Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
8df347a
run on tasks allows dataframes
amueller Sep 3, 2019
ce8640c
don't force third subcomponent part to be list
amueller Sep 3, 2019
d8a2347
Merge branch 'develop' into dataframe_run_on_task
amueller Oct 16, 2019
b16a937
Making DataFrame default behaviour for runs; Fixing test cases for th…
Neeratyoy Dec 4, 2019
df7b496
Merge branch 'develop' into dataframe_run_on_task
Neeratyoy Dec 4, 2019
b31c1dc
Fixing PEP8 + Adding docstring to CustomImputer()
Neeratyoy Dec 11, 2019
05b55a3
Merge branch 'dataframe_run_on_task' of https://github.com/amueller/o…
Neeratyoy Dec 11, 2019
5cd59af
run on tasks allows dataframes
amueller Sep 3, 2019
1f8c37b
Attempting rebase
Neeratyoy Jan 21, 2020
9d45f6f
Fixing merge
Neeratyoy Jan 21, 2020
453fbdf
Fixing test cases
anonymous99199 Feb 12, 2020
6c7172c
Trying test case fixes
anonymous99199 Feb 14, 2020
b98fdb4
run on tasks allows dataframes
amueller Sep 3, 2019
8ac6468
don't force third subcomponent part to be list
amueller Sep 3, 2019
5c5fb31
Making DataFrame default behaviour for runs; Fixing test cases for th…
Neeratyoy Dec 4, 2019
b5729d1
Fixing PEP8 + Adding docstring to CustomImputer()
Neeratyoy Dec 11, 2019
b619361
Attempting rebase
Neeratyoy Jan 21, 2020
25e1b8e
Fixing test cases
anonymous99199 Feb 12, 2020
08b1be1
Trying test case fixes
anonymous99199 Feb 14, 2020
6a4eae3
Rebasing
Neeratyoy Mar 9, 2020
ae9c312
Allowing functions in subcomponents
Neeratyoy Mar 24, 2020
7f60589
Fixing test cases
Neeratyoy Jun 5, 2020
49584de
Adding dataset output param to run
Neeratyoy Jun 16, 2020
b25bbc4
Fixing test cases
Neeratyoy Jun 16, 2020
c0116e4
Changes suggested by mfeurer
Neeratyoy Jun 27, 2020
7379f0c
Editing predict_proba function
Neeratyoy Jun 27, 2020
ba9c1a2
Merge branch 'develop' into dataframe_run_on_task
Neeratyoy Jul 5, 2020
3305a12
Test case fix
Neeratyoy Jul 6, 2020
1b05089
Test case fix
Neeratyoy Jul 7, 2020
053beb6
Edit unit test to bypass server issue
Neeratyoy Jul 9, 2020
15743ee
Fixing unit test
Neeratyoy Jul 9, 2020
440c0ad
Reiterating with @PGijsbers comments
Neeratyoy Jul 10, 2020
7967624
Minor fixes to test cases
Neeratyoy Jul 24, 2020
c06eb0d
Adding unit test and suggestions from @mfeurer
Neeratyoy Aug 1, 2020
83f309a
Fixing test case for all sklearn versions
Neeratyoy Aug 1, 2020
c2a090a
Merge branch 'develop' into dataframe_run_on_task
Neeratyoy Aug 3, 2020
2cb2028
Testing changes
Neeratyoy Aug 10, 2020
5ea4d31
Merge branch 'dataframe_run_on_task' of https://github.com/amueller/o…
Neeratyoy Aug 10, 2020
f0ff562
Fixing import in example
Neeratyoy Aug 10, 2020
7200418
Triggering unit tests
Neeratyoy Aug 14, 2020
29af032
Degugging failed example script
Neeratyoy Aug 14, 2020
001ee74
Merge branch 'develop' into dataframe_run_on_task
Neeratyoy Aug 17, 2020
ae57bea
Adding unit tests
Neeratyoy Aug 17, 2020
1fee939
Merge branch 'dataframe_run_on_task' of https://github.com/amueller/o…
Neeratyoy Aug 17, 2020
418d9e6
Push for debugging
Neeratyoy Aug 24, 2020
c00c060
Push for @mfeurer to debug
Neeratyoy Aug 26, 2020
463a326
Merging with latest updates on [develop]
Neeratyoy Sep 7, 2020
44c9e65
Resetting to debug
Neeratyoy Oct 19, 2020
d439d73
Updating branch and fixing test cases
Neeratyoy Oct 19, 2020
3ae777e
Updating branch
Neeratyoy Oct 19, 2020
9fc6c10
Merging branches
Neeratyoy Oct 19, 2020
3acce3f
pre-commit fixes
Neeratyoy Oct 19, 2020
39011df
Handling failing examples
Neeratyoy Oct 20, 2020
4908237
Reiteration with clean ups and minor fixes
Neeratyoy Oct 21, 2020
90ad9e2
Closing comments
Neeratyoy Oct 21, 2020
9982afe
Black fixes
Neeratyoy Oct 21, 2020
da8dbb9
feedback from @mfeurer
Neeratyoy Oct 21, 2020
0a0f71f
Minor fix
Neeratyoy Oct 22, 2020
78ab677
suggestions from @PGijsbers
Neeratyoy Oct 22, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Making DataFrame default behaviour for runs; Fixing test cases for th…
…e same
  • Loading branch information
Neeratyoy committed Mar 9, 2020
commit 5c5fb31c5531832865ba3de2f2cfee946f6b73c8
35 changes: 26 additions & 9 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,7 @@ def _get_external_version_string(
# requirements for their subcomponents. The external version string is a
# sorted concatenation of all modules which are present in this run.
model_package_name = model.__module__.split('.')[0]

module = importlib.import_module(model_package_name)
model_package_version_number = module.__version__ # type: ignore
external_version = self._format_external_version(
Expand Down Expand Up @@ -1512,10 +1513,11 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
if not isinstance(classes, list):
raise ValueError('please convert model classes to list prior to '
'calling this fn')
result = np.zeros((len(y), len(classes)), dtype=np.float32)
for obs, prediction_idx in enumerate(y):
result[obs][prediction_idx] = 1.0
return result
# DataFrame allows more accurate mapping of classes as column names
Comment thread
mfeurer marked this conversation as resolved.
result = pd.DataFrame(0, index=np.arange(len(y)), columns=classes, dtype=np.float32)
for obs, prediction in enumerate(y):
result.loc[obs, prediction] = 1.0
return result.to_numpy()

if isinstance(task, OpenMLSupervisedTask):
Comment thread
PGijsbers marked this conversation as resolved.
if y_train is None:
Expand Down Expand Up @@ -1573,6 +1575,11 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
else:
model_classes = used_estimator.classes_

# to handle the case when dataset is numpy and categories are encoded
# however the class labels stored in task are still categories
if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str):
model_classes = [task.class_labels[i] for i in model_classes]

modelpredict_start_cputime = time.process_time()
modelpredict_start_walltime = time.time()

Expand Down Expand Up @@ -1601,9 +1608,16 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra

try:
proba_y = model_copy.predict_proba(X_test)
except AttributeError:
except AttributeError: # predict_proba is not available when probability=False
if task.class_labels is not None:
proba_y = _prediction_to_probabilities(pred_y, list(task.class_labels))
if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str):
# mapping (decoding) the predictions to the categories
# creating a separate copy to not change the expected pred_y type
preds = [task.class_labels[pred] for pred in pred_y]
Comment thread
mfeurer marked this conversation as resolved.
Outdated
proba_y = _prediction_to_probabilities(preds, model_classes)
else:
proba_y = _prediction_to_probabilities(pred_y, model_classes)

else:
raise ValueError('The task has no class labels')

Expand All @@ -1619,10 +1633,13 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
# then we need to add a column full of zeros into the probabilities
# for class 3 because the rest of the library expects that the
# probabilities are ordered the same way as the classes are ordered).
proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels)))

# DataFrame allows more accurate mapping of classes as column names
proba_y_new = pd.DataFrame(0, index=np.arange(proba_y.shape[0]),
columns=task.class_labels, dtype=np.float32)
for idx, model_class in enumerate(model_classes):
proba_y_new[:, model_class] = proba_y[:, idx]
proba_y = proba_y_new
proba_y_new.loc[:, model_class] = proba_y[:, idx]
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it perhaps not be clear to:

  • keep the result of prediction_to_probability as a dataframe.
  • convert proba_y to a dataframe if perdict_proba was called:
    proba_y = pd.DataFrame(proba_y, columns=model_classes)
  • then add missing columns
  • then reorder the dataframe:
    proba_y = proba_y[task.class_labels]

Advantages:

  • Don't produce a whole copy in memory.
  • Keep indexing by class label only, never integers.
  • Can make (and keep) _prediction_to_proba store the probabilities as e.g. uint8 for much smaller footprint (though conversion might be required later anyway, so I don't know if it is a win).

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good spot. Thanks.
I made some changes to not have the extra copy and retain it as dataframe whenever applicable, without a conversion back and forth.

However, right at the end, needed to convert the probability array to numpy since _run_task_get_arffcontent appears to require a numpy as the probability matrix.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, right at the end, needed to convert the probability array to numpy since _run_task_get_arffcontent appears to require a numpy as the probability matrix.

You could also change that function to accept a dataframe - it appears easier and safer to work with.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey, I just checked but I think you did not yet update the return value type annotation for the functions openml.extensions.sklearn.extension._prediction_to_probabilities and openml.extensions.sklearn.extension._run_model_on_fold.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated it, thanks for pointing it out.
However, not sure how to edit the docstrings since they don't appear to bear semblance to what the function is actually returning.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That appears to be a wrong since two years, you can just update the docstring.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the update. I think Pieter's original suggestion is not yet addressed - do you think you could have a look whether that's still possible?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this resolved now then?

proba_y = proba_y_new.to_numpy()

if proba_y.shape[1] != len(task.class_labels):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a repeated clause from Line#1666.
If it was not true there (they are equal), then it is not true here.
If it was true there (they were unequal), then Line#1682 specifically creates proba_y_new to have the columns task.class_labels, and proba_y is later assigned the value of proba_y_new.
So I see no scenario where this statement is true (i.e. at this point proba_y should always a column for each label in task.class_labels. Am I missing something? Or is it forgotten code?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you're right. I was scratching my head too. I think it makes sense to remove the redundant if but retain the warning message that is being logged.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me.

message = "Estimator only predicted for {}/{} classes!".format(
Expand Down
2 changes: 1 addition & 1 deletion openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
elif isinstance(task, OpenMLRegressionTask):

for i in range(0, len(test_indices)):
Comment thread
Neeratyoy marked this conversation as resolved.
Outdated
arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y[i]]
arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y.iloc[i]]
arff_datacontent.append(arff_line)

if add_local_measures:
Expand Down
14 changes: 13 additions & 1 deletion openml/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,4 +249,16 @@ def _check_fold_timing_evaluations(
from sklearn.preprocessing import Imputer as SimpleImputer


__all__ = ['TestBase', 'SimpleImputer']
class CustomImputer(SimpleImputer):
pass


def cont(X):
return X.dtypes != 'category'


def cat(X):
return X.dtypes == 'category'


__all__ = ['TestBase', 'SimpleImputer', 'CustomImputer', 'cat', 'cont']
Original file line number Diff line number Diff line change
Expand Up @@ -1340,7 +1340,8 @@ def test_run_model_on_task(self):
class MyPipe(sklearn.pipeline.Pipeline):
pass
task = openml.tasks.get_task(1)
pipe = MyPipe([('imp', SimpleImputer()),
# using most_frequent imputer since dataste has mixed types and to keep things simple
pipe = MyPipe([('imp', SimpleImputer(strategy='most_frequent')),
('dummy', sklearn.dummy.DummyClassifier())])
openml.runs.run_model_on_task(pipe, task)

Expand Down
116 changes: 89 additions & 27 deletions tests/test_runs/test_run_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import unittest.mock

import scipy
import numpy as np
import pytest

Expand All @@ -20,7 +21,7 @@
import pandas as pd

import openml.extensions.sklearn
from openml.testing import TestBase, SimpleImputer
from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
from openml.runs.functions import (
_run_task_get_arffcontent,
run_exists,
Expand All @@ -31,17 +32,16 @@
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection._search import BaseSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression, SGDClassifier, \
LinearRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, \
StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline, make_pipeline


class TestRun(TestBase):
Expand Down Expand Up @@ -342,7 +342,9 @@ def test_run_regression_on_classif_task(self):

clf = LinearRegression()
task = openml.tasks.get_task(task_id)
with self.assertRaises(AttributeError):
# internally dataframe is loaded and targets are categorical
# which LinearRegression() cannot handle
with self.assertRaises(ValueError):
Comment thread
mfeurer marked this conversation as resolved.
Outdated
openml.runs.run_model_on_task(
model=clf,
task=task,
Expand Down Expand Up @@ -537,15 +539,17 @@ def test_run_and_upload_column_transformer_pipeline(self):
def get_ct_cf(nominal_indices, numeric_indices):
inner = sklearn.compose.ColumnTransformer(
transformers=[
('numeric', sklearn.preprocessing.StandardScaler(),
nominal_indices),
('nominal', sklearn.preprocessing.OneHotEncoder(
handle_unknown='ignore'), numeric_indices)],
('numeric',
make_pipeline(SimpleImputer(strategy='most_frequent'),
sklearn.preprocessing.StandardScaler()),
numeric_indices),
('nominal',
make_pipeline(CustomImputer(),
sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore')),
nominal_indices)],
remainder='passthrough')
return sklearn.pipeline.Pipeline(
steps=[
('imputer', sklearn.impute.SimpleImputer(
strategy='constant', fill_value=-1)),
('transformer', inner),
('classifier', sklearn.tree.DecisionTreeClassifier())
]
Expand All @@ -567,8 +571,17 @@ def get_ct_cf(nominal_indices, numeric_indices):
self.TEST_SERVER_TASK_MISSING_VALS[2],
'62501', sentinel=sentinel)

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
reason="columntransformer introduction in 0.20.0")
def test_run_and_upload_decision_tree_pipeline(self):
pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')),

cat_imp = make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder(handle_unknown='ignore'))
cont_imp = make_pipeline(CustomImputer(), StandardScaler())
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('cat', cat_imp, cat),
('cont', cont_imp, cont)])
pipeline2 = Pipeline(steps=[('Imputer', ct),
('VarianceThreshold', VarianceThreshold()),
('Estimator', RandomizedSearchCV(
DecisionTreeClassifier(),
Expand Down Expand Up @@ -689,6 +702,8 @@ def test_learning_curve_task_2(self):
self._check_sample_evaluations(run.sample_evaluations, num_repeats,
num_folds, num_samples)

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.21",
reason="Pipelines don't support indexing (used for the assert check)")
def test_initialize_cv_from_run(self):
randomsearch = RandomizedSearchCV(
RandomForestClassifier(n_estimators=5),
Expand All @@ -701,9 +716,11 @@ def test_initialize_cv_from_run(self):
cv=StratifiedKFold(n_splits=2, shuffle=True),
n_iter=2)

clf = make_pipeline(OneHotEncoder(handle_unknown='ignore'), randomsearch)
Comment thread
mfeurer marked this conversation as resolved.
Outdated

task = openml.tasks.get_task(11)
run = openml.runs.run_model_on_task(
model=randomsearch,
model=clf,
task=task,
avoid_duplicate_runs=False,
seed=1,
Expand All @@ -716,8 +733,8 @@ def test_initialize_cv_from_run(self):
modelR = openml.runs.initialize_model_from_run(run_id=run.run_id)
modelS = openml.setups.initialize_model(setup_id=run.setup_id)

self.assertEqual(modelS.cv.random_state, 62501)
self.assertEqual(modelR.cv.random_state, 62501)
self.assertEqual(modelS[-1].cv.random_state, 62501)
self.assertEqual(modelR[-1].cv.random_state, 62501)

def _test_local_evaluations(self, run):

Expand Down Expand Up @@ -749,10 +766,14 @@ def _test_local_evaluations(self, run):
self.assertGreaterEqual(alt_scores[idx], 0)
self.assertLessEqual(alt_scores[idx], 1)

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
reason="SimpleImputer doesn't handle mixed type DataFrame as input")
def test_local_run_swapped_parameter_order_model(self):

# construct sci-kit learn classifier
clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
clf = Pipeline(steps=[('imputer', make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder(handle_unknown='ignore'))),
# random forest doesn't take categoricals
('estimator', RandomForestClassifier())])

# download task
Expand All @@ -767,11 +788,14 @@ def test_local_run_swapped_parameter_order_model(self):

self._test_local_evaluations(run)

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
reason="SimpleImputer doesn't handle mixed type DataFrame as input")
def test_local_run_swapped_parameter_order_flow(self):

# construct sci-kit learn classifier
clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
('estimator', RandomForestClassifier())])
clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore')),
('estimator', RandomForestClassifier(n_estimators=10))])

flow = self.extension.model_to_flow(clf)
# download task
Expand All @@ -786,11 +810,13 @@ def test_local_run_swapped_parameter_order_flow(self):

self._test_local_evaluations(run)

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
reason="SimpleImputer doesn't handle mixed type DataFrame as input")
def test_local_run_metric_score(self):

# construct sci-kit learn classifier
clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
('estimator', RandomForestClassifier())])
clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore')),
('estimator', RandomForestClassifier(n_estimators=10))])

# download task
task = openml.tasks.get_task(7)
Expand All @@ -814,11 +840,31 @@ def test_online_run_metric_score(self):

self._test_local_evaluations(run)

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
reason="SimpleImputer doesn't handle mixed type DataFrame as input")
def test_initialize_model_from_run(self):
class MyGaussianNB(GaussianNB):
def fit(self, X, y, **kwargs):
if scipy.sparse.issparse(X):
X = X.toarray()
return super().fit(X=X, y=y, **kwargs)

def predict(self, X, **kwargs):
if scipy.sparse.issparse(X):
X = X.toarray()
return super().predict(X=X, **kwargs)

def predict_proba(self, X, **kwargs):
# if isinstance(X, scipy.sparse.csr.csr_matrix):
if scipy.sparse.issparse(X):
X = X.toarray()
return super().predict_proba(X=X, **kwargs)

clf = sklearn.pipeline.Pipeline(steps=[
('Imputer', SimpleImputer(strategy='median')),
('Imputer', SimpleImputer(strategy='most_frequent')),
('Encoder', OneHotEncoder(handle_unknown='ignore')),
('VarianceThreshold', VarianceThreshold(threshold=0.05)),
('Estimator', GaussianNB())])
('Estimator', MyGaussianNB())])
task = openml.tasks.get_task(11)
run = openml.runs.run_model_on_task(
model=clf,
Expand Down Expand Up @@ -894,6 +940,8 @@ def test_get_run_trace(self):
run_trace = openml.runs.get_run_trace(run_id)
self.assertEqual(len(run_trace.trace_iterations), num_iterations * num_folds)

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
reason="SimpleImputer doesn't handle mixed type DataFrame as input")
def test__run_exists(self):
# would be better to not sentinel these clfs,
# so we do not have to perform the actual runs
Expand Down Expand Up @@ -1059,6 +1107,8 @@ def test_run_with_illegal_flow_id_1_after_load(self):
loaded_run.publish
)

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
reason="OneHotEncoder cannot handle mixed type DataFrame as input")
def test__run_task_get_arffcontent(self):
task = openml.tasks.get_task(7)
num_instances = 3196
Expand All @@ -1067,7 +1117,8 @@ def test__run_task_get_arffcontent(self):

flow = unittest.mock.Mock()
flow.name = 'dummy'
clf = SGDClassifier(loss='log', random_state=1)
clf = make_pipeline(OneHotEncoder(handle_unknown='ignore'),
SGDClassifier(loss='log', random_state=1))
res = openml.runs.functions._run_task_get_arffcontent(
flow=flow,
extension=self.extension,
Expand Down Expand Up @@ -1272,17 +1323,28 @@ def test_get_runs_list_by_tag(self):
runs = openml.runs.list_runs(tag='curves')
self.assertGreaterEqual(len(runs), 1)

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
reason="columntransformer introduction in 0.20.0")
def test_run_on_dataset_with_missing_labels(self):
# Check that _run_task_get_arffcontent works when one of the class
# labels only declared in the arff file, but is not present in the
# actual data

flow = unittest.mock.Mock()
flow.name = 'dummy'
task = openml.tasks.get_task(2)

model = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')),
('Estimator', DecisionTreeClassifier())])
from sklearn.compose import ColumnTransformer
cat_imp = make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder(handle_unknown='ignore'))
cont_imp = make_pipeline(CustomImputer(), StandardScaler())
ct = ColumnTransformer([('cat', cat_imp, cat),
('cont', cont_imp, cont)])
model = Pipeline(
steps=[
('preprocess', ct),
('estimator', sklearn.tree.DecisionTreeClassifier())
]
) # build a sklearn classifier

data_content, _, _, _ = _run_task_get_arffcontent(
flow=flow,
Expand Down
Loading