Skip to content
Merged
Show file tree
Hide file tree
Changes from 48 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
8df347a
run on tasks allows dataframes
amueller Sep 3, 2019
ce8640c
don't force third subcomponent part to be list
amueller Sep 3, 2019
d8a2347
Merge branch 'develop' into dataframe_run_on_task
amueller Oct 16, 2019
b16a937
Making DataFrame default behaviour for runs; Fixing test cases for th…
Neeratyoy Dec 4, 2019
df7b496
Merge branch 'develop' into dataframe_run_on_task
Neeratyoy Dec 4, 2019
b31c1dc
Fixing PEP8 + Adding docstring to CustomImputer()
Neeratyoy Dec 11, 2019
05b55a3
Merge branch 'dataframe_run_on_task' of https://github.com/amueller/o…
Neeratyoy Dec 11, 2019
5cd59af
run on tasks allows dataframes
amueller Sep 3, 2019
1f8c37b
Attempting rebase
Neeratyoy Jan 21, 2020
9d45f6f
Fixing merge
Neeratyoy Jan 21, 2020
453fbdf
Fixing test cases
anonymous99199 Feb 12, 2020
6c7172c
Trying test case fixes
anonymous99199 Feb 14, 2020
b98fdb4
run on tasks allows dataframes
amueller Sep 3, 2019
8ac6468
don't force third subcomponent part to be list
amueller Sep 3, 2019
5c5fb31
Making DataFrame default behaviour for runs; Fixing test cases for th…
Neeratyoy Dec 4, 2019
b5729d1
Fixing PEP8 + Adding docstring to CustomImputer()
Neeratyoy Dec 11, 2019
b619361
Attempting rebase
Neeratyoy Jan 21, 2020
25e1b8e
Fixing test cases
anonymous99199 Feb 12, 2020
08b1be1
Trying test case fixes
anonymous99199 Feb 14, 2020
6a4eae3
Rebasing
Neeratyoy Mar 9, 2020
ae9c312
Allowing functions in subcomponents
Neeratyoy Mar 24, 2020
7f60589
Fixing test cases
Neeratyoy Jun 5, 2020
49584de
Adding dataset output param to run
Neeratyoy Jun 16, 2020
b25bbc4
Fixing test cases
Neeratyoy Jun 16, 2020
c0116e4
Changes suggested by mfeurer
Neeratyoy Jun 27, 2020
7379f0c
Editing predict_proba function
Neeratyoy Jun 27, 2020
ba9c1a2
Merge branch 'develop' into dataframe_run_on_task
Neeratyoy Jul 5, 2020
3305a12
Test case fix
Neeratyoy Jul 6, 2020
1b05089
Test case fix
Neeratyoy Jul 7, 2020
053beb6
Edit unit test to bypass server issue
Neeratyoy Jul 9, 2020
15743ee
Fixing unit test
Neeratyoy Jul 9, 2020
440c0ad
Reiterating with @PGijsbers comments
Neeratyoy Jul 10, 2020
7967624
Minor fixes to test cases
Neeratyoy Jul 24, 2020
c06eb0d
Adding unit test and suggestions from @mfeurer
Neeratyoy Aug 1, 2020
83f309a
Fixing test case for all sklearn versions
Neeratyoy Aug 1, 2020
c2a090a
Merge branch 'develop' into dataframe_run_on_task
Neeratyoy Aug 3, 2020
2cb2028
Testing changes
Neeratyoy Aug 10, 2020
5ea4d31
Merge branch 'dataframe_run_on_task' of https://github.com/amueller/o…
Neeratyoy Aug 10, 2020
f0ff562
Fixing import in example
Neeratyoy Aug 10, 2020
7200418
Triggering unit tests
Neeratyoy Aug 14, 2020
29af032
Degugging failed example script
Neeratyoy Aug 14, 2020
001ee74
Merge branch 'develop' into dataframe_run_on_task
Neeratyoy Aug 17, 2020
ae57bea
Adding unit tests
Neeratyoy Aug 17, 2020
1fee939
Merge branch 'dataframe_run_on_task' of https://github.com/amueller/o…
Neeratyoy Aug 17, 2020
418d9e6
Push for debugging
Neeratyoy Aug 24, 2020
c00c060
Push for @mfeurer to debug
Neeratyoy Aug 26, 2020
463a326
Merging with latest updates on [develop]
Neeratyoy Sep 7, 2020
44c9e65
Resetting to debug
Neeratyoy Oct 19, 2020
d439d73
Updating branch and fixing test cases
Neeratyoy Oct 19, 2020
3ae777e
Updating branch
Neeratyoy Oct 19, 2020
9fc6c10
Merging branches
Neeratyoy Oct 19, 2020
3acce3f
pre-commit fixes
Neeratyoy Oct 19, 2020
39011df
Handling failing examples
Neeratyoy Oct 20, 2020
4908237
Reiteration with clean ups and minor fixes
Neeratyoy Oct 21, 2020
90ad9e2
Closing comments
Neeratyoy Oct 21, 2020
9982afe
Black fixes
Neeratyoy Oct 21, 2020
da8dbb9
feedback from @mfeurer
Neeratyoy Oct 21, 2020
0a0f71f
Minor fix
Neeratyoy Oct 22, 2020
78ab677
suggestions from @PGijsbers
Neeratyoy Oct 22, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,9 +326,6 @@ def _deserialize_sklearn(
else:
raise NotImplementedError(serialized_type)
assert components is not None # Necessary for mypy
# value = self._deserialize_sklearn(
# value, recursion_depth=depth_pp, strict_version=strict_version
# )
step_name = value["step_name"]
key = value["key"]
component = self._deserialize_sklearn(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1751,6 +1751,77 @@ def test_run_model_on_fold_classification_1_dataframe(self):
check_scores=False,
)

@unittest.skipIf(
LooseVersion(sklearn.__version__) < "0.21",
reason="SimpleImputer, ColumnTransformer available only after 0.19 and "
"Pipeline till 0.20 doesn't support indexing and 'passthrough'",
)
def test_run_model_on_fold_classification_1_dataframe(self):
from sklearn.compose import ColumnTransformer

task = openml.tasks.get_task(1)

# diff test_run_model_on_fold_classification_1_array()
X, y = task.get_X_and_y(dataset_format="dataframe")
train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]
X_test = X.iloc[test_indices]
y_test = y.iloc[test_indices]

# Helper functions to return required columns for ColumnTransformer
cat_imp = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore", sparse=False),
)
cont_imp = make_pipeline(CustomImputer(strategy="mean"), StandardScaler())
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
pipeline = sklearn.pipeline.Pipeline(
steps=[("transform", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
)
# TODO add some mocking here to actually test the innards of this function, too!
res = self.extension._run_model_on_fold(
model=pipeline,
task=task,
fold_no=0,
rep_no=0,
X_train=X_train,
y_train=y_train,
X_test=X_test,
)

y_hat, y_hat_proba, user_defined_measures, trace = res

# predictions
self.assertIsInstance(y_hat, np.ndarray)
self.assertEqual(y_hat.shape, y_test.shape)
self.assertIsInstance(y_hat_proba, pd.DataFrame)
self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 6))
np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
# The class '4' (at index 3) is not present in the training data. We check that the
# predicted probabilities for that class are zero!
np.testing.assert_array_almost_equal(
y_hat_proba.iloc[:, 3].to_numpy(), np.zeros(y_test.shape)
)
for i in (0, 1, 2, 4, 5):
self.assertTrue(np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)))

# check user defined measures
fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
for measure in user_defined_measures:
fold_evaluations[measure][0][0] = user_defined_measures[measure]

# trace. SGD does not produce any
self.assertIsNone(trace)

self._check_fold_timing_evaluations(
fold_evaluations,
num_repeats=1,
num_folds=1,
task_type=task.task_type_id,
check_scores=False,
)

def test_run_model_on_fold_classification_2(self):
task = openml.tasks.get_task(7)

Expand Down