Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7a6f845
refactor: Remove "array" format
eddiebergman Oct 16, 2024
6296f37
refactor: Explicitly name parameter to listing functions
eddiebergman Oct 16, 2024
a31dafc
fix: Don't double call list
eddiebergman Oct 17, 2024
3bd924c
update...
eddiebergman Nov 4, 2024
e98c70a
attempted to fixed merge conflicts for examples
SubhadityaMukherjee Apr 1, 2025
2c82122
rename target_names back to target
SubhadityaMukherjee Apr 1, 2025
ce57881
fix: resolve kdd_rijn example to be without dataframe parameter and r…
LennartPurucker Jun 16, 2025
ddda711
fix: ensure dtypes as in original code
LennartPurucker Jun 16, 2025
063a8e6
fix: remove incorrect parsing of sparse pandas
LennartPurucker Jun 16, 2025
2e6c4c7
fix: make sklearn tests work with pandas
LennartPurucker Jun 16, 2025
12dedb0
fix: fix listing calls and test for utils
LennartPurucker Jun 16, 2025
4aae48b
Merge remote-tracking branch 'upstream/develop' into refactor-default…
LennartPurucker Jun 16, 2025
6517f6a
fix/maint: update and fix tests for new dataframe default
LennartPurucker Jun 16, 2025
466022e
fix/maint: resolve tests that used old default format
LennartPurucker Jun 16, 2025
bd120f5
fix: remove OrdinalEncoder
LennartPurucker Jun 16, 2025
de597b5
fix: update test to new assert with onehot
LennartPurucker Jun 16, 2025
32e6fbf
fix/maint: update examples
LennartPurucker Jun 16, 2025
bae06ca
fix: example revert
LennartPurucker Jun 16, 2025
78b1888
fix: add impute for tests to work with older sklearn version
LennartPurucker Jun 16, 2025
22b6b52
fix: make examples work
LennartPurucker Jun 16, 2025
232b37c
Update openml/utils.py
LennartPurucker Jun 17, 2025
f14fce6
Update openml/utils.py
LennartPurucker Jun 17, 2025
7fb5eb2
Update openml/setups/setup.py
LennartPurucker Jun 17, 2025
f45530f
Update openml/setups/setup.py
LennartPurucker Jun 17, 2025
7fb31ce
remove comment we do not understand
LennartPurucker Jun 17, 2025
9c2800e
Merge remote-tracking branch 'upstream/refactor-default-dataframe' in…
LennartPurucker Jun 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix/maint: update and fix tests for new dataframe default
  • Loading branch information
LennartPurucker committed Jun 16, 2025
commit 6517f6abef063b2ec0190f5074b35a4a2d664926
Original file line number Diff line number Diff line change
Expand Up @@ -61,27 +61,31 @@ def __init__(self, boolean, integer, floating_point_value):
def fit(self, X, y):
pass


def _cat_col_selector(X):
return X.select_dtypes(include=["object", "category"]).columns


def _get_sklearn_preprocessing():
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

return [
(
"cat_handling",
ColumnTransformer(
transformers=[
(
"cat",
OrdinalEncoder(
handle_unknown="use_encoded_value", unknown_value=np.nan
),
make_column_selector(dtype_include=["object", "category"]),
)
],
remainder="passthrough",
),
(
"cat_handling",
ColumnTransformer(
transformers=[
(
"cat",
OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
_cat_col_selector,
)
],
remainder="passthrough",
),
("imp", SimpleImputer())]
),
("imp", SimpleImputer()),
]


class TestSklearnExtensionFlowFunctions(TestBase):
Expand Down Expand Up @@ -1904,7 +1908,10 @@ def test_run_model_on_fold_classification_2(self):

pipeline = sklearn.model_selection.GridSearchCV(
sklearn.pipeline.Pipeline(
steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.tree.DecisionTreeClassifier())],
steps=[
*_get_sklearn_preprocessing(),
("clf", sklearn.tree.DecisionTreeClassifier()),
],
),
{"clf__max_depth": [1, 2]},
)
Expand Down
32 changes: 28 additions & 4 deletions tests/test_runs/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,17 +204,42 @@ def test_to_from_filesystem_no_model(self):
with self.assertRaises(ValueError, msg="Could not find model.pkl"):
openml.runs.OpenMLRun.from_filesystem(cache_path)

@staticmethod
def _cat_col_selector(X):
return X.select_dtypes(include=["object", "category"]).columns

@staticmethod
def _get_models_tasks_for_tests():
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

basic_preprocessing = [
(
"cat_handling",
ColumnTransformer(
transformers=[
(
"cat",
OrdinalEncoder(
handle_unknown="use_encoded_value", unknown_value=np.nan
),
TestRun._cat_col_selector,
)
],
remainder="passthrough",
),
),
("imp", SimpleImputer()),
]
model_clf = Pipeline(
[
("imputer", SimpleImputer(strategy="mean")),
*basic_preprocessing,
("classifier", DummyClassifier(strategy="prior")),
],
)
model_reg = Pipeline(
[
("imputer", SimpleImputer(strategy="mean")),
*basic_preprocessing,
(
"regressor",
# LR because dummy does not produce enough float-like values
Expand Down Expand Up @@ -263,9 +288,8 @@ def assert_run_prediction_data(task, run, model):

assert_method = np.testing.assert_array_almost_equal
if task.task_type == "Supervised Classification":
y_pred = np.take(task.class_labels, y_pred)
y_test = np.take(task.class_labels, y_test)
assert_method = np.testing.assert_array_equal
y_test = y_test.values

# Assert correctness
assert_method(y_pred, saved_y_pred)
Expand Down
51 changes: 35 additions & 16 deletions tests/test_runs/test_run_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@
from sklearn.model_selection._search import BaseSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

import openml
import openml._api_calls
Expand Down Expand Up @@ -130,9 +131,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
time.sleep(10)
continue

assert (
len(run.evaluations) > 0
), "Expect not-None evaluations to always contain elements."
assert len(run.evaluations) > 0, (
"Expect not-None evaluations to always contain elements."
)
return

raise RuntimeError(
Expand Down Expand Up @@ -306,7 +307,7 @@ def _remove_random_state(flow):
flow_server = self.extension.model_to_flow(clf_server)

if flow.class_name not in classes_without_random_state:
error_msg = "Flow class %s (id=%d) does not have a random " "state parameter" % (
error_msg = "Flow class %s (id=%d) does not have a random state parameter" % (
flow.class_name,
flow.flow_id,
)
Expand Down Expand Up @@ -479,7 +480,7 @@ def determine_grid_size(param_grid):
grid_iterations += determine_grid_size(sub_grid)
return grid_iterations
else:
raise TypeError("Param Grid should be of type list " "(GridSearch only) or dict")
raise TypeError("Param Grid should be of type list (GridSearch only) or dict")

run = self._perform_run(
task_id,
Expand Down Expand Up @@ -1286,7 +1287,7 @@ def test_run_with_illegal_flow_id_1(self):
flow_new = self.extension.model_to_flow(clf)

flow_new.flow_id = -1
expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'"
expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
openml.runs.run_flow_on_task(
task=task,
Expand Down Expand Up @@ -1326,7 +1327,7 @@ def test_run_with_illegal_flow_id_1_after_load(self):
run.to_filesystem(cache_path)
loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)

expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'"
expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
self.assertRaisesRegex(
openml.exceptions.PyOpenMLError,
expected_message_regex,
Expand Down Expand Up @@ -1827,14 +1828,33 @@ def test_joblib_backends(self, parallel_mock):
(1, "sequential", 40),
]:
clf = sklearn.model_selection.RandomizedSearchCV(
estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5),
estimator=sklearn.pipeline.Pipeline(
[
(
"cat_handling",
ColumnTransformer(
transformers=[
(
"cat",
OrdinalEncoder(
handle_unknown="use_encoded_value", unknown_value=-1
),
x.select_dtypes(include=["object", "category"]).columns,
)
],
remainder="passthrough",
),
),
("clf", sklearn.ensemble.RandomForestClassifier(n_estimators=5)),
]
),
param_distributions={
"max_depth": [3, None],
"max_features": [1, 2, 3, 4],
"min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
"min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"],
"clf__max_depth": [3, None],
"clf__max_features": [1, 2, 3, 4],
"clf__min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
"clf__min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"clf__bootstrap": [True, False],
"clf__criterion": ["gini", "entropy"],
},
random_state=1,
cv=sklearn.model_selection.StratifiedKFold(
Expand All @@ -1851,7 +1871,6 @@ def test_joblib_backends(self, parallel_mock):
model=clf,
task=task,
add_local_measures=True,
# dataset_format="array", # "dataframe" would require handling of categoricals
n_jobs=n_jobs,
)
assert type(res[0]) == list
Expand Down
8 changes: 4 additions & 4 deletions tests/test_tasks/test_classification_task.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# License: BSD 3-Clause
from __future__ import annotations

import numpy as np
import pandas as pd

from openml.tasks import TaskType, get_task

Expand All @@ -20,10 +20,10 @@ def setUp(self, n_levels: int = 1):
def test_get_X_and_Y(self):
X, Y = super().test_get_X_and_Y()
assert X.shape == (768, 8)
assert isinstance(X, np.ndarray)
assert isinstance(X, pd.DataFrame)
assert Y.shape == (768,)
assert isinstance(Y, np.ndarray)
assert Y.dtype == int
assert isinstance(Y, pd.Series)
assert pd.api.types.is_categorical_dtype(Y)

def test_download_task(self):
task = super().test_download_task()
Expand Down
8 changes: 4 additions & 4 deletions tests/test_tasks/test_learning_curve_task.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# License: BSD 3-Clause
from __future__ import annotations

import numpy as np
import pandas as pd

from openml.tasks import TaskType, get_task

Expand All @@ -20,10 +20,10 @@ def setUp(self, n_levels: int = 1):
def test_get_X_and_Y(self):
X, Y = super().test_get_X_and_Y()
assert X.shape == (768, 8)
assert isinstance(X, np.ndarray)
assert isinstance(X, pd.DataFrame)
assert Y.shape == (768,)
assert isinstance(Y, np.ndarray)
assert Y.dtype == int
assert isinstance(Y, pd.Series)
assert pd.api.types.is_categorical_dtype(Y)

def test_download_task(self):
task = super().test_download_task()
Expand Down
8 changes: 4 additions & 4 deletions tests/test_tasks/test_regression_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import ast

import numpy as np
import pandas as pd

import openml
from openml.exceptions import OpenMLServerException
Expand Down Expand Up @@ -51,10 +51,10 @@ def setUp(self, n_levels: int = 1):
def test_get_X_and_Y(self):
X, Y = super().test_get_X_and_Y()
assert X.shape == (194, 32)
assert isinstance(X, np.ndarray)
assert isinstance(X, pd.DataFrame)
assert Y.shape == (194,)
assert isinstance(Y, np.ndarray)
assert Y.dtype == float
assert isinstance(Y, pd.Series)
assert pd.api.types.is_numeric_dtype(Y)

def test_download_task(self):
task = super().test_download_task()
Expand Down
4 changes: 2 additions & 2 deletions tests/test_tasks/test_supervised_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import unittest

import numpy as np
import pandas as pd

from openml.tasks import get_task

Expand All @@ -27,7 +27,7 @@ def setUpClass(cls):
def setUp(self, n_levels: int = 1):
super().setUp()

def test_get_X_and_Y(self) -> tuple[np.ndarray, np.ndarray]:
def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
task = get_task(self.task_id)
X, Y = task.get_X_and_y()
return X, Y