Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7a6f845
refactor: Remove "array" format
eddiebergman Oct 16, 2024
6296f37
refactor: Explicitly name parameter to listing functions
eddiebergman Oct 16, 2024
a31dafc
fix: Don't double call list
eddiebergman Oct 17, 2024
3bd924c
update...
eddiebergman Nov 4, 2024
e98c70a
attempted to fixed merge conflicts for examples
SubhadityaMukherjee Apr 1, 2025
2c82122
rename target_names back to target
SubhadityaMukherjee Apr 1, 2025
ce57881
fix: resolve kdd_rijn example to be without dataframe parameter and r…
LennartPurucker Jun 16, 2025
ddda711
fix: ensure dtypes as in original code
LennartPurucker Jun 16, 2025
063a8e6
fix: remove incorrect parsing of sparse pandas
LennartPurucker Jun 16, 2025
2e6c4c7
fix: make sklearn tests work with pandas
LennartPurucker Jun 16, 2025
12dedb0
fix: fix listing calls and test for utils
LennartPurucker Jun 16, 2025
4aae48b
Merge remote-tracking branch 'upstream/develop' into refactor-default…
LennartPurucker Jun 16, 2025
6517f6a
fix/maint: update and fix tests for new dataframe default
LennartPurucker Jun 16, 2025
466022e
fix/maint: resolve tests that used old default format
LennartPurucker Jun 16, 2025
bd120f5
fix: remove OrdinalEncoder
LennartPurucker Jun 16, 2025
de597b5
fix: update test to new assert with onehot
LennartPurucker Jun 16, 2025
32e6fbf
fix/maint: update examples
LennartPurucker Jun 16, 2025
bae06ca
fix: example revert
LennartPurucker Jun 16, 2025
78b1888
fix: add impute for tests to work with older sklearn version
LennartPurucker Jun 16, 2025
22b6b52
fix: make examples work
LennartPurucker Jun 16, 2025
232b37c
Update openml/utils.py
LennartPurucker Jun 17, 2025
f14fce6
Update openml/utils.py
LennartPurucker Jun 17, 2025
7fb5eb2
Update openml/setups/setup.py
LennartPurucker Jun 17, 2025
f45530f
Update openml/setups/setup.py
LennartPurucker Jun 17, 2025
7fb31ce
remove comment we do not understand
LennartPurucker Jun 17, 2025
9c2800e
Merge remote-tracking branch 'upstream/refactor-default-dataframe' in…
LennartPurucker Jun 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix/maint: resolve tests that used old default format
  • Loading branch information
LennartPurucker committed Jun 16, 2025
commit 466022e38a67f9d7978faf293a03311b5bb127a6
29 changes: 21 additions & 8 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -1144,7 +1144,7 @@ def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> tuple[dict, set]
optional_params[param] = default_val
return optional_params, required_params

def _deserialize_model(
def _deserialize_model( # noqa: C901
self,
flow: OpenMLFlow,
keep_defaults: bool, # noqa: FBT001
Expand Down Expand Up @@ -1219,6 +1219,20 @@ def _deserialize_model(
if param not in components:
del parameter_dict[param]

if not strict_version:
# Ignore incompatible parameters
allowed_parameter = list(inspect.signature(model_class.__init__).parameters)
for p in list(parameter_dict.keys()):
if p not in allowed_parameter:
warnings.warn(
f"While deserializing in a non-strict way, parameter {p} is not "
f"allowed for {model_class.__name__} likely due to a version mismatch. "
"We ignore the parameter.",
UserWarning,
stacklevel=2,
)
del parameter_dict[p]

return model_class(**parameter_dict)

def _check_dependencies(
Expand Down Expand Up @@ -1254,8 +1268,7 @@ def _check_dependencies(
else:
raise NotImplementedError(f"operation '{operation}' is not supported")
message = (
"Trying to deserialize a model with dependency "
f"{dependency_string} not satisfied."
f"Trying to deserialize a model with dependency {dependency_string} not satisfied."
)
if not check:
if strict_version:
Expand Down Expand Up @@ -1497,7 +1510,7 @@ def _prevent_optimize_n_jobs(self, model):
)
if len(n_jobs_vals) > 0:
raise PyOpenMLError(
"openml-python should not be used to " "optimize the n_jobs parameter.",
"openml-python should not be used to optimize the n_jobs parameter.",
)

################################################################################################
Expand Down Expand Up @@ -1555,7 +1568,7 @@ def _seed_current_object(current_value):

if current_value is not None:
raise ValueError(
"Models should be seeded with int or None (this should never " "happen). ",
"Models should be seeded with int or None (this should never happen). ",
)

return True
Expand Down Expand Up @@ -1780,10 +1793,10 @@ def _prediction_to_probabilities(
# to handle the case when dataset is numpy and categories are encoded
# however the class labels stored in task are still categories
if isinstance(y_train, np.ndarray) and isinstance(
cast(List, task.class_labels)[0],
cast("List", task.class_labels)[0],
str,
):
model_classes = [cast(List[str], task.class_labels)[i] for i in model_classes]
model_classes = [cast("List[str]", task.class_labels)[i] for i in model_classes]

modelpredict_start_cputime = time.process_time()
modelpredict_start_walltime = time.time()
Expand Down Expand Up @@ -2006,7 +2019,7 @@ def is_subcomponent_specification(values):
# (mixed)). OpenML replaces the subcomponent by an
# OpenMLFlow object.
if len(subcomponent) < 2 or len(subcomponent) > 3:
raise ValueError("Component reference should be " "size {2,3}. ")
raise ValueError("Component reference should be size {2,3}. ")

subcomponent_identifier = subcomponent[0]
subcomponent_flow = subcomponent[1]
Expand Down
7 changes: 6 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
# License: BSD 3-Clause
from __future__ import annotations

import multiprocessing

multiprocessing.set_start_method("spawn", force=True)

from collections.abc import Iterator
import logging
import os
Expand All @@ -33,6 +37,7 @@
import openml
from openml.testing import TestBase


# creating logger for unit test file deletion status
logger = logging.getLogger("unit_tests")
logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -170,7 +175,7 @@ def pytest_sessionfinish() -> None:
# Delete any test dirs that remain
# In edge cases due to a mixture of pytest parametrization and oslo concurrency,
# some file lock are created after leaving the test. This removes these files!
test_files_dir=Path(__file__).parent.parent / "openml"
test_files_dir = Path(__file__).parent.parent / "openml"
for f in test_files_dir.glob("tests.*"):
if f.is_dir():
shutil.rmtree(f)
Expand Down
40 changes: 29 additions & 11 deletions tests/test_runs/test_run_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def _remove_random_state(flow):
task = openml.tasks.get_task(task_id)

X, y = task.get_X_and_y()
assert np.count_nonzero(np.isnan(X)) == n_missing_vals
assert X.isna().sum().sum() == n_missing_vals
run = openml.runs.run_flow_on_task(
flow=flow,
task=task,
Expand Down Expand Up @@ -401,7 +401,7 @@ def _check_sample_evaluations(

@pytest.mark.sklearn()
def test_run_regression_on_classif_task(self):
task_id = 115 # diabetes; crossvalidation
task_id = 259 # collins; crossvalidation; has numeric targets

clf = LinearRegression()
task = openml.tasks.get_task(task_id)
Expand Down Expand Up @@ -1758,7 +1758,26 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
num_instances = x.shape[0]
line_length = 6 + len(task.class_labels)
loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss"
clf = SGDClassifier(loss=loss, random_state=1)
clf = sklearn.pipeline.Pipeline(
[
(
"cat_handling",
ColumnTransformer(
transformers=[
(
"cat",
OrdinalEncoder(
handle_unknown="use_encoded_value", unknown_value=-1
),
x.select_dtypes(include=["object", "category"]).columns,
)
],
remainder="passthrough",
),
),
("clf", SGDClassifier(loss=loss, random_state=1)),
]
)
n_jobs = 2
backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
with parallel_backend(backend, n_jobs=n_jobs):
Expand All @@ -1767,7 +1786,6 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
model=clf,
task=task,
add_local_measures=True,
# dataset_format="array", # "dataframe" would require handling of categoricals
n_jobs=n_jobs,
)
# This unit test will fail if joblib is unable to distribute successfully since the
Expand All @@ -1784,16 +1802,16 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
assert len(res[2]) == 7
assert len(res[3]) == 7
expected_scores = [
0.965625,
0.94375,
0.946875,
0.953125,
0.95625,
0.959375,
0.96875,
0.96875,
0.96875,
0.965625,
0.9435736677115988,
0.9467084639498433,
0.9749216300940439,
0.9655172413793104,
0.9373040752351097,
0.9561128526645768,
0.9467084639498433
]
scores = [v for k, v in res[2]["predictive_accuracy"][0].items()]
np.testing.assert_array_almost_equal(
Expand Down
13 changes: 7 additions & 6 deletions tests/test_study/test_study_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,20 +183,21 @@ def test_publish_study(self):
self.assertSetEqual(set(study_downloaded.tasks), set(fixt_task_ids))

# test whether the list run function also handles study data fine
run_ids = openml.runs.list_runs(study=study.id)
self.assertSetEqual(set(run_ids), set(study_downloaded.runs))
run_ids = openml.runs.list_runs(study=study.id) # returns DF
self.assertSetEqual(set(run_ids["run_id"]), set(study_downloaded.runs))

# test whether the list evaluation function also handles study data fine
run_ids = openml.evaluations.list_evaluations(
run_ids = openml.evaluations.list_evaluations( # returns list of objects
"predictive_accuracy",
size=None,
study=study.id,
output_format="object", # making the default explicit
)
self.assertSetEqual(set(run_ids), set(study_downloaded.runs))

# attach more runs, since we fetch 11 here, at least one is non-overlapping
run_list_additional = openml.runs.list_runs(size=11, offset=10)
run_list_additional = set(run_list_additional) - set(run_ids)
run_list_additional = set(run_list_additional["run_id"]) - set(run_ids)
openml.study.attach_to_study(study.id, list(run_list_additional))
study_downloaded = openml.study.get_study(study.id)
# verify again
Expand Down Expand Up @@ -227,7 +228,7 @@ def test_study_attach_illegal(self):
benchmark_suite=None,
name="study with illegal runs",
description="none",
run_ids=list(run_list.keys()),
run_ids=list(run_list["run_id"]),
)
study.publish()
TestBase._mark_entity_for_removal("study", study.id)
Expand All @@ -246,7 +247,7 @@ def test_study_attach_illegal(self):
match="Problem attaching entities.",
):
# some runs already attached
openml.study.attach_to_study(study.id, list(run_list_more.keys()))
openml.study.attach_to_study(study.id, list(run_list_more["run_id"]))
study_downloaded = openml.study.get_study(study.id)
self.assertListEqual(study_original.runs, study_downloaded.runs)

Expand Down
Loading