Skip to content

Commit 25ba6f8

Browse files
PGijsberssatvshrgeetu040
authored
[MNT] Update CI/CD local server deployment and dependency matrix (openml#1697)
Originally started in openml#1629, this PR spins up the services within CI uses that as a local test server for the ubuntu-based tests, closing openml#1614. It also updates the test matrix to make sure included dependencies are only those that have a release on PyPI, and further restriction pandas 3.x installs to only scikit-learn 1.7 and up. Older scikit-learn versions do not play well with scikit-learn 1.6 or below. Finally, it updates some tests to reflect the new test database image state. --------- Co-authored-by: Satvik Mishra <112589278+satvshr@users.noreply.github.com> Co-authored-by: Armaghan Shakir <raoarmaghanshakir040@gmail.com>
1 parent 8cc6429 commit 25ba6f8

14 files changed

Lines changed: 76 additions & 261 deletions

File tree

.github/workflows/test.yml

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,27 @@ jobs:
3434
sklearn-only: ["true"]
3535

3636
exclude:
37-
# incompatible version combinations
37+
# (python, sklearn) combinations for which there is no PyPI release
38+
# scikit-learn 1.3
3839
- python-version: "3.13"
3940
scikit-learn: "1.3.*"
40-
- python-version: "3.13"
41-
scikit-learn: "1.4.*"
4241
- python-version: "3.14"
4342
scikit-learn: "1.3.*"
43+
# scikit-learn 1.4
44+
- python-version: "3.13"
45+
scikit-learn: "1.4.*"
4446
- python-version: "3.14"
4547
scikit-learn: "1.4.*"
48+
# scikit-learn 1.5
49+
- python-version: "3.14"
50+
scikit-learn: "1.5.*"
51+
# scikit-learn 1.6
52+
- python-version: "3.14"
53+
scikit-learn: "1.6.*"
54+
# scikit-learn 1.7 is installed with pandas 3
55+
- python-version: "3.10"
56+
scikit-learn: "1.7.*"
57+
4658

4759
include:
4860
# Full test run on ubuntu, 3.14
@@ -64,14 +76,6 @@ jobs:
6476
sklearn-only: "false"
6577
code-cov: true
6678

67-
# Pandas 2 run
68-
- os: ubuntu-latest
69-
python-version: "3.12"
70-
scikit-learn: "1.5.*"
71-
sklearn-only: "false"
72-
pandas-version: "2.*"
73-
code-cov: false
74-
7579
steps:
7680
- uses: actions/checkout@v6
7781
with:
@@ -82,15 +86,21 @@ jobs:
8286
with:
8387
python-version: ${{ matrix.python-version }}
8488

85-
- name: Install test dependencies, scikit-learn, and optional pandas
89+
- name: Install test dependencies, scikit-learn, and pandas
8690
shell: bash
8791
run: |
8892
python -m pip install --upgrade pip
8993
pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }}
90-
91-
if [ "${{ matrix.pandas-version }}" != "" ]; then
92-
echo "Installing specific pandas version: ${{ matrix.pandas-version }}"
93-
pip install "pandas==${{ matrix.pandas-version }}"
94+
95+
# scikit-learn 1.7+ requires pandas 3.x, earlier versions use pandas 2.x
96+
version="${{ matrix.scikit-learn }}"
97+
major=$(echo "$version" | cut -d. -f1)
98+
minor=$(echo "$version" | cut -d. -f2)
99+
100+
if [[ "$major" -gt 1 ]] || { [[ "$major" -eq 1 ]] && [[ "$minor" -ge 7 ]]; }; then
101+
pip install "pandas==3.*"
102+
else
103+
pip install "pandas==2.*"
94104
fi
95105
96106
- name: Store repository status
@@ -103,21 +113,27 @@ jobs:
103113
104114
- name: Clone Services
105115
if: matrix.os == 'ubuntu-latest'
116+
id: clone-services
106117
run: |
107118
git clone --depth 1 https://github.com/openml/services.git
108119
109120
- name: Start Docker Services
121+
id: start-services
110122
if: matrix.os == 'ubuntu-latest'
111123
working-directory: ./services
112124
run: |
113-
docker compose --profile rest-api --profile minio up -d
125+
chmod -R a+rw ./data
126+
chmod -R a+rw ./logs
127+
docker compose --profile rest-api --profile minio --profile evaluation-engine up -d
114128
115129
echo "Waiting for PHP API to boot..."
116130
timeout 60s bash -c 'until [ "$(docker inspect -f {{.State.Health.Status}} openml-php-rest-api)" == "healthy" ]; do sleep 5; done'
117131
118132
echo "Final Verification: Gateway Connectivity..."
119133
curl -sSfL http://localhost:8000/api/v1/xml/data/1 | head -n 15
120134
135+
docker container ls
136+
121137
- name: Show installed dependencies
122138
run: python -m pip list
123139

@@ -173,8 +189,13 @@ jobs:
173189
fail_ci_if_error: true
174190
verbose: true
175191

192+
- name: Dump server logs
193+
if: always() && steps.start-services.outcome == 'success'
194+
run: |
195+
docker logs openml-php-rest-api -t
196+
176197
- name: Cleanup Docker setup
177-
if: matrix.os == 'ubuntu-latest' && always()
198+
if: always() && steps.clone-services.outcome == 'success'
178199
run: |
179200
sudo rm -rf services
180201

openml/tasks/task.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
# License: BSD 3-Clause
22
from __future__ import annotations
33

4+
import logging
45
import warnings
56
from abc import ABC
67
from collections.abc import Sequence
78
from enum import Enum
8-
from pathlib import Path
99
from typing import TYPE_CHECKING, Any, ClassVar
1010
from typing_extensions import TypedDict
1111

12+
import arff
13+
1214
import openml._api_calls
1315
import openml.config
1416
from openml import datasets
@@ -22,6 +24,9 @@
2224
import pandas as pd
2325

2426

27+
logger = logging.getLogger(__name__)
28+
29+
2530
# TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used
2631
# and stored on server.
2732
class TaskType(Enum):
@@ -178,18 +183,6 @@ def get_train_test_split_indices(
178183

179184
return self.split.get(repeat=repeat, fold=fold, sample=sample)
180185

181-
def _download_split(self, cache_file: Path) -> None:
182-
# TODO(eddiebergman): Not sure about this try to read and error approach
183-
try:
184-
with cache_file.open(encoding="utf8"):
185-
pass
186-
except OSError:
187-
split_url = self.estimation_procedure["data_splits_url"]
188-
openml._api_calls._download_text_file(
189-
source=str(split_url),
190-
output_path=str(cache_file),
191-
)
192-
193186
def download_split(self) -> OpenMLSplit:
194187
"""Download the OpenML split for a given task."""
195188
# TODO(eddiebergman): Can this every be `None`?
@@ -199,9 +192,23 @@ def download_split(self) -> OpenMLSplit:
199192

200193
try:
201194
split = OpenMLSplit._from_arff_file(cached_split_file)
202-
except OSError:
195+
logger.debug("Loaded file from cache: %s", str(cached_split_file))
196+
except (OSError, arff.BadDataFormat):
197+
logger.info("Failed to load file from cache: %s", str(cached_split_file))
198+
if cached_split_file.exists():
199+
logger.debug("Cleaning up old file")
200+
cached_split_file.unlink()
203201
# Next, download and cache the associated split file
204-
self._download_split(cached_split_file)
202+
split_url = self.estimation_procedure["data_splits_url"]
203+
openml._api_calls._download_text_file(
204+
source=str(split_url),
205+
output_path=str(cached_split_file),
206+
)
207+
if cached_split_file.exists():
208+
logger.info("New file created of size %d", cached_split_file.stat().st_size)
209+
else:
210+
logger.info("Failed to create new file")
211+
205212
split = OpenMLSplit._from_arff_file(cached_split_file)
206213

207214
return split

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ version = {attr = "openml.__version__.__version__"}
126126

127127
# https://docs.pytest.org/en/7.2.x/reference/reference.html#ini-options-ref
128128
[tool.pytest.ini_options]
129+
log_level="DEBUG"
129130
testpaths = ["tests"]
130131
minversion = "7.0"
131132
xfail_strict = true

tests/conftest.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,19 @@ def with_server(request):
286286

287287
@pytest.fixture(autouse=True)
288288
def with_test_cache(test_files_directory, request):
289+
# Skip this fixture for TestBase subclasses - they manage their own cache directory
290+
# in setUp()/tearDown(). Having both mechanisms fight over the global config
291+
# causes race conditions.
292+
if request.instance is not None and isinstance(request.instance, TestBase):
293+
yield
294+
return
295+
289296
if not test_files_directory.exists():
290297
raise ValueError(
291298
f"Cannot find test cache dir, expected it to be {test_files_directory!s}!",
292299
)
293300
_root_cache_directory = openml.config._root_cache_directory
294-
tmp_cache = test_files_directory / request.node.name
301+
tmp_cache = test_files_directory / request.node.nodeid.replace("/", ".").replace("::", ".")
295302
openml.config.set_root_cache_directory(tmp_cache)
296303
yield
297304
openml.config.set_root_cache_directory(_root_cache_directory)

tests/test_datasets/test_dataset_functions.py

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -530,10 +530,6 @@ def test_deletion_of_cache_dir_faulty_download(self, patch):
530530
datasets_cache_dir = os.path.join(openml.config.get_cache_directory(), "datasets")
531531
assert len(os.listdir(datasets_cache_dir)) == 0
532532

533-
@pytest.mark.skipif(
534-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
535-
reason="Pending resolution of #1657",
536-
)
537533
@pytest.mark.test_server()
538534
def test_publish_dataset(self):
539535
arff_file_path = self.static_cache_dir / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff"
@@ -570,10 +566,6 @@ def test__retrieve_class_labels(self):
570566
labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
571567
assert labels == ["COIL", "SHEET"]
572568

573-
@pytest.mark.skipif(
574-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
575-
reason="Pending resolution of #1657",
576-
)
577569
@pytest.mark.test_server()
578570
def test_upload_dataset_with_url(self):
579571
dataset = OpenMLDataset(
@@ -697,10 +689,6 @@ def test_attributes_arff_from_df_unknown_dtype(self):
697689
with pytest.raises(ValueError, match=err_msg):
698690
attributes_arff_from_df(df)
699691

700-
@pytest.mark.skipif(
701-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
702-
reason="Pending resolution of #1657",
703-
)
704692
@pytest.mark.test_server()
705693
def test_create_dataset_numpy(self):
706694
data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
@@ -735,10 +723,6 @@ def test_create_dataset_numpy(self):
735723
), "Uploaded arff does not match original one"
736724
assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
737725

738-
@pytest.mark.skipif(
739-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
740-
reason="Pending resolution of #1657",
741-
)
742726
@pytest.mark.test_server()
743727
def test_create_dataset_list(self):
744728
data = [
@@ -794,10 +778,6 @@ def test_create_dataset_list(self):
794778
), "Uploaded ARFF does not match original one"
795779
assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
796780

797-
@pytest.mark.skipif(
798-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
799-
reason="Pending resolution of #1657",
800-
)
801781
@pytest.mark.test_server()
802782
def test_create_dataset_sparse(self):
803783
# test the scipy.sparse.coo_matrix
@@ -946,10 +926,6 @@ def test_get_online_dataset_format(self):
946926
dataset_id
947927
), "The format of the ARFF files is different"
948928

949-
@pytest.mark.skipif(
950-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
951-
reason="Pending resolution of #1657",
952-
)
953929
@pytest.mark.test_server()
954930
def test_create_dataset_pandas(self):
955931
data = [
@@ -1175,10 +1151,6 @@ def test_ignore_attributes_dataset(self):
11751151
paper_url=paper_url,
11761152
)
11771153

1178-
@pytest.mark.skipif(
1179-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
1180-
reason="Pending resolution of #1657",
1181-
)
11821154
@pytest.mark.test_server()
11831155
def test_publish_fetch_ignore_attribute(self):
11841156
"""Test to upload and retrieve dataset and check ignore_attributes"""
@@ -1298,10 +1270,6 @@ def test_create_dataset_row_id_attribute_error(self):
12981270
paper_url=paper_url,
12991271
)
13001272

1301-
@pytest.mark.skipif(
1302-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
1303-
reason="Pending resolution of #1657",
1304-
)
13051273
@pytest.mark.test_server()
13061274
def test_create_dataset_row_id_attribute_inference(self):
13071275
# meta-information
@@ -1470,10 +1438,6 @@ def test_data_edit_non_critical_field(self):
14701438
edited_dataset = openml.datasets.get_dataset(did)
14711439
assert edited_dataset.description == desc
14721440

1473-
@pytest.mark.skipif(
1474-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
1475-
reason="Pending resolution of #1657",
1476-
)
14771441
@pytest.mark.test_server()
14781442
def test_data_edit_critical_field(self):
14791443
# Case 2
@@ -1526,10 +1490,6 @@ def test_data_edit_requires_valid_dataset(self):
15261490
description="xor operation dataset",
15271491
)
15281492

1529-
@pytest.mark.skipif(
1530-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
1531-
reason="Pending resolution of #1657",
1532-
)
15331493
@pytest.mark.test_server()
15341494
def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
15351495
# Need to own a dataset to be able to edit meta-data

tests/test_flows/test_flow.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,6 @@ def test_to_xml_from_xml(self):
180180
openml.flows.functions.assert_flows_equal(new_flow, flow)
181181
assert new_flow is not flow
182182

183-
@pytest.mark.skipif(
184-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
185-
reason="Pending resolution of #1657",
186-
)
187183
@pytest.mark.sklearn()
188184
@pytest.mark.test_server()
189185
def test_publish_flow(self):
@@ -226,10 +222,6 @@ def test_publish_existing_flow(self, flow_exists_mock):
226222
f"collected from {__file__.split('/')[-1]}: {flow.flow_id}",
227223
)
228224

229-
@pytest.mark.skipif(
230-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
231-
reason="Pending resolution of #1657",
232-
)
233225
@pytest.mark.sklearn()
234226
@pytest.mark.test_server()
235227
def test_publish_flow_with_similar_components(self):
@@ -281,10 +273,6 @@ def test_publish_flow_with_similar_components(self):
281273
TestBase._mark_entity_for_removal("flow", flow3.flow_id, flow3.name)
282274
TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
283275

284-
@pytest.mark.skipif(
285-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
286-
reason="Pending resolution of #1657",
287-
)
288276
@pytest.mark.sklearn()
289277
@pytest.mark.test_server()
290278
def test_semi_legal_flow(self):
@@ -395,10 +383,6 @@ def get_sentinel():
395383
flow_id = openml.flows.flow_exists(name, version)
396384
assert not flow_id
397385

398-
@pytest.mark.skipif(
399-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
400-
reason="Pending resolution of #1657",
401-
)
402386
@pytest.mark.sklearn()
403387
@pytest.mark.test_server()
404388
def test_existing_flow_exists(self):
@@ -440,10 +424,6 @@ def test_existing_flow_exists(self):
440424
)
441425
assert downloaded_flow_id == flow.flow_id
442426

443-
@pytest.mark.skipif(
444-
os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
445-
reason="Pending resolution of #1657",
446-
)
447427
@pytest.mark.sklearn()
448428
@pytest.mark.test_server()
449429
def test_sklearn_to_upload_to_flow(self):

0 commit comments

Comments
 (0)