diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index e50d67710..773dda6f2 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -29,10 +29,7 @@ jobs: python-version: 3.8 - name: Install dependencies run: | - pip install -e .[docs,examples,examples_unix] - # dependency "fanova" does not work with numpy 1.24 or later - # https://github.com/automl/fanova/issues/108 - pip install numpy==1.23.5 + pip install -e .[docs,examples] - name: Make docs run: | cd doc @@ -64,4 +61,4 @@ jobs: git config --global user.email 'not@mail.com' git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }} git commit -am "$last_commit" - git push + git diff --quiet @{u} HEAD || git push diff --git a/.github/workflows/release_docker.yaml b/.github/workflows/release_docker.yaml index c8f8c59f8..fc629a4e4 100644 --- a/.github/workflows/release_docker.yaml +++ b/.github/workflows/release_docker.yaml @@ -8,9 +8,6 @@ on: - 'docker' tags: - 'v*' - pull_request: - branches: - - 'develop' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -47,7 +44,7 @@ jobs: - name: Build and push id: docker_build - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: ./docker/ tags: ${{ steps.meta_dockerhub.outputs.tags }} @@ -57,7 +54,7 @@ jobs: - name: Update repo description if: ${{ startsWith(github.ref, 'refs/tags/v') }} - uses: peter-evans/dockerhub-description@v3 + uses: peter-evans/dockerhub-description@v4 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6a0408137..f2543bc53 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -118,7 +118,7 @@ jobs: fi - name: Upload coverage if: matrix.code-cov && always() - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: files: coverage.xml token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5f13625a0..95e2a5239 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,20 +7,20 @@ files: | )/.*\.py$ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.14 + rev: v0.7.3 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix, --no-cache] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.8.0 + rev: v1.13.0 hooks: - id: mypy additional_dependencies: - types-requests - types-python-dateutil - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.27.3 + rev: 0.29.4 hooks: - id: check-github-workflows files: '^github/workflows/.*\.ya?ml$' @@ -28,7 +28,7 @@ repos: - id: check-dependabot files: '^\.github/dependabot\.ya?ml$' - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v5.0.0 hooks: - id: check-added-large-files files: ".*" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c2b4be187..cc8633f84 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -141,7 +141,7 @@ following rules before you submit a pull request: - If your pull request addresses an issue, please use the pull request title to describe the issue and mention the issue number in the pull request description. This will make sure a link back to the original issue is - created. + created. Make sure the title is descriptive enough to understand what the pull request does! - An incomplete contribution -- where you expect to do more work before receiving a full review -- should be submitted as a `draft`. These may be useful @@ -174,8 +174,6 @@ following rules before you submit a pull request: For the Bug-fixes case, at the time of the PR, this tests should fail for the code base in develop and pass for the PR code. - - Add your changes to the changelog in the file doc/progress.rst. - - If any source file is being added to the repository, please add the BSD 3-Clause license to it. @@ -201,17 +199,12 @@ Make sure your code has good unittest **coverage** (at least 80%). Pre-commit is used for various style checking and code formatting. Before each commit, it will automatically run: - - [black](https://black.readthedocs.io/en/stable/) a code formatter. + - [ruff](https://docs.astral.sh/ruff/) a code formatter and linter. This will automatically format your code. Make sure to take a second look after any formatting takes place, if the resulting code is very bloated, consider a (small) refactor. - *note*: If Black reformats your code, the commit will automatically be aborted. - Make sure to add the formatted files (back) to your commit after checking them. - [mypy](https://mypy.readthedocs.io/en/stable/) a static type checker. In particular, make sure each function you work on has type hints. - - [flake8](https://flake8.pycqa.org/en/latest/index.html) style guide enforcement. - Almost all of the black-formatted code should automatically pass this check, - but make sure to make adjustments if it does fail. If you want to run the pre-commit tests without doing a commit, run: ```bash @@ -224,23 +217,6 @@ $ pre-commit run --all-files Make sure to do this at least once before your first commit to check your setup works. Executing a specific unit test can be done by specifying the module, test case, and test. -To obtain a hierarchical list of all tests, run - -```bash -$ pytest --collect-only - - - - - - - - - - - -``` - You may then run a specific module, test case, or unit test respectively: ```bash $ pytest tests/test_datasets/test_dataset.py @@ -271,7 +247,7 @@ information. For building the documentation, you will need to install a few additional dependencies: ```bash -$ pip install -e .[docs] +$ pip install -e .[examples,docs] ``` When dependencies are installed, run ```bash diff --git a/README.md b/README.md index f13038faa..0bad7ac66 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,76 @@ -# OpenML-Python - -[![All Contributors](https://img.shields.io/badge/all_contributors-2-orange.svg?style=flat-square)](#contributors-) - -A python interface for [OpenML](http://openml.org), an online platform for open science collaboration in machine learning. -It can be used to download or upload OpenML data such as datasets and machine learning experiment results. -## General +
-* [Documentation](https://openml.github.io/openml-python). -* [Contribution guidelines](https://github.com/openml/openml-python/blob/develop/CONTRIBUTING.md). +
+
    + + OpenML Logo +

    OpenML-Python

    + Python Logo +
    +
+
+## The Python API for a World of Data and More :dizzy: + +[![Latest Release](https://img.shields.io/github/v/release/openml/openml-python)](https://github.com/openml/openml-python/releases) +[![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)](https://pypi.org/project/openml/) +[![Downloads](https://static.pepy.tech/badge/openml)](https://pepy.tech/project/openml) [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) + + +[Installation](https://openml.github.io/openml-python/main/#how-to-get-openml-for-python) | [Documentation](https://openml.github.io/openml-python) | [Contribution guidelines](https://github.com/openml/openml-python/blob/develop/CONTRIBUTING.md) +
+ +OpenML-Python provides an easy-to-use and straightforward Python interface for [OpenML](http://openml.org), an online platform for open science collaboration in machine learning. +It can download or upload data from OpenML, such as datasets and machine learning experiment results. + +## :joystick: Minimal Example -## Citing OpenML-Python +Use the following code to get the [credit-g](https://www.openml.org/search?type=data&sort=runs&status=active&id=31) [dataset](https://docs.openml.org/concepts/data/): + +```python +import openml + +dataset = openml.datasets.get_dataset("credit-g") # or by ID get_dataset(31) +X, y, categorical_indicator, attribute_names = dataset.get_data(target="class") +``` -If you use OpenML-Python in a scientific publication, we would appreciate a reference to the -following paper: +Get a [task](https://docs.openml.org/concepts/tasks/) for [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31): + +```python +import openml + +task = openml.tasks.get_task(31) +dataset = task.get_dataset() +X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name) +# get splits for the first fold of 10-fold cross-validation +train_indices, test_indices = task.get_train_test_split_indices(fold=0) +``` + +Use an [OpenML benchmarking suite](https://docs.openml.org/concepts/benchmarking/) to get a curated list of machine-learning tasks: +```python +import openml + +suite = openml.study.get_suite("amlb-classification-all") # Get a curated list of tasks for classification +for task_id in suite.tasks: + task = openml.tasks.get_task(task_id) +``` + +## :magic_wand: Installation + +OpenML-Python is supported on Python 3.8 - 3.13 and is available on Linux, MacOS, and Windows. + +You can install OpenML-Python with: + +```bash +pip install openml +``` + +## :page_facing_up: Citing OpenML-Python + +If you use OpenML-Python in a scientific publication, we would appreciate a reference to the following paper: [Matthias Feurer, Jan N. van Rijn, Arlind Kadra, Pieter Gijsbers, Neeratyoy Mallik, Sahithya Ravi, Andreas Mรผller, Joaquin Vanschoren, Frank Hutter
**OpenML-Python: an extensible Python API for OpenML**
@@ -35,23 +89,3 @@ Bibtex entry: url = {http://jmlr.org/papers/v22/19-920.html} } ``` - -## Contributors โœจ - -Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): - - - - - - - - - -

a-moadel

๐Ÿ“– ๐Ÿ’ก

Neeratyoy Mallik

๐Ÿ’ป ๐Ÿ“– ๐Ÿ’ก
- - - - - -This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! diff --git a/doc/index.rst b/doc/index.rst index a3b13c9e8..4ab56f5c3 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -71,7 +71,7 @@ Further information * `OpenML documentation `_ * `OpenML client APIs `_ -* `OpenML developer guide `_ +* `OpenML developer guide `_ * `Contact information `_ * `Citation request `_ * `OpenML blog `_ diff --git a/doc/progress.rst b/doc/progress.rst index 6496db7a8..3bf7c05aa 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -2,12 +2,12 @@ .. _progress: -========= -Changelog -========= +============================================= +Changelog (discontinued after version 0.15.0) +============================================= -next -~~~~~~ +See GitHub releases for the latest changes. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 0.15.0 ~~~~~~ diff --git a/doc/usage.rst b/doc/usage.rst index 8c713b586..f6476407e 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -118,7 +118,7 @@ this should be repeated several times. Also, the task defines a target metric for which a flow should be optimized. Below you can find our tutorial regarding tasks and if you want to know more -you can read the `OpenML guide `_: +you can read the `OpenML guide `_: * :ref:`sphx_glr_examples_30_extended_tasks_tutorial.py` diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py index 35b325fd9..b90d53660 100644 --- a/examples/20_basic/simple_datasets_tutorial.py +++ b/examples/20_basic/simple_datasets_tutorial.py @@ -27,7 +27,7 @@ # ================== # Iris dataset https://www.openml.org/d/61 -dataset = openml.datasets.get_dataset(61) +dataset = openml.datasets.get_dataset(dataset_id="iris", version=1) # Print a summary print( diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py index 0176328b6..eec6d7e8b 100644 --- a/examples/20_basic/simple_flows_and_runs_tutorial.py +++ b/examples/20_basic/simple_flows_and_runs_tutorial.py @@ -20,8 +20,8 @@ # Train a machine learning model # ============================== -# NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20 -dataset = openml.datasets.get_dataset(20) +# NOTE: We are using dataset "diabetes" from the test server: https://test.openml.org/d/20 +dataset = openml.datasets.get_dataset(dataset_id="diabetes", version=1) X, y, categorical_indicator, attribute_names = dataset.get_data( target=dataset.default_target_attribute ) diff --git a/examples/20_basic/simple_suites_tutorial.py b/examples/20_basic/simple_suites_tutorial.py index 92dfb3c04..3daf7b992 100644 --- a/examples/20_basic/simple_suites_tutorial.py +++ b/examples/20_basic/simple_suites_tutorial.py @@ -39,7 +39,9 @@ # Downloading benchmark suites # ============================ -suite = openml.study.get_suite(99) +# OpenML Benchmarking Suites and the OpenML-CC18 +# https://www.openml.org/s/99 +suite = openml.study.get_suite("OpenML-CC18") print(suite) #################################################################################################### diff --git a/examples/30_extended/configure_logging.py b/examples/30_extended/configure_logging.py index 3d33f1546..3878b0436 100644 --- a/examples/30_extended/configure_logging.py +++ b/examples/30_extended/configure_logging.py @@ -24,7 +24,7 @@ import openml -openml.datasets.get_dataset("iris") +openml.datasets.get_dataset("iris", version=1) # With default configuration, the above example will show no output to console. # However, in your cache directory you should find a file named 'openml_python.log', @@ -39,7 +39,7 @@ openml.config.set_console_log_level(logging.DEBUG) openml.config.set_file_log_level(logging.WARNING) -openml.datasets.get_dataset("iris") +openml.datasets.get_dataset("iris", version=1) # Now the log level that was previously written to file should also be shown in the console. # The message is now no longer written to file as the `file_log` was set to level `WARNING`. diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index 764cb8f36..606455dd8 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -51,7 +51,7 @@ # ================= # This is done based on the dataset ID. -dataset = openml.datasets.get_dataset(1471) +dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1) # Print a summary print( @@ -87,8 +87,7 @@ # Starting from 0.15, not downloading data will be the default behavior instead. # The data will be downloading automatically when you try to access it through # openml objects, e.g., using `dataset.features`. -dataset = openml.datasets.get_dataset(1471, download_data=False) - +dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1, download_data=False) ############################################################################ # Exercise 2 # ********** diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index 38b0d23cf..b7c000101 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -25,7 +25,7 @@ # Train a scikit-learn model on the data manually. # NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68 -dataset = openml.datasets.get_dataset(68) +dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1) X, y, categorical_indicator, attribute_names = dataset.get_data( target=dataset.default_target_attribute ) @@ -36,7 +36,7 @@ # You can also ask for meta-data to automatically preprocess the data. # # * e.g. categorical features -> do feature encoding -dataset = openml.datasets.get_dataset(17) +dataset = openml.datasets.get_dataset(dataset_id="credit-g", version=1) X, y, categorical_indicator, attribute_names = dataset.get_data( target=dataset.default_target_attribute ) @@ -101,7 +101,7 @@ [ ( "categorical", - preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), + preprocessing.OneHotEncoder(handle_unknown="ignore"), cat, # returns the categorical feature indices ), ( @@ -145,7 +145,7 @@ [ ( "categorical", - preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), + preprocessing.OneHotEncoder(handle_unknown="ignore"), categorical_feature_indices, ), ( diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py index a2bc3a4df..477e49fa6 100644 --- a/examples/30_extended/run_setup_tutorial.py +++ b/examples/30_extended/run_setup_tutorial.py @@ -58,7 +58,7 @@ cat_imp = make_pipeline( - OneHotEncoder(handle_unknown="ignore", sparse=False), + OneHotEncoder(handle_unknown="ignore"), TruncatedSVD(), ) cont_imp = SimpleImputer(strategy="median") diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py index d5bfcd88a..8715dfb4a 100644 --- a/examples/30_extended/study_tutorial.py +++ b/examples/30_extended/study_tutorial.py @@ -79,7 +79,8 @@ tasks = [115, 259, 307] # To verify -suite = openml.study.get_suite(1) +# https://test.openml.org/api/v1/study/1 +suite = openml.study.get_suite("OpenML100") print(all([t_id in suite.tasks for t_id in tasks])) run_ids = [] diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py index ff9902356..935d4c529 100644 --- a/examples/30_extended/suites_tutorial.py +++ b/examples/30_extended/suites_tutorial.py @@ -37,7 +37,8 @@ ############################################################################ # This is done based on the dataset ID. -suite = openml.study.get_suite(99) +# https://www.openml.org/api/v1/study/99 +suite = openml.study.get_suite("OpenML-CC18") print(suite) ############################################################################ diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py index 3960c3852..ae59c9ced 100644 --- a/examples/40_paper/2015_neurips_feurer_example.py +++ b/examples/40_paper/2015_neurips_feurer_example.py @@ -49,14 +49,14 @@ # this does not allow reproducibility (unclear splitting). Please do not use datasets but the # respective tasks as basis for a paper and publish task IDS. This example is only given to # showcase the use of OpenML-Python for a published paper and as a warning on how not to do it. -# Please check the `OpenML documentation of tasks `_ if you +# Please check the `OpenML documentation of tasks `_ if you # want to learn more about them. #################################################################################################### # This lists both active and inactive tasks (because of ``status='all'``). Unfortunately, # this is necessary as some of the datasets contain issues found after the publication and became # deactivated, which also deactivated the tasks on them. More information on active or inactive -# datasets can be found in the `online docs `_. +# datasets can be found in the `online docs `_. tasks = openml.tasks.list_tasks( task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION, status="all", diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py index d3ce59f35..6522013e3 100644 --- a/examples/40_paper/2018_kdd_rijn_example.py +++ b/examples/40_paper/2018_kdd_rijn_example.py @@ -4,8 +4,10 @@ A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. -This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other -systems). +Example Deprecation Warning! +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore. Publication ~~~~~~~~~~~ @@ -14,6 +16,16 @@ | Jan N. van Rijn and Frank Hutter | In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 | Available at https://dl.acm.org/doi/10.1145/3219819.3220058 + +Requirements +~~~~~~~~~~~~ + +This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other +systems). + +The following Python packages are required: + +pip install openml[examples,docs] fanova ConfigSpace<1.0 """ # License: BSD 3-Clause @@ -26,148 +38,151 @@ ) exit() -import json -import fanova -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns - -import openml - - -############################################################################## -# With the advent of automated machine learning, automated hyperparameter -# optimization methods are by now routinely used in data mining. However, this -# progress is not yet matched by equal progress on automatic analyses that -# yield information beyond performance-optimizing hyperparameter settings. -# In this example, we aim to answer the following two questions: Given an -# algorithm, what are generally its most important hyperparameters? -# -# This work is carried out on the OpenML-100 benchmark suite, which can be -# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we -# conduct the experiment on the Support Vector Machine (``flow_id=7707``) -# with specific kernel (we will perform a post-process filter operation for -# this). We should set some other experimental parameters (number of results -# per task, evaluation measure and the number of trees of the internal -# functional Anova) before the fun can begin. -# -# Note that we simplify the example in several ways: -# -# 1) We only consider numerical hyperparameters -# 2) We consider all hyperparameters that are numerical (in reality, some -# hyperparameters might be inactive (e.g., ``degree``) or irrelevant -# (e.g., ``random_state``) -# 3) We assume all hyperparameters to be on uniform scale -# -# Any difference in conclusion between the actual paper and the presented -# results is most likely due to one of these simplifications. For example, -# the hyperparameter C looks rather insignificant, whereas it is quite -# important when it is put on a log-scale. All these simplifications can be -# addressed by defining a ConfigSpace. For a more elaborated example that uses -# this, please see: -# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 - -suite = openml.study.get_suite("OpenML100") -flow_id = 7707 -parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"} -evaluation_measure = "predictive_accuracy" -limit_per_task = 500 -limit_nr_tasks = 15 -n_trees = 16 - -fanova_results = [] -# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the -# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. -for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - output_format="dataframe", - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]} - ) - for _, setup in evals.iterrows() - ] +# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline +print("This example is deprecated, remove the `if False` in this code to use it manually.") +if False: + import json + import fanova + import matplotlib.pyplot as plt + import pandas as pd + import seaborn as sns + + import openml + + + ############################################################################## + # With the advent of automated machine learning, automated hyperparameter + # optimization methods are by now routinely used in data mining. However, this + # progress is not yet matched by equal progress on automatic analyses that + # yield information beyond performance-optimizing hyperparameter settings. + # In this example, we aim to answer the following two questions: Given an + # algorithm, what are generally its most important hyperparameters? + # + # This work is carried out on the OpenML-100 benchmark suite, which can be + # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we + # conduct the experiment on the Support Vector Machine (``flow_id=7707``) + # with specific kernel (we will perform a post-process filter operation for + # this). We should set some other experimental parameters (number of results + # per task, evaluation measure and the number of trees of the internal + # functional Anova) before the fun can begin. + # + # Note that we simplify the example in several ways: + # + # 1) We only consider numerical hyperparameters + # 2) We consider all hyperparameters that are numerical (in reality, some + # hyperparameters might be inactive (e.g., ``degree``) or irrelevant + # (e.g., ``random_state``) + # 3) We assume all hyperparameters to be on uniform scale + # + # Any difference in conclusion between the actual paper and the presented + # results is most likely due to one of these simplifications. For example, + # the hyperparameter C looks rather insignificant, whereas it is quite + # important when it is put on a log-scale. All these simplifications can be + # addressed by defining a ConfigSpace. For a more elaborated example that uses + # this, please see: + # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 + + suite = openml.study.get_suite("OpenML100") + flow_id = 7707 + parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"} + evaluation_measure = "predictive_accuracy" + limit_per_task = 500 + limit_nr_tasks = 15 + n_trees = 16 + + fanova_results = [] + # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the + # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. + for idx, task_id in enumerate(suite.tasks): + if limit_nr_tasks is not None and idx >= limit_nr_tasks: + continue + print( + "Starting with task %d (%d/%d)" + % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) ) - except json.decoder.JSONDecodeError as e: - print("Task %d error: %s" % (task_id, e)) - continue - # apply our filters, to have only the setups that comply to the hyperparameters we want - for filter_key, filter_value in parameter_filters.items(): - setups_evals = setups_evals[setups_evals[filter_key] == filter_value] - # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, - # the fanova library needs to be informed by using a configspace object. - setups_evals = setups_evals.select_dtypes(include=["int64", "float64"]) - # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, - # ``verbose``. - setups_evals = setups_evals[ - [ - c - for c in list(setups_evals) - if len(setups_evals[c].unique()) > 1 or c == performance_column - ] - ] - # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., - # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: - - # determine x values to pass to fanova library - parameter_names = [ - pname for pname in setups_evals.columns.to_numpy() if pname != performance_column - ] - evaluator = fanova.fanova.fANOVA( - X=setups_evals[parameter_names].to_numpy(), - Y=setups_evals[performance_column].to_numpy(), - n_trees=n_trees, - ) - for idx, pname in enumerate(parameter_names): + # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) + evals = openml.evaluations.list_evaluations_setups( + evaluation_measure, + flows=[flow_id], + tasks=[task_id], + size=limit_per_task, + output_format="dataframe", + ) + + performance_column = "value" + # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance + # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine + # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format + # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for + # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the + # setups that belong to the flows embedded in this example though. try: - fanova_results.append( - { - "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], - } + setups_evals = pd.DataFrame( + [ + dict( + **{name: json.loads(value) for name, value in setup["parameters"].items()}, + **{performance_column: setup[performance_column]} + ) + for _, setup in evals.iterrows() + ] ) - except RuntimeError as e: - # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant - # for all configurations (there is no variance). We will skip these tasks (like the authors did in the - # paper). + except json.decoder.JSONDecodeError as e: print("Task %d error: %s" % (task_id, e)) continue + # apply our filters, to have only the setups that comply to the hyperparameters we want + for filter_key, filter_value in parameter_filters.items(): + setups_evals = setups_evals[setups_evals[filter_key] == filter_value] + # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, + # the fanova library needs to be informed by using a configspace object. + setups_evals = setups_evals.select_dtypes(include=["int64", "float64"]) + # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, + # ``verbose``. + setups_evals = setups_evals[ + [ + c + for c in list(setups_evals) + if len(setups_evals[c].unique()) > 1 or c == performance_column + ] + ] + # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., + # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: -# transform ``fanova_results`` from a list of dicts into a DataFrame -fanova_results = pd.DataFrame(fanova_results) - -############################################################################## -# make the boxplot of the variance contribution. Obviously, we can also use -# this data to make the Nemenyi plot, but this relies on the rather complex -# ``Orange`` dependency (``pip install Orange3``). For the complete example, -# the reader is referred to the more elaborate script (referred to earlier) -fig, ax = plt.subplots() -sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax) -ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") -ax.set_ylabel("Variance Contribution") -ax.set_xlabel(None) -plt.tight_layout() -plt.show() + # determine x values to pass to fanova library + parameter_names = [ + pname for pname in setups_evals.columns.to_numpy() if pname != performance_column + ] + evaluator = fanova.fanova.fANOVA( + X=setups_evals[parameter_names].to_numpy(), + Y=setups_evals[performance_column].to_numpy(), + n_trees=n_trees, + ) + for idx, pname in enumerate(parameter_names): + try: + fanova_results.append( + { + "hyperparameter": pname.split(".")[-1], + "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], + } + ) + except RuntimeError as e: + # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant + # for all configurations (there is no variance). We will skip these tasks (like the authors did in the + # paper). + print("Task %d error: %s" % (task_id, e)) + continue + + # transform ``fanova_results`` from a list of dicts into a DataFrame + fanova_results = pd.DataFrame(fanova_results) + + ############################################################################## + # make the boxplot of the variance contribution. Obviously, we can also use + # this data to make the Nemenyi plot, but this relies on the rather complex + # ``Orange`` dependency (``pip install Orange3``). For the complete example, + # the reader is referred to the more elaborate script (referred to earlier) + fig, ax = plt.subplots() + sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax) + ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") + ax.set_ylabel("Variance Contribution") + ax.set_xlabel(None) + plt.tight_layout() + plt.show() diff --git a/openml/__version__.py b/openml/__version__.py index 6632a85f4..392bf4b37 100644 --- a/openml/__version__.py +++ b/openml/__version__.py @@ -5,4 +5,4 @@ # The following line *must* be the last in the module, exactly as formatted: from __future__ import annotations -__version__ = "0.15.0" +__version__ = "0.15.1" diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 4f673186e..3509f18e7 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -24,6 +24,7 @@ from .__version__ import __version__ from .exceptions import ( OpenMLHashException, + OpenMLNotAuthorizedError, OpenMLServerError, OpenMLServerException, OpenMLServerNoResult, @@ -36,6 +37,8 @@ FILE_ELEMENTS_TYPE = Dict[str, Union[str, Tuple[str, str]]] DATABASE_CONNECTION_ERRCODE = 107 +API_TOKEN_HELP_LINK = "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial.html#authentication" # noqa: S105 + def _robot_delay(n: int) -> float: wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60 @@ -208,6 +211,8 @@ def _download_minio_bucket(source: str, destination: str | Path) -> None: for file_object in client.list_objects(bucket, prefix=prefix, recursive=True): if file_object.object_name is None: raise ValueError(f"Object name is None for object {file_object!r}") + if file_object.etag is None: + raise ValueError(f"Object etag is None for object {file_object!r}") marker = destination / file_object.etag if marker.exists(): @@ -351,7 +356,7 @@ def __is_checksum_equal(downloaded_file_binary: bytes, md5_checksum: str | None return md5_checksum == md5_checksum_download -def _send_request( # noqa: C901 +def _send_request( # noqa: C901, PLR0912 request_method: str, url: str, data: DATA_TYPE, @@ -387,18 +392,15 @@ def _send_request( # noqa: C901 # -- Check if encoding is not UTF-8 perhaps if __is_checksum_equal(response.content, md5_checksum): raise OpenMLHashException( - "Checksum of downloaded file is unequal to the expected checksum {}" - "because the text encoding is not UTF-8 when downloading {}. " - "There might be a sever-sided issue with the file, " - "see: https://github.com/openml/openml-python/issues/1180.".format( - md5_checksum, - url, - ), + f"Checksum of downloaded file is unequal to the expected checksum" + f"{md5_checksum} because the text encoding is not UTF-8 when " + f"downloading {url}. There might be a sever-sided issue with the file, " + "see: https://github.com/openml/openml-python/issues/1180.", ) raise OpenMLHashException( - "Checksum of downloaded file is unequal to the expected checksum {} " - "when downloading {}.".format(md5_checksum, url), + f"Checksum of downloaded file is unequal to the expected checksum " + f"{md5_checksum} when downloading {url}.", ) return response @@ -457,26 +459,33 @@ def __parse_server_exception( url: str, file_elements: FILE_ELEMENTS_TYPE | None, ) -> OpenMLServerError: - if response.status_code == 414: + if response.status_code == requests.codes.URI_TOO_LONG: raise OpenMLServerError(f"URI too long! ({url})") + # OpenML has a sophisticated error system where information about failures is provided, + # in the response body itself. + # First, we need to parse it out. try: server_exception = xmltodict.parse(response.text) except xml.parsers.expat.ExpatError as e: raise e - except Exception as e: # noqa: BLE001 - # OpenML has a sophisticated error system - # where information about failures is provided. try to parse this + except Exception as e: + # If we failed to parse it out, then something has gone wrong in the body we have sent back + # from the server and there is little extra information we can capture. raise OpenMLServerError( f"Unexpected server error when calling {url}. Please contact the developers!\n" f"Status code: {response.status_code}\n{response.text}", ) from e + # Now we can parse out the specific error codes that we return. These + # are in addition to the typical HTTP error codes, but encode more + # specific informtion. You can find these codes here: + # https://github.com/openml/OpenML/blob/develop/openml_OS/views/pages/api_new/v1/xml/pre.php server_error = server_exception["oml:error"] code = int(server_error["oml:code"]) message = server_error["oml:message"] additional_information = server_error.get("oml:additional_information") - if code in [372, 512, 500, 482, 542, 674]: + if code in [111, 372, 512, 500, 482, 542, 674]: if additional_information: full_message = f"{message} - {additional_information}" else: @@ -484,10 +493,9 @@ def __parse_server_exception( # 512 for runs, 372 for datasets, 500 for flows # 482 for tasks, 542 for evaluations, 674 for setups - return OpenMLServerNoResult( - code=code, - message=full_message, - ) + # 111 for dataset descriptions + return OpenMLServerNoResult(code=code, message=full_message, url=url) + # 163: failure to validate flow XML (https://www.openml.org/api_docs#!/flow/post_flow) if code in [163] and file_elements is not None and "description" in file_elements: # file_elements['description'] is the XML file description of the flow @@ -498,4 +506,21 @@ def __parse_server_exception( ) else: full_message = f"{message} - {additional_information}" + + if code in [ + 102, # flow/exists post + 137, # dataset post + 350, # dataset/42 delete + 310, # flow/ post + 320, # flow/42 delete + 400, # run/42 delete + 460, # task/42 delete + ]: + msg = ( + f"The API call {url} requires authentication via an API key.\nPlease configure " + "OpenML-Python to use your API as described in this example:" + "\nhttps://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial.html#authentication" + ) + return OpenMLNotAuthorizedError(message=msg) + return OpenMLServerException(code=code, message=full_message, url=url) diff --git a/openml/cli.py b/openml/cli.py index 5732442d0..d0a46e498 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -1,4 +1,5 @@ -""""Command Line Interface for `openml` to configure its settings.""" +"""Command Line Interface for `openml` to configure its settings.""" + from __future__ import annotations import argparse diff --git a/openml/config.py b/openml/config.py index 6a37537dc..d838b070a 100644 --- a/openml/config.py +++ b/openml/config.py @@ -8,10 +8,12 @@ import logging.handlers import os import platform +import shutil import warnings +from contextlib import contextmanager from io import StringIO from pathlib import Path -from typing import Any, cast +from typing import Any, Iterator, cast from typing_extensions import Literal, TypedDict from urllib.parse import urlparse @@ -20,6 +22,9 @@ console_handler: logging.StreamHandler | None = None file_handler: logging.handlers.RotatingFileHandler | None = None +OPENML_CACHE_DIR_ENV_VAR = "OPENML_CACHE_DIR" +OPENML_SKIP_PARQUET_ENV_VAR = "OPENML_SKIP_PARQUET" + class _Config(TypedDict): apikey: str @@ -101,14 +106,50 @@ def set_file_log_level(file_output_level: int) -> None: # Default values (see also https://github.com/openml/OpenML/wiki/Client-API-Standards) _user_path = Path("~").expanduser().absolute() + + +def _resolve_default_cache_dir() -> Path: + user_defined_cache_dir = os.environ.get(OPENML_CACHE_DIR_ENV_VAR) + if user_defined_cache_dir is not None: + return Path(user_defined_cache_dir) + + if platform.system().lower() != "linux": + return _user_path / ".openml" + + xdg_cache_home = os.environ.get("XDG_CACHE_HOME") + if xdg_cache_home is None: + return Path("~", ".cache", "openml") + + # This is the proper XDG_CACHE_HOME directory, but + # we unfortunately had a problem where we used XDG_CACHE_HOME/org, + # we check heuristically if this old directory still exists and issue + # a warning if it does. There's too much data to move to do this for the user. + + # The new cache directory exists + cache_dir = Path(xdg_cache_home) / "openml" + if cache_dir.exists(): + return cache_dir + + # The old cache directory *does not* exist + heuristic_dir_for_backwards_compat = Path(xdg_cache_home) / "org" / "openml" + if not heuristic_dir_for_backwards_compat.exists(): + return cache_dir + + root_dir_to_delete = Path(xdg_cache_home) / "org" + openml_logger.warning( + "An old cache directory was found at '%s'. This directory is no longer used by " + "OpenML-Python. To silence this warning you would need to delete the old cache " + "directory. The cached files will then be located in '%s'.", + root_dir_to_delete, + cache_dir, + ) + return Path(xdg_cache_home) + + _defaults: _Config = { "apikey": "", "server": "https://www.openml.org/api/v1/xml", - "cachedir": ( - Path(os.environ.get("XDG_CACHE_HOME", _user_path / ".cache" / "openml")) - if platform.system() == "Linux" - else _user_path / ".openml" - ), + "cachedir": _resolve_default_cache_dir(), "avoid_duplicate_runs": True, "retry_policy": "human", "connection_n_retries": 5, @@ -135,11 +176,11 @@ def get_server_base_url() -> str: apikey: str = _defaults["apikey"] show_progress: bool = _defaults["show_progress"] # The current cache directory (without the server name) -_root_cache_directory = Path(_defaults["cachedir"]) +_root_cache_directory: Path = Path(_defaults["cachedir"]) avoid_duplicate_runs = _defaults["avoid_duplicate_runs"] -retry_policy = _defaults["retry_policy"] -connection_n_retries = _defaults["connection_n_retries"] +retry_policy: Literal["human", "robot"] = _defaults["retry_policy"] +connection_n_retries: int = _defaults["connection_n_retries"] def set_retry_policy(value: Literal["human", "robot"], n_retries: int | None = None) -> None: @@ -218,11 +259,66 @@ def stop_using_configuration_for_example(cls) -> None: cls._start_last_called = False +def _handle_xdg_config_home_backwards_compatibility( + xdg_home: str, +) -> Path: + # NOTE(eddiebergman): A previous bug results in the config + # file being located at `${XDG_CONFIG_HOME}/config` instead + # of `${XDG_CONFIG_HOME}/openml/config`. As to maintain backwards + # compatibility, where users may already may have had a configuration, + # we copy it over an issue a warning until it's deleted. + # As a heurisitic to ensure that it's "our" config file, we try parse it first. + config_dir = Path(xdg_home) / "openml" + + backwards_compat_config_file = Path(xdg_home) / "config" + if not backwards_compat_config_file.exists(): + return config_dir + + # If it errors, that's a good sign it's not ours and we can + # safely ignore it, jumping out of this block. This is a heurisitc + try: + _parse_config(backwards_compat_config_file) + except Exception: # noqa: BLE001 + return config_dir + + # Looks like it's ours, lets try copy it to the correct place + correct_config_location = config_dir / "config" + try: + # We copy and return the new copied location + shutil.copy(backwards_compat_config_file, correct_config_location) + openml_logger.warning( + "An openml configuration file was found at the old location " + f"at {backwards_compat_config_file}. We have copied it to the new " + f"location at {correct_config_location}. " + "\nTo silence this warning please verify that the configuration file " + f"at {correct_config_location} is correct and delete the file at " + f"{backwards_compat_config_file}." + ) + return config_dir + except Exception as e: # noqa: BLE001 + # We failed to copy and its ours, return the old one. + openml_logger.warning( + "While attempting to perform a backwards compatible fix, we " + f"failed to copy the openml config file at " + f"{backwards_compat_config_file}' to {correct_config_location}" + f"\n{type(e)}: {e}", + "\n\nTo silence this warning, please copy the file " + "to the new location and delete the old file at " + f"{backwards_compat_config_file}.", + ) + return backwards_compat_config_file + + def determine_config_file_path() -> Path: - if platform.system() == "Linux": - config_dir = Path(os.environ.get("XDG_CONFIG_HOME", Path("~") / ".config" / "openml")) + if platform.system().lower() == "linux": + xdg_home = os.environ.get("XDG_CONFIG_HOME") + if xdg_home is not None: + config_dir = _handle_xdg_config_home_backwards_compatibility(xdg_home) + else: + config_dir = Path("~", ".config", "openml") else: config_dir = Path("~") / ".openml" + # Still use os.path.expanduser to trigger the mock in the unit test config_dir = Path(config_dir).expanduser().resolve() return config_dir / "config" @@ -251,7 +347,10 @@ def _setup(config: _Config | None = None) -> None: if not config_dir.exists(): config_dir.mkdir(exist_ok=True, parents=True) except PermissionError: - pass + openml_logger.warning( + f"No permission to create OpenML directory at {config_dir}!" + " This can result in OpenML-Python not working properly." + ) if config is None: config = _parse_config(config_file) @@ -260,36 +359,29 @@ def _setup(config: _Config | None = None) -> None: apikey = config["apikey"] server = config["server"] show_progress = config["show_progress"] - short_cache_dir = Path(config["cachedir"]) n_retries = int(config["connection_n_retries"]) set_retry_policy(config["retry_policy"], n_retries) + user_defined_cache_dir = os.environ.get(OPENML_CACHE_DIR_ENV_VAR) + if user_defined_cache_dir is not None: + short_cache_dir = Path(user_defined_cache_dir) + else: + short_cache_dir = Path(config["cachedir"]) _root_cache_directory = short_cache_dir.expanduser().resolve() try: cache_exists = _root_cache_directory.exists() - except PermissionError: - cache_exists = False - - # create the cache subdirectory - try: - if not _root_cache_directory.exists(): + # create the cache subdirectory + if not cache_exists: _root_cache_directory.mkdir(exist_ok=True, parents=True) + _create_log_handlers() except PermissionError: openml_logger.warning( - "No permission to create openml cache directory at %s! This can result in " - "OpenML-Python not working properly." % _root_cache_directory, + f"No permission to create OpenML directory at {_root_cache_directory}!" + " This can result in OpenML-Python not working properly." ) - - if cache_exists: - _create_log_handlers() - else: _create_log_handlers(create_file_handler=False) - openml_logger.warning( - "No permission to create OpenML directory at %s! This can result in OpenML-Python " - "not working properly." % config_dir, - ) def set_field_in_config_file(field: str, value: Any) -> None: @@ -407,6 +499,18 @@ def set_root_cache_directory(root_cache_directory: str | Path) -> None: stop_using_configuration_for_example = ConfigurationForExamples.stop_using_configuration_for_example +@contextmanager +def overwrite_config_context(config: dict[str, Any]) -> Iterator[_Config]: + """A context manager to temporarily override variables in the configuration.""" + existing_config = get_config_as_dict() + merged_config = {**existing_config, **config} + + _setup(merged_config) # type: ignore + yield merged_config # type: ignore + + _setup(existing_config) + + __all__ = [ "get_cache_directory", "set_root_cache_directory", diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 30febcba5..5190ac522 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -3,6 +3,7 @@ import gzip import logging +import os import pickle import re import warnings @@ -17,6 +18,7 @@ import xmltodict from openml.base import OpenMLBase +from openml.config import OPENML_SKIP_PARQUET_ENV_VAR from openml.exceptions import PyOpenMLError from .data_feature import OpenMLDataFeature @@ -156,14 +158,14 @@ def find_invalid_characters(string: str, pattern: str) -> str: ) if dataset_id is None: - pattern = "^[\x00-\x7F]*$" + pattern = "^[\x00-\x7f]*$" if description and not re.match(pattern, description): # not basiclatin (XSD complains) invalid_characters = find_invalid_characters(description, pattern) raise ValueError( f"Invalid symbols {invalid_characters} in description: {description}", ) - pattern = "^[\x00-\x7F]*$" + pattern = "^[\x00-\x7f]*$" if citation and not re.match(pattern, citation): # not basiclatin (XSD complains) invalid_characters = find_invalid_characters(citation, pattern) @@ -329,13 +331,26 @@ def __eq__(self, other: Any) -> bool: "version", "upload_date", "url", + "_parquet_url", "dataset", "data_file", + "format", + "cache_format", + } + + cache_fields = { + "_dataset", + "data_file", + "data_pickle_file", + "data_feather_file", + "feather_attribute_file", + "parquet_file", } # check that common keys and values are identical - self_keys = set(self.__dict__.keys()) - server_fields - other_keys = set(other.__dict__.keys()) - server_fields + ignore_fields = server_fields | cache_fields + self_keys = set(self.__dict__.keys()) - ignore_fields + other_keys = set(other.__dict__.keys()) - ignore_fields return self_keys == other_keys and all( self.__dict__[key] == other.__dict__[key] for key in self_keys ) @@ -345,8 +360,10 @@ def _download_data(self) -> None: # import required here to avoid circular import. from .functions import _get_dataset_arff, _get_dataset_parquet - if self._parquet_url is not None: - self.parquet_file = str(_get_dataset_parquet(self)) + skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true" + if self._parquet_url is not None and not skip_parquet: + parquet_file = _get_dataset_parquet(self) + self.parquet_file = None if parquet_file is None else str(parquet_file) if self.parquet_file is None: self.data_file = str(_get_dataset_arff(self)) @@ -574,7 +591,7 @@ def _parse_data_from_file(self, data_file: Path) -> tuple[list[str], list[bool], def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]: try: data = pd.read_parquet(data_file) - except Exception as e: # noqa: BLE001 + except Exception as e: raise Exception(f"File: {data_file}") from e categorical = [data[c].dtype.name == "category" for c in data.columns] attribute_names = list(data.columns) @@ -816,7 +833,7 @@ def get_data( # noqa: C901, PLR0912, PLR0915 to_exclude.extend(self.ignore_attribute) if len(to_exclude) > 0: - logger.info("Going to remove the following attributes: %s" % to_exclude) + logger.info(f"Going to remove the following attributes: {to_exclude}") keep = np.array([column not in to_exclude for column in attribute_names]) data = data.loc[:, keep] if isinstance(data, pd.DataFrame) else data[:, keep] @@ -1077,7 +1094,9 @@ def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]: def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]: - xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value")) + xml_dict = xmltodict.parse( + features_xml_string, force_list=("oml:feature", "oml:nominal_value"), strip_whitespace=False + ) features_xml = xml_dict["oml:data_features"] features: dict[int, OpenMLDataFeature] = {} diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 410867b01..3f3c709f9 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -3,9 +3,11 @@ from __future__ import annotations import logging +import os import warnings from collections import OrderedDict from pathlib import Path +from pyexpat import ExpatError from typing import TYPE_CHECKING, Any, overload from typing_extensions import Literal @@ -15,11 +17,11 @@ import pandas as pd import urllib3 import xmltodict -from pyexpat import ExpatError from scipy.sparse import coo_matrix import openml._api_calls import openml.utils +from openml.config import OPENML_SKIP_PARQUET_ENV_VAR from openml.exceptions import ( OpenMLHashException, OpenMLPrivateDatasetError, @@ -85,8 +87,7 @@ def list_datasets( *, output_format: Literal["dataframe"], **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... @overload @@ -98,8 +99,7 @@ def list_datasets( tag: str | None, output_format: Literal["dataframe"], **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... @overload @@ -111,8 +111,7 @@ def list_datasets( tag: str | None = ..., output_format: Literal["dict"] = "dict", **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def list_datasets( @@ -207,8 +206,7 @@ def _list_datasets( data_id: list | None = ..., output_format: Literal["dict"] = "dict", **kwargs: Any, -) -> dict: - ... +) -> dict: ... @overload @@ -216,8 +214,7 @@ def _list_datasets( data_id: list | None = ..., output_format: Literal["dataframe"] = "dataframe", **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def _list_datasets( @@ -256,18 +253,16 @@ def _list_datasets( for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" if data_id is not None: - api_call += "/data_id/%s" % ",".join([str(int(i)) for i in data_id]) + api_call += "/data_id/{}".format(",".join([str(int(i)) for i in data_id])) return __list_datasets(api_call=api_call, output_format=output_format) @overload -def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict: - ... +def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ... @overload -def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: - ... +def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ... def __list_datasets( @@ -484,7 +479,7 @@ def get_dataset( # noqa: C901, PLR0912 Parameters ---------- dataset_id : int or str - Dataset ID of the dataset to download + The ID or name of the dataset to download. download_data : bool (default=False) If True, also download the data file. Beware that some datasets are large and it might make the operation noticeably slower. Metadata is also still retrieved. @@ -567,7 +562,10 @@ def get_dataset( # noqa: C901, PLR0912 if download_qualities: qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id) - if "oml:parquet_url" in description and download_data: + parquet_file = None + skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true" + download_parquet = "oml:parquet_url" in description and not skip_parquet + if download_parquet and (download_data or download_all_files): try: parquet_file = _get_dataset_parquet( description, @@ -575,12 +573,11 @@ def get_dataset( # noqa: C901, PLR0912 ) except urllib3.exceptions.MaxRetryError: parquet_file = None - else: - parquet_file = None arff_file = None if parquet_file is None and download_data: - logger.warning("Failed to download parquet, fallback on ARFF.") + if download_parquet: + logger.warning("Failed to download parquet, fallback on ARFF.") arff_file = _get_dataset_arff(description) remove_dataset_cache = False @@ -785,10 +782,8 @@ def create_dataset( # noqa: C901, PLR0912, PLR0915 if not is_row_id_an_attribute: raise ValueError( "'row_id_attribute' should be one of the data attribute. " - " Got '{}' while candidates are {}.".format( - row_id_attribute, - [attr[0] for attr in attributes_], - ), + f" Got '{row_id_attribute}' while candidates are" + f" {[attr[0] for attr in attributes_]}.", ) if isinstance(data, pd.DataFrame): @@ -870,7 +865,7 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non Updates the status of a dataset to either 'active' or 'deactivated'. Please see the OpenML API documentation for a description of the status and all legal status transitions: - https://docs.openml.org/#dataset-status + https://docs.openml.org/concepts/data/#dataset-status Parameters ---------- diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index a854686d1..a39096a58 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -32,8 +32,7 @@ def list_evaluations( per_fold: bool | None = ..., sort_order: str | None = ..., output_format: Literal["dict", "object"] = "dict", -) -> dict: - ... +) -> dict: ... @overload @@ -51,8 +50,7 @@ def list_evaluations( per_fold: bool | None = ..., sort_order: str | None = ..., output_format: Literal["dataframe"] = ..., -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def list_evaluations( @@ -204,24 +202,24 @@ def _list_evaluations( ------- dict of objects, or dataframe """ - api_call = "evaluation/list/function/%s" % function + api_call = f"evaluation/list/function/{function}" if kwargs is not None: for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" if tasks is not None: - api_call += "/task/%s" % ",".join([str(int(i)) for i in tasks]) + api_call += "/task/{}".format(",".join([str(int(i)) for i in tasks])) if setups is not None: - api_call += "/setup/%s" % ",".join([str(int(i)) for i in setups]) + api_call += "/setup/{}".format(",".join([str(int(i)) for i in setups])) if flows is not None: - api_call += "/flow/%s" % ",".join([str(int(i)) for i in flows]) + api_call += "/flow/{}".format(",".join([str(int(i)) for i in flows])) if runs is not None: - api_call += "/run/%s" % ",".join([str(int(i)) for i in runs]) + api_call += "/run/{}".format(",".join([str(int(i)) for i in runs])) if uploaders is not None: - api_call += "/uploader/%s" % ",".join([str(int(i)) for i in uploaders]) + api_call += "/uploader/{}".format(",".join([str(int(i)) for i in uploaders])) if study is not None: api_call += "/study/%d" % study if sort_order is not None: - api_call += "/sort_order/%s" % sort_order + api_call += f"/sort_order/{sort_order}" return __list_evaluations(api_call, output_format=output_format) @@ -236,7 +234,7 @@ def __list_evaluations( # Minimalistic check if the XML is useful if "oml:evaluations" not in evals_dict: raise ValueError( - "Error in return XML, does not contain " '"oml:evaluations": %s' % str(evals_dict), + "Error in return XML, does not contain " f'"oml:evaluations": {evals_dict!s}', ) assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), type( diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 02322196e..2d40d03b8 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -48,12 +48,27 @@ r"(?P(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$", ) -sctypes = np.sctypes if Version(np.__version__) < Version("2.0") else np.core.sctypes +# NOTE(eddiebergman): This was imported before but became deprecated, +# as a result I just enumerated them manually by copy-ing and pasting, +# recommended solution in Numpy 2.0 guide was to explicitly list them. SIMPLE_NUMPY_TYPES = [ - nptype - for type_cat, nptypes in sctypes.items() - for nptype in nptypes # type: ignore - if type_cat != "others" + np.int8, + np.int16, + np.int32, + np.int64, + np.longlong, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.ulonglong, + np.float16, + np.float32, + np.float64, + np.longdouble, + np.complex64, + np.complex128, + np.clongdouble, ] SIMPLE_TYPES = (bool, int, float, str, *SIMPLE_NUMPY_TYPES) @@ -312,7 +327,7 @@ def flow_to_model( strict_version=strict_version, ) - def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0913, PLR0912 + def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0912 self, o: Any, components: dict | None = None, @@ -419,7 +434,7 @@ def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0913, PLR0912 strict_version=strict_version, ) else: - raise ValueError("Cannot flow_to_sklearn %s" % serialized_type) + raise ValueError(f"Cannot flow_to_sklearn {serialized_type}") else: rval = OrderedDict( @@ -979,17 +994,17 @@ def flatten_all(list_): # length 2 is for {VotingClassifier.estimators, # Pipeline.steps, FeatureUnion.transformer_list} # length 3 is for ColumnTransformer - msg = "Length of tuple of type {} does not match assumptions".format( - sub_component_type, + raise ValueError( + f"Length of tuple of type {sub_component_type}" + " does not match assumptions" ) - raise ValueError(msg) if isinstance(sub_component, str): if sub_component not in SKLEARN_PIPELINE_STRING_COMPONENTS: msg = ( "Second item of tuple does not match assumptions. " "If string, can be only 'drop' or 'passthrough' but" - "got %s" % sub_component + f"got {sub_component}" ) raise ValueError(msg) elif sub_component is None: @@ -1002,15 +1017,15 @@ def flatten_all(list_): elif not isinstance(sub_component, OpenMLFlow): msg = ( "Second item of tuple does not match assumptions. " - "Expected OpenMLFlow, got %s" % type(sub_component) + f"Expected OpenMLFlow, got {type(sub_component)}" ) raise TypeError(msg) if identifier in reserved_keywords: parent_model = f"{model.__module__}.{model.__class__.__name__}" - msg = "Found element shadowing official " "parameter for {}: {}".format( - parent_model, - identifier, + msg = ( + "Found element shadowing official " + f"parameter for {parent_model}: {identifier}" ) raise PyOpenMLError(msg) @@ -1035,9 +1050,9 @@ def flatten_all(list_): model=None, ) component_reference: OrderedDict[str, str | dict] = OrderedDict() - component_reference[ - "oml-python:serialized_object" - ] = COMPOSITION_STEP_CONSTANT + component_reference["oml-python:serialized_object"] = ( + COMPOSITION_STEP_CONSTANT + ) cr_value: dict[str, Any] = OrderedDict() cr_value["key"] = identifier cr_value["step_name"] = identifier @@ -1218,7 +1233,7 @@ def _check_dependencies( for dependency_string in dependencies_list: match = DEPENDENCIES_PATTERN.match(dependency_string) if not match: - raise ValueError("Cannot parse dependency %s" % dependency_string) + raise ValueError(f"Cannot parse dependency {dependency_string}") dependency_name = match.group("name") operation = match.group("operation") @@ -1237,7 +1252,7 @@ def _check_dependencies( installed_version > required_version or installed_version == required_version ) else: - raise NotImplementedError("operation '%s' is not supported" % operation) + raise NotImplementedError(f"operation '{operation}' is not supported") message = ( "Trying to deserialize a model with dependency " f"{dependency_string} not satisfied." @@ -1363,7 +1378,7 @@ def _serialize_cross_validator(self, o: Any) -> OrderedDict[str, str | dict]: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DeprecationWarning) value = getattr(o, key, None) - if w is not None and len(w) and w[0].category == DeprecationWarning: + if w is not None and len(w) and w[0].category is DeprecationWarning: # if the parameter is deprecated, don't show it continue @@ -1812,9 +1827,9 @@ def _prediction_to_probabilities( # then we need to add a column full of zeros into the probabilities # for class 3 because the rest of the library expects that the # probabilities are ordered the same way as the classes are ordered). - message = "Estimator only predicted for {}/{} classes!".format( - proba_y.shape[1], - len(task.class_labels), + message = ( + f"Estimator only predicted for {proba_y.shape[1]}/{len(task.class_labels)}" + " classes!" ) warnings.warn(message, stacklevel=2) openml.config.logger.warning(message) @@ -2008,9 +2023,8 @@ def is_subcomponent_specification(values): pass else: raise TypeError( - "Subcomponent flow should be of type flow, but is {}".format( - type(subcomponent_flow), - ), + "Subcomponent flow should be of type flow, but is" + f" {type(subcomponent_flow)}", ) current = { @@ -2129,8 +2143,8 @@ def instantiate_model_from_hpo_class( """ if not self._is_hpo_class(model): raise AssertionError( - "Flow model %s is not an instance of sklearn.model_selection._search.BaseSearchCV" - % model, + f"Flow model {model} is not an instance of" + " sklearn.model_selection._search.BaseSearchCV", ) base_estimator = model.estimator base_estimator.set_params(**trace_iteration.get_parameters()) @@ -2197,8 +2211,8 @@ def _obtain_arff_trace( """ if not self._is_hpo_class(model): raise AssertionError( - "Flow model %s is not an instance of sklearn.model_selection._search.BaseSearchCV" - % model, + f"Flow model {model} is not an instance of " + "sklearn.model_selection._search.BaseSearchCV", ) if not hasattr(model, "cv_results_"): raise ValueError("model should contain `cv_results_`") @@ -2235,7 +2249,7 @@ def _obtain_arff_trace( # hyperparameter layer_sizes of MLPClassifier type = "STRING" # noqa: A001 else: - raise TypeError("Unsupported param type in param grid: %s" % key) + raise TypeError(f"Unsupported param type in param grid: {key}") # renamed the attribute param to parameter, as this is a required # OpenML convention - this also guards against name collisions diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 4e437e35c..a3ff50ca1 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -135,15 +135,13 @@ def __init__( # noqa: PLR0913 keys_parameters_meta_info = set(parameters_meta_info.keys()) if len(keys_parameters.difference(keys_parameters_meta_info)) > 0: raise ValueError( - "Parameter %s only in parameters, but not in " - "parameters_meta_info." - % str(keys_parameters.difference(keys_parameters_meta_info)), + f"Parameter {keys_parameters.difference(keys_parameters_meta_info)!s} only in " + "parameters, but not in parameters_meta_info.", ) if len(keys_parameters_meta_info.difference(keys_parameters)) > 0: raise ValueError( - "Parameter %s only in parameters_meta_info, " - "but not in parameters." - % str(keys_parameters_meta_info.difference(keys_parameters)), + f"Parameter {keys_parameters_meta_info.difference(keys_parameters)!s} only in " + " parameters_meta_info, but not in parameters.", ) self.external_version = external_version diff --git a/openml/flows/functions.py b/openml/flows/functions.py index b01e54b44..3d056ac60 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -140,8 +140,7 @@ def list_flows( tag: str | None = ..., output_format: Literal["dict"] = "dict", **kwargs: Any, -) -> dict: - ... +) -> dict: ... @overload @@ -152,8 +151,7 @@ def list_flows( *, output_format: Literal["dataframe"], **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... @overload @@ -163,8 +161,7 @@ def list_flows( tag: str | None, output_format: Literal["dataframe"], **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def list_flows( @@ -243,18 +240,15 @@ def list_flows( @overload -def _list_flows(output_format: Literal["dict"] = ..., **kwargs: Any) -> dict: - ... +def _list_flows(output_format: Literal["dict"] = ..., **kwargs: Any) -> dict: ... @overload -def _list_flows(*, output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: - ... +def _list_flows(*, output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ... @overload -def _list_flows(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: - ... +def _list_flows(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ... def _list_flows( @@ -391,13 +385,11 @@ def get_flow_id( @overload -def __list_flows(api_call: str, output_format: Literal["dict"] = "dict") -> dict: - ... +def __list_flows(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ... @overload -def __list_flows(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: - ... +def __list_flows(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ... def __list_flows( @@ -453,7 +445,7 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None: while len(stack) > 0: current = stack.pop() if current.flow_id is None: - raise ValueError("Flow %s has no flow_id!" % current.name) + raise ValueError(f"Flow {current.name} has no flow_id!") for component in current.components.values(): stack.append(component) @@ -492,10 +484,10 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 Whether to ignore matching of flow descriptions. """ if not isinstance(flow1, OpenMLFlow): - raise TypeError("Argument 1 must be of type OpenMLFlow, but is %s" % type(flow1)) + raise TypeError(f"Argument 1 must be of type OpenMLFlow, but is {type(flow1)}") if not isinstance(flow2, OpenMLFlow): - raise TypeError("Argument 2 must be of type OpenMLFlow, but is %s" % type(flow2)) + raise TypeError(f"Argument 2 must be of type OpenMLFlow, but is {type(flow2)}") # TODO as they are actually now saved during publish, it might be good to # check for the equality of these as well. @@ -522,11 +514,11 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 for name in set(attr1.keys()).union(attr2.keys()): if name not in attr1: raise ValueError( - "Component %s only available in " "argument2, but not in argument1." % name, + f"Component {name} only available in " "argument2, but not in argument1.", ) if name not in attr2: raise ValueError( - "Component %s only available in " "argument2, but not in argument1." % name, + f"Component {name} only available in " "argument2, but not in argument1.", ) assert_flows_equal( attr1[name], @@ -549,9 +541,9 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 symmetric_difference = params_flow_1 ^ params_flow_2 if len(symmetric_difference) > 0: raise ValueError( - "Flow %s: parameter set of flow " + f"Flow {flow1.name}: parameter set of flow " "differs from the parameters stored " - "on the server." % flow1.name, + "on the server.", ) if ignore_parameter_values_on_older_children: diff --git a/openml/runs/functions.py b/openml/runs/functions.py index f7963297d..b6f950020 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -74,8 +74,7 @@ def run_model_on_task( # noqa: PLR0913 ---------- model : sklearn model A model which has a function fit(X,Y) and predict(X), - all supervised estimators of scikit learn follow this definition of a model - (https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) + all supervised estimators of scikit learn follow this definition of a model. task : OpenMLTask or int or str Task to perform or Task id. This may be a model instead if the first argument is an OpenMLTask. @@ -199,16 +198,12 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 flow : OpenMLFlow A flow wraps a machine learning model together with relevant information. The model has a function fit(X,Y) and predict(X), - all supervised estimators of scikit learn follow this definition of a model - (https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) + all supervised estimators of scikit learn follow this definition of a model. task : OpenMLTask Task to perform. This may be an OpenMLFlow instead if the first argument is an OpenMLTask. avoid_duplicate_runs : bool, optional (default=True) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. - avoid_duplicate_runs : bool, optional (default=True) - If True, the run will throw an error if the setup/task combination is already present on - the server. This feature requires an internet connection. flow_tags : List[str], optional (default=None) A list of tags that the flow should have at creation. seed: int, optional (default=None) @@ -367,7 +362,7 @@ def get_run_trace(run_id: int) -> OpenMLRunTrace: return OpenMLRunTrace.trace_from_xml(trace_xml) -def initialize_model_from_run(run_id: int) -> Any: +def initialize_model_from_run(run_id: int, *, strict_version: bool = True) -> Any: """ Initialized a model based on a run_id (i.e., using the exact same parameter settings) @@ -376,6 +371,8 @@ def initialize_model_from_run(run_id: int) -> Any: ---------- run_id : int The Openml run_id + strict_version: bool (default=True) + See `flow_to_model` strict_version. Returns ------- @@ -385,7 +382,7 @@ def initialize_model_from_run(run_id: int) -> Any: # TODO(eddiebergman): I imagine this is None if it's not published, # might need to raise an explicit error for that assert run.setup_id is not None - return initialize_model(run.setup_id) + return initialize_model(setup_id=run.setup_id, strict_version=strict_version) def initialize_model_from_trace( @@ -679,9 +676,9 @@ def _calculate_local_measure( # type: ignore user_defined_measures_per_fold[measure][rep_no][fold_no] = user_defined_measures_fold[ measure ] - user_defined_measures_per_sample[measure][rep_no][fold_no][ - sample_no - ] = user_defined_measures_fold[measure] + user_defined_measures_per_sample[measure][rep_no][fold_no][sample_no] = ( + user_defined_measures_fold[measure] + ) trace: OpenMLRunTrace | None = None if len(traces) > 0: @@ -783,13 +780,9 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913 raise NotImplementedError(task.task_type) config.logger.info( - "Going to run model {} on dataset {} for repeat {} fold {} sample {}".format( - str(model), - openml.datasets.get_dataset(task.dataset_id).name, - rep_no, - fold_no, - sample_no, - ), + f"Going to run model {model!s} on " + f"dataset {openml.datasets.get_dataset(task.dataset_id).name} " + f"for repeat {rep_no} fold {fold_no} sample {sample_no}" ) ( pred_y, @@ -865,7 +858,7 @@ def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: # noqa: FBT0 return _create_run_from_xml(run_xml) -def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun: # noqa: PLR0915, PLR0912, C901, , FBT001, FBT002FBT +def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun: # noqa: PLR0915, PLR0912, C901, FBT001, FBT002 """Create a run object from xml returned from server. Parameters @@ -978,7 +971,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore else: raise ValueError( 'Could not find keys "value" or ' - '"array_data" in %s' % str(evaluation_dict.keys()), + f'"array_data" in {evaluation_dict.keys()!s}', ) if ( "@repeat" in evaluation_dict @@ -1211,15 +1204,15 @@ def _list_runs( # noqa: PLR0913 for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" if id is not None: - api_call += "/run/%s" % ",".join([str(int(i)) for i in id]) + api_call += "/run/{}".format(",".join([str(int(i)) for i in id])) if task is not None: - api_call += "/task/%s" % ",".join([str(int(i)) for i in task]) + api_call += "/task/{}".format(",".join([str(int(i)) for i in task])) if setup is not None: - api_call += "/setup/%s" % ",".join([str(int(i)) for i in setup]) + api_call += "/setup/{}".format(",".join([str(int(i)) for i in setup])) if flow is not None: - api_call += "/flow/%s" % ",".join([str(int(i)) for i in flow]) + api_call += "/flow/{}".format(",".join([str(int(i)) for i in flow])) if uploader is not None: - api_call += "/uploader/%s" % ",".join([str(int(i)) for i in uploader]) + api_call += "/uploader/{}".format(",".join([str(int(i)) for i in uploader])) if study is not None: api_call += "/study/%d" % study if display_errors: diff --git a/openml/runs/run.py b/openml/runs/run.py index 766f8c97f..945264131 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -480,7 +480,7 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]: ] else: - raise NotImplementedError("Task type %s is not yet supported." % str(task.task_type)) + raise NotImplementedError(f"Task type {task.task_type!s} is not yet supported.") return arff_dict diff --git a/openml/runs/trace.py b/openml/runs/trace.py index 3b7d60c2f..bc9e1b5d6 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -80,8 +80,8 @@ def __post_init__(self) -> None: if self.parameters is not None and not isinstance(self.parameters, dict): raise TypeError( - "argument parameters is not an instance of OrderedDict, but %s" - % str(type(self.parameters)), + f"argument parameters is not an instance of OrderedDict, but" + f" {type(self.parameters)!s}", ) def get_parameters(self) -> dict[str, Any]: @@ -351,7 +351,7 @@ def _trace_from_arff_struct( for required_attribute in REQUIRED_ATTRIBUTES: if required_attribute not in attribute_idx: - raise ValueError("arff misses required attribute: %s" % required_attribute) + raise ValueError(f"arff misses required attribute: {required_attribute}") if "setup_string" in attribute_idx: raise ValueError(error_message) @@ -383,7 +383,7 @@ def _trace_from_arff_struct( else: raise ValueError( 'expected {"true", "false"} value for selected field, ' - "received: %s" % selected_value, + f"received: {selected_value}", ) parameters = { @@ -448,7 +448,7 @@ def trace_from_xml(cls, xml: str | Path | IO) -> OpenMLRunTrace: else: raise ValueError( 'expected {"true", "false"} value for ' - "selected field, received: %s" % selected_value, + f"selected field, received: {selected_value}", ) current = OpenMLTraceIteration( @@ -504,10 +504,8 @@ def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace: if list(param_keys) != list(trace_itr_keys): raise ValueError( "Cannot merge traces because the parameters are not equal: " - "{} vs {}".format( - list(trace_itr.parameters.keys()), - list(iteration.parameters.keys()), - ), + f"{list(trace_itr.parameters.keys())} vs " + f"{list(iteration.parameters.keys())}", ) if key in merged_trace: @@ -521,9 +519,9 @@ def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace: return cls(None, merged_trace) def __repr__(self) -> str: - return "[Run id: {}, {} trace iterations]".format( - -1 if self.run_id is None else self.run_id, - len(self.trace_iterations), + return ( + f"[Run id: {-1 if self.run_id is None else self.run_id}, " + f"{len(self.trace_iterations)} trace iterations]" ) def __iter__(self) -> Iterator[OpenMLTraceIteration]: diff --git a/openml/setups/functions.py b/openml/setups/functions.py index ee0c6d707..877384636 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -212,7 +212,7 @@ def _list_setups( """ api_call = "setup/list" if setup is not None: - api_call += "/setup/%s" % ",".join([str(int(i)) for i in setup]) + api_call += "/setup/{}".format(",".join([str(int(i)) for i in setup])) if kwargs is not None: for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" @@ -230,13 +230,12 @@ def __list_setups( # Minimalistic check if the XML is useful if "oml:setups" not in setups_dict: raise ValueError( - 'Error in return XML, does not contain "oml:setups":' " %s" % str(setups_dict), + 'Error in return XML, does not contain "oml:setups":' f" {setups_dict!s}", ) if "@xmlns:oml" not in setups_dict["oml:setups"]: raise ValueError( - "Error in return XML, does not contain " - '"oml:setups"/@xmlns:oml: %s' % str(setups_dict), + "Error in return XML, does not contain " f'"oml:setups"/@xmlns:oml: {setups_dict!s}', ) if setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri: @@ -266,7 +265,7 @@ def __list_setups( return setups -def initialize_model(setup_id: int) -> Any: +def initialize_model(setup_id: int, *, strict_version: bool = True) -> Any: """ Initialized a model based on a setup_id (i.e., using the exact same parameter settings) @@ -275,6 +274,8 @@ def initialize_model(setup_id: int) -> Any: ---------- setup_id : int The Openml setup_id + strict_version: bool (default=True) + See `flow_to_model` strict_version. Returns ------- @@ -295,7 +296,7 @@ def initialize_model(setup_id: int) -> Any: subflow = flow subflow.parameters[hyperparameter.parameter_name] = hyperparameter.value - return flow.extension.flow_to_model(flow) + return flow.extension.flow_to_model(flow, strict_version=strict_version) def _to_dict( @@ -364,7 +365,7 @@ def _create_setup_from_xml( else: raise ValueError( "Expected None, list or dict, received " - "something else: %s" % str(type(xml_parameters)), + f"something else: {type(xml_parameters)!s}", ) if _output_format in ["dataframe", "dict"]: diff --git a/openml/study/functions.py b/openml/study/functions.py index 9d726d286..7fdc6f636 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -90,7 +90,7 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy: ) result_dict = xmltodict.parse(xml_string, force_list=force_list_tags)["oml:study"] study_id = int(result_dict["oml:id"]) - alias = result_dict["oml:alias"] if "oml:alias" in result_dict else None + alias = result_dict.get("oml:alias", None) main_entity_type = result_dict["oml:main_entity_type"] if entity_type != main_entity_type: @@ -99,9 +99,7 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy: f", expected '{entity_type}'" ) - benchmark_suite = ( - result_dict["oml:benchmark_suite"] if "oml:benchmark_suite" in result_dict else None - ) + benchmark_suite = result_dict.get("oml:benchmark_suite", None) name = result_dict["oml:name"] description = result_dict["oml:description"] status = result_dict["oml:status"] @@ -300,7 +298,7 @@ def update_study_status(study_id: int, status: str) -> None: """ legal_status = {"active", "deactivated"} if status not in legal_status: - raise ValueError("Illegal status value. " "Legal values: %s" % legal_status) + raise ValueError("Illegal status value. " f"Legal values: {legal_status}") data = {"study_id": study_id, "status": status} # type: openml._api_calls.DATA_TYPE result_xml = openml._api_calls._perform_api_call("study/status/update", "post", data=data) result = xmltodict.parse(result_xml) @@ -442,8 +440,7 @@ def list_suites( status: str | None = ..., uploader: list[int] | None = ..., output_format: Literal["dict"] = "dict", -) -> dict: - ... +) -> dict: ... @overload @@ -453,8 +450,7 @@ def list_suites( status: str | None = ..., uploader: list[int] | None = ..., output_format: Literal["dataframe"] = "dataframe", -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def list_suites( @@ -538,8 +534,7 @@ def list_studies( uploader: list[str] | None = ..., benchmark_suite: int | None = ..., output_format: Literal["dict"] = "dict", -) -> dict: - ... +) -> dict: ... @overload @@ -550,8 +545,7 @@ def list_studies( uploader: list[str] | None = ..., benchmark_suite: int | None = ..., output_format: Literal["dataframe"] = "dataframe", -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def list_studies( @@ -637,13 +631,11 @@ def list_studies( @overload -def _list_studies(output_format: Literal["dict"] = "dict", **kwargs: Any) -> dict: - ... +def _list_studies(output_format: Literal["dict"] = "dict", **kwargs: Any) -> dict: ... @overload -def _list_studies(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: - ... +def _list_studies(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ... def _list_studies( @@ -674,13 +666,11 @@ def _list_studies( @overload -def __list_studies(api_call: str, output_format: Literal["dict"] = "dict") -> dict: - ... +def __list_studies(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ... @overload -def __list_studies(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: - ... +def __list_studies(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ... def __list_studies( diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 9fd2e4be1..54030422d 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -98,8 +98,9 @@ def _get_estimation_procedure_list() -> list[dict[str, Any]]: raise ValueError( "Error in return XML, value of " "oml:estimationprocedures/@xmlns:oml is not " - "http://openml.org/openml, but %s" - % str(procs_dict["oml:estimationprocedures"]["@xmlns:oml"]), + "http://openml.org/openml, but {}".format( + str(procs_dict["oml:estimationprocedures"]["@xmlns:oml"]) + ), ) procs: list[dict[str, Any]] = [] @@ -276,7 +277,7 @@ def __list_tasks( # noqa: PLR0912, C901 raise ValueError( "Error in return XML, value of " '"oml:runs"/@xmlns:oml is not ' - '"http://openml.org/openml": %s' % str(tasks_dict), + f'"http://openml.org/openml": {tasks_dict!s}', ) assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"]) @@ -527,7 +528,7 @@ def _create_task_from_xml(xml: str) -> OpenMLTask: TaskType.LEARNING_CURVE: OpenMLLearningCurveTask, }.get(task_type) if cls is None: - raise NotImplementedError("Task type %s not supported." % common_kwargs["task_type"]) + raise NotImplementedError("Task type {} not supported.".format(common_kwargs["task_type"])) return cls(**common_kwargs) # type: ignore diff --git a/openml/tasks/split.py b/openml/tasks/split.py index 81105f1fd..ac538496e 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -177,9 +177,9 @@ def get(self, repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[np.ndarr If the specified repeat, fold, or sample is not known. """ if repeat not in self.split: - raise ValueError("Repeat %s not known" % str(repeat)) + raise ValueError(f"Repeat {repeat!s} not known") if fold not in self.split[repeat]: - raise ValueError("Fold %s not known" % str(fold)) + raise ValueError(f"Fold {fold!s} not known") if sample not in self.split[repeat][fold]: - raise ValueError("Sample %s not known" % str(sample)) + raise ValueError(f"Sample {sample!s} not known") return self.split[repeat][fold][sample] diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 064b834ba..e7d19bdce 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -207,7 +207,7 @@ def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]: {"@name": "source_data", "#text": str(self.dataset_id)}, {"@name": "estimation_procedure", "#text": str(self.estimation_procedure_id)}, ] - if self.evaluation_measure is not None: # + if self.evaluation_measure is not None: oml_input.append({"@name": "evaluation_measures", "#text": self.evaluation_measure}) return { @@ -283,8 +283,7 @@ def get_X_and_y( ) -> tuple[ np.ndarray | scipy.sparse.spmatrix, np.ndarray | None, - ]: - ... + ]: ... @overload def get_X_and_y( @@ -292,8 +291,7 @@ def get_X_and_y( ) -> tuple[ pd.DataFrame, pd.Series | pd.DataFrame | None, - ]: - ... + ]: ... # TODO(eddiebergman): Do all OpenMLSupervisedTask have a `y`? def get_X_and_y( @@ -542,12 +540,10 @@ def __init__( # noqa: PLR0913 def get_X( self, dataset_format: Literal["array"] = "array", - ) -> np.ndarray | scipy.sparse.spmatrix: - ... + ) -> np.ndarray | scipy.sparse.spmatrix: ... @overload - def get_X(self, dataset_format: Literal["dataframe"]) -> pd.DataFrame: - ... + def get_X(self, dataset_format: Literal["dataframe"]) -> pd.DataFrame: ... def get_X( self, diff --git a/openml/testing.py b/openml/testing.py index 529a304d4..9016ff6a9 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -182,7 +182,7 @@ def _get_sentinel(self, sentinel: str | None = None) -> str: md5.update(str(time.time()).encode("utf-8")) md5.update(str(os.getpid()).encode("utf-8")) sentinel = md5.hexdigest()[:10] - sentinel = "TEST%s" % sentinel + sentinel = f"TEST{sentinel}" return sentinel def _add_sentinel_to_flow_name( diff --git a/openml/utils.py b/openml/utils.py index a03610512..82859fd40 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -35,8 +35,7 @@ def extract_xml_tags( node: Mapping[str, Any], *, allow_none: Literal[True] = ..., -) -> Any | None: - ... +) -> Any | None: ... @overload @@ -45,8 +44,7 @@ def extract_xml_tags( node: Mapping[str, Any], *, allow_none: Literal[False], -) -> Any: - ... +) -> Any: ... def extract_xml_tags( @@ -198,7 +196,7 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool: "user", } if entity_type not in legal_entities: - raise ValueError("Can't delete a %s" % entity_type) + raise ValueError(f"Can't delete a {entity_type}") url_suffix = "%s/%d" % (entity_type, entity_id) try: @@ -236,7 +234,7 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool: " please open an issue at: https://github.com/openml/openml/issues/new" ), ) from e - raise + raise e @overload @@ -245,8 +243,7 @@ def _list_all( list_output_format: Literal["dict"] = ..., *args: P.args, **filters: P.kwargs, -) -> dict: - ... +) -> dict: ... @overload @@ -255,8 +252,7 @@ def _list_all( list_output_format: Literal["object"], *args: P.args, **filters: P.kwargs, -) -> dict: - ... +) -> dict: ... @overload @@ -265,8 +261,7 @@ def _list_all( list_output_format: Literal["dataframe"], *args: P.args, **filters: P.kwargs, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def _list_all( # noqa: C901, PLR0912 @@ -376,7 +371,7 @@ def _create_cache_directory(key: str) -> Path: try: cache_dir.mkdir(exist_ok=True, parents=True) - except Exception as e: # noqa: BLE001 + except Exception as e: raise openml.exceptions.OpenMLCacheException( f"Cannot create cache directory {cache_dir}." ) from e @@ -412,7 +407,7 @@ def _create_cache_directory_for_id(key: str, id_: int) -> Path: """ cache_dir = _get_cache_dir_for_id(key, id_, create=True) if cache_dir.exists() and not cache_dir.is_dir(): - raise ValueError("%s cache dir exists but is not a directory!" % key) + raise ValueError(f"{key} cache dir exists but is not a directory!") cache_dir.mkdir(exist_ok=True, parents=True) return cache_dir diff --git a/pyproject.toml b/pyproject.toml index ffb1eb001..83f0793f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,7 +86,6 @@ examples=[ "ipykernel", "seaborn", ] -examples_unix=["fanova"] docs=[ "sphinx>=3", "sphinx-gallery", @@ -127,12 +126,79 @@ markers = [ # https://github.com/charliermarsh/ruff [tool.ruff] -target-version = "py37" +target-version = "py38" line-length = 100 -show-source = true +output-format = "grouped" src = ["openml", "tests", "examples"] unsafe-fixes = true +exclude = [ + # TODO(eddiebergman): Tests should be re-enabled after the refactor + "tests", + # + ".bzr", + ".direnv", + ".eggs", + ".git", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", + "docs", +] + +# Exclude a variety of commonly ignored directories. +[tool.ruff.lint.per-file-ignores] +"tests/*.py" = [ + "D100", # Undocumented public module + "D101", # Missing docstring in public class + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "S101", # Use of assert + "ANN201", # Missing return type annotation for public function + "FBT001", # Positional boolean argument + "PLR2004",# No use of magic numbers + "PD901", # X is a bad variable name. (pandas) + "TCH", # https://docs.astral.sh/ruff/rules/#flake8-type-checking-tch + "N803", # Argument name {name} should be lowercase +] +"openml/cli.py" = [ + "T201", # print found + "T203", # pprint found +] +"openml/__version__.py" = [ + "D100", # Undocumented public module +] +"__init__.py" = [ + "I002", # Missing required import (i.e. from __future__ import annotations) +] +"examples/*.py" = [ + "D101", # Missing docstring in public class + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D415", # First line should end with a . or ? or ! + "INP001", # File is part of an implicit namespace package, add an __init__.py + "I002", # Missing required import (i.e. from __future__ import annotations) + "E741", # Ambigiuous variable name + "T201", # print found + "T203", # pprint found + "ERA001", # found commeneted out code + "E402", # Module level import not at top of cell + "E501", # Line too long +] + +[tool.ruff.lint] # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" @@ -212,74 +278,9 @@ ignore = [ "N802", # Public function name should be lower case (i.e. get_X()) ] -exclude = [ - # TODO(eddiebergman): Tests should be re-enabled after the refactor - "tests", - # - ".bzr", - ".direnv", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".nox", - ".pants.d", - ".ruff_cache", - ".svn", - ".tox", - ".venv", - "__pypackages__", - "_build", - "buck-out", - "build", - "dist", - "node_modules", - "venv", - "docs", -] - -# Exclude a variety of commonly ignored directories. -[tool.ruff.per-file-ignores] -"tests/*.py" = [ - "D100", # Undocumented public module - "D101", # Missing docstring in public class - "D102", # Missing docstring in public method - "D103", # Missing docstring in public function - "S101", # Use of assert - "ANN201", # Missing return type annotation for public function - "FBT001", # Positional boolean argument - "PLR2004",# No use of magic numbers - "PD901", # X is a bad variable name. (pandas) - "TCH", # https://docs.astral.sh/ruff/rules/#flake8-type-checking-tch - "N803", # Argument name {name} should be lowercase -] -"openml/cli.py" = [ - "T201", # print found - "T203", # pprint found -] -"openml/__version__.py" = [ - "D100", # Undocumented public module -] -"__init__.py" = [ - "I002", # Missing required import (i.e. from __future__ import annotations) -] -"examples/*.py" = [ - "D101", # Missing docstring in public class - "D102", # Missing docstring in public method - "D103", # Missing docstring in public function - "D415", # First line should end with a . or ? or ! - "INP001", # File is part of an implicit namespace package, add an __init__.py - "I002", # Missing required import (i.e. from __future__ import annotations) - "E741", # Ambigiuous variable name - "T201", # print found - "T203", # pprint found - "ERA001", # found commeneted out code - "E402", # Module level import not at top of cell - "E501", # Line too long -] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["openml"] no-lines-before = ["future"] required-imports = ["from __future__ import annotations"] @@ -287,11 +288,11 @@ combine-as-imports = true extra-standard-library = ["typing_extensions"] force-wrap-aliases = true -[tool.ruff.pydocstyle] +[tool.ruff.lint.pydocstyle] convention = "numpy" [tool.mypy] -python_version = "3.7" +python_version = "3.8" packages = ["openml", "tests"] show_error_codes = true diff --git a/tests/conftest.py b/tests/conftest.py index 62fe3c7e8..79ee2bbd3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,8 +23,10 @@ # License: BSD 3-Clause from __future__ import annotations +from collections.abc import Iterator import logging import os +import shutil from pathlib import Path import pytest @@ -164,6 +166,15 @@ def pytest_sessionfinish() -> None: # Local file deletion new_file_list = read_file_list() compare_delete_files(file_list, new_file_list) + + # Delete any test dirs that remain + # In edge cases due to a mixture of pytest parametrization and oslo concurrency, + # some file lock are created after leaving the test. This removes these files! + test_files_dir=Path(__file__).parent.parent / "openml" + for f in test_files_dir.glob("tests.*"): + if f.is_dir(): + shutil.rmtree(f) + logger.info("Local files deleted") logger.info(f"{worker} is killed") @@ -185,55 +196,90 @@ def pytest_addoption(parser): def _expected_static_cache_state(root_dir: Path) -> list[Path]: _c_root_dir = root_dir / "org" / "openml" / "test" res_paths = [root_dir, _c_root_dir] - + for _d in ["datasets", "tasks", "runs", "setups"]: res_paths.append(_c_root_dir / _d) - for _id in ["-1","2"]: + for _id in ["-1", "2"]: tmp_p = _c_root_dir / "datasets" / _id - res_paths.extend([ - tmp_p / "dataset.arff", - tmp_p / "features.xml", - tmp_p / "qualities.xml", - tmp_p / "description.xml", - ]) + res_paths.extend( + [ + tmp_p / "dataset.arff", + tmp_p / "features.xml", + tmp_p / "qualities.xml", + tmp_p / "description.xml", + ] + ) res_paths.append(_c_root_dir / "datasets" / "30" / "dataset_30.pq") res_paths.append(_c_root_dir / "runs" / "1" / "description.xml") res_paths.append(_c_root_dir / "setups" / "1" / "description.xml") - + for _id in ["1", "3", "1882"]: tmp_p = _c_root_dir / "tasks" / _id - res_paths.extend([ - tmp_p / "datasplits.arff", - tmp_p / "task.xml", - ]) - + res_paths.extend( + [ + tmp_p / "datasplits.arff", + tmp_p / "task.xml", + ] + ) + return res_paths def assert_static_test_cache_correct(root_dir: Path) -> None: for p in _expected_static_cache_state(root_dir): - assert p.exists(), f"Expected path {p} does not exist" - + assert p.exists(), f"Expected path {p} exists" + @pytest.fixture(scope="class") def long_version(request): request.cls.long_version = request.config.getoption("--long") -@pytest.fixture() +@pytest.fixture(scope="session") def test_files_directory() -> Path: return Path(__file__).parent / "files" -@pytest.fixture() +@pytest.fixture(scope="session") def test_api_key() -> str: return "c0c42819af31e706efe1f4b88c23c6c1" -@pytest.fixture(autouse=True) -def verify_cache_state(test_files_directory) -> None: +@pytest.fixture(autouse=True, scope="function") +def verify_cache_state(test_files_directory) -> Iterator[None]: assert_static_test_cache_correct(test_files_directory) yield assert_static_test_cache_correct(test_files_directory) + + +@pytest.fixture(autouse=True, scope="session") +def as_robot() -> Iterator[None]: + policy = openml.config.retry_policy + n_retries = openml.config.connection_n_retries + openml.config.set_retry_policy("robot", n_retries=20) + yield + openml.config.set_retry_policy(policy, n_retries) + + +@pytest.fixture(autouse=True, scope="session") +def with_test_server(): + openml.config.start_using_configuration_for_example() + yield + openml.config.stop_using_configuration_for_example() + + +@pytest.fixture(autouse=True) +def with_test_cache(test_files_directory, request): + if not test_files_directory.exists(): + raise ValueError( + f"Cannot find test cache dir, expected it to be {test_files_directory!s}!", + ) + _root_cache_directory = openml.config._root_cache_directory + tmp_cache = test_files_directory / request.node.name + openml.config.set_root_cache_directory(tmp_cache) + yield + openml.config.set_root_cache_directory(_root_cache_directory) + if tmp_cache.exists(): + shutil.rmtree(tmp_cache) diff --git a/tests/files/misc/features_with_whitespaces.xml b/tests/files/misc/features_with_whitespaces.xml new file mode 100644 index 000000000..2b542d167 --- /dev/null +++ b/tests/files/misc/features_with_whitespaces.xml @@ -0,0 +1,22 @@ + + + 0 + V1 + numeric + false + false + false + 0 + + + 1 + V42 + nominal + - 50000. + 50000+. + false + false + false + 0 + + diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 80da9c842..4598b8985 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -309,6 +309,10 @@ def test_lazy_loading_metadata(self): assert _dataset.features == _compare_dataset.features assert _dataset.qualities == _compare_dataset.qualities + def test_equality_comparison(self): + self.assertEqual(self.iris, self.iris) + self.assertNotEqual(self.iris, self.titanic) + self.assertNotEqual(self.titanic, 'Wrong_object') class OpenMLDatasetTestOnTestServer(TestBase): def setUp(self): diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 47e97496d..a15100070 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -43,6 +43,7 @@ OpenMLNotAuthorizedError, OpenMLPrivateDatasetError, OpenMLServerException, + OpenMLServerNoResult, ) from openml.tasks import TaskType, create_task from openml.testing import TestBase, create_request_response @@ -274,9 +275,7 @@ def test_get_dataset_cannot_access_private_data(self): @pytest.mark.skip("Need to find dataset name of private dataset") def test_dataset_by_name_cannot_access_private_data(self): openml.config.server = self.production_server - self.assertRaises( - OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE" - ) + self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE") def test_get_dataset_lazy_all_functions(self): """Test that all expected functionality is available without downloading the dataset.""" @@ -285,9 +284,7 @@ def test_get_dataset_lazy_all_functions(self): def ensure_absence_of_real_data(): assert not os.path.exists( - os.path.join( - openml.config.get_cache_directory(), "datasets", "1", "dataset.arff" - ) + os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff") ) tag = "test_lazy_tag_%d" % random.randint(1, 1000000) @@ -509,12 +506,8 @@ def test_deletion_of_cache_dir(self): @mock.patch("openml.datasets.functions._get_dataset_description") def test_deletion_of_cache_dir_faulty_download(self, patch): patch.side_effect = Exception("Boom!") - self.assertRaisesRegex( - Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1 - ) - datasets_cache_dir = os.path.join( - self.workdir, "org", "openml", "test", "datasets" - ) + self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1) + datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets") assert len(os.listdir(datasets_cache_dir)) == 0 def test_publish_dataset(self): @@ -555,9 +548,7 @@ def test__retrieve_class_labels(self): # Test workaround for string-typed class labels custom_ds = openml.datasets.get_dataset(2) custom_ds.features[31].data_type = "string" - labels = custom_ds.retrieve_class_labels( - target_name=custom_ds.features[31].name - ) + labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name) assert labels == ["COIL", "SHEET"] def test_upload_dataset_with_url(self): @@ -600,9 +591,7 @@ def test_data_status(self): ) dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) - ) + TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) did = dataset.id # admin key for test server (only adminds can activate datasets. @@ -678,8 +667,7 @@ def test_attributes_arff_from_df_unknown_dtype(self): for arr, dt in zip(data, dtype): df = pd.DataFrame(arr) err_msg = ( - f"The dtype '{dt}' of the column '0' is not currently " - "supported by liac-arff" + f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff" ) with pytest.raises(ValueError, match=err_msg): attributes_arff_from_df(df) @@ -710,16 +698,12 @@ def test_create_dataset_numpy(self): dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) - ) + TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded arff does not match original one" - assert ( - _get_online_dataset_format(dataset.id) == "arff" - ), "Wrong format for dataset" + assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset" def test_create_dataset_list(self): data = [ @@ -769,15 +753,11 @@ def test_create_dataset_list(self): dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) - ) + TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded ARFF does not match original one" - assert ( - _get_online_dataset_format(dataset.id) == "arff" - ), "Wrong format for dataset" + assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset" def test_create_dataset_sparse(self): # test the scipy.sparse.coo_matrix @@ -974,9 +954,7 @@ def test_create_dataset_pandas(self): ) dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) - ) + TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded ARFF does not match original one" @@ -991,9 +969,7 @@ def test_create_dataset_pandas(self): column_names = ["input1", "input2", "y"] df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names) # meta-information - description = ( - "Synthetic dataset created from a Pandas DataFrame with Sparse columns" - ) + description = "Synthetic dataset created from a Pandas DataFrame with Sparse columns" dataset = openml.datasets.functions.create_dataset( name=name, description=description, @@ -1014,15 +990,11 @@ def test_create_dataset_pandas(self): ) dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) - ) + TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded ARFF does not match original one" - assert ( - _get_online_dataset_format(dataset.id) == "sparse_arff" - ), "Wrong format for dataset" + assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset" # Check that we can overwrite the attributes data = [["a"], ["b"], ["c"], ["d"], ["e"]] @@ -1050,13 +1022,9 @@ def test_create_dataset_pandas(self): ) dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) - ) + TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) downloaded_data = _get_online_dataset_arff(dataset.id) - assert ( - downloaded_data == dataset._dataset - ), "Uploaded ARFF does not match original one" + assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one" assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data def test_ignore_attributes_dataset(self): @@ -1217,9 +1185,7 @@ def test_publish_fetch_ignore_attribute(self): # publish dataset dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info( - "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) - ) + TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) # test if publish was successful assert isinstance(dataset.id, int) @@ -1403,9 +1369,7 @@ def test_get_dataset_cache_format_feather(self): cache_dir = openml.config.get_cache_directory() cache_dir_for_id = os.path.join(cache_dir, "datasets", "128") feather_file = os.path.join(cache_dir_for_id, "dataset.feather") - pickle_file = os.path.join( - cache_dir_for_id, "dataset.feather.attributes.pkl.py3" - ) + pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3") data = pd.read_feather(feather_file) assert os.path.isfile(feather_file), "Feather file is missing" assert os.path.isfile(pickle_file), "Attributes pickle file is missing" @@ -1450,9 +1414,7 @@ def test_data_edit_critical_field(self): # for this, we need to first clone a dataset to do changes did = fork_dataset(1) self._wait_for_dataset_being_processed(did) - result = edit_dataset( - did, default_target_attribute="shape", ignore_attribute="oil" - ) + result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil") assert did == result n_tries = 10 @@ -1460,9 +1422,7 @@ def test_data_edit_critical_field(self): for i in range(n_tries): edited_dataset = openml.datasets.get_dataset(did) try: - assert ( - edited_dataset.default_target_attribute == "shape" - ), edited_dataset + assert edited_dataset.default_target_attribute == "shape", edited_dataset assert edited_dataset.ignore_attribute == ["oil"], edited_dataset break except AssertionError as e: @@ -1471,9 +1431,7 @@ def test_data_edit_critical_field(self): time.sleep(10) # Delete the cache dir to get the newer version of the dataset shutil.rmtree( - os.path.join( - self.workdir, "org", "openml", "test", "datasets", str(did) - ), + os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)), ) def test_data_edit_requires_field(self): @@ -1564,9 +1522,7 @@ def test_list_datasets_with_high_size_parameter(self): openml.config.server = self.production_server datasets_a = openml.datasets.list_datasets(output_format="dataframe") - datasets_b = openml.datasets.list_datasets( - output_format="dataframe", size=np.inf - ) + datasets_b = openml.datasets.list_datasets(output_format="dataframe", size=np.inf) # Reverting to test server openml.config.server = self.test_server @@ -1646,9 +1602,7 @@ def test_invalid_attribute_validations( (None, None, ["outlook", "windy"]), ], ) -def test_valid_attribute_validations( - default_target_attribute, row_id_attribute, ignore_attribute -): +def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], ["b", "sunny", 80.0, 90.0, "TRUE", "no"], @@ -1749,10 +1703,7 @@ def test_delete_dataset(self): def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( - test_files_directory - / "mock_responses" - / "datasets" - / "data_delete_not_owned.xml" + test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml" ) mock_delete.return_value = create_request_response( status_code=412, @@ -1774,10 +1725,7 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_ke def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( - test_files_directory - / "mock_responses" - / "datasets" - / "data_delete_has_tasks.xml" + test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml" ) mock_delete.return_value = create_request_response( status_code=412, @@ -1799,10 +1747,7 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( - test_files_directory - / "mock_responses" - / "datasets" - / "data_delete_successful.xml" + test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml" ) mock_delete.return_value = create_request_response( status_code=200, @@ -1821,10 +1766,7 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key) def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( - test_files_directory - / "mock_responses" - / "datasets" - / "data_delete_not_exist.xml" + test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml" ) mock_delete.return_value = create_request_response( status_code=412, @@ -1861,9 +1803,7 @@ def test_list_datasets(all_datasets: pd.DataFrame): def test_list_datasets_by_tag(all_datasets: pd.DataFrame): - tag_datasets = openml.datasets.list_datasets( - tag="study_14", output_format="dataframe" - ) + tag_datasets = openml.datasets.list_datasets(tag="study_14", output_format="dataframe") assert 0 < len(tag_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(tag_datasets) @@ -2001,15 +1941,22 @@ def test_get_dataset_lazy_behavior( with_features=with_features, with_data=with_data, ) - assert ( - dataset.features - ), "Features should be downloaded on-demand if not during get_dataset" - assert ( - dataset.qualities - ), "Qualities should be downloaded on-demand if not during get_dataset" - assert ( - dataset.get_data() - ), "Data should be downloaded on-demand if not during get_dataset" + assert dataset.features, "Features should be downloaded on-demand if not during get_dataset" + assert dataset.qualities, "Qualities should be downloaded on-demand if not during get_dataset" + assert dataset.get_data(), "Data should be downloaded on-demand if not during get_dataset" _assert_datasets_retrieved_successfully( [1], with_qualities=True, with_features=True, with_data=True ) + + +def test_get_dataset_with_invalid_id() -> None: + INVALID_ID = 123819023109238 # Well, at some point this will probably be valid... + with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e: + openml.datasets.get_dataset(INVALID_ID) + assert e.value.code == 111 + +def test_read_features_from_xml_with_whitespace() -> None: + from openml.datasets.dataset import _read_features + features_file = Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml" + dict = _read_features(features_file) + assert dict[1].nominal_values == [" - 50000.", " 50000+."] diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py index bf5b03f3f..a0980f5f9 100644 --- a/tests/test_evaluations/test_evaluations_example.py +++ b/tests/test_evaluations/test_evaluations_example.py @@ -3,35 +3,47 @@ import unittest +from openml.config import overwrite_config_context + class TestEvaluationsExample(unittest.TestCase): def test_example_python_paper(self): # Example script which will appear in the upcoming OpenML-Python paper # This test ensures that the example will keep running! - - import matplotlib.pyplot as plt - import numpy as np - - import openml - - df = openml.evaluations.list_evaluations_setups( - "predictive_accuracy", - flows=[8353], - tasks=[6], - output_format="dataframe", - parameters_in_separate_columns=True, - ) # Choose an SVM flow, for example 8353, and a task. - - hp_names = ["sklearn.svm.classes.SVC(16)_C", "sklearn.svm.classes.SVC(16)_gamma"] - df[hp_names] = df[hp_names].astype(float).apply(np.log) - C, gamma, score = df[hp_names[0]], df[hp_names[1]], df["value"] - - cntr = plt.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r") - plt.colorbar(cntr, label="accuracy") - plt.xlim((min(C), max(C))) - plt.ylim((min(gamma), max(gamma))) - plt.xlabel("C (log10)", size=16) - plt.ylabel("gamma (log10)", size=16) - plt.title("SVM performance landscape", size=20) - - plt.tight_layout() + with overwrite_config_context( + { + "server": "https://www.openml.org/api/v1/xml", + "apikey": None, + } + ): + import matplotlib.pyplot as plt + import numpy as np + + import openml + + df = openml.evaluations.list_evaluations_setups( + "predictive_accuracy", + flows=[8353], + tasks=[6], + output_format="dataframe", + parameters_in_separate_columns=True, + ) # Choose an SVM flow, for example 8353, and a task. + + assert len(df) > 0, ( + "No evaluation found for flow 8353 on task 6, could " + "be that this task is not available on the test server." + ) + + hp_names = ["sklearn.svm.classes.SVC(16)_C", "sklearn.svm.classes.SVC(16)_gamma"] + df[hp_names] = df[hp_names].astype(float).apply(np.log) + C, gamma, score = df[hp_names[0]], df[hp_names[1]], df["value"] + + cntr = plt.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r") + plt.colorbar(cntr, label="accuracy") + plt.xlim((min(C), max(C))) + plt.ylim((min(gamma), max(gamma))) + plt.xlabel("C (log10)", size=16) + plt.ylabel("gamma (log10)", size=16) + plt.title("SVM performance landscape", size=20) + + plt.tight_layout() diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index c6df73e0a..51123b0d8 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -9,8 +9,9 @@ import pytest import openml +from openml.config import ConfigurationForExamples import openml.testing -from openml._api_calls import _download_minio_bucket +from openml._api_calls import _download_minio_bucket, API_TOKEN_HELP_LINK class TestConfig(openml.testing.TestBase): @@ -36,8 +37,12 @@ def test_retry_on_database_error(self, Session_class_mock, _): assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20 + class FakeObject(NamedTuple): object_name: str + etag: str + """We use the etag of a Minio object as the name of a marker if we already downloaded it.""" + class FakeMinio: def __init__(self, objects: Iterable[FakeObject] | None = None): @@ -60,7 +65,7 @@ def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None: some_url = f"https://not.real.com/bucket/{some_object_path}" mock_minio.return_value = FakeMinio( objects=[ - FakeObject(some_object_path), + FakeObject(object_name=some_object_path, etag=str(hash(some_object_path))), ], ) @@ -71,3 +76,50 @@ def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None: time_modified = (tmp_path / some_filename).stat().st_mtime assert time_created == time_modified + + +@mock.patch.object(minio, "Minio") +def test_download_minio_failure(mock_minio, tmp_path: Path) -> None: + some_prefix, some_filename = "some/prefix", "dataset.arff" + some_object_path = f"{some_prefix}/{some_filename}" + some_url = f"https://not.real.com/bucket/{some_object_path}" + mock_minio.return_value = FakeMinio( + objects=[ + FakeObject(object_name=None, etag="tmp"), + ], + ) + + with pytest.raises(ValueError): + _download_minio_bucket(source=some_url, destination=tmp_path) + + mock_minio.return_value = FakeMinio( + objects=[ + FakeObject(object_name="tmp", etag=None), + ], + ) + + with pytest.raises(ValueError): + _download_minio_bucket(source=some_url, destination=tmp_path) + + +@pytest.mark.parametrize( + "endpoint, method", + [ + # https://github.com/openml/OpenML/blob/develop/openml_OS/views/pages/api_new/v1/xml/pre.php + ("flow/exists", "post"), # 102 + ("dataset", "post"), # 137 + ("dataset/42", "delete"), # 350 + # ("flow/owned", "post"), # 310 - Couldn't find what would trigger this + ("flow/42", "delete"), # 320 + ("run/42", "delete"), # 400 + ("task/42", "delete"), # 460 + ], +) +def test_authentication_endpoints_requiring_api_key_show_relevant_help_link( + endpoint: str, + method: str, +) -> None: + # We need to temporarily disable the API key to test the error message + with openml.config.overwrite_config_context({"apikey": None}): + with pytest.raises(openml.exceptions.OpenMLNotAuthorizedError, match=API_TOKEN_HELP_LINK): + openml._api_calls._perform_api_call(call=endpoint, request_method=method, data=None) diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index a92cd0cfd..f9ab5eb9f 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -1,11 +1,14 @@ # License: BSD 3-Clause from __future__ import annotations +from contextlib import contextmanager import os import tempfile import unittest.mock from copy import copy +from typing import Any, Iterator from pathlib import Path +import platform import pytest @@ -13,10 +16,32 @@ import openml.testing +@contextmanager +def safe_environ_patcher(key: str, value: Any) -> Iterator[None]: + """Context manager to temporarily set an environment variable. + + Safe to errors happening in the yielded to function. + """ + _prev = os.environ.get(key) + os.environ[key] = value + try: + yield + except Exception as e: + raise e + finally: + os.environ.pop(key) + if _prev is not None: + os.environ[key] = _prev + + class TestConfig(openml.testing.TestBase): @unittest.mock.patch("openml.config.openml_logger.warning") @unittest.mock.patch("openml.config._create_log_handlers") @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033") + @unittest.skipIf( + platform.uname().release.endswith(("-Microsoft", "microsoft-standard-WSL2")), + "WSL does nto support chmod as we would need here, see https://github.com/microsoft/WSL/issues/81", + ) def test_non_writable_home(self, log_handler_mock, warnings_mock): with tempfile.TemporaryDirectory(dir=self.workdir) as td: os.chmod(td, 0o444) @@ -24,20 +49,28 @@ def test_non_writable_home(self, log_handler_mock, warnings_mock): _dd["cachedir"] = Path(td) / "something-else" openml.config._setup(_dd) - assert warnings_mock.call_count == 2 + assert warnings_mock.call_count == 1 assert log_handler_mock.call_count == 1 assert not log_handler_mock.call_args_list[0][1]["create_file_handler"] assert openml.config._root_cache_directory == Path(td) / "something-else" - @unittest.mock.patch("os.path.expanduser") - def test_XDG_directories_do_not_exist(self, expanduser_mock): + @unittest.skipIf(platform.system() != "Linux","XDG only exists for Linux systems.") + def test_XDG_directories_do_not_exist(self): with tempfile.TemporaryDirectory(dir=self.workdir) as td: + # Save previous state + path = Path(td) / "fake_xdg_cache_home" + with safe_environ_patcher("XDG_CONFIG_HOME", str(path)): + expected_config_dir = path / "openml" + expected_determined_config_file_path = expected_config_dir / "config" - def side_effect(path_): - return os.path.join(td, str(path_).replace("~/", "")) + # Ensure that it correctly determines the path to the config file + determined_config_file_path = openml.config.determine_config_file_path() + assert determined_config_file_path == expected_determined_config_file_path - expanduser_mock.side_effect = side_effect - openml.config._setup() + # Ensure that setup will create the config folder as the configuration + # will be written to that location. + openml.config._setup() + assert expected_config_dir.exists() def test_get_config_as_dict(self): """Checks if the current configuration is returned accurately as a dict.""" @@ -121,7 +154,7 @@ def test_example_configuration_start_twice(self): def test_configuration_file_not_overwritten_on_load(): - """ Regression test for #1337 """ + """Regression test for #1337""" config_file_content = "apikey = abcd" with tempfile.TemporaryDirectory() as tmpdir: config_file_path = Path(tmpdir) / "config" @@ -136,12 +169,22 @@ def test_configuration_file_not_overwritten_on_load(): assert config_file_content == new_file_content assert "abcd" == read_config["apikey"] + def test_configuration_loads_booleans(tmp_path): config_file_content = "avoid_duplicate_runs=true\nshow_progress=false" - with (tmp_path/"config").open("w") as config_file: + with (tmp_path / "config").open("w") as config_file: config_file.write(config_file_content) read_config = openml.config._parse_config(tmp_path) # Explicit test to avoid truthy/falsy modes of other types assert True == read_config["avoid_duplicate_runs"] assert False == read_config["show_progress"] + + +def test_openml_cache_dir_env_var(tmp_path: Path) -> None: + expected_path = tmp_path / "test-cache" + + with safe_environ_patcher("OPENML_CACHE_DIR", str(expected_path)): + openml.config._setup() + assert openml.config._root_cache_directory == expected_path + assert openml.config.get_cache_directory() == str(expected_path / "org" / "openml" / "www") diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 40a778d8b..2bd9ee0ed 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -119,7 +119,6 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): # time.time() works in seconds start_time = time.time() while time.time() - start_time < max_waiting_time_seconds: - try: openml.runs.get_run_trace(run_id) except openml.exceptions.OpenMLServerException: @@ -131,7 +130,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): time.sleep(10) continue - assert len(run.evaluations) > 0, "Expect not-None evaluations to always contain elements." + assert ( + len(run.evaluations) > 0 + ), "Expect not-None evaluations to always contain elements." return raise RuntimeError( @@ -557,7 +558,7 @@ def determine_grid_size(param_grid): fold_evaluations=run.fold_evaluations, num_repeats=1, num_folds=num_folds, - task_type=task_type + task_type=task_type, ) # Check if run string and print representation do not run into an error @@ -796,7 +797,9 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock): @pytest.mark.sklearn() def test_run_and_upload_gridsearch(self): - estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + estimator_name = ( + "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + ) gridsearch = GridSearchCV( BaggingClassifier(**{estimator_name: SVC()}), {f"{estimator_name}__C": [0.01, 0.1, 10], f"{estimator_name}__gamma": [0.01, 0.1, 10]}, @@ -1826,7 +1829,9 @@ def test_joblib_backends(self, parallel_mock): num_instances = x.shape[0] line_length = 6 + len(task.class_labels) - backend_choice = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing" + backend_choice = ( + "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing" + ) for n_jobs, backend, call_count in [ (1, backend_choice, 10), (2, backend_choice, 10), @@ -1877,20 +1882,39 @@ def test_joblib_backends(self, parallel_mock): reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) def test_delete_run(self): - rs = 1 + rs = np.random.randint(1, 2**31 - 1) clf = sklearn.pipeline.Pipeline( - steps=[("imputer", SimpleImputer()), ("estimator", DecisionTreeClassifier())], + steps=[ + (f"test_server_imputer_{rs}", SimpleImputer()), + ("estimator", DecisionTreeClassifier()), + ], ) task = openml.tasks.get_task(32) # diabetes; crossvalidation - run = openml.runs.run_model_on_task(model=clf, task=task, seed=rs) + run = openml.runs.run_model_on_task( + model=clf, task=task, seed=rs, avoid_duplicate_runs=False + ) run.publish() + + with pytest.raises(openml.exceptions.OpenMLRunsExistError): + openml.runs.run_model_on_task(model=clf, task=task, seed=rs, avoid_duplicate_runs=True) + TestBase._mark_entity_for_removal("run", run.run_id) TestBase.logger.info(f"collected from test_run_functions: {run.run_id}") _run_id = run.run_id assert delete_run(_run_id) + @unittest.skipIf( + Version(sklearn.__version__) < Version("0.20"), + reason="SimpleImputer doesn't handle mixed type DataFrame as input", + ) + def test_initialize_model_from_run_nonstrict(self): + # We cannot guarantee that a run with an older version exists on the server. + # Thus, we test it simply with a run that we know exists that might not be loose. + # This tests all lines of code for OpenML but not the initialization, which we do not want to guarantee anyhow. + _ = openml.runs.initialize_model_from_run(run_id=1, strict_version=False) + @mock.patch.object(requests.Session, "delete") def test_delete_run_not_owned(mock_delete, test_files_directory, test_api_key): diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 9e357f6aa..259cb98b4 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -115,6 +115,7 @@ def test_existing_setup_exists_3(self): ), ) + @pytest.mark.production() def test_get_setup(self): # no setups in default test server openml.config.server = "https://www.openml.org/api/v1/xml/" diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index cae947917..d900671b7 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -8,37 +8,6 @@ from openml.testing import _check_dataset -@pytest.fixture(autouse=True) -def as_robot(): - policy = openml.config.retry_policy - n_retries = openml.config.connection_n_retries - openml.config.set_retry_policy("robot", n_retries=20) - yield - openml.config.set_retry_policy(policy, n_retries) - - -@pytest.fixture(autouse=True) -def with_test_server(): - openml.config.start_using_configuration_for_example() - yield - openml.config.stop_using_configuration_for_example() - - -@pytest.fixture(autouse=True) -def with_test_cache(test_files_directory, request): - if not test_files_directory.exists(): - raise ValueError( - f"Cannot find test cache dir, expected it to be {test_files_directory!s}!", - ) - _root_cache_directory = openml.config._root_cache_directory - tmp_cache = test_files_directory / request.node.name - openml.config.set_root_cache_directory(tmp_cache) - yield - openml.config.set_root_cache_directory(_root_cache_directory) - if tmp_cache.exists(): - shutil.rmtree(tmp_cache) - - @pytest.fixture() def min_number_tasks_on_test_server() -> int: """After a reset at least 1068 tasks are on the test server"""