Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
846a539
style: Fix linting split.py
eddiebergman Jan 8, 2024
053b053
typing: Fix mypy errors split.py
eddiebergman Jan 8, 2024
48d9471
typing: data_feature
eddiebergman Jan 8, 2024
7dbc9b6
typing: trace
eddiebergman Jan 8, 2024
2712c71
more linting fixes
LennartPurucker Jan 8, 2024
e3e432e
Merge branch 'fix_linter_lennart' of https://github.com/openml/openml…
LennartPurucker Jan 8, 2024
69f033e
typing: finish up trace
eddiebergman Jan 8, 2024
798cb8e
typing: config.py
eddiebergman Jan 8, 2024
5fbb36a
typing: More fixes on config.py
eddiebergman Jan 8, 2024
c88f8f4
typing: setup.py
eddiebergman Jan 8, 2024
f911c30
finalize runs linting
LennartPurucker Jan 8, 2024
92d9b26
Merge branch 'fix_linter_lennart' of https://github.com/openml/openml…
LennartPurucker Jan 8, 2024
38bcd5e
typing: evaluation.py
eddiebergman Jan 8, 2024
869f9c4
typing: setup
eddiebergman Jan 8, 2024
abc6117
ruff fixes across different files and mypy fixes for run files
LennartPurucker Jan 8, 2024
54aca64
Merge branch 'fix_linter_lennart' of https://github.com/openml/openml…
LennartPurucker Jan 8, 2024
f6c2ae5
typing: _api_calls
eddiebergman Jan 8, 2024
960afa1
adjust setup files' linting and minor ruff changes
LennartPurucker Jan 8, 2024
bea95cc
Merge branch 'fix_linter_lennart' of https://github.com/openml/openml…
LennartPurucker Jan 8, 2024
5ea4287
typing: utils
eddiebergman Jan 8, 2024
cffd7ed
late night push
LennartPurucker Jan 8, 2024
6d3ae4a
Merge branch 'fix_linter_lennart' of https://github.com/openml/openml…
LennartPurucker Jan 8, 2024
bef753e
typing: utils.py
eddiebergman Jan 8, 2024
1df08b5
typing: tip tap tippity
eddiebergman Jan 9, 2024
d4f79f8
typing: mypy 78, ruff ~200
eddiebergman Jan 9, 2024
cecc746
refactor output format name and minor linting stuff
LennartPurucker Jan 9, 2024
3804220
other: midway merge
eddiebergman Jan 9, 2024
57db7f0
merge
eddiebergman Jan 9, 2024
c9c96b1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 9, 2024
bb0cdd1
typing: I'm runnign out of good messages
eddiebergman Jan 9, 2024
e38fdd1
Merge branch 'fix_linter_lennart' of github.com:openml/openml-python …
eddiebergman Jan 9, 2024
dcc60f5
typing: datasets
eddiebergman Jan 9, 2024
a19bc26
leinting for flows and some ruff changes
LennartPurucker Jan 9, 2024
93b83eb
Merge branch 'fix_linter_lennart' of https://github.com/openml/openml…
LennartPurucker Jan 9, 2024
9174f20
no more mypy errors
LennartPurucker Jan 9, 2024
a87109a
ruff runs and setups
LennartPurucker Jan 9, 2024
10a2f5e
typing: Finish off mypy and ruff errors
eddiebergman Jan 9, 2024
66e3c97
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 9, 2024
66a3ab1
style: File wide ignores of PLR0913
eddiebergman Jan 9, 2024
290578c
Merge branch 'fix_linter_lennart' of github.com:openml/openml-python …
eddiebergman Jan 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
other: midway merge
  • Loading branch information
eddiebergman committed Jan 9, 2024
commit 38042200b31e23c599bf49b95eee498820e49f33
125 changes: 79 additions & 46 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import os
import warnings
from collections import OrderedDict
from typing import cast
from typing_extensions import Literal
from pathlib import Path
from typing import Any, cast
from typing_extensions import Literal, overload

import arff
import minio.error
Expand Down Expand Up @@ -41,8 +42,9 @@
# Local getters/accessors to the cache directory


def _get_cache_directory(dataset: OpenMLDataset) -> str:
"""Return the cache directory of the OpenMLDataset"""
def _get_cache_directory(dataset: OpenMLDataset) -> Path:
"""Creates and returns the cache directory of the OpenMLDataset."""
assert dataset.dataset_id is not None
return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id)


Expand All @@ -61,20 +63,48 @@ def list_qualities() -> list[str]:
qualities = xmltodict.parse(xml_string, force_list=("oml:quality"))
# Minimalistic check if the XML is useful
if "oml:data_qualities_list" not in qualities:
raise ValueError("Error in return XML, does not contain " '"oml:data_qualities_list"')
raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')

if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list):
raise TypeError("Error in return XML, does not contain " '"oml:quality" as a list')

return qualities["oml:data_qualities_list"]["oml:quality"]


@overload
def list_datasets(
data_id: list[int] | None = None,
offset: int | None = None,
size: int | None = None,
status: str | None = None,
tag: str | None = None,
output_format: str = "dict",
**kwargs,
output_format: Literal["dataframe"] = "dataframe",
**kwargs: Any,
) -> pd.DataFrame:
...


@overload
def list_datasets(
data_id: list[int] | None = None,
offset: int | None = None,
size: int | None = None,
status: str | None = None,
tag: str | None = None,
output_format: Literal["dict"] = "dict",
**kwargs: Any,
) -> pd.DataFrame:
...


def list_datasets( # noqa: PLR0913
data_id: list[int] | None = None,
offset: int | None = None,
size: int | None = None,
status: str | None = None,
tag: str | None = None,
output_format: Literal["dataframe", "dict"] = "dict",
**kwargs: Any,
) -> dict | pd.DataFrame:
"""
Return a list of all dataset which are on OpenML.
Expand Down Expand Up @@ -244,13 +274,14 @@ def _validated_data_attributes(
if not is_attribute_a_data_attribute:
raise ValueError(
f"all attribute of '{parameter_name}' should be one of the data attribute. "
f" Got '{attribute_}' while candidates are {[attr[0] for attr in data_attributes]}.",
f" Got '{attribute_}' while candidates are"
f" {[attr[0] for attr in data_attributes]}.",
)


def check_datasets_active(
dataset_ids: list[int],
raise_error_if_not_exist: bool = True,
raise_error_if_not_exist: bool = True, # noqa: FBT001, FBT002
) -> dict[int, bool]:
"""
Check if the dataset ids provided are active.
Expand Down Expand Up @@ -283,7 +314,7 @@ def check_datasets_active(
def _name_to_id(
dataset_name: str,
version: int | None = None,
error_if_multiple: bool = False,
error_if_multiple: bool = False, # noqa: FBT001, FBT002
) -> int:
"""Attempt to find the dataset id of the dataset with the given name.

Expand Down Expand Up @@ -311,31 +342,29 @@ def _name_to_id(
The id of the dataset.
"""
status = None if version is not None else "active"
candidates = cast(
pd.DataFrame,
list_datasets(
data_name=dataset_name,
status=status,
data_version=version,
output_format="dataframe",
),
candidates = list_datasets(
data_name=dataset_name,
status=status,
data_version=version,
output_format="dataframe",
)
if error_if_multiple and len(candidates) > 1:
msg = f"Multiple active datasets exist with name '{dataset_name}'."
raise ValueError(msg)

if candidates.empty:
no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
and_version = f" and version '{version}'." if version is not None else "."
raise RuntimeError(no_dataset_for_name + and_version)

# Dataset ids are chronological so we can just sort based on ids (instead of version)
return candidates["did"].min()
return candidates["did"].min() # type: ignore


def get_datasets(
dataset_ids: list[str | int],
download_data: bool = True,
download_qualities: bool = True,
download_data: bool = True, # noqa: FBT001, FBT002
download_qualities: bool = True, # noqa: FBT001, FBT002
) -> list[OpenMLDataset]:
"""Download datasets.

Expand Down Expand Up @@ -368,16 +397,16 @@ def get_datasets(


@openml.utils.thread_safe_if_oslo_installed
def get_dataset(
def get_dataset( # noqa: PLR0915, C901, PLR0913, PLR0912
dataset_id: int | str,
download_data: bool | None = None, # Optional for deprecation warning; later again only bool
version: int | None = None,
error_if_multiple: bool = False,
error_if_multiple: bool = False, # noqa: FBT002, FBT001
cache_format: str = "pickle",
download_qualities: bool | None = None, # Same as above
download_features_meta_data: bool | None = None, # Same as above
download_all_files: bool = False,
force_refresh_cache: bool = False,
download_all_files: bool = False, # noqa: FBT002, FBT001
force_refresh_cache: bool = False, # noqa: FBT001, FBT002
) -> OpenMLDataset:
"""Download the OpenML dataset representation, optionally also download actual data file.

Expand Down Expand Up @@ -454,6 +483,7 @@ def get_dataset(
"`download_qualities`, and `download_features_meta_data` to a bool while calling "
"`get_dataset`.",
FutureWarning,
stacklevel=2,
)

download_data = True if download_data is None else download_data
Expand All @@ -465,6 +495,8 @@ def get_dataset(
if download_all_files:
warnings.warn(
"``download_all_files`` is experimental and is likely to break with new releases.",
FutureWarning,
stacklevel=2,
)

if cache_format not in ["feather", "pickle"]:
Expand All @@ -485,7 +517,7 @@ def get_dataset(

if force_refresh_cache:
did_cache_dir = _get_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
if os.path.exists(did_cache_dir):
if did_cache_dir.exists():
_remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)

did_cache_dir = _create_cache_directory_for_id(
Expand Down Expand Up @@ -1060,7 +1092,7 @@ def _get_dataset_description(did_cache_dir, dataset_id):
def _get_dataset_parquet(
description: dict | OpenMLDataset,
cache_directory: str | None = None,
download_all_files: bool = False,
download_all_files: bool = False, # noqa: FBT001, FBT002
) -> str | None:
"""Return the path to the local parquet file of the dataset. If is not cached, it is downloaded.

Expand Down Expand Up @@ -1090,20 +1122,22 @@ def _get_dataset_parquet(
Location of the Parquet file if successfully downloaded, None otherwise.
"""
if isinstance(description, dict):
url = cast(str, description.get("oml:parquet_url"))
did = description.get("oml:id")
url = str(description.get("oml:parquet_url"))
did = int(description.get("oml:id"))
elif isinstance(description, OpenMLDataset):
url = cast(str, description._parquet_url)
did = description.dataset_id
url = str(description._parquet_url)
did = int(description.dataset_id)
else:
raise TypeError("`description` should be either OpenMLDataset or Dict.")

if cache_directory is None:
cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)

output_file_path = os.path.join(cache_directory, f"dataset_{did}.pq")

old_file_path = os.path.join(cache_directory, "dataset.pq")
if os.path.isfile(old_file_path):
if old_file_path.is_file():
old_file_path.rename(output_file_path)
os.rename(old_file_path, output_file_path)

# For this release, we want to be able to force a new download even if the
Expand All @@ -1113,16 +1147,17 @@ def _get_dataset_parquet(
if download_all_files:
if url.endswith(".pq"):
url, _ = url.rsplit("/", maxsplit=1)
openml._api_calls._download_minio_bucket(source=cast(str, url), destination=cache_directory)

if not os.path.isfile(output_file_path):
openml._api_calls._download_minio_bucket(source=url, destination=cache_directory)

if not output_file_path.is_file():
try:
openml._api_calls._download_minio_file(
source=cast(str, url),
source=url,
destination=output_file_path,
)
except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e:
logger.warning(f"Could not download file from {cast(str, url)}: {e}")
logger.warning(f"Could not download file from {url}: {e}")
return None
return output_file_path

Expand Down Expand Up @@ -1182,12 +1217,12 @@ def _get_dataset_arff(
return output_file_path


def _get_features_xml(dataset_id):
def _get_features_xml(dataset_id: int) -> str:
url_extension = f"data/features/{dataset_id}"
return openml._api_calls._perform_api_call(url_extension, "get")


def _get_dataset_features_file(did_cache_dir: str | None, dataset_id: int) -> str:
def _get_dataset_features_file(did_cache_dir: str | Path | None, dataset_id: int) -> Path:
"""API call to load dataset features. Loads from cache or downloads them.

Features are feature descriptions for each column.
Expand All @@ -1205,21 +1240,19 @@ def _get_dataset_features_file(did_cache_dir: str | None, dataset_id: int) -> st

Returns
-------
str
Path
Path of the cached dataset feature file
"""
did_cache_dir = Path(did_cache_dir) if did_cache_dir is not None else None
if did_cache_dir is None:
did_cache_dir = _create_cache_directory_for_id(
DATASETS_CACHE_DIR_NAME,
dataset_id,
)
did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)

features_file = os.path.join(did_cache_dir, "features.xml")
features_file = did_cache_dir / "features.xml"

# Dataset features aren't subject to change...
if not os.path.isfile(features_file):
if not features_file.is_file():
features_xml = _get_features_xml(dataset_id)
with open(features_file, "w", encoding="utf8") as fh:
with features_file.open("w", encoding="utf8") as fh:
fh.write(features_xml)

return features_file
Expand Down