Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
5db6c08
made dataset features optional
LennartPurucker Jun 13, 2023
e00cbb6
fix check for qualities
LennartPurucker Jun 13, 2023
3e808a5
add lazy loading for dataset metadata and add option to refresh cache
LennartPurucker Jun 13, 2023
45aa03c
Merge branch 'develop' of https://github.com/openml/openml-python int…
LennartPurucker Jun 13, 2023
d5c40c1
adjust progress.rst
LennartPurucker Jun 13, 2023
eda6c9a
minor fixes
LennartPurucker Jun 13, 2023
32c6099
break line to keep link and respect line length
LennartPurucker Jun 13, 2023
c3e0074
[no ci] changes for pull request review
LennartPurucker Jun 14, 2023
490f072
refactor and add cache usage to load_metadata
LennartPurucker Jun 14, 2023
f8bcafd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 14, 2023
23ebb05
fix precommit
LennartPurucker Jun 14, 2023
e0c9e37
Merge branch 'download_updates' of https://github.com/openml/openml-p…
LennartPurucker Jun 14, 2023
6b21e9d
[no ci] adjust task loading to new dataset loading
LennartPurucker Jun 14, 2023
004fd85
[no ci] add actual lazy loading based on properties and adjusted test…
LennartPurucker Jun 14, 2023
722ff52
switch deprecation to future warning, adjusted deprecation cycle to v…
LennartPurucker Jun 15, 2023
2cb3b57
Update openml/tasks/functions.py
LennartPurucker Jun 15, 2023
a934586
Merge branch 'develop' of https://github.com/openml/openml-python int…
LennartPurucker Jun 15, 2023
b93ab89
changes based on pr review feedback
LennartPurucker Jun 15, 2023
f326be8
fix test w.r.t. server state
LennartPurucker Jun 15, 2023
e29f25f
Merge branch 'download_updates' of https://github.com/openml/openml-p…
LennartPurucker Jun 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactor and add cache usage to load_metadata
  • Loading branch information
LennartPurucker committed Jun 14, 2023
commit 490f07219812cce4ba1611f9fae61796b9655c56
31 changes: 14 additions & 17 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import xmltodict

from openml.base import OpenMLBase
from openml._api_calls import _perform_api_call
from .data_feature import OpenMLDataFeature
from ..exceptions import PyOpenMLError

Expand Down Expand Up @@ -801,20 +800,28 @@ def load_metadata(self, features: bool = False, qualities: bool = False):
qualities: bool (default=False)
If True, load the `self.qualities` data if not already loaded.
"""
# Delayed Import to avoid circular imports or having to import all of dataset.functions to import OpenMLDataset
from openml.datasets.functions import _get_dataset_metadata

if self.dataset_id is None:
raise ValueError(
"""No dataset id specified. Please set the dataset id.
Otherwise we cannot load metadata."""
)

if features and self.features is None:
feature_xml = _get_features_xml(self.dataset_id)
self.features = _parse_features_xml(feature_xml)
# Only load the data if it is not already stored in the dataset object.
features = features if self.features is None else False
qualities = qualities if self.qualities is None else False

if qualities and self.qualities is None:
qualities_xml = _get_qualities_xml(self.dataset_id)
self.qualities = _parse_qualities_xml(qualities_xml)
features_file, qualities_file = _get_dataset_metadata(
self.dataset_id, features=features, qualities=qualities
)

if features_file is not None:
self.features = _read_features(features_file)

if qualities_file is not None:
self.qualities = _read_qualities(qualities_file)

def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[str]]:
"""Reads the datasets arff to determine the class-labels.
Expand Down Expand Up @@ -982,11 +989,6 @@ def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
return features


def _get_features_xml(dataset_id):
url_extension = "data/features/{}".format(dataset_id)
return _perform_api_call(url_extension, "get")


def _parse_features_xml(features_xml_string):
xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
features_xml = xml_dict["oml:data_features"]
Expand Down Expand Up @@ -1028,11 +1030,6 @@ def _read_qualities(qualities_file: str) -> Dict[str, float]:
return qualities


def _get_qualities_xml(dataset_id):
url_extension = "data/qualities/{}".format(dataset_id)
return _perform_api_call(url_extension, "get")


def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
qualities_ = {}
for xmlquality in qualities:
Expand Down
84 changes: 66 additions & 18 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
import os
from pyexpat import ExpatError
from typing import List, Dict, Union, Optional, cast
from typing import List, Dict, Union, Optional, cast, Tuple
import warnings

import numpy as np
Expand All @@ -18,7 +18,7 @@

import openml.utils
import openml._api_calls
from .dataset import OpenMLDataset, _get_features_xml, _get_qualities_xml
from .dataset import OpenMLDataset
from ..exceptions import (
OpenMLHashException,
OpenMLServerError,
Expand Down Expand Up @@ -475,22 +475,9 @@ def get_dataset(
try:
description = _get_dataset_description(did_cache_dir, dataset_id)

if download_features_meta_data:
features_file = _get_dataset_features_file(did_cache_dir, dataset_id)
else:
features_file = None

try:
if download_qualities:
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
else:
qualities_file = ""
except OpenMLServerException as e:
if e.code == 362 and str(e) == "No qualities found - None":
logger.warning("No qualities found for dataset {}".format(dataset_id))
qualities_file = None
else:
raise
features_file, qualities_file = _get_dataset_metadata(
dataset_id, download_features_meta_data, download_qualities, did_cache_dir
)

arff_file = _get_dataset_arff(description) if download_data else None
if "oml:minio_url" in description and download_data:
Expand Down Expand Up @@ -1152,6 +1139,11 @@ def _get_dataset_arff(
return output_file_path


def _get_features_xml(dataset_id):
url_extension = "data/features/{}".format(dataset_id)
return openml._api_calls._perform_api_call(url_extension, "get")


def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str:
"""API call to load dataset features. Loads from cache or downloads them.

Expand Down Expand Up @@ -1184,6 +1176,11 @@ def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str:
return features_file


def _get_qualities_xml(dataset_id):
url_extension = "data/qualities/{}".format(dataset_id)
return openml._api_calls._perform_api_call(url_extension, "get")


def _get_dataset_qualities_file(did_cache_dir, dataset_id):
"""API call to load dataset qualities. Loads from cache or downloads them.

Expand Down Expand Up @@ -1218,6 +1215,57 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
return qualities_file


def _get_dataset_metadata(
dataset_id: int, features: bool, qualities: bool, did_cache_dir: Optional[str] = None
) -> Tuple[Union[str, None], Union[str, None]]:
"""Download the files and initialize the cache for the metadata for a dataset. If the cache is
already initialized, the files are only loaded from the cache.

This includes the features and qualities of the dataset.

Parameters
----------
dataset_id: int
ID of the dataset for which the metadata is requested.
features: bool
Whether to return the features in the metadata.
qualities

did_cache_dir

Returns
-------
features_file: str or None
Path to the features file. None if features=False.
qualities_file: str or None
Path to the qualities file. None if qualities=False.
"""

# Init cache directory if needed
if did_cache_dir is None:
did_cache_dir = _create_cache_directory_for_id(
DATASETS_CACHE_DIR_NAME,
dataset_id,
)
features_file = None
qualities_file = None

if features:
features_file = _get_dataset_features_file(did_cache_dir, dataset_id)

if qualities:
try:
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
except OpenMLServerException as e:
if e.code == 362 and str(e) == "No qualities found - None":
# quality file stays as None
logger.warning("No qualities found for dataset {}".format(dataset_id))
else:
raise

return features_file, qualities_file


def _create_dataset_from_description(
description: Dict[str, str],
features_file: Optional[str] = None,
Expand Down
18 changes: 18 additions & 0 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,15 +263,33 @@ def test_get_data_corrupt_pickle(self):
self.assertEqual(xy.shape, (150, 5))

def test_load_metadata(self):
# Initial Setup
did_cache_dir = openml.utils._create_cache_directory_for_id(openml.datasets.functions.DATASETS_CACHE_DIR_NAME,
2)
_compare_dataset = openml.datasets.get_dataset(
2, download_data=False, download_features_meta_data=True, download_qualities=True
)
change_time = os.stat(did_cache_dir).st_mtime

# Test with cache
_dataset = openml.datasets.get_dataset(
2, download_data=False, download_features_meta_data=False, download_qualities=False
)
_dataset.load_metadata(features=True, qualities=True)

self.assertEqual(change_time, os.stat(did_cache_dir).st_mtime)
self.assertEqual(_dataset.features, _compare_dataset.features)
self.assertEqual(_dataset.qualities, _compare_dataset.qualities)

# -- Test without cache
openml.utils._remove_cache_dir_for_id(openml.datasets.functions.DATASETS_CACHE_DIR_NAME, did_cache_dir)

_dataset = openml.datasets.get_dataset(
2, download_data=False, download_features_meta_data=False, download_qualities=False
)
_dataset.load_metadata(features=True, qualities=True)

self.assertNotEqual(change_time, os.stat(did_cache_dir).st_mtime)
self.assertEqual(_dataset.features, _compare_dataset.features)
self.assertEqual(_dataset.qualities, _compare_dataset.qualities)

Expand Down