Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
5db6c08
made dataset features optional
LennartPurucker Jun 13, 2023
e00cbb6
fix check for qualities
LennartPurucker Jun 13, 2023
3e808a5
add lazy loading for dataset metadata and add option to refresh cache
LennartPurucker Jun 13, 2023
45aa03c
Merge branch 'develop' of https://github.com/openml/openml-python int…
LennartPurucker Jun 13, 2023
d5c40c1
adjust progress.rst
LennartPurucker Jun 13, 2023
eda6c9a
minor fixes
LennartPurucker Jun 13, 2023
32c6099
break line to keep link and respect line length
LennartPurucker Jun 13, 2023
c3e0074
[no ci] changes for pull request review
LennartPurucker Jun 14, 2023
490f072
refactor and add cache usage to load_metadata
LennartPurucker Jun 14, 2023
f8bcafd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 14, 2023
23ebb05
fix precommit
LennartPurucker Jun 14, 2023
e0c9e37
Merge branch 'download_updates' of https://github.com/openml/openml-p…
LennartPurucker Jun 14, 2023
6b21e9d
[no ci] adjust task loading to new dataset loading
LennartPurucker Jun 14, 2023
004fd85
[no ci] add actual lazy loading based on properties and adjusted test…
LennartPurucker Jun 14, 2023
722ff52
switch deprecation to future warning, adjusted deprecation cycle to v…
LennartPurucker Jun 15, 2023
2cb3b57
Update openml/tasks/functions.py
LennartPurucker Jun 15, 2023
a934586
Merge branch 'develop' of https://github.com/openml/openml-python int…
LennartPurucker Jun 15, 2023
b93ab89
changes based on pr review feedback
LennartPurucker Jun 15, 2023
f326be8
fix test w.r.t. server state
LennartPurucker Jun 15, 2023
e29f25f
Merge branch 'download_updates' of https://github.com/openml/openml-p…
LennartPurucker Jun 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add lazy loading for dataset metadata and add option to refresh cache
  • Loading branch information
LennartPurucker committed Jun 13, 2023
commit 3e808a598c31442a7414a35cf329cd5b7ceff635
12 changes: 12 additions & 0 deletions openml/datasets/data_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,5 +62,17 @@ def __init__(
def __repr__(self):
return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)

def __eq__(self, other):
if not isinstance(other, OpenMLDataFeature):
return False
else:
return (
self.index == other.index
and self.name == other.name
and self.data_type == other.data_type
and self.nominal_values == other.nominal_values
and self.number_missing_values == other.number_missing_values
)

def _repr_pretty_(self, pp, cycle):
pp.text(str(self))
95 changes: 71 additions & 24 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import xmltodict

from openml.base import OpenMLBase
from openml._api_calls import _perform_api_call
from .data_feature import OpenMLDataFeature
from ..exceptions import PyOpenMLError

Expand Down Expand Up @@ -787,6 +788,31 @@ def get_data(

return data, targets, categorical, attribute_names

def load_metadata(self, features: bool = False, qualities: bool = False):
"""Load the missing medata information from the server and store it in the server.

The purpose of the function is to support lazy loading.

Parameters
----------
features : bool (default=False)
If True, load the `self.features` data if not already loaded.
qualities: bool (default=False)
If True, load the `self.qualities` data if not already loaded.
"""

if self.dataset_id is None:
raise ValueError(
"""No dataset id specified. Please set the dataset id.
Otherwise we cannot load metadata."""
)

if features and self.features is None:
self.features = _parse_features_xml(_get_features_xml(self.dataset_id))

if qualities and self.qualities is None:
self.qualities = _parse_qualities_xml(_get_qualities_xml(self.dataset_id))

def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[str]]:
"""Reads the datasets arff to determine the class-labels.

Expand Down Expand Up @@ -936,6 +962,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
return data_container


# -- Code for Features Property
def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
features_pickle_file = _get_features_pickle_file(features_file)
try:
Expand All @@ -944,35 +971,46 @@ def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
except: # noqa E722
with open(features_file, encoding="utf8") as fh:
features_xml_string = fh.read()
xml_dict = xmltodict.parse(
features_xml_string, force_list=("oml:feature", "oml:nominal_value")
)
features_xml = xml_dict["oml:data_features"]

features = {}
for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
feature = OpenMLDataFeature(
int(xmlfeature["oml:index"]),
xmlfeature["oml:name"],
xmlfeature["oml:data_type"],
xmlfeature.get("oml:nominal_value"),
int(nr_missing),
)
if idx != feature.index:
raise ValueError("Data features not provided in right order")
features[feature.index] = feature

features = _parse_features_xml(features_xml_string)

with open(features_pickle_file, "wb") as fh_binary:
pickle.dump(features, fh_binary)
return features


def _get_features_xml(dataset_id):
url_extension = "data/features/{}".format(dataset_id)
return _perform_api_call(url_extension, "get")


def _parse_features_xml(features_xml_string):
xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
features_xml = xml_dict["oml:data_features"]

features = {}
for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
feature = OpenMLDataFeature(
int(xmlfeature["oml:index"]),
xmlfeature["oml:name"],
xmlfeature["oml:data_type"],
xmlfeature.get("oml:nominal_value"),
int(nr_missing),
)
if idx != feature.index:
raise ValueError("Data features not provided in right order")
features[feature.index] = feature

return features


def _get_features_pickle_file(features_file: str) -> str:
"""This function only exists so it can be mocked during unit testing"""
return features_file + ".pkl"


# -- Code for Qualities Property
def _read_qualities(qualities_file: str) -> Dict[str, float]:
qualities_pickle_file = _get_qualities_pickle_file(qualities_file)
try:
Expand All @@ -981,17 +1019,15 @@ def _read_qualities(qualities_file: str) -> Dict[str, float]:
except: # noqa E722
with open(qualities_file, encoding="utf8") as fh:
qualities_xml = fh.read()
xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
qualities = _check_qualities(qualities)
qualities = _parse_qualities_xml(qualities_xml)
with open(qualities_pickle_file, "wb") as fh_binary:
pickle.dump(qualities, fh_binary)
return qualities


def _get_qualities_pickle_file(qualities_file: str) -> str:
"""This function only exists so it can be mocked during unit testing"""
return qualities_file + ".pkl"
def _get_qualities_xml(dataset_id):
url_extension = "data/qualities/{}".format(dataset_id)
return _perform_api_call(url_extension, "get")


def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
Expand All @@ -1006,3 +1042,14 @@ def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
value = float(xmlquality["oml:value"])
qualities_[name] = value
return qualities_


def _parse_qualities_xml(qualities_xml):
xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
return _check_qualities(qualities)


def _get_qualities_pickle_file(qualities_file: str) -> str:
"""This function only exists so it can be mocked during unit testing"""
return qualities_file + ".pkl"
54 changes: 41 additions & 13 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,14 @@

import openml.utils
import openml._api_calls
from .dataset import OpenMLDataset
from .dataset import OpenMLDataset, _get_features_xml, _get_qualities_xml
from ..exceptions import (
OpenMLHashException,
OpenMLServerError,
OpenMLServerException,
OpenMLPrivateDatasetError,
)
from ..utils import (
_remove_cache_dir_for_id,
_create_cache_directory_for_id,
)
from ..utils import _remove_cache_dir_for_id, _create_cache_directory_for_id, _get_cache_dir_for_id

DATASETS_CACHE_DIR_NAME = "datasets"
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -357,12 +354,17 @@ def get_dataset(
download_qualities: bool = True,
download_features_meta_data: bool = True,
download_all_files: bool = False,
force_refresh_cache: bool = False,
) -> OpenMLDataset:
"""Download the OpenML dataset representation, optionally also download actual data file.

This function is thread/multiprocessing safe.
This function uses caching. A check will be performed to determine if the information has
previously been downloaded, and if so be loaded from disk instead of retrieved from the server.
This function is by default NOT thread/multiprocessing safe, as this function uses caching.
A check will be performed to determine if the information has previously been downloaded to a
cache, and if so be loaded from disk instead of retrieved from the server.

To make this function thread/multiprocessing safe initialize the cache first by calling
`get_dataset(args)` once before calling `get_datasett(args)` many times in parallel. This will
initialize the cache and later calls will use the cache in a thread/multiprocessing safe way.

If dataset is retrieved by name, a version may be specified.
If no version is specified and multiple versions of the dataset exist,
Expand All @@ -384,23 +386,42 @@ def get_dataset(
If no version is specified, retrieve the least recent still active version.
error_if_multiple : bool (default=False)
If ``True`` raise an error if multiple datasets are found with matching criteria.
cache_format : str (default='pickle')
cache_format : str (default='pickle') in {'pickle', 'feather'}
Format for caching the dataset - may be feather or pickle
Note that the default 'pickle' option may load slower than feather when
no.of.rows is very high.
download_qualities : bool (default=True)
Option to download 'qualities' meta-data in addition to the minimal dataset description.
If True, download and cache the qualities file.
If False, create the OpenMLDataset without qualities metadata. The data may later be added
to the OpenMLDataset hrough the `OpenMLDataset.load_metadata(qualities=True)` method.
download_features_meta_data : bool (default=True)
Option to download 'features' meta-data in addition to the minimal dataset description.
If True, download and cache the features file.
If False, create the OpenMLDataset without features metadata. The data may later be added
to the OpenMLDataset through the `OpenMLDataset.load_metadata(features=True)` method.
download_all_files: bool (default=False)
EXPERIMENTAL. Download all files related to the dataset that reside on the server.
Useful for datasets which refer to auxiliary files (e.g., meta-album).
force_refresh_cache : bool (default=False)
Force the cache to refreshed by deleting the cache directory and re-downloading the data.
Note, if `force_refresh_cache` is True, `get_dataset` is NOT thread/multiprocessing safe,
because this creates a race condition to creating and deleting the cache; as in general with
the cache.

Returns
-------
dataset : :class:`openml.OpenMLDataset`
The downloaded dataset.
"""
if any([download_qualities, download_features_meta_data]):
warnings.warn(
"""Starting from Version 0.14 `download_data`, `download_qualities`, and
`download_features_meta_data` will all be ``False`` by default to enable
lazy loading.""",
DeprecationWarning,
)

if download_all_files:
warnings.warn(
"``download_all_files`` is experimental and is likely to break with new releases."
Expand All @@ -422,6 +443,15 @@ def get_dataset(
"`dataset_id` must be one of `str` or `int`, not {}.".format(type(dataset_id))
)

# Note: we could also (quite heavily) re-implement the below to only download the
# data and do not cache the data at all. This would always be thread/multiprocessing
# safe. However, this would likely drastically increase the strain on the server.
# Hence, we should stick to the alternative mentioned in the docstring.
if force_refresh_cache:
did_cache_dir = _get_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
if os.path.exists(did_cache_dir):
_remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)

did_cache_dir = _create_cache_directory_for_id(
DATASETS_CACHE_DIR_NAME,
dataset_id,
Expand Down Expand Up @@ -1133,8 +1163,7 @@ def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str:

# Dataset features aren't subject to change...
if not os.path.isfile(features_file):
url_extension = "data/features/{}".format(dataset_id)
features_xml = openml._api_calls._perform_api_call(url_extension, "get")
features_xml = _get_features_xml(dataset_id)
with io.open(features_file, "w", encoding="utf8") as fh:
fh.write(features_xml)

Expand Down Expand Up @@ -1169,8 +1198,7 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
with io.open(qualities_file, encoding="utf8") as fh:
qualities_xml = fh.read()
except (OSError, IOError):
url_extension = "data/qualities/{}".format(dataset_id)
qualities_xml = openml._api_calls._perform_api_call(url_extension, "get")
qualities_xml = _get_qualities_xml(dataset_id)
with io.open(qualities_file, "w", encoding="utf8") as fh:
fh.write(qualities_xml)
return qualities_file
Expand Down
22 changes: 18 additions & 4 deletions openml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
if TYPE_CHECKING:
from openml.base import OpenMLBase


oslo_installed = False
try:
# Currently, importing oslo raises a lot of warning that it will stop working
Expand Down Expand Up @@ -303,18 +302,33 @@ def _list_all(listing_call, output_format="dict", *args, **filters):
return result


def _create_cache_directory(key):
def _get_cache_dir_for_key(key):
cache = config.get_cache_directory()
cache_dir = os.path.join(cache, key)
return os.path.join(cache, key)


def _create_cache_directory(key):
cache_dir = _get_cache_dir_for_key(key)

try:
os.makedirs(cache_dir, exist_ok=True)
except Exception as e:
raise openml.exceptions.OpenMLCacheException(
f"Cannot create cache directory {cache_dir}."
) from e

return cache_dir


def _get_cache_dir_for_id(key, id_, create=False):
if create:
cache_dir = _create_cache_directory(key)
else:
cache_dir = _get_cache_dir_for_key(key)

return os.path.join(cache_dir, str(id_))


def _create_cache_directory_for_id(key, id_):
"""Create the cache directory for a specific ID

Expand All @@ -336,7 +350,7 @@ def _create_cache_directory_for_id(key, id_):
str
Path of the created dataset cache directory.
"""
cache_dir = os.path.join(_create_cache_directory(key), str(id_))
cache_dir = _get_cache_dir_for_id(key, id_, create=True)
if os.path.isdir(cache_dir):
pass
elif os.path.exists(cache_dir):
Expand Down
13 changes: 13 additions & 0 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,19 @@ def test_get_data_corrupt_pickle(self):
self.assertIsInstance(xy, pd.DataFrame)
self.assertEqual(xy.shape, (150, 5))

def test_load_metadata(self):
_compare_dataset = openml.datasets.get_dataset(
2, download_data=False, download_features_meta_data=True, download_qualities=True
)

_dataset = openml.datasets.get_dataset(
2, download_data=False, download_features_meta_data=False, download_qualities=False
)
_dataset.load_metadata(features=True, qualities=True)

self.assertEqual(_dataset.features, _compare_dataset.features)
self.assertEqual(_dataset.qualities, _compare_dataset.qualities)


class OpenMLDatasetTestOnTestServer(TestBase):
def setUp(self):
Expand Down
Loading