Skip to content

Commit 91b4bf0

Browse files
LennartPuruckerpre-commit-ci[bot]mfeurer
authored
Download updates (#1256)
* made dataset features optional * fix check for qualities * add lazy loading for dataset metadata and add option to refresh cache * adjust progress.rst * minor fixes * break line to keep link and respect line length * [no ci] changes for pull request review * refactor and add cache usage to load_metadata * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix precommit * [no ci] adjust task loading to new dataset loading * [no ci] add actual lazy loading based on properties and adjusted test on how to use it * switch deprecation to future warning, adjusted deprecation cycle to version 0.15.0, update documentation. * Update openml/tasks/functions.py Co-authored-by: Matthias Feurer <feurerm@informatik.uni-freiburg.de> --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
1 parent a7f2639 commit 91b4bf0

File tree

9 files changed

+376
-94
lines changed

9 files changed

+376
-94
lines changed

doc/progress.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Changelog
99
0.13.1
1010
~~~~~~
1111

12+
* ADD #1081 #1132: Add additional options for (not) downloading datasets ``openml.datasets.get_dataset`` and cache management.
1213
* ADD #1028: Add functions to delete runs, flows, datasets, and tasks (e.g., ``openml.datasets.delete_dataset``).
1314
* ADD #1144: Add locally computed results to the ``OpenMLRun`` object's representation if the run was created locally and not downloaded from the server.
1415
* ADD #1180: Improve the error message when the checksum of a downloaded dataset does not match the checksum provided by the API.

openml/datasets/data_feature.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,5 +62,11 @@ def __init__(
6262
def __repr__(self):
6363
return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)
6464

65+
def __eq__(self, other):
66+
if not isinstance(other, OpenMLDataFeature):
67+
return False
68+
69+
return self.__dict__ == other.__dict__
70+
6571
def _repr_pretty_(self, pp, cycle):
6672
pp.text(str(self))

openml/datasets/dataset.py

Lines changed: 112 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import os
88
import pickle
99
from typing import List, Optional, Union, Tuple, Iterable, Dict
10+
import warnings
1011

1112
import arff
1213
import numpy as np
@@ -18,7 +19,6 @@
1819
from .data_feature import OpenMLDataFeature
1920
from ..exceptions import PyOpenMLError
2021

21-
2222
logger = logging.getLogger(__name__)
2323

2424

@@ -212,17 +212,22 @@ def find_invalid_characters(string, pattern):
212212
self._dataset = dataset
213213
self._minio_url = minio_url
214214

215+
self._features = None # type: Optional[Dict[int, OpenMLDataFeature]]
216+
self._qualities = None # type: Optional[Dict[str, float]]
217+
self._no_qualities_found = False
218+
215219
if features_file is not None:
216-
self.features = _read_features(
217-
features_file
218-
) # type: Optional[Dict[int, OpenMLDataFeature]]
219-
else:
220-
self.features = None
220+
self._features = _read_features(features_file)
221+
222+
if qualities_file == "":
223+
# TODO(0.15): to switch to "qualities_file is not None" below and remove warning
224+
warnings.warn(
225+
"Starting from Version 0.15 `qualities_file` must be None and not an empty string.",
226+
FutureWarning,
227+
)
221228

222229
if qualities_file:
223-
self.qualities = _read_qualities(qualities_file) # type: Optional[Dict[str, float]]
224-
else:
225-
self.qualities = None
230+
self._qualities = _read_qualities(qualities_file)
226231

227232
if data_file is not None:
228233
rval = self._compressed_cache_file_paths(data_file)
@@ -234,12 +239,36 @@ def find_invalid_characters(string, pattern):
234239
self.data_feather_file = None
235240
self.feather_attribute_file = None
236241

242+
@property
243+
def features(self):
244+
# Lazy loading of features
245+
if self._features is None:
246+
self._load_metadata(features=True)
247+
248+
return self._features
249+
250+
@property
251+
def qualities(self):
252+
# Lazy loading of qualities
253+
# We have to check `_no_qualities_found` as there might not be qualities for a dataset
254+
if self._qualities is None and (not self._no_qualities_found):
255+
self._load_metadata(qualities=True)
256+
257+
return self._qualities
258+
237259
@property
238260
def id(self) -> Optional[int]:
239261
return self.dataset_id
240262

241263
def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
242264
"""Collect all information to display in the __repr__ body."""
265+
266+
# Obtain number of features in accordance with lazy loading.
267+
if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None:
268+
n_features = int(self._qualities["NumberOfFeatures"]) # type: Optional[int]
269+
else:
270+
n_features = len(self._features) if self._features is not None else None
271+
243272
fields = {
244273
"Name": self.name,
245274
"Version": self.version,
@@ -248,14 +277,14 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
248277
"Download URL": self.url,
249278
"Data file": self.data_file,
250279
"Pickle file": self.data_pickle_file,
251-
"# of features": len(self.features) if self.features is not None else None,
280+
"# of features": n_features,
252281
}
253282
if self.upload_date is not None:
254283
fields["Upload Date"] = self.upload_date.replace("T", " ")
255284
if self.dataset_id is not None:
256285
fields["OpenML URL"] = self.openml_url
257-
if self.qualities is not None and self.qualities["NumberOfInstances"] is not None:
258-
fields["# of instances"] = int(self.qualities["NumberOfInstances"])
286+
if self._qualities is not None and self._qualities["NumberOfInstances"] is not None:
287+
fields["# of instances"] = int(self._qualities["NumberOfInstances"])
259288

260289
# determines the order in which the information will be printed
261290
order = [
@@ -773,6 +802,40 @@ def get_data(
773802

774803
return data, targets, categorical, attribute_names
775804

805+
def _load_metadata(self, features: bool = False, qualities: bool = False):
806+
"""Load the missing metadata information from the server and store it in the
807+
dataset object.
808+
809+
The purpose of the function is to support lazy loading.
810+
811+
Parameters
812+
----------
813+
features : bool (default=False)
814+
If True, load the `self.features` data if not already loaded.
815+
qualities: bool (default=False)
816+
If True, load the `self.qualities` data if not already loaded.
817+
"""
818+
# Delayed Import to avoid circular imports or having to import all of dataset.functions to
819+
# import OpenMLDataset
820+
from openml.datasets.functions import _get_dataset_metadata
821+
822+
if self.dataset_id is None:
823+
raise ValueError(
824+
"""No dataset id specified. Please set the dataset id.
825+
Otherwise we cannot load metadata."""
826+
)
827+
828+
features_file, qualities_file = _get_dataset_metadata(
829+
self.dataset_id, features=features, qualities=qualities
830+
)
831+
832+
if features_file is not None:
833+
self._features = _read_features(features_file)
834+
835+
if qualities_file is not None:
836+
self._qualities = _read_qualities(qualities_file)
837+
self._no_qualities_found = self._qualities is None
838+
776839
def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[str]]:
777840
"""Reads the datasets arff to determine the class-labels.
778841
@@ -790,10 +853,6 @@ def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[
790853
-------
791854
list
792855
"""
793-
if self.features is None:
794-
raise ValueError(
795-
"retrieve_class_labels can only be called if feature information is available."
796-
)
797856
for feature in self.features.values():
798857
if (feature.name == target_name) and (feature.data_type == "nominal"):
799858
return feature.nominal_values
@@ -922,6 +981,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
922981
return data_container
923982

924983

984+
# -- Code for Features Property
925985
def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
926986
features_pickle_file = _get_features_pickle_file(features_file)
927987
try:
@@ -930,35 +990,41 @@ def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
930990
except: # noqa E722
931991
with open(features_file, encoding="utf8") as fh:
932992
features_xml_string = fh.read()
933-
xml_dict = xmltodict.parse(
934-
features_xml_string, force_list=("oml:feature", "oml:nominal_value")
935-
)
936-
features_xml = xml_dict["oml:data_features"]
937-
938-
features = {}
939-
for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
940-
nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
941-
feature = OpenMLDataFeature(
942-
int(xmlfeature["oml:index"]),
943-
xmlfeature["oml:name"],
944-
xmlfeature["oml:data_type"],
945-
xmlfeature.get("oml:nominal_value"),
946-
int(nr_missing),
947-
)
948-
if idx != feature.index:
949-
raise ValueError("Data features not provided in right order")
950-
features[feature.index] = feature
993+
994+
features = _parse_features_xml(features_xml_string)
951995

952996
with open(features_pickle_file, "wb") as fh_binary:
953997
pickle.dump(features, fh_binary)
954998
return features
955999

9561000

1001+
def _parse_features_xml(features_xml_string):
1002+
xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
1003+
features_xml = xml_dict["oml:data_features"]
1004+
1005+
features = {}
1006+
for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
1007+
nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
1008+
feature = OpenMLDataFeature(
1009+
int(xmlfeature["oml:index"]),
1010+
xmlfeature["oml:name"],
1011+
xmlfeature["oml:data_type"],
1012+
xmlfeature.get("oml:nominal_value"),
1013+
int(nr_missing),
1014+
)
1015+
if idx != feature.index:
1016+
raise ValueError("Data features not provided in right order")
1017+
features[feature.index] = feature
1018+
1019+
return features
1020+
1021+
9571022
def _get_features_pickle_file(features_file: str) -> str:
9581023
"""This function only exists so it can be mocked during unit testing"""
9591024
return features_file + ".pkl"
9601025

9611026

1027+
# -- Code for Qualities Property
9621028
def _read_qualities(qualities_file: str) -> Dict[str, float]:
9631029
qualities_pickle_file = _get_qualities_pickle_file(qualities_file)
9641030
try:
@@ -967,19 +1033,12 @@ def _read_qualities(qualities_file: str) -> Dict[str, float]:
9671033
except: # noqa E722
9681034
with open(qualities_file, encoding="utf8") as fh:
9691035
qualities_xml = fh.read()
970-
xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
971-
qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
972-
qualities = _check_qualities(qualities)
1036+
qualities = _parse_qualities_xml(qualities_xml)
9731037
with open(qualities_pickle_file, "wb") as fh_binary:
9741038
pickle.dump(qualities, fh_binary)
9751039
return qualities
9761040

9771041

978-
def _get_qualities_pickle_file(qualities_file: str) -> str:
979-
"""This function only exists so it can be mocked during unit testing"""
980-
return qualities_file + ".pkl"
981-
982-
9831042
def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
9841043
qualities_ = {}
9851044
for xmlquality in qualities:
@@ -992,3 +1051,14 @@ def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
9921051
value = float(xmlquality["oml:value"])
9931052
qualities_[name] = value
9941053
return qualities_
1054+
1055+
1056+
def _parse_qualities_xml(qualities_xml):
1057+
xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
1058+
qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
1059+
return _check_qualities(qualities)
1060+
1061+
1062+
def _get_qualities_pickle_file(qualities_file: str) -> str:
1063+
"""This function only exists so it can be mocked during unit testing"""
1064+
return qualities_file + ".pkl"

0 commit comments

Comments
 (0)