Skip to content

Commit 4acdac4

Browse files
Merge pull request #1259 from openml/revert-1256-download_updates
Revert "Download updates"
2 parents 91b4bf0 + 3b3553b commit 4acdac4

File tree

9 files changed

+94
-376
lines changed

9 files changed

+94
-376
lines changed

doc/progress.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ Changelog
99
0.13.1
1010
~~~~~~
1111

12-
* ADD #1081 #1132: Add additional options for (not) downloading datasets ``openml.datasets.get_dataset`` and cache management.
1312
* ADD #1028: Add functions to delete runs, flows, datasets, and tasks (e.g., ``openml.datasets.delete_dataset``).
1413
* ADD #1144: Add locally computed results to the ``OpenMLRun`` object's representation if the run was created locally and not downloaded from the server.
1514
* ADD #1180: Improve the error message when the checksum of a downloaded dataset does not match the checksum provided by the API.

openml/datasets/data_feature.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,5 @@ def __init__(
6262
def __repr__(self):
6363
return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)
6464

65-
def __eq__(self, other):
66-
if not isinstance(other, OpenMLDataFeature):
67-
return False
68-
69-
return self.__dict__ == other.__dict__
70-
7165
def _repr_pretty_(self, pp, cycle):
7266
pp.text(str(self))

openml/datasets/dataset.py

Lines changed: 42 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import os
88
import pickle
99
from typing import List, Optional, Union, Tuple, Iterable, Dict
10-
import warnings
1110

1211
import arff
1312
import numpy as np
@@ -19,6 +18,7 @@
1918
from .data_feature import OpenMLDataFeature
2019
from ..exceptions import PyOpenMLError
2120

21+
2222
logger = logging.getLogger(__name__)
2323

2424

@@ -212,22 +212,17 @@ def find_invalid_characters(string, pattern):
212212
self._dataset = dataset
213213
self._minio_url = minio_url
214214

215-
self._features = None # type: Optional[Dict[int, OpenMLDataFeature]]
216-
self._qualities = None # type: Optional[Dict[str, float]]
217-
self._no_qualities_found = False
218-
219215
if features_file is not None:
220-
self._features = _read_features(features_file)
221-
222-
if qualities_file == "":
223-
# TODO(0.15): to switch to "qualities_file is not None" below and remove warning
224-
warnings.warn(
225-
"Starting from Version 0.15 `qualities_file` must be None and not an empty string.",
226-
FutureWarning,
227-
)
216+
self.features = _read_features(
217+
features_file
218+
) # type: Optional[Dict[int, OpenMLDataFeature]]
219+
else:
220+
self.features = None
228221

229222
if qualities_file:
230-
self._qualities = _read_qualities(qualities_file)
223+
self.qualities = _read_qualities(qualities_file) # type: Optional[Dict[str, float]]
224+
else:
225+
self.qualities = None
231226

232227
if data_file is not None:
233228
rval = self._compressed_cache_file_paths(data_file)
@@ -239,36 +234,12 @@ def find_invalid_characters(string, pattern):
239234
self.data_feather_file = None
240235
self.feather_attribute_file = None
241236

242-
@property
243-
def features(self):
244-
# Lazy loading of features
245-
if self._features is None:
246-
self._load_metadata(features=True)
247-
248-
return self._features
249-
250-
@property
251-
def qualities(self):
252-
# Lazy loading of qualities
253-
# We have to check `_no_qualities_found` as there might not be qualities for a dataset
254-
if self._qualities is None and (not self._no_qualities_found):
255-
self._load_metadata(qualities=True)
256-
257-
return self._qualities
258-
259237
@property
260238
def id(self) -> Optional[int]:
261239
return self.dataset_id
262240

263241
def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
264242
"""Collect all information to display in the __repr__ body."""
265-
266-
# Obtain number of features in accordance with lazy loading.
267-
if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None:
268-
n_features = int(self._qualities["NumberOfFeatures"]) # type: Optional[int]
269-
else:
270-
n_features = len(self._features) if self._features is not None else None
271-
272243
fields = {
273244
"Name": self.name,
274245
"Version": self.version,
@@ -277,14 +248,14 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
277248
"Download URL": self.url,
278249
"Data file": self.data_file,
279250
"Pickle file": self.data_pickle_file,
280-
"# of features": n_features,
251+
"# of features": len(self.features) if self.features is not None else None,
281252
}
282253
if self.upload_date is not None:
283254
fields["Upload Date"] = self.upload_date.replace("T", " ")
284255
if self.dataset_id is not None:
285256
fields["OpenML URL"] = self.openml_url
286-
if self._qualities is not None and self._qualities["NumberOfInstances"] is not None:
287-
fields["# of instances"] = int(self._qualities["NumberOfInstances"])
257+
if self.qualities is not None and self.qualities["NumberOfInstances"] is not None:
258+
fields["# of instances"] = int(self.qualities["NumberOfInstances"])
288259

289260
# determines the order in which the information will be printed
290261
order = [
@@ -802,40 +773,6 @@ def get_data(
802773

803774
return data, targets, categorical, attribute_names
804775

805-
def _load_metadata(self, features: bool = False, qualities: bool = False):
806-
"""Load the missing metadata information from the server and store it in the
807-
dataset object.
808-
809-
The purpose of the function is to support lazy loading.
810-
811-
Parameters
812-
----------
813-
features : bool (default=False)
814-
If True, load the `self.features` data if not already loaded.
815-
qualities: bool (default=False)
816-
If True, load the `self.qualities` data if not already loaded.
817-
"""
818-
# Delayed Import to avoid circular imports or having to import all of dataset.functions to
819-
# import OpenMLDataset
820-
from openml.datasets.functions import _get_dataset_metadata
821-
822-
if self.dataset_id is None:
823-
raise ValueError(
824-
"""No dataset id specified. Please set the dataset id.
825-
Otherwise we cannot load metadata."""
826-
)
827-
828-
features_file, qualities_file = _get_dataset_metadata(
829-
self.dataset_id, features=features, qualities=qualities
830-
)
831-
832-
if features_file is not None:
833-
self._features = _read_features(features_file)
834-
835-
if qualities_file is not None:
836-
self._qualities = _read_qualities(qualities_file)
837-
self._no_qualities_found = self._qualities is None
838-
839776
def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[str]]:
840777
"""Reads the datasets arff to determine the class-labels.
841778
@@ -853,6 +790,10 @@ def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[
853790
-------
854791
list
855792
"""
793+
if self.features is None:
794+
raise ValueError(
795+
"retrieve_class_labels can only be called if feature information is available."
796+
)
856797
for feature in self.features.values():
857798
if (feature.name == target_name) and (feature.data_type == "nominal"):
858799
return feature.nominal_values
@@ -981,7 +922,6 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
981922
return data_container
982923

983924

984-
# -- Code for Features Property
985925
def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
986926
features_pickle_file = _get_features_pickle_file(features_file)
987927
try:
@@ -990,41 +930,35 @@ def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
990930
except: # noqa E722
991931
with open(features_file, encoding="utf8") as fh:
992932
features_xml_string = fh.read()
993-
994-
features = _parse_features_xml(features_xml_string)
933+
xml_dict = xmltodict.parse(
934+
features_xml_string, force_list=("oml:feature", "oml:nominal_value")
935+
)
936+
features_xml = xml_dict["oml:data_features"]
937+
938+
features = {}
939+
for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
940+
nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
941+
feature = OpenMLDataFeature(
942+
int(xmlfeature["oml:index"]),
943+
xmlfeature["oml:name"],
944+
xmlfeature["oml:data_type"],
945+
xmlfeature.get("oml:nominal_value"),
946+
int(nr_missing),
947+
)
948+
if idx != feature.index:
949+
raise ValueError("Data features not provided in right order")
950+
features[feature.index] = feature
995951

996952
with open(features_pickle_file, "wb") as fh_binary:
997953
pickle.dump(features, fh_binary)
998954
return features
999955

1000956

1001-
def _parse_features_xml(features_xml_string):
1002-
xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
1003-
features_xml = xml_dict["oml:data_features"]
1004-
1005-
features = {}
1006-
for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
1007-
nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
1008-
feature = OpenMLDataFeature(
1009-
int(xmlfeature["oml:index"]),
1010-
xmlfeature["oml:name"],
1011-
xmlfeature["oml:data_type"],
1012-
xmlfeature.get("oml:nominal_value"),
1013-
int(nr_missing),
1014-
)
1015-
if idx != feature.index:
1016-
raise ValueError("Data features not provided in right order")
1017-
features[feature.index] = feature
1018-
1019-
return features
1020-
1021-
1022957
def _get_features_pickle_file(features_file: str) -> str:
1023958
"""This function only exists so it can be mocked during unit testing"""
1024959
return features_file + ".pkl"
1025960

1026961

1027-
# -- Code for Qualities Property
1028962
def _read_qualities(qualities_file: str) -> Dict[str, float]:
1029963
qualities_pickle_file = _get_qualities_pickle_file(qualities_file)
1030964
try:
@@ -1033,12 +967,19 @@ def _read_qualities(qualities_file: str) -> Dict[str, float]:
1033967
except: # noqa E722
1034968
with open(qualities_file, encoding="utf8") as fh:
1035969
qualities_xml = fh.read()
1036-
qualities = _parse_qualities_xml(qualities_xml)
970+
xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
971+
qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
972+
qualities = _check_qualities(qualities)
1037973
with open(qualities_pickle_file, "wb") as fh_binary:
1038974
pickle.dump(qualities, fh_binary)
1039975
return qualities
1040976

1041977

978+
def _get_qualities_pickle_file(qualities_file: str) -> str:
979+
"""This function only exists so it can be mocked during unit testing"""
980+
return qualities_file + ".pkl"
981+
982+
1042983
def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
1043984
qualities_ = {}
1044985
for xmlquality in qualities:
@@ -1051,14 +992,3 @@ def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
1051992
value = float(xmlquality["oml:value"])
1052993
qualities_[name] = value
1053994
return qualities_
1054-
1055-
1056-
def _parse_qualities_xml(qualities_xml):
1057-
xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
1058-
qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
1059-
return _check_qualities(qualities)
1060-
1061-
1062-
def _get_qualities_pickle_file(qualities_file: str) -> str:
1063-
"""This function only exists so it can be mocked during unit testing"""
1064-
return qualities_file + ".pkl"

0 commit comments

Comments
 (0)