77import os
88import pickle
99from typing import List , Optional , Union , Tuple , Iterable , Dict
10+ import warnings
1011
1112import arff
1213import numpy as np
1819from .data_feature import OpenMLDataFeature
1920from ..exceptions import PyOpenMLError
2021
21-
2222logger = logging .getLogger (__name__ )
2323
2424
@@ -212,17 +212,22 @@ def find_invalid_characters(string, pattern):
212212 self ._dataset = dataset
213213 self ._minio_url = minio_url
214214
215+ self ._features = None # type: Optional[Dict[int, OpenMLDataFeature]]
216+ self ._qualities = None # type: Optional[Dict[str, float]]
217+ self ._no_qualities_found = False
218+
215219 if features_file is not None :
216- self .features = _read_features (
217- features_file
218- ) # type: Optional[Dict[int, OpenMLDataFeature]]
219- else :
220- self .features = None
220+ self ._features = _read_features (features_file )
221+
222+ if qualities_file == "" :
223+ # TODO(0.15): to switch to "qualities_file is not None" below and remove warning
224+ warnings .warn (
225+ "Starting from Version 0.15 `qualities_file` must be None and not an empty string." ,
226+ FutureWarning ,
227+ )
221228
222229 if qualities_file :
223- self .qualities = _read_qualities (qualities_file ) # type: Optional[Dict[str, float]]
224- else :
225- self .qualities = None
230+ self ._qualities = _read_qualities (qualities_file )
226231
227232 if data_file is not None :
228233 rval = self ._compressed_cache_file_paths (data_file )
@@ -234,12 +239,36 @@ def find_invalid_characters(string, pattern):
234239 self .data_feather_file = None
235240 self .feather_attribute_file = None
236241
242+ @property
243+ def features (self ):
244+ # Lazy loading of features
245+ if self ._features is None :
246+ self ._load_metadata (features = True )
247+
248+ return self ._features
249+
250+ @property
251+ def qualities (self ):
252+ # Lazy loading of qualities
253+ # We have to check `_no_qualities_found` as there might not be qualities for a dataset
254+ if self ._qualities is None and (not self ._no_qualities_found ):
255+ self ._load_metadata (qualities = True )
256+
257+ return self ._qualities
258+
237259 @property
238260 def id (self ) -> Optional [int ]:
239261 return self .dataset_id
240262
241263 def _get_repr_body_fields (self ) -> List [Tuple [str , Union [str , int , List [str ]]]]:
242264 """Collect all information to display in the __repr__ body."""
265+
266+ # Obtain number of features in accordance with lazy loading.
267+ if self ._qualities is not None and self ._qualities ["NumberOfFeatures" ] is not None :
268+ n_features = int (self ._qualities ["NumberOfFeatures" ]) # type: Optional[int]
269+ else :
270+ n_features = len (self ._features ) if self ._features is not None else None
271+
243272 fields = {
244273 "Name" : self .name ,
245274 "Version" : self .version ,
@@ -248,14 +277,14 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
248277 "Download URL" : self .url ,
249278 "Data file" : self .data_file ,
250279 "Pickle file" : self .data_pickle_file ,
251- "# of features" : len ( self . features ) if self . features is not None else None ,
280+ "# of features" : n_features ,
252281 }
253282 if self .upload_date is not None :
254283 fields ["Upload Date" ] = self .upload_date .replace ("T" , " " )
255284 if self .dataset_id is not None :
256285 fields ["OpenML URL" ] = self .openml_url
257- if self .qualities is not None and self .qualities ["NumberOfInstances" ] is not None :
258- fields ["# of instances" ] = int (self .qualities ["NumberOfInstances" ])
286+ if self ._qualities is not None and self ._qualities ["NumberOfInstances" ] is not None :
287+ fields ["# of instances" ] = int (self ._qualities ["NumberOfInstances" ])
259288
260289 # determines the order in which the information will be printed
261290 order = [
@@ -773,6 +802,40 @@ def get_data(
773802
774803 return data , targets , categorical , attribute_names
775804
805+ def _load_metadata (self , features : bool = False , qualities : bool = False ):
806+ """Load the missing metadata information from the server and store it in the
807+ dataset object.
808+
809+ The purpose of the function is to support lazy loading.
810+
811+ Parameters
812+ ----------
813+ features : bool (default=False)
814+ If True, load the `self.features` data if not already loaded.
815+ qualities: bool (default=False)
816+ If True, load the `self.qualities` data if not already loaded.
817+ """
818+ # Delayed Import to avoid circular imports or having to import all of dataset.functions to
819+ # import OpenMLDataset
820+ from openml .datasets .functions import _get_dataset_metadata
821+
822+ if self .dataset_id is None :
823+ raise ValueError (
824+ """No dataset id specified. Please set the dataset id.
825+ Otherwise we cannot load metadata."""
826+ )
827+
828+ features_file , qualities_file = _get_dataset_metadata (
829+ self .dataset_id , features = features , qualities = qualities
830+ )
831+
832+ if features_file is not None :
833+ self ._features = _read_features (features_file )
834+
835+ if qualities_file is not None :
836+ self ._qualities = _read_qualities (qualities_file )
837+ self ._no_qualities_found = self ._qualities is None
838+
776839 def retrieve_class_labels (self , target_name : str = "class" ) -> Union [None , List [str ]]:
777840 """Reads the datasets arff to determine the class-labels.
778841
@@ -790,10 +853,6 @@ def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[
790853 -------
791854 list
792855 """
793- if self .features is None :
794- raise ValueError (
795- "retrieve_class_labels can only be called if feature information is available."
796- )
797856 for feature in self .features .values ():
798857 if (feature .name == target_name ) and (feature .data_type == "nominal" ):
799858 return feature .nominal_values
@@ -922,6 +981,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
922981 return data_container
923982
924983
984+ # -- Code for Features Property
925985def _read_features (features_file : str ) -> Dict [int , OpenMLDataFeature ]:
926986 features_pickle_file = _get_features_pickle_file (features_file )
927987 try :
@@ -930,35 +990,41 @@ def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
930990 except : # noqa E722
931991 with open (features_file , encoding = "utf8" ) as fh :
932992 features_xml_string = fh .read ()
933- xml_dict = xmltodict .parse (
934- features_xml_string , force_list = ("oml:feature" , "oml:nominal_value" )
935- )
936- features_xml = xml_dict ["oml:data_features" ]
937-
938- features = {}
939- for idx , xmlfeature in enumerate (features_xml ["oml:feature" ]):
940- nr_missing = xmlfeature .get ("oml:number_of_missing_values" , 0 )
941- feature = OpenMLDataFeature (
942- int (xmlfeature ["oml:index" ]),
943- xmlfeature ["oml:name" ],
944- xmlfeature ["oml:data_type" ],
945- xmlfeature .get ("oml:nominal_value" ),
946- int (nr_missing ),
947- )
948- if idx != feature .index :
949- raise ValueError ("Data features not provided in right order" )
950- features [feature .index ] = feature
993+
994+ features = _parse_features_xml (features_xml_string )
951995
952996 with open (features_pickle_file , "wb" ) as fh_binary :
953997 pickle .dump (features , fh_binary )
954998 return features
955999
9561000
1001+ def _parse_features_xml (features_xml_string ):
1002+ xml_dict = xmltodict .parse (features_xml_string , force_list = ("oml:feature" , "oml:nominal_value" ))
1003+ features_xml = xml_dict ["oml:data_features" ]
1004+
1005+ features = {}
1006+ for idx , xmlfeature in enumerate (features_xml ["oml:feature" ]):
1007+ nr_missing = xmlfeature .get ("oml:number_of_missing_values" , 0 )
1008+ feature = OpenMLDataFeature (
1009+ int (xmlfeature ["oml:index" ]),
1010+ xmlfeature ["oml:name" ],
1011+ xmlfeature ["oml:data_type" ],
1012+ xmlfeature .get ("oml:nominal_value" ),
1013+ int (nr_missing ),
1014+ )
1015+ if idx != feature .index :
1016+ raise ValueError ("Data features not provided in right order" )
1017+ features [feature .index ] = feature
1018+
1019+ return features
1020+
1021+
9571022def _get_features_pickle_file (features_file : str ) -> str :
9581023 """This function only exists so it can be mocked during unit testing"""
9591024 return features_file + ".pkl"
9601025
9611026
1027+ # -- Code for Qualities Property
9621028def _read_qualities (qualities_file : str ) -> Dict [str , float ]:
9631029 qualities_pickle_file = _get_qualities_pickle_file (qualities_file )
9641030 try :
@@ -967,19 +1033,12 @@ def _read_qualities(qualities_file: str) -> Dict[str, float]:
9671033 except : # noqa E722
9681034 with open (qualities_file , encoding = "utf8" ) as fh :
9691035 qualities_xml = fh .read ()
970- xml_as_dict = xmltodict .parse (qualities_xml , force_list = ("oml:quality" ,))
971- qualities = xml_as_dict ["oml:data_qualities" ]["oml:quality" ]
972- qualities = _check_qualities (qualities )
1036+ qualities = _parse_qualities_xml (qualities_xml )
9731037 with open (qualities_pickle_file , "wb" ) as fh_binary :
9741038 pickle .dump (qualities , fh_binary )
9751039 return qualities
9761040
9771041
978- def _get_qualities_pickle_file (qualities_file : str ) -> str :
979- """This function only exists so it can be mocked during unit testing"""
980- return qualities_file + ".pkl"
981-
982-
9831042def _check_qualities (qualities : List [Dict [str , str ]]) -> Dict [str , float ]:
9841043 qualities_ = {}
9851044 for xmlquality in qualities :
@@ -992,3 +1051,14 @@ def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
9921051 value = float (xmlquality ["oml:value" ])
9931052 qualities_ [name ] = value
9941053 return qualities_
1054+
1055+
1056+ def _parse_qualities_xml (qualities_xml ):
1057+ xml_as_dict = xmltodict .parse (qualities_xml , force_list = ("oml:quality" ,))
1058+ qualities = xml_as_dict ["oml:data_qualities" ]["oml:quality" ]
1059+ return _check_qualities (qualities )
1060+
1061+
1062+ def _get_qualities_pickle_file (qualities_file : str ) -> str :
1063+ """This function only exists so it can be mocked during unit testing"""
1064+ return qualities_file + ".pkl"
0 commit comments