77import os
88import pickle
99from typing import List , Optional , Union , Tuple , Iterable , Dict
10- import warnings
1110
1211import arff
1312import numpy as np
1918from .data_feature import OpenMLDataFeature
2019from ..exceptions import PyOpenMLError
2120
21+
2222logger = logging .getLogger (__name__ )
2323
2424
@@ -212,22 +212,17 @@ def find_invalid_characters(string, pattern):
212212 self ._dataset = dataset
213213 self ._minio_url = minio_url
214214
215- self ._features = None # type: Optional[Dict[int, OpenMLDataFeature]]
216- self ._qualities = None # type: Optional[Dict[str, float]]
217- self ._no_qualities_found = False
218-
219215 if features_file is not None :
220- self ._features = _read_features (features_file )
221-
222- if qualities_file == "" :
223- # TODO(0.15): to switch to "qualities_file is not None" below and remove warning
224- warnings .warn (
225- "Starting from Version 0.15 `qualities_file` must be None and not an empty string." ,
226- FutureWarning ,
227- )
216+ self .features = _read_features (
217+ features_file
218+ ) # type: Optional[Dict[int, OpenMLDataFeature]]
219+ else :
220+ self .features = None
228221
229222 if qualities_file :
230- self ._qualities = _read_qualities (qualities_file )
223+ self .qualities = _read_qualities (qualities_file ) # type: Optional[Dict[str, float]]
224+ else :
225+ self .qualities = None
231226
232227 if data_file is not None :
233228 rval = self ._compressed_cache_file_paths (data_file )
@@ -239,36 +234,12 @@ def find_invalid_characters(string, pattern):
239234 self .data_feather_file = None
240235 self .feather_attribute_file = None
241236
242- @property
243- def features (self ):
244- # Lazy loading of features
245- if self ._features is None :
246- self ._load_metadata (features = True )
247-
248- return self ._features
249-
250- @property
251- def qualities (self ):
252- # Lazy loading of qualities
253- # We have to check `_no_qualities_found` as there might not be qualities for a dataset
254- if self ._qualities is None and (not self ._no_qualities_found ):
255- self ._load_metadata (qualities = True )
256-
257- return self ._qualities
258-
259237 @property
260238 def id (self ) -> Optional [int ]:
261239 return self .dataset_id
262240
263241 def _get_repr_body_fields (self ) -> List [Tuple [str , Union [str , int , List [str ]]]]:
264242 """Collect all information to display in the __repr__ body."""
265-
266- # Obtain number of features in accordance with lazy loading.
267- if self ._qualities is not None and self ._qualities ["NumberOfFeatures" ] is not None :
268- n_features = int (self ._qualities ["NumberOfFeatures" ]) # type: Optional[int]
269- else :
270- n_features = len (self ._features ) if self ._features is not None else None
271-
272243 fields = {
273244 "Name" : self .name ,
274245 "Version" : self .version ,
@@ -277,14 +248,14 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
277248 "Download URL" : self .url ,
278249 "Data file" : self .data_file ,
279250 "Pickle file" : self .data_pickle_file ,
280- "# of features" : n_features ,
251+ "# of features" : len ( self . features ) if self . features is not None else None ,
281252 }
282253 if self .upload_date is not None :
283254 fields ["Upload Date" ] = self .upload_date .replace ("T" , " " )
284255 if self .dataset_id is not None :
285256 fields ["OpenML URL" ] = self .openml_url
286- if self ._qualities is not None and self ._qualities ["NumberOfInstances" ] is not None :
287- fields ["# of instances" ] = int (self ._qualities ["NumberOfInstances" ])
257+ if self .qualities is not None and self .qualities ["NumberOfInstances" ] is not None :
258+ fields ["# of instances" ] = int (self .qualities ["NumberOfInstances" ])
288259
289260 # determines the order in which the information will be printed
290261 order = [
@@ -802,40 +773,6 @@ def get_data(
802773
803774 return data , targets , categorical , attribute_names
804775
805- def _load_metadata (self , features : bool = False , qualities : bool = False ):
806- """Load the missing metadata information from the server and store it in the
807- dataset object.
808-
809- The purpose of the function is to support lazy loading.
810-
811- Parameters
812- ----------
813- features : bool (default=False)
814- If True, load the `self.features` data if not already loaded.
815- qualities: bool (default=False)
816- If True, load the `self.qualities` data if not already loaded.
817- """
818- # Delayed Import to avoid circular imports or having to import all of dataset.functions to
819- # import OpenMLDataset
820- from openml .datasets .functions import _get_dataset_metadata
821-
822- if self .dataset_id is None :
823- raise ValueError (
824- """No dataset id specified. Please set the dataset id.
825- Otherwise we cannot load metadata."""
826- )
827-
828- features_file , qualities_file = _get_dataset_metadata (
829- self .dataset_id , features = features , qualities = qualities
830- )
831-
832- if features_file is not None :
833- self ._features = _read_features (features_file )
834-
835- if qualities_file is not None :
836- self ._qualities = _read_qualities (qualities_file )
837- self ._no_qualities_found = self ._qualities is None
838-
839776 def retrieve_class_labels (self , target_name : str = "class" ) -> Union [None , List [str ]]:
840777 """Reads the datasets arff to determine the class-labels.
841778
@@ -853,6 +790,10 @@ def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[
853790 -------
854791 list
855792 """
793+ if self .features is None :
794+ raise ValueError (
795+ "retrieve_class_labels can only be called if feature information is available."
796+ )
856797 for feature in self .features .values ():
857798 if (feature .name == target_name ) and (feature .data_type == "nominal" ):
858799 return feature .nominal_values
@@ -981,7 +922,6 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
981922 return data_container
982923
983924
984- # -- Code for Features Property
985925def _read_features (features_file : str ) -> Dict [int , OpenMLDataFeature ]:
986926 features_pickle_file = _get_features_pickle_file (features_file )
987927 try :
@@ -990,41 +930,35 @@ def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
990930 except : # noqa E722
991931 with open (features_file , encoding = "utf8" ) as fh :
992932 features_xml_string = fh .read ()
993-
994- features = _parse_features_xml (features_xml_string )
933+ xml_dict = xmltodict .parse (
934+ features_xml_string , force_list = ("oml:feature" , "oml:nominal_value" )
935+ )
936+ features_xml = xml_dict ["oml:data_features" ]
937+
938+ features = {}
939+ for idx , xmlfeature in enumerate (features_xml ["oml:feature" ]):
940+ nr_missing = xmlfeature .get ("oml:number_of_missing_values" , 0 )
941+ feature = OpenMLDataFeature (
942+ int (xmlfeature ["oml:index" ]),
943+ xmlfeature ["oml:name" ],
944+ xmlfeature ["oml:data_type" ],
945+ xmlfeature .get ("oml:nominal_value" ),
946+ int (nr_missing ),
947+ )
948+ if idx != feature .index :
949+ raise ValueError ("Data features not provided in right order" )
950+ features [feature .index ] = feature
995951
996952 with open (features_pickle_file , "wb" ) as fh_binary :
997953 pickle .dump (features , fh_binary )
998954 return features
999955
1000956
1001- def _parse_features_xml (features_xml_string ):
1002- xml_dict = xmltodict .parse (features_xml_string , force_list = ("oml:feature" , "oml:nominal_value" ))
1003- features_xml = xml_dict ["oml:data_features" ]
1004-
1005- features = {}
1006- for idx , xmlfeature in enumerate (features_xml ["oml:feature" ]):
1007- nr_missing = xmlfeature .get ("oml:number_of_missing_values" , 0 )
1008- feature = OpenMLDataFeature (
1009- int (xmlfeature ["oml:index" ]),
1010- xmlfeature ["oml:name" ],
1011- xmlfeature ["oml:data_type" ],
1012- xmlfeature .get ("oml:nominal_value" ),
1013- int (nr_missing ),
1014- )
1015- if idx != feature .index :
1016- raise ValueError ("Data features not provided in right order" )
1017- features [feature .index ] = feature
1018-
1019- return features
1020-
1021-
1022957def _get_features_pickle_file (features_file : str ) -> str :
1023958 """This function only exists so it can be mocked during unit testing"""
1024959 return features_file + ".pkl"
1025960
1026961
1027- # -- Code for Qualities Property
1028962def _read_qualities (qualities_file : str ) -> Dict [str , float ]:
1029963 qualities_pickle_file = _get_qualities_pickle_file (qualities_file )
1030964 try :
@@ -1033,12 +967,19 @@ def _read_qualities(qualities_file: str) -> Dict[str, float]:
1033967 except : # noqa E722
1034968 with open (qualities_file , encoding = "utf8" ) as fh :
1035969 qualities_xml = fh .read ()
1036- qualities = _parse_qualities_xml (qualities_xml )
970+ xml_as_dict = xmltodict .parse (qualities_xml , force_list = ("oml:quality" ,))
971+ qualities = xml_as_dict ["oml:data_qualities" ]["oml:quality" ]
972+ qualities = _check_qualities (qualities )
1037973 with open (qualities_pickle_file , "wb" ) as fh_binary :
1038974 pickle .dump (qualities , fh_binary )
1039975 return qualities
1040976
1041977
978+ def _get_qualities_pickle_file (qualities_file : str ) -> str :
979+ """This function only exists so it can be mocked during unit testing"""
980+ return qualities_file + ".pkl"
981+
982+
1042983def _check_qualities (qualities : List [Dict [str , str ]]) -> Dict [str , float ]:
1043984 qualities_ = {}
1044985 for xmlquality in qualities :
@@ -1051,14 +992,3 @@ def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
1051992 value = float (xmlquality ["oml:value" ])
1052993 qualities_ [name ] = value
1053994 return qualities_
1054-
1055-
1056- def _parse_qualities_xml (qualities_xml ):
1057- xml_as_dict = xmltodict .parse (qualities_xml , force_list = ("oml:quality" ,))
1058- qualities = xml_as_dict ["oml:data_qualities" ]["oml:quality" ]
1059- return _check_qualities (qualities )
1060-
1061-
1062- def _get_qualities_pickle_file (qualities_file : str ) -> str :
1063- """This function only exists so it can be mocked during unit testing"""
1064- return qualities_file + ".pkl"
0 commit comments