@@ -36,6 +36,8 @@ class OpenMLDataset(OpenMLBase):
3636 Description of the dataset.
3737 format : str
3838 Format of the dataset which can be either 'arff' or 'sparse_arff'.
39+ cache_format : str
40+ Format for caching the dataset which can be either 'feather' or 'pickle'.
3941 dataset_id : int, optional
4042 Id autogenerated by the server.
4143 version : int, optional
@@ -99,7 +101,8 @@ class OpenMLDataset(OpenMLBase):
99101 Serialized arff dataset string.
100102 """
101103 def __init__ (self , name , description , format = None ,
102- data_format = 'arff' , dataset_id = None , version = None ,
104+ data_format = 'arff' , cache_format = 'pickle' ,
105+ dataset_id = None , version = None ,
103106 creator = None , contributor = None , collection_date = None ,
104107 upload_date = None , language = None , licence = None ,
105108 url = None , default_target_attribute = None ,
@@ -127,6 +130,11 @@ def __init__(self, name, description, format=None,
127130 self .name = name
128131 self .version = int (version ) if version is not None else None
129132 self .description = description
133+ if cache_format not in ['feather' , 'pickle' ]:
134+ raise ValueError ("cache_format must be one of 'feather' or 'pickle. "
135+ "Invalid format specified: {}" .format (cache_format ))
136+
137+ self .cache_format = cache_format
130138 if format is None :
131139 self .format = data_format
132140 else :
@@ -180,9 +188,11 @@ def __init__(self, name, description, format=None,
180188 self .qualities = _check_qualities (qualities )
181189
182190 if data_file is not None :
183- self .data_pickle_file = self ._create_pickle_in_cache (data_file )
191+ self .data_pickle_file , self .data_feather_file ,\
192+ self .feather_attribute_file = self ._create_pickle_in_cache (data_file )
184193 else :
185- self .data_pickle_file = None
194+ self .data_pickle_file , self .data_feather_file , \
195+ self .feather_attribute_file = None , None , None
186196
187197 @property
188198 def id (self ) -> Optional [int ]:
@@ -396,18 +406,20 @@ def _parse_data_from_arff(
396406
397407 return X , categorical , attribute_names
398408
399- def _create_pickle_in_cache (self , data_file : str ) -> str :
409+ def _create_pickle_in_cache (self , data_file : str ) -> Tuple [ str , str , str ] :
400410 """ Parse the arff and pickle the result. Update any old pickle objects. """
401411 data_pickle_file = data_file .replace ('.arff' , '.pkl.py3' )
402- if os .path .exists (data_pickle_file ):
412+ data_feather_file = data_file .replace ('.arff' , '.feather' )
413+ feather_attribute_file = data_file .replace ('.arff' , '.feather.attributes.pkl.py3' )
414+ if os .path .exists (data_pickle_file ) and self .cache_format == 'pickle' :
403415 # Load the data to check if the pickle file is outdated (i.e. contains numpy array)
404416 with open (data_pickle_file , "rb" ) as fh :
405417 try :
406418 data , categorical , attribute_names = pickle .load (fh )
407419 except EOFError :
408420 # The file is likely corrupt, see #780.
409421 # We deal with this when loading the data in `_load_data`.
410- return data_pickle_file
422+ return data_pickle_file , data_feather_file , feather_attribute_file
411423
412424 # Between v0.8 and v0.9 the format of pickled data changed from
413425 # np.ndarray to pd.DataFrame. This breaks some backwards compatibility,
@@ -416,32 +428,62 @@ def _create_pickle_in_cache(self, data_file: str) -> str:
416428 # pd.DataFrame blob. See also #646.
417429 if isinstance (data , pd .DataFrame ) or scipy .sparse .issparse (data ):
418430 logger .debug ("Data pickle file already exists and is up to date." )
419- return data_pickle_file
431+ return data_pickle_file , data_feather_file , feather_attribute_file
432+ elif os .path .exists (data_feather_file ) and self .cache_format == 'feather' :
433+ # Load the data to check if the pickle file is outdated (i.e. contains numpy array)
434+ try :
435+ data = pd .read_feather (data_feather_file )
436+ except EOFError :
437+ # The file is likely corrupt, see #780.
438+ # We deal with this when loading the data in `_load_data`.
439+ return data_pickle_file , data_feather_file , feather_attribute_file
440+
441+ logger .debug ("Data feather file already exists and is up to date." )
442+ return data_pickle_file , data_feather_file , feather_attribute_file
420443
421444 # At this point either the pickle file does not exist, or it had outdated formatting.
422445 # We parse the data from arff again and populate the cache with a recent pickle file.
423446 X , categorical , attribute_names = self ._parse_data_from_arff (data_file )
424447
425- with open (data_pickle_file , "wb" ) as fh :
426- pickle .dump ((X , categorical , attribute_names ), fh , pickle .HIGHEST_PROTOCOL )
427- logger .debug ("Saved dataset {did}: {name} to file {path}"
428- .format (did = int (self .dataset_id or - 1 ),
429- name = self .name ,
430- path = data_pickle_file )
431- )
448+ # Feather format does not work for sparse datasets, so we use pickle for sparse datasets
432449
433- return data_pickle_file
450+ if self .cache_format == "feather" and not scipy .sparse .issparse (X ):
451+ logger .info ("feather write {}" .format (self .name ))
452+ X .to_feather (data_feather_file )
453+ with open (feather_attribute_file , "wb" ) as fh :
454+ pickle .dump ((categorical , attribute_names ), fh , pickle .HIGHEST_PROTOCOL )
455+ else :
456+ logger .info ("pickle write {}" .format (self .name ))
457+ self .cache_format = 'pickle'
458+ with open (data_pickle_file , "wb" ) as fh :
459+ pickle .dump ((X , categorical , attribute_names ), fh , pickle .HIGHEST_PROTOCOL )
460+ logger .debug ("Saved dataset {did}: {name} to file {path}"
461+ .format (did = int (self .dataset_id or - 1 ),
462+ name = self .name ,
463+ path = data_pickle_file )
464+ )
465+ return data_pickle_file , data_feather_file , feather_attribute_file
434466
435467 def _load_data (self ):
436468 """ Load data from pickle or arff. Download data first if not present on disk. """
437- if self .data_pickle_file is None :
469+ if (self .cache_format == 'pickle' and self .data_pickle_file is None ) or \
470+ (self .cache_format == 'feather' and self .data_feather_file is None ):
438471 if self .data_file is None :
439472 self ._download_data ()
440- self .data_pickle_file = self ._create_pickle_in_cache (self .data_file )
473+ self .data_pickle_file , self .data_feather_file , self .feather_attribute_file = \
474+ self ._create_pickle_in_cache (self .data_file )
441475
442476 try :
443- with open (self .data_pickle_file , "rb" ) as fh :
444- data , categorical , attribute_names = pickle .load (fh )
477+ if self .cache_format == 'feather' :
478+ logger .info ("feather load data {}" .format (self .name ))
479+ data = pd .read_feather (self .data_feather_file )
480+
481+ with open (self .feather_attribute_file , "rb" ) as fh :
482+ categorical , attribute_names = pickle .load (fh )
483+ else :
484+ logger .info ("pickle load data {}" .format (self .name ))
485+ with open (self .data_pickle_file , "rb" ) as fh :
486+ data , categorical , attribute_names = pickle .load (fh )
445487 except EOFError :
446488 logger .warning (
447489 "Detected a corrupt cache file loading dataset %d: '%s'. "
0 commit comments