@@ -217,16 +217,14 @@ def find_invalid_characters(string, pattern):
217217 self .qualities = None
218218
219219 if data_file is not None :
220- rval = self ._create_pickle_in_cache (data_file )
221- self .data_pickle_file = rval [0 ] # type: Optional[str]
222- self .data_feather_file = rval [1 ] # type: Optional[str]
223- self .feather_attribute_file = rval [2 ] # type: Optional[str]
220+ rval = self ._compressed_cache_file_paths (data_file )
221+ self .data_pickle_file = rval [0 ] if os . path . exists ( rval [ 0 ]) else None
222+ self .data_feather_file = rval [1 ] if os . path . exists ( rval [ 1 ]) else None
223+ self .feather_attribute_file = rval [2 ] if os . path . exists ( rval [ 2 ]) else None
224224 else :
225- self .data_pickle_file , self .data_feather_file , self .feather_attribute_file = (
226- None ,
227- None ,
228- None ,
229- )
225+ self .data_pickle_file = None
226+ self .data_feather_file = None
227+ self .feather_attribute_file = None
230228
231229 @property
232230 def id (self ) -> Optional [int ]:
@@ -455,152 +453,97 @@ def _parse_data_from_arff(
455453
456454 return X , categorical , attribute_names
457455
458- def _create_pickle_in_cache (self , data_file : str ) -> Tuple [str , str , str ]:
459- """ Parse the arff and pickle the result. Update any old pickle objects. """
456+ def _compressed_cache_file_paths (self , data_file : str ) -> Tuple [str , str , str ]:
460457 data_pickle_file = data_file .replace (".arff" , ".pkl.py3" )
461458 data_feather_file = data_file .replace (".arff" , ".feather" )
462459 feather_attribute_file = data_file .replace (".arff" , ".feather.attributes.pkl.py3" )
463- if os .path .exists (data_pickle_file ) and self .cache_format == "pickle" :
464- # Load the data to check if the pickle file is outdated (i.e. contains numpy array)
465- with open (data_pickle_file , "rb" ) as fh :
466- try :
467- data , categorical , attribute_names = pickle .load (fh )
468- except EOFError :
469- # The file is likely corrupt, see #780.
470- # We deal with this when loading the data in `_load_data`.
471- return data_pickle_file , data_feather_file , feather_attribute_file
472- except ModuleNotFoundError :
473- # There was some issue loading the file, see #918
474- # We deal with this when loading the data in `_load_data`.
475- return data_pickle_file , data_feather_file , feather_attribute_file
476- except ValueError as e :
477- if "unsupported pickle protocol" in e .args [0 ]:
478- # There was some issue loading the file, see #898
479- # We deal with this when loading the data in `_load_data`.
480- return data_pickle_file , data_feather_file , feather_attribute_file
481- else :
482- raise
483-
484- # Between v0.8 and v0.9 the format of pickled data changed from
485- # np.ndarray to pd.DataFrame. This breaks some backwards compatibility,
486- # e.g. for `run_model_on_task`. If a local file still exists with
487- # np.ndarray data, we reprocess the data file to store a pickled
488- # pd.DataFrame blob. See also #646.
489- if isinstance (data , pd .DataFrame ) or scipy .sparse .issparse (data ):
490- logger .debug ("Data pickle file already exists and is up to date." )
491- return data_pickle_file , data_feather_file , feather_attribute_file
492- elif os .path .exists (data_feather_file ) and self .cache_format == "feather" :
493- # Load the data to check if the pickle file is outdated (i.e. contains numpy array)
494- try :
495- data = pd .read_feather (data_feather_file )
496- except EOFError :
497- # The file is likely corrupt, see #780.
498- # We deal with this when loading the data in `_load_data`.
499- return data_pickle_file , data_feather_file , feather_attribute_file
500- except ModuleNotFoundError :
501- # There was some issue loading the file, see #918
502- # We deal with this when loading the data in `_load_data`.
503- return data_pickle_file , data_feather_file , feather_attribute_file
504- except ValueError as e :
505- if "unsupported pickle protocol" in e .args [0 ]:
506- # There was some issue loading the file, see #898
507- # We deal with this when loading the data in `_load_data`.
508- return data_pickle_file , data_feather_file , feather_attribute_file
509- else :
510- raise
460+ return data_pickle_file , data_feather_file , feather_attribute_file
511461
512- logger .debug ("Data feather file already exists and is up to date." )
513- return data_pickle_file , data_feather_file , feather_attribute_file
462+ def _cache_compressed_file_from_arff (
463+ self , arff_file : str
464+ ) -> Tuple [Union [pd .DataFrame , scipy .sparse .csr_matrix ], List [bool ], List [str ]]:
465+ """ Store data from the arff file in compressed format. Sets cache_format to 'pickle' if data is sparse. """ # noqa: 501
466+ (
467+ data_pickle_file ,
468+ data_feather_file ,
469+ feather_attribute_file ,
470+ ) = self ._compressed_cache_file_paths (arff_file )
514471
515- # At this point either the pickle file does not exist, or it had outdated formatting.
516- # We parse the data from arff again and populate the cache with a recent pickle file.
517- X , categorical , attribute_names = self ._parse_data_from_arff (data_file )
472+ data , categorical , attribute_names = self ._parse_data_from_arff (arff_file )
518473
519474 # Feather format does not work for sparse datasets, so we use pickle for sparse datasets
475+ if scipy .sparse .issparse (data ):
476+ self .cache_format = "pickle"
520477
521- if self .cache_format == "feather" and not scipy . sparse . issparse ( X ):
522- logger . info ( "feather write {}" . format ( self . name ))
523- X .to_feather (data_feather_file )
478+ logger . info ( f" { self .cache_format } write { self . name } " )
479+ if self . cache_format == "feather" :
480+ data .to_feather (data_feather_file )
524481 with open (feather_attribute_file , "wb" ) as fh :
525482 pickle .dump ((categorical , attribute_names ), fh , pickle .HIGHEST_PROTOCOL )
526483 else :
527- logger .info ("pickle write {}" .format (self .name ))
528- self .cache_format = "pickle"
529484 with open (data_pickle_file , "wb" ) as fh :
530- pickle .dump ((X , categorical , attribute_names ), fh , pickle .HIGHEST_PROTOCOL )
531- logger .debug (
532- "Saved dataset {did}: {name} to file {path}" .format (
533- did = int (self .dataset_id or - 1 ), name = self .name , path = data_pickle_file
534- )
535- )
536- return data_pickle_file , data_feather_file , feather_attribute_file
485+ pickle .dump ((data , categorical , attribute_names ), fh , pickle .HIGHEST_PROTOCOL )
486+
487+ data_file = data_pickle_file if self .cache_format == "pickle" else data_feather_file
488+ logger .debug (f"Saved dataset { int (self .dataset_id or - 1 )} : { self .name } to file { data_file } " )
489+ return data , categorical , attribute_names
537490
538491 def _load_data (self ):
539- """ Load data from pickle or arff. Download data first if not present on disk. """
540- if (self .cache_format == "pickle" and self .data_pickle_file is None ) or (
541- self .cache_format == "feather" and self .data_feather_file is None
542- ):
492+ """ Load data from compressed format or arff. Download data if not present on disk. """
493+ need_to_create_pickle = self .cache_format == "pickle" and self .data_pickle_file is None
494+ need_to_create_feather = self .cache_format == "feather" and self .data_feather_file is None
495+
496+ if need_to_create_pickle or need_to_create_feather :
543497 if self .data_file is None :
544498 self ._download_data ()
545- (
546- self .data_pickle_file ,
547- self .data_feather_file ,
548- self .feather_attribute_file ,
549- ) = self ._create_pickle_in_cache (self .data_file )
550-
499+ res = self ._compressed_cache_file_paths (self .data_file )
500+ self .data_pickle_file , self .data_feather_file , self .feather_attribute_file = res
501+ # Since our recently stored data is exists in memory, there is no need to load from disk
502+ return self ._cache_compressed_file_from_arff (self .data_file )
503+
504+ # helper variable to help identify where errors occur
505+ fpath = self .data_feather_file if self .cache_format == "feather" else self .data_pickle_file
506+ logger .info (f"{ self .cache_format } load data { self .name } " )
551507 try :
552508 if self .cache_format == "feather" :
553- logger .info ("feather load data {}" .format (self .name ))
554509 data = pd .read_feather (self .data_feather_file )
555-
510+ fpath = self . feather_attribute_file
556511 with open (self .feather_attribute_file , "rb" ) as fh :
557512 categorical , attribute_names = pickle .load (fh )
558513 else :
559- logger .info ("pickle load data {}" .format (self .name ))
560514 with open (self .data_pickle_file , "rb" ) as fh :
561515 data , categorical , attribute_names = pickle .load (fh )
562- except EOFError :
563- logger .warning (
564- "Detected a corrupt cache file loading dataset %d: '%s'. "
565- "We will continue loading data from the arff-file, "
566- "but this will be much slower for big datasets. "
567- "Please manually delete the cache file if you want OpenML-Python "
568- "to attempt to reconstruct it."
569- "" % (self .dataset_id , self .data_pickle_file )
570- )
571- data , categorical , attribute_names = self ._parse_data_from_arff (self .data_file )
572516 except FileNotFoundError :
573- raise ValueError (
574- "Cannot find a pickle file for dataset {} at "
575- "location {} " .format (self .name , self .data_pickle_file )
576- )
577- except ModuleNotFoundError as e :
517+ raise ValueError (f"Cannot find file for dataset { self .name } at location '{ fpath } '." )
518+ except (EOFError , ModuleNotFoundError , ValueError ) as e :
519+ error_message = e .message if hasattr (e , "message" ) else e .args [0 ]
520+ hint = ""
521+
522+ if isinstance (e , EOFError ):
523+ readable_error = "Detected a corrupt cache file"
524+ elif isinstance (e , ModuleNotFoundError ):
525+ readable_error = "Detected likely dependency issues"
526+ hint = "This is most likely due to https://github.com/openml/openml-python/issues/918. " # noqa: 501
527+ elif isinstance (e , ValueError ) and "unsupported pickle protocol" in e .args [0 ]:
528+ readable_error = "Encountered unsupported pickle protocol"
529+ else :
530+ raise # an unknown ValueError is raised, should crash and file bug report
531+
578532 logger .warning (
579- "Encountered error message when loading cached dataset %d: '%s '. "
580- "Error message was: %s. "
581- "This is most likely due to https://github.com/openml/openml-python/issues/918 . "
533+ f" { readable_error } when loading dataset { self . id } from ' { fpath } '. "
534+ f" { hint } "
535+ f"Error message was: { error_message } . "
582536 "We will continue loading data from the arff-file, "
583537 "but this will be much slower for big datasets. "
584538 "Please manually delete the cache file if you want OpenML-Python "
585539 "to attempt to reconstruct it."
586- "" % (self .dataset_id , self .data_pickle_file , e .args [0 ]),
587540 )
588541 data , categorical , attribute_names = self ._parse_data_from_arff (self .data_file )
589- except ValueError as e :
590- if "unsupported pickle protocol" in e .args [0 ]:
591- logger .warning (
592- "Encountered unsupported pickle protocol when loading cached dataset %d: '%s'. "
593- "Error message was: %s. "
594- "We will continue loading data from the arff-file, "
595- "but this will be much slower for big datasets. "
596- "Please manually delete the cache file if you want OpenML-Python "
597- "to attempt to reconstruct it."
598- "" % (self .dataset_id , self .data_pickle_file , e .args [0 ]),
599- )
600- data , categorical , attribute_names = self ._parse_data_from_arff (self .data_file )
601- else :
602- raise
603542
543+ data_up_to_date = isinstance (data , pd .DataFrame ) or scipy .sparse .issparse (data )
544+ if self .cache_format == "pickle" and not data_up_to_date :
545+ logger .info ("Updating outdated pickle file." )
546+ return self ._cache_compressed_file_from_arff (self .data_file )
604547 return data , categorical , attribute_names
605548
606549 @staticmethod
0 commit comments