Skip to content

Commit e074c14

Browse files
authored
Refactor data loading/storing (openml#1018)
* Refactor flow of loading/compressing data There was a lot of code duplication, and the general flow of loading/storing the data in compressed format was hard to navigate. * Only set data file members for files that exist * Call get_data to create compressed pickle Otherwise the data would actually be loaded from arff (first load). * Add data load refactor * Revert aggressive text replacement from PyCharm My editor incorrectly renamed too many instances of 'data_file' to 'arff_file'. * Avoid duplicate exists/isdir
1 parent fba6aab commit e074c14

4 files changed

Lines changed: 71 additions & 124 deletions

File tree

doc/progress.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Changelog
88

99
0.11.1
1010
~~~~~~
11+
* MAINT #1018 : Refactor data loading and storage. Data is now compressed on the first call to `get_data`.
1112
* MAINT #891: Changed the way that numerical features are stored. Numerical features that range from 0 to 255 are now stored as uint8, which reduces the storage space required as well as storing and loading times.
1213
* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
1314
* FIX #964 : AValidate `ignore_attribute`, `default_target_attribute`, `row_id_attribute` are set to attributes that exist on the dataset when calling ``create_dataset``.

openml/datasets/dataset.py

Lines changed: 65 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -217,16 +217,14 @@ def find_invalid_characters(string, pattern):
217217
self.qualities = None
218218

219219
if data_file is not None:
220-
rval = self._create_pickle_in_cache(data_file)
221-
self.data_pickle_file = rval[0] # type: Optional[str]
222-
self.data_feather_file = rval[1] # type: Optional[str]
223-
self.feather_attribute_file = rval[2] # type: Optional[str]
220+
rval = self._compressed_cache_file_paths(data_file)
221+
self.data_pickle_file = rval[0] if os.path.exists(rval[0]) else None
222+
self.data_feather_file = rval[1] if os.path.exists(rval[1]) else None
223+
self.feather_attribute_file = rval[2] if os.path.exists(rval[2]) else None
224224
else:
225-
self.data_pickle_file, self.data_feather_file, self.feather_attribute_file = (
226-
None,
227-
None,
228-
None,
229-
)
225+
self.data_pickle_file = None
226+
self.data_feather_file = None
227+
self.feather_attribute_file = None
230228

231229
@property
232230
def id(self) -> Optional[int]:
@@ -455,152 +453,97 @@ def _parse_data_from_arff(
455453

456454
return X, categorical, attribute_names
457455

458-
def _create_pickle_in_cache(self, data_file: str) -> Tuple[str, str, str]:
459-
""" Parse the arff and pickle the result. Update any old pickle objects. """
456+
def _compressed_cache_file_paths(self, data_file: str) -> Tuple[str, str, str]:
460457
data_pickle_file = data_file.replace(".arff", ".pkl.py3")
461458
data_feather_file = data_file.replace(".arff", ".feather")
462459
feather_attribute_file = data_file.replace(".arff", ".feather.attributes.pkl.py3")
463-
if os.path.exists(data_pickle_file) and self.cache_format == "pickle":
464-
# Load the data to check if the pickle file is outdated (i.e. contains numpy array)
465-
with open(data_pickle_file, "rb") as fh:
466-
try:
467-
data, categorical, attribute_names = pickle.load(fh)
468-
except EOFError:
469-
# The file is likely corrupt, see #780.
470-
# We deal with this when loading the data in `_load_data`.
471-
return data_pickle_file, data_feather_file, feather_attribute_file
472-
except ModuleNotFoundError:
473-
# There was some issue loading the file, see #918
474-
# We deal with this when loading the data in `_load_data`.
475-
return data_pickle_file, data_feather_file, feather_attribute_file
476-
except ValueError as e:
477-
if "unsupported pickle protocol" in e.args[0]:
478-
# There was some issue loading the file, see #898
479-
# We deal with this when loading the data in `_load_data`.
480-
return data_pickle_file, data_feather_file, feather_attribute_file
481-
else:
482-
raise
483-
484-
# Between v0.8 and v0.9 the format of pickled data changed from
485-
# np.ndarray to pd.DataFrame. This breaks some backwards compatibility,
486-
# e.g. for `run_model_on_task`. If a local file still exists with
487-
# np.ndarray data, we reprocess the data file to store a pickled
488-
# pd.DataFrame blob. See also #646.
489-
if isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data):
490-
logger.debug("Data pickle file already exists and is up to date.")
491-
return data_pickle_file, data_feather_file, feather_attribute_file
492-
elif os.path.exists(data_feather_file) and self.cache_format == "feather":
493-
# Load the data to check if the pickle file is outdated (i.e. contains numpy array)
494-
try:
495-
data = pd.read_feather(data_feather_file)
496-
except EOFError:
497-
# The file is likely corrupt, see #780.
498-
# We deal with this when loading the data in `_load_data`.
499-
return data_pickle_file, data_feather_file, feather_attribute_file
500-
except ModuleNotFoundError:
501-
# There was some issue loading the file, see #918
502-
# We deal with this when loading the data in `_load_data`.
503-
return data_pickle_file, data_feather_file, feather_attribute_file
504-
except ValueError as e:
505-
if "unsupported pickle protocol" in e.args[0]:
506-
# There was some issue loading the file, see #898
507-
# We deal with this when loading the data in `_load_data`.
508-
return data_pickle_file, data_feather_file, feather_attribute_file
509-
else:
510-
raise
460+
return data_pickle_file, data_feather_file, feather_attribute_file
511461

512-
logger.debug("Data feather file already exists and is up to date.")
513-
return data_pickle_file, data_feather_file, feather_attribute_file
462+
def _cache_compressed_file_from_arff(
463+
self, arff_file: str
464+
) -> Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]:
465+
""" Store data from the arff file in compressed format. Sets cache_format to 'pickle' if data is sparse. """ # noqa: 501
466+
(
467+
data_pickle_file,
468+
data_feather_file,
469+
feather_attribute_file,
470+
) = self._compressed_cache_file_paths(arff_file)
514471

515-
# At this point either the pickle file does not exist, or it had outdated formatting.
516-
# We parse the data from arff again and populate the cache with a recent pickle file.
517-
X, categorical, attribute_names = self._parse_data_from_arff(data_file)
472+
data, categorical, attribute_names = self._parse_data_from_arff(arff_file)
518473

519474
# Feather format does not work for sparse datasets, so we use pickle for sparse datasets
475+
if scipy.sparse.issparse(data):
476+
self.cache_format = "pickle"
520477

521-
if self.cache_format == "feather" and not scipy.sparse.issparse(X):
522-
logger.info("feather write {}".format(self.name))
523-
X.to_feather(data_feather_file)
478+
logger.info(f"{self.cache_format} write {self.name}")
479+
if self.cache_format == "feather":
480+
data.to_feather(data_feather_file)
524481
with open(feather_attribute_file, "wb") as fh:
525482
pickle.dump((categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
526483
else:
527-
logger.info("pickle write {}".format(self.name))
528-
self.cache_format = "pickle"
529484
with open(data_pickle_file, "wb") as fh:
530-
pickle.dump((X, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
531-
logger.debug(
532-
"Saved dataset {did}: {name} to file {path}".format(
533-
did=int(self.dataset_id or -1), name=self.name, path=data_pickle_file
534-
)
535-
)
536-
return data_pickle_file, data_feather_file, feather_attribute_file
485+
pickle.dump((data, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
486+
487+
data_file = data_pickle_file if self.cache_format == "pickle" else data_feather_file
488+
logger.debug(f"Saved dataset {int(self.dataset_id or -1)}: {self.name} to file {data_file}")
489+
return data, categorical, attribute_names
537490

538491
def _load_data(self):
539-
""" Load data from pickle or arff. Download data first if not present on disk. """
540-
if (self.cache_format == "pickle" and self.data_pickle_file is None) or (
541-
self.cache_format == "feather" and self.data_feather_file is None
542-
):
492+
""" Load data from compressed format or arff. Download data if not present on disk. """
493+
need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None
494+
need_to_create_feather = self.cache_format == "feather" and self.data_feather_file is None
495+
496+
if need_to_create_pickle or need_to_create_feather:
543497
if self.data_file is None:
544498
self._download_data()
545-
(
546-
self.data_pickle_file,
547-
self.data_feather_file,
548-
self.feather_attribute_file,
549-
) = self._create_pickle_in_cache(self.data_file)
550-
499+
res = self._compressed_cache_file_paths(self.data_file)
500+
self.data_pickle_file, self.data_feather_file, self.feather_attribute_file = res
501+
# Since our recently stored data is exists in memory, there is no need to load from disk
502+
return self._cache_compressed_file_from_arff(self.data_file)
503+
504+
# helper variable to help identify where errors occur
505+
fpath = self.data_feather_file if self.cache_format == "feather" else self.data_pickle_file
506+
logger.info(f"{self.cache_format} load data {self.name}")
551507
try:
552508
if self.cache_format == "feather":
553-
logger.info("feather load data {}".format(self.name))
554509
data = pd.read_feather(self.data_feather_file)
555-
510+
fpath = self.feather_attribute_file
556511
with open(self.feather_attribute_file, "rb") as fh:
557512
categorical, attribute_names = pickle.load(fh)
558513
else:
559-
logger.info("pickle load data {}".format(self.name))
560514
with open(self.data_pickle_file, "rb") as fh:
561515
data, categorical, attribute_names = pickle.load(fh)
562-
except EOFError:
563-
logger.warning(
564-
"Detected a corrupt cache file loading dataset %d: '%s'. "
565-
"We will continue loading data from the arff-file, "
566-
"but this will be much slower for big datasets. "
567-
"Please manually delete the cache file if you want OpenML-Python "
568-
"to attempt to reconstruct it."
569-
"" % (self.dataset_id, self.data_pickle_file)
570-
)
571-
data, categorical, attribute_names = self._parse_data_from_arff(self.data_file)
572516
except FileNotFoundError:
573-
raise ValueError(
574-
"Cannot find a pickle file for dataset {} at "
575-
"location {} ".format(self.name, self.data_pickle_file)
576-
)
577-
except ModuleNotFoundError as e:
517+
raise ValueError(f"Cannot find file for dataset {self.name} at location '{fpath}'.")
518+
except (EOFError, ModuleNotFoundError, ValueError) as e:
519+
error_message = e.message if hasattr(e, "message") else e.args[0]
520+
hint = ""
521+
522+
if isinstance(e, EOFError):
523+
readable_error = "Detected a corrupt cache file"
524+
elif isinstance(e, ModuleNotFoundError):
525+
readable_error = "Detected likely dependency issues"
526+
hint = "This is most likely due to https://github.com/openml/openml-python/issues/918. " # noqa: 501
527+
elif isinstance(e, ValueError) and "unsupported pickle protocol" in e.args[0]:
528+
readable_error = "Encountered unsupported pickle protocol"
529+
else:
530+
raise # an unknown ValueError is raised, should crash and file bug report
531+
578532
logger.warning(
579-
"Encountered error message when loading cached dataset %d: '%s'. "
580-
"Error message was: %s. "
581-
"This is most likely due to https://github.com/openml/openml-python/issues/918. "
533+
f"{readable_error} when loading dataset {self.id} from '{fpath}'. "
534+
f"{hint}"
535+
f"Error message was: {error_message}. "
582536
"We will continue loading data from the arff-file, "
583537
"but this will be much slower for big datasets. "
584538
"Please manually delete the cache file if you want OpenML-Python "
585539
"to attempt to reconstruct it."
586-
"" % (self.dataset_id, self.data_pickle_file, e.args[0]),
587540
)
588541
data, categorical, attribute_names = self._parse_data_from_arff(self.data_file)
589-
except ValueError as e:
590-
if "unsupported pickle protocol" in e.args[0]:
591-
logger.warning(
592-
"Encountered unsupported pickle protocol when loading cached dataset %d: '%s'. "
593-
"Error message was: %s. "
594-
"We will continue loading data from the arff-file, "
595-
"but this will be much slower for big datasets. "
596-
"Please manually delete the cache file if you want OpenML-Python "
597-
"to attempt to reconstruct it."
598-
"" % (self.dataset_id, self.data_pickle_file, e.args[0]),
599-
)
600-
data, categorical, attribute_names = self._parse_data_from_arff(self.data_file)
601-
else:
602-
raise
603542

543+
data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data)
544+
if self.cache_format == "pickle" and not data_up_to_date:
545+
logger.info("Updating outdated pickle file.")
546+
return self._cache_compressed_file_from_arff(self.data_file)
604547
return data, categorical, attribute_names
605548

606549
@staticmethod

openml/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,9 +305,9 @@ def _create_cache_directory_for_id(key, id_):
305305
Path of the created dataset cache directory.
306306
"""
307307
cache_dir = os.path.join(_create_cache_directory(key), str(id_))
308-
if os.path.exists(cache_dir) and os.path.isdir(cache_dir):
308+
if os.path.isdir(cache_dir):
309309
pass
310-
elif os.path.exists(cache_dir) and not os.path.isdir(cache_dir):
310+
elif os.path.exists(cache_dir):
311311
raise ValueError("%s cache dir exists but is not a directory!" % key)
312312
else:
313313
os.makedirs(cache_dir)

tests/test_datasets/test_dataset_functions.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,6 +1258,8 @@ def test_list_qualities(self):
12581258

12591259
def test_get_dataset_cache_format_pickle(self):
12601260
dataset = openml.datasets.get_dataset(1)
1261+
dataset.get_data()
1262+
12611263
self.assertEqual(type(dataset), OpenMLDataset)
12621264
self.assertEqual(dataset.name, "anneal")
12631265
self.assertGreater(len(dataset.features), 1)
@@ -1272,6 +1274,7 @@ def test_get_dataset_cache_format_pickle(self):
12721274
def test_get_dataset_cache_format_feather(self):
12731275

12741276
dataset = openml.datasets.get_dataset(128, cache_format="feather")
1277+
dataset.get_data()
12751278

12761279
# Check if dataset is written to cache directory using feather
12771280
cache_dir = openml.config.get_cache_directory()

0 commit comments

Comments
 (0)