Skip to content

Commit 07d429c

Browse files
authored
Feather investigation (openml#894)
* init feather implementation * sparse matrix * test notebook * feather pickle compare * test arrow vs feather * add columns condition * Testing * get_dataset add cache format * add pyarrow * sparse matrix check * pep8 and remove files * return type * fix type annotation * value check * change feather condition * fixes and test * fix errors * testing file * feather new file for attributes * change feather attribute file path * delete testing file * testing changes * delete pkls * fixes * fixes * add comments * change default caching * pip version * review comment fixes * newline * fix if condition * Update install.sh * pandas verison due to sparse data * review openml#2 * Update appveyor.yml * Update appveyor.yml * rename cache dir
1 parent b37b261 commit 07d429c

7 files changed

Lines changed: 123 additions & 28 deletions

File tree

appveyor.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ environment:
55
# CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\scikit-learn-contrib\\run_with_env.cmd"
66

77
matrix:
8-
- PYTHON: "C:\\Python35-x64"
9-
PYTHON_VERSION: "3.5"
8+
- PYTHON: "C:\\Python3-x64"
9+
PYTHON_VERSION: "3.6"
1010
PYTHON_ARCH: "64"
11-
MINICONDA: "C:\\Miniconda35-x64"
11+
MINICONDA: "C:\\Miniconda36-x64"
1212

1313
matrix:
1414
fast_finish: true

ci_scripts/install.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ fi
3535
python --version
3636

3737
if [[ "$TEST_DIST" == "true" ]]; then
38-
pip install twine nbconvert jupyter_client matplotlib pytest pytest-xdist pytest-timeout \
38+
pip install twine nbconvert jupyter_client matplotlib pyarrow pytest pytest-xdist pytest-timeout \
3939
nbformat oslo.concurrency flaky
4040
python setup.py sdist
4141
# Find file which was modified last as done in https://stackoverflow.com/a/4561987

doc/progress.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Changelog
1515
logging to console and file.
1616
* MAINT #767: Source distribution installation is now unit-tested.
1717
* MAINT #865: OpenML no longer bundles test files in the source distribution.
18+
* ADD #894: Support caching of datasets using feather format as an option.
1819

1920
0.10.2
2021
~~~~~~

openml/datasets/dataset.py

Lines changed: 61 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ class OpenMLDataset(OpenMLBase):
3636
Description of the dataset.
3737
format : str
3838
Format of the dataset which can be either 'arff' or 'sparse_arff'.
39+
cache_format : str
40+
Format for caching the dataset which can be either 'feather' or 'pickle'.
3941
dataset_id : int, optional
4042
Id autogenerated by the server.
4143
version : int, optional
@@ -99,7 +101,8 @@ class OpenMLDataset(OpenMLBase):
99101
Serialized arff dataset string.
100102
"""
101103
def __init__(self, name, description, format=None,
102-
data_format='arff', dataset_id=None, version=None,
104+
data_format='arff', cache_format='pickle',
105+
dataset_id=None, version=None,
103106
creator=None, contributor=None, collection_date=None,
104107
upload_date=None, language=None, licence=None,
105108
url=None, default_target_attribute=None,
@@ -127,6 +130,11 @@ def __init__(self, name, description, format=None,
127130
self.name = name
128131
self.version = int(version) if version is not None else None
129132
self.description = description
133+
if cache_format not in ['feather', 'pickle']:
134+
raise ValueError("cache_format must be one of 'feather' or 'pickle. "
135+
"Invalid format specified: {}".format(cache_format))
136+
137+
self.cache_format = cache_format
130138
if format is None:
131139
self.format = data_format
132140
else:
@@ -180,9 +188,11 @@ def __init__(self, name, description, format=None,
180188
self.qualities = _check_qualities(qualities)
181189

182190
if data_file is not None:
183-
self.data_pickle_file = self._create_pickle_in_cache(data_file)
191+
self.data_pickle_file, self.data_feather_file,\
192+
self.feather_attribute_file = self._create_pickle_in_cache(data_file)
184193
else:
185-
self.data_pickle_file = None
194+
self.data_pickle_file, self.data_feather_file, \
195+
self.feather_attribute_file = None, None, None
186196

187197
@property
188198
def id(self) -> Optional[int]:
@@ -396,18 +406,20 @@ def _parse_data_from_arff(
396406

397407
return X, categorical, attribute_names
398408

399-
def _create_pickle_in_cache(self, data_file: str) -> str:
409+
def _create_pickle_in_cache(self, data_file: str) -> Tuple[str, str, str]:
400410
""" Parse the arff and pickle the result. Update any old pickle objects. """
401411
data_pickle_file = data_file.replace('.arff', '.pkl.py3')
402-
if os.path.exists(data_pickle_file):
412+
data_feather_file = data_file.replace('.arff', '.feather')
413+
feather_attribute_file = data_file.replace('.arff', '.feather.attributes.pkl.py3')
414+
if os.path.exists(data_pickle_file) and self.cache_format == 'pickle':
403415
# Load the data to check if the pickle file is outdated (i.e. contains numpy array)
404416
with open(data_pickle_file, "rb") as fh:
405417
try:
406418
data, categorical, attribute_names = pickle.load(fh)
407419
except EOFError:
408420
# The file is likely corrupt, see #780.
409421
# We deal with this when loading the data in `_load_data`.
410-
return data_pickle_file
422+
return data_pickle_file, data_feather_file, feather_attribute_file
411423

412424
# Between v0.8 and v0.9 the format of pickled data changed from
413425
# np.ndarray to pd.DataFrame. This breaks some backwards compatibility,
@@ -416,32 +428,62 @@ def _create_pickle_in_cache(self, data_file: str) -> str:
416428
# pd.DataFrame blob. See also #646.
417429
if isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data):
418430
logger.debug("Data pickle file already exists and is up to date.")
419-
return data_pickle_file
431+
return data_pickle_file, data_feather_file, feather_attribute_file
432+
elif os.path.exists(data_feather_file) and self.cache_format == 'feather':
433+
# Load the data to check if the pickle file is outdated (i.e. contains numpy array)
434+
try:
435+
data = pd.read_feather(data_feather_file)
436+
except EOFError:
437+
# The file is likely corrupt, see #780.
438+
# We deal with this when loading the data in `_load_data`.
439+
return data_pickle_file, data_feather_file, feather_attribute_file
440+
441+
logger.debug("Data feather file already exists and is up to date.")
442+
return data_pickle_file, data_feather_file, feather_attribute_file
420443

421444
# At this point either the pickle file does not exist, or it had outdated formatting.
422445
# We parse the data from arff again and populate the cache with a recent pickle file.
423446
X, categorical, attribute_names = self._parse_data_from_arff(data_file)
424447

425-
with open(data_pickle_file, "wb") as fh:
426-
pickle.dump((X, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
427-
logger.debug("Saved dataset {did}: {name} to file {path}"
428-
.format(did=int(self.dataset_id or -1),
429-
name=self.name,
430-
path=data_pickle_file)
431-
)
448+
# Feather format does not work for sparse datasets, so we use pickle for sparse datasets
432449

433-
return data_pickle_file
450+
if self.cache_format == "feather" and not scipy.sparse.issparse(X):
451+
logger.info("feather write {}".format(self.name))
452+
X.to_feather(data_feather_file)
453+
with open(feather_attribute_file, "wb") as fh:
454+
pickle.dump((categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
455+
else:
456+
logger.info("pickle write {}".format(self.name))
457+
self.cache_format = 'pickle'
458+
with open(data_pickle_file, "wb") as fh:
459+
pickle.dump((X, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
460+
logger.debug("Saved dataset {did}: {name} to file {path}"
461+
.format(did=int(self.dataset_id or -1),
462+
name=self.name,
463+
path=data_pickle_file)
464+
)
465+
return data_pickle_file, data_feather_file, feather_attribute_file
434466

435467
def _load_data(self):
436468
""" Load data from pickle or arff. Download data first if not present on disk. """
437-
if self.data_pickle_file is None:
469+
if (self.cache_format == 'pickle' and self.data_pickle_file is None) or \
470+
(self.cache_format == 'feather' and self.data_feather_file is None):
438471
if self.data_file is None:
439472
self._download_data()
440-
self.data_pickle_file = self._create_pickle_in_cache(self.data_file)
473+
self.data_pickle_file, self.data_feather_file, self.feather_attribute_file = \
474+
self._create_pickle_in_cache(self.data_file)
441475

442476
try:
443-
with open(self.data_pickle_file, "rb") as fh:
444-
data, categorical, attribute_names = pickle.load(fh)
477+
if self.cache_format == 'feather':
478+
logger.info("feather load data {}".format(self.name))
479+
data = pd.read_feather(self.data_feather_file)
480+
481+
with open(self.feather_attribute_file, "rb") as fh:
482+
categorical, attribute_names = pickle.load(fh)
483+
else:
484+
logger.info("pickle load data {}".format(self.name))
485+
with open(self.data_pickle_file, "rb") as fh:
486+
data, categorical, attribute_names = pickle.load(fh)
445487
except EOFError:
446488
logger.warning(
447489
"Detected a corrupt cache file loading dataset %d: '%s'. "

openml/datasets/functions.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,8 @@ def get_dataset(
451451
dataset_id: Union[int, str],
452452
download_data: bool = True,
453453
version: int = None,
454-
error_if_multiple: bool = False
454+
error_if_multiple: bool = False,
455+
cache_format: str = 'pickle'
455456
) -> OpenMLDataset:
456457
""" Download the OpenML dataset representation, optionally also download actual data file.
457458
@@ -479,12 +480,19 @@ def get_dataset(
479480
If no version is specified, retrieve the least recent still active version.
480481
error_if_multiple : bool, optional (default=False)
481482
If ``True`` raise an error if multiple datasets are found with matching criteria.
482-
483+
cache_format : str, optional (default='pickle')
484+
Format for caching the dataset - may be feather or pickle
485+
Note that the default 'pickle' option may load slower than feather when
486+
no.of.rows is very high.
483487
Returns
484488
-------
485489
dataset : :class:`openml.OpenMLDataset`
486490
The downloaded dataset.
487491
"""
492+
if cache_format not in ['feather', 'pickle']:
493+
raise ValueError("cache_format must be one of 'feather' or 'pickle. "
494+
"Invalid format specified: {}".format(cache_format))
495+
488496
if isinstance(dataset_id, str):
489497
try:
490498
dataset_id = int(dataset_id)
@@ -527,7 +535,7 @@ def get_dataset(
527535
did_cache_dir)
528536

529537
dataset = _create_dataset_from_description(
530-
description, features, qualities, arff_file
538+
description, features, qualities, arff_file, cache_format
531539
)
532540
return dataset
533541

@@ -975,6 +983,7 @@ def _create_dataset_from_description(
975983
features: Dict,
976984
qualities: List,
977985
arff_file: str = None,
986+
cache_format: str = 'pickle',
978987
) -> OpenMLDataset:
979988
"""Create a dataset object from a description dict.
980989
@@ -988,6 +997,8 @@ def _create_dataset_from_description(
988997
Description of a dataset qualities.
989998
arff_file : string, optional
990999
Path of dataset ARFF file.
1000+
cache_format: string, optional
1001+
Caching option for datasets (feather/pickle)
9911002
9921003
Returns
9931004
-------
@@ -1019,6 +1030,7 @@ def _create_dataset_from_description(
10191030
update_comment=description.get("oml:update_comment"),
10201031
md5_checksum=description.get("oml:md5_checksum"),
10211032
data_file=arff_file,
1033+
cache_format=cache_format,
10221034
features=features,
10231035
qualities=qualities,
10241036
)

setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@
4949
'requests',
5050
'scikit-learn>=0.18',
5151
'python-dateutil', # Installed through pandas anyway.
52-
'pandas>=0.19.2',
52+
'pandas>=0.19.2, <1.0.0',
5353
'scipy>=0.13.3',
54-
'numpy>=1.6.2'
54+
'numpy>=1.6.2',
5555
],
5656
extras_require={
5757
'test': [
@@ -64,6 +64,7 @@
6464
'nbformat',
6565
'oslo.concurrency',
6666
'flaky',
67+
'pyarrow'
6768
],
6869
'examples': [
6970
'matplotlib',

tests/test_datasets/test_dataset_functions.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1316,3 +1316,42 @@ def test_list_qualities(self):
13161316
qualities = openml.datasets.list_qualities()
13171317
self.assertEqual(isinstance(qualities, list), True)
13181318
self.assertEqual(all([isinstance(q, str) for q in qualities]), True)
1319+
1320+
def test_get_dataset_cache_format_pickle(self):
1321+
dataset = openml.datasets.get_dataset(1)
1322+
self.assertEqual(type(dataset), OpenMLDataset)
1323+
self.assertEqual(dataset.name, 'anneal')
1324+
self.assertGreater(len(dataset.features), 1)
1325+
self.assertGreater(len(dataset.qualities), 4)
1326+
1327+
X, y, categorical, attribute_names = dataset.get_data()
1328+
self.assertIsInstance(X, pd.DataFrame)
1329+
self.assertEqual(X.shape, (898, 39))
1330+
self.assertEqual(len(categorical), X.shape[1])
1331+
self.assertEqual(len(attribute_names), X.shape[1])
1332+
1333+
def test_get_dataset_cache_format_feather(self):
1334+
1335+
dataset = openml.datasets.get_dataset(128, cache_format='feather')
1336+
1337+
# Check if dataset is written to cache directory using feather
1338+
cache_dir = openml.config.get_cache_directory()
1339+
cache_dir_for_id = os.path.join(cache_dir, 'datasets', '128')
1340+
feather_file = os.path.join(cache_dir_for_id, 'dataset.feather')
1341+
pickle_file = os.path.join(cache_dir_for_id, 'dataset.feather.attributes.pkl.py3')
1342+
data = pd.read_feather(feather_file)
1343+
self.assertTrue(os.path.isfile(feather_file), msg='Feather file is missing')
1344+
self.assertTrue(os.path.isfile(pickle_file), msg='Attributes pickle file is missing')
1345+
self.assertEqual(data.shape, (150, 5))
1346+
1347+
# Check if get_data is able to retrieve feather data
1348+
self.assertEqual(type(dataset), OpenMLDataset)
1349+
self.assertEqual(dataset.name, 'iris')
1350+
self.assertGreater(len(dataset.features), 1)
1351+
self.assertGreater(len(dataset.qualities), 4)
1352+
1353+
X, y, categorical, attribute_names = dataset.get_data()
1354+
self.assertIsInstance(X, pd.DataFrame)
1355+
self.assertEqual(X.shape, (150, 5))
1356+
self.assertEqual(len(categorical), X.shape[1])
1357+
self.assertEqual(len(attribute_names), X.shape[1])

0 commit comments

Comments
 (0)