Feather investigation (openml#894)

sahithyaravi · web-flow · commit 07d429c843cf · 2020-02-20T13:13:30.000+01:00
* init feather implementation * sparse matrix * test notebook * feather pickle compare * test arrow vs feather * add columns condition * Testing * get_dataset add cache format * add pyarrow * sparse matrix check * pep8 and remove files * return type * fix type annotation * value check * change feather condition * fixes and test * fix errors * testing file * feather new file for attributes * change feather attribute file path * delete testing file * testing changes * delete pkls * fixes * fixes * add comments * change default caching * pip version * review comment fixes * newline * fix if condition * Update install.sh * pandas verison due to sparse data * review openml#2 * Update appveyor.yml * Update appveyor.yml * rename cache dir
diff --git a/appveyor.yml b/appveyor.yml
@@ -5,10 +5,10 @@ environment:
 #     CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\scikit-learn-contrib\\run_with_env.cmd"
 
  matrix:
-    - PYTHON: "C:\\Python35-x64"
-      PYTHON_VERSION: "3.5"
+    - PYTHON: "C:\\Python3-x64"
+      PYTHON_VERSION: "3.6"
       PYTHON_ARCH: "64"
-      MINICONDA: "C:\\Miniconda35-x64"
+      MINICONDA: "C:\\Miniconda36-x64"
 
 matrix:
     fast_finish: true
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
@@ -35,7 +35,7 @@ fi
 python --version
 
 if [[ "$TEST_DIST" == "true" ]]; then
-    pip install twine nbconvert jupyter_client matplotlib pytest pytest-xdist pytest-timeout \
+    pip install twine nbconvert jupyter_client matplotlib pyarrow pytest pytest-xdist pytest-timeout \
         nbformat oslo.concurrency flaky
     python setup.py sdist
     # Find file which was modified last as done in https://stackoverflow.com/a/4561987
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -15,6 +15,7 @@ Changelog
   logging to console and file.
 * MAINT #767: Source distribution installation is now unit-tested.
 * MAINT #865: OpenML no longer bundles test files in the source distribution.
+* ADD #894: Support caching of datasets using feather format as an option.
 
 0.10.2
 ~~~~~~
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -36,6 +36,8 @@ class OpenMLDataset(OpenMLBase):
         Description of the dataset.
     format : str
         Format of the dataset which can be either 'arff' or 'sparse_arff'.
+    cache_format : str
+        Format for caching the dataset which can be either 'feather' or 'pickle'.
     dataset_id : int, optional
         Id autogenerated by the server.
     version : int, optional
@@ -99,7 +101,8 @@ class OpenMLDataset(OpenMLBase):
         Serialized arff dataset string.
     """
     def __init__(self, name, description, format=None,
-                 data_format='arff', dataset_id=None, version=None,
+                 data_format='arff', cache_format='pickle',
+                 dataset_id=None, version=None,
                  creator=None, contributor=None, collection_date=None,
                  upload_date=None, language=None, licence=None,
                  url=None, default_target_attribute=None,
@@ -127,6 +130,11 @@ def __init__(self, name, description, format=None,
         self.name = name
         self.version = int(version) if version is not None else None
         self.description = description
+        if cache_format not in ['feather', 'pickle']:
+            raise ValueError("cache_format must be one of 'feather' or 'pickle. "
+                             "Invalid format specified: {}".format(cache_format))
+
+        self.cache_format = cache_format
         if format is None:
             self.format = data_format
         else:
@@ -180,9 +188,11 @@ def __init__(self, name, description, format=None,
         self.qualities = _check_qualities(qualities)
 
         if data_file is not None:
-            self.data_pickle_file = self._create_pickle_in_cache(data_file)
+            self.data_pickle_file, self.data_feather_file,\
+                self.feather_attribute_file = self._create_pickle_in_cache(data_file)
         else:
-            self.data_pickle_file = None
+            self.data_pickle_file, self.data_feather_file, \
+                self.feather_attribute_file = None, None, None
 
     @property
     def id(self) -> Optional[int]:
@@ -396,18 +406,20 @@ def _parse_data_from_arff(
 
         return X, categorical, attribute_names
 
-    def _create_pickle_in_cache(self, data_file: str) -> str:
+    def _create_pickle_in_cache(self, data_file: str) -> Tuple[str, str, str]:
         """ Parse the arff and pickle the result. Update any old pickle objects. """
         data_pickle_file = data_file.replace('.arff', '.pkl.py3')
-        if os.path.exists(data_pickle_file):
+        data_feather_file = data_file.replace('.arff', '.feather')
+        feather_attribute_file = data_file.replace('.arff', '.feather.attributes.pkl.py3')
+        if os.path.exists(data_pickle_file) and self.cache_format == 'pickle':
             # Load the data to check if the pickle file is outdated (i.e. contains numpy array)
             with open(data_pickle_file, "rb") as fh:
                 try:
                     data, categorical, attribute_names = pickle.load(fh)
                 except EOFError:
                     # The file is likely corrupt, see #780.
                     # We deal with this when loading the data in `_load_data`.
-                    return data_pickle_file
+                    return data_pickle_file, data_feather_file, feather_attribute_file
 
             # Between v0.8 and v0.9 the format of pickled data changed from
             # np.ndarray to pd.DataFrame. This breaks some backwards compatibility,
@@ -416,32 +428,62 @@ def _create_pickle_in_cache(self, data_file: str) -> str:
             # pd.DataFrame blob. See also #646.
             if isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data):
                 logger.debug("Data pickle file already exists and is up to date.")
-                return data_pickle_file
+                return data_pickle_file, data_feather_file, feather_attribute_file
+        elif os.path.exists(data_feather_file) and self.cache_format == 'feather':
+            # Load the data to check if the pickle file is outdated (i.e. contains numpy array)
+            try:
+                data = pd.read_feather(data_feather_file)
+            except EOFError:
+                # The file is likely corrupt, see #780.
+                # We deal with this when loading the data in `_load_data`.
+                return data_pickle_file, data_feather_file, feather_attribute_file
+
+            logger.debug("Data feather file already exists and is up to date.")
+            return data_pickle_file, data_feather_file, feather_attribute_file
 
         # At this point either the pickle file does not exist, or it had outdated formatting.
         # We parse the data from arff again and populate the cache with a recent pickle file.
         X, categorical, attribute_names = self._parse_data_from_arff(data_file)
 
-        with open(data_pickle_file, "wb") as fh:
-            pickle.dump((X, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
-        logger.debug("Saved dataset {did}: {name} to file {path}"
-                     .format(did=int(self.dataset_id or -1),
-                             name=self.name,
-                             path=data_pickle_file)
-                     )
+        # Feather format does not work for sparse datasets, so we use pickle for sparse datasets
 
-        return data_pickle_file
+        if self.cache_format == "feather" and not scipy.sparse.issparse(X):
+            logger.info("feather write {}".format(self.name))
+            X.to_feather(data_feather_file)
+            with open(feather_attribute_file, "wb") as fh:
+                pickle.dump((categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
+        else:
+            logger.info("pickle write {}".format(self.name))
+            self.cache_format = 'pickle'
+            with open(data_pickle_file, "wb") as fh:
+                pickle.dump((X, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
+            logger.debug("Saved dataset {did}: {name} to file {path}"
+                         .format(did=int(self.dataset_id or -1),
+                                 name=self.name,
+                                 path=data_pickle_file)
+                         )
+        return data_pickle_file, data_feather_file, feather_attribute_file
 
     def _load_data(self):
         """ Load data from pickle or arff. Download data first if not present on disk. """
-        if self.data_pickle_file is None:
+        if (self.cache_format == 'pickle' and self.data_pickle_file is None) or \
+                (self.cache_format == 'feather' and self.data_feather_file is None):
             if self.data_file is None:
                 self._download_data()
-            self.data_pickle_file = self._create_pickle_in_cache(self.data_file)
+            self.data_pickle_file, self.data_feather_file, self.feather_attribute_file = \
+                self._create_pickle_in_cache(self.data_file)
 
         try:
-            with open(self.data_pickle_file, "rb") as fh:
-                data, categorical, attribute_names = pickle.load(fh)
+            if self.cache_format == 'feather':
+                logger.info("feather load data {}".format(self.name))
+                data = pd.read_feather(self.data_feather_file)
+
+                with open(self.feather_attribute_file, "rb") as fh:
+                    categorical, attribute_names = pickle.load(fh)
+            else:
+                logger.info("pickle load data {}".format(self.name))
+                with open(self.data_pickle_file, "rb") as fh:
+                    data, categorical, attribute_names = pickle.load(fh)
         except EOFError:
             logger.warning(
                 "Detected a corrupt cache file loading dataset %d: '%s'. "
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -451,7 +451,8 @@ def get_dataset(
     dataset_id: Union[int, str],
     download_data: bool = True,
     version: int = None,
-    error_if_multiple: bool = False
+    error_if_multiple: bool = False,
+    cache_format: str = 'pickle'
 ) -> OpenMLDataset:
     """ Download the OpenML dataset representation, optionally also download actual data file.
 
@@ -479,12 +480,19 @@ def get_dataset(
         If no version is specified, retrieve the least recent still active version.
     error_if_multiple : bool, optional (default=False)
         If ``True`` raise an error if multiple datasets are found with matching criteria.
-
+    cache_format : str, optional (default='pickle')
+        Format for caching the dataset - may be feather or pickle
+        Note that the default 'pickle' option may load slower than feather when
+        no.of.rows is very high.
     Returns
     -------
     dataset : :class:`openml.OpenMLDataset`
         The downloaded dataset.
     """
+    if cache_format not in ['feather', 'pickle']:
+        raise ValueError("cache_format must be one of 'feather' or 'pickle. "
+                         "Invalid format specified: {}".format(cache_format))
+
     if isinstance(dataset_id, str):
         try:
             dataset_id = int(dataset_id)
@@ -527,7 +535,7 @@ def get_dataset(
                                      did_cache_dir)
 
     dataset = _create_dataset_from_description(
-        description, features, qualities, arff_file
+        description, features, qualities, arff_file, cache_format
     )
     return dataset
 
@@ -975,6 +983,7 @@ def _create_dataset_from_description(
         features: Dict,
         qualities: List,
         arff_file: str = None,
+        cache_format: str = 'pickle',
 ) -> OpenMLDataset:
     """Create a dataset object from a description dict.
 
@@ -988,6 +997,8 @@ def _create_dataset_from_description(
         Description of a dataset qualities.
     arff_file : string, optional
         Path of dataset ARFF file.
+    cache_format: string, optional
+        Caching option for datasets (feather/pickle)
 
     Returns
     -------
@@ -1019,6 +1030,7 @@ def _create_dataset_from_description(
         update_comment=description.get("oml:update_comment"),
         md5_checksum=description.get("oml:md5_checksum"),
         data_file=arff_file,
+        cache_format=cache_format,
         features=features,
         qualities=qualities,
     )
diff --git a/setup.py b/setup.py
@@ -49,9 +49,9 @@
                      'requests',
                      'scikit-learn>=0.18',
                      'python-dateutil',  # Installed through pandas anyway.
-                     'pandas>=0.19.2',
+                     'pandas>=0.19.2, <1.0.0',
                      'scipy>=0.13.3',
-                     'numpy>=1.6.2'
+                     'numpy>=1.6.2',
                  ],
                  extras_require={
                      'test': [
@@ -64,6 +64,7 @@
                          'nbformat',
                          'oslo.concurrency',
                          'flaky',
+                         'pyarrow'
                      ],
                      'examples': [
                          'matplotlib',
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -1316,3 +1316,42 @@ def test_list_qualities(self):
         qualities = openml.datasets.list_qualities()
         self.assertEqual(isinstance(qualities, list), True)
         self.assertEqual(all([isinstance(q, str) for q in qualities]), True)
+
+    def test_get_dataset_cache_format_pickle(self):
+        dataset = openml.datasets.get_dataset(1)
+        self.assertEqual(type(dataset), OpenMLDataset)
+        self.assertEqual(dataset.name, 'anneal')
+        self.assertGreater(len(dataset.features), 1)
+        self.assertGreater(len(dataset.qualities), 4)
+
+        X, y, categorical, attribute_names = dataset.get_data()
+        self.assertIsInstance(X, pd.DataFrame)
+        self.assertEqual(X.shape, (898, 39))
+        self.assertEqual(len(categorical), X.shape[1])
+        self.assertEqual(len(attribute_names), X.shape[1])
+
+    def test_get_dataset_cache_format_feather(self):
+
+        dataset = openml.datasets.get_dataset(128, cache_format='feather')
+
+        # Check if dataset is written to cache directory using feather
+        cache_dir = openml.config.get_cache_directory()
+        cache_dir_for_id = os.path.join(cache_dir, 'datasets', '128')
+        feather_file = os.path.join(cache_dir_for_id, 'dataset.feather')
+        pickle_file = os.path.join(cache_dir_for_id, 'dataset.feather.attributes.pkl.py3')
+        data = pd.read_feather(feather_file)
+        self.assertTrue(os.path.isfile(feather_file), msg='Feather file is missing')
+        self.assertTrue(os.path.isfile(pickle_file), msg='Attributes pickle file is missing')
+        self.assertEqual(data.shape, (150, 5))
+
+        # Check if get_data is able to retrieve feather data
+        self.assertEqual(type(dataset), OpenMLDataset)
+        self.assertEqual(dataset.name, 'iris')
+        self.assertGreater(len(dataset.features), 1)
+        self.assertGreater(len(dataset.qualities), 4)
+
+        X, y, categorical, attribute_names = dataset.get_data()
+        self.assertIsInstance(X, pd.DataFrame)
+        self.assertEqual(X.shape, (150, 5))
+        self.assertEqual(len(categorical), X.shape[1])
+        self.assertEqual(len(attribute_names), X.shape[1])