Updated the way 'image features' are stored, updated old unit tests, …

…added unit test, fixed typo (#983)
openml · mfeurer · Apr 8, 2021 · Oct 25, 2020 · Oct 26, 2020 · Oct 29, 2020
commit 6afc8806d97be3c2ba3bc067ce3d3c3cab9d5bc8
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -8,6 +8,7 @@ Changelog
 
 0.11.1
 ~~~~~~
+* MAINT #891: Changed the way that numerical features are stored. Numerical features that range from 0 to 255 are now stored as uint8, which reduces the storage space required as well as storing and loading times.
 * MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
 
 0.11.0

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -407,7 +407,7 @@ def _parse_data_from_arff(
         categories_names = {}
         categorical = []
         for i, (name, type_) in enumerate(data["attributes"]):
-            # if the feature is nominal and the a sparse matrix is
+            # if the feature is nominal and a sparse matrix is
             # requested, the categories need to be numeric
             if isinstance(type_, list) and self.format.lower() == "sparse_arff":
                 try:
@@ -456,6 +456,18 @@ def _parse_data_from_arff(
                     col.append(
                         self._unpack_categories(X[column_name], categories_names[column_name])
                     )
+                elif attribute_dtype[column_name] in ('floating',
+                                                      'integer'):
+                    X_col = X[column_name]
+                    if X_col.min() >= 0 and X_col.max() <= 255:
+                        try:
+                            X_col_uint = X_col.astype('uint8')
+                            if (X_col == X_col_uint).all():
+                                col.append(X_col_uint)
+                                continue
+                        except ValueError:
+                            pass
+                    col.append(X[column_name])
                 else:
                     col.append(X[column_name])
             X = pd.concat(col, axis=1)

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -72,13 +72,13 @@ def test_get_data_pandas(self):
         self.assertEqual(data.shape[1], len(self.titanic.features))
         self.assertEqual(data.shape[0], 1309)
         col_dtype = {
-            "pclass": "float64",
+            "pclass": "uint8",
             "survived": "category",
             "name": "object",
             "sex": "category",
             "age": "float64",
-            "sibsp": "float64",
-            "parch": "float64",
+            "sibsp": "uint8",
+            "parch": "uint8",
             "ticket": "object",
             "fare": "float64",
             "cabin": "object",
@@ -118,21 +118,29 @@ def test_get_data_no_str_data_for_nparrays(self):
         with pytest.raises(PyOpenMLError, match=err_msg):
             self.titanic.get_data(dataset_format="array")
 
+    def _check_expected_type(self, dtype, is_cat, col):
+        if is_cat:
+            expected_type = 'category'
+        elif not col.isna().any() and (col.astype('uint8') == col).all():
+            expected_type = 'uint8'
+        else:
+            expected_type = 'float64'
+
+        self.assertEqual(dtype.name, expected_type)
+
     def test_get_data_with_rowid(self):
         self.dataset.row_id_attribute = "condition"
         rval, _, categorical, _ = self.dataset.get_data(include_row_id=True)
         self.assertIsInstance(rval, pd.DataFrame)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = "category" if is_cat else "float64"
-            self.assertEqual(dtype.name, expected_type)
+        for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
         self.assertEqual(rval.shape, (898, 39))
         self.assertEqual(len(categorical), 39)
 
         rval, _, categorical, _ = self.dataset.get_data()
         self.assertIsInstance(rval, pd.DataFrame)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = "category" if is_cat else "float64"
-            self.assertEqual(dtype.name, expected_type)
+        for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
         self.assertEqual(rval.shape, (898, 38))
         self.assertEqual(len(categorical), 38)
 
@@ -149,9 +157,8 @@ def test_get_data_with_target_array(self):
     def test_get_data_with_target_pandas(self):
         X, y, categorical, attribute_names = self.dataset.get_data(target="class")
         self.assertIsInstance(X, pd.DataFrame)
-        for (dtype, is_cat) in zip(X.dtypes, categorical):
-            expected_type = "category" if is_cat else "float64"
-            self.assertEqual(dtype.name, expected_type)
+        for (dtype, is_cat, col) in zip(X.dtypes, categorical, X):
+            self._check_expected_type(dtype, is_cat, X[col])
         self.assertIsInstance(y, pd.Series)
         self.assertEqual(y.dtype.name, "category")
 
@@ -174,16 +181,14 @@ def test_get_data_rowid_and_ignore_and_target(self):
     def test_get_data_with_ignore_attributes(self):
         self.dataset.ignore_attribute = ["condition"]
         rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = "category" if is_cat else "float64"
-            self.assertEqual(dtype.name, expected_type)
+        for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
         self.assertEqual(rval.shape, (898, 39))
         self.assertEqual(len(categorical), 39)
 
         rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = "category" if is_cat else "float64"
-            self.assertEqual(dtype.name, expected_type)
+        for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
         self.assertEqual(rval.shape, (898, 38))
         self.assertEqual(len(categorical), 38)
 

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -373,6 +373,13 @@ def test_get_dataset_by_name(self):
         openml.config.server = self.production_server
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
 
+    def test_get_dataset_uint8_dtype(self):
+        dataset = openml.datasets.get_dataset(1)
+        self.assertEqual(type(dataset), OpenMLDataset)
+        self.assertEqual(dataset.name, 'anneal')
+        df, _, _, _ = dataset.get_data()
+        self.assertEqual(df['carbon'].dtype, 'uint8')
+
     def test_get_dataset(self):
         # This is the only non-lazy load to ensure default behaviour works.
         dataset = openml.datasets.get_dataset(1)
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ Changelog @@
 .11.1
     ~~~~~~
+    * MAINT #891: Changed the way that numerical features are stored. Numerical features that range from 0 to 255 are now stored as uint8, which reduces the storage space required as well as storing and loading times.
     * MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
 .11.0
@@ Expand Down @@