add lazy loading for dataset metadata and add option to refresh cache

openml · LennartPurucker · Jun 15, 2023 · Jun 13, 2023 · Jun 13, 2023 · Jun 13, 2023
commit 3e808a598c31442a7414a35cf329cd5b7ceff635
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
@@ -62,5 +62,17 @@ def __init__(
     def __repr__(self):
         return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)
 
+    def __eq__(self, other):
+        if not isinstance(other, OpenMLDataFeature):
+            return False
+        else:
+            return (
+                self.index == other.index
+                and self.name == other.name
+                and self.data_type == other.data_type
+                and self.nominal_values == other.nominal_values
+                and self.number_missing_values == other.number_missing_values
+            )
+
     def _repr_pretty_(self, pp, cycle):
         pp.text(str(self))
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -16,6 +16,7 @@
 import xmltodict
 
 from openml.base import OpenMLBase
+from openml._api_calls import _perform_api_call
 from .data_feature import OpenMLDataFeature
 from ..exceptions import PyOpenMLError
 
@@ -787,6 +788,31 @@ def get_data(
 
         return data, targets, categorical, attribute_names
 
+    def load_metadata(self, features: bool = False, qualities: bool = False):
+        """Load the missing medata information from the server and store it in the server.
+
+        The purpose of the function is to support lazy loading.
+
+        Parameters
+        ----------
+        features : bool (default=False)
+            If True, load the `self.features` data if not already loaded.
+        qualities: bool (default=False)
+            If True, load the `self.qualities` data if not already loaded.
+        """
+
+        if self.dataset_id is None:
+            raise ValueError(
+                """No dataset id specified. Please set the dataset id.
+                                Otherwise we cannot load metadata."""
+            )
+
+        if features and self.features is None:
+            self.features = _parse_features_xml(_get_features_xml(self.dataset_id))
+
+        if qualities and self.qualities is None:
+            self.qualities = _parse_qualities_xml(_get_qualities_xml(self.dataset_id))
+
     def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[str]]:
         """Reads the datasets arff to determine the class-labels.
 
@@ -936,6 +962,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
         return data_container
 
 
+# -- Code for Features Property
 def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
     features_pickle_file = _get_features_pickle_file(features_file)
     try:
@@ -944,35 +971,46 @@ def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
     except:  # noqa E722
         with open(features_file, encoding="utf8") as fh:
             features_xml_string = fh.read()
-        xml_dict = xmltodict.parse(
-            features_xml_string, force_list=("oml:feature", "oml:nominal_value")
-        )
-        features_xml = xml_dict["oml:data_features"]
-
-        features = {}
-        for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
-            nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
-            feature = OpenMLDataFeature(
-                int(xmlfeature["oml:index"]),
-                xmlfeature["oml:name"],
-                xmlfeature["oml:data_type"],
-                xmlfeature.get("oml:nominal_value"),
-                int(nr_missing),
-            )
-            if idx != feature.index:
-                raise ValueError("Data features not provided in right order")
-            features[feature.index] = feature
+
+        features = _parse_features_xml(features_xml_string)
 
         with open(features_pickle_file, "wb") as fh_binary:
             pickle.dump(features, fh_binary)
     return features
 
 
+def _get_features_xml(dataset_id):
+    url_extension = "data/features/{}".format(dataset_id)
+    return _perform_api_call(url_extension, "get")
+
+
+def _parse_features_xml(features_xml_string):
+    xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
+    features_xml = xml_dict["oml:data_features"]
+
+    features = {}
+    for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
+        nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
+        feature = OpenMLDataFeature(
+            int(xmlfeature["oml:index"]),
+            xmlfeature["oml:name"],
+            xmlfeature["oml:data_type"],
+            xmlfeature.get("oml:nominal_value"),
+            int(nr_missing),
+        )
+        if idx != feature.index:
+            raise ValueError("Data features not provided in right order")
+        features[feature.index] = feature
+
+    return features
+
+
 def _get_features_pickle_file(features_file: str) -> str:
     """This function only exists so it can be mocked during unit testing"""
     return features_file + ".pkl"
 
 
+# -- Code for Qualities Property
 def _read_qualities(qualities_file: str) -> Dict[str, float]:
     qualities_pickle_file = _get_qualities_pickle_file(qualities_file)
     try:
@@ -981,17 +1019,15 @@ def _read_qualities(qualities_file: str) -> Dict[str, float]:
     except:  # noqa E722
         with open(qualities_file, encoding="utf8") as fh:
             qualities_xml = fh.read()
-        xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
-        qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
-        qualities = _check_qualities(qualities)
+        qualities = _parse_qualities_xml(qualities_xml)
         with open(qualities_pickle_file, "wb") as fh_binary:
             pickle.dump(qualities, fh_binary)
     return qualities
 
 
-def _get_qualities_pickle_file(qualities_file: str) -> str:
-    """This function only exists so it can be mocked during unit testing"""
-    return qualities_file + ".pkl"
+def _get_qualities_xml(dataset_id):
+    url_extension = "data/qualities/{}".format(dataset_id)
+    return _perform_api_call(url_extension, "get")
 
 
 def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
@@ -1006,3 +1042,14 @@ def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
             value = float(xmlquality["oml:value"])
         qualities_[name] = value
     return qualities_
+
+
+def _parse_qualities_xml(qualities_xml):
+    xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
+    qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
+    return _check_qualities(qualities)
+
+
+def _get_qualities_pickle_file(qualities_file: str) -> str:
+    """This function only exists so it can be mocked during unit testing"""
+    return qualities_file + ".pkl"
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -18,17 +18,14 @@
 
 import openml.utils
 import openml._api_calls
-from .dataset import OpenMLDataset
+from .dataset import OpenMLDataset, _get_features_xml, _get_qualities_xml
 from ..exceptions import (
     OpenMLHashException,
     OpenMLServerError,
     OpenMLServerException,
     OpenMLPrivateDatasetError,
 )
-from ..utils import (
-    _remove_cache_dir_for_id,
-    _create_cache_directory_for_id,
-)
+from ..utils import _remove_cache_dir_for_id, _create_cache_directory_for_id, _get_cache_dir_for_id
 
 DATASETS_CACHE_DIR_NAME = "datasets"
 logger = logging.getLogger(__name__)
@@ -357,12 +354,17 @@ def get_dataset(
     download_qualities: bool = True,
     download_features_meta_data: bool = True,
     download_all_files: bool = False,
+    force_refresh_cache: bool = False,
 ) -> OpenMLDataset:
     """Download the OpenML dataset representation, optionally also download actual data file.
 
-    This function is thread/multiprocessing safe.
-    This function uses caching. A check will be performed to determine if the information has
-    previously been downloaded, and if so be loaded from disk instead of retrieved from the server.
+    This function is by default NOT thread/multiprocessing safe, as this function uses caching.
+    A check will be performed to determine if the information has previously been downloaded to a
+    cache, and if so be loaded from disk instead of retrieved from the server.
+
+    To make this function thread/multiprocessing safe initialize the cache first by calling
+    `get_dataset(args)` once before calling `get_datasett(args)` many times in parallel. This will
+    initialize the cache and later calls will use the cache in a thread/multiprocessing safe way.
 
     If dataset is retrieved by name, a version may be specified.
     If no version is specified and multiple versions of the dataset exist,
@@ -384,23 +386,42 @@ def get_dataset(
         If no version is specified, retrieve the least recent still active version.
     error_if_multiple : bool (default=False)
         If ``True`` raise an error if multiple datasets are found with matching criteria.
-    cache_format : str (default='pickle')
+    cache_format : str (default='pickle') in {'pickle', 'feather'}
         Format for caching the dataset - may be feather or pickle
         Note that the default 'pickle' option may load slower than feather when
         no.of.rows is very high.
     download_qualities : bool (default=True)
         Option to download 'qualities' meta-data in addition to the minimal dataset description.
+        If True, download and cache the qualities file.
+        If False, create the OpenMLDataset without qualities metadata. The data may later be added
+        to the OpenMLDataset hrough the `OpenMLDataset.load_metadata(qualities=True)` method.
     download_features_meta_data : bool (default=True)
         Option to download 'features' meta-data in addition to the minimal dataset description.
+        If True, download and cache the features file.
+        If False, create the OpenMLDataset without features metadata. The data may later be added
+        to the OpenMLDataset through the `OpenMLDataset.load_metadata(features=True)` method.
     download_all_files: bool (default=False)
         EXPERIMENTAL. Download all files related to the dataset that reside on the server.
         Useful for datasets which refer to auxiliary files (e.g., meta-album).
+    force_refresh_cache : bool (default=False)
+        Force the cache to refreshed by deleting the cache directory and re-downloading the data.
+        Note, if `force_refresh_cache` is True, `get_dataset` is NOT thread/multiprocessing safe,
+        because this creates a race condition to creating and deleting the cache; as in general with
+        the cache.
 
     Returns
     -------
     dataset : :class:`openml.OpenMLDataset`
         The downloaded dataset.
     """
+    if any([download_qualities, download_features_meta_data]):
+        warnings.warn(
+            """Starting from Version 0.14 `download_data`, `download_qualities`, and
+            `download_features_meta_data` will all be ``False`` by default to enable
+            lazy loading.""",
+            DeprecationWarning,
+        )
+
     if download_all_files:
         warnings.warn(
             "``download_all_files`` is experimental and is likely to break with new releases."
@@ -422,6 +443,15 @@ def get_dataset(
             "`dataset_id` must be one of `str` or `int`, not {}.".format(type(dataset_id))
         )
 
+    # Note: we could also (quite heavily) re-implement the below to only download the
+    # data and do not cache the data at all. This would always be thread/multiprocessing
+    # safe. However, this would likely drastically increase the strain on the server.
+    # Hence, we should stick to the alternative mentioned in the docstring.
+    if force_refresh_cache:
+        did_cache_dir = _get_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
+        if os.path.exists(did_cache_dir):
+            _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
+
     did_cache_dir = _create_cache_directory_for_id(
         DATASETS_CACHE_DIR_NAME,
         dataset_id,
@@ -1133,8 +1163,7 @@ def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str:
 
     # Dataset features aren't subject to change...
     if not os.path.isfile(features_file):
-        url_extension = "data/features/{}".format(dataset_id)
-        features_xml = openml._api_calls._perform_api_call(url_extension, "get")
+        features_xml = _get_features_xml(dataset_id)
         with io.open(features_file, "w", encoding="utf8") as fh:
             fh.write(features_xml)
 
@@ -1169,8 +1198,7 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
         with io.open(qualities_file, encoding="utf8") as fh:
             qualities_xml = fh.read()
     except (OSError, IOError):
-        url_extension = "data/qualities/{}".format(dataset_id)
-        qualities_xml = openml._api_calls._perform_api_call(url_extension, "get")
+        qualities_xml = _get_qualities_xml(dataset_id)
         with io.open(qualities_file, "w", encoding="utf8") as fh:
             fh.write(qualities_xml)
     return qualities_file

diff --git a/openml/utils.py b/openml/utils.py
@@ -18,7 +18,6 @@
 if TYPE_CHECKING:
     from openml.base import OpenMLBase
 
-
 oslo_installed = False
 try:
     # Currently, importing oslo raises a lot of warning that it will stop working
@@ -303,18 +302,33 @@ def _list_all(listing_call, output_format="dict", *args, **filters):
     return result
 
 
-def _create_cache_directory(key):
+def _get_cache_dir_for_key(key):
     cache = config.get_cache_directory()
-    cache_dir = os.path.join(cache, key)
+    return os.path.join(cache, key)
+
+
+def _create_cache_directory(key):
+    cache_dir = _get_cache_dir_for_key(key)
+
     try:
         os.makedirs(cache_dir, exist_ok=True)
     except Exception as e:
         raise openml.exceptions.OpenMLCacheException(
             f"Cannot create cache directory {cache_dir}."
         ) from e
+
     return cache_dir
 
 
+def _get_cache_dir_for_id(key, id_, create=False):
+    if create:
+        cache_dir = _create_cache_directory(key)
+    else:
+        cache_dir = _get_cache_dir_for_key(key)
+
+    return os.path.join(cache_dir, str(id_))
+
+
 def _create_cache_directory_for_id(key, id_):
     """Create the cache directory for a specific ID
 
@@ -336,7 +350,7 @@ def _create_cache_directory_for_id(key, id_):
     str
         Path of the created dataset cache directory.
     """
-    cache_dir = os.path.join(_create_cache_directory(key), str(id_))
+    cache_dir = _get_cache_dir_for_id(key, id_, create=True)
     if os.path.isdir(cache_dir):
         pass
     elif os.path.exists(cache_dir):

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -262,6 +262,19 @@ def test_get_data_corrupt_pickle(self):
         self.assertIsInstance(xy, pd.DataFrame)
         self.assertEqual(xy.shape, (150, 5))
 
+    def test_load_metadata(self):
+        _compare_dataset = openml.datasets.get_dataset(
+            2, download_data=False, download_features_meta_data=True, download_qualities=True
+        )
+
+        _dataset = openml.datasets.get_dataset(
+            2, download_data=False, download_features_meta_data=False, download_qualities=False
+        )
+        _dataset.load_metadata(features=True, qualities=True)
+
+        self.assertEqual(_dataset.features, _compare_dataset.features)
+        self.assertEqual(_dataset.qualities, _compare_dataset.qualities)
+
 
 class OpenMLDatasetTestOnTestServer(TestBase):
     def setUp(self):