Skip to content

Commit a7f2639

Browse files
authored
Fix 1124: provide clear naming for cache directories (#1254)
* Fix #1124 Make variable `openml.config.cache_directory` private so that there is no confusion on how to retrieve the cache directory (since this should be done via `openml.config.get_cache_directory`) * Improve docstrings and method names * Rename base_ to root_ * Update based on Pieter's feedback
1 parent 333b068 commit a7f2639

File tree

7 files changed

+46
-30
lines changed

7 files changed

+46
-30
lines changed

openml/config.py

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def _create_log_handlers(create_file_handler=True):
3737

3838
if create_file_handler:
3939
one_mb = 2**20
40-
log_path = os.path.join(cache_directory, "openml_python.log")
40+
log_path = os.path.join(_root_cache_directory, "openml_python.log")
4141
file_handler = logging.handlers.RotatingFileHandler(
4242
log_path, maxBytes=one_mb, backupCount=1, delay=True
4343
)
@@ -125,7 +125,7 @@ def get_server_base_url() -> str:
125125

126126
apikey = _defaults["apikey"]
127127
# The current cache directory (without the server name)
128-
cache_directory = str(_defaults["cachedir"]) # so mypy knows it is a string
128+
_root_cache_directory = str(_defaults["cachedir"]) # so mypy knows it is a string
129129
avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False
130130

131131
retry_policy = _defaults["retry_policy"]
@@ -226,7 +226,7 @@ def _setup(config=None):
226226
"""
227227
global apikey
228228
global server
229-
global cache_directory
229+
global _root_cache_directory
230230
global avoid_duplicate_runs
231231

232232
config_file = determine_config_file_path()
@@ -266,15 +266,15 @@ def _get(config, key):
266266

267267
set_retry_policy(_get(config, "retry_policy"), n_retries)
268268

269-
cache_directory = os.path.expanduser(short_cache_dir)
269+
_root_cache_directory = os.path.expanduser(short_cache_dir)
270270
# create the cache subdirectory
271-
if not os.path.exists(cache_directory):
271+
if not os.path.exists(_root_cache_directory):
272272
try:
273-
os.makedirs(cache_directory, exist_ok=True)
273+
os.makedirs(_root_cache_directory, exist_ok=True)
274274
except PermissionError:
275275
openml_logger.warning(
276276
"No permission to create openml cache directory at %s! This can result in "
277-
"OpenML-Python not working properly." % cache_directory
277+
"OpenML-Python not working properly." % _root_cache_directory
278278
)
279279

280280
if cache_exists:
@@ -333,7 +333,7 @@ def get_config_as_dict():
333333
config = dict()
334334
config["apikey"] = apikey
335335
config["server"] = server
336-
config["cachedir"] = cache_directory
336+
config["cachedir"] = _root_cache_directory
337337
config["avoid_duplicate_runs"] = avoid_duplicate_runs
338338
config["connection_n_retries"] = connection_n_retries
339339
config["retry_policy"] = retry_policy
@@ -343,6 +343,17 @@ def get_config_as_dict():
343343
def get_cache_directory():
344344
"""Get the current cache directory.
345345
346+
This gets the cache directory for the current server relative
347+
to the root cache directory that can be set via
348+
``set_root_cache_directory()``. The cache directory is the
349+
``root_cache_directory`` with additional information on which
350+
subdirectory to use based on the server name. By default it is
351+
``root_cache_directory / org / openml / www`` for the standard
352+
OpenML.org server and is defined as
353+
``root_cache_directory / top-level domain / second-level domain /
354+
hostname``
355+
```
356+
346357
Returns
347358
-------
348359
cachedir : string
@@ -351,27 +362,32 @@ def get_cache_directory():
351362
"""
352363
url_suffix = urlparse(server).netloc
353364
reversed_url_suffix = os.sep.join(url_suffix.split(".")[::-1])
354-
_cachedir = os.path.join(cache_directory, reversed_url_suffix)
365+
_cachedir = os.path.join(_root_cache_directory, reversed_url_suffix)
355366
return _cachedir
356367

357368

358-
def set_cache_directory(cachedir):
359-
"""Set module-wide cache directory.
369+
def set_root_cache_directory(root_cache_directory):
370+
"""Set module-wide base cache directory.
360371
361-
Sets the cache directory into which to download datasets, tasks etc.
372+
Sets the root cache directory, wherin the cache directories are
373+
created to store content from different OpenML servers. For example,
374+
by default, cached data for the standard OpenML.org server is stored
375+
at ``root_cache_directory / org / openml / www``, and the general
376+
pattern is ``root_cache_directory / top-level domain / second-level
377+
domain / hostname``.
362378
363379
Parameters
364380
----------
365-
cachedir : string
381+
root_cache_directory : string
366382
Path to use as cache directory.
367383
368384
See also
369385
--------
370386
get_cache_directory
371387
"""
372388

373-
global cache_directory
374-
cache_directory = cachedir
389+
global _root_cache_directory
390+
_root_cache_directory = root_cache_directory
375391

376392

377393
start_using_configuration_for_example = (
@@ -382,7 +398,7 @@ def set_cache_directory(cachedir):
382398

383399
__all__ = [
384400
"get_cache_directory",
385-
"set_cache_directory",
401+
"set_root_cache_directory",
386402
"start_using_configuration_for_example",
387403
"stop_using_configuration_for_example",
388404
"get_config_as_dict",

openml/testing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def setUp(self, n_levels: int = 1):
9393
self.production_server = "https://openml.org/api/v1/xml"
9494
openml.config.server = TestBase.test_server
9595
openml.config.avoid_duplicate_runs = False
96-
openml.config.cache_directory = self.workdir
96+
openml.config.set_root_cache_directory(self.workdir)
9797

9898
# Increase the number of retries to avoid spurious server failures
9999
self.retry_policy = openml.config.retry_policy

tests/test_datasets/test_dataset_functions.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ def test__get_dataset_description(self):
420420
self.assertTrue(os.path.exists(description_xml_path))
421421

422422
def test__getarff_path_dataset_arff(self):
423-
openml.config.cache_directory = self.static_cache_dir
423+
openml.config.set_root_cache_directory(self.static_cache_dir)
424424
description = _get_dataset_description(self.workdir, 2)
425425
arff_path = _get_dataset_arff(description, cache_directory=self.workdir)
426426
self.assertIsInstance(arff_path, str)
@@ -494,7 +494,7 @@ def test__get_dataset_parquet_not_cached(self):
494494

495495
@mock.patch("openml._api_calls._download_minio_file")
496496
def test__get_dataset_parquet_is_cached(self, patch):
497-
openml.config.cache_directory = self.static_cache_dir
497+
openml.config.set_root_cache_directory(self.static_cache_dir)
498498
patch.side_effect = RuntimeError(
499499
"_download_minio_file should not be called when loading from cache"
500500
)
@@ -594,7 +594,7 @@ def test_publish_dataset(self):
594594
self.assertIsInstance(dataset.dataset_id, int)
595595

596596
def test__retrieve_class_labels(self):
597-
openml.config.cache_directory = self.static_cache_dir
597+
openml.config.set_root_cache_directory(self.static_cache_dir)
598598
labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels()
599599
self.assertEqual(labels, ["1", "2", "3", "4", "5", "U"])
600600
labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels(

tests/test_runs/test_run_functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,11 +1569,11 @@ def test_run_on_dataset_with_missing_labels_array(self):
15691569
self.assertEqual(len(row), 12)
15701570

15711571
def test_get_cached_run(self):
1572-
openml.config.cache_directory = self.static_cache_dir
1572+
openml.config.set_root_cache_directory(self.static_cache_dir)
15731573
openml.runs.functions._get_cached_run(1)
15741574

15751575
def test_get_uncached_run(self):
1576-
openml.config.cache_directory = self.static_cache_dir
1576+
openml.config.set_root_cache_directory(self.static_cache_dir)
15771577
with self.assertRaises(openml.exceptions.OpenMLCacheException):
15781578
openml.runs.functions._get_cached_run(10)
15791579

tests/test_setups/test_setup_functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,10 +182,10 @@ def test_setuplist_offset(self):
182182
self.assertEqual(len(all), size * 2)
183183

184184
def test_get_cached_setup(self):
185-
openml.config.cache_directory = self.static_cache_dir
185+
openml.config.set_root_cache_directory(self.static_cache_dir)
186186
openml.setups.functions._get_cached_setup(1)
187187

188188
def test_get_uncached_setup(self):
189-
openml.config.cache_directory = self.static_cache_dir
189+
openml.config.set_root_cache_directory(self.static_cache_dir)
190190
with self.assertRaises(openml.exceptions.OpenMLCacheException):
191191
openml.setups.functions._get_cached_setup(10)

tests/test_tasks/test_task_functions.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,19 @@ def tearDown(self):
2525
super(TestTask, self).tearDown()
2626

2727
def test__get_cached_tasks(self):
28-
openml.config.cache_directory = self.static_cache_dir
28+
openml.config.set_root_cache_directory(self.static_cache_dir)
2929
tasks = openml.tasks.functions._get_cached_tasks()
3030
self.assertIsInstance(tasks, dict)
3131
self.assertEqual(len(tasks), 3)
3232
self.assertIsInstance(list(tasks.values())[0], OpenMLTask)
3333

3434
def test__get_cached_task(self):
35-
openml.config.cache_directory = self.static_cache_dir
35+
openml.config.set_root_cache_directory(self.static_cache_dir)
3636
task = openml.tasks.functions._get_cached_task(1)
3737
self.assertIsInstance(task, OpenMLTask)
3838

3939
def test__get_cached_task_not_cached(self):
40-
openml.config.cache_directory = self.static_cache_dir
40+
openml.config.set_root_cache_directory(self.static_cache_dir)
4141
self.assertRaisesRegex(
4242
OpenMLCacheException,
4343
"Task file for tid 2 not cached",
@@ -129,7 +129,7 @@ def test_list_tasks_per_type_paginate(self):
129129
self._check_task(tasks[tid])
130130

131131
def test__get_task(self):
132-
openml.config.cache_directory = self.static_cache_dir
132+
openml.config.set_root_cache_directory(self.static_cache_dir)
133133
openml.tasks.get_task(1882)
134134

135135
@unittest.skip(
@@ -224,7 +224,7 @@ def assert_and_raise(*args, **kwargs):
224224
self.assertFalse(os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml")))
225225

226226
def test_get_task_with_cache(self):
227-
openml.config.cache_directory = self.static_cache_dir
227+
openml.config.set_root_cache_directory(self.static_cache_dir)
228228
task = openml.tasks.get_task(1)
229229
self.assertIsInstance(task, OpenMLTask)
230230

tests/test_tasks/test_task_methods.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_tagging(self):
2828
self.assertEqual(len(task_list), 0)
2929

3030
def test_get_train_and_test_split_indices(self):
31-
openml.config.cache_directory = self.static_cache_dir
31+
openml.config.set_root_cache_directory(self.static_cache_dir)
3232
task = openml.tasks.get_task(1882)
3333
train_indices, test_indices = task.get_train_test_split_indices(0, 0)
3434
self.assertEqual(16, train_indices[0])

0 commit comments

Comments
 (0)