Skip to content

Commit a505162

Browse files
authored
Create a non-linear retry policy. (#1065)
Create a second configurable retry policy. The configuration now allows for a `human` and `robot` retry policy, intended for interactive use and scripts, respectively.
1 parent 97d67e7 commit a505162

9 files changed

Lines changed: 97 additions & 32 deletions

File tree

doc/progress.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Changelog
88

99
0.12.2
1010
~~~~~~
11+
* ADD #1065: Add a ``retry_policy`` configuration option that determines the frequency and number of times to attempt to retry server requests.
1112
* ADD #1075: A docker image is now automatically built on a push to develop. It can be used to build docs or run tests in an isolated environment.
1213
* DOC: Fixes a few broken links in the documentation.
1314
* MAINT/DOC: Automatically check for broken external links when building the documentation.

doc/usage.rst

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,14 @@ which are separated by newlines. The following keys are defined:
5252
* if set to ``True``, when ``run_flow_on_task`` or similar methods are called a lookup is performed to see if there already exists such a run on the server. If so, download those results instead.
5353
* if not given, will default to ``True``.
5454

55+
* retry_policy:
56+
* Defines how to react when the server is unavailable or experiencing high load. It determines both how often to attempt to reconnect and how quickly to do so. Please don't use ``human`` in an automated script that you run more than one instance of, it might increase the time to complete your jobs and that of others.
57+
* human (default): For people running openml in interactive fashion. Try only a few times, but in quick succession.
58+
* robot: For people using openml in an automated fashion. Keep trying to reconnect for a longer time, quickly increasing the time between retries.
59+
5560
* connection_n_retries:
56-
* number of connection retries.
57-
* default: 2. Maximum number of retries: 20.
61+
* number of connection retries
62+
* default depends on retry_policy (5 for ``human``, 50 for ``robot``)
5863

5964
* verbosity:
6065
* 0: normal output

openml/_api_calls.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import time
44
import hashlib
55
import logging
6+
import math
67
import pathlib
8+
import random
79
import requests
810
import urllib.parse
911
import xml
@@ -217,7 +219,7 @@ def __is_checksum_equal(downloaded_file, md5_checksum=None):
217219

218220

219221
def _send_request(request_method, url, data, files=None, md5_checksum=None):
220-
n_retries = max(1, min(config.connection_n_retries, config.max_retries))
222+
n_retries = max(1, config.connection_n_retries)
221223

222224
response = None
223225
with requests.Session() as session:
@@ -261,7 +263,17 @@ def _send_request(request_method, url, data, files=None, md5_checksum=None):
261263
if retry_counter >= n_retries:
262264
raise
263265
else:
264-
time.sleep(retry_counter)
266+
267+
def robot(n: int) -> float:
268+
wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60
269+
variation = random.gauss(0, wait / 10)
270+
return max(1.0, wait + variation)
271+
272+
def human(n: int) -> float:
273+
return max(1.0, n)
274+
275+
delay = {"human": human, "robot": robot}[config.retry_policy](retry_counter)
276+
time.sleep(delay)
265277
if response is None:
266278
raise ValueError("This should never happen!")
267279
return response

openml/cli.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,19 +149,17 @@ def check_cache_dir(path: str) -> str:
149149
def configure_connection_n_retries(value: str) -> None:
150150
def valid_connection_retries(n: str) -> str:
151151
if not n.isdigit():
152-
return f"Must be an integer number (smaller than {config.max_retries})."
153-
if int(n) > config.max_retries:
154-
return f"connection_n_retries may not exceed {config.max_retries}."
155-
if int(n) == 0:
156-
return "connection_n_retries must be non-zero."
152+
return f"'{n}' is not a valid positive integer."
153+
if int(n) <= 0:
154+
return "connection_n_retries must be positive."
157155
return ""
158156

159157
configure_field(
160158
field="connection_n_retries",
161159
value=value,
162160
check_with_message=valid_connection_retries,
163161
intro_message="Configuring the number of times to attempt to connect to the OpenML Server",
164-
input_message=f"Enter an integer between 0 and {config.max_retries}: ",
162+
input_message="Enter a positive integer: ",
165163
)
166164

167165

@@ -217,6 +215,35 @@ def is_zero_through_two(verbosity: str) -> str:
217215
)
218216

219217

218+
def configure_retry_policy(value: str) -> None:
219+
def is_known_policy(policy: str) -> str:
220+
if policy in ["human", "robot"]:
221+
return ""
222+
return "Must be 'human' or 'robot'."
223+
224+
def autocomplete_policy(policy: str) -> str:
225+
for option in ["human", "robot"]:
226+
if option.startswith(policy.lower()):
227+
return option
228+
return policy
229+
230+
intro_message = (
231+
"Set the retry policy which determines how to react if the server is unresponsive."
232+
"We recommend 'human' for interactive usage and 'robot' for scripts."
233+
"'human': try a few times in quick succession, less reliable but quicker response."
234+
"'robot': try many times with increasing intervals, more reliable but slower response."
235+
)
236+
237+
configure_field(
238+
field="retry_policy",
239+
value=value,
240+
check_with_message=is_known_policy,
241+
intro_message=intro_message,
242+
input_message="Enter 'human' or 'robot': ",
243+
sanitize=autocomplete_policy,
244+
)
245+
246+
220247
def configure_field(
221248
field: str,
222249
value: Union[None, str],
@@ -272,6 +299,7 @@ def configure(args: argparse.Namespace):
272299
"apikey": configure_apikey,
273300
"server": configure_server,
274301
"cachedir": configure_cachedir,
302+
"retry_policy": configure_retry_policy,
275303
"connection_n_retries": configure_connection_n_retries,
276304
"avoid_duplicate_runs": configure_avoid_duplicate_runs,
277305
"verbosity": configure_verbosity,

openml/config.py

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import os
1010
from pathlib import Path
1111
import platform
12-
from typing import Tuple, cast, Any
12+
from typing import Tuple, cast, Any, Optional
1313
import warnings
1414

1515
from io import StringIO
@@ -95,11 +95,10 @@ def set_file_log_level(file_output_level: int):
9595
else os.path.join("~", ".openml")
9696
),
9797
"avoid_duplicate_runs": "True",
98-
"connection_n_retries": "10",
99-
"max_retries": "20",
98+
"retry_policy": "human",
99+
"connection_n_retries": "5",
100100
}
101101

102-
103102
# Default values are actually added here in the _setup() function which is
104103
# called at the end of this module
105104
server = str(_defaults["server"]) # so mypy knows it is a string
@@ -122,9 +121,26 @@ def get_server_base_url() -> str:
122121
cache_directory = str(_defaults["cachedir"]) # so mypy knows it is a string
123122
avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False
124123

125-
# Number of retries if the connection breaks
124+
retry_policy = _defaults["retry_policy"]
126125
connection_n_retries = int(_defaults["connection_n_retries"])
127-
max_retries = int(_defaults["max_retries"])
126+
127+
128+
def set_retry_policy(value: str, n_retries: Optional[int] = None) -> None:
129+
global retry_policy
130+
global connection_n_retries
131+
default_retries_by_policy = dict(human=5, robot=50)
132+
133+
if value not in default_retries_by_policy:
134+
raise ValueError(
135+
f"Detected retry_policy '{value}' but must be one of {default_retries_by_policy}"
136+
)
137+
if n_retries is not None and not isinstance(n_retries, int):
138+
raise TypeError(f"`n_retries` must be of type `int` or `None` but is `{type(n_retries)}`.")
139+
if isinstance(n_retries, int) and n_retries < 1:
140+
raise ValueError(f"`n_retries` is '{n_retries}' but must be positive.")
141+
142+
retry_policy = value
143+
connection_n_retries = default_retries_by_policy[value] if n_retries is None else n_retries
128144

129145

130146
class ConfigurationForExamples:
@@ -205,8 +221,6 @@ def _setup(config=None):
205221
global server
206222
global cache_directory
207223
global avoid_duplicate_runs
208-
global connection_n_retries
209-
global max_retries
210224

211225
config_file = determine_config_file_path()
212226
config_dir = config_file.parent
@@ -238,8 +252,12 @@ def _get(config, key):
238252
apikey = _get(config, "apikey")
239253
server = _get(config, "server")
240254
short_cache_dir = _get(config, "cachedir")
241-
connection_n_retries = int(_get(config, "connection_n_retries"))
242-
max_retries = int(_get(config, "max_retries"))
255+
256+
n_retries = _get(config, "connection_n_retries")
257+
if n_retries is not None:
258+
n_retries = int(n_retries)
259+
260+
set_retry_policy(_get(config, "retry_policy"), n_retries)
243261

244262
cache_directory = os.path.expanduser(short_cache_dir)
245263
# create the cache subdirectory
@@ -261,12 +279,6 @@ def _get(config, key):
261279
"not working properly." % config_dir
262280
)
263281

264-
if connection_n_retries > max_retries:
265-
raise ValueError(
266-
"A higher number of retries than {} is not allowed to keep the "
267-
"server load reasonable".format(max_retries)
268-
)
269-
270282

271283
def set_field_in_config_file(field: str, value: Any):
272284
""" Overwrites the `field` in the configuration file with the new `value`. """
@@ -317,7 +329,7 @@ def get_config_as_dict():
317329
config["cachedir"] = cache_directory
318330
config["avoid_duplicate_runs"] = avoid_duplicate_runs
319331
config["connection_n_retries"] = connection_n_retries
320-
config["max_retries"] = max_retries
332+
config["retry_policy"] = retry_policy
321333
return config
322334

323335

openml/testing.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,9 @@ def setUp(self, n_levels: int = 1):
9494
openml.config.cache_directory = self.workdir
9595

9696
# Increase the number of retries to avoid spurious server failures
97+
self.retry_policy = openml.config.retry_policy
9798
self.connection_n_retries = openml.config.connection_n_retries
98-
openml.config.connection_n_retries = 10
99+
openml.config.set_retry_policy("robot", n_retries=20)
99100

100101
def tearDown(self):
101102
os.chdir(self.cwd)
@@ -109,6 +110,7 @@ def tearDown(self):
109110
raise
110111
openml.config.server = self.production_server
111112
openml.config.connection_n_retries = self.connection_n_retries
113+
openml.config.retry_policy = self.retry_policy
112114

113115
@classmethod
114116
def _mark_entity_for_removal(self, entity_type, entity_id):

tests/test_datasets/test_dataset_functions.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,9 @@ def test__getarff_md5_issue(self):
506506
"oml:md5_checksum": "abc",
507507
"oml:url": "https://www.openml.org/data/download/61",
508508
}
509+
n = openml.config.connection_n_retries
510+
openml.config.connection_n_retries = 1
511+
509512
self.assertRaisesRegex(
510513
OpenMLHashException,
511514
"Checksum of downloaded file is unequal to the expected checksum abc when downloading "
@@ -514,6 +517,8 @@ def test__getarff_md5_issue(self):
514517
description,
515518
)
516519

520+
openml.config.connection_n_retries = n
521+
517522
def test__get_dataset_features(self):
518523
features_file = _get_dataset_features_file(self.workdir, 2)
519524
self.assertIsInstance(features_file, str)

tests/test_openml/test_api_calls.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,4 @@ def test_retry_on_database_error(self, Session_class_mock, _):
2929
):
3030
openml._api_calls._send_request("get", "/abc", {})
3131

32-
self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 10)
32+
self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 20)

tests/test_openml/test_config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ def test_get_config_as_dict(self):
4444
_config["server"] = "https://test.openml.org/api/v1/xml"
4545
_config["cachedir"] = self.workdir
4646
_config["avoid_duplicate_runs"] = False
47-
_config["connection_n_retries"] = 10
48-
_config["max_retries"] = 20
47+
_config["connection_n_retries"] = 20
48+
_config["retry_policy"] = "robot"
4949
self.assertIsInstance(config, dict)
5050
self.assertEqual(len(config), 6)
5151
self.assertDictEqual(config, _config)
@@ -57,8 +57,8 @@ def test_setup_with_config(self):
5757
_config["server"] = "https://www.openml.org/api/v1/xml"
5858
_config["cachedir"] = self.workdir
5959
_config["avoid_duplicate_runs"] = True
60+
_config["retry_policy"] = "human"
6061
_config["connection_n_retries"] = 100
61-
_config["max_retries"] = 1000
6262
orig_config = openml.config.get_config_as_dict()
6363
openml.config._setup(_config)
6464
updated_config = openml.config.get_config_as_dict()

0 commit comments

Comments
 (0)