Create a non-linear retry policy. (#1065)

PGijsbers · web-flow · commit a505162b9741 · 2021-05-10T09:33:44.000+02:00
Create a second configurable retry policy. The configuration now allows for a `human` and `robot` retry policy, intended for interactive use and scripts, respectively.
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -8,6 +8,7 @@ Changelog
 
 0.12.2
 ~~~~~~
+* ADD #1065: Add a ``retry_policy`` configuration option that determines the frequency and number of times to attempt to retry server requests.
 * ADD #1075: A docker image is now automatically built on a push to develop. It can be used to build docs or run tests in an isolated environment.
 * DOC: Fixes a few broken links in the documentation.
 * MAINT/DOC: Automatically check for broken external links when building the documentation.
diff --git a/doc/usage.rst b/doc/usage.rst
@@ -52,9 +52,14 @@ which are separated by newlines. The following keys are defined:
     * if set to ``True``, when ``run_flow_on_task`` or similar methods are called a lookup is performed to see if there already exists such a run on the server. If so, download those results instead.
     * if not given, will default to ``True``.
 
+* retry_policy:
+    * Defines how to react when the server is unavailable or experiencing high load. It determines both how often to attempt to reconnect and how quickly to do so. Please don't use ``human`` in an automated script that you run more than one instance of, it might increase the time to complete your jobs and that of others.
+    * human (default): For people running openml in interactive fashion. Try only a few times, but in quick succession.
+    * robot: For people using openml in an automated fashion. Keep trying to reconnect for a longer time, quickly increasing the time between retries.
+
 * connection_n_retries:
-    * number of connection retries.
-    * default: 2. Maximum number of retries: 20.
+    * number of connection retries
+    * default depends on retry_policy (5 for ``human``, 50 for ``robot``)
 
 * verbosity:
     * 0: normal output
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -3,7 +3,9 @@
 import time
 import hashlib
 import logging
+import math
 import pathlib
+import random
 import requests
 import urllib.parse
 import xml
@@ -217,7 +219,7 @@ def __is_checksum_equal(downloaded_file, md5_checksum=None):
 
 
 def _send_request(request_method, url, data, files=None, md5_checksum=None):
-    n_retries = max(1, min(config.connection_n_retries, config.max_retries))
+    n_retries = max(1, config.connection_n_retries)
 
     response = None
     with requests.Session() as session:
@@ -261,7 +263,17 @@ def _send_request(request_method, url, data, files=None, md5_checksum=None):
                 if retry_counter >= n_retries:
                     raise
                 else:
-                    time.sleep(retry_counter)
+
+                    def robot(n: int) -> float:
+                        wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60
+                        variation = random.gauss(0, wait / 10)
+                        return max(1.0, wait + variation)
+
+                    def human(n: int) -> float:
+                        return max(1.0, n)
+
+                    delay = {"human": human, "robot": robot}[config.retry_policy](retry_counter)
+                    time.sleep(delay)
     if response is None:
         raise ValueError("This should never happen!")
     return response
diff --git a/openml/cli.py b/openml/cli.py
@@ -149,19 +149,17 @@ def check_cache_dir(path: str) -> str:
 def configure_connection_n_retries(value: str) -> None:
     def valid_connection_retries(n: str) -> str:
         if not n.isdigit():
-            return f"Must be an integer number (smaller than {config.max_retries})."
-        if int(n) > config.max_retries:
-            return f"connection_n_retries may not exceed {config.max_retries}."
-        if int(n) == 0:
-            return "connection_n_retries must be non-zero."
+            return f"'{n}' is not a valid positive integer."
+        if int(n) <= 0:
+            return "connection_n_retries must be positive."
         return ""
 
     configure_field(
         field="connection_n_retries",
         value=value,
         check_with_message=valid_connection_retries,
         intro_message="Configuring the number of times to attempt to connect to the OpenML Server",
-        input_message=f"Enter an integer between 0 and {config.max_retries}: ",
+        input_message="Enter a positive integer: ",
     )
 
 
@@ -217,6 +215,35 @@ def is_zero_through_two(verbosity: str) -> str:
     )
 
 
+def configure_retry_policy(value: str) -> None:
+    def is_known_policy(policy: str) -> str:
+        if policy in ["human", "robot"]:
+            return ""
+        return "Must be 'human' or 'robot'."
+
+    def autocomplete_policy(policy: str) -> str:
+        for option in ["human", "robot"]:
+            if option.startswith(policy.lower()):
+                return option
+        return policy
+
+    intro_message = (
+        "Set the retry policy which determines how to react if the server is unresponsive."
+        "We recommend 'human' for interactive usage and 'robot' for scripts."
+        "'human': try a few times in quick succession, less reliable but quicker response."
+        "'robot': try many times with increasing intervals, more reliable but slower response."
+    )
+
+    configure_field(
+        field="retry_policy",
+        value=value,
+        check_with_message=is_known_policy,
+        intro_message=intro_message,
+        input_message="Enter 'human' or 'robot': ",
+        sanitize=autocomplete_policy,
+    )
+
+
 def configure_field(
     field: str,
     value: Union[None, str],
@@ -272,6 +299,7 @@ def configure(args: argparse.Namespace):
         "apikey": configure_apikey,
         "server": configure_server,
         "cachedir": configure_cachedir,
+        "retry_policy": configure_retry_policy,
         "connection_n_retries": configure_connection_n_retries,
         "avoid_duplicate_runs": configure_avoid_duplicate_runs,
         "verbosity": configure_verbosity,
diff --git a/openml/config.py b/openml/config.py
@@ -9,7 +9,7 @@
 import os
 from pathlib import Path
 import platform
-from typing import Tuple, cast, Any
+from typing import Tuple, cast, Any, Optional
 import warnings
 
 from io import StringIO
@@ -95,11 +95,10 @@ def set_file_log_level(file_output_level: int):
         else os.path.join("~", ".openml")
     ),
     "avoid_duplicate_runs": "True",
-    "connection_n_retries": "10",
-    "max_retries": "20",
+    "retry_policy": "human",
+    "connection_n_retries": "5",
 }
 
-
 # Default values are actually added here in the _setup() function which is
 # called at the end of this module
 server = str(_defaults["server"])  # so mypy knows it is a string
@@ -122,9 +121,26 @@ def get_server_base_url() -> str:
 cache_directory = str(_defaults["cachedir"])  # so mypy knows it is a string
 avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False
 
-# Number of retries if the connection breaks
+retry_policy = _defaults["retry_policy"]
 connection_n_retries = int(_defaults["connection_n_retries"])
-max_retries = int(_defaults["max_retries"])
+
+
+def set_retry_policy(value: str, n_retries: Optional[int] = None) -> None:
+    global retry_policy
+    global connection_n_retries
+    default_retries_by_policy = dict(human=5, robot=50)
+
+    if value not in default_retries_by_policy:
+        raise ValueError(
+            f"Detected retry_policy '{value}' but must be one of {default_retries_by_policy}"
+        )
+    if n_retries is not None and not isinstance(n_retries, int):
+        raise TypeError(f"`n_retries` must be of type `int` or `None` but is `{type(n_retries)}`.")
+    if isinstance(n_retries, int) and n_retries < 1:
+        raise ValueError(f"`n_retries` is '{n_retries}' but must be positive.")
+
+    retry_policy = value
+    connection_n_retries = default_retries_by_policy[value] if n_retries is None else n_retries
 
 
 class ConfigurationForExamples:
@@ -205,8 +221,6 @@ def _setup(config=None):
     global server
     global cache_directory
     global avoid_duplicate_runs
-    global connection_n_retries
-    global max_retries
 
     config_file = determine_config_file_path()
     config_dir = config_file.parent
@@ -238,8 +252,12 @@ def _get(config, key):
     apikey = _get(config, "apikey")
     server = _get(config, "server")
     short_cache_dir = _get(config, "cachedir")
-    connection_n_retries = int(_get(config, "connection_n_retries"))
-    max_retries = int(_get(config, "max_retries"))
+
+    n_retries = _get(config, "connection_n_retries")
+    if n_retries is not None:
+        n_retries = int(n_retries)
+
+    set_retry_policy(_get(config, "retry_policy"), n_retries)
 
     cache_directory = os.path.expanduser(short_cache_dir)
     # create the cache subdirectory
@@ -261,12 +279,6 @@ def _get(config, key):
             "not working properly." % config_dir
         )
 
-    if connection_n_retries > max_retries:
-        raise ValueError(
-            "A higher number of retries than {} is not allowed to keep the "
-            "server load reasonable".format(max_retries)
-        )
-
 
 def set_field_in_config_file(field: str, value: Any):
     """ Overwrites the `field` in the configuration file with the new `value`. """
@@ -317,7 +329,7 @@ def get_config_as_dict():
     config["cachedir"] = cache_directory
     config["avoid_duplicate_runs"] = avoid_duplicate_runs
     config["connection_n_retries"] = connection_n_retries
-    config["max_retries"] = max_retries
+    config["retry_policy"] = retry_policy
     return config
 
 
diff --git a/openml/testing.py b/openml/testing.py
@@ -94,8 +94,9 @@ def setUp(self, n_levels: int = 1):
         openml.config.cache_directory = self.workdir
 
         # Increase the number of retries to avoid spurious server failures
+        self.retry_policy = openml.config.retry_policy
         self.connection_n_retries = openml.config.connection_n_retries
-        openml.config.connection_n_retries = 10
+        openml.config.set_retry_policy("robot", n_retries=20)
 
     def tearDown(self):
         os.chdir(self.cwd)
@@ -109,6 +110,7 @@ def tearDown(self):
                 raise
         openml.config.server = self.production_server
         openml.config.connection_n_retries = self.connection_n_retries
+        openml.config.retry_policy = self.retry_policy
 
     @classmethod
     def _mark_entity_for_removal(self, entity_type, entity_id):
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -506,6 +506,9 @@ def test__getarff_md5_issue(self):
             "oml:md5_checksum": "abc",
             "oml:url": "https://www.openml.org/data/download/61",
         }
+        n = openml.config.connection_n_retries
+        openml.config.connection_n_retries = 1
+
         self.assertRaisesRegex(
             OpenMLHashException,
             "Checksum of downloaded file is unequal to the expected checksum abc when downloading "
@@ -514,6 +517,8 @@ def test__getarff_md5_issue(self):
             description,
         )
 
+        openml.config.connection_n_retries = n
+
     def test__get_dataset_features(self):
         features_file = _get_dataset_features_file(self.workdir, 2)
         self.assertIsInstance(features_file, str)
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
@@ -29,4 +29,4 @@ def test_retry_on_database_error(self, Session_class_mock, _):
         ):
             openml._api_calls._send_request("get", "/abc", {})
 
-        self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 10)
+        self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 20)
diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py
@@ -44,8 +44,8 @@ def test_get_config_as_dict(self):
         _config["server"] = "https://test.openml.org/api/v1/xml"
         _config["cachedir"] = self.workdir
         _config["avoid_duplicate_runs"] = False
-        _config["connection_n_retries"] = 10
-        _config["max_retries"] = 20
+        _config["connection_n_retries"] = 20
+        _config["retry_policy"] = "robot"
         self.assertIsInstance(config, dict)
         self.assertEqual(len(config), 6)
         self.assertDictEqual(config, _config)
@@ -57,8 +57,8 @@ def test_setup_with_config(self):
         _config["server"] = "https://www.openml.org/api/v1/xml"
         _config["cachedir"] = self.workdir
         _config["avoid_duplicate_runs"] = True
+        _config["retry_policy"] = "human"
         _config["connection_n_retries"] = 100
-        _config["max_retries"] = 1000
         orig_config = openml.config.get_config_as_dict()
         openml.config._setup(_config)
         updated_config = openml.config.get_config_as_dict()