Skip to content

Commit 3bc4866

Browse files
author
Josh Gachnang
committed
Adding DynamicLoopingCall around lookup
Currently, if a single lookup call to the Ironic API fails, the entire agent errors out and restarts. This allows the agent to retry for a set amount of time before throwing an uncaught exception forcing a restart. Change-Id: I39752fb3f42ad3e4f15a49194f1554e1d3463cf8 Closes-Bug: 1297019
1 parent 6329ae4 commit 3bc4866

File tree

9 files changed

+366
-55
lines changed

9 files changed

+366
-55
lines changed

ironic_python_agent/agent.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,8 @@ def stop(self):
106106

107107

108108
class IronicPythonAgent(object):
109-
def __init__(self, api_url, advertise_address, listen_address):
109+
def __init__(self, api_url, advertise_address, listen_address,
110+
lookup_timeout, lookup_interval):
110111
self.api_url = api_url
111112
self.api_client = ironic_api_client.APIClient(self.api_url)
112113
self.listen_address = listen_address
@@ -127,6 +128,9 @@ def __init__(self, api_url, advertise_address, listen_address):
127128
invoke_on_load=True,
128129
propagate_map_exceptions=True,
129130
)
131+
# lookup timeout in seconds
132+
self.lookup_timeout = lookup_timeout
133+
self.lookup_interval = lookup_interval
130134

131135
def get_status(self):
132136
"""Retrieve a serializable status."""
@@ -195,13 +199,14 @@ def execute_command(self, command_name, **kwargs):
195199

196200
def run(self):
197201
"""Run the Ironic Python Agent."""
202+
# Get the UUID so we can heartbeat to Ironic. Raises LookupNodeError
203+
# if there is an issue (uncaught, restart agent)
198204
self.started_at = _time()
199-
# Get the UUID so we can heartbeat to Ironic
200205
content = self.api_client.lookup_node(
201-
hardware_info=self.hardware.list_hardware_info()
202-
)
203-
if 'node' not in content or 'heartbeat_timeout' not in content:
204-
raise LookupError('Lookup return needs node and heartbeat_timeout')
206+
hardware_info=self.hardware.list_hardware_info(),
207+
timeout=self.lookup_timeout,
208+
starting_interval=self.lookup_interval)
209+
205210
self.node = content['node']
206211
self.heartbeat_timeout = content['heartbeat_timeout']
207212
self.heartbeater.start()
@@ -223,8 +228,12 @@ def build_agent(api_url,
223228
advertise_host,
224229
advertise_port,
225230
listen_host,
226-
listen_port):
231+
listen_port,
232+
lookup_timeout,
233+
lookup_interval):
227234

228235
return IronicPythonAgent(api_url,
229236
(advertise_host, advertise_port),
230-
(listen_host, listen_port))
237+
(listen_host, listen_port),
238+
lookup_timeout,
239+
lookup_interval)

ironic_python_agent/cmd/agent.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,24 @@ def run():
4747
type=int,
4848
help='The port to tell Ironic to reply and send '
4949
'commands to.')
50+
parser.add_argument('--lookup-timeout',
51+
default=300,
52+
type=int,
53+
help='The amount of time to retry the initial lookup '
54+
'call to Ironic. After the timeout, the agent '
55+
'will exit with a non-zero exit code.')
56+
parser.add_argument('--lookup-interval',
57+
default=1,
58+
type=int,
59+
help='The initial interval for retries on the initial '
60+
'lookup call to Ironic. The interval will be '
61+
'doubled after each failure until timeout is '
62+
'exceeded.')
5063
args = parser.parse_args()
5164
agent.build_agent(args.api_url,
5265
args.advertise_host,
5366
args.advertise_port,
5467
args.listen_host,
55-
args.listen_port).run()
68+
args.listen_port,
69+
args.lookup_timeout,
70+
args.lookup_interval).run()

ironic_python_agent/ironic_api_client.py

Lines changed: 65 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@
1515
"""
1616

1717
import json
18-
1918
import requests
2019

2120
from ironic_python_agent import encoding
2221
from ironic_python_agent import errors
22+
from ironic_python_agent.openstack.common import log
23+
from ironic_python_agent.openstack.common import loopingcall
2324

2425

2526
class APIClient(object):
@@ -29,6 +30,7 @@ def __init__(self, api_url):
2930
self.api_url = api_url.rstrip('/')
3031
self.session = requests.Session()
3132
self.encoder = encoding.RESTJSONEncoder()
33+
self.log = log.getLogger(__name__)
3234

3335
def _request(self, method, path, data=None):
3436
request_url = '{api_url}{path}'.format(api_url=self.api_url, path=path)
@@ -70,38 +72,86 @@ def heartbeat(self, uuid, advertise_address):
7072
except Exception:
7173
raise errors.HeartbeatError('Invalid Heartbeat-Before header')
7274

73-
def lookup_node(self, hardware_info):
75+
def lookup_node(self, hardware_info, timeout, starting_interval):
76+
timer = loopingcall.DynamicLoopingCall(
77+
self._do_lookup,
78+
hardware_info=hardware_info,
79+
intervals=[starting_interval],
80+
total_time=[0],
81+
timeout=timeout)
82+
node_content = timer.start().wait()
83+
84+
# True is returned on timeout
85+
if node_content is True:
86+
raise errors.LookupNodeError('Could not look up node info. Check '
87+
'logs for details.')
88+
return node_content
89+
90+
def _do_lookup(self, hardware_info, timeout, intervals=[1],
91+
total_time=[0]):
92+
"""The actual call to lookup a node. Should be called inside
93+
loopingcall.DynamicLoopingCall.
94+
95+
intervals and total_time are mutable so it can be changed by each run
96+
in the looping call and accessed/changed on the next run.
97+
"""
98+
def next_interval(timeout, intervals=[], total_time=[]):
99+
"""Function to calculate what the next interval should be. Uses
100+
exponential backoff and raises an exception (that won't
101+
be caught by do_lookup) to kill the looping call if it goes too
102+
long
103+
"""
104+
new_interval = intervals[-1] * 2
105+
if total_time[0] + new_interval > timeout:
106+
# No retvalue signifies error
107+
raise loopingcall.LoopingCallDone()
108+
109+
total_time[0] += new_interval
110+
intervals.append(new_interval)
111+
return new_interval
112+
74113
path = '/{api_version}/drivers/teeth/vendor_passthru/lookup'.format(
75114
api_version=self.api_version
76115
)
77-
# This hardware won't be saved on the node currently, because of how
78-
# driver_vendor_passthru is implemented (no node saving).
116+
# This hardware won't be saved on the node currently, because of
117+
# how driver_vendor_passthru is implemented (no node saving).
79118
data = {
80-
'hardware': hardware_info,
119+
'hardware': hardware_info
81120
}
82121

122+
# Make the POST, make sure we get back normal data/status codes and
123+
# content
83124
try:
84125
response = self._request('POST', path, data=data)
85126
except Exception as e:
86-
raise errors.LookupNodeError(str(e))
127+
self.log.warning('POST failed: %s' % str(e))
128+
return next_interval(timeout, intervals, total_time)
87129

88130
if response.status_code != requests.codes.OK:
89-
msg = 'Invalid status code: {0}'.format(response.status_code)
90-
raise errors.LookupNodeError(msg)
131+
self.log.warning('Invalid status code: %s' %
132+
response.status_code)
133+
134+
return next_interval(timeout, intervals, total_time)
91135

92136
try:
93137
content = json.loads(response.content)
94138
except Exception as e:
95-
raise errors.LookupNodeError('Error decoding response: '
96-
+ str(e))
139+
self.log.warning('Error decoding response: %s' % str(e))
140+
return next_interval(timeout, intervals, total_time)
97141

142+
# Check for valid response data
98143
if 'node' not in content or 'uuid' not in content['node']:
99-
raise errors.LookupNodeError('Got invalid node data from the API:'
100-
'%s' % content)
144+
self.log.warning('Got invalid node data from the API: %s' %
145+
content)
146+
return next_interval(timeout, intervals, total_time)
147+
101148
if 'heartbeat_timeout' not in content:
102-
raise errors.LookupNodeError('Got invalid heartbeat from the API:'
103-
'%s' % content)
104-
return content
149+
self.log.warning('Got invalid heartbeat from the API: %s' %
150+
content)
151+
return next_interval(timeout, intervals, total_time)
152+
153+
# Got valid content
154+
raise loopingcall.LoopingCallDone(retvalue=content)
105155

106156
def _get_agent_url(self, advertise_address):
107157
return 'http://{0}:{1}'.format(advertise_address[0],
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# Copyright 2010 United States Government as represented by the
2+
# Administrator of the National Aeronautics and Space Administration.
3+
# Copyright 2011 Justin Santa Barbara
4+
# All Rights Reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
7+
# not use this file except in compliance with the License. You may obtain
8+
# a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15+
# License for the specific language governing permissions and limitations
16+
# under the License.
17+
18+
import sys
19+
20+
from eventlet import event
21+
from eventlet import greenthread
22+
23+
from ironic_python_agent.openstack.common.gettextutils import _LE, _LW
24+
from ironic_python_agent.openstack.common import log as logging
25+
from ironic_python_agent.openstack.common import timeutils
26+
27+
LOG = logging.getLogger(__name__)
28+
29+
30+
class LoopingCallDone(Exception):
31+
"""Exception to break out and stop a LoopingCall.
32+
33+
The poll-function passed to LoopingCall can raise this exception to
34+
break out of the loop normally. This is somewhat analogous to
35+
StopIteration.
36+
37+
An optional return-value can be included as the argument to the exception;
38+
this return-value will be returned by LoopingCall.wait()
39+
40+
"""
41+
42+
def __init__(self, retvalue=True):
43+
""":param retvalue: Value that LoopingCall.wait() should return."""
44+
self.retvalue = retvalue
45+
46+
47+
class LoopingCallBase(object):
48+
def __init__(self, f=None, *args, **kw):
49+
self.args = args
50+
self.kw = kw
51+
self.f = f
52+
self._running = False
53+
self.done = None
54+
55+
def stop(self):
56+
self._running = False
57+
58+
def wait(self):
59+
return self.done.wait()
60+
61+
62+
class FixedIntervalLoopingCall(LoopingCallBase):
63+
"""A fixed interval looping call."""
64+
65+
def start(self, interval, initial_delay=None):
66+
self._running = True
67+
done = event.Event()
68+
69+
def _inner():
70+
if initial_delay:
71+
greenthread.sleep(initial_delay)
72+
73+
try:
74+
while self._running:
75+
start = timeutils.utcnow()
76+
self.f(*self.args, **self.kw)
77+
end = timeutils.utcnow()
78+
if not self._running:
79+
break
80+
delay = interval - timeutils.delta_seconds(start, end)
81+
if delay <= 0:
82+
LOG.warn(_LW('task run outlasted interval by %s sec') %
83+
-delay)
84+
greenthread.sleep(delay if delay > 0 else 0)
85+
except LoopingCallDone as e:
86+
self.stop()
87+
done.send(e.retvalue)
88+
except Exception:
89+
LOG.exception(_LE('in fixed duration looping call'))
90+
done.send_exception(*sys.exc_info())
91+
return
92+
else:
93+
done.send(True)
94+
95+
self.done = done
96+
97+
greenthread.spawn_n(_inner)
98+
return self.done
99+
100+
101+
# TODO(mikal): this class name is deprecated in Havana and should be removed
102+
# in the I release
103+
LoopingCall = FixedIntervalLoopingCall
104+
105+
106+
class DynamicLoopingCall(LoopingCallBase):
107+
"""A looping call which sleeps until the next known event.
108+
109+
The function called should return how long to sleep for before being
110+
called again.
111+
"""
112+
113+
def start(self, initial_delay=None, periodic_interval_max=None):
114+
self._running = True
115+
done = event.Event()
116+
117+
def _inner():
118+
if initial_delay:
119+
greenthread.sleep(initial_delay)
120+
121+
try:
122+
while self._running:
123+
idle = self.f(*self.args, **self.kw)
124+
if not self._running:
125+
break
126+
127+
if periodic_interval_max is not None:
128+
idle = min(idle, periodic_interval_max)
129+
LOG.debug('Dynamic looping call sleeping for %.02f '
130+
'seconds', idle)
131+
greenthread.sleep(idle)
132+
except LoopingCallDone as e:
133+
self.stop()
134+
done.send(e.retvalue)
135+
except Exception:
136+
LOG.exception(_LE('in dynamic looping call'))
137+
done.send_exception(*sys.exc_info())
138+
return
139+
else:
140+
done.send(True)
141+
142+
self.done = done
143+
144+
greenthread.spawn(_inner)
145+
return self.done

ironic_python_agent/tests/agent.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,9 @@ def setUp(self):
123123
self.agent = agent.IronicPythonAgent('https://fake_api.example.'
124124
'org:8081/',
125125
('203.0.113.1', 9990),
126-
('192.0.2.1', 9999))
126+
('192.0.2.1', 9999),
127+
lookup_timeout=300,
128+
lookup_interval=1)
127129

128130
def assertEqualEncoded(self, a, b):
129131
# Evidently JSONEncoder.default() can't handle None (??) so we have to

0 commit comments

Comments
 (0)