Skip to content

Commit 59281ec

Browse files
author
Josh Gachnang
committed
Improve IPA logging and exception handling
Debugging the agent is a huge pain point. Tracebacks are rarely logged, error messages are often only returned via the API, and lack of info logging makes it hard to determine where some failures occur. Some errors only return a 500 with no error message or logs. Change-Id: I0a127de6e4abf62e20d5c5ad583ba46738604d2d
1 parent c641371 commit 59281ec

File tree

5 files changed

+67
-15
lines changed

5 files changed

+67
-15
lines changed

ironic_python_agent/api/controllers/v1/base.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@ def validate(self, value):
2626
return value
2727

2828
def tobasetype(self, value):
29-
"""Turn a RESTError into a dict."""
29+
"""Turn an Exception into a dict."""
3030
return {
3131
'type': value.__class__.__name__,
32-
'code': value.status_code,
33-
'message': value.message,
34-
'details': value.details,
32+
'code': getattr(value, 'status_code', 500),
33+
'message': str(value),
34+
'details': getattr(value, 'details', ''),
3535
}
3636

3737
frombasetype = tobasetype

ironic_python_agent/extensions/base.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
from ironic_python_agent import errors
2525
from ironic_python_agent import utils
2626

27+
LOG = log.getLogger()
28+
2729

2830
class AgentCommandStatus(object):
2931
"""Mapping of agent command statuses."""
@@ -150,7 +152,8 @@ def run(self):
150152

151153
if isinstance(result, (bytes, six.text_type)):
152154
result = {'result': '{}: {}'.format(self.command_name, result)}
153-
155+
LOG.info('Command: %(name)s, result: %(result)s',
156+
{'name': self.command_name, 'result': result})
154157
with self.command_state_lock:
155158
self.command_result = result
156159
self.command_status = AgentCommandStatus.SUCCEEDED
@@ -159,7 +162,11 @@ def run(self):
159162
self.command_error = e
160163
self.command_status = AgentCommandStatus.CLEAN_VERSION_MISMATCH
161164
self.command_result = None
165+
LOG.info('Clean version mismatch for command %s',
166+
self.command_name)
162167
except Exception as e:
168+
LOG.exception('Command failed: %(name)s, error: %(err)s',
169+
{'name': self.command_name, 'err': e})
163170
if not isinstance(e, errors.RESTError):
164171
e = errors.CommandExecutionError(str(e))
165172

@@ -220,29 +227,38 @@ def split_command(self, command_name):
220227
def execute_command(self, command_name, **kwargs):
221228
"""Execute an agent command."""
222229
with self.command_lock:
230+
LOG.info('Executing command: %(name)s with args: %(args)s',
231+
{'name': command_name, 'args': kwargs})
223232
extension_part, command_part = self.split_command(command_name)
224233

225234
if len(self.command_results) > 0:
226235
last_command = list(self.command_results.values())[-1]
227236
if not last_command.is_done():
237+
LOG.info('Tried to execute %(command)s, agent is still '
238+
'executing %(last)s', {'command': command_name,
239+
'last': last_command})
228240
raise errors.CommandExecutionError('agent is busy')
229241

230242
try:
231243
ext = self.get_extension(extension_part)
232244
result = ext.execute(command_part, **kwargs)
233245
except KeyError:
234246
# Extension Not found
247+
LOG.exception('Extension %s not found', extension_part)
235248
raise errors.RequestedObjectNotFoundError('Extension',
236249
extension_part)
237250
except errors.InvalidContentError as e:
238251
# Any command may raise a InvalidContentError which will be
239252
# returned to the caller directly.
253+
LOG.exception('Invalid content error: %s', e)
240254
raise e
241255
except Exception as e:
242256
# Other errors are considered command execution errors, and are
243-
# recorded as an
257+
# recorded as a failed SyncCommandResult with an error message
258+
LOG.exception('Command execution error: %s', e)
244259
result = SyncCommandResult(command_name, kwargs, False, e)
245-
260+
LOG.info('Command %(name)s completed: %(result)s',
261+
{'name': command_name, 'result': result})
246262
self.command_results[result.id] = result
247263
return result
248264

ironic_python_agent/extensions/clean.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,14 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from oslo_log import log
16+
1517
from ironic_python_agent import errors
1618
from ironic_python_agent.extensions import base
1719
from ironic_python_agent import hardware
1820

21+
LOG = log.getLogger()
22+
1923

2024
class CleanExtension(base.BaseAgentExtension):
2125
@base.sync_command('get_clean_steps')
@@ -28,10 +32,13 @@ def get_clean_steps(self, node, ports):
2832
:returns: A list of clean steps with keys step, priority, and
2933
reboot_requested
3034
"""
35+
LOG.debug('Getting clean steps, called with node: %(node)s, '
36+
'ports: %(ports)s', {'node': node, 'ports': ports})
3137
# Results should be a dict, not a list
3238
steps = hardware.dispatch_to_all_managers('get_clean_steps',
3339
node, ports)
3440

41+
LOG.debug('Returning clean steps: %s', steps)
3542
return {
3643
'clean_steps': steps,
3744
'hardware_manager_version': _get_current_clean_version()
@@ -52,17 +59,23 @@ def execute_clean_step(self, step, node, ports, clean_version=None,
5259
the step returns.
5360
"""
5461
# Ensure the agent is still the same version, or raise an exception
62+
LOG.info('Executing clean step %s', step)
5563
_check_clean_version(clean_version)
5664

5765
if 'step' not in step:
58-
raise ValueError('Malformed clean_step, no "step" key: %s'.format(
59-
step))
66+
msg = 'Malformed clean_step, no "step" key: %s' % step
67+
LOG.error(msg)
68+
raise ValueError(msg)
6069
try:
6170
result = hardware.dispatch_to_managers(step['step'], node, ports)
6271
except Exception as e:
63-
raise errors.CleaningError(
64-
'Error performing clean_step %(step)s: %(err)s' %
65-
{'step': step['step'], 'err': e})
72+
msg = ('Error performing clean_step %(step)s: %(err)s' %
73+
{'step': step['step'], 'err': e})
74+
LOG.exception(msg)
75+
raise errors.CleaningError(msg)
76+
77+
LOG.info('Clean step completed: %(step)s, result: %(result)s',
78+
{'step': step, 'result': result})
6679
# Return the step that was executed so we can dispatch
6780
# to the appropriate Ironic interface
6881
return {
@@ -78,6 +91,9 @@ def _check_clean_version(clean_version=None):
7891
return
7992
agent_version = _get_current_clean_version()
8093
if clean_version != agent_version:
94+
LOG.warning('Mismatched clean versions. Agent version: %(agent), '
95+
'node version: %(node)', {'agent': agent_version,
96+
'node': clean_version})
8197
raise errors.CleanVersionMismatch(agent_version=agent_version,
8298
node_version=clean_version)
8399

ironic_python_agent/extensions/standby.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,35 +207,45 @@ def __init__(self, agent=None):
207207

208208
@base.async_command('cache_image', _validate_image_info)
209209
def cache_image(self, image_info=None, force=False):
210+
LOG.debug('Caching image %s', image_info['id'])
210211
device = hardware.dispatch_to_managers('get_os_install_device')
211212

212213
result_msg = 'image ({0}) already present on device {1}'
213214

214215
if self.cached_image_id != image_info['id'] or force:
216+
LOG.debug('Already had %s cached, overwriting',
217+
self.cached_image_id)
215218
_download_image(image_info)
216219
_write_image(image_info, device)
217220
self.cached_image_id = image_info['id']
218221
result_msg = 'image ({0}) cached to device {1}'
219222

220-
return result_msg.format(image_info['id'], device)
223+
msg = result_msg.format(image_info['id'], device)
224+
LOG.info(msg)
225+
return msg
221226

222227
@base.async_command('prepare_image', _validate_image_info)
223228
def prepare_image(self,
224229
image_info=None,
225230
configdrive=None):
231+
LOG.debug('Preparing image %s', image_info['id'])
226232
device = hardware.dispatch_to_managers('get_os_install_device')
227233

228234
# don't write image again if already cached
229235
if self.cached_image_id != image_info['id']:
236+
LOG.debug('Already had %s cached, overwriting',
237+
self.cached_image_id)
230238
_download_image(image_info)
231239
_write_image(image_info, device)
232240
self.cached_image_id = image_info['id']
233241

234242
if configdrive is not None:
235243
_write_configdrive_to_partition(configdrive, device)
236244

237-
return 'image ({0}) written to device {1}'.format(image_info['id'],
238-
device)
245+
msg = ('image ({0}) written to device {1}'.format(
246+
image_info['id'], device))
247+
LOG.info(msg)
248+
return msg
239249

240250
def _run_shutdown_script(self, parameter):
241251
script = _path_to_script('shell/shutdown.sh')

ironic_python_agent/hardware.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,11 @@ def dispatch_to_all_managers(method, *args, **kwargs):
614614
LOG.debug('HardwareManager {0} does not support {1}'
615615
.format(manager, method))
616616
continue
617+
except Exception as e:
618+
LOG.exception('Unexpected error dispatching %(method)s to '
619+
'manager %(manager)s: %(e)s',
620+
{'method': method, 'manager': manager, 'e': e})
621+
raise
617622
responses[manager.__class__.__name__] = response
618623
else:
619624
LOG.debug('HardwareManager {0} does not have method {1}'
@@ -650,6 +655,11 @@ def dispatch_to_managers(method, *args, **kwargs):
650655
except(errors.IncompatibleHardwareMethodError):
651656
LOG.debug('HardwareManager {0} does not support {1}'
652657
.format(manager, method))
658+
except Exception as e:
659+
LOG.exception('Unexpected error dispatching %(method)s to '
660+
'manager %(manager)s: %(e)s',
661+
{'method': method, 'manager': manager, 'e': e})
662+
raise
653663
else:
654664
LOG.debug('HardwareManager {0} does not have method {1}'
655665
.format(manager, method))

0 commit comments

Comments
 (0)