Skip to content

Commit bd8c6c7

Browse files
umagodtantsur
andcommitted
Fix waiting for target disk to appear
This patch is changing the _wait_for_disks() method behavior to wait to a specific disk if any device hints is specified. There are cases where the deployment might fail or succeed randomly depending on the order and time that the disks shows up. If no root device hints is specified, the method will just wait for any suitable disk to show up, like before. The _wait_for_disks call was made into a proper hardware manager method. It is now also called each time the cached node is updated, not only on start up. This is to ensure that we wait for the device, matching root device hints (which are part of the node). The loop was corrected to avoid redundant sleeps and warnings. Finally, this patch adds more logging around detecting the root device. Co-Authored-By: Dmitry Tantsur <dtantsur@redhat.com> Change-Id: I10ca70d6a390ed802505c0d10d440dfb52beb56c Closes-Bug: #1670916 (cherry picked from commit 3189c16)
1 parent f05702e commit bd8c6c7

File tree

8 files changed

+157
-101
lines changed

8 files changed

+157
-101
lines changed

ironic_python_agent/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@
180180
default=APARAMS.get('ipa-disk-wait-delay', 3),
181181
help='How much time (in seconds) to wait between attempts '
182182
'to check if at least one suitable disk has appeared '
183-
'in inventory. '
183+
'in inventory. Set to zero to disable. '
184184
'Can be supplied as "ipa-disk-wait-delay" '
185185
'kernel parameter.'),
186186
cfg.BoolOpt('insecure',

ironic_python_agent/hardware.py

Lines changed: 49 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,33 @@ def erase_devices(self, node, ports):
354354
erase_results[block_device.name] = result
355355
return erase_results
356356

357+
def wait_for_disks(self):
358+
"""Wait for the root disk to appear.
359+
360+
Wait for at least one suitable disk to show up or a specific disk
361+
if any device hint is specified. Otherwise neither inspection
362+
not deployment have any chances to succeed.
363+
364+
"""
365+
if not CONF.disk_wait_attempts:
366+
return
367+
368+
for attempt in range(CONF.disk_wait_attempts):
369+
try:
370+
self.get_os_install_device()
371+
except errors.DeviceNotFound:
372+
LOG.debug('Still waiting for the root device to appear, '
373+
'attempt %d of %d', attempt + 1,
374+
CONF.disk_wait_attempts)
375+
376+
if attempt < CONF.disk_wait_attempts - 1:
377+
time.sleep(CONF.disk_wait_delay)
378+
else:
379+
break
380+
else:
381+
LOG.warning('The root device was not detected in %d seconds',
382+
CONF.disk_wait_delay * CONF.disk_wait_attempts)
383+
357384
def list_hardware_info(self):
358385
"""Return full hardware inventory as a serializable dict.
359386
@@ -462,32 +489,9 @@ def __init__(self):
462489
def evaluate_hardware_support(self):
463490
# Do some initialization before we declare ourself ready
464491
_check_for_iscsi()
465-
self._wait_for_disks()
492+
self.wait_for_disks()
466493
return HardwareSupport.GENERIC
467494

468-
def _wait_for_disks(self):
469-
"""Wait for disk to appear
470-
471-
Wait for at least one suitable disk to show up, otherwise neither
472-
inspection not deployment have any chances to succeed.
473-
474-
"""
475-
476-
for attempt in range(CONF.disk_wait_attempts):
477-
try:
478-
block_devices = self.list_block_devices()
479-
utils.guess_root_disk(block_devices)
480-
except errors.DeviceNotFound:
481-
LOG.debug('Still waiting for at least one disk to appear, '
482-
'attempt %d of %d', attempt + 1,
483-
CONF.disk_wait_attempts)
484-
time.sleep(CONF.disk_wait_delay)
485-
else:
486-
break
487-
else:
488-
LOG.warning('No disks detected in %d seconds',
489-
CONF.disk_wait_delay * CONF.disk_wait_attempts)
490-
491495
def collect_lldp_data(self, interface_names):
492496
"""Collect and convert LLDP info from the node.
493497
@@ -678,10 +682,12 @@ def get_os_install_device(self):
678682
root_device_hints = None
679683
if cached_node is not None:
680684
root_device_hints = cached_node['properties'].get('root_device')
685+
LOG.debug('Looking for a device matching root hints %s',
686+
root_device_hints)
681687

682688
block_devices = self.list_block_devices()
683689
if not root_device_hints:
684-
return utils.guess_root_disk(block_devices).name
690+
dev_name = utils.guess_root_disk(block_devices).name
685691
else:
686692
serialized_devs = [dev.serialize() for dev in block_devices]
687693
try:
@@ -702,7 +708,13 @@ def get_os_install_device(self):
702708
"No suitable device was found for "
703709
"deployment using these hints %s" % root_device_hints)
704710

705-
return device['name']
711+
dev_name = device['name']
712+
713+
LOG.info('Picked root device %(dev)s for node %(node)s based on '
714+
'root device hints %(hints)s',
715+
{'dev': dev_name, 'hints': root_device_hints,
716+
'node': cached_node['uuid'] if cached_node else None})
717+
return dev_name
706718

707719
def get_system_vendor_info(self):
708720
product_name = None
@@ -1133,11 +1145,22 @@ def cache_node(node):
11331145
Stores the node object in the hardware module to facilitate the
11341146
access of a node information in the hardware extensions.
11351147
1148+
If the new node does not match the previously cached one, wait for the
1149+
expected root device to appear.
1150+
11361151
:param node: Ironic node object
11371152
"""
11381153
global NODE
1154+
new_node = NODE is None or NODE['uuid'] != node['uuid']
11391155
NODE = node
11401156

1157+
if new_node:
1158+
LOG.info('Cached node %s, waiting for its root device to appear',
1159+
node['uuid'])
1160+
# Root device hints, stored in the new node, can change the expected
1161+
# root device. So let us wait for it to appear again.
1162+
dispatch_to_managers('wait_for_disks')
1163+
11411164

11421165
def get_cached_node():
11431166
"""Guard function around the module variable NODE."""

ironic_python_agent/tests/functional/test_commands.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ class TestCommands(base.FunctionalBase):
2424
different test runs.
2525
"""
2626

27+
node = {'uuid': '1', 'properties': {}}
28+
2729
def step_1_get_empty_commands(self):
2830
response = self.request('get', 'commands')
2931
self.assertEqual({'commands': []}, response)
@@ -34,7 +36,7 @@ def step_2_run_command(self):
3436
# this command succeeds even with an empty node and port. This test's
3537
# success is required for steps 3 and 4 to succeed.
3638
command = {'name': 'clean.get_clean_steps',
37-
'params': {'node': {}, 'ports': {}}}
39+
'params': {'node': self.node, 'ports': {}}}
3840
response = self.request('post', 'commands', json=command,
3941
headers={'Content-Type': 'application/json'})
4042
self.assertIsNone(response['command_error'])

ironic_python_agent/tests/unit/extensions/test_clean.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from ironic_python_agent.extensions import clean
2020

2121

22+
@mock.patch('ironic_python_agent.hardware.cache_node', autospec=True)
2223
class TestCleanExtension(test_base.BaseTestCase):
2324
def setUp(self):
2425
super(TestCleanExtension, self).setUp()
@@ -37,7 +38,8 @@ def setUp(self):
3738
'_get_current_clean_version', autospec=True)
3839
@mock.patch('ironic_python_agent.hardware.dispatch_to_all_managers',
3940
autospec=True)
40-
def test_get_clean_steps(self, mock_dispatch, mock_version):
41+
def test_get_clean_steps(self, mock_dispatch, mock_version,
42+
mock_cache_node):
4143
mock_version.return_value = self.version
4244

4345
manager_steps = {
@@ -135,12 +137,14 @@ def test_get_clean_steps(self, mock_dispatch, mock_version):
135137
# 'priority' in Ironic
136138
self.assertEqual(expected_return,
137139
async_results.join().command_result)
140+
mock_cache_node.assert_called_once_with(self.node)
138141

139142
@mock.patch('ironic_python_agent.hardware.dispatch_to_managers',
140143
autospec=True)
141144
@mock.patch('ironic_python_agent.extensions.clean._check_clean_version',
142145
autospec=True)
143-
def test_execute_clean_step(self, mock_version, mock_dispatch):
146+
def test_execute_clean_step(self, mock_version, mock_dispatch,
147+
mock_cache_node):
144148
result = 'cleaned'
145149
mock_dispatch.return_value = result
146150

@@ -159,13 +163,14 @@ def test_execute_clean_step(self, mock_version, mock_dispatch):
159163
self.step['GenericHardwareManager'][0]['step'],
160164
self.node, self.ports)
161165
self.assertEqual(expected_result, async_result.command_result)
166+
mock_cache_node.assert_called_once_with(self.node)
162167

163168
@mock.patch('ironic_python_agent.hardware.dispatch_to_managers',
164169
autospec=True)
165170
@mock.patch('ironic_python_agent.extensions.clean._check_clean_version',
166171
autospec=True)
167172
def test_execute_clean_step_tuple_result(self, mock_version,
168-
mock_dispatch):
173+
mock_dispatch, mock_cache_node):
169174
result = ('stdout', 'stderr')
170175
mock_dispatch.return_value = result
171176

@@ -184,23 +189,26 @@ def test_execute_clean_step_tuple_result(self, mock_version,
184189
self.step['GenericHardwareManager'][0]['step'],
185190
self.node, self.ports)
186191
self.assertEqual(expected_result, async_result.command_result)
192+
mock_cache_node.assert_called_once_with(self.node)
187193

188194
@mock.patch('ironic_python_agent.extensions.clean._check_clean_version',
189195
autospec=True)
190-
def test_execute_clean_step_no_step(self, mock_version):
196+
def test_execute_clean_step_no_step(self, mock_version, mock_cache_node):
191197
async_result = self.agent_extension.execute_clean_step(
192198
step={}, node=self.node, ports=self.ports,
193199
clean_version=self.version)
194200
async_result.join()
195201

196202
self.assertEqual('FAILED', async_result.command_status)
197203
mock_version.assert_called_once_with(self.version)
204+
mock_cache_node.assert_called_once_with(self.node)
198205

199206
@mock.patch('ironic_python_agent.hardware.dispatch_to_managers',
200207
autospec=True)
201208
@mock.patch('ironic_python_agent.extensions.clean._check_clean_version',
202209
autospec=True)
203-
def test_execute_clean_step_fail(self, mock_version, mock_dispatch):
210+
def test_execute_clean_step_fail(self, mock_version, mock_dispatch,
211+
mock_cache_node):
204212
mock_dispatch.side_effect = RuntimeError
205213

206214
async_result = self.agent_extension.execute_clean_step(
@@ -214,13 +222,15 @@ def test_execute_clean_step_fail(self, mock_version, mock_dispatch):
214222
mock_dispatch.assert_called_once_with(
215223
self.step['GenericHardwareManager'][0]['step'],
216224
self.node, self.ports)
225+
mock_cache_node.assert_called_once_with(self.node)
217226

218227
@mock.patch('ironic_python_agent.hardware.dispatch_to_managers',
219228
autospec=True)
220229
@mock.patch('ironic_python_agent.extensions.clean._check_clean_version',
221230
autospec=True)
222231
def test_execute_clean_step_version_mismatch(self, mock_version,
223-
mock_dispatch):
232+
mock_dispatch,
233+
mock_cache_node):
224234
mock_version.side_effect = errors.CleanVersionMismatch(
225235
{'GenericHardwareManager': 1}, {'GenericHardwareManager': 2})
226236

@@ -232,17 +242,19 @@ def test_execute_clean_step_version_mismatch(self, mock_version,
232242

233243
mock_version.assert_called_once_with(self.version)
234244

235-
@mock.patch('ironic_python_agent.hardware.dispatch_to_all_managers',
236-
autospec=True)
237-
def _get_current_clean_version(self, mock_dispatch):
245+
246+
@mock.patch('ironic_python_agent.hardware.dispatch_to_all_managers',
247+
autospec=True)
248+
class TestCleanVersion(test_base.BaseTestCase):
249+
version = {'generic': '1', 'specific': '1'}
250+
251+
def test__get_current_clean_version(self, mock_dispatch):
238252
mock_dispatch.return_value = {'SpecificHardwareManager':
239253
{'name': 'specific', 'version': '1'},
240254
'GenericHardwareManager':
241255
{'name': 'generic', 'version': '1'}}
242256
self.assertEqual(self.version, clean._get_current_clean_version())
243257

244-
@mock.patch('ironic_python_agent.hardware.dispatch_to_all_managers',
245-
autospec=True)
246258
def test__check_clean_version_fail(self, mock_dispatch):
247259
mock_dispatch.return_value = {'SpecificHardwareManager':
248260
{'name': 'specific', 'version': '1'}}

ironic_python_agent/tests/unit/test_agent.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def test_heartbeat(self, mock_uniform, mock_time, mock_poll, mock_read):
127127
self.assertEqual(2.7, self.heartbeater.error_delay)
128128

129129

130-
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
130+
@mock.patch.object(hardware.GenericHardwareManager, 'wait_for_disks',
131131
lambda self: None)
132132
class TestBaseAgent(ironic_agent_base.IronicAgentTest):
133133

@@ -151,6 +151,7 @@ def setUp(self):
151151
FakeExtension())])
152152
self.sample_nw_iface = hardware.NetworkInterface(
153153
"eth9", "AA:BB:CC:DD:EE:FF", "1.2.3.4", True)
154+
hardware.NODE = None
154155

155156
def assertEqualEncoded(self, a, b):
156157
# Evidently JSONEncoder.default() can't handle None (??) so we have to
@@ -204,7 +205,9 @@ def test_run(self, mock_make_server, mock_dispatch, mock_wait):
204205
server_class=simple_server.WSGIServer)
205206
wsgi_server.serve_forever.assert_called_once_with()
206207
mock_wait.assert_called_once_with(mock.ANY)
207-
mock_dispatch.assert_called_once_with("list_hardware_info")
208+
self.assertEqual([mock.call('list_hardware_info'),
209+
mock.call('wait_for_disks')],
210+
mock_dispatch.call_args_list)
208211
self.agent.heartbeater.start.assert_called_once_with()
209212

210213
@mock.patch.object(hardware, '_check_for_iscsi', mock.Mock())
@@ -252,7 +255,9 @@ def test_run_with_inspection(self, mock_list_hardware, mock_make_server,
252255
self.agent.api_client.lookup_node.call_args[1]['node_uuid'])
253256

254257
mock_wait.assert_called_once_with(mock.ANY)
255-
mock_dispatch.assert_called_once_with("list_hardware_info")
258+
self.assertEqual([mock.call('list_hardware_info'),
259+
mock.call('wait_for_disks')],
260+
mock_dispatch.call_args_list)
256261
self.agent.heartbeater.start.assert_called_once_with()
257262

258263
@mock.patch.object(hardware, '_check_for_iscsi', mock.Mock())
@@ -419,7 +424,9 @@ def test_run_with_sleep(self, mock_make_server, mock_dispatch,
419424
mock_sleep.assert_called_once_with(10)
420425
self.assertTrue(mock_load_managers.called)
421426
self.assertTrue(mock_wait.called)
422-
mock_dispatch.assert_called_once_with('list_hardware_info')
427+
self.assertEqual([mock.call('list_hardware_info'),
428+
mock.call('wait_for_disks')],
429+
mock_dispatch.call_args_list)
423430

424431
def test_async_command_success(self):
425432
result = base.AsyncCommandResult('foo_command', {'fail': False},
@@ -507,7 +514,7 @@ def test_get_route_source_indexerror(self, mock_execute, mock_log):
507514
mock_log.warning.assert_called_once()
508515

509516

510-
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
517+
@mock.patch.object(hardware.GenericHardwareManager, 'wait_for_disks',
511518
lambda self: None)
512519
class TestAgentStandalone(ironic_agent_base.IronicAgentTest):
513520

@@ -563,7 +570,7 @@ def test_run(self, mock_list_hardware, mock_make_server):
563570

564571

565572
@mock.patch.object(hardware, '_check_for_iscsi', lambda: None)
566-
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
573+
@mock.patch.object(hardware.GenericHardwareManager, 'wait_for_disks',
567574
lambda self: None)
568575
@mock.patch.object(socket, 'gethostbyname', autospec=True)
569576
@mock.patch.object(utils, 'execute', autospec=True)

0 commit comments

Comments
 (0)