Skip to content

Commit 3deb25a

Browse files
committed
Wait for the interfaces to get IP addresses before inspection
In the DIB build the DHCP code (provided by the dhcp-all-interfaces element) races with the service starting IPA. It does not matter for deployment itself, as we're waiting for the route to the Ironic API to appear. However, for inspection it may result in reporting back all NIC's without IP addresses. Inspection fails in this case. This change makes inspection wait for *all* NIC's to get their IP addresses up to a small timeout. The timeout is 60 seconds by default and can be changed via the new ipa-inspection-dhcp-wait-timeout kernel option (0 to not wait). After the wait inspection proceedes in any case, so the worst downside is making inspection 60 seconds longer. To avoid waiting for NIC's that are not even connected, this change extends the NetworkInterface class with 'has_carrier' field. Closes-Bug: #1564954 Change-Id: I5bf14de4c1c622f4bf6e3eadbe20c44759da5d66
1 parent 3cf5369 commit 3deb25a

7 files changed

Lines changed: 163 additions & 10 deletions

File tree

ironic_python_agent/cmd/agent.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,13 @@
114114
help='Comma-separated list of plugins providing additional '
115115
'hardware data for inspection, empty value gives '
116116
'a minimum required set of plugins.'),
117+
118+
cfg.IntOpt('inspection_dhcp_wait_timeout',
119+
default=APARAMS.get('ipa-inspection-dhcp-wait-timeout',
120+
inspector.DEFAULT_DHCP_WAIT_TIMEOUT),
121+
help='Maximum time (in seconds) to wait for all NIC\'s '
122+
'to get their IP addresses via DHCP before inspection. '
123+
'Set to 0 to disable waiting completely.'),
117124
]
118125

119126
CONF.register_cli_opts(cli_opts)

ironic_python_agent/hardware.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,12 +175,14 @@ def __init__(self, name, model, size, rotational, wwn=None, serial=None,
175175

176176
class NetworkInterface(encoding.SerializableComparable):
177177
serializable_fields = ('name', 'mac_address', 'switch_port_descr',
178-
'switch_chassis_descr', 'ipv4_address')
178+
'switch_chassis_descr', 'ipv4_address',
179+
'has_carrier')
179180

180-
def __init__(self, name, mac_addr, ipv4_address=None):
181+
def __init__(self, name, mac_addr, ipv4_address=None, has_carrier=True):
181182
self.name = name
182183
self.mac_address = mac_addr
183184
self.ipv4_address = ipv4_address
185+
self.has_carrier = has_carrier
184186
# TODO(russellhaering): Pull these from LLDP
185187
self.switch_port_descr = None
186188
self.switch_chassis_descr = None
@@ -402,7 +404,8 @@ def _get_interface_info(self, interface_name):
402404

403405
return NetworkInterface(
404406
interface_name, mac_addr,
405-
ipv4_address=self.get_ipv4_addr(interface_name))
407+
ipv4_address=self.get_ipv4_addr(interface_name),
408+
has_carrier=self._interface_has_carrier(interface_name))
406409

407410
def get_ipv4_addr(self, interface_id):
408411
try:
@@ -412,6 +415,17 @@ def get_ipv4_addr(self, interface_id):
412415
# No default IPv4 address found
413416
return None
414417

418+
def _interface_has_carrier(self, interface_name):
419+
path = '{0}/class/net/{1}/carrier'.format(self.sys_path,
420+
interface_name)
421+
try:
422+
with open(path, 'rt') as fp:
423+
return fp.read().strip() == '1'
424+
except EnvironmentError:
425+
LOG.debug('No carrier information for interface %s',
426+
interface_name)
427+
return False
428+
415429
def _is_device(self, interface_name):
416430
device_path = '{0}/class/net/{1}/device'.format(self.sys_path,
417431
interface_name)

ironic_python_agent/inspector.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import io
1818
import json
1919
import tarfile
20+
import time
2021

2122
import netaddr
2223
from oslo_concurrency import processutils
@@ -36,6 +37,9 @@
3637
LOG = logging.getLogger(__name__)
3738
CONF = cfg.CONF
3839
DEFAULT_COLLECTOR = 'default'
40+
DEFAULT_DHCP_WAIT_TIMEOUT = 60
41+
42+
_DHCP_RETRY_INTERVAL = 2
3943
_COLLECTOR_NS = 'ironic_python_agent.inspector.collectors'
4044
_NO_LOGGING_FIELDS = ('logs',)
4145

@@ -215,6 +219,38 @@ def discover_scheduling_properties(inventory, data, root_disk=None):
215219
LOG.info('value for %s is %s', key, data[key])
216220

217221

222+
def wait_for_dhcp():
223+
"""Wait until all NIC's get their IP addresses via DHCP or timeout happens.
224+
225+
Ignores interfaces which do not even have a carrier.
226+
227+
Note: only supports IPv4 addresses for now.
228+
229+
:return: True if all NIC's got IP addresses, False if timeout happened.
230+
Also returns True if waiting is disabled via configuration.
231+
"""
232+
if not CONF.inspection_dhcp_wait_timeout:
233+
return True
234+
235+
threshold = time.time() + CONF.inspection_dhcp_wait_timeout
236+
while time.time() <= threshold:
237+
interfaces = hardware.dispatch_to_managers('list_network_interfaces')
238+
missing = [iface.name for iface in interfaces
239+
if iface.has_carrier and not iface.ipv4_address]
240+
if not missing:
241+
return True
242+
243+
LOG.debug('Still waiting for interfaces %s to get IP addresses',
244+
missing)
245+
time.sleep(_DHCP_RETRY_INTERVAL)
246+
247+
LOG.warning('Not all network interfaces received IP addresses in '
248+
'%(timeout)d seconds: %(missing)s',
249+
{'timeout': CONF.inspection_dhcp_wait_timeout,
250+
'missing': missing})
251+
return False
252+
253+
218254
def collect_default(data, failures):
219255
"""The default inspection collector.
220256
@@ -229,6 +265,7 @@ def collect_default(data, failures):
229265
:param data: mutable data that we'll send to inspector
230266
:param failures: AccumulatedFailures object
231267
"""
268+
wait_for_dhcp()
232269
inventory = hardware.dispatch_to_managers('list_hardware_info')
233270

234271
# In the future we will only need the current version of inventory,

ironic_python_agent/tests/unit/test_hardware.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ def test_list_network_interfaces(self,
268268
mocked_open.return_value.__enter__ = lambda s: s
269269
mocked_open.return_value.__exit__ = mock.Mock()
270270
read_mock = mocked_open.return_value.read
271-
read_mock.return_value = '00:0c:29:8c:11:b1\n'
271+
read_mock.side_effect = ['00:0c:29:8c:11:b1\n', '1']
272272
mocked_ifaddresses.return_value = {
273273
netifaces.AF_INET: [{'addr': '192.168.1.2'}]
274274
}
@@ -277,6 +277,32 @@ def test_list_network_interfaces(self,
277277
self.assertEqual('eth0', interfaces[0].name)
278278
self.assertEqual('00:0c:29:8c:11:b1', interfaces[0].mac_address)
279279
self.assertEqual('192.168.1.2', interfaces[0].ipv4_address)
280+
self.assertTrue(interfaces[0].has_carrier)
281+
282+
@mock.patch('netifaces.ifaddresses')
283+
@mock.patch('os.listdir')
284+
@mock.patch('os.path.exists')
285+
@mock.patch('six.moves.builtins.open')
286+
def test_list_network_interfaces_no_carrier(self,
287+
mocked_open,
288+
mocked_exists,
289+
mocked_listdir,
290+
mocked_ifaddresses):
291+
mocked_listdir.return_value = ['lo', 'eth0']
292+
mocked_exists.side_effect = [False, True]
293+
mocked_open.return_value.__enter__ = lambda s: s
294+
mocked_open.return_value.__exit__ = mock.Mock()
295+
read_mock = mocked_open.return_value.read
296+
read_mock.side_effect = ['00:0c:29:8c:11:b1\n', OSError('boom')]
297+
mocked_ifaddresses.return_value = {
298+
netifaces.AF_INET: [{'addr': '192.168.1.2'}]
299+
}
300+
interfaces = self.hardware.list_network_interfaces()
301+
self.assertEqual(1, len(interfaces))
302+
self.assertEqual('eth0', interfaces[0].name)
303+
self.assertEqual('00:0c:29:8c:11:b1', interfaces[0].mac_address)
304+
self.assertEqual('192.168.1.2', interfaces[0].ipv4_address)
305+
self.assertFalse(interfaces[0].has_carrier)
280306

281307
@mock.patch.object(utils, 'execute')
282308
def test_get_os_install_device(self, mocked_execute):

ironic_python_agent/tests/unit/test_inspector.py

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import copy
1919
import io
2020
import tarfile
21+
import time
2122
import unittest
2223

2324
import mock
@@ -328,11 +329,13 @@ def test_no_local_gb(self):
328329

329330
@mock.patch.object(utils, 'get_agent_params',
330331
lambda: {'BOOTIF': 'boot:if'})
332+
@mock.patch.object(inspector, 'wait_for_dhcp', autospec=True)
331333
@mock.patch.object(inspector, 'discover_scheduling_properties', autospec=True)
332334
@mock.patch.object(inspector, 'discover_network_properties', autospec=True)
333335
@mock.patch.object(hardware, 'dispatch_to_managers', autospec=True)
334336
class TestCollectDefault(BaseDiscoverTest):
335-
def test_ok(self, mock_dispatch, mock_discover_net, mock_discover_sched):
337+
def test_ok(self, mock_dispatch, mock_discover_net, mock_discover_sched,
338+
mock_wait_for_dhcp):
336339
mock_dispatch.return_value = self.inventory
337340

338341
inspector.collect_default(self.data, self.failures)
@@ -351,9 +354,10 @@ def test_ok(self, mock_dispatch, mock_discover_net, mock_discover_sched):
351354
mock_discover_sched.assert_called_once_with(
352355
self.inventory, self.data,
353356
root_disk=self.inventory['disks'][0])
357+
mock_wait_for_dhcp.assert_called_once_with()
354358

355359
def test_no_root_disk(self, mock_dispatch, mock_discover_net,
356-
mock_discover_sched):
360+
mock_discover_sched, mock_wait_for_dhcp):
357361
mock_dispatch.return_value = self.inventory
358362
self.inventory['disks'] = []
359363

@@ -371,6 +375,7 @@ def test_no_root_disk(self, mock_dispatch, mock_discover_net,
371375
self.failures)
372376
mock_discover_sched.assert_called_once_with(
373377
self.inventory, self.data, root_disk=None)
378+
mock_wait_for_dhcp.assert_called_once_with()
374379

375380

376381
@mock.patch.object(utils, 'execute', autospec=True)
@@ -453,3 +458,56 @@ def test_parsing_failed(self, mock_execute):
453458
self.assertNotIn('data', self.data)
454459
self.assertTrue(self.failures)
455460
mock_execute.assert_called_once_with('hardware-detect')
461+
462+
463+
@mock.patch.object(hardware, 'dispatch_to_managers', autospec=True)
464+
class TestWaitForDhcp(unittest.TestCase):
465+
def setUp(self):
466+
super(TestWaitForDhcp, self).setUp()
467+
CONF.set_override('inspection_dhcp_wait_timeout',
468+
inspector.DEFAULT_DHCP_WAIT_TIMEOUT)
469+
470+
@mock.patch.object(time, 'sleep', autospec=True)
471+
def test_ok(self, mocked_sleep, mocked_dispatch):
472+
mocked_dispatch.side_effect = [
473+
[hardware.NetworkInterface(name='em0', mac_addr='abcd',
474+
ipv4_address=None),
475+
hardware.NetworkInterface(name='em1', mac_addr='abcd',
476+
ipv4_address='1.2.3.4')],
477+
[hardware.NetworkInterface(name='em0', mac_addr='abcd',
478+
ipv4_address=None),
479+
hardware.NetworkInterface(name='em1', mac_addr='abcd',
480+
ipv4_address='1.2.3.4')],
481+
[hardware.NetworkInterface(name='em0', mac_addr='abcd',
482+
ipv4_address='1.1.1.1'),
483+
hardware.NetworkInterface(name='em1', mac_addr='abcd',
484+
ipv4_address='1.2.3.4')],
485+
]
486+
487+
self.assertTrue(inspector.wait_for_dhcp())
488+
489+
mocked_dispatch.assert_called_with('list_network_interfaces')
490+
self.assertEqual(2, mocked_sleep.call_count)
491+
self.assertEqual(3, mocked_dispatch.call_count)
492+
493+
@mock.patch.object(inspector, '_DHCP_RETRY_INTERVAL', 0.01)
494+
def test_timeout(self, mocked_dispatch):
495+
CONF.set_override('inspection_dhcp_wait_timeout', 0.02)
496+
497+
mocked_dispatch.return_value = [
498+
hardware.NetworkInterface(name='em0', mac_addr='abcd',
499+
ipv4_address=None),
500+
hardware.NetworkInterface(name='em1', mac_addr='abcd',
501+
ipv4_address='1.2.3.4'),
502+
]
503+
504+
self.assertFalse(inspector.wait_for_dhcp())
505+
506+
mocked_dispatch.assert_called_with('list_network_interfaces')
507+
508+
def test_disabled(self, mocked_dispatch):
509+
CONF.set_override('inspection_dhcp_wait_timeout', 0)
510+
511+
self.assertTrue(inspector.wait_for_dhcp())
512+
513+
self.assertFalse(mocked_dispatch.called)

ironic_python_agent/tests/unit/test_ironic_api_client.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,14 +160,16 @@ def test_do_lookup(self):
160160
u'name': u'eth0',
161161
u'ipv4_address': None,
162162
u'switch_chassis_descr': None,
163-
u'switch_port_descr': None
163+
u'switch_port_descr': None,
164+
u'has_carrier': True,
164165
},
165166
{
166167
u'mac_address': u'00:0c:29:8c:11:b2',
167168
u'name': u'eth1',
168169
u'ipv4_address': None,
169170
u'switch_chassis_descr': None,
170-
'switch_port_descr': None
171+
u'switch_port_descr': None,
172+
u'has_carrier': True,
171173
}
172174
],
173175
u'cpu': {
@@ -295,14 +297,16 @@ def test_do_lookup_with_node_uuid(self):
295297
u'name': u'eth0',
296298
u'ipv4_address': None,
297299
u'switch_chassis_descr': None,
298-
u'switch_port_descr': None
300+
u'switch_port_descr': None,
301+
u'has_carrier': True,
299302
},
300303
{
301304
u'mac_address': u'00:0c:29:8c:11:b2',
302305
u'name': u'eth1',
303306
u'ipv4_address': None,
304307
u'switch_chassis_descr': None,
305-
'switch_port_descr': None
308+
u'switch_port_descr': None,
309+
u'has_carrier': True,
306310
}
307311
],
308312
u'cpu': {
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
features:
3+
- The "has_carrier" flag was added to the network interface information.
4+
fixes:
5+
- Inspection code now waits up to 1 minute for all NICs to get their IP addresses.
6+
Otherwise inspection fails for some users due to race between DIB DHCP code
7+
and IPA startup. See https://bugs.launchpad.net/bugs/1564954 for details.

0 commit comments

Comments
 (0)