Skip to content

Commit fdadcb5

Browse files
jasonzioServer Team CI Bot
authored andcommitted
net: Wait for dhclient to daemonize before reading lease file
cloud-init uses dhclient to fetch the DHCP lease so it can extract DHCP options. dhclient creates the leasefile, then writes to it; simply waiting for the leasefile to appear creates a race between dhclient and cloud-init. Instead, wait for dhclient to be parented by init. At that point, we know it has written to the leasefile, so it's safe to copy the file and kill the process. cloud-init creates a temporary directory in which to execute dhclient, and deletes that directory after it has killed the process. If cloud-init abandons waiting for dhclient to daemonize, it will still attempt to delete the temporary directory, but will not report an exception should that attempt fail. LP: #1794399
1 parent f19dc8f commit fdadcb5

6 files changed

Lines changed: 84 additions & 19 deletions

File tree

cloudinit/net/dhcp.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import os
1010
import re
1111
import signal
12+
import time
1213

1314
from cloudinit.net import (
1415
EphemeralIPv4Network, find_fallback_nic, get_devicelist,
@@ -127,7 +128,9 @@ def maybe_perform_dhcp_discovery(nic=None):
127128
if not dhclient_path:
128129
LOG.debug('Skip dhclient configuration: No dhclient command found.')
129130
return []
130-
with temp_utils.tempdir(prefix='cloud-init-dhcp-', needs_exe=True) as tdir:
131+
with temp_utils.tempdir(rmtree_ignore_errors=True,
132+
prefix='cloud-init-dhcp-',
133+
needs_exe=True) as tdir:
131134
# Use /var/tmp because /run/cloud-init/tmp is mounted noexec
132135
return dhcp_discovery(dhclient_path, nic, tdir)
133136

@@ -195,24 +198,39 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
195198
'-pf', pid_file, interface, '-sf', '/bin/true']
196199
util.subp(cmd, capture=True)
197200

198-
# dhclient doesn't write a pid file until after it forks when it gets a
199-
# proper lease response. Since cleandir is a temp directory that gets
200-
# removed, we need to wait for that pidfile creation before the
201-
# cleandir is removed, otherwise we get FileNotFound errors.
201+
# Wait for pid file and lease file to appear, and for the process
202+
# named by the pid file to daemonize (have pid 1 as its parent). If we
203+
# try to read the lease file before daemonization happens, we might try
204+
# to read it before the dhclient has actually written it. We also have
205+
# to wait until the dhclient has become a daemon so we can be sure to
206+
# kill the correct process, thus freeing cleandir to be deleted back
207+
# up the callstack.
202208
missing = util.wait_for_files(
203209
[pid_file, lease_file], maxwait=5, naplen=0.01)
204210
if missing:
205211
LOG.warning("dhclient did not produce expected files: %s",
206212
', '.join(os.path.basename(f) for f in missing))
207213
return []
208-
pid_content = util.load_file(pid_file).strip()
209-
try:
210-
pid = int(pid_content)
211-
except ValueError:
212-
LOG.debug(
213-
"pid file contains non-integer content '%s'", pid_content)
214-
else:
215-
os.kill(pid, signal.SIGKILL)
214+
215+
ppid = 'unknown'
216+
for _ in range(0, 1000):
217+
pid_content = util.load_file(pid_file).strip()
218+
try:
219+
pid = int(pid_content)
220+
except ValueError:
221+
pass
222+
else:
223+
ppid = util.get_proc_ppid(pid)
224+
if ppid == 1:
225+
LOG.debug('killing dhclient with pid=%s', pid)
226+
os.kill(pid, signal.SIGKILL)
227+
return parse_dhcp_lease_file(lease_file)
228+
time.sleep(0.01)
229+
230+
LOG.error(
231+
'dhclient(pid=%s, parentpid=%s) failed to daemonize after %s seconds',
232+
pid_content, ppid, 0.01 * 1000
233+
)
216234
return parse_dhcp_lease_file(lease_file)
217235

218236

cloudinit/net/tests/test_dhcp.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,16 +145,20 @@ def test_dhcp_discovery_run_in_sandbox_warns_invalid_pid(self, m_subp,
145145
'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}],
146146
dhcp_discovery(dhclient_script, 'eth9', tmpdir))
147147
self.assertIn(
148-
"pid file contains non-integer content ''", self.logs.getvalue())
148+
"dhclient(pid=, parentpid=unknown) failed "
149+
"to daemonize after 10.0 seconds",
150+
self.logs.getvalue())
149151
m_kill.assert_not_called()
150152

153+
@mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
151154
@mock.patch('cloudinit.net.dhcp.os.kill')
152155
@mock.patch('cloudinit.net.dhcp.util.wait_for_files')
153156
@mock.patch('cloudinit.net.dhcp.util.subp')
154157
def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
155158
m_subp,
156159
m_wait,
157-
m_kill):
160+
m_kill,
161+
m_getppid):
158162
"""dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
159163
tmpdir = self.tmp_dir()
160164
dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
@@ -164,6 +168,7 @@ def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
164168
pidfile = self.tmp_path('dhclient.pid', tmpdir)
165169
leasefile = self.tmp_path('dhcp.leases', tmpdir)
166170
m_wait.return_value = [pidfile] # Return the missing pidfile wait for
171+
m_getppid.return_value = 1 # Indicate that dhclient has daemonized
167172
self.assertEqual([], dhcp_discovery(dhclient_script, 'eth9', tmpdir))
168173
self.assertEqual(
169174
mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
@@ -173,9 +178,10 @@ def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
173178
self.logs.getvalue())
174179
m_kill.assert_not_called()
175180

181+
@mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
176182
@mock.patch('cloudinit.net.dhcp.os.kill')
177183
@mock.patch('cloudinit.net.dhcp.util.subp')
178-
def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
184+
def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill, m_getppid):
179185
"""dhcp_discovery brings up the interface and runs dhclient.
180186
181187
It also returns the parsed dhcp.leases file generated in the sandbox.
@@ -197,6 +203,7 @@ def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
197203
pid_file = os.path.join(tmpdir, 'dhclient.pid')
198204
my_pid = 1
199205
write_file(pid_file, "%d\n" % my_pid)
206+
m_getppid.return_value = 1 # Indicate that dhclient has daemonized
200207

201208
self.assertItemsEqual(
202209
[{'interface': 'eth9', 'fixed-address': '192.168.2.74',
@@ -355,3 +362,5 @@ def test_ephemeral_dhcp_setup_network_if_url_connectivity(
355362
self.assertEqual(fake_lease, lease)
356363
# Ensure that dhcp discovery occurs
357364
m_dhcp.called_once_with()
365+
366+
# vi: ts=4 expandtab

cloudinit/temp_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,15 @@ def unlink_now():
8181

8282

8383
@contextlib.contextmanager
84-
def tempdir(**kwargs):
84+
def tempdir(rmtree_ignore_errors=False, **kwargs):
8585
# This seems like it was only added in python 3.2
8686
# Make it since its useful...
8787
# See: http://bugs.python.org/file12970/tempdir.patch
8888
tdir = mkdtemp(**kwargs)
8989
try:
9090
yield tdir
9191
finally:
92-
shutil.rmtree(tdir)
92+
shutil.rmtree(tdir, ignore_errors=rmtree_ignore_errors)
9393

9494

9595
def mkdtemp(**kwargs):

cloudinit/tests/test_temp_utils.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22

33
"""Tests for cloudinit.temp_utils"""
44

5-
from cloudinit.temp_utils import mkdtemp, mkstemp
5+
from cloudinit.temp_utils import mkdtemp, mkstemp, tempdir
66
from cloudinit.tests.helpers import CiTestCase, wrap_and_call
7+
import os
78

89

910
class TestTempUtils(CiTestCase):
@@ -98,4 +99,19 @@ def fake_mkstemp(*args, **kwargs):
9899
self.assertEqual('/fake/return/path', retval)
99100
self.assertEqual([{'dir': '/run/cloud-init/tmp'}], calls)
100101

102+
def test_tempdir_error_suppression(self):
103+
"""test tempdir suppresses errors during directory removal."""
104+
105+
with self.assertRaises(OSError):
106+
with tempdir(prefix='cloud-init-dhcp-') as tdir:
107+
os.rmdir(tdir)
108+
# As a result, the directory is already gone,
109+
# so shutil.rmtree should raise OSError
110+
111+
with tempdir(rmtree_ignore_errors=True,
112+
prefix='cloud-init-dhcp-') as tdir:
113+
os.rmdir(tdir)
114+
# Since the directory is already gone, shutil.rmtree would raise
115+
# OSError, but we suppress that
116+
101117
# vi: ts=4 expandtab

cloudinit/util.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2876,4 +2876,20 @@ def udevadm_settle(exists=None, timeout=None):
28762876
return subp(settle_cmd)
28772877

28782878

2879+
def get_proc_ppid(pid):
2880+
"""
2881+
Return the parent pid of a process.
2882+
"""
2883+
ppid = 0
2884+
try:
2885+
contents = load_file("/proc/%s/stat" % pid, quiet=True)
2886+
except IOError as e:
2887+
LOG.warning('Failed to load /proc/%s/stat. %s', pid, e)
2888+
if contents:
2889+
parts = contents.split(" ", 4)
2890+
# man proc says
2891+
# ppid %d (4) The PID of the parent.
2892+
ppid = int(parts[3])
2893+
return ppid
2894+
28792895
# vi: ts=4 expandtab

tests/unittests/test_util.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1171,4 +1171,10 @@ def test_non_existing_file_returns_empty_dict(self, m_load_file):
11711171
self.assertEqual({}, util.get_proc_env(1))
11721172
self.assertEqual(1, m_load_file.call_count)
11731173

1174+
def test_get_proc_ppid(self):
1175+
"""get_proc_ppid returns correct parent pid value."""
1176+
my_pid = os.getpid()
1177+
my_ppid = os.getppid()
1178+
self.assertEqual(my_ppid, util.get_proc_ppid(my_pid))
1179+
11741180
# vi: ts=4 expandtab

0 commit comments

Comments
 (0)