diff options
author | Chad Smith <chad.smith@canonical.com> | 2017-11-30 13:09:30 -0700 |
---|---|---|
committer | Scott Moser <smoser@brickies.net> | 2017-11-30 21:51:14 -0500 |
commit | 7acc9e68fafbbd7c56587aebe752ba6ba8c8a3db (patch) | |
tree | 521cc4ebca1fb7b66f6e1dba97e09b4861bb72d7 | |
parent | 88368f9851b29dddb5a12e4b21868cbdef906c5c (diff) | |
download | vyos-cloud-init-7acc9e68fafbbd7c56587aebe752ba6ba8c8a3db.tar.gz vyos-cloud-init-7acc9e68fafbbd7c56587aebe752ba6ba8c8a3db.zip |
ec2: Fix sandboxed dhclient background process cleanup.
There is a race condition where our sandboxed dhclient properly writes a
lease file but has not yet written a pid file. If the sandbox temporary
directory is torn down before the dhclient subprocess writes a pidfile
DataSourceEc2Local gets a traceback and the instance will fallback to
DataSourceEc2 in the init-network stage. This wastes boot cycles we'd
rather not spend.
Fix handling of sandboxed dhclient to wait for both pidfile and leasefile
before proceding. If either file doesn't show in 5 seconds, log a warning
and return empty lease results {}.
LP: #1735331
-rw-r--r-- | cloudinit/net/dhcp.py | 44 | ||||
-rw-r--r-- | cloudinit/net/tests/test_dhcp.py | 66 | ||||
-rw-r--r-- | cloudinit/sources/DataSourceAzure.py | 29 | ||||
-rw-r--r-- | cloudinit/util.py | 22 | ||||
-rw-r--r-- | tests/unittests/test_datasource/test_azure.py | 5 |
5 files changed, 118 insertions, 48 deletions
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py index d8624d82..875a4609 100644 --- a/cloudinit/net/dhcp.py +++ b/cloudinit/net/dhcp.py @@ -36,22 +36,23 @@ def maybe_perform_dhcp_discovery(nic=None): skip dhcp_discovery and return an empty dict. @param nic: Name of the network interface we want to run dhclient on. - @return: A dict of dhcp options from the dhclient discovery if run, - otherwise an empty dict is returned. + @return: A list of dicts representing dhcp options for each lease obtained + from the dhclient discovery if run, otherwise an empty list is + returned. """ if nic is None: nic = find_fallback_nic() if nic is None: LOG.debug('Skip dhcp_discovery: Unable to find fallback nic.') - return {} + return [] elif nic not in get_devicelist(): LOG.debug( 'Skip dhcp_discovery: nic %s not found in get_devicelist.', nic) - return {} + return [] dhclient_path = util.which('dhclient') if not dhclient_path: LOG.debug('Skip dhclient configuration: No dhclient command found.') - return {} + return [] with temp_utils.tempdir(prefix='cloud-init-dhcp-', needs_exe=True) as tdir: # Use /var/tmp because /run/cloud-init/tmp is mounted noexec return dhcp_discovery(dhclient_path, nic, tdir) @@ -60,8 +61,8 @@ def maybe_perform_dhcp_discovery(nic=None): def parse_dhcp_lease_file(lease_file): """Parse the given dhcp lease file for the most recent lease. - Return a dict of dhcp options as key value pairs for the most recent lease - block. + Return a list of dicts of dhcp options. Each dict contains key value pairs + a specific lease in order from oldest to newest. @raises: InvalidDHCPLeaseFileError on empty of unparseable leasefile content. @@ -96,8 +97,8 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir): @param cleandir: The directory from which to run dhclient as well as store dhcp leases. - @return: A dict of dhcp options parsed from the dhcp.leases file or empty - dict. + @return: A list of dicts of representing the dhcp leases parsed from the + dhcp.leases file or empty list. """ LOG.debug('Performing a dhcp discovery on %s', interface) @@ -119,13 +120,26 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir): cmd = [sandbox_dhclient_cmd, '-1', '-v', '-lf', lease_file, '-pf', pid_file, interface, '-sf', '/bin/true'] util.subp(cmd, capture=True) - pid = None + + # dhclient doesn't write a pid file until after it forks when it gets a + # proper lease response. Since cleandir is a temp directory that gets + # removed, we need to wait for that pidfile creation before the + # cleandir is removed, otherwise we get FileNotFound errors. + missing = util.wait_for_files( + [pid_file, lease_file], maxwait=5, naplen=0.01) + if missing: + LOG.warning("dhclient did not produce expected files: %s", + ', '.join(os.path.basename(f) for f in missing)) + return [] + pid_content = util.load_file(pid_file).strip() try: - pid = int(util.load_file(pid_file).strip()) - return parse_dhcp_lease_file(lease_file) - finally: - if pid: - os.kill(pid, signal.SIGKILL) + pid = int(pid_content) + except ValueError: + LOG.debug( + "pid file contains non-integer content '%s'", pid_content) + else: + os.kill(pid, signal.SIGKILL) + return parse_dhcp_lease_file(lease_file) def networkd_parse_lease(content): diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py index 3d8e15c0..db25b6f2 100644 --- a/cloudinit/net/tests/test_dhcp.py +++ b/cloudinit/net/tests/test_dhcp.py @@ -1,6 +1,5 @@ # This file is part of cloud-init. See LICENSE file for license information. -import mock import os import signal from textwrap import dedent @@ -9,7 +8,8 @@ from cloudinit.net.dhcp import ( InvalidDHCPLeaseFileError, maybe_perform_dhcp_discovery, parse_dhcp_lease_file, dhcp_discovery, networkd_load_leases) from cloudinit.util import ensure_file, write_file -from cloudinit.tests.helpers import CiTestCase, wrap_and_call, populate_dir +from cloudinit.tests.helpers import ( + CiTestCase, mock, populate_dir, wrap_and_call) class TestParseDHCPLeasesFile(CiTestCase): @@ -69,14 +69,14 @@ class TestDHCPDiscoveryClean(CiTestCase): def test_no_fallback_nic_found(self, m_fallback_nic): """Log and do nothing when nic is absent and no fallback is found.""" m_fallback_nic.return_value = None # No fallback nic found - self.assertEqual({}, maybe_perform_dhcp_discovery()) + self.assertEqual([], maybe_perform_dhcp_discovery()) self.assertIn( 'Skip dhcp_discovery: Unable to find fallback nic.', self.logs.getvalue()) def test_provided_nic_does_not_exist(self): """When the provided nic doesn't exist, log a message and no-op.""" - self.assertEqual({}, maybe_perform_dhcp_discovery('idontexist')) + self.assertEqual([], maybe_perform_dhcp_discovery('idontexist')) self.assertIn( 'Skip dhcp_discovery: nic idontexist not found in get_devicelist.', self.logs.getvalue()) @@ -87,7 +87,7 @@ class TestDHCPDiscoveryClean(CiTestCase): """When dhclient doesn't exist in the OS, log the issue and no-op.""" m_fallback.return_value = 'eth9' m_which.return_value = None # dhclient isn't found - self.assertEqual({}, maybe_perform_dhcp_discovery()) + self.assertEqual([], maybe_perform_dhcp_discovery()) self.assertIn( 'Skip dhclient configuration: No dhclient command found.', self.logs.getvalue()) @@ -117,6 +117,62 @@ class TestDHCPDiscoveryClean(CiTestCase): @mock.patch('cloudinit.net.dhcp.os.kill') @mock.patch('cloudinit.net.dhcp.util.subp') + def test_dhcp_discovery_run_in_sandbox_warns_invalid_pid(self, m_subp, + m_kill): + """dhcp_discovery logs a warning when pidfile contains invalid content. + + Lease processing still occurs and no proc kill is attempted. + """ + tmpdir = self.tmp_dir() + dhclient_script = os.path.join(tmpdir, 'dhclient.orig') + script_content = '#!/bin/bash\necho fake-dhclient' + write_file(dhclient_script, script_content, mode=0o755) + write_file(self.tmp_path('dhclient.pid', tmpdir), '') # Empty pid '' + lease_content = dedent(""" + lease { + interface "eth9"; + fixed-address 192.168.2.74; + option subnet-mask 255.255.255.0; + option routers 192.168.2.1; + } + """) + write_file(self.tmp_path('dhcp.leases', tmpdir), lease_content) + + self.assertItemsEqual( + [{'interface': 'eth9', 'fixed-address': '192.168.2.74', + 'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}], + dhcp_discovery(dhclient_script, 'eth9', tmpdir)) + self.assertIn( + "pid file contains non-integer content ''", self.logs.getvalue()) + m_kill.assert_not_called() + + @mock.patch('cloudinit.net.dhcp.os.kill') + @mock.patch('cloudinit.net.dhcp.util.wait_for_files') + @mock.patch('cloudinit.net.dhcp.util.subp') + def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self, + m_subp, + m_wait, + m_kill): + """dhcp_discovery waits for the presence of pidfile and dhcp.leases.""" + tmpdir = self.tmp_dir() + dhclient_script = os.path.join(tmpdir, 'dhclient.orig') + script_content = '#!/bin/bash\necho fake-dhclient' + write_file(dhclient_script, script_content, mode=0o755) + # Don't create pid or leases file + pidfile = self.tmp_path('dhclient.pid', tmpdir) + leasefile = self.tmp_path('dhcp.leases', tmpdir) + m_wait.return_value = [pidfile] # Return the missing pidfile wait for + self.assertEqual([], dhcp_discovery(dhclient_script, 'eth9', tmpdir)) + self.assertEqual( + mock.call([pidfile, leasefile], maxwait=5, naplen=0.01), + m_wait.call_args_list[0]) + self.assertIn( + 'WARNING: dhclient did not produce expected files: dhclient.pid', + self.logs.getvalue()) + m_kill.assert_not_called() + + @mock.patch('cloudinit.net.dhcp.os.kill') + @mock.patch('cloudinit.net.dhcp.util.subp') def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill): """dhcp_discovery brings up the interface and runs dhclient. diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index 8c3492d9..14367e9c 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -11,7 +11,6 @@ from functools import partial import os import os.path import re -import time from xml.dom import minidom import xml.etree.ElementTree as ET @@ -321,7 +320,7 @@ class DataSourceAzure(sources.DataSource): # https://bugs.launchpad.net/cloud-init/+bug/1717611 missing = util.log_time(logfunc=LOG.debug, msg="waiting for SSH public key files", - func=wait_for_files, + func=util.wait_for_files, args=(fp_files, 900)) if len(missing): @@ -556,8 +555,8 @@ def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120, is_new_instance=False): # wait for ephemeral disk to come up naplen = .2 - missing = wait_for_files([devpath], maxwait=maxwait, naplen=naplen, - log_pre="Azure ephemeral disk: ") + missing = util.wait_for_files([devpath], maxwait=maxwait, naplen=naplen, + log_pre="Azure ephemeral disk: ") if missing: LOG.warning("ephemeral device '%s' did not appear after %d seconds.", @@ -639,28 +638,6 @@ def pubkeys_from_crt_files(flist): return pubkeys -def wait_for_files(flist, maxwait, naplen=.5, log_pre=""): - need = set(flist) - waited = 0 - while True: - need -= set([f for f in need if os.path.exists(f)]) - if len(need) == 0: - LOG.debug("%sAll files appeared after %s seconds: %s", - log_pre, waited, flist) - return [] - if waited == 0: - LOG.info("%sWaiting up to %s seconds for the following files: %s", - log_pre, maxwait, flist) - if waited + naplen > maxwait: - break - time.sleep(naplen) - waited += naplen - - LOG.warning("%sStill missing files after %s seconds: %s", - log_pre, maxwait, need) - return need - - def write_files(datadir, files, dirmode=None): def _redact_password(cnt, fname): diff --git a/cloudinit/util.py b/cloudinit/util.py index e1290aa8..6c014ba5 100644 --- a/cloudinit/util.py +++ b/cloudinit/util.py @@ -2541,4 +2541,26 @@ def load_shell_content(content, add_empty=False, empty_val=None): return data +def wait_for_files(flist, maxwait, naplen=.5, log_pre=""): + need = set(flist) + waited = 0 + while True: + need -= set([f for f in need if os.path.exists(f)]) + if len(need) == 0: + LOG.debug("%sAll files appeared after %s seconds: %s", + log_pre, waited, flist) + return [] + if waited == 0: + LOG.debug("%sWaiting up to %s seconds for the following files: %s", + log_pre, maxwait, flist) + if waited + naplen > maxwait: + break + time.sleep(naplen) + waited += naplen + + LOG.debug("%sStill missing files after %s seconds: %s", + log_pre, maxwait, need) + return need + + # vi: ts=4 expandtab diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index 0a117771..7cb1812a 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -171,7 +171,6 @@ scbus-1 on xpt0 bus 0 self.apply_patches([ (dsaz, 'list_possible_azure_ds_devs', dsdevs), (dsaz, 'invoke_agent', _invoke_agent), - (dsaz, 'wait_for_files', _wait_for_files), (dsaz, 'pubkeys_from_crt_files', _pubkeys_from_crt_files), (dsaz, 'perform_hostname_bounce', mock.MagicMock()), (dsaz, 'get_hostname', mock.MagicMock()), @@ -179,6 +178,8 @@ scbus-1 on xpt0 bus 0 (dsaz, 'get_metadata_from_fabric', self.get_metadata_from_fabric), (dsaz.util, 'read_dmi_data', mock.MagicMock( side_effect=_dmi_mocks)), + (dsaz.util, 'wait_for_files', mock.MagicMock( + side_effect=_wait_for_files)), ]) dsrc = dsaz.DataSourceAzure( @@ -647,7 +648,7 @@ class TestAzureBounce(TestCase): self.patches.enter_context( mock.patch.object(dsaz, 'invoke_agent')) self.patches.enter_context( - mock.patch.object(dsaz, 'wait_for_files')) + mock.patch.object(dsaz.util, 'wait_for_files')) self.patches.enter_context( mock.patch.object(dsaz, 'list_possible_azure_ds_devs', mock.MagicMock(return_value=[]))) |