diff options
-rw-r--r-- | cloudinit/distros/networking.py | 15 | ||||
-rw-r--r-- | cloudinit/distros/tests/test_networking.py | 31 | ||||
-rwxr-xr-x | cloudinit/sources/DataSourceAzure.py | 515 | ||||
-rw-r--r-- | cloudinit/sources/helpers/netlink.py | 102 | ||||
-rw-r--r-- | cloudinit/sources/helpers/tests/test_netlink.py | 74 | ||||
-rw-r--r-- | tests/unittests/test_datasource/test_azure.py | 436 | ||||
-rw-r--r-- | tools/.github-cla-signers | 1 |
7 files changed, 1109 insertions, 65 deletions
diff --git a/cloudinit/distros/networking.py b/cloudinit/distros/networking.py index e407fa29..c291196a 100644 --- a/cloudinit/distros/networking.py +++ b/cloudinit/distros/networking.py @@ -2,6 +2,7 @@ import abc import logging import os +from cloudinit import subp from cloudinit import net, util @@ -175,6 +176,10 @@ class Networking(metaclass=abc.ABCMeta): if strict: raise RuntimeError(msg) + @abc.abstractmethod + def try_set_link_up(self, devname: DeviceName) -> bool: + """Try setting the link to up explicitly and return if it is up.""" + class BSDNetworking(Networking): """Implementation of networking functionality shared across BSDs.""" @@ -185,6 +190,9 @@ class BSDNetworking(Networking): def settle(self, *, exists=None) -> None: """BSD has no equivalent to `udevadm settle`; noop.""" + def try_set_link_up(self, devname: DeviceName) -> bool: + raise NotImplementedError() + class LinuxNetworking(Networking): """Implementation of networking functionality common to Linux distros.""" @@ -214,3 +222,10 @@ class LinuxNetworking(Networking): if exists is not None: exists = net.sys_dev_path(exists) util.udevadm_settle(exists=exists) + + def try_set_link_up(self, devname: DeviceName) -> bool: + """Try setting the link to up explicitly and return if it is up. + Not guaranteed to bring the interface up. The caller is expected to + add wait times before retrying.""" + subp.subp(['ip', 'link', 'set', devname, 'up']) + return self.is_up(devname) diff --git a/cloudinit/distros/tests/test_networking.py b/cloudinit/distros/tests/test_networking.py index b9a63842..ec508f4d 100644 --- a/cloudinit/distros/tests/test_networking.py +++ b/cloudinit/distros/tests/test_networking.py @@ -30,6 +30,9 @@ def generic_networking_cls(): def settle(self, *args, **kwargs): raise NotImplementedError + def try_set_link_up(self, *args, **kwargs): + raise NotImplementedError + error = AssertionError("Unexpectedly used /sys in generic networking code") with mock.patch( "cloudinit.net.get_sys_class_path", side_effect=error, @@ -74,6 +77,34 @@ class TestLinuxNetworkingIsPhysical: assert LinuxNetworking().is_physical(devname) +class TestBSDNetworkingTrySetLinkUp: + def test_raises_notimplementederror(self): + with pytest.raises(NotImplementedError): + BSDNetworking().try_set_link_up("eth0") + + +@mock.patch("cloudinit.net.is_up") +@mock.patch("cloudinit.distros.networking.subp.subp") +class TestLinuxNetworkingTrySetLinkUp: + def test_calls_subp_return_true(self, m_subp, m_is_up): + devname = "eth0" + m_is_up.return_value = True + is_success = LinuxNetworking().try_set_link_up(devname) + + assert (mock.call(['ip', 'link', 'set', devname, 'up']) == + m_subp.call_args_list[-1]) + assert is_success + + def test_calls_subp_return_false(self, m_subp, m_is_up): + devname = "eth0" + m_is_up.return_value = False + is_success = LinuxNetworking().try_set_link_up(devname) + + assert (mock.call(['ip', 'link', 'set', devname, 'up']) == + m_subp.call_args_list[-1]) + assert not is_success + + class TestBSDNetworkingSettle: def test_settle_doesnt_error(self): # This also implicitly tests that it doesn't use subp.subp diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index f777a007..04ff2131 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -12,8 +12,10 @@ import os import os.path import re from time import time +from time import sleep from xml.dom import minidom import xml.etree.ElementTree as ET +from enum import Enum from cloudinit import dmi from cloudinit import log as logging @@ -67,13 +69,27 @@ DEFAULT_FS = 'ext4' # DMI chassis-asset-tag is set static for all azure instances AZURE_CHASSIS_ASSET_TAG = '7783-7084-3265-9085-8269-3286-77' REPROVISION_MARKER_FILE = "/var/lib/cloud/data/poll_imds" +REPROVISION_NIC_ATTACH_MARKER_FILE = "/var/lib/cloud/data/wait_for_nic_attach" +REPROVISION_NIC_DETACHED_MARKER_FILE = "/var/lib/cloud/data/nic_detached" REPORTED_READY_MARKER_FILE = "/var/lib/cloud/data/reported_ready" AGENT_SEED_DIR = '/var/lib/waagent' + # In the event where the IMDS primary server is not # available, it takes 1s to fallback to the secondary one IMDS_TIMEOUT_IN_SECONDS = 2 IMDS_URL = "http://169.254.169.254/metadata/" +IMDS_VER = "2019-06-01" +IMDS_VER_PARAM = "api-version={}".format(IMDS_VER) + + +class metadata_type(Enum): + compute = "{}instance?{}".format(IMDS_URL, IMDS_VER_PARAM) + network = "{}instance/network?{}".format(IMDS_URL, + IMDS_VER_PARAM) + reprovisiondata = "{}reprovisiondata?{}".format(IMDS_URL, + IMDS_VER_PARAM) + PLATFORM_ENTROPY_SOURCE = "/sys/firmware/acpi/tables/OEM0" @@ -434,20 +450,39 @@ class DataSourceAzure(sources.DataSource): # need to look in the datadir and consider that valid ddir = self.ds_cfg['data_dir'] + # The order in which the candidates are inserted matters here, because + # it determines the value of ret. More specifically, the first one in + # the candidate list determines the path to take in order to get the + # metadata we need. candidates = [self.seed_dir] if os.path.isfile(REPROVISION_MARKER_FILE): candidates.insert(0, "IMDS") + report_diagnostic_event("Reprovision marker file already present " + "before crawling Azure metadata: %s" % + REPROVISION_MARKER_FILE, + logger_func=LOG.debug) + elif os.path.isfile(REPROVISION_NIC_ATTACH_MARKER_FILE): + candidates.insert(0, "NIC_ATTACH_MARKER_PRESENT") + report_diagnostic_event("Reprovision nic attach marker file " + "already present before crawling Azure " + "metadata: %s" % + REPROVISION_NIC_ATTACH_MARKER_FILE, + logger_func=LOG.debug) candidates.extend(list_possible_azure_ds_devs()) if ddir: candidates.append(ddir) found = None reprovision = False + reprovision_after_nic_attach = False for cdev in candidates: try: if cdev == "IMDS": ret = None reprovision = True + elif cdev == "NIC_ATTACH_MARKER_PRESENT": + ret = None + reprovision_after_nic_attach = True elif cdev.startswith("/dev/"): if util.is_FreeBSD(): ret = util.mount_cb(cdev, load_azure_ds_dir, @@ -472,12 +507,19 @@ class DataSourceAzure(sources.DataSource): continue perform_reprovision = reprovision or self._should_reprovision(ret) - if perform_reprovision: + perform_reprovision_after_nic_attach = ( + reprovision_after_nic_attach or + self._should_reprovision_after_nic_attach(ret)) + + if perform_reprovision or perform_reprovision_after_nic_attach: if util.is_FreeBSD(): msg = "Free BSD is not supported for PPS VMs" report_diagnostic_event(msg, logger_func=LOG.error) raise sources.InvalidMetaDataException(msg) + if perform_reprovision_after_nic_attach: + self._wait_for_all_nics_ready() ret = self._reprovision() + imds_md = get_metadata_from_imds( self.fallback_interface, retries=10) (md, userdata_raw, cfg, files) = ret @@ -508,7 +550,7 @@ class DataSourceAzure(sources.DataSource): crawled_data['metadata']['random_seed'] = seed crawled_data['metadata']['instance-id'] = self._iid() - if perform_reprovision: + if perform_reprovision or perform_reprovision_after_nic_attach: LOG.info("Reporting ready to Azure after getting ReprovisionData") use_cached_ephemeral = ( self.distro.networking.is_up(self.fallback_interface) and @@ -617,14 +659,12 @@ class DataSourceAzure(sources.DataSource): LOG.debug('Retrieved SSH keys from IMDS') except KeyError: log_msg = 'Unable to get keys from IMDS, falling back to OVF' - LOG.debug(log_msg) report_diagnostic_event(log_msg, logger_func=LOG.debug) try: ssh_keys = self.metadata['public-keys'] LOG.debug('Retrieved keys from OVF') except KeyError: log_msg = 'No keys available from OVF' - LOG.debug(log_msg) report_diagnostic_event(log_msg, logger_func=LOG.debug) return ssh_keys @@ -660,10 +700,293 @@ class DataSourceAzure(sources.DataSource): LOG.debug("negotiating already done for %s", self.get_instance_id()) + @azure_ds_telemetry_reporter + def _wait_for_nic_detach(self, nl_sock): + """Use the netlink socket provided to wait for nic detach event. + NOTE: The function doesn't close the socket. The caller owns closing + the socket and disposing it safely. + """ + try: + ifname = None + + # Preprovisioned VM will only have one NIC, and it gets + # detached immediately after deployment. + with events.ReportEventStack( + name="wait-for-nic-detach", + description=("wait for nic detach"), + parent=azure_ds_reporter): + ifname = netlink.wait_for_nic_detach_event(nl_sock) + if ifname is None: + msg = ("Preprovisioned nic not detached as expected. " + "Proceeding without failing.") + report_diagnostic_event(msg, logger_func=LOG.warning) + else: + report_diagnostic_event("The preprovisioned nic %s is detached" + % ifname, logger_func=LOG.warning) + path = REPROVISION_NIC_DETACHED_MARKER_FILE + LOG.info("Creating a marker file for nic detached: %s", path) + util.write_file(path, "{pid}: {time}\n".format( + pid=os.getpid(), time=time())) + except AssertionError as error: + report_diagnostic_event(error, logger_func=LOG.error) + raise + + @azure_ds_telemetry_reporter + def wait_for_link_up(self, ifname): + """In cases where the link state is still showing down after a nic is + hot-attached, we can attempt to bring it up by forcing the hv_netvsc + drivers to query the link state by unbinding and then binding the + device. This function attempts infinitely until the link is up, + because we cannot proceed further until we have a stable link.""" + + if self.distro.networking.try_set_link_up(ifname): + report_diagnostic_event("The link %s is already up." % ifname, + logger_func=LOG.info) + return + + LOG.info("Attempting to bring %s up", ifname) + + attempts = 0 + while True: + + LOG.info("Unbinding and binding the interface %s", ifname) + devicename = net.read_sys_net(ifname, + 'device/device_id').strip('{}') + util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/unbind', + devicename) + util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/bind', + devicename) + + attempts = attempts + 1 + if self.distro.networking.try_set_link_up(ifname): + msg = "The link %s is up after %s attempts" % (ifname, + attempts) + report_diagnostic_event(msg, logger_func=LOG.info) + return + + sleep_duration = 1 + msg = ("Link is not up after %d attempts with %d seconds sleep " + "between attempts." % (attempts, sleep_duration)) + + if attempts % 10 == 0: + report_diagnostic_event(msg, logger_func=LOG.info) + else: + LOG.info(msg) + + sleep(sleep_duration) + + @azure_ds_telemetry_reporter + def _create_report_ready_marker(self): + path = REPORTED_READY_MARKER_FILE + LOG.info( + "Creating a marker file to report ready: %s", path) + util.write_file(path, "{pid}: {time}\n".format( + pid=os.getpid(), time=time())) + report_diagnostic_event( + 'Successfully created reported ready marker file ' + 'while in the preprovisioning pool.', + logger_func=LOG.debug) + + @azure_ds_telemetry_reporter + def _report_ready_if_needed(self): + """Report ready to the platform if the marker file is not present, + and create the marker file. + """ + have_not_reported_ready = ( + not os.path.isfile(REPORTED_READY_MARKER_FILE)) + + if have_not_reported_ready: + report_diagnostic_event("Reporting ready before nic detach", + logger_func=LOG.info) + try: + with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease: + self._report_ready(lease=lease) + except Exception as e: + report_diagnostic_event("Exception reporting ready during " + "preprovisioning before nic detach: %s" + % e, logger_func=LOG.error) + raise + self._create_report_ready_marker() + else: + report_diagnostic_event("Already reported ready before nic detach." + " The marker file already exists: %s" % + REPORTED_READY_MARKER_FILE, + logger_func=LOG.error) + + @azure_ds_telemetry_reporter + def _check_if_nic_is_primary(self, ifname): + """Check if a given interface is the primary nic or not. If it is the + primary nic, then we also get the expected total nic count from IMDS. + IMDS will process the request and send a response only for primary NIC. + """ + is_primary = False + expected_nic_count = -1 + imds_md = None + + # For now, only a VM's primary NIC can contact IMDS and WireServer. If + # DHCP fails for a NIC, we have no mechanism to determine if the NIC is + # primary or secondary. In this case, the desired behavior is to fail + # VM provisioning if there is any DHCP failure when trying to determine + # the primary NIC. + try: + with events.ReportEventStack( + name="obtain-dhcp-lease", + description=("obtain dhcp lease for %s when attempting to " + "determine primary NIC during reprovision of " + "a pre-provisioned VM" % ifname), + parent=azure_ds_reporter): + dhcp_ctx = EphemeralDHCPv4( + iface=ifname, + dhcp_log_func=dhcp_log_cb) + dhcp_ctx.obtain_lease() + except Exception as e: + report_diagnostic_event("Giving up. Failed to obtain dhcp lease " + "for %s when attempting to determine " + "primary NIC during reprovision due to %s" + % (ifname, e), logger_func=LOG.error) + raise + + # Primary nic detection will be optimized in the future. The fact that + # primary nic is being attached first helps here. Otherwise each nic + # could add several seconds of delay. + try: + imds_md = get_metadata_from_imds( + ifname, + 5, + metadata_type.network) + except Exception as e: + LOG.warning( + "Failed to get network metadata using nic %s. Attempt to " + "contact IMDS failed with error %s. Assuming this is not the " + "primary nic.", ifname, e) + finally: + # If we are not the primary nic, then clean the dhcp context. + if imds_md is None: + dhcp_ctx.clean_network() + + if imds_md is not None: + # Only primary NIC will get a response from IMDS. + LOG.info("%s is the primary nic", ifname) + is_primary = True + + # If primary, set ephemeral dhcp ctx so we can report ready + self._ephemeral_dhcp_ctx = dhcp_ctx + + # Set the expected nic count based on the response received. + expected_nic_count = len( + imds_md['interface']) + report_diagnostic_event("Expected nic count: %d" % + expected_nic_count, logger_func=LOG.info) + + return is_primary, expected_nic_count + + @azure_ds_telemetry_reporter + def _wait_for_hot_attached_nics(self, nl_sock): + """Wait until all the expected nics for the vm are hot-attached. + The expected nic count is obtained by requesting the network metadata + from IMDS. + """ + LOG.info("Waiting for nics to be hot-attached") + try: + # Wait for nics to be attached one at a time, until we know for + # sure that all nics have been attached. + nics_found = [] + primary_nic_found = False + expected_nic_count = -1 + + # Wait for netlink nic attach events. After the first nic is + # attached, we are already in the customer vm deployment path and + # so eerything from then on should happen fast and avoid + # unnecessary delays wherever possible. + while True: + ifname = None + with events.ReportEventStack( + name="wait-for-nic-attach", + description=("wait for nic attach after %d nics have " + "been attached" % len(nics_found)), + parent=azure_ds_reporter): + ifname = netlink.wait_for_nic_attach_event(nl_sock, + nics_found) + + # wait_for_nic_attach_event guarantees that ifname it not None + nics_found.append(ifname) + report_diagnostic_event("Detected nic %s attached." % ifname, + logger_func=LOG.info) + + # Attempt to bring the interface's operating state to + # UP in case it is not already. + self.wait_for_link_up(ifname) + + # If primary nic is not found, check if this is it. The + # platform will attach the primary nic first so we + # won't be in primary_nic_found = false state for long. + if not primary_nic_found: + LOG.info("Checking if %s is the primary nic", + ifname) + (primary_nic_found, expected_nic_count) = ( + self._check_if_nic_is_primary(ifname)) + + # Exit criteria: check if we've discovered all nics + if (expected_nic_count != -1 + and len(nics_found) >= expected_nic_count): + LOG.info("Found all the nics for this VM.") + break + + except AssertionError as error: + report_diagnostic_event(error, logger_func=LOG.error) + + @azure_ds_telemetry_reporter + def _wait_for_all_nics_ready(self): + """Wait for nic(s) to be hot-attached. There may be multiple nics + depending on the customer request. + But only primary nic would be able to communicate with wireserver + and IMDS. So we detect and save the primary nic to be used later. + """ + + nl_sock = None + try: + nl_sock = netlink.create_bound_netlink_socket() + + report_ready_marker_present = bool( + os.path.isfile(REPORTED_READY_MARKER_FILE)) + + # Report ready if the marker file is not already present. + # The nic of the preprovisioned vm gets hot-detached as soon as + # we report ready. So no need to save the dhcp context. + self._report_ready_if_needed() + + has_nic_been_detached = bool( + os.path.isfile(REPROVISION_NIC_DETACHED_MARKER_FILE)) + + if not has_nic_been_detached: + LOG.info("NIC has not been detached yet.") + self._wait_for_nic_detach(nl_sock) + + # If we know that the preprovisioned nic has been detached, and we + # still have a fallback nic, then it means the VM must have + # rebooted as part of customer assignment, and all the nics have + # already been attached by the Azure platform. So there is no need + # to wait for nics to be hot-attached. + if not self.fallback_interface: + self._wait_for_hot_attached_nics(nl_sock) + else: + report_diagnostic_event("Skipping waiting for nic attach " + "because we already have a fallback " + "interface. Report Ready marker " + "present before detaching nics: %s" % + report_ready_marker_present, + logger_func=LOG.info) + except netlink.NetlinkCreateSocketError as e: + report_diagnostic_event(e, logger_func=LOG.warning) + raise + finally: + if nl_sock: + nl_sock.close() + def _poll_imds(self): """Poll IMDS for the new provisioning data until we get a valid response. Then return the returned JSON object.""" - url = IMDS_URL + "reprovisiondata?api-version=2017-04-02" + url = metadata_type.reprovisiondata.value headers = {"Metadata": "true"} nl_sock = None report_ready = bool(not os.path.isfile(REPORTED_READY_MARKER_FILE)) @@ -705,17 +1028,31 @@ class DataSourceAzure(sources.DataSource): logger_func=LOG.warning) return False - LOG.debug("Wait for vnetswitch to happen") + # When the interface is hot-attached, we would have already + # done dhcp and set the dhcp context. In that case, skip + # the attempt to do dhcp. + is_ephemeral_ctx_present = self._ephemeral_dhcp_ctx is not None + msg = ("Unexpected error. Dhcp context is not expected to be already " + "set when we need to wait for vnet switch") + if is_ephemeral_ctx_present and report_ready: + report_diagnostic_event(msg, logger_func=LOG.error) + raise RuntimeError(msg) + while True: try: - # Save our EphemeralDHCPv4 context to avoid repeated dhcp - with events.ReportEventStack( - name="obtain-dhcp-lease", - description="obtain dhcp lease", - parent=azure_ds_reporter): - self._ephemeral_dhcp_ctx = EphemeralDHCPv4( - dhcp_log_func=dhcp_log_cb) - lease = self._ephemeral_dhcp_ctx.obtain_lease() + # Since is_ephemeral_ctx_present is set only once, this ensures + # that with regular reprovisioning, dhcp is always done every + # time the loop runs. + if not is_ephemeral_ctx_present: + # Save our EphemeralDHCPv4 context to avoid repeated dhcp + # later when we report ready + with events.ReportEventStack( + name="obtain-dhcp-lease", + description="obtain dhcp lease", + parent=azure_ds_reporter): + self._ephemeral_dhcp_ctx = EphemeralDHCPv4( + dhcp_log_func=dhcp_log_cb) + lease = self._ephemeral_dhcp_ctx.obtain_lease() if vnet_switched: dhcp_attempts += 1 @@ -737,17 +1074,10 @@ class DataSourceAzure(sources.DataSource): self._ephemeral_dhcp_ctx.clean_network() raise sources.InvalidMetaDataException(msg) - path = REPORTED_READY_MARKER_FILE - LOG.info( - "Creating a marker file to report ready: %s", path) - util.write_file(path, "{pid}: {time}\n".format( - pid=os.getpid(), time=time())) - report_diagnostic_event( - 'Successfully created reported ready marker file ' - 'while in the preprovisioning pool.', - logger_func=LOG.debug) + self._create_report_ready_marker() report_ready = False + LOG.debug("Wait for vnetswitch to happen") with events.ReportEventStack( name="wait-for-media-disconnect-connect", description="wait for vnet switch", @@ -863,6 +1193,35 @@ class DataSourceAzure(sources.DataSource): "connectivity issues: %s" % e, logger_func=LOG.warning) return False + def _should_reprovision_after_nic_attach(self, candidate_metadata) -> bool: + """Whether or not we should wait for nic attach and then poll + IMDS for reprovisioning data. Also sets a marker file to poll IMDS. + + The marker file is used for the following scenario: the VM boots into + wait for nic attach, which we expect to be proceeding infinitely until + the nic is attached. If for whatever reason the platform moves us to a + new host (for instance a hardware issue), we need to keep waiting. + However, since the VM reports ready to the Fabric, we will not attach + the ISO, thus cloud-init needs to have a way of knowing that it should + jump back into the waiting mode in order to retrieve the ovf_env. + + @param candidate_metadata: Metadata obtained from reading ovf-env. + @return: Whether to reprovision after waiting for nics to be attached. + """ + if not candidate_metadata: + return False + (_md, _userdata_raw, cfg, _files) = candidate_metadata + path = REPROVISION_NIC_ATTACH_MARKER_FILE + if (cfg.get('PreprovisionedVMType', None) == "Savable" or + os.path.isfile(path)): + if not os.path.isfile(path): + LOG.info("Creating a marker file to wait for nic attach: %s", + path) + util.write_file(path, "{pid}: {time}\n".format( + pid=os.getpid(), time=time())) + return True + return False + def _should_reprovision(self, ret): """Whether or not we should poll IMDS for reprovisioning data. Also sets a marker file to poll IMDS. @@ -879,6 +1238,7 @@ class DataSourceAzure(sources.DataSource): (_md, _userdata_raw, cfg, _files) = ret path = REPROVISION_MARKER_FILE if (cfg.get('PreprovisionedVm') is True or + cfg.get('PreprovisionedVMType', None) == 'Running' or os.path.isfile(path)): if not os.path.isfile(path): LOG.info("Creating a marker file to poll imds: %s", @@ -944,6 +1304,8 @@ class DataSourceAzure(sources.DataSource): util.del_file(REPORTED_READY_MARKER_FILE) util.del_file(REPROVISION_MARKER_FILE) + util.del_file(REPROVISION_NIC_ATTACH_MARKER_FILE) + util.del_file(REPROVISION_NIC_DETACHED_MARKER_FILE) return fabric_data @azure_ds_telemetry_reporter @@ -1399,34 +1761,109 @@ def read_azure_ovf(contents): if 'ssh_pwauth' not in cfg and password: cfg['ssh_pwauth'] = True - cfg['PreprovisionedVm'] = _extract_preprovisioned_vm_setting(dom) + preprovisioning_cfg = _get_preprovisioning_cfgs(dom) + cfg = util.mergemanydict([cfg, preprovisioning_cfg]) return (md, ud, cfg) @azure_ds_telemetry_reporter -def _extract_preprovisioned_vm_setting(dom): - """Read the preprovision flag from the ovf. It should not - exist unless true.""" +def _get_preprovisioning_cfgs(dom): + """Read the preprovisioning related flags from ovf and populates a dict + with the info. + + Two flags are in use today: PreprovisionedVm bool and + PreprovisionedVMType enum. In the long term, the PreprovisionedVm bool + will be deprecated in favor of PreprovisionedVMType string/enum. + + Only these combinations of values are possible today: + - PreprovisionedVm=True and PreprovisionedVMType=Running + - PreprovisionedVm=False and PreprovisionedVMType=Savable + - PreprovisionedVm is missing and PreprovisionedVMType=Running/Savable + - PreprovisionedVm=False and PreprovisionedVMType is missing + + More specifically, this will never happen: + - PreprovisionedVm=True and PreprovisionedVMType=Savable + """ + cfg = { + "PreprovisionedVm": False, + "PreprovisionedVMType": None + } + platform_settings_section = find_child( dom.documentElement, lambda n: n.localName == "PlatformSettingsSection") if not platform_settings_section or len(platform_settings_section) == 0: LOG.debug("PlatformSettingsSection not found") - return False + return cfg platform_settings = find_child( platform_settings_section[0], lambda n: n.localName == "PlatformSettings") if not platform_settings or len(platform_settings) == 0: LOG.debug("PlatformSettings not found") - return False - preprovisionedVm = find_child( + return cfg + + # Read the PreprovisionedVm bool flag. This should be deprecated when the + # platform has removed PreprovisionedVm and only surfaces + # PreprovisionedVMType. + cfg["PreprovisionedVm"] = _get_preprovisionedvm_cfg_value( + platform_settings) + + cfg["PreprovisionedVMType"] = _get_preprovisionedvmtype_cfg_value( + platform_settings) + return cfg + + +@azure_ds_telemetry_reporter +def _get_preprovisionedvm_cfg_value(platform_settings): + preprovisionedVm = False + + # Read the PreprovisionedVm bool flag. This should be deprecated when the + # platform has removed PreprovisionedVm and only surfaces + # PreprovisionedVMType. + preprovisionedVmVal = find_child( platform_settings[0], lambda n: n.localName == "PreprovisionedVm") - if not preprovisionedVm or len(preprovisionedVm) == 0: + if not preprovisionedVmVal or len(preprovisionedVmVal) == 0: LOG.debug("PreprovisionedVm not found") - return False - return util.translate_bool(preprovisionedVm[0].firstChild.nodeValue) + return preprovisionedVm + preprovisionedVm = util.translate_bool( + preprovisionedVmVal[0].firstChild.nodeValue) + + report_diagnostic_event( + "PreprovisionedVm: %s" % preprovisionedVm, logger_func=LOG.info) + + return preprovisionedVm + + +@azure_ds_telemetry_reporter +def _get_preprovisionedvmtype_cfg_value(platform_settings): + preprovisionedVMType = None + + # Read the PreprovisionedVMType value from the ovf. It can be + # 'Running' or 'Savable' or not exist. This enum value is intended to + # replace PreprovisionedVm bool flag in the long term. + # A Running VM is the same as preprovisioned VMs of today. This is + # equivalent to having PreprovisionedVm=True. + # A Savable VM is one whose nic is hot-detached immediately after it + # reports ready the first time to free up the network resources. + # Once assigned to customer, the customer-requested nics are + # hot-attached to it and reprovision happens like today. + preprovisionedVMTypeVal = find_child( + platform_settings[0], + lambda n: n.localName == "PreprovisionedVMType") + if (not preprovisionedVMTypeVal or len(preprovisionedVMTypeVal) == 0 or + preprovisionedVMTypeVal[0].firstChild is None): + LOG.debug("PreprovisionedVMType not found") + return preprovisionedVMType + + preprovisionedVMType = preprovisionedVMTypeVal[0].firstChild.nodeValue + + report_diagnostic_event( + "PreprovisionedVMType: %s" % preprovisionedVMType, + logger_func=LOG.info) + + return preprovisionedVMType def encrypt_pass(password, salt_id="$6$"): @@ -1588,8 +2025,10 @@ def _generate_network_config_from_fallback_config() -> dict: @azure_ds_telemetry_reporter -def get_metadata_from_imds(fallback_nic, retries): - """Query Azure's network metadata service, returning a dictionary. +def get_metadata_from_imds(fallback_nic, + retries, + md_type=metadata_type.compute): + """Query Azure's instance metadata service, returning a dictionary. If network is not up, setup ephemeral dhcp on fallback_nic to talk to the IMDS. For more info on IMDS: @@ -1604,7 +2043,7 @@ def get_metadata_from_imds(fallback_nic, retries): """ kwargs = {'logfunc': LOG.debug, 'msg': 'Crawl of Azure Instance Metadata Service (IMDS)', - 'func': _get_metadata_from_imds, 'args': (retries,)} + 'func': _get_metadata_from_imds, 'args': (retries, md_type,)} if net.is_up(fallback_nic): return util.log_time(**kwargs) else: @@ -1620,9 +2059,9 @@ def get_metadata_from_imds(fallback_nic, retries): @azure_ds_telemetry_reporter -def _get_metadata_from_imds(retries): +def _get_metadata_from_imds(retries, md_type=metadata_type.compute): - url = IMDS_URL + "instance?api-version=2019-06-01" + url = md_type.value headers = {"Metadata": "true"} try: response = readurl( diff --git a/cloudinit/sources/helpers/netlink.py b/cloudinit/sources/helpers/netlink.py index c2ad587b..e13d6834 100644 --- a/cloudinit/sources/helpers/netlink.py +++ b/cloudinit/sources/helpers/netlink.py @@ -185,6 +185,54 @@ def read_rta_oper_state(data): return InterfaceOperstate(ifname, operstate) +def wait_for_nic_attach_event(netlink_socket, existing_nics): + '''Block until a single nic is attached. + + :param: netlink_socket: netlink_socket to receive events + :param: existing_nics: List of existing nics so that we can skip them. + :raises: AssertionError if netlink_socket is none. + ''' + LOG.debug("Preparing to wait for nic attach.") + ifname = None + + def should_continue_cb(iname, carrier, prevCarrier): + if iname in existing_nics: + return True + nonlocal ifname + ifname = iname + return False + + # We can return even if the operational state of the new nic is DOWN + # because we set it to UP before doing dhcp. + read_netlink_messages(netlink_socket, + None, + [RTM_NEWLINK], + [OPER_UP, OPER_DOWN], + should_continue_cb) + return ifname + + +def wait_for_nic_detach_event(netlink_socket): + '''Block until a single nic is detached and its operational state is down. + + :param: netlink_socket: netlink_socket to receive events. + ''' + LOG.debug("Preparing to wait for nic detach.") + ifname = None + + def should_continue_cb(iname, carrier, prevCarrier): + nonlocal ifname + ifname = iname + return False + + read_netlink_messages(netlink_socket, + None, + [RTM_DELLINK], + [OPER_DOWN], + should_continue_cb) + return ifname + + def wait_for_media_disconnect_connect(netlink_socket, ifname): '''Block until media disconnect and connect has happened on an interface. Listens on netlink socket to receive netlink events and when the carrier @@ -198,10 +246,42 @@ def wait_for_media_disconnect_connect(netlink_socket, ifname): assert (netlink_socket is not None), ("netlink socket is none") assert (ifname is not None), ("interface name is none") assert (len(ifname) > 0), ("interface name cannot be empty") + + def should_continue_cb(iname, carrier, prevCarrier): + # check for carrier down, up sequence + isVnetSwitch = (prevCarrier == OPER_DOWN) and (carrier == OPER_UP) + if isVnetSwitch: + LOG.debug("Media switch happened on %s.", ifname) + return False + return True + + LOG.debug("Wait for media disconnect and reconnect to happen") + read_netlink_messages(netlink_socket, + ifname, + [RTM_NEWLINK, RTM_DELLINK], + [OPER_UP, OPER_DOWN], + should_continue_cb) + + +def read_netlink_messages(netlink_socket, + ifname_filter, + rtm_types, + operstates, + should_continue_callback): + ''' Reads from the netlink socket until the condition specified by + the continuation callback is met. + + :param: netlink_socket: netlink_socket to receive events. + :param: ifname_filter: if not None, will only listen for this interface. + :param: rtm_types: Type of netlink events to listen for. + :param: operstates: Operational states to listen. + :param: should_continue_callback: Specifies when to stop listening. + ''' + if netlink_socket is None: + raise RuntimeError("Netlink socket is none") + data = bytes() carrier = OPER_UP prevCarrier = OPER_UP - data = bytes() - LOG.debug("Wait for media disconnect and reconnect to happen") while True: recv_data = read_netlink_socket(netlink_socket, SELECT_TIMEOUT) if recv_data is None: @@ -223,26 +303,26 @@ def wait_for_media_disconnect_connect(netlink_socket, ifname): padlen = (nlheader.length+PAD_ALIGNMENT-1) & ~(PAD_ALIGNMENT-1) offset = offset + padlen LOG.debug('offset to next netlink message: %d', offset) - # Ignore any messages not new link or del link - if nlheader.type not in [RTM_NEWLINK, RTM_DELLINK]: + # Continue if we are not interested in this message. + if nlheader.type not in rtm_types: continue interface_state = read_rta_oper_state(nl_msg) if interface_state is None: LOG.debug('Failed to read rta attributes: %s', interface_state) continue - if interface_state.ifname != ifname: + if (ifname_filter is not None and + interface_state.ifname != ifname_filter): LOG.debug( "Ignored netlink event on interface %s. Waiting for %s.", - interface_state.ifname, ifname) + interface_state.ifname, ifname_filter) continue - if interface_state.operstate not in [OPER_UP, OPER_DOWN]: + if interface_state.operstate not in operstates: continue prevCarrier = carrier carrier = interface_state.operstate - # check for carrier down, up sequence - isVnetSwitch = (prevCarrier == OPER_DOWN) and (carrier == OPER_UP) - if isVnetSwitch: - LOG.debug("Media switch happened on %s.", ifname) + if not should_continue_callback(interface_state.ifname, + carrier, + prevCarrier): return data = data[offset:] diff --git a/cloudinit/sources/helpers/tests/test_netlink.py b/cloudinit/sources/helpers/tests/test_netlink.py index 10760bd6..cafe3961 100644 --- a/cloudinit/sources/helpers/tests/test_netlink.py +++ b/cloudinit/sources/helpers/tests/test_netlink.py @@ -9,9 +9,10 @@ import codecs from cloudinit.sources.helpers.netlink import ( NetlinkCreateSocketError, create_bound_netlink_socket, read_netlink_socket, read_rta_oper_state, unpack_rta_attr, wait_for_media_disconnect_connect, + wait_for_nic_attach_event, wait_for_nic_detach_event, OPER_DOWN, OPER_UP, OPER_DORMANT, OPER_LOWERLAYERDOWN, OPER_NOTPRESENT, - OPER_TESTING, OPER_UNKNOWN, RTATTR_START_OFFSET, RTM_NEWLINK, RTM_SETLINK, - RTM_GETLINK, MAX_SIZE) + OPER_TESTING, OPER_UNKNOWN, RTATTR_START_OFFSET, RTM_NEWLINK, RTM_DELLINK, + RTM_SETLINK, RTM_GETLINK, MAX_SIZE) def int_to_bytes(i): @@ -135,6 +136,75 @@ class TestParseNetlinkMessage(CiTestCase): @mock.patch('cloudinit.sources.helpers.netlink.socket.socket') @mock.patch('cloudinit.sources.helpers.netlink.read_netlink_socket') +class TestNicAttachDetach(CiTestCase): + with_logs = True + + def _media_switch_data(self, ifname, msg_type, operstate): + '''construct netlink data with specified fields''' + if ifname and operstate is not None: + data = bytearray(48) + bytes = ifname.encode("utf-8") + struct.pack_into("HH4sHHc", data, RTATTR_START_OFFSET, 8, 3, + bytes, 5, 16, int_to_bytes(operstate)) + elif ifname: + data = bytearray(40) + bytes = ifname.encode("utf-8") + struct.pack_into("HH4s", data, RTATTR_START_OFFSET, 8, 3, bytes) + elif operstate: + data = bytearray(40) + struct.pack_into("HHc", data, RTATTR_START_OFFSET, 5, 16, + int_to_bytes(operstate)) + struct.pack_into("=LHHLL", data, 0, len(data), msg_type, 0, 0, 0) + return data + + def test_nic_attached_oper_down(self, m_read_netlink_socket, m_socket): + '''Test for a new nic attached''' + ifname = "eth0" + data_op_down = self._media_switch_data(ifname, RTM_NEWLINK, OPER_DOWN) + m_read_netlink_socket.side_effect = [data_op_down] + ifread = wait_for_nic_attach_event(m_socket, []) + self.assertEqual(m_read_netlink_socket.call_count, 1) + self.assertEqual(ifname, ifread) + + def test_nic_attached_oper_up(self, m_read_netlink_socket, m_socket): + '''Test for a new nic attached''' + ifname = "eth0" + data_op_up = self._media_switch_data(ifname, RTM_NEWLINK, OPER_UP) + m_read_netlink_socket.side_effect = [data_op_up] + ifread = wait_for_nic_attach_event(m_socket, []) + self.assertEqual(m_read_netlink_socket.call_count, 1) + self.assertEqual(ifname, ifread) + + def test_nic_attach_ignore_existing(self, m_read_netlink_socket, m_socket): + '''Test that we read only the interfaces we are interested in.''' + data_eth0 = self._media_switch_data("eth0", RTM_NEWLINK, OPER_DOWN) + data_eth1 = self._media_switch_data("eth1", RTM_NEWLINK, OPER_DOWN) + m_read_netlink_socket.side_effect = [data_eth0, data_eth1] + ifread = wait_for_nic_attach_event(m_socket, ["eth0"]) + self.assertEqual(m_read_netlink_socket.call_count, 2) + self.assertEqual("eth1", ifread) + + def test_nic_attach_read_first(self, m_read_netlink_socket, m_socket): + '''Test that we read only the interfaces we are interested in.''' + data_eth0 = self._media_switch_data("eth0", RTM_NEWLINK, OPER_DOWN) + data_eth1 = self._media_switch_data("eth1", RTM_NEWLINK, OPER_DOWN) + m_read_netlink_socket.side_effect = [data_eth0, data_eth1] + ifread = wait_for_nic_attach_event(m_socket, ["eth1"]) + self.assertEqual(m_read_netlink_socket.call_count, 1) + self.assertEqual("eth0", ifread) + + def test_nic_detached(self, m_read_netlink_socket, m_socket): + '''Test for an existing nic detached''' + ifname = "eth0" + data_op_down = self._media_switch_data(ifname, RTM_DELLINK, OPER_DOWN) + m_read_netlink_socket.side_effect = [data_op_down] + ifread = wait_for_nic_detach_event(m_socket) + self.assertEqual(m_read_netlink_socket.call_count, 1) + self.assertEqual(ifname, ifread) + + +@mock.patch('cloudinit.sources.helpers.netlink.socket.socket') +@mock.patch('cloudinit.sources.helpers.netlink.read_netlink_socket') class TestWaitForMediaDisconnectConnect(CiTestCase): with_logs = True diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index 534314aa..e363c1f9 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -11,6 +11,7 @@ from cloudinit.version import version_string as vs from cloudinit.tests.helpers import ( HttprettyTestCase, CiTestCase, populate_dir, mock, wrap_and_call, ExitStack, resourceLocation) +from cloudinit.sources.helpers import netlink import copy import crypt @@ -78,6 +79,8 @@ def construct_valid_ovf_env(data=None, pubkeys=None, if platform_settings: for k, v in platform_settings.items(): content += "<%s>%s</%s>\n" % (k, v, k) + if "PreprovisionedVMType" not in platform_settings: + content += """<PreprovisionedVMType i:nil="true" />""" content += """</PlatformSettings></wa:PlatformSettingsSection> </Environment>""" @@ -156,6 +159,31 @@ SECONDARY_INTERFACE = { } } +IMDS_NETWORK_METADATA = { + "interface": [ + { + "macAddress": "000D3A047598", + "ipv6": { + "ipAddress": [] + }, + "ipv4": { + "subnet": [ + { + "prefix": "24", + "address": "10.0.0.0" + } + ], + "ipAddress": [ + { + "privateIpAddress": "10.0.0.4", + "publicIpAddress": "104.46.124.81" + } + ] + } + } + ] +} + MOCKPATH = 'cloudinit.sources.DataSourceAzure.' @@ -366,8 +394,8 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): self.network_md_url = dsaz.IMDS_URL + "instance?api-version=2019-06-01" @mock.patch(MOCKPATH + 'readurl') - @mock.patch(MOCKPATH + 'EphemeralDHCPv4') - @mock.patch(MOCKPATH + 'net.is_up') + @mock.patch(MOCKPATH + 'EphemeralDHCPv4', autospec=True) + @mock.patch(MOCKPATH + 'net.is_up', autospec=True) def test_get_metadata_does_not_dhcp_if_network_is_up( self, m_net_is_up, m_dhcp, m_readurl): """Do not perform DHCP setup when nic is already up.""" @@ -384,9 +412,66 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): "Crawl of Azure Instance Metadata Service (IMDS) took", # log_time self.logs.getvalue()) - @mock.patch(MOCKPATH + 'readurl') - @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting') + @mock.patch(MOCKPATH + 'readurl', autospec=True) + @mock.patch(MOCKPATH + 'EphemeralDHCPv4') + @mock.patch(MOCKPATH + 'net.is_up') + def test_get_compute_metadata_uses_compute_url( + self, m_net_is_up, m_dhcp, m_readurl): + """Make sure readurl is called with the correct url when accessing + network metadata""" + m_net_is_up.return_value = True + m_readurl.return_value = url_helper.StringResponse( + json.dumps(IMDS_NETWORK_METADATA).encode('utf-8')) + + dsaz.get_metadata_from_imds( + 'eth0', retries=3, md_type=dsaz.metadata_type.compute) + m_readurl.assert_called_with( + "http://169.254.169.254/metadata/instance?api-version=" + "2019-06-01", exception_cb=mock.ANY, + headers=mock.ANY, retries=mock.ANY, + timeout=mock.ANY) + + @mock.patch(MOCKPATH + 'readurl', autospec=True) + @mock.patch(MOCKPATH + 'EphemeralDHCPv4') + @mock.patch(MOCKPATH + 'net.is_up') + def test_get_network_metadata_uses_network_url( + self, m_net_is_up, m_dhcp, m_readurl): + """Make sure readurl is called with the correct url when accessing + network metadata""" + m_net_is_up.return_value = True + m_readurl.return_value = url_helper.StringResponse( + json.dumps(IMDS_NETWORK_METADATA).encode('utf-8')) + + dsaz.get_metadata_from_imds( + 'eth0', retries=3, md_type=dsaz.metadata_type.network) + m_readurl.assert_called_with( + "http://169.254.169.254/metadata/instance/network?api-version=" + "2019-06-01", exception_cb=mock.ANY, + headers=mock.ANY, retries=mock.ANY, + timeout=mock.ANY) + + @mock.patch(MOCKPATH + 'readurl', autospec=True) + @mock.patch(MOCKPATH + 'EphemeralDHCPv4') @mock.patch(MOCKPATH + 'net.is_up') + def test_get_default_metadata_uses_compute_url( + self, m_net_is_up, m_dhcp, m_readurl): + """Make sure readurl is called with the correct url when accessing + network metadata""" + m_net_is_up.return_value = True + m_readurl.return_value = url_helper.StringResponse( + json.dumps(IMDS_NETWORK_METADATA).encode('utf-8')) + + dsaz.get_metadata_from_imds( + 'eth0', retries=3) + m_readurl.assert_called_with( + "http://169.254.169.254/metadata/instance?api-version=" + "2019-06-01", exception_cb=mock.ANY, + headers=mock.ANY, retries=mock.ANY, + timeout=mock.ANY) + + @mock.patch(MOCKPATH + 'readurl', autospec=True) + @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting', autospec=True) + @mock.patch(MOCKPATH + 'net.is_up', autospec=True) def test_get_metadata_performs_dhcp_when_network_is_down( self, m_net_is_up, m_dhcp, m_readurl): """Perform DHCP setup when nic is not up.""" @@ -410,7 +495,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS) @mock.patch('cloudinit.url_helper.time.sleep') - @mock.patch(MOCKPATH + 'net.is_up') + @mock.patch(MOCKPATH + 'net.is_up', autospec=True) def test_get_metadata_from_imds_empty_when_no_imds_present( self, m_net_is_up, m_sleep): """Return empty dict when IMDS network metadata is absent.""" @@ -431,7 +516,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): @mock.patch('requests.Session.request') @mock.patch('cloudinit.url_helper.time.sleep') - @mock.patch(MOCKPATH + 'net.is_up') + @mock.patch(MOCKPATH + 'net.is_up', autospec=True) def test_get_metadata_from_imds_retries_on_timeout( self, m_net_is_up, m_sleep, m_request): """Retry IMDS network metadata on timeout errors.""" @@ -801,6 +886,7 @@ scbus-1 on xpt0 bus 0 'sys_cfg': {}} dsrc = self._get_ds(data) expected_cfg = { + 'PreprovisionedVMType': None, 'PreprovisionedVm': False, 'datasource': {'Azure': {'agent_command': 'my_command'}}, 'system_info': {'default_user': {'name': u'myuser'}}} @@ -864,6 +950,66 @@ scbus-1 on xpt0 bus 0 dsrc.crawl_metadata() self.assertEqual(1, m_report_ready.call_count) + @mock.patch( + 'cloudinit.sources.DataSourceAzure.EphemeralDHCPv4WithReporting') + @mock.patch('cloudinit.sources.DataSourceAzure.util.write_file') + @mock.patch( + 'cloudinit.sources.DataSourceAzure.DataSourceAzure._report_ready') + @mock.patch('cloudinit.sources.DataSourceAzure.DataSourceAzure._poll_imds') + @mock.patch( + 'cloudinit.sources.DataSourceAzure.DataSourceAzure.' + '_wait_for_all_nics_ready') + def test_crawl_metadata_waits_for_nic_on_savable_vms( + self, detect_nics, poll_imds_func, report_ready_func, m_write, m_dhcp + ): + """If reprovisioning, report ready at the end""" + ovfenv = construct_valid_ovf_env( + platform_settings={"PreprovisionedVMType": "Savable", + "PreprovisionedVm": "True"} + ) + + data = { + 'ovfcontent': ovfenv, + 'sys_cfg': {} + } + dsrc = self._get_ds(data) + poll_imds_func.return_value = ovfenv + dsrc.crawl_metadata() + self.assertEqual(1, report_ready_func.call_count) + self.assertEqual(1, detect_nics.call_count) + + @mock.patch( + 'cloudinit.sources.DataSourceAzure.EphemeralDHCPv4WithReporting') + @mock.patch('cloudinit.sources.DataSourceAzure.util.write_file') + @mock.patch( + 'cloudinit.sources.DataSourceAzure.DataSourceAzure._report_ready') + @mock.patch('cloudinit.sources.DataSourceAzure.DataSourceAzure._poll_imds') + @mock.patch( + 'cloudinit.sources.DataSourceAzure.DataSourceAzure.' + '_wait_for_all_nics_ready') + @mock.patch('os.path.isfile') + def test_detect_nics_when_marker_present( + self, is_file, detect_nics, poll_imds_func, report_ready_func, m_write, + m_dhcp): + """If reprovisioning, wait for nic attach if marker present""" + + def is_file_ret(key): + return key == dsaz.REPROVISION_NIC_ATTACH_MARKER_FILE + + is_file.side_effect = is_file_ret + ovfenv = construct_valid_ovf_env() + + data = { + 'ovfcontent': ovfenv, + 'sys_cfg': {} + } + + dsrc = self._get_ds(data) + poll_imds_func.return_value = ovfenv + dsrc.crawl_metadata() + self.assertEqual(1, report_ready_func.call_count) + self.assertEqual(1, detect_nics.call_count) + @mock.patch('cloudinit.sources.DataSourceAzure.util.write_file') @mock.patch('cloudinit.sources.helpers.netlink.' 'wait_for_media_disconnect_connect') @@ -1526,7 +1672,7 @@ scbus-1 on xpt0 bus 0 @mock.patch('cloudinit.net.get_interface_mac') @mock.patch('cloudinit.net.get_devicelist') @mock.patch('cloudinit.net.device_driver') - @mock.patch('cloudinit.net.generate_fallback_config') + @mock.patch('cloudinit.net.generate_fallback_config', autospec=True) def test_fallback_network_config(self, mock_fallback, mock_dd, mock_devlist, mock_get_mac): """On absent IMDS network data, generate network fallback config.""" @@ -1561,7 +1707,7 @@ scbus-1 on xpt0 bus 0 blacklist_drivers=['mlx4_core', 'mlx5_core'], config_driver=True) - @mock.patch(MOCKPATH + 'net.get_interfaces') + @mock.patch(MOCKPATH + 'net.get_interfaces', autospec=True) @mock.patch(MOCKPATH + 'util.is_FreeBSD') def test_blacklist_through_distro( self, m_is_freebsd, m_net_get_interfaces): @@ -1583,17 +1729,17 @@ scbus-1 on xpt0 bus 0 m_net_get_interfaces.assert_called_with( blacklist_drivers=dsaz.BLACKLIST_DRIVERS) - @mock.patch(MOCKPATH + 'subp.subp') + @mock.patch(MOCKPATH + 'subp.subp', autospec=True) def test_get_hostname_with_no_args(self, m_subp): dsaz.get_hostname() m_subp.assert_called_once_with(("hostname",), capture=True) - @mock.patch(MOCKPATH + 'subp.subp') + @mock.patch(MOCKPATH + 'subp.subp', autospec=True) def test_get_hostname_with_string_arg(self, m_subp): dsaz.get_hostname(hostname_command="hostname") m_subp.assert_called_once_with(("hostname",), capture=True) - @mock.patch(MOCKPATH + 'subp.subp') + @mock.patch(MOCKPATH + 'subp.subp', autospec=True) def test_get_hostname_with_iterable_arg(self, m_subp): dsaz.get_hostname(hostname_command=("hostname",)) m_subp.assert_called_once_with(("hostname",), capture=True) @@ -2224,6 +2370,29 @@ class TestPreprovisioningReadAzureOvfFlag(CiTestCase): ret = dsaz.read_azure_ovf(content) cfg = ret[2] self.assertFalse(cfg['PreprovisionedVm']) + self.assertEqual(None, cfg["PreprovisionedVMType"]) + + def test_read_azure_ovf_with_running_type(self): + """The read_azure_ovf method should set PreprovisionedVMType + cfg flag to Running.""" + content = construct_valid_ovf_env( + platform_settings={"PreprovisionedVMType": "Running", + "PreprovisionedVm": "True"}) + ret = dsaz.read_azure_ovf(content) + cfg = ret[2] + self.assertTrue(cfg['PreprovisionedVm']) + self.assertEqual("Running", cfg['PreprovisionedVMType']) + + def test_read_azure_ovf_with_savable_type(self): + """The read_azure_ovf method should set PreprovisionedVMType + cfg flag to Savable.""" + content = construct_valid_ovf_env( + platform_settings={"PreprovisionedVMType": "Savable", + "PreprovisionedVm": "True"}) + ret = dsaz.read_azure_ovf(content) + cfg = ret[2] + self.assertTrue(cfg['PreprovisionedVm']) + self.assertEqual("Savable", cfg['PreprovisionedVMType']) @mock.patch('os.path.isfile') @@ -2273,6 +2442,227 @@ class TestPreprovisioningShouldReprovision(CiTestCase): _poll_imds.assert_called_with() +class TestPreprovisioningHotAttachNics(CiTestCase): + + def setUp(self): + super(TestPreprovisioningHotAttachNics, self).setUp() + self.tmp = self.tmp_dir() + self.waagent_d = self.tmp_path('/var/lib/waagent', self.tmp) + self.paths = helpers.Paths({'cloud_dir': self.tmp}) + dsaz.BUILTIN_DS_CONFIG['data_dir'] = self.waagent_d + self.paths = helpers.Paths({'cloud_dir': self.tmp}) + + @mock.patch('cloudinit.sources.helpers.netlink.wait_for_nic_detach_event', + autospec=True) + @mock.patch(MOCKPATH + 'util.write_file', autospec=True) + def test_nic_detach_writes_marker(self, m_writefile, m_detach): + """When we detect that a nic gets detached, we write a marker for it""" + dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + nl_sock = mock.MagicMock() + dsa._wait_for_nic_detach(nl_sock) + m_detach.assert_called_with(nl_sock) + self.assertEqual(1, m_detach.call_count) + m_writefile.assert_called_with( + dsaz.REPROVISION_NIC_DETACHED_MARKER_FILE, mock.ANY) + + @mock.patch(MOCKPATH + 'util.write_file', autospec=True) + @mock.patch(MOCKPATH + 'DataSourceAzure.fallback_interface') + @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting') + @mock.patch(MOCKPATH + 'DataSourceAzure._report_ready') + @mock.patch(MOCKPATH + 'DataSourceAzure._wait_for_nic_detach') + def test_detect_nic_attach_reports_ready_and_waits_for_detach( + self, m_detach, m_report_ready, m_dhcp, m_fallback_if, + m_writefile): + """Report ready first and then wait for nic detach""" + dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + dsa._wait_for_all_nics_ready() + m_fallback_if.return_value = "Dummy interface" + self.assertEqual(1, m_report_ready.call_count) + self.assertEqual(1, m_detach.call_count) + self.assertEqual(1, m_writefile.call_count) + self.assertEqual(1, m_dhcp.call_count) + m_writefile.assert_called_with(dsaz.REPORTED_READY_MARKER_FILE, + mock.ANY) + + @mock.patch('os.path.isfile') + @mock.patch(MOCKPATH + 'DataSourceAzure.fallback_interface') + @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting') + @mock.patch(MOCKPATH + 'DataSourceAzure._report_ready') + @mock.patch(MOCKPATH + 'DataSourceAzure._wait_for_nic_detach') + def test_detect_nic_attach_skips_report_ready_when_marker_present( + self, m_detach, m_report_ready, m_dhcp, m_fallback_if, m_isfile): + """Skip reporting ready if we already have a marker file.""" + dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + + def isfile(key): + return key == dsaz.REPORTED_READY_MARKER_FILE + + m_isfile.side_effect = isfile + dsa._wait_for_all_nics_ready() + m_fallback_if.return_value = "Dummy interface" + self.assertEqual(0, m_report_ready.call_count) + self.assertEqual(0, m_dhcp.call_count) + self.assertEqual(1, m_detach.call_count) + + @mock.patch('os.path.isfile') + @mock.patch(MOCKPATH + 'DataSourceAzure.fallback_interface') + @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting') + @mock.patch(MOCKPATH + 'DataSourceAzure._report_ready') + @mock.patch(MOCKPATH + 'DataSourceAzure._wait_for_nic_detach') + def test_detect_nic_attach_skips_nic_detach_when_marker_present( + self, m_detach, m_report_ready, m_dhcp, m_fallback_if, m_isfile): + """Skip wait for nic detach if it already happened.""" + dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + + m_isfile.return_value = True + dsa._wait_for_all_nics_ready() + m_fallback_if.return_value = "Dummy interface" + self.assertEqual(0, m_report_ready.call_count) + self.assertEqual(0, m_dhcp.call_count) + self.assertEqual(0, m_detach.call_count) + + @mock.patch(MOCKPATH + 'DataSourceAzure.wait_for_link_up', autospec=True) + @mock.patch('cloudinit.sources.helpers.netlink.wait_for_nic_attach_event') + @mock.patch('cloudinit.sources.net.find_fallback_nic') + @mock.patch(MOCKPATH + 'get_metadata_from_imds') + @mock.patch(MOCKPATH + 'EphemeralDHCPv4') + @mock.patch(MOCKPATH + 'DataSourceAzure._wait_for_nic_detach') + @mock.patch('os.path.isfile') + def test_wait_for_nic_attach_if_no_fallback_interface( + self, m_isfile, m_detach, m_dhcpv4, m_imds, m_fallback_if, + m_attach, m_link_up): + """Wait for nic attach if we do not have a fallback interface""" + dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + lease = { + 'interface': 'eth9', 'fixed-address': '192.168.2.9', + 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0', + 'unknown-245': '624c3620'} + + m_isfile.return_value = True + m_attach.return_value = "eth0" + dhcp_ctx = mock.MagicMock(lease=lease) + dhcp_ctx.obtain_lease.return_value = lease + m_dhcpv4.return_value = dhcp_ctx + m_imds.return_value = IMDS_NETWORK_METADATA + m_fallback_if.return_value = None + + dsa._wait_for_all_nics_ready() + + self.assertEqual(0, m_detach.call_count) + self.assertEqual(1, m_attach.call_count) + self.assertEqual(1, m_dhcpv4.call_count) + self.assertEqual(1, m_imds.call_count) + self.assertEqual(1, m_link_up.call_count) + m_link_up.assert_called_with(mock.ANY, "eth0") + + @mock.patch(MOCKPATH + 'DataSourceAzure.wait_for_link_up') + @mock.patch('cloudinit.sources.helpers.netlink.wait_for_nic_attach_event') + @mock.patch('cloudinit.sources.net.find_fallback_nic') + @mock.patch(MOCKPATH + 'get_metadata_from_imds') + @mock.patch(MOCKPATH + 'EphemeralDHCPv4') + @mock.patch(MOCKPATH + 'DataSourceAzure._wait_for_nic_detach') + @mock.patch('os.path.isfile') + def test_wait_for_nic_attach_multinic_attach( + self, m_isfile, m_detach, m_dhcpv4, m_imds, m_fallback_if, + m_attach, m_link_up): + """Wait for nic attach if we do not have a fallback interface""" + dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + lease = { + 'interface': 'eth9', 'fixed-address': '192.168.2.9', + 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0', + 'unknown-245': '624c3620'} + m_attach_call_count = 0 + + def nic_attach_ret(nl_sock, nics_found): + nonlocal m_attach_call_count + if m_attach_call_count == 0: + m_attach_call_count = m_attach_call_count + 1 + return "eth0" + return "eth1" + + def network_metadata_ret(ifname, retries, type): + # Simulate two NICs by adding the same one twice. + md = IMDS_NETWORK_METADATA + md['interface'].append(md['interface'][0]) + if ifname == "eth0": + return md + raise requests.Timeout('Fake connection timeout') + + m_isfile.return_value = True + m_attach.side_effect = nic_attach_ret + dhcp_ctx = mock.MagicMock(lease=lease) + dhcp_ctx.obtain_lease.return_value = lease + m_dhcpv4.return_value = dhcp_ctx + m_imds.side_effect = network_metadata_ret + m_fallback_if.return_value = None + + dsa._wait_for_all_nics_ready() + + self.assertEqual(0, m_detach.call_count) + self.assertEqual(2, m_attach.call_count) + # DHCP and network metadata calls will only happen on the primary NIC. + self.assertEqual(1, m_dhcpv4.call_count) + self.assertEqual(1, m_imds.call_count) + self.assertEqual(2, m_link_up.call_count) + + @mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up') + def test_wait_for_link_up_returns_if_already_up( + self, m_is_link_up): + """Waiting for link to be up should return immediately if the link is + already up.""" + + distro_cls = distros.fetch('ubuntu') + distro = distro_cls('ubuntu', {}, self.paths) + dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) + m_is_link_up.return_value = True + + dsa.wait_for_link_up("eth0") + self.assertEqual(1, m_is_link_up.call_count) + + @mock.patch(MOCKPATH + 'util.write_file') + @mock.patch('cloudinit.net.read_sys_net') + @mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up') + def test_wait_for_link_up_writes_to_device_file( + self, m_is_link_up, m_read_sys_net, m_writefile): + """Waiting for link to be up should return immediately if the link is + already up.""" + + distro_cls = distros.fetch('ubuntu') + distro = distro_cls('ubuntu', {}, self.paths) + dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) + + callcount = 0 + + def linkup(key): + nonlocal callcount + if callcount == 0: + callcount += 1 + return False + return True + + m_is_link_up.side_effect = linkup + + dsa.wait_for_link_up("eth0") + self.assertEqual(2, m_is_link_up.call_count) + self.assertEqual(1, m_read_sys_net.call_count) + self.assertEqual(2, m_writefile.call_count) + + @mock.patch('cloudinit.sources.helpers.netlink.' + 'create_bound_netlink_socket') + def test_wait_for_all_nics_ready_raises_if_socket_fails(self, m_socket): + """Waiting for all nics should raise exception if netlink socket + creation fails.""" + + m_socket.side_effect = netlink.NetlinkCreateSocketError + distro_cls = distros.fetch('ubuntu') + distro = distro_cls('ubuntu', {}, self.paths) + dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) + + self.assertRaises(netlink.NetlinkCreateSocketError, + dsa._wait_for_all_nics_ready) + # dsa._wait_for_all_nics_ready() + + @mock.patch('cloudinit.net.dhcp.EphemeralIPv4Network') @mock.patch('cloudinit.net.dhcp.maybe_perform_dhcp_discovery') @mock.patch('cloudinit.sources.helpers.netlink.' @@ -2330,6 +2720,24 @@ class TestPreprovisioningPollIMDS(CiTestCase): self.assertEqual(3, m_dhcpv4.call_count, 'Expected 3 DHCP calls') self.assertEqual(4, self.tries, 'Expected 4 total reads from IMDS') + @mock.patch('os.path.isfile') + def test_poll_imds_skips_dhcp_if_ctx_present( + self, m_isfile, report_ready_func, fake_resp, m_media_switch, + m_dhcp, m_net): + """The poll_imds function should reuse the dhcp ctx if it is already + present. This happens when we wait for nic to be hot-attached before + polling for reprovisiondata. Note that if this ctx is set when + _poll_imds is called, then it is not expected to be waiting for + media_disconnect_connect either.""" + report_file = self.tmp_path('report_marker', self.tmp) + m_isfile.return_value = True + dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + dsa._ephemeral_dhcp_ctx = "Dummy dhcp ctx" + with mock.patch(MOCKPATH + 'REPORTED_READY_MARKER_FILE', report_file): + dsa._poll_imds() + self.assertEqual(0, m_dhcp.call_count) + self.assertEqual(0, m_media_switch.call_count) + def test_does_not_poll_imds_report_ready_when_marker_file_exists( self, m_report_ready, m_request, m_media_switch, m_dhcp, m_net): """poll_imds should not call report ready when the reported ready @@ -2390,7 +2798,7 @@ class TestPreprovisioningPollIMDS(CiTestCase): @mock.patch(MOCKPATH + 'util.is_FreeBSD') @mock.patch('cloudinit.sources.helpers.netlink.' 'wait_for_media_disconnect_connect') -@mock.patch('cloudinit.net.dhcp.EphemeralIPv4Network') +@mock.patch('cloudinit.net.dhcp.EphemeralIPv4Network', autospec=True) @mock.patch('cloudinit.net.dhcp.maybe_perform_dhcp_discovery') @mock.patch('requests.Session.request') class TestAzureDataSourcePreprovisioning(CiTestCase): @@ -2412,7 +2820,7 @@ class TestAzureDataSourcePreprovisioning(CiTestCase): m_dhcp.return_value = [{ 'interface': 'eth9', 'fixed-address': '192.168.2.9', 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0'}] - url = 'http://{0}/metadata/reprovisiondata?api-version=2017-04-02' + url = 'http://{0}/metadata/reprovisiondata?api-version=2019-06-01' host = "169.254.169.254" full_url = url.format(host) m_request.return_value = mock.MagicMock(status_code=200, text="ovf", @@ -2445,7 +2853,7 @@ class TestAzureDataSourcePreprovisioning(CiTestCase): 'interface': 'eth9', 'fixed-address': '192.168.2.9', 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0', 'unknown-245': '624c3620'}] - url = 'http://{0}/metadata/reprovisiondata?api-version=2017-04-02' + url = 'http://{0}/metadata/reprovisiondata?api-version=2019-06-01' host = "169.254.169.254" full_url = url.format(host) hostname = "myhost" diff --git a/tools/.github-cla-signers b/tools/.github-cla-signers index 9b594a44..1e0c3ea4 100644 --- a/tools/.github-cla-signers +++ b/tools/.github-cla-signers @@ -1,6 +1,7 @@ ader1990 AlexBaranowski Aman306 +aswinrajamannar beezly bipinbachhao BirknerAlex |