diff options
| -rw-r--r-- | cloudinit/distros/networking.py | 15 | ||||
| -rw-r--r-- | cloudinit/distros/tests/test_networking.py | 31 | ||||
| -rwxr-xr-x | cloudinit/sources/DataSourceAzure.py | 515 | ||||
| -rw-r--r-- | cloudinit/sources/helpers/netlink.py | 102 | ||||
| -rw-r--r-- | cloudinit/sources/helpers/tests/test_netlink.py | 74 | ||||
| -rw-r--r-- | tests/unittests/test_datasource/test_azure.py | 436 | ||||
| -rw-r--r-- | tools/.github-cla-signers | 1 | 
7 files changed, 1109 insertions, 65 deletions
| diff --git a/cloudinit/distros/networking.py b/cloudinit/distros/networking.py index e407fa29..c291196a 100644 --- a/cloudinit/distros/networking.py +++ b/cloudinit/distros/networking.py @@ -2,6 +2,7 @@ import abc  import logging  import os +from cloudinit import subp  from cloudinit import net, util @@ -175,6 +176,10 @@ class Networking(metaclass=abc.ABCMeta):          if strict:              raise RuntimeError(msg) +    @abc.abstractmethod +    def try_set_link_up(self, devname: DeviceName) -> bool: +        """Try setting the link to up explicitly and return if it is up.""" +  class BSDNetworking(Networking):      """Implementation of networking functionality shared across BSDs.""" @@ -185,6 +190,9 @@ class BSDNetworking(Networking):      def settle(self, *, exists=None) -> None:          """BSD has no equivalent to `udevadm settle`; noop.""" +    def try_set_link_up(self, devname: DeviceName) -> bool: +        raise NotImplementedError() +  class LinuxNetworking(Networking):      """Implementation of networking functionality common to Linux distros.""" @@ -214,3 +222,10 @@ class LinuxNetworking(Networking):          if exists is not None:              exists = net.sys_dev_path(exists)          util.udevadm_settle(exists=exists) + +    def try_set_link_up(self, devname: DeviceName) -> bool: +        """Try setting the link to up explicitly and return if it is up. +           Not guaranteed to bring the interface up. The caller is expected to +           add wait times before retrying.""" +        subp.subp(['ip', 'link', 'set', devname, 'up']) +        return self.is_up(devname) diff --git a/cloudinit/distros/tests/test_networking.py b/cloudinit/distros/tests/test_networking.py index b9a63842..ec508f4d 100644 --- a/cloudinit/distros/tests/test_networking.py +++ b/cloudinit/distros/tests/test_networking.py @@ -30,6 +30,9 @@ def generic_networking_cls():          def settle(self, *args, **kwargs):              raise NotImplementedError +        def try_set_link_up(self, *args, **kwargs): +            raise NotImplementedError +      error = AssertionError("Unexpectedly used /sys in generic networking code")      with mock.patch(          "cloudinit.net.get_sys_class_path", side_effect=error, @@ -74,6 +77,34 @@ class TestLinuxNetworkingIsPhysical:          assert LinuxNetworking().is_physical(devname) +class TestBSDNetworkingTrySetLinkUp: +    def test_raises_notimplementederror(self): +        with pytest.raises(NotImplementedError): +            BSDNetworking().try_set_link_up("eth0") + + +@mock.patch("cloudinit.net.is_up") +@mock.patch("cloudinit.distros.networking.subp.subp") +class TestLinuxNetworkingTrySetLinkUp: +    def test_calls_subp_return_true(self, m_subp, m_is_up): +        devname = "eth0" +        m_is_up.return_value = True +        is_success = LinuxNetworking().try_set_link_up(devname) + +        assert (mock.call(['ip', 'link', 'set', devname, 'up']) == +                m_subp.call_args_list[-1]) +        assert is_success + +    def test_calls_subp_return_false(self, m_subp, m_is_up): +        devname = "eth0" +        m_is_up.return_value = False +        is_success = LinuxNetworking().try_set_link_up(devname) + +        assert (mock.call(['ip', 'link', 'set', devname, 'up']) == +                m_subp.call_args_list[-1]) +        assert not is_success + +  class TestBSDNetworkingSettle:      def test_settle_doesnt_error(self):          # This also implicitly tests that it doesn't use subp.subp diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index f777a007..04ff2131 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -12,8 +12,10 @@ import os  import os.path  import re  from time import time +from time import sleep  from xml.dom import minidom  import xml.etree.ElementTree as ET +from enum import Enum  from cloudinit import dmi  from cloudinit import log as logging @@ -67,13 +69,27 @@ DEFAULT_FS = 'ext4'  # DMI chassis-asset-tag is set static for all azure instances  AZURE_CHASSIS_ASSET_TAG = '7783-7084-3265-9085-8269-3286-77'  REPROVISION_MARKER_FILE = "/var/lib/cloud/data/poll_imds" +REPROVISION_NIC_ATTACH_MARKER_FILE = "/var/lib/cloud/data/wait_for_nic_attach" +REPROVISION_NIC_DETACHED_MARKER_FILE = "/var/lib/cloud/data/nic_detached"  REPORTED_READY_MARKER_FILE = "/var/lib/cloud/data/reported_ready"  AGENT_SEED_DIR = '/var/lib/waagent' +  # In the event where the IMDS primary server is not  # available, it takes 1s to fallback to the secondary one  IMDS_TIMEOUT_IN_SECONDS = 2  IMDS_URL = "http://169.254.169.254/metadata/" +IMDS_VER = "2019-06-01" +IMDS_VER_PARAM = "api-version={}".format(IMDS_VER) + + +class metadata_type(Enum): +    compute = "{}instance?{}".format(IMDS_URL, IMDS_VER_PARAM) +    network = "{}instance/network?{}".format(IMDS_URL, +                                             IMDS_VER_PARAM) +    reprovisiondata = "{}reprovisiondata?{}".format(IMDS_URL, +                                                    IMDS_VER_PARAM) +  PLATFORM_ENTROPY_SOURCE = "/sys/firmware/acpi/tables/OEM0" @@ -434,20 +450,39 @@ class DataSourceAzure(sources.DataSource):          # need to look in the datadir and consider that valid          ddir = self.ds_cfg['data_dir'] +        # The order in which the candidates are inserted matters here, because +        # it determines the value of ret. More specifically, the first one in +        # the candidate list determines the path to take in order to get the +        # metadata we need.          candidates = [self.seed_dir]          if os.path.isfile(REPROVISION_MARKER_FILE):              candidates.insert(0, "IMDS") +            report_diagnostic_event("Reprovision marker file already present " +                                    "before crawling Azure metadata: %s" % +                                    REPROVISION_MARKER_FILE, +                                    logger_func=LOG.debug) +        elif os.path.isfile(REPROVISION_NIC_ATTACH_MARKER_FILE): +            candidates.insert(0, "NIC_ATTACH_MARKER_PRESENT") +            report_diagnostic_event("Reprovision nic attach marker file " +                                    "already present before crawling Azure " +                                    "metadata: %s" % +                                    REPROVISION_NIC_ATTACH_MARKER_FILE, +                                    logger_func=LOG.debug)          candidates.extend(list_possible_azure_ds_devs())          if ddir:              candidates.append(ddir)          found = None          reprovision = False +        reprovision_after_nic_attach = False          for cdev in candidates:              try:                  if cdev == "IMDS":                      ret = None                      reprovision = True +                elif cdev == "NIC_ATTACH_MARKER_PRESENT": +                    ret = None +                    reprovision_after_nic_attach = True                  elif cdev.startswith("/dev/"):                      if util.is_FreeBSD():                          ret = util.mount_cb(cdev, load_azure_ds_dir, @@ -472,12 +507,19 @@ class DataSourceAzure(sources.DataSource):                  continue              perform_reprovision = reprovision or self._should_reprovision(ret) -            if perform_reprovision: +            perform_reprovision_after_nic_attach = ( +                reprovision_after_nic_attach or +                self._should_reprovision_after_nic_attach(ret)) + +            if perform_reprovision or perform_reprovision_after_nic_attach:                  if util.is_FreeBSD():                      msg = "Free BSD is not supported for PPS VMs"                      report_diagnostic_event(msg, logger_func=LOG.error)                      raise sources.InvalidMetaDataException(msg) +                if perform_reprovision_after_nic_attach: +                    self._wait_for_all_nics_ready()                  ret = self._reprovision() +              imds_md = get_metadata_from_imds(                  self.fallback_interface, retries=10)              (md, userdata_raw, cfg, files) = ret @@ -508,7 +550,7 @@ class DataSourceAzure(sources.DataSource):              crawled_data['metadata']['random_seed'] = seed          crawled_data['metadata']['instance-id'] = self._iid() -        if perform_reprovision: +        if perform_reprovision or perform_reprovision_after_nic_attach:              LOG.info("Reporting ready to Azure after getting ReprovisionData")              use_cached_ephemeral = (                  self.distro.networking.is_up(self.fallback_interface) and @@ -617,14 +659,12 @@ class DataSourceAzure(sources.DataSource):              LOG.debug('Retrieved SSH keys from IMDS')          except KeyError:              log_msg = 'Unable to get keys from IMDS, falling back to OVF' -            LOG.debug(log_msg)              report_diagnostic_event(log_msg, logger_func=LOG.debug)              try:                  ssh_keys = self.metadata['public-keys']                  LOG.debug('Retrieved keys from OVF')              except KeyError:                  log_msg = 'No keys available from OVF' -                LOG.debug(log_msg)                  report_diagnostic_event(log_msg, logger_func=LOG.debug)          return ssh_keys @@ -660,10 +700,293 @@ class DataSourceAzure(sources.DataSource):              LOG.debug("negotiating already done for %s",                        self.get_instance_id()) +    @azure_ds_telemetry_reporter +    def _wait_for_nic_detach(self, nl_sock): +        """Use the netlink socket provided to wait for nic detach event. +           NOTE: The function doesn't close the socket. The caller owns closing +           the socket and disposing it safely. +        """ +        try: +            ifname = None + +            # Preprovisioned VM will only have one NIC, and it gets +            # detached immediately after deployment. +            with events.ReportEventStack( +                    name="wait-for-nic-detach", +                    description=("wait for nic detach"), +                    parent=azure_ds_reporter): +                ifname = netlink.wait_for_nic_detach_event(nl_sock) +            if ifname is None: +                msg = ("Preprovisioned nic not detached as expected. " +                       "Proceeding without failing.") +                report_diagnostic_event(msg, logger_func=LOG.warning) +            else: +                report_diagnostic_event("The preprovisioned nic %s is detached" +                                        % ifname, logger_func=LOG.warning) +            path = REPROVISION_NIC_DETACHED_MARKER_FILE +            LOG.info("Creating a marker file for nic detached: %s", path) +            util.write_file(path, "{pid}: {time}\n".format( +                pid=os.getpid(), time=time())) +        except AssertionError as error: +            report_diagnostic_event(error, logger_func=LOG.error) +            raise + +    @azure_ds_telemetry_reporter +    def wait_for_link_up(self, ifname): +        """In cases where the link state is still showing down after a nic is +           hot-attached, we can attempt to bring it up by forcing the hv_netvsc +           drivers to query the link state by unbinding and then binding the +           device. This function attempts infinitely until the link is up, +           because we cannot proceed further until we have a stable link.""" + +        if self.distro.networking.try_set_link_up(ifname): +            report_diagnostic_event("The link %s is already up." % ifname, +                                    logger_func=LOG.info) +            return + +        LOG.info("Attempting to bring %s up", ifname) + +        attempts = 0 +        while True: + +            LOG.info("Unbinding and binding the interface %s", ifname) +            devicename = net.read_sys_net(ifname, +                                          'device/device_id').strip('{}') +            util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/unbind', +                            devicename) +            util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/bind', +                            devicename) + +            attempts = attempts + 1 +            if self.distro.networking.try_set_link_up(ifname): +                msg = "The link %s is up after %s attempts" % (ifname, +                                                               attempts) +                report_diagnostic_event(msg, logger_func=LOG.info) +                return + +            sleep_duration = 1 +            msg = ("Link is not up after %d attempts with %d seconds sleep " +                   "between attempts." % (attempts, sleep_duration)) + +            if attempts % 10 == 0: +                report_diagnostic_event(msg, logger_func=LOG.info) +            else: +                LOG.info(msg) + +            sleep(sleep_duration) + +    @azure_ds_telemetry_reporter +    def _create_report_ready_marker(self): +        path = REPORTED_READY_MARKER_FILE +        LOG.info( +            "Creating a marker file to report ready: %s", path) +        util.write_file(path, "{pid}: {time}\n".format( +            pid=os.getpid(), time=time())) +        report_diagnostic_event( +            'Successfully created reported ready marker file ' +            'while in the preprovisioning pool.', +            logger_func=LOG.debug) + +    @azure_ds_telemetry_reporter +    def _report_ready_if_needed(self): +        """Report ready to the platform if the marker file is not present, +        and create the marker file. +        """ +        have_not_reported_ready = ( +            not os.path.isfile(REPORTED_READY_MARKER_FILE)) + +        if have_not_reported_ready: +            report_diagnostic_event("Reporting ready before nic detach", +                                    logger_func=LOG.info) +            try: +                with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease: +                    self._report_ready(lease=lease) +            except Exception as e: +                report_diagnostic_event("Exception reporting ready during " +                                        "preprovisioning before nic detach: %s" +                                        % e, logger_func=LOG.error) +                raise +            self._create_report_ready_marker() +        else: +            report_diagnostic_event("Already reported ready before nic detach." +                                    " The marker file already exists: %s" % +                                    REPORTED_READY_MARKER_FILE, +                                    logger_func=LOG.error) + +    @azure_ds_telemetry_reporter +    def _check_if_nic_is_primary(self, ifname): +        """Check if a given interface is the primary nic or not. If it is the +        primary nic, then we also get the expected total nic count from IMDS. +        IMDS will process the request and send a response only for primary NIC. +        """ +        is_primary = False +        expected_nic_count = -1 +        imds_md = None + +        # For now, only a VM's primary NIC can contact IMDS and WireServer. If +        # DHCP fails for a NIC, we have no mechanism to determine if the NIC is +        # primary or secondary. In this case, the desired behavior is to fail +        # VM provisioning if there is any DHCP failure when trying to determine +        # the primary NIC. +        try: +            with events.ReportEventStack( +                    name="obtain-dhcp-lease", +                    description=("obtain dhcp lease for %s when attempting to " +                                 "determine primary NIC during reprovision of " +                                 "a pre-provisioned VM" % ifname), +                    parent=azure_ds_reporter): +                dhcp_ctx = EphemeralDHCPv4( +                    iface=ifname, +                    dhcp_log_func=dhcp_log_cb) +                dhcp_ctx.obtain_lease() +        except Exception as e: +            report_diagnostic_event("Giving up. Failed to obtain dhcp lease " +                                    "for %s when attempting to determine " +                                    "primary NIC during reprovision due to %s" +                                    % (ifname, e), logger_func=LOG.error) +            raise + +        # Primary nic detection will be optimized in the future. The fact that +        # primary nic is being attached first helps here. Otherwise each nic +        # could add several seconds of delay. +        try: +            imds_md = get_metadata_from_imds( +                ifname, +                5, +                metadata_type.network) +        except Exception as e: +            LOG.warning( +                "Failed to get network metadata using nic %s. Attempt to " +                "contact IMDS failed with error %s. Assuming this is not the " +                "primary nic.", ifname, e) +        finally: +            # If we are not the primary nic, then clean the dhcp context. +            if imds_md is None: +                dhcp_ctx.clean_network() + +        if imds_md is not None: +            # Only primary NIC will get a response from IMDS. +            LOG.info("%s is the primary nic", ifname) +            is_primary = True + +            # If primary, set ephemeral dhcp ctx so we can report ready +            self._ephemeral_dhcp_ctx = dhcp_ctx + +            # Set the expected nic count based on the response received. +            expected_nic_count = len( +                imds_md['interface']) +            report_diagnostic_event("Expected nic count: %d" % +                                    expected_nic_count, logger_func=LOG.info) + +        return is_primary, expected_nic_count + +    @azure_ds_telemetry_reporter +    def _wait_for_hot_attached_nics(self, nl_sock): +        """Wait until all the expected nics for the vm are hot-attached. +        The expected nic count is obtained by requesting the network metadata +        from IMDS. +        """ +        LOG.info("Waiting for nics to be hot-attached") +        try: +            # Wait for nics to be attached one at a time, until we know for +            # sure that all nics have been attached. +            nics_found = [] +            primary_nic_found = False +            expected_nic_count = -1 + +            # Wait for netlink nic attach events. After the first nic is +            # attached, we are already in the customer vm deployment path and +            # so eerything from then on should happen fast and avoid +            # unnecessary delays wherever possible. +            while True: +                ifname = None +                with events.ReportEventStack( +                        name="wait-for-nic-attach", +                        description=("wait for nic attach after %d nics have " +                                     "been attached" % len(nics_found)), +                        parent=azure_ds_reporter): +                    ifname = netlink.wait_for_nic_attach_event(nl_sock, +                                                               nics_found) + +                # wait_for_nic_attach_event guarantees that ifname it not None +                nics_found.append(ifname) +                report_diagnostic_event("Detected nic %s attached." % ifname, +                                        logger_func=LOG.info) + +                # Attempt to bring the interface's operating state to +                # UP in case it is not already. +                self.wait_for_link_up(ifname) + +                # If primary nic is not found, check if this is it. The +                # platform will attach the primary nic first so we +                # won't be in primary_nic_found = false state for long. +                if not primary_nic_found: +                    LOG.info("Checking if %s is the primary nic", +                             ifname) +                    (primary_nic_found, expected_nic_count) = ( +                        self._check_if_nic_is_primary(ifname)) + +                # Exit criteria: check if we've discovered all nics +                if (expected_nic_count != -1 +                        and len(nics_found) >= expected_nic_count): +                    LOG.info("Found all the nics for this VM.") +                    break + +        except AssertionError as error: +            report_diagnostic_event(error, logger_func=LOG.error) + +    @azure_ds_telemetry_reporter +    def _wait_for_all_nics_ready(self): +        """Wait for nic(s) to be hot-attached. There may be multiple nics +           depending on the customer request. +           But only primary nic would be able to communicate with wireserver +           and IMDS. So we detect and save the primary nic to be used later. +        """ + +        nl_sock = None +        try: +            nl_sock = netlink.create_bound_netlink_socket() + +            report_ready_marker_present = bool( +                os.path.isfile(REPORTED_READY_MARKER_FILE)) + +            # Report ready if the marker file is not already present. +            # The nic of the preprovisioned vm gets hot-detached as soon as +            # we report ready. So no need to save the dhcp context. +            self._report_ready_if_needed() + +            has_nic_been_detached = bool( +                os.path.isfile(REPROVISION_NIC_DETACHED_MARKER_FILE)) + +            if not has_nic_been_detached: +                LOG.info("NIC has not been detached yet.") +                self._wait_for_nic_detach(nl_sock) + +            # If we know that the preprovisioned nic has been detached, and we +            # still have a fallback nic, then it means the VM must have +            # rebooted as part of customer assignment, and all the nics have +            # already been attached by the Azure platform. So there is no need +            # to wait for nics to be hot-attached. +            if not self.fallback_interface: +                self._wait_for_hot_attached_nics(nl_sock) +            else: +                report_diagnostic_event("Skipping waiting for nic attach " +                                        "because we already have a fallback " +                                        "interface. Report Ready marker " +                                        "present before detaching nics: %s" % +                                        report_ready_marker_present, +                                        logger_func=LOG.info) +        except netlink.NetlinkCreateSocketError as e: +            report_diagnostic_event(e, logger_func=LOG.warning) +            raise +        finally: +            if nl_sock: +                nl_sock.close() +      def _poll_imds(self):          """Poll IMDS for the new provisioning data until we get a valid          response. Then return the returned JSON object.""" -        url = IMDS_URL + "reprovisiondata?api-version=2017-04-02" +        url = metadata_type.reprovisiondata.value          headers = {"Metadata": "true"}          nl_sock = None          report_ready = bool(not os.path.isfile(REPORTED_READY_MARKER_FILE)) @@ -705,17 +1028,31 @@ class DataSourceAzure(sources.DataSource):                  logger_func=LOG.warning)              return False -        LOG.debug("Wait for vnetswitch to happen") +        # When the interface is hot-attached, we would have already +        # done dhcp and set the dhcp context. In that case, skip +        # the attempt to do dhcp. +        is_ephemeral_ctx_present = self._ephemeral_dhcp_ctx is not None +        msg = ("Unexpected error. Dhcp context is not expected to be already " +               "set when we need to wait for vnet switch") +        if is_ephemeral_ctx_present and report_ready: +            report_diagnostic_event(msg, logger_func=LOG.error) +            raise RuntimeError(msg) +          while True:              try: -                # Save our EphemeralDHCPv4 context to avoid repeated dhcp -                with events.ReportEventStack( -                        name="obtain-dhcp-lease", -                        description="obtain dhcp lease", -                        parent=azure_ds_reporter): -                    self._ephemeral_dhcp_ctx = EphemeralDHCPv4( -                        dhcp_log_func=dhcp_log_cb) -                    lease = self._ephemeral_dhcp_ctx.obtain_lease() +                # Since is_ephemeral_ctx_present is set only once, this ensures +                # that with regular reprovisioning, dhcp is always done every +                # time the loop runs. +                if not is_ephemeral_ctx_present: +                    # Save our EphemeralDHCPv4 context to avoid repeated dhcp +                    # later when we report ready +                    with events.ReportEventStack( +                            name="obtain-dhcp-lease", +                            description="obtain dhcp lease", +                            parent=azure_ds_reporter): +                        self._ephemeral_dhcp_ctx = EphemeralDHCPv4( +                            dhcp_log_func=dhcp_log_cb) +                        lease = self._ephemeral_dhcp_ctx.obtain_lease()                  if vnet_switched:                      dhcp_attempts += 1 @@ -737,17 +1074,10 @@ class DataSourceAzure(sources.DataSource):                          self._ephemeral_dhcp_ctx.clean_network()                          raise sources.InvalidMetaDataException(msg) -                    path = REPORTED_READY_MARKER_FILE -                    LOG.info( -                        "Creating a marker file to report ready: %s", path) -                    util.write_file(path, "{pid}: {time}\n".format( -                        pid=os.getpid(), time=time())) -                    report_diagnostic_event( -                        'Successfully created reported ready marker file ' -                        'while in the preprovisioning pool.', -                        logger_func=LOG.debug) +                    self._create_report_ready_marker()                      report_ready = False +                    LOG.debug("Wait for vnetswitch to happen")                      with events.ReportEventStack(                              name="wait-for-media-disconnect-connect",                              description="wait for vnet switch", @@ -863,6 +1193,35 @@ class DataSourceAzure(sources.DataSource):                  "connectivity issues: %s" % e, logger_func=LOG.warning)              return False +    def _should_reprovision_after_nic_attach(self, candidate_metadata) -> bool: +        """Whether or not we should wait for nic attach and then poll +        IMDS for reprovisioning data. Also sets a marker file to poll IMDS. + +        The marker file is used for the following scenario: the VM boots into +        wait for nic attach, which we expect to be proceeding infinitely until +        the nic is attached. If for whatever reason the platform moves us to a +        new host (for instance a hardware issue), we need to keep waiting. +        However, since the VM reports ready to the Fabric, we will not attach +        the ISO, thus cloud-init needs to have a way of knowing that it should +        jump back into the waiting mode in order to retrieve the ovf_env. + +        @param candidate_metadata: Metadata obtained from reading ovf-env. +        @return: Whether to reprovision after waiting for nics to be attached. +        """ +        if not candidate_metadata: +            return False +        (_md, _userdata_raw, cfg, _files) = candidate_metadata +        path = REPROVISION_NIC_ATTACH_MARKER_FILE +        if (cfg.get('PreprovisionedVMType', None) == "Savable" or +                os.path.isfile(path)): +            if not os.path.isfile(path): +                LOG.info("Creating a marker file to wait for nic attach: %s", +                         path) +                util.write_file(path, "{pid}: {time}\n".format( +                    pid=os.getpid(), time=time())) +            return True +        return False +      def _should_reprovision(self, ret):          """Whether or not we should poll IMDS for reprovisioning data.          Also sets a marker file to poll IMDS. @@ -879,6 +1238,7 @@ class DataSourceAzure(sources.DataSource):          (_md, _userdata_raw, cfg, _files) = ret          path = REPROVISION_MARKER_FILE          if (cfg.get('PreprovisionedVm') is True or +                cfg.get('PreprovisionedVMType', None) == 'Running' or                  os.path.isfile(path)):              if not os.path.isfile(path):                  LOG.info("Creating a marker file to poll imds: %s", @@ -944,6 +1304,8 @@ class DataSourceAzure(sources.DataSource):          util.del_file(REPORTED_READY_MARKER_FILE)          util.del_file(REPROVISION_MARKER_FILE) +        util.del_file(REPROVISION_NIC_ATTACH_MARKER_FILE) +        util.del_file(REPROVISION_NIC_DETACHED_MARKER_FILE)          return fabric_data      @azure_ds_telemetry_reporter @@ -1399,34 +1761,109 @@ def read_azure_ovf(contents):      if 'ssh_pwauth' not in cfg and password:          cfg['ssh_pwauth'] = True -    cfg['PreprovisionedVm'] = _extract_preprovisioned_vm_setting(dom) +    preprovisioning_cfg = _get_preprovisioning_cfgs(dom) +    cfg = util.mergemanydict([cfg, preprovisioning_cfg])      return (md, ud, cfg)  @azure_ds_telemetry_reporter -def _extract_preprovisioned_vm_setting(dom): -    """Read the preprovision flag from the ovf. It should not -       exist unless true.""" +def _get_preprovisioning_cfgs(dom): +    """Read the preprovisioning related flags from ovf and populates a dict +    with the info. + +    Two flags are in use today: PreprovisionedVm bool and +    PreprovisionedVMType enum. In the long term, the PreprovisionedVm bool +    will be deprecated in favor of PreprovisionedVMType string/enum. + +    Only these combinations of values are possible today: +        - PreprovisionedVm=True and PreprovisionedVMType=Running +        - PreprovisionedVm=False and PreprovisionedVMType=Savable +        - PreprovisionedVm is missing and PreprovisionedVMType=Running/Savable +        - PreprovisionedVm=False and PreprovisionedVMType is missing + +    More specifically, this will never happen: +        - PreprovisionedVm=True and PreprovisionedVMType=Savable +    """ +    cfg = { +        "PreprovisionedVm": False, +        "PreprovisionedVMType": None +    } +      platform_settings_section = find_child(          dom.documentElement,          lambda n: n.localName == "PlatformSettingsSection")      if not platform_settings_section or len(platform_settings_section) == 0:          LOG.debug("PlatformSettingsSection not found") -        return False +        return cfg      platform_settings = find_child(          platform_settings_section[0],          lambda n: n.localName == "PlatformSettings")      if not platform_settings or len(platform_settings) == 0:          LOG.debug("PlatformSettings not found") -        return False -    preprovisionedVm = find_child( +        return cfg + +    # Read the PreprovisionedVm bool flag. This should be deprecated when the +    # platform has removed PreprovisionedVm and only surfaces +    # PreprovisionedVMType. +    cfg["PreprovisionedVm"] = _get_preprovisionedvm_cfg_value( +        platform_settings) + +    cfg["PreprovisionedVMType"] = _get_preprovisionedvmtype_cfg_value( +        platform_settings) +    return cfg + + +@azure_ds_telemetry_reporter +def _get_preprovisionedvm_cfg_value(platform_settings): +    preprovisionedVm = False + +    # Read the PreprovisionedVm bool flag. This should be deprecated when the +    # platform has removed PreprovisionedVm and only surfaces +    # PreprovisionedVMType. +    preprovisionedVmVal = find_child(          platform_settings[0],          lambda n: n.localName == "PreprovisionedVm") -    if not preprovisionedVm or len(preprovisionedVm) == 0: +    if not preprovisionedVmVal or len(preprovisionedVmVal) == 0:          LOG.debug("PreprovisionedVm not found") -        return False -    return util.translate_bool(preprovisionedVm[0].firstChild.nodeValue) +        return preprovisionedVm +    preprovisionedVm = util.translate_bool( +        preprovisionedVmVal[0].firstChild.nodeValue) + +    report_diagnostic_event( +        "PreprovisionedVm: %s" % preprovisionedVm, logger_func=LOG.info) + +    return preprovisionedVm + + +@azure_ds_telemetry_reporter +def _get_preprovisionedvmtype_cfg_value(platform_settings): +    preprovisionedVMType = None + +    # Read the PreprovisionedVMType value from the ovf. It can be +    # 'Running' or 'Savable' or not exist. This enum value is intended to +    # replace PreprovisionedVm bool flag in the long term. +    # A Running VM is the same as preprovisioned VMs of today. This is +    # equivalent to having PreprovisionedVm=True. +    # A Savable VM is one whose nic is hot-detached immediately after it +    # reports ready the first time to free up the network resources. +    # Once assigned to customer, the customer-requested nics are +    # hot-attached to it and reprovision happens like today. +    preprovisionedVMTypeVal = find_child( +        platform_settings[0], +        lambda n: n.localName == "PreprovisionedVMType") +    if (not preprovisionedVMTypeVal or len(preprovisionedVMTypeVal) == 0 or +            preprovisionedVMTypeVal[0].firstChild is None): +        LOG.debug("PreprovisionedVMType not found") +        return preprovisionedVMType + +    preprovisionedVMType = preprovisionedVMTypeVal[0].firstChild.nodeValue + +    report_diagnostic_event( +        "PreprovisionedVMType: %s" % preprovisionedVMType, +        logger_func=LOG.info) + +    return preprovisionedVMType  def encrypt_pass(password, salt_id="$6$"): @@ -1588,8 +2025,10 @@ def _generate_network_config_from_fallback_config() -> dict:  @azure_ds_telemetry_reporter -def get_metadata_from_imds(fallback_nic, retries): -    """Query Azure's network metadata service, returning a dictionary. +def get_metadata_from_imds(fallback_nic, +                           retries, +                           md_type=metadata_type.compute): +    """Query Azure's instance metadata service, returning a dictionary.      If network is not up, setup ephemeral dhcp on fallback_nic to talk to the      IMDS. For more info on IMDS: @@ -1604,7 +2043,7 @@ def get_metadata_from_imds(fallback_nic, retries):      """      kwargs = {'logfunc': LOG.debug,                'msg': 'Crawl of Azure Instance Metadata Service (IMDS)', -              'func': _get_metadata_from_imds, 'args': (retries,)} +              'func': _get_metadata_from_imds, 'args': (retries, md_type,)}      if net.is_up(fallback_nic):          return util.log_time(**kwargs)      else: @@ -1620,9 +2059,9 @@ def get_metadata_from_imds(fallback_nic, retries):  @azure_ds_telemetry_reporter -def _get_metadata_from_imds(retries): +def _get_metadata_from_imds(retries, md_type=metadata_type.compute): -    url = IMDS_URL + "instance?api-version=2019-06-01" +    url = md_type.value      headers = {"Metadata": "true"}      try:          response = readurl( diff --git a/cloudinit/sources/helpers/netlink.py b/cloudinit/sources/helpers/netlink.py index c2ad587b..e13d6834 100644 --- a/cloudinit/sources/helpers/netlink.py +++ b/cloudinit/sources/helpers/netlink.py @@ -185,6 +185,54 @@ def read_rta_oper_state(data):      return InterfaceOperstate(ifname, operstate) +def wait_for_nic_attach_event(netlink_socket, existing_nics): +    '''Block until a single nic is attached. + +    :param: netlink_socket: netlink_socket to receive events +    :param: existing_nics: List of existing nics so that we can skip them. +    :raises: AssertionError if netlink_socket is none. +    ''' +    LOG.debug("Preparing to wait for nic attach.") +    ifname = None + +    def should_continue_cb(iname, carrier, prevCarrier): +        if iname in existing_nics: +            return True +        nonlocal ifname +        ifname = iname +        return False + +    # We can return even if the operational state of the new nic is DOWN +    # because we set it to UP before doing dhcp. +    read_netlink_messages(netlink_socket, +                          None, +                          [RTM_NEWLINK], +                          [OPER_UP, OPER_DOWN], +                          should_continue_cb) +    return ifname + + +def wait_for_nic_detach_event(netlink_socket): +    '''Block until a single nic is detached and its operational state is down. + +    :param: netlink_socket: netlink_socket to receive events. +    ''' +    LOG.debug("Preparing to wait for nic detach.") +    ifname = None + +    def should_continue_cb(iname, carrier, prevCarrier): +        nonlocal ifname +        ifname = iname +        return False + +    read_netlink_messages(netlink_socket, +                          None, +                          [RTM_DELLINK], +                          [OPER_DOWN], +                          should_continue_cb) +    return ifname + +  def wait_for_media_disconnect_connect(netlink_socket, ifname):      '''Block until media disconnect and connect has happened on an interface.      Listens on netlink socket to receive netlink events and when the carrier @@ -198,10 +246,42 @@ def wait_for_media_disconnect_connect(netlink_socket, ifname):      assert (netlink_socket is not None), ("netlink socket is none")      assert (ifname is not None), ("interface name is none")      assert (len(ifname) > 0), ("interface name cannot be empty") + +    def should_continue_cb(iname, carrier, prevCarrier): +        # check for carrier down, up sequence +        isVnetSwitch = (prevCarrier == OPER_DOWN) and (carrier == OPER_UP) +        if isVnetSwitch: +            LOG.debug("Media switch happened on %s.", ifname) +            return False +        return True + +    LOG.debug("Wait for media disconnect and reconnect to happen") +    read_netlink_messages(netlink_socket, +                          ifname, +                          [RTM_NEWLINK, RTM_DELLINK], +                          [OPER_UP, OPER_DOWN], +                          should_continue_cb) + + +def read_netlink_messages(netlink_socket, +                          ifname_filter, +                          rtm_types, +                          operstates, +                          should_continue_callback): +    ''' Reads from the netlink socket until the condition specified by +    the continuation callback is met. + +    :param: netlink_socket: netlink_socket to receive events. +    :param: ifname_filter: if not None, will only listen for this interface. +    :param: rtm_types: Type of netlink events to listen for. +    :param: operstates: Operational states to listen. +    :param: should_continue_callback: Specifies when to stop listening. +    ''' +    if netlink_socket is None: +        raise RuntimeError("Netlink socket is none") +    data = bytes()      carrier = OPER_UP      prevCarrier = OPER_UP -    data = bytes() -    LOG.debug("Wait for media disconnect and reconnect to happen")      while True:          recv_data = read_netlink_socket(netlink_socket, SELECT_TIMEOUT)          if recv_data is None: @@ -223,26 +303,26 @@ def wait_for_media_disconnect_connect(netlink_socket, ifname):              padlen = (nlheader.length+PAD_ALIGNMENT-1) & ~(PAD_ALIGNMENT-1)              offset = offset + padlen              LOG.debug('offset to next netlink message: %d', offset) -            # Ignore any messages not new link or del link -            if nlheader.type not in [RTM_NEWLINK, RTM_DELLINK]: +            # Continue if we are not interested in this message. +            if nlheader.type not in rtm_types:                  continue              interface_state = read_rta_oper_state(nl_msg)              if interface_state is None:                  LOG.debug('Failed to read rta attributes: %s', interface_state)                  continue -            if interface_state.ifname != ifname: +            if (ifname_filter is not None and +                    interface_state.ifname != ifname_filter):                  LOG.debug(                      "Ignored netlink event on interface %s. Waiting for %s.", -                    interface_state.ifname, ifname) +                    interface_state.ifname, ifname_filter)                  continue -            if interface_state.operstate not in [OPER_UP, OPER_DOWN]: +            if interface_state.operstate not in operstates:                  continue              prevCarrier = carrier              carrier = interface_state.operstate -            # check for carrier down, up sequence -            isVnetSwitch = (prevCarrier == OPER_DOWN) and (carrier == OPER_UP) -            if isVnetSwitch: -                LOG.debug("Media switch happened on %s.", ifname) +            if not should_continue_callback(interface_state.ifname, +                                            carrier, +                                            prevCarrier):                  return          data = data[offset:] diff --git a/cloudinit/sources/helpers/tests/test_netlink.py b/cloudinit/sources/helpers/tests/test_netlink.py index 10760bd6..cafe3961 100644 --- a/cloudinit/sources/helpers/tests/test_netlink.py +++ b/cloudinit/sources/helpers/tests/test_netlink.py @@ -9,9 +9,10 @@ import codecs  from cloudinit.sources.helpers.netlink import (      NetlinkCreateSocketError, create_bound_netlink_socket, read_netlink_socket,      read_rta_oper_state, unpack_rta_attr, wait_for_media_disconnect_connect, +    wait_for_nic_attach_event, wait_for_nic_detach_event,      OPER_DOWN, OPER_UP, OPER_DORMANT, OPER_LOWERLAYERDOWN, OPER_NOTPRESENT, -    OPER_TESTING, OPER_UNKNOWN, RTATTR_START_OFFSET, RTM_NEWLINK, RTM_SETLINK, -    RTM_GETLINK, MAX_SIZE) +    OPER_TESTING, OPER_UNKNOWN, RTATTR_START_OFFSET, RTM_NEWLINK, RTM_DELLINK, +    RTM_SETLINK, RTM_GETLINK, MAX_SIZE)  def int_to_bytes(i): @@ -135,6 +136,75 @@ class TestParseNetlinkMessage(CiTestCase):  @mock.patch('cloudinit.sources.helpers.netlink.socket.socket')  @mock.patch('cloudinit.sources.helpers.netlink.read_netlink_socket') +class TestNicAttachDetach(CiTestCase): +    with_logs = True + +    def _media_switch_data(self, ifname, msg_type, operstate): +        '''construct netlink data with specified fields''' +        if ifname and operstate is not None: +            data = bytearray(48) +            bytes = ifname.encode("utf-8") +            struct.pack_into("HH4sHHc", data, RTATTR_START_OFFSET, 8, 3, +                             bytes, 5, 16, int_to_bytes(operstate)) +        elif ifname: +            data = bytearray(40) +            bytes = ifname.encode("utf-8") +            struct.pack_into("HH4s", data, RTATTR_START_OFFSET, 8, 3, bytes) +        elif operstate: +            data = bytearray(40) +            struct.pack_into("HHc", data, RTATTR_START_OFFSET, 5, 16, +                             int_to_bytes(operstate)) +        struct.pack_into("=LHHLL", data, 0, len(data), msg_type, 0, 0, 0) +        return data + +    def test_nic_attached_oper_down(self, m_read_netlink_socket, m_socket): +        '''Test for a new nic attached''' +        ifname = "eth0" +        data_op_down = self._media_switch_data(ifname, RTM_NEWLINK, OPER_DOWN) +        m_read_netlink_socket.side_effect = [data_op_down] +        ifread = wait_for_nic_attach_event(m_socket, []) +        self.assertEqual(m_read_netlink_socket.call_count, 1) +        self.assertEqual(ifname, ifread) + +    def test_nic_attached_oper_up(self, m_read_netlink_socket, m_socket): +        '''Test for a new nic attached''' +        ifname = "eth0" +        data_op_up = self._media_switch_data(ifname, RTM_NEWLINK, OPER_UP) +        m_read_netlink_socket.side_effect = [data_op_up] +        ifread = wait_for_nic_attach_event(m_socket, []) +        self.assertEqual(m_read_netlink_socket.call_count, 1) +        self.assertEqual(ifname, ifread) + +    def test_nic_attach_ignore_existing(self, m_read_netlink_socket, m_socket): +        '''Test that we read only the interfaces we are interested in.''' +        data_eth0 = self._media_switch_data("eth0", RTM_NEWLINK, OPER_DOWN) +        data_eth1 = self._media_switch_data("eth1", RTM_NEWLINK, OPER_DOWN) +        m_read_netlink_socket.side_effect = [data_eth0, data_eth1] +        ifread = wait_for_nic_attach_event(m_socket, ["eth0"]) +        self.assertEqual(m_read_netlink_socket.call_count, 2) +        self.assertEqual("eth1", ifread) + +    def test_nic_attach_read_first(self, m_read_netlink_socket, m_socket): +        '''Test that we read only the interfaces we are interested in.''' +        data_eth0 = self._media_switch_data("eth0", RTM_NEWLINK, OPER_DOWN) +        data_eth1 = self._media_switch_data("eth1", RTM_NEWLINK, OPER_DOWN) +        m_read_netlink_socket.side_effect = [data_eth0, data_eth1] +        ifread = wait_for_nic_attach_event(m_socket, ["eth1"]) +        self.assertEqual(m_read_netlink_socket.call_count, 1) +        self.assertEqual("eth0", ifread) + +    def test_nic_detached(self, m_read_netlink_socket, m_socket): +        '''Test for an existing nic detached''' +        ifname = "eth0" +        data_op_down = self._media_switch_data(ifname, RTM_DELLINK, OPER_DOWN) +        m_read_netlink_socket.side_effect = [data_op_down] +        ifread = wait_for_nic_detach_event(m_socket) +        self.assertEqual(m_read_netlink_socket.call_count, 1) +        self.assertEqual(ifname, ifread) + + +@mock.patch('cloudinit.sources.helpers.netlink.socket.socket') +@mock.patch('cloudinit.sources.helpers.netlink.read_netlink_socket')  class TestWaitForMediaDisconnectConnect(CiTestCase):      with_logs = True diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index 534314aa..e363c1f9 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -11,6 +11,7 @@ from cloudinit.version import version_string as vs  from cloudinit.tests.helpers import (      HttprettyTestCase, CiTestCase, populate_dir, mock, wrap_and_call,      ExitStack, resourceLocation) +from cloudinit.sources.helpers import netlink  import copy  import crypt @@ -78,6 +79,8 @@ def construct_valid_ovf_env(data=None, pubkeys=None,      if platform_settings:          for k, v in platform_settings.items():              content += "<%s>%s</%s>\n" % (k, v, k) +        if "PreprovisionedVMType" not in platform_settings: +            content += """<PreprovisionedVMType i:nil="true" />"""      content += """</PlatformSettings></wa:PlatformSettingsSection>  </Environment>""" @@ -156,6 +159,31 @@ SECONDARY_INTERFACE = {      }  } +IMDS_NETWORK_METADATA = { +    "interface": [ +        { +            "macAddress": "000D3A047598", +            "ipv6": { +                "ipAddress": [] +            }, +            "ipv4": { +                "subnet": [ +                    { +                        "prefix": "24", +                        "address": "10.0.0.0" +                    } +                ], +                "ipAddress": [ +                    { +                        "privateIpAddress": "10.0.0.4", +                        "publicIpAddress": "104.46.124.81" +                    } +                ] +            } +        } +    ] +} +  MOCKPATH = 'cloudinit.sources.DataSourceAzure.' @@ -366,8 +394,8 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):          self.network_md_url = dsaz.IMDS_URL + "instance?api-version=2019-06-01"      @mock.patch(MOCKPATH + 'readurl') -    @mock.patch(MOCKPATH + 'EphemeralDHCPv4') -    @mock.patch(MOCKPATH + 'net.is_up') +    @mock.patch(MOCKPATH + 'EphemeralDHCPv4', autospec=True) +    @mock.patch(MOCKPATH + 'net.is_up', autospec=True)      def test_get_metadata_does_not_dhcp_if_network_is_up(              self, m_net_is_up, m_dhcp, m_readurl):          """Do not perform DHCP setup when nic is already up.""" @@ -384,9 +412,66 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):              "Crawl of Azure Instance Metadata Service (IMDS) took",  # log_time              self.logs.getvalue()) -    @mock.patch(MOCKPATH + 'readurl') -    @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting') +    @mock.patch(MOCKPATH + 'readurl', autospec=True) +    @mock.patch(MOCKPATH + 'EphemeralDHCPv4') +    @mock.patch(MOCKPATH + 'net.is_up') +    def test_get_compute_metadata_uses_compute_url( +            self, m_net_is_up, m_dhcp, m_readurl): +        """Make sure readurl is called with the correct url when accessing +        network metadata""" +        m_net_is_up.return_value = True +        m_readurl.return_value = url_helper.StringResponse( +            json.dumps(IMDS_NETWORK_METADATA).encode('utf-8')) + +        dsaz.get_metadata_from_imds( +            'eth0', retries=3, md_type=dsaz.metadata_type.compute) +        m_readurl.assert_called_with( +            "http://169.254.169.254/metadata/instance?api-version=" +            "2019-06-01", exception_cb=mock.ANY, +            headers=mock.ANY, retries=mock.ANY, +            timeout=mock.ANY) + +    @mock.patch(MOCKPATH + 'readurl', autospec=True) +    @mock.patch(MOCKPATH + 'EphemeralDHCPv4') +    @mock.patch(MOCKPATH + 'net.is_up') +    def test_get_network_metadata_uses_network_url( +            self, m_net_is_up, m_dhcp, m_readurl): +        """Make sure readurl is called with the correct url when accessing +        network metadata""" +        m_net_is_up.return_value = True +        m_readurl.return_value = url_helper.StringResponse( +            json.dumps(IMDS_NETWORK_METADATA).encode('utf-8')) + +        dsaz.get_metadata_from_imds( +            'eth0', retries=3, md_type=dsaz.metadata_type.network) +        m_readurl.assert_called_with( +            "http://169.254.169.254/metadata/instance/network?api-version=" +            "2019-06-01", exception_cb=mock.ANY, +            headers=mock.ANY, retries=mock.ANY, +            timeout=mock.ANY) + +    @mock.patch(MOCKPATH + 'readurl', autospec=True) +    @mock.patch(MOCKPATH + 'EphemeralDHCPv4')      @mock.patch(MOCKPATH + 'net.is_up') +    def test_get_default_metadata_uses_compute_url( +            self, m_net_is_up, m_dhcp, m_readurl): +        """Make sure readurl is called with the correct url when accessing +        network metadata""" +        m_net_is_up.return_value = True +        m_readurl.return_value = url_helper.StringResponse( +            json.dumps(IMDS_NETWORK_METADATA).encode('utf-8')) + +        dsaz.get_metadata_from_imds( +            'eth0', retries=3) +        m_readurl.assert_called_with( +            "http://169.254.169.254/metadata/instance?api-version=" +            "2019-06-01", exception_cb=mock.ANY, +            headers=mock.ANY, retries=mock.ANY, +            timeout=mock.ANY) + +    @mock.patch(MOCKPATH + 'readurl', autospec=True) +    @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting', autospec=True) +    @mock.patch(MOCKPATH + 'net.is_up', autospec=True)      def test_get_metadata_performs_dhcp_when_network_is_down(              self, m_net_is_up, m_dhcp, m_readurl):          """Perform DHCP setup when nic is not up.""" @@ -410,7 +495,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):              timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS)      @mock.patch('cloudinit.url_helper.time.sleep') -    @mock.patch(MOCKPATH + 'net.is_up') +    @mock.patch(MOCKPATH + 'net.is_up', autospec=True)      def test_get_metadata_from_imds_empty_when_no_imds_present(              self, m_net_is_up, m_sleep):          """Return empty dict when IMDS network metadata is absent.""" @@ -431,7 +516,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):      @mock.patch('requests.Session.request')      @mock.patch('cloudinit.url_helper.time.sleep') -    @mock.patch(MOCKPATH + 'net.is_up') +    @mock.patch(MOCKPATH + 'net.is_up', autospec=True)      def test_get_metadata_from_imds_retries_on_timeout(              self, m_net_is_up, m_sleep, m_request):          """Retry IMDS network metadata on timeout errors.""" @@ -801,6 +886,7 @@ scbus-1 on xpt0 bus 0                  'sys_cfg': {}}          dsrc = self._get_ds(data)          expected_cfg = { +            'PreprovisionedVMType': None,              'PreprovisionedVm': False,              'datasource': {'Azure': {'agent_command': 'my_command'}},              'system_info': {'default_user': {'name': u'myuser'}}} @@ -864,6 +950,66 @@ scbus-1 on xpt0 bus 0          dsrc.crawl_metadata()          self.assertEqual(1, m_report_ready.call_count) +    @mock.patch( +        'cloudinit.sources.DataSourceAzure.EphemeralDHCPv4WithReporting') +    @mock.patch('cloudinit.sources.DataSourceAzure.util.write_file') +    @mock.patch( +        'cloudinit.sources.DataSourceAzure.DataSourceAzure._report_ready') +    @mock.patch('cloudinit.sources.DataSourceAzure.DataSourceAzure._poll_imds') +    @mock.patch( +        'cloudinit.sources.DataSourceAzure.DataSourceAzure.' +        '_wait_for_all_nics_ready') +    def test_crawl_metadata_waits_for_nic_on_savable_vms( +        self, detect_nics, poll_imds_func, report_ready_func, m_write, m_dhcp +    ): +        """If reprovisioning, report ready at the end""" +        ovfenv = construct_valid_ovf_env( +            platform_settings={"PreprovisionedVMType": "Savable", +                               "PreprovisionedVm": "True"} +        ) + +        data = { +            'ovfcontent': ovfenv, +            'sys_cfg': {} +        } +        dsrc = self._get_ds(data) +        poll_imds_func.return_value = ovfenv +        dsrc.crawl_metadata() +        self.assertEqual(1, report_ready_func.call_count) +        self.assertEqual(1, detect_nics.call_count) + +    @mock.patch( +        'cloudinit.sources.DataSourceAzure.EphemeralDHCPv4WithReporting') +    @mock.patch('cloudinit.sources.DataSourceAzure.util.write_file') +    @mock.patch( +        'cloudinit.sources.DataSourceAzure.DataSourceAzure._report_ready') +    @mock.patch('cloudinit.sources.DataSourceAzure.DataSourceAzure._poll_imds') +    @mock.patch( +        'cloudinit.sources.DataSourceAzure.DataSourceAzure.' +        '_wait_for_all_nics_ready') +    @mock.patch('os.path.isfile') +    def test_detect_nics_when_marker_present( +        self, is_file, detect_nics, poll_imds_func, report_ready_func, m_write, +            m_dhcp): +        """If reprovisioning, wait for nic attach if marker present""" + +        def is_file_ret(key): +            return key == dsaz.REPROVISION_NIC_ATTACH_MARKER_FILE + +        is_file.side_effect = is_file_ret +        ovfenv = construct_valid_ovf_env() + +        data = { +            'ovfcontent': ovfenv, +            'sys_cfg': {} +        } + +        dsrc = self._get_ds(data) +        poll_imds_func.return_value = ovfenv +        dsrc.crawl_metadata() +        self.assertEqual(1, report_ready_func.call_count) +        self.assertEqual(1, detect_nics.call_count) +      @mock.patch('cloudinit.sources.DataSourceAzure.util.write_file')      @mock.patch('cloudinit.sources.helpers.netlink.'                  'wait_for_media_disconnect_connect') @@ -1526,7 +1672,7 @@ scbus-1 on xpt0 bus 0      @mock.patch('cloudinit.net.get_interface_mac')      @mock.patch('cloudinit.net.get_devicelist')      @mock.patch('cloudinit.net.device_driver') -    @mock.patch('cloudinit.net.generate_fallback_config') +    @mock.patch('cloudinit.net.generate_fallback_config', autospec=True)      def test_fallback_network_config(self, mock_fallback, mock_dd,                                       mock_devlist, mock_get_mac):          """On absent IMDS network data, generate network fallback config.""" @@ -1561,7 +1707,7 @@ scbus-1 on xpt0 bus 0              blacklist_drivers=['mlx4_core', 'mlx5_core'],              config_driver=True) -    @mock.patch(MOCKPATH + 'net.get_interfaces') +    @mock.patch(MOCKPATH + 'net.get_interfaces', autospec=True)      @mock.patch(MOCKPATH + 'util.is_FreeBSD')      def test_blacklist_through_distro(              self, m_is_freebsd, m_net_get_interfaces): @@ -1583,17 +1729,17 @@ scbus-1 on xpt0 bus 0          m_net_get_interfaces.assert_called_with(              blacklist_drivers=dsaz.BLACKLIST_DRIVERS) -    @mock.patch(MOCKPATH + 'subp.subp') +    @mock.patch(MOCKPATH + 'subp.subp', autospec=True)      def test_get_hostname_with_no_args(self, m_subp):          dsaz.get_hostname()          m_subp.assert_called_once_with(("hostname",), capture=True) -    @mock.patch(MOCKPATH + 'subp.subp') +    @mock.patch(MOCKPATH + 'subp.subp', autospec=True)      def test_get_hostname_with_string_arg(self, m_subp):          dsaz.get_hostname(hostname_command="hostname")          m_subp.assert_called_once_with(("hostname",), capture=True) -    @mock.patch(MOCKPATH + 'subp.subp') +    @mock.patch(MOCKPATH + 'subp.subp', autospec=True)      def test_get_hostname_with_iterable_arg(self, m_subp):          dsaz.get_hostname(hostname_command=("hostname",))          m_subp.assert_called_once_with(("hostname",), capture=True) @@ -2224,6 +2370,29 @@ class TestPreprovisioningReadAzureOvfFlag(CiTestCase):          ret = dsaz.read_azure_ovf(content)          cfg = ret[2]          self.assertFalse(cfg['PreprovisionedVm']) +        self.assertEqual(None, cfg["PreprovisionedVMType"]) + +    def test_read_azure_ovf_with_running_type(self): +        """The read_azure_ovf method should set PreprovisionedVMType +           cfg flag to Running.""" +        content = construct_valid_ovf_env( +            platform_settings={"PreprovisionedVMType": "Running", +                               "PreprovisionedVm": "True"}) +        ret = dsaz.read_azure_ovf(content) +        cfg = ret[2] +        self.assertTrue(cfg['PreprovisionedVm']) +        self.assertEqual("Running", cfg['PreprovisionedVMType']) + +    def test_read_azure_ovf_with_savable_type(self): +        """The read_azure_ovf method should set PreprovisionedVMType +           cfg flag to Savable.""" +        content = construct_valid_ovf_env( +            platform_settings={"PreprovisionedVMType": "Savable", +                               "PreprovisionedVm": "True"}) +        ret = dsaz.read_azure_ovf(content) +        cfg = ret[2] +        self.assertTrue(cfg['PreprovisionedVm']) +        self.assertEqual("Savable", cfg['PreprovisionedVMType'])  @mock.patch('os.path.isfile') @@ -2273,6 +2442,227 @@ class TestPreprovisioningShouldReprovision(CiTestCase):          _poll_imds.assert_called_with() +class TestPreprovisioningHotAttachNics(CiTestCase): + +    def setUp(self): +        super(TestPreprovisioningHotAttachNics, self).setUp() +        self.tmp = self.tmp_dir() +        self.waagent_d = self.tmp_path('/var/lib/waagent', self.tmp) +        self.paths = helpers.Paths({'cloud_dir': self.tmp}) +        dsaz.BUILTIN_DS_CONFIG['data_dir'] = self.waagent_d +        self.paths = helpers.Paths({'cloud_dir': self.tmp}) + +    @mock.patch('cloudinit.sources.helpers.netlink.wait_for_nic_detach_event', +                autospec=True) +    @mock.patch(MOCKPATH + 'util.write_file', autospec=True) +    def test_nic_detach_writes_marker(self, m_writefile, m_detach): +        """When we detect that a nic gets detached, we write a marker for it""" +        dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) +        nl_sock = mock.MagicMock() +        dsa._wait_for_nic_detach(nl_sock) +        m_detach.assert_called_with(nl_sock) +        self.assertEqual(1, m_detach.call_count) +        m_writefile.assert_called_with( +            dsaz.REPROVISION_NIC_DETACHED_MARKER_FILE, mock.ANY) + +    @mock.patch(MOCKPATH + 'util.write_file', autospec=True) +    @mock.patch(MOCKPATH + 'DataSourceAzure.fallback_interface') +    @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting') +    @mock.patch(MOCKPATH + 'DataSourceAzure._report_ready') +    @mock.patch(MOCKPATH + 'DataSourceAzure._wait_for_nic_detach') +    def test_detect_nic_attach_reports_ready_and_waits_for_detach( +            self, m_detach, m_report_ready, m_dhcp, m_fallback_if, +            m_writefile): +        """Report ready first and then wait for nic detach""" +        dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) +        dsa._wait_for_all_nics_ready() +        m_fallback_if.return_value = "Dummy interface" +        self.assertEqual(1, m_report_ready.call_count) +        self.assertEqual(1, m_detach.call_count) +        self.assertEqual(1, m_writefile.call_count) +        self.assertEqual(1, m_dhcp.call_count) +        m_writefile.assert_called_with(dsaz.REPORTED_READY_MARKER_FILE, +                                       mock.ANY) + +    @mock.patch('os.path.isfile') +    @mock.patch(MOCKPATH + 'DataSourceAzure.fallback_interface') +    @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting') +    @mock.patch(MOCKPATH + 'DataSourceAzure._report_ready') +    @mock.patch(MOCKPATH + 'DataSourceAzure._wait_for_nic_detach') +    def test_detect_nic_attach_skips_report_ready_when_marker_present( +            self, m_detach, m_report_ready, m_dhcp, m_fallback_if, m_isfile): +        """Skip reporting ready if we already have a marker file.""" +        dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + +        def isfile(key): +            return key == dsaz.REPORTED_READY_MARKER_FILE + +        m_isfile.side_effect = isfile +        dsa._wait_for_all_nics_ready() +        m_fallback_if.return_value = "Dummy interface" +        self.assertEqual(0, m_report_ready.call_count) +        self.assertEqual(0, m_dhcp.call_count) +        self.assertEqual(1, m_detach.call_count) + +    @mock.patch('os.path.isfile') +    @mock.patch(MOCKPATH + 'DataSourceAzure.fallback_interface') +    @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting') +    @mock.patch(MOCKPATH + 'DataSourceAzure._report_ready') +    @mock.patch(MOCKPATH + 'DataSourceAzure._wait_for_nic_detach') +    def test_detect_nic_attach_skips_nic_detach_when_marker_present( +            self, m_detach, m_report_ready, m_dhcp, m_fallback_if, m_isfile): +        """Skip wait for nic detach if it already happened.""" +        dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + +        m_isfile.return_value = True +        dsa._wait_for_all_nics_ready() +        m_fallback_if.return_value = "Dummy interface" +        self.assertEqual(0, m_report_ready.call_count) +        self.assertEqual(0, m_dhcp.call_count) +        self.assertEqual(0, m_detach.call_count) + +    @mock.patch(MOCKPATH + 'DataSourceAzure.wait_for_link_up', autospec=True) +    @mock.patch('cloudinit.sources.helpers.netlink.wait_for_nic_attach_event') +    @mock.patch('cloudinit.sources.net.find_fallback_nic') +    @mock.patch(MOCKPATH + 'get_metadata_from_imds') +    @mock.patch(MOCKPATH + 'EphemeralDHCPv4') +    @mock.patch(MOCKPATH + 'DataSourceAzure._wait_for_nic_detach') +    @mock.patch('os.path.isfile') +    def test_wait_for_nic_attach_if_no_fallback_interface( +            self, m_isfile, m_detach, m_dhcpv4, m_imds, m_fallback_if, +            m_attach, m_link_up): +        """Wait for nic attach if we do not have a fallback interface""" +        dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) +        lease = { +            'interface': 'eth9', 'fixed-address': '192.168.2.9', +            'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0', +            'unknown-245': '624c3620'} + +        m_isfile.return_value = True +        m_attach.return_value = "eth0" +        dhcp_ctx = mock.MagicMock(lease=lease) +        dhcp_ctx.obtain_lease.return_value = lease +        m_dhcpv4.return_value = dhcp_ctx +        m_imds.return_value = IMDS_NETWORK_METADATA +        m_fallback_if.return_value = None + +        dsa._wait_for_all_nics_ready() + +        self.assertEqual(0, m_detach.call_count) +        self.assertEqual(1, m_attach.call_count) +        self.assertEqual(1, m_dhcpv4.call_count) +        self.assertEqual(1, m_imds.call_count) +        self.assertEqual(1, m_link_up.call_count) +        m_link_up.assert_called_with(mock.ANY, "eth0") + +    @mock.patch(MOCKPATH + 'DataSourceAzure.wait_for_link_up') +    @mock.patch('cloudinit.sources.helpers.netlink.wait_for_nic_attach_event') +    @mock.patch('cloudinit.sources.net.find_fallback_nic') +    @mock.patch(MOCKPATH + 'get_metadata_from_imds') +    @mock.patch(MOCKPATH + 'EphemeralDHCPv4') +    @mock.patch(MOCKPATH + 'DataSourceAzure._wait_for_nic_detach') +    @mock.patch('os.path.isfile') +    def test_wait_for_nic_attach_multinic_attach( +            self, m_isfile, m_detach, m_dhcpv4, m_imds, m_fallback_if, +            m_attach, m_link_up): +        """Wait for nic attach if we do not have a fallback interface""" +        dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) +        lease = { +            'interface': 'eth9', 'fixed-address': '192.168.2.9', +            'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0', +            'unknown-245': '624c3620'} +        m_attach_call_count = 0 + +        def nic_attach_ret(nl_sock, nics_found): +            nonlocal m_attach_call_count +            if m_attach_call_count == 0: +                m_attach_call_count = m_attach_call_count + 1 +                return "eth0" +            return "eth1" + +        def network_metadata_ret(ifname, retries, type): +            # Simulate two NICs by adding the same one twice. +            md = IMDS_NETWORK_METADATA +            md['interface'].append(md['interface'][0]) +            if ifname == "eth0": +                return md +            raise requests.Timeout('Fake connection timeout') + +        m_isfile.return_value = True +        m_attach.side_effect = nic_attach_ret +        dhcp_ctx = mock.MagicMock(lease=lease) +        dhcp_ctx.obtain_lease.return_value = lease +        m_dhcpv4.return_value = dhcp_ctx +        m_imds.side_effect = network_metadata_ret +        m_fallback_if.return_value = None + +        dsa._wait_for_all_nics_ready() + +        self.assertEqual(0, m_detach.call_count) +        self.assertEqual(2, m_attach.call_count) +        # DHCP and network metadata calls will only happen on the primary NIC. +        self.assertEqual(1, m_dhcpv4.call_count) +        self.assertEqual(1, m_imds.call_count) +        self.assertEqual(2, m_link_up.call_count) + +    @mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up') +    def test_wait_for_link_up_returns_if_already_up( +            self, m_is_link_up): +        """Waiting for link to be up should return immediately if the link is +           already up.""" + +        distro_cls = distros.fetch('ubuntu') +        distro = distro_cls('ubuntu', {}, self.paths) +        dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) +        m_is_link_up.return_value = True + +        dsa.wait_for_link_up("eth0") +        self.assertEqual(1, m_is_link_up.call_count) + +    @mock.patch(MOCKPATH + 'util.write_file') +    @mock.patch('cloudinit.net.read_sys_net') +    @mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up') +    def test_wait_for_link_up_writes_to_device_file( +            self, m_is_link_up, m_read_sys_net, m_writefile): +        """Waiting for link to be up should return immediately if the link is +           already up.""" + +        distro_cls = distros.fetch('ubuntu') +        distro = distro_cls('ubuntu', {}, self.paths) +        dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) + +        callcount = 0 + +        def linkup(key): +            nonlocal callcount +            if callcount == 0: +                callcount += 1 +                return False +            return True + +        m_is_link_up.side_effect = linkup + +        dsa.wait_for_link_up("eth0") +        self.assertEqual(2, m_is_link_up.call_count) +        self.assertEqual(1, m_read_sys_net.call_count) +        self.assertEqual(2, m_writefile.call_count) + +    @mock.patch('cloudinit.sources.helpers.netlink.' +                'create_bound_netlink_socket') +    def test_wait_for_all_nics_ready_raises_if_socket_fails(self, m_socket): +        """Waiting for all nics should raise exception if netlink socket +           creation fails.""" + +        m_socket.side_effect = netlink.NetlinkCreateSocketError +        distro_cls = distros.fetch('ubuntu') +        distro = distro_cls('ubuntu', {}, self.paths) +        dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) + +        self.assertRaises(netlink.NetlinkCreateSocketError, +                          dsa._wait_for_all_nics_ready) +        # dsa._wait_for_all_nics_ready() + +  @mock.patch('cloudinit.net.dhcp.EphemeralIPv4Network')  @mock.patch('cloudinit.net.dhcp.maybe_perform_dhcp_discovery')  @mock.patch('cloudinit.sources.helpers.netlink.' @@ -2330,6 +2720,24 @@ class TestPreprovisioningPollIMDS(CiTestCase):          self.assertEqual(3, m_dhcpv4.call_count, 'Expected 3 DHCP calls')          self.assertEqual(4, self.tries, 'Expected 4 total reads from IMDS') +    @mock.patch('os.path.isfile') +    def test_poll_imds_skips_dhcp_if_ctx_present( +            self, m_isfile, report_ready_func, fake_resp, m_media_switch, +            m_dhcp, m_net): +        """The poll_imds function should reuse the dhcp ctx if it is already +           present. This happens when we wait for nic to be hot-attached before +           polling for reprovisiondata. Note that if this ctx is set when +           _poll_imds is called, then it is not expected to be waiting for +           media_disconnect_connect either.""" +        report_file = self.tmp_path('report_marker', self.tmp) +        m_isfile.return_value = True +        dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) +        dsa._ephemeral_dhcp_ctx = "Dummy dhcp ctx" +        with mock.patch(MOCKPATH + 'REPORTED_READY_MARKER_FILE', report_file): +            dsa._poll_imds() +        self.assertEqual(0, m_dhcp.call_count) +        self.assertEqual(0, m_media_switch.call_count) +      def test_does_not_poll_imds_report_ready_when_marker_file_exists(              self, m_report_ready, m_request, m_media_switch, m_dhcp, m_net):          """poll_imds should not call report ready when the reported ready @@ -2390,7 +2798,7 @@ class TestPreprovisioningPollIMDS(CiTestCase):  @mock.patch(MOCKPATH + 'util.is_FreeBSD')  @mock.patch('cloudinit.sources.helpers.netlink.'              'wait_for_media_disconnect_connect') -@mock.patch('cloudinit.net.dhcp.EphemeralIPv4Network') +@mock.patch('cloudinit.net.dhcp.EphemeralIPv4Network', autospec=True)  @mock.patch('cloudinit.net.dhcp.maybe_perform_dhcp_discovery')  @mock.patch('requests.Session.request')  class TestAzureDataSourcePreprovisioning(CiTestCase): @@ -2412,7 +2820,7 @@ class TestAzureDataSourcePreprovisioning(CiTestCase):          m_dhcp.return_value = [{              'interface': 'eth9', 'fixed-address': '192.168.2.9',              'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0'}] -        url = 'http://{0}/metadata/reprovisiondata?api-version=2017-04-02' +        url = 'http://{0}/metadata/reprovisiondata?api-version=2019-06-01'          host = "169.254.169.254"          full_url = url.format(host)          m_request.return_value = mock.MagicMock(status_code=200, text="ovf", @@ -2445,7 +2853,7 @@ class TestAzureDataSourcePreprovisioning(CiTestCase):              'interface': 'eth9', 'fixed-address': '192.168.2.9',              'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0',              'unknown-245': '624c3620'}] -        url = 'http://{0}/metadata/reprovisiondata?api-version=2017-04-02' +        url = 'http://{0}/metadata/reprovisiondata?api-version=2019-06-01'          host = "169.254.169.254"          full_url = url.format(host)          hostname = "myhost" diff --git a/tools/.github-cla-signers b/tools/.github-cla-signers index 9b594a44..1e0c3ea4 100644 --- a/tools/.github-cla-signers +++ b/tools/.github-cla-signers @@ -1,6 +1,7 @@  ader1990  AlexBaranowski  Aman306 +aswinrajamannar  beezly  bipinbachhao  BirknerAlex | 
