diff options
Diffstat (limited to 'cloudinit/sources')
27 files changed, 1308 insertions, 306 deletions
diff --git a/cloudinit/sources/DataSourceAliYun.py b/cloudinit/sources/DataSourceAliYun.py index 45cc9f00..09052873 100644 --- a/cloudinit/sources/DataSourceAliYun.py +++ b/cloudinit/sources/DataSourceAliYun.py @@ -1,8 +1,8 @@ # This file is part of cloud-init. See LICENSE file for license information. +from cloudinit import dmi from cloudinit import sources from cloudinit.sources import DataSourceEc2 as EC2 -from cloudinit import util ALIYUN_PRODUCT = "Alibaba Cloud ECS" @@ -30,7 +30,7 @@ class DataSourceAliYun(EC2.DataSourceEc2): def _is_aliyun(): - return util.read_dmi_data('system-product-name') == ALIYUN_PRODUCT + return dmi.read_dmi_data('system-product-name') == ALIYUN_PRODUCT def parse_public_keys(public_keys): diff --git a/cloudinit/sources/DataSourceAltCloud.py b/cloudinit/sources/DataSourceAltCloud.py index ac3ecc3d..cd93412a 100644 --- a/cloudinit/sources/DataSourceAltCloud.py +++ b/cloudinit/sources/DataSourceAltCloud.py @@ -16,6 +16,7 @@ import errno import os import os.path +from cloudinit import dmi from cloudinit import log as logging from cloudinit import sources from cloudinit import subp @@ -109,7 +110,7 @@ class DataSourceAltCloud(sources.DataSource): CLOUD_INFO_FILE) return 'UNKNOWN' return cloud_type - system_name = util.read_dmi_data("system-product-name") + system_name = dmi.read_dmi_data("system-product-name") if not system_name: return 'UNKNOWN' diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index f3c6452b..04ff2131 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -12,9 +12,12 @@ import os import os.path import re from time import time +from time import sleep from xml.dom import minidom import xml.etree.ElementTree as ET +from enum import Enum +from cloudinit import dmi from cloudinit import log as logging from cloudinit import net from cloudinit.event import EventType @@ -28,6 +31,7 @@ from cloudinit import util from cloudinit.reporting import events from cloudinit.sources.helpers.azure import ( + DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE, azure_ds_reporter, azure_ds_telemetry_reporter, get_metadata_from_fabric, @@ -37,7 +41,8 @@ from cloudinit.sources.helpers.azure import ( EphemeralDHCPv4WithReporting, is_byte_swapped, dhcp_log_cb, - push_log_to_kvp) + push_log_to_kvp, + report_failure_to_fabric) LOG = logging.getLogger(__name__) @@ -64,13 +69,27 @@ DEFAULT_FS = 'ext4' # DMI chassis-asset-tag is set static for all azure instances AZURE_CHASSIS_ASSET_TAG = '7783-7084-3265-9085-8269-3286-77' REPROVISION_MARKER_FILE = "/var/lib/cloud/data/poll_imds" +REPROVISION_NIC_ATTACH_MARKER_FILE = "/var/lib/cloud/data/wait_for_nic_attach" +REPROVISION_NIC_DETACHED_MARKER_FILE = "/var/lib/cloud/data/nic_detached" REPORTED_READY_MARKER_FILE = "/var/lib/cloud/data/reported_ready" AGENT_SEED_DIR = '/var/lib/waagent' + # In the event where the IMDS primary server is not # available, it takes 1s to fallback to the secondary one IMDS_TIMEOUT_IN_SECONDS = 2 IMDS_URL = "http://169.254.169.254/metadata/" +IMDS_VER = "2019-06-01" +IMDS_VER_PARAM = "api-version={}".format(IMDS_VER) + + +class metadata_type(Enum): + compute = "{}instance?{}".format(IMDS_URL, IMDS_VER_PARAM) + network = "{}instance/network?{}".format(IMDS_URL, + IMDS_VER_PARAM) + reprovisiondata = "{}reprovisiondata?{}".format(IMDS_URL, + IMDS_VER_PARAM) + PLATFORM_ENTROPY_SOURCE = "/sys/firmware/acpi/tables/OEM0" @@ -83,6 +102,25 @@ UBUNTU_EXTENDED_NETWORK_SCRIPTS = [ '/run/network/interfaces.ephemeral.d', ] +# This list is used to blacklist devices that will be considered +# for renaming or fallback interfaces. +# +# On Azure network devices using these drivers are automatically +# configured by the platform and should not be configured by +# cloud-init's network configuration. +# +# Note: +# Azure Dv4 and Ev4 series VMs always have mlx5 hardware. +# https://docs.microsoft.com/en-us/azure/virtual-machines/dv4-dsv4-series +# https://docs.microsoft.com/en-us/azure/virtual-machines/ev4-esv4-series +# Earlier D and E series VMs (such as Dv2, Dv3, and Ev3 series VMs) +# can have either mlx4 or mlx5 hardware, with the older series VMs +# having a higher chance of coming with mlx4 hardware. +# https://docs.microsoft.com/en-us/azure/virtual-machines/dv2-dsv2-series +# https://docs.microsoft.com/en-us/azure/virtual-machines/dv3-dsv3-series +# https://docs.microsoft.com/en-us/azure/virtual-machines/ev3-esv3-series +BLACKLIST_DRIVERS = ['mlx4_core', 'mlx5_core'] + def find_storvscid_from_sysctl_pnpinfo(sysctl_out, deviceid): # extract the 'X' from dev.storvsc.X. if deviceid matches @@ -280,9 +318,9 @@ def temporary_hostname(temp_hostname, cfg, hostname_command='hostname'): try: set_hostname(temp_hostname, hostname_command) except Exception as e: - msg = 'Failed setting temporary hostname: %s' % e - report_diagnostic_event(msg) - LOG.warning(msg) + report_diagnostic_event( + 'Failed setting temporary hostname: %s' % e, + logger_func=LOG.warning) yield None return try: @@ -337,7 +375,9 @@ class DataSourceAzure(sources.DataSource): cfg=cfg, prev_hostname=previous_hn) except Exception as e: - LOG.warning("Failed publishing hostname: %s", e) + report_diagnostic_event( + "Failed publishing hostname: %s" % e, + logger_func=LOG.warning) util.logexc(LOG, "handling set_hostname failed") return False @@ -410,20 +450,39 @@ class DataSourceAzure(sources.DataSource): # need to look in the datadir and consider that valid ddir = self.ds_cfg['data_dir'] + # The order in which the candidates are inserted matters here, because + # it determines the value of ret. More specifically, the first one in + # the candidate list determines the path to take in order to get the + # metadata we need. candidates = [self.seed_dir] if os.path.isfile(REPROVISION_MARKER_FILE): candidates.insert(0, "IMDS") + report_diagnostic_event("Reprovision marker file already present " + "before crawling Azure metadata: %s" % + REPROVISION_MARKER_FILE, + logger_func=LOG.debug) + elif os.path.isfile(REPROVISION_NIC_ATTACH_MARKER_FILE): + candidates.insert(0, "NIC_ATTACH_MARKER_PRESENT") + report_diagnostic_event("Reprovision nic attach marker file " + "already present before crawling Azure " + "metadata: %s" % + REPROVISION_NIC_ATTACH_MARKER_FILE, + logger_func=LOG.debug) candidates.extend(list_possible_azure_ds_devs()) if ddir: candidates.append(ddir) found = None reprovision = False + reprovision_after_nic_attach = False for cdev in candidates: try: if cdev == "IMDS": ret = None reprovision = True + elif cdev == "NIC_ATTACH_MARKER_PRESENT": + ret = None + reprovision_after_nic_attach = True elif cdev.startswith("/dev/"): if util.is_FreeBSD(): ret = util.mount_cb(cdev, load_azure_ds_dir, @@ -435,26 +494,32 @@ class DataSourceAzure(sources.DataSource): except NonAzureDataSource: report_diagnostic_event( - "Did not find Azure data source in %s" % cdev) + "Did not find Azure data source in %s" % cdev, + logger_func=LOG.debug) continue except BrokenAzureDataSource as exc: msg = 'BrokenAzureDataSource: %s' % exc - report_diagnostic_event(msg) + report_diagnostic_event(msg, logger_func=LOG.error) raise sources.InvalidMetaDataException(msg) except util.MountFailedError: - msg = '%s was not mountable' % cdev - report_diagnostic_event(msg) - LOG.warning(msg) + report_diagnostic_event( + '%s was not mountable' % cdev, logger_func=LOG.warning) continue perform_reprovision = reprovision or self._should_reprovision(ret) - if perform_reprovision: + perform_reprovision_after_nic_attach = ( + reprovision_after_nic_attach or + self._should_reprovision_after_nic_attach(ret)) + + if perform_reprovision or perform_reprovision_after_nic_attach: if util.is_FreeBSD(): msg = "Free BSD is not supported for PPS VMs" - LOG.error(msg) - report_diagnostic_event(msg) + report_diagnostic_event(msg, logger_func=LOG.error) raise sources.InvalidMetaDataException(msg) + if perform_reprovision_after_nic_attach: + self._wait_for_all_nics_ready() ret = self._reprovision() + imds_md = get_metadata_from_imds( self.fallback_interface, retries=10) (md, userdata_raw, cfg, files) = ret @@ -467,26 +532,29 @@ class DataSourceAzure(sources.DataSource): 'userdata_raw': userdata_raw}) found = cdev - LOG.debug("found datasource in %s", cdev) + report_diagnostic_event( + 'found datasource in %s' % cdev, logger_func=LOG.debug) break if not found: msg = 'No Azure metadata found' - report_diagnostic_event(msg) + report_diagnostic_event(msg, logger_func=LOG.error) raise sources.InvalidMetaDataException(msg) if found == ddir: - LOG.debug("using files cached in %s", ddir) + report_diagnostic_event( + "using files cached in %s" % ddir, logger_func=LOG.debug) seed = _get_random_seed() if seed: crawled_data['metadata']['random_seed'] = seed crawled_data['metadata']['instance-id'] = self._iid() - if perform_reprovision: + if perform_reprovision or perform_reprovision_after_nic_attach: LOG.info("Reporting ready to Azure after getting ReprovisionData") - use_cached_ephemeral = (net.is_up(self.fallback_interface) and - getattr(self, '_ephemeral_dhcp_ctx', None)) + use_cached_ephemeral = ( + self.distro.networking.is_up(self.fallback_interface) and + getattr(self, '_ephemeral_dhcp_ctx', None)) if use_cached_ephemeral: self._report_ready(lease=self._ephemeral_dhcp_ctx.lease) self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral @@ -497,7 +565,8 @@ class DataSourceAzure(sources.DataSource): self._report_ready(lease=lease) except Exception as e: report_diagnostic_event( - "exception while reporting ready: %s" % e) + "exception while reporting ready: %s" % e, + logger_func=LOG.error) raise return crawled_data @@ -529,14 +598,21 @@ class DataSourceAzure(sources.DataSource): except Exception as e: LOG.warning("Failed to get system information: %s", e) + self.distro.networking.blacklist_drivers = BLACKLIST_DRIVERS + try: crawled_data = util.log_time( logfunc=LOG.debug, msg='Crawl of metadata service', func=self.crawl_metadata ) - except sources.InvalidMetaDataException as e: - LOG.warning('Could not crawl Azure metadata: %s', e) + except Exception as e: + report_diagnostic_event( + 'Could not crawl Azure metadata: %s' % e, + logger_func=LOG.error) + self._report_failure( + description=DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE) return False + if (self.distro and self.distro.name == 'ubuntu' and self.ds_cfg.get('apply_network_config')): maybe_remove_ubuntu_network_config_scripts() @@ -561,6 +637,38 @@ class DataSourceAzure(sources.DataSource): def device_name_to_device(self, name): return self.ds_cfg['disk_aliases'].get(name) + @azure_ds_telemetry_reporter + def get_public_ssh_keys(self): + """ + Try to get the ssh keys from IMDS first, and if that fails + (i.e. IMDS is unavailable) then fallback to getting the ssh + keys from OVF. + + The benefit to getting keys from IMDS is a large performance + advantage, so this is a strong preference. But we must keep + OVF as a second option for environments that don't have IMDS. + """ + LOG.debug('Retrieving public SSH keys') + ssh_keys = [] + try: + ssh_keys = [ + public_key['keyData'] + for public_key + in self.metadata['imds']['compute']['publicKeys'] + ] + LOG.debug('Retrieved SSH keys from IMDS') + except KeyError: + log_msg = 'Unable to get keys from IMDS, falling back to OVF' + report_diagnostic_event(log_msg, logger_func=LOG.debug) + try: + ssh_keys = self.metadata['public-keys'] + LOG.debug('Retrieved keys from OVF') + except KeyError: + log_msg = 'No keys available from OVF' + report_diagnostic_event(log_msg, logger_func=LOG.debug) + + return ssh_keys + def get_config_obj(self): return self.cfg @@ -571,7 +679,7 @@ class DataSourceAzure(sources.DataSource): def _iid(self, previous=None): prev_iid_path = os.path.join( self.paths.get_cpath('data'), 'instance-id') - iid = util.read_dmi_data('system-uuid') + iid = dmi.read_dmi_data('system-uuid') if os.path.exists(prev_iid_path): previous = util.load_file(prev_iid_path).strip() if is_byte_swapped(previous, iid): @@ -592,10 +700,293 @@ class DataSourceAzure(sources.DataSource): LOG.debug("negotiating already done for %s", self.get_instance_id()) + @azure_ds_telemetry_reporter + def _wait_for_nic_detach(self, nl_sock): + """Use the netlink socket provided to wait for nic detach event. + NOTE: The function doesn't close the socket. The caller owns closing + the socket and disposing it safely. + """ + try: + ifname = None + + # Preprovisioned VM will only have one NIC, and it gets + # detached immediately after deployment. + with events.ReportEventStack( + name="wait-for-nic-detach", + description=("wait for nic detach"), + parent=azure_ds_reporter): + ifname = netlink.wait_for_nic_detach_event(nl_sock) + if ifname is None: + msg = ("Preprovisioned nic not detached as expected. " + "Proceeding without failing.") + report_diagnostic_event(msg, logger_func=LOG.warning) + else: + report_diagnostic_event("The preprovisioned nic %s is detached" + % ifname, logger_func=LOG.warning) + path = REPROVISION_NIC_DETACHED_MARKER_FILE + LOG.info("Creating a marker file for nic detached: %s", path) + util.write_file(path, "{pid}: {time}\n".format( + pid=os.getpid(), time=time())) + except AssertionError as error: + report_diagnostic_event(error, logger_func=LOG.error) + raise + + @azure_ds_telemetry_reporter + def wait_for_link_up(self, ifname): + """In cases where the link state is still showing down after a nic is + hot-attached, we can attempt to bring it up by forcing the hv_netvsc + drivers to query the link state by unbinding and then binding the + device. This function attempts infinitely until the link is up, + because we cannot proceed further until we have a stable link.""" + + if self.distro.networking.try_set_link_up(ifname): + report_diagnostic_event("The link %s is already up." % ifname, + logger_func=LOG.info) + return + + LOG.info("Attempting to bring %s up", ifname) + + attempts = 0 + while True: + + LOG.info("Unbinding and binding the interface %s", ifname) + devicename = net.read_sys_net(ifname, + 'device/device_id').strip('{}') + util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/unbind', + devicename) + util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/bind', + devicename) + + attempts = attempts + 1 + if self.distro.networking.try_set_link_up(ifname): + msg = "The link %s is up after %s attempts" % (ifname, + attempts) + report_diagnostic_event(msg, logger_func=LOG.info) + return + + sleep_duration = 1 + msg = ("Link is not up after %d attempts with %d seconds sleep " + "between attempts." % (attempts, sleep_duration)) + + if attempts % 10 == 0: + report_diagnostic_event(msg, logger_func=LOG.info) + else: + LOG.info(msg) + + sleep(sleep_duration) + + @azure_ds_telemetry_reporter + def _create_report_ready_marker(self): + path = REPORTED_READY_MARKER_FILE + LOG.info( + "Creating a marker file to report ready: %s", path) + util.write_file(path, "{pid}: {time}\n".format( + pid=os.getpid(), time=time())) + report_diagnostic_event( + 'Successfully created reported ready marker file ' + 'while in the preprovisioning pool.', + logger_func=LOG.debug) + + @azure_ds_telemetry_reporter + def _report_ready_if_needed(self): + """Report ready to the platform if the marker file is not present, + and create the marker file. + """ + have_not_reported_ready = ( + not os.path.isfile(REPORTED_READY_MARKER_FILE)) + + if have_not_reported_ready: + report_diagnostic_event("Reporting ready before nic detach", + logger_func=LOG.info) + try: + with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease: + self._report_ready(lease=lease) + except Exception as e: + report_diagnostic_event("Exception reporting ready during " + "preprovisioning before nic detach: %s" + % e, logger_func=LOG.error) + raise + self._create_report_ready_marker() + else: + report_diagnostic_event("Already reported ready before nic detach." + " The marker file already exists: %s" % + REPORTED_READY_MARKER_FILE, + logger_func=LOG.error) + + @azure_ds_telemetry_reporter + def _check_if_nic_is_primary(self, ifname): + """Check if a given interface is the primary nic or not. If it is the + primary nic, then we also get the expected total nic count from IMDS. + IMDS will process the request and send a response only for primary NIC. + """ + is_primary = False + expected_nic_count = -1 + imds_md = None + + # For now, only a VM's primary NIC can contact IMDS and WireServer. If + # DHCP fails for a NIC, we have no mechanism to determine if the NIC is + # primary or secondary. In this case, the desired behavior is to fail + # VM provisioning if there is any DHCP failure when trying to determine + # the primary NIC. + try: + with events.ReportEventStack( + name="obtain-dhcp-lease", + description=("obtain dhcp lease for %s when attempting to " + "determine primary NIC during reprovision of " + "a pre-provisioned VM" % ifname), + parent=azure_ds_reporter): + dhcp_ctx = EphemeralDHCPv4( + iface=ifname, + dhcp_log_func=dhcp_log_cb) + dhcp_ctx.obtain_lease() + except Exception as e: + report_diagnostic_event("Giving up. Failed to obtain dhcp lease " + "for %s when attempting to determine " + "primary NIC during reprovision due to %s" + % (ifname, e), logger_func=LOG.error) + raise + + # Primary nic detection will be optimized in the future. The fact that + # primary nic is being attached first helps here. Otherwise each nic + # could add several seconds of delay. + try: + imds_md = get_metadata_from_imds( + ifname, + 5, + metadata_type.network) + except Exception as e: + LOG.warning( + "Failed to get network metadata using nic %s. Attempt to " + "contact IMDS failed with error %s. Assuming this is not the " + "primary nic.", ifname, e) + finally: + # If we are not the primary nic, then clean the dhcp context. + if imds_md is None: + dhcp_ctx.clean_network() + + if imds_md is not None: + # Only primary NIC will get a response from IMDS. + LOG.info("%s is the primary nic", ifname) + is_primary = True + + # If primary, set ephemeral dhcp ctx so we can report ready + self._ephemeral_dhcp_ctx = dhcp_ctx + + # Set the expected nic count based on the response received. + expected_nic_count = len( + imds_md['interface']) + report_diagnostic_event("Expected nic count: %d" % + expected_nic_count, logger_func=LOG.info) + + return is_primary, expected_nic_count + + @azure_ds_telemetry_reporter + def _wait_for_hot_attached_nics(self, nl_sock): + """Wait until all the expected nics for the vm are hot-attached. + The expected nic count is obtained by requesting the network metadata + from IMDS. + """ + LOG.info("Waiting for nics to be hot-attached") + try: + # Wait for nics to be attached one at a time, until we know for + # sure that all nics have been attached. + nics_found = [] + primary_nic_found = False + expected_nic_count = -1 + + # Wait for netlink nic attach events. After the first nic is + # attached, we are already in the customer vm deployment path and + # so eerything from then on should happen fast and avoid + # unnecessary delays wherever possible. + while True: + ifname = None + with events.ReportEventStack( + name="wait-for-nic-attach", + description=("wait for nic attach after %d nics have " + "been attached" % len(nics_found)), + parent=azure_ds_reporter): + ifname = netlink.wait_for_nic_attach_event(nl_sock, + nics_found) + + # wait_for_nic_attach_event guarantees that ifname it not None + nics_found.append(ifname) + report_diagnostic_event("Detected nic %s attached." % ifname, + logger_func=LOG.info) + + # Attempt to bring the interface's operating state to + # UP in case it is not already. + self.wait_for_link_up(ifname) + + # If primary nic is not found, check if this is it. The + # platform will attach the primary nic first so we + # won't be in primary_nic_found = false state for long. + if not primary_nic_found: + LOG.info("Checking if %s is the primary nic", + ifname) + (primary_nic_found, expected_nic_count) = ( + self._check_if_nic_is_primary(ifname)) + + # Exit criteria: check if we've discovered all nics + if (expected_nic_count != -1 + and len(nics_found) >= expected_nic_count): + LOG.info("Found all the nics for this VM.") + break + + except AssertionError as error: + report_diagnostic_event(error, logger_func=LOG.error) + + @azure_ds_telemetry_reporter + def _wait_for_all_nics_ready(self): + """Wait for nic(s) to be hot-attached. There may be multiple nics + depending on the customer request. + But only primary nic would be able to communicate with wireserver + and IMDS. So we detect and save the primary nic to be used later. + """ + + nl_sock = None + try: + nl_sock = netlink.create_bound_netlink_socket() + + report_ready_marker_present = bool( + os.path.isfile(REPORTED_READY_MARKER_FILE)) + + # Report ready if the marker file is not already present. + # The nic of the preprovisioned vm gets hot-detached as soon as + # we report ready. So no need to save the dhcp context. + self._report_ready_if_needed() + + has_nic_been_detached = bool( + os.path.isfile(REPROVISION_NIC_DETACHED_MARKER_FILE)) + + if not has_nic_been_detached: + LOG.info("NIC has not been detached yet.") + self._wait_for_nic_detach(nl_sock) + + # If we know that the preprovisioned nic has been detached, and we + # still have a fallback nic, then it means the VM must have + # rebooted as part of customer assignment, and all the nics have + # already been attached by the Azure platform. So there is no need + # to wait for nics to be hot-attached. + if not self.fallback_interface: + self._wait_for_hot_attached_nics(nl_sock) + else: + report_diagnostic_event("Skipping waiting for nic attach " + "because we already have a fallback " + "interface. Report Ready marker " + "present before detaching nics: %s" % + report_ready_marker_present, + logger_func=LOG.info) + except netlink.NetlinkCreateSocketError as e: + report_diagnostic_event(e, logger_func=LOG.warning) + raise + finally: + if nl_sock: + nl_sock.close() + def _poll_imds(self): """Poll IMDS for the new provisioning data until we get a valid response. Then return the returned JSON object.""" - url = IMDS_URL + "reprovisiondata?api-version=2017-04-02" + url = metadata_type.reprovisiondata.value headers = {"Metadata": "true"} nl_sock = None report_ready = bool(not os.path.isfile(REPORTED_READY_MARKER_FILE)) @@ -611,16 +1002,14 @@ class DataSourceAzure(sources.DataSource): if self.imds_poll_counter == self.imds_logging_threshold: # Reducing the logging frequency as we are polling IMDS self.imds_logging_threshold *= 2 - LOG.debug("Call to IMDS with arguments %s failed " - "with status code %s after %s retries", - msg, exception.code, self.imds_poll_counter) LOG.debug("Backing off logging threshold for the same " "exception to %d", self.imds_logging_threshold) report_diagnostic_event("poll IMDS with %s failed. " "Exception: %s and code: %s" % (msg, exception.cause, - exception.code)) + exception.code), + logger_func=LOG.debug) self.imds_poll_counter += 1 return True else: @@ -629,24 +1018,41 @@ class DataSourceAzure(sources.DataSource): report_diagnostic_event("poll IMDS with %s failed. " "Exception: %s and code: %s" % (msg, exception.cause, - exception.code)) + exception.code), + logger_func=LOG.warning) return False - LOG.debug("poll IMDS failed with an unexpected exception: %s", - exception) - return False + report_diagnostic_event( + "poll IMDS failed with an " + "unexpected exception: %s" % exception, + logger_func=LOG.warning) + return False + + # When the interface is hot-attached, we would have already + # done dhcp and set the dhcp context. In that case, skip + # the attempt to do dhcp. + is_ephemeral_ctx_present = self._ephemeral_dhcp_ctx is not None + msg = ("Unexpected error. Dhcp context is not expected to be already " + "set when we need to wait for vnet switch") + if is_ephemeral_ctx_present and report_ready: + report_diagnostic_event(msg, logger_func=LOG.error) + raise RuntimeError(msg) - LOG.debug("Wait for vnetswitch to happen") while True: try: - # Save our EphemeralDHCPv4 context to avoid repeated dhcp - with events.ReportEventStack( - name="obtain-dhcp-lease", - description="obtain dhcp lease", - parent=azure_ds_reporter): - self._ephemeral_dhcp_ctx = EphemeralDHCPv4( - dhcp_log_func=dhcp_log_cb) - lease = self._ephemeral_dhcp_ctx.obtain_lease() + # Since is_ephemeral_ctx_present is set only once, this ensures + # that with regular reprovisioning, dhcp is always done every + # time the loop runs. + if not is_ephemeral_ctx_present: + # Save our EphemeralDHCPv4 context to avoid repeated dhcp + # later when we report ready + with events.ReportEventStack( + name="obtain-dhcp-lease", + description="obtain dhcp lease", + parent=azure_ds_reporter): + self._ephemeral_dhcp_ctx = EphemeralDHCPv4( + dhcp_log_func=dhcp_log_cb) + lease = self._ephemeral_dhcp_ctx.obtain_lease() if vnet_switched: dhcp_attempts += 1 @@ -654,19 +1060,24 @@ class DataSourceAzure(sources.DataSource): try: nl_sock = netlink.create_bound_netlink_socket() except netlink.NetlinkCreateSocketError as e: - report_diagnostic_event(e) - LOG.warning(e) + report_diagnostic_event( + 'Failed to create bound netlink socket: %s' % e, + logger_func=LOG.warning) self._ephemeral_dhcp_ctx.clean_network() break - path = REPORTED_READY_MARKER_FILE - LOG.info( - "Creating a marker file to report ready: %s", path) - util.write_file(path, "{pid}: {time}\n".format( - pid=os.getpid(), time=time())) - self._report_ready(lease=lease) + report_ready_succeeded = self._report_ready(lease=lease) + if not report_ready_succeeded: + msg = ('Failed reporting ready while in ' + 'the preprovisioning pool.') + report_diagnostic_event(msg, logger_func=LOG.error) + self._ephemeral_dhcp_ctx.clean_network() + raise sources.InvalidMetaDataException(msg) + + self._create_report_ready_marker() report_ready = False + LOG.debug("Wait for vnetswitch to happen") with events.ReportEventStack( name="wait-for-media-disconnect-connect", description="wait for vnet switch", @@ -674,9 +1085,10 @@ class DataSourceAzure(sources.DataSource): try: netlink.wait_for_media_disconnect_connect( nl_sock, lease['interface']) - except AssertionError as error: - report_diagnostic_event(error) - LOG.error(error) + except AssertionError as e: + report_diagnostic_event( + 'Error while waiting for vnet switch: %s' % e, + logger_func=LOG.error) break vnet_switched = True @@ -702,21 +1114,113 @@ class DataSourceAzure(sources.DataSource): if vnet_switched: report_diagnostic_event("attempted dhcp %d times after reuse" % - dhcp_attempts) + dhcp_attempts, + logger_func=LOG.debug) report_diagnostic_event("polled imds %d times after reuse" % - self.imds_poll_counter) + self.imds_poll_counter, + logger_func=LOG.debug) return return_val @azure_ds_telemetry_reporter - def _report_ready(self, lease): - """Tells the fabric provisioning has completed """ + def _report_failure(self, description=None) -> bool: + """Tells the Azure fabric that provisioning has failed. + + @param description: A description of the error encountered. + @return: The success status of sending the failure signal. + """ + unknown_245_key = 'unknown-245' + + try: + if (self.distro.networking.is_up(self.fallback_interface) and + getattr(self, '_ephemeral_dhcp_ctx', None) and + getattr(self._ephemeral_dhcp_ctx, 'lease', None) and + unknown_245_key in self._ephemeral_dhcp_ctx.lease): + report_diagnostic_event( + 'Using cached ephemeral dhcp context ' + 'to report failure to Azure', logger_func=LOG.debug) + report_failure_to_fabric( + dhcp_opts=self._ephemeral_dhcp_ctx.lease[unknown_245_key], + description=description) + self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral + return True + except Exception as e: + report_diagnostic_event( + 'Failed to report failure using ' + 'cached ephemeral dhcp context: %s' % e, + logger_func=LOG.error) + + try: + report_diagnostic_event( + 'Using new ephemeral dhcp to report failure to Azure', + logger_func=LOG.debug) + with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease: + report_failure_to_fabric( + dhcp_opts=lease[unknown_245_key], + description=description) + return True + except Exception as e: + report_diagnostic_event( + 'Failed to report failure using new ephemeral dhcp: %s' % e, + logger_func=LOG.debug) + + try: + report_diagnostic_event( + 'Using fallback lease to report failure to Azure') + report_failure_to_fabric( + fallback_lease_file=self.dhclient_lease_file, + description=description) + return True + except Exception as e: + report_diagnostic_event( + 'Failed to report failure using fallback lease: %s' % e, + logger_func=LOG.debug) + + return False + + def _report_ready(self, lease: dict) -> bool: + """Tells the fabric provisioning has completed. + + @param lease: dhcp lease to use for sending the ready signal. + @return: The success status of sending the ready signal. + """ try: get_metadata_from_fabric(None, lease['unknown-245']) - except Exception: - LOG.warning( - "Error communicating with Azure fabric; You may experience." - "connectivity issues.", exc_info=True) + return True + except Exception as e: + report_diagnostic_event( + "Error communicating with Azure fabric; You may experience " + "connectivity issues: %s" % e, logger_func=LOG.warning) + return False + + def _should_reprovision_after_nic_attach(self, candidate_metadata) -> bool: + """Whether or not we should wait for nic attach and then poll + IMDS for reprovisioning data. Also sets a marker file to poll IMDS. + + The marker file is used for the following scenario: the VM boots into + wait for nic attach, which we expect to be proceeding infinitely until + the nic is attached. If for whatever reason the platform moves us to a + new host (for instance a hardware issue), we need to keep waiting. + However, since the VM reports ready to the Fabric, we will not attach + the ISO, thus cloud-init needs to have a way of knowing that it should + jump back into the waiting mode in order to retrieve the ovf_env. + + @param candidate_metadata: Metadata obtained from reading ovf-env. + @return: Whether to reprovision after waiting for nics to be attached. + """ + if not candidate_metadata: + return False + (_md, _userdata_raw, cfg, _files) = candidate_metadata + path = REPROVISION_NIC_ATTACH_MARKER_FILE + if (cfg.get('PreprovisionedVMType', None) == "Savable" or + os.path.isfile(path)): + if not os.path.isfile(path): + LOG.info("Creating a marker file to wait for nic attach: %s", + path) + util.write_file(path, "{pid}: {time}\n".format( + pid=os.getpid(), time=time())) + return True + return False def _should_reprovision(self, ret): """Whether or not we should poll IMDS for reprovisioning data. @@ -734,6 +1238,7 @@ class DataSourceAzure(sources.DataSource): (_md, _userdata_raw, cfg, _files) = ret path = REPROVISION_MARKER_FILE if (cfg.get('PreprovisionedVm') is True or + cfg.get('PreprovisionedVMType', None) == 'Running' or os.path.isfile(path)): if not os.path.isfile(path): LOG.info("Creating a marker file to poll imds: %s", @@ -764,7 +1269,22 @@ class DataSourceAzure(sources.DataSource): if self.ds_cfg['agent_command'] == AGENT_START_BUILTIN: self.bounce_network_with_azure_hostname() - pubkey_info = self.cfg.get('_pubkeys', None) + pubkey_info = None + try: + public_keys = self.metadata['imds']['compute']['publicKeys'] + LOG.debug( + 'Successfully retrieved %s key(s) from IMDS', + len(public_keys) + if public_keys is not None + else 0 + ) + except KeyError: + LOG.debug( + 'Unable to retrieve SSH keys from IMDS during ' + 'negotiation, falling back to OVF' + ) + pubkey_info = self.cfg.get('_pubkeys', None) + metadata_func = partial(get_metadata_from_fabric, fallback_lease_file=self. dhclient_lease_file, @@ -779,14 +1299,13 @@ class DataSourceAzure(sources.DataSource): except Exception as e: report_diagnostic_event( "Error communicating with Azure fabric; You may experience " - "connectivity issues: %s" % e) - LOG.warning( - "Error communicating with Azure fabric; You may experience " - "connectivity issues.", exc_info=True) + "connectivity issues: %s" % e, logger_func=LOG.warning) return False util.del_file(REPORTED_READY_MARKER_FILE) util.del_file(REPROVISION_MARKER_FILE) + util.del_file(REPROVISION_NIC_ATTACH_MARKER_FILE) + util.del_file(REPROVISION_NIC_DETACHED_MARKER_FILE) return fabric_data @azure_ds_telemetry_reporter @@ -947,9 +1466,10 @@ def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120, log_pre="Azure ephemeral disk: ") if missing: - LOG.warning("ephemeral device '%s' did" - " not appear after %d seconds.", - devpath, maxwait) + report_diagnostic_event( + "ephemeral device '%s' did not appear after %d seconds." % + (devpath, maxwait), + logger_func=LOG.warning) return result = False @@ -1034,7 +1554,9 @@ def pubkeys_from_crt_files(flist): errors.append(fname) if errors: - LOG.warning("failed to convert the crt files to pubkey: %s", errors) + report_diagnostic_event( + "failed to convert the crt files to pubkey: %s" % errors, + logger_func=LOG.warning) return pubkeys @@ -1146,7 +1668,7 @@ def read_azure_ovf(contents): dom = minidom.parseString(contents) except Exception as e: error_str = "Invalid ovf-env.xml: %s" % e - report_diagnostic_event(error_str) + report_diagnostic_event(error_str, logger_func=LOG.warning) raise BrokenAzureDataSource(error_str) from e results = find_child(dom.documentElement, @@ -1231,7 +1753,7 @@ def read_azure_ovf(contents): if password: defuser['lock_passwd'] = False if DEF_PASSWD_REDACTION != password: - defuser['passwd'] = encrypt_pass(password) + defuser['passwd'] = cfg['password'] = encrypt_pass(password) if defuser: cfg['system_info'] = {'default_user': defuser} @@ -1239,34 +1761,109 @@ def read_azure_ovf(contents): if 'ssh_pwauth' not in cfg and password: cfg['ssh_pwauth'] = True - cfg['PreprovisionedVm'] = _extract_preprovisioned_vm_setting(dom) + preprovisioning_cfg = _get_preprovisioning_cfgs(dom) + cfg = util.mergemanydict([cfg, preprovisioning_cfg]) return (md, ud, cfg) @azure_ds_telemetry_reporter -def _extract_preprovisioned_vm_setting(dom): - """Read the preprovision flag from the ovf. It should not - exist unless true.""" +def _get_preprovisioning_cfgs(dom): + """Read the preprovisioning related flags from ovf and populates a dict + with the info. + + Two flags are in use today: PreprovisionedVm bool and + PreprovisionedVMType enum. In the long term, the PreprovisionedVm bool + will be deprecated in favor of PreprovisionedVMType string/enum. + + Only these combinations of values are possible today: + - PreprovisionedVm=True and PreprovisionedVMType=Running + - PreprovisionedVm=False and PreprovisionedVMType=Savable + - PreprovisionedVm is missing and PreprovisionedVMType=Running/Savable + - PreprovisionedVm=False and PreprovisionedVMType is missing + + More specifically, this will never happen: + - PreprovisionedVm=True and PreprovisionedVMType=Savable + """ + cfg = { + "PreprovisionedVm": False, + "PreprovisionedVMType": None + } + platform_settings_section = find_child( dom.documentElement, lambda n: n.localName == "PlatformSettingsSection") if not platform_settings_section or len(platform_settings_section) == 0: LOG.debug("PlatformSettingsSection not found") - return False + return cfg platform_settings = find_child( platform_settings_section[0], lambda n: n.localName == "PlatformSettings") if not platform_settings or len(platform_settings) == 0: LOG.debug("PlatformSettings not found") - return False - preprovisionedVm = find_child( + return cfg + + # Read the PreprovisionedVm bool flag. This should be deprecated when the + # platform has removed PreprovisionedVm and only surfaces + # PreprovisionedVMType. + cfg["PreprovisionedVm"] = _get_preprovisionedvm_cfg_value( + platform_settings) + + cfg["PreprovisionedVMType"] = _get_preprovisionedvmtype_cfg_value( + platform_settings) + return cfg + + +@azure_ds_telemetry_reporter +def _get_preprovisionedvm_cfg_value(platform_settings): + preprovisionedVm = False + + # Read the PreprovisionedVm bool flag. This should be deprecated when the + # platform has removed PreprovisionedVm and only surfaces + # PreprovisionedVMType. + preprovisionedVmVal = find_child( platform_settings[0], lambda n: n.localName == "PreprovisionedVm") - if not preprovisionedVm or len(preprovisionedVm) == 0: + if not preprovisionedVmVal or len(preprovisionedVmVal) == 0: LOG.debug("PreprovisionedVm not found") - return False - return util.translate_bool(preprovisionedVm[0].firstChild.nodeValue) + return preprovisionedVm + preprovisionedVm = util.translate_bool( + preprovisionedVmVal[0].firstChild.nodeValue) + + report_diagnostic_event( + "PreprovisionedVm: %s" % preprovisionedVm, logger_func=LOG.info) + + return preprovisionedVm + + +@azure_ds_telemetry_reporter +def _get_preprovisionedvmtype_cfg_value(platform_settings): + preprovisionedVMType = None + + # Read the PreprovisionedVMType value from the ovf. It can be + # 'Running' or 'Savable' or not exist. This enum value is intended to + # replace PreprovisionedVm bool flag in the long term. + # A Running VM is the same as preprovisioned VMs of today. This is + # equivalent to having PreprovisionedVm=True. + # A Savable VM is one whose nic is hot-detached immediately after it + # reports ready the first time to free up the network resources. + # Once assigned to customer, the customer-requested nics are + # hot-attached to it and reprovision happens like today. + preprovisionedVMTypeVal = find_child( + platform_settings[0], + lambda n: n.localName == "PreprovisionedVMType") + if (not preprovisionedVMTypeVal or len(preprovisionedVMTypeVal) == 0 or + preprovisionedVMTypeVal[0].firstChild is None): + LOG.debug("PreprovisionedVMType not found") + return preprovisionedVMType + + preprovisionedVMType = preprovisionedVMTypeVal[0].firstChild.nodeValue + + report_diagnostic_event( + "PreprovisionedVMType: %s" % preprovisionedVMType, + logger_func=LOG.info) + + return preprovisionedVMType def encrypt_pass(password, salt_id="$6$"): @@ -1338,81 +1935,100 @@ def load_azure_ds_dir(source_dir): return (md, ud, cfg, {'ovf-env.xml': contents}) -def parse_network_config(imds_metadata): +@azure_ds_telemetry_reporter +def parse_network_config(imds_metadata) -> dict: """Convert imds_metadata dictionary to network v2 configuration. - Parses network configuration from imds metadata if present or generate fallback network config excluding mlx4_core devices. @param: imds_metadata: Dict of content read from IMDS network service. @return: Dictionary containing network version 2 standard configuration. """ - with events.ReportEventStack( - name="parse_network_config", - description="", - parent=azure_ds_reporter - ) as evt: - if imds_metadata != sources.UNSET and imds_metadata: - netconfig = {'version': 2, 'ethernets': {}} - LOG.debug('Azure: generating network configuration from IMDS') - network_metadata = imds_metadata['network'] - for idx, intf in enumerate(network_metadata['interface']): - # First IPv4 and/or IPv6 address will be obtained via DHCP. - # Any additional IPs of each type will be set as static - # addresses. - nicname = 'eth{idx}'.format(idx=idx) - dhcp_override = {'route-metric': (idx + 1) * 100} - dev_config = {'dhcp4': True, 'dhcp4-overrides': dhcp_override, - 'dhcp6': False} - for addr_type in ('ipv4', 'ipv6'): - addresses = intf.get(addr_type, {}).get('ipAddress', []) - if addr_type == 'ipv4': - default_prefix = '24' - else: - default_prefix = '128' - if addresses: - dev_config['dhcp6'] = True - # non-primary interfaces should have a higher - # route-metric (cost) so default routes prefer - # primary nic due to lower route-metric value - dev_config['dhcp6-overrides'] = dhcp_override - for addr in addresses[1:]: - # Append static address config for ip > 1 - netPrefix = intf[addr_type]['subnet'][0].get( - 'prefix', default_prefix) - privateIp = addr['privateIpAddress'] - if not dev_config.get('addresses'): - dev_config['addresses'] = [] - dev_config['addresses'].append( - '{ip}/{prefix}'.format( - ip=privateIp, prefix=netPrefix)) - if dev_config: - mac = ':'.join(re.findall(r'..', intf['macAddress'])) - dev_config.update({ - 'match': {'macaddress': mac.lower()}, - 'set-name': nicname - }) - # With netvsc, we can get two interfaces that - # share the same MAC, so we need to make sure - # our match condition also contains the driver - driver = device_driver(nicname) - if driver and driver == 'hv_netvsc': - dev_config['match']['driver'] = driver - netconfig['ethernets'][nicname] = dev_config - evt.description = "network config from imds" - else: - blacklist = ['mlx4_core'] - LOG.debug('Azure: generating fallback configuration') - # generate a network config, blacklist picking mlx4_core devs - netconfig = net.generate_fallback_config( - blacklist_drivers=blacklist, config_driver=True) - evt.description = "network config from fallback" - return netconfig + if imds_metadata != sources.UNSET and imds_metadata: + try: + return _generate_network_config_from_imds_metadata(imds_metadata) + except Exception as e: + LOG.error( + 'Failed generating network config ' + 'from IMDS network metadata: %s', str(e)) + try: + return _generate_network_config_from_fallback_config() + except Exception as e: + LOG.error('Failed generating fallback network config: %s', str(e)) + return {} + + +@azure_ds_telemetry_reporter +def _generate_network_config_from_imds_metadata(imds_metadata) -> dict: + """Convert imds_metadata dictionary to network v2 configuration. + Parses network configuration from imds metadata. + + @param: imds_metadata: Dict of content read from IMDS network service. + @return: Dictionary containing network version 2 standard configuration. + """ + netconfig = {'version': 2, 'ethernets': {}} + network_metadata = imds_metadata['network'] + for idx, intf in enumerate(network_metadata['interface']): + # First IPv4 and/or IPv6 address will be obtained via DHCP. + # Any additional IPs of each type will be set as static + # addresses. + nicname = 'eth{idx}'.format(idx=idx) + dhcp_override = {'route-metric': (idx + 1) * 100} + dev_config = {'dhcp4': True, 'dhcp4-overrides': dhcp_override, + 'dhcp6': False} + for addr_type in ('ipv4', 'ipv6'): + addresses = intf.get(addr_type, {}).get('ipAddress', []) + if addr_type == 'ipv4': + default_prefix = '24' + else: + default_prefix = '128' + if addresses: + dev_config['dhcp6'] = True + # non-primary interfaces should have a higher + # route-metric (cost) so default routes prefer + # primary nic due to lower route-metric value + dev_config['dhcp6-overrides'] = dhcp_override + for addr in addresses[1:]: + # Append static address config for ip > 1 + netPrefix = intf[addr_type]['subnet'][0].get( + 'prefix', default_prefix) + privateIp = addr['privateIpAddress'] + if not dev_config.get('addresses'): + dev_config['addresses'] = [] + dev_config['addresses'].append( + '{ip}/{prefix}'.format( + ip=privateIp, prefix=netPrefix)) + if dev_config: + mac = ':'.join(re.findall(r'..', intf['macAddress'])) + dev_config.update({ + 'match': {'macaddress': mac.lower()}, + 'set-name': nicname + }) + # With netvsc, we can get two interfaces that + # share the same MAC, so we need to make sure + # our match condition also contains the driver + driver = device_driver(nicname) + if driver and driver == 'hv_netvsc': + dev_config['match']['driver'] = driver + netconfig['ethernets'][nicname] = dev_config + return netconfig + + +@azure_ds_telemetry_reporter +def _generate_network_config_from_fallback_config() -> dict: + """Generate fallback network config excluding blacklisted devices. + + @return: Dictionary containing network version 2 standard configuration. + """ + return net.generate_fallback_config( + blacklist_drivers=BLACKLIST_DRIVERS, config_driver=True) @azure_ds_telemetry_reporter -def get_metadata_from_imds(fallback_nic, retries): - """Query Azure's network metadata service, returning a dictionary. +def get_metadata_from_imds(fallback_nic, + retries, + md_type=metadata_type.compute): + """Query Azure's instance metadata service, returning a dictionary. If network is not up, setup ephemeral dhcp on fallback_nic to talk to the IMDS. For more info on IMDS: @@ -1427,7 +2043,7 @@ def get_metadata_from_imds(fallback_nic, retries): """ kwargs = {'logfunc': LOG.debug, 'msg': 'Crawl of Azure Instance Metadata Service (IMDS)', - 'func': _get_metadata_from_imds, 'args': (retries,)} + 'func': _get_metadata_from_imds, 'args': (retries, md_type,)} if net.is_up(fallback_nic): return util.log_time(**kwargs) else: @@ -1436,23 +2052,26 @@ def get_metadata_from_imds(fallback_nic, retries): azure_ds_reporter, fallback_nic): return util.log_time(**kwargs) except Exception as e: - report_diagnostic_event("exception while getting metadata: %s" % e) + report_diagnostic_event( + "exception while getting metadata: %s" % e, + logger_func=LOG.warning) raise @azure_ds_telemetry_reporter -def _get_metadata_from_imds(retries): +def _get_metadata_from_imds(retries, md_type=metadata_type.compute): - url = IMDS_URL + "instance?api-version=2017-12-01" + url = md_type.value headers = {"Metadata": "true"} try: response = readurl( url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers, retries=retries, exception_cb=retry_on_url_exc) except Exception as e: - msg = 'Ignoring IMDS instance metadata: %s' % e - report_diagnostic_event(msg) - LOG.debug(msg) + report_diagnostic_event( + 'Ignoring IMDS instance metadata. ' + 'Get metadata from IMDS failed: %s' % e, + logger_func=LOG.warning) return {} try: from json.decoder import JSONDecodeError @@ -1463,9 +2082,10 @@ def _get_metadata_from_imds(retries): try: return util.load_json(str(response)) except json_decode_error as e: - report_diagnostic_event('non-json imds response' % e) - LOG.warning( - 'Ignoring non-json IMDS instance metadata: %s', str(response)) + report_diagnostic_event( + 'Ignoring non-json IMDS instance metadata response: %s. ' + 'Loading non-json IMDS response failed: %s' % (str(response), e), + logger_func=LOG.warning) return {} @@ -1513,13 +2133,12 @@ def _is_platform_viable(seed_dir): description="found azure asset tag", parent=azure_ds_reporter ) as evt: - asset_tag = util.read_dmi_data('chassis-asset-tag') + asset_tag = dmi.read_dmi_data('chassis-asset-tag') if asset_tag == AZURE_CHASSIS_ASSET_TAG: return True msg = "Non-Azure DMI asset tag '%s' discovered." % asset_tag - LOG.debug(msg) evt.description = msg - report_diagnostic_event(msg) + report_diagnostic_event(msg, logger_func=LOG.debug) if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')): return True return False diff --git a/cloudinit/sources/DataSourceBigstep.py b/cloudinit/sources/DataSourceBigstep.py index 52fff20a..63435279 100644 --- a/cloudinit/sources/DataSourceBigstep.py +++ b/cloudinit/sources/DataSourceBigstep.py @@ -7,13 +7,10 @@ import errno import json -from cloudinit import log as logging from cloudinit import sources from cloudinit import url_helper from cloudinit import util -LOG = logging.getLogger(__name__) - class DataSourceBigstep(sources.DataSource): diff --git a/cloudinit/sources/DataSourceCloudSigma.py b/cloudinit/sources/DataSourceCloudSigma.py index df88f677..f63baf74 100644 --- a/cloudinit/sources/DataSourceCloudSigma.py +++ b/cloudinit/sources/DataSourceCloudSigma.py @@ -9,9 +9,9 @@ import re from cloudinit.cs_utils import Cepko, SERIAL_PORT +from cloudinit import dmi from cloudinit import log as logging from cloudinit import sources -from cloudinit import util LOG = logging.getLogger(__name__) @@ -38,7 +38,7 @@ class DataSourceCloudSigma(sources.DataSource): """ LOG.debug("determining hypervisor product name via dmi data") - sys_product_name = util.read_dmi_data("system-product-name") + sys_product_name = dmi.read_dmi_data("system-product-name") if not sys_product_name: LOG.debug("system-product-name not available in dmi data") return False diff --git a/cloudinit/sources/DataSourceEc2.py b/cloudinit/sources/DataSourceEc2.py index 1d09c12a..1930a509 100644 --- a/cloudinit/sources/DataSourceEc2.py +++ b/cloudinit/sources/DataSourceEc2.py @@ -11,6 +11,7 @@ import os import time +from cloudinit import dmi from cloudinit import ec2_utils as ec2 from cloudinit import log as logging from cloudinit import net @@ -699,26 +700,26 @@ def _collect_platform_data(): uuid = util.load_file("/sys/hypervisor/uuid").strip() data['uuid_source'] = 'hypervisor' except Exception: - uuid = util.read_dmi_data('system-uuid') + uuid = dmi.read_dmi_data('system-uuid') data['uuid_source'] = 'dmi' if uuid is None: uuid = '' data['uuid'] = uuid.lower() - serial = util.read_dmi_data('system-serial-number') + serial = dmi.read_dmi_data('system-serial-number') if serial is None: serial = '' data['serial'] = serial.lower() - asset_tag = util.read_dmi_data('chassis-asset-tag') + asset_tag = dmi.read_dmi_data('chassis-asset-tag') if asset_tag is None: asset_tag = '' data['asset_tag'] = asset_tag.lower() - vendor = util.read_dmi_data('system-manufacturer') + vendor = dmi.read_dmi_data('system-manufacturer') data['vendor'] = (vendor if vendor else '').lower() return data diff --git a/cloudinit/sources/DataSourceExoscale.py b/cloudinit/sources/DataSourceExoscale.py index d59aefd1..adee6d79 100644 --- a/cloudinit/sources/DataSourceExoscale.py +++ b/cloudinit/sources/DataSourceExoscale.py @@ -3,6 +3,7 @@ # # This file is part of cloud-init. See LICENSE file for license information. +from cloudinit import dmi from cloudinit import ec2_utils as ec2 from cloudinit import log as logging from cloudinit import sources @@ -135,7 +136,7 @@ class DataSourceExoscale(sources.DataSource): return self.extra_config def _is_platform_viable(self): - return util.read_dmi_data('system-product-name').startswith( + return dmi.read_dmi_data('system-product-name').startswith( EXOSCALE_DMI_NAME) diff --git a/cloudinit/sources/DataSourceGCE.py b/cloudinit/sources/DataSourceGCE.py index 0ec5f6ec..746caddb 100644 --- a/cloudinit/sources/DataSourceGCE.py +++ b/cloudinit/sources/DataSourceGCE.py @@ -7,6 +7,7 @@ import json from base64 import b64decode +from cloudinit import dmi from cloudinit.distros import ug_util from cloudinit import log as logging from cloudinit import sources @@ -248,12 +249,12 @@ def read_md(address=None, platform_check=True): def platform_reports_gce(): - pname = util.read_dmi_data('system-product-name') or "N/A" + pname = dmi.read_dmi_data('system-product-name') or "N/A" if pname == "Google Compute Engine": return True # system-product-name is not always guaranteed (LP: #1674861) - serial = util.read_dmi_data('system-serial-number') or "N/A" + serial = dmi.read_dmi_data('system-serial-number') or "N/A" if serial.startswith("GoogleCloud-"): return True diff --git a/cloudinit/sources/DataSourceHetzner.py b/cloudinit/sources/DataSourceHetzner.py index a86035e0..c7c88dd7 100644 --- a/cloudinit/sources/DataSourceHetzner.py +++ b/cloudinit/sources/DataSourceHetzner.py @@ -3,9 +3,10 @@ # # This file is part of cloud-init. See LICENSE file for license information. # -"""Hetzner Cloud API Documentation. +"""Hetzner Cloud API Documentation https://docs.hetzner.cloud/""" +from cloudinit import dmi from cloudinit import log as logging from cloudinit import net as cloudnet from cloudinit import sources @@ -46,9 +47,12 @@ class DataSourceHetzner(sources.DataSource): self._network_config = None self.dsmode = sources.DSMODE_NETWORK - def get_data(self): - if not on_hetzner(): + def _get_data(self): + (on_hetzner, serial) = get_hcloud_data() + + if not on_hetzner: return False + nic = cloudnet.find_fallback_nic() with cloudnet.EphemeralIPv4Network(nic, "169.254.0.1", 16, "169.254.255.255"): @@ -78,8 +82,18 @@ class DataSourceHetzner(sources.DataSource): self.metadata['public-keys'] = md.get('public-keys', None) self.vendordata_raw = md.get("vendor_data", None) + # instance-id and serial from SMBIOS should be identical + if self.get_instance_id() != serial: + raise RuntimeError( + "SMBIOS serial does not match instance ID from metadata" + ) + return True + def check_instance_id(self, sys_cfg): + return sources.instance_id_matches_system_uuid( + self.get_instance_id(), 'system-serial-number') + @property def network_config(self): """Configure the networking. This needs to be done each boot, since @@ -99,8 +113,18 @@ class DataSourceHetzner(sources.DataSource): return self._network_config -def on_hetzner(): - return util.read_dmi_data('system-manufacturer') == "Hetzner" +def get_hcloud_data(): + vendor_name = dmi.read_dmi_data('system-manufacturer') + if vendor_name != "Hetzner": + return (False, None) + + serial = dmi.read_dmi_data("system-serial-number") + if serial: + LOG.debug("Running on Hetzner Cloud: serial=%s", serial) + else: + raise RuntimeError("Hetzner Cloud detected, but no serial found") + + return (True, serial) # Used to match classes to dependencies diff --git a/cloudinit/sources/DataSourceNoCloud.py b/cloudinit/sources/DataSourceNoCloud.py index e408d730..a126aad3 100644 --- a/cloudinit/sources/DataSourceNoCloud.py +++ b/cloudinit/sources/DataSourceNoCloud.py @@ -11,6 +11,7 @@ import errno import os +from cloudinit import dmi from cloudinit import log as logging from cloudinit.net import eni from cloudinit import sources @@ -61,7 +62,7 @@ class DataSourceNoCloud(sources.DataSource): # Parse the system serial label from dmi. If not empty, try parsing # like the commandline md = {} - serial = util.read_dmi_data('system-serial-number') + serial = dmi.read_dmi_data('system-serial-number') if serial and load_cmdline_data(md, serial): found.append("dmi") mydata = _merge_new_seed(mydata, {'meta-data': md}) @@ -157,13 +158,14 @@ class DataSourceNoCloud(sources.DataSource): # This could throw errors, but the user told us to do it # so if errors are raised, let them raise - (md_seed, ud) = util.read_seeded(seedfrom, timeout=None) + (md_seed, ud, vd) = util.read_seeded(seedfrom, timeout=None) LOG.debug("Using seeded cache data from %s", seedfrom) # Values in the command line override those from the seed mydata['meta-data'] = util.mergemanydict([mydata['meta-data'], md_seed]) mydata['user-data'] = ud + mydata['vendor-data'] = vd found.append(seedfrom) # Now that we have exhausted any other places merge in the defaults diff --git a/cloudinit/sources/DataSourceNone.py b/cloudinit/sources/DataSourceNone.py index e6250801..b7656ac5 100644 --- a/cloudinit/sources/DataSourceNone.py +++ b/cloudinit/sources/DataSourceNone.py @@ -4,11 +4,8 @@ # # This file is part of cloud-init. See LICENSE file for license information. -from cloudinit import log as logging from cloudinit import sources -LOG = logging.getLogger(__name__) - class DataSourceNone(sources.DataSource): diff --git a/cloudinit/sources/DataSourceOVF.py b/cloudinit/sources/DataSourceOVF.py index e53d2eb1..741c140a 100644 --- a/cloudinit/sources/DataSourceOVF.py +++ b/cloudinit/sources/DataSourceOVF.py @@ -14,6 +14,7 @@ import re import time from xml.dom import minidom +from cloudinit import dmi from cloudinit import log as logging from cloudinit import sources from cloudinit import subp @@ -73,6 +74,7 @@ class DataSourceOVF(sources.DataSource): found = [] md = {} ud = "" + vd = "" vmwareImcConfigFilePath = None nicspath = None @@ -82,7 +84,7 @@ class DataSourceOVF(sources.DataSource): (seedfile, contents) = get_ovf_env(self.paths.seed_dir) - system_type = util.read_dmi_data("system-product-name") + system_type = dmi.read_dmi_data("system-product-name") if system_type is None: LOG.debug("No system-product-name found") @@ -304,7 +306,7 @@ class DataSourceOVF(sources.DataSource): seedfrom, self) return False - (md_seed, ud) = util.read_seeded(seedfrom, timeout=None) + (md_seed, ud, vd) = util.read_seeded(seedfrom, timeout=None) LOG.debug("Using seeded cache data from %s", seedfrom) md = util.mergemanydict([md, md_seed]) @@ -316,11 +318,12 @@ class DataSourceOVF(sources.DataSource): self.seed = ",".join(found) self.metadata = md self.userdata_raw = ud + self.vendordata_raw = vd self.cfg = cfg return True def _get_subplatform(self): - system_type = util.read_dmi_data("system-product-name").lower() + system_type = dmi.read_dmi_data("system-product-name").lower() if system_type == 'vmware': return 'vmware (%s)' % self.seed return 'ovf (%s)' % self.seed diff --git a/cloudinit/sources/DataSourceOpenNebula.py b/cloudinit/sources/DataSourceOpenNebula.py index 45481938..730ec586 100644 --- a/cloudinit/sources/DataSourceOpenNebula.py +++ b/cloudinit/sources/DataSourceOpenNebula.py @@ -350,7 +350,8 @@ def parse_shell_config(content, keylist=None, bash=None, asuser=None, # exclude vars in bash that change on their own or that we used excluded = ( "EPOCHREALTIME", "EPOCHSECONDS", "RANDOM", "LINENO", "SECONDS", "_", - "__v") + "SRANDOM", "__v", + ) preset = {} ret = {} target = None diff --git a/cloudinit/sources/DataSourceOpenStack.py b/cloudinit/sources/DataSourceOpenStack.py index d4b43f44..b3406c67 100644 --- a/cloudinit/sources/DataSourceOpenStack.py +++ b/cloudinit/sources/DataSourceOpenStack.py @@ -6,6 +6,7 @@ import time +from cloudinit import dmi from cloudinit import log as logging from cloudinit.net.dhcp import EphemeralDHCPv4, NoDHCPLeaseError from cloudinit import sources @@ -32,7 +33,8 @@ DMI_ASSET_TAG_OPENTELEKOM = 'OpenTelekomCloud' # See github.com/sapcc/helm-charts/blob/master/openstack/nova/values.yaml # -> compute.defaults.vmware.smbios_asset_tag for this value DMI_ASSET_TAG_SAPCCLOUD = 'SAP CCloud VM' -VALID_DMI_ASSET_TAGS = [DMI_ASSET_TAG_OPENTELEKOM, DMI_ASSET_TAG_SAPCCLOUD] +VALID_DMI_ASSET_TAGS = VALID_DMI_PRODUCT_NAMES +VALID_DMI_ASSET_TAGS += [DMI_ASSET_TAG_OPENTELEKOM, DMI_ASSET_TAG_SAPCCLOUD] class DataSourceOpenStack(openstack.SourceMixin, sources.DataSource): @@ -224,10 +226,10 @@ def detect_openstack(accept_oracle=False): """Return True when a potential OpenStack platform is detected.""" if not util.is_x86(): return True # Non-Intel cpus don't properly report dmi product names - product_name = util.read_dmi_data('system-product-name') + product_name = dmi.read_dmi_data('system-product-name') if product_name in VALID_DMI_PRODUCT_NAMES: return True - elif util.read_dmi_data('chassis-asset-tag') in VALID_DMI_ASSET_TAGS: + elif dmi.read_dmi_data('chassis-asset-tag') in VALID_DMI_ASSET_TAGS: return True elif accept_oracle and oracle._is_platform_viable(): return True diff --git a/cloudinit/sources/DataSourceOracle.py b/cloudinit/sources/DataSourceOracle.py index 20d6487d..bf81b10b 100644 --- a/cloudinit/sources/DataSourceOracle.py +++ b/cloudinit/sources/DataSourceOracle.py @@ -17,6 +17,7 @@ import base64 from collections import namedtuple from contextlib import suppress as noop +from cloudinit import dmi from cloudinit import log as logging from cloudinit import net, sources, util from cloudinit.net import ( @@ -273,12 +274,12 @@ class DataSourceOracle(sources.DataSource): def _read_system_uuid(): - sys_uuid = util.read_dmi_data('system-uuid') + sys_uuid = dmi.read_dmi_data('system-uuid') return None if sys_uuid is None else sys_uuid.lower() def _is_platform_viable(): - asset_tag = util.read_dmi_data('chassis-asset-tag') + asset_tag = dmi.read_dmi_data('chassis-asset-tag') return asset_tag == CHASSIS_ASSET_TAG diff --git a/cloudinit/sources/DataSourceRbxCloud.py b/cloudinit/sources/DataSourceRbxCloud.py index e064c8d6..0b8994bf 100644 --- a/cloudinit/sources/DataSourceRbxCloud.py +++ b/cloudinit/sources/DataSourceRbxCloud.py @@ -71,11 +71,13 @@ def gratuitous_arp(items, distro): def get_md(): - rbx_data = None + """Returns False (not found or error) or a dictionary with metadata.""" devices = set( util.find_devs_with('LABEL=CLOUDMD') + util.find_devs_with('LABEL=cloudmd') ) + if not devices: + return False for device in devices: try: rbx_data = util.mount_cb( @@ -84,17 +86,17 @@ def get_md(): mtype=['vfat', 'fat', 'msdosfs'] ) if rbx_data: - break + return rbx_data except OSError as err: if err.errno != errno.ENOENT: raise except util.MountFailedError: util.logexc(LOG, "Failed to mount %s when looking for user " "data", device) - if not rbx_data: - util.logexc(LOG, "Failed to load metadata and userdata") - return False - return rbx_data + + LOG.debug("Did not find RbxCloud data, searched devices: %s", + ",".join(devices)) + return False def generate_network_config(netadps): @@ -223,6 +225,8 @@ class DataSourceRbxCloud(sources.DataSource): is used to perform instance configuration. """ rbx_data = get_md() + if rbx_data is False: + return False self.userdata_raw = rbx_data['userdata'] self.metadata = rbx_data['metadata'] self.gratuitous_arp = rbx_data['gratuitous_arp'] diff --git a/cloudinit/sources/DataSourceScaleway.py b/cloudinit/sources/DataSourceScaleway.py index 83c2bf65..41be7665 100644 --- a/cloudinit/sources/DataSourceScaleway.py +++ b/cloudinit/sources/DataSourceScaleway.py @@ -25,6 +25,7 @@ import requests from requests.packages.urllib3.connection import HTTPConnection from requests.packages.urllib3.poolmanager import PoolManager +from cloudinit import dmi from cloudinit import log as logging from cloudinit import sources from cloudinit import url_helper @@ -56,7 +57,7 @@ def on_scaleway(): * the initrd created the file /var/run/scaleway. * "scaleway" is in the kernel cmdline. """ - vendor_name = util.read_dmi_data('system-manufacturer') + vendor_name = dmi.read_dmi_data('system-manufacturer') if vendor_name == 'Scaleway': return True diff --git a/cloudinit/sources/DataSourceSmartOS.py b/cloudinit/sources/DataSourceSmartOS.py index f1f903bc..fd292baa 100644 --- a/cloudinit/sources/DataSourceSmartOS.py +++ b/cloudinit/sources/DataSourceSmartOS.py @@ -30,6 +30,7 @@ import random import re import socket +from cloudinit import dmi from cloudinit import log as logging from cloudinit import serial from cloudinit import sources @@ -767,7 +768,7 @@ def get_smartos_environ(uname_version=None, product_name=None): return SMARTOS_ENV_LX_BRAND if product_name is None: - system_type = util.read_dmi_data("system-product-name") + system_type = dmi.read_dmi_data("system-product-name") else: system_type = product_name diff --git a/cloudinit/sources/__init__.py b/cloudinit/sources/__init__.py index c4d60fff..9dccc687 100644 --- a/cloudinit/sources/__init__.py +++ b/cloudinit/sources/__init__.py @@ -14,6 +14,7 @@ import json import os from collections import namedtuple +from cloudinit import dmi from cloudinit import importer from cloudinit import log as logging from cloudinit import net @@ -809,7 +810,7 @@ def instance_id_matches_system_uuid(instance_id, field='system-uuid'): if not instance_id: return False - dmi_value = util.read_dmi_data(field) + dmi_value = dmi.read_dmi_data(field) if not dmi_value: return False return instance_id.lower() == dmi_value.lower() diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index b968a96f..d3055d08 100755 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -9,6 +9,7 @@ import struct import time import textwrap import zlib +from errno import ENOENT from cloudinit.settings import CFG_BUILTIN from cloudinit.net import dhcp @@ -16,6 +17,7 @@ from cloudinit import stages from cloudinit import temp_utils from contextlib import contextmanager from xml.etree import ElementTree +from xml.sax.saxutils import escape from cloudinit import subp from cloudinit import url_helper @@ -41,13 +43,19 @@ COMPRESSED_EVENT_TYPE = 'compressed' # cloud-init.log files where the P95 of the file sizes was 537KB and the time # consumed to dump 500KB file was (P95:76, P99:233, P99.9:1170) in ms MAX_LOG_TO_KVP_LENGTH = 512000 -# Marker file to indicate whether cloud-init.log is pushed to KVP -LOG_PUSHED_TO_KVP_MARKER_FILE = '/var/lib/cloud/data/log_pushed_to_kvp' +# File to store the last byte of cloud-init.log that was pushed to KVP. This +# file will be deleted with every VM reboot. +LOG_PUSHED_TO_KVP_INDEX_FILE = '/run/cloud-init/log_pushed_to_kvp_index' azure_ds_reporter = events.ReportEventStack( name="azure-ds", description="initialize reporter for azure ds", reporting_enabled=True) +DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE = ( + 'The VM encountered an error during deployment. ' + 'Please visit https://aka.ms/linuxprovisioningerror ' + 'for more information on remediation.') + def azure_ds_telemetry_reporter(func): def impl(*args, **kwargs): @@ -180,12 +188,15 @@ def get_system_info(): return evt -def report_diagnostic_event(str): +def report_diagnostic_event( + msg: str, *, logger_func=None) -> events.ReportingEvent: """Report a diagnostic event""" + if callable(logger_func): + logger_func(msg) evt = events.ReportingEvent( DIAGNOSTIC_EVENT_TYPE, 'diagnostic message', - str, events.DEFAULT_EVENT_ORIGIN) - events.report_event(evt) + msg, events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt, excluded_handler_types={"log"}) # return the event for unit testing purpose return evt @@ -211,27 +222,58 @@ def report_compressed_event(event_name, event_content): def push_log_to_kvp(file_name=CFG_BUILTIN['def_log_file']): """Push a portion of cloud-init.log file or the whole file to KVP based on the file size. - If called more than once, it skips pushing the log file to KVP again.""" + The first time this function is called after VM boot, It will push the last + n bytes of the log file such that n < MAX_LOG_TO_KVP_LENGTH + If called again on the same boot, it continues from where it left off. + In addition to cloud-init.log, dmesg log will also be collected.""" - log_pushed_to_kvp = bool(os.path.isfile(LOG_PUSHED_TO_KVP_MARKER_FILE)) - if log_pushed_to_kvp: - report_diagnostic_event("cloud-init.log is already pushed to KVP") - return + start_index = get_last_log_byte_pushed_to_kvp_index() LOG.debug("Dumping cloud-init.log file to KVP") try: with open(file_name, "rb") as f: f.seek(0, os.SEEK_END) - seek_index = max(f.tell() - MAX_LOG_TO_KVP_LENGTH, 0) + seek_index = max(f.tell() - MAX_LOG_TO_KVP_LENGTH, start_index) report_diagnostic_event( - "Dumping last {} bytes of cloud-init.log file to KVP".format( - f.tell() - seek_index)) + "Dumping last {0} bytes of cloud-init.log file to KVP starting" + " from index: {1}".format(f.tell() - seek_index, seek_index), + logger_func=LOG.debug) f.seek(seek_index, os.SEEK_SET) report_compressed_event("cloud-init.log", f.read()) - util.write_file(LOG_PUSHED_TO_KVP_MARKER_FILE, '') + util.write_file(LOG_PUSHED_TO_KVP_INDEX_FILE, str(f.tell())) + except Exception as ex: + report_diagnostic_event( + "Exception when dumping log file: %s" % repr(ex), + logger_func=LOG.warning) + + LOG.debug("Dumping dmesg log to KVP") + try: + out, _ = subp.subp(['dmesg'], decode=False, capture=True) + report_compressed_event("dmesg", out) except Exception as ex: - report_diagnostic_event("Exception when dumping log file: %s" % - repr(ex)) + report_diagnostic_event( + "Exception when dumping dmesg log: %s" % repr(ex), + logger_func=LOG.warning) + + +@azure_ds_telemetry_reporter +def get_last_log_byte_pushed_to_kvp_index(): + try: + with open(LOG_PUSHED_TO_KVP_INDEX_FILE, "r") as f: + return int(f.read()) + except IOError as e: + if e.errno != ENOENT: + report_diagnostic_event("Reading LOG_PUSHED_TO_KVP_INDEX_FILE" + " failed: %s." % repr(e), + logger_func=LOG.warning) + except ValueError as e: + report_diagnostic_event("Invalid value in LOG_PUSHED_TO_KVP_INDEX_FILE" + ": %s." % repr(e), + logger_func=LOG.warning) + except Exception as e: + report_diagnostic_event("Failed to get the last log byte pushed to KVP" + ": %s." % repr(e), logger_func=LOG.warning) + return 0 @contextmanager @@ -252,6 +294,54 @@ def _get_dhcp_endpoint_option_name(): return azure_endpoint +@azure_ds_telemetry_reporter +def http_with_retries(url, **kwargs) -> str: + """Wrapper around url_helper.readurl() with custom telemetry logging + that url_helper.readurl() does not provide. + """ + exc = None + + max_readurl_attempts = 240 + default_readurl_timeout = 5 + periodic_logging_attempts = 12 + + if 'timeout' not in kwargs: + kwargs['timeout'] = default_readurl_timeout + + # remove kwargs that cause url_helper.readurl to retry, + # since we are already implementing our own retry logic. + if kwargs.pop('retries', None): + LOG.warning( + 'Ignoring retries kwarg passed in for ' + 'communication with Azure endpoint.') + if kwargs.pop('infinite', None): + LOG.warning( + 'Ignoring infinite kwarg passed in for communication ' + 'with Azure endpoint.') + + for attempt in range(1, max_readurl_attempts + 1): + try: + ret = url_helper.readurl(url, **kwargs) + + report_diagnostic_event( + 'Successful HTTP request with Azure endpoint %s after ' + '%d attempts' % (url, attempt), + logger_func=LOG.debug) + + return ret + + except Exception as e: + exc = e + if attempt % periodic_logging_attempts == 0: + report_diagnostic_event( + 'Failed HTTP request with Azure endpoint %s during ' + 'attempt %d with exception: %s' % + (url, attempt, e), + logger_func=LOG.debug) + + raise exc + + class AzureEndpointHttpClient: headers = { @@ -270,16 +360,15 @@ class AzureEndpointHttpClient: if secure: headers = self.headers.copy() headers.update(self.extra_secure_headers) - return url_helper.readurl(url, headers=headers, - timeout=5, retries=10, sec_between=5) + return http_with_retries(url, headers=headers) def post(self, url, data=None, extra_headers=None): headers = self.headers if extra_headers is not None: headers = self.headers.copy() headers.update(extra_headers) - return url_helper.readurl(url, data=data, headers=headers, - timeout=5, retries=10, sec_between=5) + return http_with_retries( + url, data=data, headers=headers) class InvalidGoalStateXMLException(Exception): @@ -288,11 +377,16 @@ class InvalidGoalStateXMLException(Exception): class GoalState: - def __init__(self, unparsed_xml, azure_endpoint_client): + def __init__( + self, + unparsed_xml: str, + azure_endpoint_client: AzureEndpointHttpClient, + need_certificate: bool = True) -> None: """Parses a GoalState XML string and returns a GoalState object. @param unparsed_xml: string representing a GoalState XML. - @param azure_endpoint_client: instance of AzureEndpointHttpClient + @param azure_endpoint_client: instance of AzureEndpointHttpClient. + @param need_certificate: switch to know if certificates is needed. @return: GoalState object representing the GoalState XML string. """ self.azure_endpoint_client = azure_endpoint_client @@ -300,9 +394,9 @@ class GoalState: try: self.root = ElementTree.fromstring(unparsed_xml) except ElementTree.ParseError as e: - msg = 'Failed to parse GoalState XML: %s' - LOG.warning(msg, e) - report_diagnostic_event(msg % (e,)) + report_diagnostic_event( + 'Failed to parse GoalState XML: %s' % e, + logger_func=LOG.warning) raise self.container_id = self._text_from_xpath('./Container/ContainerId') @@ -312,16 +406,15 @@ class GoalState: for attr in ("container_id", "instance_id", "incarnation"): if getattr(self, attr) is None: - msg = 'Missing %s in GoalState XML' - LOG.warning(msg, attr) - report_diagnostic_event(msg % (attr,)) + msg = 'Missing %s in GoalState XML' % attr + report_diagnostic_event(msg, logger_func=LOG.warning) raise InvalidGoalStateXMLException(msg) self.certificates_xml = None url = self._text_from_xpath( './Container/RoleInstanceList/RoleInstance' '/Configuration/Certificates') - if url is not None: + if url is not None and need_certificate: with events.ReportEventStack( name="get-certificates-xml", description="get certificates xml", @@ -349,12 +442,20 @@ class OpenSSLManager: def __init__(self): self.tmpdir = temp_utils.mkdtemp() - self.certificate = None + self._certificate = None self.generate_certificate() def clean_up(self): util.del_dir(self.tmpdir) + @property + def certificate(self): + return self._certificate + + @certificate.setter + def certificate(self, value): + self._certificate = value + @azure_ds_telemetry_reporter def generate_certificate(self): LOG.debug('Generating certificate for communication with fabric...') @@ -477,8 +578,15 @@ class GoalStateHealthReporter: ''') PROVISIONING_SUCCESS_STATUS = 'Ready' + PROVISIONING_NOT_READY_STATUS = 'NotReady' + PROVISIONING_FAILURE_SUBSTATUS = 'ProvisioningFailed' + + HEALTH_REPORT_DESCRIPTION_TRIM_LEN = 512 - def __init__(self, goal_state, azure_endpoint_client, endpoint): + def __init__( + self, goal_state: GoalState, + azure_endpoint_client: AzureEndpointHttpClient, + endpoint: str) -> None: """Creates instance that will report provisioning status to an endpoint @param goal_state: An instance of class GoalState that contains @@ -495,7 +603,7 @@ class GoalStateHealthReporter: self._endpoint = endpoint @azure_ds_telemetry_reporter - def send_ready_signal(self): + def send_ready_signal(self) -> None: document = self.build_report( incarnation=self._goal_state.incarnation, container_id=self._goal_state.container_id, @@ -505,32 +613,52 @@ class GoalStateHealthReporter: try: self._post_health_report(document=document) except Exception as e: - msg = "exception while reporting ready: %s" % e - LOG.error(msg) - report_diagnostic_event(msg) + report_diagnostic_event( + "exception while reporting ready: %s" % e, + logger_func=LOG.error) raise LOG.info('Reported ready to Azure fabric.') + @azure_ds_telemetry_reporter + def send_failure_signal(self, description: str) -> None: + document = self.build_report( + incarnation=self._goal_state.incarnation, + container_id=self._goal_state.container_id, + instance_id=self._goal_state.instance_id, + status=self.PROVISIONING_NOT_READY_STATUS, + substatus=self.PROVISIONING_FAILURE_SUBSTATUS, + description=description) + try: + self._post_health_report(document=document) + except Exception as e: + msg = "exception while reporting failure: %s" % e + report_diagnostic_event(msg, logger_func=LOG.error) + raise + + LOG.warning('Reported failure to Azure fabric.') + def build_report( - self, incarnation, container_id, instance_id, - status, substatus=None, description=None): + self, incarnation: str, container_id: str, instance_id: str, + status: str, substatus=None, description=None) -> str: health_detail = '' if substatus is not None: health_detail = self.HEALTH_DETAIL_SUBSECTION_XML_TEMPLATE.format( - health_substatus=substatus, health_description=description) + health_substatus=escape(substatus), + health_description=escape( + description[:self.HEALTH_REPORT_DESCRIPTION_TRIM_LEN])) health_report = self.HEALTH_REPORT_XML_TEMPLATE.format( - incarnation=incarnation, - container_id=container_id, - instance_id=instance_id, - health_status=status, + incarnation=escape(str(incarnation)), + container_id=escape(container_id), + instance_id=escape(instance_id), + health_status=escape(status), health_detail_subsection=health_detail) return health_report @azure_ds_telemetry_reporter - def _post_health_report(self, document): + def _post_health_report(self, document: str) -> None: push_log_to_kvp() # Whenever report_diagnostic_event(diagnostic_msg) is invoked in code, @@ -690,43 +818,52 @@ class WALinuxAgentShim: value = dhcp245 LOG.debug("Using Azure Endpoint from dhcp options") if value is None: - report_diagnostic_event("No Azure endpoint from dhcp options") - LOG.debug('Finding Azure endpoint from networkd...') + report_diagnostic_event( + 'No Azure endpoint from dhcp options. ' + 'Finding Azure endpoint from networkd...', + logger_func=LOG.debug) value = WALinuxAgentShim._networkd_get_value_from_leases() if value is None: # Option-245 stored in /run/cloud-init/dhclient.hooks/<ifc>.json # a dhclient exit hook that calls cloud-init-dhclient-hook - report_diagnostic_event("No Azure endpoint from networkd") - LOG.debug('Finding Azure endpoint from hook json...') + report_diagnostic_event( + 'No Azure endpoint from networkd. ' + 'Finding Azure endpoint from hook json...', + logger_func=LOG.debug) dhcp_options = WALinuxAgentShim._load_dhclient_json() value = WALinuxAgentShim._get_value_from_dhcpoptions(dhcp_options) if value is None: # Fallback and check the leases file if unsuccessful - report_diagnostic_event("No Azure endpoint from dhclient logs") - LOG.debug("Unable to find endpoint in dhclient logs. " - " Falling back to check lease files") + report_diagnostic_event( + 'No Azure endpoint from dhclient logs. ' + 'Unable to find endpoint in dhclient logs. ' + 'Falling back to check lease files', + logger_func=LOG.debug) if fallback_lease_file is None: - LOG.warning("No fallback lease file was specified.") + report_diagnostic_event( + 'No fallback lease file was specified.', + logger_func=LOG.warning) value = None else: - LOG.debug("Looking for endpoint in lease file %s", - fallback_lease_file) + report_diagnostic_event( + 'Looking for endpoint in lease file %s' + % fallback_lease_file, logger_func=LOG.debug) value = WALinuxAgentShim._get_value_from_leases_file( fallback_lease_file) if value is None: - msg = "No lease found; using default endpoint" - report_diagnostic_event(msg) - LOG.warning(msg) value = DEFAULT_WIRESERVER_ENDPOINT + report_diagnostic_event( + 'No lease found; using default endpoint: %s' % value, + logger_func=LOG.warning) endpoint_ip_address = WALinuxAgentShim.get_ip_from_lease_value(value) - msg = 'Azure endpoint found at %s' % endpoint_ip_address - report_diagnostic_event(msg) - LOG.debug(msg) + report_diagnostic_event( + 'Azure endpoint found at %s' % endpoint_ip_address, + logger_func=LOG.debug) return endpoint_ip_address @azure_ds_telemetry_reporter - def register_with_azure_and_fetch_data(self, pubkey_info=None): + def register_with_azure_and_fetch_data(self, pubkey_info=None) -> dict: """Gets the VM's GoalState from Azure, uses the GoalState information to report ready/send the ready signal/provisioning complete signal to Azure, and then uses pubkey_info to filter and obtain the user's @@ -737,30 +874,56 @@ class WALinuxAgentShim: GoalState. @return: The list of user's authorized pubkey values. """ - if self.openssl_manager is None: + http_client_certificate = None + if self.openssl_manager is None and pubkey_info is not None: self.openssl_manager = OpenSSLManager() + http_client_certificate = self.openssl_manager.certificate if self.azure_endpoint_client is None: self.azure_endpoint_client = AzureEndpointHttpClient( - self.openssl_manager.certificate) - goal_state = self._fetch_goal_state_from_azure() - ssh_keys = self._get_user_pubkeys(goal_state, pubkey_info) + http_client_certificate) + goal_state = self._fetch_goal_state_from_azure( + need_certificate=http_client_certificate is not None + ) + ssh_keys = None + if pubkey_info is not None: + ssh_keys = self._get_user_pubkeys(goal_state, pubkey_info) health_reporter = GoalStateHealthReporter( goal_state, self.azure_endpoint_client, self.endpoint) health_reporter.send_ready_signal() return {'public-keys': ssh_keys} @azure_ds_telemetry_reporter - def _fetch_goal_state_from_azure(self): + def register_with_azure_and_report_failure(self, description: str) -> None: + """Gets the VM's GoalState from Azure, uses the GoalState information + to report failure/send provisioning failure signal to Azure. + + @param: user visible error description of provisioning failure. + """ + if self.azure_endpoint_client is None: + self.azure_endpoint_client = AzureEndpointHttpClient(None) + goal_state = self._fetch_goal_state_from_azure(need_certificate=False) + health_reporter = GoalStateHealthReporter( + goal_state, self.azure_endpoint_client, self.endpoint) + health_reporter.send_failure_signal(description=description) + + @azure_ds_telemetry_reporter + def _fetch_goal_state_from_azure( + self, + need_certificate: bool) -> GoalState: """Fetches the GoalState XML from the Azure endpoint, parses the XML, and returns a GoalState object. + @param need_certificate: switch to know if certificates is needed. @return: GoalState object representing the GoalState XML """ unparsed_goal_state_xml = self._get_raw_goal_state_xml_from_azure() - return self._parse_raw_goal_state_xml(unparsed_goal_state_xml) + return self._parse_raw_goal_state_xml( + unparsed_goal_state_xml, + need_certificate + ) @azure_ds_telemetry_reporter - def _get_raw_goal_state_xml_from_azure(self): + def _get_raw_goal_state_xml_from_azure(self) -> str: """Fetches the GoalState XML from the Azure endpoint and returns the XML as a string. @@ -770,40 +933,51 @@ class WALinuxAgentShim: LOG.info('Registering with Azure...') url = 'http://{}/machine/?comp=goalstate'.format(self.endpoint) try: - response = self.azure_endpoint_client.get(url) + with events.ReportEventStack( + name="goalstate-retrieval", + description="retrieve goalstate", + parent=azure_ds_reporter): + response = self.azure_endpoint_client.get(url) except Exception as e: - msg = 'failed to register with Azure: %s' % e - LOG.warning(msg) - report_diagnostic_event(msg) + report_diagnostic_event( + 'failed to register with Azure and fetch GoalState XML: %s' + % e, logger_func=LOG.warning) raise LOG.debug('Successfully fetched GoalState XML.') return response.contents @azure_ds_telemetry_reporter - def _parse_raw_goal_state_xml(self, unparsed_goal_state_xml): + def _parse_raw_goal_state_xml( + self, + unparsed_goal_state_xml: str, + need_certificate: bool) -> GoalState: """Parses a GoalState XML string and returns a GoalState object. @param unparsed_goal_state_xml: GoalState XML string + @param need_certificate: switch to know if certificates is needed. @return: GoalState object representing the GoalState XML """ try: goal_state = GoalState( - unparsed_goal_state_xml, self.azure_endpoint_client) + unparsed_goal_state_xml, + self.azure_endpoint_client, + need_certificate + ) except Exception as e: - msg = 'Error processing GoalState XML: %s' % e - LOG.warning(msg) - report_diagnostic_event(msg) + report_diagnostic_event( + 'Error processing GoalState XML: %s' % e, + logger_func=LOG.warning) raise msg = ', '.join([ 'GoalState XML container id: %s' % goal_state.container_id, 'GoalState XML instance id: %s' % goal_state.instance_id, 'GoalState XML incarnation: %s' % goal_state.incarnation]) - LOG.debug(msg) - report_diagnostic_event(msg) + report_diagnostic_event(msg, logger_func=LOG.debug) return goal_state @azure_ds_telemetry_reporter - def _get_user_pubkeys(self, goal_state, pubkey_info): + def _get_user_pubkeys( + self, goal_state: GoalState, pubkey_info: list) -> list: """Gets and filters the VM admin user's authorized pubkeys. The admin user in this case is the username specified as "admin" @@ -838,7 +1012,7 @@ class WALinuxAgentShim: return ssh_keys @staticmethod - def _filter_pubkeys(keys_by_fingerprint, pubkey_info): + def _filter_pubkeys(keys_by_fingerprint: dict, pubkey_info: list) -> list: """ Filter and return only the user's actual pubkeys. @param keys_by_fingerprint: pubkey fingerprint -> pubkey value dict @@ -879,9 +1053,25 @@ def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None, shim.clean_up() +@azure_ds_telemetry_reporter +def report_failure_to_fabric(fallback_lease_file=None, dhcp_opts=None, + description=None): + shim = WALinuxAgentShim(fallback_lease_file=fallback_lease_file, + dhcp_options=dhcp_opts) + if not description: + description = DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE + try: + shim.register_with_azure_and_report_failure( + description=description) + finally: + shim.clean_up() + + def dhcp_log_cb(out, err): - report_diagnostic_event("dhclient output stream: %s" % out) - report_diagnostic_event("dhclient error stream: %s" % err) + report_diagnostic_event( + "dhclient output stream: %s" % out, logger_func=LOG.debug) + report_diagnostic_event( + "dhclient error stream: %s" % err, logger_func=LOG.debug) class EphemeralDHCPv4WithReporting: diff --git a/cloudinit/sources/helpers/digitalocean.py b/cloudinit/sources/helpers/digitalocean.py index b545c4d6..f9be4ecb 100644 --- a/cloudinit/sources/helpers/digitalocean.py +++ b/cloudinit/sources/helpers/digitalocean.py @@ -5,6 +5,7 @@ import json import random +from cloudinit import dmi from cloudinit import log as logging from cloudinit import net as cloudnet from cloudinit import url_helper @@ -195,11 +196,11 @@ def read_sysinfo(): # SMBIOS information # Detect if we are on DigitalOcean and return the Droplet's ID - vendor_name = util.read_dmi_data("system-manufacturer") + vendor_name = dmi.read_dmi_data("system-manufacturer") if vendor_name != "DigitalOcean": return (False, None) - droplet_id = util.read_dmi_data("system-serial-number") + droplet_id = dmi.read_dmi_data("system-serial-number") if droplet_id: LOG.debug("system identified via SMBIOS as DigitalOcean Droplet: %s", droplet_id) diff --git a/cloudinit/sources/helpers/hetzner.py b/cloudinit/sources/helpers/hetzner.py index 72edb023..33dc4c53 100644 --- a/cloudinit/sources/helpers/hetzner.py +++ b/cloudinit/sources/helpers/hetzner.py @@ -3,15 +3,12 @@ # # This file is part of cloud-init. See LICENSE file for license information. -from cloudinit import log as logging from cloudinit import url_helper from cloudinit import util import base64 import binascii -LOG = logging.getLogger(__name__) - def read_metadata(url, timeout=2, sec_between=2, retries=30): response = url_helper.readurl(url, timeout=timeout, diff --git a/cloudinit/sources/helpers/netlink.py b/cloudinit/sources/helpers/netlink.py index c2ad587b..e13d6834 100644 --- a/cloudinit/sources/helpers/netlink.py +++ b/cloudinit/sources/helpers/netlink.py @@ -185,6 +185,54 @@ def read_rta_oper_state(data): return InterfaceOperstate(ifname, operstate) +def wait_for_nic_attach_event(netlink_socket, existing_nics): + '''Block until a single nic is attached. + + :param: netlink_socket: netlink_socket to receive events + :param: existing_nics: List of existing nics so that we can skip them. + :raises: AssertionError if netlink_socket is none. + ''' + LOG.debug("Preparing to wait for nic attach.") + ifname = None + + def should_continue_cb(iname, carrier, prevCarrier): + if iname in existing_nics: + return True + nonlocal ifname + ifname = iname + return False + + # We can return even if the operational state of the new nic is DOWN + # because we set it to UP before doing dhcp. + read_netlink_messages(netlink_socket, + None, + [RTM_NEWLINK], + [OPER_UP, OPER_DOWN], + should_continue_cb) + return ifname + + +def wait_for_nic_detach_event(netlink_socket): + '''Block until a single nic is detached and its operational state is down. + + :param: netlink_socket: netlink_socket to receive events. + ''' + LOG.debug("Preparing to wait for nic detach.") + ifname = None + + def should_continue_cb(iname, carrier, prevCarrier): + nonlocal ifname + ifname = iname + return False + + read_netlink_messages(netlink_socket, + None, + [RTM_DELLINK], + [OPER_DOWN], + should_continue_cb) + return ifname + + def wait_for_media_disconnect_connect(netlink_socket, ifname): '''Block until media disconnect and connect has happened on an interface. Listens on netlink socket to receive netlink events and when the carrier @@ -198,10 +246,42 @@ def wait_for_media_disconnect_connect(netlink_socket, ifname): assert (netlink_socket is not None), ("netlink socket is none") assert (ifname is not None), ("interface name is none") assert (len(ifname) > 0), ("interface name cannot be empty") + + def should_continue_cb(iname, carrier, prevCarrier): + # check for carrier down, up sequence + isVnetSwitch = (prevCarrier == OPER_DOWN) and (carrier == OPER_UP) + if isVnetSwitch: + LOG.debug("Media switch happened on %s.", ifname) + return False + return True + + LOG.debug("Wait for media disconnect and reconnect to happen") + read_netlink_messages(netlink_socket, + ifname, + [RTM_NEWLINK, RTM_DELLINK], + [OPER_UP, OPER_DOWN], + should_continue_cb) + + +def read_netlink_messages(netlink_socket, + ifname_filter, + rtm_types, + operstates, + should_continue_callback): + ''' Reads from the netlink socket until the condition specified by + the continuation callback is met. + + :param: netlink_socket: netlink_socket to receive events. + :param: ifname_filter: if not None, will only listen for this interface. + :param: rtm_types: Type of netlink events to listen for. + :param: operstates: Operational states to listen. + :param: should_continue_callback: Specifies when to stop listening. + ''' + if netlink_socket is None: + raise RuntimeError("Netlink socket is none") + data = bytes() carrier = OPER_UP prevCarrier = OPER_UP - data = bytes() - LOG.debug("Wait for media disconnect and reconnect to happen") while True: recv_data = read_netlink_socket(netlink_socket, SELECT_TIMEOUT) if recv_data is None: @@ -223,26 +303,26 @@ def wait_for_media_disconnect_connect(netlink_socket, ifname): padlen = (nlheader.length+PAD_ALIGNMENT-1) & ~(PAD_ALIGNMENT-1) offset = offset + padlen LOG.debug('offset to next netlink message: %d', offset) - # Ignore any messages not new link or del link - if nlheader.type not in [RTM_NEWLINK, RTM_DELLINK]: + # Continue if we are not interested in this message. + if nlheader.type not in rtm_types: continue interface_state = read_rta_oper_state(nl_msg) if interface_state is None: LOG.debug('Failed to read rta attributes: %s', interface_state) continue - if interface_state.ifname != ifname: + if (ifname_filter is not None and + interface_state.ifname != ifname_filter): LOG.debug( "Ignored netlink event on interface %s. Waiting for %s.", - interface_state.ifname, ifname) + interface_state.ifname, ifname_filter) continue - if interface_state.operstate not in [OPER_UP, OPER_DOWN]: + if interface_state.operstate not in operstates: continue prevCarrier = carrier carrier = interface_state.operstate - # check for carrier down, up sequence - isVnetSwitch = (prevCarrier == OPER_DOWN) and (carrier == OPER_UP) - if isVnetSwitch: - LOG.debug("Media switch happened on %s.", ifname) + if not should_continue_callback(interface_state.ifname, + carrier, + prevCarrier): return data = data[offset:] diff --git a/cloudinit/sources/helpers/openstack.py b/cloudinit/sources/helpers/openstack.py index 65e020c5..3e6365f1 100644 --- a/cloudinit/sources/helpers/openstack.py +++ b/cloudinit/sources/helpers/openstack.py @@ -602,11 +602,17 @@ def convert_net_json(network_json=None, known_macs=None): elif network['type'] in ['ipv6_slaac', 'ipv6_dhcpv6-stateless', 'ipv6_dhcpv6-stateful']: subnet.update({'type': network['type']}) - elif network['type'] in ['ipv4', 'ipv6']: + elif network['type'] in ['ipv4', 'static']: subnet.update({ 'type': 'static', 'address': network.get('ip_address'), }) + elif network['type'] in ['ipv6', 'static6']: + cfg.update({'accept-ra': False}) + subnet.update({ + 'type': 'static6', + 'address': network.get('ip_address'), + }) # Enable accept_ra for stateful and legacy ipv6_dhcp types if network['type'] in ['ipv6_dhcpv6-stateful', 'ipv6_dhcp']: diff --git a/cloudinit/sources/helpers/tests/test_netlink.py b/cloudinit/sources/helpers/tests/test_netlink.py index 10760bd6..cafe3961 100644 --- a/cloudinit/sources/helpers/tests/test_netlink.py +++ b/cloudinit/sources/helpers/tests/test_netlink.py @@ -9,9 +9,10 @@ import codecs from cloudinit.sources.helpers.netlink import ( NetlinkCreateSocketError, create_bound_netlink_socket, read_netlink_socket, read_rta_oper_state, unpack_rta_attr, wait_for_media_disconnect_connect, + wait_for_nic_attach_event, wait_for_nic_detach_event, OPER_DOWN, OPER_UP, OPER_DORMANT, OPER_LOWERLAYERDOWN, OPER_NOTPRESENT, - OPER_TESTING, OPER_UNKNOWN, RTATTR_START_OFFSET, RTM_NEWLINK, RTM_SETLINK, - RTM_GETLINK, MAX_SIZE) + OPER_TESTING, OPER_UNKNOWN, RTATTR_START_OFFSET, RTM_NEWLINK, RTM_DELLINK, + RTM_SETLINK, RTM_GETLINK, MAX_SIZE) def int_to_bytes(i): @@ -135,6 +136,75 @@ class TestParseNetlinkMessage(CiTestCase): @mock.patch('cloudinit.sources.helpers.netlink.socket.socket') @mock.patch('cloudinit.sources.helpers.netlink.read_netlink_socket') +class TestNicAttachDetach(CiTestCase): + with_logs = True + + def _media_switch_data(self, ifname, msg_type, operstate): + '''construct netlink data with specified fields''' + if ifname and operstate is not None: + data = bytearray(48) + bytes = ifname.encode("utf-8") + struct.pack_into("HH4sHHc", data, RTATTR_START_OFFSET, 8, 3, + bytes, 5, 16, int_to_bytes(operstate)) + elif ifname: + data = bytearray(40) + bytes = ifname.encode("utf-8") + struct.pack_into("HH4s", data, RTATTR_START_OFFSET, 8, 3, bytes) + elif operstate: + data = bytearray(40) + struct.pack_into("HHc", data, RTATTR_START_OFFSET, 5, 16, + int_to_bytes(operstate)) + struct.pack_into("=LHHLL", data, 0, len(data), msg_type, 0, 0, 0) + return data + + def test_nic_attached_oper_down(self, m_read_netlink_socket, m_socket): + '''Test for a new nic attached''' + ifname = "eth0" + data_op_down = self._media_switch_data(ifname, RTM_NEWLINK, OPER_DOWN) + m_read_netlink_socket.side_effect = [data_op_down] + ifread = wait_for_nic_attach_event(m_socket, []) + self.assertEqual(m_read_netlink_socket.call_count, 1) + self.assertEqual(ifname, ifread) + + def test_nic_attached_oper_up(self, m_read_netlink_socket, m_socket): + '''Test for a new nic attached''' + ifname = "eth0" + data_op_up = self._media_switch_data(ifname, RTM_NEWLINK, OPER_UP) + m_read_netlink_socket.side_effect = [data_op_up] + ifread = wait_for_nic_attach_event(m_socket, []) + self.assertEqual(m_read_netlink_socket.call_count, 1) + self.assertEqual(ifname, ifread) + + def test_nic_attach_ignore_existing(self, m_read_netlink_socket, m_socket): + '''Test that we read only the interfaces we are interested in.''' + data_eth0 = self._media_switch_data("eth0", RTM_NEWLINK, OPER_DOWN) + data_eth1 = self._media_switch_data("eth1", RTM_NEWLINK, OPER_DOWN) + m_read_netlink_socket.side_effect = [data_eth0, data_eth1] + ifread = wait_for_nic_attach_event(m_socket, ["eth0"]) + self.assertEqual(m_read_netlink_socket.call_count, 2) + self.assertEqual("eth1", ifread) + + def test_nic_attach_read_first(self, m_read_netlink_socket, m_socket): + '''Test that we read only the interfaces we are interested in.''' + data_eth0 = self._media_switch_data("eth0", RTM_NEWLINK, OPER_DOWN) + data_eth1 = self._media_switch_data("eth1", RTM_NEWLINK, OPER_DOWN) + m_read_netlink_socket.side_effect = [data_eth0, data_eth1] + ifread = wait_for_nic_attach_event(m_socket, ["eth1"]) + self.assertEqual(m_read_netlink_socket.call_count, 1) + self.assertEqual("eth0", ifread) + + def test_nic_detached(self, m_read_netlink_socket, m_socket): + '''Test for an existing nic detached''' + ifname = "eth0" + data_op_down = self._media_switch_data(ifname, RTM_DELLINK, OPER_DOWN) + m_read_netlink_socket.side_effect = [data_op_down] + ifread = wait_for_nic_detach_event(m_socket) + self.assertEqual(m_read_netlink_socket.call_count, 1) + self.assertEqual(ifname, ifread) + + +@mock.patch('cloudinit.sources.helpers.netlink.socket.socket') +@mock.patch('cloudinit.sources.helpers.netlink.read_netlink_socket') class TestWaitForMediaDisconnectConnect(CiTestCase): with_logs = True diff --git a/cloudinit/sources/helpers/vmware/imc/config_nic.py b/cloudinit/sources/helpers/vmware/imc/config_nic.py index 3745a262..9cd2c0c0 100644 --- a/cloudinit/sources/helpers/vmware/imc/config_nic.py +++ b/cloudinit/sources/helpers/vmware/imc/config_nic.py @@ -275,6 +275,7 @@ class NicConfigurator(object): "# DO NOT EDIT THIS FILE BY HAND --" " AUTOMATICALLY GENERATED BY cloud-init", "source /etc/network/interfaces.d/*.cfg", + "source-directory /etc/network/interfaces.d", ] util.write_file(interfaceFile, content='\n'.join(lines)) diff --git a/cloudinit/sources/tests/test_oracle.py b/cloudinit/sources/tests/test_oracle.py index 7bd23813..a7bbdfd9 100644 --- a/cloudinit/sources/tests/test_oracle.py +++ b/cloudinit/sources/tests/test_oracle.py @@ -153,20 +153,20 @@ class TestDataSourceOracle: class TestIsPlatformViable(test_helpers.CiTestCase): - @mock.patch(DS_PATH + ".util.read_dmi_data", + @mock.patch(DS_PATH + ".dmi.read_dmi_data", return_value=oracle.CHASSIS_ASSET_TAG) def test_expected_viable(self, m_read_dmi_data): """System with known chassis tag is viable.""" self.assertTrue(oracle._is_platform_viable()) m_read_dmi_data.assert_has_calls([mock.call('chassis-asset-tag')]) - @mock.patch(DS_PATH + ".util.read_dmi_data", return_value=None) + @mock.patch(DS_PATH + ".dmi.read_dmi_data", return_value=None) def test_expected_not_viable_dmi_data_none(self, m_read_dmi_data): """System without known chassis tag is not viable.""" self.assertFalse(oracle._is_platform_viable()) m_read_dmi_data.assert_has_calls([mock.call('chassis-asset-tag')]) - @mock.patch(DS_PATH + ".util.read_dmi_data", return_value="LetsGoCubs") + @mock.patch(DS_PATH + ".dmi.read_dmi_data", return_value="LetsGoCubs") def test_expected_not_viable_other(self, m_read_dmi_data): """System with unnown chassis tag is not viable.""" self.assertFalse(oracle._is_platform_viable()) |