summaryrefslogtreecommitdiff
path: root/cloudinit
diff options
context:
space:
mode:
Diffstat (limited to 'cloudinit')
-rw-r--r--cloudinit/distros/networking.py15
-rw-r--r--cloudinit/distros/tests/test_networking.py31
-rwxr-xr-xcloudinit/sources/DataSourceAzure.py515
-rw-r--r--cloudinit/sources/helpers/netlink.py102
-rw-r--r--cloudinit/sources/helpers/tests/test_netlink.py74
5 files changed, 686 insertions, 51 deletions
diff --git a/cloudinit/distros/networking.py b/cloudinit/distros/networking.py
index e407fa29..c291196a 100644
--- a/cloudinit/distros/networking.py
+++ b/cloudinit/distros/networking.py
@@ -2,6 +2,7 @@ import abc
import logging
import os
+from cloudinit import subp
from cloudinit import net, util
@@ -175,6 +176,10 @@ class Networking(metaclass=abc.ABCMeta):
if strict:
raise RuntimeError(msg)
+ @abc.abstractmethod
+ def try_set_link_up(self, devname: DeviceName) -> bool:
+ """Try setting the link to up explicitly and return if it is up."""
+
class BSDNetworking(Networking):
"""Implementation of networking functionality shared across BSDs."""
@@ -185,6 +190,9 @@ class BSDNetworking(Networking):
def settle(self, *, exists=None) -> None:
"""BSD has no equivalent to `udevadm settle`; noop."""
+ def try_set_link_up(self, devname: DeviceName) -> bool:
+ raise NotImplementedError()
+
class LinuxNetworking(Networking):
"""Implementation of networking functionality common to Linux distros."""
@@ -214,3 +222,10 @@ class LinuxNetworking(Networking):
if exists is not None:
exists = net.sys_dev_path(exists)
util.udevadm_settle(exists=exists)
+
+ def try_set_link_up(self, devname: DeviceName) -> bool:
+ """Try setting the link to up explicitly and return if it is up.
+ Not guaranteed to bring the interface up. The caller is expected to
+ add wait times before retrying."""
+ subp.subp(['ip', 'link', 'set', devname, 'up'])
+ return self.is_up(devname)
diff --git a/cloudinit/distros/tests/test_networking.py b/cloudinit/distros/tests/test_networking.py
index b9a63842..ec508f4d 100644
--- a/cloudinit/distros/tests/test_networking.py
+++ b/cloudinit/distros/tests/test_networking.py
@@ -30,6 +30,9 @@ def generic_networking_cls():
def settle(self, *args, **kwargs):
raise NotImplementedError
+ def try_set_link_up(self, *args, **kwargs):
+ raise NotImplementedError
+
error = AssertionError("Unexpectedly used /sys in generic networking code")
with mock.patch(
"cloudinit.net.get_sys_class_path", side_effect=error,
@@ -74,6 +77,34 @@ class TestLinuxNetworkingIsPhysical:
assert LinuxNetworking().is_physical(devname)
+class TestBSDNetworkingTrySetLinkUp:
+ def test_raises_notimplementederror(self):
+ with pytest.raises(NotImplementedError):
+ BSDNetworking().try_set_link_up("eth0")
+
+
+@mock.patch("cloudinit.net.is_up")
+@mock.patch("cloudinit.distros.networking.subp.subp")
+class TestLinuxNetworkingTrySetLinkUp:
+ def test_calls_subp_return_true(self, m_subp, m_is_up):
+ devname = "eth0"
+ m_is_up.return_value = True
+ is_success = LinuxNetworking().try_set_link_up(devname)
+
+ assert (mock.call(['ip', 'link', 'set', devname, 'up']) ==
+ m_subp.call_args_list[-1])
+ assert is_success
+
+ def test_calls_subp_return_false(self, m_subp, m_is_up):
+ devname = "eth0"
+ m_is_up.return_value = False
+ is_success = LinuxNetworking().try_set_link_up(devname)
+
+ assert (mock.call(['ip', 'link', 'set', devname, 'up']) ==
+ m_subp.call_args_list[-1])
+ assert not is_success
+
+
class TestBSDNetworkingSettle:
def test_settle_doesnt_error(self):
# This also implicitly tests that it doesn't use subp.subp
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index f777a007..04ff2131 100755
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -12,8 +12,10 @@ import os
import os.path
import re
from time import time
+from time import sleep
from xml.dom import minidom
import xml.etree.ElementTree as ET
+from enum import Enum
from cloudinit import dmi
from cloudinit import log as logging
@@ -67,13 +69,27 @@ DEFAULT_FS = 'ext4'
# DMI chassis-asset-tag is set static for all azure instances
AZURE_CHASSIS_ASSET_TAG = '7783-7084-3265-9085-8269-3286-77'
REPROVISION_MARKER_FILE = "/var/lib/cloud/data/poll_imds"
+REPROVISION_NIC_ATTACH_MARKER_FILE = "/var/lib/cloud/data/wait_for_nic_attach"
+REPROVISION_NIC_DETACHED_MARKER_FILE = "/var/lib/cloud/data/nic_detached"
REPORTED_READY_MARKER_FILE = "/var/lib/cloud/data/reported_ready"
AGENT_SEED_DIR = '/var/lib/waagent'
+
# In the event where the IMDS primary server is not
# available, it takes 1s to fallback to the secondary one
IMDS_TIMEOUT_IN_SECONDS = 2
IMDS_URL = "http://169.254.169.254/metadata/"
+IMDS_VER = "2019-06-01"
+IMDS_VER_PARAM = "api-version={}".format(IMDS_VER)
+
+
+class metadata_type(Enum):
+ compute = "{}instance?{}".format(IMDS_URL, IMDS_VER_PARAM)
+ network = "{}instance/network?{}".format(IMDS_URL,
+ IMDS_VER_PARAM)
+ reprovisiondata = "{}reprovisiondata?{}".format(IMDS_URL,
+ IMDS_VER_PARAM)
+
PLATFORM_ENTROPY_SOURCE = "/sys/firmware/acpi/tables/OEM0"
@@ -434,20 +450,39 @@ class DataSourceAzure(sources.DataSource):
# need to look in the datadir and consider that valid
ddir = self.ds_cfg['data_dir']
+ # The order in which the candidates are inserted matters here, because
+ # it determines the value of ret. More specifically, the first one in
+ # the candidate list determines the path to take in order to get the
+ # metadata we need.
candidates = [self.seed_dir]
if os.path.isfile(REPROVISION_MARKER_FILE):
candidates.insert(0, "IMDS")
+ report_diagnostic_event("Reprovision marker file already present "
+ "before crawling Azure metadata: %s" %
+ REPROVISION_MARKER_FILE,
+ logger_func=LOG.debug)
+ elif os.path.isfile(REPROVISION_NIC_ATTACH_MARKER_FILE):
+ candidates.insert(0, "NIC_ATTACH_MARKER_PRESENT")
+ report_diagnostic_event("Reprovision nic attach marker file "
+ "already present before crawling Azure "
+ "metadata: %s" %
+ REPROVISION_NIC_ATTACH_MARKER_FILE,
+ logger_func=LOG.debug)
candidates.extend(list_possible_azure_ds_devs())
if ddir:
candidates.append(ddir)
found = None
reprovision = False
+ reprovision_after_nic_attach = False
for cdev in candidates:
try:
if cdev == "IMDS":
ret = None
reprovision = True
+ elif cdev == "NIC_ATTACH_MARKER_PRESENT":
+ ret = None
+ reprovision_after_nic_attach = True
elif cdev.startswith("/dev/"):
if util.is_FreeBSD():
ret = util.mount_cb(cdev, load_azure_ds_dir,
@@ -472,12 +507,19 @@ class DataSourceAzure(sources.DataSource):
continue
perform_reprovision = reprovision or self._should_reprovision(ret)
- if perform_reprovision:
+ perform_reprovision_after_nic_attach = (
+ reprovision_after_nic_attach or
+ self._should_reprovision_after_nic_attach(ret))
+
+ if perform_reprovision or perform_reprovision_after_nic_attach:
if util.is_FreeBSD():
msg = "Free BSD is not supported for PPS VMs"
report_diagnostic_event(msg, logger_func=LOG.error)
raise sources.InvalidMetaDataException(msg)
+ if perform_reprovision_after_nic_attach:
+ self._wait_for_all_nics_ready()
ret = self._reprovision()
+
imds_md = get_metadata_from_imds(
self.fallback_interface, retries=10)
(md, userdata_raw, cfg, files) = ret
@@ -508,7 +550,7 @@ class DataSourceAzure(sources.DataSource):
crawled_data['metadata']['random_seed'] = seed
crawled_data['metadata']['instance-id'] = self._iid()
- if perform_reprovision:
+ if perform_reprovision or perform_reprovision_after_nic_attach:
LOG.info("Reporting ready to Azure after getting ReprovisionData")
use_cached_ephemeral = (
self.distro.networking.is_up(self.fallback_interface) and
@@ -617,14 +659,12 @@ class DataSourceAzure(sources.DataSource):
LOG.debug('Retrieved SSH keys from IMDS')
except KeyError:
log_msg = 'Unable to get keys from IMDS, falling back to OVF'
- LOG.debug(log_msg)
report_diagnostic_event(log_msg, logger_func=LOG.debug)
try:
ssh_keys = self.metadata['public-keys']
LOG.debug('Retrieved keys from OVF')
except KeyError:
log_msg = 'No keys available from OVF'
- LOG.debug(log_msg)
report_diagnostic_event(log_msg, logger_func=LOG.debug)
return ssh_keys
@@ -660,10 +700,293 @@ class DataSourceAzure(sources.DataSource):
LOG.debug("negotiating already done for %s",
self.get_instance_id())
+ @azure_ds_telemetry_reporter
+ def _wait_for_nic_detach(self, nl_sock):
+ """Use the netlink socket provided to wait for nic detach event.
+ NOTE: The function doesn't close the socket. The caller owns closing
+ the socket and disposing it safely.
+ """
+ try:
+ ifname = None
+
+ # Preprovisioned VM will only have one NIC, and it gets
+ # detached immediately after deployment.
+ with events.ReportEventStack(
+ name="wait-for-nic-detach",
+ description=("wait for nic detach"),
+ parent=azure_ds_reporter):
+ ifname = netlink.wait_for_nic_detach_event(nl_sock)
+ if ifname is None:
+ msg = ("Preprovisioned nic not detached as expected. "
+ "Proceeding without failing.")
+ report_diagnostic_event(msg, logger_func=LOG.warning)
+ else:
+ report_diagnostic_event("The preprovisioned nic %s is detached"
+ % ifname, logger_func=LOG.warning)
+ path = REPROVISION_NIC_DETACHED_MARKER_FILE
+ LOG.info("Creating a marker file for nic detached: %s", path)
+ util.write_file(path, "{pid}: {time}\n".format(
+ pid=os.getpid(), time=time()))
+ except AssertionError as error:
+ report_diagnostic_event(error, logger_func=LOG.error)
+ raise
+
+ @azure_ds_telemetry_reporter
+ def wait_for_link_up(self, ifname):
+ """In cases where the link state is still showing down after a nic is
+ hot-attached, we can attempt to bring it up by forcing the hv_netvsc
+ drivers to query the link state by unbinding and then binding the
+ device. This function attempts infinitely until the link is up,
+ because we cannot proceed further until we have a stable link."""
+
+ if self.distro.networking.try_set_link_up(ifname):
+ report_diagnostic_event("The link %s is already up." % ifname,
+ logger_func=LOG.info)
+ return
+
+ LOG.info("Attempting to bring %s up", ifname)
+
+ attempts = 0
+ while True:
+
+ LOG.info("Unbinding and binding the interface %s", ifname)
+ devicename = net.read_sys_net(ifname,
+ 'device/device_id').strip('{}')
+ util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/unbind',
+ devicename)
+ util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/bind',
+ devicename)
+
+ attempts = attempts + 1
+ if self.distro.networking.try_set_link_up(ifname):
+ msg = "The link %s is up after %s attempts" % (ifname,
+ attempts)
+ report_diagnostic_event(msg, logger_func=LOG.info)
+ return
+
+ sleep_duration = 1
+ msg = ("Link is not up after %d attempts with %d seconds sleep "
+ "between attempts." % (attempts, sleep_duration))
+
+ if attempts % 10 == 0:
+ report_diagnostic_event(msg, logger_func=LOG.info)
+ else:
+ LOG.info(msg)
+
+ sleep(sleep_duration)
+
+ @azure_ds_telemetry_reporter
+ def _create_report_ready_marker(self):
+ path = REPORTED_READY_MARKER_FILE
+ LOG.info(
+ "Creating a marker file to report ready: %s", path)
+ util.write_file(path, "{pid}: {time}\n".format(
+ pid=os.getpid(), time=time()))
+ report_diagnostic_event(
+ 'Successfully created reported ready marker file '
+ 'while in the preprovisioning pool.',
+ logger_func=LOG.debug)
+
+ @azure_ds_telemetry_reporter
+ def _report_ready_if_needed(self):
+ """Report ready to the platform if the marker file is not present,
+ and create the marker file.
+ """
+ have_not_reported_ready = (
+ not os.path.isfile(REPORTED_READY_MARKER_FILE))
+
+ if have_not_reported_ready:
+ report_diagnostic_event("Reporting ready before nic detach",
+ logger_func=LOG.info)
+ try:
+ with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease:
+ self._report_ready(lease=lease)
+ except Exception as e:
+ report_diagnostic_event("Exception reporting ready during "
+ "preprovisioning before nic detach: %s"
+ % e, logger_func=LOG.error)
+ raise
+ self._create_report_ready_marker()
+ else:
+ report_diagnostic_event("Already reported ready before nic detach."
+ " The marker file already exists: %s" %
+ REPORTED_READY_MARKER_FILE,
+ logger_func=LOG.error)
+
+ @azure_ds_telemetry_reporter
+ def _check_if_nic_is_primary(self, ifname):
+ """Check if a given interface is the primary nic or not. If it is the
+ primary nic, then we also get the expected total nic count from IMDS.
+ IMDS will process the request and send a response only for primary NIC.
+ """
+ is_primary = False
+ expected_nic_count = -1
+ imds_md = None
+
+ # For now, only a VM's primary NIC can contact IMDS and WireServer. If
+ # DHCP fails for a NIC, we have no mechanism to determine if the NIC is
+ # primary or secondary. In this case, the desired behavior is to fail
+ # VM provisioning if there is any DHCP failure when trying to determine
+ # the primary NIC.
+ try:
+ with events.ReportEventStack(
+ name="obtain-dhcp-lease",
+ description=("obtain dhcp lease for %s when attempting to "
+ "determine primary NIC during reprovision of "
+ "a pre-provisioned VM" % ifname),
+ parent=azure_ds_reporter):
+ dhcp_ctx = EphemeralDHCPv4(
+ iface=ifname,
+ dhcp_log_func=dhcp_log_cb)
+ dhcp_ctx.obtain_lease()
+ except Exception as e:
+ report_diagnostic_event("Giving up. Failed to obtain dhcp lease "
+ "for %s when attempting to determine "
+ "primary NIC during reprovision due to %s"
+ % (ifname, e), logger_func=LOG.error)
+ raise
+
+ # Primary nic detection will be optimized in the future. The fact that
+ # primary nic is being attached first helps here. Otherwise each nic
+ # could add several seconds of delay.
+ try:
+ imds_md = get_metadata_from_imds(
+ ifname,
+ 5,
+ metadata_type.network)
+ except Exception as e:
+ LOG.warning(
+ "Failed to get network metadata using nic %s. Attempt to "
+ "contact IMDS failed with error %s. Assuming this is not the "
+ "primary nic.", ifname, e)
+ finally:
+ # If we are not the primary nic, then clean the dhcp context.
+ if imds_md is None:
+ dhcp_ctx.clean_network()
+
+ if imds_md is not None:
+ # Only primary NIC will get a response from IMDS.
+ LOG.info("%s is the primary nic", ifname)
+ is_primary = True
+
+ # If primary, set ephemeral dhcp ctx so we can report ready
+ self._ephemeral_dhcp_ctx = dhcp_ctx
+
+ # Set the expected nic count based on the response received.
+ expected_nic_count = len(
+ imds_md['interface'])
+ report_diagnostic_event("Expected nic count: %d" %
+ expected_nic_count, logger_func=LOG.info)
+
+ return is_primary, expected_nic_count
+
+ @azure_ds_telemetry_reporter
+ def _wait_for_hot_attached_nics(self, nl_sock):
+ """Wait until all the expected nics for the vm are hot-attached.
+ The expected nic count is obtained by requesting the network metadata
+ from IMDS.
+ """
+ LOG.info("Waiting for nics to be hot-attached")
+ try:
+ # Wait for nics to be attached one at a time, until we know for
+ # sure that all nics have been attached.
+ nics_found = []
+ primary_nic_found = False
+ expected_nic_count = -1
+
+ # Wait for netlink nic attach events. After the first nic is
+ # attached, we are already in the customer vm deployment path and
+ # so eerything from then on should happen fast and avoid
+ # unnecessary delays wherever possible.
+ while True:
+ ifname = None
+ with events.ReportEventStack(
+ name="wait-for-nic-attach",
+ description=("wait for nic attach after %d nics have "
+ "been attached" % len(nics_found)),
+ parent=azure_ds_reporter):
+ ifname = netlink.wait_for_nic_attach_event(nl_sock,
+ nics_found)
+
+ # wait_for_nic_attach_event guarantees that ifname it not None
+ nics_found.append(ifname)
+ report_diagnostic_event("Detected nic %s attached." % ifname,
+ logger_func=LOG.info)
+
+ # Attempt to bring the interface's operating state to
+ # UP in case it is not already.
+ self.wait_for_link_up(ifname)
+
+ # If primary nic is not found, check if this is it. The
+ # platform will attach the primary nic first so we
+ # won't be in primary_nic_found = false state for long.
+ if not primary_nic_found:
+ LOG.info("Checking if %s is the primary nic",
+ ifname)
+ (primary_nic_found, expected_nic_count) = (
+ self._check_if_nic_is_primary(ifname))
+
+ # Exit criteria: check if we've discovered all nics
+ if (expected_nic_count != -1
+ and len(nics_found) >= expected_nic_count):
+ LOG.info("Found all the nics for this VM.")
+ break
+
+ except AssertionError as error:
+ report_diagnostic_event(error, logger_func=LOG.error)
+
+ @azure_ds_telemetry_reporter
+ def _wait_for_all_nics_ready(self):
+ """Wait for nic(s) to be hot-attached. There may be multiple nics
+ depending on the customer request.
+ But only primary nic would be able to communicate with wireserver
+ and IMDS. So we detect and save the primary nic to be used later.
+ """
+
+ nl_sock = None
+ try:
+ nl_sock = netlink.create_bound_netlink_socket()
+
+ report_ready_marker_present = bool(
+ os.path.isfile(REPORTED_READY_MARKER_FILE))
+
+ # Report ready if the marker file is not already present.
+ # The nic of the preprovisioned vm gets hot-detached as soon as
+ # we report ready. So no need to save the dhcp context.
+ self._report_ready_if_needed()
+
+ has_nic_been_detached = bool(
+ os.path.isfile(REPROVISION_NIC_DETACHED_MARKER_FILE))
+
+ if not has_nic_been_detached:
+ LOG.info("NIC has not been detached yet.")
+ self._wait_for_nic_detach(nl_sock)
+
+ # If we know that the preprovisioned nic has been detached, and we
+ # still have a fallback nic, then it means the VM must have
+ # rebooted as part of customer assignment, and all the nics have
+ # already been attached by the Azure platform. So there is no need
+ # to wait for nics to be hot-attached.
+ if not self.fallback_interface:
+ self._wait_for_hot_attached_nics(nl_sock)
+ else:
+ report_diagnostic_event("Skipping waiting for nic attach "
+ "because we already have a fallback "
+ "interface. Report Ready marker "
+ "present before detaching nics: %s" %
+ report_ready_marker_present,
+ logger_func=LOG.info)
+ except netlink.NetlinkCreateSocketError as e:
+ report_diagnostic_event(e, logger_func=LOG.warning)
+ raise
+ finally:
+ if nl_sock:
+ nl_sock.close()
+
def _poll_imds(self):
"""Poll IMDS for the new provisioning data until we get a valid
response. Then return the returned JSON object."""
- url = IMDS_URL + "reprovisiondata?api-version=2017-04-02"
+ url = metadata_type.reprovisiondata.value
headers = {"Metadata": "true"}
nl_sock = None
report_ready = bool(not os.path.isfile(REPORTED_READY_MARKER_FILE))
@@ -705,17 +1028,31 @@ class DataSourceAzure(sources.DataSource):
logger_func=LOG.warning)
return False
- LOG.debug("Wait for vnetswitch to happen")
+ # When the interface is hot-attached, we would have already
+ # done dhcp and set the dhcp context. In that case, skip
+ # the attempt to do dhcp.
+ is_ephemeral_ctx_present = self._ephemeral_dhcp_ctx is not None
+ msg = ("Unexpected error. Dhcp context is not expected to be already "
+ "set when we need to wait for vnet switch")
+ if is_ephemeral_ctx_present and report_ready:
+ report_diagnostic_event(msg, logger_func=LOG.error)
+ raise RuntimeError(msg)
+
while True:
try:
- # Save our EphemeralDHCPv4 context to avoid repeated dhcp
- with events.ReportEventStack(
- name="obtain-dhcp-lease",
- description="obtain dhcp lease",
- parent=azure_ds_reporter):
- self._ephemeral_dhcp_ctx = EphemeralDHCPv4(
- dhcp_log_func=dhcp_log_cb)
- lease = self._ephemeral_dhcp_ctx.obtain_lease()
+ # Since is_ephemeral_ctx_present is set only once, this ensures
+ # that with regular reprovisioning, dhcp is always done every
+ # time the loop runs.
+ if not is_ephemeral_ctx_present:
+ # Save our EphemeralDHCPv4 context to avoid repeated dhcp
+ # later when we report ready
+ with events.ReportEventStack(
+ name="obtain-dhcp-lease",
+ description="obtain dhcp lease",
+ parent=azure_ds_reporter):
+ self._ephemeral_dhcp_ctx = EphemeralDHCPv4(
+ dhcp_log_func=dhcp_log_cb)
+ lease = self._ephemeral_dhcp_ctx.obtain_lease()
if vnet_switched:
dhcp_attempts += 1
@@ -737,17 +1074,10 @@ class DataSourceAzure(sources.DataSource):
self._ephemeral_dhcp_ctx.clean_network()
raise sources.InvalidMetaDataException(msg)
- path = REPORTED_READY_MARKER_FILE
- LOG.info(
- "Creating a marker file to report ready: %s", path)
- util.write_file(path, "{pid}: {time}\n".format(
- pid=os.getpid(), time=time()))
- report_diagnostic_event(
- 'Successfully created reported ready marker file '
- 'while in the preprovisioning pool.',
- logger_func=LOG.debug)
+ self._create_report_ready_marker()
report_ready = False
+ LOG.debug("Wait for vnetswitch to happen")
with events.ReportEventStack(
name="wait-for-media-disconnect-connect",
description="wait for vnet switch",
@@ -863,6 +1193,35 @@ class DataSourceAzure(sources.DataSource):
"connectivity issues: %s" % e, logger_func=LOG.warning)
return False
+ def _should_reprovision_after_nic_attach(self, candidate_metadata) -> bool:
+ """Whether or not we should wait for nic attach and then poll
+ IMDS for reprovisioning data. Also sets a marker file to poll IMDS.
+
+ The marker file is used for the following scenario: the VM boots into
+ wait for nic attach, which we expect to be proceeding infinitely until
+ the nic is attached. If for whatever reason the platform moves us to a
+ new host (for instance a hardware issue), we need to keep waiting.
+ However, since the VM reports ready to the Fabric, we will not attach
+ the ISO, thus cloud-init needs to have a way of knowing that it should
+ jump back into the waiting mode in order to retrieve the ovf_env.
+
+ @param candidate_metadata: Metadata obtained from reading ovf-env.
+ @return: Whether to reprovision after waiting for nics to be attached.
+ """
+ if not candidate_metadata:
+ return False
+ (_md, _userdata_raw, cfg, _files) = candidate_metadata
+ path = REPROVISION_NIC_ATTACH_MARKER_FILE
+ if (cfg.get('PreprovisionedVMType', None) == "Savable" or
+ os.path.isfile(path)):
+ if not os.path.isfile(path):
+ LOG.info("Creating a marker file to wait for nic attach: %s",
+ path)
+ util.write_file(path, "{pid}: {time}\n".format(
+ pid=os.getpid(), time=time()))
+ return True
+ return False
+
def _should_reprovision(self, ret):
"""Whether or not we should poll IMDS for reprovisioning data.
Also sets a marker file to poll IMDS.
@@ -879,6 +1238,7 @@ class DataSourceAzure(sources.DataSource):
(_md, _userdata_raw, cfg, _files) = ret
path = REPROVISION_MARKER_FILE
if (cfg.get('PreprovisionedVm') is True or
+ cfg.get('PreprovisionedVMType', None) == 'Running' or
os.path.isfile(path)):
if not os.path.isfile(path):
LOG.info("Creating a marker file to poll imds: %s",
@@ -944,6 +1304,8 @@ class DataSourceAzure(sources.DataSource):
util.del_file(REPORTED_READY_MARKER_FILE)
util.del_file(REPROVISION_MARKER_FILE)
+ util.del_file(REPROVISION_NIC_ATTACH_MARKER_FILE)
+ util.del_file(REPROVISION_NIC_DETACHED_MARKER_FILE)
return fabric_data
@azure_ds_telemetry_reporter
@@ -1399,34 +1761,109 @@ def read_azure_ovf(contents):
if 'ssh_pwauth' not in cfg and password:
cfg['ssh_pwauth'] = True
- cfg['PreprovisionedVm'] = _extract_preprovisioned_vm_setting(dom)
+ preprovisioning_cfg = _get_preprovisioning_cfgs(dom)
+ cfg = util.mergemanydict([cfg, preprovisioning_cfg])
return (md, ud, cfg)
@azure_ds_telemetry_reporter
-def _extract_preprovisioned_vm_setting(dom):
- """Read the preprovision flag from the ovf. It should not
- exist unless true."""
+def _get_preprovisioning_cfgs(dom):
+ """Read the preprovisioning related flags from ovf and populates a dict
+ with the info.
+
+ Two flags are in use today: PreprovisionedVm bool and
+ PreprovisionedVMType enum. In the long term, the PreprovisionedVm bool
+ will be deprecated in favor of PreprovisionedVMType string/enum.
+
+ Only these combinations of values are possible today:
+ - PreprovisionedVm=True and PreprovisionedVMType=Running
+ - PreprovisionedVm=False and PreprovisionedVMType=Savable
+ - PreprovisionedVm is missing and PreprovisionedVMType=Running/Savable
+ - PreprovisionedVm=False and PreprovisionedVMType is missing
+
+ More specifically, this will never happen:
+ - PreprovisionedVm=True and PreprovisionedVMType=Savable
+ """
+ cfg = {
+ "PreprovisionedVm": False,
+ "PreprovisionedVMType": None
+ }
+
platform_settings_section = find_child(
dom.documentElement,
lambda n: n.localName == "PlatformSettingsSection")
if not platform_settings_section or len(platform_settings_section) == 0:
LOG.debug("PlatformSettingsSection not found")
- return False
+ return cfg
platform_settings = find_child(
platform_settings_section[0],
lambda n: n.localName == "PlatformSettings")
if not platform_settings or len(platform_settings) == 0:
LOG.debug("PlatformSettings not found")
- return False
- preprovisionedVm = find_child(
+ return cfg
+
+ # Read the PreprovisionedVm bool flag. This should be deprecated when the
+ # platform has removed PreprovisionedVm and only surfaces
+ # PreprovisionedVMType.
+ cfg["PreprovisionedVm"] = _get_preprovisionedvm_cfg_value(
+ platform_settings)
+
+ cfg["PreprovisionedVMType"] = _get_preprovisionedvmtype_cfg_value(
+ platform_settings)
+ return cfg
+
+
+@azure_ds_telemetry_reporter
+def _get_preprovisionedvm_cfg_value(platform_settings):
+ preprovisionedVm = False
+
+ # Read the PreprovisionedVm bool flag. This should be deprecated when the
+ # platform has removed PreprovisionedVm and only surfaces
+ # PreprovisionedVMType.
+ preprovisionedVmVal = find_child(
platform_settings[0],
lambda n: n.localName == "PreprovisionedVm")
- if not preprovisionedVm or len(preprovisionedVm) == 0:
+ if not preprovisionedVmVal or len(preprovisionedVmVal) == 0:
LOG.debug("PreprovisionedVm not found")
- return False
- return util.translate_bool(preprovisionedVm[0].firstChild.nodeValue)
+ return preprovisionedVm
+ preprovisionedVm = util.translate_bool(
+ preprovisionedVmVal[0].firstChild.nodeValue)
+
+ report_diagnostic_event(
+ "PreprovisionedVm: %s" % preprovisionedVm, logger_func=LOG.info)
+
+ return preprovisionedVm
+
+
+@azure_ds_telemetry_reporter
+def _get_preprovisionedvmtype_cfg_value(platform_settings):
+ preprovisionedVMType = None
+
+ # Read the PreprovisionedVMType value from the ovf. It can be
+ # 'Running' or 'Savable' or not exist. This enum value is intended to
+ # replace PreprovisionedVm bool flag in the long term.
+ # A Running VM is the same as preprovisioned VMs of today. This is
+ # equivalent to having PreprovisionedVm=True.
+ # A Savable VM is one whose nic is hot-detached immediately after it
+ # reports ready the first time to free up the network resources.
+ # Once assigned to customer, the customer-requested nics are
+ # hot-attached to it and reprovision happens like today.
+ preprovisionedVMTypeVal = find_child(
+ platform_settings[0],
+ lambda n: n.localName == "PreprovisionedVMType")
+ if (not preprovisionedVMTypeVal or len(preprovisionedVMTypeVal) == 0 or
+ preprovisionedVMTypeVal[0].firstChild is None):
+ LOG.debug("PreprovisionedVMType not found")
+ return preprovisionedVMType
+
+ preprovisionedVMType = preprovisionedVMTypeVal[0].firstChild.nodeValue
+
+ report_diagnostic_event(
+ "PreprovisionedVMType: %s" % preprovisionedVMType,
+ logger_func=LOG.info)
+
+ return preprovisionedVMType
def encrypt_pass(password, salt_id="$6$"):
@@ -1588,8 +2025,10 @@ def _generate_network_config_from_fallback_config() -> dict:
@azure_ds_telemetry_reporter
-def get_metadata_from_imds(fallback_nic, retries):
- """Query Azure's network metadata service, returning a dictionary.
+def get_metadata_from_imds(fallback_nic,
+ retries,
+ md_type=metadata_type.compute):
+ """Query Azure's instance metadata service, returning a dictionary.
If network is not up, setup ephemeral dhcp on fallback_nic to talk to the
IMDS. For more info on IMDS:
@@ -1604,7 +2043,7 @@ def get_metadata_from_imds(fallback_nic, retries):
"""
kwargs = {'logfunc': LOG.debug,
'msg': 'Crawl of Azure Instance Metadata Service (IMDS)',
- 'func': _get_metadata_from_imds, 'args': (retries,)}
+ 'func': _get_metadata_from_imds, 'args': (retries, md_type,)}
if net.is_up(fallback_nic):
return util.log_time(**kwargs)
else:
@@ -1620,9 +2059,9 @@ def get_metadata_from_imds(fallback_nic, retries):
@azure_ds_telemetry_reporter
-def _get_metadata_from_imds(retries):
+def _get_metadata_from_imds(retries, md_type=metadata_type.compute):
- url = IMDS_URL + "instance?api-version=2019-06-01"
+ url = md_type.value
headers = {"Metadata": "true"}
try:
response = readurl(
diff --git a/cloudinit/sources/helpers/netlink.py b/cloudinit/sources/helpers/netlink.py
index c2ad587b..e13d6834 100644
--- a/cloudinit/sources/helpers/netlink.py
+++ b/cloudinit/sources/helpers/netlink.py
@@ -185,6 +185,54 @@ def read_rta_oper_state(data):
return InterfaceOperstate(ifname, operstate)
+def wait_for_nic_attach_event(netlink_socket, existing_nics):
+ '''Block until a single nic is attached.
+
+ :param: netlink_socket: netlink_socket to receive events
+ :param: existing_nics: List of existing nics so that we can skip them.
+ :raises: AssertionError if netlink_socket is none.
+ '''
+ LOG.debug("Preparing to wait for nic attach.")
+ ifname = None
+
+ def should_continue_cb(iname, carrier, prevCarrier):
+ if iname in existing_nics:
+ return True
+ nonlocal ifname
+ ifname = iname
+ return False
+
+ # We can return even if the operational state of the new nic is DOWN
+ # because we set it to UP before doing dhcp.
+ read_netlink_messages(netlink_socket,
+ None,
+ [RTM_NEWLINK],
+ [OPER_UP, OPER_DOWN],
+ should_continue_cb)
+ return ifname
+
+
+def wait_for_nic_detach_event(netlink_socket):
+ '''Block until a single nic is detached and its operational state is down.
+
+ :param: netlink_socket: netlink_socket to receive events.
+ '''
+ LOG.debug("Preparing to wait for nic detach.")
+ ifname = None
+
+ def should_continue_cb(iname, carrier, prevCarrier):
+ nonlocal ifname
+ ifname = iname
+ return False
+
+ read_netlink_messages(netlink_socket,
+ None,
+ [RTM_DELLINK],
+ [OPER_DOWN],
+ should_continue_cb)
+ return ifname
+
+
def wait_for_media_disconnect_connect(netlink_socket, ifname):
'''Block until media disconnect and connect has happened on an interface.
Listens on netlink socket to receive netlink events and when the carrier
@@ -198,10 +246,42 @@ def wait_for_media_disconnect_connect(netlink_socket, ifname):
assert (netlink_socket is not None), ("netlink socket is none")
assert (ifname is not None), ("interface name is none")
assert (len(ifname) > 0), ("interface name cannot be empty")
+
+ def should_continue_cb(iname, carrier, prevCarrier):
+ # check for carrier down, up sequence
+ isVnetSwitch = (prevCarrier == OPER_DOWN) and (carrier == OPER_UP)
+ if isVnetSwitch:
+ LOG.debug("Media switch happened on %s.", ifname)
+ return False
+ return True
+
+ LOG.debug("Wait for media disconnect and reconnect to happen")
+ read_netlink_messages(netlink_socket,
+ ifname,
+ [RTM_NEWLINK, RTM_DELLINK],
+ [OPER_UP, OPER_DOWN],
+ should_continue_cb)
+
+
+def read_netlink_messages(netlink_socket,
+ ifname_filter,
+ rtm_types,
+ operstates,
+ should_continue_callback):
+ ''' Reads from the netlink socket until the condition specified by
+ the continuation callback is met.
+
+ :param: netlink_socket: netlink_socket to receive events.
+ :param: ifname_filter: if not None, will only listen for this interface.
+ :param: rtm_types: Type of netlink events to listen for.
+ :param: operstates: Operational states to listen.
+ :param: should_continue_callback: Specifies when to stop listening.
+ '''
+ if netlink_socket is None:
+ raise RuntimeError("Netlink socket is none")
+ data = bytes()
carrier = OPER_UP
prevCarrier = OPER_UP
- data = bytes()
- LOG.debug("Wait for media disconnect and reconnect to happen")
while True:
recv_data = read_netlink_socket(netlink_socket, SELECT_TIMEOUT)
if recv_data is None:
@@ -223,26 +303,26 @@ def wait_for_media_disconnect_connect(netlink_socket, ifname):
padlen = (nlheader.length+PAD_ALIGNMENT-1) & ~(PAD_ALIGNMENT-1)
offset = offset + padlen
LOG.debug('offset to next netlink message: %d', offset)
- # Ignore any messages not new link or del link
- if nlheader.type not in [RTM_NEWLINK, RTM_DELLINK]:
+ # Continue if we are not interested in this message.
+ if nlheader.type not in rtm_types:
continue
interface_state = read_rta_oper_state(nl_msg)
if interface_state is None:
LOG.debug('Failed to read rta attributes: %s', interface_state)
continue
- if interface_state.ifname != ifname:
+ if (ifname_filter is not None and
+ interface_state.ifname != ifname_filter):
LOG.debug(
"Ignored netlink event on interface %s. Waiting for %s.",
- interface_state.ifname, ifname)
+ interface_state.ifname, ifname_filter)
continue
- if interface_state.operstate not in [OPER_UP, OPER_DOWN]:
+ if interface_state.operstate not in operstates:
continue
prevCarrier = carrier
carrier = interface_state.operstate
- # check for carrier down, up sequence
- isVnetSwitch = (prevCarrier == OPER_DOWN) and (carrier == OPER_UP)
- if isVnetSwitch:
- LOG.debug("Media switch happened on %s.", ifname)
+ if not should_continue_callback(interface_state.ifname,
+ carrier,
+ prevCarrier):
return
data = data[offset:]
diff --git a/cloudinit/sources/helpers/tests/test_netlink.py b/cloudinit/sources/helpers/tests/test_netlink.py
index 10760bd6..cafe3961 100644
--- a/cloudinit/sources/helpers/tests/test_netlink.py
+++ b/cloudinit/sources/helpers/tests/test_netlink.py
@@ -9,9 +9,10 @@ import codecs
from cloudinit.sources.helpers.netlink import (
NetlinkCreateSocketError, create_bound_netlink_socket, read_netlink_socket,
read_rta_oper_state, unpack_rta_attr, wait_for_media_disconnect_connect,
+ wait_for_nic_attach_event, wait_for_nic_detach_event,
OPER_DOWN, OPER_UP, OPER_DORMANT, OPER_LOWERLAYERDOWN, OPER_NOTPRESENT,
- OPER_TESTING, OPER_UNKNOWN, RTATTR_START_OFFSET, RTM_NEWLINK, RTM_SETLINK,
- RTM_GETLINK, MAX_SIZE)
+ OPER_TESTING, OPER_UNKNOWN, RTATTR_START_OFFSET, RTM_NEWLINK, RTM_DELLINK,
+ RTM_SETLINK, RTM_GETLINK, MAX_SIZE)
def int_to_bytes(i):
@@ -135,6 +136,75 @@ class TestParseNetlinkMessage(CiTestCase):
@mock.patch('cloudinit.sources.helpers.netlink.socket.socket')
@mock.patch('cloudinit.sources.helpers.netlink.read_netlink_socket')
+class TestNicAttachDetach(CiTestCase):
+ with_logs = True
+
+ def _media_switch_data(self, ifname, msg_type, operstate):
+ '''construct netlink data with specified fields'''
+ if ifname and operstate is not None:
+ data = bytearray(48)
+ bytes = ifname.encode("utf-8")
+ struct.pack_into("HH4sHHc", data, RTATTR_START_OFFSET, 8, 3,
+ bytes, 5, 16, int_to_bytes(operstate))
+ elif ifname:
+ data = bytearray(40)
+ bytes = ifname.encode("utf-8")
+ struct.pack_into("HH4s", data, RTATTR_START_OFFSET, 8, 3, bytes)
+ elif operstate:
+ data = bytearray(40)
+ struct.pack_into("HHc", data, RTATTR_START_OFFSET, 5, 16,
+ int_to_bytes(operstate))
+ struct.pack_into("=LHHLL", data, 0, len(data), msg_type, 0, 0, 0)
+ return data
+
+ def test_nic_attached_oper_down(self, m_read_netlink_socket, m_socket):
+ '''Test for a new nic attached'''
+ ifname = "eth0"
+ data_op_down = self._media_switch_data(ifname, RTM_NEWLINK, OPER_DOWN)
+ m_read_netlink_socket.side_effect = [data_op_down]
+ ifread = wait_for_nic_attach_event(m_socket, [])
+ self.assertEqual(m_read_netlink_socket.call_count, 1)
+ self.assertEqual(ifname, ifread)
+
+ def test_nic_attached_oper_up(self, m_read_netlink_socket, m_socket):
+ '''Test for a new nic attached'''
+ ifname = "eth0"
+ data_op_up = self._media_switch_data(ifname, RTM_NEWLINK, OPER_UP)
+ m_read_netlink_socket.side_effect = [data_op_up]
+ ifread = wait_for_nic_attach_event(m_socket, [])
+ self.assertEqual(m_read_netlink_socket.call_count, 1)
+ self.assertEqual(ifname, ifread)
+
+ def test_nic_attach_ignore_existing(self, m_read_netlink_socket, m_socket):
+ '''Test that we read only the interfaces we are interested in.'''
+ data_eth0 = self._media_switch_data("eth0", RTM_NEWLINK, OPER_DOWN)
+ data_eth1 = self._media_switch_data("eth1", RTM_NEWLINK, OPER_DOWN)
+ m_read_netlink_socket.side_effect = [data_eth0, data_eth1]
+ ifread = wait_for_nic_attach_event(m_socket, ["eth0"])
+ self.assertEqual(m_read_netlink_socket.call_count, 2)
+ self.assertEqual("eth1", ifread)
+
+ def test_nic_attach_read_first(self, m_read_netlink_socket, m_socket):
+ '''Test that we read only the interfaces we are interested in.'''
+ data_eth0 = self._media_switch_data("eth0", RTM_NEWLINK, OPER_DOWN)
+ data_eth1 = self._media_switch_data("eth1", RTM_NEWLINK, OPER_DOWN)
+ m_read_netlink_socket.side_effect = [data_eth0, data_eth1]
+ ifread = wait_for_nic_attach_event(m_socket, ["eth1"])
+ self.assertEqual(m_read_netlink_socket.call_count, 1)
+ self.assertEqual("eth0", ifread)
+
+ def test_nic_detached(self, m_read_netlink_socket, m_socket):
+ '''Test for an existing nic detached'''
+ ifname = "eth0"
+ data_op_down = self._media_switch_data(ifname, RTM_DELLINK, OPER_DOWN)
+ m_read_netlink_socket.side_effect = [data_op_down]
+ ifread = wait_for_nic_detach_event(m_socket)
+ self.assertEqual(m_read_netlink_socket.call_count, 1)
+ self.assertEqual(ifname, ifread)
+
+
+@mock.patch('cloudinit.sources.helpers.netlink.socket.socket')
+@mock.patch('cloudinit.sources.helpers.netlink.read_netlink_socket')
class TestWaitForMediaDisconnectConnect(CiTestCase):
with_logs = True