summaryrefslogtreecommitdiff
path: root/cloudinit/sources/DataSourceAzure.py
diff options
context:
space:
mode:
authorzsdc <taras@vyos.io>2022-03-25 20:58:01 +0200
committerzsdc <taras@vyos.io>2022-03-25 21:42:00 +0200
commit31448cccedd8f841fb3ac7d0f2e3cdefe08a53ba (patch)
tree349631a02467dae0158f6f663cc8aa8537974a97 /cloudinit/sources/DataSourceAzure.py
parent5c4b3943343a85fbe517e5ec1fc670b3a8566b4b (diff)
parent8537237d80a48c8f0cbf8e66aa4826bbc882b022 (diff)
downloadvyos-cloud-init-31448cccedd8f841fb3ac7d0f2e3cdefe08a53ba.tar.gz
vyos-cloud-init-31448cccedd8f841fb3ac7d0f2e3cdefe08a53ba.zip
T2117: Cloud-init updated to 22.1
Merged with 22.1 tag from the upstream Cloud-init repository. Our modules were slightly modified for compatibility with the new version.
Diffstat (limited to 'cloudinit/sources/DataSourceAzure.py')
-rwxr-xr-xcloudinit/sources/DataSourceAzure.py2358
1 files changed, 1333 insertions, 1025 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index 04ff2131..359dfbde 100755
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -5,101 +5,93 @@
# This file is part of cloud-init. See LICENSE file for license information.
import base64
-import contextlib
import crypt
-from functools import partial
+import datetime
import os
import os.path
import re
-from time import time
-from time import sleep
-from xml.dom import minidom
import xml.etree.ElementTree as ET
from enum import Enum
+from time import sleep, time
+from typing import Any, Dict, List, Optional
+from xml.dom import minidom
+
+import requests
from cloudinit import dmi
from cloudinit import log as logging
-from cloudinit import net
-from cloudinit.event import EventType
+from cloudinit import net, sources, ssh_util, subp, util
+from cloudinit.event import EventScope, EventType
from cloudinit.net import device_driver
-from cloudinit.net.dhcp import EphemeralDHCPv4
-from cloudinit import sources
-from cloudinit.sources.helpers import netlink
-from cloudinit import subp
-from cloudinit.url_helper import UrlError, readurl, retry_on_url_exc
-from cloudinit import util
+from cloudinit.net.dhcp import EphemeralDHCPv4, NoDHCPLeaseError
from cloudinit.reporting import events
-
+from cloudinit.sources.helpers import netlink
from cloudinit.sources.helpers.azure import (
DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE,
+ DEFAULT_WIRESERVER_ENDPOINT,
azure_ds_reporter,
azure_ds_telemetry_reporter,
- get_metadata_from_fabric,
+ build_minimal_ovf,
+ dhcp_log_cb,
get_boot_telemetry,
+ get_metadata_from_fabric,
get_system_info,
- report_diagnostic_event,
- EphemeralDHCPv4WithReporting,
is_byte_swapped,
- dhcp_log_cb,
push_log_to_kvp,
- report_failure_to_fabric)
+ report_diagnostic_event,
+ report_failure_to_fabric,
+)
+from cloudinit.url_helper import UrlError, readurl, retry_on_url_exc
LOG = logging.getLogger(__name__)
-DS_NAME = 'Azure'
+DS_NAME = "Azure"
DEFAULT_METADATA = {"instance-id": "iid-AZURE-NODE"}
-AGENT_START = ['service', 'walinuxagent', 'start']
-AGENT_START_BUILTIN = "__builtin__"
-BOUNCE_COMMAND_IFUP = [
- 'sh', '-xc',
- "i=$interface; x=0; ifdown $i || x=$?; ifup $i || x=$?; exit $x"
-]
-BOUNCE_COMMAND_FREEBSD = [
- 'sh', '-xc',
- ("i=$interface; x=0; ifconfig down $i || x=$?; "
- "ifconfig up $i || x=$?; exit $x")
-]
# azure systems will always have a resource disk, and 66-azure-ephemeral.rules
# ensures that it gets linked to this path.
-RESOURCE_DISK_PATH = '/dev/disk/cloud/azure_resource'
-DEFAULT_PRIMARY_NIC = 'eth0'
-LEASE_FILE = '/var/lib/dhcp/dhclient.eth0.leases'
-DEFAULT_FS = 'ext4'
+RESOURCE_DISK_PATH = "/dev/disk/cloud/azure_resource"
+LEASE_FILE = "/var/lib/dhcp/dhclient.eth0.leases"
+DEFAULT_FS = "ext4"
# DMI chassis-asset-tag is set static for all azure instances
-AZURE_CHASSIS_ASSET_TAG = '7783-7084-3265-9085-8269-3286-77'
+AZURE_CHASSIS_ASSET_TAG = "7783-7084-3265-9085-8269-3286-77"
REPROVISION_MARKER_FILE = "/var/lib/cloud/data/poll_imds"
-REPROVISION_NIC_ATTACH_MARKER_FILE = "/var/lib/cloud/data/wait_for_nic_attach"
REPROVISION_NIC_DETACHED_MARKER_FILE = "/var/lib/cloud/data/nic_detached"
REPORTED_READY_MARKER_FILE = "/var/lib/cloud/data/reported_ready"
-AGENT_SEED_DIR = '/var/lib/waagent'
-
+AGENT_SEED_DIR = "/var/lib/waagent"
+DEFAULT_PROVISIONING_ISO_DEV = "/dev/sr0"
# In the event where the IMDS primary server is not
# available, it takes 1s to fallback to the secondary one
IMDS_TIMEOUT_IN_SECONDS = 2
-IMDS_URL = "http://169.254.169.254/metadata/"
-IMDS_VER = "2019-06-01"
-IMDS_VER_PARAM = "api-version={}".format(IMDS_VER)
+IMDS_URL = "http://169.254.169.254/metadata"
+IMDS_VER_MIN = "2019-06-01"
+IMDS_VER_WANT = "2021-08-01"
+IMDS_EXTENDED_VER_MIN = "2021-03-01"
+
+class MetadataType(Enum):
+ ALL = "{}/instance".format(IMDS_URL)
+ NETWORK = "{}/instance/network".format(IMDS_URL)
+ REPROVISION_DATA = "{}/reprovisiondata".format(IMDS_URL)
-class metadata_type(Enum):
- compute = "{}instance?{}".format(IMDS_URL, IMDS_VER_PARAM)
- network = "{}instance/network?{}".format(IMDS_URL,
- IMDS_VER_PARAM)
- reprovisiondata = "{}reprovisiondata?{}".format(IMDS_URL,
- IMDS_VER_PARAM)
+class PPSType(Enum):
+ NONE = "None"
+ RUNNING = "Running"
+ SAVABLE = "Savable"
+ UNKNOWN = "Unknown"
-PLATFORM_ENTROPY_SOURCE = "/sys/firmware/acpi/tables/OEM0"
+
+PLATFORM_ENTROPY_SOURCE: Optional[str] = "/sys/firmware/acpi/tables/OEM0"
# List of static scripts and network config artifacts created by
# stock ubuntu suported images.
UBUNTU_EXTENDED_NETWORK_SCRIPTS = [
- '/etc/netplan/90-hotplug-azure.yaml',
- '/usr/local/sbin/ephemeral_eth.sh',
- '/etc/udev/rules.d/10-net-device-added.rules',
- '/run/network/interfaces.ephemeral.d',
+ "/etc/netplan/90-hotplug-azure.yaml",
+ "/usr/local/sbin/ephemeral_eth.sh",
+ "/etc/udev/rules.d/10-net-device-added.rules",
+ "/run/network/interfaces.ephemeral.d",
]
# This list is used to blacklist devices that will be considered
@@ -119,7 +111,7 @@ UBUNTU_EXTENDED_NETWORK_SCRIPTS = [
# https://docs.microsoft.com/en-us/azure/virtual-machines/dv2-dsv2-series
# https://docs.microsoft.com/en-us/azure/virtual-machines/dv3-dsv3-series
# https://docs.microsoft.com/en-us/azure/virtual-machines/ev3-esv3-series
-BLACKLIST_DRIVERS = ['mlx4_core', 'mlx5_core']
+BLACKLIST_DRIVERS = ["mlx4_core", "mlx5_core"]
def find_storvscid_from_sysctl_pnpinfo(sysctl_out, deviceid):
@@ -133,11 +125,13 @@ def find_storvscid_from_sysctl_pnpinfo(sysctl_out, deviceid):
if re.search(r"pnpinfo", line):
fields = line.split()
if len(fields) >= 3:
- columns = fields[2].split('=')
- if (len(columns) >= 2 and
- columns[0] == "deviceid" and
- columns[1].startswith(deviceid)):
- comps = fields[0].split('.')
+ columns = fields[2].split("=")
+ if (
+ len(columns) >= 2
+ and columns[0] == "deviceid"
+ and columns[1].startswith(deviceid)
+ ):
+ comps = fields[0].split(".")
return comps[2]
return None
@@ -161,7 +155,7 @@ def find_busdev_from_disk(camcontrol_out, disk_drv):
return None
-def find_dev_from_busdev(camcontrol_out, busdev):
+def find_dev_from_busdev(camcontrol_out: str, busdev: str) -> Optional[str]:
# find the daX from 'camcontrol devlist' output
# if busdev matches the specified value, i.e. 'scbus2'
"""
@@ -171,18 +165,38 @@ def find_dev_from_busdev(camcontrol_out, busdev):
"""
for line in camcontrol_out.splitlines():
if re.search(busdev, line):
- items = line.split('(')
+ items = line.split("(")
if len(items) == 2:
- dev_pass = items[1].split(',')
+ dev_pass = items[1].split(",")
return dev_pass[0]
return None
-def execute_or_debug(cmd, fail_ret=None):
+def normalize_mac_address(mac: str) -> str:
+ """Normalize mac address with colons and lower-case."""
+ if len(mac) == 12:
+ mac = ":".join(
+ [mac[0:2], mac[2:4], mac[4:6], mac[6:8], mac[8:10], mac[10:12]]
+ )
+
+ return mac.lower()
+
+
+@azure_ds_telemetry_reporter
+def get_hv_netvsc_macs_normalized() -> List[str]:
+ """Get Hyper-V NICs as normalized MAC addresses."""
+ return [
+ normalize_mac_address(n[1])
+ for n in net.get_interfaces()
+ if n[2] == "hv_netvsc"
+ ]
+
+
+def execute_or_debug(cmd, fail_ret=None) -> str:
try:
- return subp.subp(cmd)[0]
+ return subp.subp(cmd)[0] # type: ignore
except subp.ProcessExecutionError:
- LOG.debug("Failed to execute: %s", ' '.join(cmd))
+ LOG.debug("Failed to execute: %s", " ".join(cmd))
return fail_ret
@@ -191,14 +205,14 @@ def get_dev_storvsc_sysctl():
def get_camcontrol_dev_bus():
- return execute_or_debug(['camcontrol', 'devlist', '-b'])
+ return execute_or_debug(["camcontrol", "devlist", "-b"])
def get_camcontrol_dev():
- return execute_or_debug(['camcontrol', 'devlist'])
+ return execute_or_debug(["camcontrol", "devlist"])
-def get_resource_disk_on_freebsd(port_id):
+def get_resource_disk_on_freebsd(port_id) -> Optional[str]:
g0 = "00000000"
if port_id > 1:
g0 = "00000001"
@@ -242,9 +256,8 @@ def get_resource_disk_on_freebsd(port_id):
# update the FreeBSD specific information
if util.is_FreeBSD():
- DEFAULT_PRIMARY_NIC = 'hn0'
- LEASE_FILE = '/var/db/dhclient.leases.hn0'
- DEFAULT_FS = 'freebsd-ufs'
+ LEASE_FILE = "/var/db/dhclient.leases.hn0"
+ DEFAULT_FS = "freebsd-ufs"
res_disk = get_resource_disk_on_freebsd(1)
if res_disk is not None:
LOG.debug("resource disk is not None")
@@ -255,186 +268,152 @@ if util.is_FreeBSD():
PLATFORM_ENTROPY_SOURCE = None
BUILTIN_DS_CONFIG = {
- 'agent_command': AGENT_START_BUILTIN,
- 'data_dir': AGENT_SEED_DIR,
- 'set_hostname': True,
- 'hostname_bounce': {
- 'interface': DEFAULT_PRIMARY_NIC,
- 'policy': True,
- 'command': 'builtin',
- 'hostname_command': 'hostname',
- },
- 'disk_aliases': {'ephemeral0': RESOURCE_DISK_PATH},
- 'dhclient_lease_file': LEASE_FILE,
- 'apply_network_config': True, # Use IMDS published network configuration
+ "data_dir": AGENT_SEED_DIR,
+ "disk_aliases": {"ephemeral0": RESOURCE_DISK_PATH},
+ "dhclient_lease_file": LEASE_FILE,
+ "apply_network_config": True, # Use IMDS published network configuration
}
# RELEASE_BLOCKER: Xenial and earlier apply_network_config default is False
-BUILTIN_CLOUD_CONFIG = {
- 'disk_setup': {
- 'ephemeral0': {'table_type': 'gpt',
- 'layout': [100],
- 'overwrite': True},
+BUILTIN_CLOUD_EPHEMERAL_DISK_CONFIG = {
+ "disk_setup": {
+ "ephemeral0": {
+ "table_type": "gpt",
+ "layout": [100],
+ "overwrite": True,
+ },
},
- 'fs_setup': [{'filesystem': DEFAULT_FS,
- 'device': 'ephemeral0.1'}],
+ "fs_setup": [{"filesystem": DEFAULT_FS, "device": "ephemeral0.1"}],
}
-DS_CFG_PATH = ['datasource', DS_NAME]
-DS_CFG_KEY_PRESERVE_NTFS = 'never_destroy_ntfs'
-DEF_EPHEMERAL_LABEL = 'Temporary Storage'
+DS_CFG_PATH = ["datasource", DS_NAME]
+DS_CFG_KEY_PRESERVE_NTFS = "never_destroy_ntfs"
+DEF_EPHEMERAL_LABEL = "Temporary Storage"
# The redacted password fails to meet password complexity requirements
# so we can safely use this to mask/redact the password in the ovf-env.xml
-DEF_PASSWD_REDACTION = 'REDACTED'
-
-
-def get_hostname(hostname_command='hostname'):
- if not isinstance(hostname_command, (list, tuple)):
- hostname_command = (hostname_command,)
- return subp.subp(hostname_command, capture=True)[0].strip()
-
-
-def set_hostname(hostname, hostname_command='hostname'):
- subp.subp([hostname_command, hostname])
-
-
-@azure_ds_telemetry_reporter
-@contextlib.contextmanager
-def temporary_hostname(temp_hostname, cfg, hostname_command='hostname'):
- """
- Set a temporary hostname, restoring the previous hostname on exit.
-
- Will have the value of the previous hostname when used as a context
- manager, or None if the hostname was not changed.
- """
- policy = cfg['hostname_bounce']['policy']
- previous_hostname = get_hostname(hostname_command)
- if (not util.is_true(cfg.get('set_hostname')) or
- util.is_false(policy) or
- (previous_hostname == temp_hostname and policy != 'force')):
- yield None
- return
- try:
- set_hostname(temp_hostname, hostname_command)
- except Exception as e:
- report_diagnostic_event(
- 'Failed setting temporary hostname: %s' % e,
- logger_func=LOG.warning)
- yield None
- return
- try:
- yield previous_hostname
- finally:
- set_hostname(previous_hostname, hostname_command)
+DEF_PASSWD_REDACTION = "REDACTED"
class DataSourceAzure(sources.DataSource):
- dsname = 'Azure'
+ dsname = "Azure"
+ default_update_events = {
+ EventScope.NETWORK: {
+ EventType.BOOT_NEW_INSTANCE,
+ EventType.BOOT,
+ }
+ }
_negotiated = False
_metadata_imds = sources.UNSET
+ _ci_pkl_version = 1
def __init__(self, sys_cfg, distro, paths):
sources.DataSource.__init__(self, sys_cfg, distro, paths)
- self.seed_dir = os.path.join(paths.seed_dir, 'azure')
+ self.seed_dir = os.path.join(paths.seed_dir, "azure")
self.cfg = {}
self.seed = None
- self.ds_cfg = util.mergemanydict([
- util.get_cfg_by_path(sys_cfg, DS_CFG_PATH, {}),
- BUILTIN_DS_CONFIG])
- self.dhclient_lease_file = self.ds_cfg.get('dhclient_lease_file')
+ self.ds_cfg = util.mergemanydict(
+ [util.get_cfg_by_path(sys_cfg, DS_CFG_PATH, {}), BUILTIN_DS_CONFIG]
+ )
+ self.dhclient_lease_file = self.ds_cfg.get("dhclient_lease_file")
+ self._iso_dev = None
self._network_config = None
- # Regenerate network config new_instance boot and every boot
- self.update_events['network'].add(EventType.BOOT)
self._ephemeral_dhcp_ctx = None
+ self._wireserver_endpoint = DEFAULT_WIRESERVER_ENDPOINT
+
+ def _unpickle(self, ci_pkl_version: int) -> None:
+ super()._unpickle(ci_pkl_version)
+
+ self._ephemeral_dhcp_ctx = None
+ self._iso_dev = None
+ self._wireserver_endpoint = DEFAULT_WIRESERVER_ENDPOINT
def __str__(self):
root = sources.DataSource.__str__(self)
return "%s [seed=%s]" % (root, self.seed)
- @azure_ds_telemetry_reporter
- def bounce_network_with_azure_hostname(self):
- # When using cloud-init to provision, we have to set the hostname from
- # the metadata and "bounce" the network to force DDNS to update via
- # dhclient
- azure_hostname = self.metadata.get('local-hostname')
- LOG.debug("Hostname in metadata is %s", azure_hostname)
- hostname_command = self.ds_cfg['hostname_bounce']['hostname_command']
-
- with temporary_hostname(azure_hostname, self.ds_cfg,
- hostname_command=hostname_command) \
- as previous_hn:
- if (previous_hn is not None and
- util.is_true(self.ds_cfg.get('set_hostname'))):
- cfg = self.ds_cfg['hostname_bounce']
-
- # "Bouncing" the network
- try:
- return perform_hostname_bounce(hostname=azure_hostname,
- cfg=cfg,
- prev_hostname=previous_hn)
- except Exception as e:
- report_diagnostic_event(
- "Failed publishing hostname: %s" % e,
- logger_func=LOG.warning)
- util.logexc(LOG, "handling set_hostname failed")
- return False
+ def _get_subplatform(self):
+ """Return the subplatform metadata source details."""
+ if self.seed is None:
+ subplatform_type = "unknown"
+ elif self.seed.startswith("/dev"):
+ subplatform_type = "config-disk"
+ elif self.seed.lower() == "imds":
+ subplatform_type = "imds"
+ else:
+ subplatform_type = "seed-dir"
+ return "%s (%s)" % (subplatform_type, self.seed)
@azure_ds_telemetry_reporter
- def get_metadata_from_agent(self):
- temp_hostname = self.metadata.get('local-hostname')
- agent_cmd = self.ds_cfg['agent_command']
- LOG.debug("Getting metadata via agent. hostname=%s cmd=%s",
- temp_hostname, agent_cmd)
+ def _setup_ephemeral_networking(
+ self, *, iface: Optional[str] = None, timeout_minutes: int = 5
+ ) -> None:
+ """Setup ephemeral networking.
- self.bounce_network_with_azure_hostname()
+ Keep retrying DHCP up to specified number of minutes. This does
+ not kill dhclient, so the timeout in practice may be up to
+ timeout_minutes + the system-configured timeout for dhclient.
- try:
- invoke_agent(agent_cmd)
- except subp.ProcessExecutionError:
- # claim the datasource even if the command failed
- util.logexc(LOG, "agent command '%s' failed.",
- self.ds_cfg['agent_command'])
-
- ddir = self.ds_cfg['data_dir']
-
- fp_files = []
- key_value = None
- for pk in self.cfg.get('_pubkeys', []):
- if pk.get('value', None):
- key_value = pk['value']
- LOG.debug("SSH authentication: using value from fabric")
- else:
- bname = str(pk['fingerprint'] + ".crt")
- fp_files += [os.path.join(ddir, bname)]
- LOG.debug("SSH authentication: "
- "using fingerprint from fabric")
+ :param timeout_minutes: Number of minutes to keep retrying for.
+
+ :raises NoDHCPLeaseError: If unable to obtain DHCP lease.
+ """
+ if self._ephemeral_dhcp_ctx is not None:
+ raise RuntimeError(
+ "Bringing up networking when already configured."
+ )
+
+ LOG.debug("Requested ephemeral networking (iface=%s)", iface)
+
+ start = datetime.datetime.utcnow()
+ timeout = start + datetime.timedelta(minutes=timeout_minutes)
+ self._ephemeral_dhcp_ctx = EphemeralDHCPv4(
+ iface=iface, dhcp_log_func=dhcp_log_cb
+ )
+
+ lease = None
with events.ReportEventStack(
- name="waiting-for-ssh-public-key",
- description="wait for agents to retrieve SSH keys",
- parent=azure_ds_reporter):
- # wait very long for public SSH keys to arrive
- # https://bugs.launchpad.net/cloud-init/+bug/1717611
- missing = util.log_time(logfunc=LOG.debug,
- msg="waiting for SSH public key files",
- func=util.wait_for_files,
- args=(fp_files, 900))
- if len(missing):
- LOG.warning("Did not find files, but going on: %s", missing)
-
- metadata = {}
- metadata['public-keys'] = key_value or pubkeys_from_crt_files(fp_files)
- return metadata
+ name="obtain-dhcp-lease",
+ description="obtain dhcp lease",
+ parent=azure_ds_reporter,
+ ):
+ while datetime.datetime.utcnow() < timeout:
+ try:
+ lease = self._ephemeral_dhcp_ctx.obtain_lease()
+ break
+ except NoDHCPLeaseError:
+ continue
- def _get_subplatform(self):
- """Return the subplatform metadata source details."""
- if self.seed.startswith('/dev'):
- subplatform_type = 'config-disk'
- else:
- subplatform_type = 'seed-dir'
- return '%s (%s)' % (subplatform_type, self.seed)
+ if lease is None:
+ msg = "Failed to obtain DHCP lease (iface=%s)" % iface
+ report_diagnostic_event(msg, logger_func=LOG.error)
+ self._ephemeral_dhcp_ctx = None
+ raise NoDHCPLeaseError()
+ else:
+ # Ensure iface is set.
+ self._ephemeral_dhcp_ctx.iface = lease["interface"]
+
+ # Update wireserver IP from DHCP options.
+ if "unknown-245" in lease:
+ self._wireserver_endpoint = lease["unknown-245"]
+
+ @azure_ds_telemetry_reporter
+ def _teardown_ephemeral_networking(self) -> None:
+ """Teardown ephemeral networking."""
+ if self._ephemeral_dhcp_ctx is None:
+ return
+
+ self._ephemeral_dhcp_ctx.clean_network()
+ self._ephemeral_dhcp_ctx = None
+
+ def _is_ephemeral_networking_up(self) -> bool:
+ """Check if networking is configured."""
+ return not (
+ self._ephemeral_dhcp_ctx is None
+ or self._ephemeral_dhcp_ctx.lease is None
+ )
@azure_ds_telemetry_reporter
def crawl_metadata(self):
@@ -448,126 +427,205 @@ class DataSourceAzure(sources.DataSource):
# azure removes/ejects the cdrom containing the ovf-env.xml
# file on reboot. So, in order to successfully reboot we
# need to look in the datadir and consider that valid
- ddir = self.ds_cfg['data_dir']
+ ddir = self.ds_cfg["data_dir"]
# The order in which the candidates are inserted matters here, because
# it determines the value of ret. More specifically, the first one in
# the candidate list determines the path to take in order to get the
# metadata we need.
- candidates = [self.seed_dir]
+ ovf_is_accessible = False
+ metadata_source = None
+ md = {}
+ userdata_raw = ""
+ cfg = {}
+ files = {}
+
if os.path.isfile(REPROVISION_MARKER_FILE):
- candidates.insert(0, "IMDS")
- report_diagnostic_event("Reprovision marker file already present "
- "before crawling Azure metadata: %s" %
- REPROVISION_MARKER_FILE,
- logger_func=LOG.debug)
- elif os.path.isfile(REPROVISION_NIC_ATTACH_MARKER_FILE):
- candidates.insert(0, "NIC_ATTACH_MARKER_PRESENT")
- report_diagnostic_event("Reprovision nic attach marker file "
- "already present before crawling Azure "
- "metadata: %s" %
- REPROVISION_NIC_ATTACH_MARKER_FILE,
- logger_func=LOG.debug)
- candidates.extend(list_possible_azure_ds_devs())
- if ddir:
- candidates.append(ddir)
-
- found = None
- reprovision = False
- reprovision_after_nic_attach = False
- for cdev in candidates:
- try:
- if cdev == "IMDS":
- ret = None
- reprovision = True
- elif cdev == "NIC_ATTACH_MARKER_PRESENT":
- ret = None
- reprovision_after_nic_attach = True
- elif cdev.startswith("/dev/"):
- if util.is_FreeBSD():
- ret = util.mount_cb(cdev, load_azure_ds_dir,
- mtype="udf")
+ metadata_source = "IMDS"
+ report_diagnostic_event(
+ "Reprovision marker file already present "
+ "before crawling Azure metadata: %s" % REPROVISION_MARKER_FILE,
+ logger_func=LOG.debug,
+ )
+ else:
+ for src in list_possible_azure_ds(self.seed_dir, ddir):
+ try:
+ if src.startswith("/dev/"):
+ if util.is_FreeBSD():
+ md, userdata_raw, cfg, files = util.mount_cb(
+ src, load_azure_ds_dir, mtype="udf"
+ )
+ else:
+ md, userdata_raw, cfg, files = util.mount_cb(
+ src, load_azure_ds_dir
+ )
+ # save the device for ejection later
+ self._iso_dev = src
else:
- ret = util.mount_cb(cdev, load_azure_ds_dir)
- else:
- ret = load_azure_ds_dir(cdev)
+ md, userdata_raw, cfg, files = load_azure_ds_dir(src)
+ ovf_is_accessible = True
+ metadata_source = src
+ break
+ except NonAzureDataSource:
+ report_diagnostic_event(
+ "Did not find Azure data source in %s" % src,
+ logger_func=LOG.debug,
+ )
+ continue
+ except util.MountFailedError:
+ report_diagnostic_event(
+ "%s was not mountable" % src, logger_func=LOG.debug
+ )
+ md = {"local-hostname": ""}
+ cfg = {"system_info": {"default_user": {"name": ""}}}
+ metadata_source = "IMDS"
+ continue
+ except BrokenAzureDataSource as exc:
+ msg = "BrokenAzureDataSource: %s" % exc
+ report_diagnostic_event(msg, logger_func=LOG.error)
+ raise sources.InvalidMetaDataException(msg)
- except NonAzureDataSource:
- report_diagnostic_event(
- "Did not find Azure data source in %s" % cdev,
- logger_func=LOG.debug)
- continue
- except BrokenAzureDataSource as exc:
- msg = 'BrokenAzureDataSource: %s' % exc
+ report_diagnostic_event(
+ "Found provisioning metadata in %s" % metadata_source,
+ logger_func=LOG.debug,
+ )
+
+ # If we read OVF from attached media, we are provisioning. If OVF
+ # is not found, we are probably provisioning on a system which does
+ # not have UDF support. In either case, require IMDS metadata.
+ # If we require IMDS metadata, try harder to obtain networking, waiting
+ # for at least 20 minutes. Otherwise only wait 5 minutes.
+ requires_imds_metadata = bool(self._iso_dev) or not ovf_is_accessible
+ timeout_minutes = 5 if requires_imds_metadata else 20
+ try:
+ self._setup_ephemeral_networking(timeout_minutes=timeout_minutes)
+ except NoDHCPLeaseError:
+ pass
+
+ if self._is_ephemeral_networking_up():
+ imds_md = self.get_imds_data_with_api_fallback(retries=10)
+ else:
+ imds_md = {}
+
+ if not imds_md and not ovf_is_accessible:
+ msg = "No OVF or IMDS available"
+ report_diagnostic_event(msg)
+ raise sources.InvalidMetaDataException(msg)
+
+ # Refresh PPS type using metadata.
+ pps_type = self._determine_pps_type(cfg, imds_md)
+ if pps_type != PPSType.NONE:
+ if util.is_FreeBSD():
+ msg = "Free BSD is not supported for PPS VMs"
report_diagnostic_event(msg, logger_func=LOG.error)
raise sources.InvalidMetaDataException(msg)
- except util.MountFailedError:
- report_diagnostic_event(
- '%s was not mountable' % cdev, logger_func=LOG.warning)
- continue
- perform_reprovision = reprovision or self._should_reprovision(ret)
- perform_reprovision_after_nic_attach = (
- reprovision_after_nic_attach or
- self._should_reprovision_after_nic_attach(ret))
+ self._write_reprovision_marker()
+
+ if pps_type == PPSType.SAVABLE:
+ self._wait_for_all_nics_ready()
+
+ md, userdata_raw, cfg, files = self._reprovision()
+ # fetch metadata again as it has changed after reprovisioning
+ imds_md = self.get_imds_data_with_api_fallback(retries=10)
+
+ # Report errors if IMDS network configuration is missing data.
+ self.validate_imds_network_metadata(imds_md=imds_md)
+
+ self.seed = metadata_source
+ crawled_data.update(
+ {
+ "cfg": cfg,
+ "files": files,
+ "metadata": util.mergemanydict([md, {"imds": imds_md}]),
+ "userdata_raw": userdata_raw,
+ }
+ )
+ imds_username = _username_from_imds(imds_md)
+ imds_hostname = _hostname_from_imds(imds_md)
+ imds_disable_password = _disable_password_from_imds(imds_md)
+ if imds_username:
+ LOG.debug("Username retrieved from IMDS: %s", imds_username)
+ cfg["system_info"]["default_user"]["name"] = imds_username
+ if imds_hostname:
+ LOG.debug("Hostname retrieved from IMDS: %s", imds_hostname)
+ crawled_data["metadata"]["local-hostname"] = imds_hostname
+ if imds_disable_password:
+ LOG.debug(
+ "Disable password retrieved from IMDS: %s",
+ imds_disable_password,
+ )
+ crawled_data["metadata"][
+ "disable_password"
+ ] = imds_disable_password
- if perform_reprovision or perform_reprovision_after_nic_attach:
- if util.is_FreeBSD():
- msg = "Free BSD is not supported for PPS VMs"
- report_diagnostic_event(msg, logger_func=LOG.error)
- raise sources.InvalidMetaDataException(msg)
- if perform_reprovision_after_nic_attach:
- self._wait_for_all_nics_ready()
- ret = self._reprovision()
-
- imds_md = get_metadata_from_imds(
- self.fallback_interface, retries=10)
- (md, userdata_raw, cfg, files) = ret
- self.seed = cdev
- crawled_data.update({
- 'cfg': cfg,
- 'files': files,
- 'metadata': util.mergemanydict(
- [md, {'imds': imds_md}]),
- 'userdata_raw': userdata_raw})
- found = cdev
+ if metadata_source == "IMDS" and not crawled_data["files"]:
+ try:
+ contents = build_minimal_ovf(
+ username=imds_username, # type: ignore
+ hostname=imds_hostname, # type: ignore
+ disableSshPwd=imds_disable_password, # type: ignore
+ )
+ crawled_data["files"] = {"ovf-env.xml": contents}
+ except Exception as e:
+ report_diagnostic_event(
+ "Failed to construct OVF from IMDS data %s" % e,
+ logger_func=LOG.debug,
+ )
- report_diagnostic_event(
- 'found datasource in %s' % cdev, logger_func=LOG.debug)
- break
+ # only use userdata from imds if OVF did not provide custom data
+ # userdata provided by IMDS is always base64 encoded
+ if not userdata_raw:
+ imds_userdata = _userdata_from_imds(imds_md)
+ if imds_userdata:
+ LOG.debug("Retrieved userdata from IMDS")
+ try:
+ crawled_data["userdata_raw"] = base64.b64decode(
+ "".join(imds_userdata.split())
+ )
+ except Exception:
+ report_diagnostic_event(
+ "Bad userdata in IMDS", logger_func=LOG.warning
+ )
- if not found:
- msg = 'No Azure metadata found'
+ if not metadata_source:
+ msg = "No Azure metadata found"
report_diagnostic_event(msg, logger_func=LOG.error)
raise sources.InvalidMetaDataException(msg)
+ else:
+ report_diagnostic_event(
+ "found datasource in %s" % metadata_source,
+ logger_func=LOG.debug,
+ )
- if found == ddir:
+ if metadata_source == ddir:
report_diagnostic_event(
- "using files cached in %s" % ddir, logger_func=LOG.debug)
+ "using files cached in %s" % ddir, logger_func=LOG.debug
+ )
seed = _get_random_seed()
if seed:
- crawled_data['metadata']['random_seed'] = seed
- crawled_data['metadata']['instance-id'] = self._iid()
-
- if perform_reprovision or perform_reprovision_after_nic_attach:
- LOG.info("Reporting ready to Azure after getting ReprovisionData")
- use_cached_ephemeral = (
- self.distro.networking.is_up(self.fallback_interface) and
- getattr(self, '_ephemeral_dhcp_ctx', None))
- if use_cached_ephemeral:
- self._report_ready(lease=self._ephemeral_dhcp_ctx.lease)
- self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral
+ crawled_data["metadata"]["random_seed"] = seed
+ crawled_data["metadata"]["instance-id"] = self._iid()
+
+ if self._negotiated is False and self._is_ephemeral_networking_up():
+ # Report ready and fetch public-keys from Wireserver, if required.
+ pubkey_info = self._determine_wireserver_pubkey_info(
+ cfg=cfg, imds_md=imds_md
+ )
+ try:
+ ssh_keys = self._report_ready(pubkey_info=pubkey_info)
+ except Exception:
+ # Failed to report ready, but continue with best effort.
+ pass
else:
- try:
- with EphemeralDHCPv4WithReporting(
- azure_ds_reporter) as lease:
- self._report_ready(lease=lease)
- except Exception as e:
- report_diagnostic_event(
- "exception while reporting ready: %s" % e,
- logger_func=LOG.error)
- raise
+ LOG.debug("negotiating returned %s", ssh_keys)
+ if ssh_keys:
+ crawled_data["metadata"]["public-keys"] = ssh_keys
+
+ self._cleanup_markers()
+ self._negotiated = True
+
return crawled_data
def _is_platform_viable(self):
@@ -602,28 +660,57 @@ class DataSourceAzure(sources.DataSource):
try:
crawled_data = util.log_time(
- logfunc=LOG.debug, msg='Crawl of metadata service',
- func=self.crawl_metadata
+ logfunc=LOG.debug,
+ msg="Crawl of metadata service",
+ func=self.crawl_metadata,
)
except Exception as e:
report_diagnostic_event(
- 'Could not crawl Azure metadata: %s' % e,
- logger_func=LOG.error)
+ "Could not crawl Azure metadata: %s" % e, logger_func=LOG.error
+ )
self._report_failure(
- description=DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE)
+ description=DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE
+ )
return False
+ finally:
+ self._teardown_ephemeral_networking()
- if (self.distro and self.distro.name == 'ubuntu' and
- self.ds_cfg.get('apply_network_config')):
+ if (
+ self.distro
+ and self.distro.name == "ubuntu"
+ and self.ds_cfg.get("apply_network_config")
+ ):
maybe_remove_ubuntu_network_config_scripts()
# Process crawled data and augment with various config defaults
- self.cfg = util.mergemanydict(
- [crawled_data['cfg'], BUILTIN_CLOUD_CONFIG])
- self._metadata_imds = crawled_data['metadata']['imds']
+
+ # Only merge in default cloud config related to the ephemeral disk
+ # if the ephemeral disk exists
+ devpath = RESOURCE_DISK_PATH
+ if os.path.exists(devpath):
+ report_diagnostic_event(
+ "Ephemeral resource disk '%s' exists. "
+ "Merging default Azure cloud ephemeral disk configs."
+ % devpath,
+ logger_func=LOG.debug,
+ )
+ self.cfg = util.mergemanydict(
+ [crawled_data["cfg"], BUILTIN_CLOUD_EPHEMERAL_DISK_CONFIG]
+ )
+ else:
+ report_diagnostic_event(
+ "Ephemeral resource disk '%s' does not exist. "
+ "Not merging default Azure cloud ephemeral disk configs."
+ % devpath,
+ logger_func=LOG.debug,
+ )
+ self.cfg = crawled_data["cfg"]
+
+ self._metadata_imds = crawled_data["metadata"]["imds"]
self.metadata = util.mergemanydict(
- [crawled_data['metadata'], DEFAULT_METADATA])
- self.userdata_raw = crawled_data['userdata_raw']
+ [crawled_data["metadata"], DEFAULT_METADATA]
+ )
+ self.userdata_raw = crawled_data["userdata_raw"]
user_ds_cfg = util.get_cfg_by_path(self.cfg, DS_CFG_PATH, {})
self.ds_cfg = util.mergemanydict([user_ds_cfg, self.ds_cfg])
@@ -631,41 +718,108 @@ class DataSourceAzure(sources.DataSource):
# walinux agent writes files world readable, but expects
# the directory to be protected.
write_files(
- self.ds_cfg['data_dir'], crawled_data['files'], dirmode=0o700)
+ self.ds_cfg["data_dir"], crawled_data["files"], dirmode=0o700
+ )
return True
+ @azure_ds_telemetry_reporter
+ def get_imds_data_with_api_fallback(
+ self,
+ *,
+ retries,
+ md_type=MetadataType.ALL,
+ exc_cb=retry_on_url_exc,
+ infinite=False,
+ ):
+ """
+ Wrapper for get_metadata_from_imds so that we can have flexibility
+ in which IMDS api-version we use. If a particular instance of IMDS
+ does not have the api version that is desired, we want to make
+ this fault tolerant and fall back to a good known minimum api
+ version.
+ """
+ for _ in range(retries):
+ try:
+ LOG.info("Attempting IMDS api-version: %s", IMDS_VER_WANT)
+ return get_metadata_from_imds(
+ retries=0,
+ md_type=md_type,
+ api_version=IMDS_VER_WANT,
+ exc_cb=exc_cb,
+ )
+ except UrlError as err:
+ LOG.info("UrlError with IMDS api-version: %s", IMDS_VER_WANT)
+ if err.code == 400:
+ log_msg = "Fall back to IMDS api-version: {}".format(
+ IMDS_VER_MIN
+ )
+ report_diagnostic_event(log_msg, logger_func=LOG.info)
+ break
+
+ LOG.info("Using IMDS api-version: %s", IMDS_VER_MIN)
+ return get_metadata_from_imds(
+ retries=retries,
+ md_type=md_type,
+ api_version=IMDS_VER_MIN,
+ exc_cb=exc_cb,
+ infinite=infinite,
+ )
+
def device_name_to_device(self, name):
- return self.ds_cfg['disk_aliases'].get(name)
+ return self.ds_cfg["disk_aliases"].get(name)
@azure_ds_telemetry_reporter
- def get_public_ssh_keys(self):
+ def get_public_ssh_keys(self) -> List[str]:
+ """
+ Retrieve public SSH keys.
"""
- Try to get the ssh keys from IMDS first, and if that fails
- (i.e. IMDS is unavailable) then fallback to getting the ssh
- keys from OVF.
+ try:
+ return self._get_public_keys_from_imds(self.metadata["imds"])
+ except (KeyError, ValueError):
+ pass
+
+ return self._get_public_keys_from_ovf()
+
+ def _get_public_keys_from_imds(self, imds_md: dict) -> List[str]:
+ """Get SSH keys from IMDS metadata.
- The benefit to getting keys from IMDS is a large performance
- advantage, so this is a strong preference. But we must keep
- OVF as a second option for environments that don't have IMDS.
+ :raises KeyError: if IMDS metadata is malformed/missing.
+ :raises ValueError: if key format is not supported.
+
+ :returns: List of keys.
"""
- LOG.debug('Retrieving public SSH keys')
- ssh_keys = []
try:
ssh_keys = [
- public_key['keyData']
- for public_key
- in self.metadata['imds']['compute']['publicKeys']
+ public_key["keyData"]
+ for public_key in imds_md["compute"]["publicKeys"]
]
- LOG.debug('Retrieved SSH keys from IMDS')
except KeyError:
- log_msg = 'Unable to get keys from IMDS, falling back to OVF'
+ log_msg = "No SSH keys found in IMDS metadata"
+ report_diagnostic_event(log_msg, logger_func=LOG.debug)
+ raise
+
+ if any(not _key_is_openssh_formatted(key=key) for key in ssh_keys):
+ log_msg = "Key(s) not in OpenSSH format"
+ report_diagnostic_event(log_msg, logger_func=LOG.debug)
+ raise ValueError(log_msg)
+
+ log_msg = "Retrieved {} keys from IMDS".format(len(ssh_keys))
+ report_diagnostic_event(log_msg, logger_func=LOG.debug)
+ return ssh_keys
+
+ def _get_public_keys_from_ovf(self) -> List[str]:
+ """Get SSH keys that were fetched from wireserver.
+
+ :returns: List of keys.
+ """
+ ssh_keys = []
+ try:
+ ssh_keys = self.metadata["public-keys"]
+ log_msg = "Retrieved {} keys from OVF".format(len(ssh_keys))
+ report_diagnostic_event(log_msg, logger_func=LOG.debug)
+ except KeyError:
+ log_msg = "No keys available from OVF"
report_diagnostic_event(log_msg, logger_func=LOG.debug)
- try:
- ssh_keys = self.metadata['public-keys']
- LOG.debug('Retrieved keys from OVF')
- except KeyError:
- log_msg = 'No keys available from OVF'
- report_diagnostic_event(log_msg, logger_func=LOG.debug)
return ssh_keys
@@ -678,33 +832,32 @@ class DataSourceAzure(sources.DataSource):
def _iid(self, previous=None):
prev_iid_path = os.path.join(
- self.paths.get_cpath('data'), 'instance-id')
- iid = dmi.read_dmi_data('system-uuid')
+ self.paths.get_cpath("data"), "instance-id"
+ )
+ # Older kernels than 4.15 will have UPPERCASE product_uuid.
+ # We don't want Azure to react to an UPPER/lower difference as a new
+ # instance id as it rewrites SSH host keys.
+ # LP: #1835584
+ system_uuid = dmi.read_dmi_data("system-uuid")
+ if system_uuid is None:
+ raise RuntimeError("failed to read system-uuid")
+
+ iid = system_uuid.lower()
if os.path.exists(prev_iid_path):
previous = util.load_file(prev_iid_path).strip()
- if is_byte_swapped(previous, iid):
+ if previous.lower() == iid:
+ # If uppercase/lowercase equivalent, return the previous value
+ # to avoid new instance id.
+ return previous
+ if is_byte_swapped(previous.lower(), iid):
return previous
return iid
@azure_ds_telemetry_reporter
- def setup(self, is_new_instance):
- if self._negotiated is False:
- LOG.debug("negotiating for %s (new_instance=%s)",
- self.get_instance_id(), is_new_instance)
- fabric_data = self._negotiate()
- LOG.debug("negotiating returned %s", fabric_data)
- if fabric_data:
- self.metadata.update(fabric_data)
- self._negotiated = True
- else:
- LOG.debug("negotiating already done for %s",
- self.get_instance_id())
-
- @azure_ds_telemetry_reporter
def _wait_for_nic_detach(self, nl_sock):
"""Use the netlink socket provided to wait for nic detach event.
- NOTE: The function doesn't close the socket. The caller owns closing
- the socket and disposing it safely.
+ NOTE: The function doesn't close the socket. The caller owns closing
+ the socket and disposing it safely.
"""
try:
ifname = None
@@ -712,106 +865,124 @@ class DataSourceAzure(sources.DataSource):
# Preprovisioned VM will only have one NIC, and it gets
# detached immediately after deployment.
with events.ReportEventStack(
- name="wait-for-nic-detach",
- description=("wait for nic detach"),
- parent=azure_ds_reporter):
+ name="wait-for-nic-detach",
+ description="wait for nic detach",
+ parent=azure_ds_reporter,
+ ):
ifname = netlink.wait_for_nic_detach_event(nl_sock)
if ifname is None:
- msg = ("Preprovisioned nic not detached as expected. "
- "Proceeding without failing.")
+ msg = (
+ "Preprovisioned nic not detached as expected. "
+ "Proceeding without failing."
+ )
report_diagnostic_event(msg, logger_func=LOG.warning)
else:
- report_diagnostic_event("The preprovisioned nic %s is detached"
- % ifname, logger_func=LOG.warning)
+ report_diagnostic_event(
+ "The preprovisioned nic %s is detached" % ifname,
+ logger_func=LOG.warning,
+ )
path = REPROVISION_NIC_DETACHED_MARKER_FILE
LOG.info("Creating a marker file for nic detached: %s", path)
- util.write_file(path, "{pid}: {time}\n".format(
- pid=os.getpid(), time=time()))
+ util.write_file(
+ path, "{pid}: {time}\n".format(pid=os.getpid(), time=time())
+ )
except AssertionError as error:
- report_diagnostic_event(error, logger_func=LOG.error)
+ report_diagnostic_event(str(error), logger_func=LOG.error)
raise
@azure_ds_telemetry_reporter
def wait_for_link_up(self, ifname):
"""In cases where the link state is still showing down after a nic is
- hot-attached, we can attempt to bring it up by forcing the hv_netvsc
- drivers to query the link state by unbinding and then binding the
- device. This function attempts infinitely until the link is up,
- because we cannot proceed further until we have a stable link."""
+ hot-attached, we can attempt to bring it up by forcing the hv_netvsc
+ drivers to query the link state by unbinding and then binding the
+ device. This function attempts infinitely until the link is up,
+ because we cannot proceed further until we have a stable link."""
if self.distro.networking.try_set_link_up(ifname):
- report_diagnostic_event("The link %s is already up." % ifname,
- logger_func=LOG.info)
+ report_diagnostic_event(
+ "The link %s is already up." % ifname, logger_func=LOG.info
+ )
return
- LOG.info("Attempting to bring %s up", ifname)
+ LOG.debug("Attempting to bring %s up", ifname)
attempts = 0
+ LOG.info("Unbinding and binding the interface %s", ifname)
while True:
-
- LOG.info("Unbinding and binding the interface %s", ifname)
- devicename = net.read_sys_net(ifname,
- 'device/device_id').strip('{}')
- util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/unbind',
- devicename)
- util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/bind',
- devicename)
+ device_id = net.read_sys_net(ifname, "device/device_id")
+ if device_id is False or not isinstance(device_id, str):
+ raise RuntimeError("Unable to read device ID: %s" % device_id)
+ devicename = device_id.strip("{}")
+ util.write_file(
+ "/sys/bus/vmbus/drivers/hv_netvsc/unbind", devicename
+ )
+ util.write_file(
+ "/sys/bus/vmbus/drivers/hv_netvsc/bind", devicename
+ )
attempts = attempts + 1
if self.distro.networking.try_set_link_up(ifname):
- msg = "The link %s is up after %s attempts" % (ifname,
- attempts)
+ msg = "The link %s is up after %s attempts" % (
+ ifname,
+ attempts,
+ )
report_diagnostic_event(msg, logger_func=LOG.info)
return
- sleep_duration = 1
- msg = ("Link is not up after %d attempts with %d seconds sleep "
- "between attempts." % (attempts, sleep_duration))
-
if attempts % 10 == 0:
+ msg = "Link is not up after %d attempts to rebind" % attempts
report_diagnostic_event(msg, logger_func=LOG.info)
- else:
LOG.info(msg)
- sleep(sleep_duration)
+ # It could take some time after rebind for the interface to be up.
+ # So poll for the status for some time before attempting to rebind
+ # again.
+ sleep_duration = 0.5
+ max_status_polls = 20
+ LOG.debug(
+ "Polling %d seconds for primary NIC link up after rebind.",
+ sleep_duration * max_status_polls,
+ )
+
+ for i in range(0, max_status_polls):
+ if self.distro.networking.is_up(ifname):
+ msg = (
+ "After %d attempts to rebind, link is up after "
+ "polling the link status %d times" % (attempts, i)
+ )
+ report_diagnostic_event(msg, logger_func=LOG.info)
+ LOG.debug(msg)
+ return
+ else:
+ sleep(sleep_duration)
@azure_ds_telemetry_reporter
def _create_report_ready_marker(self):
path = REPORTED_READY_MARKER_FILE
- LOG.info(
- "Creating a marker file to report ready: %s", path)
- util.write_file(path, "{pid}: {time}\n".format(
- pid=os.getpid(), time=time()))
+ LOG.info("Creating a marker file to report ready: %s", path)
+ util.write_file(
+ path, "{pid}: {time}\n".format(pid=os.getpid(), time=time())
+ )
report_diagnostic_event(
- 'Successfully created reported ready marker file '
- 'while in the preprovisioning pool.',
- logger_func=LOG.debug)
+ "Successfully created reported ready marker file "
+ "while in the preprovisioning pool.",
+ logger_func=LOG.debug,
+ )
@azure_ds_telemetry_reporter
- def _report_ready_if_needed(self):
- """Report ready to the platform if the marker file is not present,
- and create the marker file.
+ def _report_ready_for_pps(self) -> None:
+ """Report ready for PPS, creating the marker file upon completion.
+
+ :raises sources.InvalidMetaDataException: On error reporting ready.
"""
- have_not_reported_ready = (
- not os.path.isfile(REPORTED_READY_MARKER_FILE))
+ try:
+ self._report_ready()
+ except Exception as error:
+ msg = "Failed reporting ready while in the preprovisioning pool."
+ report_diagnostic_event(msg, logger_func=LOG.error)
+ raise sources.InvalidMetaDataException(msg) from error
- if have_not_reported_ready:
- report_diagnostic_event("Reporting ready before nic detach",
- logger_func=LOG.info)
- try:
- with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease:
- self._report_ready(lease=lease)
- except Exception as e:
- report_diagnostic_event("Exception reporting ready during "
- "preprovisioning before nic detach: %s"
- % e, logger_func=LOG.error)
- raise
- self._create_report_ready_marker()
- else:
- report_diagnostic_event("Already reported ready before nic detach."
- " The marker file already exists: %s" %
- REPORTED_READY_MARKER_FILE,
- logger_func=LOG.error)
+ self._create_report_ready_marker()
@azure_ds_telemetry_reporter
def _check_if_nic_is_primary(self, ifname):
@@ -822,61 +993,89 @@ class DataSourceAzure(sources.DataSource):
is_primary = False
expected_nic_count = -1
imds_md = None
+ metadata_poll_count = 0
+ metadata_logging_threshold = 1
+ expected_errors_count = 0
# For now, only a VM's primary NIC can contact IMDS and WireServer. If
# DHCP fails for a NIC, we have no mechanism to determine if the NIC is
- # primary or secondary. In this case, the desired behavior is to fail
- # VM provisioning if there is any DHCP failure when trying to determine
- # the primary NIC.
- try:
- with events.ReportEventStack(
- name="obtain-dhcp-lease",
- description=("obtain dhcp lease for %s when attempting to "
- "determine primary NIC during reprovision of "
- "a pre-provisioned VM" % ifname),
- parent=azure_ds_reporter):
- dhcp_ctx = EphemeralDHCPv4(
- iface=ifname,
- dhcp_log_func=dhcp_log_cb)
- dhcp_ctx.obtain_lease()
- except Exception as e:
- report_diagnostic_event("Giving up. Failed to obtain dhcp lease "
- "for %s when attempting to determine "
- "primary NIC during reprovision due to %s"
- % (ifname, e), logger_func=LOG.error)
- raise
+ # primary or secondary. In this case, retry DHCP until successful.
+ self._setup_ephemeral_networking(iface=ifname, timeout_minutes=20)
+
+ # Retry polling network metadata for a limited duration only when the
+ # calls fail due to network unreachable error or timeout.
+ # This is because the platform drops packets going towards IMDS
+ # when it is not a primary nic. If the calls fail due to other issues
+ # like 410, 503 etc, then it means we are primary but IMDS service
+ # is unavailable at the moment. Retry indefinitely in those cases
+ # since we cannot move on without the network metadata. In the future,
+ # all this will not be necessary, as a new dhcp option would tell
+ # whether the nic is primary or not.
+ def network_metadata_exc_cb(msg, exc):
+ nonlocal expected_errors_count, metadata_poll_count
+ nonlocal metadata_logging_threshold
+
+ metadata_poll_count = metadata_poll_count + 1
+
+ # Log when needed but back off exponentially to avoid exploding
+ # the log file.
+ if metadata_poll_count >= metadata_logging_threshold:
+ metadata_logging_threshold *= 2
+ report_diagnostic_event(
+ "Ran into exception when attempting to reach %s "
+ "after %d polls." % (msg, metadata_poll_count),
+ logger_func=LOG.error,
+ )
+
+ if isinstance(exc, UrlError):
+ report_diagnostic_event(
+ "poll IMDS with %s failed. Exception: %s and code: %s"
+ % (msg, exc.cause, exc.code),
+ logger_func=LOG.error,
+ )
+
+ # Retry up to a certain limit for both timeout and network
+ # unreachable errors.
+ if exc.cause and isinstance(
+ exc.cause, (requests.Timeout, requests.ConnectionError)
+ ):
+ expected_errors_count = expected_errors_count + 1
+ return expected_errors_count <= 10
+ return True
# Primary nic detection will be optimized in the future. The fact that
# primary nic is being attached first helps here. Otherwise each nic
# could add several seconds of delay.
try:
- imds_md = get_metadata_from_imds(
- ifname,
- 5,
- metadata_type.network)
+ imds_md = self.get_imds_data_with_api_fallback(
+ retries=0,
+ md_type=MetadataType.NETWORK,
+ exc_cb=network_metadata_exc_cb,
+ infinite=True,
+ )
except Exception as e:
LOG.warning(
"Failed to get network metadata using nic %s. Attempt to "
"contact IMDS failed with error %s. Assuming this is not the "
- "primary nic.", ifname, e)
- finally:
- # If we are not the primary nic, then clean the dhcp context.
- if imds_md is None:
- dhcp_ctx.clean_network()
+ "primary nic.",
+ ifname,
+ e,
+ )
- if imds_md is not None:
+ if imds_md:
# Only primary NIC will get a response from IMDS.
LOG.info("%s is the primary nic", ifname)
is_primary = True
- # If primary, set ephemeral dhcp ctx so we can report ready
- self._ephemeral_dhcp_ctx = dhcp_ctx
-
# Set the expected nic count based on the response received.
- expected_nic_count = len(
- imds_md['interface'])
- report_diagnostic_event("Expected nic count: %d" %
- expected_nic_count, logger_func=LOG.info)
+ expected_nic_count = len(imds_md["interface"])
+ report_diagnostic_event(
+ "Expected nic count: %d" % expected_nic_count,
+ logger_func=LOG.info,
+ )
+ else:
+ # If we are not the primary nic, then clean the dhcp context.
+ self._teardown_ephemeral_networking()
return is_primary, expected_nic_count
@@ -901,17 +1100,22 @@ class DataSourceAzure(sources.DataSource):
while True:
ifname = None
with events.ReportEventStack(
- name="wait-for-nic-attach",
- description=("wait for nic attach after %d nics have "
- "been attached" % len(nics_found)),
- parent=azure_ds_reporter):
- ifname = netlink.wait_for_nic_attach_event(nl_sock,
- nics_found)
+ name="wait-for-nic-attach",
+ description=(
+ "wait for nic attach after %d nics have been attached"
+ % len(nics_found)
+ ),
+ parent=azure_ds_reporter,
+ ):
+ ifname = netlink.wait_for_nic_attach_event(
+ nl_sock, nics_found
+ )
# wait_for_nic_attach_event guarantees that ifname it not None
nics_found.append(ifname)
- report_diagnostic_event("Detected nic %s attached." % ifname,
- logger_func=LOG.info)
+ report_diagnostic_event(
+ "Detected nic %s attached." % ifname, logger_func=LOG.info
+ )
# Attempt to bring the interface's operating state to
# UP in case it is not already.
@@ -921,26 +1125,29 @@ class DataSourceAzure(sources.DataSource):
# platform will attach the primary nic first so we
# won't be in primary_nic_found = false state for long.
if not primary_nic_found:
- LOG.info("Checking if %s is the primary nic",
- ifname)
- (primary_nic_found, expected_nic_count) = (
- self._check_if_nic_is_primary(ifname))
+ LOG.info("Checking if %s is the primary nic", ifname)
+ (
+ primary_nic_found,
+ expected_nic_count,
+ ) = self._check_if_nic_is_primary(ifname)
# Exit criteria: check if we've discovered all nics
- if (expected_nic_count != -1
- and len(nics_found) >= expected_nic_count):
+ if (
+ expected_nic_count != -1
+ and len(nics_found) >= expected_nic_count
+ ):
LOG.info("Found all the nics for this VM.")
break
except AssertionError as error:
- report_diagnostic_event(error, logger_func=LOG.error)
+ report_diagnostic_event(str(error), logger_func=LOG.error)
@azure_ds_telemetry_reporter
def _wait_for_all_nics_ready(self):
"""Wait for nic(s) to be hot-attached. There may be multiple nics
- depending on the customer request.
- But only primary nic would be able to communicate with wireserver
- and IMDS. So we detect and save the primary nic to be used later.
+ depending on the customer request.
+ But only primary nic would be able to communicate with wireserver
+ and IMDS. So we detect and save the primary nic to be used later.
"""
nl_sock = None
@@ -948,18 +1155,22 @@ class DataSourceAzure(sources.DataSource):
nl_sock = netlink.create_bound_netlink_socket()
report_ready_marker_present = bool(
- os.path.isfile(REPORTED_READY_MARKER_FILE))
+ os.path.isfile(REPORTED_READY_MARKER_FILE)
+ )
# Report ready if the marker file is not already present.
# The nic of the preprovisioned vm gets hot-detached as soon as
# we report ready. So no need to save the dhcp context.
- self._report_ready_if_needed()
+ if not os.path.isfile(REPORTED_READY_MARKER_FILE):
+ self._report_ready_for_pps()
has_nic_been_detached = bool(
- os.path.isfile(REPROVISION_NIC_DETACHED_MARKER_FILE))
+ os.path.isfile(REPROVISION_NIC_DETACHED_MARKER_FILE)
+ )
if not has_nic_been_detached:
LOG.info("NIC has not been detached yet.")
+ self._teardown_ephemeral_networking()
self._wait_for_nic_detach(nl_sock)
# If we know that the preprovisioned nic has been detached, and we
@@ -970,31 +1181,35 @@ class DataSourceAzure(sources.DataSource):
if not self.fallback_interface:
self._wait_for_hot_attached_nics(nl_sock)
else:
- report_diagnostic_event("Skipping waiting for nic attach "
- "because we already have a fallback "
- "interface. Report Ready marker "
- "present before detaching nics: %s" %
- report_ready_marker_present,
- logger_func=LOG.info)
+ report_diagnostic_event(
+ "Skipping waiting for nic attach "
+ "because we already have a fallback "
+ "interface. Report Ready marker "
+ "present before detaching nics: %s"
+ % report_ready_marker_present,
+ logger_func=LOG.info,
+ )
except netlink.NetlinkCreateSocketError as e:
- report_diagnostic_event(e, logger_func=LOG.warning)
+ report_diagnostic_event(str(e), logger_func=LOG.warning)
raise
finally:
if nl_sock:
nl_sock.close()
+ @azure_ds_telemetry_reporter
def _poll_imds(self):
"""Poll IMDS for the new provisioning data until we get a valid
response. Then return the returned JSON object."""
- url = metadata_type.reprovisiondata.value
+ url = "{}?api-version={}".format(
+ MetadataType.REPROVISION_DATA.value, IMDS_VER_MIN
+ )
headers = {"Metadata": "true"}
nl_sock = None
report_ready = bool(not os.path.isfile(REPORTED_READY_MARKER_FILE))
self.imds_logging_threshold = 1
self.imds_poll_counter = 1
dhcp_attempts = 0
- vnet_switched = False
- return_val = None
+ reprovision_data = None
def exc_cb(msg, exception):
if isinstance(exception, UrlError):
@@ -1002,339 +1217,328 @@ class DataSourceAzure(sources.DataSource):
if self.imds_poll_counter == self.imds_logging_threshold:
# Reducing the logging frequency as we are polling IMDS
self.imds_logging_threshold *= 2
- LOG.debug("Backing off logging threshold for the same "
- "exception to %d",
- self.imds_logging_threshold)
- report_diagnostic_event("poll IMDS with %s failed. "
- "Exception: %s and code: %s" %
- (msg, exception.cause,
- exception.code),
- logger_func=LOG.debug)
+ LOG.debug(
+ "Backing off logging threshold for the same "
+ "exception to %d",
+ self.imds_logging_threshold,
+ )
+ report_diagnostic_event(
+ "poll IMDS with %s failed. "
+ "Exception: %s and code: %s"
+ % (msg, exception.cause, exception.code),
+ logger_func=LOG.debug,
+ )
self.imds_poll_counter += 1
return True
else:
# If we get an exception while trying to call IMDS, we call
# DHCP and setup the ephemeral network to acquire a new IP.
- report_diagnostic_event("poll IMDS with %s failed. "
- "Exception: %s and code: %s" %
- (msg, exception.cause,
- exception.code),
- logger_func=LOG.warning)
+ report_diagnostic_event(
+ "poll IMDS with %s failed. Exception: %s and code: %s"
+ % (msg, exception.cause, exception.code),
+ logger_func=LOG.warning,
+ )
return False
report_diagnostic_event(
- "poll IMDS failed with an "
- "unexpected exception: %s" % exception,
- logger_func=LOG.warning)
+ "poll IMDS failed with an unexpected exception: %s"
+ % exception,
+ logger_func=LOG.warning,
+ )
return False
- # When the interface is hot-attached, we would have already
- # done dhcp and set the dhcp context. In that case, skip
- # the attempt to do dhcp.
- is_ephemeral_ctx_present = self._ephemeral_dhcp_ctx is not None
- msg = ("Unexpected error. Dhcp context is not expected to be already "
- "set when we need to wait for vnet switch")
- if is_ephemeral_ctx_present and report_ready:
- report_diagnostic_event(msg, logger_func=LOG.error)
- raise RuntimeError(msg)
+ if report_ready:
+ # Networking must be up for netlink to detect
+ # media disconnect/connect. It may be down to due
+ # initial DHCP failure, if so check for it and retry,
+ # ensuring we flag it as required.
+ if not self._is_ephemeral_networking_up():
+ self._setup_ephemeral_networking(timeout_minutes=20)
- while True:
try:
- # Since is_ephemeral_ctx_present is set only once, this ensures
- # that with regular reprovisioning, dhcp is always done every
- # time the loop runs.
- if not is_ephemeral_ctx_present:
- # Save our EphemeralDHCPv4 context to avoid repeated dhcp
- # later when we report ready
- with events.ReportEventStack(
- name="obtain-dhcp-lease",
- description="obtain dhcp lease",
- parent=azure_ds_reporter):
- self._ephemeral_dhcp_ctx = EphemeralDHCPv4(
- dhcp_log_func=dhcp_log_cb)
- lease = self._ephemeral_dhcp_ctx.obtain_lease()
-
- if vnet_switched:
- dhcp_attempts += 1
- if report_ready:
+ if (
+ self._ephemeral_dhcp_ctx is None
+ or self._ephemeral_dhcp_ctx.iface is None
+ ):
+ raise RuntimeError("Missing ephemeral context")
+ iface = self._ephemeral_dhcp_ctx.iface
+
+ nl_sock = netlink.create_bound_netlink_socket()
+ self._report_ready_for_pps()
+
+ LOG.debug(
+ "Wait for vnetswitch to happen on %s",
+ iface,
+ )
+ with events.ReportEventStack(
+ name="wait-for-media-disconnect-connect",
+ description="wait for vnet switch",
+ parent=azure_ds_reporter,
+ ):
try:
- nl_sock = netlink.create_bound_netlink_socket()
- except netlink.NetlinkCreateSocketError as e:
+ netlink.wait_for_media_disconnect_connect(
+ nl_sock, iface
+ )
+ except AssertionError as e:
report_diagnostic_event(
- 'Failed to create bound netlink socket: %s' % e,
- logger_func=LOG.warning)
- self._ephemeral_dhcp_ctx.clean_network()
- break
-
- report_ready_succeeded = self._report_ready(lease=lease)
- if not report_ready_succeeded:
- msg = ('Failed reporting ready while in '
- 'the preprovisioning pool.')
- report_diagnostic_event(msg, logger_func=LOG.error)
- self._ephemeral_dhcp_ctx.clean_network()
- raise sources.InvalidMetaDataException(msg)
-
- self._create_report_ready_marker()
- report_ready = False
-
- LOG.debug("Wait for vnetswitch to happen")
- with events.ReportEventStack(
- name="wait-for-media-disconnect-connect",
- description="wait for vnet switch",
- parent=azure_ds_reporter):
- try:
- netlink.wait_for_media_disconnect_connect(
- nl_sock, lease['interface'])
- except AssertionError as e:
- report_diagnostic_event(
- 'Error while waiting for vnet switch: %s' % e,
- logger_func=LOG.error)
- break
-
- vnet_switched = True
- self._ephemeral_dhcp_ctx.clean_network()
- else:
- with events.ReportEventStack(
- name="get-reprovision-data-from-imds",
- description="get reprovision data from imds",
- parent=azure_ds_reporter):
- return_val = readurl(url,
- timeout=IMDS_TIMEOUT_IN_SECONDS,
- headers=headers,
- exception_cb=exc_cb,
- infinite=True,
- log_req_resp=False).contents
- break
- except UrlError:
- # Teardown our EphemeralDHCPv4 context on failure as we retry
- self._ephemeral_dhcp_ctx.clean_network()
+ "Error while waiting for vnet switch: %s" % e,
+ logger_func=LOG.error,
+ )
+ except netlink.NetlinkCreateSocketError as e:
+ report_diagnostic_event(
+ "Failed to create bound netlink socket: %s" % e,
+ logger_func=LOG.warning,
+ )
+ raise sources.InvalidMetaDataException(
+ "Failed to report ready while in provisioning pool."
+ ) from e
+ except NoDHCPLeaseError as e:
+ report_diagnostic_event(
+ "DHCP failed while in provisioning pool",
+ logger_func=LOG.warning,
+ )
+ raise sources.InvalidMetaDataException(
+ "Failed to report ready while in provisioning pool."
+ ) from e
finally:
if nl_sock:
nl_sock.close()
- if vnet_switched:
- report_diagnostic_event("attempted dhcp %d times after reuse" %
- dhcp_attempts,
- logger_func=LOG.debug)
- report_diagnostic_event("polled imds %d times after reuse" %
- self.imds_poll_counter,
- logger_func=LOG.debug)
+ # Teardown old network configuration.
+ self._teardown_ephemeral_networking()
- return return_val
+ while not reprovision_data:
+ if not self._is_ephemeral_networking_up():
+ dhcp_attempts += 1
+ try:
+ self._setup_ephemeral_networking(timeout_minutes=5)
+ except NoDHCPLeaseError:
+ continue
+
+ with events.ReportEventStack(
+ name="get-reprovision-data-from-imds",
+ description="get reprovision data from imds",
+ parent=azure_ds_reporter,
+ ):
+ try:
+ reprovision_data = readurl(
+ url,
+ timeout=IMDS_TIMEOUT_IN_SECONDS,
+ headers=headers,
+ exception_cb=exc_cb,
+ infinite=True,
+ log_req_resp=False,
+ ).contents
+ except UrlError:
+ self._teardown_ephemeral_networking()
+ continue
+
+ report_diagnostic_event(
+ "attempted dhcp %d times after reuse" % dhcp_attempts,
+ logger_func=LOG.debug,
+ )
+ report_diagnostic_event(
+ "polled imds %d times after reuse" % self.imds_poll_counter,
+ logger_func=LOG.debug,
+ )
+
+ return reprovision_data
@azure_ds_telemetry_reporter
- def _report_failure(self, description=None) -> bool:
+ def _report_failure(self, description: Optional[str] = None) -> bool:
"""Tells the Azure fabric that provisioning has failed.
@param description: A description of the error encountered.
@return: The success status of sending the failure signal.
"""
- unknown_245_key = 'unknown-245'
-
- try:
- if (self.distro.networking.is_up(self.fallback_interface) and
- getattr(self, '_ephemeral_dhcp_ctx', None) and
- getattr(self._ephemeral_dhcp_ctx, 'lease', None) and
- unknown_245_key in self._ephemeral_dhcp_ctx.lease):
+ if self._is_ephemeral_networking_up():
+ try:
report_diagnostic_event(
- 'Using cached ephemeral dhcp context '
- 'to report failure to Azure', logger_func=LOG.debug)
+ "Using cached ephemeral dhcp context "
+ "to report failure to Azure",
+ logger_func=LOG.debug,
+ )
report_failure_to_fabric(
- dhcp_opts=self._ephemeral_dhcp_ctx.lease[unknown_245_key],
- description=description)
- self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral
+ dhcp_opts=self._wireserver_endpoint,
+ description=description,
+ )
return True
- except Exception as e:
- report_diagnostic_event(
- 'Failed to report failure using '
- 'cached ephemeral dhcp context: %s' % e,
- logger_func=LOG.error)
-
- try:
- report_diagnostic_event(
- 'Using new ephemeral dhcp to report failure to Azure',
- logger_func=LOG.debug)
- with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease:
- report_failure_to_fabric(
- dhcp_opts=lease[unknown_245_key],
- description=description)
- return True
- except Exception as e:
- report_diagnostic_event(
- 'Failed to report failure using new ephemeral dhcp: %s' % e,
- logger_func=LOG.debug)
+ except Exception as e:
+ report_diagnostic_event(
+ "Failed to report failure using "
+ "cached ephemeral dhcp context: %s" % e,
+ logger_func=LOG.error,
+ )
try:
report_diagnostic_event(
- 'Using fallback lease to report failure to Azure')
+ "Using new ephemeral dhcp to report failure to Azure",
+ logger_func=LOG.debug,
+ )
+ self._teardown_ephemeral_networking()
+ try:
+ self._setup_ephemeral_networking(timeout_minutes=20)
+ except NoDHCPLeaseError:
+ # Reporting failure will fail, but it will emit telemetry.
+ pass
report_failure_to_fabric(
- fallback_lease_file=self.dhclient_lease_file,
- description=description)
+ dhcp_opts=self._wireserver_endpoint, description=description
+ )
return True
except Exception as e:
report_diagnostic_event(
- 'Failed to report failure using fallback lease: %s' % e,
- logger_func=LOG.debug)
+ "Failed to report failure using new ephemeral dhcp: %s" % e,
+ logger_func=LOG.debug,
+ )
return False
- def _report_ready(self, lease: dict) -> bool:
+ @azure_ds_telemetry_reporter
+ def _report_ready(
+ self, *, pubkey_info: Optional[List[str]] = None
+ ) -> Optional[List[str]]:
"""Tells the fabric provisioning has completed.
- @param lease: dhcp lease to use for sending the ready signal.
- @return: The success status of sending the ready signal.
+ :param pubkey_info: Fingerprints of keys to request from Wireserver.
+
+ :raises Exception: if failed to report.
+
+ :returns: List of SSH keys, if requested.
"""
try:
- get_metadata_from_fabric(None, lease['unknown-245'])
- return True
+ data = get_metadata_from_fabric(
+ fallback_lease_file=None,
+ dhcp_opts=self._wireserver_endpoint,
+ iso_dev=self._iso_dev,
+ pubkey_info=pubkey_info,
+ )
except Exception as e:
report_diagnostic_event(
"Error communicating with Azure fabric; You may experience "
- "connectivity issues: %s" % e, logger_func=LOG.warning)
- return False
+ "connectivity issues: %s" % e,
+ logger_func=LOG.warning,
+ )
+ raise
- def _should_reprovision_after_nic_attach(self, candidate_metadata) -> bool:
- """Whether or not we should wait for nic attach and then poll
- IMDS for reprovisioning data. Also sets a marker file to poll IMDS.
+ # Reporting ready ejected OVF media, no need to do so again.
+ self._iso_dev = None
+ return data
- The marker file is used for the following scenario: the VM boots into
- wait for nic attach, which we expect to be proceeding infinitely until
- the nic is attached. If for whatever reason the platform moves us to a
- new host (for instance a hardware issue), we need to keep waiting.
- However, since the VM reports ready to the Fabric, we will not attach
- the ISO, thus cloud-init needs to have a way of knowing that it should
- jump back into the waiting mode in order to retrieve the ovf_env.
+ def _ppstype_from_imds(self, imds_md: dict) -> Optional[str]:
+ try:
+ return imds_md["extended"]["compute"]["ppsType"]
+ except Exception as e:
+ report_diagnostic_event(
+ "Could not retrieve pps configuration from IMDS: %s" % e,
+ logger_func=LOG.debug,
+ )
+ return None
- @param candidate_metadata: Metadata obtained from reading ovf-env.
- @return: Whether to reprovision after waiting for nics to be attached.
- """
- if not candidate_metadata:
- return False
- (_md, _userdata_raw, cfg, _files) = candidate_metadata
- path = REPROVISION_NIC_ATTACH_MARKER_FILE
- if (cfg.get('PreprovisionedVMType', None) == "Savable" or
- os.path.isfile(path)):
- if not os.path.isfile(path):
- LOG.info("Creating a marker file to wait for nic attach: %s",
- path)
- util.write_file(path, "{pid}: {time}\n".format(
- pid=os.getpid(), time=time()))
- return True
- return False
+ def _determine_pps_type(self, ovf_cfg: dict, imds_md: dict) -> PPSType:
+ """Determine PPS type using OVF, IMDS data, and reprovision marker."""
+ if os.path.isfile(REPROVISION_MARKER_FILE):
+ pps_type = PPSType.UNKNOWN
+ elif (
+ ovf_cfg.get("PreprovisionedVMType", None) == PPSType.SAVABLE.value
+ or self._ppstype_from_imds(imds_md) == PPSType.SAVABLE.value
+ ):
+ pps_type = PPSType.SAVABLE
+ elif (
+ ovf_cfg.get("PreprovisionedVm") is True
+ or ovf_cfg.get("PreprovisionedVMType", None)
+ == PPSType.RUNNING.value
+ or self._ppstype_from_imds(imds_md) == PPSType.RUNNING.value
+ ):
+ pps_type = PPSType.RUNNING
+ else:
+ pps_type = PPSType.NONE
- def _should_reprovision(self, ret):
- """Whether or not we should poll IMDS for reprovisioning data.
- Also sets a marker file to poll IMDS.
-
- The marker file is used for the following scenario: the VM boots into
- this polling loop, which we expect to be proceeding infinitely until
- the VM is picked. If for whatever reason the platform moves us to a
- new host (for instance a hardware issue), we need to keep polling.
- However, since the VM reports ready to the Fabric, we will not attach
- the ISO, thus cloud-init needs to have a way of knowing that it should
- jump back into the polling loop in order to retrieve the ovf_env."""
- if not ret:
- return False
- (_md, _userdata_raw, cfg, _files) = ret
- path = REPROVISION_MARKER_FILE
- if (cfg.get('PreprovisionedVm') is True or
- cfg.get('PreprovisionedVMType', None) == 'Running' or
- os.path.isfile(path)):
- if not os.path.isfile(path):
- LOG.info("Creating a marker file to poll imds: %s",
- path)
- util.write_file(path, "{pid}: {time}\n".format(
- pid=os.getpid(), time=time()))
- return True
- return False
+ report_diagnostic_event(
+ "PPS type: %s" % pps_type.value, logger_func=LOG.info
+ )
+ return pps_type
+ def _write_reprovision_marker(self):
+ """Write reprovision marker file in case system is rebooted."""
+ LOG.info(
+ "Creating a marker file to poll imds: %s", REPROVISION_MARKER_FILE
+ )
+ util.write_file(
+ REPROVISION_MARKER_FILE,
+ "{pid}: {time}\n".format(pid=os.getpid(), time=time()),
+ )
+
+ @azure_ds_telemetry_reporter
def _reprovision(self):
- """Initiate the reprovisioning workflow."""
+ """Initiate the reprovisioning workflow.
+
+ Ephemeral networking is up upon successful reprovisioning.
+ """
contents = self._poll_imds()
with events.ReportEventStack(
- name="reprovisioning-read-azure-ovf",
- description="read azure ovf during reprovisioning",
- parent=azure_ds_reporter):
+ name="reprovisioning-read-azure-ovf",
+ description="read azure ovf during reprovisioning",
+ parent=azure_ds_reporter,
+ ):
md, ud, cfg = read_azure_ovf(contents)
- return (md, ud, cfg, {'ovf-env.xml': contents})
+ return (md, ud, cfg, {"ovf-env.xml": contents})
@azure_ds_telemetry_reporter
- def _negotiate(self):
- """Negotiate with fabric and return data from it.
+ def _determine_wireserver_pubkey_info(
+ self, *, cfg: dict, imds_md: dict
+ ) -> Optional[List[str]]:
+ """Determine the fingerprints we need to retrieve from Wireserver.
- On success, returns a dictionary including 'public_keys'.
- On failure, returns False.
+ :return: List of keys to request from Wireserver, if any, else None.
"""
-
- if self.ds_cfg['agent_command'] == AGENT_START_BUILTIN:
- self.bounce_network_with_azure_hostname()
-
- pubkey_info = None
- try:
- public_keys = self.metadata['imds']['compute']['publicKeys']
- LOG.debug(
- 'Successfully retrieved %s key(s) from IMDS',
- len(public_keys)
- if public_keys is not None
- else 0
- )
- except KeyError:
- LOG.debug(
- 'Unable to retrieve SSH keys from IMDS during '
- 'negotiation, falling back to OVF'
- )
- pubkey_info = self.cfg.get('_pubkeys', None)
-
- metadata_func = partial(get_metadata_from_fabric,
- fallback_lease_file=self.
- dhclient_lease_file,
- pubkey_info=pubkey_info)
- else:
- metadata_func = self.get_metadata_from_agent
-
- LOG.debug("negotiating with fabric via agent command %s",
- self.ds_cfg['agent_command'])
+ pubkey_info: Optional[List[str]] = None
try:
- fabric_data = metadata_func()
- except Exception as e:
- report_diagnostic_event(
- "Error communicating with Azure fabric; You may experience "
- "connectivity issues: %s" % e, logger_func=LOG.warning)
- return False
+ self._get_public_keys_from_imds(imds_md)
+ except (KeyError, ValueError):
+ pubkey_info = cfg.get("_pubkeys", None)
+ log_msg = "Retrieved {} fingerprints from OVF".format(
+ len(pubkey_info) if pubkey_info is not None else 0
+ )
+ report_diagnostic_event(log_msg, logger_func=LOG.debug)
+ return pubkey_info
+ def _cleanup_markers(self):
+ """Cleanup any marker files."""
util.del_file(REPORTED_READY_MARKER_FILE)
util.del_file(REPROVISION_MARKER_FILE)
- util.del_file(REPROVISION_NIC_ATTACH_MARKER_FILE)
util.del_file(REPROVISION_NIC_DETACHED_MARKER_FILE)
- return fabric_data
@azure_ds_telemetry_reporter
def activate(self, cfg, is_new_instance):
try:
- address_ephemeral_resize(is_new_instance=is_new_instance,
- preserve_ntfs=self.ds_cfg.get(
- DS_CFG_KEY_PRESERVE_NTFS, False))
+ address_ephemeral_resize(
+ is_new_instance=is_new_instance,
+ preserve_ntfs=self.ds_cfg.get(DS_CFG_KEY_PRESERVE_NTFS, False),
+ )
finally:
- push_log_to_kvp(self.sys_cfg['def_log_file'])
+ push_log_to_kvp(self.sys_cfg["def_log_file"])
return
@property
def availability_zone(self):
- return self.metadata.get(
- 'imds', {}).get('compute', {}).get('platformFaultDomain')
+ return (
+ self.metadata.get("imds", {})
+ .get("compute", {})
+ .get("platformFaultDomain")
+ )
@property
def network_config(self):
"""Generate a network config like net.generate_fallback_network() with
- the following exceptions.
+ the following exceptions.
- 1. Probe the drivers of the net-devices present and inject them in
- the network configuration under params: driver: <driver> value
- 2. Generate a fallback network config that does not include any of
- the blacklisted devices.
+ 1. Probe the drivers of the net-devices present and inject them in
+ the network configuration under params: driver: <driver> value
+ 2. Generate a fallback network config that does not include any of
+ the blacklisted devices.
"""
if not self._network_config or self._network_config == sources.UNSET:
- if self.ds_cfg.get('apply_network_config'):
+ if self.ds_cfg.get("apply_network_config"):
nc_src = self._metadata_imds
else:
nc_src = None
@@ -1343,7 +1547,103 @@ class DataSourceAzure(sources.DataSource):
@property
def region(self):
- return self.metadata.get('imds', {}).get('compute', {}).get('location')
+ return self.metadata.get("imds", {}).get("compute", {}).get("location")
+
+ @azure_ds_telemetry_reporter
+ def validate_imds_network_metadata(self, imds_md: dict) -> bool:
+ """Validate IMDS network config and report telemetry for errors."""
+ local_macs = get_hv_netvsc_macs_normalized()
+
+ try:
+ network_config = imds_md["network"]
+ imds_macs = [
+ normalize_mac_address(i["macAddress"])
+ for i in network_config["interface"]
+ ]
+ except KeyError:
+ report_diagnostic_event(
+ "IMDS network metadata has incomplete configuration: %r"
+ % imds_md.get("network"),
+ logger_func=LOG.warning,
+ )
+ return False
+
+ missing_macs = [m for m in local_macs if m not in imds_macs]
+ if not missing_macs:
+ return True
+
+ report_diagnostic_event(
+ "IMDS network metadata is missing configuration for NICs %r: %r"
+ % (missing_macs, network_config),
+ logger_func=LOG.warning,
+ )
+
+ if not self._ephemeral_dhcp_ctx or not self._ephemeral_dhcp_ctx.iface:
+ # No primary interface to check against.
+ return False
+
+ primary_mac = net.get_interface_mac(self._ephemeral_dhcp_ctx.iface)
+ if not primary_mac or not isinstance(primary_mac, str):
+ # Unexpected data for primary interface.
+ return False
+
+ primary_mac = normalize_mac_address(primary_mac)
+ if primary_mac in missing_macs:
+ report_diagnostic_event(
+ "IMDS network metadata is missing primary NIC %r: %r"
+ % (primary_mac, network_config),
+ logger_func=LOG.warning,
+ )
+
+ return False
+
+
+def _username_from_imds(imds_data):
+ try:
+ return imds_data["compute"]["osProfile"]["adminUsername"]
+ except KeyError:
+ return None
+
+
+def _userdata_from_imds(imds_data):
+ try:
+ return imds_data["compute"]["userData"]
+ except KeyError:
+ return None
+
+
+def _hostname_from_imds(imds_data):
+ try:
+ return imds_data["compute"]["osProfile"]["computerName"]
+ except KeyError:
+ return None
+
+
+def _disable_password_from_imds(imds_data):
+ try:
+ return (
+ imds_data["compute"]["osProfile"]["disablePasswordAuthentication"]
+ == "true"
+ )
+ except KeyError:
+ return None
+
+
+def _key_is_openssh_formatted(key):
+ """
+ Validate whether or not the key is OpenSSH-formatted.
+ """
+ # See https://bugs.launchpad.net/cloud-init/+bug/1910835
+ if "\r\n" in key.strip():
+ return False
+
+ parser = ssh_util.AuthKeyLineParser()
+ try:
+ akl = parser.parse(key)
+ except TypeError:
+ return False
+
+ return akl.keytype is not None
def _partitions_on_device(devpath, maxnum=16):
@@ -1362,7 +1662,7 @@ def _partitions_on_device(devpath, maxnum=16):
@azure_ds_telemetry_reporter
def _has_ntfs_filesystem(devpath):
ntfs_devices = util.find_devs_with("TYPE=ntfs", no_cache=True)
- LOG.debug('ntfs_devices found = %s', ntfs_devices)
+ LOG.debug("ntfs_devices found = %s", ntfs_devices)
return os.path.realpath(devpath) in ntfs_devices
@@ -1386,24 +1686,29 @@ def can_dev_be_reformatted(devpath, preserve_ntfs):
If cloud-init cannot mount the disk to check for data, destruction
will be allowed, unless the dscfg key is set."""
if preserve_ntfs:
- msg = ('config says to never destroy NTFS (%s.%s), skipping checks' %
- (".".join(DS_CFG_PATH), DS_CFG_KEY_PRESERVE_NTFS))
+ msg = "config says to never destroy NTFS (%s.%s), skipping checks" % (
+ ".".join(DS_CFG_PATH),
+ DS_CFG_KEY_PRESERVE_NTFS,
+ )
return False, msg
if not os.path.exists(devpath):
- return False, 'device %s does not exist' % devpath
+ return False, "device %s does not exist" % devpath
- LOG.debug('Resolving realpath of %s -> %s', devpath,
- os.path.realpath(devpath))
+ LOG.debug(
+ "Resolving realpath of %s -> %s", devpath, os.path.realpath(devpath)
+ )
# devpath of /dev/sd[a-z] or /dev/disk/cloud/azure_resource
# where partitions are "<devpath>1" or "<devpath>-part1" or "<devpath>p1"
partitions = _partitions_on_device(devpath)
if len(partitions) == 0:
- return False, 'device %s was not partitioned' % devpath
+ return False, "device %s was not partitioned" % devpath
elif len(partitions) > 2:
- msg = ('device %s had 3 or more partitions: %s' %
- (devpath, ' '.join([p[1] for p in partitions])))
+ msg = "device %s had 3 or more partitions: %s" % (
+ devpath,
+ " ".join([p[1] for p in partitions]),
+ )
return False, msg
elif len(partitions) == 2:
cand_part, cand_path = partitions[1]
@@ -1411,66 +1716,78 @@ def can_dev_be_reformatted(devpath, preserve_ntfs):
cand_part, cand_path = partitions[0]
if not _has_ntfs_filesystem(cand_path):
- msg = ('partition %s (%s) on device %s was not ntfs formatted' %
- (cand_part, cand_path, devpath))
+ msg = "partition %s (%s) on device %s was not ntfs formatted" % (
+ cand_part,
+ cand_path,
+ devpath,
+ )
return False, msg
@azure_ds_telemetry_reporter
def count_files(mp):
- ignored = set(['dataloss_warning_readme.txt'])
+ ignored = set(["dataloss_warning_readme.txt"])
return len([f for f in os.listdir(mp) if f.lower() not in ignored])
- bmsg = ('partition %s (%s) on device %s was ntfs formatted' %
- (cand_part, cand_path, devpath))
+ bmsg = "partition %s (%s) on device %s was ntfs formatted" % (
+ cand_part,
+ cand_path,
+ devpath,
+ )
with events.ReportEventStack(
name="mount-ntfs-and-count",
description="mount-ntfs-and-count",
- parent=azure_ds_reporter
+ parent=azure_ds_reporter,
) as evt:
try:
- file_count = util.mount_cb(cand_path, count_files, mtype="ntfs",
- update_env_for_mount={'LANG': 'C'})
+ file_count = util.mount_cb(
+ cand_path,
+ count_files,
+ mtype="ntfs",
+ update_env_for_mount={"LANG": "C"},
+ )
except util.MountFailedError as e:
evt.description = "cannot mount ntfs"
if "unknown filesystem type 'ntfs'" in str(e):
- return True, (bmsg + ' but this system cannot mount NTFS,'
- ' assuming there are no important files.'
- ' Formatting allowed.')
- return False, bmsg + ' but mount of %s failed: %s' % (cand_part, e)
+ return (
+ True,
+ (
+ bmsg + " but this system cannot mount NTFS,"
+ " assuming there are no important files."
+ " Formatting allowed."
+ ),
+ )
+ return False, bmsg + " but mount of %s failed: %s" % (cand_part, e)
if file_count != 0:
evt.description = "mounted and counted %d files" % file_count
- LOG.warning("it looks like you're using NTFS on the ephemeral"
- " disk, to ensure that filesystem does not get wiped,"
- " set %s.%s in config", '.'.join(DS_CFG_PATH),
- DS_CFG_KEY_PRESERVE_NTFS)
- return False, bmsg + ' but had %d files on it.' % file_count
+ LOG.warning(
+ "it looks like you're using NTFS on the ephemeral"
+ " disk, to ensure that filesystem does not get wiped,"
+ " set %s.%s in config",
+ ".".join(DS_CFG_PATH),
+ DS_CFG_KEY_PRESERVE_NTFS,
+ )
+ return False, bmsg + " but had %d files on it." % file_count
- return True, bmsg + ' and had no important files. Safe for reformatting.'
+ return True, bmsg + " and had no important files. Safe for reformatting."
@azure_ds_telemetry_reporter
-def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120,
- is_new_instance=False, preserve_ntfs=False):
- # wait for ephemeral disk to come up
- naplen = .2
- with events.ReportEventStack(
- name="wait-for-ephemeral-disk",
- description="wait for ephemeral disk",
- parent=azure_ds_reporter
- ):
- missing = util.wait_for_files([devpath],
- maxwait=maxwait,
- naplen=naplen,
- log_pre="Azure ephemeral disk: ")
-
- if missing:
- report_diagnostic_event(
- "ephemeral device '%s' did not appear after %d seconds." %
- (devpath, maxwait),
- logger_func=LOG.warning)
- return
+def address_ephemeral_resize(
+ devpath=RESOURCE_DISK_PATH, is_new_instance=False, preserve_ntfs=False
+):
+ if not os.path.exists(devpath):
+ report_diagnostic_event(
+ "Ephemeral resource disk '%s' does not exist." % devpath,
+ logger_func=LOG.debug,
+ )
+ return
+ else:
+ report_diagnostic_event(
+ "Ephemeral resource disk '%s' exists." % devpath,
+ logger_func=LOG.debug,
+ )
result = False
msg = None
@@ -1483,94 +1800,32 @@ def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120,
if not result:
return
- for mod in ['disk_setup', 'mounts']:
- sempath = '/var/lib/cloud/instance/sem/config_' + mod
+ for mod in ["disk_setup", "mounts"]:
+ sempath = "/var/lib/cloud/instance/sem/config_" + mod
bmsg = 'Marker "%s" for module "%s"' % (sempath, mod)
if os.path.exists(sempath):
try:
os.unlink(sempath)
- LOG.debug('%s removed.', bmsg)
+ LOG.debug("%s removed.", bmsg)
except Exception as e:
# python3 throws FileNotFoundError, python2 throws OSError
- LOG.warning('%s: remove failed! (%s)', bmsg, e)
+ LOG.warning("%s: remove failed! (%s)", bmsg, e)
else:
- LOG.debug('%s did not exist.', bmsg)
+ LOG.debug("%s did not exist.", bmsg)
return
@azure_ds_telemetry_reporter
-def perform_hostname_bounce(hostname, cfg, prev_hostname):
- # set the hostname to 'hostname' if it is not already set to that.
- # then, if policy is not off, bounce the interface using command
- # Returns True if the network was bounced, False otherwise.
- command = cfg['command']
- interface = cfg['interface']
- policy = cfg['policy']
-
- msg = ("hostname=%s policy=%s interface=%s" %
- (hostname, policy, interface))
- env = os.environ.copy()
- env['interface'] = interface
- env['hostname'] = hostname
- env['old_hostname'] = prev_hostname
-
- if command == "builtin":
- if util.is_FreeBSD():
- command = BOUNCE_COMMAND_FREEBSD
- elif subp.which('ifup'):
- command = BOUNCE_COMMAND_IFUP
- else:
- LOG.debug(
- "Skipping network bounce: ifupdown utils aren't present.")
- # Don't bounce as networkd handles hostname DDNS updates
- return False
- LOG.debug("pubhname: publishing hostname [%s]", msg)
- shell = not isinstance(command, (list, tuple))
- # capture=False, see comments in bug 1202758 and bug 1206164.
- util.log_time(logfunc=LOG.debug, msg="publishing hostname",
- get_uptime=True, func=subp.subp,
- kwargs={'args': command, 'shell': shell, 'capture': False,
- 'env': env})
- return True
-
-
-@azure_ds_telemetry_reporter
-def crtfile_to_pubkey(fname, data=None):
- pipeline = ('openssl x509 -noout -pubkey < "$0" |'
- 'ssh-keygen -i -m PKCS8 -f /dev/stdin')
- (out, _err) = subp.subp(['sh', '-c', pipeline, fname],
- capture=True, data=data)
- return out.rstrip()
-
-
-@azure_ds_telemetry_reporter
-def pubkeys_from_crt_files(flist):
- pubkeys = []
- errors = []
- for fname in flist:
- try:
- pubkeys.append(crtfile_to_pubkey(fname))
- except subp.ProcessExecutionError:
- errors.append(fname)
-
- if errors:
- report_diagnostic_event(
- "failed to convert the crt files to pubkey: %s" % errors,
- logger_func=LOG.warning)
-
- return pubkeys
-
-
-@azure_ds_telemetry_reporter
def write_files(datadir, files, dirmode=None):
-
def _redact_password(cnt, fname):
"""Azure provides the UserPassword in plain text. So we redact it"""
try:
root = ET.fromstring(cnt)
for elem in root.iter():
- if ('UserPassword' in elem.tag and
- elem.text != DEF_PASSWD_REDACTION):
+ if (
+ "UserPassword" in elem.tag
+ and elem.text != DEF_PASSWD_REDACTION
+ ):
elem.text = DEF_PASSWD_REDACTION
return ET.tostring(root)
except Exception:
@@ -1584,21 +1839,11 @@ def write_files(datadir, files, dirmode=None):
util.ensure_dir(datadir, dirmode)
for (name, content) in files.items():
fname = os.path.join(datadir, name)
- if 'ovf-env.xml' in name:
+ if "ovf-env.xml" in name:
content = _redact_password(content, fname)
util.write_file(filename=fname, content=content, mode=0o600)
-@azure_ds_telemetry_reporter
-def invoke_agent(cmd):
- # this is a function itself to simplify patching it for test
- if cmd:
- LOG.debug("invoking agent: %s", cmd)
- subp.subp(cmd, shell=(not isinstance(cmd, list)))
- else:
- LOG.debug("not invoking agent")
-
-
def find_child(node, filter_func):
ret = []
if not node.hasChildNodes():
@@ -1626,8 +1871,9 @@ def load_azure_ovf_pubkeys(sshnode):
if len(results) == 0:
return []
if len(results) > 1:
- raise BrokenAzureDataSource("Multiple 'PublicKeys'(%s) in SSH node" %
- len(results))
+ raise BrokenAzureDataSource(
+ "Multiple 'PublicKeys'(%s) in SSH node" % len(results)
+ )
pubkeys_node = results[0]
pubkeys = find_child(pubkeys_node, lambda n: n.localName == "PublicKey")
@@ -1642,7 +1888,7 @@ def load_azure_ovf_pubkeys(sshnode):
if not pk_node.hasChildNodes():
continue
- cur = {'fingerprint': "", 'path': "", 'value': ""}
+ cur = {"fingerprint": "", "path": "", "value": ""}
for child in pk_node.childNodes:
if child.nodeType == text_node or not child.localName:
continue
@@ -1652,8 +1898,10 @@ def load_azure_ovf_pubkeys(sshnode):
if name not in cur.keys():
continue
- if (len(child.childNodes) != 1 or
- child.childNodes[0].nodeType != text_node):
+ if (
+ len(child.childNodes) != 1
+ or child.childNodes[0].nodeType != text_node
+ ):
continue
cur[name] = child.childNodes[0].wholeText.strip()
@@ -1671,33 +1919,37 @@ def read_azure_ovf(contents):
report_diagnostic_event(error_str, logger_func=LOG.warning)
raise BrokenAzureDataSource(error_str) from e
- results = find_child(dom.documentElement,
- lambda n: n.localName == "ProvisioningSection")
+ results = find_child(
+ dom.documentElement, lambda n: n.localName == "ProvisioningSection"
+ )
if len(results) == 0:
raise NonAzureDataSource("No ProvisioningSection")
if len(results) > 1:
- raise BrokenAzureDataSource("found '%d' ProvisioningSection items" %
- len(results))
+ raise BrokenAzureDataSource(
+ "found '%d' ProvisioningSection items" % len(results)
+ )
provSection = results[0]
- lpcs_nodes = find_child(provSection,
- lambda n:
- n.localName == "LinuxProvisioningConfigurationSet")
+ lpcs_nodes = find_child(
+ provSection,
+ lambda n: n.localName == "LinuxProvisioningConfigurationSet",
+ )
if len(lpcs_nodes) == 0:
raise NonAzureDataSource("No LinuxProvisioningConfigurationSet")
if len(lpcs_nodes) > 1:
- raise BrokenAzureDataSource("found '%d' %ss" %
- (len(lpcs_nodes),
- "LinuxProvisioningConfigurationSet"))
+ raise BrokenAzureDataSource(
+ "found '%d' %ss"
+ % (len(lpcs_nodes), "LinuxProvisioningConfigurationSet")
+ )
lpcs = lpcs_nodes[0]
if not lpcs.hasChildNodes():
raise BrokenAzureDataSource("no child nodes of configuration set")
- md_props = 'seedfrom'
- md = {'azure_data': {}}
+ md_props = "seedfrom"
+ md: Dict[str, Any] = {"azure_data": {}}
cfg = {}
ud = ""
password = None
@@ -1711,8 +1963,10 @@ def read_azure_ovf(contents):
simple = False
value = ""
- if (len(child.childNodes) == 1 and
- child.childNodes[0].nodeType == dom.TEXT_NODE):
+ if (
+ len(child.childNodes) == 1
+ and child.childNodes[0].nodeType == dom.TEXT_NODE
+ ):
simple = True
value = child.childNodes[0].wholeText
@@ -1721,8 +1975,8 @@ def read_azure_ovf(contents):
# we accept either UserData or CustomData. If both are present
# then behavior is undefined.
if name == "userdata" or name == "customdata":
- if attrs.get('encoding') in (None, "base64"):
- ud = base64.b64decode(''.join(value.split()))
+ if attrs.get("encoding") in (None, "base64"):
+ ud = base64.b64decode("".join(value.split()))
else:
ud = value
elif name == "username":
@@ -1730,36 +1984,36 @@ def read_azure_ovf(contents):
elif name == "userpassword":
password = value
elif name == "hostname":
- md['local-hostname'] = value
+ md["local-hostname"] = value
elif name == "dscfg":
- if attrs.get('encoding') in (None, "base64"):
- dscfg = base64.b64decode(''.join(value.split()))
+ if attrs.get("encoding") in (None, "base64"):
+ dscfg = base64.b64decode("".join(value.split()))
else:
dscfg = value
- cfg['datasource'] = {DS_NAME: util.load_yaml(dscfg, default={})}
+ cfg["datasource"] = {DS_NAME: util.load_yaml(dscfg, default={})}
elif name == "ssh":
- cfg['_pubkeys'] = load_azure_ovf_pubkeys(child)
+ cfg["_pubkeys"] = load_azure_ovf_pubkeys(child)
elif name == "disablesshpasswordauthentication":
- cfg['ssh_pwauth'] = util.is_false(value)
+ cfg["ssh_pwauth"] = util.is_false(value)
elif simple:
if name in md_props:
md[name] = value
else:
- md['azure_data'][name] = value
+ md["azure_data"][name] = value
defuser = {}
if username:
- defuser['name'] = username
+ defuser["name"] = username
if password:
- defuser['lock_passwd'] = False
+ defuser["lock_passwd"] = False
if DEF_PASSWD_REDACTION != password:
- defuser['passwd'] = cfg['password'] = encrypt_pass(password)
+ defuser["passwd"] = cfg["password"] = encrypt_pass(password)
if defuser:
- cfg['system_info'] = {'default_user': defuser}
+ cfg["system_info"] = {"default_user": defuser}
- if 'ssh_pwauth' not in cfg and password:
- cfg['ssh_pwauth'] = True
+ if "ssh_pwauth" not in cfg and password:
+ cfg["ssh_pwauth"] = True
preprovisioning_cfg = _get_preprovisioning_cfgs(dom)
cfg = util.mergemanydict([cfg, preprovisioning_cfg])
@@ -1785,20 +2039,18 @@ def _get_preprovisioning_cfgs(dom):
More specifically, this will never happen:
- PreprovisionedVm=True and PreprovisionedVMType=Savable
"""
- cfg = {
- "PreprovisionedVm": False,
- "PreprovisionedVMType": None
- }
+ cfg = {"PreprovisionedVm": False, "PreprovisionedVMType": None}
platform_settings_section = find_child(
- dom.documentElement,
- lambda n: n.localName == "PlatformSettingsSection")
+ dom.documentElement, lambda n: n.localName == "PlatformSettingsSection"
+ )
if not platform_settings_section or len(platform_settings_section) == 0:
LOG.debug("PlatformSettingsSection not found")
return cfg
platform_settings = find_child(
platform_settings_section[0],
- lambda n: n.localName == "PlatformSettings")
+ lambda n: n.localName == "PlatformSettings",
+ )
if not platform_settings or len(platform_settings) == 0:
LOG.debug("PlatformSettings not found")
return cfg
@@ -1807,10 +2059,12 @@ def _get_preprovisioning_cfgs(dom):
# platform has removed PreprovisionedVm and only surfaces
# PreprovisionedVMType.
cfg["PreprovisionedVm"] = _get_preprovisionedvm_cfg_value(
- platform_settings)
+ platform_settings
+ )
cfg["PreprovisionedVMType"] = _get_preprovisionedvmtype_cfg_value(
- platform_settings)
+ platform_settings
+ )
return cfg
@@ -1822,16 +2076,18 @@ def _get_preprovisionedvm_cfg_value(platform_settings):
# platform has removed PreprovisionedVm and only surfaces
# PreprovisionedVMType.
preprovisionedVmVal = find_child(
- platform_settings[0],
- lambda n: n.localName == "PreprovisionedVm")
+ platform_settings[0], lambda n: n.localName == "PreprovisionedVm"
+ )
if not preprovisionedVmVal or len(preprovisionedVmVal) == 0:
LOG.debug("PreprovisionedVm not found")
return preprovisionedVm
preprovisionedVm = util.translate_bool(
- preprovisionedVmVal[0].firstChild.nodeValue)
+ preprovisionedVmVal[0].firstChild.nodeValue
+ )
report_diagnostic_event(
- "PreprovisionedVm: %s" % preprovisionedVm, logger_func=LOG.info)
+ "PreprovisionedVm: %s" % preprovisionedVm, logger_func=LOG.info
+ )
return preprovisionedVm
@@ -1850,18 +2106,21 @@ def _get_preprovisionedvmtype_cfg_value(platform_settings):
# Once assigned to customer, the customer-requested nics are
# hot-attached to it and reprovision happens like today.
preprovisionedVMTypeVal = find_child(
- platform_settings[0],
- lambda n: n.localName == "PreprovisionedVMType")
- if (not preprovisionedVMTypeVal or len(preprovisionedVMTypeVal) == 0 or
- preprovisionedVMTypeVal[0].firstChild is None):
+ platform_settings[0], lambda n: n.localName == "PreprovisionedVMType"
+ )
+ if (
+ not preprovisionedVMTypeVal
+ or len(preprovisionedVMTypeVal) == 0
+ or preprovisionedVMTypeVal[0].firstChild is None
+ ):
LOG.debug("PreprovisionedVMType not found")
return preprovisionedVMType
preprovisionedVMType = preprovisionedVMTypeVal[0].firstChild.nodeValue
report_diagnostic_event(
- "PreprovisionedVMType: %s" % preprovisionedVMType,
- logger_func=LOG.info)
+ "PreprovisionedVMType: %s" % preprovisionedVMType, logger_func=LOG.info
+ )
return preprovisionedVMType
@@ -1885,7 +2144,7 @@ def _check_freebsd_cdrom(cdrom_dev):
@azure_ds_telemetry_reporter
def _get_random_seed(source=PLATFORM_ENTROPY_SOURCE):
"""Return content random seed file if available, otherwise,
- return None."""
+ return None."""
# azure / hyper-v provides random data here
# now update ds_cfg to reflect contents pass in config
if source is None:
@@ -1901,24 +2160,22 @@ def _get_random_seed(source=PLATFORM_ENTROPY_SOURCE):
# string. Same number of bits of entropy, just with 25% more zeroes.
# There's no need to undo this base64-encoding when the random seed is
# actually used in cc_seed_random.py.
- seed = base64.b64encode(seed).decode()
-
- return seed
+ return base64.b64encode(seed).decode() # type: ignore
@azure_ds_telemetry_reporter
-def list_possible_azure_ds_devs():
- devlist = []
+def list_possible_azure_ds(seed, cache_dir):
+ yield seed
+ yield DEFAULT_PROVISIONING_ISO_DEV
if util.is_FreeBSD():
cdrom_dev = "/dev/cd0"
if _check_freebsd_cdrom(cdrom_dev):
- return [cdrom_dev]
+ yield cdrom_dev
else:
for fstype in ("iso9660", "udf"):
- devlist.extend(util.find_devs_with("TYPE=%s" % fstype))
-
- devlist.sort(reverse=True)
- return devlist
+ yield from util.find_devs_with("TYPE=%s" % fstype)
+ if cache_dir:
+ yield cache_dir
@azure_ds_telemetry_reporter
@@ -1932,7 +2189,7 @@ def load_azure_ds_dir(source_dir):
contents = fp.read()
md, ud, cfg = read_azure_ovf(contents)
- return (md, ud, cfg, {'ovf-env.xml': contents})
+ return (md, ud, cfg, {"ovf-env.xml": contents})
@azure_ds_telemetry_reporter
@@ -1949,12 +2206,14 @@ def parse_network_config(imds_metadata) -> dict:
return _generate_network_config_from_imds_metadata(imds_metadata)
except Exception as e:
LOG.error(
- 'Failed generating network config '
- 'from IMDS network metadata: %s', str(e))
+ "Failed generating network config "
+ "from IMDS network metadata: %s",
+ str(e),
+ )
try:
return _generate_network_config_from_fallback_config()
except Exception as e:
- LOG.error('Failed generating fallback network config: %s', str(e))
+ LOG.error("Failed generating fallback network config: %s", str(e))
return {}
@@ -1966,51 +2225,69 @@ def _generate_network_config_from_imds_metadata(imds_metadata) -> dict:
@param: imds_metadata: Dict of content read from IMDS network service.
@return: Dictionary containing network version 2 standard configuration.
"""
- netconfig = {'version': 2, 'ethernets': {}}
- network_metadata = imds_metadata['network']
- for idx, intf in enumerate(network_metadata['interface']):
+ netconfig: Dict[str, Any] = {"version": 2, "ethernets": {}}
+ network_metadata = imds_metadata["network"]
+ for idx, intf in enumerate(network_metadata["interface"]):
+ has_ip_address = False
# First IPv4 and/or IPv6 address will be obtained via DHCP.
# Any additional IPs of each type will be set as static
# addresses.
- nicname = 'eth{idx}'.format(idx=idx)
- dhcp_override = {'route-metric': (idx + 1) * 100}
- dev_config = {'dhcp4': True, 'dhcp4-overrides': dhcp_override,
- 'dhcp6': False}
- for addr_type in ('ipv4', 'ipv6'):
- addresses = intf.get(addr_type, {}).get('ipAddress', [])
- if addr_type == 'ipv4':
- default_prefix = '24'
+ nicname = "eth{idx}".format(idx=idx)
+ dhcp_override = {"route-metric": (idx + 1) * 100}
+ dev_config: Dict[str, Any] = {
+ "dhcp4": True,
+ "dhcp4-overrides": dhcp_override,
+ "dhcp6": False,
+ }
+ for addr_type in ("ipv4", "ipv6"):
+ addresses = intf.get(addr_type, {}).get("ipAddress", [])
+ # If there are no available IP addresses, then we don't
+ # want to add this interface to the generated config.
+ if not addresses:
+ LOG.debug("No %s addresses found for: %r", addr_type, intf)
+ continue
+ has_ip_address = True
+ if addr_type == "ipv4":
+ default_prefix = "24"
else:
- default_prefix = '128'
+ default_prefix = "128"
if addresses:
- dev_config['dhcp6'] = True
+ dev_config["dhcp6"] = True
# non-primary interfaces should have a higher
# route-metric (cost) so default routes prefer
# primary nic due to lower route-metric value
- dev_config['dhcp6-overrides'] = dhcp_override
+ dev_config["dhcp6-overrides"] = dhcp_override
for addr in addresses[1:]:
# Append static address config for ip > 1
- netPrefix = intf[addr_type]['subnet'][0].get(
- 'prefix', default_prefix)
- privateIp = addr['privateIpAddress']
- if not dev_config.get('addresses'):
- dev_config['addresses'] = []
- dev_config['addresses'].append(
- '{ip}/{prefix}'.format(
- ip=privateIp, prefix=netPrefix))
- if dev_config:
- mac = ':'.join(re.findall(r'..', intf['macAddress']))
- dev_config.update({
- 'match': {'macaddress': mac.lower()},
- 'set-name': nicname
- })
+ netPrefix = intf[addr_type]["subnet"][0].get(
+ "prefix", default_prefix
+ )
+ privateIp = addr["privateIpAddress"]
+ if not dev_config.get("addresses"):
+ dev_config["addresses"] = []
+ dev_config["addresses"].append(
+ "{ip}/{prefix}".format(ip=privateIp, prefix=netPrefix)
+ )
+ if dev_config and has_ip_address:
+ mac = normalize_mac_address(intf["macAddress"])
+ dev_config.update(
+ {"match": {"macaddress": mac.lower()}, "set-name": nicname}
+ )
# With netvsc, we can get two interfaces that
# share the same MAC, so we need to make sure
# our match condition also contains the driver
driver = device_driver(nicname)
- if driver and driver == 'hv_netvsc':
- dev_config['match']['driver'] = driver
- netconfig['ethernets'][nicname] = dev_config
+ if driver and driver == "hv_netvsc":
+ dev_config["match"]["driver"] = driver
+ netconfig["ethernets"][nicname] = dev_config
+ continue
+
+ LOG.debug(
+ "No configuration for: %s (dev_config=%r) (has_ip_address=%r)",
+ nicname,
+ dev_config,
+ has_ip_address,
+ )
return netconfig
@@ -2020,72 +2297,101 @@ def _generate_network_config_from_fallback_config() -> dict:
@return: Dictionary containing network version 2 standard configuration.
"""
- return net.generate_fallback_config(
- blacklist_drivers=BLACKLIST_DRIVERS, config_driver=True)
+ cfg = net.generate_fallback_config(
+ blacklist_drivers=BLACKLIST_DRIVERS, config_driver=True
+ )
+ if cfg is None:
+ return {}
+ return cfg
@azure_ds_telemetry_reporter
-def get_metadata_from_imds(fallback_nic,
- retries,
- md_type=metadata_type.compute):
+def get_metadata_from_imds(
+ retries,
+ md_type=MetadataType.ALL,
+ api_version=IMDS_VER_MIN,
+ exc_cb=retry_on_url_exc,
+ infinite=False,
+):
"""Query Azure's instance metadata service, returning a dictionary.
- If network is not up, setup ephemeral dhcp on fallback_nic to talk to the
- IMDS. For more info on IMDS:
+ For more info on IMDS:
https://docs.microsoft.com/en-us/azure/virtual-machines/windows/instance-metadata-service
- @param fallback_nic: String. The name of the nic which requires active
- network in order to query IMDS.
@param retries: The number of retries of the IMDS_URL.
+ @param md_type: Metadata type for IMDS request.
+ @param api_version: IMDS api-version to use in the request.
@return: A dict of instance metadata containing compute and network
info.
"""
- kwargs = {'logfunc': LOG.debug,
- 'msg': 'Crawl of Azure Instance Metadata Service (IMDS)',
- 'func': _get_metadata_from_imds, 'args': (retries, md_type,)}
- if net.is_up(fallback_nic):
+ kwargs = {
+ "logfunc": LOG.debug,
+ "msg": "Crawl of Azure Instance Metadata Service (IMDS)",
+ "func": _get_metadata_from_imds,
+ "args": (retries, exc_cb, md_type, api_version, infinite),
+ }
+ try:
return util.log_time(**kwargs)
- else:
- try:
- with EphemeralDHCPv4WithReporting(
- azure_ds_reporter, fallback_nic):
- return util.log_time(**kwargs)
- except Exception as e:
- report_diagnostic_event(
- "exception while getting metadata: %s" % e,
- logger_func=LOG.warning)
- raise
+ except Exception as e:
+ report_diagnostic_event(
+ "exception while getting metadata: %s" % e,
+ logger_func=LOG.warning,
+ )
+ raise
@azure_ds_telemetry_reporter
-def _get_metadata_from_imds(retries, md_type=metadata_type.compute):
-
- url = md_type.value
+def _get_metadata_from_imds(
+ retries,
+ exc_cb,
+ md_type=MetadataType.ALL,
+ api_version=IMDS_VER_MIN,
+ infinite=False,
+):
+ url = "{}?api-version={}".format(md_type.value, api_version)
headers = {"Metadata": "true"}
+
+ # support for extended metadata begins with 2021-03-01
+ if api_version >= IMDS_EXTENDED_VER_MIN and md_type == MetadataType.ALL:
+ url = url + "&extended=true"
+
try:
response = readurl(
- url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers,
- retries=retries, exception_cb=retry_on_url_exc)
+ url,
+ timeout=IMDS_TIMEOUT_IN_SECONDS,
+ headers=headers,
+ retries=retries,
+ exception_cb=exc_cb,
+ infinite=infinite,
+ )
except Exception as e:
- report_diagnostic_event(
- 'Ignoring IMDS instance metadata. '
- 'Get metadata from IMDS failed: %s' % e,
- logger_func=LOG.warning)
- return {}
+ # pylint:disable=no-member
+ if isinstance(e, UrlError) and e.code == 400:
+ raise
+ else:
+ report_diagnostic_event(
+ "Ignoring IMDS instance metadata. "
+ "Get metadata from IMDS failed: %s" % e,
+ logger_func=LOG.warning,
+ )
+ return {}
try:
from json.decoder import JSONDecodeError
+
json_decode_error = JSONDecodeError
except ImportError:
json_decode_error = ValueError
try:
- return util.load_json(str(response))
+ return util.load_json(response.contents)
except json_decode_error as e:
report_diagnostic_event(
- 'Ignoring non-json IMDS instance metadata response: %s. '
- 'Loading non-json IMDS response failed: %s' % (str(response), e),
- logger_func=LOG.warning)
+ "Ignoring non-json IMDS instance metadata response: %s. "
+ "Loading non-json IMDS response failed: %s"
+ % (response.contents, e),
+ logger_func=LOG.warning,
+ )
return {}
@@ -2115,10 +2421,11 @@ def maybe_remove_ubuntu_network_config_scripts(paths=None):
if os.path.exists(path):
if not logged:
LOG.info(
- 'Removing Ubuntu extended network scripts because'
- ' cloud-init updates Azure network configuration on the'
- ' following event: %s.',
- EventType.BOOT)
+ "Removing Ubuntu extended network scripts because"
+ " cloud-init updates Azure network configuration on the"
+ " following events: %s.",
+ [EventType.BOOT.value, EventType.BOOT_LEGACY.value],
+ )
logged = True
if os.path.isdir(path):
util.del_dir(path)
@@ -2131,15 +2438,15 @@ def _is_platform_viable(seed_dir):
with events.ReportEventStack(
name="check-platform-viability",
description="found azure asset tag",
- parent=azure_ds_reporter
+ parent=azure_ds_reporter,
) as evt:
- asset_tag = dmi.read_dmi_data('chassis-asset-tag')
+ asset_tag = dmi.read_dmi_data("chassis-asset-tag")
if asset_tag == AZURE_CHASSIS_ASSET_TAG:
return True
msg = "Non-Azure DMI asset tag '%s' discovered." % asset_tag
evt.description = msg
report_diagnostic_event(msg, logger_func=LOG.debug)
- if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')):
+ if os.path.exists(os.path.join(seed_dir, "ovf-env.xml")):
return True
return False
@@ -2157,7 +2464,7 @@ DataSourceAzureNet = DataSourceAzure
# Used to match classes to dependencies
datasources = [
- (DataSourceAzure, (sources.DEP_FILESYSTEM, )),
+ (DataSourceAzure, (sources.DEP_FILESYSTEM,)),
]
@@ -2165,4 +2472,5 @@ datasources = [
def get_datasource_list(depends):
return sources.list_from_depends(depends, datasources)
+
# vi: ts=4 expandtab