diff options
author | Douglas Jordan <dojordan@microsoft.com> | 2018-01-24 16:10:08 -0700 |
---|---|---|
committer | Chad Smith <chad.smith@canonical.com> | 2018-01-24 16:10:08 -0700 |
commit | c03bdd3d8ed762cada813c5e95a40b14d2047b57 (patch) | |
tree | 708422bb64a8804f649ad7558d5088f9e11011a4 /cloudinit | |
parent | 30597f28512fafbe25486df5865b628d859486c6 (diff) | |
download | vyos-cloud-init-c03bdd3d8ed762cada813c5e95a40b14d2047b57.tar.gz vyos-cloud-init-c03bdd3d8ed762cada813c5e95a40b14d2047b57.zip |
Azure VM Preprovisioning support.
This change will enable azure vms to report provisioning has completed
twice, first to tell the fabric it has completed then a second time to
enable customer settings. The datasource for the second provisioning is
the Instance Metadata Service (IMDS),and the VM will poll indefinitely for
the new ovf-env.xml from IMDS.
This branch introduces EphemeralDHCPv4 which encapsulates common logic
used by both DataSourceEc2 an DataSourceAzure for temporary DHCP
interactions without side-effects.
LP: #1734991
Diffstat (limited to 'cloudinit')
-rw-r--r-- | cloudinit/net/dhcp.py | 43 | ||||
-rw-r--r-- | cloudinit/net/network_state.py | 12 | ||||
-rw-r--r-- | cloudinit/sources/DataSourceAzure.py | 138 | ||||
-rw-r--r-- | cloudinit/sources/DataSourceEc2.py | 23 | ||||
-rw-r--r-- | cloudinit/sources/helpers/azure.py | 22 | ||||
-rw-r--r-- | cloudinit/temp_utils.py | 11 | ||||
-rw-r--r-- | cloudinit/url_helper.py | 29 |
7 files changed, 232 insertions, 46 deletions
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py index 875a4609..087c0c03 100644 --- a/cloudinit/net/dhcp.py +++ b/cloudinit/net/dhcp.py @@ -10,7 +10,9 @@ import os import re import signal -from cloudinit.net import find_fallback_nic, get_devicelist +from cloudinit.net import ( + EphemeralIPv4Network, find_fallback_nic, get_devicelist) +from cloudinit.net.network_state import mask_and_ipv4_to_bcast_addr as bcip from cloudinit import temp_utils from cloudinit import util from six import StringIO @@ -29,6 +31,45 @@ class InvalidDHCPLeaseFileError(Exception): pass +class NoDHCPLeaseError(Exception): + """Raised when unable to get a DHCP lease.""" + pass + + +class EphemeralDHCPv4(object): + def __init__(self, iface=None): + self.iface = iface + self._ephipv4 = None + + def __enter__(self): + try: + leases = maybe_perform_dhcp_discovery(self.iface) + except InvalidDHCPLeaseFileError: + raise NoDHCPLeaseError() + if not leases: + raise NoDHCPLeaseError() + lease = leases[-1] + LOG.debug("Received dhcp lease on %s for %s/%s", + lease['interface'], lease['fixed-address'], + lease['subnet-mask']) + nmap = {'interface': 'interface', 'ip': 'fixed-address', + 'prefix_or_mask': 'subnet-mask', + 'broadcast': 'broadcast-address', + 'router': 'routers'} + kwargs = dict([(k, lease.get(v)) for k, v in nmap.items()]) + if not kwargs['broadcast']: + kwargs['broadcast'] = bcip(kwargs['prefix_or_mask'], kwargs['ip']) + ephipv4 = EphemeralIPv4Network(**kwargs) + ephipv4.__enter__() + self._ephipv4 = ephipv4 + return lease + + def __exit__(self, excp_type, excp_value, excp_traceback): + if not self._ephipv4: + return + self._ephipv4.__exit__(excp_type, excp_value, excp_traceback) + + def maybe_perform_dhcp_discovery(nic=None): """Perform dhcp discovery if nic valid and dhclient command exists. diff --git a/cloudinit/net/network_state.py b/cloudinit/net/network_state.py index 31738c73..fe667d88 100644 --- a/cloudinit/net/network_state.py +++ b/cloudinit/net/network_state.py @@ -961,4 +961,16 @@ def mask_to_net_prefix(mask): return ipv4_mask_to_net_prefix(mask) +def mask_and_ipv4_to_bcast_addr(mask, ip): + """Calculate the broadcast address from the subnet mask and ip addr. + + Supports ipv4 only.""" + ip_bin = int(''.join([bin(int(x) + 256)[3:] for x in ip.split('.')]), 2) + mask_dec = ipv4_mask_to_net_prefix(mask) + bcast_bin = ip_bin | (2**(32 - mask_dec) - 1) + bcast_str = '.'.join([str(bcast_bin >> (i << 3) & 0xFF) + for i in range(4)[::-1]]) + return bcast_str + + # vi: ts=4 expandtab diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index d1d09757..4bcbf3a4 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -11,13 +11,16 @@ from functools import partial import os import os.path import re +from time import time from xml.dom import minidom import xml.etree.ElementTree as ET from cloudinit import log as logging from cloudinit import net +from cloudinit.net.dhcp import EphemeralDHCPv4 from cloudinit import sources from cloudinit.sources.helpers.azure import get_metadata_from_fabric +from cloudinit.url_helper import readurl, wait_for_url, UrlError from cloudinit import util LOG = logging.getLogger(__name__) @@ -44,6 +47,9 @@ LEASE_FILE = '/var/lib/dhcp/dhclient.eth0.leases' DEFAULT_FS = 'ext4' # DMI chassis-asset-tag is set static for all azure instances AZURE_CHASSIS_ASSET_TAG = '7783-7084-3265-9085-8269-3286-77' +REPROVISION_MARKER_FILE = "/var/lib/cloud/data/poll_imds" +IMDS_URL = "http://169.254.169.254/metadata/reprovisiondata" +IMDS_RETRIES = 5 def find_storvscid_from_sysctl_pnpinfo(sysctl_out, deviceid): @@ -276,19 +282,20 @@ class DataSourceAzure(sources.DataSource): with temporary_hostname(azure_hostname, self.ds_cfg, hostname_command=hostname_command) \ - as previous_hostname: - if (previous_hostname is not None and + as previous_hn: + if (previous_hn is not None and util.is_true(self.ds_cfg.get('set_hostname'))): cfg = self.ds_cfg['hostname_bounce'] # "Bouncing" the network try: - perform_hostname_bounce(hostname=azure_hostname, - cfg=cfg, - prev_hostname=previous_hostname) + return perform_hostname_bounce(hostname=azure_hostname, + cfg=cfg, + prev_hostname=previous_hn) except Exception as e: LOG.warning("Failed publishing hostname: %s", e) util.logexc(LOG, "handling set_hostname failed") + return False def get_metadata_from_agent(self): temp_hostname = self.metadata.get('local-hostname') @@ -345,15 +352,20 @@ class DataSourceAzure(sources.DataSource): ddir = self.ds_cfg['data_dir'] candidates = [self.seed_dir] + if os.path.isfile(REPROVISION_MARKER_FILE): + candidates.insert(0, "IMDS") candidates.extend(list_possible_azure_ds_devs()) if ddir: candidates.append(ddir) found = None - + reprovision = False for cdev in candidates: try: - if cdev.startswith("/dev/"): + if cdev == "IMDS": + ret = None + reprovision = True + elif cdev.startswith("/dev/"): if util.is_FreeBSD(): ret = util.mount_cb(cdev, load_azure_ds_dir, mtype="udf", sync=False) @@ -370,6 +382,8 @@ class DataSourceAzure(sources.DataSource): LOG.warning("%s was not mountable", cdev) continue + if reprovision or self._should_reprovision(ret): + ret = self._reprovision() (md, self.userdata_raw, cfg, files) = ret self.seed = cdev self.metadata = util.mergemanydict([md, DEFAULT_METADATA]) @@ -428,6 +442,83 @@ class DataSourceAzure(sources.DataSource): LOG.debug("negotiating already done for %s", self.get_instance_id()) + def _poll_imds(self, report_ready=True): + """Poll IMDS for the new provisioning data until we get a valid + response. Then return the returned JSON object.""" + url = IMDS_URL + "?api-version=2017-04-02" + headers = {"Metadata": "true"} + LOG.debug("Start polling IMDS") + + def sleep_cb(response, loop_n): + return 1 + + def exception_cb(msg, exception): + if isinstance(exception, UrlError) and exception.code == 404: + return + LOG.warning("Exception during polling. Will try DHCP.", + exc_info=True) + + # If we get an exception while trying to call IMDS, we + # call DHCP and setup the ephemeral network to acquire the new IP. + raise exception + + need_report = report_ready + for i in range(IMDS_RETRIES): + try: + with EphemeralDHCPv4() as lease: + if need_report: + self._report_ready(lease=lease) + need_report = False + wait_for_url([url], max_wait=None, timeout=60, + status_cb=LOG.info, + headers_cb=lambda url: headers, sleep_time=1, + exception_cb=exception_cb, + sleep_time_cb=sleep_cb) + return str(readurl(url, headers=headers)) + except Exception: + LOG.debug("Exception during polling-retrying dhcp" + + " %d more time(s).", (IMDS_RETRIES - i), + exc_info=True) + + def _report_ready(self, lease): + """Tells the fabric provisioning has completed + before we go into our polling loop.""" + try: + get_metadata_from_fabric(None, lease['unknown-245']) + except Exception as exc: + LOG.warning( + "Error communicating with Azure fabric; You may experience." + "connectivity issues.", exc_info=True) + + def _should_reprovision(self, ret): + """Whether or not we should poll IMDS for reprovisioning data. + Also sets a marker file to poll IMDS. + + The marker file is used for the following scenario: the VM boots into + this polling loop, which we expect to be proceeding infinitely until + the VM is picked. If for whatever reason the platform moves us to a + new host (for instance a hardware issue), we need to keep polling. + However, since the VM reports ready to the Fabric, we will not attach + the ISO, thus cloud-init needs to have a way of knowing that it should + jump back into the polling loop in order to retrieve the ovf_env.""" + if not ret: + return False + (md, self.userdata_raw, cfg, files) = ret + path = REPROVISION_MARKER_FILE + if (cfg.get('PreprovisionedVm') is True or + os.path.isfile(path)): + if not os.path.isfile(path): + LOG.info("Creating a marker file to poll imds") + util.write_file(path, "%s: %s\n" % (os.getpid(), time())) + return True + return False + + def _reprovision(self): + """Initiate the reprovisioning workflow.""" + contents = self._poll_imds() + md, ud, cfg = read_azure_ovf(contents) + return (md, ud, cfg, {'ovf-env.xml': contents}) + def _negotiate(self): """Negotiate with fabric and return data from it. @@ -453,7 +544,7 @@ class DataSourceAzure(sources.DataSource): "Error communicating with Azure fabric; You may experience." "connectivity issues.", exc_info=True) return False - + util.del_file(REPROVISION_MARKER_FILE) return fabric_data def activate(self, cfg, is_new_instance): @@ -595,6 +686,7 @@ def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120, def perform_hostname_bounce(hostname, cfg, prev_hostname): # set the hostname to 'hostname' if it is not already set to that. # then, if policy is not off, bounce the interface using command + # Returns True if the network was bounced, False otherwise. command = cfg['command'] interface = cfg['interface'] policy = cfg['policy'] @@ -614,7 +706,8 @@ def perform_hostname_bounce(hostname, cfg, prev_hostname): else: LOG.debug( "Skipping network bounce: ifupdown utils aren't present.") - return # Don't bounce as networkd handles hostname DDNS updates + # Don't bounce as networkd handles hostname DDNS updates + return False LOG.debug("pubhname: publishing hostname [%s]", msg) shell = not isinstance(command, (list, tuple)) # capture=False, see comments in bug 1202758 and bug 1206164. @@ -622,6 +715,7 @@ def perform_hostname_bounce(hostname, cfg, prev_hostname): get_uptime=True, func=util.subp, kwargs={'args': command, 'shell': shell, 'capture': False, 'env': env}) + return True def crtfile_to_pubkey(fname, data=None): @@ -838,9 +932,35 @@ def read_azure_ovf(contents): if 'ssh_pwauth' not in cfg and password: cfg['ssh_pwauth'] = True + cfg['PreprovisionedVm'] = _extract_preprovisioned_vm_setting(dom) + return (md, ud, cfg) +def _extract_preprovisioned_vm_setting(dom): + """Read the preprovision flag from the ovf. It should not + exist unless true.""" + platform_settings_section = find_child( + dom.documentElement, + lambda n: n.localName == "PlatformSettingsSection") + if not platform_settings_section or len(platform_settings_section) == 0: + LOG.debug("PlatformSettingsSection not found") + return False + platform_settings = find_child( + platform_settings_section[0], + lambda n: n.localName == "PlatformSettings") + if not platform_settings or len(platform_settings) == 0: + LOG.debug("PlatformSettings not found") + return False + preprovisionedVm = find_child( + platform_settings[0], + lambda n: n.localName == "PreprovisionedVm") + if not preprovisionedVm or len(preprovisionedVm) == 0: + LOG.debug("PreprovisionedVm not found") + return False + return util.translate_bool(preprovisionedVm[0].firstChild.nodeValue) + + def encrypt_pass(password, salt_id="$6$"): return crypt.crypt(password, salt_id + util.rand_str(strlen=16)) diff --git a/cloudinit/sources/DataSourceEc2.py b/cloudinit/sources/DataSourceEc2.py index 0f89f34d..e14553b3 100644 --- a/cloudinit/sources/DataSourceEc2.py +++ b/cloudinit/sources/DataSourceEc2.py @@ -14,7 +14,7 @@ import time from cloudinit import ec2_utils as ec2 from cloudinit import log as logging from cloudinit import net -from cloudinit.net import dhcp +from cloudinit.net.dhcp import EphemeralDHCPv4, NoDHCPLeaseError from cloudinit import sources from cloudinit import url_helper as uhelp from cloudinit import util @@ -102,22 +102,13 @@ class DataSourceEc2(sources.DataSource): if util.is_FreeBSD(): LOG.debug("FreeBSD doesn't support running dhclient with -sf") return False - dhcp_leases = dhcp.maybe_perform_dhcp_discovery( - self.fallback_interface) - if not dhcp_leases: - # DataSourceEc2Local failed in init-local stage. DataSourceEc2 - # will still run in init-network stage. + try: + with EphemeralDHCPv4(self.fallback_interface): + return util.log_time( + logfunc=LOG.debug, msg='Crawl of metadata service', + func=self._crawl_metadata) + except NoDHCPLeaseError: return False - dhcp_opts = dhcp_leases[-1] - net_params = {'interface': dhcp_opts.get('interface'), - 'ip': dhcp_opts.get('fixed-address'), - 'prefix_or_mask': dhcp_opts.get('subnet-mask'), - 'broadcast': dhcp_opts.get('broadcast-address'), - 'router': dhcp_opts.get('routers')} - with net.EphemeralIPv4Network(**net_params): - return util.log_time( - logfunc=LOG.debug, msg='Crawl of metadata service', - func=self._crawl_metadata) else: return self._crawl_metadata() diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index 6cda5721..90c12df1 100644 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -199,10 +199,10 @@ class WALinuxAgentShim(object): ' </Container>', '</Health>']) - def __init__(self, fallback_lease_file=None): + def __init__(self, fallback_lease_file=None, dhcp_options=None): LOG.debug('WALinuxAgentShim instantiated, fallback_lease_file=%s', fallback_lease_file) - self.dhcpoptions = None + self.dhcpoptions = dhcp_options self._endpoint = None self.openssl_manager = None self.values = {} @@ -220,7 +220,8 @@ class WALinuxAgentShim(object): @property def endpoint(self): if self._endpoint is None: - self._endpoint = self.find_endpoint(self.lease_file) + self._endpoint = self.find_endpoint(self.lease_file, + self.dhcpoptions) return self._endpoint @staticmethod @@ -292,10 +293,14 @@ class WALinuxAgentShim(object): return _value @staticmethod - def find_endpoint(fallback_lease_file=None): + def find_endpoint(fallback_lease_file=None, dhcp245=None): value = None - LOG.debug('Finding Azure endpoint from networkd...') - value = WALinuxAgentShim._networkd_get_value_from_leases() + if dhcp245 is not None: + value = dhcp245 + LOG.debug("Using Azure Endpoint from dhcp options") + if value is None: + LOG.debug('Finding Azure endpoint from networkd...') + value = WALinuxAgentShim._networkd_get_value_from_leases() if value is None: # Option-245 stored in /run/cloud-init/dhclient.hooks/<ifc>.json # a dhclient exit hook that calls cloud-init-dhclient-hook @@ -367,8 +372,9 @@ class WALinuxAgentShim(object): LOG.info('Reported ready to Azure fabric.') -def get_metadata_from_fabric(fallback_lease_file=None): - shim = WALinuxAgentShim(fallback_lease_file=fallback_lease_file) +def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None): + shim = WALinuxAgentShim(fallback_lease_file=fallback_lease_file, + dhcp_options=dhcp_opts) try: return shim.register_with_azure_and_fetch_data() finally: diff --git a/cloudinit/temp_utils.py b/cloudinit/temp_utils.py index 5d7adf70..c98a1b53 100644 --- a/cloudinit/temp_utils.py +++ b/cloudinit/temp_utils.py @@ -28,13 +28,18 @@ def _tempfile_dir_arg(odir=None, needs_exe=False): if odir is not None: return odir + if needs_exe: + tdir = _EXE_ROOT_TMPDIR + if not os.path.isdir(tdir): + os.makedirs(tdir) + os.chmod(tdir, 0o1777) + return tdir + global _TMPDIR if _TMPDIR: return _TMPDIR - if needs_exe: - tdir = _EXE_ROOT_TMPDIR - elif os.getuid() == 0: + if os.getuid() == 0: tdir = _ROOT_TMPDIR else: tdir = os.environ.get('TMPDIR', '/tmp') diff --git a/cloudinit/url_helper.py b/cloudinit/url_helper.py index 0e0f5b4c..0a5be0b3 100644 --- a/cloudinit/url_helper.py +++ b/cloudinit/url_helper.py @@ -273,7 +273,7 @@ def readurl(url, data=None, timeout=None, retries=0, sec_between=1, def wait_for_url(urls, max_wait=None, timeout=None, status_cb=None, headers_cb=None, sleep_time=1, - exception_cb=None): + exception_cb=None, sleep_time_cb=None): """ urls: a list of urls to try max_wait: roughly the maximum time to wait before giving up @@ -286,6 +286,8 @@ def wait_for_url(urls, max_wait=None, timeout=None, for request. exception_cb: call method with 2 arguments 'msg' (per status_cb) and 'exception', the exception that occurred. + sleep_time_cb: call method with 2 arguments (response, loop_n) that + generates the next sleep time. the idea of this routine is to wait for the EC2 metdata service to come up. On both Eucalyptus and EC2 we have seen the case where @@ -301,6 +303,8 @@ def wait_for_url(urls, max_wait=None, timeout=None, service but is not going to find one. It is possible that the instance data host (169.254.169.254) may be firewalled off Entirely for a sytem, meaning that the connection will block forever unless a timeout is set. + + A value of None for max_wait will retry indefinitely. """ start_time = time.time() @@ -311,18 +315,24 @@ def wait_for_url(urls, max_wait=None, timeout=None, status_cb = log_status_cb def timeup(max_wait, start_time): - return ((max_wait <= 0 or max_wait is None) or - (time.time() - start_time > max_wait)) + if (max_wait is None): + return False + return ((max_wait <= 0) or (time.time() - start_time > max_wait)) loop_n = 0 + response = None while True: - sleep_time = int(loop_n / 5) + 1 + if sleep_time_cb is not None: + sleep_time = sleep_time_cb(response, loop_n) + else: + sleep_time = int(loop_n / 5) + 1 for url in urls: now = time.time() if loop_n != 0: if timeup(max_wait, start_time): break - if timeout and (now + timeout > (start_time + max_wait)): + if (max_wait is not None and + timeout and (now + timeout > (start_time + max_wait))): # shorten timeout to not run way over max_time timeout = int((start_time + max_wait) - now) @@ -354,10 +364,11 @@ def wait_for_url(urls, max_wait=None, timeout=None, url_exc = e time_taken = int(time.time() - start_time) - status_msg = "Calling '%s' failed [%s/%ss]: %s" % (url, - time_taken, - max_wait, - reason) + max_wait_str = "%ss" % max_wait if max_wait else "unlimited" + status_msg = "Calling '%s' failed [%s/%s]: %s" % (url, + time_taken, + max_wait_str, + reason) status_cb(status_msg) if exception_cb: # This can be used to alter the headers that will be sent |