From 34f54360fcc1e0f805002a0b639d0a84eb2cb8ee Mon Sep 17 00:00:00 2001 From: "Jason Zions (MSFT)" Date: Fri, 22 Feb 2019 13:26:31 +0000 Subject: azure: Filter list of ssh keys pulled from fabric The Azure data source is expected to expose a list of ssh keys for the user-to-be-provisioned in the crawled metadata. When configured to use the __builtin__ agent this list is built by the WALinuxAgentShim. The shim retrieves the full set of certificates and public keys exposed to the VM from the wireserver, extracts any ssh keys it can, and returns that list. This fix reduces that list of ssh keys to just the ones whose fingerprints appear in the "administrative user" section of the ovf-env.xml file. The Azure control plane exposes other ssh keys to the VM for other reasons, but those should not be added to the authorized_keys file for the provisioned user. --- cloudinit/sources/helpers/azure.py | 109 ++++++++++++++++++++++++++----------- 1 file changed, 78 insertions(+), 31 deletions(-) (limited to 'cloudinit/sources/helpers/azure.py') diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index e5696b1f..2829dd20 100644 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -138,9 +138,36 @@ class OpenSSLManager(object): self.certificate = certificate LOG.debug('New certificate generated.') - def parse_certificates(self, certificates_xml): - tag = ElementTree.fromstring(certificates_xml).find( - './/Data') + @staticmethod + def _run_x509_action(action, cert): + cmd = ['openssl', 'x509', '-noout', action] + result, _ = util.subp(cmd, data=cert) + return result + + def _get_ssh_key_from_cert(self, certificate): + pub_key = self._run_x509_action('-pubkey', certificate) + keygen_cmd = ['ssh-keygen', '-i', '-m', 'PKCS8', '-f', '/dev/stdin'] + ssh_key, _ = util.subp(keygen_cmd, data=pub_key) + return ssh_key + + def _get_fingerprint_from_cert(self, certificate): + """openssl x509 formats fingerprints as so: + 'SHA1 Fingerprint=07:3E:19:D1:4D:1C:79:92:24:C6:A0:FD:8D:DA:\ + B6:A8:BF:27:D4:73\n' + + Azure control plane passes that fingerprint as so: + '073E19D14D1C799224C6A0FD8DDAB6A8BF27D473' + """ + raw_fp = self._run_x509_action('-fingerprint', certificate) + eq = raw_fp.find('=') + octets = raw_fp[eq+1:-1].split(':') + return ''.join(octets) + + def _decrypt_certs_from_xml(self, certificates_xml): + """Decrypt the certificates XML document using the our private key; + return the list of certs and private keys contained in the doc. + """ + tag = ElementTree.fromstring(certificates_xml).find('.//Data') certificates_content = tag.text lines = [ b'MIME-Version: 1.0', @@ -151,32 +178,30 @@ class OpenSSLManager(object): certificates_content.encode('utf-8'), ] with cd(self.tmpdir): - with open('Certificates.p7m', 'wb') as f: - f.write(b'\n'.join(lines)) out, _ = util.subp( - 'openssl cms -decrypt -in Certificates.p7m -inkey' + 'openssl cms -decrypt -in /dev/stdin -inkey' ' {private_key} -recip {certificate} | openssl pkcs12 -nodes' ' -password pass:'.format(**self.certificate_names), - shell=True) - private_keys, certificates = [], [] + shell=True, data=b'\n'.join(lines)) + return out + + def parse_certificates(self, certificates_xml): + """Given the Certificates XML document, return a dictionary of + fingerprints and associated SSH keys derived from the certs.""" + out = self._decrypt_certs_from_xml(certificates_xml) current = [] + keys = {} for line in out.splitlines(): current.append(line) if re.match(r'[-]+END .*?KEY[-]+$', line): - private_keys.append('\n'.join(current)) + # ignore private_keys current = [] elif re.match(r'[-]+END .*?CERTIFICATE[-]+$', line): - certificates.append('\n'.join(current)) + certificate = '\n'.join(current) + ssh_key = self._get_ssh_key_from_cert(certificate) + fingerprint = self._get_fingerprint_from_cert(certificate) + keys[fingerprint] = ssh_key current = [] - keys = [] - for certificate in certificates: - with cd(self.tmpdir): - public_key, _ = util.subp( - 'openssl x509 -noout -pubkey |' - 'ssh-keygen -i -m PKCS8 -f /dev/stdin', - data=certificate, - shell=True) - keys.append(public_key) return keys @@ -206,7 +231,6 @@ class WALinuxAgentShim(object): self.dhcpoptions = dhcp_options self._endpoint = None self.openssl_manager = None - self.values = {} self.lease_file = fallback_lease_file def clean_up(self): @@ -328,8 +352,9 @@ class WALinuxAgentShim(object): LOG.debug('Azure endpoint found at %s', endpoint_ip_address) return endpoint_ip_address - def register_with_azure_and_fetch_data(self): - self.openssl_manager = OpenSSLManager() + def register_with_azure_and_fetch_data(self, pubkey_info=None): + if self.openssl_manager is None: + self.openssl_manager = OpenSSLManager() http_client = AzureEndpointHttpClient(self.openssl_manager.certificate) LOG.info('Registering with Azure...') attempts = 0 @@ -347,16 +372,37 @@ class WALinuxAgentShim(object): attempts += 1 LOG.debug('Successfully fetched GoalState XML.') goal_state = GoalState(response.contents, http_client) - public_keys = [] - if goal_state.certificates_xml is not None: + ssh_keys = [] + if goal_state.certificates_xml is not None and pubkey_info is not None: LOG.debug('Certificate XML found; parsing out public keys.') - public_keys = self.openssl_manager.parse_certificates( + keys_by_fingerprint = self.openssl_manager.parse_certificates( goal_state.certificates_xml) - data = { - 'public-keys': public_keys, - } + ssh_keys = self._filter_pubkeys(keys_by_fingerprint, pubkey_info) self._report_ready(goal_state, http_client) - return data + return {'public-keys': ssh_keys} + + def _filter_pubkeys(self, keys_by_fingerprint, pubkey_info): + """cloud-init expects a straightforward array of keys to be dropped + into the user's authorized_keys file. Azure control plane exposes + multiple public keys to the VM via wireserver. Select just the + user's key(s) and return them, ignoring any other certs. + """ + keys = [] + for pubkey in pubkey_info: + if 'value' in pubkey and pubkey['value']: + keys.append(pubkey['value']) + elif 'fingerprint' in pubkey and pubkey['fingerprint']: + fingerprint = pubkey['fingerprint'] + if fingerprint in keys_by_fingerprint: + keys.append(keys_by_fingerprint[fingerprint]) + else: + LOG.warning("ovf-env.xml specified PublicKey fingerprint " + "%s not found in goalstate XML", fingerprint) + else: + LOG.warning("ovf-env.xml specified PublicKey with neither " + "value nor fingerprint: %s", pubkey) + + return keys def _report_ready(self, goal_state, http_client): LOG.debug('Reporting ready to Azure fabric.') @@ -373,11 +419,12 @@ class WALinuxAgentShim(object): LOG.info('Reported ready to Azure fabric.') -def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None): +def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None, + pubkey_info=None): shim = WALinuxAgentShim(fallback_lease_file=fallback_lease_file, dhcp_options=dhcp_opts) try: - return shim.register_with_azure_and_fetch_data() + return shim.register_with_azure_and_fetch_data(pubkey_info=pubkey_info) finally: shim.clean_up() -- cgit v1.2.3 From 0d8c88393b51db6454491a379dcc2e691551217a Mon Sep 17 00:00:00 2001 From: Anh Vo Date: Wed, 3 Apr 2019 18:23:18 +0000 Subject: DatasourceAzure: add additional logging for azure datasource Create an Azure logging decorator and use additional ReportEventStack context managers to provide additional logging details. --- cloudinit/sources/DataSourceAzure.py | 231 ++++++++++++++++++++++------------- cloudinit/sources/helpers/azure.py | 31 +++++ 2 files changed, 179 insertions(+), 83 deletions(-) mode change 100644 => 100755 cloudinit/sources/DataSourceAzure.py mode change 100644 => 100755 cloudinit/sources/helpers/azure.py (limited to 'cloudinit/sources/helpers/azure.py') diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py old mode 100644 new mode 100755 index b4e3f061..d4230b3c --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -21,10 +21,14 @@ from cloudinit import net from cloudinit.event import EventType from cloudinit.net.dhcp import EphemeralDHCPv4 from cloudinit import sources -from cloudinit.sources.helpers.azure import get_metadata_from_fabric from cloudinit.sources.helpers import netlink from cloudinit.url_helper import UrlError, readurl, retry_on_url_exc from cloudinit import util +from cloudinit.reporting import events + +from cloudinit.sources.helpers.azure import (azure_ds_reporter, + azure_ds_telemetry_reporter, + get_metadata_from_fabric) LOG = logging.getLogger(__name__) @@ -244,6 +248,7 @@ def set_hostname(hostname, hostname_command='hostname'): util.subp([hostname_command, hostname]) +@azure_ds_telemetry_reporter @contextlib.contextmanager def temporary_hostname(temp_hostname, cfg, hostname_command='hostname'): """ @@ -290,6 +295,7 @@ class DataSourceAzure(sources.DataSource): root = sources.DataSource.__str__(self) return "%s [seed=%s]" % (root, self.seed) + @azure_ds_telemetry_reporter def bounce_network_with_azure_hostname(self): # When using cloud-init to provision, we have to set the hostname from # the metadata and "bounce" the network to force DDNS to update via @@ -315,6 +321,7 @@ class DataSourceAzure(sources.DataSource): util.logexc(LOG, "handling set_hostname failed") return False + @azure_ds_telemetry_reporter def get_metadata_from_agent(self): temp_hostname = self.metadata.get('local-hostname') agent_cmd = self.ds_cfg['agent_command'] @@ -344,15 +351,18 @@ class DataSourceAzure(sources.DataSource): LOG.debug("ssh authentication: " "using fingerprint from fabirc") - # wait very long for public SSH keys to arrive - # https://bugs.launchpad.net/cloud-init/+bug/1717611 - missing = util.log_time(logfunc=LOG.debug, - msg="waiting for SSH public key files", - func=util.wait_for_files, - args=(fp_files, 900)) - - if len(missing): - LOG.warning("Did not find files, but going on: %s", missing) + with events.ReportEventStack( + name="waiting-for-ssh-public-key", + description="wait for agents to retrieve ssh keys", + parent=azure_ds_reporter): + # wait very long for public SSH keys to arrive + # https://bugs.launchpad.net/cloud-init/+bug/1717611 + missing = util.log_time(logfunc=LOG.debug, + msg="waiting for SSH public key files", + func=util.wait_for_files, + args=(fp_files, 900)) + if len(missing): + LOG.warning("Did not find files, but going on: %s", missing) metadata = {} metadata['public-keys'] = key_value or pubkeys_from_crt_files(fp_files) @@ -366,6 +376,7 @@ class DataSourceAzure(sources.DataSource): subplatform_type = 'seed-dir' return '%s (%s)' % (subplatform_type, self.seed) + @azure_ds_telemetry_reporter def crawl_metadata(self): """Walk all instance metadata sources returning a dict on success. @@ -467,6 +478,7 @@ class DataSourceAzure(sources.DataSource): super(DataSourceAzure, self).clear_cached_attrs(attr_defaults) self._metadata_imds = sources.UNSET + @azure_ds_telemetry_reporter def _get_data(self): """Crawl and process datasource metadata caching metadata as attrs. @@ -513,6 +525,7 @@ class DataSourceAzure(sources.DataSource): # quickly (local check only) if self.instance_id is still valid return sources.instance_id_matches_system_uuid(self.get_instance_id()) + @azure_ds_telemetry_reporter def setup(self, is_new_instance): if self._negotiated is False: LOG.debug("negotiating for %s (new_instance=%s)", @@ -580,6 +593,7 @@ class DataSourceAzure(sources.DataSource): if nl_sock: nl_sock.close() + @azure_ds_telemetry_reporter def _report_ready(self, lease): """Tells the fabric provisioning has completed """ try: @@ -617,9 +631,14 @@ class DataSourceAzure(sources.DataSource): def _reprovision(self): """Initiate the reprovisioning workflow.""" contents = self._poll_imds() - md, ud, cfg = read_azure_ovf(contents) - return (md, ud, cfg, {'ovf-env.xml': contents}) - + with events.ReportEventStack( + name="reprovisioning-read-azure-ovf", + description="read azure ovf during reprovisioning", + parent=azure_ds_reporter): + md, ud, cfg = read_azure_ovf(contents) + return (md, ud, cfg, {'ovf-env.xml': contents}) + + @azure_ds_telemetry_reporter def _negotiate(self): """Negotiate with fabric and return data from it. @@ -652,6 +671,7 @@ class DataSourceAzure(sources.DataSource): util.del_file(REPROVISION_MARKER_FILE) return fabric_data + @azure_ds_telemetry_reporter def activate(self, cfg, is_new_instance): address_ephemeral_resize(is_new_instance=is_new_instance, preserve_ntfs=self.ds_cfg.get( @@ -690,12 +710,14 @@ def _partitions_on_device(devpath, maxnum=16): return [] +@azure_ds_telemetry_reporter def _has_ntfs_filesystem(devpath): ntfs_devices = util.find_devs_with("TYPE=ntfs", no_cache=True) LOG.debug('ntfs_devices found = %s', ntfs_devices) return os.path.realpath(devpath) in ntfs_devices +@azure_ds_telemetry_reporter def can_dev_be_reformatted(devpath, preserve_ntfs): """Determine if the ephemeral drive at devpath should be reformatted. @@ -744,43 +766,59 @@ def can_dev_be_reformatted(devpath, preserve_ntfs): (cand_part, cand_path, devpath)) return False, msg + @azure_ds_telemetry_reporter def count_files(mp): ignored = set(['dataloss_warning_readme.txt']) return len([f for f in os.listdir(mp) if f.lower() not in ignored]) bmsg = ('partition %s (%s) on device %s was ntfs formatted' % (cand_part, cand_path, devpath)) - try: - file_count = util.mount_cb(cand_path, count_files, mtype="ntfs", - update_env_for_mount={'LANG': 'C'}) - except util.MountFailedError as e: - if "unknown filesystem type 'ntfs'" in str(e): - return True, (bmsg + ' but this system cannot mount NTFS,' - ' assuming there are no important files.' - ' Formatting allowed.') - return False, bmsg + ' but mount of %s failed: %s' % (cand_part, e) - - if file_count != 0: - LOG.warning("it looks like you're using NTFS on the ephemeral disk, " - 'to ensure that filesystem does not get wiped, set ' - '%s.%s in config', '.'.join(DS_CFG_PATH), - DS_CFG_KEY_PRESERVE_NTFS) - return False, bmsg + ' but had %d files on it.' % file_count + + with events.ReportEventStack( + name="mount-ntfs-and-count", + description="mount-ntfs-and-count", + parent=azure_ds_reporter) as evt: + try: + file_count = util.mount_cb(cand_path, count_files, mtype="ntfs", + update_env_for_mount={'LANG': 'C'}) + except util.MountFailedError as e: + evt.description = "cannot mount ntfs" + if "unknown filesystem type 'ntfs'" in str(e): + return True, (bmsg + ' but this system cannot mount NTFS,' + ' assuming there are no important files.' + ' Formatting allowed.') + return False, bmsg + ' but mount of %s failed: %s' % (cand_part, e) + + if file_count != 0: + evt.description = "mounted and counted %d files" % file_count + LOG.warning("it looks like you're using NTFS on the ephemeral" + " disk, to ensure that filesystem does not get wiped," + " set %s.%s in config", '.'.join(DS_CFG_PATH), + DS_CFG_KEY_PRESERVE_NTFS) + return False, bmsg + ' but had %d files on it.' % file_count return True, bmsg + ' and had no important files. Safe for reformatting.' +@azure_ds_telemetry_reporter def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120, is_new_instance=False, preserve_ntfs=False): # wait for ephemeral disk to come up naplen = .2 - missing = util.wait_for_files([devpath], maxwait=maxwait, naplen=naplen, - log_pre="Azure ephemeral disk: ") - - if missing: - LOG.warning("ephemeral device '%s' did not appear after %d seconds.", - devpath, maxwait) - return + with events.ReportEventStack( + name="wait-for-ephemeral-disk", + description="wait for ephemeral disk", + parent=azure_ds_reporter): + missing = util.wait_for_files([devpath], + maxwait=maxwait, + naplen=naplen, + log_pre="Azure ephemeral disk: ") + + if missing: + LOG.warning("ephemeral device '%s' did" + " not appear after %d seconds.", + devpath, maxwait) + return result = False msg = None @@ -808,6 +846,7 @@ def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120, return +@azure_ds_telemetry_reporter def perform_hostname_bounce(hostname, cfg, prev_hostname): # set the hostname to 'hostname' if it is not already set to that. # then, if policy is not off, bounce the interface using command @@ -843,6 +882,7 @@ def perform_hostname_bounce(hostname, cfg, prev_hostname): return True +@azure_ds_telemetry_reporter def crtfile_to_pubkey(fname, data=None): pipeline = ('openssl x509 -noout -pubkey < "$0" |' 'ssh-keygen -i -m PKCS8 -f /dev/stdin') @@ -851,6 +891,7 @@ def crtfile_to_pubkey(fname, data=None): return out.rstrip() +@azure_ds_telemetry_reporter def pubkeys_from_crt_files(flist): pubkeys = [] errors = [] @@ -866,6 +907,7 @@ def pubkeys_from_crt_files(flist): return pubkeys +@azure_ds_telemetry_reporter def write_files(datadir, files, dirmode=None): def _redact_password(cnt, fname): @@ -893,6 +935,7 @@ def write_files(datadir, files, dirmode=None): util.write_file(filename=fname, content=content, mode=0o600) +@azure_ds_telemetry_reporter def invoke_agent(cmd): # this is a function itself to simplify patching it for test if cmd: @@ -912,6 +955,7 @@ def find_child(node, filter_func): return ret +@azure_ds_telemetry_reporter def load_azure_ovf_pubkeys(sshnode): # This parses a 'SSH' node formatted like below, and returns # an array of dicts. @@ -964,6 +1008,7 @@ def load_azure_ovf_pubkeys(sshnode): return found +@azure_ds_telemetry_reporter def read_azure_ovf(contents): try: dom = minidom.parseString(contents) @@ -1064,6 +1109,7 @@ def read_azure_ovf(contents): return (md, ud, cfg) +@azure_ds_telemetry_reporter def _extract_preprovisioned_vm_setting(dom): """Read the preprovision flag from the ovf. It should not exist unless true.""" @@ -1092,6 +1138,7 @@ def encrypt_pass(password, salt_id="$6$"): return crypt.crypt(password, salt_id + util.rand_str(strlen=16)) +@azure_ds_telemetry_reporter def _check_freebsd_cdrom(cdrom_dev): """Return boolean indicating path to cdrom device has content.""" try: @@ -1103,6 +1150,7 @@ def _check_freebsd_cdrom(cdrom_dev): return False +@azure_ds_telemetry_reporter def _get_random_seed(source=PLATFORM_ENTROPY_SOURCE): """Return content random seed file if available, otherwise, return None.""" @@ -1126,6 +1174,7 @@ def _get_random_seed(source=PLATFORM_ENTROPY_SOURCE): return seed +@azure_ds_telemetry_reporter def list_possible_azure_ds_devs(): devlist = [] if util.is_FreeBSD(): @@ -1140,6 +1189,7 @@ def list_possible_azure_ds_devs(): return devlist +@azure_ds_telemetry_reporter def load_azure_ds_dir(source_dir): ovf_file = os.path.join(source_dir, "ovf-env.xml") @@ -1162,47 +1212,54 @@ def parse_network_config(imds_metadata): @param: imds_metadata: Dict of content read from IMDS network service. @return: Dictionary containing network version 2 standard configuration. """ - if imds_metadata != sources.UNSET and imds_metadata: - netconfig = {'version': 2, 'ethernets': {}} - LOG.debug('Azure: generating network configuration from IMDS') - network_metadata = imds_metadata['network'] - for idx, intf in enumerate(network_metadata['interface']): - nicname = 'eth{idx}'.format(idx=idx) - dev_config = {} - for addr4 in intf['ipv4']['ipAddress']: - privateIpv4 = addr4['privateIpAddress'] - if privateIpv4: - if dev_config.get('dhcp4', False): - # Append static address config for nic > 1 - netPrefix = intf['ipv4']['subnet'][0].get( - 'prefix', '24') - if not dev_config.get('addresses'): - dev_config['addresses'] = [] - dev_config['addresses'].append( - '{ip}/{prefix}'.format( - ip=privateIpv4, prefix=netPrefix)) - else: - dev_config['dhcp4'] = True - for addr6 in intf['ipv6']['ipAddress']: - privateIpv6 = addr6['privateIpAddress'] - if privateIpv6: - dev_config['dhcp6'] = True - break - if dev_config: - mac = ':'.join(re.findall(r'..', intf['macAddress'])) - dev_config.update( - {'match': {'macaddress': mac.lower()}, - 'set-name': nicname}) - netconfig['ethernets'][nicname] = dev_config - else: - blacklist = ['mlx4_core'] - LOG.debug('Azure: generating fallback configuration') - # generate a network config, blacklist picking mlx4_core devs - netconfig = net.generate_fallback_config( - blacklist_drivers=blacklist, config_driver=True) - return netconfig + with events.ReportEventStack( + name="parse_network_config", + description="", + parent=azure_ds_reporter) as evt: + if imds_metadata != sources.UNSET and imds_metadata: + netconfig = {'version': 2, 'ethernets': {}} + LOG.debug('Azure: generating network configuration from IMDS') + network_metadata = imds_metadata['network'] + for idx, intf in enumerate(network_metadata['interface']): + nicname = 'eth{idx}'.format(idx=idx) + dev_config = {} + for addr4 in intf['ipv4']['ipAddress']: + privateIpv4 = addr4['privateIpAddress'] + if privateIpv4: + if dev_config.get('dhcp4', False): + # Append static address config for nic > 1 + netPrefix = intf['ipv4']['subnet'][0].get( + 'prefix', '24') + if not dev_config.get('addresses'): + dev_config['addresses'] = [] + dev_config['addresses'].append( + '{ip}/{prefix}'.format( + ip=privateIpv4, prefix=netPrefix)) + else: + dev_config['dhcp4'] = True + for addr6 in intf['ipv6']['ipAddress']: + privateIpv6 = addr6['privateIpAddress'] + if privateIpv6: + dev_config['dhcp6'] = True + break + if dev_config: + mac = ':'.join(re.findall(r'..', intf['macAddress'])) + dev_config.update( + {'match': {'macaddress': mac.lower()}, + 'set-name': nicname}) + netconfig['ethernets'][nicname] = dev_config + evt.description = "network config from imds" + else: + blacklist = ['mlx4_core'] + LOG.debug('Azure: generating fallback configuration') + # generate a network config, blacklist picking mlx4_core devs + netconfig = net.generate_fallback_config( + blacklist_drivers=blacklist, config_driver=True) + evt.description = "network config from fallback" + return netconfig +@azure_ds_telemetry_reporter def get_metadata_from_imds(fallback_nic, retries): """Query Azure's network metadata service, returning a dictionary. @@ -1227,6 +1284,7 @@ def get_metadata_from_imds(fallback_nic, retries): return util.log_time(**kwargs) +@azure_ds_telemetry_reporter def _get_metadata_from_imds(retries): url = IMDS_URL + "instance?api-version=2017-12-01" @@ -1246,6 +1304,7 @@ def _get_metadata_from_imds(retries): return {} +@azure_ds_telemetry_reporter def maybe_remove_ubuntu_network_config_scripts(paths=None): """Remove Azure-specific ubuntu network config for non-primary nics. @@ -1283,14 +1342,20 @@ def maybe_remove_ubuntu_network_config_scripts(paths=None): def _is_platform_viable(seed_dir): - """Check platform environment to report if this datasource may run.""" - asset_tag = util.read_dmi_data('chassis-asset-tag') - if asset_tag == AZURE_CHASSIS_ASSET_TAG: - return True - LOG.debug("Non-Azure DMI asset tag '%s' discovered.", asset_tag) - if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')): - return True - return False + with events.ReportEventStack( + name="check-platform-viability", + description="found azure asset tag", + parent=azure_ds_reporter) as evt: + + """Check platform environment to report if this datasource may run.""" + asset_tag = util.read_dmi_data('chassis-asset-tag') + if asset_tag == AZURE_CHASSIS_ASSET_TAG: + return True + LOG.debug("Non-Azure DMI asset tag '%s' discovered.", asset_tag) + evt.description = "Non-Azure DMI asset tag '%s' discovered.", asset_tag + if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')): + return True + return False class BrokenAzureDataSource(Exception): diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py old mode 100644 new mode 100755 index 2829dd20..d3af05ee --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -16,10 +16,27 @@ from xml.etree import ElementTree from cloudinit import url_helper from cloudinit import util +from cloudinit.reporting import events LOG = logging.getLogger(__name__) +azure_ds_reporter = events.ReportEventStack( + name="azure-ds", + description="initialize reporter for azure ds", + reporting_enabled=True) + + +def azure_ds_telemetry_reporter(func): + def impl(*args, **kwargs): + with events.ReportEventStack( + name=func.__name__, + description=func.__name__, + parent=azure_ds_reporter): + return func(*args, **kwargs) + return impl + + @contextmanager def cd(newdir): prevdir = os.getcwd() @@ -119,6 +136,7 @@ class OpenSSLManager(object): def clean_up(self): util.del_dir(self.tmpdir) + @azure_ds_telemetry_reporter def generate_certificate(self): LOG.debug('Generating certificate for communication with fabric...') if self.certificate is not None: @@ -139,17 +157,20 @@ class OpenSSLManager(object): LOG.debug('New certificate generated.') @staticmethod + @azure_ds_telemetry_reporter def _run_x509_action(action, cert): cmd = ['openssl', 'x509', '-noout', action] result, _ = util.subp(cmd, data=cert) return result + @azure_ds_telemetry_reporter def _get_ssh_key_from_cert(self, certificate): pub_key = self._run_x509_action('-pubkey', certificate) keygen_cmd = ['ssh-keygen', '-i', '-m', 'PKCS8', '-f', '/dev/stdin'] ssh_key, _ = util.subp(keygen_cmd, data=pub_key) return ssh_key + @azure_ds_telemetry_reporter def _get_fingerprint_from_cert(self, certificate): """openssl x509 formats fingerprints as so: 'SHA1 Fingerprint=07:3E:19:D1:4D:1C:79:92:24:C6:A0:FD:8D:DA:\ @@ -163,6 +184,7 @@ class OpenSSLManager(object): octets = raw_fp[eq+1:-1].split(':') return ''.join(octets) + @azure_ds_telemetry_reporter def _decrypt_certs_from_xml(self, certificates_xml): """Decrypt the certificates XML document using the our private key; return the list of certs and private keys contained in the doc. @@ -185,6 +207,7 @@ class OpenSSLManager(object): shell=True, data=b'\n'.join(lines)) return out + @azure_ds_telemetry_reporter def parse_certificates(self, certificates_xml): """Given the Certificates XML document, return a dictionary of fingerprints and associated SSH keys derived from the certs.""" @@ -265,11 +288,13 @@ class WALinuxAgentShim(object): return socket.inet_ntoa(packed_bytes) @staticmethod + @azure_ds_telemetry_reporter def _networkd_get_value_from_leases(leases_d=None): return dhcp.networkd_get_option_from_leases( 'OPTION_245', leases_d=leases_d) @staticmethod + @azure_ds_telemetry_reporter def _get_value_from_leases_file(fallback_lease_file): leases = [] content = util.load_file(fallback_lease_file) @@ -287,6 +312,7 @@ class WALinuxAgentShim(object): return leases[-1] @staticmethod + @azure_ds_telemetry_reporter def _load_dhclient_json(): dhcp_options = {} hooks_dir = WALinuxAgentShim._get_hooks_dir() @@ -305,6 +331,7 @@ class WALinuxAgentShim(object): return dhcp_options @staticmethod + @azure_ds_telemetry_reporter def _get_value_from_dhcpoptions(dhcp_options): if dhcp_options is None: return None @@ -318,6 +345,7 @@ class WALinuxAgentShim(object): return _value @staticmethod + @azure_ds_telemetry_reporter def find_endpoint(fallback_lease_file=None, dhcp245=None): value = None if dhcp245 is not None: @@ -352,6 +380,7 @@ class WALinuxAgentShim(object): LOG.debug('Azure endpoint found at %s', endpoint_ip_address) return endpoint_ip_address + @azure_ds_telemetry_reporter def register_with_azure_and_fetch_data(self, pubkey_info=None): if self.openssl_manager is None: self.openssl_manager = OpenSSLManager() @@ -404,6 +433,7 @@ class WALinuxAgentShim(object): return keys + @azure_ds_telemetry_reporter def _report_ready(self, goal_state, http_client): LOG.debug('Reporting ready to Azure fabric.') document = self.REPORT_READY_XML_TEMPLATE.format( @@ -419,6 +449,7 @@ class WALinuxAgentShim(object): LOG.info('Reported ready to Azure fabric.') +@azure_ds_telemetry_reporter def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None, pubkey_info=None): shim = WALinuxAgentShim(fallback_lease_file=fallback_lease_file, -- cgit v1.2.3 From baa478546d8cac98a706010699d64f8c2f70b5bf Mon Sep 17 00:00:00 2001 From: "Jason Zions (MSFT)" Date: Fri, 10 May 2019 18:38:55 +0000 Subject: Azure: Return static fallback address as if failed to find endpoint The Azure data source helper attempts to use information in the dhcp lease to find the Wireserver endpoint (IP address). Under some unusual circumstances, those attempts will fail. This change uses a static address, known to be always correct in the Azure public and sovereign clouds, when the helper fails to locate a valid dhcp lease. This address is not guaranteed to be correct in Azure Stack environments; it's still best to use the information from the lease whenever possible. --- cloudinit/sources/helpers/azure.py | 14 +++++++++++--- tests/unittests/test_datasource/test_azure_helper.py | 9 +++++++-- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'cloudinit/sources/helpers/azure.py') diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index d3af05ee..82c4c8c4 100755 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -20,6 +20,9 @@ from cloudinit.reporting import events LOG = logging.getLogger(__name__) +# This endpoint matches the format as found in dhcp lease files, since this +# value is applied if the endpoint can't be found within a lease file +DEFAULT_WIRESERVER_ENDPOINT = "a8:3f:81:10" azure_ds_reporter = events.ReportEventStack( name="azure-ds", @@ -297,7 +300,12 @@ class WALinuxAgentShim(object): @azure_ds_telemetry_reporter def _get_value_from_leases_file(fallback_lease_file): leases = [] - content = util.load_file(fallback_lease_file) + try: + content = util.load_file(fallback_lease_file) + except IOError as ex: + LOG.error("Failed to read %s: %s", fallback_lease_file, ex) + return None + LOG.debug("content is %s", content) option_name = _get_dhcp_endpoint_option_name() for line in content.splitlines(): @@ -372,9 +380,9 @@ class WALinuxAgentShim(object): fallback_lease_file) value = WALinuxAgentShim._get_value_from_leases_file( fallback_lease_file) - if value is None: - raise ValueError('No endpoint found.') + LOG.warning("No lease found; using default endpoint") + value = DEFAULT_WIRESERVER_ENDPOINT endpoint_ip_address = WALinuxAgentShim.get_ip_from_lease_value(value) LOG.debug('Azure endpoint found at %s', endpoint_ip_address) diff --git a/tests/unittests/test_datasource/test_azure_helper.py b/tests/unittests/test_datasource/test_azure_helper.py index 02556165..bd006aba 100644 --- a/tests/unittests/test_datasource/test_azure_helper.py +++ b/tests/unittests/test_datasource/test_azure_helper.py @@ -67,12 +67,17 @@ class TestFindEndpoint(CiTestCase): self.networkd_leases.return_value = None def test_missing_file(self): - self.assertRaises(ValueError, wa_shim.find_endpoint) + """wa_shim find_endpoint uses default endpoint if leasefile not found + """ + self.assertEqual(wa_shim.find_endpoint(), "168.63.129.16") def test_missing_special_azure_line(self): + """wa_shim find_endpoint uses default endpoint if leasefile is found + but does not contain DHCP Option 245 (whose value is the endpoint) + """ self.load_file.return_value = '' self.dhcp_options.return_value = {'eth0': {'key': 'value'}} - self.assertRaises(ValueError, wa_shim.find_endpoint) + self.assertEqual(wa_shim.find_endpoint(), "168.63.129.16") @staticmethod def _build_lease_content(encoded_address): -- cgit v1.2.3 From 2f3bb764626b9065f4102c7c0a67998a9c174444 Mon Sep 17 00:00:00 2001 From: Anh Vo Date: Wed, 14 Aug 2019 21:03:13 +0000 Subject: Azure: Record boot timestamps, system information, and diagnostic events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collect and record the following information through KVP:  + timestamps related to kernel initialization and systemd activation    of cloud-init services  + system information including cloud-init version, kernel version,    distro version, and python version  + diagnostic events for the most common provisioning error issues    such as empty dhcp lease, corrupted ovf-env.xml, etc. + increasing the log frequency of polling IMDS during reprovision. --- cloudinit/sources/DataSourceAzure.py | 157 ++++++++++++++++++++----- cloudinit/sources/helpers/azure.py | 160 ++++++++++++++++++++++++-- tests/unittests/test_datasource/test_azure.py | 15 ++- tests/unittests/test_reporting_hyperv.py | 65 +++++++++++ 4 files changed, 353 insertions(+), 44 deletions(-) mode change 100755 => 100644 tests/unittests/test_reporting_hyperv.py (limited to 'cloudinit/sources/helpers/azure.py') diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index e6ed2f3b..4984fa84 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -26,9 +26,14 @@ from cloudinit.url_helper import UrlError, readurl, retry_on_url_exc from cloudinit import util from cloudinit.reporting import events -from cloudinit.sources.helpers.azure import (azure_ds_reporter, - azure_ds_telemetry_reporter, - get_metadata_from_fabric) +from cloudinit.sources.helpers.azure import ( + azure_ds_reporter, + azure_ds_telemetry_reporter, + get_metadata_from_fabric, + get_boot_telemetry, + get_system_info, + report_diagnostic_event, + EphemeralDHCPv4WithReporting) LOG = logging.getLogger(__name__) @@ -354,7 +359,7 @@ class DataSourceAzure(sources.DataSource): bname = str(pk['fingerprint'] + ".crt") fp_files += [os.path.join(ddir, bname)] LOG.debug("ssh authentication: " - "using fingerprint from fabirc") + "using fingerprint from fabric") with events.ReportEventStack( name="waiting-for-ssh-public-key", @@ -419,12 +424,17 @@ class DataSourceAzure(sources.DataSource): ret = load_azure_ds_dir(cdev) except NonAzureDataSource: + report_diagnostic_event( + "Did not find Azure data source in %s" % cdev) continue except BrokenAzureDataSource as exc: msg = 'BrokenAzureDataSource: %s' % exc + report_diagnostic_event(msg) raise sources.InvalidMetaDataException(msg) except util.MountFailedError: - LOG.warning("%s was not mountable", cdev) + msg = '%s was not mountable' % cdev + report_diagnostic_event(msg) + LOG.warning(msg) continue perform_reprovision = reprovision or self._should_reprovision(ret) @@ -432,6 +442,7 @@ class DataSourceAzure(sources.DataSource): if util.is_FreeBSD(): msg = "Free BSD is not supported for PPS VMs" LOG.error(msg) + report_diagnostic_event(msg) raise sources.InvalidMetaDataException(msg) ret = self._reprovision() imds_md = get_metadata_from_imds( @@ -450,7 +461,9 @@ class DataSourceAzure(sources.DataSource): break if not found: - raise sources.InvalidMetaDataException('No Azure metadata found') + msg = 'No Azure metadata found' + report_diagnostic_event(msg) + raise sources.InvalidMetaDataException(msg) if found == ddir: LOG.debug("using files cached in %s", ddir) @@ -469,9 +482,14 @@ class DataSourceAzure(sources.DataSource): self._report_ready(lease=self._ephemeral_dhcp_ctx.lease) self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral else: - with EphemeralDHCPv4() as lease: - self._report_ready(lease=lease) - + try: + with EphemeralDHCPv4WithReporting( + azure_ds_reporter) as lease: + self._report_ready(lease=lease) + except Exception as e: + report_diagnostic_event( + "exception while reporting ready: %s" % e) + raise return crawled_data def _is_platform_viable(self): @@ -492,6 +510,16 @@ class DataSourceAzure(sources.DataSource): """ if not self._is_platform_viable(): return False + try: + get_boot_telemetry() + except Exception as e: + LOG.warning("Failed to get boot telemetry: %s", e) + + try: + get_system_info() + except Exception as e: + LOG.warning("Failed to get system information: %s", e) + try: crawled_data = util.log_time( logfunc=LOG.debug, msg='Crawl of metadata service', @@ -551,27 +579,55 @@ class DataSourceAzure(sources.DataSource): headers = {"Metadata": "true"} nl_sock = None report_ready = bool(not os.path.isfile(REPORTED_READY_MARKER_FILE)) + self.imds_logging_threshold = 1 + self.imds_poll_counter = 1 + dhcp_attempts = 0 + vnet_switched = False + return_val = None def exc_cb(msg, exception): if isinstance(exception, UrlError) and exception.code == 404: + if self.imds_poll_counter == self.imds_logging_threshold: + # Reducing the logging frequency as we are polling IMDS + self.imds_logging_threshold *= 2 + LOG.debug("Call to IMDS with arguments %s failed " + "with status code %s after %s retries", + msg, exception.code, self.imds_poll_counter) + LOG.debug("Backing off logging threshold for the same " + "exception to %d", self.imds_logging_threshold) + self.imds_poll_counter += 1 return True + # If we get an exception while trying to call IMDS, we # call DHCP and setup the ephemeral network to acquire the new IP. + LOG.debug("Call to IMDS with arguments %s failed with " + "status code %s", msg, exception.code) + report_diagnostic_event("polling IMDS failed with exception %s" + % exception.code) return False LOG.debug("Wait for vnetswitch to happen") while True: try: - # Save our EphemeralDHCPv4 context so we avoid repeated dhcp - self._ephemeral_dhcp_ctx = EphemeralDHCPv4() - lease = self._ephemeral_dhcp_ctx.obtain_lease() + # Save our EphemeralDHCPv4 context to avoid repeated dhcp + with events.ReportEventStack( + name="obtain-dhcp-lease", + description="obtain dhcp lease", + parent=azure_ds_reporter): + self._ephemeral_dhcp_ctx = EphemeralDHCPv4() + lease = self._ephemeral_dhcp_ctx.obtain_lease() + + if vnet_switched: + dhcp_attempts += 1 if report_ready: try: nl_sock = netlink.create_bound_netlink_socket() except netlink.NetlinkCreateSocketError as e: + report_diagnostic_event(e) LOG.warning(e) self._ephemeral_dhcp_ctx.clean_network() - return + break + path = REPORTED_READY_MARKER_FILE LOG.info( "Creating a marker file to report ready: %s", path) @@ -579,17 +635,33 @@ class DataSourceAzure(sources.DataSource): pid=os.getpid(), time=time())) self._report_ready(lease=lease) report_ready = False - try: - netlink.wait_for_media_disconnect_connect( - nl_sock, lease['interface']) - except AssertionError as error: - LOG.error(error) - return + + with events.ReportEventStack( + name="wait-for-media-disconnect-connect", + description="wait for vnet switch", + parent=azure_ds_reporter): + try: + netlink.wait_for_media_disconnect_connect( + nl_sock, lease['interface']) + except AssertionError as error: + report_diagnostic_event(error) + LOG.error(error) + break + + vnet_switched = True self._ephemeral_dhcp_ctx.clean_network() else: - return readurl(url, timeout=IMDS_TIMEOUT_IN_SECONDS, - headers=headers, exception_cb=exc_cb, - infinite=True, log_req_resp=False).contents + with events.ReportEventStack( + name="get-reprovision-data-from-imds", + description="get reprovision data from imds", + parent=azure_ds_reporter): + return_val = readurl(url, + timeout=IMDS_TIMEOUT_IN_SECONDS, + headers=headers, + exception_cb=exc_cb, + infinite=True, + log_req_resp=False).contents + break except UrlError: # Teardown our EphemeralDHCPv4 context on failure as we retry self._ephemeral_dhcp_ctx.clean_network() @@ -598,6 +670,14 @@ class DataSourceAzure(sources.DataSource): if nl_sock: nl_sock.close() + if vnet_switched: + report_diagnostic_event("attempted dhcp %d times after reuse" % + dhcp_attempts) + report_diagnostic_event("polled imds %d times after reuse" % + self.imds_poll_counter) + + return return_val + @azure_ds_telemetry_reporter def _report_ready(self, lease): """Tells the fabric provisioning has completed """ @@ -666,9 +746,12 @@ class DataSourceAzure(sources.DataSource): self.ds_cfg['agent_command']) try: fabric_data = metadata_func() - except Exception: + except Exception as e: + report_diagnostic_event( + "Error communicating with Azure fabric; You may experience " + "connectivity issues: %s" % e) LOG.warning( - "Error communicating with Azure fabric; You may experience." + "Error communicating with Azure fabric; You may experience " "connectivity issues.", exc_info=True) return False @@ -1027,7 +1110,9 @@ def read_azure_ovf(contents): try: dom = minidom.parseString(contents) except Exception as e: - raise BrokenAzureDataSource("Invalid ovf-env.xml: %s" % e) + error_str = "Invalid ovf-env.xml: %s" % e + report_diagnostic_event(error_str) + raise BrokenAzureDataSource(error_str) results = find_child(dom.documentElement, lambda n: n.localName == "ProvisioningSection") @@ -1299,8 +1384,13 @@ def get_metadata_from_imds(fallback_nic, retries): if net.is_up(fallback_nic): return util.log_time(**kwargs) else: - with EphemeralDHCPv4(fallback_nic): - return util.log_time(**kwargs) + try: + with EphemeralDHCPv4WithReporting( + azure_ds_reporter, fallback_nic): + return util.log_time(**kwargs) + except Exception as e: + report_diagnostic_event("exception while getting metadata: %s" % e) + raise @azure_ds_telemetry_reporter @@ -1313,11 +1403,14 @@ def _get_metadata_from_imds(retries): url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers, retries=retries, exception_cb=retry_on_url_exc) except Exception as e: - LOG.debug('Ignoring IMDS instance metadata: %s', e) + msg = 'Ignoring IMDS instance metadata: %s' % e + report_diagnostic_event(msg) + LOG.debug(msg) return {} try: return util.load_json(str(response)) - except json.decoder.JSONDecodeError: + except json.decoder.JSONDecodeError as e: + report_diagnostic_event('non-json imds response' % e) LOG.warning( 'Ignoring non-json IMDS instance metadata: %s', str(response)) return {} @@ -1370,8 +1463,10 @@ def _is_platform_viable(seed_dir): asset_tag = util.read_dmi_data('chassis-asset-tag') if asset_tag == AZURE_CHASSIS_ASSET_TAG: return True - LOG.debug("Non-Azure DMI asset tag '%s' discovered.", asset_tag) - evt.description = "Non-Azure DMI asset tag '%s' discovered.", asset_tag + msg = "Non-Azure DMI asset tag '%s' discovered." % asset_tag + LOG.debug(msg) + evt.description = msg + report_diagnostic_event(msg) if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')): return True return False diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index 82c4c8c4..f1fba175 100755 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -16,7 +16,11 @@ from xml.etree import ElementTree from cloudinit import url_helper from cloudinit import util +from cloudinit import version +from cloudinit import distros from cloudinit.reporting import events +from cloudinit.net.dhcp import EphemeralDHCPv4 +from datetime import datetime LOG = logging.getLogger(__name__) @@ -24,6 +28,10 @@ LOG = logging.getLogger(__name__) # value is applied if the endpoint can't be found within a lease file DEFAULT_WIRESERVER_ENDPOINT = "a8:3f:81:10" +BOOT_EVENT_TYPE = 'boot-telemetry' +SYSTEMINFO_EVENT_TYPE = 'system-info' +DIAGNOSTIC_EVENT_TYPE = 'diagnostic' + azure_ds_reporter = events.ReportEventStack( name="azure-ds", description="initialize reporter for azure ds", @@ -40,6 +48,105 @@ def azure_ds_telemetry_reporter(func): return impl +@azure_ds_telemetry_reporter +def get_boot_telemetry(): + """Report timestamps related to kernel initialization and systemd + activation of cloud-init""" + if not distros.uses_systemd(): + raise RuntimeError( + "distro not using systemd, skipping boot telemetry") + + LOG.debug("Collecting boot telemetry") + try: + kernel_start = float(time.time()) - float(util.uptime()) + except ValueError: + raise RuntimeError("Failed to determine kernel start timestamp") + + try: + out, _ = util.subp(['/bin/systemctl', + 'show', '-p', + 'UserspaceTimestampMonotonic'], + capture=True) + tsm = None + if out and '=' in out: + tsm = out.split("=")[1] + + if not tsm: + raise RuntimeError("Failed to parse " + "UserspaceTimestampMonotonic from systemd") + + user_start = kernel_start + (float(tsm) / 1000000) + except util.ProcessExecutionError as e: + raise RuntimeError("Failed to get UserspaceTimestampMonotonic: %s" + % e) + except ValueError as e: + raise RuntimeError("Failed to parse " + "UserspaceTimestampMonotonic from systemd: %s" + % e) + + try: + out, _ = util.subp(['/bin/systemctl', 'show', + 'cloud-init-local', '-p', + 'InactiveExitTimestampMonotonic'], + capture=True) + tsm = None + if out and '=' in out: + tsm = out.split("=")[1] + if not tsm: + raise RuntimeError("Failed to parse " + "InactiveExitTimestampMonotonic from systemd") + + cloudinit_activation = kernel_start + (float(tsm) / 1000000) + except util.ProcessExecutionError as e: + raise RuntimeError("Failed to get InactiveExitTimestampMonotonic: %s" + % e) + except ValueError as e: + raise RuntimeError("Failed to parse " + "InactiveExitTimestampMonotonic from systemd: %s" + % e) + + evt = events.ReportingEvent( + BOOT_EVENT_TYPE, 'boot-telemetry', + "kernel_start=%s user_start=%s cloudinit_activation=%s" % + (datetime.utcfromtimestamp(kernel_start).isoformat() + 'Z', + datetime.utcfromtimestamp(user_start).isoformat() + 'Z', + datetime.utcfromtimestamp(cloudinit_activation).isoformat() + 'Z'), + events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt) + + # return the event for unit testing purpose + return evt + + +@azure_ds_telemetry_reporter +def get_system_info(): + """Collect and report system information""" + info = util.system_info() + evt = events.ReportingEvent( + SYSTEMINFO_EVENT_TYPE, 'system information', + "cloudinit_version=%s, kernel_version=%s, variant=%s, " + "distro_name=%s, distro_version=%s, flavor=%s, " + "python_version=%s" % + (version.version_string(), info['release'], info['variant'], + info['dist'][0], info['dist'][1], info['dist'][2], + info['python']), events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt) + + # return the event for unit testing purpose + return evt + + +def report_diagnostic_event(str): + """Report a diagnostic event""" + evt = events.ReportingEvent( + DIAGNOSTIC_EVENT_TYPE, 'diagnostic message', + str, events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt) + + # return the event for unit testing purpose + return evt + + @contextmanager def cd(newdir): prevdir = os.getcwd() @@ -360,16 +467,19 @@ class WALinuxAgentShim(object): value = dhcp245 LOG.debug("Using Azure Endpoint from dhcp options") if value is None: + report_diagnostic_event("No Azure endpoint from dhcp options") LOG.debug('Finding Azure endpoint from networkd...') value = WALinuxAgentShim._networkd_get_value_from_leases() if value is None: # Option-245 stored in /run/cloud-init/dhclient.hooks/.json # a dhclient exit hook that calls cloud-init-dhclient-hook + report_diagnostic_event("No Azure endpoint from networkd") LOG.debug('Finding Azure endpoint from hook json...') dhcp_options = WALinuxAgentShim._load_dhclient_json() value = WALinuxAgentShim._get_value_from_dhcpoptions(dhcp_options) if value is None: # Fallback and check the leases file if unsuccessful + report_diagnostic_event("No Azure endpoint from dhclient logs") LOG.debug("Unable to find endpoint in dhclient logs. " " Falling back to check lease files") if fallback_lease_file is None: @@ -381,11 +491,15 @@ class WALinuxAgentShim(object): value = WALinuxAgentShim._get_value_from_leases_file( fallback_lease_file) if value is None: - LOG.warning("No lease found; using default endpoint") + msg = "No lease found; using default endpoint" + report_diagnostic_event(msg) + LOG.warning(msg) value = DEFAULT_WIRESERVER_ENDPOINT endpoint_ip_address = WALinuxAgentShim.get_ip_from_lease_value(value) - LOG.debug('Azure endpoint found at %s', endpoint_ip_address) + msg = 'Azure endpoint found at %s' % endpoint_ip_address + report_diagnostic_event(msg) + LOG.debug(msg) return endpoint_ip_address @azure_ds_telemetry_reporter @@ -399,16 +513,19 @@ class WALinuxAgentShim(object): try: response = http_client.get( 'http://{0}/machine/?comp=goalstate'.format(self.endpoint)) - except Exception: + except Exception as e: if attempts < 10: time.sleep(attempts + 1) else: + report_diagnostic_event( + "failed to register with Azure: %s" % e) raise else: break attempts += 1 LOG.debug('Successfully fetched GoalState XML.') goal_state = GoalState(response.contents, http_client) + report_diagnostic_event("container_id %s" % goal_state.container_id) ssh_keys = [] if goal_state.certificates_xml is not None and pubkey_info is not None: LOG.debug('Certificate XML found; parsing out public keys.') @@ -449,11 +566,20 @@ class WALinuxAgentShim(object): container_id=goal_state.container_id, instance_id=goal_state.instance_id, ) - http_client.post( - "http://{0}/machine?comp=health".format(self.endpoint), - data=document, - extra_headers={'Content-Type': 'text/xml; charset=utf-8'}, - ) + # Host will collect kvps when cloud-init reports ready. + # some kvps might still be in the queue. We yield the scheduler + # to make sure we process all kvps up till this point. + time.sleep(0) + try: + http_client.post( + "http://{0}/machine?comp=health".format(self.endpoint), + data=document, + extra_headers={'Content-Type': 'text/xml; charset=utf-8'}, + ) + except Exception as e: + report_diagnostic_event("exception while reporting ready: %s" % e) + raise + LOG.info('Reported ready to Azure fabric.') @@ -467,4 +593,22 @@ def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None, finally: shim.clean_up() + +class EphemeralDHCPv4WithReporting(object): + def __init__(self, reporter, nic=None): + self.reporter = reporter + self.ephemeralDHCPv4 = EphemeralDHCPv4(iface=nic) + + def __enter__(self): + with events.ReportEventStack( + name="obtain-dhcp-lease", + description="obtain dhcp lease", + parent=self.reporter): + return self.ephemeralDHCPv4.__enter__() + + def __exit__(self, excp_type, excp_value, excp_traceback): + self.ephemeralDHCPv4.__exit__( + excp_type, excp_value, excp_traceback) + + # vi: ts=4 expandtab diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index 4d57cebc..3547dd94 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -181,7 +181,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): self.logs.getvalue()) @mock.patch(MOCKPATH + 'readurl') - @mock.patch(MOCKPATH + 'EphemeralDHCPv4') + @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting') @mock.patch(MOCKPATH + 'net.is_up') def test_get_metadata_performs_dhcp_when_network_is_down( self, m_net_is_up, m_dhcp, m_readurl): @@ -195,7 +195,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): dsaz.get_metadata_from_imds('eth9', retries=2)) m_net_is_up.assert_called_with('eth9') - m_dhcp.assert_called_with('eth9') + m_dhcp.assert_called_with(mock.ANY, 'eth9') self.assertIn( "Crawl of Azure Instance Metadata Service (IMDS) took", # log_time self.logs.getvalue()) @@ -552,7 +552,8 @@ scbus-1 on xpt0 bus 0 dsrc.crawl_metadata() self.assertEqual(str(cm.exception), error_msg) - @mock.patch('cloudinit.sources.DataSourceAzure.EphemeralDHCPv4') + @mock.patch( + 'cloudinit.sources.DataSourceAzure.EphemeralDHCPv4WithReporting') @mock.patch('cloudinit.sources.DataSourceAzure.util.write_file') @mock.patch( 'cloudinit.sources.DataSourceAzure.DataSourceAzure._report_ready') @@ -1308,7 +1309,9 @@ class TestAzureBounce(CiTestCase): self.assertEqual(initial_host_name, self.set_hostname.call_args_list[-1][0][0]) - def test_environment_correct_for_bounce_command(self): + @mock.patch.object(dsaz, 'get_boot_telemetry') + def test_environment_correct_for_bounce_command( + self, mock_get_boot_telemetry): interface = 'int0' hostname = 'my-new-host' old_hostname = 'my-old-host' @@ -1324,7 +1327,9 @@ class TestAzureBounce(CiTestCase): self.assertEqual(hostname, bounce_env['hostname']) self.assertEqual(old_hostname, bounce_env['old_hostname']) - def test_default_bounce_command_ifup_used_by_default(self): + @mock.patch.object(dsaz, 'get_boot_telemetry') + def test_default_bounce_command_ifup_used_by_default( + self, mock_get_boot_telemetry): cfg = {'hostname_bounce': {'policy': 'force'}} data = self.get_ovf_env_with_dscfg('some-hostname', cfg) dsrc = self._get_ds(data, agent_command=['not', '__builtin__']) diff --git a/tests/unittests/test_reporting_hyperv.py b/tests/unittests/test_reporting_hyperv.py old mode 100755 new mode 100644 index d01ed5b3..640895a4 --- a/tests/unittests/test_reporting_hyperv.py +++ b/tests/unittests/test_reporting_hyperv.py @@ -7,9 +7,12 @@ import json import os import struct import time +import re +import mock from cloudinit import util from cloudinit.tests.helpers import CiTestCase +from cloudinit.sources.helpers import azure class TestKvpEncoding(CiTestCase): @@ -126,3 +129,65 @@ class TextKvpReporter(CiTestCase): reporter = HyperVKvpReportingHandler(kvp_file_path=self.tmp_file_path) kvps = list(reporter._iterate_kvps(0)) self.assertEqual(0, len(kvps)) + + @mock.patch('cloudinit.distros.uses_systemd') + @mock.patch('cloudinit.util.subp') + def test_get_boot_telemetry(self, m_subp, m_sysd): + reporter = HyperVKvpReportingHandler(kvp_file_path=self.tmp_file_path) + datetime_pattern = r"\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]" + r"\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z)" + + # get_boot_telemetry makes two subp calls to systemctl. We provide + # a list of values that the subp calls should return + m_subp.side_effect = [ + ('UserspaceTimestampMonotonic=1844838', ''), + ('InactiveExitTimestampMonotonic=3068203', '')] + m_sysd.return_value = True + + reporter.publish_event(azure.get_boot_telemetry()) + reporter.q.join() + kvps = list(reporter._iterate_kvps(0)) + self.assertEqual(1, len(kvps)) + + evt_msg = kvps[0]['value'] + if not re.search("kernel_start=" + datetime_pattern, evt_msg): + raise AssertionError("missing kernel_start timestamp") + if not re.search("user_start=" + datetime_pattern, evt_msg): + raise AssertionError("missing user_start timestamp") + if not re.search("cloudinit_activation=" + datetime_pattern, + evt_msg): + raise AssertionError( + "missing cloudinit_activation timestamp") + + def test_get_system_info(self): + reporter = HyperVKvpReportingHandler(kvp_file_path=self.tmp_file_path) + pattern = r"[^=\s]+" + + reporter.publish_event(azure.get_system_info()) + reporter.q.join() + kvps = list(reporter._iterate_kvps(0)) + self.assertEqual(1, len(kvps)) + evt_msg = kvps[0]['value'] + + # the most important information is cloudinit version, + # kernel_version, and the distro variant. It is ok if + # if the rest is not available + if not re.search("cloudinit_version=" + pattern, evt_msg): + raise AssertionError("missing cloudinit_version string") + if not re.search("kernel_version=" + pattern, evt_msg): + raise AssertionError("missing kernel_version string") + if not re.search("variant=" + pattern, evt_msg): + raise AssertionError("missing distro variant string") + + def test_report_diagnostic_event(self): + reporter = HyperVKvpReportingHandler(kvp_file_path=self.tmp_file_path) + + reporter.publish_event( + azure.report_diagnostic_event("test_diagnostic")) + reporter.q.join() + kvps = list(reporter._iterate_kvps(0)) + self.assertEqual(1, len(kvps)) + evt_msg = kvps[0]['value'] + + if "test_diagnostic" not in evt_msg: + raise AssertionError("missing expected diagnostic message") -- cgit v1.2.3 From f69d33a723b805fec3ee70c3a6127c8cadcb02d8 Mon Sep 17 00:00:00 2001 From: Chad Smith Date: Mon, 2 Dec 2019 16:24:18 -0700 Subject: url_helper: read_file_or_url should pass headers param into readurl (#66) Headers param was accidentally omitted and no longer passed through to readurl due to a previous commit. To avoid this omission of params in the future, drop positional param definitions from read_file_or_url and pass all kwargs through to readurl when we are not operating on a file. In util:read_seeded, correct the case where invalid positional param file_retries was being passed into read_file_or_url. Also drop duplicated file:// prefix addition from read_seeded because read_file_or_url does that work anyway. LP: #1854084 --- cloudinit/sources/helpers/azure.py | 6 ++- cloudinit/tests/test_url_helper.py | 52 ++++++++++++++++++++++ cloudinit/url_helper.py | 47 +++++++++++++++---- cloudinit/user_data.py | 2 +- cloudinit/util.py | 15 ++----- .../unittests/test_datasource/test_azure_helper.py | 18 +++++--- 6 files changed, 112 insertions(+), 28 deletions(-) (limited to 'cloudinit/sources/helpers/azure.py') diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index f1fba175..f5cdb3fd 100755 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -183,14 +183,16 @@ class AzureEndpointHttpClient(object): if secure: headers = self.headers.copy() headers.update(self.extra_secure_headers) - return url_helper.read_file_or_url(url, headers=headers) + return url_helper.read_file_or_url(url, headers=headers, timeout=5, + retries=10) def post(self, url, data=None, extra_headers=None): headers = self.headers if extra_headers is not None: headers = self.headers.copy() headers.update(extra_headers) - return url_helper.read_file_or_url(url, data=data, headers=headers) + return url_helper.read_file_or_url(url, data=data, headers=headers, + timeout=5, retries=10) class GoalState(object): diff --git a/cloudinit/tests/test_url_helper.py b/cloudinit/tests/test_url_helper.py index aa9f3ec1..e883ddc2 100644 --- a/cloudinit/tests/test_url_helper.py +++ b/cloudinit/tests/test_url_helper.py @@ -4,6 +4,7 @@ from cloudinit.url_helper import ( NOT_FOUND, UrlError, oauth_headers, read_file_or_url, retry_on_url_exc) from cloudinit.tests.helpers import CiTestCase, mock, skipIf from cloudinit import util +from cloudinit import version import httpretty import requests @@ -17,6 +18,9 @@ except ImportError: _missing_oauthlib_dep = True +M_PATH = 'cloudinit.url_helper.' + + class TestOAuthHeaders(CiTestCase): def test_oauth_headers_raises_not_implemented_when_oathlib_missing(self): @@ -67,6 +71,54 @@ class TestReadFileOrUrl(CiTestCase): self.assertEqual(result.contents, data) self.assertEqual(str(result), data.decode('utf-8')) + @mock.patch(M_PATH + 'readurl') + def test_read_file_or_url_passes_params_to_readurl(self, m_readurl): + """read_file_or_url passes all params through to readurl.""" + url = 'http://hostname/path' + response = 'This is my url content\n' + m_readurl.return_value = response + params = {'url': url, 'timeout': 1, 'retries': 2, + 'headers': {'somehdr': 'val'}, + 'data': 'data', 'sec_between': 1, + 'ssl_details': {'cert_file': '/path/cert.pem'}, + 'headers_cb': 'headers_cb', 'exception_cb': 'exception_cb'} + self.assertEqual(response, read_file_or_url(**params)) + params.pop('url') # url is passed in as a positional arg + self.assertEqual([mock.call(url, **params)], m_readurl.call_args_list) + + def test_wb_read_url_defaults_honored_by_read_file_or_url_callers(self): + """Readurl param defaults used when unspecified by read_file_or_url + + Param defaults tested are as follows: + retries: 0, additional headers None beyond default, method: GET, + data: None, check_status: True and allow_redirects: True + """ + url = 'http://hostname/path' + + m_response = mock.MagicMock() + + class FakeSession(requests.Session): + def request(cls, **kwargs): + self.assertEqual( + {'url': url, 'allow_redirects': True, 'method': 'GET', + 'headers': { + 'User-Agent': 'Cloud-Init/%s' % ( + version.version_string())}}, + kwargs) + return m_response + + with mock.patch(M_PATH + 'requests.Session') as m_session: + error = requests.exceptions.HTTPError('broke') + m_session.side_effect = [error, FakeSession()] + # assert no retries and check_status == True + with self.assertRaises(UrlError) as context_manager: + response = read_file_or_url(url) + self.assertEqual('broke', str(context_manager.exception)) + # assert default headers, method, url and allow_redirects True + # Success on 2nd call with FakeSession + response = read_file_or_url(url) + self.assertEqual(m_response, response._response) + class TestRetryOnUrlExc(CiTestCase): diff --git a/cloudinit/url_helper.py b/cloudinit/url_helper.py index 48ddae45..1496a471 100644 --- a/cloudinit/url_helper.py +++ b/cloudinit/url_helper.py @@ -81,14 +81,19 @@ def combine_url(base, *add_ons): return url -def read_file_or_url(url, timeout=5, retries=10, - headers=None, data=None, sec_between=1, ssl_details=None, - headers_cb=None, exception_cb=None): +def read_file_or_url(url, **kwargs): + """Wrapper function around readurl to allow passing a file path as url. + + When url is not a local file path, passthrough any kwargs to readurl. + + In the case of parameter passthrough to readurl, default values for some + parameters. See: call-signature of readurl in this module for param docs. + """ url = url.lstrip() if url.startswith("/"): url = "file://%s" % url if url.lower().startswith("file://"): - if data: + if kwargs.get("data"): LOG.warning("Unable to post data to file resource %s", url) file_path = url[len("file://"):] try: @@ -101,10 +106,7 @@ def read_file_or_url(url, timeout=5, retries=10, raise UrlError(cause=e, code=code, headers=None, url=url) return FileResponse(file_path, contents=contents) else: - return readurl(url, timeout=timeout, retries=retries, - headers_cb=headers_cb, data=data, - sec_between=sec_between, ssl_details=ssl_details, - exception_cb=exception_cb) + return readurl(url, **kwargs) # Made to have same accessors as UrlResponse so that the @@ -201,6 +203,35 @@ def readurl(url, data=None, timeout=None, retries=0, sec_between=1, check_status=True, allow_redirects=True, exception_cb=None, session=None, infinite=False, log_req_resp=True, request_method=None): + """Wrapper around requests.Session to read the url and retry if necessary + + :param url: Mandatory url to request. + :param data: Optional form data to post the URL. Will set request_method + to 'POST' if present. + :param timeout: Timeout in seconds to wait for a response + :param retries: Number of times to retry on exception if exception_cb is + None or exception_cb returns True for the exception caught. Default is + to fail with 0 retries on exception. + :param sec_between: Default 1: amount of seconds passed to time.sleep + between retries. None or -1 means don't sleep. + :param headers: Optional dict of headers to send during request + :param headers_cb: Optional callable returning a dict of values to send as + headers during request + :param ssl_details: Optional dict providing key_file, ca_certs, and + cert_file keys for use on in ssl connections. + :param check_status: Optional boolean set True to raise when HTTPError + occurs. Default: True. + :param allow_redirects: Optional boolean passed straight to Session.request + as 'allow_redirects'. Default: True. + :param exception_cb: Optional callable which accepts the params + msg and exception and returns a boolean True if retries are permitted. + :param session: Optional exiting requests.Session instance to reuse. + :param infinite: Bool, set True to retry indefinitely. Default: False. + :param log_req_resp: Set False to turn off verbose debug messages. + :param request_method: String passed as 'method' to Session.request. + Typically GET, or POST. Default: POST if data is provided, GET + otherwise. + """ url = _cleanurl(url) req_args = { 'url': url, diff --git a/cloudinit/user_data.py b/cloudinit/user_data.py index ed83d2d8..15af1daf 100644 --- a/cloudinit/user_data.py +++ b/cloudinit/user_data.py @@ -224,7 +224,7 @@ class UserDataProcessor(object): content = util.load_file(include_once_fn) else: try: - resp = read_file_or_url(include_url, + resp = read_file_or_url(include_url, timeout=5, retries=10, ssl_details=self.ssl_details) if include_once_on and resp.ok(): util.write_file(include_once_fn, resp.contents, diff --git a/cloudinit/util.py b/cloudinit/util.py index 78b6a2d0..9d9d5c72 100644 --- a/cloudinit/util.py +++ b/cloudinit/util.py @@ -986,13 +986,6 @@ def load_yaml(blob, default=None, allowed=(dict,)): def read_seeded(base="", ext="", timeout=5, retries=10, file_retries=0): - if base.startswith("/"): - base = "file://%s" % base - - # default retries for file is 0. for network is 10 - if base.startswith("file://"): - retries = file_retries - if base.find("%s") >= 0: ud_url = base % ("user-data" + ext) md_url = base % ("meta-data" + ext) @@ -1000,14 +993,14 @@ def read_seeded(base="", ext="", timeout=5, retries=10, file_retries=0): ud_url = "%s%s%s" % (base, "user-data", ext) md_url = "%s%s%s" % (base, "meta-data", ext) - md_resp = url_helper.read_file_or_url(md_url, timeout, retries, - file_retries) + md_resp = url_helper.read_file_or_url(md_url, timeout=timeout, + retries=retries) md = None if md_resp.ok(): md = load_yaml(decode_binary(md_resp.contents), default={}) - ud_resp = url_helper.read_file_or_url(ud_url, timeout, retries, - file_retries) + ud_resp = url_helper.read_file_or_url(ud_url, timeout=timeout, + retries=retries) ud = None if ud_resp.ok(): ud = ud_resp.contents diff --git a/tests/unittests/test_datasource/test_azure_helper.py b/tests/unittests/test_datasource/test_azure_helper.py index bd006aba..bd17f636 100644 --- a/tests/unittests/test_datasource/test_azure_helper.py +++ b/tests/unittests/test_datasource/test_azure_helper.py @@ -212,8 +212,10 @@ class TestAzureEndpointHttpClient(CiTestCase): response = client.get(url, secure=False) self.assertEqual(1, self.read_file_or_url.call_count) self.assertEqual(self.read_file_or_url.return_value, response) - self.assertEqual(mock.call(url, headers=self.regular_headers), - self.read_file_or_url.call_args) + self.assertEqual( + mock.call(url, headers=self.regular_headers, retries=10, + timeout=5), + self.read_file_or_url.call_args) def test_secure_get(self): url = 'MyTestUrl' @@ -227,8 +229,10 @@ class TestAzureEndpointHttpClient(CiTestCase): response = client.get(url, secure=True) self.assertEqual(1, self.read_file_or_url.call_count) self.assertEqual(self.read_file_or_url.return_value, response) - self.assertEqual(mock.call(url, headers=expected_headers), - self.read_file_or_url.call_args) + self.assertEqual( + mock.call(url, headers=expected_headers, retries=10, + timeout=5), + self.read_file_or_url.call_args) def test_post(self): data = mock.MagicMock() @@ -238,7 +242,8 @@ class TestAzureEndpointHttpClient(CiTestCase): self.assertEqual(1, self.read_file_or_url.call_count) self.assertEqual(self.read_file_or_url.return_value, response) self.assertEqual( - mock.call(url, data=data, headers=self.regular_headers), + mock.call(url, data=data, headers=self.regular_headers, retries=10, + timeout=5), self.read_file_or_url.call_args) def test_post_with_extra_headers(self): @@ -250,7 +255,8 @@ class TestAzureEndpointHttpClient(CiTestCase): expected_headers = self.regular_headers.copy() expected_headers.update(extra_headers) self.assertEqual( - mock.call(mock.ANY, data=mock.ANY, headers=expected_headers), + mock.call(mock.ANY, data=mock.ANY, headers=expected_headers, + retries=10, timeout=5), self.read_file_or_url.call_args) -- cgit v1.2.3 From 129b1c4ea250619bd7caed7aaffacc796b0139f2 Mon Sep 17 00:00:00 2001 From: AOhassan <37305877+AOhassan@users.noreply.github.com> Date: Thu, 12 Dec 2019 13:51:42 -0800 Subject: azure: avoid re-running cloud-init when instance-id is byte-swapped (#84) Azure stores the instance ID with an incorrect byte ordering for the first three hyphen delimited parts. This results in invalid is_new_instance checks forcing Azure datasource to recrawl the metadata service. When persisting instance-id from the metadata service, swap the instance-id string byte order such that it is consistent with that returned by dmi information. Check whether the instance-id string is a byte-swapped match when determining correctly whether the Azure platform instance-id has actually changed. --- cloudinit/sources/DataSourceAzure.py | 16 ++++++++++--- cloudinit/sources/helpers/azure.py | 27 ++++++++++++++++++++++ tests/unittests/test_datasource/test_azure.py | 24 ++++++++++++++++--- .../unittests/test_datasource/test_azure_helper.py | 19 +++++++++++++++ 4 files changed, 80 insertions(+), 6 deletions(-) (limited to 'cloudinit/sources/helpers/azure.py') diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index 87a848ce..24f448c5 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -33,7 +33,8 @@ from cloudinit.sources.helpers.azure import ( get_boot_telemetry, get_system_info, report_diagnostic_event, - EphemeralDHCPv4WithReporting) + EphemeralDHCPv4WithReporting, + is_byte_swapped) LOG = logging.getLogger(__name__) @@ -471,8 +472,7 @@ class DataSourceAzure(sources.DataSource): seed = _get_random_seed() if seed: crawled_data['metadata']['random_seed'] = seed - crawled_data['metadata']['instance-id'] = util.read_dmi_data( - 'system-uuid') + crawled_data['metadata']['instance-id'] = self._iid() if perform_reprovision: LOG.info("Reporting ready to Azure after getting ReprovisionData") @@ -558,6 +558,16 @@ class DataSourceAzure(sources.DataSource): # quickly (local check only) if self.instance_id is still valid return sources.instance_id_matches_system_uuid(self.get_instance_id()) + def _iid(self, previous=None): + prev_iid_path = os.path.join( + self.paths.get_cpath('data'), 'instance-id') + iid = util.read_dmi_data('system-uuid') + if os.path.exists(prev_iid_path): + previous = util.load_file(prev_iid_path).strip() + if is_byte_swapped(previous, iid): + return previous + return iid + @azure_ds_telemetry_reporter def setup(self, is_new_instance): if self._negotiated is False: diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index f5cdb3fd..fc760581 100755 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -7,6 +7,7 @@ import re import socket import struct import time +import textwrap from cloudinit.net import dhcp from cloudinit import stages @@ -48,6 +49,32 @@ def azure_ds_telemetry_reporter(func): return impl +def is_byte_swapped(previous_id, current_id): + """ + Azure stores the instance ID with an incorrect byte ordering for the + first parts. This corrects the byte order such that it is consistent with + that returned by the metadata service. + """ + if previous_id == current_id: + return False + + def swap_bytestring(s, width=2): + dd = [byte for byte in textwrap.wrap(s, 2)] + dd.reverse() + return ''.join(dd) + + parts = current_id.split('-') + swapped_id = '-'.join([ + swap_bytestring(parts[0]), + swap_bytestring(parts[1]), + swap_bytestring(parts[2]), + parts[3], + parts[4] + ]) + + return previous_id == swapped_id + + @azure_ds_telemetry_reporter def get_boot_telemetry(): """Report timestamps related to kernel initialization and systemd diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index 59e351de..a809fd87 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -477,7 +477,7 @@ scbus-1 on xpt0 bus 0 'public-keys': [], }) - self.instance_id = 'test-instance-id' + self.instance_id = 'D0DF4C54-4ECB-4A4B-9954-5BDF3ED5C3B8' def _dmi_mocks(key): if key == 'system-uuid': @@ -645,7 +645,7 @@ scbus-1 on xpt0 bus 0 'azure_data': { 'configurationsettype': 'LinuxProvisioningConfiguration'}, 'imds': NETWORK_METADATA, - 'instance-id': 'test-instance-id', + 'instance-id': 'D0DF4C54-4ECB-4A4B-9954-5BDF3ED5C3B8', 'local-hostname': u'myhost', 'random_seed': 'wild'} @@ -1091,6 +1091,24 @@ scbus-1 on xpt0 bus 0 self.assertTrue(ret) self.assertEqual('value', dsrc.metadata['test']) + def test_instance_id_endianness(self): + """Return the previous iid when dmi uuid is the byteswapped iid.""" + ds = self._get_ds({'ovfcontent': construct_valid_ovf_env()}) + # byte-swapped previous + write_file( + os.path.join(self.paths.cloud_dir, 'data', 'instance-id'), + '544CDFD0-CB4E-4B4A-9954-5BDF3ED5C3B8') + ds.get_data() + self.assertEqual( + '544CDFD0-CB4E-4B4A-9954-5BDF3ED5C3B8', ds.metadata['instance-id']) + # not byte-swapped previous + write_file( + os.path.join(self.paths.cloud_dir, 'data', 'instance-id'), + '644CDFD0-CB4E-4B4A-9954-5BDF3ED5C3B8') + ds.get_data() + self.assertEqual( + 'D0DF4C54-4ECB-4A4B-9954-5BDF3ED5C3B8', ds.metadata['instance-id']) + def test_instance_id_from_dmidecode_used(self): ds = self._get_ds({'ovfcontent': construct_valid_ovf_env()}) ds.get_data() @@ -1292,7 +1310,7 @@ class TestAzureBounce(CiTestCase): def _dmi_mocks(key): if key == 'system-uuid': - return 'test-instance-id' + return 'D0DF4C54-4ECB-4A4B-9954-5BDF3ED5C3B8' elif key == 'chassis-asset-tag': return '7783-7084-3265-9085-8269-3286-77' raise RuntimeError('should not get here') diff --git a/tests/unittests/test_datasource/test_azure_helper.py b/tests/unittests/test_datasource/test_azure_helper.py index bd17f636..007df09f 100644 --- a/tests/unittests/test_datasource/test_azure_helper.py +++ b/tests/unittests/test_datasource/test_azure_helper.py @@ -170,6 +170,25 @@ class TestGoalStateParsing(CiTestCase): goal_state = self._get_goal_state(instance_id=instance_id) self.assertEqual(instance_id, goal_state.instance_id) + def test_instance_id_byte_swap(self): + """Return true when previous_iid is byteswapped current_iid""" + previous_iid = "D0DF4C54-4ECB-4A4B-9954-5BDF3ED5C3B8" + current_iid = "544CDFD0-CB4E-4B4A-9954-5BDF3ED5C3B8" + self.assertTrue( + azure_helper.is_byte_swapped(previous_iid, current_iid)) + + def test_instance_id_no_byte_swap_same_instance_id(self): + previous_iid = "D0DF4C54-4ECB-4A4B-9954-5BDF3ED5C3B8" + current_iid = "D0DF4C54-4ECB-4A4B-9954-5BDF3ED5C3B8" + self.assertFalse( + azure_helper.is_byte_swapped(previous_iid, current_iid)) + + def test_instance_id_no_byte_swap_diff_instance_id(self): + previous_iid = "D0DF4C54-4ECB-4A4B-9954-5BDF3ED5C3B8" + current_iid = "G0DF4C54-4ECB-4A4B-9954-5BDF3ED5C3B8" + self.assertFalse( + azure_helper.is_byte_swapped(previous_iid, current_iid)) + def test_certificates_xml_parsed_and_fetched_correctly(self): http_client = mock.MagicMock() certificates_url = 'TestCertificatesUrl' -- cgit v1.2.3