diff options
author | Christian Poessinger <christian@poessinger.com> | 2020-03-11 21:15:45 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-03-11 21:15:45 +0100 |
commit | fd87c24ff8f6f09a7e2cc223a4e8cdb8c08f1ef8 (patch) | |
tree | b754b3991e5e57a9ae9155819f73fa0cbd4be269 /cloudinit/sources/helpers/azure.py | |
parent | ca9a4eb26b41c204d1bd3a15586b14a5dde950bb (diff) | |
parent | c6627bc05a57645e6af8b9a5a67e452d9f37e487 (diff) | |
download | vyos-cloud-init-fd87c24ff8f6f09a7e2cc223a4e8cdb8c08f1ef8.tar.gz vyos-cloud-init-fd87c24ff8f6f09a7e2cc223a4e8cdb8c08f1ef8.zip |
Merge pull request #4 from zdc/T2117
Cloud-init: T2117: Updated to 20.1 version
Diffstat (limited to 'cloudinit/sources/helpers/azure.py')
-rwxr-xr-x[-rw-r--r--] | cloudinit/sources/helpers/azure.py | 345 |
1 files changed, 302 insertions, 43 deletions
diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index e5696b1f..fc760581 100644..100755 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -7,6 +7,7 @@ import re import socket import struct import time +import textwrap from cloudinit.net import dhcp from cloudinit import stages @@ -16,9 +17,162 @@ from xml.etree import ElementTree from cloudinit import url_helper from cloudinit import util +from cloudinit import version +from cloudinit import distros +from cloudinit.reporting import events +from cloudinit.net.dhcp import EphemeralDHCPv4 +from datetime import datetime LOG = logging.getLogger(__name__) +# This endpoint matches the format as found in dhcp lease files, since this +# value is applied if the endpoint can't be found within a lease file +DEFAULT_WIRESERVER_ENDPOINT = "a8:3f:81:10" + +BOOT_EVENT_TYPE = 'boot-telemetry' +SYSTEMINFO_EVENT_TYPE = 'system-info' +DIAGNOSTIC_EVENT_TYPE = 'diagnostic' + +azure_ds_reporter = events.ReportEventStack( + name="azure-ds", + description="initialize reporter for azure ds", + reporting_enabled=True) + + +def azure_ds_telemetry_reporter(func): + def impl(*args, **kwargs): + with events.ReportEventStack( + name=func.__name__, + description=func.__name__, + parent=azure_ds_reporter): + return func(*args, **kwargs) + return impl + + +def is_byte_swapped(previous_id, current_id): + """ + Azure stores the instance ID with an incorrect byte ordering for the + first parts. This corrects the byte order such that it is consistent with + that returned by the metadata service. + """ + if previous_id == current_id: + return False + + def swap_bytestring(s, width=2): + dd = [byte for byte in textwrap.wrap(s, 2)] + dd.reverse() + return ''.join(dd) + + parts = current_id.split('-') + swapped_id = '-'.join([ + swap_bytestring(parts[0]), + swap_bytestring(parts[1]), + swap_bytestring(parts[2]), + parts[3], + parts[4] + ]) + + return previous_id == swapped_id + + +@azure_ds_telemetry_reporter +def get_boot_telemetry(): + """Report timestamps related to kernel initialization and systemd + activation of cloud-init""" + if not distros.uses_systemd(): + raise RuntimeError( + "distro not using systemd, skipping boot telemetry") + + LOG.debug("Collecting boot telemetry") + try: + kernel_start = float(time.time()) - float(util.uptime()) + except ValueError: + raise RuntimeError("Failed to determine kernel start timestamp") + + try: + out, _ = util.subp(['/bin/systemctl', + 'show', '-p', + 'UserspaceTimestampMonotonic'], + capture=True) + tsm = None + if out and '=' in out: + tsm = out.split("=")[1] + + if not tsm: + raise RuntimeError("Failed to parse " + "UserspaceTimestampMonotonic from systemd") + + user_start = kernel_start + (float(tsm) / 1000000) + except util.ProcessExecutionError as e: + raise RuntimeError("Failed to get UserspaceTimestampMonotonic: %s" + % e) + except ValueError as e: + raise RuntimeError("Failed to parse " + "UserspaceTimestampMonotonic from systemd: %s" + % e) + + try: + out, _ = util.subp(['/bin/systemctl', 'show', + 'cloud-init-local', '-p', + 'InactiveExitTimestampMonotonic'], + capture=True) + tsm = None + if out and '=' in out: + tsm = out.split("=")[1] + if not tsm: + raise RuntimeError("Failed to parse " + "InactiveExitTimestampMonotonic from systemd") + + cloudinit_activation = kernel_start + (float(tsm) / 1000000) + except util.ProcessExecutionError as e: + raise RuntimeError("Failed to get InactiveExitTimestampMonotonic: %s" + % e) + except ValueError as e: + raise RuntimeError("Failed to parse " + "InactiveExitTimestampMonotonic from systemd: %s" + % e) + + evt = events.ReportingEvent( + BOOT_EVENT_TYPE, 'boot-telemetry', + "kernel_start=%s user_start=%s cloudinit_activation=%s" % + (datetime.utcfromtimestamp(kernel_start).isoformat() + 'Z', + datetime.utcfromtimestamp(user_start).isoformat() + 'Z', + datetime.utcfromtimestamp(cloudinit_activation).isoformat() + 'Z'), + events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt) + + # return the event for unit testing purpose + return evt + + +@azure_ds_telemetry_reporter +def get_system_info(): + """Collect and report system information""" + info = util.system_info() + evt = events.ReportingEvent( + SYSTEMINFO_EVENT_TYPE, 'system information', + "cloudinit_version=%s, kernel_version=%s, variant=%s, " + "distro_name=%s, distro_version=%s, flavor=%s, " + "python_version=%s" % + (version.version_string(), info['release'], info['variant'], + info['dist'][0], info['dist'][1], info['dist'][2], + info['python']), events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt) + + # return the event for unit testing purpose + return evt + + +def report_diagnostic_event(str): + """Report a diagnostic event""" + evt = events.ReportingEvent( + DIAGNOSTIC_EVENT_TYPE, 'diagnostic message', + str, events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt) + + # return the event for unit testing purpose + return evt + @contextmanager def cd(newdir): @@ -56,14 +210,16 @@ class AzureEndpointHttpClient(object): if secure: headers = self.headers.copy() headers.update(self.extra_secure_headers) - return url_helper.read_file_or_url(url, headers=headers) + return url_helper.read_file_or_url(url, headers=headers, timeout=5, + retries=10) def post(self, url, data=None, extra_headers=None): headers = self.headers if extra_headers is not None: headers = self.headers.copy() headers.update(extra_headers) - return url_helper.read_file_or_url(url, data=data, headers=headers) + return url_helper.read_file_or_url(url, data=data, headers=headers, + timeout=5, retries=10) class GoalState(object): @@ -119,6 +275,7 @@ class OpenSSLManager(object): def clean_up(self): util.del_dir(self.tmpdir) + @azure_ds_telemetry_reporter def generate_certificate(self): LOG.debug('Generating certificate for communication with fabric...') if self.certificate is not None: @@ -138,9 +295,40 @@ class OpenSSLManager(object): self.certificate = certificate LOG.debug('New certificate generated.') - def parse_certificates(self, certificates_xml): - tag = ElementTree.fromstring(certificates_xml).find( - './/Data') + @staticmethod + @azure_ds_telemetry_reporter + def _run_x509_action(action, cert): + cmd = ['openssl', 'x509', '-noout', action] + result, _ = util.subp(cmd, data=cert) + return result + + @azure_ds_telemetry_reporter + def _get_ssh_key_from_cert(self, certificate): + pub_key = self._run_x509_action('-pubkey', certificate) + keygen_cmd = ['ssh-keygen', '-i', '-m', 'PKCS8', '-f', '/dev/stdin'] + ssh_key, _ = util.subp(keygen_cmd, data=pub_key) + return ssh_key + + @azure_ds_telemetry_reporter + def _get_fingerprint_from_cert(self, certificate): + """openssl x509 formats fingerprints as so: + 'SHA1 Fingerprint=07:3E:19:D1:4D:1C:79:92:24:C6:A0:FD:8D:DA:\ + B6:A8:BF:27:D4:73\n' + + Azure control plane passes that fingerprint as so: + '073E19D14D1C799224C6A0FD8DDAB6A8BF27D473' + """ + raw_fp = self._run_x509_action('-fingerprint', certificate) + eq = raw_fp.find('=') + octets = raw_fp[eq+1:-1].split(':') + return ''.join(octets) + + @azure_ds_telemetry_reporter + def _decrypt_certs_from_xml(self, certificates_xml): + """Decrypt the certificates XML document using the our private key; + return the list of certs and private keys contained in the doc. + """ + tag = ElementTree.fromstring(certificates_xml).find('.//Data') certificates_content = tag.text lines = [ b'MIME-Version: 1.0', @@ -151,32 +339,31 @@ class OpenSSLManager(object): certificates_content.encode('utf-8'), ] with cd(self.tmpdir): - with open('Certificates.p7m', 'wb') as f: - f.write(b'\n'.join(lines)) out, _ = util.subp( - 'openssl cms -decrypt -in Certificates.p7m -inkey' + 'openssl cms -decrypt -in /dev/stdin -inkey' ' {private_key} -recip {certificate} | openssl pkcs12 -nodes' ' -password pass:'.format(**self.certificate_names), - shell=True) - private_keys, certificates = [], [] + shell=True, data=b'\n'.join(lines)) + return out + + @azure_ds_telemetry_reporter + def parse_certificates(self, certificates_xml): + """Given the Certificates XML document, return a dictionary of + fingerprints and associated SSH keys derived from the certs.""" + out = self._decrypt_certs_from_xml(certificates_xml) current = [] + keys = {} for line in out.splitlines(): current.append(line) if re.match(r'[-]+END .*?KEY[-]+$', line): - private_keys.append('\n'.join(current)) + # ignore private_keys current = [] elif re.match(r'[-]+END .*?CERTIFICATE[-]+$', line): - certificates.append('\n'.join(current)) + certificate = '\n'.join(current) + ssh_key = self._get_ssh_key_from_cert(certificate) + fingerprint = self._get_fingerprint_from_cert(certificate) + keys[fingerprint] = ssh_key current = [] - keys = [] - for certificate in certificates: - with cd(self.tmpdir): - public_key, _ = util.subp( - 'openssl x509 -noout -pubkey |' - 'ssh-keygen -i -m PKCS8 -f /dev/stdin', - data=certificate, - shell=True) - keys.append(public_key) return keys @@ -206,7 +393,6 @@ class WALinuxAgentShim(object): self.dhcpoptions = dhcp_options self._endpoint = None self.openssl_manager = None - self.values = {} self.lease_file = fallback_lease_file def clean_up(self): @@ -241,14 +427,21 @@ class WALinuxAgentShim(object): return socket.inet_ntoa(packed_bytes) @staticmethod + @azure_ds_telemetry_reporter def _networkd_get_value_from_leases(leases_d=None): return dhcp.networkd_get_option_from_leases( 'OPTION_245', leases_d=leases_d) @staticmethod + @azure_ds_telemetry_reporter def _get_value_from_leases_file(fallback_lease_file): leases = [] - content = util.load_file(fallback_lease_file) + try: + content = util.load_file(fallback_lease_file) + except IOError as ex: + LOG.error("Failed to read %s: %s", fallback_lease_file, ex) + return None + LOG.debug("content is %s", content) option_name = _get_dhcp_endpoint_option_name() for line in content.splitlines(): @@ -263,6 +456,7 @@ class WALinuxAgentShim(object): return leases[-1] @staticmethod + @azure_ds_telemetry_reporter def _load_dhclient_json(): dhcp_options = {} hooks_dir = WALinuxAgentShim._get_hooks_dir() @@ -281,6 +475,7 @@ class WALinuxAgentShim(object): return dhcp_options @staticmethod + @azure_ds_telemetry_reporter def _get_value_from_dhcpoptions(dhcp_options): if dhcp_options is None: return None @@ -294,22 +489,26 @@ class WALinuxAgentShim(object): return _value @staticmethod + @azure_ds_telemetry_reporter def find_endpoint(fallback_lease_file=None, dhcp245=None): value = None if dhcp245 is not None: value = dhcp245 LOG.debug("Using Azure Endpoint from dhcp options") if value is None: + report_diagnostic_event("No Azure endpoint from dhcp options") LOG.debug('Finding Azure endpoint from networkd...') value = WALinuxAgentShim._networkd_get_value_from_leases() if value is None: # Option-245 stored in /run/cloud-init/dhclient.hooks/<ifc>.json # a dhclient exit hook that calls cloud-init-dhclient-hook + report_diagnostic_event("No Azure endpoint from networkd") LOG.debug('Finding Azure endpoint from hook json...') dhcp_options = WALinuxAgentShim._load_dhclient_json() value = WALinuxAgentShim._get_value_from_dhcpoptions(dhcp_options) if value is None: # Fallback and check the leases file if unsuccessful + report_diagnostic_event("No Azure endpoint from dhclient logs") LOG.debug("Unable to find endpoint in dhclient logs. " " Falling back to check lease files") if fallback_lease_file is None: @@ -320,16 +519,22 @@ class WALinuxAgentShim(object): fallback_lease_file) value = WALinuxAgentShim._get_value_from_leases_file( fallback_lease_file) - if value is None: - raise ValueError('No endpoint found.') + msg = "No lease found; using default endpoint" + report_diagnostic_event(msg) + LOG.warning(msg) + value = DEFAULT_WIRESERVER_ENDPOINT endpoint_ip_address = WALinuxAgentShim.get_ip_from_lease_value(value) - LOG.debug('Azure endpoint found at %s', endpoint_ip_address) + msg = 'Azure endpoint found at %s' % endpoint_ip_address + report_diagnostic_event(msg) + LOG.debug(msg) return endpoint_ip_address - def register_with_azure_and_fetch_data(self): - self.openssl_manager = OpenSSLManager() + @azure_ds_telemetry_reporter + def register_with_azure_and_fetch_data(self, pubkey_info=None): + if self.openssl_manager is None: + self.openssl_manager = OpenSSLManager() http_client = AzureEndpointHttpClient(self.openssl_manager.certificate) LOG.info('Registering with Azure...') attempts = 0 @@ -337,27 +542,52 @@ class WALinuxAgentShim(object): try: response = http_client.get( 'http://{0}/machine/?comp=goalstate'.format(self.endpoint)) - except Exception: + except Exception as e: if attempts < 10: time.sleep(attempts + 1) else: + report_diagnostic_event( + "failed to register with Azure: %s" % e) raise else: break attempts += 1 LOG.debug('Successfully fetched GoalState XML.') goal_state = GoalState(response.contents, http_client) - public_keys = [] - if goal_state.certificates_xml is not None: + report_diagnostic_event("container_id %s" % goal_state.container_id) + ssh_keys = [] + if goal_state.certificates_xml is not None and pubkey_info is not None: LOG.debug('Certificate XML found; parsing out public keys.') - public_keys = self.openssl_manager.parse_certificates( + keys_by_fingerprint = self.openssl_manager.parse_certificates( goal_state.certificates_xml) - data = { - 'public-keys': public_keys, - } + ssh_keys = self._filter_pubkeys(keys_by_fingerprint, pubkey_info) self._report_ready(goal_state, http_client) - return data + return {'public-keys': ssh_keys} + + def _filter_pubkeys(self, keys_by_fingerprint, pubkey_info): + """cloud-init expects a straightforward array of keys to be dropped + into the user's authorized_keys file. Azure control plane exposes + multiple public keys to the VM via wireserver. Select just the + user's key(s) and return them, ignoring any other certs. + """ + keys = [] + for pubkey in pubkey_info: + if 'value' in pubkey and pubkey['value']: + keys.append(pubkey['value']) + elif 'fingerprint' in pubkey and pubkey['fingerprint']: + fingerprint = pubkey['fingerprint'] + if fingerprint in keys_by_fingerprint: + keys.append(keys_by_fingerprint[fingerprint]) + else: + LOG.warning("ovf-env.xml specified PublicKey fingerprint " + "%s not found in goalstate XML", fingerprint) + else: + LOG.warning("ovf-env.xml specified PublicKey with neither " + "value nor fingerprint: %s", pubkey) + return keys + + @azure_ds_telemetry_reporter def _report_ready(self, goal_state, http_client): LOG.debug('Reporting ready to Azure fabric.') document = self.REPORT_READY_XML_TEMPLATE.format( @@ -365,20 +595,49 @@ class WALinuxAgentShim(object): container_id=goal_state.container_id, instance_id=goal_state.instance_id, ) - http_client.post( - "http://{0}/machine?comp=health".format(self.endpoint), - data=document, - extra_headers={'Content-Type': 'text/xml; charset=utf-8'}, - ) + # Host will collect kvps when cloud-init reports ready. + # some kvps might still be in the queue. We yield the scheduler + # to make sure we process all kvps up till this point. + time.sleep(0) + try: + http_client.post( + "http://{0}/machine?comp=health".format(self.endpoint), + data=document, + extra_headers={'Content-Type': 'text/xml; charset=utf-8'}, + ) + except Exception as e: + report_diagnostic_event("exception while reporting ready: %s" % e) + raise + LOG.info('Reported ready to Azure fabric.') -def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None): +@azure_ds_telemetry_reporter +def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None, + pubkey_info=None): shim = WALinuxAgentShim(fallback_lease_file=fallback_lease_file, dhcp_options=dhcp_opts) try: - return shim.register_with_azure_and_fetch_data() + return shim.register_with_azure_and_fetch_data(pubkey_info=pubkey_info) finally: shim.clean_up() + +class EphemeralDHCPv4WithReporting(object): + def __init__(self, reporter, nic=None): + self.reporter = reporter + self.ephemeralDHCPv4 = EphemeralDHCPv4(iface=nic) + + def __enter__(self): + with events.ReportEventStack( + name="obtain-dhcp-lease", + description="obtain dhcp lease", + parent=self.reporter): + return self.ephemeralDHCPv4.__enter__() + + def __exit__(self, excp_type, excp_value, excp_traceback): + self.ephemeralDHCPv4.__exit__( + excp_type, excp_value, excp_traceback) + + # vi: ts=4 expandtab |