diff options
author | Anh Vo <anhvo@microsoft.com> | 2019-08-14 21:03:13 +0000 |
---|---|---|
committer | Server Team CI Bot <josh.powers+server-team-bot@canonical.com> | 2019-08-14 21:03:13 +0000 |
commit | 2f3bb764626b9065f4102c7c0a67998a9c174444 (patch) | |
tree | 1cb2c86c2e17fc9373bdf3c875fa48e1cd8171f3 /cloudinit | |
parent | 0e79a1b89287358a77fe31fb82c4bcd83ff48894 (diff) | |
download | vyos-cloud-init-2f3bb764626b9065f4102c7c0a67998a9c174444.tar.gz vyos-cloud-init-2f3bb764626b9065f4102c7c0a67998a9c174444.zip |
Azure: Record boot timestamps, system information, and diagnostic events
Collect and record the following information through KVP:
+ timestamps related to kernel initialization and systemd activation
of cloud-init services
+ system information including cloud-init version, kernel version,
distro version, and python version
+ diagnostic events for the most common provisioning error issues
such as empty dhcp lease, corrupted ovf-env.xml, etc.
+ increasing the log frequency of polling IMDS during reprovision.
Diffstat (limited to 'cloudinit')
-rwxr-xr-x | cloudinit/sources/DataSourceAzure.py | 157 | ||||
-rwxr-xr-x | cloudinit/sources/helpers/azure.py | 160 |
2 files changed, 278 insertions, 39 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index e6ed2f3b..4984fa84 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -26,9 +26,14 @@ from cloudinit.url_helper import UrlError, readurl, retry_on_url_exc from cloudinit import util from cloudinit.reporting import events -from cloudinit.sources.helpers.azure import (azure_ds_reporter, - azure_ds_telemetry_reporter, - get_metadata_from_fabric) +from cloudinit.sources.helpers.azure import ( + azure_ds_reporter, + azure_ds_telemetry_reporter, + get_metadata_from_fabric, + get_boot_telemetry, + get_system_info, + report_diagnostic_event, + EphemeralDHCPv4WithReporting) LOG = logging.getLogger(__name__) @@ -354,7 +359,7 @@ class DataSourceAzure(sources.DataSource): bname = str(pk['fingerprint'] + ".crt") fp_files += [os.path.join(ddir, bname)] LOG.debug("ssh authentication: " - "using fingerprint from fabirc") + "using fingerprint from fabric") with events.ReportEventStack( name="waiting-for-ssh-public-key", @@ -419,12 +424,17 @@ class DataSourceAzure(sources.DataSource): ret = load_azure_ds_dir(cdev) except NonAzureDataSource: + report_diagnostic_event( + "Did not find Azure data source in %s" % cdev) continue except BrokenAzureDataSource as exc: msg = 'BrokenAzureDataSource: %s' % exc + report_diagnostic_event(msg) raise sources.InvalidMetaDataException(msg) except util.MountFailedError: - LOG.warning("%s was not mountable", cdev) + msg = '%s was not mountable' % cdev + report_diagnostic_event(msg) + LOG.warning(msg) continue perform_reprovision = reprovision or self._should_reprovision(ret) @@ -432,6 +442,7 @@ class DataSourceAzure(sources.DataSource): if util.is_FreeBSD(): msg = "Free BSD is not supported for PPS VMs" LOG.error(msg) + report_diagnostic_event(msg) raise sources.InvalidMetaDataException(msg) ret = self._reprovision() imds_md = get_metadata_from_imds( @@ -450,7 +461,9 @@ class DataSourceAzure(sources.DataSource): break if not found: - raise sources.InvalidMetaDataException('No Azure metadata found') + msg = 'No Azure metadata found' + report_diagnostic_event(msg) + raise sources.InvalidMetaDataException(msg) if found == ddir: LOG.debug("using files cached in %s", ddir) @@ -469,9 +482,14 @@ class DataSourceAzure(sources.DataSource): self._report_ready(lease=self._ephemeral_dhcp_ctx.lease) self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral else: - with EphemeralDHCPv4() as lease: - self._report_ready(lease=lease) - + try: + with EphemeralDHCPv4WithReporting( + azure_ds_reporter) as lease: + self._report_ready(lease=lease) + except Exception as e: + report_diagnostic_event( + "exception while reporting ready: %s" % e) + raise return crawled_data def _is_platform_viable(self): @@ -493,6 +511,16 @@ class DataSourceAzure(sources.DataSource): if not self._is_platform_viable(): return False try: + get_boot_telemetry() + except Exception as e: + LOG.warning("Failed to get boot telemetry: %s", e) + + try: + get_system_info() + except Exception as e: + LOG.warning("Failed to get system information: %s", e) + + try: crawled_data = util.log_time( logfunc=LOG.debug, msg='Crawl of metadata service', func=self.crawl_metadata) @@ -551,27 +579,55 @@ class DataSourceAzure(sources.DataSource): headers = {"Metadata": "true"} nl_sock = None report_ready = bool(not os.path.isfile(REPORTED_READY_MARKER_FILE)) + self.imds_logging_threshold = 1 + self.imds_poll_counter = 1 + dhcp_attempts = 0 + vnet_switched = False + return_val = None def exc_cb(msg, exception): if isinstance(exception, UrlError) and exception.code == 404: + if self.imds_poll_counter == self.imds_logging_threshold: + # Reducing the logging frequency as we are polling IMDS + self.imds_logging_threshold *= 2 + LOG.debug("Call to IMDS with arguments %s failed " + "with status code %s after %s retries", + msg, exception.code, self.imds_poll_counter) + LOG.debug("Backing off logging threshold for the same " + "exception to %d", self.imds_logging_threshold) + self.imds_poll_counter += 1 return True + # If we get an exception while trying to call IMDS, we # call DHCP and setup the ephemeral network to acquire the new IP. + LOG.debug("Call to IMDS with arguments %s failed with " + "status code %s", msg, exception.code) + report_diagnostic_event("polling IMDS failed with exception %s" + % exception.code) return False LOG.debug("Wait for vnetswitch to happen") while True: try: - # Save our EphemeralDHCPv4 context so we avoid repeated dhcp - self._ephemeral_dhcp_ctx = EphemeralDHCPv4() - lease = self._ephemeral_dhcp_ctx.obtain_lease() + # Save our EphemeralDHCPv4 context to avoid repeated dhcp + with events.ReportEventStack( + name="obtain-dhcp-lease", + description="obtain dhcp lease", + parent=azure_ds_reporter): + self._ephemeral_dhcp_ctx = EphemeralDHCPv4() + lease = self._ephemeral_dhcp_ctx.obtain_lease() + + if vnet_switched: + dhcp_attempts += 1 if report_ready: try: nl_sock = netlink.create_bound_netlink_socket() except netlink.NetlinkCreateSocketError as e: + report_diagnostic_event(e) LOG.warning(e) self._ephemeral_dhcp_ctx.clean_network() - return + break + path = REPORTED_READY_MARKER_FILE LOG.info( "Creating a marker file to report ready: %s", path) @@ -579,17 +635,33 @@ class DataSourceAzure(sources.DataSource): pid=os.getpid(), time=time())) self._report_ready(lease=lease) report_ready = False - try: - netlink.wait_for_media_disconnect_connect( - nl_sock, lease['interface']) - except AssertionError as error: - LOG.error(error) - return + + with events.ReportEventStack( + name="wait-for-media-disconnect-connect", + description="wait for vnet switch", + parent=azure_ds_reporter): + try: + netlink.wait_for_media_disconnect_connect( + nl_sock, lease['interface']) + except AssertionError as error: + report_diagnostic_event(error) + LOG.error(error) + break + + vnet_switched = True self._ephemeral_dhcp_ctx.clean_network() else: - return readurl(url, timeout=IMDS_TIMEOUT_IN_SECONDS, - headers=headers, exception_cb=exc_cb, - infinite=True, log_req_resp=False).contents + with events.ReportEventStack( + name="get-reprovision-data-from-imds", + description="get reprovision data from imds", + parent=azure_ds_reporter): + return_val = readurl(url, + timeout=IMDS_TIMEOUT_IN_SECONDS, + headers=headers, + exception_cb=exc_cb, + infinite=True, + log_req_resp=False).contents + break except UrlError: # Teardown our EphemeralDHCPv4 context on failure as we retry self._ephemeral_dhcp_ctx.clean_network() @@ -598,6 +670,14 @@ class DataSourceAzure(sources.DataSource): if nl_sock: nl_sock.close() + if vnet_switched: + report_diagnostic_event("attempted dhcp %d times after reuse" % + dhcp_attempts) + report_diagnostic_event("polled imds %d times after reuse" % + self.imds_poll_counter) + + return return_val + @azure_ds_telemetry_reporter def _report_ready(self, lease): """Tells the fabric provisioning has completed """ @@ -666,9 +746,12 @@ class DataSourceAzure(sources.DataSource): self.ds_cfg['agent_command']) try: fabric_data = metadata_func() - except Exception: + except Exception as e: + report_diagnostic_event( + "Error communicating with Azure fabric; You may experience " + "connectivity issues: %s" % e) LOG.warning( - "Error communicating with Azure fabric; You may experience." + "Error communicating with Azure fabric; You may experience " "connectivity issues.", exc_info=True) return False @@ -1027,7 +1110,9 @@ def read_azure_ovf(contents): try: dom = minidom.parseString(contents) except Exception as e: - raise BrokenAzureDataSource("Invalid ovf-env.xml: %s" % e) + error_str = "Invalid ovf-env.xml: %s" % e + report_diagnostic_event(error_str) + raise BrokenAzureDataSource(error_str) results = find_child(dom.documentElement, lambda n: n.localName == "ProvisioningSection") @@ -1299,8 +1384,13 @@ def get_metadata_from_imds(fallback_nic, retries): if net.is_up(fallback_nic): return util.log_time(**kwargs) else: - with EphemeralDHCPv4(fallback_nic): - return util.log_time(**kwargs) + try: + with EphemeralDHCPv4WithReporting( + azure_ds_reporter, fallback_nic): + return util.log_time(**kwargs) + except Exception as e: + report_diagnostic_event("exception while getting metadata: %s" % e) + raise @azure_ds_telemetry_reporter @@ -1313,11 +1403,14 @@ def _get_metadata_from_imds(retries): url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers, retries=retries, exception_cb=retry_on_url_exc) except Exception as e: - LOG.debug('Ignoring IMDS instance metadata: %s', e) + msg = 'Ignoring IMDS instance metadata: %s' % e + report_diagnostic_event(msg) + LOG.debug(msg) return {} try: return util.load_json(str(response)) - except json.decoder.JSONDecodeError: + except json.decoder.JSONDecodeError as e: + report_diagnostic_event('non-json imds response' % e) LOG.warning( 'Ignoring non-json IMDS instance metadata: %s', str(response)) return {} @@ -1370,8 +1463,10 @@ def _is_platform_viable(seed_dir): asset_tag = util.read_dmi_data('chassis-asset-tag') if asset_tag == AZURE_CHASSIS_ASSET_TAG: return True - LOG.debug("Non-Azure DMI asset tag '%s' discovered.", asset_tag) - evt.description = "Non-Azure DMI asset tag '%s' discovered.", asset_tag + msg = "Non-Azure DMI asset tag '%s' discovered." % asset_tag + LOG.debug(msg) + evt.description = msg + report_diagnostic_event(msg) if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')): return True return False diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index 82c4c8c4..f1fba175 100755 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -16,7 +16,11 @@ from xml.etree import ElementTree from cloudinit import url_helper from cloudinit import util +from cloudinit import version +from cloudinit import distros from cloudinit.reporting import events +from cloudinit.net.dhcp import EphemeralDHCPv4 +from datetime import datetime LOG = logging.getLogger(__name__) @@ -24,6 +28,10 @@ LOG = logging.getLogger(__name__) # value is applied if the endpoint can't be found within a lease file DEFAULT_WIRESERVER_ENDPOINT = "a8:3f:81:10" +BOOT_EVENT_TYPE = 'boot-telemetry' +SYSTEMINFO_EVENT_TYPE = 'system-info' +DIAGNOSTIC_EVENT_TYPE = 'diagnostic' + azure_ds_reporter = events.ReportEventStack( name="azure-ds", description="initialize reporter for azure ds", @@ -40,6 +48,105 @@ def azure_ds_telemetry_reporter(func): return impl +@azure_ds_telemetry_reporter +def get_boot_telemetry(): + """Report timestamps related to kernel initialization and systemd + activation of cloud-init""" + if not distros.uses_systemd(): + raise RuntimeError( + "distro not using systemd, skipping boot telemetry") + + LOG.debug("Collecting boot telemetry") + try: + kernel_start = float(time.time()) - float(util.uptime()) + except ValueError: + raise RuntimeError("Failed to determine kernel start timestamp") + + try: + out, _ = util.subp(['/bin/systemctl', + 'show', '-p', + 'UserspaceTimestampMonotonic'], + capture=True) + tsm = None + if out and '=' in out: + tsm = out.split("=")[1] + + if not tsm: + raise RuntimeError("Failed to parse " + "UserspaceTimestampMonotonic from systemd") + + user_start = kernel_start + (float(tsm) / 1000000) + except util.ProcessExecutionError as e: + raise RuntimeError("Failed to get UserspaceTimestampMonotonic: %s" + % e) + except ValueError as e: + raise RuntimeError("Failed to parse " + "UserspaceTimestampMonotonic from systemd: %s" + % e) + + try: + out, _ = util.subp(['/bin/systemctl', 'show', + 'cloud-init-local', '-p', + 'InactiveExitTimestampMonotonic'], + capture=True) + tsm = None + if out and '=' in out: + tsm = out.split("=")[1] + if not tsm: + raise RuntimeError("Failed to parse " + "InactiveExitTimestampMonotonic from systemd") + + cloudinit_activation = kernel_start + (float(tsm) / 1000000) + except util.ProcessExecutionError as e: + raise RuntimeError("Failed to get InactiveExitTimestampMonotonic: %s" + % e) + except ValueError as e: + raise RuntimeError("Failed to parse " + "InactiveExitTimestampMonotonic from systemd: %s" + % e) + + evt = events.ReportingEvent( + BOOT_EVENT_TYPE, 'boot-telemetry', + "kernel_start=%s user_start=%s cloudinit_activation=%s" % + (datetime.utcfromtimestamp(kernel_start).isoformat() + 'Z', + datetime.utcfromtimestamp(user_start).isoformat() + 'Z', + datetime.utcfromtimestamp(cloudinit_activation).isoformat() + 'Z'), + events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt) + + # return the event for unit testing purpose + return evt + + +@azure_ds_telemetry_reporter +def get_system_info(): + """Collect and report system information""" + info = util.system_info() + evt = events.ReportingEvent( + SYSTEMINFO_EVENT_TYPE, 'system information', + "cloudinit_version=%s, kernel_version=%s, variant=%s, " + "distro_name=%s, distro_version=%s, flavor=%s, " + "python_version=%s" % + (version.version_string(), info['release'], info['variant'], + info['dist'][0], info['dist'][1], info['dist'][2], + info['python']), events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt) + + # return the event for unit testing purpose + return evt + + +def report_diagnostic_event(str): + """Report a diagnostic event""" + evt = events.ReportingEvent( + DIAGNOSTIC_EVENT_TYPE, 'diagnostic message', + str, events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt) + + # return the event for unit testing purpose + return evt + + @contextmanager def cd(newdir): prevdir = os.getcwd() @@ -360,16 +467,19 @@ class WALinuxAgentShim(object): value = dhcp245 LOG.debug("Using Azure Endpoint from dhcp options") if value is None: + report_diagnostic_event("No Azure endpoint from dhcp options") LOG.debug('Finding Azure endpoint from networkd...') value = WALinuxAgentShim._networkd_get_value_from_leases() if value is None: # Option-245 stored in /run/cloud-init/dhclient.hooks/<ifc>.json # a dhclient exit hook that calls cloud-init-dhclient-hook + report_diagnostic_event("No Azure endpoint from networkd") LOG.debug('Finding Azure endpoint from hook json...') dhcp_options = WALinuxAgentShim._load_dhclient_json() value = WALinuxAgentShim._get_value_from_dhcpoptions(dhcp_options) if value is None: # Fallback and check the leases file if unsuccessful + report_diagnostic_event("No Azure endpoint from dhclient logs") LOG.debug("Unable to find endpoint in dhclient logs. " " Falling back to check lease files") if fallback_lease_file is None: @@ -381,11 +491,15 @@ class WALinuxAgentShim(object): value = WALinuxAgentShim._get_value_from_leases_file( fallback_lease_file) if value is None: - LOG.warning("No lease found; using default endpoint") + msg = "No lease found; using default endpoint" + report_diagnostic_event(msg) + LOG.warning(msg) value = DEFAULT_WIRESERVER_ENDPOINT endpoint_ip_address = WALinuxAgentShim.get_ip_from_lease_value(value) - LOG.debug('Azure endpoint found at %s', endpoint_ip_address) + msg = 'Azure endpoint found at %s' % endpoint_ip_address + report_diagnostic_event(msg) + LOG.debug(msg) return endpoint_ip_address @azure_ds_telemetry_reporter @@ -399,16 +513,19 @@ class WALinuxAgentShim(object): try: response = http_client.get( 'http://{0}/machine/?comp=goalstate'.format(self.endpoint)) - except Exception: + except Exception as e: if attempts < 10: time.sleep(attempts + 1) else: + report_diagnostic_event( + "failed to register with Azure: %s" % e) raise else: break attempts += 1 LOG.debug('Successfully fetched GoalState XML.') goal_state = GoalState(response.contents, http_client) + report_diagnostic_event("container_id %s" % goal_state.container_id) ssh_keys = [] if goal_state.certificates_xml is not None and pubkey_info is not None: LOG.debug('Certificate XML found; parsing out public keys.') @@ -449,11 +566,20 @@ class WALinuxAgentShim(object): container_id=goal_state.container_id, instance_id=goal_state.instance_id, ) - http_client.post( - "http://{0}/machine?comp=health".format(self.endpoint), - data=document, - extra_headers={'Content-Type': 'text/xml; charset=utf-8'}, - ) + # Host will collect kvps when cloud-init reports ready. + # some kvps might still be in the queue. We yield the scheduler + # to make sure we process all kvps up till this point. + time.sleep(0) + try: + http_client.post( + "http://{0}/machine?comp=health".format(self.endpoint), + data=document, + extra_headers={'Content-Type': 'text/xml; charset=utf-8'}, + ) + except Exception as e: + report_diagnostic_event("exception while reporting ready: %s" % e) + raise + LOG.info('Reported ready to Azure fabric.') @@ -467,4 +593,22 @@ def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None, finally: shim.clean_up() + +class EphemeralDHCPv4WithReporting(object): + def __init__(self, reporter, nic=None): + self.reporter = reporter + self.ephemeralDHCPv4 = EphemeralDHCPv4(iface=nic) + + def __enter__(self): + with events.ReportEventStack( + name="obtain-dhcp-lease", + description="obtain dhcp lease", + parent=self.reporter): + return self.ephemeralDHCPv4.__enter__() + + def __exit__(self, excp_type, excp_value, excp_traceback): + self.ephemeralDHCPv4.__exit__( + excp_type, excp_value, excp_traceback) + + # vi: ts=4 expandtab |