summaryrefslogtreecommitdiff
path: root/cloudinit
diff options
context:
space:
mode:
authorJohnson Shi <Johnson.Shi@microsoft.com>2020-11-18 09:34:04 -0800
committerGitHub <noreply@github.com>2020-11-18 12:34:04 -0500
commitd807df288f8cef29ca74f0b00c326b084e825782 (patch)
tree4102e9fda39c03da57cd875692015c41b73f5055 /cloudinit
parent96d21dfbee308cd8fe00809184f78da9231ece4a (diff)
downloadvyos-cloud-init-d807df288f8cef29ca74f0b00c326b084e825782.tar.gz
vyos-cloud-init-d807df288f8cef29ca74f0b00c326b084e825782.zip
DataSourceAzure: send failure signal on Azure datasource failure (#594)
On systems where the Azure datasource is a viable platform for crawling metadata, cloud-init occasionally encounters fatal irrecoverable errors during the crawling of the Azure datasource. When this happens, cloud-init crashes, and Azure VM provisioning would fail. However, instead of failing immediately, the user will continue seeing provisioning for a long time until it times out with "OS Provisioning Timed Out" message. In these situations, cloud-init should report failure to the Azure datasource endpoint indicating provisioning failure. The user will immediately see provisioning terminate, giving them a much better failure experience instead of pointlessly waiting for OS provisioning timeout.
Diffstat (limited to 'cloudinit')
-rwxr-xr-xcloudinit/sources/DataSourceAzure.py73
-rwxr-xr-xcloudinit/sources/helpers/azure.py80
2 files changed, 142 insertions, 11 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index fa3e0a2b..ab139b8d 100755
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -29,6 +29,7 @@ from cloudinit import util
from cloudinit.reporting import events
from cloudinit.sources.helpers.azure import (
+ DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE,
azure_ds_reporter,
azure_ds_telemetry_reporter,
get_metadata_from_fabric,
@@ -38,7 +39,8 @@ from cloudinit.sources.helpers.azure import (
EphemeralDHCPv4WithReporting,
is_byte_swapped,
dhcp_log_cb,
- push_log_to_kvp)
+ push_log_to_kvp,
+ report_failure_to_fabric)
LOG = logging.getLogger(__name__)
@@ -508,8 +510,9 @@ class DataSourceAzure(sources.DataSource):
if perform_reprovision:
LOG.info("Reporting ready to Azure after getting ReprovisionData")
- use_cached_ephemeral = (net.is_up(self.fallback_interface) and
- getattr(self, '_ephemeral_dhcp_ctx', None))
+ use_cached_ephemeral = (
+ self.distro.networking.is_up(self.fallback_interface) and
+ getattr(self, '_ephemeral_dhcp_ctx', None))
if use_cached_ephemeral:
self._report_ready(lease=self._ephemeral_dhcp_ctx.lease)
self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral
@@ -560,9 +563,14 @@ class DataSourceAzure(sources.DataSource):
logfunc=LOG.debug, msg='Crawl of metadata service',
func=self.crawl_metadata
)
- except sources.InvalidMetaDataException as e:
- LOG.warning('Could not crawl Azure metadata: %s', e)
+ except Exception as e:
+ report_diagnostic_event(
+ 'Could not crawl Azure metadata: %s' % e,
+ logger_func=LOG.error)
+ self._report_failure(
+ description=DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE)
return False
+
if (self.distro and self.distro.name == 'ubuntu' and
self.ds_cfg.get('apply_network_config')):
maybe_remove_ubuntu_network_config_scripts()
@@ -785,6 +793,61 @@ class DataSourceAzure(sources.DataSource):
return return_val
@azure_ds_telemetry_reporter
+ def _report_failure(self, description=None) -> bool:
+ """Tells the Azure fabric that provisioning has failed.
+
+ @param description: A description of the error encountered.
+ @return: The success status of sending the failure signal.
+ """
+ unknown_245_key = 'unknown-245'
+
+ try:
+ if (self.distro.networking.is_up(self.fallback_interface) and
+ getattr(self, '_ephemeral_dhcp_ctx', None) and
+ getattr(self._ephemeral_dhcp_ctx, 'lease', None) and
+ unknown_245_key in self._ephemeral_dhcp_ctx.lease):
+ report_diagnostic_event(
+ 'Using cached ephemeral dhcp context '
+ 'to report failure to Azure', logger_func=LOG.debug)
+ report_failure_to_fabric(
+ dhcp_opts=self._ephemeral_dhcp_ctx.lease[unknown_245_key],
+ description=description)
+ self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral
+ return True
+ except Exception as e:
+ report_diagnostic_event(
+ 'Failed to report failure using '
+ 'cached ephemeral dhcp context: %s' % e,
+ logger_func=LOG.error)
+
+ try:
+ report_diagnostic_event(
+ 'Using new ephemeral dhcp to report failure to Azure',
+ logger_func=LOG.debug)
+ with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease:
+ report_failure_to_fabric(
+ dhcp_opts=lease[unknown_245_key],
+ description=description)
+ return True
+ except Exception as e:
+ report_diagnostic_event(
+ 'Failed to report failure using new ephemeral dhcp: %s' % e,
+ logger_func=LOG.debug)
+
+ try:
+ report_diagnostic_event(
+ 'Using fallback lease to report failure to Azure')
+ report_failure_to_fabric(
+ fallback_lease_file=self.dhclient_lease_file,
+ description=description)
+ return True
+ except Exception as e:
+ report_diagnostic_event(
+ 'Failed to report failure using fallback lease: %s' % e,
+ logger_func=LOG.debug)
+
+ return False
+
def _report_ready(self, lease: dict) -> bool:
"""Tells the fabric provisioning has completed.
diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py
index 4071a50e..951c7a10 100755
--- a/cloudinit/sources/helpers/azure.py
+++ b/cloudinit/sources/helpers/azure.py
@@ -17,6 +17,7 @@ from cloudinit import stages
from cloudinit import temp_utils
from contextlib import contextmanager
from xml.etree import ElementTree
+from xml.sax.saxutils import escape
from cloudinit import subp
from cloudinit import url_helper
@@ -50,6 +51,11 @@ azure_ds_reporter = events.ReportEventStack(
description="initialize reporter for azure ds",
reporting_enabled=True)
+DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE = (
+ 'The VM encountered an error during deployment. '
+ 'Please visit https://aka.ms/linuxprovisioningerror '
+ 'for more information on remediation.')
+
def azure_ds_telemetry_reporter(func):
def impl(*args, **kwargs):
@@ -379,12 +385,20 @@ class OpenSSLManager:
def __init__(self):
self.tmpdir = temp_utils.mkdtemp()
- self.certificate = None
+ self._certificate = None
self.generate_certificate()
def clean_up(self):
util.del_dir(self.tmpdir)
+ @property
+ def certificate(self):
+ return self._certificate
+
+ @certificate.setter
+ def certificate(self, value):
+ self._certificate = value
+
@azure_ds_telemetry_reporter
def generate_certificate(self):
LOG.debug('Generating certificate for communication with fabric...')
@@ -507,6 +521,10 @@ class GoalStateHealthReporter:
''')
PROVISIONING_SUCCESS_STATUS = 'Ready'
+ PROVISIONING_NOT_READY_STATUS = 'NotReady'
+ PROVISIONING_FAILURE_SUBSTATUS = 'ProvisioningFailed'
+
+ HEALTH_REPORT_DESCRIPTION_TRIM_LEN = 512
def __init__(
self, goal_state: GoalState,
@@ -545,19 +563,39 @@ class GoalStateHealthReporter:
LOG.info('Reported ready to Azure fabric.')
+ @azure_ds_telemetry_reporter
+ def send_failure_signal(self, description: str) -> None:
+ document = self.build_report(
+ incarnation=self._goal_state.incarnation,
+ container_id=self._goal_state.container_id,
+ instance_id=self._goal_state.instance_id,
+ status=self.PROVISIONING_NOT_READY_STATUS,
+ substatus=self.PROVISIONING_FAILURE_SUBSTATUS,
+ description=description)
+ try:
+ self._post_health_report(document=document)
+ except Exception as e:
+ msg = "exception while reporting failure: %s" % e
+ report_diagnostic_event(msg, logger_func=LOG.error)
+ raise
+
+ LOG.warning('Reported failure to Azure fabric.')
+
def build_report(
self, incarnation: str, container_id: str, instance_id: str,
status: str, substatus=None, description=None) -> str:
health_detail = ''
if substatus is not None:
health_detail = self.HEALTH_DETAIL_SUBSECTION_XML_TEMPLATE.format(
- health_substatus=substatus, health_description=description)
+ health_substatus=escape(substatus),
+ health_description=escape(
+ description[:self.HEALTH_REPORT_DESCRIPTION_TRIM_LEN]))
health_report = self.HEALTH_REPORT_XML_TEMPLATE.format(
- incarnation=incarnation,
- container_id=container_id,
- instance_id=instance_id,
- health_status=status,
+ incarnation=escape(str(incarnation)),
+ container_id=escape(container_id),
+ instance_id=escape(instance_id),
+ health_status=escape(status),
health_detail_subsection=health_detail)
return health_report
@@ -798,12 +836,27 @@ class WALinuxAgentShim:
return {'public-keys': ssh_keys}
@azure_ds_telemetry_reporter
+ def register_with_azure_and_report_failure(self, description: str) -> None:
+ """Gets the VM's GoalState from Azure, uses the GoalState information
+ to report failure/send provisioning failure signal to Azure.
+
+ @param: user visible error description of provisioning failure.
+ """
+ if self.azure_endpoint_client is None:
+ self.azure_endpoint_client = AzureEndpointHttpClient(None)
+ goal_state = self._fetch_goal_state_from_azure(need_certificate=False)
+ health_reporter = GoalStateHealthReporter(
+ goal_state, self.azure_endpoint_client, self.endpoint)
+ health_reporter.send_failure_signal(description=description)
+
+ @azure_ds_telemetry_reporter
def _fetch_goal_state_from_azure(
self,
need_certificate: bool) -> GoalState:
"""Fetches the GoalState XML from the Azure endpoint, parses the XML,
and returns a GoalState object.
+ @param need_certificate: switch to know if certificates is needed.
@return: GoalState object representing the GoalState XML
"""
unparsed_goal_state_xml = self._get_raw_goal_state_xml_from_azure()
@@ -844,6 +897,7 @@ class WALinuxAgentShim:
"""Parses a GoalState XML string and returns a GoalState object.
@param unparsed_goal_state_xml: GoalState XML string
+ @param need_certificate: switch to know if certificates is needed.
@return: GoalState object representing the GoalState XML
"""
try:
@@ -942,6 +996,20 @@ def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None,
shim.clean_up()
+@azure_ds_telemetry_reporter
+def report_failure_to_fabric(fallback_lease_file=None, dhcp_opts=None,
+ description=None):
+ shim = WALinuxAgentShim(fallback_lease_file=fallback_lease_file,
+ dhcp_options=dhcp_opts)
+ if not description:
+ description = DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE
+ try:
+ shim.register_with_azure_and_report_failure(
+ description=description)
+ finally:
+ shim.clean_up()
+
+
def dhcp_log_cb(out, err):
report_diagnostic_event(
"dhclient output stream: %s" % out, logger_func=LOG.debug)