summaryrefslogtreecommitdiff
path: root/cloudinit/sources/helpers/azure.py
diff options
context:
space:
mode:
authorJohnson Shi <Johnson.Shi@microsoft.com>2020-11-18 09:34:04 -0800
committerGitHub <noreply@github.com>2020-11-18 12:34:04 -0500
commitd807df288f8cef29ca74f0b00c326b084e825782 (patch)
tree4102e9fda39c03da57cd875692015c41b73f5055 /cloudinit/sources/helpers/azure.py
parent96d21dfbee308cd8fe00809184f78da9231ece4a (diff)
downloadvyos-cloud-init-d807df288f8cef29ca74f0b00c326b084e825782.tar.gz
vyos-cloud-init-d807df288f8cef29ca74f0b00c326b084e825782.zip
DataSourceAzure: send failure signal on Azure datasource failure (#594)
On systems where the Azure datasource is a viable platform for crawling metadata, cloud-init occasionally encounters fatal irrecoverable errors during the crawling of the Azure datasource. When this happens, cloud-init crashes, and Azure VM provisioning would fail. However, instead of failing immediately, the user will continue seeing provisioning for a long time until it times out with "OS Provisioning Timed Out" message. In these situations, cloud-init should report failure to the Azure datasource endpoint indicating provisioning failure. The user will immediately see provisioning terminate, giving them a much better failure experience instead of pointlessly waiting for OS provisioning timeout.
Diffstat (limited to 'cloudinit/sources/helpers/azure.py')
-rwxr-xr-xcloudinit/sources/helpers/azure.py80
1 files changed, 74 insertions, 6 deletions
diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py
index 4071a50e..951c7a10 100755
--- a/cloudinit/sources/helpers/azure.py
+++ b/cloudinit/sources/helpers/azure.py
@@ -17,6 +17,7 @@ from cloudinit import stages
from cloudinit import temp_utils
from contextlib import contextmanager
from xml.etree import ElementTree
+from xml.sax.saxutils import escape
from cloudinit import subp
from cloudinit import url_helper
@@ -50,6 +51,11 @@ azure_ds_reporter = events.ReportEventStack(
description="initialize reporter for azure ds",
reporting_enabled=True)
+DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE = (
+ 'The VM encountered an error during deployment. '
+ 'Please visit https://aka.ms/linuxprovisioningerror '
+ 'for more information on remediation.')
+
def azure_ds_telemetry_reporter(func):
def impl(*args, **kwargs):
@@ -379,12 +385,20 @@ class OpenSSLManager:
def __init__(self):
self.tmpdir = temp_utils.mkdtemp()
- self.certificate = None
+ self._certificate = None
self.generate_certificate()
def clean_up(self):
util.del_dir(self.tmpdir)
+ @property
+ def certificate(self):
+ return self._certificate
+
+ @certificate.setter
+ def certificate(self, value):
+ self._certificate = value
+
@azure_ds_telemetry_reporter
def generate_certificate(self):
LOG.debug('Generating certificate for communication with fabric...')
@@ -507,6 +521,10 @@ class GoalStateHealthReporter:
''')
PROVISIONING_SUCCESS_STATUS = 'Ready'
+ PROVISIONING_NOT_READY_STATUS = 'NotReady'
+ PROVISIONING_FAILURE_SUBSTATUS = 'ProvisioningFailed'
+
+ HEALTH_REPORT_DESCRIPTION_TRIM_LEN = 512
def __init__(
self, goal_state: GoalState,
@@ -545,19 +563,39 @@ class GoalStateHealthReporter:
LOG.info('Reported ready to Azure fabric.')
+ @azure_ds_telemetry_reporter
+ def send_failure_signal(self, description: str) -> None:
+ document = self.build_report(
+ incarnation=self._goal_state.incarnation,
+ container_id=self._goal_state.container_id,
+ instance_id=self._goal_state.instance_id,
+ status=self.PROVISIONING_NOT_READY_STATUS,
+ substatus=self.PROVISIONING_FAILURE_SUBSTATUS,
+ description=description)
+ try:
+ self._post_health_report(document=document)
+ except Exception as e:
+ msg = "exception while reporting failure: %s" % e
+ report_diagnostic_event(msg, logger_func=LOG.error)
+ raise
+
+ LOG.warning('Reported failure to Azure fabric.')
+
def build_report(
self, incarnation: str, container_id: str, instance_id: str,
status: str, substatus=None, description=None) -> str:
health_detail = ''
if substatus is not None:
health_detail = self.HEALTH_DETAIL_SUBSECTION_XML_TEMPLATE.format(
- health_substatus=substatus, health_description=description)
+ health_substatus=escape(substatus),
+ health_description=escape(
+ description[:self.HEALTH_REPORT_DESCRIPTION_TRIM_LEN]))
health_report = self.HEALTH_REPORT_XML_TEMPLATE.format(
- incarnation=incarnation,
- container_id=container_id,
- instance_id=instance_id,
- health_status=status,
+ incarnation=escape(str(incarnation)),
+ container_id=escape(container_id),
+ instance_id=escape(instance_id),
+ health_status=escape(status),
health_detail_subsection=health_detail)
return health_report
@@ -798,12 +836,27 @@ class WALinuxAgentShim:
return {'public-keys': ssh_keys}
@azure_ds_telemetry_reporter
+ def register_with_azure_and_report_failure(self, description: str) -> None:
+ """Gets the VM's GoalState from Azure, uses the GoalState information
+ to report failure/send provisioning failure signal to Azure.
+
+ @param: user visible error description of provisioning failure.
+ """
+ if self.azure_endpoint_client is None:
+ self.azure_endpoint_client = AzureEndpointHttpClient(None)
+ goal_state = self._fetch_goal_state_from_azure(need_certificate=False)
+ health_reporter = GoalStateHealthReporter(
+ goal_state, self.azure_endpoint_client, self.endpoint)
+ health_reporter.send_failure_signal(description=description)
+
+ @azure_ds_telemetry_reporter
def _fetch_goal_state_from_azure(
self,
need_certificate: bool) -> GoalState:
"""Fetches the GoalState XML from the Azure endpoint, parses the XML,
and returns a GoalState object.
+ @param need_certificate: switch to know if certificates is needed.
@return: GoalState object representing the GoalState XML
"""
unparsed_goal_state_xml = self._get_raw_goal_state_xml_from_azure()
@@ -844,6 +897,7 @@ class WALinuxAgentShim:
"""Parses a GoalState XML string and returns a GoalState object.
@param unparsed_goal_state_xml: GoalState XML string
+ @param need_certificate: switch to know if certificates is needed.
@return: GoalState object representing the GoalState XML
"""
try:
@@ -942,6 +996,20 @@ def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None,
shim.clean_up()
+@azure_ds_telemetry_reporter
+def report_failure_to_fabric(fallback_lease_file=None, dhcp_opts=None,
+ description=None):
+ shim = WALinuxAgentShim(fallback_lease_file=fallback_lease_file,
+ dhcp_options=dhcp_opts)
+ if not description:
+ description = DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE
+ try:
+ shim.register_with_azure_and_report_failure(
+ description=description)
+ finally:
+ shim.clean_up()
+
+
def dhcp_log_cb(out, err):
report_diagnostic_event(
"dhclient output stream: %s" % out, logger_func=LOG.debug)