summaryrefslogtreecommitdiff
path: root/cloudinit/sources/DataSourceAzure.py
diff options
context:
space:
mode:
authorJohnson Shi <Johnson.Shi@microsoft.com>2020-11-18 09:34:04 -0800
committerGitHub <noreply@github.com>2020-11-18 12:34:04 -0500
commitd807df288f8cef29ca74f0b00c326b084e825782 (patch)
tree4102e9fda39c03da57cd875692015c41b73f5055 /cloudinit/sources/DataSourceAzure.py
parent96d21dfbee308cd8fe00809184f78da9231ece4a (diff)
downloadvyos-cloud-init-d807df288f8cef29ca74f0b00c326b084e825782.tar.gz
vyos-cloud-init-d807df288f8cef29ca74f0b00c326b084e825782.zip
DataSourceAzure: send failure signal on Azure datasource failure (#594)
On systems where the Azure datasource is a viable platform for crawling metadata, cloud-init occasionally encounters fatal irrecoverable errors during the crawling of the Azure datasource. When this happens, cloud-init crashes, and Azure VM provisioning would fail. However, instead of failing immediately, the user will continue seeing provisioning for a long time until it times out with "OS Provisioning Timed Out" message. In these situations, cloud-init should report failure to the Azure datasource endpoint indicating provisioning failure. The user will immediately see provisioning terminate, giving them a much better failure experience instead of pointlessly waiting for OS provisioning timeout.
Diffstat (limited to 'cloudinit/sources/DataSourceAzure.py')
-rwxr-xr-xcloudinit/sources/DataSourceAzure.py73
1 files changed, 68 insertions, 5 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index fa3e0a2b..ab139b8d 100755
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -29,6 +29,7 @@ from cloudinit import util
from cloudinit.reporting import events
from cloudinit.sources.helpers.azure import (
+ DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE,
azure_ds_reporter,
azure_ds_telemetry_reporter,
get_metadata_from_fabric,
@@ -38,7 +39,8 @@ from cloudinit.sources.helpers.azure import (
EphemeralDHCPv4WithReporting,
is_byte_swapped,
dhcp_log_cb,
- push_log_to_kvp)
+ push_log_to_kvp,
+ report_failure_to_fabric)
LOG = logging.getLogger(__name__)
@@ -508,8 +510,9 @@ class DataSourceAzure(sources.DataSource):
if perform_reprovision:
LOG.info("Reporting ready to Azure after getting ReprovisionData")
- use_cached_ephemeral = (net.is_up(self.fallback_interface) and
- getattr(self, '_ephemeral_dhcp_ctx', None))
+ use_cached_ephemeral = (
+ self.distro.networking.is_up(self.fallback_interface) and
+ getattr(self, '_ephemeral_dhcp_ctx', None))
if use_cached_ephemeral:
self._report_ready(lease=self._ephemeral_dhcp_ctx.lease)
self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral
@@ -560,9 +563,14 @@ class DataSourceAzure(sources.DataSource):
logfunc=LOG.debug, msg='Crawl of metadata service',
func=self.crawl_metadata
)
- except sources.InvalidMetaDataException as e:
- LOG.warning('Could not crawl Azure metadata: %s', e)
+ except Exception as e:
+ report_diagnostic_event(
+ 'Could not crawl Azure metadata: %s' % e,
+ logger_func=LOG.error)
+ self._report_failure(
+ description=DEFAULT_REPORT_FAILURE_USER_VISIBLE_MESSAGE)
return False
+
if (self.distro and self.distro.name == 'ubuntu' and
self.ds_cfg.get('apply_network_config')):
maybe_remove_ubuntu_network_config_scripts()
@@ -785,6 +793,61 @@ class DataSourceAzure(sources.DataSource):
return return_val
@azure_ds_telemetry_reporter
+ def _report_failure(self, description=None) -> bool:
+ """Tells the Azure fabric that provisioning has failed.
+
+ @param description: A description of the error encountered.
+ @return: The success status of sending the failure signal.
+ """
+ unknown_245_key = 'unknown-245'
+
+ try:
+ if (self.distro.networking.is_up(self.fallback_interface) and
+ getattr(self, '_ephemeral_dhcp_ctx', None) and
+ getattr(self._ephemeral_dhcp_ctx, 'lease', None) and
+ unknown_245_key in self._ephemeral_dhcp_ctx.lease):
+ report_diagnostic_event(
+ 'Using cached ephemeral dhcp context '
+ 'to report failure to Azure', logger_func=LOG.debug)
+ report_failure_to_fabric(
+ dhcp_opts=self._ephemeral_dhcp_ctx.lease[unknown_245_key],
+ description=description)
+ self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral
+ return True
+ except Exception as e:
+ report_diagnostic_event(
+ 'Failed to report failure using '
+ 'cached ephemeral dhcp context: %s' % e,
+ logger_func=LOG.error)
+
+ try:
+ report_diagnostic_event(
+ 'Using new ephemeral dhcp to report failure to Azure',
+ logger_func=LOG.debug)
+ with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease:
+ report_failure_to_fabric(
+ dhcp_opts=lease[unknown_245_key],
+ description=description)
+ return True
+ except Exception as e:
+ report_diagnostic_event(
+ 'Failed to report failure using new ephemeral dhcp: %s' % e,
+ logger_func=LOG.debug)
+
+ try:
+ report_diagnostic_event(
+ 'Using fallback lease to report failure to Azure')
+ report_failure_to_fabric(
+ fallback_lease_file=self.dhclient_lease_file,
+ description=description)
+ return True
+ except Exception as e:
+ report_diagnostic_event(
+ 'Failed to report failure using fallback lease: %s' % e,
+ logger_func=LOG.debug)
+
+ return False
+
def _report_ready(self, lease: dict) -> bool:
"""Tells the fabric provisioning has completed.