From d3271217e2745fb0e3405bd093b61c39fe0708a7 Mon Sep 17 00:00:00 2001 From: aswinrajamannar <39812128+aswinrajamannar@users.noreply.github.com> Date: Tue, 10 Aug 2021 12:28:00 -0700 Subject: Azure: Limit polling network metadata on connection errors (#961) --- cloudinit/sources/DataSourceAzure.py | 27 +++++++++++++++++---------- tests/unittests/test_datasource/test_azure.py | 10 ++++++++-- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index 01e2c959..6df9934b 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -972,7 +972,7 @@ class DataSourceAzure(sources.DataSource): imds_md = None metadata_poll_count = 0 metadata_logging_threshold = 1 - metadata_timeout_count = 0 + expected_errors_count = 0 # For now, only a VM's primary NIC can contact IMDS and WireServer. If # DHCP fails for a NIC, we have no mechanism to determine if the NIC is @@ -998,13 +998,16 @@ class DataSourceAzure(sources.DataSource): raise # Retry polling network metadata for a limited duration only when the - # calls fail due to timeout. This is because the platform drops packets - # going towards IMDS when it is not a primary nic. If the calls fail - # due to other issues like 410, 503 etc, then it means we are primary - # but IMDS service is unavailable at the moment. Retry indefinitely in - # those cases since we cannot move on without the network metadata. + # calls fail due to network unreachable error or timeout. + # This is because the platform drops packets going towards IMDS + # when it is not a primary nic. If the calls fail due to other issues + # like 410, 503 etc, then it means we are primary but IMDS service + # is unavailable at the moment. Retry indefinitely in those cases + # since we cannot move on without the network metadata. In the future, + # all this will not be necessary, as a new dhcp option would tell + # whether the nic is primary or not. def network_metadata_exc_cb(msg, exc): - nonlocal metadata_timeout_count, metadata_poll_count + nonlocal expected_errors_count, metadata_poll_count nonlocal metadata_logging_threshold metadata_poll_count = metadata_poll_count + 1 @@ -1024,9 +1027,13 @@ class DataSourceAzure(sources.DataSource): (msg, exc.cause, exc.code), logger_func=LOG.error) - if exc.cause and isinstance(exc.cause, requests.Timeout): - metadata_timeout_count = metadata_timeout_count + 1 - return (metadata_timeout_count <= 10) + # Retry up to a certain limit for both timeout and network + # unreachable errors. + if exc.cause and isinstance( + exc.cause, (requests.Timeout, requests.ConnectionError) + ): + expected_errors_count = expected_errors_count + 1 + return (expected_errors_count <= 10) return True # Primary nic detection will be optimized in the future. The fact that diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index 3bf8fdb2..63eaf384 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -2825,7 +2825,8 @@ class TestPreprovisioningHotAttachNics(CiTestCase): @mock.patch(MOCKPATH + 'EphemeralDHCPv4') def test_check_if_nic_is_primary_retries_on_failures( self, m_dhcpv4, m_imds): - """Retry polling for network metadata on all failures except timeout""" + """Retry polling for network metadata on all failures except timeout + and network unreachable errors""" dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) lease = { 'interface': 'eth9', 'fixed-address': '192.168.2.9', @@ -2854,8 +2855,13 @@ class TestPreprovisioningHotAttachNics(CiTestCase): error = url_helper.UrlError(cause=cause, code=410) eth0Retries.append(exc_cb("No goal state.", error)) else: - cause = requests.Timeout('Fake connection timeout') for _ in range(0, 10): + # We are expected to retry for a certain period for both + # timeout errors and network unreachable errors. + if _ < 5: + cause = requests.Timeout('Fake connection timeout') + else: + cause = requests.ConnectionError('Network Unreachable') error = url_helper.UrlError(cause=cause) eth1Retries.append(exc_cb("Connection timeout", error)) # Should stop retrying after 10 retries -- cgit v1.2.3