diff options
-rwxr-xr-x | cloudinit/sources/DataSourceAzure.py | 65 | ||||
-rw-r--r-- | tests/unittests/test_datasource/test_azure.py | 95 |
2 files changed, 140 insertions, 20 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index f4fc91cd..38790c12 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -17,6 +17,7 @@ from time import sleep from xml.dom import minidom import xml.etree.ElementTree as ET from enum import Enum +import requests from cloudinit import dmi from cloudinit import log as logging @@ -665,7 +666,9 @@ class DataSourceAzure(sources.DataSource): self, fallback_nic, retries, - md_type=metadata_type.compute): + md_type=metadata_type.compute, + exc_cb=retry_on_url_exc, + infinite=False): """ Wrapper for get_metadata_from_imds so that we can have flexibility in which IMDS api-version we use. If a particular instance of IMDS @@ -685,7 +688,8 @@ class DataSourceAzure(sources.DataSource): fallback_nic=fallback_nic, retries=0, md_type=md_type, - api_version=IMDS_VER_WANT + api_version=IMDS_VER_WANT, + exc_cb=exc_cb ) except UrlError as err: LOG.info( @@ -708,7 +712,9 @@ class DataSourceAzure(sources.DataSource): fallback_nic=fallback_nic, retries=retries, md_type=md_type, - api_version=IMDS_VER_MIN + api_version=IMDS_VER_MIN, + exc_cb=exc_cb, + infinite=infinite ) def device_name_to_device(self, name): @@ -938,6 +944,9 @@ class DataSourceAzure(sources.DataSource): is_primary = False expected_nic_count = -1 imds_md = None + metadata_poll_count = 0 + metadata_logging_threshold = 1 + metadata_timeout_count = 0 # For now, only a VM's primary NIC can contact IMDS and WireServer. If # DHCP fails for a NIC, we have no mechanism to determine if the NIC is @@ -962,14 +971,48 @@ class DataSourceAzure(sources.DataSource): % (ifname, e), logger_func=LOG.error) raise + # Retry polling network metadata for a limited duration only when the + # calls fail due to timeout. This is because the platform drops packets + # going towards IMDS when it is not a primary nic. If the calls fail + # due to other issues like 410, 503 etc, then it means we are primary + # but IMDS service is unavailable at the moment. Retry indefinitely in + # those cases since we cannot move on without the network metadata. + def network_metadata_exc_cb(msg, exc): + nonlocal metadata_timeout_count, metadata_poll_count + nonlocal metadata_logging_threshold + + metadata_poll_count = metadata_poll_count + 1 + + # Log when needed but back off exponentially to avoid exploding + # the log file. + if metadata_poll_count >= metadata_logging_threshold: + metadata_logging_threshold *= 2 + report_diagnostic_event( + "Ran into exception when attempting to reach %s " + "after %d polls." % (msg, metadata_poll_count), + logger_func=LOG.error) + + if isinstance(exc, UrlError): + report_diagnostic_event("poll IMDS with %s failed. " + "Exception: %s and code: %s" % + (msg, exc.cause, exc.code), + logger_func=LOG.error) + + if exc.cause and isinstance(exc.cause, requests.Timeout): + metadata_timeout_count = metadata_timeout_count + 1 + return (metadata_timeout_count <= 10) + return True + # Primary nic detection will be optimized in the future. The fact that # primary nic is being attached first helps here. Otherwise each nic # could add several seconds of delay. try: imds_md = self.get_imds_data_with_api_fallback( ifname, - 5, - metadata_type.network + 0, + metadata_type.network, + network_metadata_exc_cb, + True ) except Exception as e: LOG.warning( @@ -2139,7 +2182,9 @@ def _generate_network_config_from_fallback_config() -> dict: def get_metadata_from_imds(fallback_nic, retries, md_type=metadata_type.compute, - api_version=IMDS_VER_MIN): + api_version=IMDS_VER_MIN, + exc_cb=retry_on_url_exc, + infinite=False): """Query Azure's instance metadata service, returning a dictionary. If network is not up, setup ephemeral dhcp on fallback_nic to talk to the @@ -2158,7 +2203,7 @@ def get_metadata_from_imds(fallback_nic, kwargs = {'logfunc': LOG.debug, 'msg': 'Crawl of Azure Instance Metadata Service (IMDS)', 'func': _get_metadata_from_imds, - 'args': (retries, md_type, api_version,)} + 'args': (retries, exc_cb, md_type, api_version, infinite)} if net.is_up(fallback_nic): return util.log_time(**kwargs) else: @@ -2176,14 +2221,16 @@ def get_metadata_from_imds(fallback_nic, @azure_ds_telemetry_reporter def _get_metadata_from_imds( retries, + exc_cb, md_type=metadata_type.compute, - api_version=IMDS_VER_MIN): + api_version=IMDS_VER_MIN, + infinite=False): url = "{}?api-version={}".format(md_type.value, api_version) headers = {"Metadata": "true"} try: response = readurl( url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers, - retries=retries, exception_cb=retry_on_url_exc) + retries=retries, exception_cb=exc_cb, infinite=infinite) except Exception as e: # pylint:disable=no-member if isinstance(e, UrlError) and e.code == 400: diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index d9817d84..c4a8e08d 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -448,7 +448,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): "http://169.254.169.254/metadata/instance?api-version=" "2019-06-01", exception_cb=mock.ANY, headers=mock.ANY, retries=mock.ANY, - timeout=mock.ANY) + timeout=mock.ANY, infinite=False) @mock.patch(MOCKPATH + 'readurl', autospec=True) @mock.patch(MOCKPATH + 'EphemeralDHCPv4') @@ -467,7 +467,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): "http://169.254.169.254/metadata/instance/network?api-version=" "2019-06-01", exception_cb=mock.ANY, headers=mock.ANY, retries=mock.ANY, - timeout=mock.ANY) + timeout=mock.ANY, infinite=False) @mock.patch(MOCKPATH + 'readurl', autospec=True) @mock.patch(MOCKPATH + 'EphemeralDHCPv4') @@ -486,7 +486,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): "http://169.254.169.254/metadata/instance?api-version=" "2019-06-01", exception_cb=mock.ANY, headers=mock.ANY, retries=mock.ANY, - timeout=mock.ANY) + timeout=mock.ANY, infinite=False) @mock.patch(MOCKPATH + 'readurl', autospec=True) @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting', autospec=True) @@ -511,7 +511,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): m_readurl.assert_called_with( self.network_md_url, exception_cb=mock.ANY, headers={'Metadata': 'true'}, retries=2, - timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS) + timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS, infinite=False) @mock.patch('cloudinit.url_helper.time.sleep') @mock.patch(MOCKPATH + 'net.is_up', autospec=True) @@ -2694,15 +2694,22 @@ class TestPreprovisioningHotAttachNics(CiTestCase): def nic_attach_ret(nl_sock, nics_found): nonlocal m_attach_call_count - if m_attach_call_count == 0: - m_attach_call_count = m_attach_call_count + 1 + m_attach_call_count = m_attach_call_count + 1 + if m_attach_call_count == 1: return "eth0" - return "eth1" + elif m_attach_call_count == 2: + return "eth1" + raise RuntimeError("Must have found primary nic by now.") + + # Simulate two NICs by adding the same one twice. + md = { + "interface": [ + IMDS_NETWORK_METADATA['interface'][0], + IMDS_NETWORK_METADATA['interface'][0] + ] + } - def network_metadata_ret(ifname, retries, type): - # Simulate two NICs by adding the same one twice. - md = IMDS_NETWORK_METADATA - md['interface'].append(md['interface'][0]) + def network_metadata_ret(ifname, retries, type, exc_cb, infinite): if ifname == "eth0": return md raise requests.Timeout('Fake connection timeout') @@ -2724,6 +2731,72 @@ class TestPreprovisioningHotAttachNics(CiTestCase): self.assertEqual(1, m_imds.call_count) self.assertEqual(2, m_link_up.call_count) + @mock.patch(MOCKPATH + 'DataSourceAzure.get_imds_data_with_api_fallback') + @mock.patch(MOCKPATH + 'EphemeralDHCPv4') + def test_check_if_nic_is_primary_retries_on_failures( + self, m_dhcpv4, m_imds): + """Retry polling for network metadata on all failures except timeout""" + dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + lease = { + 'interface': 'eth9', 'fixed-address': '192.168.2.9', + 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0', + 'unknown-245': '624c3620'} + + eth0Retries = [] + eth1Retries = [] + # Simulate two NICs by adding the same one twice. + md = { + "interface": [ + IMDS_NETWORK_METADATA['interface'][0], + IMDS_NETWORK_METADATA['interface'][0] + ] + } + + def network_metadata_ret(ifname, retries, type, exc_cb, infinite): + nonlocal eth0Retries, eth1Retries + + # Simulate readurl functionality with retries and + # exception callbacks so that the callback logic can be + # validated. + if ifname == "eth0": + cause = requests.HTTPError() + for _ in range(0, 15): + error = url_helper.UrlError(cause=cause, code=410) + eth0Retries.append(exc_cb("No goal state.", error)) + else: + cause = requests.Timeout('Fake connection timeout') + for _ in range(0, 10): + error = url_helper.UrlError(cause=cause) + eth1Retries.append(exc_cb("Connection timeout", error)) + # Should stop retrying after 10 retries + eth1Retries.append(exc_cb("Connection timeout", error)) + raise cause + return md + + m_imds.side_effect = network_metadata_ret + + dhcp_ctx = mock.MagicMock(lease=lease) + dhcp_ctx.obtain_lease.return_value = lease + m_dhcpv4.return_value = dhcp_ctx + + is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth0") + self.assertEqual(True, is_primary) + self.assertEqual(2, expected_nic_count) + + # All Eth0 errors are non-timeout errors. So we should have been + # retrying indefinitely until success. + for i in eth0Retries: + self.assertTrue(i) + + is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth1") + self.assertEqual(False, is_primary) + + # All Eth1 errors are timeout errors. Retry happens for a max of 10 and + # then we should have moved on assuming it is not the primary nic. + for i in range(0, 10): + self.assertTrue(eth1Retries[i]) + self.assertFalse(eth1Retries[10]) + @mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up') def test_wait_for_link_up_returns_if_already_up( self, m_is_link_up): |