summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xcloudinit/sources/DataSourceAzure.py65
-rw-r--r--tests/unittests/test_datasource/test_azure.py95
2 files changed, 140 insertions, 20 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index f4fc91cd..38790c12 100755
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -17,6 +17,7 @@ from time import sleep
from xml.dom import minidom
import xml.etree.ElementTree as ET
from enum import Enum
+import requests
from cloudinit import dmi
from cloudinit import log as logging
@@ -665,7 +666,9 @@ class DataSourceAzure(sources.DataSource):
self,
fallback_nic,
retries,
- md_type=metadata_type.compute):
+ md_type=metadata_type.compute,
+ exc_cb=retry_on_url_exc,
+ infinite=False):
"""
Wrapper for get_metadata_from_imds so that we can have flexibility
in which IMDS api-version we use. If a particular instance of IMDS
@@ -685,7 +688,8 @@ class DataSourceAzure(sources.DataSource):
fallback_nic=fallback_nic,
retries=0,
md_type=md_type,
- api_version=IMDS_VER_WANT
+ api_version=IMDS_VER_WANT,
+ exc_cb=exc_cb
)
except UrlError as err:
LOG.info(
@@ -708,7 +712,9 @@ class DataSourceAzure(sources.DataSource):
fallback_nic=fallback_nic,
retries=retries,
md_type=md_type,
- api_version=IMDS_VER_MIN
+ api_version=IMDS_VER_MIN,
+ exc_cb=exc_cb,
+ infinite=infinite
)
def device_name_to_device(self, name):
@@ -938,6 +944,9 @@ class DataSourceAzure(sources.DataSource):
is_primary = False
expected_nic_count = -1
imds_md = None
+ metadata_poll_count = 0
+ metadata_logging_threshold = 1
+ metadata_timeout_count = 0
# For now, only a VM's primary NIC can contact IMDS and WireServer. If
# DHCP fails for a NIC, we have no mechanism to determine if the NIC is
@@ -962,14 +971,48 @@ class DataSourceAzure(sources.DataSource):
% (ifname, e), logger_func=LOG.error)
raise
+ # Retry polling network metadata for a limited duration only when the
+ # calls fail due to timeout. This is because the platform drops packets
+ # going towards IMDS when it is not a primary nic. If the calls fail
+ # due to other issues like 410, 503 etc, then it means we are primary
+ # but IMDS service is unavailable at the moment. Retry indefinitely in
+ # those cases since we cannot move on without the network metadata.
+ def network_metadata_exc_cb(msg, exc):
+ nonlocal metadata_timeout_count, metadata_poll_count
+ nonlocal metadata_logging_threshold
+
+ metadata_poll_count = metadata_poll_count + 1
+
+ # Log when needed but back off exponentially to avoid exploding
+ # the log file.
+ if metadata_poll_count >= metadata_logging_threshold:
+ metadata_logging_threshold *= 2
+ report_diagnostic_event(
+ "Ran into exception when attempting to reach %s "
+ "after %d polls." % (msg, metadata_poll_count),
+ logger_func=LOG.error)
+
+ if isinstance(exc, UrlError):
+ report_diagnostic_event("poll IMDS with %s failed. "
+ "Exception: %s and code: %s" %
+ (msg, exc.cause, exc.code),
+ logger_func=LOG.error)
+
+ if exc.cause and isinstance(exc.cause, requests.Timeout):
+ metadata_timeout_count = metadata_timeout_count + 1
+ return (metadata_timeout_count <= 10)
+ return True
+
# Primary nic detection will be optimized in the future. The fact that
# primary nic is being attached first helps here. Otherwise each nic
# could add several seconds of delay.
try:
imds_md = self.get_imds_data_with_api_fallback(
ifname,
- 5,
- metadata_type.network
+ 0,
+ metadata_type.network,
+ network_metadata_exc_cb,
+ True
)
except Exception as e:
LOG.warning(
@@ -2139,7 +2182,9 @@ def _generate_network_config_from_fallback_config() -> dict:
def get_metadata_from_imds(fallback_nic,
retries,
md_type=metadata_type.compute,
- api_version=IMDS_VER_MIN):
+ api_version=IMDS_VER_MIN,
+ exc_cb=retry_on_url_exc,
+ infinite=False):
"""Query Azure's instance metadata service, returning a dictionary.
If network is not up, setup ephemeral dhcp on fallback_nic to talk to the
@@ -2158,7 +2203,7 @@ def get_metadata_from_imds(fallback_nic,
kwargs = {'logfunc': LOG.debug,
'msg': 'Crawl of Azure Instance Metadata Service (IMDS)',
'func': _get_metadata_from_imds,
- 'args': (retries, md_type, api_version,)}
+ 'args': (retries, exc_cb, md_type, api_version, infinite)}
if net.is_up(fallback_nic):
return util.log_time(**kwargs)
else:
@@ -2176,14 +2221,16 @@ def get_metadata_from_imds(fallback_nic,
@azure_ds_telemetry_reporter
def _get_metadata_from_imds(
retries,
+ exc_cb,
md_type=metadata_type.compute,
- api_version=IMDS_VER_MIN):
+ api_version=IMDS_VER_MIN,
+ infinite=False):
url = "{}?api-version={}".format(md_type.value, api_version)
headers = {"Metadata": "true"}
try:
response = readurl(
url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers,
- retries=retries, exception_cb=retry_on_url_exc)
+ retries=retries, exception_cb=exc_cb, infinite=infinite)
except Exception as e:
# pylint:disable=no-member
if isinstance(e, UrlError) and e.code == 400:
diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
index d9817d84..c4a8e08d 100644
--- a/tests/unittests/test_datasource/test_azure.py
+++ b/tests/unittests/test_datasource/test_azure.py
@@ -448,7 +448,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
"http://169.254.169.254/metadata/instance?api-version="
"2019-06-01", exception_cb=mock.ANY,
headers=mock.ANY, retries=mock.ANY,
- timeout=mock.ANY)
+ timeout=mock.ANY, infinite=False)
@mock.patch(MOCKPATH + 'readurl', autospec=True)
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
@@ -467,7 +467,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
"http://169.254.169.254/metadata/instance/network?api-version="
"2019-06-01", exception_cb=mock.ANY,
headers=mock.ANY, retries=mock.ANY,
- timeout=mock.ANY)
+ timeout=mock.ANY, infinite=False)
@mock.patch(MOCKPATH + 'readurl', autospec=True)
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
@@ -486,7 +486,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
"http://169.254.169.254/metadata/instance?api-version="
"2019-06-01", exception_cb=mock.ANY,
headers=mock.ANY, retries=mock.ANY,
- timeout=mock.ANY)
+ timeout=mock.ANY, infinite=False)
@mock.patch(MOCKPATH + 'readurl', autospec=True)
@mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting', autospec=True)
@@ -511,7 +511,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
m_readurl.assert_called_with(
self.network_md_url, exception_cb=mock.ANY,
headers={'Metadata': 'true'}, retries=2,
- timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS)
+ timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS, infinite=False)
@mock.patch('cloudinit.url_helper.time.sleep')
@mock.patch(MOCKPATH + 'net.is_up', autospec=True)
@@ -2694,15 +2694,22 @@ class TestPreprovisioningHotAttachNics(CiTestCase):
def nic_attach_ret(nl_sock, nics_found):
nonlocal m_attach_call_count
- if m_attach_call_count == 0:
- m_attach_call_count = m_attach_call_count + 1
+ m_attach_call_count = m_attach_call_count + 1
+ if m_attach_call_count == 1:
return "eth0"
- return "eth1"
+ elif m_attach_call_count == 2:
+ return "eth1"
+ raise RuntimeError("Must have found primary nic by now.")
+
+ # Simulate two NICs by adding the same one twice.
+ md = {
+ "interface": [
+ IMDS_NETWORK_METADATA['interface'][0],
+ IMDS_NETWORK_METADATA['interface'][0]
+ ]
+ }
- def network_metadata_ret(ifname, retries, type):
- # Simulate two NICs by adding the same one twice.
- md = IMDS_NETWORK_METADATA
- md['interface'].append(md['interface'][0])
+ def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
if ifname == "eth0":
return md
raise requests.Timeout('Fake connection timeout')
@@ -2724,6 +2731,72 @@ class TestPreprovisioningHotAttachNics(CiTestCase):
self.assertEqual(1, m_imds.call_count)
self.assertEqual(2, m_link_up.call_count)
+ @mock.patch(MOCKPATH + 'DataSourceAzure.get_imds_data_with_api_fallback')
+ @mock.patch(MOCKPATH + 'EphemeralDHCPv4')
+ def test_check_if_nic_is_primary_retries_on_failures(
+ self, m_dhcpv4, m_imds):
+ """Retry polling for network metadata on all failures except timeout"""
+ dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths)
+ lease = {
+ 'interface': 'eth9', 'fixed-address': '192.168.2.9',
+ 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0',
+ 'unknown-245': '624c3620'}
+
+ eth0Retries = []
+ eth1Retries = []
+ # Simulate two NICs by adding the same one twice.
+ md = {
+ "interface": [
+ IMDS_NETWORK_METADATA['interface'][0],
+ IMDS_NETWORK_METADATA['interface'][0]
+ ]
+ }
+
+ def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
+ nonlocal eth0Retries, eth1Retries
+
+ # Simulate readurl functionality with retries and
+ # exception callbacks so that the callback logic can be
+ # validated.
+ if ifname == "eth0":
+ cause = requests.HTTPError()
+ for _ in range(0, 15):
+ error = url_helper.UrlError(cause=cause, code=410)
+ eth0Retries.append(exc_cb("No goal state.", error))
+ else:
+ cause = requests.Timeout('Fake connection timeout')
+ for _ in range(0, 10):
+ error = url_helper.UrlError(cause=cause)
+ eth1Retries.append(exc_cb("Connection timeout", error))
+ # Should stop retrying after 10 retries
+ eth1Retries.append(exc_cb("Connection timeout", error))
+ raise cause
+ return md
+
+ m_imds.side_effect = network_metadata_ret
+
+ dhcp_ctx = mock.MagicMock(lease=lease)
+ dhcp_ctx.obtain_lease.return_value = lease
+ m_dhcpv4.return_value = dhcp_ctx
+
+ is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth0")
+ self.assertEqual(True, is_primary)
+ self.assertEqual(2, expected_nic_count)
+
+ # All Eth0 errors are non-timeout errors. So we should have been
+ # retrying indefinitely until success.
+ for i in eth0Retries:
+ self.assertTrue(i)
+
+ is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth1")
+ self.assertEqual(False, is_primary)
+
+ # All Eth1 errors are timeout errors. Retry happens for a max of 10 and
+ # then we should have moved on assuming it is not the primary nic.
+ for i in range(0, 10):
+ self.assertTrue(eth1Retries[i])
+ self.assertFalse(eth1Retries[10])
+
@mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up')
def test_wait_for_link_up_returns_if_already_up(
self, m_is_link_up):