diff options
-rwxr-xr-x | cloudinit/sources/DataSourceAzure.py | 38 | ||||
-rw-r--r-- | tests/unittests/test_datasource/test_azure.py | 20 |
2 files changed, 35 insertions, 23 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index ba23139b..fddfe363 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -892,12 +892,12 @@ class DataSourceAzure(sources.DataSource): logger_func=LOG.info) return - LOG.info("Attempting to bring %s up", ifname) + LOG.debug("Attempting to bring %s up", ifname) attempts = 0 + LOG.info("Unbinding and binding the interface %s", ifname) while True: - LOG.info("Unbinding and binding the interface %s", ifname) devicename = net.read_sys_net(ifname, 'device/device_id').strip('{}') util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/unbind', @@ -912,26 +912,28 @@ class DataSourceAzure(sources.DataSource): report_diagnostic_event(msg, logger_func=LOG.info) return - sleep_duration = 1 - msg = ("Link is not up after %d attempts with %d seconds sleep " - "between attempts." % (attempts, sleep_duration)) - if attempts % 10 == 0: + msg = ("Link is not up after %d attempts to rebind" % attempts) report_diagnostic_event(msg, logger_func=LOG.info) - else: LOG.info(msg) - sleep(sleep_duration) - - # Since we just did a unbind and bind, check again after sleep - # but before doing unbind and bind again to avoid races where the - # link might take a slight delay after bind to be up. - if self.distro.networking.is_up(ifname): - msg = ("Link is up after checking after sleeping for %d secs" - " after %d attempts" % - (sleep_duration, attempts)) - report_diagnostic_event(msg, logger_func=LOG.info) - return + # It could take some time after rebind for the interface to be up. + # So poll for the status for some time before attempting to rebind + # again. + sleep_duration = 0.5 + max_status_polls = 20 + LOG.debug("Polling %d seconds for primary NIC link up after " + "rebind.", sleep_duration * max_status_polls) + + for i in range(0, max_status_polls): + if self.distro.networking.is_up(ifname): + msg = ("After %d attempts to rebind, link is up after " + "polling the link status %d times" % (attempts, i)) + report_diagnostic_event(msg, logger_func=LOG.info) + LOG.debug(msg) + return + else: + sleep(sleep_duration) @azure_ds_telemetry_reporter def _create_report_ready_marker(self): diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index 03609c3d..851cf82e 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -2912,19 +2912,29 @@ class TestPreprovisioningHotAttachNics(CiTestCase): @mock.patch('cloudinit.net.read_sys_net') @mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up') def test_wait_for_link_up_checks_link_after_sleep( - self, m_is_link_up, m_read_sys_net, m_writefile, m_is_up): + self, m_try_set_link_up, m_read_sys_net, m_writefile, m_is_up): """Waiting for link to be up should return immediately if the link is already up.""" distro_cls = distros.fetch('ubuntu') distro = distro_cls('ubuntu', {}, self.paths) dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) - m_is_link_up.return_value = False - m_is_up.return_value = True + m_try_set_link_up.return_value = False + + callcount = 0 + + def is_up_mock(key): + nonlocal callcount + if callcount == 0: + callcount += 1 + return False + return True + + m_is_up.side_effect = is_up_mock dsa.wait_for_link_up("eth0") - self.assertEqual(2, m_is_link_up.call_count) - self.assertEqual(1, m_is_up.call_count) + self.assertEqual(2, m_try_set_link_up.call_count) + self.assertEqual(2, m_is_up.call_count) @mock.patch(MOCKPATH + 'util.write_file') @mock.patch('cloudinit.net.read_sys_net') |