From 3ec8ddde0d1d2fd8597f7d2915baa3e328552ab1 Mon Sep 17 00:00:00 2001 From: aswinrajamannar <39812128+aswinrajamannar@users.noreply.github.com> Date: Fri, 20 Aug 2021 15:53:18 -0700 Subject: Azure: During primary nic detection, check interface status continuously before rebinding again (#990) Add 10 second polling loop in wait_for_link_up after performing an unbind and re-bind of primary NIC in hv_netvsc driver. Also reduce cloud-init logging levels to debug for these operations. --- cloudinit/sources/DataSourceAzure.py | 38 +++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'cloudinit/sources') diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index ba23139b..fddfe363 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -892,12 +892,12 @@ class DataSourceAzure(sources.DataSource): logger_func=LOG.info) return - LOG.info("Attempting to bring %s up", ifname) + LOG.debug("Attempting to bring %s up", ifname) attempts = 0 + LOG.info("Unbinding and binding the interface %s", ifname) while True: - LOG.info("Unbinding and binding the interface %s", ifname) devicename = net.read_sys_net(ifname, 'device/device_id').strip('{}') util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/unbind', @@ -912,26 +912,28 @@ class DataSourceAzure(sources.DataSource): report_diagnostic_event(msg, logger_func=LOG.info) return - sleep_duration = 1 - msg = ("Link is not up after %d attempts with %d seconds sleep " - "between attempts." % (attempts, sleep_duration)) - if attempts % 10 == 0: + msg = ("Link is not up after %d attempts to rebind" % attempts) report_diagnostic_event(msg, logger_func=LOG.info) - else: LOG.info(msg) - sleep(sleep_duration) - - # Since we just did a unbind and bind, check again after sleep - # but before doing unbind and bind again to avoid races where the - # link might take a slight delay after bind to be up. - if self.distro.networking.is_up(ifname): - msg = ("Link is up after checking after sleeping for %d secs" - " after %d attempts" % - (sleep_duration, attempts)) - report_diagnostic_event(msg, logger_func=LOG.info) - return + # It could take some time after rebind for the interface to be up. + # So poll for the status for some time before attempting to rebind + # again. + sleep_duration = 0.5 + max_status_polls = 20 + LOG.debug("Polling %d seconds for primary NIC link up after " + "rebind.", sleep_duration * max_status_polls) + + for i in range(0, max_status_polls): + if self.distro.networking.is_up(ifname): + msg = ("After %d attempts to rebind, link is up after " + "polling the link status %d times" % (attempts, i)) + report_diagnostic_event(msg, logger_func=LOG.info) + LOG.debug(msg) + return + else: + sleep(sleep_duration) @azure_ds_telemetry_reporter def _create_report_ready_marker(self): -- cgit v1.2.3