summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohnson Shi <Johnson.Shi@microsoft.com>2020-10-16 08:54:38 -0700
committerGitHub <noreply@github.com>2020-10-16 11:54:38 -0400
commit8766784f4b1d1f9f6a9094e1268e4accb811ea7f (patch)
treed654af968ed5b185402802a3cafea7c7357e0b26
parentc0e8480678e3a9173c9de1271f651fb3ba375f22 (diff)
downloadvyos-cloud-init-8766784f4b1d1f9f6a9094e1268e4accb811ea7f.tar.gz
vyos-cloud-init-8766784f4b1d1f9f6a9094e1268e4accb811ea7f.zip
DataSourceAzure: write marker file after report ready in preprovisioning (#590)
DataSourceAzure previously writes the preprovisioning reported ready marker file before it goes through the report ready workflow. On certain VM instances, the marker file is successfully written but then reporting ready fails. Upon rare VM reboots by the platform, cloud-init sees that the report ready marker file already exists. The existence of this marker file tells cloud-init not to report ready again (because it mistakenly assumes that it already reported ready in preprovisioning). In this scenario, cloud-init instead erroneously takes the reprovisioning workflow instead of reporting ready again.
-rwxr-xr-xcloudinit/sources/DataSourceAzure.py23
-rw-r--r--tests/unittests/test_datasource/test_azure.py86
-rw-r--r--tests/unittests/test_datasource/test_azure_helper.py3
3 files changed, 90 insertions, 22 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index 8858fbd5..70e32f46 100755
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -720,12 +720,23 @@ class DataSourceAzure(sources.DataSource):
self._ephemeral_dhcp_ctx.clean_network()
break
+ report_ready_succeeded = self._report_ready(lease=lease)
+ if not report_ready_succeeded:
+ msg = ('Failed reporting ready while in '
+ 'the preprovisioning pool.')
+ report_diagnostic_event(msg, logger_func=LOG.error)
+ self._ephemeral_dhcp_ctx.clean_network()
+ raise sources.InvalidMetaDataException(msg)
+
path = REPORTED_READY_MARKER_FILE
LOG.info(
"Creating a marker file to report ready: %s", path)
util.write_file(path, "{pid}: {time}\n".format(
pid=os.getpid(), time=time()))
- self._report_ready(lease=lease)
+ report_diagnostic_event(
+ 'Successfully created reported ready marker file '
+ 'while in the preprovisioning pool.',
+ logger_func=LOG.debug)
report_ready = False
with events.ReportEventStack(
@@ -773,14 +784,20 @@ class DataSourceAzure(sources.DataSource):
return return_val
@azure_ds_telemetry_reporter
- def _report_ready(self, lease):
- """Tells the fabric provisioning has completed """
+ def _report_ready(self, lease: dict) -> bool:
+ """Tells the fabric provisioning has completed.
+
+ @param lease: dhcp lease to use for sending the ready signal.
+ @return: The success status of sending the ready signal.
+ """
try:
get_metadata_from_fabric(None, lease['unknown-245'])
+ return True
except Exception as e:
report_diagnostic_event(
"Error communicating with Azure fabric; You may experience "
"connectivity issues: %s" % e, logger_func=LOG.warning)
+ return False
def _should_reprovision(self, ret):
"""Whether or not we should poll IMDS for reprovisioning data.
diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
index 3b9456f7..56c1cf18 100644
--- a/tests/unittests/test_datasource/test_azure.py
+++ b/tests/unittests/test_datasource/test_azure.py
@@ -1161,6 +1161,19 @@ scbus-1 on xpt0 bus 0
dsrc = self._get_ds({'ovfcontent': xml})
dsrc.get_data()
+ def test_dsaz_report_ready_returns_true_when_report_succeeds(
+ self):
+ dsrc = self._get_ds({'ovfcontent': construct_valid_ovf_env()})
+ dsrc.ds_cfg['agent_command'] = '__builtin__'
+ self.assertTrue(dsrc._report_ready(lease=mock.MagicMock()))
+
+ def test_dsaz_report_ready_returns_false_and_does_not_propagate_exc(
+ self):
+ dsrc = self._get_ds({'ovfcontent': construct_valid_ovf_env()})
+ dsrc.ds_cfg['agent_command'] = '__builtin__'
+ self.get_metadata_from_fabric.side_effect = Exception
+ self.assertFalse(dsrc._report_ready(lease=mock.MagicMock()))
+
def test_exception_fetching_fabric_data_doesnt_propagate(self):
"""Errors communicating with fabric should warn, but return True."""
dsrc = self._get_ds({'ovfcontent': construct_valid_ovf_env()})
@@ -2041,7 +2054,7 @@ class TestPreprovisioningPollIMDS(CiTestCase):
@mock.patch('time.sleep', mock.MagicMock())
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
def test_poll_imds_re_dhcp_on_timeout(self, m_dhcpv4, report_ready_func,
- fake_resp, m_media_switch, m_dhcp,
+ m_request, m_media_switch, m_dhcp,
m_net):
"""The poll_imds will retry DHCP on IMDS timeout."""
report_file = self.tmp_path('report_marker', self.tmp)
@@ -2070,7 +2083,7 @@ class TestPreprovisioningPollIMDS(CiTestCase):
# Third try should succeed and stop retries or redhcp
return mock.MagicMock(status_code=200, text="good", content="good")
- fake_resp.side_effect = fake_timeout_once
+ m_request.side_effect = fake_timeout_once
dsa = dsaz.DataSourceAzure({}, distro=mock.Mock(), paths=self.paths)
with mock.patch(MOCKPATH + 'REPORTED_READY_MARKER_FILE', report_file):
@@ -2080,11 +2093,10 @@ class TestPreprovisioningPollIMDS(CiTestCase):
self.assertEqual(3, m_dhcpv4.call_count, 'Expected 3 DHCP calls')
self.assertEqual(4, self.tries, 'Expected 4 total reads from IMDS')
- def test_poll_imds_report_ready_false(self,
- report_ready_func, fake_resp,
- m_media_switch, m_dhcp, m_net):
- """The poll_imds should not call reporting ready
- when flag is false"""
+ def test_does_not_poll_imds_report_ready_when_marker_file_exists(
+ self, m_report_ready, m_request, m_media_switch, m_dhcp, m_net):
+ """poll_imds should not call report ready when the reported ready
+ marker file exists"""
report_file = self.tmp_path('report_marker', self.tmp)
write_file(report_file, content='dont run report_ready :)')
m_dhcp.return_value = [{
@@ -2095,11 +2107,49 @@ class TestPreprovisioningPollIMDS(CiTestCase):
dsa = dsaz.DataSourceAzure({}, distro=mock.Mock(), paths=self.paths)
with mock.patch(MOCKPATH + 'REPORTED_READY_MARKER_FILE', report_file):
dsa._poll_imds()
- self.assertEqual(report_ready_func.call_count, 0)
+ self.assertEqual(m_report_ready.call_count, 0)
+
+ def test_poll_imds_report_ready_success_writes_marker_file(
+ self, m_report_ready, m_request, m_media_switch, m_dhcp, m_net):
+ """poll_imds should write the report_ready marker file if
+ reporting ready succeeds"""
+ report_file = self.tmp_path('report_marker', self.tmp)
+ m_dhcp.return_value = [{
+ 'interface': 'eth9', 'fixed-address': '192.168.2.9',
+ 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0',
+ 'unknown-245': '624c3620'}]
+ m_media_switch.return_value = None
+ dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths)
+ self.assertFalse(os.path.exists(report_file))
+ with mock.patch(MOCKPATH + 'REPORTED_READY_MARKER_FILE', report_file):
+ dsa._poll_imds()
+ self.assertEqual(m_report_ready.call_count, 1)
+ self.assertTrue(os.path.exists(report_file))
+
+ def test_poll_imds_report_ready_failure_raises_exc_and_doesnt_write_marker(
+ self, m_report_ready, m_request, m_media_switch, m_dhcp, m_net):
+ """poll_imds should write the report_ready marker file if
+ reporting ready succeeds"""
+ report_file = self.tmp_path('report_marker', self.tmp)
+ m_dhcp.return_value = [{
+ 'interface': 'eth9', 'fixed-address': '192.168.2.9',
+ 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0',
+ 'unknown-245': '624c3620'}]
+ m_media_switch.return_value = None
+ m_report_ready.return_value = False
+ dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths)
+ self.assertFalse(os.path.exists(report_file))
+ with mock.patch(MOCKPATH + 'REPORTED_READY_MARKER_FILE', report_file):
+ self.assertRaises(
+ InvalidMetaDataException,
+ dsa._poll_imds)
+ self.assertEqual(m_report_ready.call_count, 1)
+ self.assertFalse(os.path.exists(report_file))
-@mock.patch(MOCKPATH + 'subp.subp')
-@mock.patch(MOCKPATH + 'util.write_file')
+@mock.patch(MOCKPATH + 'DataSourceAzure._report_ready', mock.MagicMock())
+@mock.patch(MOCKPATH + 'subp.subp', mock.MagicMock())
+@mock.patch(MOCKPATH + 'util.write_file', mock.MagicMock())
@mock.patch(MOCKPATH + 'util.is_FreeBSD')
@mock.patch('cloudinit.sources.helpers.netlink.'
'wait_for_media_disconnect_connect')
@@ -2115,10 +2165,10 @@ class TestAzureDataSourcePreprovisioning(CiTestCase):
self.paths = helpers.Paths({'cloud_dir': tmp})
dsaz.BUILTIN_DS_CONFIG['data_dir'] = self.waagent_d
- def test_poll_imds_returns_ovf_env(self, fake_resp,
+ def test_poll_imds_returns_ovf_env(self, m_request,
m_dhcp, m_net,
m_media_switch,
- m_is_bsd, write_f, subp):
+ m_is_bsd):
"""The _poll_imds method should return the ovf_env.xml."""
m_is_bsd.return_value = False
m_media_switch.return_value = None
@@ -2128,11 +2178,11 @@ class TestAzureDataSourcePreprovisioning(CiTestCase):
url = 'http://{0}/metadata/reprovisiondata?api-version=2017-04-02'
host = "169.254.169.254"
full_url = url.format(host)
- fake_resp.return_value = mock.MagicMock(status_code=200, text="ovf",
+ m_request.return_value = mock.MagicMock(status_code=200, text="ovf",
content="ovf")
dsa = dsaz.DataSourceAzure({}, distro=mock.Mock(), paths=self.paths)
self.assertTrue(len(dsa._poll_imds()) > 0)
- self.assertEqual(fake_resp.call_args_list,
+ self.assertEqual(m_request.call_args_list,
[mock.call(allow_redirects=True,
headers={'Metadata': 'true',
'User-Agent':
@@ -2147,10 +2197,10 @@ class TestAzureDataSourcePreprovisioning(CiTestCase):
static_routes=None)
self.assertEqual(m_net.call_count, 2)
- def test__reprovision_calls__poll_imds(self, fake_resp,
+ def test__reprovision_calls__poll_imds(self, m_request,
m_dhcp, m_net,
m_media_switch,
- m_is_bsd, write_f, subp):
+ m_is_bsd):
"""The _reprovision method should call poll IMDS."""
m_is_bsd.return_value = False
m_media_switch.return_value = None
@@ -2165,7 +2215,7 @@ class TestAzureDataSourcePreprovisioning(CiTestCase):
username = "myuser"
odata = {'HostName': hostname, 'UserName': username}
content = construct_valid_ovf_env(data=odata)
- fake_resp.return_value = mock.MagicMock(status_code=200, text=content,
+ m_request.return_value = mock.MagicMock(status_code=200, text=content,
content=content)
dsa = dsaz.DataSourceAzure({}, distro=mock.Mock(), paths=self.paths)
md, _ud, cfg, _d = dsa._reprovision()
@@ -2182,7 +2232,7 @@ class TestAzureDataSourcePreprovisioning(CiTestCase):
timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS,
url=full_url
),
- fake_resp.call_args_list)
+ m_request.call_args_list)
self.assertEqual(m_dhcp.call_count, 2)
m_net.assert_any_call(
broadcast='192.168.2.255', interface='eth9', ip='192.168.2.9',
diff --git a/tests/unittests/test_datasource/test_azure_helper.py b/tests/unittests/test_datasource/test_azure_helper.py
index 5c31b8be..6e004e34 100644
--- a/tests/unittests/test_datasource/test_azure_helper.py
+++ b/tests/unittests/test_datasource/test_azure_helper.py
@@ -742,7 +742,8 @@ class TestGetMetadataGoalStateXMLAndReportReadyToFabric(CiTestCase):
self.assertEqual(1, shim.return_value.clean_up.call_count)
@mock.patch.object(azure_helper, 'WALinuxAgentShim')
- def test_failure_in_registration_calls_clean_up(self, shim):
+ def test_failure_in_registration_propagates_exc_and_calls_clean_up(
+ self, shim):
shim.return_value.register_with_azure_and_fetch_data.side_effect = (
SentinelException)
self.assertRaises(SentinelException,