diff options
author | Johnson Shi <Johnson.Shi@microsoft.com> | 2020-10-16 08:54:38 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-16 11:54:38 -0400 |
commit | 8766784f4b1d1f9f6a9094e1268e4accb811ea7f (patch) | |
tree | d654af968ed5b185402802a3cafea7c7357e0b26 | |
parent | c0e8480678e3a9173c9de1271f651fb3ba375f22 (diff) | |
download | vyos-cloud-init-8766784f4b1d1f9f6a9094e1268e4accb811ea7f.tar.gz vyos-cloud-init-8766784f4b1d1f9f6a9094e1268e4accb811ea7f.zip |
DataSourceAzure: write marker file after report ready in preprovisioning (#590)
DataSourceAzure previously writes the preprovisioning
reported ready marker file before it goes through the
report ready workflow. On certain VM instances, the
marker file is successfully written but then reporting
ready fails.
Upon rare VM reboots by the platform, cloud-init sees
that the report ready marker file already exists.
The existence of this marker file tells cloud-init
not to report ready again (because it mistakenly
assumes that it already reported ready in
preprovisioning).
In this scenario, cloud-init instead erroneously
takes the reprovisioning workflow instead of
reporting ready again.
-rwxr-xr-x | cloudinit/sources/DataSourceAzure.py | 23 | ||||
-rw-r--r-- | tests/unittests/test_datasource/test_azure.py | 86 | ||||
-rw-r--r-- | tests/unittests/test_datasource/test_azure_helper.py | 3 |
3 files changed, 90 insertions, 22 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index 8858fbd5..70e32f46 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -720,12 +720,23 @@ class DataSourceAzure(sources.DataSource): self._ephemeral_dhcp_ctx.clean_network() break + report_ready_succeeded = self._report_ready(lease=lease) + if not report_ready_succeeded: + msg = ('Failed reporting ready while in ' + 'the preprovisioning pool.') + report_diagnostic_event(msg, logger_func=LOG.error) + self._ephemeral_dhcp_ctx.clean_network() + raise sources.InvalidMetaDataException(msg) + path = REPORTED_READY_MARKER_FILE LOG.info( "Creating a marker file to report ready: %s", path) util.write_file(path, "{pid}: {time}\n".format( pid=os.getpid(), time=time())) - self._report_ready(lease=lease) + report_diagnostic_event( + 'Successfully created reported ready marker file ' + 'while in the preprovisioning pool.', + logger_func=LOG.debug) report_ready = False with events.ReportEventStack( @@ -773,14 +784,20 @@ class DataSourceAzure(sources.DataSource): return return_val @azure_ds_telemetry_reporter - def _report_ready(self, lease): - """Tells the fabric provisioning has completed """ + def _report_ready(self, lease: dict) -> bool: + """Tells the fabric provisioning has completed. + + @param lease: dhcp lease to use for sending the ready signal. + @return: The success status of sending the ready signal. + """ try: get_metadata_from_fabric(None, lease['unknown-245']) + return True except Exception as e: report_diagnostic_event( "Error communicating with Azure fabric; You may experience " "connectivity issues: %s" % e, logger_func=LOG.warning) + return False def _should_reprovision(self, ret): """Whether or not we should poll IMDS for reprovisioning data. diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index 3b9456f7..56c1cf18 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -1161,6 +1161,19 @@ scbus-1 on xpt0 bus 0 dsrc = self._get_ds({'ovfcontent': xml}) dsrc.get_data() + def test_dsaz_report_ready_returns_true_when_report_succeeds( + self): + dsrc = self._get_ds({'ovfcontent': construct_valid_ovf_env()}) + dsrc.ds_cfg['agent_command'] = '__builtin__' + self.assertTrue(dsrc._report_ready(lease=mock.MagicMock())) + + def test_dsaz_report_ready_returns_false_and_does_not_propagate_exc( + self): + dsrc = self._get_ds({'ovfcontent': construct_valid_ovf_env()}) + dsrc.ds_cfg['agent_command'] = '__builtin__' + self.get_metadata_from_fabric.side_effect = Exception + self.assertFalse(dsrc._report_ready(lease=mock.MagicMock())) + def test_exception_fetching_fabric_data_doesnt_propagate(self): """Errors communicating with fabric should warn, but return True.""" dsrc = self._get_ds({'ovfcontent': construct_valid_ovf_env()}) @@ -2041,7 +2054,7 @@ class TestPreprovisioningPollIMDS(CiTestCase): @mock.patch('time.sleep', mock.MagicMock()) @mock.patch(MOCKPATH + 'EphemeralDHCPv4') def test_poll_imds_re_dhcp_on_timeout(self, m_dhcpv4, report_ready_func, - fake_resp, m_media_switch, m_dhcp, + m_request, m_media_switch, m_dhcp, m_net): """The poll_imds will retry DHCP on IMDS timeout.""" report_file = self.tmp_path('report_marker', self.tmp) @@ -2070,7 +2083,7 @@ class TestPreprovisioningPollIMDS(CiTestCase): # Third try should succeed and stop retries or redhcp return mock.MagicMock(status_code=200, text="good", content="good") - fake_resp.side_effect = fake_timeout_once + m_request.side_effect = fake_timeout_once dsa = dsaz.DataSourceAzure({}, distro=mock.Mock(), paths=self.paths) with mock.patch(MOCKPATH + 'REPORTED_READY_MARKER_FILE', report_file): @@ -2080,11 +2093,10 @@ class TestPreprovisioningPollIMDS(CiTestCase): self.assertEqual(3, m_dhcpv4.call_count, 'Expected 3 DHCP calls') self.assertEqual(4, self.tries, 'Expected 4 total reads from IMDS') - def test_poll_imds_report_ready_false(self, - report_ready_func, fake_resp, - m_media_switch, m_dhcp, m_net): - """The poll_imds should not call reporting ready - when flag is false""" + def test_does_not_poll_imds_report_ready_when_marker_file_exists( + self, m_report_ready, m_request, m_media_switch, m_dhcp, m_net): + """poll_imds should not call report ready when the reported ready + marker file exists""" report_file = self.tmp_path('report_marker', self.tmp) write_file(report_file, content='dont run report_ready :)') m_dhcp.return_value = [{ @@ -2095,11 +2107,49 @@ class TestPreprovisioningPollIMDS(CiTestCase): dsa = dsaz.DataSourceAzure({}, distro=mock.Mock(), paths=self.paths) with mock.patch(MOCKPATH + 'REPORTED_READY_MARKER_FILE', report_file): dsa._poll_imds() - self.assertEqual(report_ready_func.call_count, 0) + self.assertEqual(m_report_ready.call_count, 0) + + def test_poll_imds_report_ready_success_writes_marker_file( + self, m_report_ready, m_request, m_media_switch, m_dhcp, m_net): + """poll_imds should write the report_ready marker file if + reporting ready succeeds""" + report_file = self.tmp_path('report_marker', self.tmp) + m_dhcp.return_value = [{ + 'interface': 'eth9', 'fixed-address': '192.168.2.9', + 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0', + 'unknown-245': '624c3620'}] + m_media_switch.return_value = None + dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + self.assertFalse(os.path.exists(report_file)) + with mock.patch(MOCKPATH + 'REPORTED_READY_MARKER_FILE', report_file): + dsa._poll_imds() + self.assertEqual(m_report_ready.call_count, 1) + self.assertTrue(os.path.exists(report_file)) + + def test_poll_imds_report_ready_failure_raises_exc_and_doesnt_write_marker( + self, m_report_ready, m_request, m_media_switch, m_dhcp, m_net): + """poll_imds should write the report_ready marker file if + reporting ready succeeds""" + report_file = self.tmp_path('report_marker', self.tmp) + m_dhcp.return_value = [{ + 'interface': 'eth9', 'fixed-address': '192.168.2.9', + 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0', + 'unknown-245': '624c3620'}] + m_media_switch.return_value = None + m_report_ready.return_value = False + dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths) + self.assertFalse(os.path.exists(report_file)) + with mock.patch(MOCKPATH + 'REPORTED_READY_MARKER_FILE', report_file): + self.assertRaises( + InvalidMetaDataException, + dsa._poll_imds) + self.assertEqual(m_report_ready.call_count, 1) + self.assertFalse(os.path.exists(report_file)) -@mock.patch(MOCKPATH + 'subp.subp') -@mock.patch(MOCKPATH + 'util.write_file') +@mock.patch(MOCKPATH + 'DataSourceAzure._report_ready', mock.MagicMock()) +@mock.patch(MOCKPATH + 'subp.subp', mock.MagicMock()) +@mock.patch(MOCKPATH + 'util.write_file', mock.MagicMock()) @mock.patch(MOCKPATH + 'util.is_FreeBSD') @mock.patch('cloudinit.sources.helpers.netlink.' 'wait_for_media_disconnect_connect') @@ -2115,10 +2165,10 @@ class TestAzureDataSourcePreprovisioning(CiTestCase): self.paths = helpers.Paths({'cloud_dir': tmp}) dsaz.BUILTIN_DS_CONFIG['data_dir'] = self.waagent_d - def test_poll_imds_returns_ovf_env(self, fake_resp, + def test_poll_imds_returns_ovf_env(self, m_request, m_dhcp, m_net, m_media_switch, - m_is_bsd, write_f, subp): + m_is_bsd): """The _poll_imds method should return the ovf_env.xml.""" m_is_bsd.return_value = False m_media_switch.return_value = None @@ -2128,11 +2178,11 @@ class TestAzureDataSourcePreprovisioning(CiTestCase): url = 'http://{0}/metadata/reprovisiondata?api-version=2017-04-02' host = "169.254.169.254" full_url = url.format(host) - fake_resp.return_value = mock.MagicMock(status_code=200, text="ovf", + m_request.return_value = mock.MagicMock(status_code=200, text="ovf", content="ovf") dsa = dsaz.DataSourceAzure({}, distro=mock.Mock(), paths=self.paths) self.assertTrue(len(dsa._poll_imds()) > 0) - self.assertEqual(fake_resp.call_args_list, + self.assertEqual(m_request.call_args_list, [mock.call(allow_redirects=True, headers={'Metadata': 'true', 'User-Agent': @@ -2147,10 +2197,10 @@ class TestAzureDataSourcePreprovisioning(CiTestCase): static_routes=None) self.assertEqual(m_net.call_count, 2) - def test__reprovision_calls__poll_imds(self, fake_resp, + def test__reprovision_calls__poll_imds(self, m_request, m_dhcp, m_net, m_media_switch, - m_is_bsd, write_f, subp): + m_is_bsd): """The _reprovision method should call poll IMDS.""" m_is_bsd.return_value = False m_media_switch.return_value = None @@ -2165,7 +2215,7 @@ class TestAzureDataSourcePreprovisioning(CiTestCase): username = "myuser" odata = {'HostName': hostname, 'UserName': username} content = construct_valid_ovf_env(data=odata) - fake_resp.return_value = mock.MagicMock(status_code=200, text=content, + m_request.return_value = mock.MagicMock(status_code=200, text=content, content=content) dsa = dsaz.DataSourceAzure({}, distro=mock.Mock(), paths=self.paths) md, _ud, cfg, _d = dsa._reprovision() @@ -2182,7 +2232,7 @@ class TestAzureDataSourcePreprovisioning(CiTestCase): timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS, url=full_url ), - fake_resp.call_args_list) + m_request.call_args_list) self.assertEqual(m_dhcp.call_count, 2) m_net.assert_any_call( broadcast='192.168.2.255', interface='eth9', ip='192.168.2.9', diff --git a/tests/unittests/test_datasource/test_azure_helper.py b/tests/unittests/test_datasource/test_azure_helper.py index 5c31b8be..6e004e34 100644 --- a/tests/unittests/test_datasource/test_azure_helper.py +++ b/tests/unittests/test_datasource/test_azure_helper.py @@ -742,7 +742,8 @@ class TestGetMetadataGoalStateXMLAndReportReadyToFabric(CiTestCase): self.assertEqual(1, shim.return_value.clean_up.call_count) @mock.patch.object(azure_helper, 'WALinuxAgentShim') - def test_failure_in_registration_calls_clean_up(self, shim): + def test_failure_in_registration_propagates_exc_and_calls_clean_up( + self, shim): shim.return_value.register_with_azure_and_fetch_data.side_effect = ( SentinelException) self.assertRaises(SentinelException, |