From e27c30748e88409b1646a552f994edf9ed9d017e Mon Sep 17 00:00:00 2001 From: vteratipally <67723486+vteratipally@users.noreply.github.com> Date: Mon, 20 Sep 2021 21:53:05 -0700 Subject: Add retries to DataSourceGCE.py when connecting to GCE (#1005) Add retries to DatasourceGCE when connecting to GCE. Sometimes when the trying to fetch the metadata, cloud-init fails and the fallback datasource NoCloud is used which is not expected. Add retries to ensure loading of the data source. --- cloudinit/sources/DataSourceGCE.py | 19 +++++++++++++------ cloudinit/sources/__init__.py | 23 ++++++++++++++++++----- cloudinit/sources/tests/test_init.py | 17 +++++++++++------ doc/rtd/topics/datasources/gce.rst | 22 ++++++++++++++++++++++ tools/.github-cla-signers | 1 + 5 files changed, 65 insertions(+), 17 deletions(-) diff --git a/cloudinit/sources/DataSourceGCE.py b/cloudinit/sources/DataSourceGCE.py index 746caddb..ecdc458b 100644 --- a/cloudinit/sources/DataSourceGCE.py +++ b/cloudinit/sources/DataSourceGCE.py @@ -27,8 +27,10 @@ HEADERS = {'Metadata-Flavor': 'Google'} class GoogleMetadataFetcher(object): - def __init__(self, metadata_address): + def __init__(self, metadata_address, num_retries, sec_between_retries): self.metadata_address = metadata_address + self.num_retries = num_retries + self.sec_between_retries = sec_between_retries def get_value(self, path, is_text, is_recursive=False): value = None @@ -36,7 +38,9 @@ class GoogleMetadataFetcher(object): url = self.metadata_address + path if is_recursive: url += '/?recursive=True' - resp = url_helper.readurl(url=url, headers=HEADERS) + resp = url_helper.readurl(url=url, headers=HEADERS, + retries=self.num_retries, + sec_between=self.sec_between_retries) except url_helper.UrlError as exc: msg = "url %s raised exception %s" LOG.debug(msg, path, exc) @@ -68,9 +72,11 @@ class DataSourceGCE(sources.DataSource): self.metadata_address = self.ds_cfg['metadata_url'] def _get_data(self): + url_params = self.get_url_params() ret = util.log_time( LOG.debug, 'Crawl of GCE metadata service', - read_md, kwargs={'address': self.metadata_address}) + read_md, kwargs={'address': self.metadata_address, + 'url_params': url_params}) if not ret['success']: if ret['platform_reports_gce']: @@ -176,7 +182,7 @@ def _parse_public_keys(public_keys_data, default_user=None): return public_keys -def read_md(address=None, platform_check=True): +def read_md(address=None, url_params=None, platform_check=True): if address is None: address = MD_V1_URL @@ -203,8 +209,9 @@ def read_md(address=None, platform_check=True): ('instance-data', ('instance/attributes',), False, False, True), ('project-data', ('project/attributes',), False, False, True), ] - - metadata_fetcher = GoogleMetadataFetcher(address) + metadata_fetcher = GoogleMetadataFetcher(address, + url_params.num_retries, + url_params.sec_between_retries) md = {} # Iterate over url_map keys to get metadata items. for (mkey, paths, required, is_text, is_recursive) in url_map: diff --git a/cloudinit/sources/__init__.py b/cloudinit/sources/__init__.py index 54b8240a..d61d280d 100644 --- a/cloudinit/sources/__init__.py +++ b/cloudinit/sources/__init__.py @@ -138,7 +138,8 @@ def redact_sensitive_keys(metadata, redact_value=REDACT_SENSITIVE_VALUE): URLParams = namedtuple( - 'URLParms', ['max_wait_seconds', 'timeout_seconds', 'num_retries']) + 'URLParms', ['max_wait_seconds', 'timeout_seconds', + 'num_retries', 'sec_between_retries']) class DataSource(CloudInitPickleMixin, metaclass=abc.ABCMeta): @@ -175,9 +176,10 @@ class DataSource(CloudInitPickleMixin, metaclass=abc.ABCMeta): NetworkConfigSource.ds) # read_url_params - url_max_wait = -1 # max_wait < 0 means do not wait - url_timeout = 10 # timeout for each metadata url read attempt - url_retries = 5 # number of times to retry url upon 404 + url_max_wait = -1 # max_wait < 0 means do not wait + url_timeout = 10 # timeout for each metadata url read attempt + url_retries = 5 # number of times to retry url upon 404 + url_sec_between_retries = 1 # amount of seconds to wait between retries # The datasource defines a set of supported EventTypes during which # the datasource can react to changes in metadata and regenerate @@ -422,7 +424,18 @@ class DataSource(CloudInitPickleMixin, metaclass=abc.ABCMeta): LOG, "Config retries '%s' is not an int, using default '%s'", self.ds_cfg.get('retries'), retries) - return URLParams(max_wait, timeout, retries) + sec_between_retries = self.url_sec_between_retries + try: + sec_between_retries = int(self.ds_cfg.get( + "sec_between_retries", + self.url_sec_between_retries)) + except Exception: + util.logexc( + LOG, "Config sec_between_retries '%s' is not an int," + " using default '%s'", + self.ds_cfg.get("sec_between_retries"), sec_between_retries) + + return URLParams(max_wait, timeout, retries, sec_between_retries) def get_userdata(self, apply_filter=False): if self.userdata is None: diff --git a/cloudinit/sources/tests/test_init.py b/cloudinit/sources/tests/test_init.py index a2b052a6..ae09cb17 100644 --- a/cloudinit/sources/tests/test_init.py +++ b/cloudinit/sources/tests/test_init.py @@ -97,6 +97,8 @@ class TestDataSource(CiTestCase): self.assertEqual(params.max_wait_seconds, self.datasource.url_max_wait) self.assertEqual(params.timeout_seconds, self.datasource.url_timeout) self.assertEqual(params.num_retries, self.datasource.url_retries) + self.assertEqual(params.sec_between_retries, + self.datasource.url_sec_between_retries) def test_datasource_get_url_params_subclassed(self): """Subclasses can override get_url_params defaults.""" @@ -104,7 +106,7 @@ class TestDataSource(CiTestCase): distro = 'distrotest' # generally should be a Distro object datasource = DataSourceTestSubclassNet(sys_cfg, distro, self.paths) expected = (datasource.url_max_wait, datasource.url_timeout, - datasource.url_retries) + datasource.url_retries, datasource.url_sec_between_retries) url_params = datasource.get_url_params() self.assertNotEqual(self.datasource.get_url_params(), url_params) self.assertEqual(expected, url_params) @@ -114,14 +116,16 @@ class TestDataSource(CiTestCase): sys_cfg = { 'datasource': { 'MyTestSubclass': { - 'max_wait': '1', 'timeout': '2', 'retries': '3'}}} + 'max_wait': '1', 'timeout': '2', + 'retries': '3', 'sec_between_retries': 4 + }}} datasource = DataSourceTestSubclassNet( sys_cfg, self.distro, self.paths) - expected = (1, 2, 3) + expected = (1, 2, 3, 4) url_params = datasource.get_url_params() self.assertNotEqual( (datasource.url_max_wait, datasource.url_timeout, - datasource.url_retries), + datasource.url_retries, datasource.url_sec_between_retries), url_params) self.assertEqual(expected, url_params) @@ -130,7 +134,8 @@ class TestDataSource(CiTestCase): # Set an override that is below 0 which gets ignored. sys_cfg = {'datasource': {'_undef': {'timeout': '-1'}}} datasource = DataSource(sys_cfg, self.distro, self.paths) - (_max_wait, timeout, _retries) = datasource.get_url_params() + (_max_wait, timeout, _retries, + _sec_between_retries) = datasource.get_url_params() self.assertEqual(0, timeout) def test_datasource_get_url_uses_defaults_on_errors(self): @@ -142,7 +147,7 @@ class TestDataSource(CiTestCase): datasource = DataSource(sys_cfg, self.distro, self.paths) url_params = datasource.get_url_params() expected = (datasource.url_max_wait, datasource.url_timeout, - datasource.url_retries) + datasource.url_retries, datasource.url_sec_between_retries) self.assertEqual(expected, url_params) logs = self.logs.getvalue() expected_logs = [ diff --git a/doc/rtd/topics/datasources/gce.rst b/doc/rtd/topics/datasources/gce.rst index 8406695c..f3590282 100644 --- a/doc/rtd/topics/datasources/gce.rst +++ b/doc/rtd/topics/datasources/gce.rst @@ -15,6 +15,28 @@ to provide ``public-keys``. ``user-data`` and ``user-data-encoding`` can be provided to cloud-init by setting those custom metadata keys for an *instance*. +Configuration +------------- +The following configuration can be set for the datasource in system +configuration (in `/etc/cloud/cloud.cfg` or `/etc/cloud/cloud.cfg.d/`). + +The settings that may be configured are: + + * **retries**: The number of retries that should be done for an http request. + This value is used only after metadata_url is selected. (default: 5) + * **sec_between_retries**: The amount of wait time between the retries when + crawling the metadata service. (default: 1) + + +An example configuration with the default values is provided below: + +.. sourcecode:: yaml + + datasource: + GCE: + retries: 5 + sec_between_retries: 1 + .. _GCE metadata docs: https://cloud.google.com/compute/docs/storing-retrieving-metadata#querying .. vi: textwidth=78 diff --git a/tools/.github-cla-signers b/tools/.github-cla-signers index d9d43ba9..4f668112 100644 --- a/tools/.github-cla-signers +++ b/tools/.github-cla-signers @@ -66,6 +66,7 @@ timothegenzmer tnt-dev tomponline tsanghan +vteratipally Vultaire WebSpider xiachen-rh -- cgit v1.2.3