summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorScott Moser <smoser@ubuntu.com>2011-12-19 12:00:48 -0500
committerScott Moser <smoser@ubuntu.com>2011-12-19 12:00:48 -0500
commit8ea9a9c2c3621b6cb2890abac543885af3b40970 (patch)
tree00ecc38cd9d414e89862ef7db1fcd98acaa20978
parent6b799f1c4384094dabc59a08c8418f9eeec59573 (diff)
downloadvyos-cloud-init-8ea9a9c2c3621b6cb2890abac543885af3b40970.tar.gz
vyos-cloud-init-8ea9a9c2c3621b6cb2890abac543885af3b40970.zip
make DataSourceEc2 more resilliant to slow metadata service (LP: #894279)
This increases the timeout for a metadata request to something that should be easily satisfiable (50 seconds). But hopefully does so while still keeping the case of no-metadata service in mind. Previously, there was a small timeout and many retries (30) would be done. Now, - larger timeout (50 seconds) by default - retry until a given "max_wait" is reached (120 seconds default) The end result is that if we're hitting the timeout, there will only end up being a couple attempts made. But if the requests are coming back quickly then we'll still make several attempts. There is one EC2DataSource config change, now 'retries' is not used, but rather 'max_wait' to indicate generally how long it should try to find a metadata service.
-rw-r--r--cloudinit/DataSourceEc2.py166
-rw-r--r--doc/examples/cloud-config-datasources.txt14
2 files changed, 118 insertions, 62 deletions
diff --git a/cloudinit/DataSourceEc2.py b/cloudinit/DataSourceEc2.py
index ea56960b..aee10ffa 100644
--- a/cloudinit/DataSourceEc2.py
+++ b/cloudinit/DataSourceEc2.py
@@ -48,8 +48,10 @@ class DataSourceEc2(DataSource.DataSource):
try:
if not self.wait_for_metadata_service():
return False
+ start = time.time()
self.userdata_raw = boto_utils.get_instance_userdata(self.api_ver, None, self.metadata_address)
self.metadata = boto_utils.get_instance_metadata(self.api_ver, self.metadata_address)
+ log.debug("crawl of metadata service took %ds" % (time.time()-start))
return True
except Exception as e:
print e
@@ -81,37 +83,31 @@ class DataSourceEc2(DataSource.DataSource):
except:
return fallback
-
- def wait_for_metadata_service(self, sleeps = None):
+ def wait_for_metadata_service(self):
mcfg = self.ds_cfg
- if sleeps is None:
- sleeps = 30
- try:
- sleeps = int(mcfg.get("retries",sleeps))
- except Exception as e:
- util.logexc(log)
- log.warn("Failed to get number of sleeps, using %s" % sleeps)
- if sleeps == 0: return False
+ if not hasattr(mcfg, "get"):
+ mcfg = {}
- timeout=3
+ max_wait = 120
try:
- timeout = int(mcfg.get("timeout",timeout))
+ max_wait = int(mcfg.get("max_wait",max_wait))
except Exception as e:
util.logexc(log)
- log.warn("Failed to get timeout, using %s" % timeout)
+ log.warn("Failed to get max wait. using %s" % max_wait)
- sleeptime = 1
+ if max_wait == 0:
+ return False
- def_mdurls = ["http://169.254.169.254", "http://instance-data:8773"]
+ timeout = 50
try:
- mdurls = mcfg.get("metadata_urls", def_mdurls)
+ timeout = int(mcfg.get("timeout",timeout))
except Exception as e:
- mdurls = def_mdurls
util.logexc(log)
- log.warn("Failed to get metadata URLs, using defaults")
+ log.warn("Failed to get timeout, using %s" % timeout)
- starttime = time.time()
+ def_mdurls = ["http://169.254.169.254", "http://instance-data:8773"]
+ mdurls = mcfg.get("metadata_urls", def_mdurls)
# Remove addresses from the list that wont resolve.
filtered = [x for x in mdurls if util.is_resolvable_url(x)]
@@ -126,41 +122,25 @@ class DataSourceEc2(DataSource.DataSource):
log.warn("Empty metadata url list! using default list")
mdurls = def_mdurls
- log.debug("Searching the following metadata urls: %s" % mdurls)
-
- for x in range(sleeps):
- for url in mdurls:
- iurl="%s/%s/meta-data/instance-id" % (url, self.api_ver)
-
- # given 100 sleeps, this ends up total sleep time of 1050 sec
- sleeptime=int(x/5)+1
-
- reason = ""
- try:
- req = urllib2.Request(iurl)
- resp = urllib2.urlopen(req, timeout=timeout)
- if resp.read() != "":
- self.metadata_address = url
- log.debug("Using metadata source: '%s'" % url)
- return True
- reason = "empty data [%s]" % resp.getcode()
- except urllib2.HTTPError as e:
- reason = "http error [%s]" % e.code
- except urllib2.URLError as e:
- reason = "url error [%s]" % e.reason
- except socket.timeout as e:
- reason = "socket timeout [%s]" % e
-
- #not needed? Addresses being checked are displayed above
- #if x == 0:
- # log.warn("waiting for metadata service at %s" % url)
-
- log.warn("'%s' failed: %s" % (url, reason))
- time.sleep(sleeptime)
-
- log.critical("giving up on md after %i seconds\n" %
- int(time.time()-starttime))
- return False
+ urls = [ ]
+ url2base = { False: False }
+ for url in mdurls:
+ cur = "%s/%s/meta-data/instance-id" % (url, self.api_ver)
+ urls.append(cur)
+ url2base[cur] = url
+
+ starttime = time.time()
+ url = wait_for_metadata_service(urls=urls, max_wait=max_wait,
+ timeout=timeout, status_cb=log.warn)
+
+ if url:
+ log.debug("Using metadata source: '%s'" % url2base[url])
+ else:
+ log.critical("giving up on md after %i seconds\n" %
+ int(time.time()-starttime))
+
+ self.metadata_address = url2base[url]
+ return (bool(url))
def device_name_to_device(self, name):
# consult metadata service, that has
@@ -221,6 +201,84 @@ class DataSourceEc2(DataSource.DataSource):
return True
return False
+
+def wait_for_metadata_service(urls, max_wait=None, timeout=None, status_cb=None):
+ """
+ urls: a list of urls to try
+ max_wait: roughly the maximum time to wait before giving up
+ The max time is *actually* len(urls)*timeout as each url will
+ be tried once and given the timeout provided.
+ timeout: the timeout provided to urllib2.urlopen
+ status_cb: call method with string message when a url is not available
+
+ the idea of this routine is to wait for the EC2 metdata service to
+ come up. On both Eucalyptus and EC2 we have seen the case where
+ the instance hit the MD before the MD service was up. EC2 seems
+ to have permenantely fixed this, though.
+
+ In openstack, the metadata service might be painfully slow, and
+ unable to avoid hitting a timeout of even up to 10 seconds or more
+ (LP: #894279) for a simple GET.
+
+ Offset those needs with the need to not hang forever (and block boot)
+ on a system where cloud-init is configured to look for EC2 Metadata
+ service but is not going to find one. It is possible that the instance
+ data host (169.254.169.254) may be firewalled off Entirely for a sytem,
+ meaning that the connection will block forever unless a timeout is set.
+ """
+ starttime = time.time()
+
+ sleeptime = 1
+ timeout_orig = timeout
+
+ if status_cb == None:
+ def status_cb(msg): return
+
+ def timeup(max_wait, starttime):
+ return((max_wait <= 0 or max_wait == None) or
+ (time.time()-starttime > max_wait))
+
+ loop_n = 0
+ while True:
+ sleeptime=int(loop_n/5)+1
+ for url in urls:
+ now = time.time()
+ if loop_n != 0:
+ if timeup(max_wait, starttime):
+ break
+ if timeout and (now + timeout > (starttime + max_wait)):
+ # shorten timeout to not run way over max_time
+ timeout = int((starttime + max_wait) - now)
+
+ reason = ""
+ try:
+ req = urllib2.Request(url)
+ resp = urllib2.urlopen(req, timeout=timeout)
+ if resp.read() != "":
+ return url
+ reason = "empty data [%s]" % resp.getcode()
+ except urllib2.HTTPError as e:
+ reason = "http error [%s]" % e.code
+ except urllib2.URLError as e:
+ reason = "url error [%s]" % e.reason
+ except socket.timeout as e:
+ reason = "socket timeout [%s]" % e
+ except Exception as e:
+ reason = "unexpected error [%s]" % e
+
+ if log:
+ status_cb("'%s' failed [%s/%ss]: %s" %
+ (url, int(time.time()-starttime), max_wait, reason))
+
+ if timeup(max_wait, starttime):
+ break
+
+ loop_n = loop_n + 1
+ time.sleep(sleeptime)
+
+ return False
+
+
datasources = [
( DataSourceEc2, ( DataSource.DEP_FILESYSTEM , DataSource.DEP_NETWORK ) ),
]
diff --git a/doc/examples/cloud-config-datasources.txt b/doc/examples/cloud-config-datasources.txt
index c6708a2c..b86c5ba6 100644
--- a/doc/examples/cloud-config-datasources.txt
+++ b/doc/examples/cloud-config-datasources.txt
@@ -2,16 +2,14 @@
datasource:
# Ec2
Ec2:
- # timeout: the timeout value for attempt at metadata service
- timeout : 2
- # the number of tries that should be attempted at the metadata service
- # after each try, a sleep of int(try_number/5)+1 is done
- # default is 30
- retries : 30
+ # timeout: the timeout value for a request at metadata service
+ timeout : 50
+ # The length in seconds to wait before giving up on the metadata
+ # service. The actual total wait could be up to
+ # len(resolvable_metadata_urls)*timeout
+ max_wait : 120
#metadata_url: a list of URLs to check for metadata services
metadata_urls:
- http://169.254.169.254:80
- http://instance-data:8773
-
-