From 0a078d626ae2b84f1e33a3e3eb5348e919e039c4 Mon Sep 17 00:00:00 2001 From: Joshua Harlow Date: Fri, 8 Jun 2012 18:01:24 -0700 Subject: Large amounts of refactoring. Now there exists a class which processes the user data down to a mime message and just some small utility methods to walk and determine types. Large amount of content type cleanups & constant creation. --- cloudinit/user_data.py | 387 +++++++++++++++++++++++++------------------------ 1 file changed, 198 insertions(+), 189 deletions(-) (limited to 'cloudinit/user_data.py') diff --git a/cloudinit/user_data.py b/cloudinit/user_data.py index ec914480..f35e5d38 100644 --- a/cloudinit/user_data.py +++ b/cloudinit/user_data.py @@ -18,19 +18,23 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import email +import hashlib +import os +import urllib +import email from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email.mime.base import MIMEBase + import yaml -import cloudinit -import cloudinit.util as util -import hashlib -import urllib +from cloudinit import url_helper +from cloudinit import util -starts_with_mappings = { + +# Different file beginnings to there content type +INCLUSION_TYPES_MAP = { '#include': 'text/x-include-url', '#include-once': 'text/x-include-once-url', '#!': 'text/x-shellscript', @@ -41,170 +45,210 @@ starts_with_mappings = { '#cloud-config-archive': 'text/cloud-config-archive', } +# Various special content types +TYPE_NEEDED = ["text/plain", "text/x-not-multipart"] +INCLUDE_TYPES = ['text/x-include-url', 'text/x-include-once-url'] +ARCHIVE_TYPES = ["text/cloud-config-archive"] +UNDEF_TYPE = "text/plain" +ARCHIVE_UNDEF_TYPE = "text/cloud-config" +NOT_MULTIPART_TYPE = "text/x-not-multipart" +OCTET_TYPE = 'application/octet-stream' -# if 'string' is compressed return decompressed otherwise return it -def decomp_str(string): - import StringIO - import gzip - try: - uncomp = gzip.GzipFile(None, "rb", 1, StringIO.StringIO(string)).read() - return(uncomp) - except: - return(string) - - -def do_include(content, appendmsg): - import os - # is just a list of urls, one per line - # also support '#include ' - includeonce = False - for line in content.splitlines(): - if line == "#include": - continue - if line == "#include-once": - includeonce = True - continue - if line.startswith("#include-once"): - line = line[len("#include-once"):].lstrip() - includeonce = True - elif line.startswith("#include"): - line = line[len("#include"):].lstrip() - if line.startswith("#"): - continue - if line.strip() == "": - continue - - # urls cannot not have leading or trailing white space - msum = hashlib.md5() # pylint: disable=E1101 - msum.update(line.strip()) - includeonce_filename = "%s/urlcache/%s" % ( - cloudinit.get_ipath_cur("data"), msum.hexdigest()) - try: - if includeonce and os.path.isfile(includeonce_filename): - with open(includeonce_filename, "r") as fp: - content = fp.read() - else: - content = urllib.urlopen(line).read() - if includeonce: - util.write_file(includeonce_filename, content, mode=0600) - except Exception: - raise - - process_includes(message_from_string(decomp_str(content)), appendmsg) - - -def explode_cc_archive(archive, appendmsg): - for ent in yaml.load(archive): - # ent can be one of: - # dict { 'filename' : 'value', 'content' : 'value', 'type' : 'value' } - # filename and type not be present - # or - # scalar(payload) - - def_type = "text/cloud-config" - if isinstance(ent, str): - ent = {'content': ent} - - content = ent.get('content', '') - mtype = ent.get('type', None) - if mtype == None: - mtype = type_from_startswith(content, def_type) - - maintype, subtype = mtype.split('/', 1) - if maintype == "text": - msg = MIMEText(content, _subtype=subtype) - else: - msg = MIMEBase(maintype, subtype) - msg.set_payload(content) - - if 'filename' in ent: - msg.add_header('Content-Disposition', 'attachment', - filename=ent['filename']) - - for header in ent.keys(): - if header in ('content', 'filename', 'type'): - continue - msg.add_header(header, ent['header']) +# Sorted longest first +INCLUSION_SRCH = sorted(INCLUSION_TYPES_MAP.keys(), key=(lambda e: 0 - len(e))) - _attach_part(appendmsg, msg) +# Msg header used to track attachments +ATTACHMENT_FIELD = 'Number-Attachments' +# This will be used to create a filename from a url (or like) entry +# When we want to make sure a entry isn't included more than once across sessions. +INCLUDE_ONCE_HASHER = 'md5' -def multi_part_count(outermsg, newcount=None): - """ - Return the number of attachments to this MIMEMultipart by looking - at its 'Number-Attachments' header. - """ - nfield = 'Number-Attachments' - if nfield not in outermsg: - outermsg[nfield] = "0" +# For those pieces without filenames +PART_FN_TPL = 'part-%03d' - if newcount != None: - outermsg.replace_header(nfield, str(newcount)) - return(int(outermsg.get('Number-Attachments', 0))) +class UserDataProcessor(object): + def __init__(self, paths): + self.paths = paths + def process(self, blob): + base_msg = convert_string(blob) + process_msg = MIMEMultipart() + self._process_msg(base_msg, process_msg) + return process_msg -def _attach_part(outermsg, part): - """ - Attach an part to an outer message. outermsg must be a MIMEMultipart. - Modifies a header in outermsg to keep track of number of attachments. - """ - cur = multi_part_count(outermsg) - if not part.get_filename(None): - part.add_header('Content-Disposition', 'attachment', - filename='part-%03d' % (cur + 1)) - outermsg.attach(part) - multi_part_count(outermsg, cur + 1) + def _process_msg(self, base_msg, append_msg): + for part in base_msg.walk(): + # multipart/* are just containers + if part.get_content_maintype() == 'multipart': + continue + + ctype = None + ctype_orig = part.get_content_type() + payload = part.get_payload(decode=True) + + if not ctype_orig: + ctype_orig = UNDEF_TYPE + + if ctype_orig in TYPE_NEEDED: + ctype = type_from_starts_with(payload) + + if ctype is None: + ctype = ctype_orig + + if ctype in INCLUDE_TYPES: + self._do_include(payload, append_msg) + continue + + if ctype in ARCHIVE_TYPES: + self._explode_archive(payload, append_msg) + continue + + if 'Content-Type' in base_msg: + base_msg.replace_header('Content-Type', ctype) + else: + base_msg['Content-Type'] = ctype + + self._attach_part(append_msg, part) + + def _get_include_once_filename(self, entry): + msum = hashlib.new(INCLUDE_ONCE_HASHER) + msum.update(entry) + entry_fn = msum.hexdigest()[0:64] # Don't get to long now + return os.path.join(self.paths.get_ipath_cur('data'), 'urlcache', entry_fn) + + def _do_include(self, content, append_msg): + # is just a list of urls, one per line + # also support '#include ' + for line in content.splitlines(): + includeonce = False + if line in ("#include", "#include-once"): + continue + if line.startswith("#include-once"): + line = line[len("#include-once"):].lstrip() + includeonce = True + elif line.startswith("#include"): + line = line[len("#include"):].lstrip() + if line.startswith("#"): + continue + include_url = line.strip() + if not include_url: + continue + includeonce_filename = self._get_include_once_filename(include_url) + if includeonce and os.path.isfile(includeonce_filename): + content = util.load_file(includeonce_filename) + else: + (content, st) = url_helper.readurl(include_url) + if includeonce and url_helper.ok_http_code(st): + util.write_file(includeonce_filename, content, mode=0600) + if not url_helper.ok_http_code(st): + content = '' -def type_from_startswith(payload, default=None): - # slist is sorted longest first - slist = sorted(starts_with_mappings.keys(), key=lambda e: 0 - len(e)) - for sstr in slist: - if payload.startswith(sstr): - return(starts_with_mappings[sstr]) - return default + new_msg = convert_string(content) + self._process_msg(new_msg, append_msg) + def _explode_archive(self, archive, append_msg): + try: + entries = yaml.load(archive) + except: + entries = [] + if not isinstance(entries, (list, set)): + # TODO raise? + entries = [] + + for ent in entries: + # ent can be one of: + # dict { 'filename' : 'value', 'content' : 'value', 'type' : 'value' } + # filename and type not be present + # or + # scalar(payload) + if isinstance(ent, str): + ent = {'content': ent} + if not isinstance(ent, (dict)): + # TODO raise? + continue -def process_includes(msg, appendmsg=None): - if appendmsg == None: - appendmsg = MIMEMultipart() + content = ent.get('content', '') + mtype = ent.get('type') + if not mtype: + mtype = type_from_starts_with(content, ARCHIVE_UNDEF_TYPE) - for part in msg.walk(): + maintype, subtype = mtype.split('/', 1) + if maintype == "text": + msg = MIMEText(content, _subtype=subtype) + else: + msg = MIMEBase(maintype, subtype) + msg.set_payload(content) + + if 'filename' in ent: + msg.add_header('Content-Disposition', 'attachment', filename=ent['filename']) + + for header in ent.keys(): + if header in ('content', 'filename', 'type'): + continue + msg.add_header(header, ent['header']) + + self._attach_part(append_msg, msg) + + def _multi_part_count(self, outer_msg, new_count=None): + """ + Return the number of attachments to this MIMEMultipart by looking + at its 'Number-Attachments' header. + """ + if ATTACHMENT_FIELD not in outer_msg: + outer_msg[ATTACHMENT_FIELD] = str(0) + + if new_count is not None: + outer_msg.replace_header(ATTACHMENT_FIELD, str(new_count)) + + fetched_count = 0 + try: + fetched_count = int(outer_msg.get(ATTACHMENT_FIELD)) + except (ValueError, TypeError): + outer_msg.replace_header(ATTACHMENT_FIELD, str(fetched_count)) + return fetched_count + + def _attach_part(self, outer_msg, part): + """ + Attach an part to an outer message. outermsg must be a MIMEMultipart. + Modifies a header in the message to keep track of number of attachments. + """ + cur = self._multi_part_count(outer_msg) + if not part.get_filename(): + part.add_header('Content-Disposition', 'attachment', filename=PART_FN_TPL % (cur + 1)) + outer_msg.attach(part) + self._multi_part_count(outer_msg, cur + 1) + + +# Callback is a function that will be called with +# (data, content_type, filename, payload) +def walk(ud_msg, callback, data): + partnum = 0 + for part in ud_msg.walk(): # multipart/* are just containers if part.get_content_maintype() == 'multipart': continue - ctype = None - ctype_orig = part.get_content_type() - - payload = part.get_payload(decode=True) - - if ctype_orig in ("text/plain", "text/x-not-multipart"): - ctype = type_from_startswith(payload) - + ctype = part.get_content_type() if ctype is None: - ctype = ctype_orig + ctype = OCTET_TYPE - if ctype in ('text/x-include-url', 'text/x-include-once-url'): - do_include(payload, appendmsg) - continue - - if ctype == "text/cloud-config-archive": - explode_cc_archive(payload, appendmsg) - continue - - if 'Content-Type' in msg: - msg.replace_header('Content-Type', ctype) - else: - msg['Content-Type'] = ctype + filename = part.get_filename() + if not filename: + filename = PART_FN_TPL % partnum - _attach_part(appendmsg, part) + callback(data, ctype, filename, part.get_payload(decode=True)) + partnum = partnum + 1 -def message_from_string(data, headers=None): - if headers is None: +def convert_string(self, raw_data, headers=None): + if not data: + data = '' + if not headers: headers = {} + data = util.decomp_str(raw_data) if "mime-version:" in data[0:4096].lower(): msg = email.message_from_string(data) for (key, val) in headers.items(): @@ -213,50 +257,15 @@ def message_from_string(data, headers=None): else: msg[key] = val else: - mtype = headers.get("Content-Type", "text/x-not-multipart") + mtype = headers.get("Content-Type", NOT_MULTIPART_TYPE) maintype, subtype = mtype.split("/", 1) msg = MIMEBase(maintype, subtype, *headers) msg.set_payload(data) - - return(msg) + return msg -# this is heavily wasteful, reads through userdata string input -def preprocess_userdata(data): - newmsg = MIMEMultipart() - process_includes(message_from_string(decomp_str(data)), newmsg) - return(newmsg.as_string()) - - -# callback is a function that will be called with (data, content_type, -# filename, payload) -def walk_userdata(istr, callback, data=None): - partnum = 0 - for part in message_from_string(istr).walk(): - # multipart/* are just containers - if part.get_content_maintype() == 'multipart': - continue - - ctype = part.get_content_type() - if ctype is None: - ctype = 'application/octet-stream' - - filename = part.get_filename() - if not filename: - filename = 'part-%03d' % partnum - - callback(data, ctype, filename, part.get_payload(decode=True)) - - partnum = partnum + 1 - - -if __name__ == "__main__": - def main(): - import sys - data = decomp_str(file(sys.argv[1]).read()) - newmsg = MIMEMultipart() - process_includes(message_from_string(data), newmsg) - print newmsg - print "#found %s parts" % multi_part_count(newmsg) - - main() +def type_from_starts_with(payload, default=None): + for text in INCLUSION_SRCH: + if payload.startswith(text): + return INCLUSION_TYPES_MAP[text] + return default -- cgit v1.2.3