From c818ddba06ff7d486a085edae531896156c14e9d Mon Sep 17 00:00:00 2001 From: Joshua Harlow Date: Fri, 19 Jul 2013 15:49:35 -0700 Subject: Add the ability to decompress MIME gzip. Instead of being restricted to only gzip compressing the overall mime segment or individual included segments, allow for each mime segment to be gzip compressed. LP: #1203203 --- cloudinit/user_data.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/cloudinit/user_data.py b/cloudinit/user_data.py index df069ff8..23c31fde 100644 --- a/cloudinit/user_data.py +++ b/cloudinit/user_data.py @@ -48,6 +48,18 @@ ARCHIVE_TYPES = ["text/cloud-config-archive"] UNDEF_TYPE = "text/plain" ARCHIVE_UNDEF_TYPE = "text/cloud-config" +# This seems to hit most of the gzip possible content types. +DECOMP_TYPES = [ + 'application/gzip', + 'application/gzip-compressed', + 'application/gzipped', + 'application/x-compress', + 'application/x-compressed', + 'application/x-gunzip', + 'application/x-gzip', + 'application/x-gzip-compressed', +] + # Msg header used to track attachments ATTACHMENT_FIELD = 'Number-Attachments' @@ -67,6 +79,13 @@ class UserDataProcessor(object): return accumulating_msg def _process_msg(self, base_msg, append_msg): + + def replace_header(part, key, value): + if key in part: + part.replace_header(key, value) + else: + part[key] = value + for part in base_msg.walk(): if is_skippable(part): continue @@ -75,6 +94,18 @@ class UserDataProcessor(object): ctype_orig = part.get_content_type() payload = part.get_payload(decode=True) + # When the message states it is of a gzipped content type ensure + # that we attempt to decode said payload so that the decompressed + # data can be examined (instead of the compressed data). + if ctype_orig in DECOMP_TYPES: + try: + payload = util.decomp_gzip(payload, quiet=False) + ctype_orig = UNDEF_TYPE + # TODO(harlowja): should we also set the payload to the + # decompressed value?? + except util.DecompressionError: + pass + if not ctype_orig: ctype_orig = UNDEF_TYPE @@ -85,10 +116,7 @@ class UserDataProcessor(object): ctype = ctype_orig if ctype != ctype_orig: - if CONTENT_TYPE in part: - part.replace_header(CONTENT_TYPE, ctype) - else: - part[CONTENT_TYPE] = ctype + replace_header(part, CONTENT_TYPE, ctype) if ctype in INCLUDE_TYPES: self._do_include(payload, append_msg) @@ -100,10 +128,7 @@ class UserDataProcessor(object): # Should this be happening, shouldn't # the part header be modified and not the base? - if CONTENT_TYPE in base_msg: - base_msg.replace_header(CONTENT_TYPE, ctype) - else: - base_msg[CONTENT_TYPE] = ctype + replace_header(base_msg, CONTENT_TYPE, ctype) self._attach_part(append_msg, part) -- cgit v1.2.3 From 7880588f804ea035f03eba9335af71f3322dab97 Mon Sep 17 00:00:00 2001 From: Joshua Harlow Date: Sat, 20 Jul 2013 14:34:00 -0700 Subject: Ensure we reset the part after decompression. --- cloudinit/user_data.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/cloudinit/user_data.py b/cloudinit/user_data.py index 23c31fde..97853e51 100644 --- a/cloudinit/user_data.py +++ b/cloudinit/user_data.py @@ -23,8 +23,10 @@ import os import email + from email.mime.base import MIMEBase from email.mime.multipart import MIMEMultipart +from email.mime.nonmultipart import MIMENonMultipart from email.mime.text import MIMEText from cloudinit import handlers @@ -80,6 +82,10 @@ class UserDataProcessor(object): def _process_msg(self, base_msg, append_msg): + def find_ctype(payload): + ctype = handlers.type_from_starts_with(payload) + return ctype + def replace_header(part, key, value): if key in part: part.replace_header(key, value) @@ -93,6 +99,7 @@ class UserDataProcessor(object): ctype = None ctype_orig = part.get_content_type() payload = part.get_payload(decode=True) + was_compressed = False # When the message states it is of a gzipped content type ensure # that we attempt to decode said payload so that the decompressed @@ -100,21 +107,32 @@ class UserDataProcessor(object): if ctype_orig in DECOMP_TYPES: try: payload = util.decomp_gzip(payload, quiet=False) - ctype_orig = UNDEF_TYPE - # TODO(harlowja): should we also set the payload to the - # decompressed value?? - except util.DecompressionError: - pass + # At this point we don't know what the content-type is + # since we just decompressed it. + ctype_orig = None + was_compressed = True + except util.DecompressionError as e: + LOG.warn("Failed decompressing payload from %s of length" + " %s due to: %s", ctype_orig, len(payload), e) + continue + # Attempt to figure out the payloads content-type if not ctype_orig: ctype_orig = UNDEF_TYPE - if ctype_orig in TYPE_NEEDED: - ctype = handlers.type_from_starts_with(payload) - + ctype = find_ctype(payload) if ctype is None: ctype = ctype_orig + # In the case where the data was compressed, we want to make sure + # that we create a new message that contains the found content + # type with the uncompressed content since later traversals of the + # messages will expect a part not compressed. + if was_compressed: + maintype, subtype = ctype.split("/", 1) + part = MIMENonMultipart(maintype, subtype) + part.set_payload(payload) + if ctype != ctype_orig: replace_header(part, CONTENT_TYPE, ctype) @@ -126,7 +144,7 @@ class UserDataProcessor(object): self._explode_archive(payload, append_msg) continue - # Should this be happening, shouldn't + # TODO(harlowja): Should this be happening, shouldn't # the part header be modified and not the base? replace_header(base_msg, CONTENT_TYPE, ctype) -- cgit v1.2.3 From 64c69053c11385cc43b6c628dbe8a1bf28ccc49c Mon Sep 17 00:00:00 2001 From: Joshua Harlow Date: Sat, 20 Jul 2013 14:57:42 -0700 Subject: Keep filename from original part. --- cloudinit/user_data.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cloudinit/user_data.py b/cloudinit/user_data.py index 97853e51..e17bcaee 100644 --- a/cloudinit/user_data.py +++ b/cloudinit/user_data.py @@ -130,8 +130,12 @@ class UserDataProcessor(object): # messages will expect a part not compressed. if was_compressed: maintype, subtype = ctype.split("/", 1) - part = MIMENonMultipart(maintype, subtype) - part.set_payload(payload) + n_part = MIMENonMultipart(maintype, subtype) + n_part.set_payload(payload) + if part.get_filename(): + n_part.add_header('Content-Disposition', 'attachment', + filename=part.get_filename()) + part = n_part if ctype != ctype_orig: replace_header(part, CONTENT_TYPE, ctype) -- cgit v1.2.3 From 432778cf2890c19940f29f47f9efc2cb8e784f43 Mon Sep 17 00:00:00 2001 From: Joshua Harlow Date: Sat, 20 Jul 2013 16:34:39 -0700 Subject: Unify filename, header replacement. --- cloudinit/user_data.py | 56 ++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/cloudinit/user_data.py b/cloudinit/user_data.py index e17bcaee..454f3c06 100644 --- a/cloudinit/user_data.py +++ b/cloudinit/user_data.py @@ -70,6 +70,19 @@ ATTACHMENT_FIELD = 'Number-Attachments' EXAMINE_FOR_LAUNCH_INDEX = ["text/cloud-config"] +def _replace_header(msg, key, value): + del msg[key] + msg[key] = value + + +def _set_filename(msg, filename): + if not filename: + return + del msg['Content-Disposition'] + msg.add_header('Content-Disposition', + 'attachment', filename=str(filename)) + + class UserDataProcessor(object): def __init__(self, paths): self.paths = paths @@ -83,14 +96,7 @@ class UserDataProcessor(object): def _process_msg(self, base_msg, append_msg): def find_ctype(payload): - ctype = handlers.type_from_starts_with(payload) - return ctype - - def replace_header(part, key, value): - if key in part: - part.replace_header(key, value) - else: - part[key] = value + return handlers.type_from_starts_with(payload) for part in base_msg.walk(): if is_skippable(part): @@ -132,13 +138,17 @@ class UserDataProcessor(object): maintype, subtype = ctype.split("/", 1) n_part = MIMENonMultipart(maintype, subtype) n_part.set_payload(payload) - if part.get_filename(): - n_part.add_header('Content-Disposition', 'attachment', - filename=part.get_filename()) + # Copy various headers from the old part to the new one, + # but don't include all the headers since some are not useful + # after decoding and decompression. + _set_filename(n_part, part.get_filename()) + for h in ('Launch-Index',): + if h in part: + _replace_header(n_part, h, str(part[h])) part = n_part if ctype != ctype_orig: - replace_header(part, CONTENT_TYPE, ctype) + _replace_header(part, CONTENT_TYPE, ctype) if ctype in INCLUDE_TYPES: self._do_include(payload, append_msg) @@ -150,7 +160,7 @@ class UserDataProcessor(object): # TODO(harlowja): Should this be happening, shouldn't # the part header be modified and not the base? - replace_header(base_msg, CONTENT_TYPE, ctype) + _replace_header(base_msg, CONTENT_TYPE, ctype) self._attach_part(append_msg, part) @@ -185,8 +195,7 @@ class UserDataProcessor(object): def _process_before_attach(self, msg, attached_id): if not msg.get_filename(): - msg.add_header('Content-Disposition', - 'attachment', filename=PART_FN_TPL % (attached_id)) + _set_filename(msg, PART_FN_TPL % (attached_id)) self._attach_launch_index(msg) def _do_include(self, content, append_msg): @@ -264,13 +273,15 @@ class UserDataProcessor(object): msg.set_payload(content) if 'filename' in ent: - msg.add_header('Content-Disposition', - 'attachment', filename=ent['filename']) + _set_filename(msg, ent['filename']) if 'launch-index' in ent: msg.add_header('Launch-Index', str(ent['launch-index'])) for header in list(ent.keys()): - if header in ('content', 'filename', 'type', 'launch-index'): + if header.lower() in ('content', 'filename', 'type', + 'launch-index', 'content-disposition', + ATTACHMENT_FIELD.lower(), + CONTENT_TYPE.lower()): continue msg.add_header(header, ent[header]) @@ -285,13 +296,13 @@ class UserDataProcessor(object): outer_msg[ATTACHMENT_FIELD] = '0' if new_count is not None: - outer_msg.replace_header(ATTACHMENT_FIELD, str(new_count)) + _replace_header(outer_msg, ATTACHMENT_FIELD, str(new_count)) fetched_count = 0 try: fetched_count = int(outer_msg.get(ATTACHMENT_FIELD)) except (ValueError, TypeError): - outer_msg.replace_header(ATTACHMENT_FIELD, str(fetched_count)) + _replace_header(outer_msg, ATTACHMENT_FIELD, str(fetched_count)) return fetched_count def _attach_part(self, outer_msg, part): @@ -323,10 +334,7 @@ def convert_string(raw_data, headers=None): if "mime-version:" in data[0:4096].lower(): msg = email.message_from_string(data) for (key, val) in headers.iteritems(): - if key in msg: - msg.replace_header(key, val) - else: - msg[key] = val + _replace_header(msg, key, val) else: mtype = headers.get(CONTENT_TYPE, NOT_MULTIPART_TYPE) maintype, subtype = mtype.split("/", 1) -- cgit v1.2.3 From 251317563bd36a339e6fa7a08a0fc05b5ee975a4 Mon Sep 17 00:00:00 2001 From: Joshua Harlow Date: Sat, 20 Jul 2013 16:40:11 -0700 Subject: Just check the filename existing. --- cloudinit/user_data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cloudinit/user_data.py b/cloudinit/user_data.py index 454f3c06..d49ea094 100644 --- a/cloudinit/user_data.py +++ b/cloudinit/user_data.py @@ -76,8 +76,6 @@ def _replace_header(msg, key, value): def _set_filename(msg, filename): - if not filename: - return del msg['Content-Disposition'] msg.add_header('Content-Disposition', 'attachment', filename=str(filename)) @@ -141,7 +139,8 @@ class UserDataProcessor(object): # Copy various headers from the old part to the new one, # but don't include all the headers since some are not useful # after decoding and decompression. - _set_filename(n_part, part.get_filename()) + if part.get_filename(): + _set_filename(n_part, part.get_filename()) for h in ('Launch-Index',): if h in part: _replace_header(n_part, h, str(part[h])) -- cgit v1.2.3