From 0a078d626ae2b84f1e33a3e3eb5348e919e039c4 Mon Sep 17 00:00:00 2001
From: Joshua Harlow <harlowja@yahoo-inc.com>
Date: Fri, 8 Jun 2012 18:01:24 -0700
Subject: Large amounts of refactoring.

Now there exists a class which processes the user data down to a mime message and just some small utility methods to walk and determine types.

Large amount of content type cleanups & constant creation.
---
 cloudinit/user_data.py | 387 +++++++++++++++++++++++++------------------------
 1 file changed, 198 insertions(+), 189 deletions(-)

(limited to 'cloudinit/user_data.py')
diff --git a/cloudinit/user_data.py b/cloudinit/user_data.py
index ec914480..f35e5d38 100644
--- a/cloudinit/user_data.py
+++ b/cloudinit/user_data.py
@@ -18,19 +18,23 @@
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-import email
+import hashlib
+import os
+import urllib
 
+import email
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from email.mime.base import MIMEBase
+
 import yaml
-import cloudinit
-import cloudinit.util as util
-import hashlib
-import urllib
 
+from cloudinit import url_helper
+from cloudinit import util
 
-starts_with_mappings = {
+
+# Different file beginnings to there content type
+INCLUSION_TYPES_MAP = {
     '#include': 'text/x-include-url',
     '#include-once': 'text/x-include-once-url',
     '#!': 'text/x-shellscript',
@@ -41,170 +45,210 @@ starts_with_mappings = {
     '#cloud-config-archive': 'text/cloud-config-archive',
 }
 
+# Various special content types
+TYPE_NEEDED = ["text/plain", "text/x-not-multipart"]
+INCLUDE_TYPES = ['text/x-include-url', 'text/x-include-once-url']
+ARCHIVE_TYPES = ["text/cloud-config-archive"]
+UNDEF_TYPE = "text/plain"
+ARCHIVE_UNDEF_TYPE = "text/cloud-config"
+NOT_MULTIPART_TYPE = "text/x-not-multipart"
+OCTET_TYPE = 'application/octet-stream'
 
-# if 'string' is compressed return decompressed otherwise return it
-def decomp_str(string):
-    import StringIO
-    import gzip
-    try:
-        uncomp = gzip.GzipFile(None, "rb", 1, StringIO.StringIO(string)).read()
-        return(uncomp)
-    except:
-        return(string)
-
-
-def do_include(content, appendmsg):
-    import os
-    # is just a list of urls, one per line
-    # also support '#include <url here>'
-    includeonce = False
-    for line in content.splitlines():
-        if line == "#include":
-            continue
-        if line == "#include-once":
-            includeonce = True
-            continue
-        if line.startswith("#include-once"):
-            line = line[len("#include-once"):].lstrip()
-            includeonce = True
-        elif line.startswith("#include"):
-            line = line[len("#include"):].lstrip()
-        if line.startswith("#"):
-            continue
-        if line.strip() == "":
-            continue
-
-        # urls cannot not have leading or trailing white space
-        msum = hashlib.md5()  # pylint: disable=E1101
-        msum.update(line.strip())
-        includeonce_filename = "%s/urlcache/%s" % (
-            cloudinit.get_ipath_cur("data"), msum.hexdigest())
-        try:
-            if includeonce and os.path.isfile(includeonce_filename):
-                with open(includeonce_filename, "r") as fp:
-                    content = fp.read()
-            else:
-                content = urllib.urlopen(line).read()
-                if includeonce:
-                    util.write_file(includeonce_filename, content, mode=0600)
-        except Exception:
-            raise
-
-        process_includes(message_from_string(decomp_str(content)), appendmsg)
-
-
-def explode_cc_archive(archive, appendmsg):
-    for ent in yaml.load(archive):
-        # ent can be one of:
-        #  dict { 'filename' : 'value', 'content' : 'value', 'type' : 'value' }
-        #    filename and type not be present
-        # or
-        #  scalar(payload)
-
-        def_type = "text/cloud-config"
-        if isinstance(ent, str):
-            ent = {'content': ent}
-
-        content = ent.get('content', '')
-        mtype = ent.get('type', None)
-        if mtype == None:
-            mtype = type_from_startswith(content, def_type)
-
-        maintype, subtype = mtype.split('/', 1)
-        if maintype == "text":
-            msg = MIMEText(content, _subtype=subtype)
-        else:
-            msg = MIMEBase(maintype, subtype)
-            msg.set_payload(content)
-
-        if 'filename' in ent:
-            msg.add_header('Content-Disposition', 'attachment',
-                           filename=ent['filename'])
-
-        for header in ent.keys():
-            if header in ('content', 'filename', 'type'):
-                continue
-            msg.add_header(header, ent['header'])
+# Sorted longest first
+INCLUSION_SRCH = sorted(INCLUSION_TYPES_MAP.keys(), key=(lambda e: 0 - len(e)))
 
-        _attach_part(appendmsg, msg)
+# Msg header used to track attachments
+ATTACHMENT_FIELD = 'Number-Attachments'
 
+# This will be used to create a filename from a url (or like) entry
+# When we want to make sure a entry isn't included more than once across sessions.
+INCLUDE_ONCE_HASHER = 'md5'
 
-def multi_part_count(outermsg, newcount=None):
-    """
-    Return the number of attachments to this MIMEMultipart by looking
-    at its 'Number-Attachments' header.
-    """
-    nfield = 'Number-Attachments'
-    if nfield not in outermsg:
-        outermsg[nfield] = "0"
+# For those pieces without filenames
+PART_FN_TPL = 'part-%03d'
 
-    if newcount != None:
-        outermsg.replace_header(nfield, str(newcount))
 
-    return(int(outermsg.get('Number-Attachments', 0)))
+class UserDataProcessor(object):
+    def __init__(self, paths):
+        self.paths = paths
 
+    def process(self, blob):
+        base_msg = convert_string(blob)
+        process_msg = MIMEMultipart()
+        self._process_msg(base_msg, process_msg)
+        return process_msg
 
-def _attach_part(outermsg, part):
-    """
-    Attach an part to an outer message. outermsg must be a MIMEMultipart.
-    Modifies a header in outermsg to keep track of number of attachments.
-    """
-    cur = multi_part_count(outermsg)
-    if not part.get_filename(None):
-        part.add_header('Content-Disposition', 'attachment',
-            filename='part-%03d' % (cur + 1))
-    outermsg.attach(part)
-    multi_part_count(outermsg, cur + 1)
+    def _process_msg(self, base_msg, append_msg):
+        for part in base_msg.walk():
+            # multipart/* are just containers
+            if part.get_content_maintype() == 'multipart':
+                continue
+    
+            ctype = None
+            ctype_orig = part.get_content_type()
+            payload = part.get_payload(decode=True)
+    
+            if not ctype_orig:
+                ctype_orig = UNDEF_TYPE
+    
+            if ctype_orig in TYPE_NEEDED:
+                ctype = type_from_starts_with(payload)
+    
+            if ctype is None:
+                ctype = ctype_orig
+    
+            if ctype in INCLUDE_TYPES:
+                self._do_include(payload, append_msg)
+                continue
+    
+            if ctype in ARCHIVE_TYPES:
+                self._explode_archive(payload, append_msg)
+                continue
+    
+            if 'Content-Type' in base_msg:
+                base_msg.replace_header('Content-Type', ctype)
+            else:
+                base_msg['Content-Type'] = ctype
+    
+            self._attach_part(append_msg, part)
+
+    def _get_include_once_filename(self, entry):
+        msum = hashlib.new(INCLUDE_ONCE_HASHER)
+        msum.update(entry)
+        entry_fn = msum.hexdigest()[0:64]  # Don't get to long now
+        return os.path.join(self.paths.get_ipath_cur('data'), 'urlcache', entry_fn)
+
+    def _do_include(self, content, append_msg):
+        # is just a list of urls, one per line
+        # also support '#include <url here>'
+        for line in content.splitlines():
+            includeonce = False
+            if line in ("#include", "#include-once"):
+                continue
+            if line.startswith("#include-once"):
+                line = line[len("#include-once"):].lstrip()
+                includeonce = True
+            elif line.startswith("#include"):
+                line = line[len("#include"):].lstrip()
+            if line.startswith("#"):
+                continue
+            include_url = line.strip()
+            if not include_url:
+                continue
 
+            includeonce_filename = self._get_include_once_filename(include_url)
+            if includeonce and os.path.isfile(includeonce_filename):
+                content = util.load_file(includeonce_filename)
+            else:
+                (content, st) = url_helper.readurl(include_url)
+                if includeonce and url_helper.ok_http_code(st):
+                    util.write_file(includeonce_filename, content, mode=0600)
+                if not url_helper.ok_http_code(st):
+                    content = ''
 
-def type_from_startswith(payload, default=None):
-    # slist is sorted longest first
-    slist = sorted(starts_with_mappings.keys(), key=lambda e: 0 - len(e))
-    for sstr in slist:
-        if payload.startswith(sstr):
-            return(starts_with_mappings[sstr])
-    return default
+            new_msg = convert_string(content)
+            self._process_msg(new_msg, append_msg)
 
+    def _explode_archive(self, archive, append_msg):
+        try:
+            entries = yaml.load(archive)
+        except:
+            entries = []
+        if not isinstance(entries, (list, set)):
+            # TODO raise?
+            entries = []
+
+        for ent in entries:
+            # ent can be one of:
+            #  dict { 'filename' : 'value', 'content' : 'value', 'type' : 'value' }
+            #    filename and type not be present
+            # or
+            #  scalar(payload)
+            if isinstance(ent, str):
+                ent = {'content': ent}
+            if not isinstance(ent, (dict)):
+                # TODO raise?
+                continue
 
-def process_includes(msg, appendmsg=None):
-    if appendmsg == None:
-        appendmsg = MIMEMultipart()
+            content = ent.get('content', '')
+            mtype = ent.get('type')
+            if not mtype:
+                mtype = type_from_starts_with(content, ARCHIVE_UNDEF_TYPE)
 
-    for part in msg.walk():
+            maintype, subtype = mtype.split('/', 1)
+            if maintype == "text":
+                msg = MIMEText(content, _subtype=subtype)
+            else:
+                msg = MIMEBase(maintype, subtype)
+                msg.set_payload(content)
+
+            if 'filename' in ent:
+                msg.add_header('Content-Disposition', 'attachment', filename=ent['filename'])
+
+            for header in ent.keys():
+                if header in ('content', 'filename', 'type'):
+                    continue
+                msg.add_header(header, ent['header'])
+
+            self._attach_part(append_msg, msg)
+
+    def _multi_part_count(self, outer_msg, new_count=None):
+        """
+        Return the number of attachments to this MIMEMultipart by looking
+        at its 'Number-Attachments' header.
+        """
+        if ATTACHMENT_FIELD not in outer_msg:
+            outer_msg[ATTACHMENT_FIELD] = str(0)
+    
+        if new_count is not None:
+            outer_msg.replace_header(ATTACHMENT_FIELD, str(new_count))
+    
+        fetched_count = 0
+        try:
+            fetched_count = int(outer_msg.get(ATTACHMENT_FIELD))
+        except (ValueError, TypeError):
+            outer_msg.replace_header(ATTACHMENT_FIELD, str(fetched_count))
+        return fetched_count
+
+    def _attach_part(self, outer_msg, part):
+        """
+        Attach an part to an outer message. outermsg must be a MIMEMultipart.
+        Modifies a header in the message to keep track of number of attachments.
+        """
+        cur = self._multi_part_count(outer_msg)
+        if not part.get_filename():
+            part.add_header('Content-Disposition', 'attachment', filename=PART_FN_TPL % (cur + 1))
+        outer_msg.attach(part)
+        self._multi_part_count(outer_msg, cur + 1)
+
+
+# Callback is a function that will be called with 
+# (data, content_type, filename, payload)
+def walk(ud_msg, callback, data):
+    partnum = 0
+    for part in ud_msg.walk():
         # multipart/* are just containers
         if part.get_content_maintype() == 'multipart':
             continue
 
-        ctype = None
-        ctype_orig = part.get_content_type()
-
-        payload = part.get_payload(decode=True)
-
-        if ctype_orig in ("text/plain", "text/x-not-multipart"):
-            ctype = type_from_startswith(payload)
-
+        ctype = part.get_content_type()
         if ctype is None:
-            ctype = ctype_orig
+            ctype = OCTET_TYPE
 
-        if ctype in ('text/x-include-url', 'text/x-include-once-url'):
-            do_include(payload, appendmsg)
-            continue
-
-        if ctype == "text/cloud-config-archive":
-            explode_cc_archive(payload, appendmsg)
-            continue
-
-        if 'Content-Type' in msg:
-            msg.replace_header('Content-Type', ctype)
-        else:
-            msg['Content-Type'] = ctype
+        filename = part.get_filename()
+        if not filename:
+            filename = PART_FN_TPL % partnum
 
-        _attach_part(appendmsg, part)
+        callback(data, ctype, filename, part.get_payload(decode=True))
+        partnum = partnum + 1
 
 
-def message_from_string(data, headers=None):
-    if headers is None:
+def convert_string(self, raw_data, headers=None):
+    if not data:
+        data = ''
+    if not headers:
         headers = {}
+    data = util.decomp_str(raw_data)
     if "mime-version:" in data[0:4096].lower():
         msg = email.message_from_string(data)
         for (key, val) in headers.items():
@@ -213,50 +257,15 @@ def message_from_string(data, headers=None):
             else:
                 msg[key] = val
     else:
-        mtype = headers.get("Content-Type", "text/x-not-multipart")
+        mtype = headers.get("Content-Type", NOT_MULTIPART_TYPE)
         maintype, subtype = mtype.split("/", 1)
         msg = MIMEBase(maintype, subtype, *headers)
         msg.set_payload(data)
-
-    return(msg)
+    return msg
 
 
-# this is heavily wasteful, reads through userdata string input
-def preprocess_userdata(data):
-    newmsg = MIMEMultipart()
-    process_includes(message_from_string(decomp_str(data)), newmsg)
-    return(newmsg.as_string())
-
-
-# callback is a function that will be called with (data, content_type,
-# filename, payload)
-def walk_userdata(istr, callback, data=None):
-    partnum = 0
-    for part in message_from_string(istr).walk():
-        # multipart/* are just containers
-        if part.get_content_maintype() == 'multipart':
-            continue
-
-        ctype = part.get_content_type()
-        if ctype is None:
-            ctype = 'application/octet-stream'
-
-        filename = part.get_filename()
-        if not filename:
-            filename = 'part-%03d' % partnum
-
-        callback(data, ctype, filename, part.get_payload(decode=True))
-
-        partnum = partnum + 1
-
-
-if __name__ == "__main__":
-    def main():
-        import sys
-        data = decomp_str(file(sys.argv[1]).read())
-        newmsg = MIMEMultipart()
-        process_includes(message_from_string(data), newmsg)
-        print newmsg
-        print "#found %s parts" % multi_part_count(newmsg)
-
-    main()
+def type_from_starts_with(payload, default=None):
+    for text in INCLUSION_SRCH:
+        if payload.startswith(text):
+            return INCLUSION_TYPES_MAP[text]
+    return default
-- 
cgit v1.2.3