From 96d130e7732f1242d71c65a32412ae56cb229abf Mon Sep 17 00:00:00 2001 From: Barry Warsaw Date: Tue, 27 Jan 2015 15:11:53 -0500 Subject: Respond to review: - Refactor "fully" decoding the payload of a text/* part. In Python 3, decode=True only means to decode according to Content-Transfer-Encoding, not according to any charset in the Content-Type header. So do that. --- cloudinit/handlers/__init__.py | 11 +---------- cloudinit/user_data.py | 12 +----------- cloudinit/util.py | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/cloudinit/handlers/__init__.py b/cloudinit/handlers/__init__.py index cdccf122..6b7abbcd 100644 --- a/cloudinit/handlers/__init__.py +++ b/cloudinit/handlers/__init__.py @@ -233,16 +233,7 @@ def walk(msg, callback, data): headers = dict(part) LOG.debug(headers) headers['Content-Type'] = ctype - payload = part.get_payload(decode=True) - # In Python 3, decoding the payload will ironically hand us a bytes - # object. 'decode' means to decode according to - # Content-Transfer-Encoding, not according to any charset in the - # Content-Type. So, if we end up with bytes, first try to decode to - # str via CT charset, and failing that, try utf-8 using surrogate - # escapes. - if six.PY3 and isinstance(payload, bytes): - charset = part.get_charset() or 'utf-8' - payload = payload.decode(charset, errors='surrogateescape') + payload = util.fully_decoded_payload(part) callback(data, filename, payload, headers) partnum = partnum + 1 diff --git a/cloudinit/user_data.py b/cloudinit/user_data.py index bf5642a5..5fdc46f2 100644 --- a/cloudinit/user_data.py +++ b/cloudinit/user_data.py @@ -108,17 +108,7 @@ class UserDataProcessor(object): ctype = None ctype_orig = part.get_content_type() - ctype_main = part.get_content_maintype() - payload = part.get_payload(decode=True) - # In Python 3, decoding the payload will ironically hand us a - # bytes object. 'decode' means to decode according to - # Content-Transfer-Encoding, not according to any charset in the - # Content-Type. So, if we end up with bytes, first try to decode - # to str via CT charset, and failing that, try utf-8 using - # surrogate escapes. - if six.PY3 and ctype_main == 'text' and isinstance(payload, bytes): - charset = part.get_charset() or 'utf-8' - payload = payload.decode(charset, errors='surrogateescape') + payload = util.fully_decoded_payload(part) was_compressed = False # When the message states it is of a gzipped content type ensure diff --git a/cloudinit/util.py b/cloudinit/util.py index 8916cc11..3a921afe 100644 --- a/cloudinit/util.py +++ b/cloudinit/util.py @@ -110,6 +110,21 @@ def b64e(source): return b64encode(source).decode('utf-8') +def fully_decoded_payload(part): + # In Python 3, decoding the payload will ironically hand us a bytes object. + # 'decode' means to decode according to Content-Transfer-Encoding, not + # according to any charset in the Content-Type. So, if we end up with + # bytes, first try to decode to str via CT charset, and failing that, try + # utf-8 using surrogate escapes. + cte_payload = part.get_payload(decode=True) + if ( six.PY3 and + part.get_content_maintype() == 'text' and + isinstance(cte_payload, bytes)): + charset = part.get_charset() or 'utf-8' + return cte_payload.decode(charset, errors='surrogateescape') + return cte_payload + + # Path for DMI Data DMI_SYS_PATH = "/sys/class/dmi/id" -- cgit v1.2.3