diff options
author | Moustafa Moustafa <momousta@microsoft.com> | 2020-08-20 12:04:03 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-08-20 13:04:03 -0600 |
commit | d941c7f7846e0216873384044d20f4b6723697c5 (patch) | |
tree | 7eeadd04459d2313f628834658c9449a2304fe25 /cloudinit/sources | |
parent | 79a8ce7e714ae1686c10bff77612eab0f6eccc95 (diff) | |
download | vyos-cloud-init-d941c7f7846e0216873384044d20f4b6723697c5.tar.gz vyos-cloud-init-d941c7f7846e0216873384044d20f4b6723697c5.zip |
Pushing cloud-init log to the KVP (#529)
Push the cloud-init.log file (Up to 500KB at once) to the KVP before reporting ready to the Azure platform.
Based on the analysis done on a large sample of cloud-init.log files, Here's the statistics collected on the log file size:
P50 P90 P95 P99 P99.9 P99.99
137K 423K 537K 3.5MB 6MB 16MB
This change limits the size of cloud-init.log file data that gets dumped to KVP to 500KB. So for ~95% of the cases, the whole log file will be dumped and for the remaining ~5%, we will get the last 500KB of the cloud-init.log file.
To asses the performance of the 500KB limit, 250 VM were deployed with a 500KB cloud-init.log file and the time taken to compress, encode and dump the entries to KVP was measured. Here's the time in milliseconds percentiles:
P50 P99 P999
75.705 232.701 1169.636
Another 250 VMs were deployed with this logic dumping their normal cloud-init.log file to KVP, the same timing was measured as above. Here's the time in milliseconds percentiles:
P50 P99 P999
1.88 5.277 6.992
Added excluded_handlers to the report_event function to be able to opt-out from reporting the events of the compressed cloud-init.log file to the cloud-init.log file.
The KVP break_down logic had a bug, where it will reuse the same key for all the split chunks of KVP which results in overwriting the split KVPs by the last one when consumed by Hyper-V. I added the split chunk index as a differentiator to the KVP key.
The Hyper-V consumes the KVPs from the KVP file as chunks whose key is 512KB and value is 2048KB but the Azure platform expects the value to be 1024KB, thus I introduced the Azure value limit.
Diffstat (limited to 'cloudinit/sources')
-rwxr-xr-x | cloudinit/sources/DataSourceAzure.py | 12 | ||||
-rwxr-xr-x | cloudinit/sources/helpers/azure.py | 58 |
2 files changed, 64 insertions, 6 deletions
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index a3810ca8..f572a4ef 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -35,7 +35,8 @@ from cloudinit.sources.helpers.azure import ( report_diagnostic_event, EphemeralDHCPv4WithReporting, is_byte_swapped, - dhcp_log_cb) + dhcp_log_cb, + push_log_to_kvp) LOG = logging.getLogger(__name__) @@ -789,9 +790,12 @@ class DataSourceAzure(sources.DataSource): @azure_ds_telemetry_reporter def activate(self, cfg, is_new_instance): - address_ephemeral_resize(is_new_instance=is_new_instance, - preserve_ntfs=self.ds_cfg.get( - DS_CFG_KEY_PRESERVE_NTFS, False)) + try: + address_ephemeral_resize(is_new_instance=is_new_instance, + preserve_ntfs=self.ds_cfg.get( + DS_CFG_KEY_PRESERVE_NTFS, False)) + finally: + push_log_to_kvp(self.sys_cfg['def_log_file']) return @property diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py index 6df28ccf..6156c75b 100755 --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -1,5 +1,5 @@ # This file is part of cloud-init. See LICENSE file for license information. - +import base64 import json import logging import os @@ -8,7 +8,9 @@ import socket import struct import time import textwrap +import zlib +from cloudinit.settings import CFG_BUILTIN from cloudinit.net import dhcp from cloudinit import stages from cloudinit import temp_utils @@ -33,7 +35,14 @@ DEFAULT_WIRESERVER_ENDPOINT = "a8:3f:81:10" BOOT_EVENT_TYPE = 'boot-telemetry' SYSTEMINFO_EVENT_TYPE = 'system-info' DIAGNOSTIC_EVENT_TYPE = 'diagnostic' - +COMPRESSED_EVENT_TYPE = 'compressed' +# Maximum number of bytes of the cloud-init.log file that can be dumped to KVP +# at once. This number is based on the analysis done on a large sample of +# cloud-init.log files where the P95 of the file sizes was 537KB and the time +# consumed to dump 500KB file was (P95:76, P99:233, P99.9:1170) in ms +MAX_LOG_TO_KVP_LENGTH = 512000 +# Marker file to indicate whether cloud-init.log is pushed to KVP +LOG_PUSHED_TO_KVP_MARKER_FILE = '/var/lib/cloud/data/log_pushed_to_kvp' azure_ds_reporter = events.ReportEventStack( name="azure-ds", description="initialize reporter for azure ds", @@ -177,6 +186,49 @@ def report_diagnostic_event(str): return evt +def report_compressed_event(event_name, event_content): + """Report a compressed event""" + compressed_data = base64.encodebytes(zlib.compress(event_content)) + event_data = {"encoding": "gz+b64", + "data": compressed_data.decode('ascii')} + evt = events.ReportingEvent( + COMPRESSED_EVENT_TYPE, event_name, + json.dumps(event_data), + events.DEFAULT_EVENT_ORIGIN) + events.report_event(evt, + excluded_handler_types={"log", "print", "webhook"}) + + # return the event for unit testing purpose + return evt + + +@azure_ds_telemetry_reporter +def push_log_to_kvp(file_name=CFG_BUILTIN['def_log_file']): + """Push a portion of cloud-init.log file or the whole file to KVP + based on the file size. + If called more than once, it skips pushing the log file to KVP again.""" + + log_pushed_to_kvp = bool(os.path.isfile(LOG_PUSHED_TO_KVP_MARKER_FILE)) + if log_pushed_to_kvp: + report_diagnostic_event("cloud-init.log is already pushed to KVP") + return + + LOG.debug("Dumping cloud-init.log file to KVP") + try: + with open(file_name, "rb") as f: + f.seek(0, os.SEEK_END) + seek_index = max(f.tell() - MAX_LOG_TO_KVP_LENGTH, 0) + report_diagnostic_event( + "Dumping last {} bytes of cloud-init.log file to KVP".format( + f.tell() - seek_index)) + f.seek(seek_index, os.SEEK_SET) + report_compressed_event("cloud-init.log", f.read()) + util.write_file(LOG_PUSHED_TO_KVP_MARKER_FILE, '') + except Exception as ex: + report_diagnostic_event("Exception when dumping log file: %s" % + repr(ex)) + + @contextmanager def cd(newdir): prevdir = os.getcwd() @@ -474,6 +526,8 @@ class GoalStateHealthReporter: @azure_ds_telemetry_reporter def _post_health_report(self, document): + push_log_to_kvp() + # Whenever report_diagnostic_event(diagnostic_msg) is invoked in code, # the diagnostic messages are written to special files # (/var/opt/hyperv/.kvp_pool_*) as Hyper-V KVP messages. |