diff options
author | Sam Gilson <t-sagils@microsoft.com> | 2019-07-15 21:50:33 +0000 |
---|---|---|
committer | Server Team CI Bot <josh.powers+server-team-bot@canonical.com> | 2019-07-15 21:50:33 +0000 |
commit | a24550aee4c7282cd3624bf63f9501444e517678 (patch) | |
tree | 9dd5d148266673021a56f29c4282b61e6f372898 /cloudinit | |
parent | a6faf3acef02bd8cd4d46ac9efeebf24b3f21d81 (diff) | |
download | vyos-cloud-init-a24550aee4c7282cd3624bf63f9501444e517678.tar.gz vyos-cloud-init-a24550aee4c7282cd3624bf63f9501444e517678.zip |
Cloud-init analyze module: Added ability to analyze boot events.
This branch introduces a new command line feature for cloud-init.
Currently, the cloud-init module has the capability to analyze events in
cloud-init.log in three ways: 'show', 'blame', 'dump'.
These changes add a fourth capability, called 'boot'.
Running the command 'cloud-init analyze boot' will provide the user three
timestamps.
1) Timestamp for when the kernel starts initializing.
2) Timestamp for when the kernel finishes its initialization.
3) Timestamp for when systemd activates cloud-init.
This feature enables cloud-init users to analyze different boot phases.
This would aid in debugging performance issues related
to cloud-init startup or tracking regression.
Diffstat (limited to 'cloudinit')
-rw-r--r-- | cloudinit/analyze/__main__.py | 88 | ||||
-rw-r--r-- | cloudinit/analyze/show.py | 202 | ||||
-rw-r--r-- | cloudinit/analyze/tests/test_boot.py | 170 |
3 files changed, 448 insertions, 12 deletions
diff --git a/cloudinit/analyze/__main__.py b/cloudinit/analyze/__main__.py index f8613656..99e5c203 100644 --- a/cloudinit/analyze/__main__.py +++ b/cloudinit/analyze/__main__.py @@ -7,7 +7,7 @@ import re import sys from cloudinit.util import json_dumps - +from datetime import datetime from . import dump from . import show @@ -52,9 +52,93 @@ def get_parser(parser=None): dest='outfile', default='-', help='specify where to write output. ') parser_dump.set_defaults(action=('dump', analyze_dump)) + parser_boot = subparsers.add_parser( + 'boot', help='Print list of boot times for kernel and cloud-init') + parser_boot.add_argument('-i', '--infile', action='store', + dest='infile', default='/var/log/cloud-init.log', + help='specify where to read input. ') + parser_boot.add_argument('-o', '--outfile', action='store', + dest='outfile', default='-', + help='specify where to write output.') + parser_boot.set_defaults(action=('boot', analyze_boot)) return parser +def analyze_boot(name, args): + """Report a list of how long different boot operations took. + + For Example: + -- Most Recent Boot Record -- + Kernel Started at: <time> + Kernel ended boot at: <time> + Kernel time to boot (seconds): <time> + Cloud-init activated by systemd at: <time> + Time between Kernel end boot and Cloud-init activation (seconds):<time> + Cloud-init start: <time> + """ + infh, outfh = configure_io(args) + kernel_info = show.dist_check_timestamp() + status_code, kernel_start, kernel_end, ci_sysd_start = \ + kernel_info + kernel_start_timestamp = datetime.utcfromtimestamp(kernel_start) + kernel_end_timestamp = datetime.utcfromtimestamp(kernel_end) + ci_sysd_start_timestamp = datetime.utcfromtimestamp(ci_sysd_start) + try: + last_init_local = \ + [e for e in _get_events(infh) if e['name'] == 'init-local' and + 'starting search' in e['description']][-1] + ci_start = datetime.utcfromtimestamp(last_init_local['timestamp']) + except IndexError: + ci_start = 'Could not find init-local log-line in cloud-init.log' + status_code = show.FAIL_CODE + + FAILURE_MSG = 'Your Linux distro or container does not support this ' \ + 'functionality.\n' \ + 'You must be running a Kernel Telemetry supported ' \ + 'distro.\nPlease check ' \ + 'https://cloudinit.readthedocs.io/en/latest' \ + '/topics/analyze.html for more ' \ + 'information on supported distros.\n' + + SUCCESS_MSG = '-- Most Recent Boot Record --\n' \ + ' Kernel Started at: {k_s_t}\n' \ + ' Kernel ended boot at: {k_e_t}\n' \ + ' Kernel time to boot (seconds): {k_r}\n' \ + ' Cloud-init activated by systemd at: {ci_sysd_t}\n' \ + ' Time between Kernel end boot and Cloud-init ' \ + 'activation (seconds): {bt_r}\n' \ + ' Cloud-init start: {ci_start}\n' + + CONTAINER_MSG = '-- Most Recent Container Boot Record --\n' \ + ' Container started at: {k_s_t}\n' \ + ' Cloud-init activated by systemd at: {ci_sysd_t}\n' \ + ' Cloud-init start: {ci_start}\n' \ + + status_map = { + show.FAIL_CODE: FAILURE_MSG, + show.CONTAINER_CODE: CONTAINER_MSG, + show.SUCCESS_CODE: SUCCESS_MSG + } + + kernel_runtime = kernel_end - kernel_start + between_process_runtime = ci_sysd_start - kernel_end + + kwargs = { + 'k_s_t': kernel_start_timestamp, + 'k_e_t': kernel_end_timestamp, + 'k_r': kernel_runtime, + 'bt_r': between_process_runtime, + 'k_e': kernel_end, + 'k_s': kernel_start, + 'ci_sysd': ci_sysd_start, + 'ci_sysd_t': ci_sysd_start_timestamp, + 'ci_start': ci_start + } + + outfh.write(status_map[status_code].format(**kwargs)) + return status_code + + def analyze_blame(name, args): """Report a list of records sorted by largest time delta. @@ -119,7 +203,7 @@ def analyze_dump(name, args): def _get_events(infile): rawdata = None - events, rawdata = show.load_events(infile, None) + events, rawdata = show.load_events_infile(infile) if not events: events, _ = dump.dump_events(rawdata=rawdata) return events diff --git a/cloudinit/analyze/show.py b/cloudinit/analyze/show.py index 3e778b8b..511b808e 100644 --- a/cloudinit/analyze/show.py +++ b/cloudinit/analyze/show.py @@ -8,8 +8,11 @@ import base64 import datetime import json import os +import time +import sys from cloudinit import util +from cloudinit.distros import uses_systemd # An event: ''' @@ -49,6 +52,10 @@ format_key = { formatting_help = " ".join(["{0}: {1}".format(k.replace('%', '%%'), v) for k, v in format_key.items()]) +SUCCESS_CODE = 'successful' +FAIL_CODE = 'failure' +CONTAINER_CODE = 'container' +TIMESTAMP_UNKNOWN = (FAIL_CODE, -1, -1, -1) def format_record(msg, event): @@ -125,9 +132,175 @@ def total_time_record(total_time): return 'Total Time: %3.5f seconds\n' % total_time +class SystemctlReader(object): + ''' + Class for dealing with all systemctl subp calls in a consistent manner. + ''' + def __init__(self, property, parameter=None): + self.epoch = None + self.args = ['/bin/systemctl', 'show'] + if parameter: + self.args.append(parameter) + self.args.extend(['-p', property]) + # Don't want the init of our object to break. Instead of throwing + # an exception, set an error code that gets checked when data is + # requested from the object + self.failure = self.subp() + + def subp(self): + ''' + Make a subp call based on set args and handle errors by setting + failure code + + :return: whether the subp call failed or not + ''' + try: + value, err = util.subp(self.args, capture=True) + if err: + return err + self.epoch = value + return None + except Exception as systemctl_fail: + return systemctl_fail + + def parse_epoch_as_float(self): + ''' + If subp call succeeded, return the timestamp from subp as a float. + + :return: timestamp as a float + ''' + # subp has 2 ways to fail: it either fails and throws an exception, + # or returns an error code. Raise an exception here in order to make + # sure both scenarios throw exceptions + if self.failure: + raise RuntimeError('Subprocess call to systemctl has failed, ' + 'returning error code ({})' + .format(self.failure)) + # Output from systemctl show has the format Property=Value. + # For example, UserspaceMonotonic=1929304 + timestamp = self.epoch.split('=')[1] + # Timestamps reported by systemctl are in microseconds, converting + return float(timestamp) / 1000000 + + +def dist_check_timestamp(): + ''' + Determine which init system a particular linux distro is using. + Each init system (systemd, upstart, etc) has a different way of + providing timestamps. + + :return: timestamps of kernelboot, kernelendboot, and cloud-initstart + or TIMESTAMP_UNKNOWN if the timestamps cannot be retrieved. + ''' + + if uses_systemd(): + return gather_timestamps_using_systemd() + + # Use dmesg to get timestamps if the distro does not have systemd + if util.is_FreeBSD() or 'gentoo' in \ + util.system_info()['system'].lower(): + return gather_timestamps_using_dmesg() + + # this distro doesn't fit anything that is supported by cloud-init. just + # return error codes + return TIMESTAMP_UNKNOWN + + +def gather_timestamps_using_dmesg(): + ''' + Gather timestamps that corresponds to kernel begin initialization, + kernel finish initialization using dmesg as opposed to systemctl + + :return: the two timestamps plus a dummy timestamp to keep consistency + with gather_timestamps_using_systemd + ''' + try: + data, _ = util.subp(['dmesg'], capture=True) + split_entries = data[0].splitlines() + for i in split_entries: + if i.decode('UTF-8').find('user') != -1: + splitup = i.decode('UTF-8').split() + stripped = splitup[1].strip(']') + + # kernel timestamp from dmesg is equal to 0, + # with the userspace timestamp relative to it. + user_space_timestamp = float(stripped) + kernel_start = float(time.time()) - float(util.uptime()) + kernel_end = kernel_start + user_space_timestamp + + # systemd wont start cloud-init in this case, + # so we cannot get that timestamp + return SUCCESS_CODE, kernel_start, kernel_end, \ + kernel_end + + except Exception: + pass + return TIMESTAMP_UNKNOWN + + +def gather_timestamps_using_systemd(): + ''' + Gather timestamps that corresponds to kernel begin initialization, + kernel finish initialization. and cloud-init systemd unit activation + + :return: the three timestamps + ''' + kernel_start = float(time.time()) - float(util.uptime()) + try: + delta_k_end = SystemctlReader('UserspaceTimestampMonotonic')\ + .parse_epoch_as_float() + delta_ci_s = SystemctlReader('InactiveExitTimestampMonotonic', + 'cloud-init-local').parse_epoch_as_float() + base_time = kernel_start + status = SUCCESS_CODE + # lxc based containers do not set their monotonic zero point to be when + # the container starts, instead keep using host boot as zero point + # time.CLOCK_MONOTONIC_RAW is only available in python 3.3 + if util.is_container(): + # clock.monotonic also uses host boot as zero point + if sys.version_info >= (3, 3): + base_time = float(time.time()) - float(time.monotonic()) + # TODO: lxcfs automatically truncates /proc/uptime to seconds + # in containers when https://github.com/lxc/lxcfs/issues/292 + # is fixed, util.uptime() should be used instead of stat on + try: + file_stat = os.stat('/proc/1/cmdline') + kernel_start = file_stat.st_atime + except OSError as err: + raise RuntimeError('Could not determine container boot ' + 'time from /proc/1/cmdline. ({})' + .format(err)) + status = CONTAINER_CODE + else: + status = FAIL_CODE + kernel_end = base_time + delta_k_end + cloudinit_sysd = base_time + delta_ci_s + + except Exception as e: + # Except ALL exceptions as Systemctl reader can throw many different + # errors, but any failure in systemctl means that timestamps cannot be + # obtained + print(e) + return TIMESTAMP_UNKNOWN + return status, kernel_start, kernel_end, cloudinit_sysd + + def generate_records(events, blame_sort=False, print_format="(%n) %d seconds in %I%D", dump_files=False, log_datafiles=False): + ''' + Take in raw events and create parent-child dependencies between events + in order to order events in chronological order. + + :param events: JSONs from dump that represents events taken from logs + :param blame_sort: whether to sort by timestamp or by time taken. + :param print_format: formatting to represent event, time stamp, + and time taken by the event in one line + :param dump_files: whether to dump files into JSONs + :param log_datafiles: whether or not to log events generated + + :return: boot records ordered chronologically + ''' sorted_events = sorted(events, key=lambda x: x['timestamp']) records = [] @@ -189,19 +362,28 @@ def generate_records(events, blame_sort=False, def show_events(events, print_format): + ''' + A passthrough method that makes it easier to call generate_records() + + :param events: JSONs from dump that represents events taken from logs + :param print_format: formatting to represent event, time stamp, + and time taken by the event in one line + + :return: boot records ordered chronologically + ''' return generate_records(events, print_format=print_format) -def load_events(infile, rawdata=None): - if rawdata: - data = rawdata.read() - else: - data = infile.read() +def load_events_infile(infile): + ''' + Takes in a log file, read it, and convert to json. + + :param infile: The Log file to be read - j = None + :return: json version of logfile, raw file + ''' + data = infile.read() try: - j = json.loads(data) + return json.loads(data), data except ValueError: - pass - - return j, data + return None, data diff --git a/cloudinit/analyze/tests/test_boot.py b/cloudinit/analyze/tests/test_boot.py new file mode 100644 index 00000000..706e2cc0 --- /dev/null +++ b/cloudinit/analyze/tests/test_boot.py @@ -0,0 +1,170 @@ +import os +from cloudinit.analyze.__main__ import (analyze_boot, get_parser) +from cloudinit.tests.helpers import CiTestCase, mock +from cloudinit.analyze.show import dist_check_timestamp, SystemctlReader, \ + FAIL_CODE, CONTAINER_CODE + +err_code = (FAIL_CODE, -1, -1, -1) + + +class TestDistroChecker(CiTestCase): + + @mock.patch('cloudinit.util.system_info', return_value={'dist': ('', '', + ''), + 'system': ''}) + @mock.patch('platform.linux_distribution', return_value=('', '', '')) + @mock.patch('cloudinit.util.is_FreeBSD', return_value=False) + def test_blank_distro(self, m_sys_info, m_linux_distribution, m_free_bsd): + self.assertEqual(err_code, dist_check_timestamp()) + + @mock.patch('cloudinit.util.system_info', return_value={'dist': ('', '', + '')}) + @mock.patch('platform.linux_distribution', return_value=('', '', '')) + @mock.patch('cloudinit.util.is_FreeBSD', return_value=True) + def test_freebsd_gentoo_cant_find(self, m_sys_info, + m_linux_distribution, m_is_FreeBSD): + self.assertEqual(err_code, dist_check_timestamp()) + + @mock.patch('cloudinit.util.subp', return_value=(0, 1)) + def test_subp_fails(self, m_subp): + self.assertEqual(err_code, dist_check_timestamp()) + + +class TestSystemCtlReader(CiTestCase): + + def test_systemctl_invalid_property(self): + reader = SystemctlReader('dummyProperty') + with self.assertRaises(RuntimeError): + reader.parse_epoch_as_float() + + def test_systemctl_invalid_parameter(self): + reader = SystemctlReader('dummyProperty', 'dummyParameter') + with self.assertRaises(RuntimeError): + reader.parse_epoch_as_float() + + @mock.patch('cloudinit.util.subp', return_value=('U=1000000', None)) + def test_systemctl_works_correctly_threshold(self, m_subp): + reader = SystemctlReader('dummyProperty', 'dummyParameter') + self.assertEqual(1.0, reader.parse_epoch_as_float()) + thresh = 1.0 - reader.parse_epoch_as_float() + self.assertTrue(thresh < 1e-6) + self.assertTrue(thresh > (-1 * 1e-6)) + + @mock.patch('cloudinit.util.subp', return_value=('U=0', None)) + def test_systemctl_succeed_zero(self, m_subp): + reader = SystemctlReader('dummyProperty', 'dummyParameter') + self.assertEqual(0.0, reader.parse_epoch_as_float()) + + @mock.patch('cloudinit.util.subp', return_value=('U=1', None)) + def test_systemctl_succeed_distinct(self, m_subp): + reader = SystemctlReader('dummyProperty', 'dummyParameter') + val1 = reader.parse_epoch_as_float() + m_subp.return_value = ('U=2', None) + reader2 = SystemctlReader('dummyProperty', 'dummyParameter') + val2 = reader2.parse_epoch_as_float() + self.assertNotEqual(val1, val2) + + @mock.patch('cloudinit.util.subp', return_value=('100', None)) + def test_systemctl_epoch_not_splittable(self, m_subp): + reader = SystemctlReader('dummyProperty', 'dummyParameter') + with self.assertRaises(IndexError): + reader.parse_epoch_as_float() + + @mock.patch('cloudinit.util.subp', return_value=('U=foobar', None)) + def test_systemctl_cannot_convert_epoch_to_float(self, m_subp): + reader = SystemctlReader('dummyProperty', 'dummyParameter') + with self.assertRaises(ValueError): + reader.parse_epoch_as_float() + + +class TestAnalyzeBoot(CiTestCase): + + def set_up_dummy_file_ci(self, path, log_path): + infh = open(path, 'w+') + infh.write('2019-07-08 17:40:49,601 - util.py[DEBUG]: Cloud-init v. ' + '19.1-1-gbaa47854-0ubuntu1~18.04.1 running \'init-local\' ' + 'at Mon, 08 Jul 2019 17:40:49 +0000. Up 18.84 seconds.') + infh.close() + outfh = open(log_path, 'w+') + outfh.close() + + def set_up_dummy_file(self, path, log_path): + infh = open(path, 'w+') + infh.write('dummy data') + infh.close() + outfh = open(log_path, 'w+') + outfh.close() + + def remove_dummy_file(self, path, log_path): + if os.path.isfile(path): + os.remove(path) + if os.path.isfile(log_path): + os.remove(log_path) + + @mock.patch('cloudinit.analyze.show.dist_check_timestamp', + return_value=err_code) + def test_boot_invalid_distro(self, m_dist_check_timestamp): + + path = os.path.dirname(os.path.abspath(__file__)) + log_path = path + '/boot-test.log' + path += '/dummy.log' + self.set_up_dummy_file(path, log_path) + + parser = get_parser() + args = parser.parse_args(args=['boot', '-i', path, '-o', + log_path]) + name_default = '' + analyze_boot(name_default, args) + # now args have been tested, go into outfile and make sure error + # message is in the outfile + outfh = open(args.outfile, 'r') + data = outfh.read() + err_string = 'Your Linux distro or container does not support this ' \ + 'functionality.\nYou must be running a Kernel ' \ + 'Telemetry supported distro.\nPlease check ' \ + 'https://cloudinit.readthedocs.io/en/latest/topics' \ + '/analyze.html for more information on supported ' \ + 'distros.\n' + + self.remove_dummy_file(path, log_path) + self.assertEqual(err_string, data) + + @mock.patch("cloudinit.util.is_container", return_value=True) + @mock.patch('cloudinit.util.subp', return_value=('U=1000000', None)) + def test_container_no_ci_log_line(self, m_is_container, m_subp): + path = os.path.dirname(os.path.abspath(__file__)) + log_path = path + '/boot-test.log' + path += '/dummy.log' + self.set_up_dummy_file(path, log_path) + + parser = get_parser() + args = parser.parse_args(args=['boot', '-i', path, '-o', + log_path]) + name_default = '' + + finish_code = analyze_boot(name_default, args) + + self.remove_dummy_file(path, log_path) + self.assertEqual(FAIL_CODE, finish_code) + + @mock.patch("cloudinit.util.is_container", return_value=True) + @mock.patch('cloudinit.util.subp', return_value=('U=1000000', None)) + @mock.patch('cloudinit.analyze.__main__._get_events', return_value=[{ + 'name': 'init-local', 'description': 'starting search', 'timestamp': + 100000}]) + @mock.patch('cloudinit.analyze.show.dist_check_timestamp', + return_value=(CONTAINER_CODE, 1, 1, 1)) + def test_container_ci_log_line(self, m_is_container, m_subp, m_get, m_g): + path = os.path.dirname(os.path.abspath(__file__)) + log_path = path + '/boot-test.log' + path += '/dummy.log' + self.set_up_dummy_file_ci(path, log_path) + + parser = get_parser() + args = parser.parse_args(args=['boot', '-i', path, '-o', + log_path]) + name_default = '' + finish_code = analyze_boot(name_default, args) + + self.remove_dummy_file(path, log_path) + self.assertEqual(CONTAINER_CODE, finish_code) |