diff options
author | Ćukasz 'sil2100' Zemczak <lukasz.zemczak@ubuntu.com> | 2018-02-05 17:25:14 +0100 |
---|---|---|
committer | usd-importer <ubuntu-server@lists.ubuntu.com> | 2018-02-05 19:15:55 +0000 |
commit | bdfe3a4fa9bd487ec5f40e829742e127476d2c8a (patch) | |
tree | 947ecb04d5e2737883803034ab9885ddab70b816 /azurelinuxagent | |
parent | 6aaeb3237653102b5a376986f0ea73327a7a4379 (diff) | |
download | vyos-walinuxagent-bdfe3a4fa9bd487ec5f40e829742e127476d2c8a.tar.gz vyos-walinuxagent-bdfe3a4fa9bd487ec5f40e829742e127476d2c8a.zip |
Import patches-unapplied version 2.2.21-0ubuntu1 to ubuntu/bionic-proposed
Imported using git-ubuntu import.
Changelog parent: 6aaeb3237653102b5a376986f0ea73327a7a4379
New changelog entries:
* New upstream release (LP: #1746628).
* debian/patches/disable_import_test.patch: refreshed patch.
Diffstat (limited to 'azurelinuxagent')
29 files changed, 1159 insertions, 469 deletions
diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py index e99f7be..4ab7aa0 100644 --- a/azurelinuxagent/agent.py +++ b/azurelinuxagent/agent.py @@ -63,6 +63,9 @@ class Agent(object): path="/var/log/waagent.log") logger.add_logger_appender(logger.AppenderType.CONSOLE, level, path="/dev/console") + logger.add_logger_appender(logger.AppenderType.TELEMETRY, + logger.LogLevel.WARNING, + path=event.add_log_event) ext_log_dir = conf.get_ext_log_dir() try: @@ -144,7 +147,7 @@ def main(args=[]): if command == "version": version() elif command == "help": - usage() + print(usage()) elif command == "start": start(conf_file_path=conf_file_path) else: @@ -228,15 +231,16 @@ def version(): def usage(): """ - Show agent usage + Return agent usage message """ - print("") - print((("usage: {0} [-verbose] [-force] [-help] " + s = "\n" + s += ("usage: {0} [-verbose] [-force] [-help] " "-configuration-path:<path to configuration file>" "-deprovision[+user]|-register-service|-version|-daemon|-start|" - "-run-exthandlers]" - "").format(sys.argv[0]))) - print("") + "-run-exthandlers|-show-configuration]" + "").format(sys.argv[0]) + s += "\n" + return s def start(conf_file_path=None): """ diff --git a/azurelinuxagent/common/conf.py b/azurelinuxagent/common/conf.py index 75a0248..33f953a 100644 --- a/azurelinuxagent/common/conf.py +++ b/azurelinuxagent/common/conf.py @@ -40,11 +40,11 @@ class ConfigurationProvider(object): raise AgentConfigError("Can't not parse empty configuration") for line in content.split('\n'): if not line.startswith("#") and "=" in line: - parts = line.split('=') + parts = line.split('=', 1) if len(parts) < 2: continue key = parts[0].strip() - value = parts[1].strip("\" ") + value = parts[1].split('#')[0].strip("\" ").strip() self.values[key] = value if value != "None" else None def get(self, key, default_val): @@ -131,12 +131,14 @@ __STRING_OPTIONS__ = { } __INTEGER_OPTIONS__ = { + "OS.SshClientAliveInterval" : 180, "Provisioning.PasswordCryptSaltLength" : 10, "HttpProxy.Port" : None, "ResourceDisk.SwapSizeMB" : 0, "Autoupdate.Frequency" : 3600 } + def get_configuration(conf=__conf__): options = {} for option in __SWITCH_OPTIONS__: @@ -190,6 +192,9 @@ def get_fips_enabled(conf=__conf__): def get_openssl_cmd(conf=__conf__): return conf.get("OS.OpensslPath", "/usr/bin/openssl") +def get_ssh_client_alive_interval(conf=__conf__): + return conf.get("OS.SshClientAliveInterval", 180) + def get_ssh_dir(conf=__conf__): return conf.get("OS.SshDir", "/etc/ssh") diff --git a/azurelinuxagent/common/errorstate.py b/azurelinuxagent/common/errorstate.py new file mode 100644 index 0000000..750aa77 --- /dev/null +++ b/azurelinuxagent/common/errorstate.py @@ -0,0 +1,31 @@ +from datetime import datetime, timedelta + +ERROR_STATE_DELTA = timedelta(minutes=15) + + +class ErrorState(object): + def __init__(self, min_timedelta = ERROR_STATE_DELTA): + self.min_timedelta = min_timedelta + + self.count = 0 + self.timestamp = None + + def incr(self): + if self.count == 0: + self.timestamp = datetime.utcnow() + + self.count += 1 + + def reset(self): + self.count = 0 + self.timestamp = None + + def is_triggered(self): + if self.timestamp is None: + return False + + delta = datetime.utcnow() - self.timestamp + if delta >= self.min_timedelta: + return True + + return False diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index e62a925..84a439f 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -15,51 +15,58 @@ # Requires Python 2.4+ and Openssl 1.0+ # -import os -import sys -import traceback import atexit +import datetime import json +import os +import sys import time -import datetime -import threading -import platform +import traceback -from datetime import datetime, timedelta +from datetime import datetime import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger -from azurelinuxagent.common.exception import EventError, ProtocolError +from azurelinuxagent.common.exception import EventError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.protocol.restapi import TelemetryEventParam, \ - TelemetryEventList, \ TelemetryEvent, \ - set_properties, get_properties -from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION, \ - DISTRO_CODE_NAME, AGENT_VERSION, \ - CURRENT_AGENT, CURRENT_VERSION + get_properties +from azurelinuxagent.common.version import CURRENT_VERSION _EVENT_MSG = "Event: name={0}, op={1}, message={2}, duration={3}" + class WALAEventOperation: ActivateResourceDisk = "ActivateResourceDisk" + AgentBlacklisted = "AgentBlacklisted" + AgentEnabled = "AgentEnabled" AutoUpdate = "AutoUpdate" + CustomData = "CustomData" + Deploy = "Deploy" Disable = "Disable" Download = "Download" Enable = "Enable" + ExtensionProcessing = "ExtensionProcessing" Firewall = "Firewall" HealthCheck = "HealthCheck" HeartBeat = "HeartBeat" HostPlugin = "HostPlugin" + HttpErrors = "HttpErrors" Install = "Install" InitializeHostPlugin = "InitializeHostPlugin" + Log = "Log" + Partition = "Partition" ProcessGoalState = "ProcessGoalState" Provision = "Provision" + GuestState = "GuestState" ReportStatus = "ReportStatus" Restart = "Restart" + SkipUpdate = "SkipUpdate" UnhandledError = "UnhandledError" UnInstall = "UnInstall" + Unknown = "Unknown" Upgrade = "Upgrade" Update = "Update" @@ -67,7 +74,7 @@ class WALAEventOperation: class EventStatus(object): EVENT_STATUS_FILE = "event_status.json" - def __init__(self, status_dir=conf.get_lib_dir()): + def __init__(self): self._path = None self._status = {} @@ -82,7 +89,7 @@ class EventStatus(object): event = self._event_name(name, version, op) if event not in self._status: return True - return self._status[event] == True + return self._status[event] is True def initialize(self, status_dir=conf.get_lib_dir()): self._path = os.path.join(status_dir, EventStatus.EVENT_STATUS_FILE) @@ -90,7 +97,7 @@ class EventStatus(object): def mark_event_status(self, name, version, op, status): event = self._event_name(name, version, op) - self._status[event] = (status == True) + self._status[event] = (status is True) self._save() def _event_name(self, name, version, op): @@ -113,6 +120,7 @@ class EventStatus(object): except Exception as e: logger.warn("Exception occurred saving event status: {0}".format(e)) + __event_status__ = EventStatus() __event_status_operations__ = [ WALAEventOperation.AutoUpdate, @@ -171,7 +179,7 @@ class EventLogger(object): (self.periodic_events[h] + delta) <= datetime.now() def add_periodic(self, - delta, name, op="", is_success=True, duration=0, + delta, name, op=WALAEventOperation.Unknown, is_success=True, duration=0, version=CURRENT_VERSION, message="", evt_type="", is_internal=False, log_event=True, force=False): @@ -186,7 +194,7 @@ class EventLogger(object): def add_event(self, name, - op="", + op=WALAEventOperation.Unknown, is_success=True, duration=0, version=CURRENT_VERSION, @@ -215,6 +223,32 @@ class EventLogger(object): except EventError as e: logger.error("{0}", e) + def add_log_event(self, level, message): + # By the time the message has gotten to this point it is formatted as + # + # YYYY/MM/DD HH:mm:ss.fffffff LEVEL <text>. + # + # The timestamp and the level are redundant, and should be stripped. + # The logging library does not schematize this data, so I am forced + # to parse the message. The format is regular, so the burden is low. + + parts = message.split(' ', 3) + msg = parts[3] if len(parts) == 4 \ + else message + + event = TelemetryEvent(7, "FFF0196F-EE4C-4EAF-9AA5-776F622DEB4F") + event.parameters.append(TelemetryEventParam('EventName', WALAEventOperation.Log)) + event.parameters.append(TelemetryEventParam('CapabilityUsed', logger.LogLevel.STRINGS[level])) + event.parameters.append(TelemetryEventParam('Context1', msg)) + event.parameters.append(TelemetryEventParam('Context2', '')) + event.parameters.append(TelemetryEventParam('Context3', '')) + + data = get_properties(event) + try: + self.save_event(json.dumps(data)) + except EventError: + pass + __event_logger__ = EventLogger() @@ -224,6 +258,7 @@ def elapsed_milliseconds(utc_start): return int(((d.days * 24 * 60 * 60 + d.seconds) * 1000) + \ (d.microseconds / 1000.0)) + def report_event(op, is_success=True, message=''): from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION add_event(AGENT_NAME, @@ -232,6 +267,7 @@ def report_event(op, is_success=True, message=''): message=message, op=op) + def report_periodic(delta, op, is_success=True, message=''): from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION add_periodic(delta, AGENT_NAME, @@ -240,7 +276,9 @@ def report_periodic(delta, op, is_success=True, message=''): message=message, op=op) -def add_event(name, op="", is_success=True, duration=0, version=CURRENT_VERSION, + +def add_event(name, op=WALAEventOperation.Unknown, is_success=True, duration=0, + version=CURRENT_VERSION, message="", evt_type="", is_internal=False, log_event=True, reporter=__event_logger__): if reporter.event_dir is None: @@ -255,8 +293,17 @@ def add_event(name, op="", is_success=True, duration=0, version=CURRENT_VERSION, version=str(version), message=message, evt_type=evt_type, is_internal=is_internal, log_event=log_event) + +def add_log_event(level, message, reporter=__event_logger__): + if reporter.event_dir is None: + return + + reporter.add_log_event(level, message) + + def add_periodic( - delta, name, op="", is_success=True, duration=0, version=CURRENT_VERSION, + delta, name, op=WALAEventOperation.Unknown, is_success=True, duration=0, + version=CURRENT_VERSION, message="", evt_type="", is_internal=False, log_event=True, force=False, reporter=__event_logger__): if reporter.event_dir is None: @@ -269,10 +316,12 @@ def add_periodic( version=str(version), message=message, evt_type=evt_type, is_internal=is_internal, log_event=log_event, force=force) + def mark_event_status(name, version, op, status): if op in __event_status_operations__: __event_status__.mark_event_status(name, version, op, status) + def should_emit_event(name, version, op, status): return \ op not in __event_status_operations__ or \ @@ -280,12 +329,15 @@ def should_emit_event(name, version, op, status): not __event_status__.event_marked(name, version, op) or \ __event_status__.event_succeeded(name, version, op) != status + def init_event_logger(event_dir): __event_logger__.event_dir = event_dir + def init_event_status(status_dir): __event_status__.initialize(status_dir) + def dump_unhandled_err(name): if hasattr(sys, 'last_type') and hasattr(sys, 'last_value') and \ hasattr(sys, 'last_traceback'): diff --git a/azurelinuxagent/common/exception.py b/azurelinuxagent/common/exception.py index 17c6ce0..bb8ab10 100644 --- a/azurelinuxagent/common/exception.py +++ b/azurelinuxagent/common/exception.py @@ -26,8 +26,8 @@ class AgentError(Exception): Base class of agent error. """ - def __init__(self, errno, msg, inner=None): - msg = u"[{0}] {1}".format(errno, msg) + def __init__(self, msg, inner=None): + msg = u"[{0}] {1}".format(type(self).__name__, msg) if inner is not None: msg = u"{0}\nInner error: {1}".format(msg, inner) super(AgentError, self).__init__(msg) @@ -39,16 +39,16 @@ class AgentConfigError(AgentError): """ def __init__(self, msg=None, inner=None): - super(AgentConfigError, self).__init__('000001', msg, inner) + super(AgentConfigError, self).__init__(msg, inner) class AgentNetworkError(AgentError): """ - When network is not avaiable. + When network is not available\. """ def __init__(self, msg=None, inner=None): - super(AgentNetworkError, self).__init__('000002', msg, inner) + super(AgentNetworkError, self).__init__(msg, inner) class ExtensionError(AgentError): @@ -57,7 +57,7 @@ class ExtensionError(AgentError): """ def __init__(self, msg=None, inner=None): - super(ExtensionError, self).__init__('000003', msg, inner) + super(ExtensionError, self).__init__(msg, inner) class ProvisionError(AgentError): @@ -66,7 +66,7 @@ class ProvisionError(AgentError): """ def __init__(self, msg=None, inner=None): - super(ProvisionError, self).__init__('000004', msg, inner) + super(ProvisionError, self).__init__(msg, inner) class ResourceDiskError(AgentError): @@ -75,7 +75,7 @@ class ResourceDiskError(AgentError): """ def __init__(self, msg=None, inner=None): - super(ResourceDiskError, self).__init__('000005', msg, inner) + super(ResourceDiskError, self).__init__(msg, inner) class DhcpError(AgentError): @@ -84,7 +84,8 @@ class DhcpError(AgentError): """ def __init__(self, msg=None, inner=None): - super(DhcpError, self).__init__('000006', msg, inner) + super(DhcpError, self).__init__(msg, inner) + class OSUtilError(AgentError): """ @@ -92,7 +93,7 @@ class OSUtilError(AgentError): """ def __init__(self, msg=None, inner=None): - super(OSUtilError, self).__init__('000007', msg, inner) + super(OSUtilError, self).__init__(msg, inner) class ProtocolError(AgentError): @@ -101,7 +102,7 @@ class ProtocolError(AgentError): """ def __init__(self, msg=None, inner=None): - super(ProtocolError, self).__init__('000008', msg, inner) + super(ProtocolError, self).__init__(msg, inner) class ProtocolNotFoundError(ProtocolError): @@ -113,13 +114,22 @@ class ProtocolNotFoundError(ProtocolError): super(ProtocolNotFoundError, self).__init__(msg, inner) +class RestartError(ProtocolError): + """ + Variant of ProtocolError used to restart processing if the GoalState + becomes stale. + """ + + pass + + class HttpError(AgentError): """ Http request failure """ def __init__(self, msg=None, inner=None): - super(HttpError, self).__init__('000009', msg, inner) + super(HttpError, self).__init__(msg, inner) class EventError(AgentError): @@ -128,7 +138,7 @@ class EventError(AgentError): """ def __init__(self, msg=None, inner=None): - super(EventError, self).__init__('000010', msg, inner) + super(EventError, self).__init__(msg, inner) class CryptError(AgentError): @@ -137,7 +147,7 @@ class CryptError(AgentError): """ def __init__(self, msg=None, inner=None): - super(CryptError, self).__init__('000011', msg, inner) + super(CryptError, self).__init__(msg, inner) class UpdateError(AgentError): @@ -146,7 +156,7 @@ class UpdateError(AgentError): """ def __init__(self, msg=None, inner=None): - super(UpdateError, self).__init__('000012', msg, inner) + super(UpdateError, self).__init__(msg, inner) class ResourceGoneError(HttpError): @@ -155,4 +165,6 @@ class ResourceGoneError(HttpError): """ def __init__(self, msg=None, inner=None): + if msg is None: + msg = "Resource is gone" super(ResourceGoneError, self).__init__(msg, inner) diff --git a/azurelinuxagent/common/future.py b/azurelinuxagent/common/future.py index 8d5b70b..8389cd3 100644 --- a/azurelinuxagent/common/future.py +++ b/azurelinuxagent/common/future.py @@ -1,10 +1,10 @@ import sys """ -Add alies for python2 and python3 libs and fucntions. +Add alias for python2 and python3 libs and functions. """ -if sys.version_info[0]== 3: +if sys.version_info[0] == 3: import http.client as httpclient from urllib.parse import urlparse @@ -23,5 +23,4 @@ elif sys.version_info[0] == 2: bytebuffer = buffer else: - raise ImportError("Unknown python version:{0}".format(sys.version_info)) - + raise ImportError("Unknown python version: {0}".format(sys.version_info)) diff --git a/azurelinuxagent/common/logger.py b/azurelinuxagent/common/logger.py index bfdc73a..0a90718 100644 --- a/azurelinuxagent/common/logger.py +++ b/azurelinuxagent/common/logger.py @@ -17,8 +17,8 @@ """ Log utils """ -import os import sys + from azurelinuxagent.common.future import ustr from datetime import datetime, timedelta @@ -28,6 +28,7 @@ EVERY_HOUR = timedelta(hours=1) EVERY_HALF_HOUR = timedelta(minutes=30) EVERY_FIFTEEN_MINUTES = timedelta(minutes=15) + class Logger(object): """ Logger class @@ -92,6 +93,7 @@ class Logger(object): appender = _create_logger_appender(appender_type, level, path) self.appenders.append(appender) + class ConsoleAppender(object): def __init__(self, level, path): self.level = level @@ -105,6 +107,7 @@ class ConsoleAppender(object): except IOError: pass + class FileAppender(object): def __init__(self, level, path): self.level = level @@ -118,6 +121,7 @@ class FileAppender(object): except IOError: pass + class StdoutAppender(object): def __init__(self, level): self.level = level @@ -129,9 +133,24 @@ class StdoutAppender(object): except IOError: pass + +class TelemetryAppender(object): + def __init__(self, level, event_func): + self.level = level + self.event_func = event_func + + def write(self, level, msg): + if self.level <= level: + try: + self.event_func(level, msg) + except IOError: + pass + + #Initialize logger instance DEFAULT_LOGGER = Logger() + class LogLevel(object): VERBOSE = 0 INFO = 1 @@ -144,35 +163,46 @@ class LogLevel(object): "ERROR" ] + class AppenderType(object): FILE = 0 CONSOLE = 1 STDOUT = 2 + TELEMETRY = 3 + def add_logger_appender(appender_type, level=LogLevel.INFO, path=None): DEFAULT_LOGGER.add_appender(appender_type, level, path) + def reset_periodic(): DEFAULT_LOGGER.reset_periodic() + def periodic(delta, msg_format, *args): DEFAULT_LOGGER.periodic(delta, msg_format, *args) + def verbose(msg_format, *args): DEFAULT_LOGGER.verbose(msg_format, *args) + def info(msg_format, *args): DEFAULT_LOGGER.info(msg_format, *args) + def warn(msg_format, *args): DEFAULT_LOGGER.warn(msg_format, *args) + def error(msg_format, *args): DEFAULT_LOGGER.error(msg_format, *args) + def log(level, msg_format, *args): DEFAULT_LOGGER.log(level, msg_format, args) + def _create_logger_appender(appender_type, level=LogLevel.INFO, path=None): if appender_type == AppenderType.CONSOLE: return ConsoleAppender(level, path) @@ -180,6 +210,8 @@ def _create_logger_appender(appender_type, level=LogLevel.INFO, path=None): return FileAppender(level, path) elif appender_type == AppenderType.STDOUT: return StdoutAppender(level) + elif appender_type == AppenderType.TELEMETRY: + return TelemetryAppender(level, path) else: raise ValueError("Unknown appender type") diff --git a/azurelinuxagent/common/osutil/default.py b/azurelinuxagent/common/osutil/default.py index f0b1050..16b7444 100644 --- a/azurelinuxagent/common/osutil/default.py +++ b/azurelinuxagent/common/osutil/default.py @@ -57,10 +57,21 @@ IPTABLES_VERSION = "iptables --version" IPTABLES_LOCKING_VERSION = FlexibleVersion('1.4.21') FIREWALL_ACCEPT = "iptables {0} -t security -{1} OUTPUT -d {2} -p tcp -m owner --uid-owner {3} -j ACCEPT" -FIREWALL_DROP = "iptables {0} -t security -{1} OUTPUT -d {2} -p tcp -j DROP" -FIREWALL_LIST = "iptables {0} -t security -L" +# Note: +# -- Initially "flight" the change to ACCEPT packets and develop a metric baseline +# A subsequent release will convert the ACCEPT to DROP +FIREWALL_DROP = "iptables {0} -t security -{1} OUTPUT -d {2} -p tcp -m conntrack --ctstate INVALID,NEW -j ACCEPT" +# FIREWALL_DROP = "iptables {0} -t security -{1} OUTPUT -d {2} -p tcp -m conntrack --ctstate INVALID,NEW -j DROP" +FIREWALL_LIST = "iptables {0} -t security -L -nxv" +FIREWALL_PACKETS = "iptables {0} -t security -L OUTPUT --zero OUTPUT -nxv" FIREWALL_FLUSH = "iptables {0} -t security --flush" +# Precisely delete the rules created by the agent. +FIREWALL_DELETE_CONNTRACK = "iptables {0} -t security -D OUTPUT -d {1} -p tcp -m conntrack --ctstate INVALID,NEW -j ACCEPT" +FIREWALL_DELETE_OWNER = "iptables {0} -t security -D OUTPUT -d {1} -p tcp -m owner --uid-owner {2} -j ACCEPT" + +PACKET_PATTERN = "^\s*(\d+)\s+(\d+)\s+DROP\s+.*{0}[^\d]*$" + _enable_firewall = True DMIDECODE_CMD = 'dmidecode --string system-uuid' @@ -69,13 +80,45 @@ UUID_PATTERN = re.compile( r'^\s*[A-F0-9]{8}(?:\-[A-F0-9]{4}){3}\-[A-F0-9]{12}\s*$', re.IGNORECASE) -class DefaultOSUtil(object): +class DefaultOSUtil(object): def __init__(self): self.agent_conf_file_path = '/etc/waagent.conf' self.selinux = None self.disable_route_warning = False + def get_firewall_dropped_packets(self, dst_ip=None): + # If a previous attempt failed, do not retry + global _enable_firewall + if not _enable_firewall: + return 0 + + try: + wait = self.get_firewall_will_wait() + + rc, output = shellutil.run_get_output(FIREWALL_PACKETS.format(wait)) + if rc == 3: + # Transient error that we ignore. This code fires every loop + # of the daemon (60m), so we will get the value eventually. + return 0 + + if rc != 0: + return -1 + + pattern = re.compile(PACKET_PATTERN.format(dst_ip)) + for line in output.split('\n'): + m = pattern.match(line) + if m is not None: + return int(m.group(1)) + + return 0 + + except Exception as e: + _enable_firewall = False + logger.warn("Unable to retrieve firewall packets dropped" + "{0}".format(ustr(e))) + return -1 + def get_firewall_will_wait(self): # Determine if iptables will serialize access rc, output = shellutil.run_get_output(IPTABLES_VERSION) @@ -95,31 +138,46 @@ class DefaultOSUtil(object): else "" return wait - def remove_firewall(self): - # If a previous attempt threw an exception, do not retry + def _delete_rule(self, rule): + """ + Continually execute the delete operation until the return + code is non-zero or the limit has been reached. + """ + for i in range(1, 100): + rc = shellutil.run(rule, chk_err=False) + if rc == 1: + return + elif rc == 2: + raise Exception("invalid firewall deletion rule '{0}'".format(rule)) + + def remove_firewall(self, dst_ip=None, uid=None): + # If a previous attempt failed, do not retry global _enable_firewall if not _enable_firewall: return False try: + if dst_ip is None or uid is None: + msg = "Missing arguments to enable_firewall" + logger.warn(msg) + raise Exception(msg) + wait = self.get_firewall_will_wait() - flush_rule = FIREWALL_FLUSH.format(wait) - if shellutil.run(flush_rule, chk_err=False) != 0: - logger.warn("Failed to flush firewall") + self._delete_rule(FIREWALL_DELETE_CONNTRACK.format(wait, dst_ip)) + self._delete_rule(FIREWALL_DELETE_OWNER.format(wait, dst_ip, uid)) return True except Exception as e: _enable_firewall = False - logger.info("Unable to flush firewall -- " + logger.info("Unable to remove firewall -- " "no further attempts will be made: " "{0}".format(ustr(e))) return False def enable_firewall(self, dst_ip=None, uid=None): - - # If a previous attempt threw an exception, do not retry + # If a previous attempt failed, do not retry global _enable_firewall if not _enable_firewall: return False @@ -134,10 +192,15 @@ class DefaultOSUtil(object): # If the DROP rule exists, make no changes drop_rule = FIREWALL_DROP.format(wait, "C", dst_ip) - - if shellutil.run(drop_rule, chk_err=False) == 0: + rc = shellutil.run(drop_rule, chk_err=False) + if rc == 0: logger.verbose("Firewall appears established") return True + elif rc == 2: + self.remove_firewall(dst_ip, uid) + msg = "please upgrade iptables to a version that supports the -C option" + logger.warn(msg) + raise Exception(msg) # Otherwise, append both rules accept_rule = FIREWALL_ACCEPT.format(wait, "A", dst_ip, uid) @@ -309,11 +372,11 @@ class DefaultOSUtil(object): else: sudoer = "{0} ALL=(ALL) ALL".format(username) if not os.path.isfile(sudoers_wagent) or \ - fileutil.findstr_in_file(sudoers_wagent, sudoer) is None: + fileutil.findstr_in_file(sudoers_wagent, sudoer) is False: fileutil.append_file(sudoers_wagent, "{0}\n".format(sudoer)) fileutil.chmod(sudoers_wagent, 0o440) else: - #Remove user from sudoers + # remove user from sudoers if os.path.isfile(sudoers_wagent): try: content = fileutil.read_file(sudoers_wagent) @@ -440,7 +503,7 @@ class DefaultOSUtil(object): conf_file = fileutil.read_file(conf_file_path).split("\n") textutil.set_ssh_config(conf_file, "PasswordAuthentication", option) textutil.set_ssh_config(conf_file, "ChallengeResponseAuthentication", option) - textutil.set_ssh_config(conf_file, "ClientAliveInterval", "180") + textutil.set_ssh_config(conf_file, "ClientAliveInterval", str(conf.get_ssh_client_alive_interval())) fileutil.write_file(conf_file_path, "\n".join(conf_file)) logger.info("{0} SSH password-based authentication methods." .format("Disabled" if disable_password else "Enabled")) @@ -965,7 +1028,7 @@ class DefaultOSUtil(object): if not os.path.exists(hostname_record): # this file is created at provisioning time with agents >= 2.2.3 hostname = socket.gethostname() - logger.warn('Hostname record does not exist, ' + logger.info('Hostname record does not exist, ' 'creating [{0}] with hostname [{1}]', hostname_record, hostname) diff --git a/azurelinuxagent/common/osutil/factory.py b/azurelinuxagent/common/osutil/factory.py index 43aa6a7..1b4e2cb 100644 --- a/azurelinuxagent/common/osutil/factory.py +++ b/azurelinuxagent/common/osutil/factory.py @@ -27,7 +27,8 @@ from .freebsd import FreeBSDOSUtil from .openbsd import OpenBSDOSUtil from .redhat import RedhatOSUtil, Redhat6xOSUtil from .suse import SUSEOSUtil, SUSE11OSUtil -from .ubuntu import UbuntuOSUtil, Ubuntu12OSUtil, Ubuntu14OSUtil, UbuntuSnappyOSUtil +from .ubuntu import UbuntuOSUtil, Ubuntu12OSUtil, Ubuntu14OSUtil, \ + UbuntuSnappyOSUtil, Ubuntu16OSUtil from .alpine import AlpineOSUtil from .bigip import BigIpOSUtil from .gaia import GaiaOSUtil @@ -46,10 +47,12 @@ def get_osutil(distro_name=DISTRO_NAME, return ClearLinuxUtil() if distro_name == "ubuntu": - if Version(distro_version) == Version("12.04") or Version(distro_version) == Version("12.10"): + if Version(distro_version) in [Version("12.04"), Version("12.10")]: return Ubuntu12OSUtil() - elif Version(distro_version) == Version("14.04") or Version(distro_version) == Version("14.10"): + elif Version(distro_version) in [Version("14.04"), Version("14.10")]: return Ubuntu14OSUtil() + elif Version(distro_version) in [Version('16.04'), Version('16.10'), Version('17.04')]: + return Ubuntu16OSUtil() elif distro_full_name == "Snappy Ubuntu Core": return UbuntuSnappyOSUtil() else: diff --git a/azurelinuxagent/common/osutil/ubuntu.py b/azurelinuxagent/common/osutil/ubuntu.py index 3c353cf..8dacc75 100644 --- a/azurelinuxagent/common/osutil/ubuntu.py +++ b/azurelinuxagent/common/osutil/ubuntu.py @@ -16,9 +16,14 @@ # Requires Python 2.4+ and Openssl 1.0+ # +import time + +import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.shellutil as shellutil + from azurelinuxagent.common.osutil.default import DefaultOSUtil + class Ubuntu14OSUtil(DefaultOSUtil): def __init__(self): super(Ubuntu14OSUtil, self).__init__() @@ -41,6 +46,7 @@ class Ubuntu14OSUtil(DefaultOSUtil): def get_dhcp_lease_endpoint(self): return self.get_endpoint_from_leases_path('/var/lib/dhcp/dhclient.*.leases') + class Ubuntu12OSUtil(Ubuntu14OSUtil): def __init__(self): super(Ubuntu12OSUtil, self).__init__() @@ -50,9 +56,13 @@ class Ubuntu12OSUtil(Ubuntu14OSUtil): ret = shellutil.run_get_output("pidof dhclient3", chk_err=False) return ret[1] if ret[0] == 0 else None -class UbuntuOSUtil(Ubuntu14OSUtil): + +class Ubuntu16OSUtil(Ubuntu14OSUtil): + """ + Ubuntu 16.04, 16.10, and 17.04. + """ def __init__(self): - super(UbuntuOSUtil, self).__init__() + super(Ubuntu16OSUtil, self).__init__() def register_agent_service(self): return shellutil.run("systemctl unmask walinuxagent", chk_err=False) @@ -60,6 +70,29 @@ class UbuntuOSUtil(Ubuntu14OSUtil): def unregister_agent_service(self): return shellutil.run("systemctl mask walinuxagent", chk_err=False) + +class UbuntuOSUtil(Ubuntu16OSUtil): + def __init__(self): + super(UbuntuOSUtil, self).__init__() + + def restart_if(self, ifname, retries=3, wait=5): + """ + Restart an interface by bouncing the link. systemd-networkd observes + this event, and forces a renew of DHCP. + """ + retry_limit=retries+1 + for attempt in range(1, retry_limit): + return_code=shellutil.run("ip link set {0} down && ip link set {0} up".format(ifname)) + if return_code == 0: + return + logger.warn("failed to restart {0}: return code {1}".format(ifname, return_code)) + if attempt < retry_limit: + logger.info("retrying in {0} seconds".format(wait)) + time.sleep(wait) + else: + logger.warn("exceeded restart retries") + + class UbuntuSnappyOSUtil(Ubuntu14OSUtil): def __init__(self): super(UbuntuSnappyOSUtil, self).__init__() diff --git a/azurelinuxagent/common/protocol/metadata.py b/azurelinuxagent/common/protocol/metadata.py index 4de7ecf..0cfa5ed 100644 --- a/azurelinuxagent/common/protocol/metadata.py +++ b/azurelinuxagent/common/protocol/metadata.py @@ -173,6 +173,11 @@ class MetadataProtocol(Protocol): return None return self.certs + def get_incarnation(self): + # Always return 0 since Azure Stack does not maintain goal state + # incarnation identifiers + return 0 + def get_vmagent_manifests(self): self.update_goal_state() @@ -235,7 +240,7 @@ class MetadataProtocol(Protocol): set_properties("extensionHandlers", ext_list.extHandlers, data) return ext_list, etag - def get_ext_handler_pkgs(self, ext_handler): + def get_ext_handler_pkgs(self, ext_handler, etag): logger.verbose("Get extension handler packages") pkg_list = ExtHandlerPackageList() diff --git a/azurelinuxagent/common/protocol/restapi.py b/azurelinuxagent/common/protocol/restapi.py index 1ec3e21..5a678d7 100644 --- a/azurelinuxagent/common/protocol/restapi.py +++ b/azurelinuxagent/common/protocol/restapi.py @@ -162,6 +162,7 @@ class ExtHandlerProperties(DataContract): def __init__(self): self.version = None self.upgradePolicy = None + self.upgradeGuid = None self.state = None self.extensions = DataContractList(Extension) @@ -245,11 +246,13 @@ class ExtHandlerStatus(DataContract): def __init__(self, name=None, version=None, + upgradeGuid=None, status=None, code=0, message=None): self.name = name self.version = version + self.upgradeGuid = upgradeGuid self.status = status self.code = code self.message = message @@ -300,6 +303,9 @@ class Protocol(DataContract): def get_certs(self): raise NotImplementedError() + def get_incarnation(self): + raise NotImplementedError() + def get_vmagent_manifests(self): raise NotImplementedError() @@ -309,15 +315,15 @@ class Protocol(DataContract): def get_ext_handlers(self): raise NotImplementedError() - def get_ext_handler_pkgs(self, extension): + def get_ext_handler_pkgs(self, extension, etag): raise NotImplementedError() def get_artifacts_profile(self): raise NotImplementedError() - def download_ext_handler_pkg(self, uri, headers=None): + def download_ext_handler_pkg(self, uri, headers=None, use_proxy=True): try: - resp = restutil.http_get(uri, use_proxy=True, headers=headers) + resp = restutil.http_get(uri, headers=headers, use_proxy=use_proxy) if restutil.request_succeeded(resp): return resp.read() except Exception as e: diff --git a/azurelinuxagent/common/protocol/util.py b/azurelinuxagent/common/protocol/util.py index 3071d7a..b3ec373 100644 --- a/azurelinuxagent/common/protocol/util.py +++ b/azurelinuxagent/common/protocol/util.py @@ -26,16 +26,18 @@ import threading import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger +import azurelinuxagent.common.utils.fileutil as fileutil + from azurelinuxagent.common.exception import ProtocolError, OSUtilError, \ ProtocolNotFoundError, DhcpError from azurelinuxagent.common.future import ustr -import azurelinuxagent.common.utils.fileutil as fileutil from azurelinuxagent.common.osutil import get_osutil from azurelinuxagent.common.dhcp import get_dhcp_handler from azurelinuxagent.common.protocol.ovfenv import OvfEnv from azurelinuxagent.common.protocol.wire import WireProtocol from azurelinuxagent.common.protocol.metadata import MetadataProtocol, \ METADATA_ENDPOINT +from azurelinuxagent.common.utils.restutil import IOErrorCounter OVF_FILE_NAME = "ovf-env.xml" TAG_FILE_NAME = "useMetadataEndpoint.tag" @@ -175,17 +177,20 @@ class ProtocolUtil(object): self.clear_protocol() for retry in range(0, MAX_RETRY): - for protocol in protocols: + for protocol_name in protocols: try: - if protocol == "WireProtocol": - return self._detect_wire_protocol() - - if protocol == "MetadataProtocol": - return self._detect_metadata_protocol() + protocol = self._detect_wire_protocol() \ + if protocol_name == "WireProtocol" \ + else self._detect_metadata_protocol() + + IOErrorCounter.set_protocol_endpoint( + endpoint=protocol.endpoint) + + return protocol except ProtocolError as e: logger.info("Protocol endpoint not found: {0}, {1}", - protocol, e) + protocol_name, e) if retry < MAX_RETRY -1: logger.info("Retry detect protocols: retry={0}", retry) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 4f3b7e0..963d33c 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -18,6 +18,7 @@ import json import os +import random import re import time import xml.sax.saxutils as saxutils @@ -27,7 +28,7 @@ import azurelinuxagent.common.utils.fileutil as fileutil import azurelinuxagent.common.utils.textutil as textutil from azurelinuxagent.common.exception import ProtocolNotFoundError, \ - ResourceGoneError + ResourceGoneError, RestartError from azurelinuxagent.common.future import httpclient, bytebuffer from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol from azurelinuxagent.common.protocol.restapi import * @@ -51,6 +52,7 @@ P7M_FILE_NAME = "Certificates.p7m" PEM_FILE_NAME = "Certificates.pem" EXT_CONF_FILE_NAME = "ExtensionsConfig.{0}.xml" MANIFEST_FILE_NAME = "{0}.{1}.manifest.xml" +AGENTS_MANIFEST_FILE_NAME = "{0}.{1}.agentsManifest" TRANSPORT_CERT_FILE_NAME = "TransportCert.pem" TRANSPORT_PRV_FILE_NAME = "TransportPrivate.pem" @@ -58,17 +60,12 @@ PROTOCOL_VERSION = "2012-11-30" ENDPOINT_FINE_NAME = "WireServer" SHORT_WAITING_INTERVAL = 1 # 1 second -LONG_WAITING_INTERVAL = 15 # 15 seconds class UploadError(HttpError): pass -class WireProtocolResourceGone(ProtocolError): - pass - - class WireProtocol(Protocol): """Slim layer to adapt wire protocol data to metadata protocol interface""" @@ -119,6 +116,13 @@ class WireProtocol(Protocol): certificates = self.client.get_certs() return certificates.cert_list + def get_incarnation(self): + path = os.path.join(conf.get_lib_dir(), INCARNATION_FILE_NAME) + if os.path.exists(path): + return fileutil.read_file(path) + else: + return 0 + def get_vmagent_manifests(self): # Update goal state to get latest extensions config self.update_goal_state() @@ -128,8 +132,9 @@ class WireProtocol(Protocol): def get_vmagent_pkgs(self, vmagent_manifest): goal_state = self.client.get_goal_state() - man = self.client.get_gafamily_manifest(vmagent_manifest, goal_state) - return man.pkg_list + ga_manifest = self.client.get_gafamily_manifest(vmagent_manifest, goal_state) + valid_pkg_list = self.client.filter_package_list(vmagent_manifest.family, ga_manifest, goal_state) + return valid_pkg_list def get_ext_handlers(self): logger.verbose("Get extension handler config") @@ -140,10 +145,9 @@ class WireProtocol(Protocol): # In wire protocol, incarnation is equivalent to ETag return ext_conf.ext_handlers, goal_state.incarnation - def get_ext_handler_pkgs(self, ext_handler): + def get_ext_handler_pkgs(self, ext_handler, etag): logger.verbose("Get extension handler package") - goal_state = self.client.get_goal_state() - man = self.client.get_ext_manifest(ext_handler, goal_state) + man = self.client.get_ext_manifest(ext_handler, etag) return man.pkg_list def get_artifacts_profile(self): @@ -156,10 +160,10 @@ class WireProtocol(Protocol): if package is not None: return package else: - logger.warn("Download did not succeed, falling back to host plugin") + logger.verbose("Download did not succeed, falling back to host plugin") host = self.client.get_host_plugin() uri, headers = host.get_artifact_request(uri, host.manifest_uri) - package = super(WireProtocol, self).download_ext_handler_pkg(uri, headers=headers) + package = super(WireProtocol, self).download_ext_handler_pkg(uri, headers=headers, use_proxy=False) return package def report_provision_status(self, provision_status): @@ -248,22 +252,28 @@ Convert VMStatus object to status blob format """ +def ga_status_to_guest_info(ga_status): + v1_ga_guest_info = { + "computerName" : ga_status.hostname, + "osName" : ga_status.osname, + "osVersion" : ga_status.osversion, + "version" : ga_status.version, + } + return v1_ga_guest_info + + def ga_status_to_v1(ga_status): formatted_msg = { 'lang': 'en-US', 'message': ga_status.message } v1_ga_status = { - 'version': ga_status.version, - 'status': ga_status.status, - 'osversion': ga_status.osversion, - 'osname': ga_status.osname, - 'hostname': ga_status.hostname, - 'formattedMessage': formatted_msg + "version" : ga_status.version, + "status" : ga_status.status, + "formattedMessage" : formatted_msg } return v1_ga_status - def ext_substatus_to_v1(sub_status_list): status_list = [] for substatus in sub_status_list: @@ -318,6 +328,9 @@ def ext_handler_status_to_v1(handler_status, ext_statuses, timestamp): "message": handler_status.message } + if handler_status.upgradeGuid is not None: + v1_handler_status["upgradeGuid"] = handler_status.upgradeGuid + if len(handler_status.extensions) > 0: # Currently, no more than one extension per handler ext_name = handler_status.extensions[0] @@ -334,6 +347,7 @@ def ext_handler_status_to_v1(handler_status, ext_statuses, timestamp): def vm_status_to_v1(vm_status, ext_statuses): timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + v1_ga_guest_info = ga_status_to_guest_info(vm_status.vmAgent) v1_ga_status = ga_status_to_v1(vm_status.vmAgent) v1_handler_status_list = [] for handler_status in vm_status.vmAgent.extensionHandlers: @@ -349,7 +363,8 @@ def vm_status_to_v1(vm_status, ext_statuses): v1_vm_status = { 'version': '1.1', 'timestampUTC': timestamp, - 'aggregateStatus': v1_agg_status + 'aggregateStatus': v1_agg_status, + 'guestOSInfo' : v1_ga_guest_info } return v1_vm_status @@ -512,51 +527,29 @@ class WireClient(object): self.shared_conf = None self.certs = None self.ext_conf = None - self.last_request = 0 - self.req_count = 0 self.host_plugin = None self.status_blob = StatusBlob(self) - def prevent_throttling(self): - """ - Try to avoid throttling of wire server - """ - now = time.time() - if now - self.last_request < 1: - logger.verbose("Last request issued less than 1 second ago") - logger.verbose("Sleep {0} second to avoid throttling.", - SHORT_WAITING_INTERVAL) - time.sleep(SHORT_WAITING_INTERVAL) - self.last_request = now - - self.req_count += 1 - if self.req_count % 3 == 0: - logger.verbose("Sleep {0} second to avoid throttling.", - SHORT_WAITING_INTERVAL) - time.sleep(SHORT_WAITING_INTERVAL) - self.req_count = 0 - def call_wireserver(self, http_req, *args, **kwargs): - self.prevent_throttling() - try: # Never use the HTTP proxy for wireserver kwargs['use_proxy'] = False resp = http_req(*args, **kwargs) + + if restutil.request_failed(resp): + msg = "[Wireserver Failed] URI {0} ".format(args[0]) + if resp is not None: + msg += " [HTTP Failed] Status Code {0}".format(resp.status) + raise ProtocolError(msg) + + # If the GoalState is stale, pass along the exception to the caller + except ResourceGoneError: + raise + except Exception as e: raise ProtocolError("[Wireserver Exception] {0}".format( ustr(e))) - if resp is not None and resp.status == httpclient.GONE: - msg = args[0] if len(args) > 0 else "" - raise WireProtocolResourceGone(msg) - - elif restutil.request_failed(resp): - msg = "[Wireserver Failed] URI {0} ".format(args[0]) - if resp is not None: - msg += " [HTTP Failed] Status Code {0}".format(resp.status) - raise ProtocolError(msg) - return resp def decode_config(self, data): @@ -598,7 +591,10 @@ class WireClient(object): def fetch_manifest(self, version_uris): logger.verbose("Fetch manifest") - for version in version_uris: + version_uris_shuffled = version_uris + random.shuffle(version_uris_shuffled) + + for version in version_uris_shuffled: response = None if not HostPluginProtocol.is_default_channel(): response = self.fetch(version.uri) @@ -698,26 +694,28 @@ class WireClient(object): self.ext_conf = ExtensionsConfig(xml_text) def update_goal_state(self, forced=False, max_retry=3): - uri = GOAL_STATE_URI.format(self.endpoint) - xml_text = self.fetch_config(uri, self.get_header()) - goal_state = GoalState(xml_text) - incarnation_file = os.path.join(conf.get_lib_dir(), INCARNATION_FILE_NAME) + uri = GOAL_STATE_URI.format(self.endpoint) - if not forced: - last_incarnation = None - if os.path.isfile(incarnation_file): - last_incarnation = fileutil.read_file(incarnation_file) - new_incarnation = goal_state.incarnation - if last_incarnation is not None and \ - last_incarnation == new_incarnation: - # Goalstate is not updated. - return - - # Start updating goalstate, retry on 410 + goal_state = None for retry in range(0, max_retry): try: + if goal_state is None: + xml_text = self.fetch_config(uri, self.get_header()) + goal_state = GoalState(xml_text) + + if not forced: + last_incarnation = None + if os.path.isfile(incarnation_file): + last_incarnation = fileutil.read_file( + incarnation_file) + new_incarnation = goal_state.incarnation + if last_incarnation is not None and \ + last_incarnation == new_incarnation: + # Goalstate is not updated. + return + self.goal_state = goal_state file_name = GOAL_STATE_FILE_NAME.format(goal_state.incarnation) goal_state_file = os.path.join(conf.get_lib_dir(), file_name) @@ -727,21 +725,29 @@ class WireClient(object): self.update_certs(goal_state) self.update_ext_conf(goal_state) self.save_cache(incarnation_file, goal_state.incarnation) + if self.host_plugin is not None: self.host_plugin.container_id = goal_state.container_id self.host_plugin.role_config_name = goal_state.role_config_name + return - except ProtocolError: + except ResourceGoneError: + logger.info("GoalState is stale -- re-fetching") + goal_state = None + + except Exception as e: + log_method = logger.info \ + if type(e) is ProtocolError \ + else logger.warn + log_method( + "Exception processing GoalState-related files: {0}".format( + ustr(e))) + if retry < max_retry-1: continue raise - except WireProtocolResourceGone: - logger.info("Incarnation is out of date. Update goalstate.") - xml_text = self.fetch_config(uri, self.get_header()) - goal_state = GoalState(xml_text) - raise ProtocolError("Exceeded max retry updating goal state") def get_goal_state(self): @@ -793,25 +799,70 @@ class WireClient(object): self.ext_conf = ExtensionsConfig(xml_text) return self.ext_conf - def get_ext_manifest(self, ext_handler, goal_state): + def get_ext_manifest(self, ext_handler, incarnation): + for update_goal_state in [False, True]: try: if update_goal_state: self.update_goal_state(forced=True) - goal_state = self.get_goal_state() + incarnation = self.get_goal_state().incarnation local_file = MANIFEST_FILE_NAME.format( ext_handler.name, - goal_state.incarnation) + incarnation) local_file = os.path.join(conf.get_lib_dir(), local_file) - xml_text = self.fetch_manifest(ext_handler.versionUris) - self.save_cache(local_file, xml_text) + + xml_text = None + if not update_goal_state: + try: + xml_text = self.fetch_cache(local_file) + except ProtocolError: + pass + + if xml_text is None: + xml_text = self.fetch_manifest(ext_handler.versionUris) + self.save_cache(local_file, xml_text) return ExtensionManifest(xml_text) except ResourceGoneError: continue - raise ProtocolError("Failed to retrieve extension manifest") + raise RestartError("Failed to retrieve extension manifest") + + def filter_package_list(self, family, ga_manifest, goal_state): + complete_list = ga_manifest.pkg_list + agent_manifest = os.path.join(conf.get_lib_dir(), + AGENTS_MANIFEST_FILE_NAME.format( + family, + goal_state.incarnation)) + + if not os.path.exists(agent_manifest): + # clear memory cache + ga_manifest.allowed_versions = None + + # create disk cache + with open(agent_manifest, mode='w') as manifest_fh: + for version in complete_list.versions: + manifest_fh.write('{0}\n'.format(version.version)) + fileutil.chmod(agent_manifest, 0o644) + + return complete_list + + else: + # use allowed versions from cache, otherwise from disk + if ga_manifest.allowed_versions is None: + with open(agent_manifest, mode='r') as manifest_fh: + ga_manifest.allowed_versions = [v.strip('\n') for v + in manifest_fh.readlines()] + + # use the updated manifest urls for allowed versions + allowed_list = ExtHandlerPackageList() + allowed_list.versions = [version for version + in complete_list.versions + if version.version + in ga_manifest.allowed_versions] + + return allowed_list def get_gafamily_manifest(self, vmagent_manifest, goal_state): for update_goal_state in [False, True]: @@ -844,7 +895,7 @@ class WireClient(object): logger.info("Wire protocol version:{0}", PROTOCOL_VERSION) elif PROTOCOL_VERSION in version_info.get_supported(): logger.info("Wire protocol version:{0}", PROTOCOL_VERSION) - logger.warn("Server preferred version:{0}", preferred) + logger.info("Server preferred version:{0}", preferred) else: error = ("Agent supported wire protocol version: {0} was not " "advised by Fabric.").format(PROTOCOL_VERSION) @@ -1360,6 +1411,9 @@ class ExtensionsConfig(object): ext_handler.properties.version = getattrib(plugin, "version") ext_handler.properties.state = getattrib(plugin, "state") + ext_handler.properties.upgradeGuid = getattrib(plugin, "upgradeGuid") + if not ext_handler.properties.upgradeGuid: + ext_handler.properties.upgradeGuid = None auto_upgrade = getattrib(plugin, "autoUpgrade") if auto_upgrade is not None and auto_upgrade.lower() == "true": ext_handler.properties.upgradePolicy = "auto" @@ -1418,6 +1472,7 @@ class ExtensionManifest(object): raise ValueError("ExtensionManifest is None") logger.verbose("Load ExtensionManifest.xml") self.pkg_list = ExtHandlerPackageList() + self.allowed_versions = None self.parse(xml_text) def parse(self, xml_text): diff --git a/azurelinuxagent/common/utils/cryptutil.py b/azurelinuxagent/common/utils/cryptutil.py index 6339eb3..b34c1a5 100644 --- a/azurelinuxagent/common/utils/cryptutil.py +++ b/azurelinuxagent/common/utils/cryptutil.py @@ -19,8 +19,11 @@ import base64 import struct + from azurelinuxagent.common.future import ustr, bytebuffer from azurelinuxagent.common.exception import CryptError + +import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.shellutil as shellutil class CryptUtil(object): @@ -34,7 +37,10 @@ class CryptUtil(object): cmd = ("{0} req -x509 -nodes -subj /CN=LinuxTransport -days 730 " "-newkey rsa:2048 -keyout {1} " "-out {2}").format(self.openssl_cmd, prv_file, crt_file) - shellutil.run(cmd) + rc = shellutil.run(cmd) + if rc != 0: + logger.error("Failed to create {0} and {1} certificates".format( + prv_file, crt_file)) def get_pubkey_from_prv(self, file_name): cmd = "{0} rsa -in {1} -pubout 2>/dev/null".format(self.openssl_cmd, @@ -61,6 +67,9 @@ class CryptUtil(object): "").format(self.openssl_cmd, p7m_file, trans_prv_file, trans_cert_file, self.openssl_cmd, pem_file) shellutil.run(cmd) + rc = shellutil.run(cmd) + if rc != 0: + logger.error("Failed to decrypt {0}".format(p7m_file)) def crt_to_ssh(self, input_file, output_file): shellutil.run("ssh-keygen -i -m PKCS8 -f {0} >> {1}".format(input_file, diff --git a/azurelinuxagent/common/utils/fileutil.py b/azurelinuxagent/common/utils/fileutil.py index 96b5b82..1f0c7ac 100644 --- a/azurelinuxagent/common/utils/fileutil.py +++ b/azurelinuxagent/common/utils/fileutil.py @@ -27,7 +27,6 @@ import os import pwd import re import shutil -import string import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.textutil as textutil @@ -42,9 +41,10 @@ KNOWN_IOERRORS = [ errno.ENOSPC, # Out of space errno.ENAMETOOLONG, # Name too long errno.ELOOP, # Too many symbolic links encountered - errno.EREMOTEIO # Remote I/O error + 121 # Remote I/O error (errno.EREMOTEIO -- not present in all Python 2.7+) ] + def copy_file(from_path, to_path=None, to_dir=None): if to_path is None: to_path = os.path.join(to_dir, os.path.basename(from_path)) @@ -66,11 +66,12 @@ def read_file(filepath, asbin=False, remove_bom=False, encoding='utf-8'): return data if remove_bom: - #Remove bom on bytes data before it is converted into string. + # remove bom on bytes data before it is converted into string. data = textutil.remove_bom(data) data = ustr(data, encoding=encoding) return data + def write_file(filepath, contents, asbin=False, encoding='utf-8', append=False): """ Write 'contents' to 'filepath'. @@ -82,6 +83,7 @@ def write_file(filepath, contents, asbin=False, encoding='utf-8', append=False): with open(filepath, mode) as out_file: out_file.write(data) + def append_file(filepath, contents, asbin=False, encoding='utf-8'): """ Append 'contents' to 'filepath'. @@ -93,6 +95,7 @@ def base_name(path): head, tail = os.path.split(path) return tail + def get_line_startingwith(prefix, filepath): """ Return line from 'filepath' if the line startswith 'prefix' @@ -102,7 +105,6 @@ def get_line_startingwith(prefix, filepath): return line return None -#End File operation util functions def mkdir(dirpath, mode=None, owner=None): if not os.path.isdir(dirpath): @@ -112,6 +114,7 @@ def mkdir(dirpath, mode=None, owner=None): if owner is not None: chowner(dirpath, owner) + def chowner(path, owner): if not os.path.exists(path): logger.error("Path does not exist: {0}".format(path)) @@ -119,19 +122,22 @@ def chowner(path, owner): owner_info = pwd.getpwnam(owner) os.chown(path, owner_info[2], owner_info[3]) + def chmod(path, mode): if not os.path.exists(path): logger.error("Path does not exist: {0}".format(path)) else: os.chmod(path, mode) + def rm_files(*args): for paths in args: - #Find all possible file paths + # find all possible file paths for path in glob.glob(paths): if os.path.isfile(path): os.remove(path) + def rm_dirs(*args): """ Remove the contents of each directry @@ -149,20 +155,24 @@ def rm_dirs(*args): elif os.path.isdir(path): shutil.rmtree(path) + def trim_ext(path, ext): if not ext.startswith("."): ext = "." + ext return path.split(ext)[0] if path.endswith(ext) else path + def update_conf_file(path, line_start, val, chk_err=False): conf = [] if not os.path.isfile(path) and chk_err: raise IOError("Can't find config file:{0}".format(path)) conf = read_file(path).split('\n') - conf = [x for x in conf if x is not None and len(x) > 0 and not x.startswith(line_start)] + conf = [x for x in conf + if x is not None and len(x) > 0 and not x.startswith(line_start)] conf.append(val) write_file(path, '\n'.join(conf) + '\n') + def search_file(target_dir_name, target_file_name): for root, dirs, files in os.walk(target_dir_name): for file_name in files: @@ -170,24 +180,28 @@ def search_file(target_dir_name, target_file_name): return os.path.join(root, file_name) return None + def chmod_tree(path, mode): for root, dirs, files in os.walk(path): for file_name in files: os.chmod(os.path.join(root, file_name), mode) + def findstr_in_file(file_path, line_str): """ Return True if the line is in the file; False otherwise. - (Trailing whitespace is ignore.) + (Trailing whitespace is ignored.) """ try: for line in (open(file_path, 'r')).readlines(): if line_str == line.rstrip(): return True - except Exception as e: + except Exception: + # swallow exception pass return False + def findre_in_file(file_path, line_re): """ Return match object if found in file. @@ -203,6 +217,7 @@ def findre_in_file(file_path, line_re): return None + def get_all_files(root_path): """ Find all files under the given root path @@ -213,6 +228,7 @@ def get_all_files(root_path): return result + def clean_ioerror(e, paths=[]): """ Clean-up possibly bad files and directories after an IO error. @@ -228,5 +244,6 @@ def clean_ioerror(e, paths=[]): shutil.rmtree(path, ignore_errors=True) else: os.remove(path) - except Exception as e: + except Exception: + # swallow exception pass diff --git a/azurelinuxagent/common/utils/flexible_version.py b/azurelinuxagent/common/utils/flexible_version.py index 2fce88d..14c2a73 100644 --- a/azurelinuxagent/common/utils/flexible_version.py +++ b/azurelinuxagent/common/utils/flexible_version.py @@ -37,7 +37,7 @@ class FlexibleVersion(version.Version): self.prerelease = None self.version = () if vstring: - self._parse(vstring) + self._parse(str(vstring)) return _nn_version = 'version' diff --git a/azurelinuxagent/common/utils/restutil.py b/azurelinuxagent/common/utils/restutil.py index ddd930b..807be29 100644 --- a/azurelinuxagent/common/utils/restutil.py +++ b/azurelinuxagent/common/utils/restutil.py @@ -18,6 +18,7 @@ # import os +import threading import time import traceback @@ -32,10 +33,11 @@ from azurelinuxagent.common.version import PY_VERSION_MAJOR SECURE_WARNING_EMITTED = False -DEFAULT_RETRIES = 3 +DEFAULT_RETRIES = 6 +DELAY_IN_SECONDS = 1 -SHORT_DELAY_IN_SECONDS = 5 -LONG_DELAY_IN_SECONDS = 15 +THROTTLE_RETRIES = 25 +THROTTLE_DELAY_IN_SECONDS = 1 RETRY_CODES = [ httpclient.RESET_CONTENT, @@ -63,7 +65,8 @@ OK_CODES = [ THROTTLE_CODES = [ httpclient.FORBIDDEN, - httpclient.SERVICE_UNAVAILABLE + httpclient.SERVICE_UNAVAILABLE, + 429, # Request Rate Limit Exceeded ] RETRY_EXCEPTIONS = [ @@ -76,6 +79,48 @@ RETRY_EXCEPTIONS = [ HTTP_PROXY_ENV = "http_proxy" HTTPS_PROXY_ENV = "https_proxy" +DEFAULT_PROTOCOL_ENDPOINT='168.63.129.16' +HOST_PLUGIN_PORT = 32526 + + +class IOErrorCounter(object): + _lock = threading.RLock() + _protocol_endpoint = DEFAULT_PROTOCOL_ENDPOINT + _counts = {"hostplugin":0, "protocol":0, "other":0} + + @staticmethod + def increment(host=None, port=None): + with IOErrorCounter._lock: + if host == IOErrorCounter._protocol_endpoint: + if port == HOST_PLUGIN_PORT: + IOErrorCounter._counts["hostplugin"] += 1 + else: + IOErrorCounter._counts["protocol"] += 1 + else: + IOErrorCounter._counts["other"] += 1 + + @staticmethod + def get_and_reset(): + with IOErrorCounter._lock: + counts = IOErrorCounter._counts.copy() + IOErrorCounter.reset() + return counts + + @staticmethod + def reset(): + with IOErrorCounter._lock: + IOErrorCounter._counts = {"hostplugin":0, "protocol":0, "other":0} + + @staticmethod + def set_protocol_endpoint(endpoint=DEFAULT_PROTOCOL_ENDPOINT): + IOErrorCounter._protocol_endpoint = endpoint + + +def _compute_delay(retry_attempt=1, delay=DELAY_IN_SECONDS): + fib = (1, 1) + for n in range(retry_attempt): + fib = (fib[1], fib[0]+fib[1]) + return delay*fib[1] def _is_retry_status(status, retry_codes=RETRY_CODES): return status in retry_codes @@ -166,7 +211,7 @@ def http_request(method, use_proxy=False, max_retry=DEFAULT_RETRIES, retry_codes=RETRY_CODES, - retry_delay=SHORT_DELAY_IN_SECONDS): + retry_delay=DELAY_IN_SECONDS): global SECURE_WARNING_EMITTED @@ -208,18 +253,31 @@ def http_request(method, msg = '' attempt = 0 - delay = retry_delay + delay = 0 + was_throttled = False while attempt < max_retry: if attempt > 0: - logger.info("[HTTP Retry] Attempt {0} of {1}: {2}", + # Compute the request delay + # -- Use a fixed delay if the server ever rate-throttles the request + # (with a safe, minimum number of retry attempts) + # -- Otherwise, compute a delay that is the product of the next + # item in the Fibonacci series and the initial delay value + delay = THROTTLE_DELAY_IN_SECONDS \ + if was_throttled \ + else _compute_delay(retry_attempt=attempt, + delay=retry_delay) + + logger.verbose("[HTTP Retry] " + "Attempt {0} of {1} will delay {2} seconds: {3}", attempt+1, max_retry, + delay, msg) + time.sleep(delay) attempt += 1 - delay = retry_delay try: resp = _http_request(method, @@ -235,13 +293,13 @@ def http_request(method, if request_failed(resp): if _is_retry_status(resp.status, retry_codes=retry_codes): - msg = '[HTTP Retry] HTTP {0} Status Code {1}'.format( - method, resp.status) + msg = '[HTTP Retry] {0} {1} -- Status Code {2}'.format( + method, url, resp.status) + # Note if throttled and ensure a safe, minimum number of + # retry attempts if _is_throttle_status(resp.status): - delay = LONG_DELAY_IN_SECONDS - logger.info("[HTTP Delay] Delay {0} seconds for " \ - "Status Code {1}".format( - delay, resp.status)) + was_throttled = True + max_retry = max(max_retry, THROTTLE_RETRIES) continue if resp.status in RESOURCE_GONE_CODES: @@ -250,22 +308,25 @@ def http_request(method, return resp except httpclient.HTTPException as e: - msg = '[HTTP Failed] HTTP {0} HttpException {1}'.format(method, e) + msg = '[HTTP Failed] {0} {1} -- HttpException {2}'.format( + method, url, e) if _is_retry_exception(e): continue break except IOError as e: - msg = '[HTTP Failed] HTTP {0} IOError {1}'.format(method, e) + IOErrorCounter.increment(host=host, port=port) + msg = '[HTTP Failed] {0} {1} -- IOError {2}'.format( + method, url, e) continue - raise HttpError(msg) + raise HttpError("{0} -- {1} attempts made".format(msg,attempt)) def http_get(url, headers=None, use_proxy=False, max_retry=DEFAULT_RETRIES, retry_codes=RETRY_CODES, - retry_delay=SHORT_DELAY_IN_SECONDS): + retry_delay=DELAY_IN_SECONDS): return http_request("GET", url, None, headers=headers, use_proxy=use_proxy, @@ -277,7 +338,7 @@ def http_get(url, headers=None, use_proxy=False, def http_head(url, headers=None, use_proxy=False, max_retry=DEFAULT_RETRIES, retry_codes=RETRY_CODES, - retry_delay=SHORT_DELAY_IN_SECONDS): + retry_delay=DELAY_IN_SECONDS): return http_request("HEAD", url, None, headers=headers, use_proxy=use_proxy, @@ -289,7 +350,7 @@ def http_head(url, headers=None, use_proxy=False, def http_post(url, data, headers=None, use_proxy=False, max_retry=DEFAULT_RETRIES, retry_codes=RETRY_CODES, - retry_delay=SHORT_DELAY_IN_SECONDS): + retry_delay=DELAY_IN_SECONDS): return http_request("POST", url, data, headers=headers, use_proxy=use_proxy, @@ -301,7 +362,7 @@ def http_post(url, data, headers=None, use_proxy=False, def http_put(url, data, headers=None, use_proxy=False, max_retry=DEFAULT_RETRIES, retry_codes=RETRY_CODES, - retry_delay=SHORT_DELAY_IN_SECONDS): + retry_delay=DELAY_IN_SECONDS): return http_request("PUT", url, data, headers=headers, use_proxy=use_proxy, @@ -313,7 +374,7 @@ def http_put(url, data, headers=None, use_proxy=False, def http_delete(url, headers=None, use_proxy=False, max_retry=DEFAULT_RETRIES, retry_codes=RETRY_CODES, - retry_delay=SHORT_DELAY_IN_SECONDS): + retry_delay=DELAY_IN_SECONDS): return http_request("DELETE", url, None, headers=headers, use_proxy=use_proxy, diff --git a/azurelinuxagent/common/utils/shellutil.py b/azurelinuxagent/common/utils/shellutil.py index fff6aa8..7b8e0c9 100644 --- a/azurelinuxagent/common/utils/shellutil.py +++ b/azurelinuxagent/common/utils/shellutil.py @@ -84,16 +84,22 @@ def run_get_output(cmd, chk_err=True, log_cmd=True): output = ustr(output, encoding='utf-8', errors="backslashreplace") - except subprocess.CalledProcessError as e: - output = ustr(e.output, - encoding='utf-8', - errors="backslashreplace") - if chk_err: - if log_cmd: - logger.error(u"Command: '{0}'", e.cmd) - logger.error(u"Return code: {0}", e.returncode) - logger.error(u"Result: {0}", output) - return e.returncode, output + except Exception as e: + if type(e) is subprocess.CalledProcessError: + output = ustr(e.output, + encoding='utf-8', + errors="backslashreplace") + if chk_err: + if log_cmd: + logger.error(u"Command: '{0}'", e.cmd) + logger.error(u"Return code: {0}", e.returncode) + logger.error(u"Result: {0}", output) + return e.returncode, output + else: + logger.error( + u"'{0}' raised unexpected exception: '{1}'".format( + cmd, ustr(e))) + return -1, ustr(e) return 0, output diff --git a/azurelinuxagent/common/version.py b/azurelinuxagent/common/version.py index e1bb828..f743f11 100644 --- a/azurelinuxagent/common/version.py +++ b/azurelinuxagent/common/version.py @@ -113,7 +113,7 @@ def get_distro(): AGENT_NAME = "WALinuxAgent" AGENT_LONG_NAME = "Azure Linux Agent" -AGENT_VERSION = '2.2.17' +AGENT_VERSION = '2.2.21' AGENT_LONG_VERSION = "{0}-{1}".format(AGENT_NAME, AGENT_VERSION) AGENT_DESCRIPTION = """ The Azure Linux Agent supports the provisioning and running of Linux @@ -161,14 +161,17 @@ def set_current_agent(): version = AGENT_VERSION return agent, FlexibleVersion(version) + def is_agent_package(path): path = os.path.basename(path) return not re.match(AGENT_PKG_PATTERN, path) is None + def is_agent_path(path): path = os.path.basename(path) return not re.match(AGENT_NAME_PATTERN, path) is None + CURRENT_AGENT, CURRENT_VERSION = set_current_agent() diff --git a/azurelinuxagent/ga/env.py b/azurelinuxagent/ga/env.py index 45b10bb..d9b7d82 100644 --- a/azurelinuxagent/ga/env.py +++ b/azurelinuxagent/ga/env.py @@ -17,11 +17,16 @@ # Requires Python 2.4+ and Openssl 1.0+ # +import re import os import socket import time import threading +import operator + +import datetime + import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger @@ -29,15 +34,29 @@ from azurelinuxagent.common.dhcp import get_dhcp_handler from azurelinuxagent.common.event import add_periodic, WALAEventOperation from azurelinuxagent.common.osutil import get_osutil from azurelinuxagent.common.protocol import get_protocol_util +from azurelinuxagent.common.protocol.wire import INCARNATION_FILE_NAME +from azurelinuxagent.common.utils import fileutil from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION +CACHE_PATTERNS = [ + re.compile("^(.*)\.(\d+)\.(agentsManifest)$", re.IGNORECASE), + re.compile("^(.*)\.(\d+)\.(manifest\.xml)$", re.IGNORECASE), + re.compile("^(.*)\.(\d+)\.(xml)$", re.IGNORECASE) +] + +MAXIMUM_CACHED_FILES = 50 + +CACHE_PURGE_INTERVAL = datetime.timedelta(hours=24) + + def get_env_handler(): return EnvHandler() + class EnvHandler(object): """ Monitor changes to dhcp and hostname. - If dhcp clinet process re-start has occurred, reset routes, dhcp with fabric. + If dhcp client process re-start has occurred, reset routes, dhcp with fabric. Monitor scsi disk. If new scsi disk found, set timeout @@ -48,9 +67,10 @@ class EnvHandler(object): self.protocol_util = get_protocol_util() self.stopped = True self.hostname = None - self.dhcpid = None + self.dhcp_id = None self.server_thread = None self.dhcp_warning_enabled = True + self.last_purge = None def run(self): if not self.stopped: @@ -61,7 +81,7 @@ class EnvHandler(object): logger.info("Start env monitor service.") self.dhcp_handler.conf_routes() self.hostname = self.osutil.get_hostname_record() - self.dhcpid = self.osutil.get_dhcp_pid() + self.dhcp_id = self.osutil.get_dhcp_pid() self.server_thread = threading.Thread(target=self.monitor) self.server_thread.setDaemon(True) self.server_thread.start() @@ -70,26 +90,24 @@ class EnvHandler(object): """ Monitor firewall rules Monitor dhcp client pid and hostname. - If dhcp clinet process re-start has occurred, reset routes. + If dhcp client process re-start has occurred, reset routes. + Purge unnecessary files from disk cache. """ protocol = self.protocol_util.get_protocol() while not self.stopped: self.osutil.remove_rules_files() - # Disable setting firewall for now, regardless of configuration switch - # if conf.enable_firewall(): - # success = self.osutil.enable_firewall( - # dst_ip=protocol.endpoint, - # uid=os.getuid()) - # add_periodic( - # logger.EVERY_HOUR, - # AGENT_NAME, - # version=CURRENT_VERSION, - # op=WALAEventOperation.Firewall, - # is_success=success, - # log_event=True) - - self.osutil.remove_firewall() + if conf.enable_firewall(): + success = self.osutil.enable_firewall( + dst_ip=protocol.endpoint, + uid=os.getuid()) + add_periodic( + logger.EVERY_HOUR, + AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.Firewall, + is_success=success, + log_event=False) timeout = conf.get_root_device_scsi_timeout() if timeout is not None: @@ -100,6 +118,8 @@ class EnvHandler(object): self.handle_dhclient_restart() + self.purge_disk_cache() + time.sleep(5) def handle_hostname_update(self): @@ -113,30 +133,95 @@ class EnvHandler(object): self.hostname = curr_hostname def handle_dhclient_restart(self): - if self.dhcpid is None: + if self.dhcp_id is None: if self.dhcp_warning_enabled: logger.warn("Dhcp client is not running. ") - self.dhcpid = self.osutil.get_dhcp_pid() + self.dhcp_id = self.osutil.get_dhcp_pid() # disable subsequent error logging - self.dhcp_warning_enabled = self.dhcpid is not None + self.dhcp_warning_enabled = self.dhcp_id is not None + return + + # the dhcp process has not changed since the last check + if self.osutil.check_pid_alive(self.dhcp_id.strip()): return - #The dhcp process hasn't changed since last check - if self.osutil.check_pid_alive(self.dhcpid.strip()): + new_pid = self.osutil.get_dhcp_pid() + if new_pid is not None and new_pid != self.dhcp_id: + logger.info("EnvMonitor: Detected dhcp client restart. " + "Restoring routing table.") + self.dhcp_handler.conf_routes() + self.dhcp_id = new_pid + + def purge_disk_cache(self): + """ + Ensure the number of cached files does not exceed a maximum count. + Purge only once per interval, and never delete files related to the + current incarnation. + """ + if self.last_purge is not None \ + and datetime.datetime.utcnow() < \ + self.last_purge + CACHE_PURGE_INTERVAL: + return + + current_incarnation = -1 + self.last_purge = datetime.datetime.utcnow() + incarnation_file = os.path.join(conf.get_lib_dir(), + INCARNATION_FILE_NAME) + if os.path.exists(incarnation_file): + last_incarnation = fileutil.read_file(incarnation_file) + if last_incarnation is not None: + current_incarnation = int(last_incarnation) + + logger.info("Purging disk cache, current incarnation is {0}" + .format('not found' + if current_incarnation == -1 + else current_incarnation)) + + # Create tuples: (prefix, suffix, incarnation, name, file_modified) + files = [] + for f in os.listdir(conf.get_lib_dir()): + full_path = os.path.join(conf.get_lib_dir(), f) + for pattern in CACHE_PATTERNS: + m = pattern.match(f) + if m is not None: + prefix = m.group(1) + suffix = m.group(3) + incarnation = int(m.group(2)) + file_modified = os.path.getmtime(full_path) + t = (prefix, suffix, incarnation, f, file_modified) + files.append(t) + break + + if len(files) <= 0: return - newpid = self.osutil.get_dhcp_pid() - if newpid is not None and newpid != self.dhcpid: - logger.info("EnvMonitor: Detected dhcp client restart. " - "Restoring routing table.") - self.dhcp_handler.conf_routes() - self.dhcpid = newpid + # Sort by (prefix, suffix, file_modified) in reverse order + files = sorted(files, key=operator.itemgetter(0, 1, 4), reverse=True) + + # Remove any files in excess of the maximum allowed + # -- Restart then whenever the (prefix, suffix) change + count = 0 + last_match = [None, None] + for f in files: + if last_match != f[0:2]: + last_match = f[0:2] + count = 0 + + if current_incarnation == f[2]: + logger.verbose("Skipping {0}".format(f[3])) + continue + + count += 1 + + if count > MAXIMUM_CACHED_FILES: + full_name = os.path.join(conf.get_lib_dir(), f[3]) + logger.verbose("Deleting {0}".format(full_name)) + os.remove(full_name) def stop(self): """ - Stop server comminucation and join the thread to main thread. + Stop server communication and join the thread to main thread. """ self.stopped = True if self.server_thread is not None: self.server_thread.join() - diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index f0a3b09..cc1796b 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -17,10 +17,12 @@ # Requires Python 2.4+ and Openssl 1.0+ # +import datetime import glob import json import os import os.path +import random import re import shutil import stat @@ -31,25 +33,22 @@ import zipfile import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.fileutil as fileutil -import azurelinuxagent.common.utils.restutil as restutil -import azurelinuxagent.common.utils.shellutil as shellutil import azurelinuxagent.common.version as version +from azurelinuxagent.common.errorstate import ErrorState, ERROR_STATE_DELTA -from azurelinuxagent.common.event import add_event, WALAEventOperation -from azurelinuxagent.common.exception import ExtensionError, ProtocolError, HttpError +from azurelinuxagent.common.event import add_event, WALAEventOperation, elapsed_milliseconds +from azurelinuxagent.common.exception import ExtensionError, ProtocolError, RestartError from azurelinuxagent.common.future import ustr -from azurelinuxagent.common.version import AGENT_VERSION from azurelinuxagent.common.protocol.restapi import ExtHandlerStatus, \ ExtensionStatus, \ ExtensionSubStatus, \ - Extension, \ VMStatus, ExtHandler, \ get_properties, \ set_properties from azurelinuxagent.common.utils.flexible_version import FlexibleVersion -from azurelinuxagent.common.utils.textutil import Version from azurelinuxagent.common.protocol import get_protocol_util -from azurelinuxagent.common.version import AGENT_NAME, CURRENT_AGENT, CURRENT_VERSION +from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION + #HandlerEnvironment.json schema version HANDLER_ENVIRONMENT_VERSION = 1.0 @@ -64,7 +63,6 @@ HANDLER_PKG_EXT = ".zip" HANDLER_PKG_PATTERN = re.compile(HANDLER_PATTERN+"\\"+HANDLER_PKG_EXT+"$", re.IGNORECASE) - def validate_has_key(obj, key, fullname): if key not in obj: raise ExtensionError("Missing: {0}".format(fullname)) @@ -176,35 +174,73 @@ class ExtHandlersHandler(object): self.protocol = None self.ext_handlers = None self.last_etag = None + self.last_upgrade_guids = {} self.log_report = False self.log_etag = True + self.log_process = False + + self.report_status_error_state = ErrorState() def run(self): self.ext_handlers, etag = None, None try: self.protocol = self.protocol_util.get_protocol() self.ext_handlers, etag = self.protocol.get_ext_handlers() - except ProtocolError as e: + except Exception as e: msg = u"Exception retrieving extension handlers: {0}".format( ustr(e)) logger.warn(msg) - add_event(AGENT_NAME, version=CURRENT_VERSION, is_success=False, message=msg) + add_event(AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.ExtensionProcessing, + is_success=False, + message=msg) return - msg = u"Handle extensions updates for incarnation {0}".format(etag) - logger.verbose(msg) - # Log status report success on new config - self.log_report = True - self.handle_ext_handlers(etag) - self.last_etag = etag - - self.report_ext_handlers_status() - self.cleanup_outdated_handlers() + try: + msg = u"Handle extensions updates for incarnation {0}".format(etag) + logger.verbose(msg) + # Log status report success on new config + self.log_report = True + self.handle_ext_handlers(etag) + self.last_etag = etag + + self.report_ext_handlers_status() + self.cleanup_outdated_handlers() + except RestartError: + raise + except Exception as e: + msg = u"Exception processing extension handlers: {0}".format( + ustr(e)) + logger.warn(msg) + add_event(AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.ExtensionProcessing, + is_success=False, + message=msg) + return def run_status(self): self.report_ext_handlers_status() return + def get_upgrade_guid(self, name): + return self.last_upgrade_guids.get(name, (None, False))[0] + + def get_log_upgrade_guid(self, ext_handler): + return self.last_upgrade_guids.get(ext_handler.name, (None, False))[1] + + def set_log_upgrade_guid(self, ext_handler, log_val): + guid = self.get_upgrade_guid(ext_handler.name) + if guid is not None: + self.last_upgrade_guids[ext_handler.name] = (guid, log_val) + + def is_new_guid(self, ext_handler): + last_guid = self.get_upgrade_guid(ext_handler.name) + if last_guid is None: + return True + return last_guid != ext_handler.properties.upgradeGuid + def cleanup_outdated_handlers(self): handlers = [] pkgs = [] @@ -288,7 +324,29 @@ class ExtHandlersHandler(object): ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol) try: - ext_handler_i.decide_version() + state = ext_handler.properties.state + # The extension is to be enabled, there is an upgrade GUID + # and the GUID is NOT new + if state == u"enabled" and \ + ext_handler.properties.upgradeGuid is not None and \ + not self.is_new_guid(ext_handler): + ext_handler_i.ext_handler.properties.version = ext_handler_i.get_installed_version() + ext_handler_i.set_logger() + if self.last_etag != etag: + self.set_log_upgrade_guid(ext_handler, True) + + msg = "New GUID is the same as the old GUID. Exiting without upgrading." + if self.get_log_upgrade_guid(ext_handler): + ext_handler_i.logger.info(msg) + self.set_log_upgrade_guid(ext_handler, False) + ext_handler_i.set_handler_state(ExtHandlerState.Enabled) + ext_handler_i.set_handler_status(status="Ready", message="No change") + ext_handler_i.set_operation(WALAEventOperation.SkipUpdate) + ext_handler_i.report_event(message=ustr(msg), is_success=True) + return + + self.set_log_upgrade_guid(ext_handler, True) + ext_handler_i.decide_version(etag=etag, target_state=state) if not ext_handler_i.is_upgrade and self.last_etag == etag: if self.log_etag: ext_handler_i.logger.verbose("Version {0} is current for etag {1}", @@ -299,22 +357,34 @@ class ExtHandlersHandler(object): self.log_etag = True - state = ext_handler.properties.state ext_handler_i.logger.info("Target handler state: {0}", state) if state == u"enabled": self.handle_enable(ext_handler_i) + if ext_handler.properties.upgradeGuid is not None: + ext_handler_i.logger.info("New Upgrade GUID: {0}", ext_handler.properties.upgradeGuid) + self.last_upgrade_guids[ext_handler.name] = (ext_handler.properties.upgradeGuid, True) elif state == u"disabled": self.handle_disable(ext_handler_i) + # Remove the GUID from the dictionary so that it is upgraded upon re-enable + self.last_upgrade_guids.pop(ext_handler.name, None) elif state == u"uninstall": self.handle_uninstall(ext_handler_i) + # Remove the GUID from the dictionary so that it is upgraded upon re-install + self.last_upgrade_guids.pop(ext_handler.name, None) else: message = u"Unknown ext handler state:{0}".format(state) raise ExtensionError(message) - except ExtensionError as e: + except RestartError: + ext_handler_i.logger.info("GoalState became stale during " + "processing. Restarting with new " + "GoalState") + raise + except Exception as e: ext_handler_i.set_handler_status(message=ustr(e), code=-1) ext_handler_i.report_event(message=ustr(e), is_success=False) def handle_enable(self, ext_handler_i): + self.log_process = True old_ext_handler_i = ext_handler_i.get_installed_ext_handler() if old_ext_handler_i is not None and \ old_ext_handler_i.version_gt(ext_handler_i): @@ -341,6 +411,7 @@ class ExtHandlersHandler(object): ext_handler_i.enable() def handle_disable(self, ext_handler_i): + self.log_process = True handler_state = ext_handler_i.get_handler_state() ext_handler_i.logger.info("[Disable] current handler state is: {0}", handler_state.lower()) @@ -348,6 +419,7 @@ class ExtHandlersHandler(object): ext_handler_i.disable() def handle_uninstall(self, ext_handler_i): + self.log_process = True handler_state = ext_handler_i.get_handler_state() ext_handler_i.logger.info("[Uninstall] current handler state is: {0}", handler_state.lower()) @@ -356,7 +428,7 @@ class ExtHandlersHandler(object): ext_handler_i.disable() ext_handler_i.uninstall() ext_handler_i.rm_ext_handler_dir() - + def report_ext_handlers_status(self): """Go through handler_state dir, collect and report status""" vm_status = VMStatus(status="Ready", message="Guest Agent is running") @@ -368,6 +440,7 @@ class ExtHandlersHandler(object): add_event( AGENT_NAME, version=CURRENT_VERSION, + op=WALAEventOperation.ExtensionProcessing, is_success=False, message=ustr(e)) @@ -376,9 +449,27 @@ class ExtHandlersHandler(object): self.protocol.report_vm_status(vm_status) if self.log_report: logger.verbose("Completed vm agent status report") + self.report_status_error_state.reset() except ProtocolError as e: + self.report_status_error_state.incr() message = "Failed to report vm agent status: {0}".format(e) - add_event(AGENT_NAME, version=CURRENT_VERSION, is_success=False, message=message) + add_event(AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.ExtensionProcessing, + is_success=False, + message=message) + + if self.report_status_error_state.is_triggered(): + message = "Failed to report vm agent status for more than {0}"\ + .format(ERROR_STATE_DELTA) + + add_event(AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.ExtensionProcessing, + is_success=False, + message=message) + + self.report_status_error_state.reset() def report_ext_handler_status(self, vm_status, ext_handler): ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol) @@ -386,6 +477,9 @@ class ExtHandlersHandler(object): handler_status = ext_handler_i.get_handler_status() if handler_status is None: return + guid = self.get_upgrade_guid(ext_handler.name) + if guid is not None: + handler_status.upgradeGuid = guid handler_state = ext_handler_i.get_handler_state() if handler_state != ExtHandlerState.NotInstalled: @@ -413,9 +507,7 @@ class ExtHandlerInstance(object): self.pkg = None self.pkg_file = None self.is_upgrade = False - - prefix = "[{0}]".format(self.get_full_name()) - self.logger = logger.Logger(logger.DEFAULT_LOGGER, prefix) + self.set_logger() try: fileutil.mkdir(self.get_log_dir(), mode=0o755) @@ -426,15 +518,13 @@ class ExtHandlerInstance(object): self.logger.add_appender(logger.AppenderType.FILE, logger.LogLevel.INFO, log_file) - def decide_version(self): + def decide_version(self, etag, target_state=None): self.logger.verbose("Decide which version to use") - try: - pkg_list = self.protocol.get_ext_handler_pkgs(self.ext_handler) - except ProtocolError as e: - raise ExtensionError("Failed to get ext handler pkgs", e) + pkg_list = self.protocol.get_ext_handler_pkgs(self.ext_handler, etag) # Determine the desired and installed versions - requested_version = FlexibleVersion(self.ext_handler.properties.version) + requested_version = FlexibleVersion( + str(self.ext_handler.properties.version)) installed_version_string = self.get_installed_version() installed_version = requested_version \ if installed_version_string is None \ @@ -511,13 +601,22 @@ class ExtHandlerInstance(object): # Note: # - A downgrade, which will be bound to the same major version, # is allowed if the installed version is no longer available - if selected_pkg is None \ + if target_state == u"uninstall": + if installed_pkg is None: + msg = "Failed to find installed version of {0} " \ + "to uninstall".format(self.ext_handler.name) + self.logger.warn(msg) + self.pkg = installed_pkg + self.ext_handler.properties.version = str(installed_version) \ + if installed_version is not None else None + elif selected_pkg is None \ or (installed_pkg is not None and selected_version < installed_version): self.pkg = installed_pkg - self.ext_handler.properties.version = installed_version + self.ext_handler.properties.version = str(installed_version) \ + if installed_version is not None else None else: self.pkg = selected_pkg - self.ext_handler.properties.version = selected_pkg.version + self.ext_handler.properties.version = str(selected_pkg.version) # Note if the selected package is greater than that installed if installed_pkg is None \ @@ -528,12 +627,17 @@ class ExtHandlerInstance(object): raise ExtensionError("Failed to find any valid extension package") self.logger.verbose("Use version: {0}", self.pkg.version) + self.set_logger() return + def set_logger(self): + prefix = "[{0}]".format(self.get_full_name()) + self.logger = logger.Logger(logger.DEFAULT_LOGGER, prefix) + def version_gt(self, other): self_version = self.ext_handler.properties.version other_version = other.ext_handler.properties.version - return Version(self_version) > Version(other_version) + return FlexibleVersion(self_version) > FlexibleVersion(other_version) def get_installed_ext_handler(self): lastest_version = self.get_installed_version() @@ -585,23 +689,26 @@ class ExtHandlerInstance(object): status_file = os.path.join(old_ext_status_dir, status_file) if os.path.isfile(status_file): shutil.copy2(status_file, new_ext_status_dir) - + def set_operation(self, op): self.operation = op - def report_event(self, message="", is_success=True): + def report_event(self, message="", is_success=True, duration=0): version = self.ext_handler.properties.version add_event(name=self.ext_handler.name, version=version, message=message, - op=self.operation, is_success=is_success) + op=self.operation, is_success=is_success, duration=duration) def download(self): + begin_utc = datetime.datetime.utcnow() self.logger.verbose("Download extension package") self.set_operation(WALAEventOperation.Download) if self.pkg is None: raise ExtensionError("No package uri found") package = None - for uri in self.pkg.uris: + uris_shuffled = self.pkg.uris + random.shuffle(uris_shuffled) + for uri in uris_shuffled: try: package = self.protocol.download_ext_handler_pkg(uri.uri) if package is not None: @@ -627,7 +734,8 @@ class ExtHandlerInstance(object): for file in fileutil.get_all_files(self.get_base_dir()): fileutil.chmod(file, os.stat(file).st_mode | stat.S_IXUSR) - self.report_event(message="Download succeeded") + duration = elapsed_milliseconds(begin_utc) + self.report_event(message="Download succeeded", duration=duration) self.logger.info("Initialize extension directory") #Save HandlerManifest.json @@ -649,8 +757,25 @@ class ExtHandlerInstance(object): try: status_dir = self.get_status_dir() fileutil.mkdir(status_dir, mode=0o700) + + seq_no, status_path = self.get_status_file_path() + if seq_no > -1: + now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") + status = { + "version": 1.0, + "timestampUTC" : now, + "status" : { + "name" : self.ext_handler.name, + "operation" : "Enabling Handler", + "status" : "transitioning", + "code" : 0 + } + } + fileutil.write_file(json.dumps(status), status_path) + conf_dir = self.get_conf_dir() fileutil.mkdir(conf_dir, mode=0o700) + except IOError as e: fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file]) @@ -740,17 +865,24 @@ class ExtHandlerInstance(object): continue return seq_no + def get_status_file_path(self): + seq_no = self.get_largest_seq_no() + path = None + + if seq_no > -1: + path = os.path.join( + self.get_status_dir(), + "{0}.status".format(seq_no)) + + return seq_no, path + def collect_ext_status(self, ext): self.logger.verbose("Collect extension status") - seq_no = self.get_largest_seq_no() + seq_no, ext_status_file = self.get_status_file_path() if seq_no == -1: return None - status_dir = self.get_status_dir() - ext_status_file = "{0}.status".format(seq_no) - ext_status_file = os.path.join(status_dir, ext_status_file) - ext_status = ExtensionStatus(seq_no=seq_no) try: data_str = fileutil.read_file(ext_status_file) @@ -810,6 +942,7 @@ class ExtHandlerInstance(object): return last_update <= 600 # updated within the last 10 min def launch_command(self, cmd, timeout=300): + begin_utc = datetime.datetime.utcnow() self.logger.verbose("Launch command: [{0}]", cmd) base_dir = self.get_base_dir() try: @@ -835,7 +968,8 @@ class ExtHandlerInstance(object): if ret == None or ret != 0: raise ExtensionError("Non-zero exit code: {0}, {1}".format(ret, cmd)) - self.report_event(message="Launch command succeeded: {0}".format(cmd)) + duration = elapsed_milliseconds(begin_utc) + self.report_event(message="Launch command succeeded: {0}".format(cmd), duration=duration) def load_manifest(self): man_file = self.get_manifest_file() @@ -929,7 +1063,7 @@ class ExtHandlerInstance(object): handler_status = ExtHandlerStatus() handler_status.name = self.ext_handler.name - handler_status.version = self.ext_handler.properties.version + handler_status.version = str(self.ext_handler.properties.version) handler_status.message = message handler_status.code = code handler_status.status = status @@ -980,7 +1114,7 @@ class ExtHandlerInstance(object): def get_log_dir(self): return os.path.join(conf.get_ext_log_dir(), self.ext_handler.name, - self.ext_handler.properties.version) + str(self.ext_handler.properties.version)) class HandlerEnvironment(object): def __init__(self, data): diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 307a514..71ac9b0 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -21,8 +21,10 @@ import os import platform import time import threading +import uuid import azurelinuxagent.common.conf as conf +import azurelinuxagent.common.utils.fileutil as fileutil import azurelinuxagent.common.logger as logger from azurelinuxagent.common.event import add_event, WALAEventOperation @@ -34,6 +36,7 @@ from azurelinuxagent.common.protocol.restapi import TelemetryEventParam, \ TelemetryEventList, \ TelemetryEvent, \ set_properties +from azurelinuxagent.common.utils.restutil import IOErrorCounter from azurelinuxagent.common.utils.textutil import parse_doc, findall, find, getattrib from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION, \ DISTRO_CODE_NAME, AGENT_LONG_VERSION, \ @@ -179,15 +182,53 @@ class MonitorHandler(object): def daemon(self): period = datetime.timedelta(minutes=30) + protocol = self.protocol_util.get_protocol() last_heartbeat = datetime.datetime.utcnow() - period + + # Create a new identifier on each restart and reset the counter + heartbeat_id = str(uuid.uuid4()).upper() + counter = 0 while True: if datetime.datetime.utcnow() >= (last_heartbeat + period): last_heartbeat = datetime.datetime.utcnow() + incarnation = protocol.get_incarnation() + dropped_packets = self.osutil.get_firewall_dropped_packets( + protocol.endpoint) + + msg = "{0};{1};{2};{3}".format( + incarnation, counter, heartbeat_id, dropped_packets) + add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, - is_success=True) + is_success=True, + message=msg, + log_event=False) + + counter += 1 + + io_errors = IOErrorCounter.get_and_reset() + hostplugin_errors = io_errors.get("hostplugin") + protocol_errors = io_errors.get("protocol") + other_errors = io_errors.get("other") + + if hostplugin_errors > 0 \ + or protocol_errors > 0 \ + or other_errors > 0: + + msg = "hostplugin:{0};protocol:{1};other:{2}"\ + .format(hostplugin_errors, + protocol_errors, + other_errors) + add_event( + name=AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.HttpErrors, + is_success=True, + message=msg, + log_event=False) + try: self.collect_and_send_events() except Exception as e: diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index b7ee96a..2e43031 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -21,9 +21,11 @@ import glob import json import os import platform +import random import re import shutil import signal +import stat import subprocess import sys import time @@ -43,6 +45,7 @@ from azurelinuxagent.common.event import add_event, add_periodic, \ WALAEventOperation from azurelinuxagent.common.exception import ProtocolError, \ ResourceGoneError, \ + RestartError, \ UpdateError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import get_osutil @@ -60,7 +63,7 @@ from azurelinuxagent.ga.exthandlers import HandlerManifest AGENT_ERROR_FILE = "error.json" # File name for agent error record AGENT_MANIFEST_FILE = "HandlerManifest.json" -AGENT_SUPPORTED_FILE = "supported.json" +AGENT_PARTITION_FILE = "partition" CHILD_HEALTH_INTERVAL = 15 * 60 CHILD_LAUNCH_INTERVAL = 5 * 60 @@ -71,10 +74,18 @@ MAX_FAILURE = 3 # Max failure allowed for agent before blacklisted GOAL_STATE_INTERVAL = 3 -ORPHAN_WAIT_INTERVAL = 15 * 60 * 60 +ORPHAN_WAIT_INTERVAL = 15 * 60 AGENT_SENTINAL_FILE = "current_version" +READONLY_FILE_GLOBS = [ + "*.crt", + "*.p7m", + "*.pem", + "*.prv", + "ovf-env.xml" +] + def get_update_handler(): return UpdateHandler() @@ -101,7 +112,6 @@ class UpdateHandler(object): self.child_process = None self.signal_handler = None - return def run_latest(self, child_args=None): """ @@ -233,36 +243,43 @@ class UpdateHandler(object): This is the main loop which watches for agent and extension updates. """ - logger.info(u"Agent {0} is running as the goal state agent", - CURRENT_AGENT) + try: + logger.info(u"Agent {0} is running as the goal state agent", + CURRENT_AGENT) - # Launch monitoring threads - from azurelinuxagent.ga.monitor import get_monitor_handler - get_monitor_handler().run() + # Launch monitoring threads + from azurelinuxagent.ga.monitor import get_monitor_handler + get_monitor_handler().run() - from azurelinuxagent.ga.env import get_env_handler - get_env_handler().run() + from azurelinuxagent.ga.env import get_env_handler + get_env_handler().run() - from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state - exthandlers_handler = get_exthandlers_handler() - migrate_handler_state() + from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state + exthandlers_handler = get_exthandlers_handler() + migrate_handler_state() - try: self._ensure_no_orphans() self._emit_restart_event() + self._ensure_partition_assigned() + self._ensure_readonly_files() while self.running: if self._is_orphaned: - logger.info("Goal state agent {0} was orphaned -- exiting", + logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break if self._upgrade_available(): - if len(self.agents) > 0: + available_agent = self.get_latest_agent() + if available_agent is None: + logger.info( + "Agent {0} is reverting to the installed agent -- exiting", + CURRENT_AGENT) + else: logger.info( - u"Agent {0} discovered {1} as an update and will exit", + u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, - self.agents[0].name) + available_agent.name) break utc_start = datetime.utcnow() @@ -271,20 +288,24 @@ class UpdateHandler(object): exthandlers_handler.run() if last_etag != exthandlers_handler.last_etag: + self._ensure_readonly_files() add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ProcessGoalState, is_success=True, duration=elapsed_milliseconds(utc_start), + message="Incarnation {0}".format( + exthandlers_handler.last_etag), log_event=True) time.sleep(GOAL_STATE_INTERVAL) except Exception as e: - logger.warn(u"Agent {0} failed with exception: {1}", - CURRENT_AGENT, - ustr(e)) + msg = u"Agent {0} failed with exception: {1}".format( + CURRENT_AGENT, ustr(e)) + self._set_sentinal(msg=msg) + logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests @@ -338,17 +359,21 @@ class UpdateHandler(object): return available_agents[0] if len(available_agents) >= 1 else None def _emit_restart_event(self): - if not self._is_clean_start: - msg = u"{0} did not terminate cleanly".format(CURRENT_AGENT) - logger.info(msg) - add_event( - AGENT_NAME, - version=CURRENT_VERSION, - op=WALAEventOperation.Restart, - is_success=False, - message=msg) + try: + if not self._is_clean_start: + msg = u"Agent did not terminate cleanly: {0}".format( + fileutil.read_file(self._sentinal_file_path())) + logger.info(msg) + add_event( + AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.Restart, + is_success=False, + message=msg) + except Exception: + pass - self._set_sentinal() + self._set_sentinal(msg="Starting") return def _ensure_no_orphans(self, orphan_wait_interval=ORPHAN_WAIT_INTERVAL): @@ -382,6 +407,26 @@ class UpdateHandler(object): ustr(e)) return + def _ensure_partition_assigned(self): + """ + Assign the VM to a partition (0 - 99). Downloaded updates may be configured + to run on only some VMs; the assigned partition determines eligibility. + """ + if not os.path.exists(self._partition_file): + partition = ustr(int(datetime.utcnow().microsecond / 10000)) + fileutil.write_file(self._partition_file, partition) + add_event( + AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.Partition, + is_success=True, + message=partition) + + def _ensure_readonly_files(self): + for g in READONLY_FILE_GLOBS: + for path in glob.iglob(os.path.join(conf.get_lib_dir(), g)): + os.chmod(path, stat.S_IRUSR) + def _evaluate_agent_health(self, latest_agent): """ Evaluate the health of the selected agent: If it is restarting @@ -412,7 +457,6 @@ class UpdateHandler(object): def _filter_blacklisted_agents(self): self.agents = [agent for agent in self.agents if not agent.is_blacklisted] - return def _find_agents(self): """ @@ -447,19 +491,7 @@ class UpdateHandler(object): @property def _is_clean_start(self): - if not os.path.isfile(self._sentinal_file_path()): - return True - - try: - if fileutil.read_file(self._sentinal_file_path()) != CURRENT_AGENT: - return True - except Exception as e: - logger.warn( - u"Exception reading sentinal file {0}: {1}", - self._sentinal_file_path(), - str(e)) - - return False + return not os.path.isfile(self._sentinal_file_path()) @property def _is_orphaned(self): @@ -472,11 +504,29 @@ class UpdateHandler(object): return fileutil.read_file(conf.get_agent_pid_file_path()) != ustr(parent_pid) + def _is_version_eligible(self, version): + # Ensure the installed version is always eligible + if version == CURRENT_VERSION and is_current_agent_installed(): + return True + + for agent in self.agents: + if agent.version == version: + return agent.is_available + + return False + def _load_agents(self): path = os.path.join(conf.get_lib_dir(), "{0}-*".format(AGENT_NAME)) return [GuestAgent(path=agent_dir) for agent_dir in glob.iglob(path) if os.path.isdir(agent_dir)] + def _partition(self): + return int(fileutil.read_file(self._partition_file)) + + @property + def _partition_file(self): + return os.path.join(conf.get_lib_dir(), AGENT_PARTITION_FILE) + def _purge_agents(self): """ Remove from disk all directories and .zip files of unknown agents @@ -485,8 +535,8 @@ class UpdateHandler(object): path = os.path.join(conf.get_lib_dir(), "{0}-*".format(AGENT_NAME)) known_versions = [agent.version for agent in self.agents] - if not is_current_agent_installed() and CURRENT_VERSION not in known_versions: - logger.warn( + if CURRENT_VERSION not in known_versions: + logger.info( u"Running Agent {0} was not found in the agent manifest - adding to list", CURRENT_VERSION) known_versions.append(CURRENT_VERSION) @@ -511,9 +561,11 @@ class UpdateHandler(object): self.agents.sort(key=lambda agent: agent.version, reverse=True) return - def _set_sentinal(self, agent=CURRENT_AGENT): + def _set_sentinal(self, agent=CURRENT_AGENT, msg="Unknown cause"): try: - fileutil.write_file(self._sentinal_file_path(), agent) + fileutil.write_file( + self._sentinal_file_path(), + "[{0}] [{1}]".format(agent, msg)) except Exception as e: logger.warn( u"Exception writing sentinal file {0}: {1}", @@ -599,16 +651,18 @@ class UpdateHandler(object): self._purge_agents() self._filter_blacklisted_agents() - # Return True if more recent agents are available - return len(self.agents) > 0 and \ - self.agents[0].version > base_version + # Return True if current agent is no longer available or an + # agent with a higher version number is available + return not self._is_version_eligible(base_version) \ + or (len(self.agents) > 0 \ + and self.agents[0].version > base_version) except Exception as e: if isinstance(e, ResourceGoneError): continue msg = u"Exception retrieving agent manifests: {0}".format( - ustr(e)) + ustr(traceback.format_exc())) logger.warn(msg) add_event( AGENT_NAME, @@ -667,8 +721,6 @@ class GuestAgent(object): self.error = GuestAgentError(self.get_agent_error_file()) self.error.load() - self.supported = Supported(self.get_agent_supported_file()) - self.supported.load() try: self._ensure_downloaded() @@ -677,6 +729,13 @@ class GuestAgent(object): if isinstance(e, ResourceGoneError): raise + # The agent was improperly blacklisting versions due to a timeout + # encountered while downloading a later version. Errors of type + # socket.error are IOError, so this should provide sufficient + # protection against a large class of I/O operation failures. + if isinstance(e, IOError): + raise + # Note the failure, blacklist the agent if the package downloaded # - An exception with a downloaded package indicates the package # is corrupt (e.g., missing the HandlerManifest.json file) @@ -691,7 +750,6 @@ class GuestAgent(object): op=WALAEventOperation.Install, is_success=False, message=msg) - return @property def name(self): @@ -712,13 +770,9 @@ class GuestAgent(object): def get_agent_pkg_path(self): return ".".join((os.path.join(conf.get_lib_dir(), self.name), "zip")) - def get_agent_supported_file(self): - return os.path.join(conf.get_lib_dir(), self.name, AGENT_SUPPORTED_FILE) - def clear_error(self): self.error.clear() self.error.save() - return @property def is_available(self): @@ -730,15 +784,8 @@ class GuestAgent(object): @property def is_downloaded(self): - return self.is_blacklisted or os.path.isfile(self.get_agent_manifest_path()) - - @property - def _is_optional(self): - return self.error is not None and self.error.is_sentinel and self.supported.is_supported - - @property - def _in_slice(self): - return self.supported.is_supported and self.supported.in_slice + return self.is_blacklisted or \ + os.path.isfile(self.get_agent_manifest_path()) def mark_failure(self, is_fatal=False): try: @@ -750,21 +797,6 @@ class GuestAgent(object): logger.warn(u"Agent {0} is permanently blacklisted", self.name) except Exception as e: logger.warn(u"Agent {0} failed recording error state: {1}", self.name, ustr(e)) - return - - def _enable(self): - # Enable optional agents if within the "slice" - # - The "slice" is a percentage of the agent to execute - # - Blacklist out-of-slice agents to prevent reconsideration - if self._is_optional: - if self._in_slice: - self.error.clear() - self.error.save() - logger.info(u"Enabled optional Agent {0}", self.name) - else: - self.mark_failure(is_fatal=True) - logger.info(u"Optional Agent {0} not in slice", self.name) - return def _ensure_downloaded(self): logger.verbose(u"Ensuring Agent {0} is downloaded", self.name) @@ -788,18 +820,15 @@ class GuestAgent(object): op=WALAEventOperation.Install, is_success=True, message=msg) - return def _ensure_loaded(self): self._load_manifest() self._load_error() - self._load_supported() - - self._enable() - return def _download(self): - for uri in self.pkg.uris: + uris_shuffled = self.pkg.uris + random.shuffle(uris_shuffled) + for uri in uris_shuffled: if not HostPluginProtocol.is_default_channel() and self._fetch(uri.uri): break @@ -838,8 +867,6 @@ class GuestAgent(object): message=msg) raise UpdateError(msg) - return - def _fetch(self, uri, headers=None, use_proxy=True): package = None try: @@ -871,14 +898,6 @@ class GuestAgent(object): logger.verbose(u"Agent {0} error state: {1}", self.name, ustr(self.error)) except Exception as e: logger.warn(u"Agent {0} failed loading error state: {1}", self.name, ustr(e)) - return - - def _load_supported(self): - try: - self.supported = Supported(self.get_agent_supported_file()) - self.supported.load() - except Exception as e: - self.supported = Supported() def _load_manifest(self): path = self.get_agent_manifest_path() @@ -976,10 +995,6 @@ class GuestAgentError(object): def is_blacklisted(self): return self.was_fatal or self.failure_count >= MAX_FAILURE - @property - def is_sentinel(self): - return self.was_fatal and self.last_failure == 0.0 - def load(self): if self.path is not None and os.path.isfile(self.path): with open(self.path, 'r') as f: @@ -1015,60 +1030,3 @@ class GuestAgentError(object): self.last_failure, self.failure_count, self.was_fatal) - -class Supported(object): - def __init__(self, path): - if path is None: - raise UpdateError(u"Supported requires a path") - self.path = path - self.distributions = {} - return - - @property - def is_supported(self): - return self._supported_distribution is not None - - @property - def in_slice(self): - d = self._supported_distribution - return d is not None and d.in_slice - - def load(self): - self.distributions = {} - try: - if self.path is not None and os.path.isfile(self.path): - j = json.loads(fileutil.read_file(self.path)) - for d in j: - self.distributions[d] = SupportedDistribution(j[d]) - except Exception as e: - logger.warn("Failed JSON parse of {0}: {1}".format(self.path, e)) - return - - @property - def _supported_distribution(self): - for d in self.distributions: - dd = self.distributions[d] - if dd.is_supported: - return dd - return None - -class SupportedDistribution(object): - def __init__(self, s): - if s is None or not isinstance(s, dict): - raise UpdateError(u"SupportedDisribution requires a dictionary") - - self.slice = s['slice'] - self.versions = s['versions'] - - @property - def is_supported(self): - d = ','.join(platform.linux_distribution()) - for v in self.versions: - if re.match(v, d): - return True - return False - - @property - def in_slice(self): - n = int((60 * self.slice) / 100) - return (n - datetime.utcnow().second) > 0 diff --git a/azurelinuxagent/pa/deprovision/default.py b/azurelinuxagent/pa/deprovision/default.py index e2c5613..895264a 100644 --- a/azurelinuxagent/pa/deprovision/default.py +++ b/azurelinuxagent/pa/deprovision/default.py @@ -123,6 +123,7 @@ class DeprovisionHandler(object): known_files = [ 'HostingEnvironmentConfig.xml', 'Incarnation', + 'partition', 'Protocol', 'SharedConfig.xml', 'WireServerEndpoint' diff --git a/azurelinuxagent/pa/provision/cloudinit.py b/azurelinuxagent/pa/provision/cloudinit.py index 5789e9a..fa47799 100644 --- a/azurelinuxagent/pa/provision/cloudinit.py +++ b/azurelinuxagent/pa/provision/cloudinit.py @@ -26,9 +26,9 @@ from datetime import datetime import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.fileutil as fileutil -import azurelinuxagent.common.utils.shellutil as shellutil -from azurelinuxagent.common.event import elapsed_milliseconds +from azurelinuxagent.common.event import elapsed_milliseconds, \ + WALAEventOperation from azurelinuxagent.common.exception import ProvisionError, ProtocolError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.protocol import OVF_FILE_NAME @@ -64,9 +64,12 @@ class CloudInitProvisionHandler(ProvisionHandler): logger.info("Finished provisioning") self.report_ready(thumbprint) - self.report_event("Provision succeed", + self.report_event("Provisioning with cloud-init succeeded", is_success=True, duration=elapsed_milliseconds(utc_start)) + self.report_event(self.create_guest_state_telemetry_messsage(), + is_success=True, + operation=WALAEventOperation.GuestState) except ProvisionError as e: logger.error("Provisioning failed: {0}", ustr(e)) @@ -103,7 +106,7 @@ class CloudInitProvisionHandler(ProvisionHandler): "after {1}s".format(ovf_file_path, max_retry * sleep_time)) - def wait_for_ssh_host_key(self, max_retry=360, sleep_time=5): + def wait_for_ssh_host_key(self, max_retry=1800, sleep_time=1): """ Wait for cloud-init to generate ssh host key """ diff --git a/azurelinuxagent/pa/provision/default.py b/azurelinuxagent/pa/provision/default.py index 2f7ec18..44e171b 100644 --- a/azurelinuxagent/pa/provision/default.py +++ b/azurelinuxagent/pa/provision/default.py @@ -22,6 +22,8 @@ Provision handler import os import os.path import re +import socket +import time from datetime import datetime @@ -87,10 +89,14 @@ class ProvisionHandler(object): self.write_provisioned() - self.report_event("Provision succeed", + self.report_event("Provisioning succeeded", is_success=True, duration=elapsed_milliseconds(utc_start)) + self.report_event(self.create_guest_state_telemetry_messsage(), + is_success=True, + operation=WALAEventOperation.GuestState) + self.report_ready(thumbprint) logger.info("Provisioning complete") @@ -244,9 +250,14 @@ class ProvisionHandler(object): fileutil.write_file(customdata_file, customdata) if conf.get_execute_customdata(): + start = time.time() logger.info("Execute custom data") os.chmod(customdata_file, 0o700) shellutil.run(customdata_file) + add_event(name=AGENT_NAME, + duration=int(time.time() - start), + is_success=True, + op=WALAEventOperation.CustomData) def deploy_ssh_pubkeys(self, ovfenv): for pubkey in ovfenv.ssh_pubkeys: @@ -258,12 +269,53 @@ class ProvisionHandler(object): logger.info("Deploy ssh key pairs.") self.osutil.deploy_ssh_keypair(ovfenv.username, keypair) - def report_event(self, message, is_success=False, duration=0): + def report_event(self, message, is_success=False, duration=0, + operation=WALAEventOperation.Provision): add_event(name=AGENT_NAME, message=message, duration=duration, is_success=is_success, - op=WALAEventOperation.Provision) + op=operation) + + def get_cpu_count(self): + try: + count = len([x for x in open('/proc/cpuinfo').readlines() + if x.startswith("processor")]) + return count + except Exception as e: + logger.verbose(u"Failed to determine the CPU count: {0}.", ustr(e)) + pass + return -1 + + def get_mem_size_mb(self): + try: + for line in open('/proc/meminfo').readlines(): + m = re.match('^MemTotal:\s*(\d+) kB$', line) + if m is not None: + return int(int(m.group(1)) / 1024) + except Exception as e: + logger.verbose(u"Failed to determine the memory size: {0}..", ustr(e)) + pass + return -1 + + def create_guest_state_telemetry_messsage(self): + """ + Create a GuestState JSON message that contains the current CPU, Memory + (MB), and hostname of the guest. + + e.g. + + { + "cpu": 1, + "mem": 1024, + "hostname": "server1234" + } + """ + cpu = self.get_cpu_count() + mem = self.get_mem_size_mb() + + return """{{"cpu": {0}, "mem": {1}, "hostname": "{2}"}}"""\ + .format(cpu, mem, socket.gethostname()) def report_not_ready(self, sub_status, description): status = ProvisionStatus(status="NotReady", subStatus=sub_status, diff --git a/azurelinuxagent/pa/provision/factory.py b/azurelinuxagent/pa/provision/factory.py index d87765f..9e88618 100644 --- a/azurelinuxagent/pa/provision/factory.py +++ b/azurelinuxagent/pa/provision/factory.py @@ -16,9 +16,7 @@ # import azurelinuxagent.common.conf as conf -import azurelinuxagent.common.logger as logger -from azurelinuxagent.common.utils.textutil import Version from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION, \ DISTRO_FULL_NAME diff --git a/azurelinuxagent/pa/rdma/suse.py b/azurelinuxagent/pa/rdma/suse.py index 20f06cd..2b6ae29 100644 --- a/azurelinuxagent/pa/rdma/suse.py +++ b/azurelinuxagent/pa/rdma/suse.py @@ -1,6 +1,6 @@ # Microsoft Azure Linux Agent # -# Copyright 2014 Microsoft Corporation +# Copyright 2017 Microsoft Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,6 @@ # import glob -import os import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.shellutil as shellutil from azurelinuxagent.common.rdma import RDMAHandler @@ -37,8 +36,10 @@ class SUSERDMAHandler(RDMAHandler): return zypper_install = 'zypper -n in %s' zypper_install_noref = 'zypper -n --no-refresh in %s' + zypper_lock = 'zypper addlock %s' zypper_remove = 'zypper -n rm %s' zypper_search = 'zypper -n se -s %s' + zypper_unlock = 'zypper removelock %s' package_name = 'msft-rdma-kmp-default' cmd = zypper_search % package_name status, repo_package_info = shellutil.run_get_output(cmd) @@ -54,13 +55,19 @@ class SUSERDMAHandler(RDMAHandler): installed = sections[0].strip() version = sections[3].strip() driver_package_versions.append(version) - if fw_version in version and installed == 'i': + if fw_version in version and installed.startswith('i'): info_msg = 'RDMA: Matching driver package "%s-%s" ' info_msg += 'is already installed, nothing to do.' logger.info(info_msg % (package_name, version)) return True - if installed == 'i': + if installed.startswith('i'): + # A driver with a different version is installed driver_package_installed = True + cmd = zypper_unlock % package_name + result = shellutil.run(cmd) + info_msg = 'Driver with different version installed ' + info_msg += 'unlocked package "%s".' + logger.info(info_msg % (package_name)) # If we get here the driver package is installed but the # version doesn't match or no package is installed @@ -80,11 +87,11 @@ class SUSERDMAHandler(RDMAHandler): logger.info("RDMA: looking for fw version %s in packages" % fw_version) for entry in driver_package_versions: - if not fw_version in version: + if fw_version not in entry: logger.info("Package '%s' is not a match." % entry) else: logger.info("Package '%s' is a match. Installing." % entry) - complete_name = '%s-%s' % (package_name, version) + complete_name = '%s-%s' % (package_name, entry) cmd = zypper_install % complete_name result = shellutil.run(cmd) if result: @@ -94,6 +101,11 @@ class SUSERDMAHandler(RDMAHandler): msg = 'RDMA: Successfully installed "%s" from ' msg += 'configured repositories' logger.info(msg % complete_name) + # Lock the package so it does not accidentally get updated + cmd = zypper_lock % package_name + result = shellutil.run(cmd) + info_msg = 'Applied lock to "%s"' % package_name + logger.info(info_msg) if not self.load_driver_module() or requires_reboot: self.reboot_system() return True @@ -119,6 +131,11 @@ class SUSERDMAHandler(RDMAHandler): msg = 'RDMA: Successfully installed "%s" from ' msg += 'local package cache' logger.info(msg % (local_package)) + # Lock the package so it does not accidentally get updated + cmd = zypper_lock % package_name + result = shellutil.run(cmd) + info_msg = 'Applied lock to "%s"' % package_name + logger.info(info_msg) if not self.load_driver_module() or requires_reboot: self.reboot_system() return True |