summaryrefslogtreecommitdiff
path: root/azurelinuxagent/common/rdma.py
diff options
context:
space:
mode:
Diffstat (limited to 'azurelinuxagent/common/rdma.py')
-rw-r--r--azurelinuxagent/common/rdma.py280
1 files changed, 280 insertions, 0 deletions
diff --git a/azurelinuxagent/common/rdma.py b/azurelinuxagent/common/rdma.py
new file mode 100644
index 0000000..0c17e38
--- /dev/null
+++ b/azurelinuxagent/common/rdma.py
@@ -0,0 +1,280 @@
+# Windows Azure Linux Agent
+#
+# Copyright 2016 Microsoft Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Handle packages and modules to enable RDMA for IB networking
+"""
+
+import os
+import re
+import time
+import threading
+
+import azurelinuxagent.common.conf as conf
+import azurelinuxagent.common.logger as logger
+import azurelinuxagent.common.utils.fileutil as fileutil
+import azurelinuxagent.common.utils.shellutil as shellutil
+from azurelinuxagent.common.utils.textutil import parse_doc, find, getattrib
+
+
+from azurelinuxagent.common.protocol.wire import SHARED_CONF_FILE_NAME
+
+dapl_config_paths = [
+ '/etc/dat.conf',
+ '/etc/rdma/dat.conf',
+ '/usr/local/etc/dat.conf'
+]
+
+def setup_rdma_device():
+ logger.verbose("Parsing SharedConfig XML contents for RDMA details")
+ xml_doc = parse_doc(
+ fileutil.read_file(os.path.join(conf.get_lib_dir(), SHARED_CONF_FILE_NAME)))
+ if xml_doc is None:
+ logger.error("Could not parse SharedConfig XML document")
+ return
+ instance_elem = find(xml_doc, "Instance")
+ if not instance_elem:
+ logger.error("Could not find <Instance> in SharedConfig document")
+ return
+
+ rdma_ipv4_addr = getattrib(instance_elem, "rdmaIPv4Address")
+ if not rdma_ipv4_addr:
+ logger.error(
+ "Could not find rdmaIPv4Address attribute on Instance element of SharedConfig.xml document")
+ return
+
+ rdma_mac_addr = getattrib(instance_elem, "rdmaMacAddress")
+ if not rdma_mac_addr:
+ logger.error(
+ "Could not find rdmaMacAddress attribute on Instance element of SharedConfig.xml document")
+ return
+
+ # add colons to the MAC address (e.g. 00155D33FF1D ->
+ # 00:15:5D:33:FF:1D)
+ rdma_mac_addr = ':'.join([rdma_mac_addr[i:i+2]
+ for i in range(0, len(rdma_mac_addr), 2)])
+ logger.info("Found RDMA details. IPv4={0} MAC={1}".format(
+ rdma_ipv4_addr, rdma_mac_addr))
+
+ # Set up the RDMA device with collected informatino
+ RDMADeviceHandler(rdma_ipv4_addr, rdma_mac_addr).start()
+ logger.info("RDMA: device is set up")
+ return
+
+class RDMAHandler(object):
+
+ driver_module_name = 'hv_network_direct'
+
+ @staticmethod
+ def get_rdma_version():
+ """Retrieve the firmware version information from the system.
+ This depends on information provided by the Linux kernel."""
+
+ driver_info_source = '/var/lib/hyperv/.kvp_pool_0'
+ base_kernel_err_msg = 'Kernel does not provide the necessary '
+ base_kernel_err_msg += 'information or the hv_kvp_daemon is not '
+ base_kernel_err_msg += 'running.'
+ if not os.path.isfile(driver_info_source):
+ error_msg = 'RDMA: Source file "%s" does not exist. '
+ error_msg += base_kernel_err_msg
+ logger.error(error_msg % driver_info_source)
+ return
+
+ lines = open(driver_info_source).read()
+ if not lines:
+ error_msg = 'RDMA: Source file "%s" is empty. '
+ error_msg += base_kernel_err_msg
+ logger.error(error_msg % driver_info_source)
+ return
+
+ r = re.search("NdDriverVersion\0+(\d\d\d\.\d)", lines)
+ if r:
+ NdDriverVersion = r.groups()[0]
+ return NdDriverVersion
+ else:
+ error_msg = 'RDMA: NdDriverVersion not found in "%s"'
+ logger.error(error_msg % driver_info_source)
+ return
+
+ def load_driver_module(self):
+ """Load the kernel driver, this depends on the proper driver
+ to be installed with the install_driver() method"""
+ logger.info("RDMA: probing module '%s'" % self.driver_module_name)
+ result = shellutil.run('modprobe %s' % self.driver_module_name)
+ if result != 0:
+ error_msg = 'Could not load "%s" kernel module. '
+ error_msg += 'Run "modprobe %s" as root for more details'
+ logger.error(
+ error_msg % (self.driver_module_name, self.driver_module_name)
+ )
+ return
+ logger.info('RDMA: Loaded the kernel driver successfully.')
+ return True
+
+ def install_driver(self):
+ """Install the driver. This is distribution specific and must
+ be overwritten in the child implementation."""
+ logger.error('RDMAHandler.install_driver not implemented')
+
+ def is_driver_loaded(self):
+ """Check if the network module is loaded in kernel space"""
+ cmd = 'lsmod | grep ^%s' % self.driver_module_name
+ status, loaded_modules = shellutil.run_get_output(cmd)
+ logger.info('RDMA: Checking if the module loaded.')
+ if loaded_modules:
+ logger.info('RDMA: module loaded.')
+ return True
+ logger.info('RDMA: module not loaded.')
+
+ def reboot_system(self):
+ """Reboot the system. This is required as the kernel module for
+ the rdma driver cannot be unloaded with rmmod"""
+ logger.info('RDMA: Rebooting system.')
+ ret = shellutil.run('shutdown -r now')
+ if ret != 0:
+ logger.error('RDMA: Failed to reboot the system')
+
+
+dapl_config_paths = [
+ '/etc/dat.conf', '/etc/rdma/dat.conf', '/usr/local/etc/dat.conf']
+
+class RDMADeviceHandler(object):
+
+ """
+ Responsible for writing RDMA IP and MAC address to the /dev/hvnd_rdma
+ interface.
+ """
+
+ rdma_dev = '/dev/hvnd_rdma'
+ device_check_timeout_sec = 120
+ device_check_interval_sec = 1
+
+ ipv4_addr = None
+ mac_adr = None
+
+ def __init__(self, ipv4_addr, mac_addr):
+ self.ipv4_addr = ipv4_addr
+ self.mac_addr = mac_addr
+
+ def start(self):
+ """
+ Start a thread in the background to process the RDMA tasks and returns.
+ """
+ logger.info("RDMA: starting device processing in the background.")
+ threading.Thread(target=self.process).start()
+
+ def process(self):
+ RDMADeviceHandler.wait_rdma_device(
+ self.rdma_dev, self.device_check_timeout_sec, self.device_check_interval_sec)
+ RDMADeviceHandler.update_dat_conf(dapl_config_paths, self.ipv4_addr)
+ RDMADeviceHandler.write_rdma_config_to_device(
+ self.rdma_dev, self.ipv4_addr, self.mac_addr)
+ RDMADeviceHandler.update_network_interface(self.mac_addr, self.ipv4_addr)
+
+ @staticmethod
+ def update_dat_conf(paths, ipv4_addr):
+ """
+ Looks at paths for dat.conf file and updates the ip address for the
+ infiniband interface.
+ """
+ logger.info("Updating DAPL configuration file")
+ for f in paths:
+ logger.info("RDMA: trying {0}".format(f))
+ if not os.path.isfile(f):
+ logger.info(
+ "RDMA: DAPL config not found at {0}".format(f))
+ continue
+ logger.info("RDMA: DAPL config is at: {0}".format(f))
+ cfg = fileutil.read_file(f)
+ new_cfg = RDMADeviceHandler.replace_dat_conf_contents(
+ cfg, ipv4_addr)
+ fileutil.write_file(f, new_cfg)
+ logger.info("RDMA: DAPL configuration is updated")
+ return
+
+ raise Exception("RDMA: DAPL configuration file not found at predefined paths")
+
+ @staticmethod
+ def replace_dat_conf_contents(cfg, ipv4_addr):
+ old = "ofa-v2-ib0 u2.0 nonthreadsafe default libdaplofa.so.2 dapl.2.0 \"\S+ 0\""
+ new = "ofa-v2-ib0 u2.0 nonthreadsafe default libdaplofa.so.2 dapl.2.0 \"{0} 0\"".format(
+ ipv4_addr)
+ return re.sub(old, new, cfg)
+
+ @staticmethod
+ def write_rdma_config_to_device(path, ipv4_addr, mac_addr):
+ data = RDMADeviceHandler.generate_rdma_config(ipv4_addr, mac_addr)
+ logger.info(
+ "RDMA: Updating device with configuration: {0}".format(data))
+ with open(path, "w") as f:
+ f.write(data)
+ logger.info("RDMA: Updated device with IPv4/MAC addr successfully")
+
+ @staticmethod
+ def generate_rdma_config(ipv4_addr, mac_addr):
+ return 'rdmaMacAddress="{0}" rdmaIPv4Address="{1}"'.format(mac_addr, ipv4_addr)
+
+ @staticmethod
+ def wait_rdma_device(path, timeout_sec, check_interval_sec):
+ logger.info("RDMA: waiting for device={0} timeout={1}s".format(path, timeout_sec))
+ total_retries = timeout_sec/check_interval_sec
+ n = 0
+ while n < total_retries:
+ if os.path.exists(path):
+ logger.info("RDMA: device ready")
+ return
+ logger.verbose(
+ "RDMA: device not ready, sleep {0}s".format(check_interval_sec))
+ time.sleep(check_interval_sec)
+ n += 1
+ logger.error("RDMA device wait timed out")
+ raise Exception("The device did not show up in {0} seconds ({1} retries)".format(
+ timeout_sec, total_retries))
+
+ @staticmethod
+ def update_network_interface(mac_addr, ipv4_addr):
+ netmask=16
+
+ logger.info("RDMA: will update the network interface with IPv4/MAC")
+
+ if_name=RDMADeviceHandler.get_interface_by_mac(mac_addr)
+ logger.info("RDMA: network interface found: {0}", if_name)
+ logger.info("RDMA: bringing network interface up")
+ if shellutil.run("ifconfig {0} up".format(if_name)) != 0:
+ raise Exception("Could not bring up RMDA interface: {0}".format(if_name))
+
+ logger.info("RDMA: configuring IPv4 addr and netmask on interface")
+ addr = '{0}/{1}'.format(ipv4_addr, netmask)
+ if shellutil.run("ifconfig {0} {1}".format(if_name, addr)) != 0:
+ raise Exception("Could set addr to {1} on {0}".format(if_name, addr))
+ logger.info("RDMA: network address and netmask configured on interface")
+
+ @staticmethod
+ def get_interface_by_mac(mac):
+ ret, output = shellutil.run_get_output("ifconfig -a")
+ if ret != 0:
+ raise Exception("Failed to list network interfaces")
+ output = output.replace('\n', '')
+ match = re.search(r"(eth\d).*(HWaddr|ether) {0}".format(mac),
+ output, re.IGNORECASE)
+ if match is None:
+ raise Exception("Failed to get ifname with mac: {0}".format(mac))
+ output = match.group(0)
+ eths = re.findall(r"eth\d", output)
+ if eths is None or len(eths) == 0:
+ raise Exception("ifname with mac: {0} not found".format(mac))
+ return eths[-1]