diff options
author | Ryan Harper <ryan.harper@canonical.com> | 2017-06-20 17:06:43 -0500 |
---|---|---|
committer | Scott Moser <smoser@brickies.net> | 2017-06-27 17:03:34 -0400 |
commit | ebc9ecbc8a76bdf511a456fb72339a7eb4c20568 (patch) | |
tree | 3b735b43bde5c5d8e96e602f5a7a6c39dd20f34c /cloudinit | |
parent | 10e5195e4a4459d1893801aa9fb486dc2973415d (diff) | |
download | vyos-cloud-init-ebc9ecbc8a76bdf511a456fb72339a7eb4c20568.tar.gz vyos-cloud-init-ebc9ecbc8a76bdf511a456fb72339a7eb4c20568.zip |
Azure: Add network-config, Refactor net layer to handle duplicate macs.
On systems with network devices with duplicate mac addresses, cloud-init
will fail to rename the devices according to the specified network
configuration. Refactor net layer to search by device driver and device
id if available. Azure systems may have duplicate mac addresses by
design.
Update Azure datasource to run at init-local time and let Azure datasource
generate a fallback networking config to handle advanced networking
configurations.
Lastly, add a 'setup' method to the datasources that is called before
userdata/vendordata is processed but after networking is up. That is
used here on Azure to interact with the 'fabric'.
Diffstat (limited to 'cloudinit')
-rw-r--r-- | cloudinit/cmd/main.py | 3 | ||||
-rw-r--r-- | cloudinit/net/__init__.py | 181 | ||||
-rw-r--r-- | cloudinit/net/eni.py | 2 | ||||
-rw-r--r-- | cloudinit/net/renderer.py | 4 | ||||
-rw-r--r-- | cloudinit/net/udev.py | 7 | ||||
-rw-r--r-- | cloudinit/sources/DataSourceAzure.py | 114 | ||||
-rw-r--r-- | cloudinit/sources/__init__.py | 15 | ||||
-rw-r--r-- | cloudinit/stages.py | 5 |
8 files changed, 281 insertions, 50 deletions
diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index ce3c10dd..139e03b3 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -372,6 +372,9 @@ def main_init(name, args): LOG.debug("[%s] %s is in local mode, will apply init modules now.", mode, init.datasource) + # Give the datasource a chance to use network resources. + # This is used on Azure to communicate with the fabric over network. + init.setup_datasource() # update fully realizes user-data (pulling in #include if necessary) init.update() # Stage 7 diff --git a/cloudinit/net/__init__.py b/cloudinit/net/__init__.py index 65accbb0..cba991a5 100644 --- a/cloudinit/net/__init__.py +++ b/cloudinit/net/__init__.py @@ -97,6 +97,10 @@ def is_bridge(devname): return os.path.exists(sys_dev_path(devname, "bridge")) +def is_bond(devname): + return os.path.exists(sys_dev_path(devname, "bonding")) + + def is_vlan(devname): uevent = str(read_sys_net_safe(devname, "uevent")) return 'DEVTYPE=vlan' in uevent.splitlines() @@ -124,6 +128,26 @@ def is_present(devname): return os.path.exists(sys_dev_path(devname)) +def device_driver(devname): + """Return the device driver for net device named 'devname'.""" + driver = None + driver_path = sys_dev_path(devname, "device/driver") + # driver is a symlink to the driver *dir* + if os.path.islink(driver_path): + driver = os.path.basename(os.readlink(driver_path)) + + return driver + + +def device_devid(devname): + """Return the device id string for net device named 'devname'.""" + dev_id = read_sys_net_safe(devname, "device/device") + if dev_id is False: + return None + + return dev_id + + def get_devicelist(): return os.listdir(SYS_CLASS_NET) @@ -138,12 +162,21 @@ def is_disabled_cfg(cfg): return cfg.get('config') == "disabled" -def generate_fallback_config(): +def generate_fallback_config(blacklist_drivers=None, config_driver=None): """Determine which attached net dev is most likely to have a connection and generate network state to run dhcp on that interface""" + + if not config_driver: + config_driver = False + + if not blacklist_drivers: + blacklist_drivers = [] + # get list of interfaces that could have connections invalid_interfaces = set(['lo']) - potential_interfaces = set(get_devicelist()) + potential_interfaces = set([device for device in get_devicelist() + if device_driver(device) not in + blacklist_drivers]) potential_interfaces = potential_interfaces.difference(invalid_interfaces) # sort into interfaces with carrier, interfaces which could have carrier, # and ignore interfaces that are definitely disconnected @@ -155,6 +188,9 @@ def generate_fallback_config(): if is_bridge(interface): # skip any bridges continue + if is_bond(interface): + # skip any bonds + continue carrier = read_sys_net_int(interface, 'carrier') if carrier: connected.append(interface) @@ -194,9 +230,18 @@ def generate_fallback_config(): break if target_mac and target_name: nconf = {'config': [], 'version': 1} - nconf['config'].append( - {'type': 'physical', 'name': target_name, - 'mac_address': target_mac, 'subnets': [{'type': 'dhcp'}]}) + cfg = {'type': 'physical', 'name': target_name, + 'mac_address': target_mac, 'subnets': [{'type': 'dhcp'}]} + # inject the device driver name, dev_id into config if enabled and + # device has a valid device driver value + if config_driver: + driver = device_driver(target_name) + if driver: + cfg['params'] = { + 'driver': driver, + 'device_id': device_devid(target_name), + } + nconf['config'].append(cfg) return nconf else: # can't read any interfaces addresses (or there are none); give up @@ -217,10 +262,16 @@ def apply_network_config_names(netcfg, strict_present=True, strict_busy=True): if ent.get('type') != 'physical': continue mac = ent.get('mac_address') - name = ent.get('name') if not mac: continue - renames.append([mac, name]) + name = ent.get('name') + driver = ent.get('params', {}).get('driver') + device_id = ent.get('params', {}).get('device_id') + if not driver: + driver = device_driver(name) + if not device_id: + device_id = device_devid(name) + renames.append([mac, name, driver, device_id]) return _rename_interfaces(renames) @@ -245,15 +296,27 @@ def _get_current_rename_info(check_downable=True): """Collect information necessary for rename_interfaces. returns a dictionary by mac address like: - {mac: - {'name': name - 'up': boolean: is_up(name), + {name: + { 'downable': None or boolean indicating that the - device has only automatically assigned ip addrs.}} + device has only automatically assigned ip addrs. + 'device_id': Device id value (if it has one) + 'driver': Device driver (if it has one) + 'mac': mac address + 'name': name + 'up': boolean: is_up(name) + }} """ - bymac = {} - for mac, name in get_interfaces_by_mac().items(): - bymac[mac] = {'name': name, 'up': is_up(name), 'downable': None} + cur_info = {} + for (name, mac, driver, device_id) in get_interfaces(): + cur_info[name] = { + 'downable': None, + 'device_id': device_id, + 'driver': driver, + 'mac': mac, + 'name': name, + 'up': is_up(name), + } if check_downable: nmatch = re.compile(r"[0-9]+:\s+(\w+)[@:]") @@ -265,11 +328,11 @@ def _get_current_rename_info(check_downable=True): for bytes_out in (ipv6, ipv4): nics_with_addresses.update(nmatch.findall(bytes_out)) - for d in bymac.values(): + for d in cur_info.values(): d['downable'] = (d['up'] is False or d['name'] not in nics_with_addresses) - return bymac + return cur_info def _rename_interfaces(renames, strict_present=True, strict_busy=True, @@ -282,15 +345,15 @@ def _rename_interfaces(renames, strict_present=True, strict_busy=True, if current_info is None: current_info = _get_current_rename_info() - cur_bymac = {} - for mac, data in current_info.items(): + cur_info = {} + for name, data in current_info.items(): cur = data.copy() - cur['mac'] = mac - cur_bymac[mac] = cur + cur['name'] = name + cur_info[name] = cur def update_byname(bymac): return dict((data['name'], data) - for data in bymac.values()) + for data in cur_info.values()) def rename(cur, new): util.subp(["ip", "link", "set", cur, "name", new], capture=True) @@ -304,14 +367,48 @@ def _rename_interfaces(renames, strict_present=True, strict_busy=True, ops = [] errors = [] ups = [] - cur_byname = update_byname(cur_bymac) + cur_byname = update_byname(cur_info) tmpname_fmt = "cirename%d" tmpi = -1 - for mac, new_name in renames: - cur = cur_bymac.get(mac, {}) - cur_name = cur.get('name') + def entry_match(data, mac, driver, device_id): + """match if set and in data""" + if mac and driver and device_id: + return (data['mac'] == mac and + data['driver'] == driver and + data['device_id'] == device_id) + elif mac and driver: + return (data['mac'] == mac and + data['driver'] == driver) + elif mac: + return (data['mac'] == mac) + + return False + + def find_entry(mac, driver, device_id): + match = [data for data in cur_info.values() + if entry_match(data, mac, driver, device_id)] + if len(match): + if len(match) > 1: + msg = ('Failed to match a single device. Matched devices "%s"' + ' with search values "(mac:%s driver:%s device_id:%s)"' + % (match, mac, driver, device_id)) + raise ValueError(msg) + return match[0] + + return None + + for mac, new_name, driver, device_id in renames: cur_ops = [] + cur = find_entry(mac, driver, device_id) + if not cur: + if strict_present: + errors.append( + "[nic not present] Cannot rename mac=%s to %s" + ", not available." % (mac, new_name)) + continue + + cur_name = cur.get('name') if cur_name == new_name: # nothing to do continue @@ -351,13 +448,13 @@ def _rename_interfaces(renames, strict_present=True, strict_busy=True, cur_ops.append(("rename", mac, new_name, (new_name, tmp_name))) target['name'] = tmp_name - cur_byname = update_byname(cur_bymac) + cur_byname = update_byname(cur_info) if target['up']: ups.append(("up", mac, new_name, (tmp_name,))) cur_ops.append(("rename", mac, new_name, (cur['name'], new_name))) cur['name'] = new_name - cur_byname = update_byname(cur_bymac) + cur_byname = update_byname(cur_info) ops += cur_ops opmap = {'rename': rename, 'down': down, 'up': up} @@ -426,6 +523,36 @@ def get_interfaces_by_mac(): return ret +def get_interfaces(): + """Return list of interface tuples (name, mac, driver, device_id) + + Bridges and any devices that have a 'stolen' mac are excluded.""" + try: + devs = get_devicelist() + except OSError as e: + if e.errno == errno.ENOENT: + devs = [] + else: + raise + ret = [] + empty_mac = '00:00:00:00:00:00' + for name in devs: + if not interface_has_own_mac(name): + continue + if is_bridge(name): + continue + if is_vlan(name): + continue + mac = get_interface_mac(name) + # some devices may not have a mac (tun0) + if not mac: + continue + if mac == empty_mac and name != 'lo': + continue + ret.append((name, mac, device_driver(name), device_devid(name))) + return ret + + class RendererNotFoundError(RuntimeError): pass diff --git a/cloudinit/net/eni.py b/cloudinit/net/eni.py index 98ce01e4..b707146c 100644 --- a/cloudinit/net/eni.py +++ b/cloudinit/net/eni.py @@ -72,6 +72,8 @@ def _iface_add_attrs(iface, index): content = [] ignore_map = [ 'control', + 'device_id', + 'driver', 'index', 'inet', 'mode', diff --git a/cloudinit/net/renderer.py b/cloudinit/net/renderer.py index c68658dc..bba139e5 100644 --- a/cloudinit/net/renderer.py +++ b/cloudinit/net/renderer.py @@ -34,8 +34,10 @@ class Renderer(object): for iface in network_state.iter_interfaces(filter_by_physical): # for physical interfaces write out a persist net udev rule if 'name' in iface and iface.get('mac_address'): + driver = iface.get('driver', None) content.write(generate_udev_rule(iface['name'], - iface['mac_address'])) + iface['mac_address'], + driver=driver)) return content.getvalue() @abc.abstractmethod diff --git a/cloudinit/net/udev.py b/cloudinit/net/udev.py index fd2fd8c7..58c0a708 100644 --- a/cloudinit/net/udev.py +++ b/cloudinit/net/udev.py @@ -23,7 +23,7 @@ def compose_udev_setting(key, value): return '%s="%s"' % (key, value) -def generate_udev_rule(interface, mac): +def generate_udev_rule(interface, mac, driver=None): """Return a udev rule to set the name of network interface with `mac`. The rule ends up as a single line looking something like: @@ -31,10 +31,13 @@ def generate_udev_rule(interface, mac): SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?*", ATTR{address}="ff:ee:dd:cc:bb:aa", NAME="eth0" """ + if not driver: + driver = '?*' + rule = ', '.join([ compose_udev_equality('SUBSYSTEM', 'net'), compose_udev_equality('ACTION', 'add'), - compose_udev_equality('DRIVERS', '?*'), + compose_udev_equality('DRIVERS', driver), compose_udev_attr_equality('address', mac), compose_udev_setting('NAME', interface), ]) diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index 4fe0d635..b5a95a1f 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -16,6 +16,7 @@ from xml.dom import minidom import xml.etree.ElementTree as ET from cloudinit import log as logging +from cloudinit import net from cloudinit import sources from cloudinit.sources.helpers.azure import get_metadata_from_fabric from cloudinit import util @@ -245,7 +246,9 @@ def temporary_hostname(temp_hostname, cfg, hostname_command='hostname'): set_hostname(previous_hostname, hostname_command) -class DataSourceAzureNet(sources.DataSource): +class DataSourceAzure(sources.DataSource): + _negotiated = False + def __init__(self, sys_cfg, distro, paths): sources.DataSource.__init__(self, sys_cfg, distro, paths) self.seed_dir = os.path.join(paths.seed_dir, 'azure') @@ -255,6 +258,7 @@ class DataSourceAzureNet(sources.DataSource): util.get_cfg_by_path(sys_cfg, DS_CFG_PATH, {}), BUILTIN_DS_CONFIG]) self.dhclient_lease_file = self.ds_cfg.get('dhclient_lease_file') + self._network_config = None def __str__(self): root = sources.DataSource.__str__(self) @@ -331,6 +335,7 @@ class DataSourceAzureNet(sources.DataSource): if asset_tag != AZURE_CHASSIS_ASSET_TAG: LOG.debug("Non-Azure DMI asset tag '%s' discovered.", asset_tag) return False + ddir = self.ds_cfg['data_dir'] candidates = [self.seed_dir] @@ -375,13 +380,14 @@ class DataSourceAzureNet(sources.DataSource): LOG.debug("using files cached in %s", ddir) # azure / hyper-v provides random data here + # TODO. find the seed on FreeBSD platform + # now update ds_cfg to reflect contents pass in config if not util.is_FreeBSD(): seed = util.load_file("/sys/firmware/acpi/tables/OEM0", quiet=True, decode=False) if seed: self.metadata['random_seed'] = seed - # TODO. find the seed on FreeBSD platform - # now update ds_cfg to reflect contents pass in config + user_ds_cfg = util.get_cfg_by_path(self.cfg, DS_CFG_PATH, {}) self.ds_cfg = util.mergemanydict([user_ds_cfg, self.ds_cfg]) @@ -389,6 +395,40 @@ class DataSourceAzureNet(sources.DataSource): # the directory to be protected. write_files(ddir, files, dirmode=0o700) + self.metadata['instance-id'] = util.read_dmi_data('system-uuid') + + return True + + def device_name_to_device(self, name): + return self.ds_cfg['disk_aliases'].get(name) + + def get_config_obj(self): + return self.cfg + + def check_instance_id(self, sys_cfg): + # quickly (local check only) if self.instance_id is still valid + return sources.instance_id_matches_system_uuid(self.get_instance_id()) + + def setup(self, is_new_instance): + if self._negotiated is False: + LOG.debug("negotiating for %s (new_instance=%s)", + self.get_instance_id(), is_new_instance) + fabric_data = self._negotiate() + LOG.debug("negotiating returned %s", fabric_data) + if fabric_data: + self.metadata.update(fabric_data) + self._negotiated = True + else: + LOG.debug("negotiating already done for %s", + self.get_instance_id()) + + def _negotiate(self): + """Negotiate with fabric and return data from it. + + On success, returns a dictionary including 'public_keys'. + On failure, returns False. + """ + if self.ds_cfg['agent_command'] == AGENT_START_BUILTIN: self.bounce_network_with_azure_hostname() @@ -398,31 +438,64 @@ class DataSourceAzureNet(sources.DataSource): else: metadata_func = self.get_metadata_from_agent + LOG.debug("negotiating with fabric via agent command %s", + self.ds_cfg['agent_command']) try: fabric_data = metadata_func() except Exception as exc: - LOG.info("Error communicating with Azure fabric; assume we aren't" - " on Azure.", exc_info=True) + LOG.warning( + "Error communicating with Azure fabric; You may experience." + "connectivity issues.", exc_info=True) return False - self.metadata['instance-id'] = util.read_dmi_data('system-uuid') - self.metadata.update(fabric_data) - - return True - def device_name_to_device(self, name): - return self.ds_cfg['disk_aliases'].get(name) - - def get_config_obj(self): - return self.cfg - - def check_instance_id(self, sys_cfg): - # quickly (local check only) if self.instance_id is still valid - return sources.instance_id_matches_system_uuid(self.get_instance_id()) + return fabric_data def activate(self, cfg, is_new_instance): address_ephemeral_resize(is_new_instance=is_new_instance) return + @property + def network_config(self): + """Generate a network config like net.generate_fallback_network() with + the following execptions. + + 1. Probe the drivers of the net-devices present and inject them in + the network configuration under params: driver: <driver> value + 2. If the driver value is 'mlx4_core', the control mode should be + set to manual. The device will be later used to build a bond, + for now we want to ensure the device gets named but does not + break any network configuration + """ + blacklist = ['mlx4_core'] + if not self._network_config: + LOG.debug('Azure: generating fallback configuration') + # generate a network config, blacklist picking any mlx4_core devs + netconfig = net.generate_fallback_config( + blacklist_drivers=blacklist, config_driver=True) + + # if we have any blacklisted devices, update the network_config to + # include the device, mac, and driver values, but with no ip + # config; this ensures udev rules are generated but won't affect + # ip configuration + bl_found = 0 + for bl_dev in [dev for dev in net.get_devicelist() + if net.device_driver(dev) in blacklist]: + bl_found += 1 + cfg = { + 'type': 'physical', + 'name': 'vf%d' % bl_found, + 'mac_address': net.get_interface_mac(bl_dev), + 'params': { + 'driver': net.device_driver(bl_dev), + 'device_id': net.device_devid(bl_dev), + }, + } + netconfig['config'].append(cfg) + + self._network_config = netconfig + + return self._network_config + def _partitions_on_device(devpath, maxnum=16): # return a list of tuples (ptnum, path) for each part on devpath @@ -849,9 +922,12 @@ class NonAzureDataSource(Exception): pass +# Legacy: Must be present in case we load an old pkl object +DataSourceAzureNet = DataSourceAzure + # Used to match classes to dependencies datasources = [ - (DataSourceAzureNet, (sources.DEP_FILESYSTEM, sources.DEP_NETWORK)), + (DataSourceAzure, (sources.DEP_FILESYSTEM, )), ] diff --git a/cloudinit/sources/__init__.py b/cloudinit/sources/__init__.py index c3ce36d6..952caf35 100644 --- a/cloudinit/sources/__init__.py +++ b/cloudinit/sources/__init__.py @@ -251,10 +251,23 @@ class DataSource(object): def first_instance_boot(self): return + def setup(self, is_new_instance): + """setup(is_new_instance) + + This is called before user-data and vendor-data have been processed. + + Unless the datasource has set mode to 'local', then networking + per 'fallback' or per 'network_config' will have been written and + brought up the OS at this point. + """ + return + def activate(self, cfg, is_new_instance): """activate(cfg, is_new_instance) - This is called before the init_modules will be called. + This is called before the init_modules will be called but after + the user-data and vendor-data have been fully processed. + The cfg is fully up to date config, it contains a merged view of system config, datasource config, user config, vendor config. It should be used rather than the sys_cfg passed to __init__. diff --git a/cloudinit/stages.py b/cloudinit/stages.py index ad557827..a1c4a517 100644 --- a/cloudinit/stages.py +++ b/cloudinit/stages.py @@ -362,6 +362,11 @@ class Init(object): self._store_userdata() self._store_vendordata() + def setup_datasource(self): + if self.datasource is None: + raise RuntimeError("Datasource is None, cannot setup.") + self.datasource.setup(is_new_instance=self.is_new_instance()) + def activate_datasource(self): if self.datasource is None: raise RuntimeError("Datasource is None, cannot activate.") |