diff options
author | Yves-Alexis Perez <corsac@debian.org> | 2013-01-02 14:18:20 +0100 |
---|---|---|
committer | Yves-Alexis Perez <corsac@debian.org> | 2013-01-02 14:18:20 +0100 |
commit | c1343b3278cdf99533b7902744d15969f9d6fdc1 (patch) | |
tree | d5ed3dc5677a59260ec41cd39bb284d3e94c91b3 /src/libhydra/plugins/kernel_netlink/kernel_netlink_net.c | |
parent | b34738ed08c2227300d554b139e2495ca5da97d6 (diff) | |
download | vyos-strongswan-c1343b3278cdf99533b7902744d15969f9d6fdc1.tar.gz vyos-strongswan-c1343b3278cdf99533b7902744d15969f9d6fdc1.zip |
Imported Upstream version 5.0.1
Diffstat (limited to 'src/libhydra/plugins/kernel_netlink/kernel_netlink_net.c')
-rw-r--r-- | src/libhydra/plugins/kernel_netlink/kernel_netlink_net.c | 1712 |
1 files changed, 1216 insertions, 496 deletions
diff --git a/src/libhydra/plugins/kernel_netlink/kernel_netlink_net.c b/src/libhydra/plugins/kernel_netlink/kernel_netlink_net.c index cce0ff402..3f63a8496 100644 --- a/src/libhydra/plugins/kernel_netlink/kernel_netlink_net.c +++ b/src/libhydra/plugins/kernel_netlink/kernel_netlink_net.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Tobias Brunner + * Copyright (C) 2008-2012 Tobias Brunner * Copyright (C) 2005-2008 Martin Willi * Hochschule fuer Technik Rapperswil * @@ -38,6 +38,7 @@ */ #include <sys/socket.h> +#include <sys/utsname.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> #include <unistd.h> @@ -50,32 +51,38 @@ #include <hydra.h> #include <debug.h> #include <threading/thread.h> -#include <threading/condvar.h> #include <threading/mutex.h> +#include <threading/rwlock.h> +#include <threading/rwlock_condvar.h> +#include <threading/spinlock.h> +#include <utils/hashtable.h> #include <utils/linked_list.h> #include <processing/jobs/callback_job.h> /** delay before firing roam events (ms) */ #define ROAM_DELAY 100 +/** delay before reinstalling routes (ms) */ +#define ROUTE_DELAY 100 + typedef struct addr_entry_t addr_entry_t; /** - * IP address in an inface_entry_t + * IP address in an iface_entry_t */ struct addr_entry_t { - /** The ip address */ + /** the ip address */ host_t *ip; - /** virtual IP managed by us */ - bool virtual; - /** scope of the address */ u_char scope; - /** Number of times this IP is used, if virtual */ + /** number of times this IP is used, if virtual (i.e. managed by us) */ u_int refcount; + + /** TRUE once it is installed, if virtual */ + bool installed; }; /** @@ -105,6 +112,9 @@ struct iface_entry_t { /** list of addresses as host_t */ linked_list_t *addrs; + + /** TRUE if usable by config */ + bool usable; }; /** @@ -116,6 +126,208 @@ static void iface_entry_destroy(iface_entry_t *this) free(this); } +/** + * find an interface entry by index + */ +static bool iface_entry_by_index(iface_entry_t *this, int *ifindex) +{ + return this->ifindex == *ifindex; +} + +/** + * find an interface entry by name + */ +static bool iface_entry_by_name(iface_entry_t *this, char *ifname) +{ + return streq(this->ifname, ifname); +} + +/** + * check if an interface is up + */ +static inline bool iface_entry_up(iface_entry_t *iface) +{ + return (iface->flags & IFF_UP) == IFF_UP; +} + +/** + * check if an interface is up and usable + */ +static inline bool iface_entry_up_and_usable(iface_entry_t *iface) +{ + return iface->usable && iface_entry_up(iface); +} + +typedef struct addr_map_entry_t addr_map_entry_t; + +/** + * Entry that maps an IP address to an interface entry + */ +struct addr_map_entry_t { + /** The IP address */ + host_t *ip; + + /** The address entry for this IP address */ + addr_entry_t *addr; + + /** The interface this address is installed on */ + iface_entry_t *iface; +}; + +/** + * Hash a addr_map_entry_t object, all entries with the same IP address + * are stored in the same bucket + */ +static u_int addr_map_entry_hash(addr_map_entry_t *this) +{ + return chunk_hash(this->ip->get_address(this->ip)); +} + +/** + * Compare two addr_map_entry_t objects, two entries are equal if they are + * installed on the same interface + */ +static bool addr_map_entry_equals(addr_map_entry_t *a, addr_map_entry_t *b) +{ + return a->iface->ifindex == b->iface->ifindex && + a->ip->ip_equals(a->ip, b->ip); +} + +/** + * Used with get_match this finds an address entry if it is installed on + * an up and usable interface + */ +static bool addr_map_entry_match_up_and_usable(addr_map_entry_t *a, + addr_map_entry_t *b) +{ + return iface_entry_up_and_usable(b->iface) && + a->ip->ip_equals(a->ip, b->ip); +} + +/** + * Used with get_match this finds an address entry if it is installed on + * any active local interface + */ +static bool addr_map_entry_match_up(addr_map_entry_t *a, addr_map_entry_t *b) +{ + return iface_entry_up(b->iface) && a->ip->ip_equals(a->ip, b->ip); +} + +/** + * Used with get_match this finds an address entry if it is installed on + * any local interface + */ +static bool addr_map_entry_match(addr_map_entry_t *a, addr_map_entry_t *b) +{ + return a->ip->ip_equals(a->ip, b->ip); +} + +typedef struct route_entry_t route_entry_t; + +/** + * Installed routing entry + */ +struct route_entry_t { + /** Name of the interface the route is bound to */ + char *if_name; + + /** Source ip of the route */ + host_t *src_ip; + + /** Gateway for this route */ + host_t *gateway; + + /** Destination net */ + chunk_t dst_net; + + /** Destination net prefixlen */ + u_int8_t prefixlen; +}; + +/** + * Clone a route_entry_t object. + */ +static route_entry_t *route_entry_clone(route_entry_t *this) +{ + route_entry_t *route; + + INIT(route, + .if_name = strdup(this->if_name), + .src_ip = this->src_ip->clone(this->src_ip), + .gateway = this->gateway->clone(this->gateway), + .dst_net = chunk_clone(this->dst_net), + .prefixlen = this->prefixlen, + ); + return route; +} + +/** + * Destroy a route_entry_t object + */ +static void route_entry_destroy(route_entry_t *this) +{ + free(this->if_name); + DESTROY_IF(this->src_ip); + DESTROY_IF(this->gateway); + chunk_free(&this->dst_net); + free(this); +} + +/** + * Hash a route_entry_t object + */ +static u_int route_entry_hash(route_entry_t *this) +{ + return chunk_hash_inc(chunk_from_thing(this->prefixlen), + chunk_hash(this->dst_net)); +} + +/** + * Compare two route_entry_t objects + */ +static bool route_entry_equals(route_entry_t *a, route_entry_t *b) +{ + return a->if_name && b->if_name && streq(a->if_name, b->if_name) && + a->src_ip->ip_equals(a->src_ip, b->src_ip) && + a->gateway->ip_equals(a->gateway, b->gateway) && + chunk_equals(a->dst_net, b->dst_net) && a->prefixlen == b->prefixlen; +} + +typedef struct net_change_t net_change_t; + +/** + * Queued network changes + */ +struct net_change_t { + /** Name of the interface that got activated (or an IP appeared on) */ + char *if_name; +}; + +/** + * Destroy a net_change_t object + */ +static void net_change_destroy(net_change_t *this) +{ + free(this->if_name); + free(this); +} + +/** + * Hash a net_change_t object + */ +static u_int net_change_hash(net_change_t *this) +{ + return chunk_hash(chunk_create(this->if_name, strlen(this->if_name))); +} + +/** + * Compare two net_change_t objects + */ +static bool net_change_equals(net_change_t *a, net_change_t *b) +{ + return streq(a->if_name, b->if_name); +} + typedef struct private_kernel_netlink_net_t private_kernel_netlink_net_t; /** @@ -128,14 +340,14 @@ struct private_kernel_netlink_net_t { kernel_netlink_net_t public; /** - * mutex to lock access to various lists + * lock to access various lists and maps */ - mutex_t *mutex; + rwlock_t *lock; /** * condition variable to signal virtual IP add/removal */ - condvar_t *condvar; + rwlock_condvar_t *condvar; /** * Cached list of interfaces and its addresses (iface_entry_t) @@ -143,9 +355,14 @@ struct private_kernel_netlink_net_t { linked_list_t *ifaces; /** - * job receiving netlink events + * Map for IP addresses to iface_entry_t objects (addr_map_entry_t) */ - callback_job_t *job; + hashtable_t *addrs; + + /** + * Map for virtual IP addresses to iface_entry_t objects (addr_map_entry_t) + */ + hashtable_t *vips; /** * netlink rt socket (routing) @@ -158,9 +375,14 @@ struct private_kernel_netlink_net_t { int socket_events; /** - * time of the last roam event + * earliest time of the next roam event + */ + timeval_t next_roam; + + /** + * lock to check and update roam event time */ - timeval_t last_roam; + spinlock_t *roam_lock; /** * routing table to install routes @@ -173,6 +395,31 @@ struct private_kernel_netlink_net_t { int routing_table_prio; /** + * installed routes + */ + hashtable_t *routes; + + /** + * mutex for routes + */ + mutex_t *routes_lock; + + /** + * interface changes which may trigger route reinstallation + */ + hashtable_t *net_changes; + + /** + * mutex for route reinstallation triggers + */ + mutex_t *net_changes_lock; + + /** + * time of last route reinstallation + */ + timeval_t last_route_reinstall; + + /** * whether to react to RTM_NEWROUTE or RTM_DELROUTE events */ bool process_route; @@ -183,79 +430,253 @@ struct private_kernel_netlink_net_t { bool install_virtual_ip; /** + * the name of the interface virtual IP addresses are installed on + */ + char *install_virtual_ip_on; + + /** + * whether preferred source addresses can be specified for IPv6 routes + */ + bool rta_prefsrc_for_ipv6; + + /** * list with routing tables to be excluded from route lookup */ linked_list_t *rt_exclude; }; /** - * get the refcount of a virtual ip + * Forward declaration + */ +static status_t manage_srcroute(private_kernel_netlink_net_t *this, + int nlmsg_type, int flags, chunk_t dst_net, + u_int8_t prefixlen, host_t *gateway, + host_t *src_ip, char *if_name); + +/** + * Clear the queued network changes. */ -static int get_vip_refcount(private_kernel_netlink_net_t *this, host_t* ip) +static void net_changes_clear(private_kernel_netlink_net_t *this) { - enumerator_t *ifaces, *addrs; - iface_entry_t *iface; - addr_entry_t *addr; - int refcount = 0; + enumerator_t *enumerator; + net_change_t *change; - ifaces = this->ifaces->create_enumerator(this->ifaces); - while (ifaces->enumerate(ifaces, (void**)&iface)) + enumerator = this->net_changes->create_enumerator(this->net_changes); + while (enumerator->enumerate(enumerator, NULL, (void**)&change)) { - addrs = iface->addrs->create_enumerator(iface->addrs); - while (addrs->enumerate(addrs, (void**)&addr)) - { - if (addr->virtual && (iface->flags & IFF_UP) && - ip->ip_equals(ip, addr->ip)) + this->net_changes->remove_at(this->net_changes, enumerator); + net_change_destroy(change); + } + enumerator->destroy(enumerator); +} + +/** + * Act upon queued network changes. + */ +static job_requeue_t reinstall_routes(private_kernel_netlink_net_t *this) +{ + enumerator_t *enumerator; + route_entry_t *route; + + this->net_changes_lock->lock(this->net_changes_lock); + this->routes_lock->lock(this->routes_lock); + + enumerator = this->routes->create_enumerator(this->routes); + while (enumerator->enumerate(enumerator, NULL, (void**)&route)) + { + net_change_t *change, lookup = { + .if_name = route->if_name, + }; + /* check if a change for the outgoing interface is queued */ + change = this->net_changes->get(this->net_changes, &lookup); + if (!change) + { /* in case src_ip is not on the outgoing interface */ + if (this->public.interface.get_interface(&this->public.interface, + route->src_ip, &lookup.if_name)) { - refcount = addr->refcount; - break; + if (!streq(lookup.if_name, route->if_name)) + { + change = this->net_changes->get(this->net_changes, &lookup); + } + free(lookup.if_name); } } - addrs->destroy(addrs); - if (refcount) + if (change) { - break; + manage_srcroute(this, RTM_NEWROUTE, NLM_F_CREATE | NLM_F_EXCL, + route->dst_net, route->prefixlen, route->gateway, + route->src_ip, route->if_name); } } - ifaces->destroy(ifaces); + enumerator->destroy(enumerator); + this->routes_lock->unlock(this->routes_lock); + + net_changes_clear(this); + this->net_changes_lock->unlock(this->net_changes_lock); + return JOB_REQUEUE_NONE; +} + +/** + * Queue route reinstallation caused by network changes for a given interface. + * + * The route reinstallation is delayed for a while and only done once for + * several calls during this delay, in order to avoid doing it too often. + * The interface name is freed. + */ +static void queue_route_reinstall(private_kernel_netlink_net_t *this, + char *if_name) +{ + net_change_t *update, *found; + timeval_t now; + job_t *job; + + INIT(update, + .if_name = if_name + ); + + this->net_changes_lock->lock(this->net_changes_lock); + found = this->net_changes->put(this->net_changes, update, update); + if (found) + { + net_change_destroy(found); + } + time_monotonic(&now); + if (timercmp(&now, &this->last_route_reinstall, >)) + { + now.tv_usec += ROUTE_DELAY * 1000; + while (now.tv_usec > 1000000) + { + now.tv_sec++; + now.tv_usec -= 1000000; + } + this->last_route_reinstall = now; + + job = (job_t*)callback_job_create((callback_job_cb_t)reinstall_routes, + this, NULL, NULL); + lib->scheduler->schedule_job_ms(lib->scheduler, job, ROUTE_DELAY); + } + this->net_changes_lock->unlock(this->net_changes_lock); +} - return refcount; +/** + * check if the given IP is known as virtual IP and currently installed + * + * this function will also return TRUE if the virtual IP entry disappeared. + * in that case the returned entry will be NULL. + * + * this->lock must be held when calling this function + */ +static bool is_vip_installed_or_gone(private_kernel_netlink_net_t *this, + host_t *ip, addr_map_entry_t **entry) +{ + addr_map_entry_t lookup = { + .ip = ip, + }; + + *entry = this->vips->get_match(this->vips, &lookup, + (void*)addr_map_entry_match); + if (*entry == NULL) + { /* the virtual IP disappeared */ + return TRUE; + } + return (*entry)->addr->installed; +} + +/** + * check if the given IP is known as virtual IP + * + * this->lock must be held when calling this function + */ +static bool is_known_vip(private_kernel_netlink_net_t *this, host_t *ip) +{ + addr_map_entry_t lookup = { + .ip = ip, + }; + + return this->vips->get_match(this->vips, &lookup, + (void*)addr_map_entry_match) != NULL; +} + +/** + * Add an address map entry + */ +static void addr_map_entry_add(hashtable_t *map, addr_entry_t *addr, + iface_entry_t *iface) +{ + addr_map_entry_t *entry; + + INIT(entry, + .ip = addr->ip, + .addr = addr, + .iface = iface, + ); + entry = map->put(map, entry, entry); + free(entry); +} + +/** + * Remove an address map entry + */ +static void addr_map_entry_remove(hashtable_t *map, addr_entry_t *addr, + iface_entry_t *iface) +{ + addr_map_entry_t *entry, lookup = { + .ip = addr->ip, + .addr = addr, + .iface = iface, + }; + + entry = map->remove(map, &lookup); + free(entry); } /** * get the first non-virtual ip address on the given interface. + * if a candidate address is given, we first search for that address and if not + * found return the address as above. * returned host is a clone, has to be freed by caller. + * + * this->lock must be held when calling this function */ static host_t *get_interface_address(private_kernel_netlink_net_t *this, - int ifindex, int family) + int ifindex, int family, host_t *candidate) { - enumerator_t *ifaces, *addrs; iface_entry_t *iface; + enumerator_t *addrs; addr_entry_t *addr; host_t *ip = NULL; - this->mutex->lock(this->mutex); - ifaces = this->ifaces->create_enumerator(this->ifaces); - while (ifaces->enumerate(ifaces, &iface)) + if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_index, + (void**)&iface, &ifindex) == SUCCESS) { - if (iface->ifindex == ifindex) - { + if (iface->usable) + { /* only use interfaces not excluded by config */ addrs = iface->addrs->create_enumerator(iface->addrs); while (addrs->enumerate(addrs, &addr)) { - if (!addr->virtual && addr->ip->get_family(addr->ip) == family) + if (addr->refcount) + { /* ignore virtual IP addresses */ + continue; + } + if (addr->ip->get_family(addr->ip) == family) { - ip = addr->ip->clone(addr->ip); - break; + if (!candidate || candidate->ip_equals(candidate, addr->ip)) + { /* stop at the first address if we don't search for a + * candidate or if the candidate matches */ + ip = addr->ip; + break; + } + else if (!ip) + { /* store the first address as fallback if candidate is + * not found */ + ip = addr->ip; + } } } addrs->destroy(addrs); - break; } } - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); - return ip; + return ip ? ip->clone(ip) : NULL; } /** @@ -277,21 +698,60 @@ static void fire_roam_event(private_kernel_netlink_net_t *this, bool address) job_t *job; time_monotonic(&now); - if (timercmp(&now, &this->last_roam, >)) + this->roam_lock->lock(this->roam_lock); + if (!timercmp(&now, &this->next_roam, >)) { - now.tv_usec += ROAM_DELAY * 1000; - while (now.tv_usec > 1000000) - { - now.tv_sec++; - now.tv_usec -= 1000000; - } - this->last_roam = now; + this->roam_lock->unlock(this->roam_lock); + return; + } + now.tv_usec += ROAM_DELAY * 1000; + while (now.tv_usec > 1000000) + { + now.tv_sec++; + now.tv_usec -= 1000000; + } + this->next_roam = now; + this->roam_lock->unlock(this->roam_lock); + + job = (job_t*)callback_job_create((callback_job_cb_t)roam_event, + (void*)(uintptr_t)(address ? 1 : 0), + NULL, NULL); + lib->scheduler->schedule_job_ms(lib->scheduler, job, ROAM_DELAY); +} - job = (job_t*)callback_job_create((callback_job_cb_t)roam_event, - (void*)(uintptr_t)(address ? 1 : 0), - NULL, NULL); - lib->scheduler->schedule_job_ms(lib->scheduler, job, ROAM_DELAY); +/** + * check if an interface with a given index is up and usable + * + * this->lock must be locked when calling this function + */ +static bool is_interface_up_and_usable(private_kernel_netlink_net_t *this, + int index) +{ + iface_entry_t *iface; + + if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_index, + (void**)&iface, &index) == SUCCESS) + { + return iface_entry_up_and_usable(iface); + } + return FALSE; +} + +/** + * unregister the current addr_entry_t from the hashtable it is stored in + * + * this->lock must be locked when calling this function + */ +static void addr_entry_unregister(addr_entry_t *addr, iface_entry_t *iface, + private_kernel_netlink_net_t *this) +{ + if (addr->refcount) + { + addr_map_entry_remove(this->vips, addr, iface); + this->condvar->broadcast(this->condvar); + return; } + addr_map_entry_remove(this->addrs, addr, iface); } /** @@ -306,9 +766,9 @@ static void process_link(private_kernel_netlink_net_t *this, enumerator_t *enumerator; iface_entry_t *current, *entry = NULL; char *name = NULL; - bool update = FALSE; + bool update = FALSE, update_routes = FALSE; - while(RTA_OK(rta, rtasize)) + while (RTA_OK(rta, rtasize)) { switch (rta->rta_type) { @@ -323,40 +783,30 @@ static void process_link(private_kernel_netlink_net_t *this, name = "(unknown)"; } - this->mutex->lock(this->mutex); + this->lock->write_lock(this->lock); switch (hdr->nlmsg_type) { case RTM_NEWLINK: { - if (msg->ifi_flags & IFF_LOOPBACK) - { /* ignore loopback interfaces */ - break; - } - enumerator = this->ifaces->create_enumerator(this->ifaces); - while (enumerator->enumerate(enumerator, ¤t)) - { - if (current->ifindex == msg->ifi_index) - { - entry = current; - break; - } - } - enumerator->destroy(enumerator); - if (!entry) + if (this->ifaces->find_first(this->ifaces, + (void*)iface_entry_by_index, (void**)&entry, + &msg->ifi_index) != SUCCESS) { - entry = malloc_thing(iface_entry_t); - entry->ifindex = msg->ifi_index; - entry->flags = 0; - entry->addrs = linked_list_create(); + INIT(entry, + .ifindex = msg->ifi_index, + .addrs = linked_list_create(), + .usable = hydra->kernel_interface->is_interface_usable( + hydra->kernel_interface, name), + ); this->ifaces->insert_last(this->ifaces, entry); } strncpy(entry->ifname, name, IFNAMSIZ); entry->ifname[IFNAMSIZ-1] = '\0'; - if (event) + if (event && entry->usable) { if (!(entry->flags & IFF_UP) && (msg->ifi_flags & IFF_UP)) { - update = TRUE; + update = update_routes = TRUE; DBG1(DBG_KNL, "interface %s activated", name); } if ((entry->flags & IFF_UP) && !(msg->ifi_flags & IFF_UP)) @@ -375,12 +825,16 @@ static void process_link(private_kernel_netlink_net_t *this, { if (current->ifindex == msg->ifi_index) { - if (event) + if (event && current->usable) { update = TRUE; DBG1(DBG_KNL, "interface %s deleted", current->ifname); } + /* TODO: move virtual IPs installed on this interface to + * another interface? */ this->ifaces->remove_at(this->ifaces, enumerator); + current->addrs->invoke_function(current->addrs, + (void*)addr_entry_unregister, current, this); iface_entry_destroy(current); break; } @@ -389,9 +843,13 @@ static void process_link(private_kernel_netlink_net_t *this, break; } } - this->mutex->unlock(this->mutex); + this->lock->unlock(this->lock); + + if (update_routes && event) + { + queue_route_reinstall(this, strdup(name)); + } - /* send an update to all IKE_SAs */ if (update && event) { fire_roam_event(this, TRUE); @@ -408,13 +866,12 @@ static void process_addr(private_kernel_netlink_net_t *this, struct rtattr *rta = IFA_RTA(msg); size_t rtasize = IFA_PAYLOAD (hdr); host_t *host = NULL; - enumerator_t *ifaces, *addrs; iface_entry_t *iface; - addr_entry_t *addr; chunk_t local = chunk_empty, address = chunk_empty; + char *route_ifname = NULL; bool update = FALSE, found = FALSE, changed = FALSE; - while(RTA_OK(rta, rtasize)) + while (RTA_OK(rta, rtasize)) { switch (rta->rta_type) { @@ -447,65 +904,92 @@ static void process_addr(private_kernel_netlink_net_t *this, return; } - this->mutex->lock(this->mutex); - ifaces = this->ifaces->create_enumerator(this->ifaces); - while (ifaces->enumerate(ifaces, &iface)) + this->lock->write_lock(this->lock); + if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_index, + (void**)&iface, &msg->ifa_index) == SUCCESS) { - if (iface->ifindex == msg->ifa_index) + addr_map_entry_t *entry, lookup = { + .ip = host, + .iface = iface, + }; + addr_entry_t *addr; + + entry = this->vips->get(this->vips, &lookup); + if (entry) { - addrs = iface->addrs->create_enumerator(iface->addrs); - while (addrs->enumerate(addrs, &addr)) + if (hdr->nlmsg_type == RTM_NEWADDR) + { /* mark as installed and signal waiting threads */ + entry->addr->installed = TRUE; + } + else + { /* the address was already marked as uninstalled */ + addr = entry->addr; + iface->addrs->remove(iface->addrs, addr, NULL); + addr_map_entry_remove(this->vips, addr, iface); + addr_entry_destroy(addr); + } + /* no roam events etc. for virtual IPs */ + this->condvar->broadcast(this->condvar); + this->lock->unlock(this->lock); + host->destroy(host); + return; + } + entry = this->addrs->get(this->addrs, &lookup); + if (entry) + { + if (hdr->nlmsg_type == RTM_DELADDR) { - if (host->ip_equals(host, addr->ip)) + found = TRUE; + addr = entry->addr; + iface->addrs->remove(iface->addrs, addr, NULL); + if (iface->usable) { - found = TRUE; - if (hdr->nlmsg_type == RTM_DELADDR) - { - iface->addrs->remove_at(iface->addrs, addrs); - if (!addr->virtual) - { - changed = TRUE; - DBG1(DBG_KNL, "%H disappeared from %s", - host, iface->ifname); - } - addr_entry_destroy(addr); - } - else if (hdr->nlmsg_type == RTM_NEWADDR && addr->virtual) - { - addr->refcount = 1; - } + changed = TRUE; + DBG1(DBG_KNL, "%H disappeared from %s", host, + iface->ifname); } + addr_map_entry_remove(this->addrs, addr, iface); + addr_entry_destroy(addr); } - addrs->destroy(addrs); - + } + else + { if (hdr->nlmsg_type == RTM_NEWADDR) { - if (!found) + found = TRUE; + changed = TRUE; + route_ifname = strdup(iface->ifname); + INIT(addr, + .ip = host->clone(host), + .scope = msg->ifa_scope, + ); + iface->addrs->insert_last(iface->addrs, addr); + addr_map_entry_add(this->addrs, addr, iface); + if (event && iface->usable) { - found = TRUE; - changed = TRUE; - addr = malloc_thing(addr_entry_t); - addr->ip = host->clone(host); - addr->virtual = FALSE; - addr->refcount = 1; - addr->scope = msg->ifa_scope; - - iface->addrs->insert_last(iface->addrs, addr); - if (event) - { - DBG1(DBG_KNL, "%H appeared on %s", host, iface->ifname); - } + DBG1(DBG_KNL, "%H appeared on %s", host, iface->ifname); } } - if (found && (iface->flags & IFF_UP)) - { - update = TRUE; - } - break; + } + if (found && (iface->flags & IFF_UP)) + { + update = TRUE; + } + if (!iface->usable) + { /* ignore events for interfaces excluded by config */ + update = changed = FALSE; } } - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); + this->lock->unlock(this->lock); + + if (update && event && route_ifname) + { + queue_route_reinstall(this, route_ifname); + } + else + { + free(route_ifname); + } host->destroy(host); /* send an update to all IKE_SAs */ @@ -532,6 +1016,10 @@ static void process_route(private_kernel_netlink_net_t *this, struct nlmsghdr *h { return; } + else if (msg->rtm_flags & RTM_F_CLONED) + { /* ignore cached routes, seem to be created a lot for IPv6 */ + return; + } while (RTA_OK(rta, rtasize)) { @@ -551,20 +1039,26 @@ static void process_route(private_kernel_netlink_net_t *this, struct nlmsghdr *h } rta = RTA_NEXT(rta, rtasize); } + this->lock->read_lock(this->lock); + if (rta_oif && !is_interface_up_and_usable(this, rta_oif)) + { /* ignore route changes for interfaces that are ignored or down */ + this->lock->unlock(this->lock); + DESTROY_IF(host); + return; + } if (!host && rta_oif) { - host = get_interface_address(this, rta_oif, msg->rtm_family); + host = get_interface_address(this, rta_oif, msg->rtm_family, NULL); } - if (host) - { - this->mutex->lock(this->mutex); - if (!get_vip_refcount(this, host)) - { /* ignore routes added for virtual IPs */ - fire_roam_event(this, FALSE); - } - this->mutex->unlock(this->mutex); - host->destroy(host); + if (!host || is_known_vip(this, host)) + { /* ignore routes added for virtual IPs */ + this->lock->unlock(this->lock); + DESTROY_IF(host); + return; } + this->lock->unlock(this->lock); + fire_roam_event(this, FALSE); + host->destroy(host); } /** @@ -614,12 +1108,10 @@ static job_requeue_t receive_events(private_kernel_netlink_net_t *this) case RTM_NEWADDR: case RTM_DELADDR: process_addr(this, hdr, TRUE); - this->condvar->broadcast(this->condvar); break; case RTM_NEWLINK: case RTM_DELLINK: process_link(this, hdr, TRUE); - this->condvar->broadcast(this->condvar); break; case RTM_NEWROUTE: case RTM_DELROUTE: @@ -639,10 +1131,8 @@ static job_requeue_t receive_events(private_kernel_netlink_net_t *this) /** enumerator over addresses */ typedef struct { private_kernel_netlink_net_t* this; - /** whether to enumerate down interfaces */ - bool include_down_ifaces; - /** whether to enumerate virtual ip addresses */ - bool include_virtual_ips; + /** which addresses to enumerate */ + kernel_address_type_t which; } address_enumerator_t; /** @@ -650,7 +1140,7 @@ typedef struct { */ static void address_enumerator_destroy(address_enumerator_t *data) { - data->this->mutex->unlock(data->this->mutex); + data->this->lock->unlock(data->this->lock); free(data); } @@ -660,7 +1150,7 @@ static void address_enumerator_destroy(address_enumerator_t *data) static bool filter_addresses(address_enumerator_t *data, addr_entry_t** in, host_t** out) { - if (!data->include_virtual_ips && (*in)->virtual) + if (!(data->which & ADDR_TYPE_VIRTUAL) && (*in)->refcount) { /* skip virtual interfaces added by us */ return FALSE; } @@ -689,7 +1179,15 @@ static enumerator_t *create_iface_enumerator(iface_entry_t *iface, static bool filter_interfaces(address_enumerator_t *data, iface_entry_t** in, iface_entry_t** out) { - if (!data->include_down_ifaces && !((*in)->flags & IFF_UP)) + if (!(data->which & ADDR_TYPE_IGNORED) && !(*in)->usable) + { /* skip interfaces excluded by config */ + return FALSE; + } + if (!(data->which & ADDR_TYPE_LOOPBACK) && ((*in)->flags & IFF_LOOPBACK)) + { /* ignore loopback devices */ + return FALSE; + } + if (!(data->which & ADDR_TYPE_DOWN) && !((*in)->flags & IFF_UP)) { /* skip interfaces not up */ return FALSE; } @@ -698,15 +1196,13 @@ static bool filter_interfaces(address_enumerator_t *data, iface_entry_t** in, } METHOD(kernel_net_t, create_address_enumerator, enumerator_t*, - private_kernel_netlink_net_t *this, - bool include_down_ifaces, bool include_virtual_ips) + private_kernel_netlink_net_t *this, kernel_address_type_t which) { address_enumerator_t *data = malloc_thing(address_enumerator_t); data->this = this; - data->include_down_ifaces = include_down_ifaces; - data->include_virtual_ips = include_virtual_ips; + data->which = which; - this->mutex->lock(this->mutex); + this->lock->read_lock(this->lock); return enumerator_create_nested( enumerator_create_filter( this->ifaces->create_enumerator(this->ifaces), @@ -715,47 +1211,40 @@ METHOD(kernel_net_t, create_address_enumerator, enumerator_t*, (void*)address_enumerator_destroy); } -METHOD(kernel_net_t, get_interface_name, char*, - private_kernel_netlink_net_t *this, host_t* ip) +METHOD(kernel_net_t, get_interface_name, bool, + private_kernel_netlink_net_t *this, host_t* ip, char **name) { - enumerator_t *ifaces, *addrs; - iface_entry_t *iface; - addr_entry_t *addr; - char *name = NULL; + addr_map_entry_t *entry, lookup = { + .ip = ip, + }; - DBG2(DBG_KNL, "getting interface name for %H", ip); - - this->mutex->lock(this->mutex); - ifaces = this->ifaces->create_enumerator(this->ifaces); - while (ifaces->enumerate(ifaces, &iface)) + if (ip->is_anyaddr(ip)) + { + return FALSE; + } + this->lock->read_lock(this->lock); + /* first try to find it on an up and usable interface */ + entry = this->addrs->get_match(this->addrs, &lookup, + (void*)addr_map_entry_match_up_and_usable); + if (entry) { - addrs = iface->addrs->create_enumerator(iface->addrs); - while (addrs->enumerate(addrs, &addr)) - { - if (ip->ip_equals(ip, addr->ip)) - { - name = strdup(iface->ifname); - break; - } - } - addrs->destroy(addrs); if (name) { - break; + *name = strdup(entry->iface->ifname); + DBG2(DBG_KNL, "%H is on interface %s", ip, *name); } + this->lock->unlock(this->lock); + return TRUE; } - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); - - if (name) - { - DBG2(DBG_KNL, "%H is on interface %s", ip, name); - } - else + /* maybe it is installed on an ignored interface */ + entry = this->addrs->get_match(this->addrs, &lookup, + (void*)addr_map_entry_match_up); + if (!entry) { - DBG2(DBG_KNL, "%H is not a local address", ip); + DBG2(DBG_KNL, "%H is not a local address or the interface is down", ip); } - return name; + this->lock->unlock(this->lock); + return FALSE; } /** @@ -763,24 +1252,18 @@ METHOD(kernel_net_t, get_interface_name, char*, */ static int get_interface_index(private_kernel_netlink_net_t *this, char* name) { - enumerator_t *ifaces; iface_entry_t *iface; int ifindex = 0; DBG2(DBG_KNL, "getting iface index for %s", name); - this->mutex->lock(this->mutex); - ifaces = this->ifaces->create_enumerator(this->ifaces); - while (ifaces->enumerate(ifaces, &iface)) + this->lock->read_lock(this->lock); + if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_name, + (void**)&iface, name) == SUCCESS) { - if (streq(name, iface->ifname)) - { - ifindex = iface->ifindex; - break; - } + ifindex = iface->ifindex; } - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); + this->lock->unlock(this->lock); if (ifindex == 0) { @@ -790,29 +1273,6 @@ static int get_interface_index(private_kernel_netlink_net_t *this, char* name) } /** - * Check if an interface with a given index is up - */ -static bool is_interface_up(private_kernel_netlink_net_t *this, int index) -{ - enumerator_t *ifaces; - iface_entry_t *iface; - /* default to TRUE for interface we do not monitor (e.g. lo) */ - bool up = TRUE; - - ifaces = this->ifaces->create_enumerator(this->ifaces); - while (ifaces->enumerate(ifaces, &iface)) - { - if (iface->ifindex == index) - { - up = iface->flags & IFF_UP; - break; - } - } - ifaces->destroy(ifaces); - return up; -} - -/** * check if an address (chunk) addr is in subnet (net with net_len net bits) */ static bool addr_in_subnet(chunk_t addr, chunk_t net, int net_len) @@ -849,6 +1309,94 @@ static bool addr_in_subnet(chunk_t addr, chunk_t net, int net_len) } /** + * Store information about a route retrieved via RTNETLINK + */ +typedef struct { + chunk_t gtw; + chunk_t src; + chunk_t dst; + host_t *src_host; + u_int8_t dst_len; + u_int32_t table; + u_int32_t oif; +} rt_entry_t; + +/** + * Free a route entry + */ +static void rt_entry_destroy(rt_entry_t *this) +{ + DESTROY_IF(this->src_host); + free(this); +} + +/** + * Parse route received with RTM_NEWROUTE. The given rt_entry_t object will be + * reused if not NULL. + * + * Returned chunks point to internal data of the Netlink message. + */ +static rt_entry_t *parse_route(struct nlmsghdr *hdr, rt_entry_t *route) +{ + struct rtattr *rta; + struct rtmsg *msg; + size_t rtasize; + + msg = (struct rtmsg*)(NLMSG_DATA(hdr)); + rta = RTM_RTA(msg); + rtasize = RTM_PAYLOAD(hdr); + + if (route) + { + route->gtw = chunk_empty; + route->src = chunk_empty; + route->dst = chunk_empty; + route->dst_len = msg->rtm_dst_len; + route->table = msg->rtm_table; + route->oif = 0; + } + else + { + INIT(route, + .dst_len = msg->rtm_dst_len, + .table = msg->rtm_table, + ); + } + + while (RTA_OK(rta, rtasize)) + { + switch (rta->rta_type) + { + case RTA_PREFSRC: + route->src = chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta)); + break; + case RTA_GATEWAY: + route->gtw = chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta)); + break; + case RTA_DST: + route->dst = chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta)); + break; + case RTA_OIF: + if (RTA_PAYLOAD(rta) == sizeof(route->oif)) + { + route->oif = *(u_int32_t*)RTA_DATA(rta); + } + break; +#ifdef HAVE_RTA_TABLE + case RTA_TABLE: + if (RTA_PAYLOAD(rta) == sizeof(route->table)) + { + route->table = *(u_int32_t*)RTA_DATA(rta); + } + break; +#endif /* HAVE_RTA_TABLE*/ + } + rta = RTA_NEXT(rta, rtasize); + } + return route; +} + +/** * Get a route: If "nexthop", the nexthop is returned. source addr otherwise. */ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest, @@ -859,22 +1407,21 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest, struct rtmsg *msg; chunk_t chunk; size_t len; - int best = -1; + linked_list_t *routes; + rt_entry_t *route = NULL, *best = NULL; enumerator_t *enumerator; - host_t *src = NULL, *gtw = NULL; - - DBG2(DBG_KNL, "getting address to reach %H", dest); + host_t *addr = NULL; memset(&request, 0, sizeof(request)); hdr = (struct nlmsghdr*)request; hdr->nlmsg_flags = NLM_F_REQUEST; - if (dest->get_family(dest) == AF_INET) - { - /* We dump all addresses for IPv4, as we want to ignore IPsec specific - * routes installed by us. But the kernel does not return source - * addresses in a IPv6 dump, so fall back to get() for v6 routes. */ - hdr->nlmsg_flags |= NLM_F_ROOT | NLM_F_DUMP; + if (dest->get_family(dest) == AF_INET || this->rta_prefsrc_for_ipv6 || + this->routing_table) + { /* kernels prior to 3.0 do not support RTA_PREFSRC for IPv6 routes. + * as we want to ignore routes with virtual IPs we cannot use DUMP + * if these routes are not installed in a separate table */ + hdr->nlmsg_flags |= NLM_F_DUMP; } hdr->nlmsg_type = RTM_GETROUTE; hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); @@ -891,10 +1438,12 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest, if (this->socket->send(this->socket, hdr, &out, &len) != SUCCESS) { - DBG1(DBG_KNL, "getting address to %H failed", dest); + DBG2(DBG_KNL, "getting %s to reach %H failed", + nexthop ? "nexthop" : "address", dest); return NULL; } - this->mutex->lock(this->mutex); + routes = linked_list_create(); + this->lock->read_lock(this->lock); for (current = out; NLMSG_OK(current, len); current = NLMSG_NEXT(current, len)) @@ -905,132 +1454,53 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest, break; case RTM_NEWROUTE: { - struct rtattr *rta; - size_t rtasize; - chunk_t rta_gtw, rta_src, rta_dst; - u_int32_t rta_oif = 0, rta_table; - host_t *new_src, *new_gtw; - bool cont = FALSE; + rt_entry_t *other; uintptr_t table; - rta_gtw = rta_src = rta_dst = chunk_empty; - msg = (struct rtmsg*)(NLMSG_DATA(current)); - rta = RTM_RTA(msg); - rtasize = RTM_PAYLOAD(current); - rta_table = msg->rtm_table; - while (RTA_OK(rta, rtasize)) - { - switch (rta->rta_type) - { - case RTA_PREFSRC: - rta_src = chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta)); - break; - case RTA_GATEWAY: - rta_gtw = chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta)); - break; - case RTA_DST: - rta_dst = chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta)); - break; - case RTA_OIF: - if (RTA_PAYLOAD(rta) == sizeof(rta_oif)) - { - rta_oif = *(u_int32_t*)RTA_DATA(rta); - } - break; -#ifdef HAVE_RTA_TABLE - case RTA_TABLE: - if (RTA_PAYLOAD(rta) == sizeof(rta_table)) - { - rta_table = *(u_int32_t*)RTA_DATA(rta); - } - break; -#endif /* HAVE_RTA_TABLE*/ - } - rta = RTA_NEXT(rta, rtasize); - } - if (msg->rtm_dst_len <= best) - { /* not better than a previous one */ - continue; - } - enumerator = this->rt_exclude->create_enumerator(this->rt_exclude); - while (enumerator->enumerate(enumerator, &table)) - { - if (table == rta_table) - { - cont = TRUE; - break; - } - } - enumerator->destroy(enumerator); - if (cont) - { + route = parse_route(current, route); + + table = (uintptr_t)route->table; + if (this->rt_exclude->find_first(this->rt_exclude, NULL, + (void**)&table) == SUCCESS) + { /* route is from an excluded routing table */ continue; } if (this->routing_table != 0 && - rta_table == this->routing_table) + route->table == this->routing_table) { /* route is from our own ipsec routing table */ continue; } - if (rta_oif && !is_interface_up(this, rta_oif)) + if (route->oif && !is_interface_up_and_usable(this, route->oif)) { /* interface is down */ continue; } - if (!addr_in_subnet(chunk, rta_dst, msg->rtm_dst_len)) + if (!addr_in_subnet(chunk, route->dst, route->dst_len)) { /* route destination does not contain dest */ continue; } - - if (nexthop) - { - /* nexthop lookup, return gateway if any */ - DESTROY_IF(gtw); - gtw = host_create_from_chunk(msg->rtm_family, rta_gtw, 0); - best = msg->rtm_dst_len; - continue; - } - if (rta_src.ptr) - { /* got a source address */ - new_src = host_create_from_chunk(msg->rtm_family, rta_src, 0); - if (new_src) - { - if (get_vip_refcount(this, new_src)) - { /* skip source address if it is installed by us */ - new_src->destroy(new_src); - } - else - { - DESTROY_IF(src); - src = new_src; - best = msg->rtm_dst_len; - } - } - continue; - } - if (rta_oif) - { /* no src or gtw, but an interface. Get address from it. */ - new_src = get_interface_address(this, rta_oif, - msg->rtm_family); - if (new_src) - { - DESTROY_IF(src); - src = new_src; - best = msg->rtm_dst_len; + if (route->src.ptr) + { /* verify source address, if any */ + host_t *src = host_create_from_chunk(msg->rtm_family, + route->src, 0); + if (src && is_known_vip(this, src)) + { /* ignore routes installed by us */ + src->destroy(src); + continue; } - continue; + route->src_host = src; } - if (rta_gtw.ptr) - { /* no source, but a gateway. Lookup source to reach gtw. */ - new_gtw = host_create_from_chunk(msg->rtm_family, rta_gtw, 0); - new_src = get_route(this, new_gtw, FALSE, candidate); - new_gtw->destroy(new_gtw); - if (new_src) + /* insert route, sorted by decreasing network prefix */ + enumerator = routes->create_enumerator(routes); + while (enumerator->enumerate(enumerator, &other)) + { + if (route->dst_len > other->dst_len) { - DESTROY_IF(src); - src = new_src; - best = msg->rtm_dst_len; + break; } - continue; } + routes->insert_before(routes, enumerator, route); + enumerator->destroy(enumerator); + route = NULL; continue; } default: @@ -1038,18 +1508,111 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest, } break; } - free(out); - this->mutex->unlock(this->mutex); + if (route) + { + rt_entry_destroy(route); + } + + /* now we have a list of routes matching dest, sorted by net prefix. + * we will look for source addresses for these routes and select the one + * with the preferred source address, if possible */ + enumerator = routes->create_enumerator(routes); + while (enumerator->enumerate(enumerator, &route)) + { + if (route->src_host) + { /* got a source address with the route, if no preferred source + * is given or it matches we are done, as this is the best route */ + if (!candidate || candidate->ip_equals(candidate, route->src_host)) + { + best = route; + break; + } + else if (route->oif) + { /* no match yet, maybe it is assigned to the same interface */ + host_t *src = get_interface_address(this, route->oif, + msg->rtm_family, candidate); + if (src && src->ip_equals(src, candidate)) + { + route->src_host->destroy(route->src_host); + route->src_host = src; + best = route; + break; + } + DESTROY_IF(src); + } + /* no luck yet with the source address. if this is the best (first) + * route we store it as fallback in case we don't find a route with + * the preferred source */ + best = best ?: route; + continue; + } + if (route->oif) + { /* no src, but an interface - get address from it */ + route->src_host = get_interface_address(this, route->oif, + msg->rtm_family, candidate); + if (route->src_host) + { /* we handle this address the same as the one above */ + if (!candidate || + candidate->ip_equals(candidate, route->src_host)) + { + best = route; + break; + } + best = best ?: route; + continue; + } + } + if (route->gtw.ptr) + { /* no src, no iface, but a gateway - lookup src to reach gtw */ + host_t *gtw; + + gtw = host_create_from_chunk(msg->rtm_family, route->gtw, 0); + route->src_host = get_route(this, gtw, FALSE, candidate); + gtw->destroy(gtw); + if (route->src_host) + { /* more of the same */ + if (!candidate || + candidate->ip_equals(candidate, route->src_host)) + { + best = route; + break; + } + best = best ?: route; + } + } + } + enumerator->destroy(enumerator); if (nexthop) + { /* nexthop lookup, return gateway if any */ + if (best || routes->get_first(routes, (void**)&best) == SUCCESS) + { + addr = host_create_from_chunk(msg->rtm_family, best->gtw, 0); + } + addr = addr ?: dest->clone(dest); + } + else { - if (gtw) + if (best) { - return gtw; + addr = best->src_host->clone(best->src_host); } - return dest->clone(dest); } - return src; + this->lock->unlock(this->lock); + routes->destroy_function(routes, (void*)rt_entry_destroy); + free(out); + + if (addr) + { + DBG2(DBG_KNL, "using %H as %s to reach %H", addr, + nexthop ? "nexthop" : "address", dest); + } + else + { + DBG2(DBG_KNL, "no %s found to reach %H", + nexthop ? "nexthop" : "address", dest); + } + return addr; } METHOD(kernel_net_t, get_source_addr, host_t*, @@ -1059,9 +1622,9 @@ METHOD(kernel_net_t, get_source_addr, host_t*, } METHOD(kernel_net_t, get_nexthop, host_t*, - private_kernel_netlink_net_t *this, host_t *dest) + private_kernel_netlink_net_t *this, host_t *dest, host_t *src) { - return get_route(this, dest, TRUE, NULL); + return get_route(this, dest, TRUE, src); } /** @@ -1100,87 +1663,109 @@ static status_t manage_ipaddr(private_kernel_netlink_net_t *this, int nlmsg_type METHOD(kernel_net_t, add_ip, status_t, private_kernel_netlink_net_t *this, host_t *virtual_ip, host_t *iface_ip) { - iface_entry_t *iface; - addr_entry_t *addr; - enumerator_t *addrs, *ifaces; - int ifindex; + addr_map_entry_t *entry, lookup = { + .ip = virtual_ip, + }; + iface_entry_t *iface = NULL; if (!this->install_virtual_ip) { /* disabled by config */ return SUCCESS; } - DBG2(DBG_KNL, "adding virtual IP %H", virtual_ip); - - this->mutex->lock(this->mutex); - ifaces = this->ifaces->create_enumerator(this->ifaces); - while (ifaces->enumerate(ifaces, &iface)) - { - bool iface_found = FALSE; - - addrs = iface->addrs->create_enumerator(iface->addrs); - while (addrs->enumerate(addrs, &addr)) - { - if (iface_ip->ip_equals(iface_ip, addr->ip)) + this->lock->write_lock(this->lock); + /* the virtual IP might actually be installed as regular IP, in which case + * we don't track it as virtual IP */ + entry = this->addrs->get_match(this->addrs, &lookup, + (void*)addr_map_entry_match); + if (!entry) + { /* otherwise it might already be installed as virtual IP */ + entry = this->vips->get_match(this->vips, &lookup, + (void*)addr_map_entry_match); + if (entry) + { /* the vip we found can be in one of three states: 1) installed and + * ready, 2) just added by another thread, but not yet confirmed to + * be installed by the kernel, 3) just deleted, but not yet gone. + * Then while we wait below, several things could happen (as we + * release the lock). For instance, the interface could disappear, + * or the IP is finally deleted, and it reappears on a different + * interface. All these cases are handled by the call below. */ + while (!is_vip_installed_or_gone(this, virtual_ip, &entry)) { - iface_found = TRUE; + this->condvar->wait(this->condvar, this->lock); } - else if (virtual_ip->ip_equals(virtual_ip, addr->ip)) + if (entry) { - addr->refcount++; - DBG2(DBG_KNL, "virtual IP %H already installed on %s", - virtual_ip, iface->ifname); - addrs->destroy(addrs); - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); - return SUCCESS; + entry->addr->refcount++; } } - addrs->destroy(addrs); - - if (iface_found) + } + if (entry) + { + DBG2(DBG_KNL, "virtual IP %H is already installed on %s", virtual_ip, + entry->iface->ifname); + this->lock->unlock(this->lock); + return SUCCESS; + } + /* try to find the target interface, either by config or via src ip */ + if (!this->install_virtual_ip_on || + this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_name, + (void**)&iface, this->install_virtual_ip_on) != SUCCESS) + { + lookup.ip = iface_ip; + entry = this->addrs->get_match(this->addrs, &lookup, + (void*)addr_map_entry_match); + if (!entry) + { /* if we don't find the requested interface we just use the first */ + this->ifaces->get_first(this->ifaces, (void**)&iface); + } + else { - ifindex = iface->ifindex; - addr = malloc_thing(addr_entry_t); - addr->ip = virtual_ip->clone(virtual_ip); - addr->refcount = 0; - addr->virtual = TRUE; - addr->scope = RT_SCOPE_UNIVERSE; - iface->addrs->insert_last(iface->addrs, addr); - - if (manage_ipaddr(this, RTM_NEWADDR, NLM_F_CREATE | NLM_F_EXCL, - ifindex, virtual_ip) == SUCCESS) - { - while (get_vip_refcount(this, virtual_ip) == 0) - { /* wait until address appears */ - this->condvar->wait(this->condvar, this->mutex); - } - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); + iface = entry->iface; + } + } + if (iface) + { + addr_entry_t *addr; + + INIT(addr, + .ip = virtual_ip->clone(virtual_ip), + .refcount = 1, + .scope = RT_SCOPE_UNIVERSE, + ); + iface->addrs->insert_last(iface->addrs, addr); + addr_map_entry_add(this->vips, addr, iface); + if (manage_ipaddr(this, RTM_NEWADDR, NLM_F_CREATE | NLM_F_EXCL, + iface->ifindex, virtual_ip) == SUCCESS) + { + while (!is_vip_installed_or_gone(this, virtual_ip, &entry)) + { /* wait until address appears */ + this->condvar->wait(this->condvar, this->lock); + } + if (entry) + { /* we fail if the interface got deleted in the meantime */ + DBG2(DBG_KNL, "virtual IP %H installed on %s", virtual_ip, + entry->iface->ifname); + this->lock->unlock(this->lock); return SUCCESS; } - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); - DBG1(DBG_KNL, "adding virtual IP %H failed", virtual_ip); - return FAILED; } + this->lock->unlock(this->lock); + DBG1(DBG_KNL, "adding virtual IP %H failed", virtual_ip); + return FAILED; } - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); - - DBG1(DBG_KNL, "interface address %H not found, unable to install" - "virtual IP %H", iface_ip, virtual_ip); + this->lock->unlock(this->lock); + DBG1(DBG_KNL, "no interface available, unable to install virtual IP %H", + virtual_ip); return FAILED; } METHOD(kernel_net_t, del_ip, status_t, private_kernel_netlink_net_t *this, host_t *virtual_ip) { - iface_entry_t *iface; - addr_entry_t *addr; - enumerator_t *addrs, *ifaces; - status_t status; - int ifindex; + addr_map_entry_t *entry, lookup = { + .ip = virtual_ip, + }; if (!this->install_virtual_ip) { /* disabled by config */ @@ -1189,60 +1774,61 @@ METHOD(kernel_net_t, del_ip, status_t, DBG2(DBG_KNL, "deleting virtual IP %H", virtual_ip); - this->mutex->lock(this->mutex); - ifaces = this->ifaces->create_enumerator(this->ifaces); - while (ifaces->enumerate(ifaces, &iface)) - { - addrs = iface->addrs->create_enumerator(iface->addrs); - while (addrs->enumerate(addrs, &addr)) + this->lock->write_lock(this->lock); + entry = this->vips->get_match(this->vips, &lookup, + (void*)addr_map_entry_match); + if (!entry) + { /* we didn't install this IP as virtual IP */ + entry = this->addrs->get_match(this->addrs, &lookup, + (void*)addr_map_entry_match); + if (entry) { - if (virtual_ip->ip_equals(virtual_ip, addr->ip)) + DBG2(DBG_KNL, "not deleting existing IP %H on %s", virtual_ip, + entry->iface->ifname); + this->lock->unlock(this->lock); + return SUCCESS; + } + DBG2(DBG_KNL, "virtual IP %H not cached, unable to delete", virtual_ip); + this->lock->unlock(this->lock); + return FAILED; + } + if (entry->addr->refcount == 1) + { + status_t status; + + /* we set this flag so that threads calling add_ip will block and wait + * until the entry is gone, also so we can wait below */ + entry->addr->installed = FALSE; + status = manage_ipaddr(this, RTM_DELADDR, 0, entry->iface->ifindex, + virtual_ip); + if (status == SUCCESS) + { /* wait until the address is really gone */ + while (is_known_vip(this, virtual_ip)) { - ifindex = iface->ifindex; - if (addr->refcount == 1) - { - status = manage_ipaddr(this, RTM_DELADDR, 0, - ifindex, virtual_ip); - if (status == SUCCESS) - { /* wait until the address is really gone */ - while (get_vip_refcount(this, virtual_ip) > 0) - { - this->condvar->wait(this->condvar, this->mutex); - } - } - addrs->destroy(addrs); - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); - return status; - } - else - { - addr->refcount--; - } - DBG2(DBG_KNL, "virtual IP %H used by other SAs, not deleting", - virtual_ip); - addrs->destroy(addrs); - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); - return SUCCESS; + this->condvar->wait(this->condvar, this->lock); } } - addrs->destroy(addrs); + this->lock->unlock(this->lock); + return status; } - ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); - - DBG2(DBG_KNL, "virtual IP %H not cached, unable to delete", virtual_ip); - return FAILED; + else + { + entry->addr->refcount--; + } + DBG2(DBG_KNL, "virtual IP %H used by other SAs, not deleting", + virtual_ip); + this->lock->unlock(this->lock); + return SUCCESS; } /** * Manages source routes in the routing table. * By setting the appropriate nlmsg_type, the route gets added or removed. */ -static status_t manage_srcroute(private_kernel_netlink_net_t *this, int nlmsg_type, - int flags, chunk_t dst_net, u_int8_t prefixlen, - host_t *gateway, host_t *src_ip, char *if_name) +static status_t manage_srcroute(private_kernel_netlink_net_t *this, + int nlmsg_type, int flags, chunk_t dst_net, + u_int8_t prefixlen, host_t *gateway, + host_t *src_ip, char *if_name) { netlink_buf_t request; struct nlmsghdr *hdr; @@ -1306,16 +1892,56 @@ METHOD(kernel_net_t, add_route, status_t, private_kernel_netlink_net_t *this, chunk_t dst_net, u_int8_t prefixlen, host_t *gateway, host_t *src_ip, char *if_name) { - return manage_srcroute(this, RTM_NEWROUTE, NLM_F_CREATE | NLM_F_EXCL, - dst_net, prefixlen, gateway, src_ip, if_name); + status_t status; + route_entry_t *found, route = { + .dst_net = dst_net, + .prefixlen = prefixlen, + .gateway = gateway, + .src_ip = src_ip, + .if_name = if_name, + }; + + this->routes_lock->lock(this->routes_lock); + found = this->routes->get(this->routes, &route); + if (found) + { + this->routes_lock->unlock(this->routes_lock); + return ALREADY_DONE; + } + found = route_entry_clone(&route); + this->routes->put(this->routes, found, found); + status = manage_srcroute(this, RTM_NEWROUTE, NLM_F_CREATE | NLM_F_EXCL, + dst_net, prefixlen, gateway, src_ip, if_name); + this->routes_lock->unlock(this->routes_lock); + return status; } METHOD(kernel_net_t, del_route, status_t, private_kernel_netlink_net_t *this, chunk_t dst_net, u_int8_t prefixlen, host_t *gateway, host_t *src_ip, char *if_name) { - return manage_srcroute(this, RTM_DELROUTE, 0, dst_net, prefixlen, - gateway, src_ip, if_name); + status_t status; + route_entry_t *found, route = { + .dst_net = dst_net, + .prefixlen = prefixlen, + .gateway = gateway, + .src_ip = src_ip, + .if_name = if_name, + }; + + this->routes_lock->lock(this->routes_lock); + found = this->routes->get(this->routes, &route); + if (!found) + { + this->routes_lock->unlock(this->routes_lock); + return NOT_FOUND; + } + this->routes->remove(this->routes, found); + route_entry_destroy(found); + status = manage_srcroute(this, RTM_DELROUTE, 0, dst_net, prefixlen, + gateway, src_ip, if_name); + this->routes_lock->unlock(this->routes_lock); + return status; } /** @@ -1331,7 +1957,7 @@ static status_t init_address_list(private_kernel_netlink_net_t *this) iface_entry_t *iface; addr_entry_t *addr; - DBG1(DBG_KNL, "listening on interfaces:"); + DBG2(DBG_KNL, "known interfaces and IP addresses:"); memset(&request, 0, sizeof(request)); @@ -1389,23 +2015,23 @@ static status_t init_address_list(private_kernel_netlink_net_t *this) } free(out); - this->mutex->lock(this->mutex); + this->lock->read_lock(this->lock); ifaces = this->ifaces->create_enumerator(this->ifaces); while (ifaces->enumerate(ifaces, &iface)) { - if (iface->flags & IFF_UP) + if (iface_entry_up_and_usable(iface)) { - DBG1(DBG_KNL, " %s", iface->ifname); + DBG2(DBG_KNL, " %s", iface->ifname); addrs = iface->addrs->create_enumerator(iface->addrs); while (addrs->enumerate(addrs, (void**)&addr)) { - DBG1(DBG_KNL, " %H", addr->ip); + DBG2(DBG_KNL, " %H", addr->ip); } addrs->destroy(addrs); } } ifaces->destroy(ifaces); - this->mutex->unlock(this->mutex); + this->lock->unlock(this->lock); return SUCCESS; } @@ -1443,9 +2069,59 @@ static status_t manage_rule(private_kernel_netlink_net_t *this, int nlmsg_type, return this->socket->send_ack(this->socket, hdr); } +/** + * check for kernel features (currently only via version number) + */ +static void check_kernel_features(private_kernel_netlink_net_t *this) +{ + struct utsname utsname; + int a, b, c; + + if (uname(&utsname) == 0) + { + switch(sscanf(utsname.release, "%d.%d.%d", &a, &b, &c)) + { + case 3: + if (a == 2) + { + DBG2(DBG_KNL, "detected Linux %d.%d.%d, no support for " + "RTA_PREFSRC for IPv6 routes", a, b, c); + break; + } + /* fall-through */ + case 2: + /* only 3.x+ uses two part version numbers */ + this->rta_prefsrc_for_ipv6 = TRUE; + break; + default: + break; + } + } +} + +/** + * Destroy an address to iface map + */ +static void addr_map_destroy(hashtable_t *map) +{ + enumerator_t *enumerator; + addr_map_entry_t *addr; + + enumerator = map->create_enumerator(map); + while (enumerator->enumerate(enumerator, NULL, (void**)&addr)) + { + free(addr); + } + enumerator->destroy(enumerator); + map->destroy(map); +} + METHOD(kernel_net_t, destroy, void, private_kernel_netlink_net_t *this) { + enumerator_t *enumerator; + route_entry_t *route; + if (this->routing_table) { manage_rule(this, RTM_DELRULE, AF_INET, this->routing_table, @@ -1453,19 +2129,34 @@ METHOD(kernel_net_t, destroy, void, manage_rule(this, RTM_DELRULE, AF_INET6, this->routing_table, this->routing_table_prio); } - if (this->job) - { - this->job->cancel(this->job); - } if (this->socket_events > 0) { close(this->socket_events); } + enumerator = this->routes->create_enumerator(this->routes); + while (enumerator->enumerate(enumerator, NULL, (void**)&route)) + { + manage_srcroute(this, RTM_DELROUTE, 0, route->dst_net, route->prefixlen, + route->gateway, route->src_ip, route->if_name); + route_entry_destroy(route); + } + enumerator->destroy(enumerator); + this->routes->destroy(this->routes); + this->routes_lock->destroy(this->routes_lock); DESTROY_IF(this->socket); + + net_changes_clear(this); + this->net_changes->destroy(this->net_changes); + this->net_changes_lock->destroy(this->net_changes_lock); + + addr_map_destroy(this->addrs); + addr_map_destroy(this->vips); + this->ifaces->destroy_function(this->ifaces, (void*)iface_entry_destroy); this->rt_exclude->destroy(this->rt_exclude); + this->roam_lock->destroy(this->roam_lock); this->condvar->destroy(this->condvar); - this->mutex->destroy(this->mutex); + this->lock->destroy(this->lock); free(this); } @@ -1475,8 +2166,8 @@ METHOD(kernel_net_t, destroy, void, kernel_netlink_net_t *kernel_netlink_net_create() { private_kernel_netlink_net_t *this; - struct sockaddr_nl addr; enumerator_t *enumerator; + bool register_for_events = TRUE; char *exclude; INIT(this, @@ -1495,9 +2186,22 @@ kernel_netlink_net_t *kernel_netlink_net_create() }, .socket = netlink_socket_create(NETLINK_ROUTE), .rt_exclude = linked_list_create(), + .routes = hashtable_create((hashtable_hash_t)route_entry_hash, + (hashtable_equals_t)route_entry_equals, 16), + .net_changes = hashtable_create( + (hashtable_hash_t)net_change_hash, + (hashtable_equals_t)net_change_equals, 16), + .addrs = hashtable_create( + (hashtable_hash_t)addr_map_entry_hash, + (hashtable_equals_t)addr_map_entry_equals, 16), + .vips = hashtable_create((hashtable_hash_t)addr_map_entry_hash, + (hashtable_equals_t)addr_map_entry_equals, 16), + .routes_lock = mutex_create(MUTEX_TYPE_DEFAULT), + .net_changes_lock = mutex_create(MUTEX_TYPE_DEFAULT), .ifaces = linked_list_create(), - .mutex = mutex_create(MUTEX_TYPE_RECURSIVE), - .condvar = condvar_create(CONDVAR_TYPE_DEFAULT), + .lock = rwlock_create(RWLOCK_TYPE_DEFAULT), + .condvar = rwlock_condvar_create(), + .roam_lock = spinlock_create(), .routing_table = lib->settings->get_int(lib->settings, "%s.routing_table", ROUTING_TABLE, hydra->daemon), .routing_table_prio = lib->settings->get_int(lib->settings, @@ -1506,8 +2210,18 @@ kernel_netlink_net_t *kernel_netlink_net_create() "%s.process_route", TRUE, hydra->daemon), .install_virtual_ip = lib->settings->get_bool(lib->settings, "%s.install_virtual_ip", TRUE, hydra->daemon), + .install_virtual_ip_on = lib->settings->get_str(lib->settings, + "%s.install_virtual_ip_on", NULL, hydra->daemon), ); - timerclear(&this->last_roam); + timerclear(&this->last_route_reinstall); + timerclear(&this->next_roam); + + check_kernel_features(this); + + if (streq(hydra->daemon, "starter")) + { /* starter has no threads, so we do not register for kernel events */ + register_for_events = FALSE; + } exclude = lib->settings->get_str(lib->settings, "%s.ignore_routing_tables", NULL, hydra->daemon); @@ -1530,29 +2244,35 @@ kernel_netlink_net_t *kernel_netlink_net_create() enumerator->destroy(enumerator); } - memset(&addr, 0, sizeof(addr)); - addr.nl_family = AF_NETLINK; - - /* create and bind RT socket for events (address/interface/route changes) */ - this->socket_events = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (this->socket_events < 0) + if (register_for_events) { - DBG1(DBG_KNL, "unable to create RT event socket"); - destroy(this); - return NULL; - } - addr.nl_groups = RTMGRP_IPV4_IFADDR | RTMGRP_IPV6_IFADDR | - RTMGRP_IPV4_ROUTE | RTMGRP_IPV6_ROUTE | RTMGRP_LINK; - if (bind(this->socket_events, (struct sockaddr*)&addr, sizeof(addr))) - { - DBG1(DBG_KNL, "unable to bind RT event socket"); - destroy(this); - return NULL; - } + struct sockaddr_nl addr; + + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; - this->job = callback_job_create_with_prio((callback_job_cb_t)receive_events, - this, NULL, NULL, JOB_PRIO_CRITICAL); - lib->processor->queue_job(lib->processor, (job_t*)this->job); + /* create and bind RT socket for events (address/interface/route changes) */ + this->socket_events = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (this->socket_events < 0) + { + DBG1(DBG_KNL, "unable to create RT event socket"); + destroy(this); + return NULL; + } + addr.nl_groups = RTMGRP_IPV4_IFADDR | RTMGRP_IPV6_IFADDR | + RTMGRP_IPV4_ROUTE | RTMGRP_IPV6_ROUTE | RTMGRP_LINK; + if (bind(this->socket_events, (struct sockaddr*)&addr, sizeof(addr))) + { + DBG1(DBG_KNL, "unable to bind RT event socket"); + destroy(this); + return NULL; + } + + lib->processor->queue_job(lib->processor, + (job_t*)callback_job_create_with_prio( + (callback_job_cb_t)receive_events, this, NULL, + (callback_job_cancel_t)return_false, JOB_PRIO_CRITICAL)); + } if (init_address_list(this) != SUCCESS) { |