summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Poessinger <christian@poessinger.com>2020-12-17 18:30:16 +0100
committerChristian Poessinger <christian@poessinger.com>2020-12-17 17:37:07 +0000
commitbd3ff678b733964c689b52ff1b0d2c838edeb8b8 (patch)
tree901f7a5ecd69cc692d3112d9c53aaa80c239f2c6
parenta6b35825a78b5fe8c3a91bc4cf6abf0f50a08738 (diff)
downloadvyos-1x-bd3ff678b733964c689b52ff1b0d2c838edeb8b8.tar.gz
vyos-1x-bd3ff678b733964c689b52ff1b0d2c838edeb8b8.zip
xdp: T2666: initial XDP (generic mode) forwarding support
The CLI command 'set interfaces ethernet <interface> offload-options xdp" enables the XDP generic mode on the given interface. vyos@vyos:~$ show interfaces ethernet eth1 eth1: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 xdpgeneric/id:151 qdisc mq state DOWN group default qlen 1000 link/ether 00:50:56:bf:ef:aa brd ff:ff:ff:ff:ff:ff inet6 fe80::250:56ff:febf:efaa/64 scope link tentative valid_lft forever preferred_lft forever Description: fooa XDP code is thankfully copied from [1], thank you for this nice tutorial. NOTE: this is an experimental feature which might break your forwarding/filtering. [1]: https://medium.com/swlh/building-a-xdp-express-data-path-based-peering-router-20db4995da66
-rw-r--r--Makefile8
-rwxr-xr-xdebian/rules4
-rw-r--r--interface-definitions/interfaces-ethernet.xml.in6
-rw-r--r--python/vyos/ifconfig/ethernet.py21
-rw-r--r--src/ebpf/.gitignore1
-rw-r--r--src/ebpf/Makefile16
-rw-r--r--src/ebpf/xdp_drop_ebpf.c97
-rw-r--r--src/ebpf/xdp_router.c202
8 files changed, 354 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index 1ed463440..8155b231e 100644
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,7 @@ OP_TMPL_DIR := templates-op
BUILD_DIR := build
DATA_DIR := data
SHIM_DIR := src/shim
+EBPF_DIR := src/ebpf
CC := gcc
LIBS := -lzmq
CFLAGS :=
@@ -96,8 +97,12 @@ component_versions: $(BUILD_DIR) $(obj)
vyshim:
$(MAKE) -C $(SHIM_DIR)
+.PHONY: vyebpf
+vyebpf:
+ $(MAKE) -C $(EBPF_DIR)
+
.PHONY: all
-all: clean interface_definitions op_mode_definitions component_versions vyshim
+all: clean interface_definitions op_mode_definitions component_versions vyshim vyebpf
.PHONY: clean
clean:
@@ -105,6 +110,7 @@ clean:
rm -rf $(TMPL_DIR)
rm -rf $(OP_TMPL_DIR)
$(MAKE) -C $(SHIM_DIR) clean
+ $(MAKE) -C $(EBPF_DIR) clean
.PHONY: test
test:
diff --git a/debian/rules b/debian/rules
index a0cc7a99b..599572358 100755
--- a/debian/rules
+++ b/debian/rules
@@ -78,6 +78,10 @@ override_dh_auto_install:
mkdir -p $(DIR)/$(VYOS_DATA_DIR)
cp -r data/* $(DIR)/$(VYOS_DATA_DIR)
+ # Install eBPF plugins
+ mkdir -p $(DIR)/$(VYOS_DATA_DIR)/ebpf
+ cp -r src/ebpf/*.o $(DIR)/$(VYOS_DATA_DIR)/ebpf
+
# Install etc configuration files
mkdir -p $(DIR)/etc
cp -r src/etc/* $(DIR)/etc
diff --git a/interface-definitions/interfaces-ethernet.xml.in b/interface-definitions/interfaces-ethernet.xml.in
index 0337c629b..8bd9b7010 100644
--- a/interface-definitions/interfaces-ethernet.xml.in
+++ b/interface-definitions/interfaces-ethernet.xml.in
@@ -165,6 +165,12 @@
<constraintErrorMessage>Must be either 'on' or 'off'</constraintErrorMessage>
</properties>
</leafNode>
+ <leafNode name="xdp">
+ <properties>
+ <help>Enable eXpress Data Path</help>
+ <valueless/>
+ </properties>
+ </leafNode>
</children>
</node>
<leafNode name="speed">
diff --git a/python/vyos/ifconfig/ethernet.py b/python/vyos/ifconfig/ethernet.py
index 12d1ec265..1bc63eec2 100644
--- a/python/vyos/ifconfig/ethernet.py
+++ b/python/vyos/ifconfig/ethernet.py
@@ -251,6 +251,23 @@ class EthernetIf(Interface):
"""
return self.set_interface('ufo', state)
+ def set_xdp(self, enabled):
+ """
+ """
+ ifname = self.config['ifname']
+ cmd = f'ip link set dev {ifname} xdp off'
+ if enabled:
+ # use 'xdpgeneric' for the time beeing until we can detect supported
+ # drivers or have a lookup table of whatever kind. This then can be
+ # replaced by xdpdrv
+ cmd = f'ip -force link set dev {ifname} xdpgeneric obj /usr/share/vyos/ebpf/xdp_router.o'
+ try:
+ return self._cmd(cmd)
+ except:
+ from vyos import ConfigError
+ raise ConfigError('Error: Device does not allow enslaving to a bridge.')
+
+
def set_ring_buffer(self, b_type, b_size):
"""
Example:
@@ -306,6 +323,10 @@ class EthernetIf(Interface):
value = tmp if (tmp != None) else 'off'
self.set_ufo(value)
+ # UDP fragmentation offloading
+ tmp = dict_search('offload_options.xdp', config)
+ self.set_xdp(tmp != None) # enable or disable
+
# Set physical interface speed and duplex
if {'speed', 'duplex'} <= set(config):
speed = config.get('speed')
diff --git a/src/ebpf/.gitignore b/src/ebpf/.gitignore
new file mode 100644
index 000000000..5761abcfd
--- /dev/null
+++ b/src/ebpf/.gitignore
@@ -0,0 +1 @@
+*.o
diff --git a/src/ebpf/Makefile b/src/ebpf/Makefile
new file mode 100644
index 000000000..5b80c32d7
--- /dev/null
+++ b/src/ebpf/Makefile
@@ -0,0 +1,16 @@
+#clang -target bpf -O2 -c xdp-drop-ebpf.c -o xdp-drop-ebpf.o
+
+src = $(wildcard *.c)
+obj = $(src:.c=.o)
+CLANG = clang
+CFLAGS = -Wall -Wno-unused-value -Wno-pointer-sign -Wno-compare-distinct-pointer-types -Werror -O2
+
+%.o: %.c
+ $(CLANG) -target bpf $(CFLAGS) -o $@ -c $<
+
+.PHONY: all
+all: $(obj)
+
+.PHONY: clean
+clean:
+ rm -f *.o
diff --git a/src/ebpf/xdp_drop_ebpf.c b/src/ebpf/xdp_drop_ebpf.c
new file mode 100644
index 000000000..a08edf58d
--- /dev/null
+++ b/src/ebpf/xdp_drop_ebpf.c
@@ -0,0 +1,97 @@
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+
+#include <stdint.h>
+
+/* IP flags. */
+#define IP_CE 0x8000 /* Flag: "Congestion" */
+#define IP_DF 0x4000 /* Flag: "Don't Fragment" */
+#define IP_MF 0x2000 /* Flag: "More Fragments" */
+#define IP_OFFSET 0x1FFF /* "Fragment Offset" part */
+
+#define SEC(NAME) __attribute__((section(NAME), used))
+
+#define htons(x) ((__be16)___constant_swab16((x)))
+#define htonl(x) ((__be32)___constant_swab32((x)))
+
+struct vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+
+SEC("prog")
+int xdp_drop(struct xdp_md *ctx) {
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+
+ uint64_t nh_off = sizeof(*eth);
+ if (data + nh_off > data_end) {
+ return XDP_PASS;
+ }
+
+ uint16_t h_proto = eth->h_proto;
+ int i;
+
+ /* Handle double VLAN tagged packet. See https://en.wikipedia.org/wiki/IEEE_802.1ad */
+ for (i = 0; i < 2; i++) {
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end) {
+ return XDP_PASS;
+ }
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ }
+
+ if (h_proto == htons(ETH_P_IP)) {
+ struct iphdr *iph = data + nh_off;
+ struct udphdr *udph = data + nh_off + sizeof(struct iphdr);
+
+ uint32_t hostid = iph->daddr >> 24;
+
+ if (udph + 1 > (struct udphdr *)data_end) {
+ return XDP_PASS;
+ }
+ if (hostid == 0 || hostid == 255) {
+ return XDP_DROP;
+ }
+ if (iph->frag_off & htons(IP_MF | IP_OFFSET)) {
+ return XDP_DROP;
+ }
+ if (iph->protocol == IPPROTO_UDP) {
+ __be16 dport = htons(udph->dest);
+ __be16 sport = htons(udph->source);
+
+ if (dport == 53 || sport == 53) {
+ return XDP_DROP;
+ }
+ }
+ } else if (h_proto == htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ip6h = data + nh_off;
+ struct udphdr *udph = data + nh_off + sizeof(struct ipv6hdr);
+
+ if (udph + 1 > (struct udphdr *)data_end) {
+ return XDP_PASS;
+ }
+ if (ip6h->nexthdr == IPPROTO_UDP) {
+ __be16 dport = htons(udph->dest);
+ __be16 sport = htons(udph->source);
+
+ if (dport == 53 || sport == 53) {
+ return XDP_DROP;
+ }
+ }
+ }
+
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/src/ebpf/xdp_router.c b/src/ebpf/xdp_router.c
new file mode 100644
index 000000000..4fb5c7cb1
--- /dev/null
+++ b/src/ebpf/xdp_router.c
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+// Code thankfully copied from:
+// https://medium.com/swlh/building-a-xdp-express-data-path-based-peering-router-20db4995da66
+
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include <linux/socket.h>
+
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+
+#include <stdint.h>
+
+#ifndef XDP_ACTION_MAX
+#define XDP_ACTION_MAX (XDP_REDIRECT + 1)
+#endif
+
+#ifndef memcpy
+#define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n))
+#endif
+
+#ifndef AF_INET
+#define AF_INET 2
+#endif
+
+#ifndef AF_INET6
+#define AF_INET6 10
+#endif
+
+#ifndef IPV6_FLOWINFO_MASK
+#define IPV6_FLOWINFO_MASK bpf_htonl(0x0FFFFFFF)
+#endif
+
+/* This is the data record stored in the map */
+struct datarec {
+ __u64 rx_packets;
+ __u64 rx_bytes;
+};
+
+/* Keeps stats per (enum) xdp_action */
+struct bpf_map_def SEC("maps") xdp_stats_map = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = XDP_ACTION_MAX,
+};
+
+struct bpf_map_def SEC("maps") tx_port = {
+ .type = BPF_MAP_TYPE_DEVMAP,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 256,
+};
+
+/* from include/net/ip.h */
+static __always_inline int ip_decrease_ttl(struct iphdr *iph)
+{
+ __u32 check = iph->check;
+ check += bpf_htons(0x0100);
+ iph->check = (__u16)(check + (check >= 0xFFFF));
+ return --iph->ttl;
+}
+
+static __always_inline
+__u32 xdp_stats_record_action(struct xdp_md *ctx, __u32 action)
+{
+ if (action >= XDP_ACTION_MAX)
+ return XDP_ABORTED;
+
+ /* Lookup in kernel BPF-side return pointer to actual data record */
+ struct datarec *rec = bpf_map_lookup_elem(&xdp_stats_map, &action);
+ if (!rec)
+ return XDP_ABORTED;
+
+ /* BPF_MAP_TYPE_PERCPU_ARRAY returns a data record specific to current
+ * CPU and XDP hooks runs under Softirq, which makes it safe to update
+ * without atomic operations.
+ */
+ rec->rx_packets++;
+ rec->rx_bytes += (ctx->data_end - ctx->data);
+
+ return action;
+}
+
+/* xdp_router is the name of the xdp program */
+SEC("prog")
+int xdp_router_func(struct xdp_md *ctx)
+{
+ /* this is the packet context*/
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct bpf_fib_lookup fib_params = {};
+ struct ethhdr *eth = data;
+ struct ipv6hdr *ip6h;
+ struct iphdr *iph;
+ __u16 h_proto;
+ __u64 nh_off;
+ int rc;
+ /* default action is to pass */
+ int action = XDP_PASS;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end) {
+ action = XDP_DROP;
+ goto out;
+ }
+
+ /* determine if this is IP4 or IPv6 by looking at the Ethernet protocol field */
+ h_proto = eth->h_proto;
+ if (h_proto == bpf_htons(ETH_P_IP)) {
+ /* IPv4 part of the code */
+ iph = data + nh_off;
+
+ if (iph + 1 > data_end) {
+ action = XDP_DROP;
+ goto out;
+ }
+ /* as a real router, we need to check the TTL to prevent never ending loops*/
+ if (iph->ttl <= 1)
+ goto out;
+
+ /* populate the fib_params fields to prepare for the lookup */
+ fib_params.family = AF_INET;
+ fib_params.tos = iph->tos;
+ fib_params.l4_protocol = iph->protocol;
+ fib_params.sport = 0;
+ fib_params.dport = 0;
+ fib_params.tot_len = bpf_ntohs(iph->tot_len);
+ fib_params.ipv4_src = iph->saddr;
+ fib_params.ipv4_dst = iph->daddr;
+ } else if (h_proto == bpf_htons(ETH_P_IPV6)) {
+ /* IPv6 part of the code */
+ struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src;
+ struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst;
+
+ ip6h = data + nh_off;
+ if (ip6h + 1 > data_end) {
+ action = XDP_DROP;
+ goto out;
+ }
+ /* as a real router, we need to check the TTL to prevent never ending loops*/
+ if (ip6h->hop_limit <= 1)
+ goto out;
+
+ /* populate the fib_params fields to prepare for the lookup */
+ fib_params.family = AF_INET6;
+ fib_params.flowinfo = *(__be32 *) ip6h & IPV6_FLOWINFO_MASK;
+ fib_params.l4_protocol = ip6h->nexthdr;
+ fib_params.sport = 0;
+ fib_params.dport = 0;
+ fib_params.tot_len = bpf_ntohs(ip6h->payload_len);
+ *src = ip6h->saddr;
+ *dst = ip6h->daddr;
+ } else {
+ goto out;
+ }
+
+ fib_params.ifindex = ctx->ingress_ifindex;
+
+ /* this is where the FIB lookup happens. If the lookup is successful */
+ /* it will populate the fib_params.ifindex with the egress interface index */
+
+ rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), 0);
+ switch (rc) {
+ case BPF_FIB_LKUP_RET_SUCCESS: /* lookup successful */
+ /* we are a router, so we need to decrease the ttl */
+ if (h_proto == bpf_htons(ETH_P_IP))
+ ip_decrease_ttl(iph);
+ else if (h_proto == bpf_htons(ETH_P_IPV6))
+ ip6h->hop_limit--;
+ /* set the correct new source and destionation mac addresses */
+ /* can be found in fib_params.dmac and fib_params.smac */
+ memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+ memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+ /* and done, now we set the action to bpf_redirect_map with fib_params.ifindex which is the egress port as paramater */
+ action = bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
+ break;
+ case BPF_FIB_LKUP_RET_BLACKHOLE: /* dest is blackholed; can be dropped */
+ case BPF_FIB_LKUP_RET_UNREACHABLE: /* dest is unreachable; can be dropped */
+ case BPF_FIB_LKUP_RET_PROHIBIT: /* dest not allowed; can be dropped */
+ action = XDP_DROP;
+ break;
+ case BPF_FIB_LKUP_RET_NOT_FWDED: /* packet is not forwarded */
+ case BPF_FIB_LKUP_RET_FWD_DISABLED: /* fwding is not enabled on ingress */
+ case BPF_FIB_LKUP_RET_UNSUPP_LWT: /* fwd requires encapsulation */
+ case BPF_FIB_LKUP_RET_NO_NEIGH: /* no neighbor entry for nh */
+ case BPF_FIB_LKUP_RET_FRAG_NEEDED: /* fragmentation required to fwd */
+ /* PASS */
+ break;
+ }
+
+out:
+ /* and done, update stats and return action */
+ return xdp_stats_record_action(ctx, action);
+}
+
+char _license[] SEC("license") = "GPL";