From bd3ff678b733964c689b52ff1b0d2c838edeb8b8 Mon Sep 17 00:00:00 2001 From: Christian Poessinger Date: Thu, 17 Dec 2020 18:30:16 +0100 Subject: xdp: T2666: initial XDP (generic mode) forwarding support The CLI command 'set interfaces ethernet offload-options xdp" enables the XDP generic mode on the given interface. vyos@vyos:~$ show interfaces ethernet eth1 eth1: mtu 1500 xdpgeneric/id:151 qdisc mq state DOWN group default qlen 1000 link/ether 00:50:56:bf:ef:aa brd ff:ff:ff:ff:ff:ff inet6 fe80::250:56ff:febf:efaa/64 scope link tentative valid_lft forever preferred_lft forever Description: fooa XDP code is thankfully copied from [1], thank you for this nice tutorial. NOTE: this is an experimental feature which might break your forwarding/filtering. [1]: https://medium.com/swlh/building-a-xdp-express-data-path-based-peering-router-20db4995da66 --- src/ebpf/.gitignore | 1 + src/ebpf/Makefile | 16 ++++ src/ebpf/xdp_drop_ebpf.c | 97 +++++++++++++++++++++++ src/ebpf/xdp_router.c | 202 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 316 insertions(+) create mode 100644 src/ebpf/.gitignore create mode 100644 src/ebpf/Makefile create mode 100644 src/ebpf/xdp_drop_ebpf.c create mode 100644 src/ebpf/xdp_router.c (limited to 'src') diff --git a/src/ebpf/.gitignore b/src/ebpf/.gitignore new file mode 100644 index 000000000..5761abcfd --- /dev/null +++ b/src/ebpf/.gitignore @@ -0,0 +1 @@ +*.o diff --git a/src/ebpf/Makefile b/src/ebpf/Makefile new file mode 100644 index 000000000..5b80c32d7 --- /dev/null +++ b/src/ebpf/Makefile @@ -0,0 +1,16 @@ +#clang -target bpf -O2 -c xdp-drop-ebpf.c -o xdp-drop-ebpf.o + +src = $(wildcard *.c) +obj = $(src:.c=.o) +CLANG = clang +CFLAGS = -Wall -Wno-unused-value -Wno-pointer-sign -Wno-compare-distinct-pointer-types -Werror -O2 + +%.o: %.c + $(CLANG) -target bpf $(CFLAGS) -o $@ -c $< + +.PHONY: all +all: $(obj) + +.PHONY: clean +clean: + rm -f *.o diff --git a/src/ebpf/xdp_drop_ebpf.c b/src/ebpf/xdp_drop_ebpf.c new file mode 100644 index 000000000..a08edf58d --- /dev/null +++ b/src/ebpf/xdp_drop_ebpf.c @@ -0,0 +1,97 @@ +#include +#include +#include +#include +#include +#include + +#include + +/* IP flags. */ +#define IP_CE 0x8000 /* Flag: "Congestion" */ +#define IP_DF 0x4000 /* Flag: "Don't Fragment" */ +#define IP_MF 0x2000 /* Flag: "More Fragments" */ +#define IP_OFFSET 0x1FFF /* "Fragment Offset" part */ + +#define SEC(NAME) __attribute__((section(NAME), used)) + +#define htons(x) ((__be16)___constant_swab16((x))) +#define htonl(x) ((__be32)___constant_swab32((x))) + +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +SEC("prog") +int xdp_drop(struct xdp_md *ctx) { + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + + uint64_t nh_off = sizeof(*eth); + if (data + nh_off > data_end) { + return XDP_PASS; + } + + uint16_t h_proto = eth->h_proto; + int i; + + /* Handle double VLAN tagged packet. See https://en.wikipedia.org/wiki/IEEE_802.1ad */ + for (i = 0; i < 2; i++) { + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) { + return XDP_PASS; + } + h_proto = vhdr->h_vlan_encapsulated_proto; + } + } + + if (h_proto == htons(ETH_P_IP)) { + struct iphdr *iph = data + nh_off; + struct udphdr *udph = data + nh_off + sizeof(struct iphdr); + + uint32_t hostid = iph->daddr >> 24; + + if (udph + 1 > (struct udphdr *)data_end) { + return XDP_PASS; + } + if (hostid == 0 || hostid == 255) { + return XDP_DROP; + } + if (iph->frag_off & htons(IP_MF | IP_OFFSET)) { + return XDP_DROP; + } + if (iph->protocol == IPPROTO_UDP) { + __be16 dport = htons(udph->dest); + __be16 sport = htons(udph->source); + + if (dport == 53 || sport == 53) { + return XDP_DROP; + } + } + } else if (h_proto == htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = data + nh_off; + struct udphdr *udph = data + nh_off + sizeof(struct ipv6hdr); + + if (udph + 1 > (struct udphdr *)data_end) { + return XDP_PASS; + } + if (ip6h->nexthdr == IPPROTO_UDP) { + __be16 dport = htons(udph->dest); + __be16 sport = htons(udph->source); + + if (dport == 53 || sport == 53) { + return XDP_DROP; + } + } + } + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/src/ebpf/xdp_router.c b/src/ebpf/xdp_router.c new file mode 100644 index 000000000..4fb5c7cb1 --- /dev/null +++ b/src/ebpf/xdp_router.c @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +// Code thankfully copied from: +// https://medium.com/swlh/building-a-xdp-express-data-path-based-peering-router-20db4995da66 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#ifndef XDP_ACTION_MAX +#define XDP_ACTION_MAX (XDP_REDIRECT + 1) +#endif + +#ifndef memcpy +#define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n)) +#endif + +#ifndef AF_INET +#define AF_INET 2 +#endif + +#ifndef AF_INET6 +#define AF_INET6 10 +#endif + +#ifndef IPV6_FLOWINFO_MASK +#define IPV6_FLOWINFO_MASK bpf_htonl(0x0FFFFFFF) +#endif + +/* This is the data record stored in the map */ +struct datarec { + __u64 rx_packets; + __u64 rx_bytes; +}; + +/* Keeps stats per (enum) xdp_action */ +struct bpf_map_def SEC("maps") xdp_stats_map = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct datarec), + .max_entries = XDP_ACTION_MAX, +}; + +struct bpf_map_def SEC("maps") tx_port = { + .type = BPF_MAP_TYPE_DEVMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 256, +}; + +/* from include/net/ip.h */ +static __always_inline int ip_decrease_ttl(struct iphdr *iph) +{ + __u32 check = iph->check; + check += bpf_htons(0x0100); + iph->check = (__u16)(check + (check >= 0xFFFF)); + return --iph->ttl; +} + +static __always_inline +__u32 xdp_stats_record_action(struct xdp_md *ctx, __u32 action) +{ + if (action >= XDP_ACTION_MAX) + return XDP_ABORTED; + + /* Lookup in kernel BPF-side return pointer to actual data record */ + struct datarec *rec = bpf_map_lookup_elem(&xdp_stats_map, &action); + if (!rec) + return XDP_ABORTED; + + /* BPF_MAP_TYPE_PERCPU_ARRAY returns a data record specific to current + * CPU and XDP hooks runs under Softirq, which makes it safe to update + * without atomic operations. + */ + rec->rx_packets++; + rec->rx_bytes += (ctx->data_end - ctx->data); + + return action; +} + +/* xdp_router is the name of the xdp program */ +SEC("prog") +int xdp_router_func(struct xdp_md *ctx) +{ + /* this is the packet context*/ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct bpf_fib_lookup fib_params = {}; + struct ethhdr *eth = data; + struct ipv6hdr *ip6h; + struct iphdr *iph; + __u16 h_proto; + __u64 nh_off; + int rc; + /* default action is to pass */ + int action = XDP_PASS; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) { + action = XDP_DROP; + goto out; + } + + /* determine if this is IP4 or IPv6 by looking at the Ethernet protocol field */ + h_proto = eth->h_proto; + if (h_proto == bpf_htons(ETH_P_IP)) { + /* IPv4 part of the code */ + iph = data + nh_off; + + if (iph + 1 > data_end) { + action = XDP_DROP; + goto out; + } + /* as a real router, we need to check the TTL to prevent never ending loops*/ + if (iph->ttl <= 1) + goto out; + + /* populate the fib_params fields to prepare for the lookup */ + fib_params.family = AF_INET; + fib_params.tos = iph->tos; + fib_params.l4_protocol = iph->protocol; + fib_params.sport = 0; + fib_params.dport = 0; + fib_params.tot_len = bpf_ntohs(iph->tot_len); + fib_params.ipv4_src = iph->saddr; + fib_params.ipv4_dst = iph->daddr; + } else if (h_proto == bpf_htons(ETH_P_IPV6)) { + /* IPv6 part of the code */ + struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src; + struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst; + + ip6h = data + nh_off; + if (ip6h + 1 > data_end) { + action = XDP_DROP; + goto out; + } + /* as a real router, we need to check the TTL to prevent never ending loops*/ + if (ip6h->hop_limit <= 1) + goto out; + + /* populate the fib_params fields to prepare for the lookup */ + fib_params.family = AF_INET6; + fib_params.flowinfo = *(__be32 *) ip6h & IPV6_FLOWINFO_MASK; + fib_params.l4_protocol = ip6h->nexthdr; + fib_params.sport = 0; + fib_params.dport = 0; + fib_params.tot_len = bpf_ntohs(ip6h->payload_len); + *src = ip6h->saddr; + *dst = ip6h->daddr; + } else { + goto out; + } + + fib_params.ifindex = ctx->ingress_ifindex; + + /* this is where the FIB lookup happens. If the lookup is successful */ + /* it will populate the fib_params.ifindex with the egress interface index */ + + rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), 0); + switch (rc) { + case BPF_FIB_LKUP_RET_SUCCESS: /* lookup successful */ + /* we are a router, so we need to decrease the ttl */ + if (h_proto == bpf_htons(ETH_P_IP)) + ip_decrease_ttl(iph); + else if (h_proto == bpf_htons(ETH_P_IPV6)) + ip6h->hop_limit--; + /* set the correct new source and destionation mac addresses */ + /* can be found in fib_params.dmac and fib_params.smac */ + memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); + memcpy(eth->h_source, fib_params.smac, ETH_ALEN); + /* and done, now we set the action to bpf_redirect_map with fib_params.ifindex which is the egress port as paramater */ + action = bpf_redirect_map(&tx_port, fib_params.ifindex, 0); + break; + case BPF_FIB_LKUP_RET_BLACKHOLE: /* dest is blackholed; can be dropped */ + case BPF_FIB_LKUP_RET_UNREACHABLE: /* dest is unreachable; can be dropped */ + case BPF_FIB_LKUP_RET_PROHIBIT: /* dest not allowed; can be dropped */ + action = XDP_DROP; + break; + case BPF_FIB_LKUP_RET_NOT_FWDED: /* packet is not forwarded */ + case BPF_FIB_LKUP_RET_FWD_DISABLED: /* fwding is not enabled on ingress */ + case BPF_FIB_LKUP_RET_UNSUPP_LWT: /* fwd requires encapsulation */ + case BPF_FIB_LKUP_RET_NO_NEIGH: /* no neighbor entry for nh */ + case BPF_FIB_LKUP_RET_FRAG_NEEDED: /* fragmentation required to fwd */ + /* PASS */ + break; + } + +out: + /* and done, update stats and return action */ + return xdp_stats_record_action(ctx, action); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3