123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569 |
- /*
- * eBPF RSS program
- *
- * Developed by Daynix Computing LTD (http://www.daynix.com)
- *
- * Authors:
- * Andrew Melnychenko <andrew@daynix.com>
- * Yuri Benditovich <yuri.benditovich@daynix.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- * Prepare:
- * Requires llvm, clang, bpftool, linux kernel tree
- *
- * Build rss.bpf.skeleton.h:
- * make -f Makefile.ebpf clean all
- */
- #include <stddef.h>
- #include <stdbool.h>
- #include <linux/bpf.h>
- #include <linux/in.h>
- #include <linux/if_ether.h>
- #include <linux/ip.h>
- #include <linux/ipv6.h>
- #include <linux/udp.h>
- #include <linux/tcp.h>
- #include <bpf/bpf_helpers.h>
- #include <bpf/bpf_endian.h>
- #include <linux/virtio_net.h>
- #define INDIRECTION_TABLE_SIZE 128
- #define HASH_CALCULATION_BUFFER_SIZE 36
- struct rss_config_t {
- __u8 redirect;
- __u8 populate_hash;
- __u32 hash_types;
- __u16 indirections_len;
- __u16 default_queue;
- } __attribute__((packed));
- struct toeplitz_key_data_t {
- __u32 leftmost_32_bits;
- __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE];
- };
- struct packet_hash_info_t {
- __u8 is_ipv4;
- __u8 is_ipv6;
- __u8 is_udp;
- __u8 is_tcp;
- __u8 is_ipv6_ext_src;
- __u8 is_ipv6_ext_dst;
- __u8 is_fragmented;
- __u16 src_port;
- __u16 dst_port;
- union {
- struct {
- __be32 in_src;
- __be32 in_dst;
- };
- struct {
- struct in6_addr in6_src;
- struct in6_addr in6_dst;
- struct in6_addr in6_ext_src;
- struct in6_addr in6_ext_dst;
- };
- };
- };
- struct {
- __uint(type, BPF_MAP_TYPE_ARRAY);
- __uint(key_size, sizeof(__u32));
- __uint(value_size, sizeof(struct rss_config_t));
- __uint(max_entries, 1);
- __uint(map_flags, BPF_F_MMAPABLE);
- } tap_rss_map_configurations SEC(".maps");
- struct {
- __uint(type, BPF_MAP_TYPE_ARRAY);
- __uint(key_size, sizeof(__u32));
- __uint(value_size, sizeof(struct toeplitz_key_data_t));
- __uint(max_entries, 1);
- __uint(map_flags, BPF_F_MMAPABLE);
- } tap_rss_map_toeplitz_key SEC(".maps");
- struct {
- __uint(type, BPF_MAP_TYPE_ARRAY);
- __uint(key_size, sizeof(__u32));
- __uint(value_size, sizeof(__u16));
- __uint(max_entries, INDIRECTION_TABLE_SIZE);
- __uint(map_flags, BPF_F_MMAPABLE);
- } tap_rss_map_indirection_table SEC(".maps");
- static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written,
- const void *ptr, size_t size) {
- __builtin_memcpy(&rss_input[*bytes_written], ptr, size);
- *bytes_written += size;
- }
- static inline
- void net_toeplitz_add(__u32 *result,
- __u8 *input,
- __u32 len
- , struct toeplitz_key_data_t *key) {
- __u32 accumulator = *result;
- __u32 leftmost_32_bits = key->leftmost_32_bits;
- __u32 byte;
- for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) {
- __u8 input_byte = input[byte];
- __u8 key_byte = key->next_byte[byte];
- __u8 bit;
- for (bit = 0; bit < 8; bit++) {
- if (input_byte & (1 << 7)) {
- accumulator ^= leftmost_32_bits;
- }
- leftmost_32_bits =
- (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7);
- input_byte <<= 1;
- key_byte <<= 1;
- }
- }
- *result = accumulator;
- }
- static inline int ip6_extension_header_type(__u8 hdr_type)
- {
- switch (hdr_type) {
- case IPPROTO_HOPOPTS:
- case IPPROTO_ROUTING:
- case IPPROTO_FRAGMENT:
- case IPPROTO_ICMPV6:
- case IPPROTO_NONE:
- case IPPROTO_DSTOPTS:
- case IPPROTO_MH:
- return 1;
- default:
- return 0;
- }
- }
- /*
- * According to
- * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml
- * we expect that there are would be no more than 11 extensions in IPv6 header,
- * also there is 27 TLV options for Destination and Hop-by-hop extensions.
- * Need to choose reasonable amount of maximum extensions/options we may
- * check to find ext src/dst.
- */
- #define IP6_EXTENSIONS_COUNT 11
- #define IP6_OPTIONS_COUNT 30
- static inline int parse_ipv6_ext(struct __sk_buff *skb,
- struct packet_hash_info_t *info,
- __u8 *l4_protocol, size_t *l4_offset)
- {
- int err = 0;
- if (!ip6_extension_header_type(*l4_protocol)) {
- return 0;
- }
- struct ipv6_opt_hdr ext_hdr = {};
- for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) {
- err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr,
- sizeof(ext_hdr), BPF_HDR_START_NET);
- if (err) {
- goto error;
- }
- if (*l4_protocol == IPPROTO_ROUTING) {
- struct ipv6_rt_hdr ext_rt = {};
- err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt,
- sizeof(ext_rt), BPF_HDR_START_NET);
- if (err) {
- goto error;
- }
- if ((ext_rt.type == IPV6_SRCRT_TYPE_2) &&
- (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) &&
- (ext_rt.segments_left == 1)) {
- err = bpf_skb_load_bytes_relative(skb,
- *l4_offset + offsetof(struct rt2_hdr, addr),
- &info->in6_ext_dst, sizeof(info->in6_ext_dst),
- BPF_HDR_START_NET);
- if (err) {
- goto error;
- }
- info->is_ipv6_ext_dst = 1;
- }
- } else if (*l4_protocol == IPPROTO_DSTOPTS) {
- struct ipv6_opt_t {
- __u8 type;
- __u8 length;
- } __attribute__((packed)) opt = {};
- size_t opt_offset = sizeof(ext_hdr);
- for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) {
- err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset,
- &opt, sizeof(opt), BPF_HDR_START_NET);
- if (err) {
- goto error;
- }
- if (opt.type == IPV6_TLV_HAO) {
- err = bpf_skb_load_bytes_relative(skb,
- *l4_offset + opt_offset
- + offsetof(struct ipv6_destopt_hao, addr),
- &info->in6_ext_src, sizeof(info->in6_ext_src),
- BPF_HDR_START_NET);
- if (err) {
- goto error;
- }
- info->is_ipv6_ext_src = 1;
- break;
- }
- opt_offset += (opt.type == IPV6_TLV_PAD1) ?
- 1 : opt.length + sizeof(opt);
- if (opt_offset + 1 >= ext_hdr.hdrlen * 8) {
- break;
- }
- }
- } else if (*l4_protocol == IPPROTO_FRAGMENT) {
- info->is_fragmented = true;
- }
- *l4_protocol = ext_hdr.nexthdr;
- *l4_offset += (ext_hdr.hdrlen + 1) * 8;
- if (!ip6_extension_header_type(ext_hdr.nexthdr)) {
- return 0;
- }
- }
- return 0;
- error:
- return err;
- }
- static __be16 parse_eth_type(struct __sk_buff *skb)
- {
- unsigned int offset = 12;
- __be16 ret = 0;
- int err = 0;
- err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
- BPF_HDR_START_MAC);
- if (err) {
- return 0;
- }
- switch (bpf_ntohs(ret)) {
- case ETH_P_8021AD:
- offset += 4;
- case ETH_P_8021Q:
- offset += 4;
- err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
- BPF_HDR_START_MAC);
- default:
- break;
- }
- if (err) {
- return 0;
- }
- return ret;
- }
- static inline int parse_packet(struct __sk_buff *skb,
- struct packet_hash_info_t *info)
- {
- int err = 0;
- if (!info || !skb) {
- return -1;
- }
- size_t l4_offset = 0;
- __u8 l4_protocol = 0;
- __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb));
- if (l3_protocol == 0) {
- err = -1;
- goto error;
- }
- if (l3_protocol == ETH_P_IP) {
- info->is_ipv4 = 1;
- struct iphdr ip = {};
- err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip),
- BPF_HDR_START_NET);
- if (err) {
- goto error;
- }
- info->in_src = ip.saddr;
- info->in_dst = ip.daddr;
- info->is_fragmented = !!(bpf_ntohs(ip.frag_off) & (0x2000 | 0x1fff));
- l4_protocol = ip.protocol;
- l4_offset = ip.ihl * 4;
- } else if (l3_protocol == ETH_P_IPV6) {
- info->is_ipv6 = 1;
- struct ipv6hdr ip6 = {};
- err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6),
- BPF_HDR_START_NET);
- if (err) {
- goto error;
- }
- info->in6_src = ip6.saddr;
- info->in6_dst = ip6.daddr;
- l4_protocol = ip6.nexthdr;
- l4_offset = sizeof(ip6);
- err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset);
- if (err) {
- goto error;
- }
- }
- if (l4_protocol != 0 && !info->is_fragmented) {
- if (l4_protocol == IPPROTO_TCP) {
- info->is_tcp = 1;
- struct tcphdr tcp = {};
- err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp),
- BPF_HDR_START_NET);
- if (err) {
- goto error;
- }
- info->src_port = tcp.source;
- info->dst_port = tcp.dest;
- } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */
- info->is_udp = 1;
- struct udphdr udp = {};
- err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp),
- BPF_HDR_START_NET);
- if (err) {
- goto error;
- }
- info->src_port = udp.source;
- info->dst_port = udp.dest;
- }
- }
- return 0;
- error:
- return err;
- }
- static inline bool calculate_rss_hash(struct __sk_buff *skb,
- struct rss_config_t *config,
- struct toeplitz_key_data_t *toe,
- __u32 *result)
- {
- __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {};
- size_t bytes_written = 0;
- int err = 0;
- struct packet_hash_info_t packet_info = {};
- err = parse_packet(skb, &packet_info);
- if (err) {
- return false;
- }
- if (packet_info.is_ipv4) {
- if (packet_info.is_tcp &&
- config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in_src,
- sizeof(packet_info.in_src));
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in_dst,
- sizeof(packet_info.in_dst));
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.src_port,
- sizeof(packet_info.src_port));
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.dst_port,
- sizeof(packet_info.dst_port));
- } else if (packet_info.is_udp &&
- config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in_src,
- sizeof(packet_info.in_src));
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in_dst,
- sizeof(packet_info.in_dst));
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.src_port,
- sizeof(packet_info.src_port));
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.dst_port,
- sizeof(packet_info.dst_port));
- } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in_src,
- sizeof(packet_info.in_src));
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in_dst,
- sizeof(packet_info.in_dst));
- }
- } else if (packet_info.is_ipv6) {
- if (packet_info.is_tcp &&
- config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
- if (packet_info.is_ipv6_ext_src &&
- config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_ext_src,
- sizeof(packet_info.in6_ext_src));
- } else {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_src,
- sizeof(packet_info.in6_src));
- }
- if (packet_info.is_ipv6_ext_dst &&
- config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_ext_dst,
- sizeof(packet_info.in6_ext_dst));
- } else {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_dst,
- sizeof(packet_info.in6_dst));
- }
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.src_port,
- sizeof(packet_info.src_port));
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.dst_port,
- sizeof(packet_info.dst_port));
- } else if (packet_info.is_udp &&
- config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
- if (packet_info.is_ipv6_ext_src &&
- config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_ext_src,
- sizeof(packet_info.in6_ext_src));
- } else {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_src,
- sizeof(packet_info.in6_src));
- }
- if (packet_info.is_ipv6_ext_dst &&
- config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_ext_dst,
- sizeof(packet_info.in6_ext_dst));
- } else {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_dst,
- sizeof(packet_info.in6_dst));
- }
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.src_port,
- sizeof(packet_info.src_port));
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.dst_port,
- sizeof(packet_info.dst_port));
- } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
- if (packet_info.is_ipv6_ext_src &&
- config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_ext_src,
- sizeof(packet_info.in6_ext_src));
- } else {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_src,
- sizeof(packet_info.in6_src));
- }
- if (packet_info.is_ipv6_ext_dst &&
- config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_ext_dst,
- sizeof(packet_info.in6_ext_dst));
- } else {
- net_rx_rss_add_chunk(rss_input, &bytes_written,
- &packet_info.in6_dst,
- sizeof(packet_info.in6_dst));
- }
- }
- }
- if (!bytes_written) {
- return false;
- }
- net_toeplitz_add(result, rss_input, bytes_written, toe);
- return true;
- }
- SEC("socket")
- int tun_rss_steering_prog(struct __sk_buff *skb)
- {
- struct rss_config_t *config;
- struct toeplitz_key_data_t *toe;
- __u32 key = 0;
- __u32 hash = 0;
- config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key);
- toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key);
- if (!config || !toe) {
- return 0;
- }
- if (config->redirect && calculate_rss_hash(skb, config, toe, &hash)) {
- __u32 table_idx = hash % config->indirections_len;
- __u16 *queue = 0;
- queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table,
- &table_idx);
- if (queue) {
- return *queue;
- }
- }
- return config->default_queue;
- }
- char _license[] SEC("license") = "GPL v2";
|