rss.bpf.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. /*
  2. * eBPF RSS program
  3. *
  4. * Developed by Daynix Computing LTD (http://www.daynix.com)
  5. *
  6. * Authors:
  7. * Andrew Melnychenko <andrew@daynix.com>
  8. * Yuri Benditovich <yuri.benditovich@daynix.com>
  9. *
  10. * This work is licensed under the terms of the GNU GPL, version 2. See
  11. * the COPYING file in the top-level directory.
  12. *
  13. * Prepare:
  14. * Requires llvm, clang, bpftool, linux kernel tree
  15. *
  16. * Build rss.bpf.skeleton.h:
  17. * make -f Makefile.ebpf clean all
  18. */
  19. #include <stddef.h>
  20. #include <stdbool.h>
  21. #include <linux/bpf.h>
  22. #include <linux/in.h>
  23. #include <linux/if_ether.h>
  24. #include <linux/ip.h>
  25. #include <linux/ipv6.h>
  26. #include <linux/udp.h>
  27. #include <linux/tcp.h>
  28. #include <bpf/bpf_helpers.h>
  29. #include <bpf/bpf_endian.h>
  30. #include <linux/virtio_net.h>
  31. #define INDIRECTION_TABLE_SIZE 128
  32. #define HASH_CALCULATION_BUFFER_SIZE 36
  33. struct rss_config_t {
  34. __u8 redirect;
  35. __u8 populate_hash;
  36. __u32 hash_types;
  37. __u16 indirections_len;
  38. __u16 default_queue;
  39. } __attribute__((packed));
  40. struct toeplitz_key_data_t {
  41. __u32 leftmost_32_bits;
  42. __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE];
  43. };
  44. struct packet_hash_info_t {
  45. __u8 is_ipv4;
  46. __u8 is_ipv6;
  47. __u8 is_udp;
  48. __u8 is_tcp;
  49. __u8 is_ipv6_ext_src;
  50. __u8 is_ipv6_ext_dst;
  51. __u8 is_fragmented;
  52. __u16 src_port;
  53. __u16 dst_port;
  54. union {
  55. struct {
  56. __be32 in_src;
  57. __be32 in_dst;
  58. };
  59. struct {
  60. struct in6_addr in6_src;
  61. struct in6_addr in6_dst;
  62. struct in6_addr in6_ext_src;
  63. struct in6_addr in6_ext_dst;
  64. };
  65. };
  66. };
  67. struct {
  68. __uint(type, BPF_MAP_TYPE_ARRAY);
  69. __uint(key_size, sizeof(__u32));
  70. __uint(value_size, sizeof(struct rss_config_t));
  71. __uint(max_entries, 1);
  72. __uint(map_flags, BPF_F_MMAPABLE);
  73. } tap_rss_map_configurations SEC(".maps");
  74. struct {
  75. __uint(type, BPF_MAP_TYPE_ARRAY);
  76. __uint(key_size, sizeof(__u32));
  77. __uint(value_size, sizeof(struct toeplitz_key_data_t));
  78. __uint(max_entries, 1);
  79. __uint(map_flags, BPF_F_MMAPABLE);
  80. } tap_rss_map_toeplitz_key SEC(".maps");
  81. struct {
  82. __uint(type, BPF_MAP_TYPE_ARRAY);
  83. __uint(key_size, sizeof(__u32));
  84. __uint(value_size, sizeof(__u16));
  85. __uint(max_entries, INDIRECTION_TABLE_SIZE);
  86. __uint(map_flags, BPF_F_MMAPABLE);
  87. } tap_rss_map_indirection_table SEC(".maps");
  88. static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written,
  89. const void *ptr, size_t size) {
  90. __builtin_memcpy(&rss_input[*bytes_written], ptr, size);
  91. *bytes_written += size;
  92. }
  93. static inline
  94. void net_toeplitz_add(__u32 *result,
  95. __u8 *input,
  96. __u32 len
  97. , struct toeplitz_key_data_t *key) {
  98. __u32 accumulator = *result;
  99. __u32 leftmost_32_bits = key->leftmost_32_bits;
  100. __u32 byte;
  101. for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) {
  102. __u8 input_byte = input[byte];
  103. __u8 key_byte = key->next_byte[byte];
  104. __u8 bit;
  105. for (bit = 0; bit < 8; bit++) {
  106. if (input_byte & (1 << 7)) {
  107. accumulator ^= leftmost_32_bits;
  108. }
  109. leftmost_32_bits =
  110. (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7);
  111. input_byte <<= 1;
  112. key_byte <<= 1;
  113. }
  114. }
  115. *result = accumulator;
  116. }
  117. static inline int ip6_extension_header_type(__u8 hdr_type)
  118. {
  119. switch (hdr_type) {
  120. case IPPROTO_HOPOPTS:
  121. case IPPROTO_ROUTING:
  122. case IPPROTO_FRAGMENT:
  123. case IPPROTO_ICMPV6:
  124. case IPPROTO_NONE:
  125. case IPPROTO_DSTOPTS:
  126. case IPPROTO_MH:
  127. return 1;
  128. default:
  129. return 0;
  130. }
  131. }
  132. /*
  133. * According to
  134. * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml
  135. * we expect that there are would be no more than 11 extensions in IPv6 header,
  136. * also there is 27 TLV options for Destination and Hop-by-hop extensions.
  137. * Need to choose reasonable amount of maximum extensions/options we may
  138. * check to find ext src/dst.
  139. */
  140. #define IP6_EXTENSIONS_COUNT 11
  141. #define IP6_OPTIONS_COUNT 30
  142. static inline int parse_ipv6_ext(struct __sk_buff *skb,
  143. struct packet_hash_info_t *info,
  144. __u8 *l4_protocol, size_t *l4_offset)
  145. {
  146. int err = 0;
  147. if (!ip6_extension_header_type(*l4_protocol)) {
  148. return 0;
  149. }
  150. struct ipv6_opt_hdr ext_hdr = {};
  151. for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) {
  152. err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr,
  153. sizeof(ext_hdr), BPF_HDR_START_NET);
  154. if (err) {
  155. goto error;
  156. }
  157. if (*l4_protocol == IPPROTO_ROUTING) {
  158. struct ipv6_rt_hdr ext_rt = {};
  159. err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt,
  160. sizeof(ext_rt), BPF_HDR_START_NET);
  161. if (err) {
  162. goto error;
  163. }
  164. if ((ext_rt.type == IPV6_SRCRT_TYPE_2) &&
  165. (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) &&
  166. (ext_rt.segments_left == 1)) {
  167. err = bpf_skb_load_bytes_relative(skb,
  168. *l4_offset + offsetof(struct rt2_hdr, addr),
  169. &info->in6_ext_dst, sizeof(info->in6_ext_dst),
  170. BPF_HDR_START_NET);
  171. if (err) {
  172. goto error;
  173. }
  174. info->is_ipv6_ext_dst = 1;
  175. }
  176. } else if (*l4_protocol == IPPROTO_DSTOPTS) {
  177. struct ipv6_opt_t {
  178. __u8 type;
  179. __u8 length;
  180. } __attribute__((packed)) opt = {};
  181. size_t opt_offset = sizeof(ext_hdr);
  182. for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) {
  183. err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset,
  184. &opt, sizeof(opt), BPF_HDR_START_NET);
  185. if (err) {
  186. goto error;
  187. }
  188. if (opt.type == IPV6_TLV_HAO) {
  189. err = bpf_skb_load_bytes_relative(skb,
  190. *l4_offset + opt_offset
  191. + offsetof(struct ipv6_destopt_hao, addr),
  192. &info->in6_ext_src, sizeof(info->in6_ext_src),
  193. BPF_HDR_START_NET);
  194. if (err) {
  195. goto error;
  196. }
  197. info->is_ipv6_ext_src = 1;
  198. break;
  199. }
  200. opt_offset += (opt.type == IPV6_TLV_PAD1) ?
  201. 1 : opt.length + sizeof(opt);
  202. if (opt_offset + 1 >= ext_hdr.hdrlen * 8) {
  203. break;
  204. }
  205. }
  206. } else if (*l4_protocol == IPPROTO_FRAGMENT) {
  207. info->is_fragmented = true;
  208. }
  209. *l4_protocol = ext_hdr.nexthdr;
  210. *l4_offset += (ext_hdr.hdrlen + 1) * 8;
  211. if (!ip6_extension_header_type(ext_hdr.nexthdr)) {
  212. return 0;
  213. }
  214. }
  215. return 0;
  216. error:
  217. return err;
  218. }
  219. static __be16 parse_eth_type(struct __sk_buff *skb)
  220. {
  221. unsigned int offset = 12;
  222. __be16 ret = 0;
  223. int err = 0;
  224. err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
  225. BPF_HDR_START_MAC);
  226. if (err) {
  227. return 0;
  228. }
  229. switch (bpf_ntohs(ret)) {
  230. case ETH_P_8021AD:
  231. offset += 4;
  232. case ETH_P_8021Q:
  233. offset += 4;
  234. err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
  235. BPF_HDR_START_MAC);
  236. default:
  237. break;
  238. }
  239. if (err) {
  240. return 0;
  241. }
  242. return ret;
  243. }
  244. static inline int parse_packet(struct __sk_buff *skb,
  245. struct packet_hash_info_t *info)
  246. {
  247. int err = 0;
  248. if (!info || !skb) {
  249. return -1;
  250. }
  251. size_t l4_offset = 0;
  252. __u8 l4_protocol = 0;
  253. __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb));
  254. if (l3_protocol == 0) {
  255. err = -1;
  256. goto error;
  257. }
  258. if (l3_protocol == ETH_P_IP) {
  259. info->is_ipv4 = 1;
  260. struct iphdr ip = {};
  261. err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip),
  262. BPF_HDR_START_NET);
  263. if (err) {
  264. goto error;
  265. }
  266. info->in_src = ip.saddr;
  267. info->in_dst = ip.daddr;
  268. info->is_fragmented = !!(bpf_ntohs(ip.frag_off) & (0x2000 | 0x1fff));
  269. l4_protocol = ip.protocol;
  270. l4_offset = ip.ihl * 4;
  271. } else if (l3_protocol == ETH_P_IPV6) {
  272. info->is_ipv6 = 1;
  273. struct ipv6hdr ip6 = {};
  274. err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6),
  275. BPF_HDR_START_NET);
  276. if (err) {
  277. goto error;
  278. }
  279. info->in6_src = ip6.saddr;
  280. info->in6_dst = ip6.daddr;
  281. l4_protocol = ip6.nexthdr;
  282. l4_offset = sizeof(ip6);
  283. err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset);
  284. if (err) {
  285. goto error;
  286. }
  287. }
  288. if (l4_protocol != 0 && !info->is_fragmented) {
  289. if (l4_protocol == IPPROTO_TCP) {
  290. info->is_tcp = 1;
  291. struct tcphdr tcp = {};
  292. err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp),
  293. BPF_HDR_START_NET);
  294. if (err) {
  295. goto error;
  296. }
  297. info->src_port = tcp.source;
  298. info->dst_port = tcp.dest;
  299. } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */
  300. info->is_udp = 1;
  301. struct udphdr udp = {};
  302. err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp),
  303. BPF_HDR_START_NET);
  304. if (err) {
  305. goto error;
  306. }
  307. info->src_port = udp.source;
  308. info->dst_port = udp.dest;
  309. }
  310. }
  311. return 0;
  312. error:
  313. return err;
  314. }
  315. static inline bool calculate_rss_hash(struct __sk_buff *skb,
  316. struct rss_config_t *config,
  317. struct toeplitz_key_data_t *toe,
  318. __u32 *result)
  319. {
  320. __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {};
  321. size_t bytes_written = 0;
  322. int err = 0;
  323. struct packet_hash_info_t packet_info = {};
  324. err = parse_packet(skb, &packet_info);
  325. if (err) {
  326. return false;
  327. }
  328. if (packet_info.is_ipv4) {
  329. if (packet_info.is_tcp &&
  330. config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
  331. net_rx_rss_add_chunk(rss_input, &bytes_written,
  332. &packet_info.in_src,
  333. sizeof(packet_info.in_src));
  334. net_rx_rss_add_chunk(rss_input, &bytes_written,
  335. &packet_info.in_dst,
  336. sizeof(packet_info.in_dst));
  337. net_rx_rss_add_chunk(rss_input, &bytes_written,
  338. &packet_info.src_port,
  339. sizeof(packet_info.src_port));
  340. net_rx_rss_add_chunk(rss_input, &bytes_written,
  341. &packet_info.dst_port,
  342. sizeof(packet_info.dst_port));
  343. } else if (packet_info.is_udp &&
  344. config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
  345. net_rx_rss_add_chunk(rss_input, &bytes_written,
  346. &packet_info.in_src,
  347. sizeof(packet_info.in_src));
  348. net_rx_rss_add_chunk(rss_input, &bytes_written,
  349. &packet_info.in_dst,
  350. sizeof(packet_info.in_dst));
  351. net_rx_rss_add_chunk(rss_input, &bytes_written,
  352. &packet_info.src_port,
  353. sizeof(packet_info.src_port));
  354. net_rx_rss_add_chunk(rss_input, &bytes_written,
  355. &packet_info.dst_port,
  356. sizeof(packet_info.dst_port));
  357. } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
  358. net_rx_rss_add_chunk(rss_input, &bytes_written,
  359. &packet_info.in_src,
  360. sizeof(packet_info.in_src));
  361. net_rx_rss_add_chunk(rss_input, &bytes_written,
  362. &packet_info.in_dst,
  363. sizeof(packet_info.in_dst));
  364. }
  365. } else if (packet_info.is_ipv6) {
  366. if (packet_info.is_tcp &&
  367. config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
  368. if (packet_info.is_ipv6_ext_src &&
  369. config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
  370. net_rx_rss_add_chunk(rss_input, &bytes_written,
  371. &packet_info.in6_ext_src,
  372. sizeof(packet_info.in6_ext_src));
  373. } else {
  374. net_rx_rss_add_chunk(rss_input, &bytes_written,
  375. &packet_info.in6_src,
  376. sizeof(packet_info.in6_src));
  377. }
  378. if (packet_info.is_ipv6_ext_dst &&
  379. config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
  380. net_rx_rss_add_chunk(rss_input, &bytes_written,
  381. &packet_info.in6_ext_dst,
  382. sizeof(packet_info.in6_ext_dst));
  383. } else {
  384. net_rx_rss_add_chunk(rss_input, &bytes_written,
  385. &packet_info.in6_dst,
  386. sizeof(packet_info.in6_dst));
  387. }
  388. net_rx_rss_add_chunk(rss_input, &bytes_written,
  389. &packet_info.src_port,
  390. sizeof(packet_info.src_port));
  391. net_rx_rss_add_chunk(rss_input, &bytes_written,
  392. &packet_info.dst_port,
  393. sizeof(packet_info.dst_port));
  394. } else if (packet_info.is_udp &&
  395. config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
  396. if (packet_info.is_ipv6_ext_src &&
  397. config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
  398. net_rx_rss_add_chunk(rss_input, &bytes_written,
  399. &packet_info.in6_ext_src,
  400. sizeof(packet_info.in6_ext_src));
  401. } else {
  402. net_rx_rss_add_chunk(rss_input, &bytes_written,
  403. &packet_info.in6_src,
  404. sizeof(packet_info.in6_src));
  405. }
  406. if (packet_info.is_ipv6_ext_dst &&
  407. config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
  408. net_rx_rss_add_chunk(rss_input, &bytes_written,
  409. &packet_info.in6_ext_dst,
  410. sizeof(packet_info.in6_ext_dst));
  411. } else {
  412. net_rx_rss_add_chunk(rss_input, &bytes_written,
  413. &packet_info.in6_dst,
  414. sizeof(packet_info.in6_dst));
  415. }
  416. net_rx_rss_add_chunk(rss_input, &bytes_written,
  417. &packet_info.src_port,
  418. sizeof(packet_info.src_port));
  419. net_rx_rss_add_chunk(rss_input, &bytes_written,
  420. &packet_info.dst_port,
  421. sizeof(packet_info.dst_port));
  422. } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
  423. if (packet_info.is_ipv6_ext_src &&
  424. config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
  425. net_rx_rss_add_chunk(rss_input, &bytes_written,
  426. &packet_info.in6_ext_src,
  427. sizeof(packet_info.in6_ext_src));
  428. } else {
  429. net_rx_rss_add_chunk(rss_input, &bytes_written,
  430. &packet_info.in6_src,
  431. sizeof(packet_info.in6_src));
  432. }
  433. if (packet_info.is_ipv6_ext_dst &&
  434. config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
  435. net_rx_rss_add_chunk(rss_input, &bytes_written,
  436. &packet_info.in6_ext_dst,
  437. sizeof(packet_info.in6_ext_dst));
  438. } else {
  439. net_rx_rss_add_chunk(rss_input, &bytes_written,
  440. &packet_info.in6_dst,
  441. sizeof(packet_info.in6_dst));
  442. }
  443. }
  444. }
  445. if (!bytes_written) {
  446. return false;
  447. }
  448. net_toeplitz_add(result, rss_input, bytes_written, toe);
  449. return true;
  450. }
  451. SEC("socket")
  452. int tun_rss_steering_prog(struct __sk_buff *skb)
  453. {
  454. struct rss_config_t *config;
  455. struct toeplitz_key_data_t *toe;
  456. __u32 key = 0;
  457. __u32 hash = 0;
  458. config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key);
  459. toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key);
  460. if (!config || !toe) {
  461. return 0;
  462. }
  463. if (config->redirect && calculate_rss_hash(skb, config, toe, &hash)) {
  464. __u32 table_idx = hash % config->indirections_len;
  465. __u16 *queue = 0;
  466. queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table,
  467. &table_idx);
  468. if (queue) {
  469. return *queue;
  470. }
  471. }
  472. return config->default_queue;
  473. }
  474. char _license[] SEC("license") = "GPL v2";