filter-rewriter.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. /*
  2. * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
  3. * Copyright (c) 2016 FUJITSU LIMITED
  4. * Copyright (c) 2016 Intel Corporation
  5. *
  6. * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
  7. *
  8. * This work is licensed under the terms of the GNU GPL, version 2 or
  9. * later. See the COPYING file in the top-level directory.
  10. */
  11. #include "qemu/osdep.h"
  12. #include "trace.h"
  13. #include "colo.h"
  14. #include "net/filter.h"
  15. #include "net/net.h"
  16. #include "qemu/error-report.h"
  17. #include "qom/object.h"
  18. #include "qemu/main-loop.h"
  19. #include "qemu/iov.h"
  20. #include "net/checksum.h"
  21. #include "net/colo.h"
  22. #include "migration/colo.h"
  23. #include "util.h"
  24. #define TYPE_FILTER_REWRITER "filter-rewriter"
  25. OBJECT_DECLARE_SIMPLE_TYPE(RewriterState, FILTER_REWRITER)
  26. #define FAILOVER_MODE_ON true
  27. #define FAILOVER_MODE_OFF false
  28. struct RewriterState {
  29. NetFilterState parent_obj;
  30. NetQueue *incoming_queue;
  31. /* hashtable to save connection */
  32. GHashTable *connection_track_table;
  33. bool vnet_hdr;
  34. bool failover_mode;
  35. };
  36. static void filter_rewriter_failover_mode(RewriterState *s)
  37. {
  38. s->failover_mode = FAILOVER_MODE_ON;
  39. }
  40. static void filter_rewriter_flush(NetFilterState *nf)
  41. {
  42. RewriterState *s = FILTER_REWRITER(nf);
  43. if (!qemu_net_queue_flush(s->incoming_queue)) {
  44. /* Unable to empty the queue, purge remaining packets */
  45. qemu_net_queue_purge(s->incoming_queue, nf->netdev);
  46. }
  47. }
  48. /*
  49. * Return 1 on success, if return 0 means the pkt
  50. * is not TCP packet
  51. */
  52. static int is_tcp_packet(Packet *pkt)
  53. {
  54. if (!parse_packet_early(pkt) &&
  55. pkt->ip->ip_p == IPPROTO_TCP) {
  56. return 1;
  57. } else {
  58. return 0;
  59. }
  60. }
  61. /* handle tcp packet from primary guest */
  62. static int handle_primary_tcp_pkt(RewriterState *rf,
  63. Connection *conn,
  64. Packet *pkt, ConnectionKey *key)
  65. {
  66. struct tcp_hdr *tcp_pkt;
  67. tcp_pkt = (struct tcp_hdr *)pkt->transport_header;
  68. if (trace_event_get_state_backends(TRACE_COLO_FILTER_REWRITER_PKT_INFO)) {
  69. trace_colo_filter_rewriter_pkt_info(__func__,
  70. inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
  71. ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
  72. tcp_pkt->th_flags);
  73. }
  74. if (trace_event_get_state_backends(
  75. TRACE_COLO_FILTER_REWRITER_CONN_OFFSET)) {
  76. trace_colo_filter_rewriter_conn_offset(conn->offset);
  77. }
  78. if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)) &&
  79. conn->tcp_state == TCPS_SYN_SENT) {
  80. conn->tcp_state = TCPS_ESTABLISHED;
  81. }
  82. if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
  83. /*
  84. * we use this flag update offset func
  85. * run once in independent tcp connection
  86. */
  87. conn->tcp_state = TCPS_SYN_RECEIVED;
  88. }
  89. if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
  90. if (conn->tcp_state == TCPS_SYN_RECEIVED) {
  91. /*
  92. * offset = secondary_seq - primary seq
  93. * ack packet sent by guest from primary node,
  94. * so we use th_ack - 1 get primary_seq
  95. */
  96. conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
  97. conn->tcp_state = TCPS_ESTABLISHED;
  98. }
  99. if (conn->offset) {
  100. /* handle packets to the secondary from the primary */
  101. tcp_pkt->th_ack = htonl(ntohl(tcp_pkt->th_ack) + conn->offset);
  102. net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
  103. pkt->size - pkt->vnet_hdr_len, CSUM_TCP);
  104. }
  105. /*
  106. * Passive close step 3
  107. */
  108. if ((conn->tcp_state == TCPS_LAST_ACK) &&
  109. (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) {
  110. conn->tcp_state = TCPS_CLOSED;
  111. g_hash_table_remove(rf->connection_track_table, key);
  112. }
  113. }
  114. if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) {
  115. /*
  116. * Passive close.
  117. * Step 1:
  118. * The *server* side of this connect is VM, *client* tries to close
  119. * the connection. We will into CLOSE_WAIT status.
  120. *
  121. * Step 2:
  122. * In this step we will into LAST_ACK status.
  123. *
  124. * We got 'fin=1, ack=1' packet from server side, we need to
  125. * record the seq of 'fin=1, ack=1' packet.
  126. *
  127. * Step 3:
  128. * We got 'ack=1' packets from client side, it acks 'fin=1, ack=1'
  129. * packet from server side. From this point, we can ensure that there
  130. * will be no packets in the connection, except that, some errors
  131. * happen between the path of 'filter object' and vNIC, if this rare
  132. * case really happen, we can still create a new connection,
  133. * So it is safe to remove the connection from connection_track_table.
  134. *
  135. */
  136. if (conn->tcp_state == TCPS_ESTABLISHED) {
  137. conn->tcp_state = TCPS_CLOSE_WAIT;
  138. }
  139. /*
  140. * Active close step 2.
  141. */
  142. if (conn->tcp_state == TCPS_FIN_WAIT_1) {
  143. /*
  144. * For simplify implementation, we needn't wait 2MSL time
  145. * in filter rewriter. Because guest kernel will track the
  146. * TCP status and wait 2MSL time, if client resend the FIN
  147. * packet, guest will apply the last ACK too.
  148. * So, we skip the TCPS_TIME_WAIT state here and go straight
  149. * to TCPS_CLOSED state.
  150. */
  151. conn->tcp_state = TCPS_CLOSED;
  152. g_hash_table_remove(rf->connection_track_table, key);
  153. }
  154. }
  155. return 0;
  156. }
  157. /* handle tcp packet from secondary guest */
  158. static int handle_secondary_tcp_pkt(RewriterState *rf,
  159. Connection *conn,
  160. Packet *pkt, ConnectionKey *key)
  161. {
  162. struct tcp_hdr *tcp_pkt;
  163. tcp_pkt = (struct tcp_hdr *)pkt->transport_header;
  164. if (trace_event_get_state_backends(TRACE_COLO_FILTER_REWRITER_PKT_INFO)) {
  165. trace_colo_filter_rewriter_pkt_info(__func__,
  166. inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
  167. ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
  168. tcp_pkt->th_flags);
  169. }
  170. if (trace_event_get_state_backends(
  171. TRACE_COLO_FILTER_REWRITER_CONN_OFFSET)) {
  172. trace_colo_filter_rewriter_conn_offset(conn->offset);
  173. }
  174. if (conn->tcp_state == TCPS_SYN_RECEIVED &&
  175. ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
  176. /*
  177. * save offset = secondary_seq and then
  178. * in handle_primary_tcp_pkt make offset
  179. * = secondary_seq - primary_seq
  180. */
  181. conn->offset = ntohl(tcp_pkt->th_seq);
  182. }
  183. /* VM active connect */
  184. if (conn->tcp_state == TCPS_CLOSED &&
  185. ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
  186. conn->tcp_state = TCPS_SYN_SENT;
  187. }
  188. if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
  189. /* Only need to adjust seq while offset is Non-zero */
  190. if (conn->offset) {
  191. /* handle packets to the primary from the secondary*/
  192. tcp_pkt->th_seq = htonl(ntohl(tcp_pkt->th_seq) - conn->offset);
  193. net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
  194. pkt->size - pkt->vnet_hdr_len, CSUM_TCP);
  195. }
  196. }
  197. /*
  198. * Passive close step 2:
  199. */
  200. if (conn->tcp_state == TCPS_CLOSE_WAIT &&
  201. (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
  202. conn->fin_ack_seq = ntohl(tcp_pkt->th_seq);
  203. conn->tcp_state = TCPS_LAST_ACK;
  204. }
  205. /*
  206. * Active close
  207. *
  208. * Step 1:
  209. * The *server* side of this connect is VM, *server* tries to close
  210. * the connection.
  211. *
  212. * Step 2:
  213. * We will into CLOSE_WAIT status.
  214. * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and
  215. * CLOSING status.
  216. */
  217. if (conn->tcp_state == TCPS_ESTABLISHED &&
  218. (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) {
  219. conn->tcp_state = TCPS_FIN_WAIT_1;
  220. }
  221. return 0;
  222. }
  223. static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
  224. NetClientState *sender,
  225. unsigned flags,
  226. const struct iovec *iov,
  227. int iovcnt,
  228. NetPacketSent *sent_cb)
  229. {
  230. RewriterState *s = FILTER_REWRITER(nf);
  231. Connection *conn;
  232. ConnectionKey key;
  233. Packet *pkt;
  234. ssize_t size = iov_size(iov, iovcnt);
  235. ssize_t vnet_hdr_len = 0;
  236. char *buf = g_malloc0(size);
  237. iov_to_buf(iov, iovcnt, 0, buf, size);
  238. if (s->vnet_hdr) {
  239. vnet_hdr_len = nf->netdev->vnet_hdr_len;
  240. }
  241. pkt = packet_new_nocopy(buf, size, vnet_hdr_len);
  242. /*
  243. * if we get tcp packet
  244. * we will rewrite it to make secondary guest's
  245. * connection established successfully
  246. */
  247. if (pkt && is_tcp_packet(pkt)) {
  248. fill_connection_key(pkt, &key, sender == nf->netdev);
  249. /* After failover we needn't change new TCP packet */
  250. if (s->failover_mode &&
  251. !connection_has_tracked(s->connection_track_table, &key)) {
  252. goto out;
  253. }
  254. conn = connection_get(s->connection_track_table,
  255. &key,
  256. NULL);
  257. if (sender == nf->netdev) {
  258. /* NET_FILTER_DIRECTION_TX */
  259. if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) {
  260. qemu_net_queue_send(s->incoming_queue, sender, 0,
  261. (const uint8_t *)pkt->data, pkt->size, NULL);
  262. packet_destroy(pkt, NULL);
  263. pkt = NULL;
  264. /*
  265. * We block the packet here,after rewrite pkt
  266. * and will send it
  267. */
  268. return 1;
  269. }
  270. } else {
  271. /* NET_FILTER_DIRECTION_RX */
  272. if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) {
  273. qemu_net_queue_send(s->incoming_queue, sender, 0,
  274. (const uint8_t *)pkt->data, pkt->size, NULL);
  275. packet_destroy(pkt, NULL);
  276. pkt = NULL;
  277. /*
  278. * We block the packet here,after rewrite pkt
  279. * and will send it
  280. */
  281. return 1;
  282. }
  283. }
  284. }
  285. out:
  286. packet_destroy(pkt, NULL);
  287. pkt = NULL;
  288. return 0;
  289. }
  290. static void reset_seq_offset(gpointer key, gpointer value, gpointer user_data)
  291. {
  292. Connection *conn = (Connection *)value;
  293. conn->offset = 0;
  294. }
  295. static gboolean offset_is_nonzero(gpointer key,
  296. gpointer value,
  297. gpointer user_data)
  298. {
  299. Connection *conn = (Connection *)value;
  300. return conn->offset ? true : false;
  301. }
  302. static void colo_rewriter_handle_event(NetFilterState *nf, int event,
  303. Error **errp)
  304. {
  305. RewriterState *rs = FILTER_REWRITER(nf);
  306. switch (event) {
  307. case COLO_EVENT_CHECKPOINT:
  308. g_hash_table_foreach(rs->connection_track_table,
  309. reset_seq_offset, NULL);
  310. break;
  311. case COLO_EVENT_FAILOVER:
  312. if (!g_hash_table_find(rs->connection_track_table,
  313. offset_is_nonzero, NULL)) {
  314. filter_rewriter_failover_mode(rs);
  315. }
  316. break;
  317. default:
  318. break;
  319. }
  320. }
  321. static void colo_rewriter_cleanup(NetFilterState *nf)
  322. {
  323. RewriterState *s = FILTER_REWRITER(nf);
  324. /* flush packets */
  325. if (s->incoming_queue) {
  326. filter_rewriter_flush(nf);
  327. g_free(s->incoming_queue);
  328. }
  329. g_hash_table_destroy(s->connection_track_table);
  330. }
  331. static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
  332. {
  333. RewriterState *s = FILTER_REWRITER(nf);
  334. s->connection_track_table = g_hash_table_new_full(connection_key_hash,
  335. connection_key_equal,
  336. g_free,
  337. NULL);
  338. s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
  339. }
  340. static bool filter_rewriter_get_vnet_hdr(Object *obj, Error **errp)
  341. {
  342. RewriterState *s = FILTER_REWRITER(obj);
  343. return s->vnet_hdr;
  344. }
  345. static void filter_rewriter_set_vnet_hdr(Object *obj,
  346. bool value,
  347. Error **errp)
  348. {
  349. RewriterState *s = FILTER_REWRITER(obj);
  350. s->vnet_hdr = value;
  351. }
  352. static void filter_rewriter_init(Object *obj)
  353. {
  354. RewriterState *s = FILTER_REWRITER(obj);
  355. s->vnet_hdr = false;
  356. s->failover_mode = FAILOVER_MODE_OFF;
  357. }
  358. static void colo_rewriter_class_init(ObjectClass *oc, void *data)
  359. {
  360. NetFilterClass *nfc = NETFILTER_CLASS(oc);
  361. object_class_property_add_bool(oc, "vnet_hdr_support",
  362. filter_rewriter_get_vnet_hdr,
  363. filter_rewriter_set_vnet_hdr);
  364. nfc->setup = colo_rewriter_setup;
  365. nfc->cleanup = colo_rewriter_cleanup;
  366. nfc->receive_iov = colo_rewriter_receive_iov;
  367. nfc->handle_event = colo_rewriter_handle_event;
  368. }
  369. static const TypeInfo colo_rewriter_info = {
  370. .name = TYPE_FILTER_REWRITER,
  371. .parent = TYPE_NETFILTER,
  372. .class_init = colo_rewriter_class_init,
  373. .instance_init = filter_rewriter_init,
  374. .instance_size = sizeof(RewriterState),
  375. };
  376. static void register_types(void)
  377. {
  378. type_register_static(&colo_rewriter_info);
  379. }
  380. type_init(register_types);