rdma_backend.c 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402
  1. /*
  2. * QEMU paravirtual RDMA - Generic RDMA backend
  3. *
  4. * Copyright (C) 2018 Oracle
  5. * Copyright (C) 2018 Red Hat Inc
  6. *
  7. * Authors:
  8. * Yuval Shaia <yuval.shaia@oracle.com>
  9. * Marcel Apfelbaum <marcel@redhat.com>
  10. *
  11. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12. * See the COPYING file in the top-level directory.
  13. *
  14. */
  15. #include "qemu/osdep.h"
  16. #include "qapi/qapi-events-rdma.h"
  17. #include <infiniband/verbs.h>
  18. #include "contrib/rdmacm-mux/rdmacm-mux.h"
  19. #include "trace.h"
  20. #include "rdma_utils.h"
  21. #include "rdma_rm.h"
  22. #include "rdma_backend.h"
  23. #define THR_NAME_LEN 16
  24. #define THR_POLL_TO 5000
  25. #define MAD_HDR_SIZE sizeof(struct ibv_grh)
  26. typedef struct BackendCtx {
  27. void *up_ctx;
  28. struct ibv_sge sge; /* Used to save MAD recv buffer */
  29. RdmaBackendQP *backend_qp; /* To maintain recv buffers */
  30. RdmaBackendSRQ *backend_srq;
  31. } BackendCtx;
  32. struct backend_umad {
  33. struct ib_user_mad hdr;
  34. char mad[RDMA_MAX_PRIVATE_DATA];
  35. };
  36. static void (*comp_handler)(void *ctx, struct ibv_wc *wc);
  37. static void dummy_comp_handler(void *ctx, struct ibv_wc *wc)
  38. {
  39. rdma_error_report("No completion handler is registered");
  40. }
  41. static inline void complete_work(enum ibv_wc_status status, uint32_t vendor_err,
  42. void *ctx)
  43. {
  44. struct ibv_wc wc = {};
  45. wc.status = status;
  46. wc.vendor_err = vendor_err;
  47. comp_handler(ctx, &wc);
  48. }
  49. static void free_cqe_ctx(gpointer data, gpointer user_data)
  50. {
  51. BackendCtx *bctx;
  52. RdmaDeviceResources *rdma_dev_res = user_data;
  53. unsigned long cqe_ctx_id = GPOINTER_TO_INT(data);
  54. bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, cqe_ctx_id);
  55. if (bctx) {
  56. rdma_rm_dealloc_cqe_ctx(rdma_dev_res, cqe_ctx_id);
  57. atomic_dec(&rdma_dev_res->stats.missing_cqe);
  58. }
  59. g_free(bctx);
  60. }
  61. static void clean_recv_mads(RdmaBackendDev *backend_dev)
  62. {
  63. unsigned long cqe_ctx_id;
  64. do {
  65. cqe_ctx_id = rdma_protected_qlist_pop_int64(&backend_dev->
  66. recv_mads_list);
  67. if (cqe_ctx_id != -ENOENT) {
  68. atomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
  69. free_cqe_ctx(GINT_TO_POINTER(cqe_ctx_id),
  70. backend_dev->rdma_dev_res);
  71. }
  72. } while (cqe_ctx_id != -ENOENT);
  73. }
  74. static int rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq)
  75. {
  76. int i, ne, total_ne = 0;
  77. BackendCtx *bctx;
  78. struct ibv_wc wc[2];
  79. RdmaProtectedGSList *cqe_ctx_list;
  80. qemu_mutex_lock(&rdma_dev_res->lock);
  81. do {
  82. ne = ibv_poll_cq(ibcq, ARRAY_SIZE(wc), wc);
  83. trace_rdma_poll_cq(ne, ibcq);
  84. for (i = 0; i < ne; i++) {
  85. bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id);
  86. if (unlikely(!bctx)) {
  87. rdma_error_report("No matching ctx for req %"PRId64,
  88. wc[i].wr_id);
  89. continue;
  90. }
  91. comp_handler(bctx->up_ctx, &wc[i]);
  92. if (bctx->backend_qp) {
  93. cqe_ctx_list = &bctx->backend_qp->cqe_ctx_list;
  94. } else {
  95. cqe_ctx_list = &bctx->backend_srq->cqe_ctx_list;
  96. }
  97. rdma_protected_gslist_remove_int32(cqe_ctx_list, wc[i].wr_id);
  98. rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id);
  99. g_free(bctx);
  100. }
  101. total_ne += ne;
  102. } while (ne > 0);
  103. atomic_sub(&rdma_dev_res->stats.missing_cqe, total_ne);
  104. qemu_mutex_unlock(&rdma_dev_res->lock);
  105. if (ne < 0) {
  106. rdma_error_report("ibv_poll_cq fail, rc=%d, errno=%d", ne, errno);
  107. }
  108. rdma_dev_res->stats.completions += total_ne;
  109. return total_ne;
  110. }
  111. static void *comp_handler_thread(void *arg)
  112. {
  113. RdmaBackendDev *backend_dev = (RdmaBackendDev *)arg;
  114. int rc;
  115. struct ibv_cq *ev_cq;
  116. void *ev_ctx;
  117. int flags;
  118. GPollFD pfds[1];
  119. /* Change to non-blocking mode */
  120. flags = fcntl(backend_dev->channel->fd, F_GETFL);
  121. rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK);
  122. if (rc < 0) {
  123. rdma_error_report("Failed to change backend channel FD to non-blocking");
  124. return NULL;
  125. }
  126. pfds[0].fd = backend_dev->channel->fd;
  127. pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
  128. backend_dev->comp_thread.is_running = true;
  129. while (backend_dev->comp_thread.run) {
  130. do {
  131. rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS);
  132. if (!rc) {
  133. backend_dev->rdma_dev_res->stats.poll_cq_ppoll_to++;
  134. }
  135. } while (!rc && backend_dev->comp_thread.run);
  136. if (backend_dev->comp_thread.run) {
  137. rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
  138. if (unlikely(rc)) {
  139. rdma_error_report("ibv_get_cq_event fail, rc=%d, errno=%d", rc,
  140. errno);
  141. continue;
  142. }
  143. rc = ibv_req_notify_cq(ev_cq, 0);
  144. if (unlikely(rc)) {
  145. rdma_error_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc,
  146. errno);
  147. }
  148. backend_dev->rdma_dev_res->stats.poll_cq_from_bk++;
  149. rdma_poll_cq(backend_dev->rdma_dev_res, ev_cq);
  150. ibv_ack_cq_events(ev_cq, 1);
  151. }
  152. }
  153. backend_dev->comp_thread.is_running = false;
  154. qemu_thread_exit(0);
  155. return NULL;
  156. }
  157. static inline void disable_rdmacm_mux_async(RdmaBackendDev *backend_dev)
  158. {
  159. atomic_set(&backend_dev->rdmacm_mux.can_receive, 0);
  160. }
  161. static inline void enable_rdmacm_mux_async(RdmaBackendDev *backend_dev)
  162. {
  163. atomic_set(&backend_dev->rdmacm_mux.can_receive, sizeof(RdmaCmMuxMsg));
  164. }
  165. static inline int rdmacm_mux_can_process_async(RdmaBackendDev *backend_dev)
  166. {
  167. return atomic_read(&backend_dev->rdmacm_mux.can_receive);
  168. }
  169. static int rdmacm_mux_check_op_status(CharBackend *mad_chr_be)
  170. {
  171. RdmaCmMuxMsg msg = {};
  172. int ret;
  173. ret = qemu_chr_fe_read_all(mad_chr_be, (uint8_t *)&msg, sizeof(msg));
  174. if (ret != sizeof(msg)) {
  175. rdma_error_report("Got invalid message from mux: size %d, expecting %d",
  176. ret, (int)sizeof(msg));
  177. return -EIO;
  178. }
  179. trace_rdmacm_mux_check_op_status(msg.hdr.msg_type, msg.hdr.op_code,
  180. msg.hdr.err_code);
  181. if (msg.hdr.msg_type != RDMACM_MUX_MSG_TYPE_RESP) {
  182. rdma_error_report("Got invalid message type %d", msg.hdr.msg_type);
  183. return -EIO;
  184. }
  185. if (msg.hdr.err_code != RDMACM_MUX_ERR_CODE_OK) {
  186. rdma_error_report("Operation failed in mux, error code %d",
  187. msg.hdr.err_code);
  188. return -EIO;
  189. }
  190. return 0;
  191. }
  192. static int rdmacm_mux_send(RdmaBackendDev *backend_dev, RdmaCmMuxMsg *msg)
  193. {
  194. int rc = 0;
  195. msg->hdr.msg_type = RDMACM_MUX_MSG_TYPE_REQ;
  196. trace_rdmacm_mux("send", msg->hdr.msg_type, msg->hdr.op_code);
  197. disable_rdmacm_mux_async(backend_dev);
  198. rc = qemu_chr_fe_write(backend_dev->rdmacm_mux.chr_be,
  199. (const uint8_t *)msg, sizeof(*msg));
  200. if (rc != sizeof(*msg)) {
  201. enable_rdmacm_mux_async(backend_dev);
  202. rdma_error_report("Failed to send request to rdmacm_mux (rc=%d)", rc);
  203. return -EIO;
  204. }
  205. rc = rdmacm_mux_check_op_status(backend_dev->rdmacm_mux.chr_be);
  206. if (rc) {
  207. rdma_error_report("Failed to execute rdmacm_mux request %d (rc=%d)",
  208. msg->hdr.op_code, rc);
  209. }
  210. enable_rdmacm_mux_async(backend_dev);
  211. return 0;
  212. }
  213. static void stop_backend_thread(RdmaBackendThread *thread)
  214. {
  215. thread->run = false;
  216. while (thread->is_running) {
  217. sleep(THR_POLL_TO / SCALE_US / 2);
  218. }
  219. }
  220. static void start_comp_thread(RdmaBackendDev *backend_dev)
  221. {
  222. char thread_name[THR_NAME_LEN] = {};
  223. stop_backend_thread(&backend_dev->comp_thread);
  224. snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
  225. ibv_get_device_name(backend_dev->ib_dev));
  226. backend_dev->comp_thread.run = true;
  227. qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
  228. comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
  229. }
  230. void rdma_backend_register_comp_handler(void (*handler)(void *ctx,
  231. struct ibv_wc *wc))
  232. {
  233. comp_handler = handler;
  234. }
  235. void rdma_backend_unregister_comp_handler(void)
  236. {
  237. rdma_backend_register_comp_handler(dummy_comp_handler);
  238. }
  239. int rdma_backend_query_port(RdmaBackendDev *backend_dev,
  240. struct ibv_port_attr *port_attr)
  241. {
  242. int rc;
  243. rc = ibv_query_port(backend_dev->context, backend_dev->port_num, port_attr);
  244. if (rc) {
  245. rdma_error_report("ibv_query_port fail, rc=%d, errno=%d", rc, errno);
  246. return -EIO;
  247. }
  248. return 0;
  249. }
  250. void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq)
  251. {
  252. int polled;
  253. rdma_dev_res->stats.poll_cq_from_guest++;
  254. polled = rdma_poll_cq(rdma_dev_res, cq->ibcq);
  255. if (!polled) {
  256. rdma_dev_res->stats.poll_cq_from_guest_empty++;
  257. }
  258. }
  259. static GHashTable *ah_hash;
  260. static struct ibv_ah *create_ah(RdmaBackendDev *backend_dev, struct ibv_pd *pd,
  261. uint8_t sgid_idx, union ibv_gid *dgid)
  262. {
  263. GBytes *ah_key = g_bytes_new(dgid, sizeof(*dgid));
  264. struct ibv_ah *ah = g_hash_table_lookup(ah_hash, ah_key);
  265. if (ah) {
  266. trace_rdma_create_ah_cache_hit(be64_to_cpu(dgid->global.subnet_prefix),
  267. be64_to_cpu(dgid->global.interface_id));
  268. g_bytes_unref(ah_key);
  269. } else {
  270. struct ibv_ah_attr ah_attr = {
  271. .is_global = 1,
  272. .port_num = backend_dev->port_num,
  273. .grh.hop_limit = 1,
  274. };
  275. ah_attr.grh.dgid = *dgid;
  276. ah_attr.grh.sgid_index = sgid_idx;
  277. ah = ibv_create_ah(pd, &ah_attr);
  278. if (ah) {
  279. g_hash_table_insert(ah_hash, ah_key, ah);
  280. } else {
  281. g_bytes_unref(ah_key);
  282. rdma_error_report("Failed to create AH for gid <0x%" PRIx64", 0x%"PRIx64">",
  283. be64_to_cpu(dgid->global.subnet_prefix),
  284. be64_to_cpu(dgid->global.interface_id));
  285. }
  286. trace_rdma_create_ah_cache_miss(be64_to_cpu(dgid->global.subnet_prefix),
  287. be64_to_cpu(dgid->global.interface_id));
  288. }
  289. return ah;
  290. }
  291. static void destroy_ah_hash_key(gpointer data)
  292. {
  293. g_bytes_unref(data);
  294. }
  295. static void destroy_ah_hast_data(gpointer data)
  296. {
  297. struct ibv_ah *ah = data;
  298. ibv_destroy_ah(ah);
  299. }
  300. static void ah_cache_init(void)
  301. {
  302. ah_hash = g_hash_table_new_full(g_bytes_hash, g_bytes_equal,
  303. destroy_ah_hash_key, destroy_ah_hast_data);
  304. }
  305. static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
  306. struct ibv_sge *dsge, struct ibv_sge *ssge,
  307. uint8_t num_sge, uint64_t *total_length)
  308. {
  309. RdmaRmMR *mr;
  310. int ssge_idx;
  311. for (ssge_idx = 0; ssge_idx < num_sge; ssge_idx++) {
  312. mr = rdma_rm_get_mr(rdma_dev_res, ssge[ssge_idx].lkey);
  313. if (unlikely(!mr)) {
  314. rdma_error_report("Invalid lkey 0x%x", ssge[ssge_idx].lkey);
  315. return VENDOR_ERR_INVLKEY | ssge[ssge_idx].lkey;
  316. }
  317. #ifdef LEGACY_RDMA_REG_MR
  318. dsge->addr = (uintptr_t)mr->virt + ssge[ssge_idx].addr - mr->start;
  319. #else
  320. dsge->addr = ssge[ssge_idx].addr;
  321. #endif
  322. dsge->length = ssge[ssge_idx].length;
  323. dsge->lkey = rdma_backend_mr_lkey(&mr->backend_mr);
  324. *total_length += dsge->length;
  325. dsge++;
  326. }
  327. return 0;
  328. }
  329. static void trace_mad_message(const char *title, char *buf, int len)
  330. {
  331. int i;
  332. char *b = g_malloc0(len * 3 + 1);
  333. char b1[4];
  334. for (i = 0; i < len; i++) {
  335. sprintf(b1, "%.2X ", buf[i] & 0x000000FF);
  336. strcat(b, b1);
  337. }
  338. trace_rdma_mad_message(title, len, b);
  339. g_free(b);
  340. }
  341. static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx,
  342. union ibv_gid *sgid, struct ibv_sge *sge, uint32_t num_sge)
  343. {
  344. RdmaCmMuxMsg msg = {};
  345. char *hdr, *data;
  346. int ret;
  347. if (num_sge != 2) {
  348. return -EINVAL;
  349. }
  350. msg.hdr.op_code = RDMACM_MUX_OP_CODE_MAD;
  351. memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid));
  352. msg.umad_len = sge[0].length + sge[1].length;
  353. if (msg.umad_len > sizeof(msg.umad.mad)) {
  354. return -ENOMEM;
  355. }
  356. msg.umad.hdr.addr.qpn = htobe32(1);
  357. msg.umad.hdr.addr.grh_present = 1;
  358. msg.umad.hdr.addr.gid_index = sgid_idx;
  359. memcpy(msg.umad.hdr.addr.gid, sgid->raw, sizeof(msg.umad.hdr.addr.gid));
  360. msg.umad.hdr.addr.hop_limit = 0xFF;
  361. hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
  362. if (!hdr) {
  363. return -ENOMEM;
  364. }
  365. data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
  366. if (!data) {
  367. rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
  368. return -ENOMEM;
  369. }
  370. memcpy(&msg.umad.mad[0], hdr, sge[0].length);
  371. memcpy(&msg.umad.mad[sge[0].length], data, sge[1].length);
  372. rdma_pci_dma_unmap(backend_dev->dev, data, sge[1].length);
  373. rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
  374. trace_mad_message("send", msg.umad.mad, msg.umad_len);
  375. ret = rdmacm_mux_send(backend_dev, &msg);
  376. if (ret) {
  377. rdma_error_report("Failed to send MAD to rdma_umadmux (%d)", ret);
  378. return -EIO;
  379. }
  380. return 0;
  381. }
  382. void rdma_backend_post_send(RdmaBackendDev *backend_dev,
  383. RdmaBackendQP *qp, uint8_t qp_type,
  384. struct ibv_sge *sge, uint32_t num_sge,
  385. uint8_t sgid_idx, union ibv_gid *sgid,
  386. union ibv_gid *dgid, uint32_t dqpn, uint32_t dqkey,
  387. void *ctx)
  388. {
  389. BackendCtx *bctx;
  390. struct ibv_sge new_sge[MAX_SGE];
  391. uint32_t bctx_id;
  392. int rc;
  393. struct ibv_send_wr wr = {}, *bad_wr;
  394. if (!qp->ibqp) { /* This field is not initialized for QP0 and QP1 */
  395. if (qp_type == IBV_QPT_SMI) {
  396. rdma_error_report("Got QP0 request");
  397. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
  398. } else if (qp_type == IBV_QPT_GSI) {
  399. rc = mad_send(backend_dev, sgid_idx, sgid, sge, num_sge);
  400. if (rc) {
  401. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
  402. backend_dev->rdma_dev_res->stats.mad_tx_err++;
  403. } else {
  404. complete_work(IBV_WC_SUCCESS, 0, ctx);
  405. backend_dev->rdma_dev_res->stats.mad_tx++;
  406. }
  407. }
  408. return;
  409. }
  410. bctx = g_malloc0(sizeof(*bctx));
  411. bctx->up_ctx = ctx;
  412. bctx->backend_qp = qp;
  413. rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
  414. if (unlikely(rc)) {
  415. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
  416. goto err_free_bctx;
  417. }
  418. rdma_protected_gslist_append_int32(&qp->cqe_ctx_list, bctx_id);
  419. rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, num_sge,
  420. &backend_dev->rdma_dev_res->stats.tx_len);
  421. if (rc) {
  422. complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
  423. goto err_dealloc_cqe_ctx;
  424. }
  425. if (qp_type == IBV_QPT_UD) {
  426. wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, sgid_idx, dgid);
  427. if (!wr.wr.ud.ah) {
  428. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
  429. goto err_dealloc_cqe_ctx;
  430. }
  431. wr.wr.ud.remote_qpn = dqpn;
  432. wr.wr.ud.remote_qkey = dqkey;
  433. }
  434. wr.num_sge = num_sge;
  435. wr.opcode = IBV_WR_SEND;
  436. wr.send_flags = IBV_SEND_SIGNALED;
  437. wr.sg_list = new_sge;
  438. wr.wr_id = bctx_id;
  439. rc = ibv_post_send(qp->ibqp, &wr, &bad_wr);
  440. if (rc) {
  441. rdma_error_report("ibv_post_send fail, qpn=0x%x, rc=%d, errno=%d",
  442. qp->ibqp->qp_num, rc, errno);
  443. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
  444. goto err_dealloc_cqe_ctx;
  445. }
  446. atomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
  447. backend_dev->rdma_dev_res->stats.tx++;
  448. return;
  449. err_dealloc_cqe_ctx:
  450. backend_dev->rdma_dev_res->stats.tx_err++;
  451. rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id);
  452. err_free_bctx:
  453. g_free(bctx);
  454. }
  455. static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
  456. struct ibv_sge *sge, uint32_t num_sge,
  457. void *ctx)
  458. {
  459. BackendCtx *bctx;
  460. int rc;
  461. uint32_t bctx_id;
  462. if (num_sge != 1) {
  463. rdma_error_report("Invalid num_sge (%d), expecting 1", num_sge);
  464. return VENDOR_ERR_INV_NUM_SGE;
  465. }
  466. if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) {
  467. rdma_error_report("Too small buffer for MAD");
  468. return VENDOR_ERR_INV_MAD_BUFF;
  469. }
  470. bctx = g_malloc0(sizeof(*bctx));
  471. rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
  472. if (unlikely(rc)) {
  473. g_free(bctx);
  474. return VENDOR_ERR_NOMEM;
  475. }
  476. bctx->up_ctx = ctx;
  477. bctx->sge = *sge;
  478. rdma_protected_qlist_append_int64(&backend_dev->recv_mads_list, bctx_id);
  479. return 0;
  480. }
  481. void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
  482. RdmaBackendQP *qp, uint8_t qp_type,
  483. struct ibv_sge *sge, uint32_t num_sge, void *ctx)
  484. {
  485. BackendCtx *bctx;
  486. struct ibv_sge new_sge[MAX_SGE];
  487. uint32_t bctx_id;
  488. int rc;
  489. struct ibv_recv_wr wr = {}, *bad_wr;
  490. if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
  491. if (qp_type == IBV_QPT_SMI) {
  492. rdma_error_report("Got QP0 request");
  493. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
  494. }
  495. if (qp_type == IBV_QPT_GSI) {
  496. rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx);
  497. if (rc) {
  498. complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
  499. backend_dev->rdma_dev_res->stats.mad_rx_bufs_err++;
  500. } else {
  501. backend_dev->rdma_dev_res->stats.mad_rx_bufs++;
  502. }
  503. }
  504. return;
  505. }
  506. bctx = g_malloc0(sizeof(*bctx));
  507. bctx->up_ctx = ctx;
  508. bctx->backend_qp = qp;
  509. rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
  510. if (unlikely(rc)) {
  511. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
  512. goto err_free_bctx;
  513. }
  514. rdma_protected_gslist_append_int32(&qp->cqe_ctx_list, bctx_id);
  515. rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, num_sge,
  516. &backend_dev->rdma_dev_res->stats.rx_bufs_len);
  517. if (rc) {
  518. complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
  519. goto err_dealloc_cqe_ctx;
  520. }
  521. wr.num_sge = num_sge;
  522. wr.sg_list = new_sge;
  523. wr.wr_id = bctx_id;
  524. rc = ibv_post_recv(qp->ibqp, &wr, &bad_wr);
  525. if (rc) {
  526. rdma_error_report("ibv_post_recv fail, qpn=0x%x, rc=%d, errno=%d",
  527. qp->ibqp->qp_num, rc, errno);
  528. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
  529. goto err_dealloc_cqe_ctx;
  530. }
  531. atomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
  532. backend_dev->rdma_dev_res->stats.rx_bufs++;
  533. return;
  534. err_dealloc_cqe_ctx:
  535. backend_dev->rdma_dev_res->stats.rx_bufs_err++;
  536. rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id);
  537. err_free_bctx:
  538. g_free(bctx);
  539. }
  540. void rdma_backend_post_srq_recv(RdmaBackendDev *backend_dev,
  541. RdmaBackendSRQ *srq, struct ibv_sge *sge,
  542. uint32_t num_sge, void *ctx)
  543. {
  544. BackendCtx *bctx;
  545. struct ibv_sge new_sge[MAX_SGE];
  546. uint32_t bctx_id;
  547. int rc;
  548. struct ibv_recv_wr wr = {}, *bad_wr;
  549. bctx = g_malloc0(sizeof(*bctx));
  550. bctx->up_ctx = ctx;
  551. bctx->backend_srq = srq;
  552. rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
  553. if (unlikely(rc)) {
  554. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
  555. goto err_free_bctx;
  556. }
  557. rdma_protected_gslist_append_int32(&srq->cqe_ctx_list, bctx_id);
  558. rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, num_sge,
  559. &backend_dev->rdma_dev_res->stats.rx_bufs_len);
  560. if (rc) {
  561. complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
  562. goto err_dealloc_cqe_ctx;
  563. }
  564. wr.num_sge = num_sge;
  565. wr.sg_list = new_sge;
  566. wr.wr_id = bctx_id;
  567. rc = ibv_post_srq_recv(srq->ibsrq, &wr, &bad_wr);
  568. if (rc) {
  569. rdma_error_report("ibv_post_srq_recv fail, srqn=0x%x, rc=%d, errno=%d",
  570. srq->ibsrq->handle, rc, errno);
  571. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
  572. goto err_dealloc_cqe_ctx;
  573. }
  574. atomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
  575. backend_dev->rdma_dev_res->stats.rx_bufs++;
  576. backend_dev->rdma_dev_res->stats.rx_srq++;
  577. return;
  578. err_dealloc_cqe_ctx:
  579. backend_dev->rdma_dev_res->stats.rx_bufs_err++;
  580. rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id);
  581. err_free_bctx:
  582. g_free(bctx);
  583. }
  584. int rdma_backend_create_pd(RdmaBackendDev *backend_dev, RdmaBackendPD *pd)
  585. {
  586. pd->ibpd = ibv_alloc_pd(backend_dev->context);
  587. if (!pd->ibpd) {
  588. rdma_error_report("ibv_alloc_pd fail, errno=%d", errno);
  589. return -EIO;
  590. }
  591. return 0;
  592. }
  593. void rdma_backend_destroy_pd(RdmaBackendPD *pd)
  594. {
  595. if (pd->ibpd) {
  596. ibv_dealloc_pd(pd->ibpd);
  597. }
  598. }
  599. #ifdef LEGACY_RDMA_REG_MR
  600. int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, void *addr,
  601. size_t length, int access)
  602. #else
  603. int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, void *addr,
  604. size_t length, uint64_t guest_start, int access)
  605. #endif
  606. {
  607. #ifdef LEGACY_RDMA_REG_MR
  608. mr->ibmr = ibv_reg_mr(pd->ibpd, addr, length, access);
  609. #else
  610. mr->ibmr = ibv_reg_mr_iova(pd->ibpd, addr, length, guest_start, access);
  611. #endif
  612. if (!mr->ibmr) {
  613. rdma_error_report("ibv_reg_mr fail, errno=%d", errno);
  614. return -EIO;
  615. }
  616. mr->ibpd = pd->ibpd;
  617. return 0;
  618. }
  619. void rdma_backend_destroy_mr(RdmaBackendMR *mr)
  620. {
  621. if (mr->ibmr) {
  622. ibv_dereg_mr(mr->ibmr);
  623. }
  624. }
  625. int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq,
  626. int cqe)
  627. {
  628. int rc;
  629. cq->ibcq = ibv_create_cq(backend_dev->context, cqe + 1, NULL,
  630. backend_dev->channel, 0);
  631. if (!cq->ibcq) {
  632. rdma_error_report("ibv_create_cq fail, errno=%d", errno);
  633. return -EIO;
  634. }
  635. rc = ibv_req_notify_cq(cq->ibcq, 0);
  636. if (rc) {
  637. rdma_warn_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc, errno);
  638. }
  639. cq->backend_dev = backend_dev;
  640. return 0;
  641. }
  642. void rdma_backend_destroy_cq(RdmaBackendCQ *cq)
  643. {
  644. if (cq->ibcq) {
  645. ibv_destroy_cq(cq->ibcq);
  646. }
  647. }
  648. int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,
  649. RdmaBackendPD *pd, RdmaBackendCQ *scq,
  650. RdmaBackendCQ *rcq, RdmaBackendSRQ *srq,
  651. uint32_t max_send_wr, uint32_t max_recv_wr,
  652. uint32_t max_send_sge, uint32_t max_recv_sge)
  653. {
  654. struct ibv_qp_init_attr attr = {};
  655. qp->ibqp = 0;
  656. switch (qp_type) {
  657. case IBV_QPT_GSI:
  658. return 0;
  659. case IBV_QPT_RC:
  660. /* fall through */
  661. case IBV_QPT_UD:
  662. /* do nothing */
  663. break;
  664. default:
  665. rdma_error_report("Unsupported QP type %d", qp_type);
  666. return -EIO;
  667. }
  668. attr.qp_type = qp_type;
  669. attr.send_cq = scq->ibcq;
  670. attr.recv_cq = rcq->ibcq;
  671. attr.cap.max_send_wr = max_send_wr;
  672. attr.cap.max_recv_wr = max_recv_wr;
  673. attr.cap.max_send_sge = max_send_sge;
  674. attr.cap.max_recv_sge = max_recv_sge;
  675. if (srq) {
  676. attr.srq = srq->ibsrq;
  677. }
  678. qp->ibqp = ibv_create_qp(pd->ibpd, &attr);
  679. if (!qp->ibqp) {
  680. rdma_error_report("ibv_create_qp fail, errno=%d", errno);
  681. return -EIO;
  682. }
  683. rdma_protected_gslist_init(&qp->cqe_ctx_list);
  684. qp->ibpd = pd->ibpd;
  685. /* TODO: Query QP to get max_inline_data and save it to be used in send */
  686. return 0;
  687. }
  688. int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
  689. uint8_t qp_type, uint32_t qkey)
  690. {
  691. struct ibv_qp_attr attr = {};
  692. int rc, attr_mask;
  693. attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT;
  694. attr.qp_state = IBV_QPS_INIT;
  695. attr.pkey_index = 0;
  696. attr.port_num = backend_dev->port_num;
  697. switch (qp_type) {
  698. case IBV_QPT_RC:
  699. attr_mask |= IBV_QP_ACCESS_FLAGS;
  700. trace_rdma_backend_rc_qp_state_init(qp->ibqp->qp_num);
  701. break;
  702. case IBV_QPT_UD:
  703. attr.qkey = qkey;
  704. attr_mask |= IBV_QP_QKEY;
  705. trace_rdma_backend_ud_qp_state_init(qp->ibqp->qp_num, qkey);
  706. break;
  707. default:
  708. rdma_error_report("Unsupported QP type %d", qp_type);
  709. return -EIO;
  710. }
  711. rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
  712. if (rc) {
  713. rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
  714. return -EIO;
  715. }
  716. return 0;
  717. }
  718. int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
  719. uint8_t qp_type, uint8_t sgid_idx,
  720. union ibv_gid *dgid, uint32_t dqpn,
  721. uint32_t rq_psn, uint32_t qkey, bool use_qkey)
  722. {
  723. struct ibv_qp_attr attr = {};
  724. union ibv_gid ibv_gid = {
  725. .global.interface_id = dgid->global.interface_id,
  726. .global.subnet_prefix = dgid->global.subnet_prefix
  727. };
  728. int rc, attr_mask;
  729. attr.qp_state = IBV_QPS_RTR;
  730. attr_mask = IBV_QP_STATE;
  731. qp->sgid_idx = sgid_idx;
  732. switch (qp_type) {
  733. case IBV_QPT_RC:
  734. attr.path_mtu = IBV_MTU_1024;
  735. attr.dest_qp_num = dqpn;
  736. attr.max_dest_rd_atomic = 1;
  737. attr.min_rnr_timer = 12;
  738. attr.ah_attr.port_num = backend_dev->port_num;
  739. attr.ah_attr.is_global = 1;
  740. attr.ah_attr.grh.hop_limit = 1;
  741. attr.ah_attr.grh.dgid = ibv_gid;
  742. attr.ah_attr.grh.sgid_index = qp->sgid_idx;
  743. attr.rq_psn = rq_psn;
  744. attr_mask |= IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
  745. IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC |
  746. IBV_QP_MIN_RNR_TIMER;
  747. trace_rdma_backend_rc_qp_state_rtr(qp->ibqp->qp_num,
  748. be64_to_cpu(ibv_gid.global.
  749. subnet_prefix),
  750. be64_to_cpu(ibv_gid.global.
  751. interface_id),
  752. qp->sgid_idx, dqpn, rq_psn);
  753. break;
  754. case IBV_QPT_UD:
  755. if (use_qkey) {
  756. attr.qkey = qkey;
  757. attr_mask |= IBV_QP_QKEY;
  758. }
  759. trace_rdma_backend_ud_qp_state_rtr(qp->ibqp->qp_num, use_qkey ? qkey :
  760. 0);
  761. break;
  762. }
  763. rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
  764. if (rc) {
  765. rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
  766. return -EIO;
  767. }
  768. return 0;
  769. }
  770. int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type,
  771. uint32_t sq_psn, uint32_t qkey, bool use_qkey)
  772. {
  773. struct ibv_qp_attr attr = {};
  774. int rc, attr_mask;
  775. attr.qp_state = IBV_QPS_RTS;
  776. attr.sq_psn = sq_psn;
  777. attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN;
  778. switch (qp_type) {
  779. case IBV_QPT_RC:
  780. attr.timeout = 14;
  781. attr.retry_cnt = 7;
  782. attr.rnr_retry = 7;
  783. attr.max_rd_atomic = 1;
  784. attr_mask |= IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
  785. IBV_QP_MAX_QP_RD_ATOMIC;
  786. trace_rdma_backend_rc_qp_state_rts(qp->ibqp->qp_num, sq_psn);
  787. break;
  788. case IBV_QPT_UD:
  789. if (use_qkey) {
  790. attr.qkey = qkey;
  791. attr_mask |= IBV_QP_QKEY;
  792. }
  793. trace_rdma_backend_ud_qp_state_rts(qp->ibqp->qp_num, sq_psn,
  794. use_qkey ? qkey : 0);
  795. break;
  796. }
  797. rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
  798. if (rc) {
  799. rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
  800. return -EIO;
  801. }
  802. return 0;
  803. }
  804. int rdma_backend_query_qp(RdmaBackendQP *qp, struct ibv_qp_attr *attr,
  805. int attr_mask, struct ibv_qp_init_attr *init_attr)
  806. {
  807. if (!qp->ibqp) {
  808. attr->qp_state = IBV_QPS_RTS;
  809. return 0;
  810. }
  811. return ibv_query_qp(qp->ibqp, attr, attr_mask, init_attr);
  812. }
  813. void rdma_backend_destroy_qp(RdmaBackendQP *qp, RdmaDeviceResources *dev_res)
  814. {
  815. if (qp->ibqp) {
  816. ibv_destroy_qp(qp->ibqp);
  817. }
  818. g_slist_foreach(qp->cqe_ctx_list.list, free_cqe_ctx, dev_res);
  819. rdma_protected_gslist_destroy(&qp->cqe_ctx_list);
  820. }
  821. int rdma_backend_create_srq(RdmaBackendSRQ *srq, RdmaBackendPD *pd,
  822. uint32_t max_wr, uint32_t max_sge,
  823. uint32_t srq_limit)
  824. {
  825. struct ibv_srq_init_attr srq_init_attr = {};
  826. srq_init_attr.attr.max_wr = max_wr;
  827. srq_init_attr.attr.max_sge = max_sge;
  828. srq_init_attr.attr.srq_limit = srq_limit;
  829. srq->ibsrq = ibv_create_srq(pd->ibpd, &srq_init_attr);
  830. if (!srq->ibsrq) {
  831. rdma_error_report("ibv_create_srq failed, errno=%d", errno);
  832. return -EIO;
  833. }
  834. rdma_protected_gslist_init(&srq->cqe_ctx_list);
  835. return 0;
  836. }
  837. int rdma_backend_query_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr)
  838. {
  839. if (!srq->ibsrq) {
  840. return -EINVAL;
  841. }
  842. return ibv_query_srq(srq->ibsrq, srq_attr);
  843. }
  844. int rdma_backend_modify_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr,
  845. int srq_attr_mask)
  846. {
  847. if (!srq->ibsrq) {
  848. return -EINVAL;
  849. }
  850. return ibv_modify_srq(srq->ibsrq, srq_attr, srq_attr_mask);
  851. }
  852. void rdma_backend_destroy_srq(RdmaBackendSRQ *srq, RdmaDeviceResources *dev_res)
  853. {
  854. if (srq->ibsrq) {
  855. ibv_destroy_srq(srq->ibsrq);
  856. }
  857. g_slist_foreach(srq->cqe_ctx_list.list, free_cqe_ctx, dev_res);
  858. rdma_protected_gslist_destroy(&srq->cqe_ctx_list);
  859. }
  860. #define CHK_ATTR(req, dev, member, fmt) ({ \
  861. trace_rdma_check_dev_attr(#member, dev.member, req->member); \
  862. if (req->member > dev.member) { \
  863. rdma_warn_report("%s = "fmt" is higher than host device capability "fmt, \
  864. #member, req->member, dev.member); \
  865. req->member = dev.member; \
  866. } \
  867. })
  868. static int init_device_caps(RdmaBackendDev *backend_dev,
  869. struct ibv_device_attr *dev_attr)
  870. {
  871. struct ibv_device_attr bk_dev_attr;
  872. int rc;
  873. rc = ibv_query_device(backend_dev->context, &bk_dev_attr);
  874. if (rc) {
  875. rdma_error_report("ibv_query_device fail, rc=%d, errno=%d", rc, errno);
  876. return -EIO;
  877. }
  878. dev_attr->max_sge = MAX_SGE;
  879. dev_attr->max_srq_sge = MAX_SGE;
  880. CHK_ATTR(dev_attr, bk_dev_attr, max_mr_size, "%" PRId64);
  881. CHK_ATTR(dev_attr, bk_dev_attr, max_qp, "%d");
  882. CHK_ATTR(dev_attr, bk_dev_attr, max_sge, "%d");
  883. CHK_ATTR(dev_attr, bk_dev_attr, max_cq, "%d");
  884. CHK_ATTR(dev_attr, bk_dev_attr, max_mr, "%d");
  885. CHK_ATTR(dev_attr, bk_dev_attr, max_pd, "%d");
  886. CHK_ATTR(dev_attr, bk_dev_attr, max_qp_rd_atom, "%d");
  887. CHK_ATTR(dev_attr, bk_dev_attr, max_qp_init_rd_atom, "%d");
  888. CHK_ATTR(dev_attr, bk_dev_attr, max_ah, "%d");
  889. CHK_ATTR(dev_attr, bk_dev_attr, max_srq, "%d");
  890. return 0;
  891. }
  892. static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid,
  893. union ibv_gid *my_gid, int paylen)
  894. {
  895. grh->paylen = htons(paylen);
  896. grh->sgid = *sgid;
  897. grh->dgid = *my_gid;
  898. }
  899. static void process_incoming_mad_req(RdmaBackendDev *backend_dev,
  900. RdmaCmMuxMsg *msg)
  901. {
  902. unsigned long cqe_ctx_id;
  903. BackendCtx *bctx;
  904. char *mad;
  905. trace_mad_message("recv", msg->umad.mad, msg->umad_len);
  906. cqe_ctx_id = rdma_protected_qlist_pop_int64(&backend_dev->recv_mads_list);
  907. if (cqe_ctx_id == -ENOENT) {
  908. rdma_warn_report("No more free MADs buffers, waiting for a while");
  909. sleep(THR_POLL_TO);
  910. return;
  911. }
  912. bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
  913. if (unlikely(!bctx)) {
  914. rdma_error_report("No matching ctx for req %ld", cqe_ctx_id);
  915. backend_dev->rdma_dev_res->stats.mad_rx_err++;
  916. return;
  917. }
  918. mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr,
  919. bctx->sge.length);
  920. if (!mad || bctx->sge.length < msg->umad_len + MAD_HDR_SIZE) {
  921. backend_dev->rdma_dev_res->stats.mad_rx_err++;
  922. complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF,
  923. bctx->up_ctx);
  924. } else {
  925. struct ibv_wc wc = {};
  926. memset(mad, 0, bctx->sge.length);
  927. build_mad_hdr((struct ibv_grh *)mad,
  928. (union ibv_gid *)&msg->umad.hdr.addr.gid, &msg->hdr.sgid,
  929. msg->umad_len);
  930. memcpy(&mad[MAD_HDR_SIZE], msg->umad.mad, msg->umad_len);
  931. rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length);
  932. wc.byte_len = msg->umad_len;
  933. wc.status = IBV_WC_SUCCESS;
  934. wc.wc_flags = IBV_WC_GRH;
  935. backend_dev->rdma_dev_res->stats.mad_rx++;
  936. comp_handler(bctx->up_ctx, &wc);
  937. }
  938. g_free(bctx);
  939. rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
  940. }
  941. static inline int rdmacm_mux_can_receive(void *opaque)
  942. {
  943. RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
  944. return rdmacm_mux_can_process_async(backend_dev);
  945. }
  946. static void rdmacm_mux_read(void *opaque, const uint8_t *buf, int size)
  947. {
  948. RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
  949. RdmaCmMuxMsg *msg = (RdmaCmMuxMsg *)buf;
  950. trace_rdmacm_mux("read", msg->hdr.msg_type, msg->hdr.op_code);
  951. if (msg->hdr.msg_type != RDMACM_MUX_MSG_TYPE_REQ &&
  952. msg->hdr.op_code != RDMACM_MUX_OP_CODE_MAD) {
  953. rdma_error_report("Error: Not a MAD request, skipping");
  954. return;
  955. }
  956. process_incoming_mad_req(backend_dev, msg);
  957. }
  958. static int mad_init(RdmaBackendDev *backend_dev, CharBackend *mad_chr_be)
  959. {
  960. int ret;
  961. backend_dev->rdmacm_mux.chr_be = mad_chr_be;
  962. ret = qemu_chr_fe_backend_connected(backend_dev->rdmacm_mux.chr_be);
  963. if (!ret) {
  964. rdma_error_report("Missing chardev for MAD multiplexer");
  965. return -EIO;
  966. }
  967. rdma_protected_qlist_init(&backend_dev->recv_mads_list);
  968. enable_rdmacm_mux_async(backend_dev);
  969. qemu_chr_fe_set_handlers(backend_dev->rdmacm_mux.chr_be,
  970. rdmacm_mux_can_receive, rdmacm_mux_read, NULL,
  971. NULL, backend_dev, NULL, true);
  972. return 0;
  973. }
  974. static void mad_stop(RdmaBackendDev *backend_dev)
  975. {
  976. clean_recv_mads(backend_dev);
  977. }
  978. static void mad_fini(RdmaBackendDev *backend_dev)
  979. {
  980. disable_rdmacm_mux_async(backend_dev);
  981. qemu_chr_fe_disconnect(backend_dev->rdmacm_mux.chr_be);
  982. rdma_protected_qlist_destroy(&backend_dev->recv_mads_list);
  983. }
  984. int rdma_backend_get_gid_index(RdmaBackendDev *backend_dev,
  985. union ibv_gid *gid)
  986. {
  987. union ibv_gid sgid;
  988. int ret;
  989. int i = 0;
  990. do {
  991. ret = ibv_query_gid(backend_dev->context, backend_dev->port_num, i,
  992. &sgid);
  993. i++;
  994. } while (!ret && (memcmp(&sgid, gid, sizeof(*gid))));
  995. trace_rdma_backend_get_gid_index(be64_to_cpu(gid->global.subnet_prefix),
  996. be64_to_cpu(gid->global.interface_id),
  997. i - 1);
  998. return ret ? ret : i - 1;
  999. }
  1000. int rdma_backend_add_gid(RdmaBackendDev *backend_dev, const char *ifname,
  1001. union ibv_gid *gid)
  1002. {
  1003. RdmaCmMuxMsg msg = {};
  1004. int ret;
  1005. trace_rdma_backend_gid_change("add", be64_to_cpu(gid->global.subnet_prefix),
  1006. be64_to_cpu(gid->global.interface_id));
  1007. msg.hdr.op_code = RDMACM_MUX_OP_CODE_REG;
  1008. memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid));
  1009. ret = rdmacm_mux_send(backend_dev, &msg);
  1010. if (ret) {
  1011. rdma_error_report("Failed to register GID to rdma_umadmux (%d)", ret);
  1012. return -EIO;
  1013. }
  1014. qapi_event_send_rdma_gid_status_changed(ifname, true,
  1015. gid->global.subnet_prefix,
  1016. gid->global.interface_id);
  1017. return ret;
  1018. }
  1019. int rdma_backend_del_gid(RdmaBackendDev *backend_dev, const char *ifname,
  1020. union ibv_gid *gid)
  1021. {
  1022. RdmaCmMuxMsg msg = {};
  1023. int ret;
  1024. trace_rdma_backend_gid_change("del", be64_to_cpu(gid->global.subnet_prefix),
  1025. be64_to_cpu(gid->global.interface_id));
  1026. msg.hdr.op_code = RDMACM_MUX_OP_CODE_UNREG;
  1027. memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid));
  1028. ret = rdmacm_mux_send(backend_dev, &msg);
  1029. if (ret) {
  1030. rdma_error_report("Failed to unregister GID from rdma_umadmux (%d)",
  1031. ret);
  1032. return -EIO;
  1033. }
  1034. qapi_event_send_rdma_gid_status_changed(ifname, false,
  1035. gid->global.subnet_prefix,
  1036. gid->global.interface_id);
  1037. return 0;
  1038. }
  1039. int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
  1040. RdmaDeviceResources *rdma_dev_res,
  1041. const char *backend_device_name, uint8_t port_num,
  1042. struct ibv_device_attr *dev_attr, CharBackend *mad_chr_be)
  1043. {
  1044. int i;
  1045. int ret = 0;
  1046. int num_ibv_devices;
  1047. struct ibv_device **dev_list;
  1048. memset(backend_dev, 0, sizeof(*backend_dev));
  1049. backend_dev->dev = pdev;
  1050. backend_dev->port_num = port_num;
  1051. backend_dev->rdma_dev_res = rdma_dev_res;
  1052. rdma_backend_register_comp_handler(dummy_comp_handler);
  1053. dev_list = ibv_get_device_list(&num_ibv_devices);
  1054. if (!dev_list) {
  1055. rdma_error_report("Failed to get IB devices list");
  1056. return -EIO;
  1057. }
  1058. if (num_ibv_devices == 0) {
  1059. rdma_error_report("No IB devices were found");
  1060. ret = -ENXIO;
  1061. goto out_free_dev_list;
  1062. }
  1063. if (backend_device_name) {
  1064. for (i = 0; dev_list[i]; ++i) {
  1065. if (!strcmp(ibv_get_device_name(dev_list[i]),
  1066. backend_device_name)) {
  1067. break;
  1068. }
  1069. }
  1070. backend_dev->ib_dev = dev_list[i];
  1071. if (!backend_dev->ib_dev) {
  1072. rdma_error_report("Failed to find IB device %s",
  1073. backend_device_name);
  1074. ret = -EIO;
  1075. goto out_free_dev_list;
  1076. }
  1077. } else {
  1078. backend_dev->ib_dev = *dev_list;
  1079. }
  1080. rdma_info_report("uverb device %s", backend_dev->ib_dev->dev_name);
  1081. backend_dev->context = ibv_open_device(backend_dev->ib_dev);
  1082. if (!backend_dev->context) {
  1083. rdma_error_report("Failed to open IB device %s",
  1084. ibv_get_device_name(backend_dev->ib_dev));
  1085. ret = -EIO;
  1086. goto out;
  1087. }
  1088. backend_dev->channel = ibv_create_comp_channel(backend_dev->context);
  1089. if (!backend_dev->channel) {
  1090. rdma_error_report("Failed to create IB communication channel");
  1091. ret = -EIO;
  1092. goto out_close_device;
  1093. }
  1094. ret = init_device_caps(backend_dev, dev_attr);
  1095. if (ret) {
  1096. rdma_error_report("Failed to initialize device capabilities");
  1097. ret = -EIO;
  1098. goto out_destroy_comm_channel;
  1099. }
  1100. ret = mad_init(backend_dev, mad_chr_be);
  1101. if (ret) {
  1102. rdma_error_report("Failed to initialize mad");
  1103. ret = -EIO;
  1104. goto out_destroy_comm_channel;
  1105. }
  1106. backend_dev->comp_thread.run = false;
  1107. backend_dev->comp_thread.is_running = false;
  1108. ah_cache_init();
  1109. goto out_free_dev_list;
  1110. out_destroy_comm_channel:
  1111. ibv_destroy_comp_channel(backend_dev->channel);
  1112. out_close_device:
  1113. ibv_close_device(backend_dev->context);
  1114. out_free_dev_list:
  1115. ibv_free_device_list(dev_list);
  1116. out:
  1117. return ret;
  1118. }
  1119. void rdma_backend_start(RdmaBackendDev *backend_dev)
  1120. {
  1121. start_comp_thread(backend_dev);
  1122. }
  1123. void rdma_backend_stop(RdmaBackendDev *backend_dev)
  1124. {
  1125. mad_stop(backend_dev);
  1126. stop_backend_thread(&backend_dev->comp_thread);
  1127. }
  1128. void rdma_backend_fini(RdmaBackendDev *backend_dev)
  1129. {
  1130. mad_fini(backend_dev);
  1131. g_hash_table_destroy(ah_hash);
  1132. ibv_destroy_comp_channel(backend_dev->channel);
  1133. ibv_close_device(backend_dev->context);
  1134. }