2
0

rdma.c 128 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231
  1. /*
  2. * RDMA protocol and interfaces
  3. *
  4. * Copyright IBM, Corp. 2010-2013
  5. * Copyright Red Hat, Inc. 2015-2016
  6. *
  7. * Authors:
  8. * Michael R. Hines <mrhines@us.ibm.com>
  9. * Jiuxing Liu <jl@us.ibm.com>
  10. * Daniel P. Berrange <berrange@redhat.com>
  11. *
  12. * This work is licensed under the terms of the GNU GPL, version 2 or
  13. * later. See the COPYING file in the top-level directory.
  14. *
  15. */
  16. #include "qemu/osdep.h"
  17. #include "qapi/error.h"
  18. #include "qemu/cutils.h"
  19. #include "rdma.h"
  20. #include "migration.h"
  21. #include "qemu-file.h"
  22. #include "ram.h"
  23. #include "qemu/error-report.h"
  24. #include "qemu/main-loop.h"
  25. #include "qemu/module.h"
  26. #include "qemu/rcu.h"
  27. #include "qemu/sockets.h"
  28. #include "qemu/bitmap.h"
  29. #include "qemu/coroutine.h"
  30. #include "exec/memory.h"
  31. #include <sys/socket.h>
  32. #include <netdb.h>
  33. #include <arpa/inet.h>
  34. #include <rdma/rdma_cma.h>
  35. #include "trace.h"
  36. #include "qom/object.h"
  37. #include <poll.h>
  38. /*
  39. * Print and error on both the Monitor and the Log file.
  40. */
  41. #define ERROR(errp, fmt, ...) \
  42. do { \
  43. fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
  44. if (errp && (*(errp) == NULL)) { \
  45. error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
  46. } \
  47. } while (0)
  48. #define RDMA_RESOLVE_TIMEOUT_MS 10000
  49. /* Do not merge data if larger than this. */
  50. #define RDMA_MERGE_MAX (2 * 1024 * 1024)
  51. #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
  52. #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
  53. /*
  54. * This is only for non-live state being migrated.
  55. * Instead of RDMA_WRITE messages, we use RDMA_SEND
  56. * messages for that state, which requires a different
  57. * delivery design than main memory.
  58. */
  59. #define RDMA_SEND_INCREMENT 32768
  60. /*
  61. * Maximum size infiniband SEND message
  62. */
  63. #define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
  64. #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
  65. #define RDMA_CONTROL_VERSION_CURRENT 1
  66. /*
  67. * Capabilities for negotiation.
  68. */
  69. #define RDMA_CAPABILITY_PIN_ALL 0x01
  70. /*
  71. * Add the other flags above to this list of known capabilities
  72. * as they are introduced.
  73. */
  74. static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
  75. #define CHECK_ERROR_STATE() \
  76. do { \
  77. if (rdma->error_state) { \
  78. if (!rdma->error_reported) { \
  79. error_report("RDMA is in an error state waiting migration" \
  80. " to abort!"); \
  81. rdma->error_reported = 1; \
  82. } \
  83. return rdma->error_state; \
  84. } \
  85. } while (0)
  86. /*
  87. * A work request ID is 64-bits and we split up these bits
  88. * into 3 parts:
  89. *
  90. * bits 0-15 : type of control message, 2^16
  91. * bits 16-29: ram block index, 2^14
  92. * bits 30-63: ram block chunk number, 2^34
  93. *
  94. * The last two bit ranges are only used for RDMA writes,
  95. * in order to track their completion and potentially
  96. * also track unregistration status of the message.
  97. */
  98. #define RDMA_WRID_TYPE_SHIFT 0UL
  99. #define RDMA_WRID_BLOCK_SHIFT 16UL
  100. #define RDMA_WRID_CHUNK_SHIFT 30UL
  101. #define RDMA_WRID_TYPE_MASK \
  102. ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
  103. #define RDMA_WRID_BLOCK_MASK \
  104. (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
  105. #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
  106. /*
  107. * RDMA migration protocol:
  108. * 1. RDMA Writes (data messages, i.e. RAM)
  109. * 2. IB Send/Recv (control channel messages)
  110. */
  111. enum {
  112. RDMA_WRID_NONE = 0,
  113. RDMA_WRID_RDMA_WRITE = 1,
  114. RDMA_WRID_SEND_CONTROL = 2000,
  115. RDMA_WRID_RECV_CONTROL = 4000,
  116. };
  117. static const char *wrid_desc[] = {
  118. [RDMA_WRID_NONE] = "NONE",
  119. [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
  120. [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
  121. [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
  122. };
  123. /*
  124. * Work request IDs for IB SEND messages only (not RDMA writes).
  125. * This is used by the migration protocol to transmit
  126. * control messages (such as device state and registration commands)
  127. *
  128. * We could use more WRs, but we have enough for now.
  129. */
  130. enum {
  131. RDMA_WRID_READY = 0,
  132. RDMA_WRID_DATA,
  133. RDMA_WRID_CONTROL,
  134. RDMA_WRID_MAX,
  135. };
  136. /*
  137. * SEND/RECV IB Control Messages.
  138. */
  139. enum {
  140. RDMA_CONTROL_NONE = 0,
  141. RDMA_CONTROL_ERROR,
  142. RDMA_CONTROL_READY, /* ready to receive */
  143. RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */
  144. RDMA_CONTROL_RAM_BLOCKS_REQUEST, /* RAMBlock synchronization */
  145. RDMA_CONTROL_RAM_BLOCKS_RESULT, /* RAMBlock synchronization */
  146. RDMA_CONTROL_COMPRESS, /* page contains repeat values */
  147. RDMA_CONTROL_REGISTER_REQUEST, /* dynamic page registration */
  148. RDMA_CONTROL_REGISTER_RESULT, /* key to use after registration */
  149. RDMA_CONTROL_REGISTER_FINISHED, /* current iteration finished */
  150. RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */
  151. RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
  152. };
  153. /*
  154. * Memory and MR structures used to represent an IB Send/Recv work request.
  155. * This is *not* used for RDMA writes, only IB Send/Recv.
  156. */
  157. typedef struct {
  158. uint8_t control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
  159. struct ibv_mr *control_mr; /* registration metadata */
  160. size_t control_len; /* length of the message */
  161. uint8_t *control_curr; /* start of unconsumed bytes */
  162. } RDMAWorkRequestData;
  163. /*
  164. * Negotiate RDMA capabilities during connection-setup time.
  165. */
  166. typedef struct {
  167. uint32_t version;
  168. uint32_t flags;
  169. } RDMACapabilities;
  170. static void caps_to_network(RDMACapabilities *cap)
  171. {
  172. cap->version = htonl(cap->version);
  173. cap->flags = htonl(cap->flags);
  174. }
  175. static void network_to_caps(RDMACapabilities *cap)
  176. {
  177. cap->version = ntohl(cap->version);
  178. cap->flags = ntohl(cap->flags);
  179. }
  180. /*
  181. * Representation of a RAMBlock from an RDMA perspective.
  182. * This is not transmitted, only local.
  183. * This and subsequent structures cannot be linked lists
  184. * because we're using a single IB message to transmit
  185. * the information. It's small anyway, so a list is overkill.
  186. */
  187. typedef struct RDMALocalBlock {
  188. char *block_name;
  189. uint8_t *local_host_addr; /* local virtual address */
  190. uint64_t remote_host_addr; /* remote virtual address */
  191. uint64_t offset;
  192. uint64_t length;
  193. struct ibv_mr **pmr; /* MRs for chunk-level registration */
  194. struct ibv_mr *mr; /* MR for non-chunk-level registration */
  195. uint32_t *remote_keys; /* rkeys for chunk-level registration */
  196. uint32_t remote_rkey; /* rkeys for non-chunk-level registration */
  197. int index; /* which block are we */
  198. unsigned int src_index; /* (Only used on dest) */
  199. bool is_ram_block;
  200. int nb_chunks;
  201. unsigned long *transit_bitmap;
  202. unsigned long *unregister_bitmap;
  203. } RDMALocalBlock;
  204. /*
  205. * Also represents a RAMblock, but only on the dest.
  206. * This gets transmitted by the dest during connection-time
  207. * to the source VM and then is used to populate the
  208. * corresponding RDMALocalBlock with
  209. * the information needed to perform the actual RDMA.
  210. */
  211. typedef struct QEMU_PACKED RDMADestBlock {
  212. uint64_t remote_host_addr;
  213. uint64_t offset;
  214. uint64_t length;
  215. uint32_t remote_rkey;
  216. uint32_t padding;
  217. } RDMADestBlock;
  218. static const char *control_desc(unsigned int rdma_control)
  219. {
  220. static const char *strs[] = {
  221. [RDMA_CONTROL_NONE] = "NONE",
  222. [RDMA_CONTROL_ERROR] = "ERROR",
  223. [RDMA_CONTROL_READY] = "READY",
  224. [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
  225. [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
  226. [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
  227. [RDMA_CONTROL_COMPRESS] = "COMPRESS",
  228. [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
  229. [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
  230. [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
  231. [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
  232. [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
  233. };
  234. if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
  235. return "??BAD CONTROL VALUE??";
  236. }
  237. return strs[rdma_control];
  238. }
  239. static uint64_t htonll(uint64_t v)
  240. {
  241. union { uint32_t lv[2]; uint64_t llv; } u;
  242. u.lv[0] = htonl(v >> 32);
  243. u.lv[1] = htonl(v & 0xFFFFFFFFULL);
  244. return u.llv;
  245. }
  246. static uint64_t ntohll(uint64_t v)
  247. {
  248. union { uint32_t lv[2]; uint64_t llv; } u;
  249. u.llv = v;
  250. return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
  251. }
  252. static void dest_block_to_network(RDMADestBlock *db)
  253. {
  254. db->remote_host_addr = htonll(db->remote_host_addr);
  255. db->offset = htonll(db->offset);
  256. db->length = htonll(db->length);
  257. db->remote_rkey = htonl(db->remote_rkey);
  258. }
  259. static void network_to_dest_block(RDMADestBlock *db)
  260. {
  261. db->remote_host_addr = ntohll(db->remote_host_addr);
  262. db->offset = ntohll(db->offset);
  263. db->length = ntohll(db->length);
  264. db->remote_rkey = ntohl(db->remote_rkey);
  265. }
  266. /*
  267. * Virtual address of the above structures used for transmitting
  268. * the RAMBlock descriptions at connection-time.
  269. * This structure is *not* transmitted.
  270. */
  271. typedef struct RDMALocalBlocks {
  272. int nb_blocks;
  273. bool init; /* main memory init complete */
  274. RDMALocalBlock *block;
  275. } RDMALocalBlocks;
  276. /*
  277. * Main data structure for RDMA state.
  278. * While there is only one copy of this structure being allocated right now,
  279. * this is the place where one would start if you wanted to consider
  280. * having more than one RDMA connection open at the same time.
  281. */
  282. typedef struct RDMAContext {
  283. char *host;
  284. int port;
  285. char *host_port;
  286. RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
  287. /*
  288. * This is used by *_exchange_send() to figure out whether or not
  289. * the initial "READY" message has already been received or not.
  290. * This is because other functions may potentially poll() and detect
  291. * the READY message before send() does, in which case we need to
  292. * know if it completed.
  293. */
  294. int control_ready_expected;
  295. /* number of outstanding writes */
  296. int nb_sent;
  297. /* store info about current buffer so that we can
  298. merge it with future sends */
  299. uint64_t current_addr;
  300. uint64_t current_length;
  301. /* index of ram block the current buffer belongs to */
  302. int current_index;
  303. /* index of the chunk in the current ram block */
  304. int current_chunk;
  305. bool pin_all;
  306. /*
  307. * infiniband-specific variables for opening the device
  308. * and maintaining connection state and so forth.
  309. *
  310. * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
  311. * cm_id->verbs, cm_id->channel, and cm_id->qp.
  312. */
  313. struct rdma_cm_id *cm_id; /* connection manager ID */
  314. struct rdma_cm_id *listen_id;
  315. bool connected;
  316. struct ibv_context *verbs;
  317. struct rdma_event_channel *channel;
  318. struct ibv_qp *qp; /* queue pair */
  319. struct ibv_comp_channel *recv_comp_channel; /* recv completion channel */
  320. struct ibv_comp_channel *send_comp_channel; /* send completion channel */
  321. struct ibv_pd *pd; /* protection domain */
  322. struct ibv_cq *recv_cq; /* recvieve completion queue */
  323. struct ibv_cq *send_cq; /* send completion queue */
  324. /*
  325. * If a previous write failed (perhaps because of a failed
  326. * memory registration, then do not attempt any future work
  327. * and remember the error state.
  328. */
  329. int error_state;
  330. int error_reported;
  331. int received_error;
  332. /*
  333. * Description of ram blocks used throughout the code.
  334. */
  335. RDMALocalBlocks local_ram_blocks;
  336. RDMADestBlock *dest_blocks;
  337. /* Index of the next RAMBlock received during block registration */
  338. unsigned int next_src_index;
  339. /*
  340. * Migration on *destination* started.
  341. * Then use coroutine yield function.
  342. * Source runs in a thread, so we don't care.
  343. */
  344. int migration_started_on_destination;
  345. int total_registrations;
  346. int total_writes;
  347. int unregister_current, unregister_next;
  348. uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
  349. GHashTable *blockmap;
  350. /* the RDMAContext for return path */
  351. struct RDMAContext *return_path;
  352. bool is_return_path;
  353. } RDMAContext;
  354. #define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
  355. OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
  356. struct QIOChannelRDMA {
  357. QIOChannel parent;
  358. RDMAContext *rdmain;
  359. RDMAContext *rdmaout;
  360. QEMUFile *file;
  361. bool blocking; /* XXX we don't actually honour this yet */
  362. };
  363. /*
  364. * Main structure for IB Send/Recv control messages.
  365. * This gets prepended at the beginning of every Send/Recv.
  366. */
  367. typedef struct QEMU_PACKED {
  368. uint32_t len; /* Total length of data portion */
  369. uint32_t type; /* which control command to perform */
  370. uint32_t repeat; /* number of commands in data portion of same type */
  371. uint32_t padding;
  372. } RDMAControlHeader;
  373. static void control_to_network(RDMAControlHeader *control)
  374. {
  375. control->type = htonl(control->type);
  376. control->len = htonl(control->len);
  377. control->repeat = htonl(control->repeat);
  378. }
  379. static void network_to_control(RDMAControlHeader *control)
  380. {
  381. control->type = ntohl(control->type);
  382. control->len = ntohl(control->len);
  383. control->repeat = ntohl(control->repeat);
  384. }
  385. /*
  386. * Register a single Chunk.
  387. * Information sent by the source VM to inform the dest
  388. * to register an single chunk of memory before we can perform
  389. * the actual RDMA operation.
  390. */
  391. typedef struct QEMU_PACKED {
  392. union QEMU_PACKED {
  393. uint64_t current_addr; /* offset into the ram_addr_t space */
  394. uint64_t chunk; /* chunk to lookup if unregistering */
  395. } key;
  396. uint32_t current_index; /* which ramblock the chunk belongs to */
  397. uint32_t padding;
  398. uint64_t chunks; /* how many sequential chunks to register */
  399. } RDMARegister;
  400. static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
  401. {
  402. RDMALocalBlock *local_block;
  403. local_block = &rdma->local_ram_blocks.block[reg->current_index];
  404. if (local_block->is_ram_block) {
  405. /*
  406. * current_addr as passed in is an address in the local ram_addr_t
  407. * space, we need to translate this for the destination
  408. */
  409. reg->key.current_addr -= local_block->offset;
  410. reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
  411. }
  412. reg->key.current_addr = htonll(reg->key.current_addr);
  413. reg->current_index = htonl(reg->current_index);
  414. reg->chunks = htonll(reg->chunks);
  415. }
  416. static void network_to_register(RDMARegister *reg)
  417. {
  418. reg->key.current_addr = ntohll(reg->key.current_addr);
  419. reg->current_index = ntohl(reg->current_index);
  420. reg->chunks = ntohll(reg->chunks);
  421. }
  422. typedef struct QEMU_PACKED {
  423. uint32_t value; /* if zero, we will madvise() */
  424. uint32_t block_idx; /* which ram block index */
  425. uint64_t offset; /* Address in remote ram_addr_t space */
  426. uint64_t length; /* length of the chunk */
  427. } RDMACompress;
  428. static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
  429. {
  430. comp->value = htonl(comp->value);
  431. /*
  432. * comp->offset as passed in is an address in the local ram_addr_t
  433. * space, we need to translate this for the destination
  434. */
  435. comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
  436. comp->offset += rdma->dest_blocks[comp->block_idx].offset;
  437. comp->block_idx = htonl(comp->block_idx);
  438. comp->offset = htonll(comp->offset);
  439. comp->length = htonll(comp->length);
  440. }
  441. static void network_to_compress(RDMACompress *comp)
  442. {
  443. comp->value = ntohl(comp->value);
  444. comp->block_idx = ntohl(comp->block_idx);
  445. comp->offset = ntohll(comp->offset);
  446. comp->length = ntohll(comp->length);
  447. }
  448. /*
  449. * The result of the dest's memory registration produces an "rkey"
  450. * which the source VM must reference in order to perform
  451. * the RDMA operation.
  452. */
  453. typedef struct QEMU_PACKED {
  454. uint32_t rkey;
  455. uint32_t padding;
  456. uint64_t host_addr;
  457. } RDMARegisterResult;
  458. static void result_to_network(RDMARegisterResult *result)
  459. {
  460. result->rkey = htonl(result->rkey);
  461. result->host_addr = htonll(result->host_addr);
  462. };
  463. static void network_to_result(RDMARegisterResult *result)
  464. {
  465. result->rkey = ntohl(result->rkey);
  466. result->host_addr = ntohll(result->host_addr);
  467. };
  468. const char *print_wrid(int wrid);
  469. static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
  470. uint8_t *data, RDMAControlHeader *resp,
  471. int *resp_idx,
  472. int (*callback)(RDMAContext *rdma));
  473. static inline uint64_t ram_chunk_index(const uint8_t *start,
  474. const uint8_t *host)
  475. {
  476. return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
  477. }
  478. static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
  479. uint64_t i)
  480. {
  481. return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
  482. (i << RDMA_REG_CHUNK_SHIFT));
  483. }
  484. static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
  485. uint64_t i)
  486. {
  487. uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
  488. (1UL << RDMA_REG_CHUNK_SHIFT);
  489. if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
  490. result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
  491. }
  492. return result;
  493. }
  494. static int rdma_add_block(RDMAContext *rdma, const char *block_name,
  495. void *host_addr,
  496. ram_addr_t block_offset, uint64_t length)
  497. {
  498. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  499. RDMALocalBlock *block;
  500. RDMALocalBlock *old = local->block;
  501. local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
  502. if (local->nb_blocks) {
  503. int x;
  504. if (rdma->blockmap) {
  505. for (x = 0; x < local->nb_blocks; x++) {
  506. g_hash_table_remove(rdma->blockmap,
  507. (void *)(uintptr_t)old[x].offset);
  508. g_hash_table_insert(rdma->blockmap,
  509. (void *)(uintptr_t)old[x].offset,
  510. &local->block[x]);
  511. }
  512. }
  513. memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
  514. g_free(old);
  515. }
  516. block = &local->block[local->nb_blocks];
  517. block->block_name = g_strdup(block_name);
  518. block->local_host_addr = host_addr;
  519. block->offset = block_offset;
  520. block->length = length;
  521. block->index = local->nb_blocks;
  522. block->src_index = ~0U; /* Filled in by the receipt of the block list */
  523. block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
  524. block->transit_bitmap = bitmap_new(block->nb_chunks);
  525. bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
  526. block->unregister_bitmap = bitmap_new(block->nb_chunks);
  527. bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
  528. block->remote_keys = g_new0(uint32_t, block->nb_chunks);
  529. block->is_ram_block = local->init ? false : true;
  530. if (rdma->blockmap) {
  531. g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
  532. }
  533. trace_rdma_add_block(block_name, local->nb_blocks,
  534. (uintptr_t) block->local_host_addr,
  535. block->offset, block->length,
  536. (uintptr_t) (block->local_host_addr + block->length),
  537. BITS_TO_LONGS(block->nb_chunks) *
  538. sizeof(unsigned long) * 8,
  539. block->nb_chunks);
  540. local->nb_blocks++;
  541. return 0;
  542. }
  543. /*
  544. * Memory regions need to be registered with the device and queue pairs setup
  545. * in advanced before the migration starts. This tells us where the RAM blocks
  546. * are so that we can register them individually.
  547. */
  548. static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
  549. {
  550. const char *block_name = qemu_ram_get_idstr(rb);
  551. void *host_addr = qemu_ram_get_host_addr(rb);
  552. ram_addr_t block_offset = qemu_ram_get_offset(rb);
  553. ram_addr_t length = qemu_ram_get_used_length(rb);
  554. return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
  555. }
  556. /*
  557. * Identify the RAMBlocks and their quantity. They will be references to
  558. * identify chunk boundaries inside each RAMBlock and also be referenced
  559. * during dynamic page registration.
  560. */
  561. static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
  562. {
  563. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  564. int ret;
  565. assert(rdma->blockmap == NULL);
  566. memset(local, 0, sizeof *local);
  567. ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
  568. if (ret) {
  569. return ret;
  570. }
  571. trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
  572. rdma->dest_blocks = g_new0(RDMADestBlock,
  573. rdma->local_ram_blocks.nb_blocks);
  574. local->init = true;
  575. return 0;
  576. }
  577. /*
  578. * Note: If used outside of cleanup, the caller must ensure that the destination
  579. * block structures are also updated
  580. */
  581. static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
  582. {
  583. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  584. RDMALocalBlock *old = local->block;
  585. int x;
  586. if (rdma->blockmap) {
  587. g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
  588. }
  589. if (block->pmr) {
  590. int j;
  591. for (j = 0; j < block->nb_chunks; j++) {
  592. if (!block->pmr[j]) {
  593. continue;
  594. }
  595. ibv_dereg_mr(block->pmr[j]);
  596. rdma->total_registrations--;
  597. }
  598. g_free(block->pmr);
  599. block->pmr = NULL;
  600. }
  601. if (block->mr) {
  602. ibv_dereg_mr(block->mr);
  603. rdma->total_registrations--;
  604. block->mr = NULL;
  605. }
  606. g_free(block->transit_bitmap);
  607. block->transit_bitmap = NULL;
  608. g_free(block->unregister_bitmap);
  609. block->unregister_bitmap = NULL;
  610. g_free(block->remote_keys);
  611. block->remote_keys = NULL;
  612. g_free(block->block_name);
  613. block->block_name = NULL;
  614. if (rdma->blockmap) {
  615. for (x = 0; x < local->nb_blocks; x++) {
  616. g_hash_table_remove(rdma->blockmap,
  617. (void *)(uintptr_t)old[x].offset);
  618. }
  619. }
  620. if (local->nb_blocks > 1) {
  621. local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
  622. if (block->index) {
  623. memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
  624. }
  625. if (block->index < (local->nb_blocks - 1)) {
  626. memcpy(local->block + block->index, old + (block->index + 1),
  627. sizeof(RDMALocalBlock) *
  628. (local->nb_blocks - (block->index + 1)));
  629. for (x = block->index; x < local->nb_blocks - 1; x++) {
  630. local->block[x].index--;
  631. }
  632. }
  633. } else {
  634. assert(block == local->block);
  635. local->block = NULL;
  636. }
  637. trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
  638. block->offset, block->length,
  639. (uintptr_t)(block->local_host_addr + block->length),
  640. BITS_TO_LONGS(block->nb_chunks) *
  641. sizeof(unsigned long) * 8, block->nb_chunks);
  642. g_free(old);
  643. local->nb_blocks--;
  644. if (local->nb_blocks && rdma->blockmap) {
  645. for (x = 0; x < local->nb_blocks; x++) {
  646. g_hash_table_insert(rdma->blockmap,
  647. (void *)(uintptr_t)local->block[x].offset,
  648. &local->block[x]);
  649. }
  650. }
  651. return 0;
  652. }
  653. /*
  654. * Put in the log file which RDMA device was opened and the details
  655. * associated with that device.
  656. */
  657. static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
  658. {
  659. struct ibv_port_attr port;
  660. if (ibv_query_port(verbs, 1, &port)) {
  661. error_report("Failed to query port information");
  662. return;
  663. }
  664. printf("%s RDMA Device opened: kernel name %s "
  665. "uverbs device name %s, "
  666. "infiniband_verbs class device path %s, "
  667. "infiniband class device path %s, "
  668. "transport: (%d) %s\n",
  669. who,
  670. verbs->device->name,
  671. verbs->device->dev_name,
  672. verbs->device->dev_path,
  673. verbs->device->ibdev_path,
  674. port.link_layer,
  675. (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
  676. ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
  677. ? "Ethernet" : "Unknown"));
  678. }
  679. /*
  680. * Put in the log file the RDMA gid addressing information,
  681. * useful for folks who have trouble understanding the
  682. * RDMA device hierarchy in the kernel.
  683. */
  684. static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
  685. {
  686. char sgid[33];
  687. char dgid[33];
  688. inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
  689. inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
  690. trace_qemu_rdma_dump_gid(who, sgid, dgid);
  691. }
  692. /*
  693. * As of now, IPv6 over RoCE / iWARP is not supported by linux.
  694. * We will try the next addrinfo struct, and fail if there are
  695. * no other valid addresses to bind against.
  696. *
  697. * If user is listening on '[::]', then we will not have a opened a device
  698. * yet and have no way of verifying if the device is RoCE or not.
  699. *
  700. * In this case, the source VM will throw an error for ALL types of
  701. * connections (both IPv4 and IPv6) if the destination machine does not have
  702. * a regular infiniband network available for use.
  703. *
  704. * The only way to guarantee that an error is thrown for broken kernels is
  705. * for the management software to choose a *specific* interface at bind time
  706. * and validate what time of hardware it is.
  707. *
  708. * Unfortunately, this puts the user in a fix:
  709. *
  710. * If the source VM connects with an IPv4 address without knowing that the
  711. * destination has bound to '[::]' the migration will unconditionally fail
  712. * unless the management software is explicitly listening on the IPv4
  713. * address while using a RoCE-based device.
  714. *
  715. * If the source VM connects with an IPv6 address, then we're OK because we can
  716. * throw an error on the source (and similarly on the destination).
  717. *
  718. * But in mixed environments, this will be broken for a while until it is fixed
  719. * inside linux.
  720. *
  721. * We do provide a *tiny* bit of help in this function: We can list all of the
  722. * devices in the system and check to see if all the devices are RoCE or
  723. * Infiniband.
  724. *
  725. * If we detect that we have a *pure* RoCE environment, then we can safely
  726. * thrown an error even if the management software has specified '[::]' as the
  727. * bind address.
  728. *
  729. * However, if there is are multiple hetergeneous devices, then we cannot make
  730. * this assumption and the user just has to be sure they know what they are
  731. * doing.
  732. *
  733. * Patches are being reviewed on linux-rdma.
  734. */
  735. static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
  736. {
  737. /* This bug only exists in linux, to our knowledge. */
  738. #ifdef CONFIG_LINUX
  739. struct ibv_port_attr port_attr;
  740. /*
  741. * Verbs are only NULL if management has bound to '[::]'.
  742. *
  743. * Let's iterate through all the devices and see if there any pure IB
  744. * devices (non-ethernet).
  745. *
  746. * If not, then we can safely proceed with the migration.
  747. * Otherwise, there are no guarantees until the bug is fixed in linux.
  748. */
  749. if (!verbs) {
  750. int num_devices, x;
  751. struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
  752. bool roce_found = false;
  753. bool ib_found = false;
  754. for (x = 0; x < num_devices; x++) {
  755. verbs = ibv_open_device(dev_list[x]);
  756. if (!verbs) {
  757. if (errno == EPERM) {
  758. continue;
  759. } else {
  760. return -EINVAL;
  761. }
  762. }
  763. if (ibv_query_port(verbs, 1, &port_attr)) {
  764. ibv_close_device(verbs);
  765. ERROR(errp, "Could not query initial IB port");
  766. return -EINVAL;
  767. }
  768. if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
  769. ib_found = true;
  770. } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
  771. roce_found = true;
  772. }
  773. ibv_close_device(verbs);
  774. }
  775. if (roce_found) {
  776. if (ib_found) {
  777. fprintf(stderr, "WARN: migrations may fail:"
  778. " IPv6 over RoCE / iWARP in linux"
  779. " is broken. But since you appear to have a"
  780. " mixed RoCE / IB environment, be sure to only"
  781. " migrate over the IB fabric until the kernel "
  782. " fixes the bug.\n");
  783. } else {
  784. ERROR(errp, "You only have RoCE / iWARP devices in your systems"
  785. " and your management software has specified '[::]'"
  786. ", but IPv6 over RoCE / iWARP is not supported in Linux.");
  787. return -ENONET;
  788. }
  789. }
  790. return 0;
  791. }
  792. /*
  793. * If we have a verbs context, that means that some other than '[::]' was
  794. * used by the management software for binding. In which case we can
  795. * actually warn the user about a potentially broken kernel.
  796. */
  797. /* IB ports start with 1, not 0 */
  798. if (ibv_query_port(verbs, 1, &port_attr)) {
  799. ERROR(errp, "Could not query initial IB port");
  800. return -EINVAL;
  801. }
  802. if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
  803. ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
  804. "(but patches on linux-rdma in progress)");
  805. return -ENONET;
  806. }
  807. #endif
  808. return 0;
  809. }
  810. /*
  811. * Figure out which RDMA device corresponds to the requested IP hostname
  812. * Also create the initial connection manager identifiers for opening
  813. * the connection.
  814. */
  815. static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
  816. {
  817. int ret;
  818. struct rdma_addrinfo *res;
  819. char port_str[16];
  820. struct rdma_cm_event *cm_event;
  821. char ip[40] = "unknown";
  822. struct rdma_addrinfo *e;
  823. if (rdma->host == NULL || !strcmp(rdma->host, "")) {
  824. ERROR(errp, "RDMA hostname has not been set");
  825. return -EINVAL;
  826. }
  827. /* create CM channel */
  828. rdma->channel = rdma_create_event_channel();
  829. if (!rdma->channel) {
  830. ERROR(errp, "could not create CM channel");
  831. return -EINVAL;
  832. }
  833. /* create CM id */
  834. ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
  835. if (ret) {
  836. ERROR(errp, "could not create channel id");
  837. goto err_resolve_create_id;
  838. }
  839. snprintf(port_str, 16, "%d", rdma->port);
  840. port_str[15] = '\0';
  841. ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
  842. if (ret < 0) {
  843. ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
  844. goto err_resolve_get_addr;
  845. }
  846. for (e = res; e != NULL; e = e->ai_next) {
  847. inet_ntop(e->ai_family,
  848. &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
  849. trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
  850. ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
  851. RDMA_RESOLVE_TIMEOUT_MS);
  852. if (!ret) {
  853. if (e->ai_family == AF_INET6) {
  854. ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
  855. if (ret) {
  856. continue;
  857. }
  858. }
  859. goto route;
  860. }
  861. }
  862. rdma_freeaddrinfo(res);
  863. ERROR(errp, "could not resolve address %s", rdma->host);
  864. goto err_resolve_get_addr;
  865. route:
  866. rdma_freeaddrinfo(res);
  867. qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
  868. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  869. if (ret) {
  870. ERROR(errp, "could not perform event_addr_resolved");
  871. goto err_resolve_get_addr;
  872. }
  873. if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
  874. ERROR(errp, "result not equal to event_addr_resolved %s",
  875. rdma_event_str(cm_event->event));
  876. error_report("rdma_resolve_addr");
  877. rdma_ack_cm_event(cm_event);
  878. ret = -EINVAL;
  879. goto err_resolve_get_addr;
  880. }
  881. rdma_ack_cm_event(cm_event);
  882. /* resolve route */
  883. ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
  884. if (ret) {
  885. ERROR(errp, "could not resolve rdma route");
  886. goto err_resolve_get_addr;
  887. }
  888. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  889. if (ret) {
  890. ERROR(errp, "could not perform event_route_resolved");
  891. goto err_resolve_get_addr;
  892. }
  893. if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
  894. ERROR(errp, "result not equal to event_route_resolved: %s",
  895. rdma_event_str(cm_event->event));
  896. rdma_ack_cm_event(cm_event);
  897. ret = -EINVAL;
  898. goto err_resolve_get_addr;
  899. }
  900. rdma_ack_cm_event(cm_event);
  901. rdma->verbs = rdma->cm_id->verbs;
  902. qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
  903. qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
  904. return 0;
  905. err_resolve_get_addr:
  906. rdma_destroy_id(rdma->cm_id);
  907. rdma->cm_id = NULL;
  908. err_resolve_create_id:
  909. rdma_destroy_event_channel(rdma->channel);
  910. rdma->channel = NULL;
  911. return ret;
  912. }
  913. /*
  914. * Create protection domain and completion queues
  915. */
  916. static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
  917. {
  918. /* allocate pd */
  919. rdma->pd = ibv_alloc_pd(rdma->verbs);
  920. if (!rdma->pd) {
  921. error_report("failed to allocate protection domain");
  922. return -1;
  923. }
  924. /* create receive completion channel */
  925. rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
  926. if (!rdma->recv_comp_channel) {
  927. error_report("failed to allocate receive completion channel");
  928. goto err_alloc_pd_cq;
  929. }
  930. /*
  931. * Completion queue can be filled by read work requests.
  932. */
  933. rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
  934. NULL, rdma->recv_comp_channel, 0);
  935. if (!rdma->recv_cq) {
  936. error_report("failed to allocate receive completion queue");
  937. goto err_alloc_pd_cq;
  938. }
  939. /* create send completion channel */
  940. rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
  941. if (!rdma->send_comp_channel) {
  942. error_report("failed to allocate send completion channel");
  943. goto err_alloc_pd_cq;
  944. }
  945. rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
  946. NULL, rdma->send_comp_channel, 0);
  947. if (!rdma->send_cq) {
  948. error_report("failed to allocate send completion queue");
  949. goto err_alloc_pd_cq;
  950. }
  951. return 0;
  952. err_alloc_pd_cq:
  953. if (rdma->pd) {
  954. ibv_dealloc_pd(rdma->pd);
  955. }
  956. if (rdma->recv_comp_channel) {
  957. ibv_destroy_comp_channel(rdma->recv_comp_channel);
  958. }
  959. if (rdma->send_comp_channel) {
  960. ibv_destroy_comp_channel(rdma->send_comp_channel);
  961. }
  962. if (rdma->recv_cq) {
  963. ibv_destroy_cq(rdma->recv_cq);
  964. rdma->recv_cq = NULL;
  965. }
  966. rdma->pd = NULL;
  967. rdma->recv_comp_channel = NULL;
  968. rdma->send_comp_channel = NULL;
  969. return -1;
  970. }
  971. /*
  972. * Create queue pairs.
  973. */
  974. static int qemu_rdma_alloc_qp(RDMAContext *rdma)
  975. {
  976. struct ibv_qp_init_attr attr = { 0 };
  977. int ret;
  978. attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
  979. attr.cap.max_recv_wr = 3;
  980. attr.cap.max_send_sge = 1;
  981. attr.cap.max_recv_sge = 1;
  982. attr.send_cq = rdma->send_cq;
  983. attr.recv_cq = rdma->recv_cq;
  984. attr.qp_type = IBV_QPT_RC;
  985. ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
  986. if (ret) {
  987. return -1;
  988. }
  989. rdma->qp = rdma->cm_id->qp;
  990. return 0;
  991. }
  992. /* Check whether On-Demand Paging is supported by RDAM device */
  993. static bool rdma_support_odp(struct ibv_context *dev)
  994. {
  995. struct ibv_device_attr_ex attr = {0};
  996. int ret = ibv_query_device_ex(dev, NULL, &attr);
  997. if (ret) {
  998. return false;
  999. }
  1000. if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
  1001. return true;
  1002. }
  1003. return false;
  1004. }
  1005. /*
  1006. * ibv_advise_mr to avoid RNR NAK error as far as possible.
  1007. * The responder mr registering with ODP will sent RNR NAK back to
  1008. * the requester in the face of the page fault.
  1009. */
  1010. static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
  1011. uint32_t len, uint32_t lkey,
  1012. const char *name, bool wr)
  1013. {
  1014. #ifdef HAVE_IBV_ADVISE_MR
  1015. int ret;
  1016. int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
  1017. IBV_ADVISE_MR_ADVICE_PREFETCH;
  1018. struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
  1019. ret = ibv_advise_mr(pd, advice,
  1020. IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
  1021. /* ignore the error */
  1022. if (ret) {
  1023. trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
  1024. } else {
  1025. trace_qemu_rdma_advise_mr(name, len, addr, "successed");
  1026. }
  1027. #endif
  1028. }
  1029. static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
  1030. {
  1031. int i;
  1032. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  1033. for (i = 0; i < local->nb_blocks; i++) {
  1034. int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
  1035. local->block[i].mr =
  1036. ibv_reg_mr(rdma->pd,
  1037. local->block[i].local_host_addr,
  1038. local->block[i].length, access
  1039. );
  1040. if (!local->block[i].mr &&
  1041. errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
  1042. access |= IBV_ACCESS_ON_DEMAND;
  1043. /* register ODP mr */
  1044. local->block[i].mr =
  1045. ibv_reg_mr(rdma->pd,
  1046. local->block[i].local_host_addr,
  1047. local->block[i].length, access);
  1048. trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
  1049. if (local->block[i].mr) {
  1050. qemu_rdma_advise_prefetch_mr(rdma->pd,
  1051. (uintptr_t)local->block[i].local_host_addr,
  1052. local->block[i].length,
  1053. local->block[i].mr->lkey,
  1054. local->block[i].block_name,
  1055. true);
  1056. }
  1057. }
  1058. if (!local->block[i].mr) {
  1059. perror("Failed to register local dest ram block!");
  1060. break;
  1061. }
  1062. rdma->total_registrations++;
  1063. }
  1064. if (i >= local->nb_blocks) {
  1065. return 0;
  1066. }
  1067. for (i--; i >= 0; i--) {
  1068. ibv_dereg_mr(local->block[i].mr);
  1069. local->block[i].mr = NULL;
  1070. rdma->total_registrations--;
  1071. }
  1072. return -1;
  1073. }
  1074. /*
  1075. * Find the ram block that corresponds to the page requested to be
  1076. * transmitted by QEMU.
  1077. *
  1078. * Once the block is found, also identify which 'chunk' within that
  1079. * block that the page belongs to.
  1080. *
  1081. * This search cannot fail or the migration will fail.
  1082. */
  1083. static int qemu_rdma_search_ram_block(RDMAContext *rdma,
  1084. uintptr_t block_offset,
  1085. uint64_t offset,
  1086. uint64_t length,
  1087. uint64_t *block_index,
  1088. uint64_t *chunk_index)
  1089. {
  1090. uint64_t current_addr = block_offset + offset;
  1091. RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
  1092. (void *) block_offset);
  1093. assert(block);
  1094. assert(current_addr >= block->offset);
  1095. assert((current_addr + length) <= (block->offset + block->length));
  1096. *block_index = block->index;
  1097. *chunk_index = ram_chunk_index(block->local_host_addr,
  1098. block->local_host_addr + (current_addr - block->offset));
  1099. return 0;
  1100. }
  1101. /*
  1102. * Register a chunk with IB. If the chunk was already registered
  1103. * previously, then skip.
  1104. *
  1105. * Also return the keys associated with the registration needed
  1106. * to perform the actual RDMA operation.
  1107. */
  1108. static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
  1109. RDMALocalBlock *block, uintptr_t host_addr,
  1110. uint32_t *lkey, uint32_t *rkey, int chunk,
  1111. uint8_t *chunk_start, uint8_t *chunk_end)
  1112. {
  1113. if (block->mr) {
  1114. if (lkey) {
  1115. *lkey = block->mr->lkey;
  1116. }
  1117. if (rkey) {
  1118. *rkey = block->mr->rkey;
  1119. }
  1120. return 0;
  1121. }
  1122. /* allocate memory to store chunk MRs */
  1123. if (!block->pmr) {
  1124. block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
  1125. }
  1126. /*
  1127. * If 'rkey', then we're the destination, so grant access to the source.
  1128. *
  1129. * If 'lkey', then we're the source VM, so grant access only to ourselves.
  1130. */
  1131. if (!block->pmr[chunk]) {
  1132. uint64_t len = chunk_end - chunk_start;
  1133. int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
  1134. 0;
  1135. trace_qemu_rdma_register_and_get_keys(len, chunk_start);
  1136. block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
  1137. if (!block->pmr[chunk] &&
  1138. errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
  1139. access |= IBV_ACCESS_ON_DEMAND;
  1140. /* register ODP mr */
  1141. block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
  1142. trace_qemu_rdma_register_odp_mr(block->block_name);
  1143. if (block->pmr[chunk]) {
  1144. qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
  1145. len, block->pmr[chunk]->lkey,
  1146. block->block_name, rkey);
  1147. }
  1148. }
  1149. }
  1150. if (!block->pmr[chunk]) {
  1151. perror("Failed to register chunk!");
  1152. fprintf(stderr, "Chunk details: block: %d chunk index %d"
  1153. " start %" PRIuPTR " end %" PRIuPTR
  1154. " host %" PRIuPTR
  1155. " local %" PRIuPTR " registrations: %d\n",
  1156. block->index, chunk, (uintptr_t)chunk_start,
  1157. (uintptr_t)chunk_end, host_addr,
  1158. (uintptr_t)block->local_host_addr,
  1159. rdma->total_registrations);
  1160. return -1;
  1161. }
  1162. rdma->total_registrations++;
  1163. if (lkey) {
  1164. *lkey = block->pmr[chunk]->lkey;
  1165. }
  1166. if (rkey) {
  1167. *rkey = block->pmr[chunk]->rkey;
  1168. }
  1169. return 0;
  1170. }
  1171. /*
  1172. * Register (at connection time) the memory used for control
  1173. * channel messages.
  1174. */
  1175. static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
  1176. {
  1177. rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
  1178. rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
  1179. IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
  1180. if (rdma->wr_data[idx].control_mr) {
  1181. rdma->total_registrations++;
  1182. return 0;
  1183. }
  1184. error_report("qemu_rdma_reg_control failed");
  1185. return -1;
  1186. }
  1187. const char *print_wrid(int wrid)
  1188. {
  1189. if (wrid >= RDMA_WRID_RECV_CONTROL) {
  1190. return wrid_desc[RDMA_WRID_RECV_CONTROL];
  1191. }
  1192. return wrid_desc[wrid];
  1193. }
  1194. /*
  1195. * Perform a non-optimized memory unregistration after every transfer
  1196. * for demonstration purposes, only if pin-all is not requested.
  1197. *
  1198. * Potential optimizations:
  1199. * 1. Start a new thread to run this function continuously
  1200. - for bit clearing
  1201. - and for receipt of unregister messages
  1202. * 2. Use an LRU.
  1203. * 3. Use workload hints.
  1204. */
  1205. static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
  1206. {
  1207. while (rdma->unregistrations[rdma->unregister_current]) {
  1208. int ret;
  1209. uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
  1210. uint64_t chunk =
  1211. (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
  1212. uint64_t index =
  1213. (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
  1214. RDMALocalBlock *block =
  1215. &(rdma->local_ram_blocks.block[index]);
  1216. RDMARegister reg = { .current_index = index };
  1217. RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
  1218. };
  1219. RDMAControlHeader head = { .len = sizeof(RDMARegister),
  1220. .type = RDMA_CONTROL_UNREGISTER_REQUEST,
  1221. .repeat = 1,
  1222. };
  1223. trace_qemu_rdma_unregister_waiting_proc(chunk,
  1224. rdma->unregister_current);
  1225. rdma->unregistrations[rdma->unregister_current] = 0;
  1226. rdma->unregister_current++;
  1227. if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
  1228. rdma->unregister_current = 0;
  1229. }
  1230. /*
  1231. * Unregistration is speculative (because migration is single-threaded
  1232. * and we cannot break the protocol's inifinband message ordering).
  1233. * Thus, if the memory is currently being used for transmission,
  1234. * then abort the attempt to unregister and try again
  1235. * later the next time a completion is received for this memory.
  1236. */
  1237. clear_bit(chunk, block->unregister_bitmap);
  1238. if (test_bit(chunk, block->transit_bitmap)) {
  1239. trace_qemu_rdma_unregister_waiting_inflight(chunk);
  1240. continue;
  1241. }
  1242. trace_qemu_rdma_unregister_waiting_send(chunk);
  1243. ret = ibv_dereg_mr(block->pmr[chunk]);
  1244. block->pmr[chunk] = NULL;
  1245. block->remote_keys[chunk] = 0;
  1246. if (ret != 0) {
  1247. perror("unregistration chunk failed");
  1248. return -ret;
  1249. }
  1250. rdma->total_registrations--;
  1251. reg.key.chunk = chunk;
  1252. register_to_network(rdma, &reg);
  1253. ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
  1254. &resp, NULL, NULL);
  1255. if (ret < 0) {
  1256. return ret;
  1257. }
  1258. trace_qemu_rdma_unregister_waiting_complete(chunk);
  1259. }
  1260. return 0;
  1261. }
  1262. static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
  1263. uint64_t chunk)
  1264. {
  1265. uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
  1266. result |= (index << RDMA_WRID_BLOCK_SHIFT);
  1267. result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
  1268. return result;
  1269. }
  1270. /*
  1271. * Consult the connection manager to see a work request
  1272. * (of any kind) has completed.
  1273. * Return the work request ID that completed.
  1274. */
  1275. static uint64_t qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
  1276. uint64_t *wr_id_out, uint32_t *byte_len)
  1277. {
  1278. int ret;
  1279. struct ibv_wc wc;
  1280. uint64_t wr_id;
  1281. ret = ibv_poll_cq(cq, 1, &wc);
  1282. if (!ret) {
  1283. *wr_id_out = RDMA_WRID_NONE;
  1284. return 0;
  1285. }
  1286. if (ret < 0) {
  1287. error_report("ibv_poll_cq return %d", ret);
  1288. return ret;
  1289. }
  1290. wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
  1291. if (wc.status != IBV_WC_SUCCESS) {
  1292. fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
  1293. wc.status, ibv_wc_status_str(wc.status));
  1294. fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
  1295. return -1;
  1296. }
  1297. if (rdma->control_ready_expected &&
  1298. (wr_id >= RDMA_WRID_RECV_CONTROL)) {
  1299. trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
  1300. wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
  1301. rdma->control_ready_expected = 0;
  1302. }
  1303. if (wr_id == RDMA_WRID_RDMA_WRITE) {
  1304. uint64_t chunk =
  1305. (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
  1306. uint64_t index =
  1307. (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
  1308. RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
  1309. trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
  1310. index, chunk, block->local_host_addr,
  1311. (void *)(uintptr_t)block->remote_host_addr);
  1312. clear_bit(chunk, block->transit_bitmap);
  1313. if (rdma->nb_sent > 0) {
  1314. rdma->nb_sent--;
  1315. }
  1316. } else {
  1317. trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
  1318. }
  1319. *wr_id_out = wc.wr_id;
  1320. if (byte_len) {
  1321. *byte_len = wc.byte_len;
  1322. }
  1323. return 0;
  1324. }
  1325. /* Wait for activity on the completion channel.
  1326. * Returns 0 on success, none-0 on error.
  1327. */
  1328. static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
  1329. struct ibv_comp_channel *comp_channel)
  1330. {
  1331. struct rdma_cm_event *cm_event;
  1332. int ret = -1;
  1333. /*
  1334. * Coroutine doesn't start until migration_fd_process_incoming()
  1335. * so don't yield unless we know we're running inside of a coroutine.
  1336. */
  1337. if (rdma->migration_started_on_destination &&
  1338. migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
  1339. yield_until_fd_readable(comp_channel->fd);
  1340. } else {
  1341. /* This is the source side, we're in a separate thread
  1342. * or destination prior to migration_fd_process_incoming()
  1343. * after postcopy, the destination also in a separate thread.
  1344. * we can't yield; so we have to poll the fd.
  1345. * But we need to be able to handle 'cancel' or an error
  1346. * without hanging forever.
  1347. */
  1348. while (!rdma->error_state && !rdma->received_error) {
  1349. GPollFD pfds[2];
  1350. pfds[0].fd = comp_channel->fd;
  1351. pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
  1352. pfds[0].revents = 0;
  1353. pfds[1].fd = rdma->channel->fd;
  1354. pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
  1355. pfds[1].revents = 0;
  1356. /* 0.1s timeout, should be fine for a 'cancel' */
  1357. switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
  1358. case 2:
  1359. case 1: /* fd active */
  1360. if (pfds[0].revents) {
  1361. return 0;
  1362. }
  1363. if (pfds[1].revents) {
  1364. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  1365. if (ret) {
  1366. error_report("failed to get cm event while wait "
  1367. "completion channel");
  1368. return -EPIPE;
  1369. }
  1370. error_report("receive cm event while wait comp channel,"
  1371. "cm event is %d", cm_event->event);
  1372. if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
  1373. cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
  1374. rdma_ack_cm_event(cm_event);
  1375. return -EPIPE;
  1376. }
  1377. rdma_ack_cm_event(cm_event);
  1378. }
  1379. break;
  1380. case 0: /* Timeout, go around again */
  1381. break;
  1382. default: /* Error of some type -
  1383. * I don't trust errno from qemu_poll_ns
  1384. */
  1385. error_report("%s: poll failed", __func__);
  1386. return -EPIPE;
  1387. }
  1388. if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
  1389. /* Bail out and let the cancellation happen */
  1390. return -EPIPE;
  1391. }
  1392. }
  1393. }
  1394. if (rdma->received_error) {
  1395. return -EPIPE;
  1396. }
  1397. return rdma->error_state;
  1398. }
  1399. static struct ibv_comp_channel *to_channel(RDMAContext *rdma, int wrid)
  1400. {
  1401. return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
  1402. rdma->recv_comp_channel;
  1403. }
  1404. static struct ibv_cq *to_cq(RDMAContext *rdma, int wrid)
  1405. {
  1406. return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
  1407. }
  1408. /*
  1409. * Block until the next work request has completed.
  1410. *
  1411. * First poll to see if a work request has already completed,
  1412. * otherwise block.
  1413. *
  1414. * If we encounter completed work requests for IDs other than
  1415. * the one we're interested in, then that's generally an error.
  1416. *
  1417. * The only exception is actual RDMA Write completions. These
  1418. * completions only need to be recorded, but do not actually
  1419. * need further processing.
  1420. */
  1421. static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
  1422. uint32_t *byte_len)
  1423. {
  1424. int num_cq_events = 0, ret = 0;
  1425. struct ibv_cq *cq;
  1426. void *cq_ctx;
  1427. uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
  1428. struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
  1429. struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
  1430. if (ibv_req_notify_cq(poll_cq, 0)) {
  1431. return -1;
  1432. }
  1433. /* poll cq first */
  1434. while (wr_id != wrid_requested) {
  1435. ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
  1436. if (ret < 0) {
  1437. return ret;
  1438. }
  1439. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  1440. if (wr_id == RDMA_WRID_NONE) {
  1441. break;
  1442. }
  1443. if (wr_id != wrid_requested) {
  1444. trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
  1445. wrid_requested, print_wrid(wr_id), wr_id);
  1446. }
  1447. }
  1448. if (wr_id == wrid_requested) {
  1449. return 0;
  1450. }
  1451. while (1) {
  1452. ret = qemu_rdma_wait_comp_channel(rdma, ch);
  1453. if (ret) {
  1454. goto err_block_for_wrid;
  1455. }
  1456. ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
  1457. if (ret) {
  1458. perror("ibv_get_cq_event");
  1459. goto err_block_for_wrid;
  1460. }
  1461. num_cq_events++;
  1462. ret = -ibv_req_notify_cq(cq, 0);
  1463. if (ret) {
  1464. goto err_block_for_wrid;
  1465. }
  1466. while (wr_id != wrid_requested) {
  1467. ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
  1468. if (ret < 0) {
  1469. goto err_block_for_wrid;
  1470. }
  1471. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  1472. if (wr_id == RDMA_WRID_NONE) {
  1473. break;
  1474. }
  1475. if (wr_id != wrid_requested) {
  1476. trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
  1477. wrid_requested, print_wrid(wr_id), wr_id);
  1478. }
  1479. }
  1480. if (wr_id == wrid_requested) {
  1481. goto success_block_for_wrid;
  1482. }
  1483. }
  1484. success_block_for_wrid:
  1485. if (num_cq_events) {
  1486. ibv_ack_cq_events(cq, num_cq_events);
  1487. }
  1488. return 0;
  1489. err_block_for_wrid:
  1490. if (num_cq_events) {
  1491. ibv_ack_cq_events(cq, num_cq_events);
  1492. }
  1493. rdma->error_state = ret;
  1494. return ret;
  1495. }
  1496. /*
  1497. * Post a SEND message work request for the control channel
  1498. * containing some data and block until the post completes.
  1499. */
  1500. static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
  1501. RDMAControlHeader *head)
  1502. {
  1503. int ret = 0;
  1504. RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
  1505. struct ibv_send_wr *bad_wr;
  1506. struct ibv_sge sge = {
  1507. .addr = (uintptr_t)(wr->control),
  1508. .length = head->len + sizeof(RDMAControlHeader),
  1509. .lkey = wr->control_mr->lkey,
  1510. };
  1511. struct ibv_send_wr send_wr = {
  1512. .wr_id = RDMA_WRID_SEND_CONTROL,
  1513. .opcode = IBV_WR_SEND,
  1514. .send_flags = IBV_SEND_SIGNALED,
  1515. .sg_list = &sge,
  1516. .num_sge = 1,
  1517. };
  1518. trace_qemu_rdma_post_send_control(control_desc(head->type));
  1519. /*
  1520. * We don't actually need to do a memcpy() in here if we used
  1521. * the "sge" properly, but since we're only sending control messages
  1522. * (not RAM in a performance-critical path), then its OK for now.
  1523. *
  1524. * The copy makes the RDMAControlHeader simpler to manipulate
  1525. * for the time being.
  1526. */
  1527. assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
  1528. memcpy(wr->control, head, sizeof(RDMAControlHeader));
  1529. control_to_network((void *) wr->control);
  1530. if (buf) {
  1531. memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
  1532. }
  1533. ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
  1534. if (ret > 0) {
  1535. error_report("Failed to use post IB SEND for control");
  1536. return -ret;
  1537. }
  1538. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
  1539. if (ret < 0) {
  1540. error_report("rdma migration: send polling control error");
  1541. }
  1542. return ret;
  1543. }
  1544. /*
  1545. * Post a RECV work request in anticipation of some future receipt
  1546. * of data on the control channel.
  1547. */
  1548. static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
  1549. {
  1550. struct ibv_recv_wr *bad_wr;
  1551. struct ibv_sge sge = {
  1552. .addr = (uintptr_t)(rdma->wr_data[idx].control),
  1553. .length = RDMA_CONTROL_MAX_BUFFER,
  1554. .lkey = rdma->wr_data[idx].control_mr->lkey,
  1555. };
  1556. struct ibv_recv_wr recv_wr = {
  1557. .wr_id = RDMA_WRID_RECV_CONTROL + idx,
  1558. .sg_list = &sge,
  1559. .num_sge = 1,
  1560. };
  1561. if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
  1562. return -1;
  1563. }
  1564. return 0;
  1565. }
  1566. /*
  1567. * Block and wait for a RECV control channel message to arrive.
  1568. */
  1569. static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
  1570. RDMAControlHeader *head, int expecting, int idx)
  1571. {
  1572. uint32_t byte_len;
  1573. int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
  1574. &byte_len);
  1575. if (ret < 0) {
  1576. error_report("rdma migration: recv polling control error!");
  1577. return ret;
  1578. }
  1579. network_to_control((void *) rdma->wr_data[idx].control);
  1580. memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
  1581. trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
  1582. if (expecting == RDMA_CONTROL_NONE) {
  1583. trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
  1584. head->type);
  1585. } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
  1586. error_report("Was expecting a %s (%d) control message"
  1587. ", but got: %s (%d), length: %d",
  1588. control_desc(expecting), expecting,
  1589. control_desc(head->type), head->type, head->len);
  1590. if (head->type == RDMA_CONTROL_ERROR) {
  1591. rdma->received_error = true;
  1592. }
  1593. return -EIO;
  1594. }
  1595. if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
  1596. error_report("too long length: %d", head->len);
  1597. return -EINVAL;
  1598. }
  1599. if (sizeof(*head) + head->len != byte_len) {
  1600. error_report("Malformed length: %d byte_len %d", head->len, byte_len);
  1601. return -EINVAL;
  1602. }
  1603. return 0;
  1604. }
  1605. /*
  1606. * When a RECV work request has completed, the work request's
  1607. * buffer is pointed at the header.
  1608. *
  1609. * This will advance the pointer to the data portion
  1610. * of the control message of the work request's buffer that
  1611. * was populated after the work request finished.
  1612. */
  1613. static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
  1614. RDMAControlHeader *head)
  1615. {
  1616. rdma->wr_data[idx].control_len = head->len;
  1617. rdma->wr_data[idx].control_curr =
  1618. rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
  1619. }
  1620. /*
  1621. * This is an 'atomic' high-level operation to deliver a single, unified
  1622. * control-channel message.
  1623. *
  1624. * Additionally, if the user is expecting some kind of reply to this message,
  1625. * they can request a 'resp' response message be filled in by posting an
  1626. * additional work request on behalf of the user and waiting for an additional
  1627. * completion.
  1628. *
  1629. * The extra (optional) response is used during registration to us from having
  1630. * to perform an *additional* exchange of message just to provide a response by
  1631. * instead piggy-backing on the acknowledgement.
  1632. */
  1633. static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
  1634. uint8_t *data, RDMAControlHeader *resp,
  1635. int *resp_idx,
  1636. int (*callback)(RDMAContext *rdma))
  1637. {
  1638. int ret = 0;
  1639. /*
  1640. * Wait until the dest is ready before attempting to deliver the message
  1641. * by waiting for a READY message.
  1642. */
  1643. if (rdma->control_ready_expected) {
  1644. RDMAControlHeader resp;
  1645. ret = qemu_rdma_exchange_get_response(rdma,
  1646. &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
  1647. if (ret < 0) {
  1648. return ret;
  1649. }
  1650. }
  1651. /*
  1652. * If the user is expecting a response, post a WR in anticipation of it.
  1653. */
  1654. if (resp) {
  1655. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
  1656. if (ret) {
  1657. error_report("rdma migration: error posting"
  1658. " extra control recv for anticipated result!");
  1659. return ret;
  1660. }
  1661. }
  1662. /*
  1663. * Post a WR to replace the one we just consumed for the READY message.
  1664. */
  1665. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
  1666. if (ret) {
  1667. error_report("rdma migration: error posting first control recv!");
  1668. return ret;
  1669. }
  1670. /*
  1671. * Deliver the control message that was requested.
  1672. */
  1673. ret = qemu_rdma_post_send_control(rdma, data, head);
  1674. if (ret < 0) {
  1675. error_report("Failed to send control buffer!");
  1676. return ret;
  1677. }
  1678. /*
  1679. * If we're expecting a response, block and wait for it.
  1680. */
  1681. if (resp) {
  1682. if (callback) {
  1683. trace_qemu_rdma_exchange_send_issue_callback();
  1684. ret = callback(rdma);
  1685. if (ret < 0) {
  1686. return ret;
  1687. }
  1688. }
  1689. trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
  1690. ret = qemu_rdma_exchange_get_response(rdma, resp,
  1691. resp->type, RDMA_WRID_DATA);
  1692. if (ret < 0) {
  1693. return ret;
  1694. }
  1695. qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
  1696. if (resp_idx) {
  1697. *resp_idx = RDMA_WRID_DATA;
  1698. }
  1699. trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
  1700. }
  1701. rdma->control_ready_expected = 1;
  1702. return 0;
  1703. }
  1704. /*
  1705. * This is an 'atomic' high-level operation to receive a single, unified
  1706. * control-channel message.
  1707. */
  1708. static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
  1709. int expecting)
  1710. {
  1711. RDMAControlHeader ready = {
  1712. .len = 0,
  1713. .type = RDMA_CONTROL_READY,
  1714. .repeat = 1,
  1715. };
  1716. int ret;
  1717. /*
  1718. * Inform the source that we're ready to receive a message.
  1719. */
  1720. ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
  1721. if (ret < 0) {
  1722. error_report("Failed to send control buffer!");
  1723. return ret;
  1724. }
  1725. /*
  1726. * Block and wait for the message.
  1727. */
  1728. ret = qemu_rdma_exchange_get_response(rdma, head,
  1729. expecting, RDMA_WRID_READY);
  1730. if (ret < 0) {
  1731. return ret;
  1732. }
  1733. qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
  1734. /*
  1735. * Post a new RECV work request to replace the one we just consumed.
  1736. */
  1737. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
  1738. if (ret) {
  1739. error_report("rdma migration: error posting second control recv!");
  1740. return ret;
  1741. }
  1742. return 0;
  1743. }
  1744. /*
  1745. * Write an actual chunk of memory using RDMA.
  1746. *
  1747. * If we're using dynamic registration on the dest-side, we have to
  1748. * send a registration command first.
  1749. */
  1750. static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
  1751. int current_index, uint64_t current_addr,
  1752. uint64_t length)
  1753. {
  1754. struct ibv_sge sge;
  1755. struct ibv_send_wr send_wr = { 0 };
  1756. struct ibv_send_wr *bad_wr;
  1757. int reg_result_idx, ret, count = 0;
  1758. uint64_t chunk, chunks;
  1759. uint8_t *chunk_start, *chunk_end;
  1760. RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
  1761. RDMARegister reg;
  1762. RDMARegisterResult *reg_result;
  1763. RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
  1764. RDMAControlHeader head = { .len = sizeof(RDMARegister),
  1765. .type = RDMA_CONTROL_REGISTER_REQUEST,
  1766. .repeat = 1,
  1767. };
  1768. retry:
  1769. sge.addr = (uintptr_t)(block->local_host_addr +
  1770. (current_addr - block->offset));
  1771. sge.length = length;
  1772. chunk = ram_chunk_index(block->local_host_addr,
  1773. (uint8_t *)(uintptr_t)sge.addr);
  1774. chunk_start = ram_chunk_start(block, chunk);
  1775. if (block->is_ram_block) {
  1776. chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
  1777. if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
  1778. chunks--;
  1779. }
  1780. } else {
  1781. chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
  1782. if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
  1783. chunks--;
  1784. }
  1785. }
  1786. trace_qemu_rdma_write_one_top(chunks + 1,
  1787. (chunks + 1) *
  1788. (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
  1789. chunk_end = ram_chunk_end(block, chunk + chunks);
  1790. while (test_bit(chunk, block->transit_bitmap)) {
  1791. (void)count;
  1792. trace_qemu_rdma_write_one_block(count++, current_index, chunk,
  1793. sge.addr, length, rdma->nb_sent, block->nb_chunks);
  1794. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
  1795. if (ret < 0) {
  1796. error_report("Failed to Wait for previous write to complete "
  1797. "block %d chunk %" PRIu64
  1798. " current %" PRIu64 " len %" PRIu64 " %d",
  1799. current_index, chunk, sge.addr, length, rdma->nb_sent);
  1800. return ret;
  1801. }
  1802. }
  1803. if (!rdma->pin_all || !block->is_ram_block) {
  1804. if (!block->remote_keys[chunk]) {
  1805. /*
  1806. * This chunk has not yet been registered, so first check to see
  1807. * if the entire chunk is zero. If so, tell the other size to
  1808. * memset() + madvise() the entire chunk without RDMA.
  1809. */
  1810. if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
  1811. RDMACompress comp = {
  1812. .offset = current_addr,
  1813. .value = 0,
  1814. .block_idx = current_index,
  1815. .length = length,
  1816. };
  1817. head.len = sizeof(comp);
  1818. head.type = RDMA_CONTROL_COMPRESS;
  1819. trace_qemu_rdma_write_one_zero(chunk, sge.length,
  1820. current_index, current_addr);
  1821. compress_to_network(rdma, &comp);
  1822. ret = qemu_rdma_exchange_send(rdma, &head,
  1823. (uint8_t *) &comp, NULL, NULL, NULL);
  1824. if (ret < 0) {
  1825. return -EIO;
  1826. }
  1827. acct_update_position(f, sge.length, true);
  1828. return 1;
  1829. }
  1830. /*
  1831. * Otherwise, tell other side to register.
  1832. */
  1833. reg.current_index = current_index;
  1834. if (block->is_ram_block) {
  1835. reg.key.current_addr = current_addr;
  1836. } else {
  1837. reg.key.chunk = chunk;
  1838. }
  1839. reg.chunks = chunks;
  1840. trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
  1841. current_addr);
  1842. register_to_network(rdma, &reg);
  1843. ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
  1844. &resp, &reg_result_idx, NULL);
  1845. if (ret < 0) {
  1846. return ret;
  1847. }
  1848. /* try to overlap this single registration with the one we sent. */
  1849. if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
  1850. &sge.lkey, NULL, chunk,
  1851. chunk_start, chunk_end)) {
  1852. error_report("cannot get lkey");
  1853. return -EINVAL;
  1854. }
  1855. reg_result = (RDMARegisterResult *)
  1856. rdma->wr_data[reg_result_idx].control_curr;
  1857. network_to_result(reg_result);
  1858. trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
  1859. reg_result->rkey, chunk);
  1860. block->remote_keys[chunk] = reg_result->rkey;
  1861. block->remote_host_addr = reg_result->host_addr;
  1862. } else {
  1863. /* already registered before */
  1864. if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
  1865. &sge.lkey, NULL, chunk,
  1866. chunk_start, chunk_end)) {
  1867. error_report("cannot get lkey!");
  1868. return -EINVAL;
  1869. }
  1870. }
  1871. send_wr.wr.rdma.rkey = block->remote_keys[chunk];
  1872. } else {
  1873. send_wr.wr.rdma.rkey = block->remote_rkey;
  1874. if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
  1875. &sge.lkey, NULL, chunk,
  1876. chunk_start, chunk_end)) {
  1877. error_report("cannot get lkey!");
  1878. return -EINVAL;
  1879. }
  1880. }
  1881. /*
  1882. * Encode the ram block index and chunk within this wrid.
  1883. * We will use this information at the time of completion
  1884. * to figure out which bitmap to check against and then which
  1885. * chunk in the bitmap to look for.
  1886. */
  1887. send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
  1888. current_index, chunk);
  1889. send_wr.opcode = IBV_WR_RDMA_WRITE;
  1890. send_wr.send_flags = IBV_SEND_SIGNALED;
  1891. send_wr.sg_list = &sge;
  1892. send_wr.num_sge = 1;
  1893. send_wr.wr.rdma.remote_addr = block->remote_host_addr +
  1894. (current_addr - block->offset);
  1895. trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
  1896. sge.length);
  1897. /*
  1898. * ibv_post_send() does not return negative error numbers,
  1899. * per the specification they are positive - no idea why.
  1900. */
  1901. ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
  1902. if (ret == ENOMEM) {
  1903. trace_qemu_rdma_write_one_queue_full();
  1904. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
  1905. if (ret < 0) {
  1906. error_report("rdma migration: failed to make "
  1907. "room in full send queue! %d", ret);
  1908. return ret;
  1909. }
  1910. goto retry;
  1911. } else if (ret > 0) {
  1912. perror("rdma migration: post rdma write failed");
  1913. return -ret;
  1914. }
  1915. set_bit(chunk, block->transit_bitmap);
  1916. acct_update_position(f, sge.length, false);
  1917. rdma->total_writes++;
  1918. return 0;
  1919. }
  1920. /*
  1921. * Push out any unwritten RDMA operations.
  1922. *
  1923. * We support sending out multiple chunks at the same time.
  1924. * Not all of them need to get signaled in the completion queue.
  1925. */
  1926. static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
  1927. {
  1928. int ret;
  1929. if (!rdma->current_length) {
  1930. return 0;
  1931. }
  1932. ret = qemu_rdma_write_one(f, rdma,
  1933. rdma->current_index, rdma->current_addr, rdma->current_length);
  1934. if (ret < 0) {
  1935. return ret;
  1936. }
  1937. if (ret == 0) {
  1938. rdma->nb_sent++;
  1939. trace_qemu_rdma_write_flush(rdma->nb_sent);
  1940. }
  1941. rdma->current_length = 0;
  1942. rdma->current_addr = 0;
  1943. return 0;
  1944. }
  1945. static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
  1946. uint64_t offset, uint64_t len)
  1947. {
  1948. RDMALocalBlock *block;
  1949. uint8_t *host_addr;
  1950. uint8_t *chunk_end;
  1951. if (rdma->current_index < 0) {
  1952. return 0;
  1953. }
  1954. if (rdma->current_chunk < 0) {
  1955. return 0;
  1956. }
  1957. block = &(rdma->local_ram_blocks.block[rdma->current_index]);
  1958. host_addr = block->local_host_addr + (offset - block->offset);
  1959. chunk_end = ram_chunk_end(block, rdma->current_chunk);
  1960. if (rdma->current_length == 0) {
  1961. return 0;
  1962. }
  1963. /*
  1964. * Only merge into chunk sequentially.
  1965. */
  1966. if (offset != (rdma->current_addr + rdma->current_length)) {
  1967. return 0;
  1968. }
  1969. if (offset < block->offset) {
  1970. return 0;
  1971. }
  1972. if ((offset + len) > (block->offset + block->length)) {
  1973. return 0;
  1974. }
  1975. if ((host_addr + len) > chunk_end) {
  1976. return 0;
  1977. }
  1978. return 1;
  1979. }
  1980. /*
  1981. * We're not actually writing here, but doing three things:
  1982. *
  1983. * 1. Identify the chunk the buffer belongs to.
  1984. * 2. If the chunk is full or the buffer doesn't belong to the current
  1985. * chunk, then start a new chunk and flush() the old chunk.
  1986. * 3. To keep the hardware busy, we also group chunks into batches
  1987. * and only require that a batch gets acknowledged in the completion
  1988. * queue instead of each individual chunk.
  1989. */
  1990. static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
  1991. uint64_t block_offset, uint64_t offset,
  1992. uint64_t len)
  1993. {
  1994. uint64_t current_addr = block_offset + offset;
  1995. uint64_t index = rdma->current_index;
  1996. uint64_t chunk = rdma->current_chunk;
  1997. int ret;
  1998. /* If we cannot merge it, we flush the current buffer first. */
  1999. if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
  2000. ret = qemu_rdma_write_flush(f, rdma);
  2001. if (ret) {
  2002. return ret;
  2003. }
  2004. rdma->current_length = 0;
  2005. rdma->current_addr = current_addr;
  2006. ret = qemu_rdma_search_ram_block(rdma, block_offset,
  2007. offset, len, &index, &chunk);
  2008. if (ret) {
  2009. error_report("ram block search failed");
  2010. return ret;
  2011. }
  2012. rdma->current_index = index;
  2013. rdma->current_chunk = chunk;
  2014. }
  2015. /* merge it */
  2016. rdma->current_length += len;
  2017. /* flush it if buffer is too large */
  2018. if (rdma->current_length >= RDMA_MERGE_MAX) {
  2019. return qemu_rdma_write_flush(f, rdma);
  2020. }
  2021. return 0;
  2022. }
  2023. static void qemu_rdma_cleanup(RDMAContext *rdma)
  2024. {
  2025. int idx;
  2026. if (rdma->cm_id && rdma->connected) {
  2027. if ((rdma->error_state ||
  2028. migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
  2029. !rdma->received_error) {
  2030. RDMAControlHeader head = { .len = 0,
  2031. .type = RDMA_CONTROL_ERROR,
  2032. .repeat = 1,
  2033. };
  2034. error_report("Early error. Sending error.");
  2035. qemu_rdma_post_send_control(rdma, NULL, &head);
  2036. }
  2037. rdma_disconnect(rdma->cm_id);
  2038. trace_qemu_rdma_cleanup_disconnect();
  2039. rdma->connected = false;
  2040. }
  2041. if (rdma->channel) {
  2042. qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
  2043. }
  2044. g_free(rdma->dest_blocks);
  2045. rdma->dest_blocks = NULL;
  2046. for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
  2047. if (rdma->wr_data[idx].control_mr) {
  2048. rdma->total_registrations--;
  2049. ibv_dereg_mr(rdma->wr_data[idx].control_mr);
  2050. }
  2051. rdma->wr_data[idx].control_mr = NULL;
  2052. }
  2053. if (rdma->local_ram_blocks.block) {
  2054. while (rdma->local_ram_blocks.nb_blocks) {
  2055. rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
  2056. }
  2057. }
  2058. if (rdma->qp) {
  2059. rdma_destroy_qp(rdma->cm_id);
  2060. rdma->qp = NULL;
  2061. }
  2062. if (rdma->recv_cq) {
  2063. ibv_destroy_cq(rdma->recv_cq);
  2064. rdma->recv_cq = NULL;
  2065. }
  2066. if (rdma->send_cq) {
  2067. ibv_destroy_cq(rdma->send_cq);
  2068. rdma->send_cq = NULL;
  2069. }
  2070. if (rdma->recv_comp_channel) {
  2071. ibv_destroy_comp_channel(rdma->recv_comp_channel);
  2072. rdma->recv_comp_channel = NULL;
  2073. }
  2074. if (rdma->send_comp_channel) {
  2075. ibv_destroy_comp_channel(rdma->send_comp_channel);
  2076. rdma->send_comp_channel = NULL;
  2077. }
  2078. if (rdma->pd) {
  2079. ibv_dealloc_pd(rdma->pd);
  2080. rdma->pd = NULL;
  2081. }
  2082. if (rdma->cm_id) {
  2083. rdma_destroy_id(rdma->cm_id);
  2084. rdma->cm_id = NULL;
  2085. }
  2086. /* the destination side, listen_id and channel is shared */
  2087. if (rdma->listen_id) {
  2088. if (!rdma->is_return_path) {
  2089. rdma_destroy_id(rdma->listen_id);
  2090. }
  2091. rdma->listen_id = NULL;
  2092. if (rdma->channel) {
  2093. if (!rdma->is_return_path) {
  2094. rdma_destroy_event_channel(rdma->channel);
  2095. }
  2096. rdma->channel = NULL;
  2097. }
  2098. }
  2099. if (rdma->channel) {
  2100. rdma_destroy_event_channel(rdma->channel);
  2101. rdma->channel = NULL;
  2102. }
  2103. g_free(rdma->host);
  2104. g_free(rdma->host_port);
  2105. rdma->host = NULL;
  2106. rdma->host_port = NULL;
  2107. }
  2108. static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
  2109. {
  2110. int ret, idx;
  2111. Error *local_err = NULL, **temp = &local_err;
  2112. /*
  2113. * Will be validated against destination's actual capabilities
  2114. * after the connect() completes.
  2115. */
  2116. rdma->pin_all = pin_all;
  2117. ret = qemu_rdma_resolve_host(rdma, temp);
  2118. if (ret) {
  2119. goto err_rdma_source_init;
  2120. }
  2121. ret = qemu_rdma_alloc_pd_cq(rdma);
  2122. if (ret) {
  2123. ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
  2124. " limits may be too low. Please check $ ulimit -a # and "
  2125. "search for 'ulimit -l' in the output");
  2126. goto err_rdma_source_init;
  2127. }
  2128. ret = qemu_rdma_alloc_qp(rdma);
  2129. if (ret) {
  2130. ERROR(temp, "rdma migration: error allocating qp!");
  2131. goto err_rdma_source_init;
  2132. }
  2133. ret = qemu_rdma_init_ram_blocks(rdma);
  2134. if (ret) {
  2135. ERROR(temp, "rdma migration: error initializing ram blocks!");
  2136. goto err_rdma_source_init;
  2137. }
  2138. /* Build the hash that maps from offset to RAMBlock */
  2139. rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
  2140. for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
  2141. g_hash_table_insert(rdma->blockmap,
  2142. (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
  2143. &rdma->local_ram_blocks.block[idx]);
  2144. }
  2145. for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
  2146. ret = qemu_rdma_reg_control(rdma, idx);
  2147. if (ret) {
  2148. ERROR(temp, "rdma migration: error registering %d control!",
  2149. idx);
  2150. goto err_rdma_source_init;
  2151. }
  2152. }
  2153. return 0;
  2154. err_rdma_source_init:
  2155. error_propagate(errp, local_err);
  2156. qemu_rdma_cleanup(rdma);
  2157. return -1;
  2158. }
  2159. static int qemu_get_cm_event_timeout(RDMAContext *rdma,
  2160. struct rdma_cm_event **cm_event,
  2161. long msec, Error **errp)
  2162. {
  2163. int ret;
  2164. struct pollfd poll_fd = {
  2165. .fd = rdma->channel->fd,
  2166. .events = POLLIN,
  2167. .revents = 0
  2168. };
  2169. do {
  2170. ret = poll(&poll_fd, 1, msec);
  2171. } while (ret < 0 && errno == EINTR);
  2172. if (ret == 0) {
  2173. ERROR(errp, "poll cm event timeout");
  2174. return -1;
  2175. } else if (ret < 0) {
  2176. ERROR(errp, "failed to poll cm event, errno=%i", errno);
  2177. return -1;
  2178. } else if (poll_fd.revents & POLLIN) {
  2179. return rdma_get_cm_event(rdma->channel, cm_event);
  2180. } else {
  2181. ERROR(errp, "no POLLIN event, revent=%x", poll_fd.revents);
  2182. return -1;
  2183. }
  2184. }
  2185. static int qemu_rdma_connect(RDMAContext *rdma, Error **errp, bool return_path)
  2186. {
  2187. RDMACapabilities cap = {
  2188. .version = RDMA_CONTROL_VERSION_CURRENT,
  2189. .flags = 0,
  2190. };
  2191. struct rdma_conn_param conn_param = { .initiator_depth = 2,
  2192. .retry_count = 5,
  2193. .private_data = &cap,
  2194. .private_data_len = sizeof(cap),
  2195. };
  2196. struct rdma_cm_event *cm_event;
  2197. int ret;
  2198. /*
  2199. * Only negotiate the capability with destination if the user
  2200. * on the source first requested the capability.
  2201. */
  2202. if (rdma->pin_all) {
  2203. trace_qemu_rdma_connect_pin_all_requested();
  2204. cap.flags |= RDMA_CAPABILITY_PIN_ALL;
  2205. }
  2206. caps_to_network(&cap);
  2207. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
  2208. if (ret) {
  2209. ERROR(errp, "posting second control recv");
  2210. goto err_rdma_source_connect;
  2211. }
  2212. ret = rdma_connect(rdma->cm_id, &conn_param);
  2213. if (ret) {
  2214. perror("rdma_connect");
  2215. ERROR(errp, "connecting to destination!");
  2216. goto err_rdma_source_connect;
  2217. }
  2218. if (return_path) {
  2219. ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
  2220. } else {
  2221. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2222. }
  2223. if (ret) {
  2224. perror("rdma_get_cm_event after rdma_connect");
  2225. ERROR(errp, "connecting to destination!");
  2226. goto err_rdma_source_connect;
  2227. }
  2228. if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
  2229. error_report("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
  2230. ERROR(errp, "connecting to destination!");
  2231. rdma_ack_cm_event(cm_event);
  2232. goto err_rdma_source_connect;
  2233. }
  2234. rdma->connected = true;
  2235. memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
  2236. network_to_caps(&cap);
  2237. /*
  2238. * Verify that the *requested* capabilities are supported by the destination
  2239. * and disable them otherwise.
  2240. */
  2241. if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
  2242. ERROR(errp, "Server cannot support pinning all memory. "
  2243. "Will register memory dynamically.");
  2244. rdma->pin_all = false;
  2245. }
  2246. trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
  2247. rdma_ack_cm_event(cm_event);
  2248. rdma->control_ready_expected = 1;
  2249. rdma->nb_sent = 0;
  2250. return 0;
  2251. err_rdma_source_connect:
  2252. qemu_rdma_cleanup(rdma);
  2253. return -1;
  2254. }
  2255. static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
  2256. {
  2257. int ret, idx;
  2258. struct rdma_cm_id *listen_id;
  2259. char ip[40] = "unknown";
  2260. struct rdma_addrinfo *res, *e;
  2261. char port_str[16];
  2262. int reuse = 1;
  2263. for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
  2264. rdma->wr_data[idx].control_len = 0;
  2265. rdma->wr_data[idx].control_curr = NULL;
  2266. }
  2267. if (!rdma->host || !rdma->host[0]) {
  2268. ERROR(errp, "RDMA host is not set!");
  2269. rdma->error_state = -EINVAL;
  2270. return -1;
  2271. }
  2272. /* create CM channel */
  2273. rdma->channel = rdma_create_event_channel();
  2274. if (!rdma->channel) {
  2275. ERROR(errp, "could not create rdma event channel");
  2276. rdma->error_state = -EINVAL;
  2277. return -1;
  2278. }
  2279. /* create CM id */
  2280. ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
  2281. if (ret) {
  2282. ERROR(errp, "could not create cm_id!");
  2283. goto err_dest_init_create_listen_id;
  2284. }
  2285. snprintf(port_str, 16, "%d", rdma->port);
  2286. port_str[15] = '\0';
  2287. ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
  2288. if (ret < 0) {
  2289. ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
  2290. goto err_dest_init_bind_addr;
  2291. }
  2292. ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
  2293. &reuse, sizeof reuse);
  2294. if (ret) {
  2295. ERROR(errp, "Error: could not set REUSEADDR option");
  2296. goto err_dest_init_bind_addr;
  2297. }
  2298. for (e = res; e != NULL; e = e->ai_next) {
  2299. inet_ntop(e->ai_family,
  2300. &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
  2301. trace_qemu_rdma_dest_init_trying(rdma->host, ip);
  2302. ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
  2303. if (ret) {
  2304. continue;
  2305. }
  2306. if (e->ai_family == AF_INET6) {
  2307. ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
  2308. if (ret) {
  2309. continue;
  2310. }
  2311. }
  2312. break;
  2313. }
  2314. rdma_freeaddrinfo(res);
  2315. if (!e) {
  2316. ERROR(errp, "Error: could not rdma_bind_addr!");
  2317. goto err_dest_init_bind_addr;
  2318. }
  2319. rdma->listen_id = listen_id;
  2320. qemu_rdma_dump_gid("dest_init", listen_id);
  2321. return 0;
  2322. err_dest_init_bind_addr:
  2323. rdma_destroy_id(listen_id);
  2324. err_dest_init_create_listen_id:
  2325. rdma_destroy_event_channel(rdma->channel);
  2326. rdma->channel = NULL;
  2327. rdma->error_state = ret;
  2328. return ret;
  2329. }
  2330. static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
  2331. RDMAContext *rdma)
  2332. {
  2333. int idx;
  2334. for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
  2335. rdma_return_path->wr_data[idx].control_len = 0;
  2336. rdma_return_path->wr_data[idx].control_curr = NULL;
  2337. }
  2338. /*the CM channel and CM id is shared*/
  2339. rdma_return_path->channel = rdma->channel;
  2340. rdma_return_path->listen_id = rdma->listen_id;
  2341. rdma->return_path = rdma_return_path;
  2342. rdma_return_path->return_path = rdma;
  2343. rdma_return_path->is_return_path = true;
  2344. }
  2345. static void *qemu_rdma_data_init(const char *host_port, Error **errp)
  2346. {
  2347. RDMAContext *rdma = NULL;
  2348. InetSocketAddress *addr;
  2349. if (host_port) {
  2350. rdma = g_new0(RDMAContext, 1);
  2351. rdma->current_index = -1;
  2352. rdma->current_chunk = -1;
  2353. addr = g_new(InetSocketAddress, 1);
  2354. if (!inet_parse(addr, host_port, NULL)) {
  2355. rdma->port = atoi(addr->port);
  2356. rdma->host = g_strdup(addr->host);
  2357. rdma->host_port = g_strdup(host_port);
  2358. } else {
  2359. ERROR(errp, "bad RDMA migration address '%s'", host_port);
  2360. g_free(rdma);
  2361. rdma = NULL;
  2362. }
  2363. qapi_free_InetSocketAddress(addr);
  2364. }
  2365. return rdma;
  2366. }
  2367. /*
  2368. * QEMUFile interface to the control channel.
  2369. * SEND messages for control only.
  2370. * VM's ram is handled with regular RDMA messages.
  2371. */
  2372. static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
  2373. const struct iovec *iov,
  2374. size_t niov,
  2375. int *fds,
  2376. size_t nfds,
  2377. int flags,
  2378. Error **errp)
  2379. {
  2380. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2381. QEMUFile *f = rioc->file;
  2382. RDMAContext *rdma;
  2383. int ret;
  2384. ssize_t done = 0;
  2385. size_t i;
  2386. size_t len = 0;
  2387. RCU_READ_LOCK_GUARD();
  2388. rdma = qatomic_rcu_read(&rioc->rdmaout);
  2389. if (!rdma) {
  2390. error_setg(errp, "RDMA control channel output is not set");
  2391. return -1;
  2392. }
  2393. CHECK_ERROR_STATE();
  2394. /*
  2395. * Push out any writes that
  2396. * we're queued up for VM's ram.
  2397. */
  2398. ret = qemu_rdma_write_flush(f, rdma);
  2399. if (ret < 0) {
  2400. rdma->error_state = ret;
  2401. error_setg(errp, "qemu_rdma_write_flush returned %d", ret);
  2402. return -1;
  2403. }
  2404. for (i = 0; i < niov; i++) {
  2405. size_t remaining = iov[i].iov_len;
  2406. uint8_t * data = (void *)iov[i].iov_base;
  2407. while (remaining) {
  2408. RDMAControlHeader head;
  2409. len = MIN(remaining, RDMA_SEND_INCREMENT);
  2410. remaining -= len;
  2411. head.len = len;
  2412. head.type = RDMA_CONTROL_QEMU_FILE;
  2413. ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
  2414. if (ret < 0) {
  2415. rdma->error_state = ret;
  2416. error_setg(errp, "qemu_rdma_exchange_send returned %d", ret);
  2417. return -1;
  2418. }
  2419. data += len;
  2420. done += len;
  2421. }
  2422. }
  2423. return done;
  2424. }
  2425. static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
  2426. size_t size, int idx)
  2427. {
  2428. size_t len = 0;
  2429. if (rdma->wr_data[idx].control_len) {
  2430. trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
  2431. len = MIN(size, rdma->wr_data[idx].control_len);
  2432. memcpy(buf, rdma->wr_data[idx].control_curr, len);
  2433. rdma->wr_data[idx].control_curr += len;
  2434. rdma->wr_data[idx].control_len -= len;
  2435. }
  2436. return len;
  2437. }
  2438. /*
  2439. * QEMUFile interface to the control channel.
  2440. * RDMA links don't use bytestreams, so we have to
  2441. * return bytes to QEMUFile opportunistically.
  2442. */
  2443. static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
  2444. const struct iovec *iov,
  2445. size_t niov,
  2446. int **fds,
  2447. size_t *nfds,
  2448. int flags,
  2449. Error **errp)
  2450. {
  2451. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2452. RDMAContext *rdma;
  2453. RDMAControlHeader head;
  2454. int ret = 0;
  2455. ssize_t i;
  2456. size_t done = 0;
  2457. RCU_READ_LOCK_GUARD();
  2458. rdma = qatomic_rcu_read(&rioc->rdmain);
  2459. if (!rdma) {
  2460. error_setg(errp, "RDMA control channel input is not set");
  2461. return -1;
  2462. }
  2463. CHECK_ERROR_STATE();
  2464. for (i = 0; i < niov; i++) {
  2465. size_t want = iov[i].iov_len;
  2466. uint8_t *data = (void *)iov[i].iov_base;
  2467. /*
  2468. * First, we hold on to the last SEND message we
  2469. * were given and dish out the bytes until we run
  2470. * out of bytes.
  2471. */
  2472. ret = qemu_rdma_fill(rdma, data, want, 0);
  2473. done += ret;
  2474. want -= ret;
  2475. /* Got what we needed, so go to next iovec */
  2476. if (want == 0) {
  2477. continue;
  2478. }
  2479. /* If we got any data so far, then don't wait
  2480. * for more, just return what we have */
  2481. if (done > 0) {
  2482. break;
  2483. }
  2484. /* We've got nothing at all, so lets wait for
  2485. * more to arrive
  2486. */
  2487. ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
  2488. if (ret < 0) {
  2489. rdma->error_state = ret;
  2490. error_setg(errp, "qemu_rdma_exchange_recv returned %d", ret);
  2491. return -1;
  2492. }
  2493. /*
  2494. * SEND was received with new bytes, now try again.
  2495. */
  2496. ret = qemu_rdma_fill(rdma, data, want, 0);
  2497. done += ret;
  2498. want -= ret;
  2499. /* Still didn't get enough, so lets just return */
  2500. if (want) {
  2501. if (done == 0) {
  2502. return QIO_CHANNEL_ERR_BLOCK;
  2503. } else {
  2504. break;
  2505. }
  2506. }
  2507. }
  2508. return done;
  2509. }
  2510. /*
  2511. * Block until all the outstanding chunks have been delivered by the hardware.
  2512. */
  2513. static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
  2514. {
  2515. int ret;
  2516. if (qemu_rdma_write_flush(f, rdma) < 0) {
  2517. return -EIO;
  2518. }
  2519. while (rdma->nb_sent) {
  2520. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
  2521. if (ret < 0) {
  2522. error_report("rdma migration: complete polling error!");
  2523. return -EIO;
  2524. }
  2525. }
  2526. qemu_rdma_unregister_waiting(rdma);
  2527. return 0;
  2528. }
  2529. static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
  2530. bool blocking,
  2531. Error **errp)
  2532. {
  2533. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2534. /* XXX we should make readv/writev actually honour this :-) */
  2535. rioc->blocking = blocking;
  2536. return 0;
  2537. }
  2538. typedef struct QIOChannelRDMASource QIOChannelRDMASource;
  2539. struct QIOChannelRDMASource {
  2540. GSource parent;
  2541. QIOChannelRDMA *rioc;
  2542. GIOCondition condition;
  2543. };
  2544. static gboolean
  2545. qio_channel_rdma_source_prepare(GSource *source,
  2546. gint *timeout)
  2547. {
  2548. QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
  2549. RDMAContext *rdma;
  2550. GIOCondition cond = 0;
  2551. *timeout = -1;
  2552. RCU_READ_LOCK_GUARD();
  2553. if (rsource->condition == G_IO_IN) {
  2554. rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
  2555. } else {
  2556. rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
  2557. }
  2558. if (!rdma) {
  2559. error_report("RDMAContext is NULL when prepare Gsource");
  2560. return FALSE;
  2561. }
  2562. if (rdma->wr_data[0].control_len) {
  2563. cond |= G_IO_IN;
  2564. }
  2565. cond |= G_IO_OUT;
  2566. return cond & rsource->condition;
  2567. }
  2568. static gboolean
  2569. qio_channel_rdma_source_check(GSource *source)
  2570. {
  2571. QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
  2572. RDMAContext *rdma;
  2573. GIOCondition cond = 0;
  2574. RCU_READ_LOCK_GUARD();
  2575. if (rsource->condition == G_IO_IN) {
  2576. rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
  2577. } else {
  2578. rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
  2579. }
  2580. if (!rdma) {
  2581. error_report("RDMAContext is NULL when check Gsource");
  2582. return FALSE;
  2583. }
  2584. if (rdma->wr_data[0].control_len) {
  2585. cond |= G_IO_IN;
  2586. }
  2587. cond |= G_IO_OUT;
  2588. return cond & rsource->condition;
  2589. }
  2590. static gboolean
  2591. qio_channel_rdma_source_dispatch(GSource *source,
  2592. GSourceFunc callback,
  2593. gpointer user_data)
  2594. {
  2595. QIOChannelFunc func = (QIOChannelFunc)callback;
  2596. QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
  2597. RDMAContext *rdma;
  2598. GIOCondition cond = 0;
  2599. RCU_READ_LOCK_GUARD();
  2600. if (rsource->condition == G_IO_IN) {
  2601. rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
  2602. } else {
  2603. rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
  2604. }
  2605. if (!rdma) {
  2606. error_report("RDMAContext is NULL when dispatch Gsource");
  2607. return FALSE;
  2608. }
  2609. if (rdma->wr_data[0].control_len) {
  2610. cond |= G_IO_IN;
  2611. }
  2612. cond |= G_IO_OUT;
  2613. return (*func)(QIO_CHANNEL(rsource->rioc),
  2614. (cond & rsource->condition),
  2615. user_data);
  2616. }
  2617. static void
  2618. qio_channel_rdma_source_finalize(GSource *source)
  2619. {
  2620. QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
  2621. object_unref(OBJECT(ssource->rioc));
  2622. }
  2623. GSourceFuncs qio_channel_rdma_source_funcs = {
  2624. qio_channel_rdma_source_prepare,
  2625. qio_channel_rdma_source_check,
  2626. qio_channel_rdma_source_dispatch,
  2627. qio_channel_rdma_source_finalize
  2628. };
  2629. static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
  2630. GIOCondition condition)
  2631. {
  2632. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2633. QIOChannelRDMASource *ssource;
  2634. GSource *source;
  2635. source = g_source_new(&qio_channel_rdma_source_funcs,
  2636. sizeof(QIOChannelRDMASource));
  2637. ssource = (QIOChannelRDMASource *)source;
  2638. ssource->rioc = rioc;
  2639. object_ref(OBJECT(rioc));
  2640. ssource->condition = condition;
  2641. return source;
  2642. }
  2643. static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
  2644. AioContext *ctx,
  2645. IOHandler *io_read,
  2646. IOHandler *io_write,
  2647. void *opaque)
  2648. {
  2649. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2650. if (io_read) {
  2651. aio_set_fd_handler(ctx, rioc->rdmain->recv_comp_channel->fd,
  2652. false, io_read, io_write, NULL, NULL, opaque);
  2653. aio_set_fd_handler(ctx, rioc->rdmain->send_comp_channel->fd,
  2654. false, io_read, io_write, NULL, NULL, opaque);
  2655. } else {
  2656. aio_set_fd_handler(ctx, rioc->rdmaout->recv_comp_channel->fd,
  2657. false, io_read, io_write, NULL, NULL, opaque);
  2658. aio_set_fd_handler(ctx, rioc->rdmaout->send_comp_channel->fd,
  2659. false, io_read, io_write, NULL, NULL, opaque);
  2660. }
  2661. }
  2662. struct rdma_close_rcu {
  2663. struct rcu_head rcu;
  2664. RDMAContext *rdmain;
  2665. RDMAContext *rdmaout;
  2666. };
  2667. /* callback from qio_channel_rdma_close via call_rcu */
  2668. static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
  2669. {
  2670. if (rcu->rdmain) {
  2671. qemu_rdma_cleanup(rcu->rdmain);
  2672. }
  2673. if (rcu->rdmaout) {
  2674. qemu_rdma_cleanup(rcu->rdmaout);
  2675. }
  2676. g_free(rcu->rdmain);
  2677. g_free(rcu->rdmaout);
  2678. g_free(rcu);
  2679. }
  2680. static int qio_channel_rdma_close(QIOChannel *ioc,
  2681. Error **errp)
  2682. {
  2683. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2684. RDMAContext *rdmain, *rdmaout;
  2685. struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
  2686. trace_qemu_rdma_close();
  2687. rdmain = rioc->rdmain;
  2688. if (rdmain) {
  2689. qatomic_rcu_set(&rioc->rdmain, NULL);
  2690. }
  2691. rdmaout = rioc->rdmaout;
  2692. if (rdmaout) {
  2693. qatomic_rcu_set(&rioc->rdmaout, NULL);
  2694. }
  2695. rcu->rdmain = rdmain;
  2696. rcu->rdmaout = rdmaout;
  2697. call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
  2698. return 0;
  2699. }
  2700. static int
  2701. qio_channel_rdma_shutdown(QIOChannel *ioc,
  2702. QIOChannelShutdown how,
  2703. Error **errp)
  2704. {
  2705. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2706. RDMAContext *rdmain, *rdmaout;
  2707. RCU_READ_LOCK_GUARD();
  2708. rdmain = qatomic_rcu_read(&rioc->rdmain);
  2709. rdmaout = qatomic_rcu_read(&rioc->rdmain);
  2710. switch (how) {
  2711. case QIO_CHANNEL_SHUTDOWN_READ:
  2712. if (rdmain) {
  2713. rdmain->error_state = -1;
  2714. }
  2715. break;
  2716. case QIO_CHANNEL_SHUTDOWN_WRITE:
  2717. if (rdmaout) {
  2718. rdmaout->error_state = -1;
  2719. }
  2720. break;
  2721. case QIO_CHANNEL_SHUTDOWN_BOTH:
  2722. default:
  2723. if (rdmain) {
  2724. rdmain->error_state = -1;
  2725. }
  2726. if (rdmaout) {
  2727. rdmaout->error_state = -1;
  2728. }
  2729. break;
  2730. }
  2731. return 0;
  2732. }
  2733. /*
  2734. * Parameters:
  2735. * @offset == 0 :
  2736. * This means that 'block_offset' is a full virtual address that does not
  2737. * belong to a RAMBlock of the virtual machine and instead
  2738. * represents a private malloc'd memory area that the caller wishes to
  2739. * transfer.
  2740. *
  2741. * @offset != 0 :
  2742. * Offset is an offset to be added to block_offset and used
  2743. * to also lookup the corresponding RAMBlock.
  2744. *
  2745. * @size : Number of bytes to transfer
  2746. *
  2747. * @bytes_sent : User-specificed pointer to indicate how many bytes were
  2748. * sent. Usually, this will not be more than a few bytes of
  2749. * the protocol because most transfers are sent asynchronously.
  2750. */
  2751. static size_t qemu_rdma_save_page(QEMUFile *f,
  2752. ram_addr_t block_offset, ram_addr_t offset,
  2753. size_t size, uint64_t *bytes_sent)
  2754. {
  2755. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
  2756. RDMAContext *rdma;
  2757. int ret;
  2758. RCU_READ_LOCK_GUARD();
  2759. rdma = qatomic_rcu_read(&rioc->rdmaout);
  2760. if (!rdma) {
  2761. return -EIO;
  2762. }
  2763. CHECK_ERROR_STATE();
  2764. if (migration_in_postcopy()) {
  2765. return RAM_SAVE_CONTROL_NOT_SUPP;
  2766. }
  2767. qemu_fflush(f);
  2768. /*
  2769. * Add this page to the current 'chunk'. If the chunk
  2770. * is full, or the page doesn't belong to the current chunk,
  2771. * an actual RDMA write will occur and a new chunk will be formed.
  2772. */
  2773. ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
  2774. if (ret < 0) {
  2775. error_report("rdma migration: write error! %d", ret);
  2776. goto err;
  2777. }
  2778. /*
  2779. * We always return 1 bytes because the RDMA
  2780. * protocol is completely asynchronous. We do not yet know
  2781. * whether an identified chunk is zero or not because we're
  2782. * waiting for other pages to potentially be merged with
  2783. * the current chunk. So, we have to call qemu_update_position()
  2784. * later on when the actual write occurs.
  2785. */
  2786. if (bytes_sent) {
  2787. *bytes_sent = 1;
  2788. }
  2789. /*
  2790. * Drain the Completion Queue if possible, but do not block,
  2791. * just poll.
  2792. *
  2793. * If nothing to poll, the end of the iteration will do this
  2794. * again to make sure we don't overflow the request queue.
  2795. */
  2796. while (1) {
  2797. uint64_t wr_id, wr_id_in;
  2798. int ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
  2799. if (ret < 0) {
  2800. error_report("rdma migration: polling error! %d", ret);
  2801. goto err;
  2802. }
  2803. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  2804. if (wr_id == RDMA_WRID_NONE) {
  2805. break;
  2806. }
  2807. }
  2808. while (1) {
  2809. uint64_t wr_id, wr_id_in;
  2810. int ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
  2811. if (ret < 0) {
  2812. error_report("rdma migration: polling error! %d", ret);
  2813. goto err;
  2814. }
  2815. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  2816. if (wr_id == RDMA_WRID_NONE) {
  2817. break;
  2818. }
  2819. }
  2820. return RAM_SAVE_CONTROL_DELAYED;
  2821. err:
  2822. rdma->error_state = ret;
  2823. return ret;
  2824. }
  2825. static void rdma_accept_incoming_migration(void *opaque);
  2826. static void rdma_cm_poll_handler(void *opaque)
  2827. {
  2828. RDMAContext *rdma = opaque;
  2829. int ret;
  2830. struct rdma_cm_event *cm_event;
  2831. MigrationIncomingState *mis = migration_incoming_get_current();
  2832. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2833. if (ret) {
  2834. error_report("get_cm_event failed %d", errno);
  2835. return;
  2836. }
  2837. if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
  2838. cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
  2839. if (!rdma->error_state &&
  2840. migration_incoming_get_current()->state !=
  2841. MIGRATION_STATUS_COMPLETED) {
  2842. error_report("receive cm event, cm event is %d", cm_event->event);
  2843. rdma->error_state = -EPIPE;
  2844. if (rdma->return_path) {
  2845. rdma->return_path->error_state = -EPIPE;
  2846. }
  2847. }
  2848. rdma_ack_cm_event(cm_event);
  2849. if (mis->migration_incoming_co) {
  2850. qemu_coroutine_enter(mis->migration_incoming_co);
  2851. }
  2852. return;
  2853. }
  2854. rdma_ack_cm_event(cm_event);
  2855. }
  2856. static int qemu_rdma_accept(RDMAContext *rdma)
  2857. {
  2858. RDMACapabilities cap;
  2859. struct rdma_conn_param conn_param = {
  2860. .responder_resources = 2,
  2861. .private_data = &cap,
  2862. .private_data_len = sizeof(cap),
  2863. };
  2864. RDMAContext *rdma_return_path = NULL;
  2865. struct rdma_cm_event *cm_event;
  2866. struct ibv_context *verbs;
  2867. int ret = -EINVAL;
  2868. int idx;
  2869. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2870. if (ret) {
  2871. goto err_rdma_dest_wait;
  2872. }
  2873. if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
  2874. rdma_ack_cm_event(cm_event);
  2875. goto err_rdma_dest_wait;
  2876. }
  2877. /*
  2878. * initialize the RDMAContext for return path for postcopy after first
  2879. * connection request reached.
  2880. */
  2881. if ((migrate_postcopy() || migrate_use_return_path())
  2882. && !rdma->is_return_path) {
  2883. rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
  2884. if (rdma_return_path == NULL) {
  2885. rdma_ack_cm_event(cm_event);
  2886. goto err_rdma_dest_wait;
  2887. }
  2888. qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
  2889. }
  2890. memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
  2891. network_to_caps(&cap);
  2892. if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
  2893. error_report("Unknown source RDMA version: %d, bailing...",
  2894. cap.version);
  2895. rdma_ack_cm_event(cm_event);
  2896. goto err_rdma_dest_wait;
  2897. }
  2898. /*
  2899. * Respond with only the capabilities this version of QEMU knows about.
  2900. */
  2901. cap.flags &= known_capabilities;
  2902. /*
  2903. * Enable the ones that we do know about.
  2904. * Add other checks here as new ones are introduced.
  2905. */
  2906. if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
  2907. rdma->pin_all = true;
  2908. }
  2909. rdma->cm_id = cm_event->id;
  2910. verbs = cm_event->id->verbs;
  2911. rdma_ack_cm_event(cm_event);
  2912. trace_qemu_rdma_accept_pin_state(rdma->pin_all);
  2913. caps_to_network(&cap);
  2914. trace_qemu_rdma_accept_pin_verbsc(verbs);
  2915. if (!rdma->verbs) {
  2916. rdma->verbs = verbs;
  2917. } else if (rdma->verbs != verbs) {
  2918. error_report("ibv context not matching %p, %p!", rdma->verbs,
  2919. verbs);
  2920. goto err_rdma_dest_wait;
  2921. }
  2922. qemu_rdma_dump_id("dest_init", verbs);
  2923. ret = qemu_rdma_alloc_pd_cq(rdma);
  2924. if (ret) {
  2925. error_report("rdma migration: error allocating pd and cq!");
  2926. goto err_rdma_dest_wait;
  2927. }
  2928. ret = qemu_rdma_alloc_qp(rdma);
  2929. if (ret) {
  2930. error_report("rdma migration: error allocating qp!");
  2931. goto err_rdma_dest_wait;
  2932. }
  2933. ret = qemu_rdma_init_ram_blocks(rdma);
  2934. if (ret) {
  2935. error_report("rdma migration: error initializing ram blocks!");
  2936. goto err_rdma_dest_wait;
  2937. }
  2938. for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
  2939. ret = qemu_rdma_reg_control(rdma, idx);
  2940. if (ret) {
  2941. error_report("rdma: error registering %d control", idx);
  2942. goto err_rdma_dest_wait;
  2943. }
  2944. }
  2945. /* Accept the second connection request for return path */
  2946. if ((migrate_postcopy() || migrate_use_return_path())
  2947. && !rdma->is_return_path) {
  2948. qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
  2949. NULL,
  2950. (void *)(intptr_t)rdma->return_path);
  2951. } else {
  2952. qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
  2953. NULL, rdma);
  2954. }
  2955. ret = rdma_accept(rdma->cm_id, &conn_param);
  2956. if (ret) {
  2957. error_report("rdma_accept returns %d", ret);
  2958. goto err_rdma_dest_wait;
  2959. }
  2960. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2961. if (ret) {
  2962. error_report("rdma_accept get_cm_event failed %d", ret);
  2963. goto err_rdma_dest_wait;
  2964. }
  2965. if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
  2966. error_report("rdma_accept not event established");
  2967. rdma_ack_cm_event(cm_event);
  2968. goto err_rdma_dest_wait;
  2969. }
  2970. rdma_ack_cm_event(cm_event);
  2971. rdma->connected = true;
  2972. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
  2973. if (ret) {
  2974. error_report("rdma migration: error posting second control recv");
  2975. goto err_rdma_dest_wait;
  2976. }
  2977. qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
  2978. return 0;
  2979. err_rdma_dest_wait:
  2980. rdma->error_state = ret;
  2981. qemu_rdma_cleanup(rdma);
  2982. g_free(rdma_return_path);
  2983. return ret;
  2984. }
  2985. static int dest_ram_sort_func(const void *a, const void *b)
  2986. {
  2987. unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
  2988. unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
  2989. return (a_index < b_index) ? -1 : (a_index != b_index);
  2990. }
  2991. /*
  2992. * During each iteration of the migration, we listen for instructions
  2993. * by the source VM to perform dynamic page registrations before they
  2994. * can perform RDMA operations.
  2995. *
  2996. * We respond with the 'rkey'.
  2997. *
  2998. * Keep doing this until the source tells us to stop.
  2999. */
  3000. static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
  3001. {
  3002. RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
  3003. .type = RDMA_CONTROL_REGISTER_RESULT,
  3004. .repeat = 0,
  3005. };
  3006. RDMAControlHeader unreg_resp = { .len = 0,
  3007. .type = RDMA_CONTROL_UNREGISTER_FINISHED,
  3008. .repeat = 0,
  3009. };
  3010. RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
  3011. .repeat = 1 };
  3012. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
  3013. RDMAContext *rdma;
  3014. RDMALocalBlocks *local;
  3015. RDMAControlHeader head;
  3016. RDMARegister *reg, *registers;
  3017. RDMACompress *comp;
  3018. RDMARegisterResult *reg_result;
  3019. static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
  3020. RDMALocalBlock *block;
  3021. void *host_addr;
  3022. int ret = 0;
  3023. int idx = 0;
  3024. int count = 0;
  3025. int i = 0;
  3026. RCU_READ_LOCK_GUARD();
  3027. rdma = qatomic_rcu_read(&rioc->rdmain);
  3028. if (!rdma) {
  3029. return -EIO;
  3030. }
  3031. CHECK_ERROR_STATE();
  3032. local = &rdma->local_ram_blocks;
  3033. do {
  3034. trace_qemu_rdma_registration_handle_wait();
  3035. ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
  3036. if (ret < 0) {
  3037. break;
  3038. }
  3039. if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
  3040. error_report("rdma: Too many requests in this message (%d)."
  3041. "Bailing.", head.repeat);
  3042. ret = -EIO;
  3043. break;
  3044. }
  3045. switch (head.type) {
  3046. case RDMA_CONTROL_COMPRESS:
  3047. comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
  3048. network_to_compress(comp);
  3049. trace_qemu_rdma_registration_handle_compress(comp->length,
  3050. comp->block_idx,
  3051. comp->offset);
  3052. if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
  3053. error_report("rdma: 'compress' bad block index %u (vs %d)",
  3054. (unsigned int)comp->block_idx,
  3055. rdma->local_ram_blocks.nb_blocks);
  3056. ret = -EIO;
  3057. goto out;
  3058. }
  3059. block = &(rdma->local_ram_blocks.block[comp->block_idx]);
  3060. host_addr = block->local_host_addr +
  3061. (comp->offset - block->offset);
  3062. ram_handle_compressed(host_addr, comp->value, comp->length);
  3063. break;
  3064. case RDMA_CONTROL_REGISTER_FINISHED:
  3065. trace_qemu_rdma_registration_handle_finished();
  3066. goto out;
  3067. case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
  3068. trace_qemu_rdma_registration_handle_ram_blocks();
  3069. /* Sort our local RAM Block list so it's the same as the source,
  3070. * we can do this since we've filled in a src_index in the list
  3071. * as we received the RAMBlock list earlier.
  3072. */
  3073. qsort(rdma->local_ram_blocks.block,
  3074. rdma->local_ram_blocks.nb_blocks,
  3075. sizeof(RDMALocalBlock), dest_ram_sort_func);
  3076. for (i = 0; i < local->nb_blocks; i++) {
  3077. local->block[i].index = i;
  3078. }
  3079. if (rdma->pin_all) {
  3080. ret = qemu_rdma_reg_whole_ram_blocks(rdma);
  3081. if (ret) {
  3082. error_report("rdma migration: error dest "
  3083. "registering ram blocks");
  3084. goto out;
  3085. }
  3086. }
  3087. /*
  3088. * Dest uses this to prepare to transmit the RAMBlock descriptions
  3089. * to the source VM after connection setup.
  3090. * Both sides use the "remote" structure to communicate and update
  3091. * their "local" descriptions with what was sent.
  3092. */
  3093. for (i = 0; i < local->nb_blocks; i++) {
  3094. rdma->dest_blocks[i].remote_host_addr =
  3095. (uintptr_t)(local->block[i].local_host_addr);
  3096. if (rdma->pin_all) {
  3097. rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
  3098. }
  3099. rdma->dest_blocks[i].offset = local->block[i].offset;
  3100. rdma->dest_blocks[i].length = local->block[i].length;
  3101. dest_block_to_network(&rdma->dest_blocks[i]);
  3102. trace_qemu_rdma_registration_handle_ram_blocks_loop(
  3103. local->block[i].block_name,
  3104. local->block[i].offset,
  3105. local->block[i].length,
  3106. local->block[i].local_host_addr,
  3107. local->block[i].src_index);
  3108. }
  3109. blocks.len = rdma->local_ram_blocks.nb_blocks
  3110. * sizeof(RDMADestBlock);
  3111. ret = qemu_rdma_post_send_control(rdma,
  3112. (uint8_t *) rdma->dest_blocks, &blocks);
  3113. if (ret < 0) {
  3114. error_report("rdma migration: error sending remote info");
  3115. goto out;
  3116. }
  3117. break;
  3118. case RDMA_CONTROL_REGISTER_REQUEST:
  3119. trace_qemu_rdma_registration_handle_register(head.repeat);
  3120. reg_resp.repeat = head.repeat;
  3121. registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
  3122. for (count = 0; count < head.repeat; count++) {
  3123. uint64_t chunk;
  3124. uint8_t *chunk_start, *chunk_end;
  3125. reg = &registers[count];
  3126. network_to_register(reg);
  3127. reg_result = &results[count];
  3128. trace_qemu_rdma_registration_handle_register_loop(count,
  3129. reg->current_index, reg->key.current_addr, reg->chunks);
  3130. if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
  3131. error_report("rdma: 'register' bad block index %u (vs %d)",
  3132. (unsigned int)reg->current_index,
  3133. rdma->local_ram_blocks.nb_blocks);
  3134. ret = -ENOENT;
  3135. goto out;
  3136. }
  3137. block = &(rdma->local_ram_blocks.block[reg->current_index]);
  3138. if (block->is_ram_block) {
  3139. if (block->offset > reg->key.current_addr) {
  3140. error_report("rdma: bad register address for block %s"
  3141. " offset: %" PRIx64 " current_addr: %" PRIx64,
  3142. block->block_name, block->offset,
  3143. reg->key.current_addr);
  3144. ret = -ERANGE;
  3145. goto out;
  3146. }
  3147. host_addr = (block->local_host_addr +
  3148. (reg->key.current_addr - block->offset));
  3149. chunk = ram_chunk_index(block->local_host_addr,
  3150. (uint8_t *) host_addr);
  3151. } else {
  3152. chunk = reg->key.chunk;
  3153. host_addr = block->local_host_addr +
  3154. (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
  3155. /* Check for particularly bad chunk value */
  3156. if (host_addr < (void *)block->local_host_addr) {
  3157. error_report("rdma: bad chunk for block %s"
  3158. " chunk: %" PRIx64,
  3159. block->block_name, reg->key.chunk);
  3160. ret = -ERANGE;
  3161. goto out;
  3162. }
  3163. }
  3164. chunk_start = ram_chunk_start(block, chunk);
  3165. chunk_end = ram_chunk_end(block, chunk + reg->chunks);
  3166. /* avoid "-Waddress-of-packed-member" warning */
  3167. uint32_t tmp_rkey = 0;
  3168. if (qemu_rdma_register_and_get_keys(rdma, block,
  3169. (uintptr_t)host_addr, NULL, &tmp_rkey,
  3170. chunk, chunk_start, chunk_end)) {
  3171. error_report("cannot get rkey");
  3172. ret = -EINVAL;
  3173. goto out;
  3174. }
  3175. reg_result->rkey = tmp_rkey;
  3176. reg_result->host_addr = (uintptr_t)block->local_host_addr;
  3177. trace_qemu_rdma_registration_handle_register_rkey(
  3178. reg_result->rkey);
  3179. result_to_network(reg_result);
  3180. }
  3181. ret = qemu_rdma_post_send_control(rdma,
  3182. (uint8_t *) results, &reg_resp);
  3183. if (ret < 0) {
  3184. error_report("Failed to send control buffer");
  3185. goto out;
  3186. }
  3187. break;
  3188. case RDMA_CONTROL_UNREGISTER_REQUEST:
  3189. trace_qemu_rdma_registration_handle_unregister(head.repeat);
  3190. unreg_resp.repeat = head.repeat;
  3191. registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
  3192. for (count = 0; count < head.repeat; count++) {
  3193. reg = &registers[count];
  3194. network_to_register(reg);
  3195. trace_qemu_rdma_registration_handle_unregister_loop(count,
  3196. reg->current_index, reg->key.chunk);
  3197. block = &(rdma->local_ram_blocks.block[reg->current_index]);
  3198. ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
  3199. block->pmr[reg->key.chunk] = NULL;
  3200. if (ret != 0) {
  3201. perror("rdma unregistration chunk failed");
  3202. ret = -ret;
  3203. goto out;
  3204. }
  3205. rdma->total_registrations--;
  3206. trace_qemu_rdma_registration_handle_unregister_success(
  3207. reg->key.chunk);
  3208. }
  3209. ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
  3210. if (ret < 0) {
  3211. error_report("Failed to send control buffer");
  3212. goto out;
  3213. }
  3214. break;
  3215. case RDMA_CONTROL_REGISTER_RESULT:
  3216. error_report("Invalid RESULT message at dest.");
  3217. ret = -EIO;
  3218. goto out;
  3219. default:
  3220. error_report("Unknown control message %s", control_desc(head.type));
  3221. ret = -EIO;
  3222. goto out;
  3223. }
  3224. } while (1);
  3225. out:
  3226. if (ret < 0) {
  3227. rdma->error_state = ret;
  3228. }
  3229. return ret;
  3230. }
  3231. /* Destination:
  3232. * Called via a ram_control_load_hook during the initial RAM load section which
  3233. * lists the RAMBlocks by name. This lets us know the order of the RAMBlocks
  3234. * on the source.
  3235. * We've already built our local RAMBlock list, but not yet sent the list to
  3236. * the source.
  3237. */
  3238. static int
  3239. rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
  3240. {
  3241. RDMAContext *rdma;
  3242. int curr;
  3243. int found = -1;
  3244. RCU_READ_LOCK_GUARD();
  3245. rdma = qatomic_rcu_read(&rioc->rdmain);
  3246. if (!rdma) {
  3247. return -EIO;
  3248. }
  3249. /* Find the matching RAMBlock in our local list */
  3250. for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
  3251. if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
  3252. found = curr;
  3253. break;
  3254. }
  3255. }
  3256. if (found == -1) {
  3257. error_report("RAMBlock '%s' not found on destination", name);
  3258. return -ENOENT;
  3259. }
  3260. rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
  3261. trace_rdma_block_notification_handle(name, rdma->next_src_index);
  3262. rdma->next_src_index++;
  3263. return 0;
  3264. }
  3265. static int rdma_load_hook(QEMUFile *f, uint64_t flags, void *data)
  3266. {
  3267. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
  3268. switch (flags) {
  3269. case RAM_CONTROL_BLOCK_REG:
  3270. return rdma_block_notification_handle(rioc, data);
  3271. case RAM_CONTROL_HOOK:
  3272. return qemu_rdma_registration_handle(f, rioc);
  3273. default:
  3274. /* Shouldn't be called with any other values */
  3275. abort();
  3276. }
  3277. }
  3278. static int qemu_rdma_registration_start(QEMUFile *f,
  3279. uint64_t flags, void *data)
  3280. {
  3281. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
  3282. RDMAContext *rdma;
  3283. RCU_READ_LOCK_GUARD();
  3284. rdma = qatomic_rcu_read(&rioc->rdmaout);
  3285. if (!rdma) {
  3286. return -EIO;
  3287. }
  3288. CHECK_ERROR_STATE();
  3289. if (migration_in_postcopy()) {
  3290. return 0;
  3291. }
  3292. trace_qemu_rdma_registration_start(flags);
  3293. qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
  3294. qemu_fflush(f);
  3295. return 0;
  3296. }
  3297. /*
  3298. * Inform dest that dynamic registrations are done for now.
  3299. * First, flush writes, if any.
  3300. */
  3301. static int qemu_rdma_registration_stop(QEMUFile *f,
  3302. uint64_t flags, void *data)
  3303. {
  3304. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
  3305. RDMAContext *rdma;
  3306. RDMAControlHeader head = { .len = 0, .repeat = 1 };
  3307. int ret = 0;
  3308. RCU_READ_LOCK_GUARD();
  3309. rdma = qatomic_rcu_read(&rioc->rdmaout);
  3310. if (!rdma) {
  3311. return -EIO;
  3312. }
  3313. CHECK_ERROR_STATE();
  3314. if (migration_in_postcopy()) {
  3315. return 0;
  3316. }
  3317. qemu_fflush(f);
  3318. ret = qemu_rdma_drain_cq(f, rdma);
  3319. if (ret < 0) {
  3320. goto err;
  3321. }
  3322. if (flags == RAM_CONTROL_SETUP) {
  3323. RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
  3324. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  3325. int reg_result_idx, i, nb_dest_blocks;
  3326. head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
  3327. trace_qemu_rdma_registration_stop_ram();
  3328. /*
  3329. * Make sure that we parallelize the pinning on both sides.
  3330. * For very large guests, doing this serially takes a really
  3331. * long time, so we have to 'interleave' the pinning locally
  3332. * with the control messages by performing the pinning on this
  3333. * side before we receive the control response from the other
  3334. * side that the pinning has completed.
  3335. */
  3336. ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
  3337. &reg_result_idx, rdma->pin_all ?
  3338. qemu_rdma_reg_whole_ram_blocks : NULL);
  3339. if (ret < 0) {
  3340. fprintf(stderr, "receiving remote info!");
  3341. return ret;
  3342. }
  3343. nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
  3344. /*
  3345. * The protocol uses two different sets of rkeys (mutually exclusive):
  3346. * 1. One key to represent the virtual address of the entire ram block.
  3347. * (dynamic chunk registration disabled - pin everything with one rkey.)
  3348. * 2. One to represent individual chunks within a ram block.
  3349. * (dynamic chunk registration enabled - pin individual chunks.)
  3350. *
  3351. * Once the capability is successfully negotiated, the destination transmits
  3352. * the keys to use (or sends them later) including the virtual addresses
  3353. * and then propagates the remote ram block descriptions to his local copy.
  3354. */
  3355. if (local->nb_blocks != nb_dest_blocks) {
  3356. fprintf(stderr, "ram blocks mismatch (Number of blocks %d vs %d) "
  3357. "Your QEMU command line parameters are probably "
  3358. "not identical on both the source and destination.",
  3359. local->nb_blocks, nb_dest_blocks);
  3360. rdma->error_state = -EINVAL;
  3361. return -EINVAL;
  3362. }
  3363. qemu_rdma_move_header(rdma, reg_result_idx, &resp);
  3364. memcpy(rdma->dest_blocks,
  3365. rdma->wr_data[reg_result_idx].control_curr, resp.len);
  3366. for (i = 0; i < nb_dest_blocks; i++) {
  3367. network_to_dest_block(&rdma->dest_blocks[i]);
  3368. /* We require that the blocks are in the same order */
  3369. if (rdma->dest_blocks[i].length != local->block[i].length) {
  3370. fprintf(stderr, "Block %s/%d has a different length %" PRIu64
  3371. "vs %" PRIu64, local->block[i].block_name, i,
  3372. local->block[i].length,
  3373. rdma->dest_blocks[i].length);
  3374. rdma->error_state = -EINVAL;
  3375. return -EINVAL;
  3376. }
  3377. local->block[i].remote_host_addr =
  3378. rdma->dest_blocks[i].remote_host_addr;
  3379. local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
  3380. }
  3381. }
  3382. trace_qemu_rdma_registration_stop(flags);
  3383. head.type = RDMA_CONTROL_REGISTER_FINISHED;
  3384. ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
  3385. if (ret < 0) {
  3386. goto err;
  3387. }
  3388. return 0;
  3389. err:
  3390. rdma->error_state = ret;
  3391. return ret;
  3392. }
  3393. static const QEMUFileHooks rdma_read_hooks = {
  3394. .hook_ram_load = rdma_load_hook,
  3395. };
  3396. static const QEMUFileHooks rdma_write_hooks = {
  3397. .before_ram_iterate = qemu_rdma_registration_start,
  3398. .after_ram_iterate = qemu_rdma_registration_stop,
  3399. .save_page = qemu_rdma_save_page,
  3400. };
  3401. static void qio_channel_rdma_finalize(Object *obj)
  3402. {
  3403. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
  3404. if (rioc->rdmain) {
  3405. qemu_rdma_cleanup(rioc->rdmain);
  3406. g_free(rioc->rdmain);
  3407. rioc->rdmain = NULL;
  3408. }
  3409. if (rioc->rdmaout) {
  3410. qemu_rdma_cleanup(rioc->rdmaout);
  3411. g_free(rioc->rdmaout);
  3412. rioc->rdmaout = NULL;
  3413. }
  3414. }
  3415. static void qio_channel_rdma_class_init(ObjectClass *klass,
  3416. void *class_data G_GNUC_UNUSED)
  3417. {
  3418. QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
  3419. ioc_klass->io_writev = qio_channel_rdma_writev;
  3420. ioc_klass->io_readv = qio_channel_rdma_readv;
  3421. ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
  3422. ioc_klass->io_close = qio_channel_rdma_close;
  3423. ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
  3424. ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
  3425. ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
  3426. }
  3427. static const TypeInfo qio_channel_rdma_info = {
  3428. .parent = TYPE_QIO_CHANNEL,
  3429. .name = TYPE_QIO_CHANNEL_RDMA,
  3430. .instance_size = sizeof(QIOChannelRDMA),
  3431. .instance_finalize = qio_channel_rdma_finalize,
  3432. .class_init = qio_channel_rdma_class_init,
  3433. };
  3434. static void qio_channel_rdma_register_types(void)
  3435. {
  3436. type_register_static(&qio_channel_rdma_info);
  3437. }
  3438. type_init(qio_channel_rdma_register_types);
  3439. static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
  3440. {
  3441. QIOChannelRDMA *rioc;
  3442. if (qemu_file_mode_is_not_valid(mode)) {
  3443. return NULL;
  3444. }
  3445. rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
  3446. if (mode[0] == 'w') {
  3447. rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc));
  3448. rioc->rdmaout = rdma;
  3449. rioc->rdmain = rdma->return_path;
  3450. qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
  3451. } else {
  3452. rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc));
  3453. rioc->rdmain = rdma;
  3454. rioc->rdmaout = rdma->return_path;
  3455. qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
  3456. }
  3457. return rioc->file;
  3458. }
  3459. static void rdma_accept_incoming_migration(void *opaque)
  3460. {
  3461. RDMAContext *rdma = opaque;
  3462. int ret;
  3463. QEMUFile *f;
  3464. Error *local_err = NULL;
  3465. trace_qemu_rdma_accept_incoming_migration();
  3466. ret = qemu_rdma_accept(rdma);
  3467. if (ret) {
  3468. fprintf(stderr, "RDMA ERROR: Migration initialization failed\n");
  3469. return;
  3470. }
  3471. trace_qemu_rdma_accept_incoming_migration_accepted();
  3472. if (rdma->is_return_path) {
  3473. return;
  3474. }
  3475. f = qemu_fopen_rdma(rdma, "rb");
  3476. if (f == NULL) {
  3477. fprintf(stderr, "RDMA ERROR: could not qemu_fopen_rdma\n");
  3478. qemu_rdma_cleanup(rdma);
  3479. return;
  3480. }
  3481. rdma->migration_started_on_destination = 1;
  3482. migration_fd_process_incoming(f, &local_err);
  3483. if (local_err) {
  3484. error_reportf_err(local_err, "RDMA ERROR:");
  3485. }
  3486. }
  3487. void rdma_start_incoming_migration(const char *host_port, Error **errp)
  3488. {
  3489. int ret;
  3490. RDMAContext *rdma;
  3491. Error *local_err = NULL;
  3492. trace_rdma_start_incoming_migration();
  3493. /* Avoid ram_block_discard_disable(), cannot change during migration. */
  3494. if (ram_block_discard_is_required()) {
  3495. error_setg(errp, "RDMA: cannot disable RAM discard");
  3496. return;
  3497. }
  3498. rdma = qemu_rdma_data_init(host_port, &local_err);
  3499. if (rdma == NULL) {
  3500. goto err;
  3501. }
  3502. ret = qemu_rdma_dest_init(rdma, &local_err);
  3503. if (ret) {
  3504. goto err;
  3505. }
  3506. trace_rdma_start_incoming_migration_after_dest_init();
  3507. ret = rdma_listen(rdma->listen_id, 5);
  3508. if (ret) {
  3509. ERROR(errp, "listening on socket!");
  3510. goto cleanup_rdma;
  3511. }
  3512. trace_rdma_start_incoming_migration_after_rdma_listen();
  3513. qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
  3514. NULL, (void *)(intptr_t)rdma);
  3515. return;
  3516. cleanup_rdma:
  3517. qemu_rdma_cleanup(rdma);
  3518. err:
  3519. error_propagate(errp, local_err);
  3520. if (rdma) {
  3521. g_free(rdma->host);
  3522. g_free(rdma->host_port);
  3523. }
  3524. g_free(rdma);
  3525. }
  3526. void rdma_start_outgoing_migration(void *opaque,
  3527. const char *host_port, Error **errp)
  3528. {
  3529. MigrationState *s = opaque;
  3530. RDMAContext *rdma_return_path = NULL;
  3531. RDMAContext *rdma;
  3532. int ret = 0;
  3533. /* Avoid ram_block_discard_disable(), cannot change during migration. */
  3534. if (ram_block_discard_is_required()) {
  3535. error_setg(errp, "RDMA: cannot disable RAM discard");
  3536. return;
  3537. }
  3538. rdma = qemu_rdma_data_init(host_port, errp);
  3539. if (rdma == NULL) {
  3540. goto err;
  3541. }
  3542. ret = qemu_rdma_source_init(rdma,
  3543. s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
  3544. if (ret) {
  3545. goto err;
  3546. }
  3547. trace_rdma_start_outgoing_migration_after_rdma_source_init();
  3548. ret = qemu_rdma_connect(rdma, errp, false);
  3549. if (ret) {
  3550. goto err;
  3551. }
  3552. /* RDMA postcopy need a separate queue pair for return path */
  3553. if (migrate_postcopy() || migrate_use_return_path()) {
  3554. rdma_return_path = qemu_rdma_data_init(host_port, errp);
  3555. if (rdma_return_path == NULL) {
  3556. goto return_path_err;
  3557. }
  3558. ret = qemu_rdma_source_init(rdma_return_path,
  3559. s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
  3560. if (ret) {
  3561. goto return_path_err;
  3562. }
  3563. ret = qemu_rdma_connect(rdma_return_path, errp, true);
  3564. if (ret) {
  3565. goto return_path_err;
  3566. }
  3567. rdma->return_path = rdma_return_path;
  3568. rdma_return_path->return_path = rdma;
  3569. rdma_return_path->is_return_path = true;
  3570. }
  3571. trace_rdma_start_outgoing_migration_after_rdma_connect();
  3572. s->to_dst_file = qemu_fopen_rdma(rdma, "wb");
  3573. migrate_fd_connect(s, NULL);
  3574. return;
  3575. return_path_err:
  3576. qemu_rdma_cleanup(rdma);
  3577. err:
  3578. g_free(rdma);
  3579. g_free(rdma_return_path);
  3580. }