2
0

rdma.c 125 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140
  1. /*
  2. * RDMA protocol and interfaces
  3. *
  4. * Copyright IBM, Corp. 2010-2013
  5. * Copyright Red Hat, Inc. 2015-2016
  6. *
  7. * Authors:
  8. * Michael R. Hines <mrhines@us.ibm.com>
  9. * Jiuxing Liu <jl@us.ibm.com>
  10. * Daniel P. Berrange <berrange@redhat.com>
  11. *
  12. * This work is licensed under the terms of the GNU GPL, version 2 or
  13. * later. See the COPYING file in the top-level directory.
  14. *
  15. */
  16. #include "qemu/osdep.h"
  17. #include "qapi/error.h"
  18. #include "qemu/cutils.h"
  19. #include "rdma.h"
  20. #include "migration.h"
  21. #include "qemu-file.h"
  22. #include "ram.h"
  23. #include "qemu-file-channel.h"
  24. #include "qemu/error-report.h"
  25. #include "qemu/main-loop.h"
  26. #include "qemu/module.h"
  27. #include "qemu/rcu.h"
  28. #include "qemu/sockets.h"
  29. #include "qemu/bitmap.h"
  30. #include "qemu/coroutine.h"
  31. #include "exec/memory.h"
  32. #include <sys/socket.h>
  33. #include <netdb.h>
  34. #include <arpa/inet.h>
  35. #include <rdma/rdma_cma.h>
  36. #include "trace.h"
  37. #include "qom/object.h"
  38. /*
  39. * Print and error on both the Monitor and the Log file.
  40. */
  41. #define ERROR(errp, fmt, ...) \
  42. do { \
  43. fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
  44. if (errp && (*(errp) == NULL)) { \
  45. error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
  46. } \
  47. } while (0)
  48. #define RDMA_RESOLVE_TIMEOUT_MS 10000
  49. /* Do not merge data if larger than this. */
  50. #define RDMA_MERGE_MAX (2 * 1024 * 1024)
  51. #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
  52. #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
  53. /*
  54. * This is only for non-live state being migrated.
  55. * Instead of RDMA_WRITE messages, we use RDMA_SEND
  56. * messages for that state, which requires a different
  57. * delivery design than main memory.
  58. */
  59. #define RDMA_SEND_INCREMENT 32768
  60. /*
  61. * Maximum size infiniband SEND message
  62. */
  63. #define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
  64. #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
  65. #define RDMA_CONTROL_VERSION_CURRENT 1
  66. /*
  67. * Capabilities for negotiation.
  68. */
  69. #define RDMA_CAPABILITY_PIN_ALL 0x01
  70. /*
  71. * Add the other flags above to this list of known capabilities
  72. * as they are introduced.
  73. */
  74. static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
  75. #define CHECK_ERROR_STATE() \
  76. do { \
  77. if (rdma->error_state) { \
  78. if (!rdma->error_reported) { \
  79. error_report("RDMA is in an error state waiting migration" \
  80. " to abort!"); \
  81. rdma->error_reported = 1; \
  82. } \
  83. return rdma->error_state; \
  84. } \
  85. } while (0)
  86. /*
  87. * A work request ID is 64-bits and we split up these bits
  88. * into 3 parts:
  89. *
  90. * bits 0-15 : type of control message, 2^16
  91. * bits 16-29: ram block index, 2^14
  92. * bits 30-63: ram block chunk number, 2^34
  93. *
  94. * The last two bit ranges are only used for RDMA writes,
  95. * in order to track their completion and potentially
  96. * also track unregistration status of the message.
  97. */
  98. #define RDMA_WRID_TYPE_SHIFT 0UL
  99. #define RDMA_WRID_BLOCK_SHIFT 16UL
  100. #define RDMA_WRID_CHUNK_SHIFT 30UL
  101. #define RDMA_WRID_TYPE_MASK \
  102. ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
  103. #define RDMA_WRID_BLOCK_MASK \
  104. (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
  105. #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
  106. /*
  107. * RDMA migration protocol:
  108. * 1. RDMA Writes (data messages, i.e. RAM)
  109. * 2. IB Send/Recv (control channel messages)
  110. */
  111. enum {
  112. RDMA_WRID_NONE = 0,
  113. RDMA_WRID_RDMA_WRITE = 1,
  114. RDMA_WRID_SEND_CONTROL = 2000,
  115. RDMA_WRID_RECV_CONTROL = 4000,
  116. };
  117. static const char *wrid_desc[] = {
  118. [RDMA_WRID_NONE] = "NONE",
  119. [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
  120. [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
  121. [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
  122. };
  123. /*
  124. * Work request IDs for IB SEND messages only (not RDMA writes).
  125. * This is used by the migration protocol to transmit
  126. * control messages (such as device state and registration commands)
  127. *
  128. * We could use more WRs, but we have enough for now.
  129. */
  130. enum {
  131. RDMA_WRID_READY = 0,
  132. RDMA_WRID_DATA,
  133. RDMA_WRID_CONTROL,
  134. RDMA_WRID_MAX,
  135. };
  136. /*
  137. * SEND/RECV IB Control Messages.
  138. */
  139. enum {
  140. RDMA_CONTROL_NONE = 0,
  141. RDMA_CONTROL_ERROR,
  142. RDMA_CONTROL_READY, /* ready to receive */
  143. RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */
  144. RDMA_CONTROL_RAM_BLOCKS_REQUEST, /* RAMBlock synchronization */
  145. RDMA_CONTROL_RAM_BLOCKS_RESULT, /* RAMBlock synchronization */
  146. RDMA_CONTROL_COMPRESS, /* page contains repeat values */
  147. RDMA_CONTROL_REGISTER_REQUEST, /* dynamic page registration */
  148. RDMA_CONTROL_REGISTER_RESULT, /* key to use after registration */
  149. RDMA_CONTROL_REGISTER_FINISHED, /* current iteration finished */
  150. RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */
  151. RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
  152. };
  153. /*
  154. * Memory and MR structures used to represent an IB Send/Recv work request.
  155. * This is *not* used for RDMA writes, only IB Send/Recv.
  156. */
  157. typedef struct {
  158. uint8_t control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
  159. struct ibv_mr *control_mr; /* registration metadata */
  160. size_t control_len; /* length of the message */
  161. uint8_t *control_curr; /* start of unconsumed bytes */
  162. } RDMAWorkRequestData;
  163. /*
  164. * Negotiate RDMA capabilities during connection-setup time.
  165. */
  166. typedef struct {
  167. uint32_t version;
  168. uint32_t flags;
  169. } RDMACapabilities;
  170. static void caps_to_network(RDMACapabilities *cap)
  171. {
  172. cap->version = htonl(cap->version);
  173. cap->flags = htonl(cap->flags);
  174. }
  175. static void network_to_caps(RDMACapabilities *cap)
  176. {
  177. cap->version = ntohl(cap->version);
  178. cap->flags = ntohl(cap->flags);
  179. }
  180. /*
  181. * Representation of a RAMBlock from an RDMA perspective.
  182. * This is not transmitted, only local.
  183. * This and subsequent structures cannot be linked lists
  184. * because we're using a single IB message to transmit
  185. * the information. It's small anyway, so a list is overkill.
  186. */
  187. typedef struct RDMALocalBlock {
  188. char *block_name;
  189. uint8_t *local_host_addr; /* local virtual address */
  190. uint64_t remote_host_addr; /* remote virtual address */
  191. uint64_t offset;
  192. uint64_t length;
  193. struct ibv_mr **pmr; /* MRs for chunk-level registration */
  194. struct ibv_mr *mr; /* MR for non-chunk-level registration */
  195. uint32_t *remote_keys; /* rkeys for chunk-level registration */
  196. uint32_t remote_rkey; /* rkeys for non-chunk-level registration */
  197. int index; /* which block are we */
  198. unsigned int src_index; /* (Only used on dest) */
  199. bool is_ram_block;
  200. int nb_chunks;
  201. unsigned long *transit_bitmap;
  202. unsigned long *unregister_bitmap;
  203. } RDMALocalBlock;
  204. /*
  205. * Also represents a RAMblock, but only on the dest.
  206. * This gets transmitted by the dest during connection-time
  207. * to the source VM and then is used to populate the
  208. * corresponding RDMALocalBlock with
  209. * the information needed to perform the actual RDMA.
  210. */
  211. typedef struct QEMU_PACKED RDMADestBlock {
  212. uint64_t remote_host_addr;
  213. uint64_t offset;
  214. uint64_t length;
  215. uint32_t remote_rkey;
  216. uint32_t padding;
  217. } RDMADestBlock;
  218. static const char *control_desc(unsigned int rdma_control)
  219. {
  220. static const char *strs[] = {
  221. [RDMA_CONTROL_NONE] = "NONE",
  222. [RDMA_CONTROL_ERROR] = "ERROR",
  223. [RDMA_CONTROL_READY] = "READY",
  224. [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
  225. [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
  226. [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
  227. [RDMA_CONTROL_COMPRESS] = "COMPRESS",
  228. [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
  229. [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
  230. [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
  231. [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
  232. [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
  233. };
  234. if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
  235. return "??BAD CONTROL VALUE??";
  236. }
  237. return strs[rdma_control];
  238. }
  239. static uint64_t htonll(uint64_t v)
  240. {
  241. union { uint32_t lv[2]; uint64_t llv; } u;
  242. u.lv[0] = htonl(v >> 32);
  243. u.lv[1] = htonl(v & 0xFFFFFFFFULL);
  244. return u.llv;
  245. }
  246. static uint64_t ntohll(uint64_t v) {
  247. union { uint32_t lv[2]; uint64_t llv; } u;
  248. u.llv = v;
  249. return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
  250. }
  251. static void dest_block_to_network(RDMADestBlock *db)
  252. {
  253. db->remote_host_addr = htonll(db->remote_host_addr);
  254. db->offset = htonll(db->offset);
  255. db->length = htonll(db->length);
  256. db->remote_rkey = htonl(db->remote_rkey);
  257. }
  258. static void network_to_dest_block(RDMADestBlock *db)
  259. {
  260. db->remote_host_addr = ntohll(db->remote_host_addr);
  261. db->offset = ntohll(db->offset);
  262. db->length = ntohll(db->length);
  263. db->remote_rkey = ntohl(db->remote_rkey);
  264. }
  265. /*
  266. * Virtual address of the above structures used for transmitting
  267. * the RAMBlock descriptions at connection-time.
  268. * This structure is *not* transmitted.
  269. */
  270. typedef struct RDMALocalBlocks {
  271. int nb_blocks;
  272. bool init; /* main memory init complete */
  273. RDMALocalBlock *block;
  274. } RDMALocalBlocks;
  275. /*
  276. * Main data structure for RDMA state.
  277. * While there is only one copy of this structure being allocated right now,
  278. * this is the place where one would start if you wanted to consider
  279. * having more than one RDMA connection open at the same time.
  280. */
  281. typedef struct RDMAContext {
  282. char *host;
  283. int port;
  284. RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
  285. /*
  286. * This is used by *_exchange_send() to figure out whether or not
  287. * the initial "READY" message has already been received or not.
  288. * This is because other functions may potentially poll() and detect
  289. * the READY message before send() does, in which case we need to
  290. * know if it completed.
  291. */
  292. int control_ready_expected;
  293. /* number of outstanding writes */
  294. int nb_sent;
  295. /* store info about current buffer so that we can
  296. merge it with future sends */
  297. uint64_t current_addr;
  298. uint64_t current_length;
  299. /* index of ram block the current buffer belongs to */
  300. int current_index;
  301. /* index of the chunk in the current ram block */
  302. int current_chunk;
  303. bool pin_all;
  304. /*
  305. * infiniband-specific variables for opening the device
  306. * and maintaining connection state and so forth.
  307. *
  308. * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
  309. * cm_id->verbs, cm_id->channel, and cm_id->qp.
  310. */
  311. struct rdma_cm_id *cm_id; /* connection manager ID */
  312. struct rdma_cm_id *listen_id;
  313. bool connected;
  314. struct ibv_context *verbs;
  315. struct rdma_event_channel *channel;
  316. struct ibv_qp *qp; /* queue pair */
  317. struct ibv_comp_channel *comp_channel; /* completion channel */
  318. struct ibv_pd *pd; /* protection domain */
  319. struct ibv_cq *cq; /* completion queue */
  320. /*
  321. * If a previous write failed (perhaps because of a failed
  322. * memory registration, then do not attempt any future work
  323. * and remember the error state.
  324. */
  325. int error_state;
  326. int error_reported;
  327. int received_error;
  328. /*
  329. * Description of ram blocks used throughout the code.
  330. */
  331. RDMALocalBlocks local_ram_blocks;
  332. RDMADestBlock *dest_blocks;
  333. /* Index of the next RAMBlock received during block registration */
  334. unsigned int next_src_index;
  335. /*
  336. * Migration on *destination* started.
  337. * Then use coroutine yield function.
  338. * Source runs in a thread, so we don't care.
  339. */
  340. int migration_started_on_destination;
  341. int total_registrations;
  342. int total_writes;
  343. int unregister_current, unregister_next;
  344. uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
  345. GHashTable *blockmap;
  346. /* the RDMAContext for return path */
  347. struct RDMAContext *return_path;
  348. bool is_return_path;
  349. } RDMAContext;
  350. #define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
  351. OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
  352. struct QIOChannelRDMA {
  353. QIOChannel parent;
  354. RDMAContext *rdmain;
  355. RDMAContext *rdmaout;
  356. QEMUFile *file;
  357. bool blocking; /* XXX we don't actually honour this yet */
  358. };
  359. /*
  360. * Main structure for IB Send/Recv control messages.
  361. * This gets prepended at the beginning of every Send/Recv.
  362. */
  363. typedef struct QEMU_PACKED {
  364. uint32_t len; /* Total length of data portion */
  365. uint32_t type; /* which control command to perform */
  366. uint32_t repeat; /* number of commands in data portion of same type */
  367. uint32_t padding;
  368. } RDMAControlHeader;
  369. static void control_to_network(RDMAControlHeader *control)
  370. {
  371. control->type = htonl(control->type);
  372. control->len = htonl(control->len);
  373. control->repeat = htonl(control->repeat);
  374. }
  375. static void network_to_control(RDMAControlHeader *control)
  376. {
  377. control->type = ntohl(control->type);
  378. control->len = ntohl(control->len);
  379. control->repeat = ntohl(control->repeat);
  380. }
  381. /*
  382. * Register a single Chunk.
  383. * Information sent by the source VM to inform the dest
  384. * to register an single chunk of memory before we can perform
  385. * the actual RDMA operation.
  386. */
  387. typedef struct QEMU_PACKED {
  388. union QEMU_PACKED {
  389. uint64_t current_addr; /* offset into the ram_addr_t space */
  390. uint64_t chunk; /* chunk to lookup if unregistering */
  391. } key;
  392. uint32_t current_index; /* which ramblock the chunk belongs to */
  393. uint32_t padding;
  394. uint64_t chunks; /* how many sequential chunks to register */
  395. } RDMARegister;
  396. static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
  397. {
  398. RDMALocalBlock *local_block;
  399. local_block = &rdma->local_ram_blocks.block[reg->current_index];
  400. if (local_block->is_ram_block) {
  401. /*
  402. * current_addr as passed in is an address in the local ram_addr_t
  403. * space, we need to translate this for the destination
  404. */
  405. reg->key.current_addr -= local_block->offset;
  406. reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
  407. }
  408. reg->key.current_addr = htonll(reg->key.current_addr);
  409. reg->current_index = htonl(reg->current_index);
  410. reg->chunks = htonll(reg->chunks);
  411. }
  412. static void network_to_register(RDMARegister *reg)
  413. {
  414. reg->key.current_addr = ntohll(reg->key.current_addr);
  415. reg->current_index = ntohl(reg->current_index);
  416. reg->chunks = ntohll(reg->chunks);
  417. }
  418. typedef struct QEMU_PACKED {
  419. uint32_t value; /* if zero, we will madvise() */
  420. uint32_t block_idx; /* which ram block index */
  421. uint64_t offset; /* Address in remote ram_addr_t space */
  422. uint64_t length; /* length of the chunk */
  423. } RDMACompress;
  424. static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
  425. {
  426. comp->value = htonl(comp->value);
  427. /*
  428. * comp->offset as passed in is an address in the local ram_addr_t
  429. * space, we need to translate this for the destination
  430. */
  431. comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
  432. comp->offset += rdma->dest_blocks[comp->block_idx].offset;
  433. comp->block_idx = htonl(comp->block_idx);
  434. comp->offset = htonll(comp->offset);
  435. comp->length = htonll(comp->length);
  436. }
  437. static void network_to_compress(RDMACompress *comp)
  438. {
  439. comp->value = ntohl(comp->value);
  440. comp->block_idx = ntohl(comp->block_idx);
  441. comp->offset = ntohll(comp->offset);
  442. comp->length = ntohll(comp->length);
  443. }
  444. /*
  445. * The result of the dest's memory registration produces an "rkey"
  446. * which the source VM must reference in order to perform
  447. * the RDMA operation.
  448. */
  449. typedef struct QEMU_PACKED {
  450. uint32_t rkey;
  451. uint32_t padding;
  452. uint64_t host_addr;
  453. } RDMARegisterResult;
  454. static void result_to_network(RDMARegisterResult *result)
  455. {
  456. result->rkey = htonl(result->rkey);
  457. result->host_addr = htonll(result->host_addr);
  458. };
  459. static void network_to_result(RDMARegisterResult *result)
  460. {
  461. result->rkey = ntohl(result->rkey);
  462. result->host_addr = ntohll(result->host_addr);
  463. };
  464. const char *print_wrid(int wrid);
  465. static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
  466. uint8_t *data, RDMAControlHeader *resp,
  467. int *resp_idx,
  468. int (*callback)(RDMAContext *rdma));
  469. static inline uint64_t ram_chunk_index(const uint8_t *start,
  470. const uint8_t *host)
  471. {
  472. return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
  473. }
  474. static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
  475. uint64_t i)
  476. {
  477. return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
  478. (i << RDMA_REG_CHUNK_SHIFT));
  479. }
  480. static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
  481. uint64_t i)
  482. {
  483. uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
  484. (1UL << RDMA_REG_CHUNK_SHIFT);
  485. if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
  486. result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
  487. }
  488. return result;
  489. }
  490. static int rdma_add_block(RDMAContext *rdma, const char *block_name,
  491. void *host_addr,
  492. ram_addr_t block_offset, uint64_t length)
  493. {
  494. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  495. RDMALocalBlock *block;
  496. RDMALocalBlock *old = local->block;
  497. local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
  498. if (local->nb_blocks) {
  499. int x;
  500. if (rdma->blockmap) {
  501. for (x = 0; x < local->nb_blocks; x++) {
  502. g_hash_table_remove(rdma->blockmap,
  503. (void *)(uintptr_t)old[x].offset);
  504. g_hash_table_insert(rdma->blockmap,
  505. (void *)(uintptr_t)old[x].offset,
  506. &local->block[x]);
  507. }
  508. }
  509. memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
  510. g_free(old);
  511. }
  512. block = &local->block[local->nb_blocks];
  513. block->block_name = g_strdup(block_name);
  514. block->local_host_addr = host_addr;
  515. block->offset = block_offset;
  516. block->length = length;
  517. block->index = local->nb_blocks;
  518. block->src_index = ~0U; /* Filled in by the receipt of the block list */
  519. block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
  520. block->transit_bitmap = bitmap_new(block->nb_chunks);
  521. bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
  522. block->unregister_bitmap = bitmap_new(block->nb_chunks);
  523. bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
  524. block->remote_keys = g_new0(uint32_t, block->nb_chunks);
  525. block->is_ram_block = local->init ? false : true;
  526. if (rdma->blockmap) {
  527. g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
  528. }
  529. trace_rdma_add_block(block_name, local->nb_blocks,
  530. (uintptr_t) block->local_host_addr,
  531. block->offset, block->length,
  532. (uintptr_t) (block->local_host_addr + block->length),
  533. BITS_TO_LONGS(block->nb_chunks) *
  534. sizeof(unsigned long) * 8,
  535. block->nb_chunks);
  536. local->nb_blocks++;
  537. return 0;
  538. }
  539. /*
  540. * Memory regions need to be registered with the device and queue pairs setup
  541. * in advanced before the migration starts. This tells us where the RAM blocks
  542. * are so that we can register them individually.
  543. */
  544. static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
  545. {
  546. const char *block_name = qemu_ram_get_idstr(rb);
  547. void *host_addr = qemu_ram_get_host_addr(rb);
  548. ram_addr_t block_offset = qemu_ram_get_offset(rb);
  549. ram_addr_t length = qemu_ram_get_used_length(rb);
  550. return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
  551. }
  552. /*
  553. * Identify the RAMBlocks and their quantity. They will be references to
  554. * identify chunk boundaries inside each RAMBlock and also be referenced
  555. * during dynamic page registration.
  556. */
  557. static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
  558. {
  559. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  560. int ret;
  561. assert(rdma->blockmap == NULL);
  562. memset(local, 0, sizeof *local);
  563. ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
  564. if (ret) {
  565. return ret;
  566. }
  567. trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
  568. rdma->dest_blocks = g_new0(RDMADestBlock,
  569. rdma->local_ram_blocks.nb_blocks);
  570. local->init = true;
  571. return 0;
  572. }
  573. /*
  574. * Note: If used outside of cleanup, the caller must ensure that the destination
  575. * block structures are also updated
  576. */
  577. static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
  578. {
  579. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  580. RDMALocalBlock *old = local->block;
  581. int x;
  582. if (rdma->blockmap) {
  583. g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
  584. }
  585. if (block->pmr) {
  586. int j;
  587. for (j = 0; j < block->nb_chunks; j++) {
  588. if (!block->pmr[j]) {
  589. continue;
  590. }
  591. ibv_dereg_mr(block->pmr[j]);
  592. rdma->total_registrations--;
  593. }
  594. g_free(block->pmr);
  595. block->pmr = NULL;
  596. }
  597. if (block->mr) {
  598. ibv_dereg_mr(block->mr);
  599. rdma->total_registrations--;
  600. block->mr = NULL;
  601. }
  602. g_free(block->transit_bitmap);
  603. block->transit_bitmap = NULL;
  604. g_free(block->unregister_bitmap);
  605. block->unregister_bitmap = NULL;
  606. g_free(block->remote_keys);
  607. block->remote_keys = NULL;
  608. g_free(block->block_name);
  609. block->block_name = NULL;
  610. if (rdma->blockmap) {
  611. for (x = 0; x < local->nb_blocks; x++) {
  612. g_hash_table_remove(rdma->blockmap,
  613. (void *)(uintptr_t)old[x].offset);
  614. }
  615. }
  616. if (local->nb_blocks > 1) {
  617. local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
  618. if (block->index) {
  619. memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
  620. }
  621. if (block->index < (local->nb_blocks - 1)) {
  622. memcpy(local->block + block->index, old + (block->index + 1),
  623. sizeof(RDMALocalBlock) *
  624. (local->nb_blocks - (block->index + 1)));
  625. for (x = block->index; x < local->nb_blocks - 1; x++) {
  626. local->block[x].index--;
  627. }
  628. }
  629. } else {
  630. assert(block == local->block);
  631. local->block = NULL;
  632. }
  633. trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
  634. block->offset, block->length,
  635. (uintptr_t)(block->local_host_addr + block->length),
  636. BITS_TO_LONGS(block->nb_chunks) *
  637. sizeof(unsigned long) * 8, block->nb_chunks);
  638. g_free(old);
  639. local->nb_blocks--;
  640. if (local->nb_blocks && rdma->blockmap) {
  641. for (x = 0; x < local->nb_blocks; x++) {
  642. g_hash_table_insert(rdma->blockmap,
  643. (void *)(uintptr_t)local->block[x].offset,
  644. &local->block[x]);
  645. }
  646. }
  647. return 0;
  648. }
  649. /*
  650. * Put in the log file which RDMA device was opened and the details
  651. * associated with that device.
  652. */
  653. static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
  654. {
  655. struct ibv_port_attr port;
  656. if (ibv_query_port(verbs, 1, &port)) {
  657. error_report("Failed to query port information");
  658. return;
  659. }
  660. printf("%s RDMA Device opened: kernel name %s "
  661. "uverbs device name %s, "
  662. "infiniband_verbs class device path %s, "
  663. "infiniband class device path %s, "
  664. "transport: (%d) %s\n",
  665. who,
  666. verbs->device->name,
  667. verbs->device->dev_name,
  668. verbs->device->dev_path,
  669. verbs->device->ibdev_path,
  670. port.link_layer,
  671. (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
  672. ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
  673. ? "Ethernet" : "Unknown"));
  674. }
  675. /*
  676. * Put in the log file the RDMA gid addressing information,
  677. * useful for folks who have trouble understanding the
  678. * RDMA device hierarchy in the kernel.
  679. */
  680. static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
  681. {
  682. char sgid[33];
  683. char dgid[33];
  684. inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
  685. inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
  686. trace_qemu_rdma_dump_gid(who, sgid, dgid);
  687. }
  688. /*
  689. * As of now, IPv6 over RoCE / iWARP is not supported by linux.
  690. * We will try the next addrinfo struct, and fail if there are
  691. * no other valid addresses to bind against.
  692. *
  693. * If user is listening on '[::]', then we will not have a opened a device
  694. * yet and have no way of verifying if the device is RoCE or not.
  695. *
  696. * In this case, the source VM will throw an error for ALL types of
  697. * connections (both IPv4 and IPv6) if the destination machine does not have
  698. * a regular infiniband network available for use.
  699. *
  700. * The only way to guarantee that an error is thrown for broken kernels is
  701. * for the management software to choose a *specific* interface at bind time
  702. * and validate what time of hardware it is.
  703. *
  704. * Unfortunately, this puts the user in a fix:
  705. *
  706. * If the source VM connects with an IPv4 address without knowing that the
  707. * destination has bound to '[::]' the migration will unconditionally fail
  708. * unless the management software is explicitly listening on the IPv4
  709. * address while using a RoCE-based device.
  710. *
  711. * If the source VM connects with an IPv6 address, then we're OK because we can
  712. * throw an error on the source (and similarly on the destination).
  713. *
  714. * But in mixed environments, this will be broken for a while until it is fixed
  715. * inside linux.
  716. *
  717. * We do provide a *tiny* bit of help in this function: We can list all of the
  718. * devices in the system and check to see if all the devices are RoCE or
  719. * Infiniband.
  720. *
  721. * If we detect that we have a *pure* RoCE environment, then we can safely
  722. * thrown an error even if the management software has specified '[::]' as the
  723. * bind address.
  724. *
  725. * However, if there is are multiple hetergeneous devices, then we cannot make
  726. * this assumption and the user just has to be sure they know what they are
  727. * doing.
  728. *
  729. * Patches are being reviewed on linux-rdma.
  730. */
  731. static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
  732. {
  733. /* This bug only exists in linux, to our knowledge. */
  734. #ifdef CONFIG_LINUX
  735. struct ibv_port_attr port_attr;
  736. /*
  737. * Verbs are only NULL if management has bound to '[::]'.
  738. *
  739. * Let's iterate through all the devices and see if there any pure IB
  740. * devices (non-ethernet).
  741. *
  742. * If not, then we can safely proceed with the migration.
  743. * Otherwise, there are no guarantees until the bug is fixed in linux.
  744. */
  745. if (!verbs) {
  746. int num_devices, x;
  747. struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
  748. bool roce_found = false;
  749. bool ib_found = false;
  750. for (x = 0; x < num_devices; x++) {
  751. verbs = ibv_open_device(dev_list[x]);
  752. if (!verbs) {
  753. if (errno == EPERM) {
  754. continue;
  755. } else {
  756. return -EINVAL;
  757. }
  758. }
  759. if (ibv_query_port(verbs, 1, &port_attr)) {
  760. ibv_close_device(verbs);
  761. ERROR(errp, "Could not query initial IB port");
  762. return -EINVAL;
  763. }
  764. if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
  765. ib_found = true;
  766. } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
  767. roce_found = true;
  768. }
  769. ibv_close_device(verbs);
  770. }
  771. if (roce_found) {
  772. if (ib_found) {
  773. fprintf(stderr, "WARN: migrations may fail:"
  774. " IPv6 over RoCE / iWARP in linux"
  775. " is broken. But since you appear to have a"
  776. " mixed RoCE / IB environment, be sure to only"
  777. " migrate over the IB fabric until the kernel "
  778. " fixes the bug.\n");
  779. } else {
  780. ERROR(errp, "You only have RoCE / iWARP devices in your systems"
  781. " and your management software has specified '[::]'"
  782. ", but IPv6 over RoCE / iWARP is not supported in Linux.");
  783. return -ENONET;
  784. }
  785. }
  786. return 0;
  787. }
  788. /*
  789. * If we have a verbs context, that means that some other than '[::]' was
  790. * used by the management software for binding. In which case we can
  791. * actually warn the user about a potentially broken kernel.
  792. */
  793. /* IB ports start with 1, not 0 */
  794. if (ibv_query_port(verbs, 1, &port_attr)) {
  795. ERROR(errp, "Could not query initial IB port");
  796. return -EINVAL;
  797. }
  798. if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
  799. ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
  800. "(but patches on linux-rdma in progress)");
  801. return -ENONET;
  802. }
  803. #endif
  804. return 0;
  805. }
  806. /*
  807. * Figure out which RDMA device corresponds to the requested IP hostname
  808. * Also create the initial connection manager identifiers for opening
  809. * the connection.
  810. */
  811. static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
  812. {
  813. int ret;
  814. struct rdma_addrinfo *res;
  815. char port_str[16];
  816. struct rdma_cm_event *cm_event;
  817. char ip[40] = "unknown";
  818. struct rdma_addrinfo *e;
  819. if (rdma->host == NULL || !strcmp(rdma->host, "")) {
  820. ERROR(errp, "RDMA hostname has not been set");
  821. return -EINVAL;
  822. }
  823. /* create CM channel */
  824. rdma->channel = rdma_create_event_channel();
  825. if (!rdma->channel) {
  826. ERROR(errp, "could not create CM channel");
  827. return -EINVAL;
  828. }
  829. /* create CM id */
  830. ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
  831. if (ret) {
  832. ERROR(errp, "could not create channel id");
  833. goto err_resolve_create_id;
  834. }
  835. snprintf(port_str, 16, "%d", rdma->port);
  836. port_str[15] = '\0';
  837. ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
  838. if (ret < 0) {
  839. ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
  840. goto err_resolve_get_addr;
  841. }
  842. for (e = res; e != NULL; e = e->ai_next) {
  843. inet_ntop(e->ai_family,
  844. &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
  845. trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
  846. ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
  847. RDMA_RESOLVE_TIMEOUT_MS);
  848. if (!ret) {
  849. if (e->ai_family == AF_INET6) {
  850. ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
  851. if (ret) {
  852. continue;
  853. }
  854. }
  855. goto route;
  856. }
  857. }
  858. ERROR(errp, "could not resolve address %s", rdma->host);
  859. goto err_resolve_get_addr;
  860. route:
  861. qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
  862. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  863. if (ret) {
  864. ERROR(errp, "could not perform event_addr_resolved");
  865. goto err_resolve_get_addr;
  866. }
  867. if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
  868. ERROR(errp, "result not equal to event_addr_resolved %s",
  869. rdma_event_str(cm_event->event));
  870. perror("rdma_resolve_addr");
  871. rdma_ack_cm_event(cm_event);
  872. ret = -EINVAL;
  873. goto err_resolve_get_addr;
  874. }
  875. rdma_ack_cm_event(cm_event);
  876. /* resolve route */
  877. ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
  878. if (ret) {
  879. ERROR(errp, "could not resolve rdma route");
  880. goto err_resolve_get_addr;
  881. }
  882. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  883. if (ret) {
  884. ERROR(errp, "could not perform event_route_resolved");
  885. goto err_resolve_get_addr;
  886. }
  887. if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
  888. ERROR(errp, "result not equal to event_route_resolved: %s",
  889. rdma_event_str(cm_event->event));
  890. rdma_ack_cm_event(cm_event);
  891. ret = -EINVAL;
  892. goto err_resolve_get_addr;
  893. }
  894. rdma_ack_cm_event(cm_event);
  895. rdma->verbs = rdma->cm_id->verbs;
  896. qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
  897. qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
  898. return 0;
  899. err_resolve_get_addr:
  900. rdma_destroy_id(rdma->cm_id);
  901. rdma->cm_id = NULL;
  902. err_resolve_create_id:
  903. rdma_destroy_event_channel(rdma->channel);
  904. rdma->channel = NULL;
  905. return ret;
  906. }
  907. /*
  908. * Create protection domain and completion queues
  909. */
  910. static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
  911. {
  912. /* allocate pd */
  913. rdma->pd = ibv_alloc_pd(rdma->verbs);
  914. if (!rdma->pd) {
  915. error_report("failed to allocate protection domain");
  916. return -1;
  917. }
  918. /* create completion channel */
  919. rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
  920. if (!rdma->comp_channel) {
  921. error_report("failed to allocate completion channel");
  922. goto err_alloc_pd_cq;
  923. }
  924. /*
  925. * Completion queue can be filled by both read and write work requests,
  926. * so must reflect the sum of both possible queue sizes.
  927. */
  928. rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
  929. NULL, rdma->comp_channel, 0);
  930. if (!rdma->cq) {
  931. error_report("failed to allocate completion queue");
  932. goto err_alloc_pd_cq;
  933. }
  934. return 0;
  935. err_alloc_pd_cq:
  936. if (rdma->pd) {
  937. ibv_dealloc_pd(rdma->pd);
  938. }
  939. if (rdma->comp_channel) {
  940. ibv_destroy_comp_channel(rdma->comp_channel);
  941. }
  942. rdma->pd = NULL;
  943. rdma->comp_channel = NULL;
  944. return -1;
  945. }
  946. /*
  947. * Create queue pairs.
  948. */
  949. static int qemu_rdma_alloc_qp(RDMAContext *rdma)
  950. {
  951. struct ibv_qp_init_attr attr = { 0 };
  952. int ret;
  953. attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
  954. attr.cap.max_recv_wr = 3;
  955. attr.cap.max_send_sge = 1;
  956. attr.cap.max_recv_sge = 1;
  957. attr.send_cq = rdma->cq;
  958. attr.recv_cq = rdma->cq;
  959. attr.qp_type = IBV_QPT_RC;
  960. ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
  961. if (ret) {
  962. return -1;
  963. }
  964. rdma->qp = rdma->cm_id->qp;
  965. return 0;
  966. }
  967. static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
  968. {
  969. int i;
  970. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  971. for (i = 0; i < local->nb_blocks; i++) {
  972. local->block[i].mr =
  973. ibv_reg_mr(rdma->pd,
  974. local->block[i].local_host_addr,
  975. local->block[i].length,
  976. IBV_ACCESS_LOCAL_WRITE |
  977. IBV_ACCESS_REMOTE_WRITE
  978. );
  979. if (!local->block[i].mr) {
  980. perror("Failed to register local dest ram block!\n");
  981. break;
  982. }
  983. rdma->total_registrations++;
  984. }
  985. if (i >= local->nb_blocks) {
  986. return 0;
  987. }
  988. for (i--; i >= 0; i--) {
  989. ibv_dereg_mr(local->block[i].mr);
  990. rdma->total_registrations--;
  991. }
  992. return -1;
  993. }
  994. /*
  995. * Find the ram block that corresponds to the page requested to be
  996. * transmitted by QEMU.
  997. *
  998. * Once the block is found, also identify which 'chunk' within that
  999. * block that the page belongs to.
  1000. *
  1001. * This search cannot fail or the migration will fail.
  1002. */
  1003. static int qemu_rdma_search_ram_block(RDMAContext *rdma,
  1004. uintptr_t block_offset,
  1005. uint64_t offset,
  1006. uint64_t length,
  1007. uint64_t *block_index,
  1008. uint64_t *chunk_index)
  1009. {
  1010. uint64_t current_addr = block_offset + offset;
  1011. RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
  1012. (void *) block_offset);
  1013. assert(block);
  1014. assert(current_addr >= block->offset);
  1015. assert((current_addr + length) <= (block->offset + block->length));
  1016. *block_index = block->index;
  1017. *chunk_index = ram_chunk_index(block->local_host_addr,
  1018. block->local_host_addr + (current_addr - block->offset));
  1019. return 0;
  1020. }
  1021. /*
  1022. * Register a chunk with IB. If the chunk was already registered
  1023. * previously, then skip.
  1024. *
  1025. * Also return the keys associated with the registration needed
  1026. * to perform the actual RDMA operation.
  1027. */
  1028. static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
  1029. RDMALocalBlock *block, uintptr_t host_addr,
  1030. uint32_t *lkey, uint32_t *rkey, int chunk,
  1031. uint8_t *chunk_start, uint8_t *chunk_end)
  1032. {
  1033. if (block->mr) {
  1034. if (lkey) {
  1035. *lkey = block->mr->lkey;
  1036. }
  1037. if (rkey) {
  1038. *rkey = block->mr->rkey;
  1039. }
  1040. return 0;
  1041. }
  1042. /* allocate memory to store chunk MRs */
  1043. if (!block->pmr) {
  1044. block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
  1045. }
  1046. /*
  1047. * If 'rkey', then we're the destination, so grant access to the source.
  1048. *
  1049. * If 'lkey', then we're the source VM, so grant access only to ourselves.
  1050. */
  1051. if (!block->pmr[chunk]) {
  1052. uint64_t len = chunk_end - chunk_start;
  1053. trace_qemu_rdma_register_and_get_keys(len, chunk_start);
  1054. block->pmr[chunk] = ibv_reg_mr(rdma->pd,
  1055. chunk_start, len,
  1056. (rkey ? (IBV_ACCESS_LOCAL_WRITE |
  1057. IBV_ACCESS_REMOTE_WRITE) : 0));
  1058. if (!block->pmr[chunk]) {
  1059. perror("Failed to register chunk!");
  1060. fprintf(stderr, "Chunk details: block: %d chunk index %d"
  1061. " start %" PRIuPTR " end %" PRIuPTR
  1062. " host %" PRIuPTR
  1063. " local %" PRIuPTR " registrations: %d\n",
  1064. block->index, chunk, (uintptr_t)chunk_start,
  1065. (uintptr_t)chunk_end, host_addr,
  1066. (uintptr_t)block->local_host_addr,
  1067. rdma->total_registrations);
  1068. return -1;
  1069. }
  1070. rdma->total_registrations++;
  1071. }
  1072. if (lkey) {
  1073. *lkey = block->pmr[chunk]->lkey;
  1074. }
  1075. if (rkey) {
  1076. *rkey = block->pmr[chunk]->rkey;
  1077. }
  1078. return 0;
  1079. }
  1080. /*
  1081. * Register (at connection time) the memory used for control
  1082. * channel messages.
  1083. */
  1084. static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
  1085. {
  1086. rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
  1087. rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
  1088. IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
  1089. if (rdma->wr_data[idx].control_mr) {
  1090. rdma->total_registrations++;
  1091. return 0;
  1092. }
  1093. error_report("qemu_rdma_reg_control failed");
  1094. return -1;
  1095. }
  1096. const char *print_wrid(int wrid)
  1097. {
  1098. if (wrid >= RDMA_WRID_RECV_CONTROL) {
  1099. return wrid_desc[RDMA_WRID_RECV_CONTROL];
  1100. }
  1101. return wrid_desc[wrid];
  1102. }
  1103. /*
  1104. * RDMA requires memory registration (mlock/pinning), but this is not good for
  1105. * overcommitment.
  1106. *
  1107. * In preparation for the future where LRU information or workload-specific
  1108. * writable writable working set memory access behavior is available to QEMU
  1109. * it would be nice to have in place the ability to UN-register/UN-pin
  1110. * particular memory regions from the RDMA hardware when it is determine that
  1111. * those regions of memory will likely not be accessed again in the near future.
  1112. *
  1113. * While we do not yet have such information right now, the following
  1114. * compile-time option allows us to perform a non-optimized version of this
  1115. * behavior.
  1116. *
  1117. * By uncommenting this option, you will cause *all* RDMA transfers to be
  1118. * unregistered immediately after the transfer completes on both sides of the
  1119. * connection. This has no effect in 'rdma-pin-all' mode, only regular mode.
  1120. *
  1121. * This will have a terrible impact on migration performance, so until future
  1122. * workload information or LRU information is available, do not attempt to use
  1123. * this feature except for basic testing.
  1124. */
  1125. //#define RDMA_UNREGISTRATION_EXAMPLE
  1126. /*
  1127. * Perform a non-optimized memory unregistration after every transfer
  1128. * for demonstration purposes, only if pin-all is not requested.
  1129. *
  1130. * Potential optimizations:
  1131. * 1. Start a new thread to run this function continuously
  1132. - for bit clearing
  1133. - and for receipt of unregister messages
  1134. * 2. Use an LRU.
  1135. * 3. Use workload hints.
  1136. */
  1137. static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
  1138. {
  1139. while (rdma->unregistrations[rdma->unregister_current]) {
  1140. int ret;
  1141. uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
  1142. uint64_t chunk =
  1143. (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
  1144. uint64_t index =
  1145. (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
  1146. RDMALocalBlock *block =
  1147. &(rdma->local_ram_blocks.block[index]);
  1148. RDMARegister reg = { .current_index = index };
  1149. RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
  1150. };
  1151. RDMAControlHeader head = { .len = sizeof(RDMARegister),
  1152. .type = RDMA_CONTROL_UNREGISTER_REQUEST,
  1153. .repeat = 1,
  1154. };
  1155. trace_qemu_rdma_unregister_waiting_proc(chunk,
  1156. rdma->unregister_current);
  1157. rdma->unregistrations[rdma->unregister_current] = 0;
  1158. rdma->unregister_current++;
  1159. if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
  1160. rdma->unregister_current = 0;
  1161. }
  1162. /*
  1163. * Unregistration is speculative (because migration is single-threaded
  1164. * and we cannot break the protocol's inifinband message ordering).
  1165. * Thus, if the memory is currently being used for transmission,
  1166. * then abort the attempt to unregister and try again
  1167. * later the next time a completion is received for this memory.
  1168. */
  1169. clear_bit(chunk, block->unregister_bitmap);
  1170. if (test_bit(chunk, block->transit_bitmap)) {
  1171. trace_qemu_rdma_unregister_waiting_inflight(chunk);
  1172. continue;
  1173. }
  1174. trace_qemu_rdma_unregister_waiting_send(chunk);
  1175. ret = ibv_dereg_mr(block->pmr[chunk]);
  1176. block->pmr[chunk] = NULL;
  1177. block->remote_keys[chunk] = 0;
  1178. if (ret != 0) {
  1179. perror("unregistration chunk failed");
  1180. return -ret;
  1181. }
  1182. rdma->total_registrations--;
  1183. reg.key.chunk = chunk;
  1184. register_to_network(rdma, &reg);
  1185. ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
  1186. &resp, NULL, NULL);
  1187. if (ret < 0) {
  1188. return ret;
  1189. }
  1190. trace_qemu_rdma_unregister_waiting_complete(chunk);
  1191. }
  1192. return 0;
  1193. }
  1194. static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
  1195. uint64_t chunk)
  1196. {
  1197. uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
  1198. result |= (index << RDMA_WRID_BLOCK_SHIFT);
  1199. result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
  1200. return result;
  1201. }
  1202. /*
  1203. * Set bit for unregistration in the next iteration.
  1204. * We cannot transmit right here, but will unpin later.
  1205. */
  1206. static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
  1207. uint64_t chunk, uint64_t wr_id)
  1208. {
  1209. if (rdma->unregistrations[rdma->unregister_next] != 0) {
  1210. error_report("rdma migration: queue is full");
  1211. } else {
  1212. RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
  1213. if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
  1214. trace_qemu_rdma_signal_unregister_append(chunk,
  1215. rdma->unregister_next);
  1216. rdma->unregistrations[rdma->unregister_next++] =
  1217. qemu_rdma_make_wrid(wr_id, index, chunk);
  1218. if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
  1219. rdma->unregister_next = 0;
  1220. }
  1221. } else {
  1222. trace_qemu_rdma_signal_unregister_already(chunk);
  1223. }
  1224. }
  1225. }
  1226. /*
  1227. * Consult the connection manager to see a work request
  1228. * (of any kind) has completed.
  1229. * Return the work request ID that completed.
  1230. */
  1231. static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
  1232. uint32_t *byte_len)
  1233. {
  1234. int ret;
  1235. struct ibv_wc wc;
  1236. uint64_t wr_id;
  1237. ret = ibv_poll_cq(rdma->cq, 1, &wc);
  1238. if (!ret) {
  1239. *wr_id_out = RDMA_WRID_NONE;
  1240. return 0;
  1241. }
  1242. if (ret < 0) {
  1243. error_report("ibv_poll_cq return %d", ret);
  1244. return ret;
  1245. }
  1246. wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
  1247. if (wc.status != IBV_WC_SUCCESS) {
  1248. fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
  1249. wc.status, ibv_wc_status_str(wc.status));
  1250. fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
  1251. return -1;
  1252. }
  1253. if (rdma->control_ready_expected &&
  1254. (wr_id >= RDMA_WRID_RECV_CONTROL)) {
  1255. trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
  1256. wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
  1257. rdma->control_ready_expected = 0;
  1258. }
  1259. if (wr_id == RDMA_WRID_RDMA_WRITE) {
  1260. uint64_t chunk =
  1261. (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
  1262. uint64_t index =
  1263. (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
  1264. RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
  1265. trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
  1266. index, chunk, block->local_host_addr,
  1267. (void *)(uintptr_t)block->remote_host_addr);
  1268. clear_bit(chunk, block->transit_bitmap);
  1269. if (rdma->nb_sent > 0) {
  1270. rdma->nb_sent--;
  1271. }
  1272. if (!rdma->pin_all) {
  1273. /*
  1274. * FYI: If one wanted to signal a specific chunk to be unregistered
  1275. * using LRU or workload-specific information, this is the function
  1276. * you would call to do so. That chunk would then get asynchronously
  1277. * unregistered later.
  1278. */
  1279. #ifdef RDMA_UNREGISTRATION_EXAMPLE
  1280. qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
  1281. #endif
  1282. }
  1283. } else {
  1284. trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
  1285. }
  1286. *wr_id_out = wc.wr_id;
  1287. if (byte_len) {
  1288. *byte_len = wc.byte_len;
  1289. }
  1290. return 0;
  1291. }
  1292. /* Wait for activity on the completion channel.
  1293. * Returns 0 on success, none-0 on error.
  1294. */
  1295. static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
  1296. {
  1297. struct rdma_cm_event *cm_event;
  1298. int ret = -1;
  1299. /*
  1300. * Coroutine doesn't start until migration_fd_process_incoming()
  1301. * so don't yield unless we know we're running inside of a coroutine.
  1302. */
  1303. if (rdma->migration_started_on_destination &&
  1304. migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
  1305. yield_until_fd_readable(rdma->comp_channel->fd);
  1306. } else {
  1307. /* This is the source side, we're in a separate thread
  1308. * or destination prior to migration_fd_process_incoming()
  1309. * after postcopy, the destination also in a separate thread.
  1310. * we can't yield; so we have to poll the fd.
  1311. * But we need to be able to handle 'cancel' or an error
  1312. * without hanging forever.
  1313. */
  1314. while (!rdma->error_state && !rdma->received_error) {
  1315. GPollFD pfds[2];
  1316. pfds[0].fd = rdma->comp_channel->fd;
  1317. pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
  1318. pfds[0].revents = 0;
  1319. pfds[1].fd = rdma->channel->fd;
  1320. pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
  1321. pfds[1].revents = 0;
  1322. /* 0.1s timeout, should be fine for a 'cancel' */
  1323. switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
  1324. case 2:
  1325. case 1: /* fd active */
  1326. if (pfds[0].revents) {
  1327. return 0;
  1328. }
  1329. if (pfds[1].revents) {
  1330. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  1331. if (!ret) {
  1332. rdma_ack_cm_event(cm_event);
  1333. }
  1334. error_report("receive cm event while wait comp channel,"
  1335. "cm event is %d", cm_event->event);
  1336. if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
  1337. cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
  1338. return -EPIPE;
  1339. }
  1340. }
  1341. break;
  1342. case 0: /* Timeout, go around again */
  1343. break;
  1344. default: /* Error of some type -
  1345. * I don't trust errno from qemu_poll_ns
  1346. */
  1347. error_report("%s: poll failed", __func__);
  1348. return -EPIPE;
  1349. }
  1350. if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
  1351. /* Bail out and let the cancellation happen */
  1352. return -EPIPE;
  1353. }
  1354. }
  1355. }
  1356. if (rdma->received_error) {
  1357. return -EPIPE;
  1358. }
  1359. return rdma->error_state;
  1360. }
  1361. /*
  1362. * Block until the next work request has completed.
  1363. *
  1364. * First poll to see if a work request has already completed,
  1365. * otherwise block.
  1366. *
  1367. * If we encounter completed work requests for IDs other than
  1368. * the one we're interested in, then that's generally an error.
  1369. *
  1370. * The only exception is actual RDMA Write completions. These
  1371. * completions only need to be recorded, but do not actually
  1372. * need further processing.
  1373. */
  1374. static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
  1375. uint32_t *byte_len)
  1376. {
  1377. int num_cq_events = 0, ret = 0;
  1378. struct ibv_cq *cq;
  1379. void *cq_ctx;
  1380. uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
  1381. if (ibv_req_notify_cq(rdma->cq, 0)) {
  1382. return -1;
  1383. }
  1384. /* poll cq first */
  1385. while (wr_id != wrid_requested) {
  1386. ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
  1387. if (ret < 0) {
  1388. return ret;
  1389. }
  1390. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  1391. if (wr_id == RDMA_WRID_NONE) {
  1392. break;
  1393. }
  1394. if (wr_id != wrid_requested) {
  1395. trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
  1396. wrid_requested, print_wrid(wr_id), wr_id);
  1397. }
  1398. }
  1399. if (wr_id == wrid_requested) {
  1400. return 0;
  1401. }
  1402. while (1) {
  1403. ret = qemu_rdma_wait_comp_channel(rdma);
  1404. if (ret) {
  1405. goto err_block_for_wrid;
  1406. }
  1407. ret = ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx);
  1408. if (ret) {
  1409. perror("ibv_get_cq_event");
  1410. goto err_block_for_wrid;
  1411. }
  1412. num_cq_events++;
  1413. ret = -ibv_req_notify_cq(cq, 0);
  1414. if (ret) {
  1415. goto err_block_for_wrid;
  1416. }
  1417. while (wr_id != wrid_requested) {
  1418. ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
  1419. if (ret < 0) {
  1420. goto err_block_for_wrid;
  1421. }
  1422. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  1423. if (wr_id == RDMA_WRID_NONE) {
  1424. break;
  1425. }
  1426. if (wr_id != wrid_requested) {
  1427. trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
  1428. wrid_requested, print_wrid(wr_id), wr_id);
  1429. }
  1430. }
  1431. if (wr_id == wrid_requested) {
  1432. goto success_block_for_wrid;
  1433. }
  1434. }
  1435. success_block_for_wrid:
  1436. if (num_cq_events) {
  1437. ibv_ack_cq_events(cq, num_cq_events);
  1438. }
  1439. return 0;
  1440. err_block_for_wrid:
  1441. if (num_cq_events) {
  1442. ibv_ack_cq_events(cq, num_cq_events);
  1443. }
  1444. rdma->error_state = ret;
  1445. return ret;
  1446. }
  1447. /*
  1448. * Post a SEND message work request for the control channel
  1449. * containing some data and block until the post completes.
  1450. */
  1451. static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
  1452. RDMAControlHeader *head)
  1453. {
  1454. int ret = 0;
  1455. RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
  1456. struct ibv_send_wr *bad_wr;
  1457. struct ibv_sge sge = {
  1458. .addr = (uintptr_t)(wr->control),
  1459. .length = head->len + sizeof(RDMAControlHeader),
  1460. .lkey = wr->control_mr->lkey,
  1461. };
  1462. struct ibv_send_wr send_wr = {
  1463. .wr_id = RDMA_WRID_SEND_CONTROL,
  1464. .opcode = IBV_WR_SEND,
  1465. .send_flags = IBV_SEND_SIGNALED,
  1466. .sg_list = &sge,
  1467. .num_sge = 1,
  1468. };
  1469. trace_qemu_rdma_post_send_control(control_desc(head->type));
  1470. /*
  1471. * We don't actually need to do a memcpy() in here if we used
  1472. * the "sge" properly, but since we're only sending control messages
  1473. * (not RAM in a performance-critical path), then its OK for now.
  1474. *
  1475. * The copy makes the RDMAControlHeader simpler to manipulate
  1476. * for the time being.
  1477. */
  1478. assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
  1479. memcpy(wr->control, head, sizeof(RDMAControlHeader));
  1480. control_to_network((void *) wr->control);
  1481. if (buf) {
  1482. memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
  1483. }
  1484. ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
  1485. if (ret > 0) {
  1486. error_report("Failed to use post IB SEND for control");
  1487. return -ret;
  1488. }
  1489. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
  1490. if (ret < 0) {
  1491. error_report("rdma migration: send polling control error");
  1492. }
  1493. return ret;
  1494. }
  1495. /*
  1496. * Post a RECV work request in anticipation of some future receipt
  1497. * of data on the control channel.
  1498. */
  1499. static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
  1500. {
  1501. struct ibv_recv_wr *bad_wr;
  1502. struct ibv_sge sge = {
  1503. .addr = (uintptr_t)(rdma->wr_data[idx].control),
  1504. .length = RDMA_CONTROL_MAX_BUFFER,
  1505. .lkey = rdma->wr_data[idx].control_mr->lkey,
  1506. };
  1507. struct ibv_recv_wr recv_wr = {
  1508. .wr_id = RDMA_WRID_RECV_CONTROL + idx,
  1509. .sg_list = &sge,
  1510. .num_sge = 1,
  1511. };
  1512. if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
  1513. return -1;
  1514. }
  1515. return 0;
  1516. }
  1517. /*
  1518. * Block and wait for a RECV control channel message to arrive.
  1519. */
  1520. static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
  1521. RDMAControlHeader *head, int expecting, int idx)
  1522. {
  1523. uint32_t byte_len;
  1524. int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
  1525. &byte_len);
  1526. if (ret < 0) {
  1527. error_report("rdma migration: recv polling control error!");
  1528. return ret;
  1529. }
  1530. network_to_control((void *) rdma->wr_data[idx].control);
  1531. memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
  1532. trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
  1533. if (expecting == RDMA_CONTROL_NONE) {
  1534. trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
  1535. head->type);
  1536. } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
  1537. error_report("Was expecting a %s (%d) control message"
  1538. ", but got: %s (%d), length: %d",
  1539. control_desc(expecting), expecting,
  1540. control_desc(head->type), head->type, head->len);
  1541. if (head->type == RDMA_CONTROL_ERROR) {
  1542. rdma->received_error = true;
  1543. }
  1544. return -EIO;
  1545. }
  1546. if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
  1547. error_report("too long length: %d", head->len);
  1548. return -EINVAL;
  1549. }
  1550. if (sizeof(*head) + head->len != byte_len) {
  1551. error_report("Malformed length: %d byte_len %d", head->len, byte_len);
  1552. return -EINVAL;
  1553. }
  1554. return 0;
  1555. }
  1556. /*
  1557. * When a RECV work request has completed, the work request's
  1558. * buffer is pointed at the header.
  1559. *
  1560. * This will advance the pointer to the data portion
  1561. * of the control message of the work request's buffer that
  1562. * was populated after the work request finished.
  1563. */
  1564. static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
  1565. RDMAControlHeader *head)
  1566. {
  1567. rdma->wr_data[idx].control_len = head->len;
  1568. rdma->wr_data[idx].control_curr =
  1569. rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
  1570. }
  1571. /*
  1572. * This is an 'atomic' high-level operation to deliver a single, unified
  1573. * control-channel message.
  1574. *
  1575. * Additionally, if the user is expecting some kind of reply to this message,
  1576. * they can request a 'resp' response message be filled in by posting an
  1577. * additional work request on behalf of the user and waiting for an additional
  1578. * completion.
  1579. *
  1580. * The extra (optional) response is used during registration to us from having
  1581. * to perform an *additional* exchange of message just to provide a response by
  1582. * instead piggy-backing on the acknowledgement.
  1583. */
  1584. static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
  1585. uint8_t *data, RDMAControlHeader *resp,
  1586. int *resp_idx,
  1587. int (*callback)(RDMAContext *rdma))
  1588. {
  1589. int ret = 0;
  1590. /*
  1591. * Wait until the dest is ready before attempting to deliver the message
  1592. * by waiting for a READY message.
  1593. */
  1594. if (rdma->control_ready_expected) {
  1595. RDMAControlHeader resp;
  1596. ret = qemu_rdma_exchange_get_response(rdma,
  1597. &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
  1598. if (ret < 0) {
  1599. return ret;
  1600. }
  1601. }
  1602. /*
  1603. * If the user is expecting a response, post a WR in anticipation of it.
  1604. */
  1605. if (resp) {
  1606. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
  1607. if (ret) {
  1608. error_report("rdma migration: error posting"
  1609. " extra control recv for anticipated result!");
  1610. return ret;
  1611. }
  1612. }
  1613. /*
  1614. * Post a WR to replace the one we just consumed for the READY message.
  1615. */
  1616. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
  1617. if (ret) {
  1618. error_report("rdma migration: error posting first control recv!");
  1619. return ret;
  1620. }
  1621. /*
  1622. * Deliver the control message that was requested.
  1623. */
  1624. ret = qemu_rdma_post_send_control(rdma, data, head);
  1625. if (ret < 0) {
  1626. error_report("Failed to send control buffer!");
  1627. return ret;
  1628. }
  1629. /*
  1630. * If we're expecting a response, block and wait for it.
  1631. */
  1632. if (resp) {
  1633. if (callback) {
  1634. trace_qemu_rdma_exchange_send_issue_callback();
  1635. ret = callback(rdma);
  1636. if (ret < 0) {
  1637. return ret;
  1638. }
  1639. }
  1640. trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
  1641. ret = qemu_rdma_exchange_get_response(rdma, resp,
  1642. resp->type, RDMA_WRID_DATA);
  1643. if (ret < 0) {
  1644. return ret;
  1645. }
  1646. qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
  1647. if (resp_idx) {
  1648. *resp_idx = RDMA_WRID_DATA;
  1649. }
  1650. trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
  1651. }
  1652. rdma->control_ready_expected = 1;
  1653. return 0;
  1654. }
  1655. /*
  1656. * This is an 'atomic' high-level operation to receive a single, unified
  1657. * control-channel message.
  1658. */
  1659. static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
  1660. int expecting)
  1661. {
  1662. RDMAControlHeader ready = {
  1663. .len = 0,
  1664. .type = RDMA_CONTROL_READY,
  1665. .repeat = 1,
  1666. };
  1667. int ret;
  1668. /*
  1669. * Inform the source that we're ready to receive a message.
  1670. */
  1671. ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
  1672. if (ret < 0) {
  1673. error_report("Failed to send control buffer!");
  1674. return ret;
  1675. }
  1676. /*
  1677. * Block and wait for the message.
  1678. */
  1679. ret = qemu_rdma_exchange_get_response(rdma, head,
  1680. expecting, RDMA_WRID_READY);
  1681. if (ret < 0) {
  1682. return ret;
  1683. }
  1684. qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
  1685. /*
  1686. * Post a new RECV work request to replace the one we just consumed.
  1687. */
  1688. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
  1689. if (ret) {
  1690. error_report("rdma migration: error posting second control recv!");
  1691. return ret;
  1692. }
  1693. return 0;
  1694. }
  1695. /*
  1696. * Write an actual chunk of memory using RDMA.
  1697. *
  1698. * If we're using dynamic registration on the dest-side, we have to
  1699. * send a registration command first.
  1700. */
  1701. static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
  1702. int current_index, uint64_t current_addr,
  1703. uint64_t length)
  1704. {
  1705. struct ibv_sge sge;
  1706. struct ibv_send_wr send_wr = { 0 };
  1707. struct ibv_send_wr *bad_wr;
  1708. int reg_result_idx, ret, count = 0;
  1709. uint64_t chunk, chunks;
  1710. uint8_t *chunk_start, *chunk_end;
  1711. RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
  1712. RDMARegister reg;
  1713. RDMARegisterResult *reg_result;
  1714. RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
  1715. RDMAControlHeader head = { .len = sizeof(RDMARegister),
  1716. .type = RDMA_CONTROL_REGISTER_REQUEST,
  1717. .repeat = 1,
  1718. };
  1719. retry:
  1720. sge.addr = (uintptr_t)(block->local_host_addr +
  1721. (current_addr - block->offset));
  1722. sge.length = length;
  1723. chunk = ram_chunk_index(block->local_host_addr,
  1724. (uint8_t *)(uintptr_t)sge.addr);
  1725. chunk_start = ram_chunk_start(block, chunk);
  1726. if (block->is_ram_block) {
  1727. chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
  1728. if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
  1729. chunks--;
  1730. }
  1731. } else {
  1732. chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
  1733. if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
  1734. chunks--;
  1735. }
  1736. }
  1737. trace_qemu_rdma_write_one_top(chunks + 1,
  1738. (chunks + 1) *
  1739. (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
  1740. chunk_end = ram_chunk_end(block, chunk + chunks);
  1741. if (!rdma->pin_all) {
  1742. #ifdef RDMA_UNREGISTRATION_EXAMPLE
  1743. qemu_rdma_unregister_waiting(rdma);
  1744. #endif
  1745. }
  1746. while (test_bit(chunk, block->transit_bitmap)) {
  1747. (void)count;
  1748. trace_qemu_rdma_write_one_block(count++, current_index, chunk,
  1749. sge.addr, length, rdma->nb_sent, block->nb_chunks);
  1750. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
  1751. if (ret < 0) {
  1752. error_report("Failed to Wait for previous write to complete "
  1753. "block %d chunk %" PRIu64
  1754. " current %" PRIu64 " len %" PRIu64 " %d",
  1755. current_index, chunk, sge.addr, length, rdma->nb_sent);
  1756. return ret;
  1757. }
  1758. }
  1759. if (!rdma->pin_all || !block->is_ram_block) {
  1760. if (!block->remote_keys[chunk]) {
  1761. /*
  1762. * This chunk has not yet been registered, so first check to see
  1763. * if the entire chunk is zero. If so, tell the other size to
  1764. * memset() + madvise() the entire chunk without RDMA.
  1765. */
  1766. if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
  1767. RDMACompress comp = {
  1768. .offset = current_addr,
  1769. .value = 0,
  1770. .block_idx = current_index,
  1771. .length = length,
  1772. };
  1773. head.len = sizeof(comp);
  1774. head.type = RDMA_CONTROL_COMPRESS;
  1775. trace_qemu_rdma_write_one_zero(chunk, sge.length,
  1776. current_index, current_addr);
  1777. compress_to_network(rdma, &comp);
  1778. ret = qemu_rdma_exchange_send(rdma, &head,
  1779. (uint8_t *) &comp, NULL, NULL, NULL);
  1780. if (ret < 0) {
  1781. return -EIO;
  1782. }
  1783. acct_update_position(f, sge.length, true);
  1784. return 1;
  1785. }
  1786. /*
  1787. * Otherwise, tell other side to register.
  1788. */
  1789. reg.current_index = current_index;
  1790. if (block->is_ram_block) {
  1791. reg.key.current_addr = current_addr;
  1792. } else {
  1793. reg.key.chunk = chunk;
  1794. }
  1795. reg.chunks = chunks;
  1796. trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
  1797. current_addr);
  1798. register_to_network(rdma, &reg);
  1799. ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
  1800. &resp, &reg_result_idx, NULL);
  1801. if (ret < 0) {
  1802. return ret;
  1803. }
  1804. /* try to overlap this single registration with the one we sent. */
  1805. if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
  1806. &sge.lkey, NULL, chunk,
  1807. chunk_start, chunk_end)) {
  1808. error_report("cannot get lkey");
  1809. return -EINVAL;
  1810. }
  1811. reg_result = (RDMARegisterResult *)
  1812. rdma->wr_data[reg_result_idx].control_curr;
  1813. network_to_result(reg_result);
  1814. trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
  1815. reg_result->rkey, chunk);
  1816. block->remote_keys[chunk] = reg_result->rkey;
  1817. block->remote_host_addr = reg_result->host_addr;
  1818. } else {
  1819. /* already registered before */
  1820. if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
  1821. &sge.lkey, NULL, chunk,
  1822. chunk_start, chunk_end)) {
  1823. error_report("cannot get lkey!");
  1824. return -EINVAL;
  1825. }
  1826. }
  1827. send_wr.wr.rdma.rkey = block->remote_keys[chunk];
  1828. } else {
  1829. send_wr.wr.rdma.rkey = block->remote_rkey;
  1830. if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
  1831. &sge.lkey, NULL, chunk,
  1832. chunk_start, chunk_end)) {
  1833. error_report("cannot get lkey!");
  1834. return -EINVAL;
  1835. }
  1836. }
  1837. /*
  1838. * Encode the ram block index and chunk within this wrid.
  1839. * We will use this information at the time of completion
  1840. * to figure out which bitmap to check against and then which
  1841. * chunk in the bitmap to look for.
  1842. */
  1843. send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
  1844. current_index, chunk);
  1845. send_wr.opcode = IBV_WR_RDMA_WRITE;
  1846. send_wr.send_flags = IBV_SEND_SIGNALED;
  1847. send_wr.sg_list = &sge;
  1848. send_wr.num_sge = 1;
  1849. send_wr.wr.rdma.remote_addr = block->remote_host_addr +
  1850. (current_addr - block->offset);
  1851. trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
  1852. sge.length);
  1853. /*
  1854. * ibv_post_send() does not return negative error numbers,
  1855. * per the specification they are positive - no idea why.
  1856. */
  1857. ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
  1858. if (ret == ENOMEM) {
  1859. trace_qemu_rdma_write_one_queue_full();
  1860. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
  1861. if (ret < 0) {
  1862. error_report("rdma migration: failed to make "
  1863. "room in full send queue! %d", ret);
  1864. return ret;
  1865. }
  1866. goto retry;
  1867. } else if (ret > 0) {
  1868. perror("rdma migration: post rdma write failed");
  1869. return -ret;
  1870. }
  1871. set_bit(chunk, block->transit_bitmap);
  1872. acct_update_position(f, sge.length, false);
  1873. rdma->total_writes++;
  1874. return 0;
  1875. }
  1876. /*
  1877. * Push out any unwritten RDMA operations.
  1878. *
  1879. * We support sending out multiple chunks at the same time.
  1880. * Not all of them need to get signaled in the completion queue.
  1881. */
  1882. static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
  1883. {
  1884. int ret;
  1885. if (!rdma->current_length) {
  1886. return 0;
  1887. }
  1888. ret = qemu_rdma_write_one(f, rdma,
  1889. rdma->current_index, rdma->current_addr, rdma->current_length);
  1890. if (ret < 0) {
  1891. return ret;
  1892. }
  1893. if (ret == 0) {
  1894. rdma->nb_sent++;
  1895. trace_qemu_rdma_write_flush(rdma->nb_sent);
  1896. }
  1897. rdma->current_length = 0;
  1898. rdma->current_addr = 0;
  1899. return 0;
  1900. }
  1901. static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
  1902. uint64_t offset, uint64_t len)
  1903. {
  1904. RDMALocalBlock *block;
  1905. uint8_t *host_addr;
  1906. uint8_t *chunk_end;
  1907. if (rdma->current_index < 0) {
  1908. return 0;
  1909. }
  1910. if (rdma->current_chunk < 0) {
  1911. return 0;
  1912. }
  1913. block = &(rdma->local_ram_blocks.block[rdma->current_index]);
  1914. host_addr = block->local_host_addr + (offset - block->offset);
  1915. chunk_end = ram_chunk_end(block, rdma->current_chunk);
  1916. if (rdma->current_length == 0) {
  1917. return 0;
  1918. }
  1919. /*
  1920. * Only merge into chunk sequentially.
  1921. */
  1922. if (offset != (rdma->current_addr + rdma->current_length)) {
  1923. return 0;
  1924. }
  1925. if (offset < block->offset) {
  1926. return 0;
  1927. }
  1928. if ((offset + len) > (block->offset + block->length)) {
  1929. return 0;
  1930. }
  1931. if ((host_addr + len) > chunk_end) {
  1932. return 0;
  1933. }
  1934. return 1;
  1935. }
  1936. /*
  1937. * We're not actually writing here, but doing three things:
  1938. *
  1939. * 1. Identify the chunk the buffer belongs to.
  1940. * 2. If the chunk is full or the buffer doesn't belong to the current
  1941. * chunk, then start a new chunk and flush() the old chunk.
  1942. * 3. To keep the hardware busy, we also group chunks into batches
  1943. * and only require that a batch gets acknowledged in the completion
  1944. * queue instead of each individual chunk.
  1945. */
  1946. static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
  1947. uint64_t block_offset, uint64_t offset,
  1948. uint64_t len)
  1949. {
  1950. uint64_t current_addr = block_offset + offset;
  1951. uint64_t index = rdma->current_index;
  1952. uint64_t chunk = rdma->current_chunk;
  1953. int ret;
  1954. /* If we cannot merge it, we flush the current buffer first. */
  1955. if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
  1956. ret = qemu_rdma_write_flush(f, rdma);
  1957. if (ret) {
  1958. return ret;
  1959. }
  1960. rdma->current_length = 0;
  1961. rdma->current_addr = current_addr;
  1962. ret = qemu_rdma_search_ram_block(rdma, block_offset,
  1963. offset, len, &index, &chunk);
  1964. if (ret) {
  1965. error_report("ram block search failed");
  1966. return ret;
  1967. }
  1968. rdma->current_index = index;
  1969. rdma->current_chunk = chunk;
  1970. }
  1971. /* merge it */
  1972. rdma->current_length += len;
  1973. /* flush it if buffer is too large */
  1974. if (rdma->current_length >= RDMA_MERGE_MAX) {
  1975. return qemu_rdma_write_flush(f, rdma);
  1976. }
  1977. return 0;
  1978. }
  1979. static void qemu_rdma_cleanup(RDMAContext *rdma)
  1980. {
  1981. int idx;
  1982. if (rdma->cm_id && rdma->connected) {
  1983. if ((rdma->error_state ||
  1984. migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
  1985. !rdma->received_error) {
  1986. RDMAControlHeader head = { .len = 0,
  1987. .type = RDMA_CONTROL_ERROR,
  1988. .repeat = 1,
  1989. };
  1990. error_report("Early error. Sending error.");
  1991. qemu_rdma_post_send_control(rdma, NULL, &head);
  1992. }
  1993. rdma_disconnect(rdma->cm_id);
  1994. trace_qemu_rdma_cleanup_disconnect();
  1995. rdma->connected = false;
  1996. }
  1997. if (rdma->channel) {
  1998. qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
  1999. }
  2000. g_free(rdma->dest_blocks);
  2001. rdma->dest_blocks = NULL;
  2002. for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
  2003. if (rdma->wr_data[idx].control_mr) {
  2004. rdma->total_registrations--;
  2005. ibv_dereg_mr(rdma->wr_data[idx].control_mr);
  2006. }
  2007. rdma->wr_data[idx].control_mr = NULL;
  2008. }
  2009. if (rdma->local_ram_blocks.block) {
  2010. while (rdma->local_ram_blocks.nb_blocks) {
  2011. rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
  2012. }
  2013. }
  2014. if (rdma->qp) {
  2015. rdma_destroy_qp(rdma->cm_id);
  2016. rdma->qp = NULL;
  2017. }
  2018. if (rdma->cq) {
  2019. ibv_destroy_cq(rdma->cq);
  2020. rdma->cq = NULL;
  2021. }
  2022. if (rdma->comp_channel) {
  2023. ibv_destroy_comp_channel(rdma->comp_channel);
  2024. rdma->comp_channel = NULL;
  2025. }
  2026. if (rdma->pd) {
  2027. ibv_dealloc_pd(rdma->pd);
  2028. rdma->pd = NULL;
  2029. }
  2030. if (rdma->cm_id) {
  2031. rdma_destroy_id(rdma->cm_id);
  2032. rdma->cm_id = NULL;
  2033. }
  2034. /* the destination side, listen_id and channel is shared */
  2035. if (rdma->listen_id) {
  2036. if (!rdma->is_return_path) {
  2037. rdma_destroy_id(rdma->listen_id);
  2038. }
  2039. rdma->listen_id = NULL;
  2040. if (rdma->channel) {
  2041. if (!rdma->is_return_path) {
  2042. rdma_destroy_event_channel(rdma->channel);
  2043. }
  2044. rdma->channel = NULL;
  2045. }
  2046. }
  2047. if (rdma->channel) {
  2048. rdma_destroy_event_channel(rdma->channel);
  2049. rdma->channel = NULL;
  2050. }
  2051. g_free(rdma->host);
  2052. rdma->host = NULL;
  2053. }
  2054. static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
  2055. {
  2056. int ret, idx;
  2057. Error *local_err = NULL, **temp = &local_err;
  2058. /*
  2059. * Will be validated against destination's actual capabilities
  2060. * after the connect() completes.
  2061. */
  2062. rdma->pin_all = pin_all;
  2063. ret = qemu_rdma_resolve_host(rdma, temp);
  2064. if (ret) {
  2065. goto err_rdma_source_init;
  2066. }
  2067. ret = qemu_rdma_alloc_pd_cq(rdma);
  2068. if (ret) {
  2069. ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
  2070. " limits may be too low. Please check $ ulimit -a # and "
  2071. "search for 'ulimit -l' in the output");
  2072. goto err_rdma_source_init;
  2073. }
  2074. ret = qemu_rdma_alloc_qp(rdma);
  2075. if (ret) {
  2076. ERROR(temp, "rdma migration: error allocating qp!");
  2077. goto err_rdma_source_init;
  2078. }
  2079. ret = qemu_rdma_init_ram_blocks(rdma);
  2080. if (ret) {
  2081. ERROR(temp, "rdma migration: error initializing ram blocks!");
  2082. goto err_rdma_source_init;
  2083. }
  2084. /* Build the hash that maps from offset to RAMBlock */
  2085. rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
  2086. for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
  2087. g_hash_table_insert(rdma->blockmap,
  2088. (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
  2089. &rdma->local_ram_blocks.block[idx]);
  2090. }
  2091. for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
  2092. ret = qemu_rdma_reg_control(rdma, idx);
  2093. if (ret) {
  2094. ERROR(temp, "rdma migration: error registering %d control!",
  2095. idx);
  2096. goto err_rdma_source_init;
  2097. }
  2098. }
  2099. return 0;
  2100. err_rdma_source_init:
  2101. error_propagate(errp, local_err);
  2102. qemu_rdma_cleanup(rdma);
  2103. return -1;
  2104. }
  2105. static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
  2106. {
  2107. RDMACapabilities cap = {
  2108. .version = RDMA_CONTROL_VERSION_CURRENT,
  2109. .flags = 0,
  2110. };
  2111. struct rdma_conn_param conn_param = { .initiator_depth = 2,
  2112. .retry_count = 5,
  2113. .private_data = &cap,
  2114. .private_data_len = sizeof(cap),
  2115. };
  2116. struct rdma_cm_event *cm_event;
  2117. int ret;
  2118. /*
  2119. * Only negotiate the capability with destination if the user
  2120. * on the source first requested the capability.
  2121. */
  2122. if (rdma->pin_all) {
  2123. trace_qemu_rdma_connect_pin_all_requested();
  2124. cap.flags |= RDMA_CAPABILITY_PIN_ALL;
  2125. }
  2126. caps_to_network(&cap);
  2127. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
  2128. if (ret) {
  2129. ERROR(errp, "posting second control recv");
  2130. goto err_rdma_source_connect;
  2131. }
  2132. ret = rdma_connect(rdma->cm_id, &conn_param);
  2133. if (ret) {
  2134. perror("rdma_connect");
  2135. ERROR(errp, "connecting to destination!");
  2136. goto err_rdma_source_connect;
  2137. }
  2138. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2139. if (ret) {
  2140. perror("rdma_get_cm_event after rdma_connect");
  2141. ERROR(errp, "connecting to destination!");
  2142. rdma_ack_cm_event(cm_event);
  2143. goto err_rdma_source_connect;
  2144. }
  2145. if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
  2146. perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
  2147. ERROR(errp, "connecting to destination!");
  2148. rdma_ack_cm_event(cm_event);
  2149. goto err_rdma_source_connect;
  2150. }
  2151. rdma->connected = true;
  2152. memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
  2153. network_to_caps(&cap);
  2154. /*
  2155. * Verify that the *requested* capabilities are supported by the destination
  2156. * and disable them otherwise.
  2157. */
  2158. if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
  2159. ERROR(errp, "Server cannot support pinning all memory. "
  2160. "Will register memory dynamically.");
  2161. rdma->pin_all = false;
  2162. }
  2163. trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
  2164. rdma_ack_cm_event(cm_event);
  2165. rdma->control_ready_expected = 1;
  2166. rdma->nb_sent = 0;
  2167. return 0;
  2168. err_rdma_source_connect:
  2169. qemu_rdma_cleanup(rdma);
  2170. return -1;
  2171. }
  2172. static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
  2173. {
  2174. int ret, idx;
  2175. struct rdma_cm_id *listen_id;
  2176. char ip[40] = "unknown";
  2177. struct rdma_addrinfo *res, *e;
  2178. char port_str[16];
  2179. for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
  2180. rdma->wr_data[idx].control_len = 0;
  2181. rdma->wr_data[idx].control_curr = NULL;
  2182. }
  2183. if (!rdma->host || !rdma->host[0]) {
  2184. ERROR(errp, "RDMA host is not set!");
  2185. rdma->error_state = -EINVAL;
  2186. return -1;
  2187. }
  2188. /* create CM channel */
  2189. rdma->channel = rdma_create_event_channel();
  2190. if (!rdma->channel) {
  2191. ERROR(errp, "could not create rdma event channel");
  2192. rdma->error_state = -EINVAL;
  2193. return -1;
  2194. }
  2195. /* create CM id */
  2196. ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
  2197. if (ret) {
  2198. ERROR(errp, "could not create cm_id!");
  2199. goto err_dest_init_create_listen_id;
  2200. }
  2201. snprintf(port_str, 16, "%d", rdma->port);
  2202. port_str[15] = '\0';
  2203. ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
  2204. if (ret < 0) {
  2205. ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
  2206. goto err_dest_init_bind_addr;
  2207. }
  2208. for (e = res; e != NULL; e = e->ai_next) {
  2209. inet_ntop(e->ai_family,
  2210. &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
  2211. trace_qemu_rdma_dest_init_trying(rdma->host, ip);
  2212. ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
  2213. if (ret) {
  2214. continue;
  2215. }
  2216. if (e->ai_family == AF_INET6) {
  2217. ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
  2218. if (ret) {
  2219. continue;
  2220. }
  2221. }
  2222. break;
  2223. }
  2224. if (!e) {
  2225. ERROR(errp, "Error: could not rdma_bind_addr!");
  2226. goto err_dest_init_bind_addr;
  2227. }
  2228. rdma->listen_id = listen_id;
  2229. qemu_rdma_dump_gid("dest_init", listen_id);
  2230. return 0;
  2231. err_dest_init_bind_addr:
  2232. rdma_destroy_id(listen_id);
  2233. err_dest_init_create_listen_id:
  2234. rdma_destroy_event_channel(rdma->channel);
  2235. rdma->channel = NULL;
  2236. rdma->error_state = ret;
  2237. return ret;
  2238. }
  2239. static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
  2240. RDMAContext *rdma)
  2241. {
  2242. int idx;
  2243. for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
  2244. rdma_return_path->wr_data[idx].control_len = 0;
  2245. rdma_return_path->wr_data[idx].control_curr = NULL;
  2246. }
  2247. /*the CM channel and CM id is shared*/
  2248. rdma_return_path->channel = rdma->channel;
  2249. rdma_return_path->listen_id = rdma->listen_id;
  2250. rdma->return_path = rdma_return_path;
  2251. rdma_return_path->return_path = rdma;
  2252. rdma_return_path->is_return_path = true;
  2253. }
  2254. static void *qemu_rdma_data_init(const char *host_port, Error **errp)
  2255. {
  2256. RDMAContext *rdma = NULL;
  2257. InetSocketAddress *addr;
  2258. if (host_port) {
  2259. rdma = g_new0(RDMAContext, 1);
  2260. rdma->current_index = -1;
  2261. rdma->current_chunk = -1;
  2262. addr = g_new(InetSocketAddress, 1);
  2263. if (!inet_parse(addr, host_port, NULL)) {
  2264. rdma->port = atoi(addr->port);
  2265. rdma->host = g_strdup(addr->host);
  2266. } else {
  2267. ERROR(errp, "bad RDMA migration address '%s'", host_port);
  2268. g_free(rdma);
  2269. rdma = NULL;
  2270. }
  2271. qapi_free_InetSocketAddress(addr);
  2272. }
  2273. return rdma;
  2274. }
  2275. /*
  2276. * QEMUFile interface to the control channel.
  2277. * SEND messages for control only.
  2278. * VM's ram is handled with regular RDMA messages.
  2279. */
  2280. static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
  2281. const struct iovec *iov,
  2282. size_t niov,
  2283. int *fds,
  2284. size_t nfds,
  2285. Error **errp)
  2286. {
  2287. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2288. QEMUFile *f = rioc->file;
  2289. RDMAContext *rdma;
  2290. int ret;
  2291. ssize_t done = 0;
  2292. size_t i;
  2293. size_t len = 0;
  2294. RCU_READ_LOCK_GUARD();
  2295. rdma = qatomic_rcu_read(&rioc->rdmaout);
  2296. if (!rdma) {
  2297. return -EIO;
  2298. }
  2299. CHECK_ERROR_STATE();
  2300. /*
  2301. * Push out any writes that
  2302. * we're queued up for VM's ram.
  2303. */
  2304. ret = qemu_rdma_write_flush(f, rdma);
  2305. if (ret < 0) {
  2306. rdma->error_state = ret;
  2307. return ret;
  2308. }
  2309. for (i = 0; i < niov; i++) {
  2310. size_t remaining = iov[i].iov_len;
  2311. uint8_t * data = (void *)iov[i].iov_base;
  2312. while (remaining) {
  2313. RDMAControlHeader head;
  2314. len = MIN(remaining, RDMA_SEND_INCREMENT);
  2315. remaining -= len;
  2316. head.len = len;
  2317. head.type = RDMA_CONTROL_QEMU_FILE;
  2318. ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
  2319. if (ret < 0) {
  2320. rdma->error_state = ret;
  2321. return ret;
  2322. }
  2323. data += len;
  2324. done += len;
  2325. }
  2326. }
  2327. return done;
  2328. }
  2329. static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
  2330. size_t size, int idx)
  2331. {
  2332. size_t len = 0;
  2333. if (rdma->wr_data[idx].control_len) {
  2334. trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
  2335. len = MIN(size, rdma->wr_data[idx].control_len);
  2336. memcpy(buf, rdma->wr_data[idx].control_curr, len);
  2337. rdma->wr_data[idx].control_curr += len;
  2338. rdma->wr_data[idx].control_len -= len;
  2339. }
  2340. return len;
  2341. }
  2342. /*
  2343. * QEMUFile interface to the control channel.
  2344. * RDMA links don't use bytestreams, so we have to
  2345. * return bytes to QEMUFile opportunistically.
  2346. */
  2347. static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
  2348. const struct iovec *iov,
  2349. size_t niov,
  2350. int **fds,
  2351. size_t *nfds,
  2352. Error **errp)
  2353. {
  2354. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2355. RDMAContext *rdma;
  2356. RDMAControlHeader head;
  2357. int ret = 0;
  2358. ssize_t i;
  2359. size_t done = 0;
  2360. RCU_READ_LOCK_GUARD();
  2361. rdma = qatomic_rcu_read(&rioc->rdmain);
  2362. if (!rdma) {
  2363. return -EIO;
  2364. }
  2365. CHECK_ERROR_STATE();
  2366. for (i = 0; i < niov; i++) {
  2367. size_t want = iov[i].iov_len;
  2368. uint8_t *data = (void *)iov[i].iov_base;
  2369. /*
  2370. * First, we hold on to the last SEND message we
  2371. * were given and dish out the bytes until we run
  2372. * out of bytes.
  2373. */
  2374. ret = qemu_rdma_fill(rdma, data, want, 0);
  2375. done += ret;
  2376. want -= ret;
  2377. /* Got what we needed, so go to next iovec */
  2378. if (want == 0) {
  2379. continue;
  2380. }
  2381. /* If we got any data so far, then don't wait
  2382. * for more, just return what we have */
  2383. if (done > 0) {
  2384. break;
  2385. }
  2386. /* We've got nothing at all, so lets wait for
  2387. * more to arrive
  2388. */
  2389. ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
  2390. if (ret < 0) {
  2391. rdma->error_state = ret;
  2392. return ret;
  2393. }
  2394. /*
  2395. * SEND was received with new bytes, now try again.
  2396. */
  2397. ret = qemu_rdma_fill(rdma, data, want, 0);
  2398. done += ret;
  2399. want -= ret;
  2400. /* Still didn't get enough, so lets just return */
  2401. if (want) {
  2402. if (done == 0) {
  2403. return QIO_CHANNEL_ERR_BLOCK;
  2404. } else {
  2405. break;
  2406. }
  2407. }
  2408. }
  2409. return done;
  2410. }
  2411. /*
  2412. * Block until all the outstanding chunks have been delivered by the hardware.
  2413. */
  2414. static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
  2415. {
  2416. int ret;
  2417. if (qemu_rdma_write_flush(f, rdma) < 0) {
  2418. return -EIO;
  2419. }
  2420. while (rdma->nb_sent) {
  2421. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
  2422. if (ret < 0) {
  2423. error_report("rdma migration: complete polling error!");
  2424. return -EIO;
  2425. }
  2426. }
  2427. qemu_rdma_unregister_waiting(rdma);
  2428. return 0;
  2429. }
  2430. static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
  2431. bool blocking,
  2432. Error **errp)
  2433. {
  2434. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2435. /* XXX we should make readv/writev actually honour this :-) */
  2436. rioc->blocking = blocking;
  2437. return 0;
  2438. }
  2439. typedef struct QIOChannelRDMASource QIOChannelRDMASource;
  2440. struct QIOChannelRDMASource {
  2441. GSource parent;
  2442. QIOChannelRDMA *rioc;
  2443. GIOCondition condition;
  2444. };
  2445. static gboolean
  2446. qio_channel_rdma_source_prepare(GSource *source,
  2447. gint *timeout)
  2448. {
  2449. QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
  2450. RDMAContext *rdma;
  2451. GIOCondition cond = 0;
  2452. *timeout = -1;
  2453. RCU_READ_LOCK_GUARD();
  2454. if (rsource->condition == G_IO_IN) {
  2455. rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
  2456. } else {
  2457. rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
  2458. }
  2459. if (!rdma) {
  2460. error_report("RDMAContext is NULL when prepare Gsource");
  2461. return FALSE;
  2462. }
  2463. if (rdma->wr_data[0].control_len) {
  2464. cond |= G_IO_IN;
  2465. }
  2466. cond |= G_IO_OUT;
  2467. return cond & rsource->condition;
  2468. }
  2469. static gboolean
  2470. qio_channel_rdma_source_check(GSource *source)
  2471. {
  2472. QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
  2473. RDMAContext *rdma;
  2474. GIOCondition cond = 0;
  2475. RCU_READ_LOCK_GUARD();
  2476. if (rsource->condition == G_IO_IN) {
  2477. rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
  2478. } else {
  2479. rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
  2480. }
  2481. if (!rdma) {
  2482. error_report("RDMAContext is NULL when check Gsource");
  2483. return FALSE;
  2484. }
  2485. if (rdma->wr_data[0].control_len) {
  2486. cond |= G_IO_IN;
  2487. }
  2488. cond |= G_IO_OUT;
  2489. return cond & rsource->condition;
  2490. }
  2491. static gboolean
  2492. qio_channel_rdma_source_dispatch(GSource *source,
  2493. GSourceFunc callback,
  2494. gpointer user_data)
  2495. {
  2496. QIOChannelFunc func = (QIOChannelFunc)callback;
  2497. QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
  2498. RDMAContext *rdma;
  2499. GIOCondition cond = 0;
  2500. RCU_READ_LOCK_GUARD();
  2501. if (rsource->condition == G_IO_IN) {
  2502. rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
  2503. } else {
  2504. rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
  2505. }
  2506. if (!rdma) {
  2507. error_report("RDMAContext is NULL when dispatch Gsource");
  2508. return FALSE;
  2509. }
  2510. if (rdma->wr_data[0].control_len) {
  2511. cond |= G_IO_IN;
  2512. }
  2513. cond |= G_IO_OUT;
  2514. return (*func)(QIO_CHANNEL(rsource->rioc),
  2515. (cond & rsource->condition),
  2516. user_data);
  2517. }
  2518. static void
  2519. qio_channel_rdma_source_finalize(GSource *source)
  2520. {
  2521. QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
  2522. object_unref(OBJECT(ssource->rioc));
  2523. }
  2524. GSourceFuncs qio_channel_rdma_source_funcs = {
  2525. qio_channel_rdma_source_prepare,
  2526. qio_channel_rdma_source_check,
  2527. qio_channel_rdma_source_dispatch,
  2528. qio_channel_rdma_source_finalize
  2529. };
  2530. static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
  2531. GIOCondition condition)
  2532. {
  2533. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2534. QIOChannelRDMASource *ssource;
  2535. GSource *source;
  2536. source = g_source_new(&qio_channel_rdma_source_funcs,
  2537. sizeof(QIOChannelRDMASource));
  2538. ssource = (QIOChannelRDMASource *)source;
  2539. ssource->rioc = rioc;
  2540. object_ref(OBJECT(rioc));
  2541. ssource->condition = condition;
  2542. return source;
  2543. }
  2544. static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
  2545. AioContext *ctx,
  2546. IOHandler *io_read,
  2547. IOHandler *io_write,
  2548. void *opaque)
  2549. {
  2550. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2551. if (io_read) {
  2552. aio_set_fd_handler(ctx, rioc->rdmain->comp_channel->fd,
  2553. false, io_read, io_write, NULL, opaque);
  2554. } else {
  2555. aio_set_fd_handler(ctx, rioc->rdmaout->comp_channel->fd,
  2556. false, io_read, io_write, NULL, opaque);
  2557. }
  2558. }
  2559. struct rdma_close_rcu {
  2560. struct rcu_head rcu;
  2561. RDMAContext *rdmain;
  2562. RDMAContext *rdmaout;
  2563. };
  2564. /* callback from qio_channel_rdma_close via call_rcu */
  2565. static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
  2566. {
  2567. if (rcu->rdmain) {
  2568. qemu_rdma_cleanup(rcu->rdmain);
  2569. }
  2570. if (rcu->rdmaout) {
  2571. qemu_rdma_cleanup(rcu->rdmaout);
  2572. }
  2573. g_free(rcu->rdmain);
  2574. g_free(rcu->rdmaout);
  2575. g_free(rcu);
  2576. }
  2577. static int qio_channel_rdma_close(QIOChannel *ioc,
  2578. Error **errp)
  2579. {
  2580. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2581. RDMAContext *rdmain, *rdmaout;
  2582. struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
  2583. trace_qemu_rdma_close();
  2584. rdmain = rioc->rdmain;
  2585. if (rdmain) {
  2586. qatomic_rcu_set(&rioc->rdmain, NULL);
  2587. }
  2588. rdmaout = rioc->rdmaout;
  2589. if (rdmaout) {
  2590. qatomic_rcu_set(&rioc->rdmaout, NULL);
  2591. }
  2592. rcu->rdmain = rdmain;
  2593. rcu->rdmaout = rdmaout;
  2594. call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
  2595. return 0;
  2596. }
  2597. static int
  2598. qio_channel_rdma_shutdown(QIOChannel *ioc,
  2599. QIOChannelShutdown how,
  2600. Error **errp)
  2601. {
  2602. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2603. RDMAContext *rdmain, *rdmaout;
  2604. RCU_READ_LOCK_GUARD();
  2605. rdmain = qatomic_rcu_read(&rioc->rdmain);
  2606. rdmaout = qatomic_rcu_read(&rioc->rdmain);
  2607. switch (how) {
  2608. case QIO_CHANNEL_SHUTDOWN_READ:
  2609. if (rdmain) {
  2610. rdmain->error_state = -1;
  2611. }
  2612. break;
  2613. case QIO_CHANNEL_SHUTDOWN_WRITE:
  2614. if (rdmaout) {
  2615. rdmaout->error_state = -1;
  2616. }
  2617. break;
  2618. case QIO_CHANNEL_SHUTDOWN_BOTH:
  2619. default:
  2620. if (rdmain) {
  2621. rdmain->error_state = -1;
  2622. }
  2623. if (rdmaout) {
  2624. rdmaout->error_state = -1;
  2625. }
  2626. break;
  2627. }
  2628. return 0;
  2629. }
  2630. /*
  2631. * Parameters:
  2632. * @offset == 0 :
  2633. * This means that 'block_offset' is a full virtual address that does not
  2634. * belong to a RAMBlock of the virtual machine and instead
  2635. * represents a private malloc'd memory area that the caller wishes to
  2636. * transfer.
  2637. *
  2638. * @offset != 0 :
  2639. * Offset is an offset to be added to block_offset and used
  2640. * to also lookup the corresponding RAMBlock.
  2641. *
  2642. * @size > 0 :
  2643. * Initiate an transfer this size.
  2644. *
  2645. * @size == 0 :
  2646. * A 'hint' or 'advice' that means that we wish to speculatively
  2647. * and asynchronously unregister this memory. In this case, there is no
  2648. * guarantee that the unregister will actually happen, for example,
  2649. * if the memory is being actively transmitted. Additionally, the memory
  2650. * may be re-registered at any future time if a write within the same
  2651. * chunk was requested again, even if you attempted to unregister it
  2652. * here.
  2653. *
  2654. * @size < 0 : TODO, not yet supported
  2655. * Unregister the memory NOW. This means that the caller does not
  2656. * expect there to be any future RDMA transfers and we just want to clean
  2657. * things up. This is used in case the upper layer owns the memory and
  2658. * cannot wait for qemu_fclose() to occur.
  2659. *
  2660. * @bytes_sent : User-specificed pointer to indicate how many bytes were
  2661. * sent. Usually, this will not be more than a few bytes of
  2662. * the protocol because most transfers are sent asynchronously.
  2663. */
  2664. static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
  2665. ram_addr_t block_offset, ram_addr_t offset,
  2666. size_t size, uint64_t *bytes_sent)
  2667. {
  2668. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
  2669. RDMAContext *rdma;
  2670. int ret;
  2671. RCU_READ_LOCK_GUARD();
  2672. rdma = qatomic_rcu_read(&rioc->rdmaout);
  2673. if (!rdma) {
  2674. return -EIO;
  2675. }
  2676. CHECK_ERROR_STATE();
  2677. if (migration_in_postcopy()) {
  2678. return RAM_SAVE_CONTROL_NOT_SUPP;
  2679. }
  2680. qemu_fflush(f);
  2681. if (size > 0) {
  2682. /*
  2683. * Add this page to the current 'chunk'. If the chunk
  2684. * is full, or the page doesn't belong to the current chunk,
  2685. * an actual RDMA write will occur and a new chunk will be formed.
  2686. */
  2687. ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
  2688. if (ret < 0) {
  2689. error_report("rdma migration: write error! %d", ret);
  2690. goto err;
  2691. }
  2692. /*
  2693. * We always return 1 bytes because the RDMA
  2694. * protocol is completely asynchronous. We do not yet know
  2695. * whether an identified chunk is zero or not because we're
  2696. * waiting for other pages to potentially be merged with
  2697. * the current chunk. So, we have to call qemu_update_position()
  2698. * later on when the actual write occurs.
  2699. */
  2700. if (bytes_sent) {
  2701. *bytes_sent = 1;
  2702. }
  2703. } else {
  2704. uint64_t index, chunk;
  2705. /* TODO: Change QEMUFileOps prototype to be signed: size_t => long
  2706. if (size < 0) {
  2707. ret = qemu_rdma_drain_cq(f, rdma);
  2708. if (ret < 0) {
  2709. fprintf(stderr, "rdma: failed to synchronously drain"
  2710. " completion queue before unregistration.\n");
  2711. goto err;
  2712. }
  2713. }
  2714. */
  2715. ret = qemu_rdma_search_ram_block(rdma, block_offset,
  2716. offset, size, &index, &chunk);
  2717. if (ret) {
  2718. error_report("ram block search failed");
  2719. goto err;
  2720. }
  2721. qemu_rdma_signal_unregister(rdma, index, chunk, 0);
  2722. /*
  2723. * TODO: Synchronous, guaranteed unregistration (should not occur during
  2724. * fast-path). Otherwise, unregisters will process on the next call to
  2725. * qemu_rdma_drain_cq()
  2726. if (size < 0) {
  2727. qemu_rdma_unregister_waiting(rdma);
  2728. }
  2729. */
  2730. }
  2731. /*
  2732. * Drain the Completion Queue if possible, but do not block,
  2733. * just poll.
  2734. *
  2735. * If nothing to poll, the end of the iteration will do this
  2736. * again to make sure we don't overflow the request queue.
  2737. */
  2738. while (1) {
  2739. uint64_t wr_id, wr_id_in;
  2740. int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
  2741. if (ret < 0) {
  2742. error_report("rdma migration: polling error! %d", ret);
  2743. goto err;
  2744. }
  2745. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  2746. if (wr_id == RDMA_WRID_NONE) {
  2747. break;
  2748. }
  2749. }
  2750. return RAM_SAVE_CONTROL_DELAYED;
  2751. err:
  2752. rdma->error_state = ret;
  2753. return ret;
  2754. }
  2755. static void rdma_accept_incoming_migration(void *opaque);
  2756. static void rdma_cm_poll_handler(void *opaque)
  2757. {
  2758. RDMAContext *rdma = opaque;
  2759. int ret;
  2760. struct rdma_cm_event *cm_event;
  2761. MigrationIncomingState *mis = migration_incoming_get_current();
  2762. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2763. if (ret) {
  2764. error_report("get_cm_event failed %d", errno);
  2765. return;
  2766. }
  2767. rdma_ack_cm_event(cm_event);
  2768. if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
  2769. cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
  2770. if (!rdma->error_state &&
  2771. migration_incoming_get_current()->state !=
  2772. MIGRATION_STATUS_COMPLETED) {
  2773. error_report("receive cm event, cm event is %d", cm_event->event);
  2774. rdma->error_state = -EPIPE;
  2775. if (rdma->return_path) {
  2776. rdma->return_path->error_state = -EPIPE;
  2777. }
  2778. }
  2779. if (mis->migration_incoming_co) {
  2780. qemu_coroutine_enter(mis->migration_incoming_co);
  2781. }
  2782. return;
  2783. }
  2784. }
  2785. static int qemu_rdma_accept(RDMAContext *rdma)
  2786. {
  2787. RDMACapabilities cap;
  2788. struct rdma_conn_param conn_param = {
  2789. .responder_resources = 2,
  2790. .private_data = &cap,
  2791. .private_data_len = sizeof(cap),
  2792. };
  2793. struct rdma_cm_event *cm_event;
  2794. struct ibv_context *verbs;
  2795. int ret = -EINVAL;
  2796. int idx;
  2797. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2798. if (ret) {
  2799. goto err_rdma_dest_wait;
  2800. }
  2801. if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
  2802. rdma_ack_cm_event(cm_event);
  2803. goto err_rdma_dest_wait;
  2804. }
  2805. memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
  2806. network_to_caps(&cap);
  2807. if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
  2808. error_report("Unknown source RDMA version: %d, bailing...",
  2809. cap.version);
  2810. rdma_ack_cm_event(cm_event);
  2811. goto err_rdma_dest_wait;
  2812. }
  2813. /*
  2814. * Respond with only the capabilities this version of QEMU knows about.
  2815. */
  2816. cap.flags &= known_capabilities;
  2817. /*
  2818. * Enable the ones that we do know about.
  2819. * Add other checks here as new ones are introduced.
  2820. */
  2821. if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
  2822. rdma->pin_all = true;
  2823. }
  2824. rdma->cm_id = cm_event->id;
  2825. verbs = cm_event->id->verbs;
  2826. rdma_ack_cm_event(cm_event);
  2827. trace_qemu_rdma_accept_pin_state(rdma->pin_all);
  2828. caps_to_network(&cap);
  2829. trace_qemu_rdma_accept_pin_verbsc(verbs);
  2830. if (!rdma->verbs) {
  2831. rdma->verbs = verbs;
  2832. } else if (rdma->verbs != verbs) {
  2833. error_report("ibv context not matching %p, %p!", rdma->verbs,
  2834. verbs);
  2835. goto err_rdma_dest_wait;
  2836. }
  2837. qemu_rdma_dump_id("dest_init", verbs);
  2838. ret = qemu_rdma_alloc_pd_cq(rdma);
  2839. if (ret) {
  2840. error_report("rdma migration: error allocating pd and cq!");
  2841. goto err_rdma_dest_wait;
  2842. }
  2843. ret = qemu_rdma_alloc_qp(rdma);
  2844. if (ret) {
  2845. error_report("rdma migration: error allocating qp!");
  2846. goto err_rdma_dest_wait;
  2847. }
  2848. ret = qemu_rdma_init_ram_blocks(rdma);
  2849. if (ret) {
  2850. error_report("rdma migration: error initializing ram blocks!");
  2851. goto err_rdma_dest_wait;
  2852. }
  2853. for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
  2854. ret = qemu_rdma_reg_control(rdma, idx);
  2855. if (ret) {
  2856. error_report("rdma: error registering %d control", idx);
  2857. goto err_rdma_dest_wait;
  2858. }
  2859. }
  2860. /* Accept the second connection request for return path */
  2861. if (migrate_postcopy() && !rdma->is_return_path) {
  2862. qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
  2863. NULL,
  2864. (void *)(intptr_t)rdma->return_path);
  2865. } else {
  2866. qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
  2867. NULL, rdma);
  2868. }
  2869. ret = rdma_accept(rdma->cm_id, &conn_param);
  2870. if (ret) {
  2871. error_report("rdma_accept returns %d", ret);
  2872. goto err_rdma_dest_wait;
  2873. }
  2874. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2875. if (ret) {
  2876. error_report("rdma_accept get_cm_event failed %d", ret);
  2877. goto err_rdma_dest_wait;
  2878. }
  2879. if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
  2880. error_report("rdma_accept not event established");
  2881. rdma_ack_cm_event(cm_event);
  2882. goto err_rdma_dest_wait;
  2883. }
  2884. rdma_ack_cm_event(cm_event);
  2885. rdma->connected = true;
  2886. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
  2887. if (ret) {
  2888. error_report("rdma migration: error posting second control recv");
  2889. goto err_rdma_dest_wait;
  2890. }
  2891. qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
  2892. return 0;
  2893. err_rdma_dest_wait:
  2894. rdma->error_state = ret;
  2895. qemu_rdma_cleanup(rdma);
  2896. return ret;
  2897. }
  2898. static int dest_ram_sort_func(const void *a, const void *b)
  2899. {
  2900. unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
  2901. unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
  2902. return (a_index < b_index) ? -1 : (a_index != b_index);
  2903. }
  2904. /*
  2905. * During each iteration of the migration, we listen for instructions
  2906. * by the source VM to perform dynamic page registrations before they
  2907. * can perform RDMA operations.
  2908. *
  2909. * We respond with the 'rkey'.
  2910. *
  2911. * Keep doing this until the source tells us to stop.
  2912. */
  2913. static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
  2914. {
  2915. RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
  2916. .type = RDMA_CONTROL_REGISTER_RESULT,
  2917. .repeat = 0,
  2918. };
  2919. RDMAControlHeader unreg_resp = { .len = 0,
  2920. .type = RDMA_CONTROL_UNREGISTER_FINISHED,
  2921. .repeat = 0,
  2922. };
  2923. RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
  2924. .repeat = 1 };
  2925. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
  2926. RDMAContext *rdma;
  2927. RDMALocalBlocks *local;
  2928. RDMAControlHeader head;
  2929. RDMARegister *reg, *registers;
  2930. RDMACompress *comp;
  2931. RDMARegisterResult *reg_result;
  2932. static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
  2933. RDMALocalBlock *block;
  2934. void *host_addr;
  2935. int ret = 0;
  2936. int idx = 0;
  2937. int count = 0;
  2938. int i = 0;
  2939. RCU_READ_LOCK_GUARD();
  2940. rdma = qatomic_rcu_read(&rioc->rdmain);
  2941. if (!rdma) {
  2942. return -EIO;
  2943. }
  2944. CHECK_ERROR_STATE();
  2945. local = &rdma->local_ram_blocks;
  2946. do {
  2947. trace_qemu_rdma_registration_handle_wait();
  2948. ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
  2949. if (ret < 0) {
  2950. break;
  2951. }
  2952. if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
  2953. error_report("rdma: Too many requests in this message (%d)."
  2954. "Bailing.", head.repeat);
  2955. ret = -EIO;
  2956. break;
  2957. }
  2958. switch (head.type) {
  2959. case RDMA_CONTROL_COMPRESS:
  2960. comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
  2961. network_to_compress(comp);
  2962. trace_qemu_rdma_registration_handle_compress(comp->length,
  2963. comp->block_idx,
  2964. comp->offset);
  2965. if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
  2966. error_report("rdma: 'compress' bad block index %u (vs %d)",
  2967. (unsigned int)comp->block_idx,
  2968. rdma->local_ram_blocks.nb_blocks);
  2969. ret = -EIO;
  2970. goto out;
  2971. }
  2972. block = &(rdma->local_ram_blocks.block[comp->block_idx]);
  2973. host_addr = block->local_host_addr +
  2974. (comp->offset - block->offset);
  2975. ram_handle_compressed(host_addr, comp->value, comp->length);
  2976. break;
  2977. case RDMA_CONTROL_REGISTER_FINISHED:
  2978. trace_qemu_rdma_registration_handle_finished();
  2979. goto out;
  2980. case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
  2981. trace_qemu_rdma_registration_handle_ram_blocks();
  2982. /* Sort our local RAM Block list so it's the same as the source,
  2983. * we can do this since we've filled in a src_index in the list
  2984. * as we received the RAMBlock list earlier.
  2985. */
  2986. qsort(rdma->local_ram_blocks.block,
  2987. rdma->local_ram_blocks.nb_blocks,
  2988. sizeof(RDMALocalBlock), dest_ram_sort_func);
  2989. for (i = 0; i < local->nb_blocks; i++) {
  2990. local->block[i].index = i;
  2991. }
  2992. if (rdma->pin_all) {
  2993. ret = qemu_rdma_reg_whole_ram_blocks(rdma);
  2994. if (ret) {
  2995. error_report("rdma migration: error dest "
  2996. "registering ram blocks");
  2997. goto out;
  2998. }
  2999. }
  3000. /*
  3001. * Dest uses this to prepare to transmit the RAMBlock descriptions
  3002. * to the source VM after connection setup.
  3003. * Both sides use the "remote" structure to communicate and update
  3004. * their "local" descriptions with what was sent.
  3005. */
  3006. for (i = 0; i < local->nb_blocks; i++) {
  3007. rdma->dest_blocks[i].remote_host_addr =
  3008. (uintptr_t)(local->block[i].local_host_addr);
  3009. if (rdma->pin_all) {
  3010. rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
  3011. }
  3012. rdma->dest_blocks[i].offset = local->block[i].offset;
  3013. rdma->dest_blocks[i].length = local->block[i].length;
  3014. dest_block_to_network(&rdma->dest_blocks[i]);
  3015. trace_qemu_rdma_registration_handle_ram_blocks_loop(
  3016. local->block[i].block_name,
  3017. local->block[i].offset,
  3018. local->block[i].length,
  3019. local->block[i].local_host_addr,
  3020. local->block[i].src_index);
  3021. }
  3022. blocks.len = rdma->local_ram_blocks.nb_blocks
  3023. * sizeof(RDMADestBlock);
  3024. ret = qemu_rdma_post_send_control(rdma,
  3025. (uint8_t *) rdma->dest_blocks, &blocks);
  3026. if (ret < 0) {
  3027. error_report("rdma migration: error sending remote info");
  3028. goto out;
  3029. }
  3030. break;
  3031. case RDMA_CONTROL_REGISTER_REQUEST:
  3032. trace_qemu_rdma_registration_handle_register(head.repeat);
  3033. reg_resp.repeat = head.repeat;
  3034. registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
  3035. for (count = 0; count < head.repeat; count++) {
  3036. uint64_t chunk;
  3037. uint8_t *chunk_start, *chunk_end;
  3038. reg = &registers[count];
  3039. network_to_register(reg);
  3040. reg_result = &results[count];
  3041. trace_qemu_rdma_registration_handle_register_loop(count,
  3042. reg->current_index, reg->key.current_addr, reg->chunks);
  3043. if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
  3044. error_report("rdma: 'register' bad block index %u (vs %d)",
  3045. (unsigned int)reg->current_index,
  3046. rdma->local_ram_blocks.nb_blocks);
  3047. ret = -ENOENT;
  3048. goto out;
  3049. }
  3050. block = &(rdma->local_ram_blocks.block[reg->current_index]);
  3051. if (block->is_ram_block) {
  3052. if (block->offset > reg->key.current_addr) {
  3053. error_report("rdma: bad register address for block %s"
  3054. " offset: %" PRIx64 " current_addr: %" PRIx64,
  3055. block->block_name, block->offset,
  3056. reg->key.current_addr);
  3057. ret = -ERANGE;
  3058. goto out;
  3059. }
  3060. host_addr = (block->local_host_addr +
  3061. (reg->key.current_addr - block->offset));
  3062. chunk = ram_chunk_index(block->local_host_addr,
  3063. (uint8_t *) host_addr);
  3064. } else {
  3065. chunk = reg->key.chunk;
  3066. host_addr = block->local_host_addr +
  3067. (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
  3068. /* Check for particularly bad chunk value */
  3069. if (host_addr < (void *)block->local_host_addr) {
  3070. error_report("rdma: bad chunk for block %s"
  3071. " chunk: %" PRIx64,
  3072. block->block_name, reg->key.chunk);
  3073. ret = -ERANGE;
  3074. goto out;
  3075. }
  3076. }
  3077. chunk_start = ram_chunk_start(block, chunk);
  3078. chunk_end = ram_chunk_end(block, chunk + reg->chunks);
  3079. /* avoid "-Waddress-of-packed-member" warning */
  3080. uint32_t tmp_rkey = 0;
  3081. if (qemu_rdma_register_and_get_keys(rdma, block,
  3082. (uintptr_t)host_addr, NULL, &tmp_rkey,
  3083. chunk, chunk_start, chunk_end)) {
  3084. error_report("cannot get rkey");
  3085. ret = -EINVAL;
  3086. goto out;
  3087. }
  3088. reg_result->rkey = tmp_rkey;
  3089. reg_result->host_addr = (uintptr_t)block->local_host_addr;
  3090. trace_qemu_rdma_registration_handle_register_rkey(
  3091. reg_result->rkey);
  3092. result_to_network(reg_result);
  3093. }
  3094. ret = qemu_rdma_post_send_control(rdma,
  3095. (uint8_t *) results, &reg_resp);
  3096. if (ret < 0) {
  3097. error_report("Failed to send control buffer");
  3098. goto out;
  3099. }
  3100. break;
  3101. case RDMA_CONTROL_UNREGISTER_REQUEST:
  3102. trace_qemu_rdma_registration_handle_unregister(head.repeat);
  3103. unreg_resp.repeat = head.repeat;
  3104. registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
  3105. for (count = 0; count < head.repeat; count++) {
  3106. reg = &registers[count];
  3107. network_to_register(reg);
  3108. trace_qemu_rdma_registration_handle_unregister_loop(count,
  3109. reg->current_index, reg->key.chunk);
  3110. block = &(rdma->local_ram_blocks.block[reg->current_index]);
  3111. ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
  3112. block->pmr[reg->key.chunk] = NULL;
  3113. if (ret != 0) {
  3114. perror("rdma unregistration chunk failed");
  3115. ret = -ret;
  3116. goto out;
  3117. }
  3118. rdma->total_registrations--;
  3119. trace_qemu_rdma_registration_handle_unregister_success(
  3120. reg->key.chunk);
  3121. }
  3122. ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
  3123. if (ret < 0) {
  3124. error_report("Failed to send control buffer");
  3125. goto out;
  3126. }
  3127. break;
  3128. case RDMA_CONTROL_REGISTER_RESULT:
  3129. error_report("Invalid RESULT message at dest.");
  3130. ret = -EIO;
  3131. goto out;
  3132. default:
  3133. error_report("Unknown control message %s", control_desc(head.type));
  3134. ret = -EIO;
  3135. goto out;
  3136. }
  3137. } while (1);
  3138. out:
  3139. if (ret < 0) {
  3140. rdma->error_state = ret;
  3141. }
  3142. return ret;
  3143. }
  3144. /* Destination:
  3145. * Called via a ram_control_load_hook during the initial RAM load section which
  3146. * lists the RAMBlocks by name. This lets us know the order of the RAMBlocks
  3147. * on the source.
  3148. * We've already built our local RAMBlock list, but not yet sent the list to
  3149. * the source.
  3150. */
  3151. static int
  3152. rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
  3153. {
  3154. RDMAContext *rdma;
  3155. int curr;
  3156. int found = -1;
  3157. RCU_READ_LOCK_GUARD();
  3158. rdma = qatomic_rcu_read(&rioc->rdmain);
  3159. if (!rdma) {
  3160. return -EIO;
  3161. }
  3162. /* Find the matching RAMBlock in our local list */
  3163. for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
  3164. if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
  3165. found = curr;
  3166. break;
  3167. }
  3168. }
  3169. if (found == -1) {
  3170. error_report("RAMBlock '%s' not found on destination", name);
  3171. return -ENOENT;
  3172. }
  3173. rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
  3174. trace_rdma_block_notification_handle(name, rdma->next_src_index);
  3175. rdma->next_src_index++;
  3176. return 0;
  3177. }
  3178. static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data)
  3179. {
  3180. switch (flags) {
  3181. case RAM_CONTROL_BLOCK_REG:
  3182. return rdma_block_notification_handle(opaque, data);
  3183. case RAM_CONTROL_HOOK:
  3184. return qemu_rdma_registration_handle(f, opaque);
  3185. default:
  3186. /* Shouldn't be called with any other values */
  3187. abort();
  3188. }
  3189. }
  3190. static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
  3191. uint64_t flags, void *data)
  3192. {
  3193. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
  3194. RDMAContext *rdma;
  3195. RCU_READ_LOCK_GUARD();
  3196. rdma = qatomic_rcu_read(&rioc->rdmaout);
  3197. if (!rdma) {
  3198. return -EIO;
  3199. }
  3200. CHECK_ERROR_STATE();
  3201. if (migration_in_postcopy()) {
  3202. return 0;
  3203. }
  3204. trace_qemu_rdma_registration_start(flags);
  3205. qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
  3206. qemu_fflush(f);
  3207. return 0;
  3208. }
  3209. /*
  3210. * Inform dest that dynamic registrations are done for now.
  3211. * First, flush writes, if any.
  3212. */
  3213. static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
  3214. uint64_t flags, void *data)
  3215. {
  3216. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
  3217. RDMAContext *rdma;
  3218. RDMAControlHeader head = { .len = 0, .repeat = 1 };
  3219. int ret = 0;
  3220. RCU_READ_LOCK_GUARD();
  3221. rdma = qatomic_rcu_read(&rioc->rdmaout);
  3222. if (!rdma) {
  3223. return -EIO;
  3224. }
  3225. CHECK_ERROR_STATE();
  3226. if (migration_in_postcopy()) {
  3227. return 0;
  3228. }
  3229. qemu_fflush(f);
  3230. ret = qemu_rdma_drain_cq(f, rdma);
  3231. if (ret < 0) {
  3232. goto err;
  3233. }
  3234. if (flags == RAM_CONTROL_SETUP) {
  3235. RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
  3236. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  3237. int reg_result_idx, i, nb_dest_blocks;
  3238. head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
  3239. trace_qemu_rdma_registration_stop_ram();
  3240. /*
  3241. * Make sure that we parallelize the pinning on both sides.
  3242. * For very large guests, doing this serially takes a really
  3243. * long time, so we have to 'interleave' the pinning locally
  3244. * with the control messages by performing the pinning on this
  3245. * side before we receive the control response from the other
  3246. * side that the pinning has completed.
  3247. */
  3248. ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
  3249. &reg_result_idx, rdma->pin_all ?
  3250. qemu_rdma_reg_whole_ram_blocks : NULL);
  3251. if (ret < 0) {
  3252. fprintf(stderr, "receiving remote info!");
  3253. return ret;
  3254. }
  3255. nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
  3256. /*
  3257. * The protocol uses two different sets of rkeys (mutually exclusive):
  3258. * 1. One key to represent the virtual address of the entire ram block.
  3259. * (dynamic chunk registration disabled - pin everything with one rkey.)
  3260. * 2. One to represent individual chunks within a ram block.
  3261. * (dynamic chunk registration enabled - pin individual chunks.)
  3262. *
  3263. * Once the capability is successfully negotiated, the destination transmits
  3264. * the keys to use (or sends them later) including the virtual addresses
  3265. * and then propagates the remote ram block descriptions to his local copy.
  3266. */
  3267. if (local->nb_blocks != nb_dest_blocks) {
  3268. fprintf(stderr, "ram blocks mismatch (Number of blocks %d vs %d) "
  3269. "Your QEMU command line parameters are probably "
  3270. "not identical on both the source and destination.",
  3271. local->nb_blocks, nb_dest_blocks);
  3272. rdma->error_state = -EINVAL;
  3273. return -EINVAL;
  3274. }
  3275. qemu_rdma_move_header(rdma, reg_result_idx, &resp);
  3276. memcpy(rdma->dest_blocks,
  3277. rdma->wr_data[reg_result_idx].control_curr, resp.len);
  3278. for (i = 0; i < nb_dest_blocks; i++) {
  3279. network_to_dest_block(&rdma->dest_blocks[i]);
  3280. /* We require that the blocks are in the same order */
  3281. if (rdma->dest_blocks[i].length != local->block[i].length) {
  3282. fprintf(stderr, "Block %s/%d has a different length %" PRIu64
  3283. "vs %" PRIu64, local->block[i].block_name, i,
  3284. local->block[i].length,
  3285. rdma->dest_blocks[i].length);
  3286. rdma->error_state = -EINVAL;
  3287. return -EINVAL;
  3288. }
  3289. local->block[i].remote_host_addr =
  3290. rdma->dest_blocks[i].remote_host_addr;
  3291. local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
  3292. }
  3293. }
  3294. trace_qemu_rdma_registration_stop(flags);
  3295. head.type = RDMA_CONTROL_REGISTER_FINISHED;
  3296. ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
  3297. if (ret < 0) {
  3298. goto err;
  3299. }
  3300. return 0;
  3301. err:
  3302. rdma->error_state = ret;
  3303. return ret;
  3304. }
  3305. static const QEMUFileHooks rdma_read_hooks = {
  3306. .hook_ram_load = rdma_load_hook,
  3307. };
  3308. static const QEMUFileHooks rdma_write_hooks = {
  3309. .before_ram_iterate = qemu_rdma_registration_start,
  3310. .after_ram_iterate = qemu_rdma_registration_stop,
  3311. .save_page = qemu_rdma_save_page,
  3312. };
  3313. static void qio_channel_rdma_finalize(Object *obj)
  3314. {
  3315. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
  3316. if (rioc->rdmain) {
  3317. qemu_rdma_cleanup(rioc->rdmain);
  3318. g_free(rioc->rdmain);
  3319. rioc->rdmain = NULL;
  3320. }
  3321. if (rioc->rdmaout) {
  3322. qemu_rdma_cleanup(rioc->rdmaout);
  3323. g_free(rioc->rdmaout);
  3324. rioc->rdmaout = NULL;
  3325. }
  3326. }
  3327. static void qio_channel_rdma_class_init(ObjectClass *klass,
  3328. void *class_data G_GNUC_UNUSED)
  3329. {
  3330. QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
  3331. ioc_klass->io_writev = qio_channel_rdma_writev;
  3332. ioc_klass->io_readv = qio_channel_rdma_readv;
  3333. ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
  3334. ioc_klass->io_close = qio_channel_rdma_close;
  3335. ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
  3336. ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
  3337. ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
  3338. }
  3339. static const TypeInfo qio_channel_rdma_info = {
  3340. .parent = TYPE_QIO_CHANNEL,
  3341. .name = TYPE_QIO_CHANNEL_RDMA,
  3342. .instance_size = sizeof(QIOChannelRDMA),
  3343. .instance_finalize = qio_channel_rdma_finalize,
  3344. .class_init = qio_channel_rdma_class_init,
  3345. };
  3346. static void qio_channel_rdma_register_types(void)
  3347. {
  3348. type_register_static(&qio_channel_rdma_info);
  3349. }
  3350. type_init(qio_channel_rdma_register_types);
  3351. static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
  3352. {
  3353. QIOChannelRDMA *rioc;
  3354. if (qemu_file_mode_is_not_valid(mode)) {
  3355. return NULL;
  3356. }
  3357. rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
  3358. if (mode[0] == 'w') {
  3359. rioc->file = qemu_fopen_channel_output(QIO_CHANNEL(rioc));
  3360. rioc->rdmaout = rdma;
  3361. rioc->rdmain = rdma->return_path;
  3362. qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
  3363. } else {
  3364. rioc->file = qemu_fopen_channel_input(QIO_CHANNEL(rioc));
  3365. rioc->rdmain = rdma;
  3366. rioc->rdmaout = rdma->return_path;
  3367. qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
  3368. }
  3369. return rioc->file;
  3370. }
  3371. static void rdma_accept_incoming_migration(void *opaque)
  3372. {
  3373. RDMAContext *rdma = opaque;
  3374. int ret;
  3375. QEMUFile *f;
  3376. Error *local_err = NULL;
  3377. trace_qemu_rdma_accept_incoming_migration();
  3378. ret = qemu_rdma_accept(rdma);
  3379. if (ret) {
  3380. fprintf(stderr, "RDMA ERROR: Migration initialization failed\n");
  3381. return;
  3382. }
  3383. trace_qemu_rdma_accept_incoming_migration_accepted();
  3384. if (rdma->is_return_path) {
  3385. return;
  3386. }
  3387. f = qemu_fopen_rdma(rdma, "rb");
  3388. if (f == NULL) {
  3389. fprintf(stderr, "RDMA ERROR: could not qemu_fopen_rdma\n");
  3390. qemu_rdma_cleanup(rdma);
  3391. return;
  3392. }
  3393. rdma->migration_started_on_destination = 1;
  3394. migration_fd_process_incoming(f, &local_err);
  3395. if (local_err) {
  3396. error_reportf_err(local_err, "RDMA ERROR:");
  3397. }
  3398. }
  3399. void rdma_start_incoming_migration(const char *host_port, Error **errp)
  3400. {
  3401. int ret;
  3402. RDMAContext *rdma, *rdma_return_path = NULL;
  3403. Error *local_err = NULL;
  3404. trace_rdma_start_incoming_migration();
  3405. /* Avoid ram_block_discard_disable(), cannot change during migration. */
  3406. if (ram_block_discard_is_required()) {
  3407. error_setg(errp, "RDMA: cannot disable RAM discard");
  3408. return;
  3409. }
  3410. rdma = qemu_rdma_data_init(host_port, &local_err);
  3411. if (rdma == NULL) {
  3412. goto err;
  3413. }
  3414. ret = qemu_rdma_dest_init(rdma, &local_err);
  3415. if (ret) {
  3416. goto err;
  3417. }
  3418. trace_rdma_start_incoming_migration_after_dest_init();
  3419. ret = rdma_listen(rdma->listen_id, 5);
  3420. if (ret) {
  3421. ERROR(errp, "listening on socket!");
  3422. goto err;
  3423. }
  3424. trace_rdma_start_incoming_migration_after_rdma_listen();
  3425. /* initialize the RDMAContext for return path */
  3426. if (migrate_postcopy()) {
  3427. rdma_return_path = qemu_rdma_data_init(host_port, &local_err);
  3428. if (rdma_return_path == NULL) {
  3429. goto err;
  3430. }
  3431. qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
  3432. }
  3433. qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
  3434. NULL, (void *)(intptr_t)rdma);
  3435. return;
  3436. err:
  3437. error_propagate(errp, local_err);
  3438. if (rdma) {
  3439. g_free(rdma->host);
  3440. }
  3441. g_free(rdma);
  3442. g_free(rdma_return_path);
  3443. }
  3444. void rdma_start_outgoing_migration(void *opaque,
  3445. const char *host_port, Error **errp)
  3446. {
  3447. MigrationState *s = opaque;
  3448. RDMAContext *rdma_return_path = NULL;
  3449. RDMAContext *rdma;
  3450. int ret = 0;
  3451. /* Avoid ram_block_discard_disable(), cannot change during migration. */
  3452. if (ram_block_discard_is_required()) {
  3453. error_setg(errp, "RDMA: cannot disable RAM discard");
  3454. return;
  3455. }
  3456. rdma = qemu_rdma_data_init(host_port, errp);
  3457. if (rdma == NULL) {
  3458. goto err;
  3459. }
  3460. ret = qemu_rdma_source_init(rdma,
  3461. s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
  3462. if (ret) {
  3463. goto err;
  3464. }
  3465. trace_rdma_start_outgoing_migration_after_rdma_source_init();
  3466. ret = qemu_rdma_connect(rdma, errp);
  3467. if (ret) {
  3468. goto err;
  3469. }
  3470. /* RDMA postcopy need a separate queue pair for return path */
  3471. if (migrate_postcopy()) {
  3472. rdma_return_path = qemu_rdma_data_init(host_port, errp);
  3473. if (rdma_return_path == NULL) {
  3474. goto return_path_err;
  3475. }
  3476. ret = qemu_rdma_source_init(rdma_return_path,
  3477. s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
  3478. if (ret) {
  3479. goto return_path_err;
  3480. }
  3481. ret = qemu_rdma_connect(rdma_return_path, errp);
  3482. if (ret) {
  3483. goto return_path_err;
  3484. }
  3485. rdma->return_path = rdma_return_path;
  3486. rdma_return_path->return_path = rdma;
  3487. rdma_return_path->is_return_path = true;
  3488. }
  3489. trace_rdma_start_outgoing_migration_after_rdma_connect();
  3490. s->to_dst_file = qemu_fopen_rdma(rdma, "wb");
  3491. migrate_fd_connect(s, NULL);
  3492. return;
  3493. return_path_err:
  3494. qemu_rdma_cleanup(rdma);
  3495. err:
  3496. g_free(rdma);
  3497. g_free(rdma_return_path);
  3498. }