2
0

rdma.c 127 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184
  1. /*
  2. * RDMA protocol and interfaces
  3. *
  4. * Copyright IBM, Corp. 2010-2013
  5. * Copyright Red Hat, Inc. 2015-2016
  6. *
  7. * Authors:
  8. * Michael R. Hines <mrhines@us.ibm.com>
  9. * Jiuxing Liu <jl@us.ibm.com>
  10. * Daniel P. Berrange <berrange@redhat.com>
  11. *
  12. * This work is licensed under the terms of the GNU GPL, version 2 or
  13. * later. See the COPYING file in the top-level directory.
  14. *
  15. */
  16. #include "qemu/osdep.h"
  17. #include "qapi/error.h"
  18. #include "qemu/cutils.h"
  19. #include "exec/target_page.h"
  20. #include "rdma.h"
  21. #include "migration.h"
  22. #include "migration-stats.h"
  23. #include "qemu-file.h"
  24. #include "ram.h"
  25. #include "qemu/error-report.h"
  26. #include "qemu/main-loop.h"
  27. #include "qemu/module.h"
  28. #include "qemu/rcu.h"
  29. #include "qemu/sockets.h"
  30. #include "qemu/bitmap.h"
  31. #include "qemu/coroutine.h"
  32. #include "exec/memory.h"
  33. #include <sys/socket.h>
  34. #include <netdb.h>
  35. #include <arpa/inet.h>
  36. #include <rdma/rdma_cma.h>
  37. #include "trace.h"
  38. #include "qom/object.h"
  39. #include "options.h"
  40. #include <poll.h>
  41. #define RDMA_RESOLVE_TIMEOUT_MS 10000
  42. /* Do not merge data if larger than this. */
  43. #define RDMA_MERGE_MAX (2 * 1024 * 1024)
  44. #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
  45. #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
  46. /*
  47. * This is only for non-live state being migrated.
  48. * Instead of RDMA_WRITE messages, we use RDMA_SEND
  49. * messages for that state, which requires a different
  50. * delivery design than main memory.
  51. */
  52. #define RDMA_SEND_INCREMENT 32768
  53. /*
  54. * Maximum size infiniband SEND message
  55. */
  56. #define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
  57. #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
  58. #define RDMA_CONTROL_VERSION_CURRENT 1
  59. /*
  60. * Capabilities for negotiation.
  61. */
  62. #define RDMA_CAPABILITY_PIN_ALL 0x01
  63. /*
  64. * Add the other flags above to this list of known capabilities
  65. * as they are introduced.
  66. */
  67. static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
  68. /*
  69. * A work request ID is 64-bits and we split up these bits
  70. * into 3 parts:
  71. *
  72. * bits 0-15 : type of control message, 2^16
  73. * bits 16-29: ram block index, 2^14
  74. * bits 30-63: ram block chunk number, 2^34
  75. *
  76. * The last two bit ranges are only used for RDMA writes,
  77. * in order to track their completion and potentially
  78. * also track unregistration status of the message.
  79. */
  80. #define RDMA_WRID_TYPE_SHIFT 0UL
  81. #define RDMA_WRID_BLOCK_SHIFT 16UL
  82. #define RDMA_WRID_CHUNK_SHIFT 30UL
  83. #define RDMA_WRID_TYPE_MASK \
  84. ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
  85. #define RDMA_WRID_BLOCK_MASK \
  86. (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
  87. #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
  88. /*
  89. * RDMA migration protocol:
  90. * 1. RDMA Writes (data messages, i.e. RAM)
  91. * 2. IB Send/Recv (control channel messages)
  92. */
  93. enum {
  94. RDMA_WRID_NONE = 0,
  95. RDMA_WRID_RDMA_WRITE = 1,
  96. RDMA_WRID_SEND_CONTROL = 2000,
  97. RDMA_WRID_RECV_CONTROL = 4000,
  98. };
  99. /*
  100. * Work request IDs for IB SEND messages only (not RDMA writes).
  101. * This is used by the migration protocol to transmit
  102. * control messages (such as device state and registration commands)
  103. *
  104. * We could use more WRs, but we have enough for now.
  105. */
  106. enum {
  107. RDMA_WRID_READY = 0,
  108. RDMA_WRID_DATA,
  109. RDMA_WRID_CONTROL,
  110. RDMA_WRID_MAX,
  111. };
  112. /*
  113. * SEND/RECV IB Control Messages.
  114. */
  115. enum {
  116. RDMA_CONTROL_NONE = 0,
  117. RDMA_CONTROL_ERROR,
  118. RDMA_CONTROL_READY, /* ready to receive */
  119. RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */
  120. RDMA_CONTROL_RAM_BLOCKS_REQUEST, /* RAMBlock synchronization */
  121. RDMA_CONTROL_RAM_BLOCKS_RESULT, /* RAMBlock synchronization */
  122. RDMA_CONTROL_COMPRESS, /* page contains repeat values */
  123. RDMA_CONTROL_REGISTER_REQUEST, /* dynamic page registration */
  124. RDMA_CONTROL_REGISTER_RESULT, /* key to use after registration */
  125. RDMA_CONTROL_REGISTER_FINISHED, /* current iteration finished */
  126. RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */
  127. RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
  128. };
  129. /*
  130. * Memory and MR structures used to represent an IB Send/Recv work request.
  131. * This is *not* used for RDMA writes, only IB Send/Recv.
  132. */
  133. typedef struct {
  134. uint8_t control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
  135. struct ibv_mr *control_mr; /* registration metadata */
  136. size_t control_len; /* length of the message */
  137. uint8_t *control_curr; /* start of unconsumed bytes */
  138. } RDMAWorkRequestData;
  139. /*
  140. * Negotiate RDMA capabilities during connection-setup time.
  141. */
  142. typedef struct {
  143. uint32_t version;
  144. uint32_t flags;
  145. } RDMACapabilities;
  146. static void caps_to_network(RDMACapabilities *cap)
  147. {
  148. cap->version = htonl(cap->version);
  149. cap->flags = htonl(cap->flags);
  150. }
  151. static void network_to_caps(RDMACapabilities *cap)
  152. {
  153. cap->version = ntohl(cap->version);
  154. cap->flags = ntohl(cap->flags);
  155. }
  156. /*
  157. * Representation of a RAMBlock from an RDMA perspective.
  158. * This is not transmitted, only local.
  159. * This and subsequent structures cannot be linked lists
  160. * because we're using a single IB message to transmit
  161. * the information. It's small anyway, so a list is overkill.
  162. */
  163. typedef struct RDMALocalBlock {
  164. char *block_name;
  165. uint8_t *local_host_addr; /* local virtual address */
  166. uint64_t remote_host_addr; /* remote virtual address */
  167. uint64_t offset;
  168. uint64_t length;
  169. struct ibv_mr **pmr; /* MRs for chunk-level registration */
  170. struct ibv_mr *mr; /* MR for non-chunk-level registration */
  171. uint32_t *remote_keys; /* rkeys for chunk-level registration */
  172. uint32_t remote_rkey; /* rkeys for non-chunk-level registration */
  173. int index; /* which block are we */
  174. unsigned int src_index; /* (Only used on dest) */
  175. bool is_ram_block;
  176. int nb_chunks;
  177. unsigned long *transit_bitmap;
  178. unsigned long *unregister_bitmap;
  179. } RDMALocalBlock;
  180. /*
  181. * Also represents a RAMblock, but only on the dest.
  182. * This gets transmitted by the dest during connection-time
  183. * to the source VM and then is used to populate the
  184. * corresponding RDMALocalBlock with
  185. * the information needed to perform the actual RDMA.
  186. */
  187. typedef struct QEMU_PACKED RDMADestBlock {
  188. uint64_t remote_host_addr;
  189. uint64_t offset;
  190. uint64_t length;
  191. uint32_t remote_rkey;
  192. uint32_t padding;
  193. } RDMADestBlock;
  194. static const char *control_desc(unsigned int rdma_control)
  195. {
  196. static const char *strs[] = {
  197. [RDMA_CONTROL_NONE] = "NONE",
  198. [RDMA_CONTROL_ERROR] = "ERROR",
  199. [RDMA_CONTROL_READY] = "READY",
  200. [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
  201. [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
  202. [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
  203. [RDMA_CONTROL_COMPRESS] = "COMPRESS",
  204. [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
  205. [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
  206. [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
  207. [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
  208. [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
  209. };
  210. if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
  211. return "??BAD CONTROL VALUE??";
  212. }
  213. return strs[rdma_control];
  214. }
  215. #if !defined(htonll)
  216. static uint64_t htonll(uint64_t v)
  217. {
  218. union { uint32_t lv[2]; uint64_t llv; } u;
  219. u.lv[0] = htonl(v >> 32);
  220. u.lv[1] = htonl(v & 0xFFFFFFFFULL);
  221. return u.llv;
  222. }
  223. #endif
  224. #if !defined(ntohll)
  225. static uint64_t ntohll(uint64_t v)
  226. {
  227. union { uint32_t lv[2]; uint64_t llv; } u;
  228. u.llv = v;
  229. return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
  230. }
  231. #endif
  232. static void dest_block_to_network(RDMADestBlock *db)
  233. {
  234. db->remote_host_addr = htonll(db->remote_host_addr);
  235. db->offset = htonll(db->offset);
  236. db->length = htonll(db->length);
  237. db->remote_rkey = htonl(db->remote_rkey);
  238. }
  239. static void network_to_dest_block(RDMADestBlock *db)
  240. {
  241. db->remote_host_addr = ntohll(db->remote_host_addr);
  242. db->offset = ntohll(db->offset);
  243. db->length = ntohll(db->length);
  244. db->remote_rkey = ntohl(db->remote_rkey);
  245. }
  246. /*
  247. * Virtual address of the above structures used for transmitting
  248. * the RAMBlock descriptions at connection-time.
  249. * This structure is *not* transmitted.
  250. */
  251. typedef struct RDMALocalBlocks {
  252. int nb_blocks;
  253. bool init; /* main memory init complete */
  254. RDMALocalBlock *block;
  255. } RDMALocalBlocks;
  256. /*
  257. * Main data structure for RDMA state.
  258. * While there is only one copy of this structure being allocated right now,
  259. * this is the place where one would start if you wanted to consider
  260. * having more than one RDMA connection open at the same time.
  261. */
  262. typedef struct RDMAContext {
  263. char *host;
  264. int port;
  265. RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
  266. /*
  267. * This is used by *_exchange_send() to figure out whether or not
  268. * the initial "READY" message has already been received or not.
  269. * This is because other functions may potentially poll() and detect
  270. * the READY message before send() does, in which case we need to
  271. * know if it completed.
  272. */
  273. int control_ready_expected;
  274. /* number of outstanding writes */
  275. int nb_sent;
  276. /* store info about current buffer so that we can
  277. merge it with future sends */
  278. uint64_t current_addr;
  279. uint64_t current_length;
  280. /* index of ram block the current buffer belongs to */
  281. int current_index;
  282. /* index of the chunk in the current ram block */
  283. int current_chunk;
  284. bool pin_all;
  285. /*
  286. * infiniband-specific variables for opening the device
  287. * and maintaining connection state and so forth.
  288. *
  289. * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
  290. * cm_id->verbs, cm_id->channel, and cm_id->qp.
  291. */
  292. struct rdma_cm_id *cm_id; /* connection manager ID */
  293. struct rdma_cm_id *listen_id;
  294. bool connected;
  295. struct ibv_context *verbs;
  296. struct rdma_event_channel *channel;
  297. struct ibv_qp *qp; /* queue pair */
  298. struct ibv_comp_channel *recv_comp_channel; /* recv completion channel */
  299. struct ibv_comp_channel *send_comp_channel; /* send completion channel */
  300. struct ibv_pd *pd; /* protection domain */
  301. struct ibv_cq *recv_cq; /* recvieve completion queue */
  302. struct ibv_cq *send_cq; /* send completion queue */
  303. /*
  304. * If a previous write failed (perhaps because of a failed
  305. * memory registration, then do not attempt any future work
  306. * and remember the error state.
  307. */
  308. bool errored;
  309. bool error_reported;
  310. bool received_error;
  311. /*
  312. * Description of ram blocks used throughout the code.
  313. */
  314. RDMALocalBlocks local_ram_blocks;
  315. RDMADestBlock *dest_blocks;
  316. /* Index of the next RAMBlock received during block registration */
  317. unsigned int next_src_index;
  318. /*
  319. * Migration on *destination* started.
  320. * Then use coroutine yield function.
  321. * Source runs in a thread, so we don't care.
  322. */
  323. int migration_started_on_destination;
  324. int total_registrations;
  325. int total_writes;
  326. int unregister_current, unregister_next;
  327. uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
  328. GHashTable *blockmap;
  329. /* the RDMAContext for return path */
  330. struct RDMAContext *return_path;
  331. bool is_return_path;
  332. } RDMAContext;
  333. #define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
  334. OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
  335. struct QIOChannelRDMA {
  336. QIOChannel parent;
  337. RDMAContext *rdmain;
  338. RDMAContext *rdmaout;
  339. QEMUFile *file;
  340. bool blocking; /* XXX we don't actually honour this yet */
  341. };
  342. /*
  343. * Main structure for IB Send/Recv control messages.
  344. * This gets prepended at the beginning of every Send/Recv.
  345. */
  346. typedef struct QEMU_PACKED {
  347. uint32_t len; /* Total length of data portion */
  348. uint32_t type; /* which control command to perform */
  349. uint32_t repeat; /* number of commands in data portion of same type */
  350. uint32_t padding;
  351. } RDMAControlHeader;
  352. static void control_to_network(RDMAControlHeader *control)
  353. {
  354. control->type = htonl(control->type);
  355. control->len = htonl(control->len);
  356. control->repeat = htonl(control->repeat);
  357. }
  358. static void network_to_control(RDMAControlHeader *control)
  359. {
  360. control->type = ntohl(control->type);
  361. control->len = ntohl(control->len);
  362. control->repeat = ntohl(control->repeat);
  363. }
  364. /*
  365. * Register a single Chunk.
  366. * Information sent by the source VM to inform the dest
  367. * to register an single chunk of memory before we can perform
  368. * the actual RDMA operation.
  369. */
  370. typedef struct QEMU_PACKED {
  371. union QEMU_PACKED {
  372. uint64_t current_addr; /* offset into the ram_addr_t space */
  373. uint64_t chunk; /* chunk to lookup if unregistering */
  374. } key;
  375. uint32_t current_index; /* which ramblock the chunk belongs to */
  376. uint32_t padding;
  377. uint64_t chunks; /* how many sequential chunks to register */
  378. } RDMARegister;
  379. static bool rdma_errored(RDMAContext *rdma)
  380. {
  381. if (rdma->errored && !rdma->error_reported) {
  382. error_report("RDMA is in an error state waiting migration"
  383. " to abort!");
  384. rdma->error_reported = true;
  385. }
  386. return rdma->errored;
  387. }
  388. static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
  389. {
  390. RDMALocalBlock *local_block;
  391. local_block = &rdma->local_ram_blocks.block[reg->current_index];
  392. if (local_block->is_ram_block) {
  393. /*
  394. * current_addr as passed in is an address in the local ram_addr_t
  395. * space, we need to translate this for the destination
  396. */
  397. reg->key.current_addr -= local_block->offset;
  398. reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
  399. }
  400. reg->key.current_addr = htonll(reg->key.current_addr);
  401. reg->current_index = htonl(reg->current_index);
  402. reg->chunks = htonll(reg->chunks);
  403. }
  404. static void network_to_register(RDMARegister *reg)
  405. {
  406. reg->key.current_addr = ntohll(reg->key.current_addr);
  407. reg->current_index = ntohl(reg->current_index);
  408. reg->chunks = ntohll(reg->chunks);
  409. }
  410. typedef struct QEMU_PACKED {
  411. uint32_t value; /* if zero, we will madvise() */
  412. uint32_t block_idx; /* which ram block index */
  413. uint64_t offset; /* Address in remote ram_addr_t space */
  414. uint64_t length; /* length of the chunk */
  415. } RDMACompress;
  416. static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
  417. {
  418. comp->value = htonl(comp->value);
  419. /*
  420. * comp->offset as passed in is an address in the local ram_addr_t
  421. * space, we need to translate this for the destination
  422. */
  423. comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
  424. comp->offset += rdma->dest_blocks[comp->block_idx].offset;
  425. comp->block_idx = htonl(comp->block_idx);
  426. comp->offset = htonll(comp->offset);
  427. comp->length = htonll(comp->length);
  428. }
  429. static void network_to_compress(RDMACompress *comp)
  430. {
  431. comp->value = ntohl(comp->value);
  432. comp->block_idx = ntohl(comp->block_idx);
  433. comp->offset = ntohll(comp->offset);
  434. comp->length = ntohll(comp->length);
  435. }
  436. /*
  437. * The result of the dest's memory registration produces an "rkey"
  438. * which the source VM must reference in order to perform
  439. * the RDMA operation.
  440. */
  441. typedef struct QEMU_PACKED {
  442. uint32_t rkey;
  443. uint32_t padding;
  444. uint64_t host_addr;
  445. } RDMARegisterResult;
  446. static void result_to_network(RDMARegisterResult *result)
  447. {
  448. result->rkey = htonl(result->rkey);
  449. result->host_addr = htonll(result->host_addr);
  450. };
  451. static void network_to_result(RDMARegisterResult *result)
  452. {
  453. result->rkey = ntohl(result->rkey);
  454. result->host_addr = ntohll(result->host_addr);
  455. };
  456. static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
  457. uint8_t *data, RDMAControlHeader *resp,
  458. int *resp_idx,
  459. int (*callback)(RDMAContext *rdma,
  460. Error **errp),
  461. Error **errp);
  462. static inline uint64_t ram_chunk_index(const uint8_t *start,
  463. const uint8_t *host)
  464. {
  465. return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
  466. }
  467. static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
  468. uint64_t i)
  469. {
  470. return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
  471. (i << RDMA_REG_CHUNK_SHIFT));
  472. }
  473. static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
  474. uint64_t i)
  475. {
  476. uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
  477. (1UL << RDMA_REG_CHUNK_SHIFT);
  478. if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
  479. result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
  480. }
  481. return result;
  482. }
  483. static void rdma_add_block(RDMAContext *rdma, const char *block_name,
  484. void *host_addr,
  485. ram_addr_t block_offset, uint64_t length)
  486. {
  487. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  488. RDMALocalBlock *block;
  489. RDMALocalBlock *old = local->block;
  490. local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
  491. if (local->nb_blocks) {
  492. if (rdma->blockmap) {
  493. for (int x = 0; x < local->nb_blocks; x++) {
  494. g_hash_table_remove(rdma->blockmap,
  495. (void *)(uintptr_t)old[x].offset);
  496. g_hash_table_insert(rdma->blockmap,
  497. (void *)(uintptr_t)old[x].offset,
  498. &local->block[x]);
  499. }
  500. }
  501. memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
  502. g_free(old);
  503. }
  504. block = &local->block[local->nb_blocks];
  505. block->block_name = g_strdup(block_name);
  506. block->local_host_addr = host_addr;
  507. block->offset = block_offset;
  508. block->length = length;
  509. block->index = local->nb_blocks;
  510. block->src_index = ~0U; /* Filled in by the receipt of the block list */
  511. block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
  512. block->transit_bitmap = bitmap_new(block->nb_chunks);
  513. bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
  514. block->unregister_bitmap = bitmap_new(block->nb_chunks);
  515. bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
  516. block->remote_keys = g_new0(uint32_t, block->nb_chunks);
  517. block->is_ram_block = local->init ? false : true;
  518. if (rdma->blockmap) {
  519. g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
  520. }
  521. trace_rdma_add_block(block_name, local->nb_blocks,
  522. (uintptr_t) block->local_host_addr,
  523. block->offset, block->length,
  524. (uintptr_t) (block->local_host_addr + block->length),
  525. BITS_TO_LONGS(block->nb_chunks) *
  526. sizeof(unsigned long) * 8,
  527. block->nb_chunks);
  528. local->nb_blocks++;
  529. }
  530. /*
  531. * Memory regions need to be registered with the device and queue pairs setup
  532. * in advanced before the migration starts. This tells us where the RAM blocks
  533. * are so that we can register them individually.
  534. */
  535. static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
  536. {
  537. const char *block_name = qemu_ram_get_idstr(rb);
  538. void *host_addr = qemu_ram_get_host_addr(rb);
  539. ram_addr_t block_offset = qemu_ram_get_offset(rb);
  540. ram_addr_t length = qemu_ram_get_used_length(rb);
  541. rdma_add_block(opaque, block_name, host_addr, block_offset, length);
  542. return 0;
  543. }
  544. /*
  545. * Identify the RAMBlocks and their quantity. They will be references to
  546. * identify chunk boundaries inside each RAMBlock and also be referenced
  547. * during dynamic page registration.
  548. */
  549. static void qemu_rdma_init_ram_blocks(RDMAContext *rdma)
  550. {
  551. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  552. int ret;
  553. assert(rdma->blockmap == NULL);
  554. memset(local, 0, sizeof *local);
  555. ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
  556. assert(!ret);
  557. trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
  558. rdma->dest_blocks = g_new0(RDMADestBlock,
  559. rdma->local_ram_blocks.nb_blocks);
  560. local->init = true;
  561. }
  562. /*
  563. * Note: If used outside of cleanup, the caller must ensure that the destination
  564. * block structures are also updated
  565. */
  566. static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
  567. {
  568. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  569. RDMALocalBlock *old = local->block;
  570. if (rdma->blockmap) {
  571. g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
  572. }
  573. if (block->pmr) {
  574. for (int j = 0; j < block->nb_chunks; j++) {
  575. if (!block->pmr[j]) {
  576. continue;
  577. }
  578. ibv_dereg_mr(block->pmr[j]);
  579. rdma->total_registrations--;
  580. }
  581. g_free(block->pmr);
  582. block->pmr = NULL;
  583. }
  584. if (block->mr) {
  585. ibv_dereg_mr(block->mr);
  586. rdma->total_registrations--;
  587. block->mr = NULL;
  588. }
  589. g_free(block->transit_bitmap);
  590. block->transit_bitmap = NULL;
  591. g_free(block->unregister_bitmap);
  592. block->unregister_bitmap = NULL;
  593. g_free(block->remote_keys);
  594. block->remote_keys = NULL;
  595. g_free(block->block_name);
  596. block->block_name = NULL;
  597. if (rdma->blockmap) {
  598. for (int x = 0; x < local->nb_blocks; x++) {
  599. g_hash_table_remove(rdma->blockmap,
  600. (void *)(uintptr_t)old[x].offset);
  601. }
  602. }
  603. if (local->nb_blocks > 1) {
  604. local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
  605. if (block->index) {
  606. memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
  607. }
  608. if (block->index < (local->nb_blocks - 1)) {
  609. memcpy(local->block + block->index, old + (block->index + 1),
  610. sizeof(RDMALocalBlock) *
  611. (local->nb_blocks - (block->index + 1)));
  612. for (int x = block->index; x < local->nb_blocks - 1; x++) {
  613. local->block[x].index--;
  614. }
  615. }
  616. } else {
  617. assert(block == local->block);
  618. local->block = NULL;
  619. }
  620. trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
  621. block->offset, block->length,
  622. (uintptr_t)(block->local_host_addr + block->length),
  623. BITS_TO_LONGS(block->nb_chunks) *
  624. sizeof(unsigned long) * 8, block->nb_chunks);
  625. g_free(old);
  626. local->nb_blocks--;
  627. if (local->nb_blocks && rdma->blockmap) {
  628. for (int x = 0; x < local->nb_blocks; x++) {
  629. g_hash_table_insert(rdma->blockmap,
  630. (void *)(uintptr_t)local->block[x].offset,
  631. &local->block[x]);
  632. }
  633. }
  634. }
  635. /*
  636. * Trace RDMA device open, with device details.
  637. */
  638. static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
  639. {
  640. struct ibv_port_attr port;
  641. if (ibv_query_port(verbs, 1, &port)) {
  642. trace_qemu_rdma_dump_id_failed(who);
  643. return;
  644. }
  645. trace_qemu_rdma_dump_id(who,
  646. verbs->device->name,
  647. verbs->device->dev_name,
  648. verbs->device->dev_path,
  649. verbs->device->ibdev_path,
  650. port.link_layer,
  651. port.link_layer == IBV_LINK_LAYER_INFINIBAND ? "Infiniband"
  652. : port.link_layer == IBV_LINK_LAYER_ETHERNET ? "Ethernet"
  653. : "Unknown");
  654. }
  655. /*
  656. * Trace RDMA gid addressing information.
  657. * Useful for understanding the RDMA device hierarchy in the kernel.
  658. */
  659. static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
  660. {
  661. char sgid[33];
  662. char dgid[33];
  663. inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
  664. inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
  665. trace_qemu_rdma_dump_gid(who, sgid, dgid);
  666. }
  667. /*
  668. * As of now, IPv6 over RoCE / iWARP is not supported by linux.
  669. * We will try the next addrinfo struct, and fail if there are
  670. * no other valid addresses to bind against.
  671. *
  672. * If user is listening on '[::]', then we will not have a opened a device
  673. * yet and have no way of verifying if the device is RoCE or not.
  674. *
  675. * In this case, the source VM will throw an error for ALL types of
  676. * connections (both IPv4 and IPv6) if the destination machine does not have
  677. * a regular infiniband network available for use.
  678. *
  679. * The only way to guarantee that an error is thrown for broken kernels is
  680. * for the management software to choose a *specific* interface at bind time
  681. * and validate what time of hardware it is.
  682. *
  683. * Unfortunately, this puts the user in a fix:
  684. *
  685. * If the source VM connects with an IPv4 address without knowing that the
  686. * destination has bound to '[::]' the migration will unconditionally fail
  687. * unless the management software is explicitly listening on the IPv4
  688. * address while using a RoCE-based device.
  689. *
  690. * If the source VM connects with an IPv6 address, then we're OK because we can
  691. * throw an error on the source (and similarly on the destination).
  692. *
  693. * But in mixed environments, this will be broken for a while until it is fixed
  694. * inside linux.
  695. *
  696. * We do provide a *tiny* bit of help in this function: We can list all of the
  697. * devices in the system and check to see if all the devices are RoCE or
  698. * Infiniband.
  699. *
  700. * If we detect that we have a *pure* RoCE environment, then we can safely
  701. * thrown an error even if the management software has specified '[::]' as the
  702. * bind address.
  703. *
  704. * However, if there is are multiple hetergeneous devices, then we cannot make
  705. * this assumption and the user just has to be sure they know what they are
  706. * doing.
  707. *
  708. * Patches are being reviewed on linux-rdma.
  709. */
  710. static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
  711. {
  712. /* This bug only exists in linux, to our knowledge. */
  713. #ifdef CONFIG_LINUX
  714. struct ibv_port_attr port_attr;
  715. /*
  716. * Verbs are only NULL if management has bound to '[::]'.
  717. *
  718. * Let's iterate through all the devices and see if there any pure IB
  719. * devices (non-ethernet).
  720. *
  721. * If not, then we can safely proceed with the migration.
  722. * Otherwise, there are no guarantees until the bug is fixed in linux.
  723. */
  724. if (!verbs) {
  725. int num_devices;
  726. struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
  727. bool roce_found = false;
  728. bool ib_found = false;
  729. for (int x = 0; x < num_devices; x++) {
  730. verbs = ibv_open_device(dev_list[x]);
  731. /*
  732. * ibv_open_device() is not documented to set errno. If
  733. * it does, it's somebody else's doc bug. If it doesn't,
  734. * the use of errno below is wrong.
  735. * TODO Find out whether ibv_open_device() sets errno.
  736. */
  737. if (!verbs) {
  738. if (errno == EPERM) {
  739. continue;
  740. } else {
  741. error_setg_errno(errp, errno,
  742. "could not open RDMA device context");
  743. return -1;
  744. }
  745. }
  746. if (ibv_query_port(verbs, 1, &port_attr)) {
  747. ibv_close_device(verbs);
  748. error_setg(errp,
  749. "RDMA ERROR: Could not query initial IB port");
  750. return -1;
  751. }
  752. if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
  753. ib_found = true;
  754. } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
  755. roce_found = true;
  756. }
  757. ibv_close_device(verbs);
  758. }
  759. if (roce_found) {
  760. if (ib_found) {
  761. warn_report("migrations may fail:"
  762. " IPv6 over RoCE / iWARP in linux"
  763. " is broken. But since you appear to have a"
  764. " mixed RoCE / IB environment, be sure to only"
  765. " migrate over the IB fabric until the kernel "
  766. " fixes the bug.");
  767. } else {
  768. error_setg(errp, "RDMA ERROR: "
  769. "You only have RoCE / iWARP devices in your systems"
  770. " and your management software has specified '[::]'"
  771. ", but IPv6 over RoCE / iWARP is not supported in Linux.");
  772. return -1;
  773. }
  774. }
  775. return 0;
  776. }
  777. /*
  778. * If we have a verbs context, that means that some other than '[::]' was
  779. * used by the management software for binding. In which case we can
  780. * actually warn the user about a potentially broken kernel.
  781. */
  782. /* IB ports start with 1, not 0 */
  783. if (ibv_query_port(verbs, 1, &port_attr)) {
  784. error_setg(errp, "RDMA ERROR: Could not query initial IB port");
  785. return -1;
  786. }
  787. if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
  788. error_setg(errp, "RDMA ERROR: "
  789. "Linux kernel's RoCE / iWARP does not support IPv6 "
  790. "(but patches on linux-rdma in progress)");
  791. return -1;
  792. }
  793. #endif
  794. return 0;
  795. }
  796. /*
  797. * Figure out which RDMA device corresponds to the requested IP hostname
  798. * Also create the initial connection manager identifiers for opening
  799. * the connection.
  800. */
  801. static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
  802. {
  803. Error *err = NULL;
  804. int ret;
  805. struct rdma_addrinfo *res;
  806. char port_str[16];
  807. struct rdma_cm_event *cm_event;
  808. char ip[40] = "unknown";
  809. if (rdma->host == NULL || !strcmp(rdma->host, "")) {
  810. error_setg(errp, "RDMA ERROR: RDMA hostname has not been set");
  811. return -1;
  812. }
  813. /* create CM channel */
  814. rdma->channel = rdma_create_event_channel();
  815. if (!rdma->channel) {
  816. error_setg(errp, "RDMA ERROR: could not create CM channel");
  817. return -1;
  818. }
  819. /* create CM id */
  820. ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
  821. if (ret < 0) {
  822. error_setg(errp, "RDMA ERROR: could not create channel id");
  823. goto err_resolve_create_id;
  824. }
  825. snprintf(port_str, 16, "%d", rdma->port);
  826. port_str[15] = '\0';
  827. ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
  828. if (ret) {
  829. error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s",
  830. rdma->host);
  831. goto err_resolve_get_addr;
  832. }
  833. /* Try all addresses, saving the first error in @err */
  834. for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) {
  835. Error **local_errp = err ? NULL : &err;
  836. inet_ntop(e->ai_family,
  837. &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
  838. trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
  839. ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
  840. RDMA_RESOLVE_TIMEOUT_MS);
  841. if (ret >= 0) {
  842. if (e->ai_family == AF_INET6) {
  843. ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs,
  844. local_errp);
  845. if (ret < 0) {
  846. continue;
  847. }
  848. }
  849. error_free(err);
  850. goto route;
  851. }
  852. }
  853. rdma_freeaddrinfo(res);
  854. if (err) {
  855. error_propagate(errp, err);
  856. } else {
  857. error_setg(errp, "RDMA ERROR: could not resolve address %s",
  858. rdma->host);
  859. }
  860. goto err_resolve_get_addr;
  861. route:
  862. rdma_freeaddrinfo(res);
  863. qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
  864. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  865. if (ret < 0) {
  866. error_setg(errp, "RDMA ERROR: could not perform event_addr_resolved");
  867. goto err_resolve_get_addr;
  868. }
  869. if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
  870. error_setg(errp,
  871. "RDMA ERROR: result not equal to event_addr_resolved %s",
  872. rdma_event_str(cm_event->event));
  873. rdma_ack_cm_event(cm_event);
  874. goto err_resolve_get_addr;
  875. }
  876. rdma_ack_cm_event(cm_event);
  877. /* resolve route */
  878. ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
  879. if (ret < 0) {
  880. error_setg(errp, "RDMA ERROR: could not resolve rdma route");
  881. goto err_resolve_get_addr;
  882. }
  883. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  884. if (ret < 0) {
  885. error_setg(errp, "RDMA ERROR: could not perform event_route_resolved");
  886. goto err_resolve_get_addr;
  887. }
  888. if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
  889. error_setg(errp, "RDMA ERROR: "
  890. "result not equal to event_route_resolved: %s",
  891. rdma_event_str(cm_event->event));
  892. rdma_ack_cm_event(cm_event);
  893. goto err_resolve_get_addr;
  894. }
  895. rdma_ack_cm_event(cm_event);
  896. rdma->verbs = rdma->cm_id->verbs;
  897. qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
  898. qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
  899. return 0;
  900. err_resolve_get_addr:
  901. rdma_destroy_id(rdma->cm_id);
  902. rdma->cm_id = NULL;
  903. err_resolve_create_id:
  904. rdma_destroy_event_channel(rdma->channel);
  905. rdma->channel = NULL;
  906. return -1;
  907. }
  908. /*
  909. * Create protection domain and completion queues
  910. */
  911. static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma, Error **errp)
  912. {
  913. /* allocate pd */
  914. rdma->pd = ibv_alloc_pd(rdma->verbs);
  915. if (!rdma->pd) {
  916. error_setg(errp, "failed to allocate protection domain");
  917. return -1;
  918. }
  919. /* create receive completion channel */
  920. rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
  921. if (!rdma->recv_comp_channel) {
  922. error_setg(errp, "failed to allocate receive completion channel");
  923. goto err_alloc_pd_cq;
  924. }
  925. /*
  926. * Completion queue can be filled by read work requests.
  927. */
  928. rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
  929. NULL, rdma->recv_comp_channel, 0);
  930. if (!rdma->recv_cq) {
  931. error_setg(errp, "failed to allocate receive completion queue");
  932. goto err_alloc_pd_cq;
  933. }
  934. /* create send completion channel */
  935. rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
  936. if (!rdma->send_comp_channel) {
  937. error_setg(errp, "failed to allocate send completion channel");
  938. goto err_alloc_pd_cq;
  939. }
  940. rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
  941. NULL, rdma->send_comp_channel, 0);
  942. if (!rdma->send_cq) {
  943. error_setg(errp, "failed to allocate send completion queue");
  944. goto err_alloc_pd_cq;
  945. }
  946. return 0;
  947. err_alloc_pd_cq:
  948. if (rdma->pd) {
  949. ibv_dealloc_pd(rdma->pd);
  950. }
  951. if (rdma->recv_comp_channel) {
  952. ibv_destroy_comp_channel(rdma->recv_comp_channel);
  953. }
  954. if (rdma->send_comp_channel) {
  955. ibv_destroy_comp_channel(rdma->send_comp_channel);
  956. }
  957. if (rdma->recv_cq) {
  958. ibv_destroy_cq(rdma->recv_cq);
  959. rdma->recv_cq = NULL;
  960. }
  961. rdma->pd = NULL;
  962. rdma->recv_comp_channel = NULL;
  963. rdma->send_comp_channel = NULL;
  964. return -1;
  965. }
  966. /*
  967. * Create queue pairs.
  968. */
  969. static int qemu_rdma_alloc_qp(RDMAContext *rdma)
  970. {
  971. struct ibv_qp_init_attr attr = { 0 };
  972. attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
  973. attr.cap.max_recv_wr = 3;
  974. attr.cap.max_send_sge = 1;
  975. attr.cap.max_recv_sge = 1;
  976. attr.send_cq = rdma->send_cq;
  977. attr.recv_cq = rdma->recv_cq;
  978. attr.qp_type = IBV_QPT_RC;
  979. if (rdma_create_qp(rdma->cm_id, rdma->pd, &attr) < 0) {
  980. return -1;
  981. }
  982. rdma->qp = rdma->cm_id->qp;
  983. return 0;
  984. }
  985. /* Check whether On-Demand Paging is supported by RDAM device */
  986. static bool rdma_support_odp(struct ibv_context *dev)
  987. {
  988. struct ibv_device_attr_ex attr = {0};
  989. if (ibv_query_device_ex(dev, NULL, &attr)) {
  990. return false;
  991. }
  992. if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
  993. return true;
  994. }
  995. return false;
  996. }
  997. /*
  998. * ibv_advise_mr to avoid RNR NAK error as far as possible.
  999. * The responder mr registering with ODP will sent RNR NAK back to
  1000. * the requester in the face of the page fault.
  1001. */
  1002. static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
  1003. uint32_t len, uint32_t lkey,
  1004. const char *name, bool wr)
  1005. {
  1006. #ifdef HAVE_IBV_ADVISE_MR
  1007. int ret;
  1008. int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
  1009. IBV_ADVISE_MR_ADVICE_PREFETCH;
  1010. struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
  1011. ret = ibv_advise_mr(pd, advice,
  1012. IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
  1013. /* ignore the error */
  1014. trace_qemu_rdma_advise_mr(name, len, addr, strerror(ret));
  1015. #endif
  1016. }
  1017. static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma, Error **errp)
  1018. {
  1019. int i;
  1020. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  1021. for (i = 0; i < local->nb_blocks; i++) {
  1022. int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
  1023. local->block[i].mr =
  1024. ibv_reg_mr(rdma->pd,
  1025. local->block[i].local_host_addr,
  1026. local->block[i].length, access
  1027. );
  1028. /*
  1029. * ibv_reg_mr() is not documented to set errno. If it does,
  1030. * it's somebody else's doc bug. If it doesn't, the use of
  1031. * errno below is wrong.
  1032. * TODO Find out whether ibv_reg_mr() sets errno.
  1033. */
  1034. if (!local->block[i].mr &&
  1035. errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
  1036. access |= IBV_ACCESS_ON_DEMAND;
  1037. /* register ODP mr */
  1038. local->block[i].mr =
  1039. ibv_reg_mr(rdma->pd,
  1040. local->block[i].local_host_addr,
  1041. local->block[i].length, access);
  1042. trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
  1043. if (local->block[i].mr) {
  1044. qemu_rdma_advise_prefetch_mr(rdma->pd,
  1045. (uintptr_t)local->block[i].local_host_addr,
  1046. local->block[i].length,
  1047. local->block[i].mr->lkey,
  1048. local->block[i].block_name,
  1049. true);
  1050. }
  1051. }
  1052. if (!local->block[i].mr) {
  1053. error_setg_errno(errp, errno,
  1054. "Failed to register local dest ram block!");
  1055. goto err;
  1056. }
  1057. rdma->total_registrations++;
  1058. }
  1059. return 0;
  1060. err:
  1061. for (i--; i >= 0; i--) {
  1062. ibv_dereg_mr(local->block[i].mr);
  1063. local->block[i].mr = NULL;
  1064. rdma->total_registrations--;
  1065. }
  1066. return -1;
  1067. }
  1068. /*
  1069. * Find the ram block that corresponds to the page requested to be
  1070. * transmitted by QEMU.
  1071. *
  1072. * Once the block is found, also identify which 'chunk' within that
  1073. * block that the page belongs to.
  1074. */
  1075. static void qemu_rdma_search_ram_block(RDMAContext *rdma,
  1076. uintptr_t block_offset,
  1077. uint64_t offset,
  1078. uint64_t length,
  1079. uint64_t *block_index,
  1080. uint64_t *chunk_index)
  1081. {
  1082. uint64_t current_addr = block_offset + offset;
  1083. RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
  1084. (void *) block_offset);
  1085. assert(block);
  1086. assert(current_addr >= block->offset);
  1087. assert((current_addr + length) <= (block->offset + block->length));
  1088. *block_index = block->index;
  1089. *chunk_index = ram_chunk_index(block->local_host_addr,
  1090. block->local_host_addr + (current_addr - block->offset));
  1091. }
  1092. /*
  1093. * Register a chunk with IB. If the chunk was already registered
  1094. * previously, then skip.
  1095. *
  1096. * Also return the keys associated with the registration needed
  1097. * to perform the actual RDMA operation.
  1098. */
  1099. static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
  1100. RDMALocalBlock *block, uintptr_t host_addr,
  1101. uint32_t *lkey, uint32_t *rkey, int chunk,
  1102. uint8_t *chunk_start, uint8_t *chunk_end)
  1103. {
  1104. if (block->mr) {
  1105. if (lkey) {
  1106. *lkey = block->mr->lkey;
  1107. }
  1108. if (rkey) {
  1109. *rkey = block->mr->rkey;
  1110. }
  1111. return 0;
  1112. }
  1113. /* allocate memory to store chunk MRs */
  1114. if (!block->pmr) {
  1115. block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
  1116. }
  1117. /*
  1118. * If 'rkey', then we're the destination, so grant access to the source.
  1119. *
  1120. * If 'lkey', then we're the source VM, so grant access only to ourselves.
  1121. */
  1122. if (!block->pmr[chunk]) {
  1123. uint64_t len = chunk_end - chunk_start;
  1124. int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
  1125. 0;
  1126. trace_qemu_rdma_register_and_get_keys(len, chunk_start);
  1127. block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
  1128. /*
  1129. * ibv_reg_mr() is not documented to set errno. If it does,
  1130. * it's somebody else's doc bug. If it doesn't, the use of
  1131. * errno below is wrong.
  1132. * TODO Find out whether ibv_reg_mr() sets errno.
  1133. */
  1134. if (!block->pmr[chunk] &&
  1135. errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
  1136. access |= IBV_ACCESS_ON_DEMAND;
  1137. /* register ODP mr */
  1138. block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
  1139. trace_qemu_rdma_register_odp_mr(block->block_name);
  1140. if (block->pmr[chunk]) {
  1141. qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
  1142. len, block->pmr[chunk]->lkey,
  1143. block->block_name, rkey);
  1144. }
  1145. }
  1146. }
  1147. if (!block->pmr[chunk]) {
  1148. return -1;
  1149. }
  1150. rdma->total_registrations++;
  1151. if (lkey) {
  1152. *lkey = block->pmr[chunk]->lkey;
  1153. }
  1154. if (rkey) {
  1155. *rkey = block->pmr[chunk]->rkey;
  1156. }
  1157. return 0;
  1158. }
  1159. /*
  1160. * Register (at connection time) the memory used for control
  1161. * channel messages.
  1162. */
  1163. static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
  1164. {
  1165. rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
  1166. rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
  1167. IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
  1168. if (rdma->wr_data[idx].control_mr) {
  1169. rdma->total_registrations++;
  1170. return 0;
  1171. }
  1172. return -1;
  1173. }
  1174. /*
  1175. * Perform a non-optimized memory unregistration after every transfer
  1176. * for demonstration purposes, only if pin-all is not requested.
  1177. *
  1178. * Potential optimizations:
  1179. * 1. Start a new thread to run this function continuously
  1180. - for bit clearing
  1181. - and for receipt of unregister messages
  1182. * 2. Use an LRU.
  1183. * 3. Use workload hints.
  1184. */
  1185. static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
  1186. {
  1187. Error *err = NULL;
  1188. while (rdma->unregistrations[rdma->unregister_current]) {
  1189. int ret;
  1190. uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
  1191. uint64_t chunk =
  1192. (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
  1193. uint64_t index =
  1194. (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
  1195. RDMALocalBlock *block =
  1196. &(rdma->local_ram_blocks.block[index]);
  1197. RDMARegister reg = { .current_index = index };
  1198. RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
  1199. };
  1200. RDMAControlHeader head = { .len = sizeof(RDMARegister),
  1201. .type = RDMA_CONTROL_UNREGISTER_REQUEST,
  1202. .repeat = 1,
  1203. };
  1204. trace_qemu_rdma_unregister_waiting_proc(chunk,
  1205. rdma->unregister_current);
  1206. rdma->unregistrations[rdma->unregister_current] = 0;
  1207. rdma->unregister_current++;
  1208. if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
  1209. rdma->unregister_current = 0;
  1210. }
  1211. /*
  1212. * Unregistration is speculative (because migration is single-threaded
  1213. * and we cannot break the protocol's inifinband message ordering).
  1214. * Thus, if the memory is currently being used for transmission,
  1215. * then abort the attempt to unregister and try again
  1216. * later the next time a completion is received for this memory.
  1217. */
  1218. clear_bit(chunk, block->unregister_bitmap);
  1219. if (test_bit(chunk, block->transit_bitmap)) {
  1220. trace_qemu_rdma_unregister_waiting_inflight(chunk);
  1221. continue;
  1222. }
  1223. trace_qemu_rdma_unregister_waiting_send(chunk);
  1224. ret = ibv_dereg_mr(block->pmr[chunk]);
  1225. block->pmr[chunk] = NULL;
  1226. block->remote_keys[chunk] = 0;
  1227. if (ret != 0) {
  1228. error_report("unregistration chunk failed: %s",
  1229. strerror(ret));
  1230. return -1;
  1231. }
  1232. rdma->total_registrations--;
  1233. reg.key.chunk = chunk;
  1234. register_to_network(rdma, &reg);
  1235. ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
  1236. &resp, NULL, NULL, &err);
  1237. if (ret < 0) {
  1238. error_report_err(err);
  1239. return -1;
  1240. }
  1241. trace_qemu_rdma_unregister_waiting_complete(chunk);
  1242. }
  1243. return 0;
  1244. }
  1245. static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
  1246. uint64_t chunk)
  1247. {
  1248. uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
  1249. result |= (index << RDMA_WRID_BLOCK_SHIFT);
  1250. result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
  1251. return result;
  1252. }
  1253. /*
  1254. * Consult the connection manager to see a work request
  1255. * (of any kind) has completed.
  1256. * Return the work request ID that completed.
  1257. */
  1258. static int qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
  1259. uint64_t *wr_id_out, uint32_t *byte_len)
  1260. {
  1261. int ret;
  1262. struct ibv_wc wc;
  1263. uint64_t wr_id;
  1264. ret = ibv_poll_cq(cq, 1, &wc);
  1265. if (!ret) {
  1266. *wr_id_out = RDMA_WRID_NONE;
  1267. return 0;
  1268. }
  1269. if (ret < 0) {
  1270. return -1;
  1271. }
  1272. wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
  1273. if (wc.status != IBV_WC_SUCCESS) {
  1274. return -1;
  1275. }
  1276. if (rdma->control_ready_expected &&
  1277. (wr_id >= RDMA_WRID_RECV_CONTROL)) {
  1278. trace_qemu_rdma_poll_recv(wr_id - RDMA_WRID_RECV_CONTROL, wr_id,
  1279. rdma->nb_sent);
  1280. rdma->control_ready_expected = 0;
  1281. }
  1282. if (wr_id == RDMA_WRID_RDMA_WRITE) {
  1283. uint64_t chunk =
  1284. (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
  1285. uint64_t index =
  1286. (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
  1287. RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
  1288. trace_qemu_rdma_poll_write(wr_id, rdma->nb_sent,
  1289. index, chunk, block->local_host_addr,
  1290. (void *)(uintptr_t)block->remote_host_addr);
  1291. clear_bit(chunk, block->transit_bitmap);
  1292. if (rdma->nb_sent > 0) {
  1293. rdma->nb_sent--;
  1294. }
  1295. } else {
  1296. trace_qemu_rdma_poll_other(wr_id, rdma->nb_sent);
  1297. }
  1298. *wr_id_out = wc.wr_id;
  1299. if (byte_len) {
  1300. *byte_len = wc.byte_len;
  1301. }
  1302. return 0;
  1303. }
  1304. /* Wait for activity on the completion channel.
  1305. * Returns 0 on success, none-0 on error.
  1306. */
  1307. static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
  1308. struct ibv_comp_channel *comp_channel)
  1309. {
  1310. struct rdma_cm_event *cm_event;
  1311. /*
  1312. * Coroutine doesn't start until migration_fd_process_incoming()
  1313. * so don't yield unless we know we're running inside of a coroutine.
  1314. */
  1315. if (rdma->migration_started_on_destination &&
  1316. migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
  1317. yield_until_fd_readable(comp_channel->fd);
  1318. } else {
  1319. /* This is the source side, we're in a separate thread
  1320. * or destination prior to migration_fd_process_incoming()
  1321. * after postcopy, the destination also in a separate thread.
  1322. * we can't yield; so we have to poll the fd.
  1323. * But we need to be able to handle 'cancel' or an error
  1324. * without hanging forever.
  1325. */
  1326. while (!rdma->errored && !rdma->received_error) {
  1327. GPollFD pfds[2];
  1328. pfds[0].fd = comp_channel->fd;
  1329. pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
  1330. pfds[0].revents = 0;
  1331. pfds[1].fd = rdma->channel->fd;
  1332. pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
  1333. pfds[1].revents = 0;
  1334. /* 0.1s timeout, should be fine for a 'cancel' */
  1335. switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
  1336. case 2:
  1337. case 1: /* fd active */
  1338. if (pfds[0].revents) {
  1339. return 0;
  1340. }
  1341. if (pfds[1].revents) {
  1342. if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
  1343. return -1;
  1344. }
  1345. if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
  1346. cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
  1347. rdma_ack_cm_event(cm_event);
  1348. return -1;
  1349. }
  1350. rdma_ack_cm_event(cm_event);
  1351. }
  1352. break;
  1353. case 0: /* Timeout, go around again */
  1354. break;
  1355. default: /* Error of some type -
  1356. * I don't trust errno from qemu_poll_ns
  1357. */
  1358. return -1;
  1359. }
  1360. if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
  1361. /* Bail out and let the cancellation happen */
  1362. return -1;
  1363. }
  1364. }
  1365. }
  1366. if (rdma->received_error) {
  1367. return -1;
  1368. }
  1369. return -rdma->errored;
  1370. }
  1371. static struct ibv_comp_channel *to_channel(RDMAContext *rdma, uint64_t wrid)
  1372. {
  1373. return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
  1374. rdma->recv_comp_channel;
  1375. }
  1376. static struct ibv_cq *to_cq(RDMAContext *rdma, uint64_t wrid)
  1377. {
  1378. return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
  1379. }
  1380. /*
  1381. * Block until the next work request has completed.
  1382. *
  1383. * First poll to see if a work request has already completed,
  1384. * otherwise block.
  1385. *
  1386. * If we encounter completed work requests for IDs other than
  1387. * the one we're interested in, then that's generally an error.
  1388. *
  1389. * The only exception is actual RDMA Write completions. These
  1390. * completions only need to be recorded, but do not actually
  1391. * need further processing.
  1392. */
  1393. static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
  1394. uint64_t wrid_requested,
  1395. uint32_t *byte_len)
  1396. {
  1397. int num_cq_events = 0, ret;
  1398. struct ibv_cq *cq;
  1399. void *cq_ctx;
  1400. uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
  1401. struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
  1402. struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
  1403. if (ibv_req_notify_cq(poll_cq, 0)) {
  1404. return -1;
  1405. }
  1406. /* poll cq first */
  1407. while (wr_id != wrid_requested) {
  1408. ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
  1409. if (ret < 0) {
  1410. return -1;
  1411. }
  1412. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  1413. if (wr_id == RDMA_WRID_NONE) {
  1414. break;
  1415. }
  1416. if (wr_id != wrid_requested) {
  1417. trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
  1418. }
  1419. }
  1420. if (wr_id == wrid_requested) {
  1421. return 0;
  1422. }
  1423. while (1) {
  1424. ret = qemu_rdma_wait_comp_channel(rdma, ch);
  1425. if (ret < 0) {
  1426. goto err_block_for_wrid;
  1427. }
  1428. ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
  1429. if (ret < 0) {
  1430. goto err_block_for_wrid;
  1431. }
  1432. num_cq_events++;
  1433. if (ibv_req_notify_cq(cq, 0)) {
  1434. goto err_block_for_wrid;
  1435. }
  1436. while (wr_id != wrid_requested) {
  1437. ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
  1438. if (ret < 0) {
  1439. goto err_block_for_wrid;
  1440. }
  1441. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  1442. if (wr_id == RDMA_WRID_NONE) {
  1443. break;
  1444. }
  1445. if (wr_id != wrid_requested) {
  1446. trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
  1447. }
  1448. }
  1449. if (wr_id == wrid_requested) {
  1450. goto success_block_for_wrid;
  1451. }
  1452. }
  1453. success_block_for_wrid:
  1454. if (num_cq_events) {
  1455. ibv_ack_cq_events(cq, num_cq_events);
  1456. }
  1457. return 0;
  1458. err_block_for_wrid:
  1459. if (num_cq_events) {
  1460. ibv_ack_cq_events(cq, num_cq_events);
  1461. }
  1462. rdma->errored = true;
  1463. return -1;
  1464. }
  1465. /*
  1466. * Post a SEND message work request for the control channel
  1467. * containing some data and block until the post completes.
  1468. */
  1469. static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
  1470. RDMAControlHeader *head,
  1471. Error **errp)
  1472. {
  1473. int ret;
  1474. RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
  1475. struct ibv_send_wr *bad_wr;
  1476. struct ibv_sge sge = {
  1477. .addr = (uintptr_t)(wr->control),
  1478. .length = head->len + sizeof(RDMAControlHeader),
  1479. .lkey = wr->control_mr->lkey,
  1480. };
  1481. struct ibv_send_wr send_wr = {
  1482. .wr_id = RDMA_WRID_SEND_CONTROL,
  1483. .opcode = IBV_WR_SEND,
  1484. .send_flags = IBV_SEND_SIGNALED,
  1485. .sg_list = &sge,
  1486. .num_sge = 1,
  1487. };
  1488. trace_qemu_rdma_post_send_control(control_desc(head->type));
  1489. /*
  1490. * We don't actually need to do a memcpy() in here if we used
  1491. * the "sge" properly, but since we're only sending control messages
  1492. * (not RAM in a performance-critical path), then its OK for now.
  1493. *
  1494. * The copy makes the RDMAControlHeader simpler to manipulate
  1495. * for the time being.
  1496. */
  1497. assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
  1498. memcpy(wr->control, head, sizeof(RDMAControlHeader));
  1499. control_to_network((void *) wr->control);
  1500. if (buf) {
  1501. memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
  1502. }
  1503. ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
  1504. if (ret > 0) {
  1505. error_setg(errp, "Failed to use post IB SEND for control");
  1506. return -1;
  1507. }
  1508. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
  1509. if (ret < 0) {
  1510. error_setg(errp, "rdma migration: send polling control error");
  1511. return -1;
  1512. }
  1513. return 0;
  1514. }
  1515. /*
  1516. * Post a RECV work request in anticipation of some future receipt
  1517. * of data on the control channel.
  1518. */
  1519. static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx,
  1520. Error **errp)
  1521. {
  1522. struct ibv_recv_wr *bad_wr;
  1523. struct ibv_sge sge = {
  1524. .addr = (uintptr_t)(rdma->wr_data[idx].control),
  1525. .length = RDMA_CONTROL_MAX_BUFFER,
  1526. .lkey = rdma->wr_data[idx].control_mr->lkey,
  1527. };
  1528. struct ibv_recv_wr recv_wr = {
  1529. .wr_id = RDMA_WRID_RECV_CONTROL + idx,
  1530. .sg_list = &sge,
  1531. .num_sge = 1,
  1532. };
  1533. if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
  1534. error_setg(errp, "error posting control recv");
  1535. return -1;
  1536. }
  1537. return 0;
  1538. }
  1539. /*
  1540. * Block and wait for a RECV control channel message to arrive.
  1541. */
  1542. static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
  1543. RDMAControlHeader *head, uint32_t expecting, int idx,
  1544. Error **errp)
  1545. {
  1546. uint32_t byte_len;
  1547. int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
  1548. &byte_len);
  1549. if (ret < 0) {
  1550. error_setg(errp, "rdma migration: recv polling control error!");
  1551. return -1;
  1552. }
  1553. network_to_control((void *) rdma->wr_data[idx].control);
  1554. memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
  1555. trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
  1556. if (expecting == RDMA_CONTROL_NONE) {
  1557. trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
  1558. head->type);
  1559. } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
  1560. error_setg(errp, "Was expecting a %s (%d) control message"
  1561. ", but got: %s (%d), length: %d",
  1562. control_desc(expecting), expecting,
  1563. control_desc(head->type), head->type, head->len);
  1564. if (head->type == RDMA_CONTROL_ERROR) {
  1565. rdma->received_error = true;
  1566. }
  1567. return -1;
  1568. }
  1569. if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
  1570. error_setg(errp, "too long length: %d", head->len);
  1571. return -1;
  1572. }
  1573. if (sizeof(*head) + head->len != byte_len) {
  1574. error_setg(errp, "Malformed length: %d byte_len %d",
  1575. head->len, byte_len);
  1576. return -1;
  1577. }
  1578. return 0;
  1579. }
  1580. /*
  1581. * When a RECV work request has completed, the work request's
  1582. * buffer is pointed at the header.
  1583. *
  1584. * This will advance the pointer to the data portion
  1585. * of the control message of the work request's buffer that
  1586. * was populated after the work request finished.
  1587. */
  1588. static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
  1589. RDMAControlHeader *head)
  1590. {
  1591. rdma->wr_data[idx].control_len = head->len;
  1592. rdma->wr_data[idx].control_curr =
  1593. rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
  1594. }
  1595. /*
  1596. * This is an 'atomic' high-level operation to deliver a single, unified
  1597. * control-channel message.
  1598. *
  1599. * Additionally, if the user is expecting some kind of reply to this message,
  1600. * they can request a 'resp' response message be filled in by posting an
  1601. * additional work request on behalf of the user and waiting for an additional
  1602. * completion.
  1603. *
  1604. * The extra (optional) response is used during registration to us from having
  1605. * to perform an *additional* exchange of message just to provide a response by
  1606. * instead piggy-backing on the acknowledgement.
  1607. */
  1608. static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
  1609. uint8_t *data, RDMAControlHeader *resp,
  1610. int *resp_idx,
  1611. int (*callback)(RDMAContext *rdma,
  1612. Error **errp),
  1613. Error **errp)
  1614. {
  1615. int ret;
  1616. /*
  1617. * Wait until the dest is ready before attempting to deliver the message
  1618. * by waiting for a READY message.
  1619. */
  1620. if (rdma->control_ready_expected) {
  1621. RDMAControlHeader resp_ignored;
  1622. ret = qemu_rdma_exchange_get_response(rdma, &resp_ignored,
  1623. RDMA_CONTROL_READY,
  1624. RDMA_WRID_READY, errp);
  1625. if (ret < 0) {
  1626. return -1;
  1627. }
  1628. }
  1629. /*
  1630. * If the user is expecting a response, post a WR in anticipation of it.
  1631. */
  1632. if (resp) {
  1633. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA, errp);
  1634. if (ret < 0) {
  1635. return -1;
  1636. }
  1637. }
  1638. /*
  1639. * Post a WR to replace the one we just consumed for the READY message.
  1640. */
  1641. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
  1642. if (ret < 0) {
  1643. return -1;
  1644. }
  1645. /*
  1646. * Deliver the control message that was requested.
  1647. */
  1648. ret = qemu_rdma_post_send_control(rdma, data, head, errp);
  1649. if (ret < 0) {
  1650. return -1;
  1651. }
  1652. /*
  1653. * If we're expecting a response, block and wait for it.
  1654. */
  1655. if (resp) {
  1656. if (callback) {
  1657. trace_qemu_rdma_exchange_send_issue_callback();
  1658. ret = callback(rdma, errp);
  1659. if (ret < 0) {
  1660. return -1;
  1661. }
  1662. }
  1663. trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
  1664. ret = qemu_rdma_exchange_get_response(rdma, resp,
  1665. resp->type, RDMA_WRID_DATA,
  1666. errp);
  1667. if (ret < 0) {
  1668. return -1;
  1669. }
  1670. qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
  1671. if (resp_idx) {
  1672. *resp_idx = RDMA_WRID_DATA;
  1673. }
  1674. trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
  1675. }
  1676. rdma->control_ready_expected = 1;
  1677. return 0;
  1678. }
  1679. /*
  1680. * This is an 'atomic' high-level operation to receive a single, unified
  1681. * control-channel message.
  1682. */
  1683. static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
  1684. uint32_t expecting, Error **errp)
  1685. {
  1686. RDMAControlHeader ready = {
  1687. .len = 0,
  1688. .type = RDMA_CONTROL_READY,
  1689. .repeat = 1,
  1690. };
  1691. int ret;
  1692. /*
  1693. * Inform the source that we're ready to receive a message.
  1694. */
  1695. ret = qemu_rdma_post_send_control(rdma, NULL, &ready, errp);
  1696. if (ret < 0) {
  1697. return -1;
  1698. }
  1699. /*
  1700. * Block and wait for the message.
  1701. */
  1702. ret = qemu_rdma_exchange_get_response(rdma, head,
  1703. expecting, RDMA_WRID_READY, errp);
  1704. if (ret < 0) {
  1705. return -1;
  1706. }
  1707. qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
  1708. /*
  1709. * Post a new RECV work request to replace the one we just consumed.
  1710. */
  1711. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
  1712. if (ret < 0) {
  1713. return -1;
  1714. }
  1715. return 0;
  1716. }
  1717. /*
  1718. * Write an actual chunk of memory using RDMA.
  1719. *
  1720. * If we're using dynamic registration on the dest-side, we have to
  1721. * send a registration command first.
  1722. */
  1723. static int qemu_rdma_write_one(RDMAContext *rdma,
  1724. int current_index, uint64_t current_addr,
  1725. uint64_t length, Error **errp)
  1726. {
  1727. struct ibv_sge sge;
  1728. struct ibv_send_wr send_wr = { 0 };
  1729. struct ibv_send_wr *bad_wr;
  1730. int reg_result_idx, ret, count = 0;
  1731. uint64_t chunk, chunks;
  1732. uint8_t *chunk_start, *chunk_end;
  1733. RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
  1734. RDMARegister reg;
  1735. RDMARegisterResult *reg_result;
  1736. RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
  1737. RDMAControlHeader head = { .len = sizeof(RDMARegister),
  1738. .type = RDMA_CONTROL_REGISTER_REQUEST,
  1739. .repeat = 1,
  1740. };
  1741. retry:
  1742. sge.addr = (uintptr_t)(block->local_host_addr +
  1743. (current_addr - block->offset));
  1744. sge.length = length;
  1745. chunk = ram_chunk_index(block->local_host_addr,
  1746. (uint8_t *)(uintptr_t)sge.addr);
  1747. chunk_start = ram_chunk_start(block, chunk);
  1748. if (block->is_ram_block) {
  1749. chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
  1750. if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
  1751. chunks--;
  1752. }
  1753. } else {
  1754. chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
  1755. if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
  1756. chunks--;
  1757. }
  1758. }
  1759. trace_qemu_rdma_write_one_top(chunks + 1,
  1760. (chunks + 1) *
  1761. (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
  1762. chunk_end = ram_chunk_end(block, chunk + chunks);
  1763. while (test_bit(chunk, block->transit_bitmap)) {
  1764. (void)count;
  1765. trace_qemu_rdma_write_one_block(count++, current_index, chunk,
  1766. sge.addr, length, rdma->nb_sent, block->nb_chunks);
  1767. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
  1768. if (ret < 0) {
  1769. error_setg(errp, "Failed to Wait for previous write to complete "
  1770. "block %d chunk %" PRIu64
  1771. " current %" PRIu64 " len %" PRIu64 " %d",
  1772. current_index, chunk, sge.addr, length, rdma->nb_sent);
  1773. return -1;
  1774. }
  1775. }
  1776. if (!rdma->pin_all || !block->is_ram_block) {
  1777. if (!block->remote_keys[chunk]) {
  1778. /*
  1779. * This chunk has not yet been registered, so first check to see
  1780. * if the entire chunk is zero. If so, tell the other size to
  1781. * memset() + madvise() the entire chunk without RDMA.
  1782. */
  1783. if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
  1784. RDMACompress comp = {
  1785. .offset = current_addr,
  1786. .value = 0,
  1787. .block_idx = current_index,
  1788. .length = length,
  1789. };
  1790. head.len = sizeof(comp);
  1791. head.type = RDMA_CONTROL_COMPRESS;
  1792. trace_qemu_rdma_write_one_zero(chunk, sge.length,
  1793. current_index, current_addr);
  1794. compress_to_network(rdma, &comp);
  1795. ret = qemu_rdma_exchange_send(rdma, &head,
  1796. (uint8_t *) &comp, NULL, NULL, NULL, errp);
  1797. if (ret < 0) {
  1798. return -1;
  1799. }
  1800. /*
  1801. * TODO: Here we are sending something, but we are not
  1802. * accounting for anything transferred. The following is wrong:
  1803. *
  1804. * stat64_add(&mig_stats.rdma_bytes, sge.length);
  1805. *
  1806. * because we are using some kind of compression. I
  1807. * would think that head.len would be the more similar
  1808. * thing to a correct value.
  1809. */
  1810. stat64_add(&mig_stats.zero_pages,
  1811. sge.length / qemu_target_page_size());
  1812. return 1;
  1813. }
  1814. /*
  1815. * Otherwise, tell other side to register.
  1816. */
  1817. reg.current_index = current_index;
  1818. if (block->is_ram_block) {
  1819. reg.key.current_addr = current_addr;
  1820. } else {
  1821. reg.key.chunk = chunk;
  1822. }
  1823. reg.chunks = chunks;
  1824. trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
  1825. current_addr);
  1826. register_to_network(rdma, &reg);
  1827. ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
  1828. &resp, &reg_result_idx, NULL, errp);
  1829. if (ret < 0) {
  1830. return -1;
  1831. }
  1832. /* try to overlap this single registration with the one we sent. */
  1833. if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
  1834. &sge.lkey, NULL, chunk,
  1835. chunk_start, chunk_end)) {
  1836. error_setg(errp, "cannot get lkey");
  1837. return -1;
  1838. }
  1839. reg_result = (RDMARegisterResult *)
  1840. rdma->wr_data[reg_result_idx].control_curr;
  1841. network_to_result(reg_result);
  1842. trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
  1843. reg_result->rkey, chunk);
  1844. block->remote_keys[chunk] = reg_result->rkey;
  1845. block->remote_host_addr = reg_result->host_addr;
  1846. } else {
  1847. /* already registered before */
  1848. if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
  1849. &sge.lkey, NULL, chunk,
  1850. chunk_start, chunk_end)) {
  1851. error_setg(errp, "cannot get lkey!");
  1852. return -1;
  1853. }
  1854. }
  1855. send_wr.wr.rdma.rkey = block->remote_keys[chunk];
  1856. } else {
  1857. send_wr.wr.rdma.rkey = block->remote_rkey;
  1858. if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
  1859. &sge.lkey, NULL, chunk,
  1860. chunk_start, chunk_end)) {
  1861. error_setg(errp, "cannot get lkey!");
  1862. return -1;
  1863. }
  1864. }
  1865. /*
  1866. * Encode the ram block index and chunk within this wrid.
  1867. * We will use this information at the time of completion
  1868. * to figure out which bitmap to check against and then which
  1869. * chunk in the bitmap to look for.
  1870. */
  1871. send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
  1872. current_index, chunk);
  1873. send_wr.opcode = IBV_WR_RDMA_WRITE;
  1874. send_wr.send_flags = IBV_SEND_SIGNALED;
  1875. send_wr.sg_list = &sge;
  1876. send_wr.num_sge = 1;
  1877. send_wr.wr.rdma.remote_addr = block->remote_host_addr +
  1878. (current_addr - block->offset);
  1879. trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
  1880. sge.length);
  1881. /*
  1882. * ibv_post_send() does not return negative error numbers,
  1883. * per the specification they are positive - no idea why.
  1884. */
  1885. ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
  1886. if (ret == ENOMEM) {
  1887. trace_qemu_rdma_write_one_queue_full();
  1888. ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
  1889. if (ret < 0) {
  1890. error_setg(errp, "rdma migration: failed to make "
  1891. "room in full send queue!");
  1892. return -1;
  1893. }
  1894. goto retry;
  1895. } else if (ret > 0) {
  1896. error_setg_errno(errp, ret,
  1897. "rdma migration: post rdma write failed");
  1898. return -1;
  1899. }
  1900. set_bit(chunk, block->transit_bitmap);
  1901. stat64_add(&mig_stats.normal_pages, sge.length / qemu_target_page_size());
  1902. /*
  1903. * We are adding to transferred the amount of data written, but no
  1904. * overhead at all. I will assume that RDMA is magicaly and don't
  1905. * need to transfer (at least) the addresses where it wants to
  1906. * write the pages. Here it looks like it should be something
  1907. * like:
  1908. * sizeof(send_wr) + sge.length
  1909. * but this being RDMA, who knows.
  1910. */
  1911. stat64_add(&mig_stats.rdma_bytes, sge.length);
  1912. ram_transferred_add(sge.length);
  1913. rdma->total_writes++;
  1914. return 0;
  1915. }
  1916. /*
  1917. * Push out any unwritten RDMA operations.
  1918. *
  1919. * We support sending out multiple chunks at the same time.
  1920. * Not all of them need to get signaled in the completion queue.
  1921. */
  1922. static int qemu_rdma_write_flush(RDMAContext *rdma, Error **errp)
  1923. {
  1924. int ret;
  1925. if (!rdma->current_length) {
  1926. return 0;
  1927. }
  1928. ret = qemu_rdma_write_one(rdma, rdma->current_index, rdma->current_addr,
  1929. rdma->current_length, errp);
  1930. if (ret < 0) {
  1931. return -1;
  1932. }
  1933. if (ret == 0) {
  1934. rdma->nb_sent++;
  1935. trace_qemu_rdma_write_flush(rdma->nb_sent);
  1936. }
  1937. rdma->current_length = 0;
  1938. rdma->current_addr = 0;
  1939. return 0;
  1940. }
  1941. static inline bool qemu_rdma_buffer_mergeable(RDMAContext *rdma,
  1942. uint64_t offset, uint64_t len)
  1943. {
  1944. RDMALocalBlock *block;
  1945. uint8_t *host_addr;
  1946. uint8_t *chunk_end;
  1947. if (rdma->current_index < 0) {
  1948. return false;
  1949. }
  1950. if (rdma->current_chunk < 0) {
  1951. return false;
  1952. }
  1953. block = &(rdma->local_ram_blocks.block[rdma->current_index]);
  1954. host_addr = block->local_host_addr + (offset - block->offset);
  1955. chunk_end = ram_chunk_end(block, rdma->current_chunk);
  1956. if (rdma->current_length == 0) {
  1957. return false;
  1958. }
  1959. /*
  1960. * Only merge into chunk sequentially.
  1961. */
  1962. if (offset != (rdma->current_addr + rdma->current_length)) {
  1963. return false;
  1964. }
  1965. if (offset < block->offset) {
  1966. return false;
  1967. }
  1968. if ((offset + len) > (block->offset + block->length)) {
  1969. return false;
  1970. }
  1971. if ((host_addr + len) > chunk_end) {
  1972. return false;
  1973. }
  1974. return true;
  1975. }
  1976. /*
  1977. * We're not actually writing here, but doing three things:
  1978. *
  1979. * 1. Identify the chunk the buffer belongs to.
  1980. * 2. If the chunk is full or the buffer doesn't belong to the current
  1981. * chunk, then start a new chunk and flush() the old chunk.
  1982. * 3. To keep the hardware busy, we also group chunks into batches
  1983. * and only require that a batch gets acknowledged in the completion
  1984. * queue instead of each individual chunk.
  1985. */
  1986. static int qemu_rdma_write(RDMAContext *rdma,
  1987. uint64_t block_offset, uint64_t offset,
  1988. uint64_t len, Error **errp)
  1989. {
  1990. uint64_t current_addr = block_offset + offset;
  1991. uint64_t index = rdma->current_index;
  1992. uint64_t chunk = rdma->current_chunk;
  1993. /* If we cannot merge it, we flush the current buffer first. */
  1994. if (!qemu_rdma_buffer_mergeable(rdma, current_addr, len)) {
  1995. if (qemu_rdma_write_flush(rdma, errp) < 0) {
  1996. return -1;
  1997. }
  1998. rdma->current_length = 0;
  1999. rdma->current_addr = current_addr;
  2000. qemu_rdma_search_ram_block(rdma, block_offset,
  2001. offset, len, &index, &chunk);
  2002. rdma->current_index = index;
  2003. rdma->current_chunk = chunk;
  2004. }
  2005. /* merge it */
  2006. rdma->current_length += len;
  2007. /* flush it if buffer is too large */
  2008. if (rdma->current_length >= RDMA_MERGE_MAX) {
  2009. return qemu_rdma_write_flush(rdma, errp);
  2010. }
  2011. return 0;
  2012. }
  2013. static void qemu_rdma_cleanup(RDMAContext *rdma)
  2014. {
  2015. Error *err = NULL;
  2016. if (rdma->cm_id && rdma->connected) {
  2017. if ((rdma->errored ||
  2018. migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
  2019. !rdma->received_error) {
  2020. RDMAControlHeader head = { .len = 0,
  2021. .type = RDMA_CONTROL_ERROR,
  2022. .repeat = 1,
  2023. };
  2024. warn_report("Early error. Sending error.");
  2025. if (qemu_rdma_post_send_control(rdma, NULL, &head, &err) < 0) {
  2026. warn_report_err(err);
  2027. }
  2028. }
  2029. rdma_disconnect(rdma->cm_id);
  2030. trace_qemu_rdma_cleanup_disconnect();
  2031. rdma->connected = false;
  2032. }
  2033. if (rdma->channel) {
  2034. qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
  2035. }
  2036. g_free(rdma->dest_blocks);
  2037. rdma->dest_blocks = NULL;
  2038. for (int i = 0; i < RDMA_WRID_MAX; i++) {
  2039. if (rdma->wr_data[i].control_mr) {
  2040. rdma->total_registrations--;
  2041. ibv_dereg_mr(rdma->wr_data[i].control_mr);
  2042. }
  2043. rdma->wr_data[i].control_mr = NULL;
  2044. }
  2045. if (rdma->local_ram_blocks.block) {
  2046. while (rdma->local_ram_blocks.nb_blocks) {
  2047. rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
  2048. }
  2049. }
  2050. if (rdma->qp) {
  2051. rdma_destroy_qp(rdma->cm_id);
  2052. rdma->qp = NULL;
  2053. }
  2054. if (rdma->recv_cq) {
  2055. ibv_destroy_cq(rdma->recv_cq);
  2056. rdma->recv_cq = NULL;
  2057. }
  2058. if (rdma->send_cq) {
  2059. ibv_destroy_cq(rdma->send_cq);
  2060. rdma->send_cq = NULL;
  2061. }
  2062. if (rdma->recv_comp_channel) {
  2063. ibv_destroy_comp_channel(rdma->recv_comp_channel);
  2064. rdma->recv_comp_channel = NULL;
  2065. }
  2066. if (rdma->send_comp_channel) {
  2067. ibv_destroy_comp_channel(rdma->send_comp_channel);
  2068. rdma->send_comp_channel = NULL;
  2069. }
  2070. if (rdma->pd) {
  2071. ibv_dealloc_pd(rdma->pd);
  2072. rdma->pd = NULL;
  2073. }
  2074. if (rdma->cm_id) {
  2075. rdma_destroy_id(rdma->cm_id);
  2076. rdma->cm_id = NULL;
  2077. }
  2078. /* the destination side, listen_id and channel is shared */
  2079. if (rdma->listen_id) {
  2080. if (!rdma->is_return_path) {
  2081. rdma_destroy_id(rdma->listen_id);
  2082. }
  2083. rdma->listen_id = NULL;
  2084. if (rdma->channel) {
  2085. if (!rdma->is_return_path) {
  2086. rdma_destroy_event_channel(rdma->channel);
  2087. }
  2088. rdma->channel = NULL;
  2089. }
  2090. }
  2091. if (rdma->channel) {
  2092. rdma_destroy_event_channel(rdma->channel);
  2093. rdma->channel = NULL;
  2094. }
  2095. g_free(rdma->host);
  2096. rdma->host = NULL;
  2097. }
  2098. static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
  2099. {
  2100. int ret;
  2101. /*
  2102. * Will be validated against destination's actual capabilities
  2103. * after the connect() completes.
  2104. */
  2105. rdma->pin_all = pin_all;
  2106. ret = qemu_rdma_resolve_host(rdma, errp);
  2107. if (ret < 0) {
  2108. goto err_rdma_source_init;
  2109. }
  2110. ret = qemu_rdma_alloc_pd_cq(rdma, errp);
  2111. if (ret < 0) {
  2112. goto err_rdma_source_init;
  2113. }
  2114. ret = qemu_rdma_alloc_qp(rdma);
  2115. if (ret < 0) {
  2116. error_setg(errp, "RDMA ERROR: rdma migration: error allocating qp!");
  2117. goto err_rdma_source_init;
  2118. }
  2119. qemu_rdma_init_ram_blocks(rdma);
  2120. /* Build the hash that maps from offset to RAMBlock */
  2121. rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
  2122. for (int i = 0; i < rdma->local_ram_blocks.nb_blocks; i++) {
  2123. g_hash_table_insert(rdma->blockmap,
  2124. (void *)(uintptr_t)rdma->local_ram_blocks.block[i].offset,
  2125. &rdma->local_ram_blocks.block[i]);
  2126. }
  2127. for (int i = 0; i < RDMA_WRID_MAX; i++) {
  2128. ret = qemu_rdma_reg_control(rdma, i);
  2129. if (ret < 0) {
  2130. error_setg(errp, "RDMA ERROR: rdma migration: error "
  2131. "registering %d control!", i);
  2132. goto err_rdma_source_init;
  2133. }
  2134. }
  2135. return 0;
  2136. err_rdma_source_init:
  2137. qemu_rdma_cleanup(rdma);
  2138. return -1;
  2139. }
  2140. static int qemu_get_cm_event_timeout(RDMAContext *rdma,
  2141. struct rdma_cm_event **cm_event,
  2142. long msec, Error **errp)
  2143. {
  2144. int ret;
  2145. struct pollfd poll_fd = {
  2146. .fd = rdma->channel->fd,
  2147. .events = POLLIN,
  2148. .revents = 0
  2149. };
  2150. do {
  2151. ret = poll(&poll_fd, 1, msec);
  2152. } while (ret < 0 && errno == EINTR);
  2153. if (ret == 0) {
  2154. error_setg(errp, "RDMA ERROR: poll cm event timeout");
  2155. return -1;
  2156. } else if (ret < 0) {
  2157. error_setg(errp, "RDMA ERROR: failed to poll cm event, errno=%i",
  2158. errno);
  2159. return -1;
  2160. } else if (poll_fd.revents & POLLIN) {
  2161. if (rdma_get_cm_event(rdma->channel, cm_event) < 0) {
  2162. error_setg(errp, "RDMA ERROR: failed to get cm event");
  2163. return -1;
  2164. }
  2165. return 0;
  2166. } else {
  2167. error_setg(errp, "RDMA ERROR: no POLLIN event, revent=%x",
  2168. poll_fd.revents);
  2169. return -1;
  2170. }
  2171. }
  2172. static int qemu_rdma_connect(RDMAContext *rdma, bool return_path,
  2173. Error **errp)
  2174. {
  2175. RDMACapabilities cap = {
  2176. .version = RDMA_CONTROL_VERSION_CURRENT,
  2177. .flags = 0,
  2178. };
  2179. struct rdma_conn_param conn_param = { .initiator_depth = 2,
  2180. .retry_count = 5,
  2181. .private_data = &cap,
  2182. .private_data_len = sizeof(cap),
  2183. };
  2184. struct rdma_cm_event *cm_event;
  2185. int ret;
  2186. /*
  2187. * Only negotiate the capability with destination if the user
  2188. * on the source first requested the capability.
  2189. */
  2190. if (rdma->pin_all) {
  2191. trace_qemu_rdma_connect_pin_all_requested();
  2192. cap.flags |= RDMA_CAPABILITY_PIN_ALL;
  2193. }
  2194. caps_to_network(&cap);
  2195. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
  2196. if (ret < 0) {
  2197. goto err_rdma_source_connect;
  2198. }
  2199. ret = rdma_connect(rdma->cm_id, &conn_param);
  2200. if (ret < 0) {
  2201. error_setg_errno(errp, errno,
  2202. "RDMA ERROR: connecting to destination!");
  2203. goto err_rdma_source_connect;
  2204. }
  2205. if (return_path) {
  2206. ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
  2207. } else {
  2208. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2209. if (ret < 0) {
  2210. error_setg_errno(errp, errno,
  2211. "RDMA ERROR: failed to get cm event");
  2212. }
  2213. }
  2214. if (ret < 0) {
  2215. goto err_rdma_source_connect;
  2216. }
  2217. if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
  2218. error_setg(errp, "RDMA ERROR: connecting to destination!");
  2219. rdma_ack_cm_event(cm_event);
  2220. goto err_rdma_source_connect;
  2221. }
  2222. rdma->connected = true;
  2223. memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
  2224. network_to_caps(&cap);
  2225. /*
  2226. * Verify that the *requested* capabilities are supported by the destination
  2227. * and disable them otherwise.
  2228. */
  2229. if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
  2230. warn_report("RDMA: Server cannot support pinning all memory. "
  2231. "Will register memory dynamically.");
  2232. rdma->pin_all = false;
  2233. }
  2234. trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
  2235. rdma_ack_cm_event(cm_event);
  2236. rdma->control_ready_expected = 1;
  2237. rdma->nb_sent = 0;
  2238. return 0;
  2239. err_rdma_source_connect:
  2240. qemu_rdma_cleanup(rdma);
  2241. return -1;
  2242. }
  2243. static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
  2244. {
  2245. Error *err = NULL;
  2246. int ret;
  2247. struct rdma_cm_id *listen_id;
  2248. char ip[40] = "unknown";
  2249. struct rdma_addrinfo *res, *e;
  2250. char port_str[16];
  2251. int reuse = 1;
  2252. for (int i = 0; i < RDMA_WRID_MAX; i++) {
  2253. rdma->wr_data[i].control_len = 0;
  2254. rdma->wr_data[i].control_curr = NULL;
  2255. }
  2256. if (!rdma->host || !rdma->host[0]) {
  2257. error_setg(errp, "RDMA ERROR: RDMA host is not set!");
  2258. rdma->errored = true;
  2259. return -1;
  2260. }
  2261. /* create CM channel */
  2262. rdma->channel = rdma_create_event_channel();
  2263. if (!rdma->channel) {
  2264. error_setg(errp, "RDMA ERROR: could not create rdma event channel");
  2265. rdma->errored = true;
  2266. return -1;
  2267. }
  2268. /* create CM id */
  2269. ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
  2270. if (ret < 0) {
  2271. error_setg(errp, "RDMA ERROR: could not create cm_id!");
  2272. goto err_dest_init_create_listen_id;
  2273. }
  2274. snprintf(port_str, 16, "%d", rdma->port);
  2275. port_str[15] = '\0';
  2276. ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
  2277. if (ret) {
  2278. error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s",
  2279. rdma->host);
  2280. goto err_dest_init_bind_addr;
  2281. }
  2282. ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
  2283. &reuse, sizeof reuse);
  2284. if (ret < 0) {
  2285. error_setg(errp, "RDMA ERROR: Error: could not set REUSEADDR option");
  2286. goto err_dest_init_bind_addr;
  2287. }
  2288. /* Try all addresses, saving the first error in @err */
  2289. for (e = res; e != NULL; e = e->ai_next) {
  2290. Error **local_errp = err ? NULL : &err;
  2291. inet_ntop(e->ai_family,
  2292. &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
  2293. trace_qemu_rdma_dest_init_trying(rdma->host, ip);
  2294. ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
  2295. if (ret < 0) {
  2296. continue;
  2297. }
  2298. if (e->ai_family == AF_INET6) {
  2299. ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs,
  2300. local_errp);
  2301. if (ret < 0) {
  2302. continue;
  2303. }
  2304. }
  2305. error_free(err);
  2306. break;
  2307. }
  2308. rdma_freeaddrinfo(res);
  2309. if (!e) {
  2310. if (err) {
  2311. error_propagate(errp, err);
  2312. } else {
  2313. error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!");
  2314. }
  2315. goto err_dest_init_bind_addr;
  2316. }
  2317. rdma->listen_id = listen_id;
  2318. qemu_rdma_dump_gid("dest_init", listen_id);
  2319. return 0;
  2320. err_dest_init_bind_addr:
  2321. rdma_destroy_id(listen_id);
  2322. err_dest_init_create_listen_id:
  2323. rdma_destroy_event_channel(rdma->channel);
  2324. rdma->channel = NULL;
  2325. rdma->errored = true;
  2326. return -1;
  2327. }
  2328. static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
  2329. RDMAContext *rdma)
  2330. {
  2331. for (int i = 0; i < RDMA_WRID_MAX; i++) {
  2332. rdma_return_path->wr_data[i].control_len = 0;
  2333. rdma_return_path->wr_data[i].control_curr = NULL;
  2334. }
  2335. /*the CM channel and CM id is shared*/
  2336. rdma_return_path->channel = rdma->channel;
  2337. rdma_return_path->listen_id = rdma->listen_id;
  2338. rdma->return_path = rdma_return_path;
  2339. rdma_return_path->return_path = rdma;
  2340. rdma_return_path->is_return_path = true;
  2341. }
  2342. static RDMAContext *qemu_rdma_data_init(InetSocketAddress *saddr, Error **errp)
  2343. {
  2344. RDMAContext *rdma = NULL;
  2345. rdma = g_new0(RDMAContext, 1);
  2346. rdma->current_index = -1;
  2347. rdma->current_chunk = -1;
  2348. rdma->host = g_strdup(saddr->host);
  2349. rdma->port = atoi(saddr->port);
  2350. return rdma;
  2351. }
  2352. /*
  2353. * QEMUFile interface to the control channel.
  2354. * SEND messages for control only.
  2355. * VM's ram is handled with regular RDMA messages.
  2356. */
  2357. static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
  2358. const struct iovec *iov,
  2359. size_t niov,
  2360. int *fds,
  2361. size_t nfds,
  2362. int flags,
  2363. Error **errp)
  2364. {
  2365. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2366. RDMAContext *rdma;
  2367. int ret;
  2368. ssize_t done = 0;
  2369. size_t len;
  2370. RCU_READ_LOCK_GUARD();
  2371. rdma = qatomic_rcu_read(&rioc->rdmaout);
  2372. if (!rdma) {
  2373. error_setg(errp, "RDMA control channel output is not set");
  2374. return -1;
  2375. }
  2376. if (rdma->errored) {
  2377. error_setg(errp,
  2378. "RDMA is in an error state waiting migration to abort!");
  2379. return -1;
  2380. }
  2381. /*
  2382. * Push out any writes that
  2383. * we're queued up for VM's ram.
  2384. */
  2385. ret = qemu_rdma_write_flush(rdma, errp);
  2386. if (ret < 0) {
  2387. rdma->errored = true;
  2388. return -1;
  2389. }
  2390. for (int i = 0; i < niov; i++) {
  2391. size_t remaining = iov[i].iov_len;
  2392. uint8_t * data = (void *)iov[i].iov_base;
  2393. while (remaining) {
  2394. RDMAControlHeader head = {};
  2395. len = MIN(remaining, RDMA_SEND_INCREMENT);
  2396. remaining -= len;
  2397. head.len = len;
  2398. head.type = RDMA_CONTROL_QEMU_FILE;
  2399. ret = qemu_rdma_exchange_send(rdma, &head,
  2400. data, NULL, NULL, NULL, errp);
  2401. if (ret < 0) {
  2402. rdma->errored = true;
  2403. return -1;
  2404. }
  2405. data += len;
  2406. done += len;
  2407. }
  2408. }
  2409. return done;
  2410. }
  2411. static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
  2412. size_t size, int idx)
  2413. {
  2414. size_t len = 0;
  2415. if (rdma->wr_data[idx].control_len) {
  2416. trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
  2417. len = MIN(size, rdma->wr_data[idx].control_len);
  2418. memcpy(buf, rdma->wr_data[idx].control_curr, len);
  2419. rdma->wr_data[idx].control_curr += len;
  2420. rdma->wr_data[idx].control_len -= len;
  2421. }
  2422. return len;
  2423. }
  2424. /*
  2425. * QEMUFile interface to the control channel.
  2426. * RDMA links don't use bytestreams, so we have to
  2427. * return bytes to QEMUFile opportunistically.
  2428. */
  2429. static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
  2430. const struct iovec *iov,
  2431. size_t niov,
  2432. int **fds,
  2433. size_t *nfds,
  2434. int flags,
  2435. Error **errp)
  2436. {
  2437. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2438. RDMAContext *rdma;
  2439. RDMAControlHeader head;
  2440. int ret;
  2441. ssize_t done = 0;
  2442. size_t len;
  2443. RCU_READ_LOCK_GUARD();
  2444. rdma = qatomic_rcu_read(&rioc->rdmain);
  2445. if (!rdma) {
  2446. error_setg(errp, "RDMA control channel input is not set");
  2447. return -1;
  2448. }
  2449. if (rdma->errored) {
  2450. error_setg(errp,
  2451. "RDMA is in an error state waiting migration to abort!");
  2452. return -1;
  2453. }
  2454. for (int i = 0; i < niov; i++) {
  2455. size_t want = iov[i].iov_len;
  2456. uint8_t *data = (void *)iov[i].iov_base;
  2457. /*
  2458. * First, we hold on to the last SEND message we
  2459. * were given and dish out the bytes until we run
  2460. * out of bytes.
  2461. */
  2462. len = qemu_rdma_fill(rdma, data, want, 0);
  2463. done += len;
  2464. want -= len;
  2465. /* Got what we needed, so go to next iovec */
  2466. if (want == 0) {
  2467. continue;
  2468. }
  2469. /* If we got any data so far, then don't wait
  2470. * for more, just return what we have */
  2471. if (done > 0) {
  2472. break;
  2473. }
  2474. /* We've got nothing at all, so lets wait for
  2475. * more to arrive
  2476. */
  2477. ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE,
  2478. errp);
  2479. if (ret < 0) {
  2480. rdma->errored = true;
  2481. return -1;
  2482. }
  2483. /*
  2484. * SEND was received with new bytes, now try again.
  2485. */
  2486. len = qemu_rdma_fill(rdma, data, want, 0);
  2487. done += len;
  2488. want -= len;
  2489. /* Still didn't get enough, so lets just return */
  2490. if (want) {
  2491. if (done == 0) {
  2492. return QIO_CHANNEL_ERR_BLOCK;
  2493. } else {
  2494. break;
  2495. }
  2496. }
  2497. }
  2498. return done;
  2499. }
  2500. /*
  2501. * Block until all the outstanding chunks have been delivered by the hardware.
  2502. */
  2503. static int qemu_rdma_drain_cq(RDMAContext *rdma)
  2504. {
  2505. Error *err = NULL;
  2506. if (qemu_rdma_write_flush(rdma, &err) < 0) {
  2507. error_report_err(err);
  2508. return -1;
  2509. }
  2510. while (rdma->nb_sent) {
  2511. if (qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL) < 0) {
  2512. error_report("rdma migration: complete polling error!");
  2513. return -1;
  2514. }
  2515. }
  2516. qemu_rdma_unregister_waiting(rdma);
  2517. return 0;
  2518. }
  2519. static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
  2520. bool blocking,
  2521. Error **errp)
  2522. {
  2523. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2524. /* XXX we should make readv/writev actually honour this :-) */
  2525. rioc->blocking = blocking;
  2526. return 0;
  2527. }
  2528. typedef struct QIOChannelRDMASource QIOChannelRDMASource;
  2529. struct QIOChannelRDMASource {
  2530. GSource parent;
  2531. QIOChannelRDMA *rioc;
  2532. GIOCondition condition;
  2533. };
  2534. static gboolean
  2535. qio_channel_rdma_source_prepare(GSource *source,
  2536. gint *timeout)
  2537. {
  2538. QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
  2539. RDMAContext *rdma;
  2540. GIOCondition cond = 0;
  2541. *timeout = -1;
  2542. RCU_READ_LOCK_GUARD();
  2543. if (rsource->condition == G_IO_IN) {
  2544. rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
  2545. } else {
  2546. rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
  2547. }
  2548. if (!rdma) {
  2549. error_report("RDMAContext is NULL when prepare Gsource");
  2550. return FALSE;
  2551. }
  2552. if (rdma->wr_data[0].control_len) {
  2553. cond |= G_IO_IN;
  2554. }
  2555. cond |= G_IO_OUT;
  2556. return cond & rsource->condition;
  2557. }
  2558. static gboolean
  2559. qio_channel_rdma_source_check(GSource *source)
  2560. {
  2561. QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
  2562. RDMAContext *rdma;
  2563. GIOCondition cond = 0;
  2564. RCU_READ_LOCK_GUARD();
  2565. if (rsource->condition == G_IO_IN) {
  2566. rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
  2567. } else {
  2568. rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
  2569. }
  2570. if (!rdma) {
  2571. error_report("RDMAContext is NULL when check Gsource");
  2572. return FALSE;
  2573. }
  2574. if (rdma->wr_data[0].control_len) {
  2575. cond |= G_IO_IN;
  2576. }
  2577. cond |= G_IO_OUT;
  2578. return cond & rsource->condition;
  2579. }
  2580. static gboolean
  2581. qio_channel_rdma_source_dispatch(GSource *source,
  2582. GSourceFunc callback,
  2583. gpointer user_data)
  2584. {
  2585. QIOChannelFunc func = (QIOChannelFunc)callback;
  2586. QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
  2587. RDMAContext *rdma;
  2588. GIOCondition cond = 0;
  2589. RCU_READ_LOCK_GUARD();
  2590. if (rsource->condition == G_IO_IN) {
  2591. rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
  2592. } else {
  2593. rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
  2594. }
  2595. if (!rdma) {
  2596. error_report("RDMAContext is NULL when dispatch Gsource");
  2597. return FALSE;
  2598. }
  2599. if (rdma->wr_data[0].control_len) {
  2600. cond |= G_IO_IN;
  2601. }
  2602. cond |= G_IO_OUT;
  2603. return (*func)(QIO_CHANNEL(rsource->rioc),
  2604. (cond & rsource->condition),
  2605. user_data);
  2606. }
  2607. static void
  2608. qio_channel_rdma_source_finalize(GSource *source)
  2609. {
  2610. QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
  2611. object_unref(OBJECT(ssource->rioc));
  2612. }
  2613. static GSourceFuncs qio_channel_rdma_source_funcs = {
  2614. qio_channel_rdma_source_prepare,
  2615. qio_channel_rdma_source_check,
  2616. qio_channel_rdma_source_dispatch,
  2617. qio_channel_rdma_source_finalize
  2618. };
  2619. static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
  2620. GIOCondition condition)
  2621. {
  2622. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2623. QIOChannelRDMASource *ssource;
  2624. GSource *source;
  2625. source = g_source_new(&qio_channel_rdma_source_funcs,
  2626. sizeof(QIOChannelRDMASource));
  2627. ssource = (QIOChannelRDMASource *)source;
  2628. ssource->rioc = rioc;
  2629. object_ref(OBJECT(rioc));
  2630. ssource->condition = condition;
  2631. return source;
  2632. }
  2633. static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
  2634. AioContext *read_ctx,
  2635. IOHandler *io_read,
  2636. AioContext *write_ctx,
  2637. IOHandler *io_write,
  2638. void *opaque)
  2639. {
  2640. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2641. if (io_read) {
  2642. aio_set_fd_handler(read_ctx, rioc->rdmain->recv_comp_channel->fd,
  2643. io_read, io_write, NULL, NULL, opaque);
  2644. aio_set_fd_handler(read_ctx, rioc->rdmain->send_comp_channel->fd,
  2645. io_read, io_write, NULL, NULL, opaque);
  2646. } else {
  2647. aio_set_fd_handler(write_ctx, rioc->rdmaout->recv_comp_channel->fd,
  2648. io_read, io_write, NULL, NULL, opaque);
  2649. aio_set_fd_handler(write_ctx, rioc->rdmaout->send_comp_channel->fd,
  2650. io_read, io_write, NULL, NULL, opaque);
  2651. }
  2652. }
  2653. struct rdma_close_rcu {
  2654. struct rcu_head rcu;
  2655. RDMAContext *rdmain;
  2656. RDMAContext *rdmaout;
  2657. };
  2658. /* callback from qio_channel_rdma_close via call_rcu */
  2659. static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
  2660. {
  2661. if (rcu->rdmain) {
  2662. qemu_rdma_cleanup(rcu->rdmain);
  2663. }
  2664. if (rcu->rdmaout) {
  2665. qemu_rdma_cleanup(rcu->rdmaout);
  2666. }
  2667. g_free(rcu->rdmain);
  2668. g_free(rcu->rdmaout);
  2669. g_free(rcu);
  2670. }
  2671. static int qio_channel_rdma_close(QIOChannel *ioc,
  2672. Error **errp)
  2673. {
  2674. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2675. RDMAContext *rdmain, *rdmaout;
  2676. struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
  2677. trace_qemu_rdma_close();
  2678. rdmain = rioc->rdmain;
  2679. if (rdmain) {
  2680. qatomic_rcu_set(&rioc->rdmain, NULL);
  2681. }
  2682. rdmaout = rioc->rdmaout;
  2683. if (rdmaout) {
  2684. qatomic_rcu_set(&rioc->rdmaout, NULL);
  2685. }
  2686. rcu->rdmain = rdmain;
  2687. rcu->rdmaout = rdmaout;
  2688. call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
  2689. return 0;
  2690. }
  2691. static int
  2692. qio_channel_rdma_shutdown(QIOChannel *ioc,
  2693. QIOChannelShutdown how,
  2694. Error **errp)
  2695. {
  2696. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
  2697. RDMAContext *rdmain, *rdmaout;
  2698. RCU_READ_LOCK_GUARD();
  2699. rdmain = qatomic_rcu_read(&rioc->rdmain);
  2700. rdmaout = qatomic_rcu_read(&rioc->rdmain);
  2701. switch (how) {
  2702. case QIO_CHANNEL_SHUTDOWN_READ:
  2703. if (rdmain) {
  2704. rdmain->errored = true;
  2705. }
  2706. break;
  2707. case QIO_CHANNEL_SHUTDOWN_WRITE:
  2708. if (rdmaout) {
  2709. rdmaout->errored = true;
  2710. }
  2711. break;
  2712. case QIO_CHANNEL_SHUTDOWN_BOTH:
  2713. default:
  2714. if (rdmain) {
  2715. rdmain->errored = true;
  2716. }
  2717. if (rdmaout) {
  2718. rdmaout->errored = true;
  2719. }
  2720. break;
  2721. }
  2722. return 0;
  2723. }
  2724. /*
  2725. * Parameters:
  2726. * @offset == 0 :
  2727. * This means that 'block_offset' is a full virtual address that does not
  2728. * belong to a RAMBlock of the virtual machine and instead
  2729. * represents a private malloc'd memory area that the caller wishes to
  2730. * transfer.
  2731. *
  2732. * @offset != 0 :
  2733. * Offset is an offset to be added to block_offset and used
  2734. * to also lookup the corresponding RAMBlock.
  2735. *
  2736. * @size : Number of bytes to transfer
  2737. *
  2738. * @pages_sent : User-specificed pointer to indicate how many pages were
  2739. * sent. Usually, this will not be more than a few bytes of
  2740. * the protocol because most transfers are sent asynchronously.
  2741. */
  2742. static int qemu_rdma_save_page(QEMUFile *f, ram_addr_t block_offset,
  2743. ram_addr_t offset, size_t size)
  2744. {
  2745. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
  2746. Error *err = NULL;
  2747. RDMAContext *rdma;
  2748. int ret;
  2749. RCU_READ_LOCK_GUARD();
  2750. rdma = qatomic_rcu_read(&rioc->rdmaout);
  2751. if (!rdma) {
  2752. return -1;
  2753. }
  2754. if (rdma_errored(rdma)) {
  2755. return -1;
  2756. }
  2757. qemu_fflush(f);
  2758. /*
  2759. * Add this page to the current 'chunk'. If the chunk
  2760. * is full, or the page doesn't belong to the current chunk,
  2761. * an actual RDMA write will occur and a new chunk will be formed.
  2762. */
  2763. ret = qemu_rdma_write(rdma, block_offset, offset, size, &err);
  2764. if (ret < 0) {
  2765. error_report_err(err);
  2766. goto err;
  2767. }
  2768. /*
  2769. * Drain the Completion Queue if possible, but do not block,
  2770. * just poll.
  2771. *
  2772. * If nothing to poll, the end of the iteration will do this
  2773. * again to make sure we don't overflow the request queue.
  2774. */
  2775. while (1) {
  2776. uint64_t wr_id, wr_id_in;
  2777. ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
  2778. if (ret < 0) {
  2779. error_report("rdma migration: polling error");
  2780. goto err;
  2781. }
  2782. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  2783. if (wr_id == RDMA_WRID_NONE) {
  2784. break;
  2785. }
  2786. }
  2787. while (1) {
  2788. uint64_t wr_id, wr_id_in;
  2789. ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
  2790. if (ret < 0) {
  2791. error_report("rdma migration: polling error");
  2792. goto err;
  2793. }
  2794. wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
  2795. if (wr_id == RDMA_WRID_NONE) {
  2796. break;
  2797. }
  2798. }
  2799. return RAM_SAVE_CONTROL_DELAYED;
  2800. err:
  2801. rdma->errored = true;
  2802. return -1;
  2803. }
  2804. int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
  2805. ram_addr_t offset, size_t size)
  2806. {
  2807. if (!migrate_rdma() || migration_in_postcopy()) {
  2808. return RAM_SAVE_CONTROL_NOT_SUPP;
  2809. }
  2810. int ret = qemu_rdma_save_page(f, block_offset, offset, size);
  2811. if (ret != RAM_SAVE_CONTROL_DELAYED &&
  2812. ret != RAM_SAVE_CONTROL_NOT_SUPP) {
  2813. if (ret < 0) {
  2814. qemu_file_set_error(f, ret);
  2815. }
  2816. }
  2817. return ret;
  2818. }
  2819. static void rdma_accept_incoming_migration(void *opaque);
  2820. static void rdma_cm_poll_handler(void *opaque)
  2821. {
  2822. RDMAContext *rdma = opaque;
  2823. struct rdma_cm_event *cm_event;
  2824. MigrationIncomingState *mis = migration_incoming_get_current();
  2825. if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
  2826. error_report("get_cm_event failed %d", errno);
  2827. return;
  2828. }
  2829. if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
  2830. cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
  2831. if (!rdma->errored &&
  2832. migration_incoming_get_current()->state !=
  2833. MIGRATION_STATUS_COMPLETED) {
  2834. error_report("receive cm event, cm event is %d", cm_event->event);
  2835. rdma->errored = true;
  2836. if (rdma->return_path) {
  2837. rdma->return_path->errored = true;
  2838. }
  2839. }
  2840. rdma_ack_cm_event(cm_event);
  2841. if (mis->loadvm_co) {
  2842. qemu_coroutine_enter(mis->loadvm_co);
  2843. }
  2844. return;
  2845. }
  2846. rdma_ack_cm_event(cm_event);
  2847. }
  2848. static int qemu_rdma_accept(RDMAContext *rdma)
  2849. {
  2850. Error *err = NULL;
  2851. RDMACapabilities cap;
  2852. struct rdma_conn_param conn_param = {
  2853. .responder_resources = 2,
  2854. .private_data = &cap,
  2855. .private_data_len = sizeof(cap),
  2856. };
  2857. RDMAContext *rdma_return_path = NULL;
  2858. g_autoptr(InetSocketAddress) isock = g_new0(InetSocketAddress, 1);
  2859. struct rdma_cm_event *cm_event;
  2860. struct ibv_context *verbs;
  2861. int ret;
  2862. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2863. if (ret < 0) {
  2864. goto err_rdma_dest_wait;
  2865. }
  2866. if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
  2867. rdma_ack_cm_event(cm_event);
  2868. goto err_rdma_dest_wait;
  2869. }
  2870. isock->host = g_strdup(rdma->host);
  2871. isock->port = g_strdup_printf("%d", rdma->port);
  2872. /*
  2873. * initialize the RDMAContext for return path for postcopy after first
  2874. * connection request reached.
  2875. */
  2876. if ((migrate_postcopy() || migrate_return_path())
  2877. && !rdma->is_return_path) {
  2878. rdma_return_path = qemu_rdma_data_init(isock, NULL);
  2879. if (rdma_return_path == NULL) {
  2880. rdma_ack_cm_event(cm_event);
  2881. goto err_rdma_dest_wait;
  2882. }
  2883. qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
  2884. }
  2885. memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
  2886. network_to_caps(&cap);
  2887. if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
  2888. error_report("Unknown source RDMA version: %d, bailing...",
  2889. cap.version);
  2890. rdma_ack_cm_event(cm_event);
  2891. goto err_rdma_dest_wait;
  2892. }
  2893. /*
  2894. * Respond with only the capabilities this version of QEMU knows about.
  2895. */
  2896. cap.flags &= known_capabilities;
  2897. /*
  2898. * Enable the ones that we do know about.
  2899. * Add other checks here as new ones are introduced.
  2900. */
  2901. if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
  2902. rdma->pin_all = true;
  2903. }
  2904. rdma->cm_id = cm_event->id;
  2905. verbs = cm_event->id->verbs;
  2906. rdma_ack_cm_event(cm_event);
  2907. trace_qemu_rdma_accept_pin_state(rdma->pin_all);
  2908. caps_to_network(&cap);
  2909. trace_qemu_rdma_accept_pin_verbsc(verbs);
  2910. if (!rdma->verbs) {
  2911. rdma->verbs = verbs;
  2912. } else if (rdma->verbs != verbs) {
  2913. error_report("ibv context not matching %p, %p!", rdma->verbs,
  2914. verbs);
  2915. goto err_rdma_dest_wait;
  2916. }
  2917. qemu_rdma_dump_id("dest_init", verbs);
  2918. ret = qemu_rdma_alloc_pd_cq(rdma, &err);
  2919. if (ret < 0) {
  2920. error_report_err(err);
  2921. goto err_rdma_dest_wait;
  2922. }
  2923. ret = qemu_rdma_alloc_qp(rdma);
  2924. if (ret < 0) {
  2925. error_report("rdma migration: error allocating qp!");
  2926. goto err_rdma_dest_wait;
  2927. }
  2928. qemu_rdma_init_ram_blocks(rdma);
  2929. for (int i = 0; i < RDMA_WRID_MAX; i++) {
  2930. ret = qemu_rdma_reg_control(rdma, i);
  2931. if (ret < 0) {
  2932. error_report("rdma: error registering %d control", i);
  2933. goto err_rdma_dest_wait;
  2934. }
  2935. }
  2936. /* Accept the second connection request for return path */
  2937. if ((migrate_postcopy() || migrate_return_path())
  2938. && !rdma->is_return_path) {
  2939. qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
  2940. NULL,
  2941. (void *)(intptr_t)rdma->return_path);
  2942. } else {
  2943. qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
  2944. NULL, rdma);
  2945. }
  2946. ret = rdma_accept(rdma->cm_id, &conn_param);
  2947. if (ret < 0) {
  2948. error_report("rdma_accept failed");
  2949. goto err_rdma_dest_wait;
  2950. }
  2951. ret = rdma_get_cm_event(rdma->channel, &cm_event);
  2952. if (ret < 0) {
  2953. error_report("rdma_accept get_cm_event failed");
  2954. goto err_rdma_dest_wait;
  2955. }
  2956. if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
  2957. error_report("rdma_accept not event established");
  2958. rdma_ack_cm_event(cm_event);
  2959. goto err_rdma_dest_wait;
  2960. }
  2961. rdma_ack_cm_event(cm_event);
  2962. rdma->connected = true;
  2963. ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, &err);
  2964. if (ret < 0) {
  2965. error_report_err(err);
  2966. goto err_rdma_dest_wait;
  2967. }
  2968. qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
  2969. return 0;
  2970. err_rdma_dest_wait:
  2971. rdma->errored = true;
  2972. qemu_rdma_cleanup(rdma);
  2973. g_free(rdma_return_path);
  2974. return -1;
  2975. }
  2976. static int dest_ram_sort_func(const void *a, const void *b)
  2977. {
  2978. unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
  2979. unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
  2980. return (a_index < b_index) ? -1 : (a_index != b_index);
  2981. }
  2982. /*
  2983. * During each iteration of the migration, we listen for instructions
  2984. * by the source VM to perform dynamic page registrations before they
  2985. * can perform RDMA operations.
  2986. *
  2987. * We respond with the 'rkey'.
  2988. *
  2989. * Keep doing this until the source tells us to stop.
  2990. */
  2991. int rdma_registration_handle(QEMUFile *f)
  2992. {
  2993. RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
  2994. .type = RDMA_CONTROL_REGISTER_RESULT,
  2995. .repeat = 0,
  2996. };
  2997. RDMAControlHeader unreg_resp = { .len = 0,
  2998. .type = RDMA_CONTROL_UNREGISTER_FINISHED,
  2999. .repeat = 0,
  3000. };
  3001. RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
  3002. .repeat = 1 };
  3003. QIOChannelRDMA *rioc;
  3004. Error *err = NULL;
  3005. RDMAContext *rdma;
  3006. RDMALocalBlocks *local;
  3007. RDMAControlHeader head;
  3008. RDMARegister *reg, *registers;
  3009. RDMACompress *comp;
  3010. RDMARegisterResult *reg_result;
  3011. static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
  3012. RDMALocalBlock *block;
  3013. void *host_addr;
  3014. int ret;
  3015. int idx = 0;
  3016. if (!migrate_rdma()) {
  3017. return 0;
  3018. }
  3019. RCU_READ_LOCK_GUARD();
  3020. rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
  3021. rdma = qatomic_rcu_read(&rioc->rdmain);
  3022. if (!rdma) {
  3023. return -1;
  3024. }
  3025. if (rdma_errored(rdma)) {
  3026. return -1;
  3027. }
  3028. local = &rdma->local_ram_blocks;
  3029. do {
  3030. trace_rdma_registration_handle_wait();
  3031. ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE, &err);
  3032. if (ret < 0) {
  3033. error_report_err(err);
  3034. break;
  3035. }
  3036. if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
  3037. error_report("rdma: Too many requests in this message (%d)."
  3038. "Bailing.", head.repeat);
  3039. break;
  3040. }
  3041. switch (head.type) {
  3042. case RDMA_CONTROL_COMPRESS:
  3043. comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
  3044. network_to_compress(comp);
  3045. trace_rdma_registration_handle_compress(comp->length,
  3046. comp->block_idx,
  3047. comp->offset);
  3048. if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
  3049. error_report("rdma: 'compress' bad block index %u (vs %d)",
  3050. (unsigned int)comp->block_idx,
  3051. rdma->local_ram_blocks.nb_blocks);
  3052. goto err;
  3053. }
  3054. block = &(rdma->local_ram_blocks.block[comp->block_idx]);
  3055. host_addr = block->local_host_addr +
  3056. (comp->offset - block->offset);
  3057. if (comp->value) {
  3058. error_report("rdma: Zero page with non-zero (%d) value",
  3059. comp->value);
  3060. goto err;
  3061. }
  3062. ram_handle_zero(host_addr, comp->length);
  3063. break;
  3064. case RDMA_CONTROL_REGISTER_FINISHED:
  3065. trace_rdma_registration_handle_finished();
  3066. return 0;
  3067. case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
  3068. trace_rdma_registration_handle_ram_blocks();
  3069. /* Sort our local RAM Block list so it's the same as the source,
  3070. * we can do this since we've filled in a src_index in the list
  3071. * as we received the RAMBlock list earlier.
  3072. */
  3073. qsort(rdma->local_ram_blocks.block,
  3074. rdma->local_ram_blocks.nb_blocks,
  3075. sizeof(RDMALocalBlock), dest_ram_sort_func);
  3076. for (int i = 0; i < local->nb_blocks; i++) {
  3077. local->block[i].index = i;
  3078. }
  3079. if (rdma->pin_all) {
  3080. ret = qemu_rdma_reg_whole_ram_blocks(rdma, &err);
  3081. if (ret < 0) {
  3082. error_report_err(err);
  3083. goto err;
  3084. }
  3085. }
  3086. /*
  3087. * Dest uses this to prepare to transmit the RAMBlock descriptions
  3088. * to the source VM after connection setup.
  3089. * Both sides use the "remote" structure to communicate and update
  3090. * their "local" descriptions with what was sent.
  3091. */
  3092. for (int i = 0; i < local->nb_blocks; i++) {
  3093. rdma->dest_blocks[i].remote_host_addr =
  3094. (uintptr_t)(local->block[i].local_host_addr);
  3095. if (rdma->pin_all) {
  3096. rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
  3097. }
  3098. rdma->dest_blocks[i].offset = local->block[i].offset;
  3099. rdma->dest_blocks[i].length = local->block[i].length;
  3100. dest_block_to_network(&rdma->dest_blocks[i]);
  3101. trace_rdma_registration_handle_ram_blocks_loop(
  3102. local->block[i].block_name,
  3103. local->block[i].offset,
  3104. local->block[i].length,
  3105. local->block[i].local_host_addr,
  3106. local->block[i].src_index);
  3107. }
  3108. blocks.len = rdma->local_ram_blocks.nb_blocks
  3109. * sizeof(RDMADestBlock);
  3110. ret = qemu_rdma_post_send_control(rdma,
  3111. (uint8_t *) rdma->dest_blocks, &blocks,
  3112. &err);
  3113. if (ret < 0) {
  3114. error_report_err(err);
  3115. goto err;
  3116. }
  3117. break;
  3118. case RDMA_CONTROL_REGISTER_REQUEST:
  3119. trace_rdma_registration_handle_register(head.repeat);
  3120. reg_resp.repeat = head.repeat;
  3121. registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
  3122. for (int count = 0; count < head.repeat; count++) {
  3123. uint64_t chunk;
  3124. uint8_t *chunk_start, *chunk_end;
  3125. reg = &registers[count];
  3126. network_to_register(reg);
  3127. reg_result = &results[count];
  3128. trace_rdma_registration_handle_register_loop(count,
  3129. reg->current_index, reg->key.current_addr, reg->chunks);
  3130. if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
  3131. error_report("rdma: 'register' bad block index %u (vs %d)",
  3132. (unsigned int)reg->current_index,
  3133. rdma->local_ram_blocks.nb_blocks);
  3134. goto err;
  3135. }
  3136. block = &(rdma->local_ram_blocks.block[reg->current_index]);
  3137. if (block->is_ram_block) {
  3138. if (block->offset > reg->key.current_addr) {
  3139. error_report("rdma: bad register address for block %s"
  3140. " offset: %" PRIx64 " current_addr: %" PRIx64,
  3141. block->block_name, block->offset,
  3142. reg->key.current_addr);
  3143. goto err;
  3144. }
  3145. host_addr = (block->local_host_addr +
  3146. (reg->key.current_addr - block->offset));
  3147. chunk = ram_chunk_index(block->local_host_addr,
  3148. (uint8_t *) host_addr);
  3149. } else {
  3150. chunk = reg->key.chunk;
  3151. host_addr = block->local_host_addr +
  3152. (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
  3153. /* Check for particularly bad chunk value */
  3154. if (host_addr < (void *)block->local_host_addr) {
  3155. error_report("rdma: bad chunk for block %s"
  3156. " chunk: %" PRIx64,
  3157. block->block_name, reg->key.chunk);
  3158. goto err;
  3159. }
  3160. }
  3161. chunk_start = ram_chunk_start(block, chunk);
  3162. chunk_end = ram_chunk_end(block, chunk + reg->chunks);
  3163. /* avoid "-Waddress-of-packed-member" warning */
  3164. uint32_t tmp_rkey = 0;
  3165. if (qemu_rdma_register_and_get_keys(rdma, block,
  3166. (uintptr_t)host_addr, NULL, &tmp_rkey,
  3167. chunk, chunk_start, chunk_end)) {
  3168. error_report("cannot get rkey");
  3169. goto err;
  3170. }
  3171. reg_result->rkey = tmp_rkey;
  3172. reg_result->host_addr = (uintptr_t)block->local_host_addr;
  3173. trace_rdma_registration_handle_register_rkey(reg_result->rkey);
  3174. result_to_network(reg_result);
  3175. }
  3176. ret = qemu_rdma_post_send_control(rdma,
  3177. (uint8_t *) results, &reg_resp, &err);
  3178. if (ret < 0) {
  3179. error_report_err(err);
  3180. goto err;
  3181. }
  3182. break;
  3183. case RDMA_CONTROL_UNREGISTER_REQUEST:
  3184. trace_rdma_registration_handle_unregister(head.repeat);
  3185. unreg_resp.repeat = head.repeat;
  3186. registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
  3187. for (int count = 0; count < head.repeat; count++) {
  3188. reg = &registers[count];
  3189. network_to_register(reg);
  3190. trace_rdma_registration_handle_unregister_loop(count,
  3191. reg->current_index, reg->key.chunk);
  3192. block = &(rdma->local_ram_blocks.block[reg->current_index]);
  3193. ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
  3194. block->pmr[reg->key.chunk] = NULL;
  3195. if (ret != 0) {
  3196. error_report("rdma unregistration chunk failed: %s",
  3197. strerror(errno));
  3198. goto err;
  3199. }
  3200. rdma->total_registrations--;
  3201. trace_rdma_registration_handle_unregister_success(reg->key.chunk);
  3202. }
  3203. ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp, &err);
  3204. if (ret < 0) {
  3205. error_report_err(err);
  3206. goto err;
  3207. }
  3208. break;
  3209. case RDMA_CONTROL_REGISTER_RESULT:
  3210. error_report("Invalid RESULT message at dest.");
  3211. goto err;
  3212. default:
  3213. error_report("Unknown control message %s", control_desc(head.type));
  3214. goto err;
  3215. }
  3216. } while (1);
  3217. err:
  3218. rdma->errored = true;
  3219. return -1;
  3220. }
  3221. /* Destination:
  3222. * Called during the initial RAM load section which lists the
  3223. * RAMBlocks by name. This lets us know the order of the RAMBlocks on
  3224. * the source. We've already built our local RAMBlock list, but not
  3225. * yet sent the list to the source.
  3226. */
  3227. int rdma_block_notification_handle(QEMUFile *f, const char *name)
  3228. {
  3229. int curr;
  3230. int found = -1;
  3231. if (!migrate_rdma()) {
  3232. return 0;
  3233. }
  3234. RCU_READ_LOCK_GUARD();
  3235. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
  3236. RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmain);
  3237. if (!rdma) {
  3238. return -1;
  3239. }
  3240. /* Find the matching RAMBlock in our local list */
  3241. for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
  3242. if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
  3243. found = curr;
  3244. break;
  3245. }
  3246. }
  3247. if (found == -1) {
  3248. error_report("RAMBlock '%s' not found on destination", name);
  3249. return -1;
  3250. }
  3251. rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
  3252. trace_rdma_block_notification_handle(name, rdma->next_src_index);
  3253. rdma->next_src_index++;
  3254. return 0;
  3255. }
  3256. int rdma_registration_start(QEMUFile *f, uint64_t flags)
  3257. {
  3258. if (!migrate_rdma() || migration_in_postcopy()) {
  3259. return 0;
  3260. }
  3261. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
  3262. RCU_READ_LOCK_GUARD();
  3263. RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmaout);
  3264. if (!rdma) {
  3265. return -1;
  3266. }
  3267. if (rdma_errored(rdma)) {
  3268. return -1;
  3269. }
  3270. trace_rdma_registration_start(flags);
  3271. qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
  3272. return qemu_fflush(f);
  3273. }
  3274. /*
  3275. * Inform dest that dynamic registrations are done for now.
  3276. * First, flush writes, if any.
  3277. */
  3278. int rdma_registration_stop(QEMUFile *f, uint64_t flags)
  3279. {
  3280. QIOChannelRDMA *rioc;
  3281. Error *err = NULL;
  3282. RDMAContext *rdma;
  3283. RDMAControlHeader head = { .len = 0, .repeat = 1 };
  3284. int ret;
  3285. if (!migrate_rdma() || migration_in_postcopy()) {
  3286. return 0;
  3287. }
  3288. RCU_READ_LOCK_GUARD();
  3289. rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
  3290. rdma = qatomic_rcu_read(&rioc->rdmaout);
  3291. if (!rdma) {
  3292. return -1;
  3293. }
  3294. if (rdma_errored(rdma)) {
  3295. return -1;
  3296. }
  3297. qemu_fflush(f);
  3298. ret = qemu_rdma_drain_cq(rdma);
  3299. if (ret < 0) {
  3300. goto err;
  3301. }
  3302. if (flags == RAM_CONTROL_SETUP) {
  3303. RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
  3304. RDMALocalBlocks *local = &rdma->local_ram_blocks;
  3305. int reg_result_idx, nb_dest_blocks;
  3306. head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
  3307. trace_rdma_registration_stop_ram();
  3308. /*
  3309. * Make sure that we parallelize the pinning on both sides.
  3310. * For very large guests, doing this serially takes a really
  3311. * long time, so we have to 'interleave' the pinning locally
  3312. * with the control messages by performing the pinning on this
  3313. * side before we receive the control response from the other
  3314. * side that the pinning has completed.
  3315. */
  3316. ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
  3317. &reg_result_idx, rdma->pin_all ?
  3318. qemu_rdma_reg_whole_ram_blocks : NULL,
  3319. &err);
  3320. if (ret < 0) {
  3321. error_report_err(err);
  3322. return -1;
  3323. }
  3324. nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
  3325. /*
  3326. * The protocol uses two different sets of rkeys (mutually exclusive):
  3327. * 1. One key to represent the virtual address of the entire ram block.
  3328. * (dynamic chunk registration disabled - pin everything with one rkey.)
  3329. * 2. One to represent individual chunks within a ram block.
  3330. * (dynamic chunk registration enabled - pin individual chunks.)
  3331. *
  3332. * Once the capability is successfully negotiated, the destination transmits
  3333. * the keys to use (or sends them later) including the virtual addresses
  3334. * and then propagates the remote ram block descriptions to his local copy.
  3335. */
  3336. if (local->nb_blocks != nb_dest_blocks) {
  3337. error_report("ram blocks mismatch (Number of blocks %d vs %d)",
  3338. local->nb_blocks, nb_dest_blocks);
  3339. error_printf("Your QEMU command line parameters are probably "
  3340. "not identical on both the source and destination.");
  3341. rdma->errored = true;
  3342. return -1;
  3343. }
  3344. qemu_rdma_move_header(rdma, reg_result_idx, &resp);
  3345. memcpy(rdma->dest_blocks,
  3346. rdma->wr_data[reg_result_idx].control_curr, resp.len);
  3347. for (int i = 0; i < nb_dest_blocks; i++) {
  3348. network_to_dest_block(&rdma->dest_blocks[i]);
  3349. /* We require that the blocks are in the same order */
  3350. if (rdma->dest_blocks[i].length != local->block[i].length) {
  3351. error_report("Block %s/%d has a different length %" PRIu64
  3352. "vs %" PRIu64,
  3353. local->block[i].block_name, i,
  3354. local->block[i].length,
  3355. rdma->dest_blocks[i].length);
  3356. rdma->errored = true;
  3357. return -1;
  3358. }
  3359. local->block[i].remote_host_addr =
  3360. rdma->dest_blocks[i].remote_host_addr;
  3361. local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
  3362. }
  3363. }
  3364. trace_rdma_registration_stop(flags);
  3365. head.type = RDMA_CONTROL_REGISTER_FINISHED;
  3366. ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL, &err);
  3367. if (ret < 0) {
  3368. error_report_err(err);
  3369. goto err;
  3370. }
  3371. return 0;
  3372. err:
  3373. rdma->errored = true;
  3374. return -1;
  3375. }
  3376. static void qio_channel_rdma_finalize(Object *obj)
  3377. {
  3378. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
  3379. if (rioc->rdmain) {
  3380. qemu_rdma_cleanup(rioc->rdmain);
  3381. g_free(rioc->rdmain);
  3382. rioc->rdmain = NULL;
  3383. }
  3384. if (rioc->rdmaout) {
  3385. qemu_rdma_cleanup(rioc->rdmaout);
  3386. g_free(rioc->rdmaout);
  3387. rioc->rdmaout = NULL;
  3388. }
  3389. }
  3390. static void qio_channel_rdma_class_init(ObjectClass *klass,
  3391. void *class_data G_GNUC_UNUSED)
  3392. {
  3393. QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
  3394. ioc_klass->io_writev = qio_channel_rdma_writev;
  3395. ioc_klass->io_readv = qio_channel_rdma_readv;
  3396. ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
  3397. ioc_klass->io_close = qio_channel_rdma_close;
  3398. ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
  3399. ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
  3400. ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
  3401. }
  3402. static const TypeInfo qio_channel_rdma_info = {
  3403. .parent = TYPE_QIO_CHANNEL,
  3404. .name = TYPE_QIO_CHANNEL_RDMA,
  3405. .instance_size = sizeof(QIOChannelRDMA),
  3406. .instance_finalize = qio_channel_rdma_finalize,
  3407. .class_init = qio_channel_rdma_class_init,
  3408. };
  3409. static void qio_channel_rdma_register_types(void)
  3410. {
  3411. type_register_static(&qio_channel_rdma_info);
  3412. }
  3413. type_init(qio_channel_rdma_register_types);
  3414. static QEMUFile *rdma_new_input(RDMAContext *rdma)
  3415. {
  3416. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
  3417. rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc));
  3418. rioc->rdmain = rdma;
  3419. rioc->rdmaout = rdma->return_path;
  3420. return rioc->file;
  3421. }
  3422. static QEMUFile *rdma_new_output(RDMAContext *rdma)
  3423. {
  3424. QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
  3425. rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc));
  3426. rioc->rdmaout = rdma;
  3427. rioc->rdmain = rdma->return_path;
  3428. return rioc->file;
  3429. }
  3430. static void rdma_accept_incoming_migration(void *opaque)
  3431. {
  3432. RDMAContext *rdma = opaque;
  3433. QEMUFile *f;
  3434. trace_qemu_rdma_accept_incoming_migration();
  3435. if (qemu_rdma_accept(rdma) < 0) {
  3436. error_report("RDMA ERROR: Migration initialization failed");
  3437. return;
  3438. }
  3439. trace_qemu_rdma_accept_incoming_migration_accepted();
  3440. if (rdma->is_return_path) {
  3441. return;
  3442. }
  3443. f = rdma_new_input(rdma);
  3444. if (f == NULL) {
  3445. error_report("RDMA ERROR: could not open RDMA for input");
  3446. qemu_rdma_cleanup(rdma);
  3447. return;
  3448. }
  3449. rdma->migration_started_on_destination = 1;
  3450. migration_fd_process_incoming(f);
  3451. }
  3452. void rdma_start_incoming_migration(InetSocketAddress *host_port,
  3453. Error **errp)
  3454. {
  3455. MigrationState *s = migrate_get_current();
  3456. int ret;
  3457. RDMAContext *rdma;
  3458. trace_rdma_start_incoming_migration();
  3459. /* Avoid ram_block_discard_disable(), cannot change during migration. */
  3460. if (ram_block_discard_is_required()) {
  3461. error_setg(errp, "RDMA: cannot disable RAM discard");
  3462. return;
  3463. }
  3464. rdma = qemu_rdma_data_init(host_port, errp);
  3465. if (rdma == NULL) {
  3466. goto err;
  3467. }
  3468. ret = qemu_rdma_dest_init(rdma, errp);
  3469. if (ret < 0) {
  3470. goto err;
  3471. }
  3472. trace_rdma_start_incoming_migration_after_dest_init();
  3473. ret = rdma_listen(rdma->listen_id, 5);
  3474. if (ret < 0) {
  3475. error_setg(errp, "RDMA ERROR: listening on socket!");
  3476. goto cleanup_rdma;
  3477. }
  3478. trace_rdma_start_incoming_migration_after_rdma_listen();
  3479. s->rdma_migration = true;
  3480. qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
  3481. NULL, (void *)(intptr_t)rdma);
  3482. return;
  3483. cleanup_rdma:
  3484. qemu_rdma_cleanup(rdma);
  3485. err:
  3486. if (rdma) {
  3487. g_free(rdma->host);
  3488. }
  3489. g_free(rdma);
  3490. }
  3491. void rdma_start_outgoing_migration(void *opaque,
  3492. InetSocketAddress *host_port, Error **errp)
  3493. {
  3494. MigrationState *s = opaque;
  3495. RDMAContext *rdma_return_path = NULL;
  3496. RDMAContext *rdma;
  3497. int ret;
  3498. /* Avoid ram_block_discard_disable(), cannot change during migration. */
  3499. if (ram_block_discard_is_required()) {
  3500. error_setg(errp, "RDMA: cannot disable RAM discard");
  3501. return;
  3502. }
  3503. rdma = qemu_rdma_data_init(host_port, errp);
  3504. if (rdma == NULL) {
  3505. goto err;
  3506. }
  3507. ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp);
  3508. if (ret < 0) {
  3509. goto err;
  3510. }
  3511. trace_rdma_start_outgoing_migration_after_rdma_source_init();
  3512. ret = qemu_rdma_connect(rdma, false, errp);
  3513. if (ret < 0) {
  3514. goto err;
  3515. }
  3516. /* RDMA postcopy need a separate queue pair for return path */
  3517. if (migrate_postcopy() || migrate_return_path()) {
  3518. rdma_return_path = qemu_rdma_data_init(host_port, errp);
  3519. if (rdma_return_path == NULL) {
  3520. goto return_path_err;
  3521. }
  3522. ret = qemu_rdma_source_init(rdma_return_path,
  3523. migrate_rdma_pin_all(), errp);
  3524. if (ret < 0) {
  3525. goto return_path_err;
  3526. }
  3527. ret = qemu_rdma_connect(rdma_return_path, true, errp);
  3528. if (ret < 0) {
  3529. goto return_path_err;
  3530. }
  3531. rdma->return_path = rdma_return_path;
  3532. rdma_return_path->return_path = rdma;
  3533. rdma_return_path->is_return_path = true;
  3534. }
  3535. trace_rdma_start_outgoing_migration_after_rdma_connect();
  3536. s->to_dst_file = rdma_new_output(rdma);
  3537. s->rdma_migration = true;
  3538. migration_connect(s, NULL);
  3539. return;
  3540. return_path_err:
  3541. qemu_rdma_cleanup(rdma);
  3542. err:
  3543. g_free(rdma);
  3544. g_free(rdma_return_path);
  3545. }