ram.c 136 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514
  1. /*
  2. * QEMU System Emulator
  3. *
  4. * Copyright (c) 2003-2008 Fabrice Bellard
  5. * Copyright (c) 2011-2015 Red Hat Inc
  6. *
  7. * Authors:
  8. * Juan Quintela <quintela@redhat.com>
  9. *
  10. * Permission is hereby granted, free of charge, to any person obtaining a copy
  11. * of this software and associated documentation files (the "Software"), to deal
  12. * in the Software without restriction, including without limitation the rights
  13. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14. * copies of the Software, and to permit persons to whom the Software is
  15. * furnished to do so, subject to the following conditions:
  16. *
  17. * The above copyright notice and this permission notice shall be included in
  18. * all copies or substantial portions of the Software.
  19. *
  20. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26. * THE SOFTWARE.
  27. */
  28. #include "qemu/osdep.h"
  29. #include "qemu/cutils.h"
  30. #include "qemu/bitops.h"
  31. #include "qemu/bitmap.h"
  32. #include "qemu/madvise.h"
  33. #include "qemu/main-loop.h"
  34. #include "xbzrle.h"
  35. #include "ram.h"
  36. #include "migration.h"
  37. #include "migration-stats.h"
  38. #include "migration/register.h"
  39. #include "migration/misc.h"
  40. #include "qemu-file.h"
  41. #include "postcopy-ram.h"
  42. #include "page_cache.h"
  43. #include "qemu/error-report.h"
  44. #include "qapi/error.h"
  45. #include "qapi/qapi-types-migration.h"
  46. #include "qapi/qapi-events-migration.h"
  47. #include "qapi/qapi-commands-migration.h"
  48. #include "qapi/qmp/qerror.h"
  49. #include "trace.h"
  50. #include "exec/ram_addr.h"
  51. #include "exec/target_page.h"
  52. #include "qemu/rcu_queue.h"
  53. #include "migration/colo.h"
  54. #include "system/cpu-throttle.h"
  55. #include "savevm.h"
  56. #include "qemu/iov.h"
  57. #include "multifd.h"
  58. #include "system/runstate.h"
  59. #include "rdma.h"
  60. #include "options.h"
  61. #include "system/dirtylimit.h"
  62. #include "system/kvm.h"
  63. #include "hw/boards.h" /* for machine_dump_guest_core() */
  64. #if defined(__linux__)
  65. #include "qemu/userfaultfd.h"
  66. #endif /* defined(__linux__) */
  67. /***********************************************************/
  68. /* ram save/restore */
  69. /*
  70. * mapped-ram migration supports O_DIRECT, so we need to make sure the
  71. * userspace buffer, the IO operation size and the file offset are
  72. * aligned according to the underlying device's block size. The first
  73. * two are already aligned to page size, but we need to add padding to
  74. * the file to align the offset. We cannot read the block size
  75. * dynamically because the migration file can be moved between
  76. * different systems, so use 1M to cover most block sizes and to keep
  77. * the file offset aligned at page size as well.
  78. */
  79. #define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000
  80. /*
  81. * When doing mapped-ram migration, this is the amount we read from
  82. * the pages region in the migration file at a time.
  83. */
  84. #define MAPPED_RAM_LOAD_BUF_SIZE 0x100000
  85. XBZRLECacheStats xbzrle_counters;
  86. /* used by the search for pages to send */
  87. struct PageSearchStatus {
  88. /* The migration channel used for a specific host page */
  89. QEMUFile *pss_channel;
  90. /* Last block from where we have sent data */
  91. RAMBlock *last_sent_block;
  92. /* Current block being searched */
  93. RAMBlock *block;
  94. /* Current page to search from */
  95. unsigned long page;
  96. /* Set once we wrap around */
  97. bool complete_round;
  98. /* Whether we're sending a host page */
  99. bool host_page_sending;
  100. /* The start/end of current host page. Invalid if host_page_sending==false */
  101. unsigned long host_page_start;
  102. unsigned long host_page_end;
  103. };
  104. typedef struct PageSearchStatus PageSearchStatus;
  105. /* struct contains XBZRLE cache and a static page
  106. used by the compression */
  107. static struct {
  108. /* buffer used for XBZRLE encoding */
  109. uint8_t *encoded_buf;
  110. /* buffer for storing page content */
  111. uint8_t *current_buf;
  112. /* Cache for XBZRLE, Protected by lock. */
  113. PageCache *cache;
  114. QemuMutex lock;
  115. /* it will store a page full of zeros */
  116. uint8_t *zero_target_page;
  117. /* buffer used for XBZRLE decoding */
  118. uint8_t *decoded_buf;
  119. } XBZRLE;
  120. static void XBZRLE_cache_lock(void)
  121. {
  122. if (migrate_xbzrle()) {
  123. qemu_mutex_lock(&XBZRLE.lock);
  124. }
  125. }
  126. static void XBZRLE_cache_unlock(void)
  127. {
  128. if (migrate_xbzrle()) {
  129. qemu_mutex_unlock(&XBZRLE.lock);
  130. }
  131. }
  132. /**
  133. * xbzrle_cache_resize: resize the xbzrle cache
  134. *
  135. * This function is called from migrate_params_apply in main
  136. * thread, possibly while a migration is in progress. A running
  137. * migration may be using the cache and might finish during this call,
  138. * hence changes to the cache are protected by XBZRLE.lock().
  139. *
  140. * Returns 0 for success or -1 for error
  141. *
  142. * @new_size: new cache size
  143. * @errp: set *errp if the check failed, with reason
  144. */
  145. int xbzrle_cache_resize(uint64_t new_size, Error **errp)
  146. {
  147. PageCache *new_cache;
  148. int64_t ret = 0;
  149. /* Check for truncation */
  150. if (new_size != (size_t)new_size) {
  151. error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
  152. "exceeding address space");
  153. return -1;
  154. }
  155. if (new_size == migrate_xbzrle_cache_size()) {
  156. /* nothing to do */
  157. return 0;
  158. }
  159. XBZRLE_cache_lock();
  160. if (XBZRLE.cache != NULL) {
  161. new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
  162. if (!new_cache) {
  163. ret = -1;
  164. goto out;
  165. }
  166. cache_fini(XBZRLE.cache);
  167. XBZRLE.cache = new_cache;
  168. }
  169. out:
  170. XBZRLE_cache_unlock();
  171. return ret;
  172. }
  173. static bool postcopy_preempt_active(void)
  174. {
  175. return migrate_postcopy_preempt() && migration_in_postcopy();
  176. }
  177. bool migrate_ram_is_ignored(RAMBlock *block)
  178. {
  179. MigMode mode = migrate_mode();
  180. return !qemu_ram_is_migratable(block) ||
  181. mode == MIG_MODE_CPR_TRANSFER ||
  182. (migrate_ignore_shared() && qemu_ram_is_shared(block)
  183. && qemu_ram_is_named_file(block));
  184. }
  185. #undef RAMBLOCK_FOREACH
  186. int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
  187. {
  188. RAMBlock *block;
  189. int ret = 0;
  190. RCU_READ_LOCK_GUARD();
  191. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  192. ret = func(block, opaque);
  193. if (ret) {
  194. break;
  195. }
  196. }
  197. return ret;
  198. }
  199. static void ramblock_recv_map_init(void)
  200. {
  201. RAMBlock *rb;
  202. RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
  203. assert(!rb->receivedmap);
  204. rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
  205. }
  206. }
  207. int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
  208. {
  209. return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
  210. rb->receivedmap);
  211. }
  212. bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
  213. {
  214. return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
  215. }
  216. void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
  217. {
  218. set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
  219. }
  220. void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
  221. size_t nr)
  222. {
  223. bitmap_set_atomic(rb->receivedmap,
  224. ramblock_recv_bitmap_offset(host_addr, rb),
  225. nr);
  226. }
  227. void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset)
  228. {
  229. set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
  230. }
  231. #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
  232. /*
  233. * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
  234. *
  235. * Returns >0 if success with sent bytes, or <0 if error.
  236. */
  237. int64_t ramblock_recv_bitmap_send(QEMUFile *file,
  238. const char *block_name)
  239. {
  240. RAMBlock *block = qemu_ram_block_by_name(block_name);
  241. unsigned long *le_bitmap, nbits;
  242. uint64_t size;
  243. if (!block) {
  244. error_report("%s: invalid block name: %s", __func__, block_name);
  245. return -1;
  246. }
  247. nbits = block->postcopy_length >> TARGET_PAGE_BITS;
  248. /*
  249. * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
  250. * machines we may need 4 more bytes for padding (see below
  251. * comment). So extend it a bit before hand.
  252. */
  253. le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
  254. /*
  255. * Always use little endian when sending the bitmap. This is
  256. * required that when source and destination VMs are not using the
  257. * same endianness. (Note: big endian won't work.)
  258. */
  259. bitmap_to_le(le_bitmap, block->receivedmap, nbits);
  260. /* Size of the bitmap, in bytes */
  261. size = DIV_ROUND_UP(nbits, 8);
  262. /*
  263. * size is always aligned to 8 bytes for 64bit machines, but it
  264. * may not be true for 32bit machines. We need this padding to
  265. * make sure the migration can survive even between 32bit and
  266. * 64bit machines.
  267. */
  268. size = ROUND_UP(size, 8);
  269. qemu_put_be64(file, size);
  270. qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
  271. g_free(le_bitmap);
  272. /*
  273. * Mark as an end, in case the middle part is screwed up due to
  274. * some "mysterious" reason.
  275. */
  276. qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
  277. int ret = qemu_fflush(file);
  278. if (ret) {
  279. return ret;
  280. }
  281. return size + sizeof(size);
  282. }
  283. /*
  284. * An outstanding page request, on the source, having been received
  285. * and queued
  286. */
  287. struct RAMSrcPageRequest {
  288. RAMBlock *rb;
  289. hwaddr offset;
  290. hwaddr len;
  291. QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
  292. };
  293. /* State of RAM for migration */
  294. struct RAMState {
  295. /*
  296. * PageSearchStatus structures for the channels when send pages.
  297. * Protected by the bitmap_mutex.
  298. */
  299. PageSearchStatus pss[RAM_CHANNEL_MAX];
  300. /* UFFD file descriptor, used in 'write-tracking' migration */
  301. int uffdio_fd;
  302. /* total ram size in bytes */
  303. uint64_t ram_bytes_total;
  304. /* Last block that we have visited searching for dirty pages */
  305. RAMBlock *last_seen_block;
  306. /* Last dirty target page we have sent */
  307. ram_addr_t last_page;
  308. /* last ram version we have seen */
  309. uint32_t last_version;
  310. /* How many times we have dirty too many pages */
  311. int dirty_rate_high_cnt;
  312. /* these variables are used for bitmap sync */
  313. /* last time we did a full bitmap_sync */
  314. int64_t time_last_bitmap_sync;
  315. /* bytes transferred at start_time */
  316. uint64_t bytes_xfer_prev;
  317. /* number of dirty pages since start_time */
  318. uint64_t num_dirty_pages_period;
  319. /* xbzrle misses since the beginning of the period */
  320. uint64_t xbzrle_cache_miss_prev;
  321. /* Amount of xbzrle pages since the beginning of the period */
  322. uint64_t xbzrle_pages_prev;
  323. /* Amount of xbzrle encoded bytes since the beginning of the period */
  324. uint64_t xbzrle_bytes_prev;
  325. /* Are we really using XBZRLE (e.g., after the first round). */
  326. bool xbzrle_started;
  327. /* Are we on the last stage of migration */
  328. bool last_stage;
  329. /* total handled target pages at the beginning of period */
  330. uint64_t target_page_count_prev;
  331. /* total handled target pages since start */
  332. uint64_t target_page_count;
  333. /* number of dirty bits in the bitmap */
  334. uint64_t migration_dirty_pages;
  335. /*
  336. * Protects:
  337. * - dirty/clear bitmap
  338. * - migration_dirty_pages
  339. * - pss structures
  340. */
  341. QemuMutex bitmap_mutex;
  342. /* The RAMBlock used in the last src_page_requests */
  343. RAMBlock *last_req_rb;
  344. /* Queue of outstanding page requests from the destination */
  345. QemuMutex src_page_req_mutex;
  346. QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
  347. /*
  348. * This is only used when postcopy is in recovery phase, to communicate
  349. * between the migration thread and the return path thread on dirty
  350. * bitmap synchronizations. This field is unused in other stages of
  351. * RAM migration.
  352. */
  353. unsigned int postcopy_bmap_sync_requested;
  354. };
  355. typedef struct RAMState RAMState;
  356. static RAMState *ram_state;
  357. static NotifierWithReturnList precopy_notifier_list;
  358. /* Whether postcopy has queued requests? */
  359. static bool postcopy_has_request(RAMState *rs)
  360. {
  361. return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
  362. }
  363. void precopy_infrastructure_init(void)
  364. {
  365. notifier_with_return_list_init(&precopy_notifier_list);
  366. }
  367. void precopy_add_notifier(NotifierWithReturn *n)
  368. {
  369. notifier_with_return_list_add(&precopy_notifier_list, n);
  370. }
  371. void precopy_remove_notifier(NotifierWithReturn *n)
  372. {
  373. notifier_with_return_remove(n);
  374. }
  375. int precopy_notify(PrecopyNotifyReason reason, Error **errp)
  376. {
  377. PrecopyNotifyData pnd;
  378. pnd.reason = reason;
  379. return notifier_with_return_list_notify(&precopy_notifier_list, &pnd, errp);
  380. }
  381. uint64_t ram_bytes_remaining(void)
  382. {
  383. return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
  384. 0;
  385. }
  386. void ram_transferred_add(uint64_t bytes)
  387. {
  388. if (runstate_is_running()) {
  389. stat64_add(&mig_stats.precopy_bytes, bytes);
  390. } else if (migration_in_postcopy()) {
  391. stat64_add(&mig_stats.postcopy_bytes, bytes);
  392. } else {
  393. stat64_add(&mig_stats.downtime_bytes, bytes);
  394. }
  395. }
  396. static int ram_save_host_page_urgent(PageSearchStatus *pss);
  397. /* NOTE: page is the PFN not real ram_addr_t. */
  398. static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
  399. {
  400. pss->block = rb;
  401. pss->page = page;
  402. pss->complete_round = false;
  403. }
  404. /*
  405. * Check whether two PSSs are actively sending the same page. Return true
  406. * if it is, false otherwise.
  407. */
  408. static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
  409. {
  410. return pss1->host_page_sending && pss2->host_page_sending &&
  411. (pss1->host_page_start == pss2->host_page_start);
  412. }
  413. /**
  414. * save_page_header: write page header to wire
  415. *
  416. * If this is the 1st block, it also writes the block identification
  417. *
  418. * Returns the number of bytes written
  419. *
  420. * @pss: current PSS channel status
  421. * @block: block that contains the page we want to send
  422. * @offset: offset inside the block for the page
  423. * in the lower bits, it contains flags
  424. */
  425. static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
  426. RAMBlock *block, ram_addr_t offset)
  427. {
  428. size_t size, len;
  429. bool same_block = (block == pss->last_sent_block);
  430. if (same_block) {
  431. offset |= RAM_SAVE_FLAG_CONTINUE;
  432. }
  433. qemu_put_be64(f, offset);
  434. size = 8;
  435. if (!same_block) {
  436. len = strlen(block->idstr);
  437. qemu_put_byte(f, len);
  438. qemu_put_buffer(f, (uint8_t *)block->idstr, len);
  439. size += 1 + len;
  440. pss->last_sent_block = block;
  441. }
  442. return size;
  443. }
  444. /**
  445. * mig_throttle_guest_down: throttle down the guest
  446. *
  447. * Reduce amount of guest cpu execution to hopefully slow down memory
  448. * writes. If guest dirty memory rate is reduced below the rate at
  449. * which we can transfer pages to the destination then we should be
  450. * able to complete migration. Some workloads dirty memory way too
  451. * fast and will not effectively converge, even with auto-converge.
  452. */
  453. static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
  454. uint64_t bytes_dirty_threshold)
  455. {
  456. uint64_t pct_initial = migrate_cpu_throttle_initial();
  457. uint64_t pct_increment = migrate_cpu_throttle_increment();
  458. bool pct_tailslow = migrate_cpu_throttle_tailslow();
  459. int pct_max = migrate_max_cpu_throttle();
  460. uint64_t throttle_now = cpu_throttle_get_percentage();
  461. uint64_t cpu_now, cpu_ideal, throttle_inc;
  462. /* We have not started throttling yet. Let's start it. */
  463. if (!cpu_throttle_active()) {
  464. cpu_throttle_set(pct_initial);
  465. } else {
  466. /* Throttling already on, just increase the rate */
  467. if (!pct_tailslow) {
  468. throttle_inc = pct_increment;
  469. } else {
  470. /* Compute the ideal CPU percentage used by Guest, which may
  471. * make the dirty rate match the dirty rate threshold. */
  472. cpu_now = 100 - throttle_now;
  473. cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
  474. bytes_dirty_period);
  475. throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
  476. }
  477. cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
  478. }
  479. }
  480. void mig_throttle_counter_reset(void)
  481. {
  482. RAMState *rs = ram_state;
  483. rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
  484. rs->num_dirty_pages_period = 0;
  485. rs->bytes_xfer_prev = migration_transferred_bytes();
  486. }
  487. /**
  488. * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
  489. *
  490. * @current_addr: address for the zero page
  491. *
  492. * Update the xbzrle cache to reflect a page that's been sent as all 0.
  493. * The important thing is that a stale (not-yet-0'd) page be replaced
  494. * by the new data.
  495. * As a bonus, if the page wasn't in the cache it gets added so that
  496. * when a small write is made into the 0'd page it gets XBZRLE sent.
  497. */
  498. static void xbzrle_cache_zero_page(ram_addr_t current_addr)
  499. {
  500. /* We don't care if this fails to allocate a new cache page
  501. * as long as it updated an old one */
  502. cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
  503. stat64_get(&mig_stats.dirty_sync_count));
  504. }
  505. #define ENCODING_FLAG_XBZRLE 0x1
  506. /**
  507. * save_xbzrle_page: compress and send current page
  508. *
  509. * Returns: 1 means that we wrote the page
  510. * 0 means that page is identical to the one already sent
  511. * -1 means that xbzrle would be longer than normal
  512. *
  513. * @rs: current RAM state
  514. * @pss: current PSS channel
  515. * @current_data: pointer to the address of the page contents
  516. * @current_addr: addr of the page
  517. * @block: block that contains the page we want to send
  518. * @offset: offset inside the block for the page
  519. */
  520. static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
  521. uint8_t **current_data, ram_addr_t current_addr,
  522. RAMBlock *block, ram_addr_t offset)
  523. {
  524. int encoded_len = 0, bytes_xbzrle;
  525. uint8_t *prev_cached_page;
  526. QEMUFile *file = pss->pss_channel;
  527. uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
  528. if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
  529. xbzrle_counters.cache_miss++;
  530. if (!rs->last_stage) {
  531. if (cache_insert(XBZRLE.cache, current_addr, *current_data,
  532. generation) == -1) {
  533. return -1;
  534. } else {
  535. /* update *current_data when the page has been
  536. inserted into cache */
  537. *current_data = get_cached_data(XBZRLE.cache, current_addr);
  538. }
  539. }
  540. return -1;
  541. }
  542. /*
  543. * Reaching here means the page has hit the xbzrle cache, no matter what
  544. * encoding result it is (normal encoding, overflow or skipping the page),
  545. * count the page as encoded. This is used to calculate the encoding rate.
  546. *
  547. * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
  548. * 2nd page turns out to be skipped (i.e. no new bytes written to the
  549. * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
  550. * skipped page included. In this way, the encoding rate can tell if the
  551. * guest page is good for xbzrle encoding.
  552. */
  553. xbzrle_counters.pages++;
  554. prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
  555. /* save current buffer into memory */
  556. memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
  557. /* XBZRLE encoding (if there is no overflow) */
  558. encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
  559. TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
  560. TARGET_PAGE_SIZE);
  561. /*
  562. * Update the cache contents, so that it corresponds to the data
  563. * sent, in all cases except where we skip the page.
  564. */
  565. if (!rs->last_stage && encoded_len != 0) {
  566. memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
  567. /*
  568. * In the case where we couldn't compress, ensure that the caller
  569. * sends the data from the cache, since the guest might have
  570. * changed the RAM since we copied it.
  571. */
  572. *current_data = prev_cached_page;
  573. }
  574. if (encoded_len == 0) {
  575. trace_save_xbzrle_page_skipping();
  576. return 0;
  577. } else if (encoded_len == -1) {
  578. trace_save_xbzrle_page_overflow();
  579. xbzrle_counters.overflow++;
  580. xbzrle_counters.bytes += TARGET_PAGE_SIZE;
  581. return -1;
  582. }
  583. /* Send XBZRLE based compressed page */
  584. bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
  585. offset | RAM_SAVE_FLAG_XBZRLE);
  586. qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
  587. qemu_put_be16(file, encoded_len);
  588. qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
  589. bytes_xbzrle += encoded_len + 1 + 2;
  590. /*
  591. * The xbzrle encoded bytes don't count the 8 byte header with
  592. * RAM_SAVE_FLAG_CONTINUE.
  593. */
  594. xbzrle_counters.bytes += bytes_xbzrle - 8;
  595. ram_transferred_add(bytes_xbzrle);
  596. return 1;
  597. }
  598. /**
  599. * pss_find_next_dirty: find the next dirty page of current ramblock
  600. *
  601. * This function updates pss->page to point to the next dirty page index
  602. * within the ramblock to migrate, or the end of ramblock when nothing
  603. * found. Note that when pss->host_page_sending==true it means we're
  604. * during sending a host page, so we won't look for dirty page that is
  605. * outside the host page boundary.
  606. *
  607. * @pss: the current page search status
  608. */
  609. static void pss_find_next_dirty(PageSearchStatus *pss)
  610. {
  611. RAMBlock *rb = pss->block;
  612. unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
  613. unsigned long *bitmap = rb->bmap;
  614. if (migrate_ram_is_ignored(rb)) {
  615. /* Points directly to the end, so we know no dirty page */
  616. pss->page = size;
  617. return;
  618. }
  619. /*
  620. * If during sending a host page, only look for dirty pages within the
  621. * current host page being send.
  622. */
  623. if (pss->host_page_sending) {
  624. assert(pss->host_page_end);
  625. size = MIN(size, pss->host_page_end);
  626. }
  627. pss->page = find_next_bit(bitmap, size, pss->page);
  628. }
  629. static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
  630. unsigned long page)
  631. {
  632. uint8_t shift;
  633. hwaddr size, start;
  634. if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
  635. return;
  636. }
  637. shift = rb->clear_bmap_shift;
  638. /*
  639. * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
  640. * can make things easier sometimes since then start address
  641. * of the small chunk will always be 64 pages aligned so the
  642. * bitmap will always be aligned to unsigned long. We should
  643. * even be able to remove this restriction but I'm simply
  644. * keeping it.
  645. */
  646. assert(shift >= 6);
  647. size = 1ULL << (TARGET_PAGE_BITS + shift);
  648. start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
  649. trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
  650. memory_region_clear_dirty_bitmap(rb->mr, start, size);
  651. }
  652. static void
  653. migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
  654. unsigned long start,
  655. unsigned long npages)
  656. {
  657. unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
  658. unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
  659. unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
  660. /*
  661. * Clear pages from start to start + npages - 1, so the end boundary is
  662. * exclusive.
  663. */
  664. for (i = chunk_start; i < chunk_end; i += chunk_pages) {
  665. migration_clear_memory_region_dirty_bitmap(rb, i);
  666. }
  667. }
  668. /*
  669. * colo_bitmap_find_diry:find contiguous dirty pages from start
  670. *
  671. * Returns the page offset within memory region of the start of the contiguout
  672. * dirty page
  673. *
  674. * @rs: current RAM state
  675. * @rb: RAMBlock where to search for dirty pages
  676. * @start: page where we start the search
  677. * @num: the number of contiguous dirty pages
  678. */
  679. static inline
  680. unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
  681. unsigned long start, unsigned long *num)
  682. {
  683. unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
  684. unsigned long *bitmap = rb->bmap;
  685. unsigned long first, next;
  686. *num = 0;
  687. if (migrate_ram_is_ignored(rb)) {
  688. return size;
  689. }
  690. first = find_next_bit(bitmap, size, start);
  691. if (first >= size) {
  692. return first;
  693. }
  694. next = find_next_zero_bit(bitmap, size, first + 1);
  695. assert(next >= first);
  696. *num = next - first;
  697. return first;
  698. }
  699. static inline bool migration_bitmap_clear_dirty(RAMState *rs,
  700. RAMBlock *rb,
  701. unsigned long page)
  702. {
  703. bool ret;
  704. /*
  705. * Clear dirty bitmap if needed. This _must_ be called before we
  706. * send any of the page in the chunk because we need to make sure
  707. * we can capture further page content changes when we sync dirty
  708. * log the next time. So as long as we are going to send any of
  709. * the page in the chunk we clear the remote dirty bitmap for all.
  710. * Clearing it earlier won't be a problem, but too late will.
  711. */
  712. migration_clear_memory_region_dirty_bitmap(rb, page);
  713. ret = test_and_clear_bit(page, rb->bmap);
  714. if (ret) {
  715. rs->migration_dirty_pages--;
  716. }
  717. return ret;
  718. }
  719. static void dirty_bitmap_clear_section(MemoryRegionSection *section,
  720. void *opaque)
  721. {
  722. const hwaddr offset = section->offset_within_region;
  723. const hwaddr size = int128_get64(section->size);
  724. const unsigned long start = offset >> TARGET_PAGE_BITS;
  725. const unsigned long npages = size >> TARGET_PAGE_BITS;
  726. RAMBlock *rb = section->mr->ram_block;
  727. uint64_t *cleared_bits = opaque;
  728. /*
  729. * We don't grab ram_state->bitmap_mutex because we expect to run
  730. * only when starting migration or during postcopy recovery where
  731. * we don't have concurrent access.
  732. */
  733. if (!migration_in_postcopy() && !migrate_background_snapshot()) {
  734. migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
  735. }
  736. *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
  737. bitmap_clear(rb->bmap, start, npages);
  738. }
  739. /*
  740. * Exclude all dirty pages from migration that fall into a discarded range as
  741. * managed by a RamDiscardManager responsible for the mapped memory region of
  742. * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
  743. *
  744. * Discarded pages ("logically unplugged") have undefined content and must
  745. * not get migrated, because even reading these pages for migration might
  746. * result in undesired behavior.
  747. *
  748. * Returns the number of cleared bits in the RAMBlock dirty bitmap.
  749. *
  750. * Note: The result is only stable while migrating (precopy/postcopy).
  751. */
  752. static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
  753. {
  754. uint64_t cleared_bits = 0;
  755. if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
  756. RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
  757. MemoryRegionSection section = {
  758. .mr = rb->mr,
  759. .offset_within_region = 0,
  760. .size = int128_make64(qemu_ram_get_used_length(rb)),
  761. };
  762. ram_discard_manager_replay_discarded(rdm, &section,
  763. dirty_bitmap_clear_section,
  764. &cleared_bits);
  765. }
  766. return cleared_bits;
  767. }
  768. /*
  769. * Check if a host-page aligned page falls into a discarded range as managed by
  770. * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
  771. *
  772. * Note: The result is only stable while migrating (precopy/postcopy).
  773. */
  774. bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
  775. {
  776. if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
  777. RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
  778. MemoryRegionSection section = {
  779. .mr = rb->mr,
  780. .offset_within_region = start,
  781. .size = int128_make64(qemu_ram_pagesize(rb)),
  782. };
  783. return !ram_discard_manager_is_populated(rdm, &section);
  784. }
  785. return false;
  786. }
  787. /* Called with RCU critical section */
  788. static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
  789. {
  790. uint64_t new_dirty_pages =
  791. cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
  792. rs->migration_dirty_pages += new_dirty_pages;
  793. rs->num_dirty_pages_period += new_dirty_pages;
  794. }
  795. /**
  796. * ram_pagesize_summary: calculate all the pagesizes of a VM
  797. *
  798. * Returns a summary bitmap of the page sizes of all RAMBlocks
  799. *
  800. * For VMs with just normal pages this is equivalent to the host page
  801. * size. If it's got some huge pages then it's the OR of all the
  802. * different page sizes.
  803. */
  804. uint64_t ram_pagesize_summary(void)
  805. {
  806. RAMBlock *block;
  807. uint64_t summary = 0;
  808. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  809. summary |= block->page_size;
  810. }
  811. return summary;
  812. }
  813. uint64_t ram_get_total_transferred_pages(void)
  814. {
  815. return stat64_get(&mig_stats.normal_pages) +
  816. stat64_get(&mig_stats.zero_pages) +
  817. xbzrle_counters.pages;
  818. }
  819. static void migration_update_rates(RAMState *rs, int64_t end_time)
  820. {
  821. uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
  822. /* calculate period counters */
  823. stat64_set(&mig_stats.dirty_pages_rate,
  824. rs->num_dirty_pages_period * 1000 /
  825. (end_time - rs->time_last_bitmap_sync));
  826. if (!page_count) {
  827. return;
  828. }
  829. if (migrate_xbzrle()) {
  830. double encoded_size, unencoded_size;
  831. xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
  832. rs->xbzrle_cache_miss_prev) / page_count;
  833. rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
  834. unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
  835. TARGET_PAGE_SIZE;
  836. encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
  837. if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
  838. xbzrle_counters.encoding_rate = 0;
  839. } else {
  840. xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
  841. }
  842. rs->xbzrle_pages_prev = xbzrle_counters.pages;
  843. rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
  844. }
  845. }
  846. /*
  847. * Enable dirty-limit to throttle down the guest
  848. */
  849. static void migration_dirty_limit_guest(void)
  850. {
  851. /*
  852. * dirty page rate quota for all vCPUs fetched from
  853. * migration parameter 'vcpu_dirty_limit'
  854. */
  855. static int64_t quota_dirtyrate;
  856. MigrationState *s = migrate_get_current();
  857. /*
  858. * If dirty limit already enabled and migration parameter
  859. * vcpu-dirty-limit untouched.
  860. */
  861. if (dirtylimit_in_service() &&
  862. quota_dirtyrate == s->parameters.vcpu_dirty_limit) {
  863. return;
  864. }
  865. quota_dirtyrate = s->parameters.vcpu_dirty_limit;
  866. /*
  867. * Set all vCPU a quota dirtyrate, note that the second
  868. * parameter will be ignored if setting all vCPU for the vm
  869. */
  870. qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL);
  871. trace_migration_dirty_limit_guest(quota_dirtyrate);
  872. }
  873. static void migration_trigger_throttle(RAMState *rs)
  874. {
  875. uint64_t threshold = migrate_throttle_trigger_threshold();
  876. uint64_t bytes_xfer_period =
  877. migration_transferred_bytes() - rs->bytes_xfer_prev;
  878. uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
  879. uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
  880. /*
  881. * The following detection logic can be refined later. For now:
  882. * Check to see if the ratio between dirtied bytes and the approx.
  883. * amount of bytes that just got transferred since the last time
  884. * we were in this routine reaches the threshold. If that happens
  885. * twice, start or increase throttling.
  886. */
  887. if ((bytes_dirty_period > bytes_dirty_threshold) &&
  888. (++rs->dirty_rate_high_cnt >= 2)) {
  889. rs->dirty_rate_high_cnt = 0;
  890. if (migrate_auto_converge()) {
  891. trace_migration_throttle();
  892. mig_throttle_guest_down(bytes_dirty_period,
  893. bytes_dirty_threshold);
  894. } else if (migrate_dirty_limit()) {
  895. migration_dirty_limit_guest();
  896. }
  897. }
  898. }
  899. static void migration_bitmap_sync(RAMState *rs, bool last_stage)
  900. {
  901. RAMBlock *block;
  902. int64_t end_time;
  903. stat64_add(&mig_stats.dirty_sync_count, 1);
  904. if (!rs->time_last_bitmap_sync) {
  905. rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
  906. }
  907. trace_migration_bitmap_sync_start();
  908. memory_global_dirty_log_sync(last_stage);
  909. WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) {
  910. WITH_RCU_READ_LOCK_GUARD() {
  911. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  912. ramblock_sync_dirty_bitmap(rs, block);
  913. }
  914. stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
  915. }
  916. }
  917. memory_global_after_dirty_log_sync();
  918. trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
  919. end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
  920. /* more than 1 second = 1000 millisecons */
  921. if (end_time > rs->time_last_bitmap_sync + 1000) {
  922. migration_trigger_throttle(rs);
  923. migration_update_rates(rs, end_time);
  924. rs->target_page_count_prev = rs->target_page_count;
  925. /* reset period counters */
  926. rs->time_last_bitmap_sync = end_time;
  927. rs->num_dirty_pages_period = 0;
  928. rs->bytes_xfer_prev = migration_transferred_bytes();
  929. }
  930. if (migrate_events()) {
  931. uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
  932. qapi_event_send_migration_pass(generation);
  933. }
  934. }
  935. void migration_bitmap_sync_precopy(bool last_stage)
  936. {
  937. Error *local_err = NULL;
  938. assert(ram_state);
  939. /*
  940. * The current notifier usage is just an optimization to migration, so we
  941. * don't stop the normal migration process in the error case.
  942. */
  943. if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
  944. error_report_err(local_err);
  945. local_err = NULL;
  946. }
  947. migration_bitmap_sync(ram_state, last_stage);
  948. if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
  949. error_report_err(local_err);
  950. }
  951. }
  952. void ram_release_page(const char *rbname, uint64_t offset)
  953. {
  954. if (!migrate_release_ram() || !migration_in_postcopy()) {
  955. return;
  956. }
  957. ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
  958. }
  959. /**
  960. * save_zero_page: send the zero page to the stream
  961. *
  962. * Returns the number of pages written.
  963. *
  964. * @rs: current RAM state
  965. * @pss: current PSS channel
  966. * @offset: offset inside the block for the page
  967. */
  968. static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
  969. ram_addr_t offset)
  970. {
  971. uint8_t *p = pss->block->host + offset;
  972. QEMUFile *file = pss->pss_channel;
  973. int len = 0;
  974. if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) {
  975. return 0;
  976. }
  977. if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) {
  978. return 0;
  979. }
  980. stat64_add(&mig_stats.zero_pages, 1);
  981. if (migrate_mapped_ram()) {
  982. /* zero pages are not transferred with mapped-ram */
  983. clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap);
  984. return 1;
  985. }
  986. len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO);
  987. qemu_put_byte(file, 0);
  988. len += 1;
  989. ram_release_page(pss->block->idstr, offset);
  990. ram_transferred_add(len);
  991. /*
  992. * Must let xbzrle know, otherwise a previous (now 0'd) cached
  993. * page would be stale.
  994. */
  995. if (rs->xbzrle_started) {
  996. XBZRLE_cache_lock();
  997. xbzrle_cache_zero_page(pss->block->offset + offset);
  998. XBZRLE_cache_unlock();
  999. }
  1000. return len;
  1001. }
  1002. /*
  1003. * @pages: the number of pages written by the control path,
  1004. * < 0 - error
  1005. * > 0 - number of pages written
  1006. *
  1007. * Return true if the pages has been saved, otherwise false is returned.
  1008. */
  1009. static bool control_save_page(PageSearchStatus *pss,
  1010. ram_addr_t offset, int *pages)
  1011. {
  1012. int ret;
  1013. ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, offset,
  1014. TARGET_PAGE_SIZE);
  1015. if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
  1016. return false;
  1017. }
  1018. if (ret == RAM_SAVE_CONTROL_DELAYED) {
  1019. *pages = 1;
  1020. return true;
  1021. }
  1022. *pages = ret;
  1023. return true;
  1024. }
  1025. /*
  1026. * directly send the page to the stream
  1027. *
  1028. * Returns the number of pages written.
  1029. *
  1030. * @pss: current PSS channel
  1031. * @block: block that contains the page we want to send
  1032. * @offset: offset inside the block for the page
  1033. * @buf: the page to be sent
  1034. * @async: send to page asyncly
  1035. */
  1036. static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
  1037. ram_addr_t offset, uint8_t *buf, bool async)
  1038. {
  1039. QEMUFile *file = pss->pss_channel;
  1040. if (migrate_mapped_ram()) {
  1041. qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE,
  1042. block->pages_offset + offset);
  1043. set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap);
  1044. } else {
  1045. ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
  1046. offset | RAM_SAVE_FLAG_PAGE));
  1047. if (async) {
  1048. qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
  1049. migrate_release_ram() &&
  1050. migration_in_postcopy());
  1051. } else {
  1052. qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
  1053. }
  1054. }
  1055. ram_transferred_add(TARGET_PAGE_SIZE);
  1056. stat64_add(&mig_stats.normal_pages, 1);
  1057. return 1;
  1058. }
  1059. /**
  1060. * ram_save_page: send the given page to the stream
  1061. *
  1062. * Returns the number of pages written.
  1063. * < 0 - error
  1064. * >=0 - Number of pages written - this might legally be 0
  1065. * if xbzrle noticed the page was the same.
  1066. *
  1067. * @rs: current RAM state
  1068. * @block: block that contains the page we want to send
  1069. * @offset: offset inside the block for the page
  1070. */
  1071. static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
  1072. {
  1073. int pages = -1;
  1074. uint8_t *p;
  1075. bool send_async = true;
  1076. RAMBlock *block = pss->block;
  1077. ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
  1078. ram_addr_t current_addr = block->offset + offset;
  1079. p = block->host + offset;
  1080. trace_ram_save_page(block->idstr, (uint64_t)offset, p);
  1081. XBZRLE_cache_lock();
  1082. if (rs->xbzrle_started && !migration_in_postcopy()) {
  1083. pages = save_xbzrle_page(rs, pss, &p, current_addr,
  1084. block, offset);
  1085. if (!rs->last_stage) {
  1086. /* Can't send this cached data async, since the cache page
  1087. * might get updated before it gets to the wire
  1088. */
  1089. send_async = false;
  1090. }
  1091. }
  1092. /* XBZRLE overflow or normal page */
  1093. if (pages == -1) {
  1094. pages = save_normal_page(pss, block, offset, p, send_async);
  1095. }
  1096. XBZRLE_cache_unlock();
  1097. return pages;
  1098. }
  1099. static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset)
  1100. {
  1101. if (!multifd_queue_page(block, offset)) {
  1102. return -1;
  1103. }
  1104. return 1;
  1105. }
  1106. #define PAGE_ALL_CLEAN 0
  1107. #define PAGE_TRY_AGAIN 1
  1108. #define PAGE_DIRTY_FOUND 2
  1109. /**
  1110. * find_dirty_block: find the next dirty page and update any state
  1111. * associated with the search process.
  1112. *
  1113. * Returns:
  1114. * <0: An error happened
  1115. * PAGE_ALL_CLEAN: no dirty page found, give up
  1116. * PAGE_TRY_AGAIN: no dirty page found, retry for next block
  1117. * PAGE_DIRTY_FOUND: dirty page found
  1118. *
  1119. * @rs: current RAM state
  1120. * @pss: data about the state of the current dirty page scan
  1121. * @again: set to false if the search has scanned the whole of RAM
  1122. */
  1123. static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
  1124. {
  1125. /* Update pss->page for the next dirty bit in ramblock */
  1126. pss_find_next_dirty(pss);
  1127. if (pss->complete_round && pss->block == rs->last_seen_block &&
  1128. pss->page >= rs->last_page) {
  1129. /*
  1130. * We've been once around the RAM and haven't found anything.
  1131. * Give up.
  1132. */
  1133. return PAGE_ALL_CLEAN;
  1134. }
  1135. if (!offset_in_ramblock(pss->block,
  1136. ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
  1137. /* Didn't find anything in this RAM Block */
  1138. pss->page = 0;
  1139. pss->block = QLIST_NEXT_RCU(pss->block, next);
  1140. if (!pss->block) {
  1141. if (multifd_ram_sync_per_round()) {
  1142. QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
  1143. int ret = multifd_ram_flush_and_sync(f);
  1144. if (ret < 0) {
  1145. return ret;
  1146. }
  1147. }
  1148. /* Hit the end of the list */
  1149. pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
  1150. /* Flag that we've looped */
  1151. pss->complete_round = true;
  1152. /* After the first round, enable XBZRLE. */
  1153. if (migrate_xbzrle()) {
  1154. rs->xbzrle_started = true;
  1155. }
  1156. }
  1157. /* Didn't find anything this time, but try again on the new block */
  1158. return PAGE_TRY_AGAIN;
  1159. } else {
  1160. /* We've found something */
  1161. return PAGE_DIRTY_FOUND;
  1162. }
  1163. }
  1164. /**
  1165. * unqueue_page: gets a page of the queue
  1166. *
  1167. * Helper for 'get_queued_page' - gets a page off the queue
  1168. *
  1169. * Returns the block of the page (or NULL if none available)
  1170. *
  1171. * @rs: current RAM state
  1172. * @offset: used to return the offset within the RAMBlock
  1173. */
  1174. static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
  1175. {
  1176. struct RAMSrcPageRequest *entry;
  1177. RAMBlock *block = NULL;
  1178. if (!postcopy_has_request(rs)) {
  1179. return NULL;
  1180. }
  1181. QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
  1182. /*
  1183. * This should _never_ change even after we take the lock, because no one
  1184. * should be taking anything off the request list other than us.
  1185. */
  1186. assert(postcopy_has_request(rs));
  1187. entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
  1188. block = entry->rb;
  1189. *offset = entry->offset;
  1190. if (entry->len > TARGET_PAGE_SIZE) {
  1191. entry->len -= TARGET_PAGE_SIZE;
  1192. entry->offset += TARGET_PAGE_SIZE;
  1193. } else {
  1194. memory_region_unref(block->mr);
  1195. QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
  1196. g_free(entry);
  1197. migration_consume_urgent_request();
  1198. }
  1199. return block;
  1200. }
  1201. #if defined(__linux__)
  1202. /**
  1203. * poll_fault_page: try to get next UFFD write fault page and, if pending fault
  1204. * is found, return RAM block pointer and page offset
  1205. *
  1206. * Returns pointer to the RAMBlock containing faulting page,
  1207. * NULL if no write faults are pending
  1208. *
  1209. * @rs: current RAM state
  1210. * @offset: page offset from the beginning of the block
  1211. */
  1212. static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
  1213. {
  1214. struct uffd_msg uffd_msg;
  1215. void *page_address;
  1216. RAMBlock *block;
  1217. int res;
  1218. if (!migrate_background_snapshot()) {
  1219. return NULL;
  1220. }
  1221. res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
  1222. if (res <= 0) {
  1223. return NULL;
  1224. }
  1225. page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
  1226. block = qemu_ram_block_from_host(page_address, false, offset);
  1227. assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
  1228. return block;
  1229. }
  1230. /**
  1231. * ram_save_release_protection: release UFFD write protection after
  1232. * a range of pages has been saved
  1233. *
  1234. * @rs: current RAM state
  1235. * @pss: page-search-status structure
  1236. * @start_page: index of the first page in the range relative to pss->block
  1237. *
  1238. * Returns 0 on success, negative value in case of an error
  1239. */
  1240. static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
  1241. unsigned long start_page)
  1242. {
  1243. int res = 0;
  1244. /* Check if page is from UFFD-managed region. */
  1245. if (pss->block->flags & RAM_UF_WRITEPROTECT) {
  1246. void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
  1247. uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
  1248. /* Flush async buffers before un-protect. */
  1249. qemu_fflush(pss->pss_channel);
  1250. /* Un-protect memory range. */
  1251. res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
  1252. false, false);
  1253. }
  1254. return res;
  1255. }
  1256. /* ram_write_tracking_available: check if kernel supports required UFFD features
  1257. *
  1258. * Returns true if supports, false otherwise
  1259. */
  1260. bool ram_write_tracking_available(void)
  1261. {
  1262. uint64_t uffd_features;
  1263. int res;
  1264. res = uffd_query_features(&uffd_features);
  1265. return (res == 0 &&
  1266. (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
  1267. }
  1268. /* ram_write_tracking_compatible: check if guest configuration is
  1269. * compatible with 'write-tracking'
  1270. *
  1271. * Returns true if compatible, false otherwise
  1272. */
  1273. bool ram_write_tracking_compatible(void)
  1274. {
  1275. const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
  1276. int uffd_fd;
  1277. RAMBlock *block;
  1278. bool ret = false;
  1279. /* Open UFFD file descriptor */
  1280. uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
  1281. if (uffd_fd < 0) {
  1282. return false;
  1283. }
  1284. RCU_READ_LOCK_GUARD();
  1285. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  1286. uint64_t uffd_ioctls;
  1287. /* Nothing to do with read-only and MMIO-writable regions */
  1288. if (block->mr->readonly || block->mr->rom_device) {
  1289. continue;
  1290. }
  1291. /* Try to register block memory via UFFD-IO to track writes */
  1292. if (uffd_register_memory(uffd_fd, block->host, block->max_length,
  1293. UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
  1294. goto out;
  1295. }
  1296. if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
  1297. goto out;
  1298. }
  1299. }
  1300. ret = true;
  1301. out:
  1302. uffd_close_fd(uffd_fd);
  1303. return ret;
  1304. }
  1305. static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
  1306. ram_addr_t size)
  1307. {
  1308. const ram_addr_t end = offset + size;
  1309. /*
  1310. * We read one byte of each page; this will preallocate page tables if
  1311. * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
  1312. * where no page was populated yet. This might require adaption when
  1313. * supporting other mappings, like shmem.
  1314. */
  1315. for (; offset < end; offset += block->page_size) {
  1316. char tmp = *((char *)block->host + offset);
  1317. /* Don't optimize the read out */
  1318. asm volatile("" : "+r" (tmp));
  1319. }
  1320. }
  1321. static inline int populate_read_section(MemoryRegionSection *section,
  1322. void *opaque)
  1323. {
  1324. const hwaddr size = int128_get64(section->size);
  1325. hwaddr offset = section->offset_within_region;
  1326. RAMBlock *block = section->mr->ram_block;
  1327. populate_read_range(block, offset, size);
  1328. return 0;
  1329. }
  1330. /*
  1331. * ram_block_populate_read: preallocate page tables and populate pages in the
  1332. * RAM block by reading a byte of each page.
  1333. *
  1334. * Since it's solely used for userfault_fd WP feature, here we just
  1335. * hardcode page size to qemu_real_host_page_size.
  1336. *
  1337. * @block: RAM block to populate
  1338. */
  1339. static void ram_block_populate_read(RAMBlock *rb)
  1340. {
  1341. /*
  1342. * Skip populating all pages that fall into a discarded range as managed by
  1343. * a RamDiscardManager responsible for the mapped memory region of the
  1344. * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
  1345. * must not get populated automatically. We don't have to track
  1346. * modifications via userfaultfd WP reliably, because these pages will
  1347. * not be part of the migration stream either way -- see
  1348. * ramblock_dirty_bitmap_exclude_discarded_pages().
  1349. *
  1350. * Note: The result is only stable while migrating (precopy/postcopy).
  1351. */
  1352. if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
  1353. RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
  1354. MemoryRegionSection section = {
  1355. .mr = rb->mr,
  1356. .offset_within_region = 0,
  1357. .size = rb->mr->size,
  1358. };
  1359. ram_discard_manager_replay_populated(rdm, &section,
  1360. populate_read_section, NULL);
  1361. } else {
  1362. populate_read_range(rb, 0, rb->used_length);
  1363. }
  1364. }
  1365. /*
  1366. * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
  1367. */
  1368. void ram_write_tracking_prepare(void)
  1369. {
  1370. RAMBlock *block;
  1371. RCU_READ_LOCK_GUARD();
  1372. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  1373. /* Nothing to do with read-only and MMIO-writable regions */
  1374. if (block->mr->readonly || block->mr->rom_device) {
  1375. continue;
  1376. }
  1377. /*
  1378. * Populate pages of the RAM block before enabling userfault_fd
  1379. * write protection.
  1380. *
  1381. * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
  1382. * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
  1383. * pages with pte_none() entries in page table.
  1384. */
  1385. ram_block_populate_read(block);
  1386. }
  1387. }
  1388. static inline int uffd_protect_section(MemoryRegionSection *section,
  1389. void *opaque)
  1390. {
  1391. const hwaddr size = int128_get64(section->size);
  1392. const hwaddr offset = section->offset_within_region;
  1393. RAMBlock *rb = section->mr->ram_block;
  1394. int uffd_fd = (uintptr_t)opaque;
  1395. return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
  1396. false);
  1397. }
  1398. static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
  1399. {
  1400. assert(rb->flags & RAM_UF_WRITEPROTECT);
  1401. /* See ram_block_populate_read() */
  1402. if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
  1403. RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
  1404. MemoryRegionSection section = {
  1405. .mr = rb->mr,
  1406. .offset_within_region = 0,
  1407. .size = rb->mr->size,
  1408. };
  1409. return ram_discard_manager_replay_populated(rdm, &section,
  1410. uffd_protect_section,
  1411. (void *)(uintptr_t)uffd_fd);
  1412. }
  1413. return uffd_change_protection(uffd_fd, rb->host,
  1414. rb->used_length, true, false);
  1415. }
  1416. /*
  1417. * ram_write_tracking_start: start UFFD-WP memory tracking
  1418. *
  1419. * Returns 0 for success or negative value in case of error
  1420. */
  1421. int ram_write_tracking_start(void)
  1422. {
  1423. int uffd_fd;
  1424. RAMState *rs = ram_state;
  1425. RAMBlock *block;
  1426. /* Open UFFD file descriptor */
  1427. uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
  1428. if (uffd_fd < 0) {
  1429. return uffd_fd;
  1430. }
  1431. rs->uffdio_fd = uffd_fd;
  1432. RCU_READ_LOCK_GUARD();
  1433. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  1434. /* Nothing to do with read-only and MMIO-writable regions */
  1435. if (block->mr->readonly || block->mr->rom_device) {
  1436. continue;
  1437. }
  1438. /* Register block memory with UFFD to track writes */
  1439. if (uffd_register_memory(rs->uffdio_fd, block->host,
  1440. block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
  1441. goto fail;
  1442. }
  1443. block->flags |= RAM_UF_WRITEPROTECT;
  1444. memory_region_ref(block->mr);
  1445. /* Apply UFFD write protection to the block memory range */
  1446. if (ram_block_uffd_protect(block, uffd_fd)) {
  1447. goto fail;
  1448. }
  1449. trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
  1450. block->host, block->max_length);
  1451. }
  1452. return 0;
  1453. fail:
  1454. error_report("ram_write_tracking_start() failed: restoring initial memory state");
  1455. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  1456. if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
  1457. continue;
  1458. }
  1459. uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
  1460. /* Cleanup flags and remove reference */
  1461. block->flags &= ~RAM_UF_WRITEPROTECT;
  1462. memory_region_unref(block->mr);
  1463. }
  1464. uffd_close_fd(uffd_fd);
  1465. rs->uffdio_fd = -1;
  1466. return -1;
  1467. }
  1468. /**
  1469. * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
  1470. */
  1471. void ram_write_tracking_stop(void)
  1472. {
  1473. RAMState *rs = ram_state;
  1474. RAMBlock *block;
  1475. RCU_READ_LOCK_GUARD();
  1476. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  1477. if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
  1478. continue;
  1479. }
  1480. uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
  1481. trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
  1482. block->host, block->max_length);
  1483. /* Cleanup flags and remove reference */
  1484. block->flags &= ~RAM_UF_WRITEPROTECT;
  1485. memory_region_unref(block->mr);
  1486. }
  1487. /* Finally close UFFD file descriptor */
  1488. uffd_close_fd(rs->uffdio_fd);
  1489. rs->uffdio_fd = -1;
  1490. }
  1491. #else
  1492. /* No target OS support, stubs just fail or ignore */
  1493. static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
  1494. {
  1495. (void) rs;
  1496. (void) offset;
  1497. return NULL;
  1498. }
  1499. static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
  1500. unsigned long start_page)
  1501. {
  1502. (void) rs;
  1503. (void) pss;
  1504. (void) start_page;
  1505. return 0;
  1506. }
  1507. bool ram_write_tracking_available(void)
  1508. {
  1509. return false;
  1510. }
  1511. bool ram_write_tracking_compatible(void)
  1512. {
  1513. g_assert_not_reached();
  1514. }
  1515. int ram_write_tracking_start(void)
  1516. {
  1517. g_assert_not_reached();
  1518. }
  1519. void ram_write_tracking_stop(void)
  1520. {
  1521. g_assert_not_reached();
  1522. }
  1523. #endif /* defined(__linux__) */
  1524. /**
  1525. * get_queued_page: unqueue a page from the postcopy requests
  1526. *
  1527. * Skips pages that are already sent (!dirty)
  1528. *
  1529. * Returns true if a queued page is found
  1530. *
  1531. * @rs: current RAM state
  1532. * @pss: data about the state of the current dirty page scan
  1533. */
  1534. static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
  1535. {
  1536. RAMBlock *block;
  1537. ram_addr_t offset;
  1538. bool dirty = false;
  1539. do {
  1540. block = unqueue_page(rs, &offset);
  1541. /*
  1542. * We're sending this page, and since it's postcopy nothing else
  1543. * will dirty it, and we must make sure it doesn't get sent again
  1544. * even if this queue request was received after the background
  1545. * search already sent it.
  1546. */
  1547. if (block) {
  1548. unsigned long page;
  1549. page = offset >> TARGET_PAGE_BITS;
  1550. dirty = test_bit(page, block->bmap);
  1551. if (!dirty) {
  1552. trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
  1553. page);
  1554. } else {
  1555. trace_get_queued_page(block->idstr, (uint64_t)offset, page);
  1556. }
  1557. }
  1558. } while (block && !dirty);
  1559. if (!block) {
  1560. /*
  1561. * Poll write faults too if background snapshot is enabled; that's
  1562. * when we have vcpus got blocked by the write protected pages.
  1563. */
  1564. block = poll_fault_page(rs, &offset);
  1565. }
  1566. if (block) {
  1567. /*
  1568. * We want the background search to continue from the queued page
  1569. * since the guest is likely to want other pages near to the page
  1570. * it just requested.
  1571. */
  1572. pss->block = block;
  1573. pss->page = offset >> TARGET_PAGE_BITS;
  1574. /*
  1575. * This unqueued page would break the "one round" check, even is
  1576. * really rare.
  1577. */
  1578. pss->complete_round = false;
  1579. }
  1580. return !!block;
  1581. }
  1582. /**
  1583. * migration_page_queue_free: drop any remaining pages in the ram
  1584. * request queue
  1585. *
  1586. * It should be empty at the end anyway, but in error cases there may
  1587. * be some left. in case that there is any page left, we drop it.
  1588. *
  1589. */
  1590. static void migration_page_queue_free(RAMState *rs)
  1591. {
  1592. struct RAMSrcPageRequest *mspr, *next_mspr;
  1593. /* This queue generally should be empty - but in the case of a failed
  1594. * migration might have some droppings in.
  1595. */
  1596. RCU_READ_LOCK_GUARD();
  1597. QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
  1598. memory_region_unref(mspr->rb->mr);
  1599. QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
  1600. g_free(mspr);
  1601. }
  1602. }
  1603. /**
  1604. * ram_save_queue_pages: queue the page for transmission
  1605. *
  1606. * A request from postcopy destination for example.
  1607. *
  1608. * Returns zero on success or negative on error
  1609. *
  1610. * @rbname: Name of the RAMBLock of the request. NULL means the
  1611. * same that last one.
  1612. * @start: starting address from the start of the RAMBlock
  1613. * @len: length (in bytes) to send
  1614. */
  1615. int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len,
  1616. Error **errp)
  1617. {
  1618. RAMBlock *ramblock;
  1619. RAMState *rs = ram_state;
  1620. stat64_add(&mig_stats.postcopy_requests, 1);
  1621. RCU_READ_LOCK_GUARD();
  1622. if (!rbname) {
  1623. /* Reuse last RAMBlock */
  1624. ramblock = rs->last_req_rb;
  1625. if (!ramblock) {
  1626. /*
  1627. * Shouldn't happen, we can't reuse the last RAMBlock if
  1628. * it's the 1st request.
  1629. */
  1630. error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no previous block");
  1631. return -1;
  1632. }
  1633. } else {
  1634. ramblock = qemu_ram_block_by_name(rbname);
  1635. if (!ramblock) {
  1636. /* We shouldn't be asked for a non-existent RAMBlock */
  1637. error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no block '%s'", rbname);
  1638. return -1;
  1639. }
  1640. rs->last_req_rb = ramblock;
  1641. }
  1642. trace_ram_save_queue_pages(ramblock->idstr, start, len);
  1643. if (!offset_in_ramblock(ramblock, start + len - 1)) {
  1644. error_setg(errp, "MIG_RP_MSG_REQ_PAGES request overrun, "
  1645. "start=" RAM_ADDR_FMT " len="
  1646. RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
  1647. start, len, ramblock->used_length);
  1648. return -1;
  1649. }
  1650. /*
  1651. * When with postcopy preempt, we send back the page directly in the
  1652. * rp-return thread.
  1653. */
  1654. if (postcopy_preempt_active()) {
  1655. ram_addr_t page_start = start >> TARGET_PAGE_BITS;
  1656. size_t page_size = qemu_ram_pagesize(ramblock);
  1657. PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
  1658. int ret = 0;
  1659. qemu_mutex_lock(&rs->bitmap_mutex);
  1660. pss_init(pss, ramblock, page_start);
  1661. /*
  1662. * Always use the preempt channel, and make sure it's there. It's
  1663. * safe to access without lock, because when rp-thread is running
  1664. * we should be the only one who operates on the qemufile
  1665. */
  1666. pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
  1667. assert(pss->pss_channel);
  1668. /*
  1669. * It must be either one or multiple of host page size. Just
  1670. * assert; if something wrong we're mostly split brain anyway.
  1671. */
  1672. assert(len % page_size == 0);
  1673. while (len) {
  1674. if (ram_save_host_page_urgent(pss)) {
  1675. error_setg(errp, "ram_save_host_page_urgent() failed: "
  1676. "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
  1677. ramblock->idstr, start);
  1678. ret = -1;
  1679. break;
  1680. }
  1681. /*
  1682. * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
  1683. * will automatically be moved and point to the next host page
  1684. * we're going to send, so no need to update here.
  1685. *
  1686. * Normally QEMU never sends >1 host page in requests, so
  1687. * logically we don't even need that as the loop should only
  1688. * run once, but just to be consistent.
  1689. */
  1690. len -= page_size;
  1691. };
  1692. qemu_mutex_unlock(&rs->bitmap_mutex);
  1693. return ret;
  1694. }
  1695. struct RAMSrcPageRequest *new_entry =
  1696. g_new0(struct RAMSrcPageRequest, 1);
  1697. new_entry->rb = ramblock;
  1698. new_entry->offset = start;
  1699. new_entry->len = len;
  1700. memory_region_ref(ramblock->mr);
  1701. qemu_mutex_lock(&rs->src_page_req_mutex);
  1702. QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
  1703. migration_make_urgent_request();
  1704. qemu_mutex_unlock(&rs->src_page_req_mutex);
  1705. return 0;
  1706. }
  1707. /**
  1708. * ram_save_target_page: save one target page to the precopy thread
  1709. * OR to multifd workers.
  1710. *
  1711. * @rs: current RAM state
  1712. * @pss: data about the page we want to send
  1713. */
  1714. static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
  1715. {
  1716. ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
  1717. int res;
  1718. if (!migrate_multifd()
  1719. || migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) {
  1720. if (save_zero_page(rs, pss, offset)) {
  1721. return 1;
  1722. }
  1723. }
  1724. if (migrate_multifd()) {
  1725. RAMBlock *block = pss->block;
  1726. return ram_save_multifd_page(block, offset);
  1727. }
  1728. if (control_save_page(pss, offset, &res)) {
  1729. return res;
  1730. }
  1731. return ram_save_page(rs, pss);
  1732. }
  1733. /* Should be called before sending a host page */
  1734. static void pss_host_page_prepare(PageSearchStatus *pss)
  1735. {
  1736. /* How many guest pages are there in one host page? */
  1737. size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
  1738. pss->host_page_sending = true;
  1739. if (guest_pfns <= 1) {
  1740. /*
  1741. * This covers both when guest psize == host psize, or when guest
  1742. * has larger psize than the host (guest_pfns==0).
  1743. *
  1744. * For the latter, we always send one whole guest page per
  1745. * iteration of the host page (example: an Alpha VM on x86 host
  1746. * will have guest psize 8K while host psize 4K).
  1747. */
  1748. pss->host_page_start = pss->page;
  1749. pss->host_page_end = pss->page + 1;
  1750. } else {
  1751. /*
  1752. * The host page spans over multiple guest pages, we send them
  1753. * within the same host page iteration.
  1754. */
  1755. pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
  1756. pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
  1757. }
  1758. }
  1759. /*
  1760. * Whether the page pointed by PSS is within the host page being sent.
  1761. * Must be called after a previous pss_host_page_prepare().
  1762. */
  1763. static bool pss_within_range(PageSearchStatus *pss)
  1764. {
  1765. ram_addr_t ram_addr;
  1766. assert(pss->host_page_sending);
  1767. /* Over host-page boundary? */
  1768. if (pss->page >= pss->host_page_end) {
  1769. return false;
  1770. }
  1771. ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
  1772. return offset_in_ramblock(pss->block, ram_addr);
  1773. }
  1774. static void pss_host_page_finish(PageSearchStatus *pss)
  1775. {
  1776. pss->host_page_sending = false;
  1777. /* This is not needed, but just to reset it */
  1778. pss->host_page_start = pss->host_page_end = 0;
  1779. }
  1780. /*
  1781. * Send an urgent host page specified by `pss'. Need to be called with
  1782. * bitmap_mutex held.
  1783. *
  1784. * Returns 0 if save host page succeeded, false otherwise.
  1785. */
  1786. static int ram_save_host_page_urgent(PageSearchStatus *pss)
  1787. {
  1788. bool page_dirty, sent = false;
  1789. RAMState *rs = ram_state;
  1790. int ret = 0;
  1791. trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
  1792. pss_host_page_prepare(pss);
  1793. /*
  1794. * If precopy is sending the same page, let it be done in precopy, or
  1795. * we could send the same page in two channels and none of them will
  1796. * receive the whole page.
  1797. */
  1798. if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
  1799. trace_postcopy_preempt_hit(pss->block->idstr,
  1800. pss->page << TARGET_PAGE_BITS);
  1801. return 0;
  1802. }
  1803. do {
  1804. page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
  1805. if (page_dirty) {
  1806. /* Be strict to return code; it must be 1, or what else? */
  1807. if (ram_save_target_page(rs, pss) != 1) {
  1808. error_report_once("%s: ram_save_target_page failed", __func__);
  1809. ret = -1;
  1810. goto out;
  1811. }
  1812. sent = true;
  1813. }
  1814. pss_find_next_dirty(pss);
  1815. } while (pss_within_range(pss));
  1816. out:
  1817. pss_host_page_finish(pss);
  1818. /* For urgent requests, flush immediately if sent */
  1819. if (sent) {
  1820. qemu_fflush(pss->pss_channel);
  1821. }
  1822. return ret;
  1823. }
  1824. /**
  1825. * ram_save_host_page: save a whole host page
  1826. *
  1827. * Starting at *offset send pages up to the end of the current host
  1828. * page. It's valid for the initial offset to point into the middle of
  1829. * a host page in which case the remainder of the hostpage is sent.
  1830. * Only dirty target pages are sent. Note that the host page size may
  1831. * be a huge page for this block.
  1832. *
  1833. * The saving stops at the boundary of the used_length of the block
  1834. * if the RAMBlock isn't a multiple of the host page size.
  1835. *
  1836. * The caller must be with ram_state.bitmap_mutex held to call this
  1837. * function. Note that this function can temporarily release the lock, but
  1838. * when the function is returned it'll make sure the lock is still held.
  1839. *
  1840. * Returns the number of pages written or negative on error
  1841. *
  1842. * @rs: current RAM state
  1843. * @pss: data about the page we want to send
  1844. */
  1845. static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
  1846. {
  1847. bool page_dirty, preempt_active = postcopy_preempt_active();
  1848. int tmppages, pages = 0;
  1849. size_t pagesize_bits =
  1850. qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
  1851. unsigned long start_page = pss->page;
  1852. int res;
  1853. if (migrate_ram_is_ignored(pss->block)) {
  1854. error_report("block %s should not be migrated !", pss->block->idstr);
  1855. return 0;
  1856. }
  1857. /* Update host page boundary information */
  1858. pss_host_page_prepare(pss);
  1859. do {
  1860. page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
  1861. /* Check the pages is dirty and if it is send it */
  1862. if (page_dirty) {
  1863. /*
  1864. * Properly yield the lock only in postcopy preempt mode
  1865. * because both migration thread and rp-return thread can
  1866. * operate on the bitmaps.
  1867. */
  1868. if (preempt_active) {
  1869. qemu_mutex_unlock(&rs->bitmap_mutex);
  1870. }
  1871. tmppages = ram_save_target_page(rs, pss);
  1872. if (tmppages >= 0) {
  1873. pages += tmppages;
  1874. /*
  1875. * Allow rate limiting to happen in the middle of huge pages if
  1876. * something is sent in the current iteration.
  1877. */
  1878. if (pagesize_bits > 1 && tmppages > 0) {
  1879. migration_rate_limit();
  1880. }
  1881. }
  1882. if (preempt_active) {
  1883. qemu_mutex_lock(&rs->bitmap_mutex);
  1884. }
  1885. } else {
  1886. tmppages = 0;
  1887. }
  1888. if (tmppages < 0) {
  1889. pss_host_page_finish(pss);
  1890. return tmppages;
  1891. }
  1892. pss_find_next_dirty(pss);
  1893. } while (pss_within_range(pss));
  1894. pss_host_page_finish(pss);
  1895. res = ram_save_release_protection(rs, pss, start_page);
  1896. return (res < 0 ? res : pages);
  1897. }
  1898. /**
  1899. * ram_find_and_save_block: finds a dirty page and sends it to f
  1900. *
  1901. * Called within an RCU critical section.
  1902. *
  1903. * Returns the number of pages written where zero means no dirty pages,
  1904. * or negative on error
  1905. *
  1906. * @rs: current RAM state
  1907. *
  1908. * On systems where host-page-size > target-page-size it will send all the
  1909. * pages in a host page that are dirty.
  1910. */
  1911. static int ram_find_and_save_block(RAMState *rs)
  1912. {
  1913. PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
  1914. int pages = 0;
  1915. /* No dirty page as there is zero RAM */
  1916. if (!rs->ram_bytes_total) {
  1917. return pages;
  1918. }
  1919. /*
  1920. * Always keep last_seen_block/last_page valid during this procedure,
  1921. * because find_dirty_block() relies on these values (e.g., we compare
  1922. * last_seen_block with pss.block to see whether we searched all the
  1923. * ramblocks) to detect the completion of migration. Having NULL value
  1924. * of last_seen_block can conditionally cause below loop to run forever.
  1925. */
  1926. if (!rs->last_seen_block) {
  1927. rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
  1928. rs->last_page = 0;
  1929. }
  1930. pss_init(pss, rs->last_seen_block, rs->last_page);
  1931. while (true){
  1932. if (!get_queued_page(rs, pss)) {
  1933. /* priority queue empty, so just search for something dirty */
  1934. int res = find_dirty_block(rs, pss);
  1935. if (res != PAGE_DIRTY_FOUND) {
  1936. if (res == PAGE_ALL_CLEAN) {
  1937. break;
  1938. } else if (res == PAGE_TRY_AGAIN) {
  1939. continue;
  1940. } else if (res < 0) {
  1941. pages = res;
  1942. break;
  1943. }
  1944. }
  1945. }
  1946. pages = ram_save_host_page(rs, pss);
  1947. if (pages) {
  1948. break;
  1949. }
  1950. }
  1951. rs->last_seen_block = pss->block;
  1952. rs->last_page = pss->page;
  1953. return pages;
  1954. }
  1955. static uint64_t ram_bytes_total_with_ignored(void)
  1956. {
  1957. RAMBlock *block;
  1958. uint64_t total = 0;
  1959. RCU_READ_LOCK_GUARD();
  1960. RAMBLOCK_FOREACH_MIGRATABLE(block) {
  1961. total += block->used_length;
  1962. }
  1963. return total;
  1964. }
  1965. uint64_t ram_bytes_total(void)
  1966. {
  1967. RAMBlock *block;
  1968. uint64_t total = 0;
  1969. RCU_READ_LOCK_GUARD();
  1970. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  1971. total += block->used_length;
  1972. }
  1973. return total;
  1974. }
  1975. static void xbzrle_load_setup(void)
  1976. {
  1977. XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
  1978. }
  1979. static void xbzrle_load_cleanup(void)
  1980. {
  1981. g_free(XBZRLE.decoded_buf);
  1982. XBZRLE.decoded_buf = NULL;
  1983. }
  1984. static void ram_state_cleanup(RAMState **rsp)
  1985. {
  1986. if (*rsp) {
  1987. migration_page_queue_free(*rsp);
  1988. qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
  1989. qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
  1990. g_free(*rsp);
  1991. *rsp = NULL;
  1992. }
  1993. }
  1994. static void xbzrle_cleanup(void)
  1995. {
  1996. XBZRLE_cache_lock();
  1997. if (XBZRLE.cache) {
  1998. cache_fini(XBZRLE.cache);
  1999. g_free(XBZRLE.encoded_buf);
  2000. g_free(XBZRLE.current_buf);
  2001. g_free(XBZRLE.zero_target_page);
  2002. XBZRLE.cache = NULL;
  2003. XBZRLE.encoded_buf = NULL;
  2004. XBZRLE.current_buf = NULL;
  2005. XBZRLE.zero_target_page = NULL;
  2006. }
  2007. XBZRLE_cache_unlock();
  2008. }
  2009. static void ram_bitmaps_destroy(void)
  2010. {
  2011. RAMBlock *block;
  2012. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  2013. g_free(block->clear_bmap);
  2014. block->clear_bmap = NULL;
  2015. g_free(block->bmap);
  2016. block->bmap = NULL;
  2017. g_free(block->file_bmap);
  2018. block->file_bmap = NULL;
  2019. }
  2020. }
  2021. static void ram_save_cleanup(void *opaque)
  2022. {
  2023. RAMState **rsp = opaque;
  2024. /* We don't use dirty log with background snapshots */
  2025. if (!migrate_background_snapshot()) {
  2026. /* caller have hold BQL or is in a bh, so there is
  2027. * no writing race against the migration bitmap
  2028. */
  2029. if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
  2030. /*
  2031. * do not stop dirty log without starting it, since
  2032. * memory_global_dirty_log_stop will assert that
  2033. * memory_global_dirty_log_start/stop used in pairs
  2034. */
  2035. memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
  2036. }
  2037. }
  2038. ram_bitmaps_destroy();
  2039. xbzrle_cleanup();
  2040. multifd_ram_save_cleanup();
  2041. ram_state_cleanup(rsp);
  2042. }
  2043. static void ram_state_reset(RAMState *rs)
  2044. {
  2045. int i;
  2046. for (i = 0; i < RAM_CHANNEL_MAX; i++) {
  2047. rs->pss[i].last_sent_block = NULL;
  2048. }
  2049. rs->last_seen_block = NULL;
  2050. rs->last_page = 0;
  2051. rs->last_version = ram_list.version;
  2052. rs->xbzrle_started = false;
  2053. }
  2054. #define MAX_WAIT 50 /* ms, half buffered_file limit */
  2055. /* **** functions for postcopy ***** */
  2056. void ram_postcopy_migrated_memory_release(MigrationState *ms)
  2057. {
  2058. struct RAMBlock *block;
  2059. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  2060. unsigned long *bitmap = block->bmap;
  2061. unsigned long range = block->used_length >> TARGET_PAGE_BITS;
  2062. unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
  2063. while (run_start < range) {
  2064. unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
  2065. ram_discard_range(block->idstr,
  2066. ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
  2067. ((ram_addr_t)(run_end - run_start))
  2068. << TARGET_PAGE_BITS);
  2069. run_start = find_next_zero_bit(bitmap, range, run_end + 1);
  2070. }
  2071. }
  2072. }
  2073. /**
  2074. * postcopy_send_discard_bm_ram: discard a RAMBlock
  2075. *
  2076. * Callback from postcopy_each_ram_send_discard for each RAMBlock
  2077. *
  2078. * @ms: current migration state
  2079. * @block: RAMBlock to discard
  2080. */
  2081. static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
  2082. {
  2083. unsigned long end = block->used_length >> TARGET_PAGE_BITS;
  2084. unsigned long current;
  2085. unsigned long *bitmap = block->bmap;
  2086. for (current = 0; current < end; ) {
  2087. unsigned long one = find_next_bit(bitmap, end, current);
  2088. unsigned long zero, discard_length;
  2089. if (one >= end) {
  2090. break;
  2091. }
  2092. zero = find_next_zero_bit(bitmap, end, one + 1);
  2093. if (zero >= end) {
  2094. discard_length = end - one;
  2095. } else {
  2096. discard_length = zero - one;
  2097. }
  2098. postcopy_discard_send_range(ms, one, discard_length);
  2099. current = one + discard_length;
  2100. }
  2101. }
  2102. static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
  2103. /**
  2104. * postcopy_each_ram_send_discard: discard all RAMBlocks
  2105. *
  2106. * Utility for the outgoing postcopy code.
  2107. * Calls postcopy_send_discard_bm_ram for each RAMBlock
  2108. * passing it bitmap indexes and name.
  2109. * (qemu_ram_foreach_block ends up passing unscaled lengths
  2110. * which would mean postcopy code would have to deal with target page)
  2111. *
  2112. * @ms: current migration state
  2113. */
  2114. static void postcopy_each_ram_send_discard(MigrationState *ms)
  2115. {
  2116. struct RAMBlock *block;
  2117. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  2118. postcopy_discard_send_init(ms, block->idstr);
  2119. /*
  2120. * Deal with TPS != HPS and huge pages. It discard any partially sent
  2121. * host-page size chunks, mark any partially dirty host-page size
  2122. * chunks as all dirty. In this case the host-page is the host-page
  2123. * for the particular RAMBlock, i.e. it might be a huge page.
  2124. */
  2125. postcopy_chunk_hostpages_pass(ms, block);
  2126. /*
  2127. * Postcopy sends chunks of bitmap over the wire, but it
  2128. * just needs indexes at this point, avoids it having
  2129. * target page specific code.
  2130. */
  2131. postcopy_send_discard_bm_ram(ms, block);
  2132. postcopy_discard_send_finish(ms);
  2133. }
  2134. }
  2135. /**
  2136. * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
  2137. *
  2138. * Helper for postcopy_chunk_hostpages; it's called twice to
  2139. * canonicalize the two bitmaps, that are similar, but one is
  2140. * inverted.
  2141. *
  2142. * Postcopy requires that all target pages in a hostpage are dirty or
  2143. * clean, not a mix. This function canonicalizes the bitmaps.
  2144. *
  2145. * @ms: current migration state
  2146. * @block: block that contains the page we want to canonicalize
  2147. */
  2148. static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
  2149. {
  2150. RAMState *rs = ram_state;
  2151. unsigned long *bitmap = block->bmap;
  2152. unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
  2153. unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
  2154. unsigned long run_start;
  2155. if (block->page_size == TARGET_PAGE_SIZE) {
  2156. /* Easy case - TPS==HPS for a non-huge page RAMBlock */
  2157. return;
  2158. }
  2159. /* Find a dirty page */
  2160. run_start = find_next_bit(bitmap, pages, 0);
  2161. while (run_start < pages) {
  2162. /*
  2163. * If the start of this run of pages is in the middle of a host
  2164. * page, then we need to fixup this host page.
  2165. */
  2166. if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
  2167. /* Find the end of this run */
  2168. run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
  2169. /*
  2170. * If the end isn't at the start of a host page, then the
  2171. * run doesn't finish at the end of a host page
  2172. * and we need to discard.
  2173. */
  2174. }
  2175. if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
  2176. unsigned long page;
  2177. unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
  2178. host_ratio);
  2179. run_start = QEMU_ALIGN_UP(run_start, host_ratio);
  2180. /* Clean up the bitmap */
  2181. for (page = fixup_start_addr;
  2182. page < fixup_start_addr + host_ratio; page++) {
  2183. /*
  2184. * Remark them as dirty, updating the count for any pages
  2185. * that weren't previously dirty.
  2186. */
  2187. rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
  2188. }
  2189. }
  2190. /* Find the next dirty page for the next iteration */
  2191. run_start = find_next_bit(bitmap, pages, run_start);
  2192. }
  2193. }
  2194. /**
  2195. * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
  2196. *
  2197. * Transmit the set of pages to be discarded after precopy to the target
  2198. * these are pages that:
  2199. * a) Have been previously transmitted but are now dirty again
  2200. * b) Pages that have never been transmitted, this ensures that
  2201. * any pages on the destination that have been mapped by background
  2202. * tasks get discarded (transparent huge pages is the specific concern)
  2203. * Hopefully this is pretty sparse
  2204. *
  2205. * @ms: current migration state
  2206. */
  2207. void ram_postcopy_send_discard_bitmap(MigrationState *ms)
  2208. {
  2209. RAMState *rs = ram_state;
  2210. RCU_READ_LOCK_GUARD();
  2211. /* This should be our last sync, the src is now paused */
  2212. migration_bitmap_sync(rs, false);
  2213. /* Easiest way to make sure we don't resume in the middle of a host-page */
  2214. rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
  2215. rs->last_seen_block = NULL;
  2216. rs->last_page = 0;
  2217. postcopy_each_ram_send_discard(ms);
  2218. trace_ram_postcopy_send_discard_bitmap();
  2219. }
  2220. /**
  2221. * ram_discard_range: discard dirtied pages at the beginning of postcopy
  2222. *
  2223. * Returns zero on success
  2224. *
  2225. * @rbname: name of the RAMBlock of the request. NULL means the
  2226. * same that last one.
  2227. * @start: RAMBlock starting page
  2228. * @length: RAMBlock size
  2229. */
  2230. int ram_discard_range(const char *rbname, uint64_t start, size_t length)
  2231. {
  2232. trace_ram_discard_range(rbname, start, length);
  2233. RCU_READ_LOCK_GUARD();
  2234. RAMBlock *rb = qemu_ram_block_by_name(rbname);
  2235. if (!rb) {
  2236. error_report("ram_discard_range: Failed to find block '%s'", rbname);
  2237. return -1;
  2238. }
  2239. /*
  2240. * On source VM, we don't need to update the received bitmap since
  2241. * we don't even have one.
  2242. */
  2243. if (rb->receivedmap) {
  2244. bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
  2245. length >> qemu_target_page_bits());
  2246. }
  2247. return ram_block_discard_range(rb, start, length);
  2248. }
  2249. /*
  2250. * For every allocation, we will try not to crash the VM if the
  2251. * allocation failed.
  2252. */
  2253. static bool xbzrle_init(Error **errp)
  2254. {
  2255. if (!migrate_xbzrle()) {
  2256. return true;
  2257. }
  2258. XBZRLE_cache_lock();
  2259. XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
  2260. if (!XBZRLE.zero_target_page) {
  2261. error_setg(errp, "%s: Error allocating zero page", __func__);
  2262. goto err_out;
  2263. }
  2264. XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
  2265. TARGET_PAGE_SIZE, errp);
  2266. if (!XBZRLE.cache) {
  2267. goto free_zero_page;
  2268. }
  2269. XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
  2270. if (!XBZRLE.encoded_buf) {
  2271. error_setg(errp, "%s: Error allocating encoded_buf", __func__);
  2272. goto free_cache;
  2273. }
  2274. XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
  2275. if (!XBZRLE.current_buf) {
  2276. error_setg(errp, "%s: Error allocating current_buf", __func__);
  2277. goto free_encoded_buf;
  2278. }
  2279. /* We are all good */
  2280. XBZRLE_cache_unlock();
  2281. return true;
  2282. free_encoded_buf:
  2283. g_free(XBZRLE.encoded_buf);
  2284. XBZRLE.encoded_buf = NULL;
  2285. free_cache:
  2286. cache_fini(XBZRLE.cache);
  2287. XBZRLE.cache = NULL;
  2288. free_zero_page:
  2289. g_free(XBZRLE.zero_target_page);
  2290. XBZRLE.zero_target_page = NULL;
  2291. err_out:
  2292. XBZRLE_cache_unlock();
  2293. return false;
  2294. }
  2295. static bool ram_state_init(RAMState **rsp, Error **errp)
  2296. {
  2297. *rsp = g_try_new0(RAMState, 1);
  2298. if (!*rsp) {
  2299. error_setg(errp, "%s: Init ramstate fail", __func__);
  2300. return false;
  2301. }
  2302. qemu_mutex_init(&(*rsp)->bitmap_mutex);
  2303. qemu_mutex_init(&(*rsp)->src_page_req_mutex);
  2304. QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
  2305. (*rsp)->ram_bytes_total = ram_bytes_total();
  2306. /*
  2307. * Count the total number of pages used by ram blocks not including any
  2308. * gaps due to alignment or unplugs.
  2309. * This must match with the initial values of dirty bitmap.
  2310. */
  2311. (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
  2312. ram_state_reset(*rsp);
  2313. return true;
  2314. }
  2315. static void ram_list_init_bitmaps(void)
  2316. {
  2317. MigrationState *ms = migrate_get_current();
  2318. RAMBlock *block;
  2319. unsigned long pages;
  2320. uint8_t shift;
  2321. /* Skip setting bitmap if there is no RAM */
  2322. if (ram_bytes_total()) {
  2323. shift = ms->clear_bitmap_shift;
  2324. if (shift > CLEAR_BITMAP_SHIFT_MAX) {
  2325. error_report("clear_bitmap_shift (%u) too big, using "
  2326. "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
  2327. shift = CLEAR_BITMAP_SHIFT_MAX;
  2328. } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
  2329. error_report("clear_bitmap_shift (%u) too small, using "
  2330. "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
  2331. shift = CLEAR_BITMAP_SHIFT_MIN;
  2332. }
  2333. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  2334. pages = block->max_length >> TARGET_PAGE_BITS;
  2335. /*
  2336. * The initial dirty bitmap for migration must be set with all
  2337. * ones to make sure we'll migrate every guest RAM page to
  2338. * destination.
  2339. * Here we set RAMBlock.bmap all to 1 because when rebegin a
  2340. * new migration after a failed migration, ram_list.
  2341. * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
  2342. * guest memory.
  2343. */
  2344. block->bmap = bitmap_new(pages);
  2345. bitmap_set(block->bmap, 0, pages);
  2346. if (migrate_mapped_ram()) {
  2347. block->file_bmap = bitmap_new(pages);
  2348. }
  2349. block->clear_bmap_shift = shift;
  2350. block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
  2351. }
  2352. }
  2353. }
  2354. static void migration_bitmap_clear_discarded_pages(RAMState *rs)
  2355. {
  2356. unsigned long pages;
  2357. RAMBlock *rb;
  2358. RCU_READ_LOCK_GUARD();
  2359. RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
  2360. pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
  2361. rs->migration_dirty_pages -= pages;
  2362. }
  2363. }
  2364. static bool ram_init_bitmaps(RAMState *rs, Error **errp)
  2365. {
  2366. bool ret = true;
  2367. qemu_mutex_lock_ramlist();
  2368. WITH_RCU_READ_LOCK_GUARD() {
  2369. ram_list_init_bitmaps();
  2370. /* We don't use dirty log with background snapshots */
  2371. if (!migrate_background_snapshot()) {
  2372. ret = memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, errp);
  2373. if (!ret) {
  2374. goto out_unlock;
  2375. }
  2376. migration_bitmap_sync_precopy(false);
  2377. }
  2378. }
  2379. out_unlock:
  2380. qemu_mutex_unlock_ramlist();
  2381. if (!ret) {
  2382. ram_bitmaps_destroy();
  2383. return false;
  2384. }
  2385. /*
  2386. * After an eventual first bitmap sync, fixup the initial bitmap
  2387. * containing all 1s to exclude any discarded pages from migration.
  2388. */
  2389. migration_bitmap_clear_discarded_pages(rs);
  2390. return true;
  2391. }
  2392. static int ram_init_all(RAMState **rsp, Error **errp)
  2393. {
  2394. if (!ram_state_init(rsp, errp)) {
  2395. return -1;
  2396. }
  2397. if (!xbzrle_init(errp)) {
  2398. ram_state_cleanup(rsp);
  2399. return -1;
  2400. }
  2401. if (!ram_init_bitmaps(*rsp, errp)) {
  2402. return -1;
  2403. }
  2404. return 0;
  2405. }
  2406. static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
  2407. {
  2408. RAMBlock *block;
  2409. uint64_t pages = 0;
  2410. /*
  2411. * Postcopy is not using xbzrle/compression, so no need for that.
  2412. * Also, since source are already halted, we don't need to care
  2413. * about dirty page logging as well.
  2414. */
  2415. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  2416. pages += bitmap_count_one(block->bmap,
  2417. block->used_length >> TARGET_PAGE_BITS);
  2418. }
  2419. /* This may not be aligned with current bitmaps. Recalculate. */
  2420. rs->migration_dirty_pages = pages;
  2421. ram_state_reset(rs);
  2422. /* Update RAMState cache of output QEMUFile */
  2423. rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
  2424. trace_ram_state_resume_prepare(pages);
  2425. }
  2426. /*
  2427. * This function clears bits of the free pages reported by the caller from the
  2428. * migration dirty bitmap. @addr is the host address corresponding to the
  2429. * start of the continuous guest free pages, and @len is the total bytes of
  2430. * those pages.
  2431. */
  2432. void qemu_guest_free_page_hint(void *addr, size_t len)
  2433. {
  2434. RAMBlock *block;
  2435. ram_addr_t offset;
  2436. size_t used_len, start, npages;
  2437. /* This function is currently expected to be used during live migration */
  2438. if (!migration_is_running()) {
  2439. return;
  2440. }
  2441. for (; len > 0; len -= used_len, addr += used_len) {
  2442. block = qemu_ram_block_from_host(addr, false, &offset);
  2443. if (unlikely(!block || offset >= block->used_length)) {
  2444. /*
  2445. * The implementation might not support RAMBlock resize during
  2446. * live migration, but it could happen in theory with future
  2447. * updates. So we add a check here to capture that case.
  2448. */
  2449. error_report_once("%s unexpected error", __func__);
  2450. return;
  2451. }
  2452. if (len <= block->used_length - offset) {
  2453. used_len = len;
  2454. } else {
  2455. used_len = block->used_length - offset;
  2456. }
  2457. start = offset >> TARGET_PAGE_BITS;
  2458. npages = used_len >> TARGET_PAGE_BITS;
  2459. qemu_mutex_lock(&ram_state->bitmap_mutex);
  2460. /*
  2461. * The skipped free pages are equavalent to be sent from clear_bmap's
  2462. * perspective, so clear the bits from the memory region bitmap which
  2463. * are initially set. Otherwise those skipped pages will be sent in
  2464. * the next round after syncing from the memory region bitmap.
  2465. */
  2466. migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
  2467. ram_state->migration_dirty_pages -=
  2468. bitmap_count_one_with_offset(block->bmap, start, npages);
  2469. bitmap_clear(block->bmap, start, npages);
  2470. qemu_mutex_unlock(&ram_state->bitmap_mutex);
  2471. }
  2472. }
  2473. #define MAPPED_RAM_HDR_VERSION 1
  2474. struct MappedRamHeader {
  2475. uint32_t version;
  2476. /*
  2477. * The target's page size, so we know how many pages are in the
  2478. * bitmap.
  2479. */
  2480. uint64_t page_size;
  2481. /*
  2482. * The offset in the migration file where the pages bitmap is
  2483. * stored.
  2484. */
  2485. uint64_t bitmap_offset;
  2486. /*
  2487. * The offset in the migration file where the actual pages (data)
  2488. * are stored.
  2489. */
  2490. uint64_t pages_offset;
  2491. } QEMU_PACKED;
  2492. typedef struct MappedRamHeader MappedRamHeader;
  2493. static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block)
  2494. {
  2495. g_autofree MappedRamHeader *header = NULL;
  2496. size_t header_size, bitmap_size;
  2497. long num_pages;
  2498. header = g_new0(MappedRamHeader, 1);
  2499. header_size = sizeof(MappedRamHeader);
  2500. num_pages = block->used_length >> TARGET_PAGE_BITS;
  2501. bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
  2502. /*
  2503. * Save the file offsets of where the bitmap and the pages should
  2504. * go as they are written at the end of migration and during the
  2505. * iterative phase, respectively.
  2506. */
  2507. block->bitmap_offset = qemu_get_offset(file) + header_size;
  2508. block->pages_offset = ROUND_UP(block->bitmap_offset +
  2509. bitmap_size,
  2510. MAPPED_RAM_FILE_OFFSET_ALIGNMENT);
  2511. header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION);
  2512. header->page_size = cpu_to_be64(TARGET_PAGE_SIZE);
  2513. header->bitmap_offset = cpu_to_be64(block->bitmap_offset);
  2514. header->pages_offset = cpu_to_be64(block->pages_offset);
  2515. qemu_put_buffer(file, (uint8_t *) header, header_size);
  2516. /* prepare offset for next ramblock */
  2517. qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET);
  2518. }
  2519. static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header,
  2520. Error **errp)
  2521. {
  2522. size_t ret, header_size = sizeof(MappedRamHeader);
  2523. ret = qemu_get_buffer(file, (uint8_t *)header, header_size);
  2524. if (ret != header_size) {
  2525. error_setg(errp, "Could not read whole mapped-ram migration header "
  2526. "(expected %zd, got %zd bytes)", header_size, ret);
  2527. return false;
  2528. }
  2529. /* migration stream is big-endian */
  2530. header->version = be32_to_cpu(header->version);
  2531. if (header->version > MAPPED_RAM_HDR_VERSION) {
  2532. error_setg(errp, "Migration mapped-ram capability version not "
  2533. "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION,
  2534. header->version);
  2535. return false;
  2536. }
  2537. header->page_size = be64_to_cpu(header->page_size);
  2538. header->bitmap_offset = be64_to_cpu(header->bitmap_offset);
  2539. header->pages_offset = be64_to_cpu(header->pages_offset);
  2540. return true;
  2541. }
  2542. /*
  2543. * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
  2544. * long-running RCU critical section. When rcu-reclaims in the code
  2545. * start to become numerous it will be necessary to reduce the
  2546. * granularity of these critical sections.
  2547. */
  2548. /**
  2549. * ram_save_setup: Setup RAM for migration
  2550. *
  2551. * Returns zero to indicate success and negative for error
  2552. *
  2553. * @f: QEMUFile where to send the data
  2554. * @opaque: RAMState pointer
  2555. * @errp: pointer to Error*, to store an error if it happens.
  2556. */
  2557. static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp)
  2558. {
  2559. RAMState **rsp = opaque;
  2560. RAMBlock *block;
  2561. int ret, max_hg_page_size;
  2562. /* migration has already setup the bitmap, reuse it. */
  2563. if (!migration_in_colo_state()) {
  2564. if (ram_init_all(rsp, errp) != 0) {
  2565. return -1;
  2566. }
  2567. }
  2568. (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
  2569. /*
  2570. * ??? Mirrors the previous value of qemu_host_page_size,
  2571. * but is this really what was intended for the migration?
  2572. */
  2573. max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE);
  2574. WITH_RCU_READ_LOCK_GUARD() {
  2575. qemu_put_be64(f, ram_bytes_total_with_ignored()
  2576. | RAM_SAVE_FLAG_MEM_SIZE);
  2577. RAMBLOCK_FOREACH_MIGRATABLE(block) {
  2578. qemu_put_byte(f, strlen(block->idstr));
  2579. qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
  2580. qemu_put_be64(f, block->used_length);
  2581. if (migrate_postcopy_ram() &&
  2582. block->page_size != max_hg_page_size) {
  2583. qemu_put_be64(f, block->page_size);
  2584. }
  2585. if (migrate_ignore_shared()) {
  2586. qemu_put_be64(f, block->mr->addr);
  2587. }
  2588. if (migrate_mapped_ram()) {
  2589. mapped_ram_setup_ramblock(f, block);
  2590. }
  2591. }
  2592. }
  2593. ret = rdma_registration_start(f, RAM_CONTROL_SETUP);
  2594. if (ret < 0) {
  2595. error_setg(errp, "%s: failed to start RDMA registration", __func__);
  2596. qemu_file_set_error(f, ret);
  2597. return ret;
  2598. }
  2599. ret = rdma_registration_stop(f, RAM_CONTROL_SETUP);
  2600. if (ret < 0) {
  2601. error_setg(errp, "%s: failed to stop RDMA registration", __func__);
  2602. qemu_file_set_error(f, ret);
  2603. return ret;
  2604. }
  2605. if (migrate_multifd()) {
  2606. multifd_ram_save_setup();
  2607. }
  2608. /*
  2609. * This operation is unfortunate..
  2610. *
  2611. * For legacy QEMUs using per-section sync
  2612. * =======================================
  2613. *
  2614. * This must exist because the EOS below requires the SYNC messages
  2615. * per-channel to work.
  2616. *
  2617. * For modern QEMUs using per-round sync
  2618. * =====================================
  2619. *
  2620. * Logically such sync is not needed, and recv threads should not run
  2621. * until setup ready (using things like channels_ready on src). Then
  2622. * we should be all fine.
  2623. *
  2624. * However even if we add channels_ready to recv side in new QEMUs, old
  2625. * QEMU won't have them so this sync will still be needed to make sure
  2626. * multifd recv threads won't start processing guest pages early before
  2627. * ram_load_setup() is properly done.
  2628. *
  2629. * Let's stick with this. Fortunately the overhead is low to sync
  2630. * during setup because the VM is running, so at least it's not
  2631. * accounted as part of downtime.
  2632. */
  2633. bql_unlock();
  2634. ret = multifd_ram_flush_and_sync(f);
  2635. bql_lock();
  2636. if (ret < 0) {
  2637. error_setg(errp, "%s: multifd synchronization failed", __func__);
  2638. return ret;
  2639. }
  2640. qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
  2641. ret = qemu_fflush(f);
  2642. if (ret < 0) {
  2643. error_setg_errno(errp, -ret, "%s failed", __func__);
  2644. }
  2645. return ret;
  2646. }
  2647. static void ram_save_file_bmap(QEMUFile *f)
  2648. {
  2649. RAMBlock *block;
  2650. RAMBLOCK_FOREACH_MIGRATABLE(block) {
  2651. long num_pages = block->used_length >> TARGET_PAGE_BITS;
  2652. long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
  2653. qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size,
  2654. block->bitmap_offset);
  2655. ram_transferred_add(bitmap_size);
  2656. /*
  2657. * Free the bitmap here to catch any synchronization issues
  2658. * with multifd channels. No channels should be sending pages
  2659. * after we've written the bitmap to file.
  2660. */
  2661. g_free(block->file_bmap);
  2662. block->file_bmap = NULL;
  2663. }
  2664. }
  2665. void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set)
  2666. {
  2667. if (set) {
  2668. set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap);
  2669. } else {
  2670. clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap);
  2671. }
  2672. }
  2673. /**
  2674. * ram_save_iterate: iterative stage for migration
  2675. *
  2676. * Returns zero to indicate success and negative for error
  2677. *
  2678. * @f: QEMUFile where to send the data
  2679. * @opaque: RAMState pointer
  2680. */
  2681. static int ram_save_iterate(QEMUFile *f, void *opaque)
  2682. {
  2683. RAMState **temp = opaque;
  2684. RAMState *rs = *temp;
  2685. int ret = 0;
  2686. int i;
  2687. int64_t t0;
  2688. int done = 0;
  2689. /*
  2690. * We'll take this lock a little bit long, but it's okay for two reasons.
  2691. * Firstly, the only possible other thread to take it is who calls
  2692. * qemu_guest_free_page_hint(), which should be rare; secondly, see
  2693. * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
  2694. * guarantees that we'll at least released it in a regular basis.
  2695. */
  2696. WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) {
  2697. WITH_RCU_READ_LOCK_GUARD() {
  2698. if (ram_list.version != rs->last_version) {
  2699. ram_state_reset(rs);
  2700. }
  2701. /* Read version before ram_list.blocks */
  2702. smp_rmb();
  2703. ret = rdma_registration_start(f, RAM_CONTROL_ROUND);
  2704. if (ret < 0) {
  2705. qemu_file_set_error(f, ret);
  2706. goto out;
  2707. }
  2708. t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
  2709. i = 0;
  2710. while ((ret = migration_rate_exceeded(f)) == 0 ||
  2711. postcopy_has_request(rs)) {
  2712. int pages;
  2713. if (qemu_file_get_error(f)) {
  2714. break;
  2715. }
  2716. pages = ram_find_and_save_block(rs);
  2717. /* no more pages to sent */
  2718. if (pages == 0) {
  2719. done = 1;
  2720. break;
  2721. }
  2722. if (pages < 0) {
  2723. qemu_file_set_error(f, pages);
  2724. break;
  2725. }
  2726. rs->target_page_count += pages;
  2727. /*
  2728. * we want to check in the 1st loop, just in case it was the 1st
  2729. * time and we had to sync the dirty bitmap.
  2730. * qemu_clock_get_ns() is a bit expensive, so we only check each
  2731. * some iterations
  2732. */
  2733. if ((i & 63) == 0) {
  2734. uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
  2735. 1000000;
  2736. if (t1 > MAX_WAIT) {
  2737. trace_ram_save_iterate_big_wait(t1, i);
  2738. break;
  2739. }
  2740. }
  2741. i++;
  2742. }
  2743. }
  2744. }
  2745. /*
  2746. * Must occur before EOS (or any QEMUFile operation)
  2747. * because of RDMA protocol.
  2748. */
  2749. ret = rdma_registration_stop(f, RAM_CONTROL_ROUND);
  2750. if (ret < 0) {
  2751. qemu_file_set_error(f, ret);
  2752. }
  2753. out:
  2754. if (ret >= 0 && migration_is_running()) {
  2755. if (multifd_ram_sync_per_section()) {
  2756. ret = multifd_ram_flush_and_sync(f);
  2757. if (ret < 0) {
  2758. return ret;
  2759. }
  2760. }
  2761. qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
  2762. ram_transferred_add(8);
  2763. ret = qemu_fflush(f);
  2764. }
  2765. if (ret < 0) {
  2766. return ret;
  2767. }
  2768. return done;
  2769. }
  2770. /**
  2771. * ram_save_complete: function called to send the remaining amount of ram
  2772. *
  2773. * Returns zero to indicate success or negative on error
  2774. *
  2775. * Called with the BQL
  2776. *
  2777. * @f: QEMUFile where to send the data
  2778. * @opaque: RAMState pointer
  2779. */
  2780. static int ram_save_complete(QEMUFile *f, void *opaque)
  2781. {
  2782. RAMState **temp = opaque;
  2783. RAMState *rs = *temp;
  2784. int ret = 0;
  2785. rs->last_stage = !migration_in_colo_state();
  2786. WITH_RCU_READ_LOCK_GUARD() {
  2787. if (!migration_in_postcopy()) {
  2788. migration_bitmap_sync_precopy(true);
  2789. }
  2790. ret = rdma_registration_start(f, RAM_CONTROL_FINISH);
  2791. if (ret < 0) {
  2792. qemu_file_set_error(f, ret);
  2793. return ret;
  2794. }
  2795. /* try transferring iterative blocks of memory */
  2796. /* flush all remaining blocks regardless of rate limiting */
  2797. qemu_mutex_lock(&rs->bitmap_mutex);
  2798. while (true) {
  2799. int pages;
  2800. pages = ram_find_and_save_block(rs);
  2801. /* no more blocks to sent */
  2802. if (pages == 0) {
  2803. break;
  2804. }
  2805. if (pages < 0) {
  2806. qemu_mutex_unlock(&rs->bitmap_mutex);
  2807. return pages;
  2808. }
  2809. }
  2810. qemu_mutex_unlock(&rs->bitmap_mutex);
  2811. ret = rdma_registration_stop(f, RAM_CONTROL_FINISH);
  2812. if (ret < 0) {
  2813. qemu_file_set_error(f, ret);
  2814. return ret;
  2815. }
  2816. }
  2817. if (multifd_ram_sync_per_section()) {
  2818. /*
  2819. * Only the old dest QEMU will need this sync, because each EOS
  2820. * will require one SYNC message on each channel.
  2821. */
  2822. ret = multifd_ram_flush_and_sync(f);
  2823. if (ret < 0) {
  2824. return ret;
  2825. }
  2826. }
  2827. if (migrate_mapped_ram()) {
  2828. ram_save_file_bmap(f);
  2829. if (qemu_file_get_error(f)) {
  2830. Error *local_err = NULL;
  2831. int err = qemu_file_get_error_obj(f, &local_err);
  2832. error_reportf_err(local_err, "Failed to write bitmap to file: ");
  2833. return -err;
  2834. }
  2835. }
  2836. qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
  2837. return qemu_fflush(f);
  2838. }
  2839. static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
  2840. uint64_t *can_postcopy)
  2841. {
  2842. RAMState **temp = opaque;
  2843. RAMState *rs = *temp;
  2844. uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
  2845. if (migrate_postcopy_ram()) {
  2846. /* We can do postcopy, and all the data is postcopiable */
  2847. *can_postcopy += remaining_size;
  2848. } else {
  2849. *must_precopy += remaining_size;
  2850. }
  2851. }
  2852. static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
  2853. uint64_t *can_postcopy)
  2854. {
  2855. RAMState **temp = opaque;
  2856. RAMState *rs = *temp;
  2857. uint64_t remaining_size;
  2858. if (!migration_in_postcopy()) {
  2859. bql_lock();
  2860. WITH_RCU_READ_LOCK_GUARD() {
  2861. migration_bitmap_sync_precopy(false);
  2862. }
  2863. bql_unlock();
  2864. }
  2865. remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
  2866. if (migrate_postcopy_ram()) {
  2867. /* We can do postcopy, and all the data is postcopiable */
  2868. *can_postcopy += remaining_size;
  2869. } else {
  2870. *must_precopy += remaining_size;
  2871. }
  2872. }
  2873. static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
  2874. {
  2875. unsigned int xh_len;
  2876. int xh_flags;
  2877. uint8_t *loaded_data;
  2878. /* extract RLE header */
  2879. xh_flags = qemu_get_byte(f);
  2880. xh_len = qemu_get_be16(f);
  2881. if (xh_flags != ENCODING_FLAG_XBZRLE) {
  2882. error_report("Failed to load XBZRLE page - wrong compression!");
  2883. return -1;
  2884. }
  2885. if (xh_len > TARGET_PAGE_SIZE) {
  2886. error_report("Failed to load XBZRLE page - len overflow!");
  2887. return -1;
  2888. }
  2889. loaded_data = XBZRLE.decoded_buf;
  2890. /* load data and decode */
  2891. /* it can change loaded_data to point to an internal buffer */
  2892. qemu_get_buffer_in_place(f, &loaded_data, xh_len);
  2893. /* decode RLE */
  2894. if (xbzrle_decode_buffer(loaded_data, xh_len, host,
  2895. TARGET_PAGE_SIZE) == -1) {
  2896. error_report("Failed to load XBZRLE page - decode error!");
  2897. return -1;
  2898. }
  2899. return 0;
  2900. }
  2901. /**
  2902. * ram_block_from_stream: read a RAMBlock id from the migration stream
  2903. *
  2904. * Must be called from within a rcu critical section.
  2905. *
  2906. * Returns a pointer from within the RCU-protected ram_list.
  2907. *
  2908. * @mis: the migration incoming state pointer
  2909. * @f: QEMUFile where to read the data from
  2910. * @flags: Page flags (mostly to see if it's a continuation of previous block)
  2911. * @channel: the channel we're using
  2912. */
  2913. static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
  2914. QEMUFile *f, int flags,
  2915. int channel)
  2916. {
  2917. RAMBlock *block = mis->last_recv_block[channel];
  2918. char id[256];
  2919. uint8_t len;
  2920. if (flags & RAM_SAVE_FLAG_CONTINUE) {
  2921. if (!block) {
  2922. error_report("Ack, bad migration stream!");
  2923. return NULL;
  2924. }
  2925. return block;
  2926. }
  2927. len = qemu_get_byte(f);
  2928. qemu_get_buffer(f, (uint8_t *)id, len);
  2929. id[len] = 0;
  2930. block = qemu_ram_block_by_name(id);
  2931. if (!block) {
  2932. error_report("Can't find block %s", id);
  2933. return NULL;
  2934. }
  2935. if (migrate_ram_is_ignored(block)) {
  2936. error_report("block %s should not be migrated !", id);
  2937. return NULL;
  2938. }
  2939. mis->last_recv_block[channel] = block;
  2940. return block;
  2941. }
  2942. static inline void *host_from_ram_block_offset(RAMBlock *block,
  2943. ram_addr_t offset)
  2944. {
  2945. if (!offset_in_ramblock(block, offset)) {
  2946. return NULL;
  2947. }
  2948. return block->host + offset;
  2949. }
  2950. static void *host_page_from_ram_block_offset(RAMBlock *block,
  2951. ram_addr_t offset)
  2952. {
  2953. /* Note: Explicitly no check against offset_in_ramblock(). */
  2954. return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
  2955. block->page_size);
  2956. }
  2957. static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
  2958. ram_addr_t offset)
  2959. {
  2960. return ((uintptr_t)block->host + offset) & (block->page_size - 1);
  2961. }
  2962. void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
  2963. {
  2964. qemu_mutex_lock(&ram_state->bitmap_mutex);
  2965. for (int i = 0; i < pages; i++) {
  2966. ram_addr_t offset = normal[i];
  2967. ram_state->migration_dirty_pages += !test_and_set_bit(
  2968. offset >> TARGET_PAGE_BITS,
  2969. block->bmap);
  2970. }
  2971. qemu_mutex_unlock(&ram_state->bitmap_mutex);
  2972. }
  2973. static inline void *colo_cache_from_block_offset(RAMBlock *block,
  2974. ram_addr_t offset, bool record_bitmap)
  2975. {
  2976. if (!offset_in_ramblock(block, offset)) {
  2977. return NULL;
  2978. }
  2979. if (!block->colo_cache) {
  2980. error_report("%s: colo_cache is NULL in block :%s",
  2981. __func__, block->idstr);
  2982. return NULL;
  2983. }
  2984. /*
  2985. * During colo checkpoint, we need bitmap of these migrated pages.
  2986. * It help us to decide which pages in ram cache should be flushed
  2987. * into VM's RAM later.
  2988. */
  2989. if (record_bitmap) {
  2990. colo_record_bitmap(block, &offset, 1);
  2991. }
  2992. return block->colo_cache + offset;
  2993. }
  2994. /**
  2995. * ram_handle_zero: handle the zero page case
  2996. *
  2997. * If a page (or a whole RDMA chunk) has been
  2998. * determined to be zero, then zap it.
  2999. *
  3000. * @host: host address for the zero page
  3001. * @ch: what the page is filled from. We only support zero
  3002. * @size: size of the zero page
  3003. */
  3004. void ram_handle_zero(void *host, uint64_t size)
  3005. {
  3006. if (!buffer_is_zero(host, size)) {
  3007. memset(host, 0, size);
  3008. }
  3009. }
  3010. static void colo_init_ram_state(void)
  3011. {
  3012. Error *local_err = NULL;
  3013. if (!ram_state_init(&ram_state, &local_err)) {
  3014. error_report_err(local_err);
  3015. }
  3016. }
  3017. /*
  3018. * colo cache: this is for secondary VM, we cache the whole
  3019. * memory of the secondary VM, it is need to hold the global lock
  3020. * to call this helper.
  3021. */
  3022. int colo_init_ram_cache(void)
  3023. {
  3024. RAMBlock *block;
  3025. WITH_RCU_READ_LOCK_GUARD() {
  3026. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  3027. block->colo_cache = qemu_anon_ram_alloc(block->used_length,
  3028. NULL, false, false);
  3029. if (!block->colo_cache) {
  3030. error_report("%s: Can't alloc memory for COLO cache of block %s,"
  3031. "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
  3032. block->used_length);
  3033. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  3034. if (block->colo_cache) {
  3035. qemu_anon_ram_free(block->colo_cache, block->used_length);
  3036. block->colo_cache = NULL;
  3037. }
  3038. }
  3039. return -errno;
  3040. }
  3041. if (!machine_dump_guest_core(current_machine)) {
  3042. qemu_madvise(block->colo_cache, block->used_length,
  3043. QEMU_MADV_DONTDUMP);
  3044. }
  3045. }
  3046. }
  3047. /*
  3048. * Record the dirty pages that sent by PVM, we use this dirty bitmap together
  3049. * with to decide which page in cache should be flushed into SVM's RAM. Here
  3050. * we use the same name 'ram_bitmap' as for migration.
  3051. */
  3052. if (ram_bytes_total()) {
  3053. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  3054. unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
  3055. block->bmap = bitmap_new(pages);
  3056. }
  3057. }
  3058. colo_init_ram_state();
  3059. return 0;
  3060. }
  3061. /* TODO: duplicated with ram_init_bitmaps */
  3062. void colo_incoming_start_dirty_log(void)
  3063. {
  3064. RAMBlock *block = NULL;
  3065. Error *local_err = NULL;
  3066. /* For memory_global_dirty_log_start below. */
  3067. bql_lock();
  3068. qemu_mutex_lock_ramlist();
  3069. memory_global_dirty_log_sync(false);
  3070. WITH_RCU_READ_LOCK_GUARD() {
  3071. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  3072. ramblock_sync_dirty_bitmap(ram_state, block);
  3073. /* Discard this dirty bitmap record */
  3074. bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
  3075. }
  3076. if (!memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION,
  3077. &local_err)) {
  3078. error_report_err(local_err);
  3079. }
  3080. }
  3081. ram_state->migration_dirty_pages = 0;
  3082. qemu_mutex_unlock_ramlist();
  3083. bql_unlock();
  3084. }
  3085. /* It is need to hold the global lock to call this helper */
  3086. void colo_release_ram_cache(void)
  3087. {
  3088. RAMBlock *block;
  3089. memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
  3090. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  3091. g_free(block->bmap);
  3092. block->bmap = NULL;
  3093. }
  3094. WITH_RCU_READ_LOCK_GUARD() {
  3095. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  3096. if (block->colo_cache) {
  3097. qemu_anon_ram_free(block->colo_cache, block->used_length);
  3098. block->colo_cache = NULL;
  3099. }
  3100. }
  3101. }
  3102. ram_state_cleanup(&ram_state);
  3103. }
  3104. /**
  3105. * ram_load_setup: Setup RAM for migration incoming side
  3106. *
  3107. * Returns zero to indicate success and negative for error
  3108. *
  3109. * @f: QEMUFile where to receive the data
  3110. * @opaque: RAMState pointer
  3111. * @errp: pointer to Error*, to store an error if it happens.
  3112. */
  3113. static int ram_load_setup(QEMUFile *f, void *opaque, Error **errp)
  3114. {
  3115. xbzrle_load_setup();
  3116. ramblock_recv_map_init();
  3117. return 0;
  3118. }
  3119. static int ram_load_cleanup(void *opaque)
  3120. {
  3121. RAMBlock *rb;
  3122. RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
  3123. qemu_ram_block_writeback(rb);
  3124. }
  3125. xbzrle_load_cleanup();
  3126. RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
  3127. g_free(rb->receivedmap);
  3128. rb->receivedmap = NULL;
  3129. }
  3130. return 0;
  3131. }
  3132. /**
  3133. * ram_postcopy_incoming_init: allocate postcopy data structures
  3134. *
  3135. * Returns 0 for success and negative if there was one error
  3136. *
  3137. * @mis: current migration incoming state
  3138. *
  3139. * Allocate data structures etc needed by incoming migration with
  3140. * postcopy-ram. postcopy-ram's similarly names
  3141. * postcopy_ram_incoming_init does the work.
  3142. */
  3143. int ram_postcopy_incoming_init(MigrationIncomingState *mis)
  3144. {
  3145. return postcopy_ram_incoming_init(mis);
  3146. }
  3147. /**
  3148. * ram_load_postcopy: load a page in postcopy case
  3149. *
  3150. * Returns 0 for success or -errno in case of error
  3151. *
  3152. * Called in postcopy mode by ram_load().
  3153. * rcu_read_lock is taken prior to this being called.
  3154. *
  3155. * @f: QEMUFile where to send the data
  3156. * @channel: the channel to use for loading
  3157. */
  3158. int ram_load_postcopy(QEMUFile *f, int channel)
  3159. {
  3160. int flags = 0, ret = 0;
  3161. bool place_needed = false;
  3162. bool matches_target_page_size = false;
  3163. MigrationIncomingState *mis = migration_incoming_get_current();
  3164. PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
  3165. while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
  3166. ram_addr_t addr;
  3167. void *page_buffer = NULL;
  3168. void *place_source = NULL;
  3169. RAMBlock *block = NULL;
  3170. uint8_t ch;
  3171. addr = qemu_get_be64(f);
  3172. /*
  3173. * If qemu file error, we should stop here, and then "addr"
  3174. * may be invalid
  3175. */
  3176. ret = qemu_file_get_error(f);
  3177. if (ret) {
  3178. break;
  3179. }
  3180. flags = addr & ~TARGET_PAGE_MASK;
  3181. addr &= TARGET_PAGE_MASK;
  3182. trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
  3183. if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
  3184. block = ram_block_from_stream(mis, f, flags, channel);
  3185. if (!block) {
  3186. ret = -EINVAL;
  3187. break;
  3188. }
  3189. /*
  3190. * Relying on used_length is racy and can result in false positives.
  3191. * We might place pages beyond used_length in case RAM was shrunk
  3192. * while in postcopy, which is fine - trying to place via
  3193. * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
  3194. */
  3195. if (!block->host || addr >= block->postcopy_length) {
  3196. error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
  3197. ret = -EINVAL;
  3198. break;
  3199. }
  3200. tmp_page->target_pages++;
  3201. matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
  3202. /*
  3203. * Postcopy requires that we place whole host pages atomically;
  3204. * these may be huge pages for RAMBlocks that are backed by
  3205. * hugetlbfs.
  3206. * To make it atomic, the data is read into a temporary page
  3207. * that's moved into place later.
  3208. * The migration protocol uses, possibly smaller, target-pages
  3209. * however the source ensures it always sends all the components
  3210. * of a host page in one chunk.
  3211. */
  3212. page_buffer = tmp_page->tmp_huge_page +
  3213. host_page_offset_from_ram_block_offset(block, addr);
  3214. /* If all TP are zero then we can optimise the place */
  3215. if (tmp_page->target_pages == 1) {
  3216. tmp_page->host_addr =
  3217. host_page_from_ram_block_offset(block, addr);
  3218. } else if (tmp_page->host_addr !=
  3219. host_page_from_ram_block_offset(block, addr)) {
  3220. /* not the 1st TP within the HP */
  3221. error_report("Non-same host page detected on channel %d: "
  3222. "Target host page %p, received host page %p "
  3223. "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
  3224. channel, tmp_page->host_addr,
  3225. host_page_from_ram_block_offset(block, addr),
  3226. block->idstr, addr, tmp_page->target_pages);
  3227. ret = -EINVAL;
  3228. break;
  3229. }
  3230. /*
  3231. * If it's the last part of a host page then we place the host
  3232. * page
  3233. */
  3234. if (tmp_page->target_pages ==
  3235. (block->page_size / TARGET_PAGE_SIZE)) {
  3236. place_needed = true;
  3237. }
  3238. place_source = tmp_page->tmp_huge_page;
  3239. }
  3240. switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
  3241. case RAM_SAVE_FLAG_ZERO:
  3242. ch = qemu_get_byte(f);
  3243. if (ch != 0) {
  3244. error_report("Found a zero page with value %d", ch);
  3245. ret = -EINVAL;
  3246. break;
  3247. }
  3248. /*
  3249. * Can skip to set page_buffer when
  3250. * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
  3251. */
  3252. if (!matches_target_page_size) {
  3253. memset(page_buffer, ch, TARGET_PAGE_SIZE);
  3254. }
  3255. break;
  3256. case RAM_SAVE_FLAG_PAGE:
  3257. tmp_page->all_zero = false;
  3258. if (!matches_target_page_size) {
  3259. /* For huge pages, we always use temporary buffer */
  3260. qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
  3261. } else {
  3262. /*
  3263. * For small pages that matches target page size, we
  3264. * avoid the qemu_file copy. Instead we directly use
  3265. * the buffer of QEMUFile to place the page. Note: we
  3266. * cannot do any QEMUFile operation before using that
  3267. * buffer to make sure the buffer is valid when
  3268. * placing the page.
  3269. */
  3270. qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
  3271. TARGET_PAGE_SIZE);
  3272. }
  3273. break;
  3274. case RAM_SAVE_FLAG_EOS:
  3275. break;
  3276. default:
  3277. error_report("Unknown combination of migration flags: 0x%x"
  3278. " (postcopy mode)", flags);
  3279. ret = -EINVAL;
  3280. break;
  3281. }
  3282. /* Detect for any possible file errors */
  3283. if (!ret && qemu_file_get_error(f)) {
  3284. ret = qemu_file_get_error(f);
  3285. }
  3286. if (!ret && place_needed) {
  3287. if (tmp_page->all_zero) {
  3288. ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
  3289. } else {
  3290. ret = postcopy_place_page(mis, tmp_page->host_addr,
  3291. place_source, block);
  3292. }
  3293. place_needed = false;
  3294. postcopy_temp_page_reset(tmp_page);
  3295. }
  3296. }
  3297. return ret;
  3298. }
  3299. static bool postcopy_is_running(void)
  3300. {
  3301. PostcopyState ps = postcopy_state_get();
  3302. return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
  3303. }
  3304. /*
  3305. * Flush content of RAM cache into SVM's memory.
  3306. * Only flush the pages that be dirtied by PVM or SVM or both.
  3307. */
  3308. void colo_flush_ram_cache(void)
  3309. {
  3310. RAMBlock *block = NULL;
  3311. void *dst_host;
  3312. void *src_host;
  3313. unsigned long offset = 0;
  3314. memory_global_dirty_log_sync(false);
  3315. qemu_mutex_lock(&ram_state->bitmap_mutex);
  3316. WITH_RCU_READ_LOCK_GUARD() {
  3317. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  3318. ramblock_sync_dirty_bitmap(ram_state, block);
  3319. }
  3320. }
  3321. trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
  3322. WITH_RCU_READ_LOCK_GUARD() {
  3323. block = QLIST_FIRST_RCU(&ram_list.blocks);
  3324. while (block) {
  3325. unsigned long num = 0;
  3326. offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
  3327. if (!offset_in_ramblock(block,
  3328. ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
  3329. offset = 0;
  3330. num = 0;
  3331. block = QLIST_NEXT_RCU(block, next);
  3332. } else {
  3333. unsigned long i = 0;
  3334. for (i = 0; i < num; i++) {
  3335. migration_bitmap_clear_dirty(ram_state, block, offset + i);
  3336. }
  3337. dst_host = block->host
  3338. + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
  3339. src_host = block->colo_cache
  3340. + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
  3341. memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
  3342. offset += num;
  3343. }
  3344. }
  3345. }
  3346. qemu_mutex_unlock(&ram_state->bitmap_mutex);
  3347. trace_colo_flush_ram_cache_end();
  3348. }
  3349. static size_t ram_load_multifd_pages(void *host_addr, size_t size,
  3350. uint64_t offset)
  3351. {
  3352. MultiFDRecvData *data = multifd_get_recv_data();
  3353. data->opaque = host_addr;
  3354. data->file_offset = offset;
  3355. data->size = size;
  3356. if (!multifd_recv()) {
  3357. return 0;
  3358. }
  3359. return size;
  3360. }
  3361. static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block,
  3362. long num_pages, unsigned long *bitmap,
  3363. Error **errp)
  3364. {
  3365. ERRP_GUARD();
  3366. unsigned long set_bit_idx, clear_bit_idx;
  3367. ram_addr_t offset;
  3368. void *host;
  3369. size_t read, unread, size;
  3370. for (set_bit_idx = find_first_bit(bitmap, num_pages);
  3371. set_bit_idx < num_pages;
  3372. set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) {
  3373. clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
  3374. unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx);
  3375. offset = set_bit_idx << TARGET_PAGE_BITS;
  3376. while (unread > 0) {
  3377. host = host_from_ram_block_offset(block, offset);
  3378. if (!host) {
  3379. error_setg(errp, "page outside of ramblock %s range",
  3380. block->idstr);
  3381. return false;
  3382. }
  3383. size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE);
  3384. if (migrate_multifd()) {
  3385. read = ram_load_multifd_pages(host, size,
  3386. block->pages_offset + offset);
  3387. } else {
  3388. read = qemu_get_buffer_at(f, host, size,
  3389. block->pages_offset + offset);
  3390. }
  3391. if (!read) {
  3392. goto err;
  3393. }
  3394. offset += read;
  3395. unread -= read;
  3396. }
  3397. }
  3398. return true;
  3399. err:
  3400. qemu_file_get_error_obj(f, errp);
  3401. error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT
  3402. "from file offset %" PRIx64 ": ", block->idstr, offset,
  3403. block->pages_offset + offset);
  3404. return false;
  3405. }
  3406. static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block,
  3407. ram_addr_t length, Error **errp)
  3408. {
  3409. g_autofree unsigned long *bitmap = NULL;
  3410. MappedRamHeader header;
  3411. size_t bitmap_size;
  3412. long num_pages;
  3413. if (!mapped_ram_read_header(f, &header, errp)) {
  3414. return;
  3415. }
  3416. block->pages_offset = header.pages_offset;
  3417. /*
  3418. * Check the alignment of the file region that contains pages. We
  3419. * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that
  3420. * value to change in the future. Do only a sanity check with page
  3421. * size alignment.
  3422. */
  3423. if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) {
  3424. error_setg(errp,
  3425. "Error reading ramblock %s pages, region has bad alignment",
  3426. block->idstr);
  3427. return;
  3428. }
  3429. num_pages = length / header.page_size;
  3430. bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
  3431. bitmap = g_malloc0(bitmap_size);
  3432. if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size,
  3433. header.bitmap_offset) != bitmap_size) {
  3434. error_setg(errp, "Error reading dirty bitmap");
  3435. return;
  3436. }
  3437. if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) {
  3438. return;
  3439. }
  3440. /* Skip pages array */
  3441. qemu_set_offset(f, block->pages_offset + length, SEEK_SET);
  3442. return;
  3443. }
  3444. static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
  3445. {
  3446. int ret = 0;
  3447. /* ADVISE is earlier, it shows the source has the postcopy capability on */
  3448. bool postcopy_advised = migration_incoming_postcopy_advised();
  3449. int max_hg_page_size;
  3450. Error *local_err = NULL;
  3451. assert(block);
  3452. if (migrate_mapped_ram()) {
  3453. parse_ramblock_mapped_ram(f, block, length, &local_err);
  3454. if (local_err) {
  3455. error_report_err(local_err);
  3456. return -EINVAL;
  3457. }
  3458. return 0;
  3459. }
  3460. if (!qemu_ram_is_migratable(block)) {
  3461. error_report("block %s should not be migrated !", block->idstr);
  3462. return -EINVAL;
  3463. }
  3464. if (length != block->used_length) {
  3465. ret = qemu_ram_resize(block, length, &local_err);
  3466. if (local_err) {
  3467. error_report_err(local_err);
  3468. return ret;
  3469. }
  3470. }
  3471. /*
  3472. * ??? Mirrors the previous value of qemu_host_page_size,
  3473. * but is this really what was intended for the migration?
  3474. */
  3475. max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE);
  3476. /* For postcopy we need to check hugepage sizes match */
  3477. if (postcopy_advised && migrate_postcopy_ram() &&
  3478. block->page_size != max_hg_page_size) {
  3479. uint64_t remote_page_size = qemu_get_be64(f);
  3480. if (remote_page_size != block->page_size) {
  3481. error_report("Mismatched RAM page size %s "
  3482. "(local) %zd != %" PRId64, block->idstr,
  3483. block->page_size, remote_page_size);
  3484. return -EINVAL;
  3485. }
  3486. }
  3487. if (migrate_ignore_shared()) {
  3488. hwaddr addr = qemu_get_be64(f);
  3489. if (migrate_ram_is_ignored(block) &&
  3490. block->mr->addr != addr) {
  3491. error_report("Mismatched GPAs for block %s "
  3492. "%" PRId64 "!= %" PRId64, block->idstr,
  3493. (uint64_t)addr, (uint64_t)block->mr->addr);
  3494. return -EINVAL;
  3495. }
  3496. }
  3497. ret = rdma_block_notification_handle(f, block->idstr);
  3498. if (ret < 0) {
  3499. qemu_file_set_error(f, ret);
  3500. }
  3501. return ret;
  3502. }
  3503. static int parse_ramblocks(QEMUFile *f, ram_addr_t total_ram_bytes)
  3504. {
  3505. int ret = 0;
  3506. /* Synchronize RAM block list */
  3507. while (!ret && total_ram_bytes) {
  3508. RAMBlock *block;
  3509. char id[256];
  3510. ram_addr_t length;
  3511. int len = qemu_get_byte(f);
  3512. qemu_get_buffer(f, (uint8_t *)id, len);
  3513. id[len] = 0;
  3514. length = qemu_get_be64(f);
  3515. block = qemu_ram_block_by_name(id);
  3516. if (block) {
  3517. ret = parse_ramblock(f, block, length);
  3518. } else {
  3519. error_report("Unknown ramblock \"%s\", cannot accept "
  3520. "migration", id);
  3521. ret = -EINVAL;
  3522. }
  3523. total_ram_bytes -= length;
  3524. }
  3525. return ret;
  3526. }
  3527. /**
  3528. * ram_load_precopy: load pages in precopy case
  3529. *
  3530. * Returns 0 for success or -errno in case of error
  3531. *
  3532. * Called in precopy mode by ram_load().
  3533. * rcu_read_lock is taken prior to this being called.
  3534. *
  3535. * @f: QEMUFile where to send the data
  3536. */
  3537. static int ram_load_precopy(QEMUFile *f)
  3538. {
  3539. MigrationIncomingState *mis = migration_incoming_get_current();
  3540. int flags = 0, ret = 0, invalid_flags = 0, i = 0;
  3541. if (migrate_mapped_ram()) {
  3542. invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH |
  3543. RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE |
  3544. RAM_SAVE_FLAG_ZERO);
  3545. }
  3546. while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
  3547. ram_addr_t addr;
  3548. void *host = NULL, *host_bak = NULL;
  3549. uint8_t ch;
  3550. /*
  3551. * Yield periodically to let main loop run, but an iteration of
  3552. * the main loop is expensive, so do it each some iterations
  3553. */
  3554. if ((i & 32767) == 0 && qemu_in_coroutine()) {
  3555. aio_co_schedule(qemu_get_current_aio_context(),
  3556. qemu_coroutine_self());
  3557. qemu_coroutine_yield();
  3558. }
  3559. i++;
  3560. addr = qemu_get_be64(f);
  3561. ret = qemu_file_get_error(f);
  3562. if (ret) {
  3563. error_report("Getting RAM address failed");
  3564. break;
  3565. }
  3566. flags = addr & ~TARGET_PAGE_MASK;
  3567. addr &= TARGET_PAGE_MASK;
  3568. if (flags & invalid_flags) {
  3569. error_report("Unexpected RAM flags: %d", flags & invalid_flags);
  3570. ret = -EINVAL;
  3571. break;
  3572. }
  3573. if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
  3574. RAM_SAVE_FLAG_XBZRLE)) {
  3575. RAMBlock *block = ram_block_from_stream(mis, f, flags,
  3576. RAM_CHANNEL_PRECOPY);
  3577. host = host_from_ram_block_offset(block, addr);
  3578. /*
  3579. * After going into COLO stage, we should not load the page
  3580. * into SVM's memory directly, we put them into colo_cache firstly.
  3581. * NOTE: We need to keep a copy of SVM's ram in colo_cache.
  3582. * Previously, we copied all these memory in preparing stage of COLO
  3583. * while we need to stop VM, which is a time-consuming process.
  3584. * Here we optimize it by a trick, back-up every page while in
  3585. * migration process while COLO is enabled, though it affects the
  3586. * speed of the migration, but it obviously reduce the downtime of
  3587. * back-up all SVM'S memory in COLO preparing stage.
  3588. */
  3589. if (migration_incoming_colo_enabled()) {
  3590. if (migration_incoming_in_colo_state()) {
  3591. /* In COLO stage, put all pages into cache temporarily */
  3592. host = colo_cache_from_block_offset(block, addr, true);
  3593. } else {
  3594. /*
  3595. * In migration stage but before COLO stage,
  3596. * Put all pages into both cache and SVM's memory.
  3597. */
  3598. host_bak = colo_cache_from_block_offset(block, addr, false);
  3599. }
  3600. }
  3601. if (!host) {
  3602. error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
  3603. ret = -EINVAL;
  3604. break;
  3605. }
  3606. if (!migration_incoming_in_colo_state()) {
  3607. ramblock_recv_bitmap_set(block, host);
  3608. }
  3609. trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
  3610. }
  3611. switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
  3612. case RAM_SAVE_FLAG_MEM_SIZE:
  3613. ret = parse_ramblocks(f, addr);
  3614. /*
  3615. * For mapped-ram migration (to a file) using multifd, we sync
  3616. * once and for all here to make sure all tasks we queued to
  3617. * multifd threads are completed, so that all the ramblocks
  3618. * (including all the guest memory pages within) are fully
  3619. * loaded after this sync returns.
  3620. */
  3621. if (migrate_mapped_ram()) {
  3622. multifd_recv_sync_main();
  3623. }
  3624. break;
  3625. case RAM_SAVE_FLAG_ZERO:
  3626. ch = qemu_get_byte(f);
  3627. if (ch != 0) {
  3628. error_report("Found a zero page with value %d", ch);
  3629. ret = -EINVAL;
  3630. break;
  3631. }
  3632. ram_handle_zero(host, TARGET_PAGE_SIZE);
  3633. break;
  3634. case RAM_SAVE_FLAG_PAGE:
  3635. qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
  3636. break;
  3637. case RAM_SAVE_FLAG_XBZRLE:
  3638. if (load_xbzrle(f, addr, host) < 0) {
  3639. error_report("Failed to decompress XBZRLE page at "
  3640. RAM_ADDR_FMT, addr);
  3641. ret = -EINVAL;
  3642. break;
  3643. }
  3644. break;
  3645. case RAM_SAVE_FLAG_MULTIFD_FLUSH:
  3646. multifd_recv_sync_main();
  3647. break;
  3648. case RAM_SAVE_FLAG_EOS:
  3649. /* normal exit */
  3650. if (migrate_multifd() &&
  3651. migrate_multifd_flush_after_each_section() &&
  3652. /*
  3653. * Mapped-ram migration flushes once and for all after
  3654. * parsing ramblocks. Always ignore EOS for it.
  3655. */
  3656. !migrate_mapped_ram()) {
  3657. multifd_recv_sync_main();
  3658. }
  3659. break;
  3660. case RAM_SAVE_FLAG_HOOK:
  3661. ret = rdma_registration_handle(f);
  3662. if (ret < 0) {
  3663. qemu_file_set_error(f, ret);
  3664. }
  3665. break;
  3666. default:
  3667. error_report("Unknown combination of migration flags: 0x%x", flags);
  3668. ret = -EINVAL;
  3669. }
  3670. if (!ret) {
  3671. ret = qemu_file_get_error(f);
  3672. }
  3673. if (!ret && host_bak) {
  3674. memcpy(host_bak, host, TARGET_PAGE_SIZE);
  3675. }
  3676. }
  3677. return ret;
  3678. }
  3679. static int ram_load(QEMUFile *f, void *opaque, int version_id)
  3680. {
  3681. int ret = 0;
  3682. static uint64_t seq_iter;
  3683. /*
  3684. * If system is running in postcopy mode, page inserts to host memory must
  3685. * be atomic
  3686. */
  3687. bool postcopy_running = postcopy_is_running();
  3688. seq_iter++;
  3689. if (version_id != 4) {
  3690. return -EINVAL;
  3691. }
  3692. /*
  3693. * This RCU critical section can be very long running.
  3694. * When RCU reclaims in the code start to become numerous,
  3695. * it will be necessary to reduce the granularity of this
  3696. * critical section.
  3697. */
  3698. trace_ram_load_start();
  3699. WITH_RCU_READ_LOCK_GUARD() {
  3700. if (postcopy_running) {
  3701. /*
  3702. * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
  3703. * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
  3704. * service fast page faults.
  3705. */
  3706. ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
  3707. } else {
  3708. ret = ram_load_precopy(f);
  3709. }
  3710. }
  3711. trace_ram_load_complete(ret, seq_iter);
  3712. return ret;
  3713. }
  3714. static bool ram_has_postcopy(void *opaque)
  3715. {
  3716. RAMBlock *rb;
  3717. RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
  3718. if (ramblock_is_pmem(rb)) {
  3719. info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
  3720. "is not supported now!", rb->idstr, rb->host);
  3721. return false;
  3722. }
  3723. }
  3724. return migrate_postcopy_ram();
  3725. }
  3726. /* Sync all the dirty bitmap with destination VM. */
  3727. static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
  3728. {
  3729. RAMBlock *block;
  3730. QEMUFile *file = s->to_dst_file;
  3731. trace_ram_dirty_bitmap_sync_start();
  3732. qatomic_set(&rs->postcopy_bmap_sync_requested, 0);
  3733. RAMBLOCK_FOREACH_NOT_IGNORED(block) {
  3734. qemu_savevm_send_recv_bitmap(file, block->idstr);
  3735. trace_ram_dirty_bitmap_request(block->idstr);
  3736. qatomic_inc(&rs->postcopy_bmap_sync_requested);
  3737. }
  3738. trace_ram_dirty_bitmap_sync_wait();
  3739. /* Wait until all the ramblocks' dirty bitmap synced */
  3740. while (qatomic_read(&rs->postcopy_bmap_sync_requested)) {
  3741. if (migration_rp_wait(s)) {
  3742. return -1;
  3743. }
  3744. }
  3745. trace_ram_dirty_bitmap_sync_complete();
  3746. return 0;
  3747. }
  3748. /*
  3749. * Read the received bitmap, revert it as the initial dirty bitmap.
  3750. * This is only used when the postcopy migration is paused but wants
  3751. * to resume from a middle point.
  3752. *
  3753. * Returns true if succeeded, false for errors.
  3754. */
  3755. bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block, Error **errp)
  3756. {
  3757. /* from_dst_file is always valid because we're within rp_thread */
  3758. QEMUFile *file = s->rp_state.from_dst_file;
  3759. g_autofree unsigned long *le_bitmap = NULL;
  3760. unsigned long nbits = block->used_length >> TARGET_PAGE_BITS;
  3761. uint64_t local_size = DIV_ROUND_UP(nbits, 8);
  3762. uint64_t size, end_mark;
  3763. RAMState *rs = ram_state;
  3764. trace_ram_dirty_bitmap_reload_begin(block->idstr);
  3765. if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
  3766. error_setg(errp, "Reload bitmap in incorrect state %s",
  3767. MigrationStatus_str(s->state));
  3768. return false;
  3769. }
  3770. /*
  3771. * Note: see comments in ramblock_recv_bitmap_send() on why we
  3772. * need the endianness conversion, and the paddings.
  3773. */
  3774. local_size = ROUND_UP(local_size, 8);
  3775. /* Add paddings */
  3776. le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
  3777. size = qemu_get_be64(file);
  3778. /* The size of the bitmap should match with our ramblock */
  3779. if (size != local_size) {
  3780. error_setg(errp, "ramblock '%s' bitmap size mismatch (0x%"PRIx64
  3781. " != 0x%"PRIx64")", block->idstr, size, local_size);
  3782. return false;
  3783. }
  3784. size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
  3785. end_mark = qemu_get_be64(file);
  3786. if (qemu_file_get_error(file) || size != local_size) {
  3787. error_setg(errp, "read bitmap failed for ramblock '%s': "
  3788. "(size 0x%"PRIx64", got: 0x%"PRIx64")",
  3789. block->idstr, local_size, size);
  3790. return false;
  3791. }
  3792. if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
  3793. error_setg(errp, "ramblock '%s' end mark incorrect: 0x%"PRIx64,
  3794. block->idstr, end_mark);
  3795. return false;
  3796. }
  3797. /*
  3798. * Endianness conversion. We are during postcopy (though paused).
  3799. * The dirty bitmap won't change. We can directly modify it.
  3800. */
  3801. bitmap_from_le(block->bmap, le_bitmap, nbits);
  3802. /*
  3803. * What we received is "received bitmap". Revert it as the initial
  3804. * dirty bitmap for this ramblock.
  3805. */
  3806. bitmap_complement(block->bmap, block->bmap, nbits);
  3807. /* Clear dirty bits of discarded ranges that we don't want to migrate. */
  3808. ramblock_dirty_bitmap_clear_discarded_pages(block);
  3809. /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
  3810. trace_ram_dirty_bitmap_reload_complete(block->idstr);
  3811. qatomic_dec(&rs->postcopy_bmap_sync_requested);
  3812. /*
  3813. * We succeeded to sync bitmap for current ramblock. Always kick the
  3814. * migration thread to check whether all requested bitmaps are
  3815. * reloaded. NOTE: it's racy to only kick when requested==0, because
  3816. * we don't know whether the migration thread may still be increasing
  3817. * it.
  3818. */
  3819. migration_rp_kick(s);
  3820. return true;
  3821. }
  3822. static int ram_resume_prepare(MigrationState *s, void *opaque)
  3823. {
  3824. RAMState *rs = *(RAMState **)opaque;
  3825. int ret;
  3826. ret = ram_dirty_bitmap_sync_all(s, rs);
  3827. if (ret) {
  3828. return ret;
  3829. }
  3830. ram_state_resume_prepare(rs, s->to_dst_file);
  3831. return 0;
  3832. }
  3833. void postcopy_preempt_shutdown_file(MigrationState *s)
  3834. {
  3835. qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
  3836. qemu_fflush(s->postcopy_qemufile_src);
  3837. }
  3838. static SaveVMHandlers savevm_ram_handlers = {
  3839. .save_setup = ram_save_setup,
  3840. .save_live_iterate = ram_save_iterate,
  3841. .save_live_complete_postcopy = ram_save_complete,
  3842. .save_live_complete_precopy = ram_save_complete,
  3843. .has_postcopy = ram_has_postcopy,
  3844. .state_pending_exact = ram_state_pending_exact,
  3845. .state_pending_estimate = ram_state_pending_estimate,
  3846. .load_state = ram_load,
  3847. .save_cleanup = ram_save_cleanup,
  3848. .load_setup = ram_load_setup,
  3849. .load_cleanup = ram_load_cleanup,
  3850. .resume_prepare = ram_resume_prepare,
  3851. };
  3852. static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
  3853. size_t old_size, size_t new_size)
  3854. {
  3855. PostcopyState ps = postcopy_state_get();
  3856. ram_addr_t offset;
  3857. RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
  3858. Error *err = NULL;
  3859. if (!rb) {
  3860. error_report("RAM block not found");
  3861. return;
  3862. }
  3863. if (migrate_ram_is_ignored(rb)) {
  3864. return;
  3865. }
  3866. if (migration_is_running()) {
  3867. /*
  3868. * Precopy code on the source cannot deal with the size of RAM blocks
  3869. * changing at random points in time - especially after sending the
  3870. * RAM block sizes in the migration stream, they must no longer change.
  3871. * Abort and indicate a proper reason.
  3872. */
  3873. error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
  3874. migrate_set_error(migrate_get_current(), err);
  3875. error_free(err);
  3876. migration_cancel();
  3877. }
  3878. switch (ps) {
  3879. case POSTCOPY_INCOMING_ADVISE:
  3880. /*
  3881. * Update what ram_postcopy_incoming_init()->init_range() does at the
  3882. * time postcopy was advised. Syncing RAM blocks with the source will
  3883. * result in RAM resizes.
  3884. */
  3885. if (old_size < new_size) {
  3886. if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
  3887. error_report("RAM block '%s' discard of resized RAM failed",
  3888. rb->idstr);
  3889. }
  3890. }
  3891. rb->postcopy_length = new_size;
  3892. break;
  3893. case POSTCOPY_INCOMING_NONE:
  3894. case POSTCOPY_INCOMING_RUNNING:
  3895. case POSTCOPY_INCOMING_END:
  3896. /*
  3897. * Once our guest is running, postcopy does no longer care about
  3898. * resizes. When growing, the new memory was not available on the
  3899. * source, no handler needed.
  3900. */
  3901. break;
  3902. default:
  3903. error_report("RAM block '%s' resized during postcopy state: %d",
  3904. rb->idstr, ps);
  3905. exit(-1);
  3906. }
  3907. }
  3908. static RAMBlockNotifier ram_mig_ram_notifier = {
  3909. .ram_block_resized = ram_mig_ram_block_resized,
  3910. };
  3911. void ram_mig_init(void)
  3912. {
  3913. qemu_mutex_init(&XBZRLE.lock);
  3914. register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
  3915. ram_block_notifier_add(&ram_mig_ram_notifier);
  3916. }