2
0

postcopy-ram.c 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448
  1. /*
  2. * Postcopy migration for RAM
  3. *
  4. * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
  5. *
  6. * Authors:
  7. * Dave Gilbert <dgilbert@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10. * See the COPYING file in the top-level directory.
  11. *
  12. */
  13. /*
  14. * Postcopy is a migration technique where the execution flips from the
  15. * source to the destination before all the data has been copied.
  16. */
  17. #include "qemu/osdep.h"
  18. #include "exec/target_page.h"
  19. #include "migration.h"
  20. #include "qemu-file.h"
  21. #include "savevm.h"
  22. #include "postcopy-ram.h"
  23. #include "ram.h"
  24. #include "qapi/error.h"
  25. #include "qemu/notify.h"
  26. #include "qemu/rcu.h"
  27. #include "sysemu/sysemu.h"
  28. #include "qemu/error-report.h"
  29. #include "trace.h"
  30. #include "hw/boards.h"
  31. /* Arbitrary limit on size of each discard command,
  32. * keeps them around ~200 bytes
  33. */
  34. #define MAX_DISCARDS_PER_COMMAND 12
  35. struct PostcopyDiscardState {
  36. const char *ramblock_name;
  37. uint16_t cur_entry;
  38. /*
  39. * Start and length of a discard range (bytes)
  40. */
  41. uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
  42. uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
  43. unsigned int nsentwords;
  44. unsigned int nsentcmds;
  45. };
  46. static NotifierWithReturnList postcopy_notifier_list;
  47. void postcopy_infrastructure_init(void)
  48. {
  49. notifier_with_return_list_init(&postcopy_notifier_list);
  50. }
  51. void postcopy_add_notifier(NotifierWithReturn *nn)
  52. {
  53. notifier_with_return_list_add(&postcopy_notifier_list, nn);
  54. }
  55. void postcopy_remove_notifier(NotifierWithReturn *n)
  56. {
  57. notifier_with_return_remove(n);
  58. }
  59. int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
  60. {
  61. struct PostcopyNotifyData pnd;
  62. pnd.reason = reason;
  63. pnd.errp = errp;
  64. return notifier_with_return_list_notify(&postcopy_notifier_list,
  65. &pnd);
  66. }
  67. /* Postcopy needs to detect accesses to pages that haven't yet been copied
  68. * across, and efficiently map new pages in, the techniques for doing this
  69. * are target OS specific.
  70. */
  71. #if defined(__linux__)
  72. #include <poll.h>
  73. #include <sys/ioctl.h>
  74. #include <sys/syscall.h>
  75. #include <asm/types.h> /* for __u64 */
  76. #endif
  77. #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
  78. #include <sys/eventfd.h>
  79. #include <linux/userfaultfd.h>
  80. typedef struct PostcopyBlocktimeContext {
  81. /* time when page fault initiated per vCPU */
  82. uint32_t *page_fault_vcpu_time;
  83. /* page address per vCPU */
  84. uintptr_t *vcpu_addr;
  85. uint32_t total_blocktime;
  86. /* blocktime per vCPU */
  87. uint32_t *vcpu_blocktime;
  88. /* point in time when last page fault was initiated */
  89. uint32_t last_begin;
  90. /* number of vCPU are suspended */
  91. int smp_cpus_down;
  92. uint64_t start_time;
  93. /*
  94. * Handler for exit event, necessary for
  95. * releasing whole blocktime_ctx
  96. */
  97. Notifier exit_notifier;
  98. } PostcopyBlocktimeContext;
  99. static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
  100. {
  101. g_free(ctx->page_fault_vcpu_time);
  102. g_free(ctx->vcpu_addr);
  103. g_free(ctx->vcpu_blocktime);
  104. g_free(ctx);
  105. }
  106. static void migration_exit_cb(Notifier *n, void *data)
  107. {
  108. PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
  109. exit_notifier);
  110. destroy_blocktime_context(ctx);
  111. }
  112. static struct PostcopyBlocktimeContext *blocktime_context_new(void)
  113. {
  114. MachineState *ms = MACHINE(qdev_get_machine());
  115. unsigned int smp_cpus = ms->smp.cpus;
  116. PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
  117. ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
  118. ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
  119. ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
  120. ctx->exit_notifier.notify = migration_exit_cb;
  121. ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
  122. qemu_add_exit_notifier(&ctx->exit_notifier);
  123. return ctx;
  124. }
  125. static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
  126. {
  127. MachineState *ms = MACHINE(qdev_get_machine());
  128. uint32List *list = NULL, *entry = NULL;
  129. int i;
  130. for (i = ms->smp.cpus - 1; i >= 0; i--) {
  131. entry = g_new0(uint32List, 1);
  132. entry->value = ctx->vcpu_blocktime[i];
  133. entry->next = list;
  134. list = entry;
  135. }
  136. return list;
  137. }
  138. /*
  139. * This function just populates MigrationInfo from postcopy's
  140. * blocktime context. It will not populate MigrationInfo,
  141. * unless postcopy-blocktime capability was set.
  142. *
  143. * @info: pointer to MigrationInfo to populate
  144. */
  145. void fill_destination_postcopy_migration_info(MigrationInfo *info)
  146. {
  147. MigrationIncomingState *mis = migration_incoming_get_current();
  148. PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
  149. if (!bc) {
  150. return;
  151. }
  152. info->has_postcopy_blocktime = true;
  153. info->postcopy_blocktime = bc->total_blocktime;
  154. info->has_postcopy_vcpu_blocktime = true;
  155. info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
  156. }
  157. static uint32_t get_postcopy_total_blocktime(void)
  158. {
  159. MigrationIncomingState *mis = migration_incoming_get_current();
  160. PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
  161. if (!bc) {
  162. return 0;
  163. }
  164. return bc->total_blocktime;
  165. }
  166. /**
  167. * receive_ufd_features: check userfault fd features, to request only supported
  168. * features in the future.
  169. *
  170. * Returns: true on success
  171. *
  172. * __NR_userfaultfd - should be checked before
  173. * @features: out parameter will contain uffdio_api.features provided by kernel
  174. * in case of success
  175. */
  176. static bool receive_ufd_features(uint64_t *features)
  177. {
  178. struct uffdio_api api_struct = {0};
  179. int ufd;
  180. bool ret = true;
  181. /* if we are here __NR_userfaultfd should exists */
  182. ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
  183. if (ufd == -1) {
  184. error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
  185. strerror(errno));
  186. return false;
  187. }
  188. /* ask features */
  189. api_struct.api = UFFD_API;
  190. api_struct.features = 0;
  191. if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  192. error_report("%s: UFFDIO_API failed: %s", __func__,
  193. strerror(errno));
  194. ret = false;
  195. goto release_ufd;
  196. }
  197. *features = api_struct.features;
  198. release_ufd:
  199. close(ufd);
  200. return ret;
  201. }
  202. /**
  203. * request_ufd_features: this function should be called only once on a newly
  204. * opened ufd, subsequent calls will lead to error.
  205. *
  206. * Returns: true on succes
  207. *
  208. * @ufd: fd obtained from userfaultfd syscall
  209. * @features: bit mask see UFFD_API_FEATURES
  210. */
  211. static bool request_ufd_features(int ufd, uint64_t features)
  212. {
  213. struct uffdio_api api_struct = {0};
  214. uint64_t ioctl_mask;
  215. api_struct.api = UFFD_API;
  216. api_struct.features = features;
  217. if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  218. error_report("%s failed: UFFDIO_API failed: %s", __func__,
  219. strerror(errno));
  220. return false;
  221. }
  222. ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
  223. (__u64)1 << _UFFDIO_UNREGISTER;
  224. if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
  225. error_report("Missing userfault features: %" PRIx64,
  226. (uint64_t)(~api_struct.ioctls & ioctl_mask));
  227. return false;
  228. }
  229. return true;
  230. }
  231. static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
  232. {
  233. uint64_t asked_features = 0;
  234. static uint64_t supported_features;
  235. /*
  236. * it's not possible to
  237. * request UFFD_API twice per one fd
  238. * userfault fd features is persistent
  239. */
  240. if (!supported_features) {
  241. if (!receive_ufd_features(&supported_features)) {
  242. error_report("%s failed", __func__);
  243. return false;
  244. }
  245. }
  246. #ifdef UFFD_FEATURE_THREAD_ID
  247. if (migrate_postcopy_blocktime() && mis &&
  248. UFFD_FEATURE_THREAD_ID & supported_features) {
  249. /* kernel supports that feature */
  250. /* don't create blocktime_context if it exists */
  251. if (!mis->blocktime_ctx) {
  252. mis->blocktime_ctx = blocktime_context_new();
  253. }
  254. asked_features |= UFFD_FEATURE_THREAD_ID;
  255. }
  256. #endif
  257. /*
  258. * request features, even if asked_features is 0, due to
  259. * kernel expects UFFD_API before UFFDIO_REGISTER, per
  260. * userfault file descriptor
  261. */
  262. if (!request_ufd_features(ufd, asked_features)) {
  263. error_report("%s failed: features %" PRIu64, __func__,
  264. asked_features);
  265. return false;
  266. }
  267. if (qemu_real_host_page_size != ram_pagesize_summary()) {
  268. bool have_hp = false;
  269. /* We've got a huge page */
  270. #ifdef UFFD_FEATURE_MISSING_HUGETLBFS
  271. have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
  272. #endif
  273. if (!have_hp) {
  274. error_report("Userfault on this host does not support huge pages");
  275. return false;
  276. }
  277. }
  278. return true;
  279. }
  280. /* Callback from postcopy_ram_supported_by_host block iterator.
  281. */
  282. static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
  283. {
  284. const char *block_name = qemu_ram_get_idstr(rb);
  285. ram_addr_t length = qemu_ram_get_used_length(rb);
  286. size_t pagesize = qemu_ram_pagesize(rb);
  287. if (length % pagesize) {
  288. error_report("Postcopy requires RAM blocks to be a page size multiple,"
  289. " block %s is 0x" RAM_ADDR_FMT " bytes with a "
  290. "page size of 0x%zx", block_name, length, pagesize);
  291. return 1;
  292. }
  293. return 0;
  294. }
  295. /*
  296. * Note: This has the side effect of munlock'ing all of RAM, that's
  297. * normally fine since if the postcopy succeeds it gets turned back on at the
  298. * end.
  299. */
  300. bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
  301. {
  302. long pagesize = qemu_real_host_page_size;
  303. int ufd = -1;
  304. bool ret = false; /* Error unless we change it */
  305. void *testarea = NULL;
  306. struct uffdio_register reg_struct;
  307. struct uffdio_range range_struct;
  308. uint64_t feature_mask;
  309. Error *local_err = NULL;
  310. if (qemu_target_page_size() > pagesize) {
  311. error_report("Target page size bigger than host page size");
  312. goto out;
  313. }
  314. ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
  315. if (ufd == -1) {
  316. error_report("%s: userfaultfd not available: %s", __func__,
  317. strerror(errno));
  318. goto out;
  319. }
  320. /* Give devices a chance to object */
  321. if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
  322. error_report_err(local_err);
  323. goto out;
  324. }
  325. /* Version and features check */
  326. if (!ufd_check_and_apply(ufd, mis)) {
  327. goto out;
  328. }
  329. /* We don't support postcopy with shared RAM yet */
  330. if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
  331. goto out;
  332. }
  333. /*
  334. * userfault and mlock don't go together; we'll put it back later if
  335. * it was enabled.
  336. */
  337. if (munlockall()) {
  338. error_report("%s: munlockall: %s", __func__, strerror(errno));
  339. goto out;
  340. }
  341. /*
  342. * We need to check that the ops we need are supported on anon memory
  343. * To do that we need to register a chunk and see the flags that
  344. * are returned.
  345. */
  346. testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
  347. MAP_ANONYMOUS, -1, 0);
  348. if (testarea == MAP_FAILED) {
  349. error_report("%s: Failed to map test area: %s", __func__,
  350. strerror(errno));
  351. goto out;
  352. }
  353. g_assert(((size_t)testarea & (pagesize-1)) == 0);
  354. reg_struct.range.start = (uintptr_t)testarea;
  355. reg_struct.range.len = pagesize;
  356. reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
  357. if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
  358. error_report("%s userfault register: %s", __func__, strerror(errno));
  359. goto out;
  360. }
  361. range_struct.start = (uintptr_t)testarea;
  362. range_struct.len = pagesize;
  363. if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
  364. error_report("%s userfault unregister: %s", __func__, strerror(errno));
  365. goto out;
  366. }
  367. feature_mask = (__u64)1 << _UFFDIO_WAKE |
  368. (__u64)1 << _UFFDIO_COPY |
  369. (__u64)1 << _UFFDIO_ZEROPAGE;
  370. if ((reg_struct.ioctls & feature_mask) != feature_mask) {
  371. error_report("Missing userfault map features: %" PRIx64,
  372. (uint64_t)(~reg_struct.ioctls & feature_mask));
  373. goto out;
  374. }
  375. /* Success! */
  376. ret = true;
  377. out:
  378. if (testarea) {
  379. munmap(testarea, pagesize);
  380. }
  381. if (ufd != -1) {
  382. close(ufd);
  383. }
  384. return ret;
  385. }
  386. /*
  387. * Setup an area of RAM so that it *can* be used for postcopy later; this
  388. * must be done right at the start prior to pre-copy.
  389. * opaque should be the MIS.
  390. */
  391. static int init_range(RAMBlock *rb, void *opaque)
  392. {
  393. const char *block_name = qemu_ram_get_idstr(rb);
  394. void *host_addr = qemu_ram_get_host_addr(rb);
  395. ram_addr_t offset = qemu_ram_get_offset(rb);
  396. ram_addr_t length = qemu_ram_get_used_length(rb);
  397. trace_postcopy_init_range(block_name, host_addr, offset, length);
  398. /*
  399. * We need the whole of RAM to be truly empty for postcopy, so things
  400. * like ROMs and any data tables built during init must be zero'd
  401. * - we're going to get the copy from the source anyway.
  402. * (Precopy will just overwrite this data, so doesn't need the discard)
  403. */
  404. if (ram_discard_range(block_name, 0, length)) {
  405. return -1;
  406. }
  407. return 0;
  408. }
  409. /*
  410. * At the end of migration, undo the effects of init_range
  411. * opaque should be the MIS.
  412. */
  413. static int cleanup_range(RAMBlock *rb, void *opaque)
  414. {
  415. const char *block_name = qemu_ram_get_idstr(rb);
  416. void *host_addr = qemu_ram_get_host_addr(rb);
  417. ram_addr_t offset = qemu_ram_get_offset(rb);
  418. ram_addr_t length = qemu_ram_get_used_length(rb);
  419. MigrationIncomingState *mis = opaque;
  420. struct uffdio_range range_struct;
  421. trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
  422. /*
  423. * We turned off hugepage for the precopy stage with postcopy enabled
  424. * we can turn it back on now.
  425. */
  426. qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
  427. /*
  428. * We can also turn off userfault now since we should have all the
  429. * pages. It can be useful to leave it on to debug postcopy
  430. * if you're not sure it's always getting every page.
  431. */
  432. range_struct.start = (uintptr_t)host_addr;
  433. range_struct.len = length;
  434. if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
  435. error_report("%s: userfault unregister %s", __func__, strerror(errno));
  436. return -1;
  437. }
  438. return 0;
  439. }
  440. /*
  441. * Initialise postcopy-ram, setting the RAM to a state where we can go into
  442. * postcopy later; must be called prior to any precopy.
  443. * called from arch_init's similarly named ram_postcopy_incoming_init
  444. */
  445. int postcopy_ram_incoming_init(MigrationIncomingState *mis)
  446. {
  447. if (foreach_not_ignored_block(init_range, NULL)) {
  448. return -1;
  449. }
  450. return 0;
  451. }
  452. /*
  453. * At the end of a migration where postcopy_ram_incoming_init was called.
  454. */
  455. int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
  456. {
  457. trace_postcopy_ram_incoming_cleanup_entry();
  458. if (mis->have_fault_thread) {
  459. Error *local_err = NULL;
  460. /* Let the fault thread quit */
  461. atomic_set(&mis->fault_thread_quit, 1);
  462. postcopy_fault_thread_notify(mis);
  463. trace_postcopy_ram_incoming_cleanup_join();
  464. qemu_thread_join(&mis->fault_thread);
  465. if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
  466. error_report_err(local_err);
  467. return -1;
  468. }
  469. if (foreach_not_ignored_block(cleanup_range, mis)) {
  470. return -1;
  471. }
  472. trace_postcopy_ram_incoming_cleanup_closeuf();
  473. close(mis->userfault_fd);
  474. close(mis->userfault_event_fd);
  475. mis->have_fault_thread = false;
  476. }
  477. if (enable_mlock) {
  478. if (os_mlock() < 0) {
  479. error_report("mlock: %s", strerror(errno));
  480. /*
  481. * It doesn't feel right to fail at this point, we have a valid
  482. * VM state.
  483. */
  484. }
  485. }
  486. if (mis->postcopy_tmp_page) {
  487. munmap(mis->postcopy_tmp_page, mis->largest_page_size);
  488. mis->postcopy_tmp_page = NULL;
  489. }
  490. if (mis->postcopy_tmp_zero_page) {
  491. munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
  492. mis->postcopy_tmp_zero_page = NULL;
  493. }
  494. trace_postcopy_ram_incoming_cleanup_blocktime(
  495. get_postcopy_total_blocktime());
  496. trace_postcopy_ram_incoming_cleanup_exit();
  497. return 0;
  498. }
  499. /*
  500. * Disable huge pages on an area
  501. */
  502. static int nhp_range(RAMBlock *rb, void *opaque)
  503. {
  504. const char *block_name = qemu_ram_get_idstr(rb);
  505. void *host_addr = qemu_ram_get_host_addr(rb);
  506. ram_addr_t offset = qemu_ram_get_offset(rb);
  507. ram_addr_t length = qemu_ram_get_used_length(rb);
  508. trace_postcopy_nhp_range(block_name, host_addr, offset, length);
  509. /*
  510. * Before we do discards we need to ensure those discards really
  511. * do delete areas of the page, even if THP thinks a hugepage would
  512. * be a good idea, so force hugepages off.
  513. */
  514. qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
  515. return 0;
  516. }
  517. /*
  518. * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
  519. * however leaving it until after precopy means that most of the precopy
  520. * data is still THPd
  521. */
  522. int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
  523. {
  524. if (foreach_not_ignored_block(nhp_range, mis)) {
  525. return -1;
  526. }
  527. postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
  528. return 0;
  529. }
  530. /*
  531. * Mark the given area of RAM as requiring notification to unwritten areas
  532. * Used as a callback on foreach_not_ignored_block.
  533. * host_addr: Base of area to mark
  534. * offset: Offset in the whole ram arena
  535. * length: Length of the section
  536. * opaque: MigrationIncomingState pointer
  537. * Returns 0 on success
  538. */
  539. static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
  540. {
  541. MigrationIncomingState *mis = opaque;
  542. struct uffdio_register reg_struct;
  543. reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
  544. reg_struct.range.len = qemu_ram_get_used_length(rb);
  545. reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
  546. /* Now tell our userfault_fd that it's responsible for this area */
  547. if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
  548. error_report("%s userfault register: %s", __func__, strerror(errno));
  549. return -1;
  550. }
  551. if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
  552. error_report("%s userfault: Region doesn't support COPY", __func__);
  553. return -1;
  554. }
  555. if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
  556. qemu_ram_set_uf_zeroable(rb);
  557. }
  558. return 0;
  559. }
  560. int postcopy_wake_shared(struct PostCopyFD *pcfd,
  561. uint64_t client_addr,
  562. RAMBlock *rb)
  563. {
  564. size_t pagesize = qemu_ram_pagesize(rb);
  565. struct uffdio_range range;
  566. int ret;
  567. trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
  568. range.start = client_addr & ~(pagesize - 1);
  569. range.len = pagesize;
  570. ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
  571. if (ret) {
  572. error_report("%s: Failed to wake: %zx in %s (%s)",
  573. __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
  574. strerror(errno));
  575. }
  576. return ret;
  577. }
  578. /*
  579. * Callback from shared fault handlers to ask for a page,
  580. * the page must be specified by a RAMBlock and an offset in that rb
  581. * Note: Only for use by shared fault handlers (in fault thread)
  582. */
  583. int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
  584. uint64_t client_addr, uint64_t rb_offset)
  585. {
  586. size_t pagesize = qemu_ram_pagesize(rb);
  587. uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
  588. MigrationIncomingState *mis = migration_incoming_get_current();
  589. trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
  590. rb_offset);
  591. if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
  592. trace_postcopy_request_shared_page_present(pcfd->idstr,
  593. qemu_ram_get_idstr(rb), rb_offset);
  594. return postcopy_wake_shared(pcfd, client_addr, rb);
  595. }
  596. if (rb != mis->last_rb) {
  597. mis->last_rb = rb;
  598. migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
  599. aligned_rbo, pagesize);
  600. } else {
  601. /* Save some space */
  602. migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
  603. }
  604. return 0;
  605. }
  606. static int get_mem_fault_cpu_index(uint32_t pid)
  607. {
  608. CPUState *cpu_iter;
  609. CPU_FOREACH(cpu_iter) {
  610. if (cpu_iter->thread_id == pid) {
  611. trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
  612. return cpu_iter->cpu_index;
  613. }
  614. }
  615. trace_get_mem_fault_cpu_index(-1, pid);
  616. return -1;
  617. }
  618. static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
  619. {
  620. int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
  621. dc->start_time;
  622. return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
  623. }
  624. /*
  625. * This function is being called when pagefault occurs. It
  626. * tracks down vCPU blocking time.
  627. *
  628. * @addr: faulted host virtual address
  629. * @ptid: faulted process thread id
  630. * @rb: ramblock appropriate to addr
  631. */
  632. static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
  633. RAMBlock *rb)
  634. {
  635. int cpu, already_received;
  636. MigrationIncomingState *mis = migration_incoming_get_current();
  637. PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
  638. uint32_t low_time_offset;
  639. if (!dc || ptid == 0) {
  640. return;
  641. }
  642. cpu = get_mem_fault_cpu_index(ptid);
  643. if (cpu < 0) {
  644. return;
  645. }
  646. low_time_offset = get_low_time_offset(dc);
  647. if (dc->vcpu_addr[cpu] == 0) {
  648. atomic_inc(&dc->smp_cpus_down);
  649. }
  650. atomic_xchg(&dc->last_begin, low_time_offset);
  651. atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
  652. atomic_xchg(&dc->vcpu_addr[cpu], addr);
  653. /*
  654. * check it here, not at the beginning of the function,
  655. * due to, check could occur early than bitmap_set in
  656. * qemu_ufd_copy_ioctl
  657. */
  658. already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
  659. if (already_received) {
  660. atomic_xchg(&dc->vcpu_addr[cpu], 0);
  661. atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
  662. atomic_dec(&dc->smp_cpus_down);
  663. }
  664. trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
  665. cpu, already_received);
  666. }
  667. /*
  668. * This function just provide calculated blocktime per cpu and trace it.
  669. * Total blocktime is calculated in mark_postcopy_blocktime_end.
  670. *
  671. *
  672. * Assume we have 3 CPU
  673. *
  674. * S1 E1 S1 E1
  675. * -----***********------------xxx***************------------------------> CPU1
  676. *
  677. * S2 E2
  678. * ------------****************xxx---------------------------------------> CPU2
  679. *
  680. * S3 E3
  681. * ------------------------****xxx********-------------------------------> CPU3
  682. *
  683. * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
  684. * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
  685. * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
  686. * it's a part of total blocktime.
  687. * S1 - here is last_begin
  688. * Legend of the picture is following:
  689. * * - means blocktime per vCPU
  690. * x - means overlapped blocktime (total blocktime)
  691. *
  692. * @addr: host virtual address
  693. */
  694. static void mark_postcopy_blocktime_end(uintptr_t addr)
  695. {
  696. MigrationIncomingState *mis = migration_incoming_get_current();
  697. PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
  698. MachineState *ms = MACHINE(qdev_get_machine());
  699. unsigned int smp_cpus = ms->smp.cpus;
  700. int i, affected_cpu = 0;
  701. bool vcpu_total_blocktime = false;
  702. uint32_t read_vcpu_time, low_time_offset;
  703. if (!dc) {
  704. return;
  705. }
  706. low_time_offset = get_low_time_offset(dc);
  707. /* lookup cpu, to clear it,
  708. * that algorithm looks straighforward, but it's not
  709. * optimal, more optimal algorithm is keeping tree or hash
  710. * where key is address value is a list of */
  711. for (i = 0; i < smp_cpus; i++) {
  712. uint32_t vcpu_blocktime = 0;
  713. read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
  714. if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
  715. read_vcpu_time == 0) {
  716. continue;
  717. }
  718. atomic_xchg(&dc->vcpu_addr[i], 0);
  719. vcpu_blocktime = low_time_offset - read_vcpu_time;
  720. affected_cpu += 1;
  721. /* we need to know is that mark_postcopy_end was due to
  722. * faulted page, another possible case it's prefetched
  723. * page and in that case we shouldn't be here */
  724. if (!vcpu_total_blocktime &&
  725. atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
  726. vcpu_total_blocktime = true;
  727. }
  728. /* continue cycle, due to one page could affect several vCPUs */
  729. dc->vcpu_blocktime[i] += vcpu_blocktime;
  730. }
  731. atomic_sub(&dc->smp_cpus_down, affected_cpu);
  732. if (vcpu_total_blocktime) {
  733. dc->total_blocktime += low_time_offset - atomic_fetch_add(
  734. &dc->last_begin, 0);
  735. }
  736. trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
  737. affected_cpu);
  738. }
  739. static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
  740. {
  741. trace_postcopy_pause_fault_thread();
  742. qemu_sem_wait(&mis->postcopy_pause_sem_fault);
  743. trace_postcopy_pause_fault_thread_continued();
  744. return true;
  745. }
  746. /*
  747. * Handle faults detected by the USERFAULT markings
  748. */
  749. static void *postcopy_ram_fault_thread(void *opaque)
  750. {
  751. MigrationIncomingState *mis = opaque;
  752. struct uffd_msg msg;
  753. int ret;
  754. size_t index;
  755. RAMBlock *rb = NULL;
  756. trace_postcopy_ram_fault_thread_entry();
  757. rcu_register_thread();
  758. mis->last_rb = NULL; /* last RAMBlock we sent part of */
  759. qemu_sem_post(&mis->fault_thread_sem);
  760. struct pollfd *pfd;
  761. size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
  762. pfd = g_new0(struct pollfd, pfd_len);
  763. pfd[0].fd = mis->userfault_fd;
  764. pfd[0].events = POLLIN;
  765. pfd[1].fd = mis->userfault_event_fd;
  766. pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
  767. trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
  768. for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
  769. struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
  770. struct PostCopyFD, index);
  771. pfd[2 + index].fd = pcfd->fd;
  772. pfd[2 + index].events = POLLIN;
  773. trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
  774. pcfd->fd);
  775. }
  776. while (true) {
  777. ram_addr_t rb_offset;
  778. int poll_result;
  779. /*
  780. * We're mainly waiting for the kernel to give us a faulting HVA,
  781. * however we can be told to quit via userfault_quit_fd which is
  782. * an eventfd
  783. */
  784. poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
  785. if (poll_result == -1) {
  786. error_report("%s: userfault poll: %s", __func__, strerror(errno));
  787. break;
  788. }
  789. if (!mis->to_src_file) {
  790. /*
  791. * Possibly someone tells us that the return path is
  792. * broken already using the event. We should hold until
  793. * the channel is rebuilt.
  794. */
  795. if (postcopy_pause_fault_thread(mis)) {
  796. mis->last_rb = NULL;
  797. /* Continue to read the userfaultfd */
  798. } else {
  799. error_report("%s: paused but don't allow to continue",
  800. __func__);
  801. break;
  802. }
  803. }
  804. if (pfd[1].revents) {
  805. uint64_t tmp64 = 0;
  806. /* Consume the signal */
  807. if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
  808. /* Nothing obviously nicer than posting this error. */
  809. error_report("%s: read() failed", __func__);
  810. }
  811. if (atomic_read(&mis->fault_thread_quit)) {
  812. trace_postcopy_ram_fault_thread_quit();
  813. break;
  814. }
  815. }
  816. if (pfd[0].revents) {
  817. poll_result--;
  818. ret = read(mis->userfault_fd, &msg, sizeof(msg));
  819. if (ret != sizeof(msg)) {
  820. if (errno == EAGAIN) {
  821. /*
  822. * if a wake up happens on the other thread just after
  823. * the poll, there is nothing to read.
  824. */
  825. continue;
  826. }
  827. if (ret < 0) {
  828. error_report("%s: Failed to read full userfault "
  829. "message: %s",
  830. __func__, strerror(errno));
  831. break;
  832. } else {
  833. error_report("%s: Read %d bytes from userfaultfd "
  834. "expected %zd",
  835. __func__, ret, sizeof(msg));
  836. break; /* Lost alignment, don't know what we'd read next */
  837. }
  838. }
  839. if (msg.event != UFFD_EVENT_PAGEFAULT) {
  840. error_report("%s: Read unexpected event %ud from userfaultfd",
  841. __func__, msg.event);
  842. continue; /* It's not a page fault, shouldn't happen */
  843. }
  844. rb = qemu_ram_block_from_host(
  845. (void *)(uintptr_t)msg.arg.pagefault.address,
  846. true, &rb_offset);
  847. if (!rb) {
  848. error_report("postcopy_ram_fault_thread: Fault outside guest: %"
  849. PRIx64, (uint64_t)msg.arg.pagefault.address);
  850. break;
  851. }
  852. rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
  853. trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
  854. qemu_ram_get_idstr(rb),
  855. rb_offset,
  856. msg.arg.pagefault.feat.ptid);
  857. mark_postcopy_blocktime_begin(
  858. (uintptr_t)(msg.arg.pagefault.address),
  859. msg.arg.pagefault.feat.ptid, rb);
  860. retry:
  861. /*
  862. * Send the request to the source - we want to request one
  863. * of our host page sizes (which is >= TPS)
  864. */
  865. if (rb != mis->last_rb) {
  866. mis->last_rb = rb;
  867. ret = migrate_send_rp_req_pages(mis,
  868. qemu_ram_get_idstr(rb),
  869. rb_offset,
  870. qemu_ram_pagesize(rb));
  871. } else {
  872. /* Save some space */
  873. ret = migrate_send_rp_req_pages(mis,
  874. NULL,
  875. rb_offset,
  876. qemu_ram_pagesize(rb));
  877. }
  878. if (ret) {
  879. /* May be network failure, try to wait for recovery */
  880. if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
  881. /* We got reconnected somehow, try to continue */
  882. mis->last_rb = NULL;
  883. goto retry;
  884. } else {
  885. /* This is a unavoidable fault */
  886. error_report("%s: migrate_send_rp_req_pages() get %d",
  887. __func__, ret);
  888. break;
  889. }
  890. }
  891. }
  892. /* Now handle any requests from external processes on shared memory */
  893. /* TODO: May need to handle devices deregistering during postcopy */
  894. for (index = 2; index < pfd_len && poll_result; index++) {
  895. if (pfd[index].revents) {
  896. struct PostCopyFD *pcfd =
  897. &g_array_index(mis->postcopy_remote_fds,
  898. struct PostCopyFD, index - 2);
  899. poll_result--;
  900. if (pfd[index].revents & POLLERR) {
  901. error_report("%s: POLLERR on poll %zd fd=%d",
  902. __func__, index, pcfd->fd);
  903. pfd[index].events = 0;
  904. continue;
  905. }
  906. ret = read(pcfd->fd, &msg, sizeof(msg));
  907. if (ret != sizeof(msg)) {
  908. if (errno == EAGAIN) {
  909. /*
  910. * if a wake up happens on the other thread just after
  911. * the poll, there is nothing to read.
  912. */
  913. continue;
  914. }
  915. if (ret < 0) {
  916. error_report("%s: Failed to read full userfault "
  917. "message: %s (shared) revents=%d",
  918. __func__, strerror(errno),
  919. pfd[index].revents);
  920. /*TODO: Could just disable this sharer */
  921. break;
  922. } else {
  923. error_report("%s: Read %d bytes from userfaultfd "
  924. "expected %zd (shared)",
  925. __func__, ret, sizeof(msg));
  926. /*TODO: Could just disable this sharer */
  927. break; /*Lost alignment,don't know what we'd read next*/
  928. }
  929. }
  930. if (msg.event != UFFD_EVENT_PAGEFAULT) {
  931. error_report("%s: Read unexpected event %ud "
  932. "from userfaultfd (shared)",
  933. __func__, msg.event);
  934. continue; /* It's not a page fault, shouldn't happen */
  935. }
  936. /* Call the device handler registered with us */
  937. ret = pcfd->handler(pcfd, &msg);
  938. if (ret) {
  939. error_report("%s: Failed to resolve shared fault on %zd/%s",
  940. __func__, index, pcfd->idstr);
  941. /* TODO: Fail? Disable this sharer? */
  942. }
  943. }
  944. }
  945. }
  946. rcu_unregister_thread();
  947. trace_postcopy_ram_fault_thread_exit();
  948. g_free(pfd);
  949. return NULL;
  950. }
  951. int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
  952. {
  953. /* Open the fd for the kernel to give us userfaults */
  954. mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
  955. if (mis->userfault_fd == -1) {
  956. error_report("%s: Failed to open userfault fd: %s", __func__,
  957. strerror(errno));
  958. return -1;
  959. }
  960. /*
  961. * Although the host check already tested the API, we need to
  962. * do the check again as an ABI handshake on the new fd.
  963. */
  964. if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
  965. return -1;
  966. }
  967. /* Now an eventfd we use to tell the fault-thread to quit */
  968. mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
  969. if (mis->userfault_event_fd == -1) {
  970. error_report("%s: Opening userfault_event_fd: %s", __func__,
  971. strerror(errno));
  972. close(mis->userfault_fd);
  973. return -1;
  974. }
  975. qemu_sem_init(&mis->fault_thread_sem, 0);
  976. qemu_thread_create(&mis->fault_thread, "postcopy/fault",
  977. postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
  978. qemu_sem_wait(&mis->fault_thread_sem);
  979. qemu_sem_destroy(&mis->fault_thread_sem);
  980. mis->have_fault_thread = true;
  981. /* Mark so that we get notified of accesses to unwritten areas */
  982. if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
  983. error_report("ram_block_enable_notify failed");
  984. return -1;
  985. }
  986. mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
  987. PROT_READ | PROT_WRITE, MAP_PRIVATE |
  988. MAP_ANONYMOUS, -1, 0);
  989. if (mis->postcopy_tmp_page == MAP_FAILED) {
  990. mis->postcopy_tmp_page = NULL;
  991. error_report("%s: Failed to map postcopy_tmp_page %s",
  992. __func__, strerror(errno));
  993. return -1;
  994. }
  995. /*
  996. * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages
  997. */
  998. mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
  999. PROT_READ | PROT_WRITE,
  1000. MAP_PRIVATE | MAP_ANONYMOUS,
  1001. -1, 0);
  1002. if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
  1003. int e = errno;
  1004. mis->postcopy_tmp_zero_page = NULL;
  1005. error_report("%s: Failed to map large zero page %s",
  1006. __func__, strerror(e));
  1007. return -e;
  1008. }
  1009. memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
  1010. trace_postcopy_ram_enable_notify();
  1011. return 0;
  1012. }
  1013. static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
  1014. void *from_addr, uint64_t pagesize, RAMBlock *rb)
  1015. {
  1016. int ret;
  1017. if (from_addr) {
  1018. struct uffdio_copy copy_struct;
  1019. copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
  1020. copy_struct.src = (uint64_t)(uintptr_t)from_addr;
  1021. copy_struct.len = pagesize;
  1022. copy_struct.mode = 0;
  1023. ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
  1024. } else {
  1025. struct uffdio_zeropage zero_struct;
  1026. zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
  1027. zero_struct.range.len = pagesize;
  1028. zero_struct.mode = 0;
  1029. ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
  1030. }
  1031. if (!ret) {
  1032. ramblock_recv_bitmap_set_range(rb, host_addr,
  1033. pagesize / qemu_target_page_size());
  1034. mark_postcopy_blocktime_end((uintptr_t)host_addr);
  1035. }
  1036. return ret;
  1037. }
  1038. int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
  1039. {
  1040. int i;
  1041. MigrationIncomingState *mis = migration_incoming_get_current();
  1042. GArray *pcrfds = mis->postcopy_remote_fds;
  1043. for (i = 0; i < pcrfds->len; i++) {
  1044. struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
  1045. int ret = cur->waker(cur, rb, offset);
  1046. if (ret) {
  1047. return ret;
  1048. }
  1049. }
  1050. return 0;
  1051. }
  1052. /*
  1053. * Place a host page (from) at (host) atomically
  1054. * returns 0 on success
  1055. */
  1056. int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
  1057. RAMBlock *rb)
  1058. {
  1059. size_t pagesize = qemu_ram_pagesize(rb);
  1060. /* copy also acks to the kernel waking the stalled thread up
  1061. * TODO: We can inhibit that ack and only do it if it was requested
  1062. * which would be slightly cheaper, but we'd have to be careful
  1063. * of the order of updating our page state.
  1064. */
  1065. if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
  1066. int e = errno;
  1067. error_report("%s: %s copy host: %p from: %p (size: %zd)",
  1068. __func__, strerror(e), host, from, pagesize);
  1069. return -e;
  1070. }
  1071. trace_postcopy_place_page(host);
  1072. return postcopy_notify_shared_wake(rb,
  1073. qemu_ram_block_host_offset(rb, host));
  1074. }
  1075. /*
  1076. * Place a zero page at (host) atomically
  1077. * returns 0 on success
  1078. */
  1079. int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
  1080. RAMBlock *rb)
  1081. {
  1082. size_t pagesize = qemu_ram_pagesize(rb);
  1083. trace_postcopy_place_page_zero(host);
  1084. /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
  1085. * but it's not available for everything (e.g. hugetlbpages)
  1086. */
  1087. if (qemu_ram_is_uf_zeroable(rb)) {
  1088. if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
  1089. int e = errno;
  1090. error_report("%s: %s zero host: %p",
  1091. __func__, strerror(e), host);
  1092. return -e;
  1093. }
  1094. return postcopy_notify_shared_wake(rb,
  1095. qemu_ram_block_host_offset(rb,
  1096. host));
  1097. } else {
  1098. return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
  1099. }
  1100. }
  1101. #else
  1102. /* No target OS support, stubs just fail */
  1103. void fill_destination_postcopy_migration_info(MigrationInfo *info)
  1104. {
  1105. }
  1106. bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
  1107. {
  1108. error_report("%s: No OS support", __func__);
  1109. return false;
  1110. }
  1111. int postcopy_ram_incoming_init(MigrationIncomingState *mis)
  1112. {
  1113. error_report("postcopy_ram_incoming_init: No OS support");
  1114. return -1;
  1115. }
  1116. int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
  1117. {
  1118. assert(0);
  1119. return -1;
  1120. }
  1121. int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
  1122. {
  1123. assert(0);
  1124. return -1;
  1125. }
  1126. int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
  1127. uint64_t client_addr, uint64_t rb_offset)
  1128. {
  1129. assert(0);
  1130. return -1;
  1131. }
  1132. int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
  1133. {
  1134. assert(0);
  1135. return -1;
  1136. }
  1137. int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
  1138. RAMBlock *rb)
  1139. {
  1140. assert(0);
  1141. return -1;
  1142. }
  1143. int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
  1144. RAMBlock *rb)
  1145. {
  1146. assert(0);
  1147. return -1;
  1148. }
  1149. int postcopy_wake_shared(struct PostCopyFD *pcfd,
  1150. uint64_t client_addr,
  1151. RAMBlock *rb)
  1152. {
  1153. assert(0);
  1154. return -1;
  1155. }
  1156. #endif
  1157. /* ------------------------------------------------------------------------- */
  1158. void postcopy_fault_thread_notify(MigrationIncomingState *mis)
  1159. {
  1160. uint64_t tmp64 = 1;
  1161. /*
  1162. * Wakeup the fault_thread. It's an eventfd that should currently
  1163. * be at 0, we're going to increment it to 1
  1164. */
  1165. if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
  1166. /* Not much we can do here, but may as well report it */
  1167. error_report("%s: incrementing failed: %s", __func__,
  1168. strerror(errno));
  1169. }
  1170. }
  1171. /**
  1172. * postcopy_discard_send_init: Called at the start of each RAMBlock before
  1173. * asking to discard individual ranges.
  1174. *
  1175. * @ms: The current migration state.
  1176. * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
  1177. * @name: RAMBlock that discards will operate on.
  1178. */
  1179. static PostcopyDiscardState pds = {0};
  1180. void postcopy_discard_send_init(MigrationState *ms, const char *name)
  1181. {
  1182. pds.ramblock_name = name;
  1183. pds.cur_entry = 0;
  1184. pds.nsentwords = 0;
  1185. pds.nsentcmds = 0;
  1186. }
  1187. /**
  1188. * postcopy_discard_send_range: Called by the bitmap code for each chunk to
  1189. * discard. May send a discard message, may just leave it queued to
  1190. * be sent later.
  1191. *
  1192. * @ms: Current migration state.
  1193. * @start,@length: a range of pages in the migration bitmap in the
  1194. * RAM block passed to postcopy_discard_send_init() (length=1 is one page)
  1195. */
  1196. void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
  1197. unsigned long length)
  1198. {
  1199. size_t tp_size = qemu_target_page_size();
  1200. /* Convert to byte offsets within the RAM block */
  1201. pds.start_list[pds.cur_entry] = start * tp_size;
  1202. pds.length_list[pds.cur_entry] = length * tp_size;
  1203. trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
  1204. pds.cur_entry++;
  1205. pds.nsentwords++;
  1206. if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
  1207. /* Full set, ship it! */
  1208. qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
  1209. pds.ramblock_name,
  1210. pds.cur_entry,
  1211. pds.start_list,
  1212. pds.length_list);
  1213. pds.nsentcmds++;
  1214. pds.cur_entry = 0;
  1215. }
  1216. }
  1217. /**
  1218. * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
  1219. * bitmap code. Sends any outstanding discard messages, frees the PDS
  1220. *
  1221. * @ms: Current migration state.
  1222. */
  1223. void postcopy_discard_send_finish(MigrationState *ms)
  1224. {
  1225. /* Anything unsent? */
  1226. if (pds.cur_entry) {
  1227. qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
  1228. pds.ramblock_name,
  1229. pds.cur_entry,
  1230. pds.start_list,
  1231. pds.length_list);
  1232. pds.nsentcmds++;
  1233. }
  1234. trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
  1235. pds.nsentcmds);
  1236. }
  1237. /*
  1238. * Current state of incoming postcopy; note this is not part of
  1239. * MigrationIncomingState since it's state is used during cleanup
  1240. * at the end as MIS is being freed.
  1241. */
  1242. static PostcopyState incoming_postcopy_state;
  1243. PostcopyState postcopy_state_get(void)
  1244. {
  1245. return atomic_mb_read(&incoming_postcopy_state);
  1246. }
  1247. /* Set the state and return the old state */
  1248. PostcopyState postcopy_state_set(PostcopyState new_state)
  1249. {
  1250. return atomic_xchg(&incoming_postcopy_state, new_state);
  1251. }
  1252. /* Register a handler for external shared memory postcopy
  1253. * called on the destination.
  1254. */
  1255. void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
  1256. {
  1257. MigrationIncomingState *mis = migration_incoming_get_current();
  1258. mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
  1259. *pcfd);
  1260. }
  1261. /* Unregister a handler for external shared memory postcopy
  1262. */
  1263. void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
  1264. {
  1265. guint i;
  1266. MigrationIncomingState *mis = migration_incoming_get_current();
  1267. GArray *pcrfds = mis->postcopy_remote_fds;
  1268. for (i = 0; i < pcrfds->len; i++) {
  1269. struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
  1270. if (cur->fd == pcfd->fd) {
  1271. mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
  1272. return;
  1273. }
  1274. }
  1275. }