postcopy-ram.c 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471
  1. /*
  2. * Postcopy migration for RAM
  3. *
  4. * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
  5. *
  6. * Authors:
  7. * Dave Gilbert <dgilbert@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10. * See the COPYING file in the top-level directory.
  11. *
  12. */
  13. /*
  14. * Postcopy is a migration technique where the execution flips from the
  15. * source to the destination before all the data has been copied.
  16. */
  17. #include "qemu/osdep.h"
  18. #include "exec/target_page.h"
  19. #include "migration.h"
  20. #include "qemu-file.h"
  21. #include "savevm.h"
  22. #include "postcopy-ram.h"
  23. #include "ram.h"
  24. #include "qapi/error.h"
  25. #include "qemu/notify.h"
  26. #include "qemu/rcu.h"
  27. #include "sysemu/sysemu.h"
  28. #include "sysemu/balloon.h"
  29. #include "qemu/error-report.h"
  30. #include "trace.h"
  31. #include "hw/boards.h"
  32. /* Arbitrary limit on size of each discard command,
  33. * keeps them around ~200 bytes
  34. */
  35. #define MAX_DISCARDS_PER_COMMAND 12
  36. struct PostcopyDiscardState {
  37. const char *ramblock_name;
  38. uint16_t cur_entry;
  39. /*
  40. * Start and length of a discard range (bytes)
  41. */
  42. uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
  43. uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
  44. unsigned int nsentwords;
  45. unsigned int nsentcmds;
  46. };
  47. static NotifierWithReturnList postcopy_notifier_list;
  48. void postcopy_infrastructure_init(void)
  49. {
  50. notifier_with_return_list_init(&postcopy_notifier_list);
  51. }
  52. void postcopy_add_notifier(NotifierWithReturn *nn)
  53. {
  54. notifier_with_return_list_add(&postcopy_notifier_list, nn);
  55. }
  56. void postcopy_remove_notifier(NotifierWithReturn *n)
  57. {
  58. notifier_with_return_remove(n);
  59. }
  60. int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
  61. {
  62. struct PostcopyNotifyData pnd;
  63. pnd.reason = reason;
  64. pnd.errp = errp;
  65. return notifier_with_return_list_notify(&postcopy_notifier_list,
  66. &pnd);
  67. }
  68. /* Postcopy needs to detect accesses to pages that haven't yet been copied
  69. * across, and efficiently map new pages in, the techniques for doing this
  70. * are target OS specific.
  71. */
  72. #if defined(__linux__)
  73. #include <poll.h>
  74. #include <sys/ioctl.h>
  75. #include <sys/syscall.h>
  76. #include <asm/types.h> /* for __u64 */
  77. #endif
  78. #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
  79. #include <sys/eventfd.h>
  80. #include <linux/userfaultfd.h>
  81. typedef struct PostcopyBlocktimeContext {
  82. /* time when page fault initiated per vCPU */
  83. uint32_t *page_fault_vcpu_time;
  84. /* page address per vCPU */
  85. uintptr_t *vcpu_addr;
  86. uint32_t total_blocktime;
  87. /* blocktime per vCPU */
  88. uint32_t *vcpu_blocktime;
  89. /* point in time when last page fault was initiated */
  90. uint32_t last_begin;
  91. /* number of vCPU are suspended */
  92. int smp_cpus_down;
  93. uint64_t start_time;
  94. /*
  95. * Handler for exit event, necessary for
  96. * releasing whole blocktime_ctx
  97. */
  98. Notifier exit_notifier;
  99. } PostcopyBlocktimeContext;
  100. static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
  101. {
  102. g_free(ctx->page_fault_vcpu_time);
  103. g_free(ctx->vcpu_addr);
  104. g_free(ctx->vcpu_blocktime);
  105. g_free(ctx);
  106. }
  107. static void migration_exit_cb(Notifier *n, void *data)
  108. {
  109. PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
  110. exit_notifier);
  111. destroy_blocktime_context(ctx);
  112. }
  113. static struct PostcopyBlocktimeContext *blocktime_context_new(void)
  114. {
  115. MachineState *ms = MACHINE(qdev_get_machine());
  116. unsigned int smp_cpus = ms->smp.cpus;
  117. PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
  118. ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
  119. ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
  120. ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
  121. ctx->exit_notifier.notify = migration_exit_cb;
  122. ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
  123. qemu_add_exit_notifier(&ctx->exit_notifier);
  124. return ctx;
  125. }
  126. static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
  127. {
  128. MachineState *ms = MACHINE(qdev_get_machine());
  129. uint32List *list = NULL, *entry = NULL;
  130. int i;
  131. for (i = ms->smp.cpus - 1; i >= 0; i--) {
  132. entry = g_new0(uint32List, 1);
  133. entry->value = ctx->vcpu_blocktime[i];
  134. entry->next = list;
  135. list = entry;
  136. }
  137. return list;
  138. }
  139. /*
  140. * This function just populates MigrationInfo from postcopy's
  141. * blocktime context. It will not populate MigrationInfo,
  142. * unless postcopy-blocktime capability was set.
  143. *
  144. * @info: pointer to MigrationInfo to populate
  145. */
  146. void fill_destination_postcopy_migration_info(MigrationInfo *info)
  147. {
  148. MigrationIncomingState *mis = migration_incoming_get_current();
  149. PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
  150. if (!bc) {
  151. return;
  152. }
  153. info->has_postcopy_blocktime = true;
  154. info->postcopy_blocktime = bc->total_blocktime;
  155. info->has_postcopy_vcpu_blocktime = true;
  156. info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
  157. }
  158. static uint32_t get_postcopy_total_blocktime(void)
  159. {
  160. MigrationIncomingState *mis = migration_incoming_get_current();
  161. PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
  162. if (!bc) {
  163. return 0;
  164. }
  165. return bc->total_blocktime;
  166. }
  167. /**
  168. * receive_ufd_features: check userfault fd features, to request only supported
  169. * features in the future.
  170. *
  171. * Returns: true on success
  172. *
  173. * __NR_userfaultfd - should be checked before
  174. * @features: out parameter will contain uffdio_api.features provided by kernel
  175. * in case of success
  176. */
  177. static bool receive_ufd_features(uint64_t *features)
  178. {
  179. struct uffdio_api api_struct = {0};
  180. int ufd;
  181. bool ret = true;
  182. /* if we are here __NR_userfaultfd should exists */
  183. ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
  184. if (ufd == -1) {
  185. error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
  186. strerror(errno));
  187. return false;
  188. }
  189. /* ask features */
  190. api_struct.api = UFFD_API;
  191. api_struct.features = 0;
  192. if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  193. error_report("%s: UFFDIO_API failed: %s", __func__,
  194. strerror(errno));
  195. ret = false;
  196. goto release_ufd;
  197. }
  198. *features = api_struct.features;
  199. release_ufd:
  200. close(ufd);
  201. return ret;
  202. }
  203. /**
  204. * request_ufd_features: this function should be called only once on a newly
  205. * opened ufd, subsequent calls will lead to error.
  206. *
  207. * Returns: true on succes
  208. *
  209. * @ufd: fd obtained from userfaultfd syscall
  210. * @features: bit mask see UFFD_API_FEATURES
  211. */
  212. static bool request_ufd_features(int ufd, uint64_t features)
  213. {
  214. struct uffdio_api api_struct = {0};
  215. uint64_t ioctl_mask;
  216. api_struct.api = UFFD_API;
  217. api_struct.features = features;
  218. if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  219. error_report("%s failed: UFFDIO_API failed: %s", __func__,
  220. strerror(errno));
  221. return false;
  222. }
  223. ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
  224. (__u64)1 << _UFFDIO_UNREGISTER;
  225. if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
  226. error_report("Missing userfault features: %" PRIx64,
  227. (uint64_t)(~api_struct.ioctls & ioctl_mask));
  228. return false;
  229. }
  230. return true;
  231. }
  232. static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
  233. {
  234. uint64_t asked_features = 0;
  235. static uint64_t supported_features;
  236. /*
  237. * it's not possible to
  238. * request UFFD_API twice per one fd
  239. * userfault fd features is persistent
  240. */
  241. if (!supported_features) {
  242. if (!receive_ufd_features(&supported_features)) {
  243. error_report("%s failed", __func__);
  244. return false;
  245. }
  246. }
  247. #ifdef UFFD_FEATURE_THREAD_ID
  248. if (migrate_postcopy_blocktime() && mis &&
  249. UFFD_FEATURE_THREAD_ID & supported_features) {
  250. /* kernel supports that feature */
  251. /* don't create blocktime_context if it exists */
  252. if (!mis->blocktime_ctx) {
  253. mis->blocktime_ctx = blocktime_context_new();
  254. }
  255. asked_features |= UFFD_FEATURE_THREAD_ID;
  256. }
  257. #endif
  258. /*
  259. * request features, even if asked_features is 0, due to
  260. * kernel expects UFFD_API before UFFDIO_REGISTER, per
  261. * userfault file descriptor
  262. */
  263. if (!request_ufd_features(ufd, asked_features)) {
  264. error_report("%s failed: features %" PRIu64, __func__,
  265. asked_features);
  266. return false;
  267. }
  268. if (qemu_real_host_page_size != ram_pagesize_summary()) {
  269. bool have_hp = false;
  270. /* We've got a huge page */
  271. #ifdef UFFD_FEATURE_MISSING_HUGETLBFS
  272. have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
  273. #endif
  274. if (!have_hp) {
  275. error_report("Userfault on this host does not support huge pages");
  276. return false;
  277. }
  278. }
  279. return true;
  280. }
  281. /* Callback from postcopy_ram_supported_by_host block iterator.
  282. */
  283. static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
  284. {
  285. const char *block_name = qemu_ram_get_idstr(rb);
  286. ram_addr_t length = qemu_ram_get_used_length(rb);
  287. size_t pagesize = qemu_ram_pagesize(rb);
  288. if (length % pagesize) {
  289. error_report("Postcopy requires RAM blocks to be a page size multiple,"
  290. " block %s is 0x" RAM_ADDR_FMT " bytes with a "
  291. "page size of 0x%zx", block_name, length, pagesize);
  292. return 1;
  293. }
  294. return 0;
  295. }
  296. /*
  297. * Note: This has the side effect of munlock'ing all of RAM, that's
  298. * normally fine since if the postcopy succeeds it gets turned back on at the
  299. * end.
  300. */
  301. bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
  302. {
  303. long pagesize = qemu_real_host_page_size;
  304. int ufd = -1;
  305. bool ret = false; /* Error unless we change it */
  306. void *testarea = NULL;
  307. struct uffdio_register reg_struct;
  308. struct uffdio_range range_struct;
  309. uint64_t feature_mask;
  310. Error *local_err = NULL;
  311. if (qemu_target_page_size() > pagesize) {
  312. error_report("Target page size bigger than host page size");
  313. goto out;
  314. }
  315. ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
  316. if (ufd == -1) {
  317. error_report("%s: userfaultfd not available: %s", __func__,
  318. strerror(errno));
  319. goto out;
  320. }
  321. /* Give devices a chance to object */
  322. if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
  323. error_report_err(local_err);
  324. goto out;
  325. }
  326. /* Version and features check */
  327. if (!ufd_check_and_apply(ufd, mis)) {
  328. goto out;
  329. }
  330. /* We don't support postcopy with shared RAM yet */
  331. if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
  332. goto out;
  333. }
  334. /*
  335. * userfault and mlock don't go together; we'll put it back later if
  336. * it was enabled.
  337. */
  338. if (munlockall()) {
  339. error_report("%s: munlockall: %s", __func__, strerror(errno));
  340. return -1;
  341. }
  342. /*
  343. * We need to check that the ops we need are supported on anon memory
  344. * To do that we need to register a chunk and see the flags that
  345. * are returned.
  346. */
  347. testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
  348. MAP_ANONYMOUS, -1, 0);
  349. if (testarea == MAP_FAILED) {
  350. error_report("%s: Failed to map test area: %s", __func__,
  351. strerror(errno));
  352. goto out;
  353. }
  354. g_assert(((size_t)testarea & (pagesize-1)) == 0);
  355. reg_struct.range.start = (uintptr_t)testarea;
  356. reg_struct.range.len = pagesize;
  357. reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
  358. if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
  359. error_report("%s userfault register: %s", __func__, strerror(errno));
  360. goto out;
  361. }
  362. range_struct.start = (uintptr_t)testarea;
  363. range_struct.len = pagesize;
  364. if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
  365. error_report("%s userfault unregister: %s", __func__, strerror(errno));
  366. goto out;
  367. }
  368. feature_mask = (__u64)1 << _UFFDIO_WAKE |
  369. (__u64)1 << _UFFDIO_COPY |
  370. (__u64)1 << _UFFDIO_ZEROPAGE;
  371. if ((reg_struct.ioctls & feature_mask) != feature_mask) {
  372. error_report("Missing userfault map features: %" PRIx64,
  373. (uint64_t)(~reg_struct.ioctls & feature_mask));
  374. goto out;
  375. }
  376. /* Success! */
  377. ret = true;
  378. out:
  379. if (testarea) {
  380. munmap(testarea, pagesize);
  381. }
  382. if (ufd != -1) {
  383. close(ufd);
  384. }
  385. return ret;
  386. }
  387. /*
  388. * Setup an area of RAM so that it *can* be used for postcopy later; this
  389. * must be done right at the start prior to pre-copy.
  390. * opaque should be the MIS.
  391. */
  392. static int init_range(RAMBlock *rb, void *opaque)
  393. {
  394. const char *block_name = qemu_ram_get_idstr(rb);
  395. void *host_addr = qemu_ram_get_host_addr(rb);
  396. ram_addr_t offset = qemu_ram_get_offset(rb);
  397. ram_addr_t length = qemu_ram_get_used_length(rb);
  398. trace_postcopy_init_range(block_name, host_addr, offset, length);
  399. /*
  400. * We need the whole of RAM to be truly empty for postcopy, so things
  401. * like ROMs and any data tables built during init must be zero'd
  402. * - we're going to get the copy from the source anyway.
  403. * (Precopy will just overwrite this data, so doesn't need the discard)
  404. */
  405. if (ram_discard_range(block_name, 0, length)) {
  406. return -1;
  407. }
  408. return 0;
  409. }
  410. /*
  411. * At the end of migration, undo the effects of init_range
  412. * opaque should be the MIS.
  413. */
  414. static int cleanup_range(RAMBlock *rb, void *opaque)
  415. {
  416. const char *block_name = qemu_ram_get_idstr(rb);
  417. void *host_addr = qemu_ram_get_host_addr(rb);
  418. ram_addr_t offset = qemu_ram_get_offset(rb);
  419. ram_addr_t length = qemu_ram_get_used_length(rb);
  420. MigrationIncomingState *mis = opaque;
  421. struct uffdio_range range_struct;
  422. trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
  423. /*
  424. * We turned off hugepage for the precopy stage with postcopy enabled
  425. * we can turn it back on now.
  426. */
  427. qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
  428. /*
  429. * We can also turn off userfault now since we should have all the
  430. * pages. It can be useful to leave it on to debug postcopy
  431. * if you're not sure it's always getting every page.
  432. */
  433. range_struct.start = (uintptr_t)host_addr;
  434. range_struct.len = length;
  435. if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
  436. error_report("%s: userfault unregister %s", __func__, strerror(errno));
  437. return -1;
  438. }
  439. return 0;
  440. }
  441. /*
  442. * Initialise postcopy-ram, setting the RAM to a state where we can go into
  443. * postcopy later; must be called prior to any precopy.
  444. * called from arch_init's similarly named ram_postcopy_incoming_init
  445. */
  446. int postcopy_ram_incoming_init(MigrationIncomingState *mis)
  447. {
  448. if (foreach_not_ignored_block(init_range, NULL)) {
  449. return -1;
  450. }
  451. return 0;
  452. }
  453. /*
  454. * Manage a single vote to the QEMU balloon inhibitor for all postcopy usage,
  455. * last caller wins.
  456. */
  457. static void postcopy_balloon_inhibit(bool state)
  458. {
  459. static bool cur_state = false;
  460. if (state != cur_state) {
  461. qemu_balloon_inhibit(state);
  462. cur_state = state;
  463. }
  464. }
  465. /*
  466. * At the end of a migration where postcopy_ram_incoming_init was called.
  467. */
  468. int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
  469. {
  470. trace_postcopy_ram_incoming_cleanup_entry();
  471. if (mis->have_fault_thread) {
  472. Error *local_err = NULL;
  473. /* Let the fault thread quit */
  474. atomic_set(&mis->fault_thread_quit, 1);
  475. postcopy_fault_thread_notify(mis);
  476. trace_postcopy_ram_incoming_cleanup_join();
  477. qemu_thread_join(&mis->fault_thread);
  478. if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
  479. error_report_err(local_err);
  480. return -1;
  481. }
  482. if (foreach_not_ignored_block(cleanup_range, mis)) {
  483. return -1;
  484. }
  485. trace_postcopy_ram_incoming_cleanup_closeuf();
  486. close(mis->userfault_fd);
  487. close(mis->userfault_event_fd);
  488. mis->have_fault_thread = false;
  489. }
  490. postcopy_balloon_inhibit(false);
  491. if (enable_mlock) {
  492. if (os_mlock() < 0) {
  493. error_report("mlock: %s", strerror(errno));
  494. /*
  495. * It doesn't feel right to fail at this point, we have a valid
  496. * VM state.
  497. */
  498. }
  499. }
  500. if (mis->postcopy_tmp_page) {
  501. munmap(mis->postcopy_tmp_page, mis->largest_page_size);
  502. mis->postcopy_tmp_page = NULL;
  503. }
  504. if (mis->postcopy_tmp_zero_page) {
  505. munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
  506. mis->postcopy_tmp_zero_page = NULL;
  507. }
  508. trace_postcopy_ram_incoming_cleanup_blocktime(
  509. get_postcopy_total_blocktime());
  510. trace_postcopy_ram_incoming_cleanup_exit();
  511. return 0;
  512. }
  513. /*
  514. * Disable huge pages on an area
  515. */
  516. static int nhp_range(RAMBlock *rb, void *opaque)
  517. {
  518. const char *block_name = qemu_ram_get_idstr(rb);
  519. void *host_addr = qemu_ram_get_host_addr(rb);
  520. ram_addr_t offset = qemu_ram_get_offset(rb);
  521. ram_addr_t length = qemu_ram_get_used_length(rb);
  522. trace_postcopy_nhp_range(block_name, host_addr, offset, length);
  523. /*
  524. * Before we do discards we need to ensure those discards really
  525. * do delete areas of the page, even if THP thinks a hugepage would
  526. * be a good idea, so force hugepages off.
  527. */
  528. qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
  529. return 0;
  530. }
  531. /*
  532. * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
  533. * however leaving it until after precopy means that most of the precopy
  534. * data is still THPd
  535. */
  536. int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
  537. {
  538. if (foreach_not_ignored_block(nhp_range, mis)) {
  539. return -1;
  540. }
  541. postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
  542. return 0;
  543. }
  544. /*
  545. * Mark the given area of RAM as requiring notification to unwritten areas
  546. * Used as a callback on foreach_not_ignored_block.
  547. * host_addr: Base of area to mark
  548. * offset: Offset in the whole ram arena
  549. * length: Length of the section
  550. * opaque: MigrationIncomingState pointer
  551. * Returns 0 on success
  552. */
  553. static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
  554. {
  555. MigrationIncomingState *mis = opaque;
  556. struct uffdio_register reg_struct;
  557. reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
  558. reg_struct.range.len = qemu_ram_get_used_length(rb);
  559. reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
  560. /* Now tell our userfault_fd that it's responsible for this area */
  561. if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
  562. error_report("%s userfault register: %s", __func__, strerror(errno));
  563. return -1;
  564. }
  565. if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
  566. error_report("%s userfault: Region doesn't support COPY", __func__);
  567. return -1;
  568. }
  569. if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
  570. qemu_ram_set_uf_zeroable(rb);
  571. }
  572. return 0;
  573. }
  574. int postcopy_wake_shared(struct PostCopyFD *pcfd,
  575. uint64_t client_addr,
  576. RAMBlock *rb)
  577. {
  578. size_t pagesize = qemu_ram_pagesize(rb);
  579. struct uffdio_range range;
  580. int ret;
  581. trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
  582. range.start = client_addr & ~(pagesize - 1);
  583. range.len = pagesize;
  584. ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
  585. if (ret) {
  586. error_report("%s: Failed to wake: %zx in %s (%s)",
  587. __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
  588. strerror(errno));
  589. }
  590. return ret;
  591. }
  592. /*
  593. * Callback from shared fault handlers to ask for a page,
  594. * the page must be specified by a RAMBlock and an offset in that rb
  595. * Note: Only for use by shared fault handlers (in fault thread)
  596. */
  597. int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
  598. uint64_t client_addr, uint64_t rb_offset)
  599. {
  600. size_t pagesize = qemu_ram_pagesize(rb);
  601. uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
  602. MigrationIncomingState *mis = migration_incoming_get_current();
  603. trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
  604. rb_offset);
  605. if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
  606. trace_postcopy_request_shared_page_present(pcfd->idstr,
  607. qemu_ram_get_idstr(rb), rb_offset);
  608. return postcopy_wake_shared(pcfd, client_addr, rb);
  609. }
  610. if (rb != mis->last_rb) {
  611. mis->last_rb = rb;
  612. migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
  613. aligned_rbo, pagesize);
  614. } else {
  615. /* Save some space */
  616. migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
  617. }
  618. return 0;
  619. }
  620. static int get_mem_fault_cpu_index(uint32_t pid)
  621. {
  622. CPUState *cpu_iter;
  623. CPU_FOREACH(cpu_iter) {
  624. if (cpu_iter->thread_id == pid) {
  625. trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
  626. return cpu_iter->cpu_index;
  627. }
  628. }
  629. trace_get_mem_fault_cpu_index(-1, pid);
  630. return -1;
  631. }
  632. static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
  633. {
  634. int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
  635. dc->start_time;
  636. return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
  637. }
  638. /*
  639. * This function is being called when pagefault occurs. It
  640. * tracks down vCPU blocking time.
  641. *
  642. * @addr: faulted host virtual address
  643. * @ptid: faulted process thread id
  644. * @rb: ramblock appropriate to addr
  645. */
  646. static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
  647. RAMBlock *rb)
  648. {
  649. int cpu, already_received;
  650. MigrationIncomingState *mis = migration_incoming_get_current();
  651. PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
  652. uint32_t low_time_offset;
  653. if (!dc || ptid == 0) {
  654. return;
  655. }
  656. cpu = get_mem_fault_cpu_index(ptid);
  657. if (cpu < 0) {
  658. return;
  659. }
  660. low_time_offset = get_low_time_offset(dc);
  661. if (dc->vcpu_addr[cpu] == 0) {
  662. atomic_inc(&dc->smp_cpus_down);
  663. }
  664. atomic_xchg(&dc->last_begin, low_time_offset);
  665. atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
  666. atomic_xchg(&dc->vcpu_addr[cpu], addr);
  667. /*
  668. * check it here, not at the beginning of the function,
  669. * due to, check could occur early than bitmap_set in
  670. * qemu_ufd_copy_ioctl
  671. */
  672. already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
  673. if (already_received) {
  674. atomic_xchg(&dc->vcpu_addr[cpu], 0);
  675. atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
  676. atomic_dec(&dc->smp_cpus_down);
  677. }
  678. trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
  679. cpu, already_received);
  680. }
  681. /*
  682. * This function just provide calculated blocktime per cpu and trace it.
  683. * Total blocktime is calculated in mark_postcopy_blocktime_end.
  684. *
  685. *
  686. * Assume we have 3 CPU
  687. *
  688. * S1 E1 S1 E1
  689. * -----***********------------xxx***************------------------------> CPU1
  690. *
  691. * S2 E2
  692. * ------------****************xxx---------------------------------------> CPU2
  693. *
  694. * S3 E3
  695. * ------------------------****xxx********-------------------------------> CPU3
  696. *
  697. * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
  698. * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
  699. * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
  700. * it's a part of total blocktime.
  701. * S1 - here is last_begin
  702. * Legend of the picture is following:
  703. * * - means blocktime per vCPU
  704. * x - means overlapped blocktime (total blocktime)
  705. *
  706. * @addr: host virtual address
  707. */
  708. static void mark_postcopy_blocktime_end(uintptr_t addr)
  709. {
  710. MigrationIncomingState *mis = migration_incoming_get_current();
  711. PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
  712. MachineState *ms = MACHINE(qdev_get_machine());
  713. unsigned int smp_cpus = ms->smp.cpus;
  714. int i, affected_cpu = 0;
  715. bool vcpu_total_blocktime = false;
  716. uint32_t read_vcpu_time, low_time_offset;
  717. if (!dc) {
  718. return;
  719. }
  720. low_time_offset = get_low_time_offset(dc);
  721. /* lookup cpu, to clear it,
  722. * that algorithm looks straighforward, but it's not
  723. * optimal, more optimal algorithm is keeping tree or hash
  724. * where key is address value is a list of */
  725. for (i = 0; i < smp_cpus; i++) {
  726. uint32_t vcpu_blocktime = 0;
  727. read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
  728. if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
  729. read_vcpu_time == 0) {
  730. continue;
  731. }
  732. atomic_xchg(&dc->vcpu_addr[i], 0);
  733. vcpu_blocktime = low_time_offset - read_vcpu_time;
  734. affected_cpu += 1;
  735. /* we need to know is that mark_postcopy_end was due to
  736. * faulted page, another possible case it's prefetched
  737. * page and in that case we shouldn't be here */
  738. if (!vcpu_total_blocktime &&
  739. atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
  740. vcpu_total_blocktime = true;
  741. }
  742. /* continue cycle, due to one page could affect several vCPUs */
  743. dc->vcpu_blocktime[i] += vcpu_blocktime;
  744. }
  745. atomic_sub(&dc->smp_cpus_down, affected_cpu);
  746. if (vcpu_total_blocktime) {
  747. dc->total_blocktime += low_time_offset - atomic_fetch_add(
  748. &dc->last_begin, 0);
  749. }
  750. trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
  751. affected_cpu);
  752. }
  753. static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
  754. {
  755. trace_postcopy_pause_fault_thread();
  756. qemu_sem_wait(&mis->postcopy_pause_sem_fault);
  757. trace_postcopy_pause_fault_thread_continued();
  758. return true;
  759. }
  760. /*
  761. * Handle faults detected by the USERFAULT markings
  762. */
  763. static void *postcopy_ram_fault_thread(void *opaque)
  764. {
  765. MigrationIncomingState *mis = opaque;
  766. struct uffd_msg msg;
  767. int ret;
  768. size_t index;
  769. RAMBlock *rb = NULL;
  770. trace_postcopy_ram_fault_thread_entry();
  771. rcu_register_thread();
  772. mis->last_rb = NULL; /* last RAMBlock we sent part of */
  773. qemu_sem_post(&mis->fault_thread_sem);
  774. struct pollfd *pfd;
  775. size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
  776. pfd = g_new0(struct pollfd, pfd_len);
  777. pfd[0].fd = mis->userfault_fd;
  778. pfd[0].events = POLLIN;
  779. pfd[1].fd = mis->userfault_event_fd;
  780. pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
  781. trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
  782. for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
  783. struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
  784. struct PostCopyFD, index);
  785. pfd[2 + index].fd = pcfd->fd;
  786. pfd[2 + index].events = POLLIN;
  787. trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
  788. pcfd->fd);
  789. }
  790. while (true) {
  791. ram_addr_t rb_offset;
  792. int poll_result;
  793. /*
  794. * We're mainly waiting for the kernel to give us a faulting HVA,
  795. * however we can be told to quit via userfault_quit_fd which is
  796. * an eventfd
  797. */
  798. poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
  799. if (poll_result == -1) {
  800. error_report("%s: userfault poll: %s", __func__, strerror(errno));
  801. break;
  802. }
  803. if (!mis->to_src_file) {
  804. /*
  805. * Possibly someone tells us that the return path is
  806. * broken already using the event. We should hold until
  807. * the channel is rebuilt.
  808. */
  809. if (postcopy_pause_fault_thread(mis)) {
  810. mis->last_rb = NULL;
  811. /* Continue to read the userfaultfd */
  812. } else {
  813. error_report("%s: paused but don't allow to continue",
  814. __func__);
  815. break;
  816. }
  817. }
  818. if (pfd[1].revents) {
  819. uint64_t tmp64 = 0;
  820. /* Consume the signal */
  821. if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
  822. /* Nothing obviously nicer than posting this error. */
  823. error_report("%s: read() failed", __func__);
  824. }
  825. if (atomic_read(&mis->fault_thread_quit)) {
  826. trace_postcopy_ram_fault_thread_quit();
  827. break;
  828. }
  829. }
  830. if (pfd[0].revents) {
  831. poll_result--;
  832. ret = read(mis->userfault_fd, &msg, sizeof(msg));
  833. if (ret != sizeof(msg)) {
  834. if (errno == EAGAIN) {
  835. /*
  836. * if a wake up happens on the other thread just after
  837. * the poll, there is nothing to read.
  838. */
  839. continue;
  840. }
  841. if (ret < 0) {
  842. error_report("%s: Failed to read full userfault "
  843. "message: %s",
  844. __func__, strerror(errno));
  845. break;
  846. } else {
  847. error_report("%s: Read %d bytes from userfaultfd "
  848. "expected %zd",
  849. __func__, ret, sizeof(msg));
  850. break; /* Lost alignment, don't know what we'd read next */
  851. }
  852. }
  853. if (msg.event != UFFD_EVENT_PAGEFAULT) {
  854. error_report("%s: Read unexpected event %ud from userfaultfd",
  855. __func__, msg.event);
  856. continue; /* It's not a page fault, shouldn't happen */
  857. }
  858. rb = qemu_ram_block_from_host(
  859. (void *)(uintptr_t)msg.arg.pagefault.address,
  860. true, &rb_offset);
  861. if (!rb) {
  862. error_report("postcopy_ram_fault_thread: Fault outside guest: %"
  863. PRIx64, (uint64_t)msg.arg.pagefault.address);
  864. break;
  865. }
  866. rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
  867. trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
  868. qemu_ram_get_idstr(rb),
  869. rb_offset,
  870. msg.arg.pagefault.feat.ptid);
  871. mark_postcopy_blocktime_begin(
  872. (uintptr_t)(msg.arg.pagefault.address),
  873. msg.arg.pagefault.feat.ptid, rb);
  874. retry:
  875. /*
  876. * Send the request to the source - we want to request one
  877. * of our host page sizes (which is >= TPS)
  878. */
  879. if (rb != mis->last_rb) {
  880. mis->last_rb = rb;
  881. ret = migrate_send_rp_req_pages(mis,
  882. qemu_ram_get_idstr(rb),
  883. rb_offset,
  884. qemu_ram_pagesize(rb));
  885. } else {
  886. /* Save some space */
  887. ret = migrate_send_rp_req_pages(mis,
  888. NULL,
  889. rb_offset,
  890. qemu_ram_pagesize(rb));
  891. }
  892. if (ret) {
  893. /* May be network failure, try to wait for recovery */
  894. if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
  895. /* We got reconnected somehow, try to continue */
  896. mis->last_rb = NULL;
  897. goto retry;
  898. } else {
  899. /* This is a unavoidable fault */
  900. error_report("%s: migrate_send_rp_req_pages() get %d",
  901. __func__, ret);
  902. break;
  903. }
  904. }
  905. }
  906. /* Now handle any requests from external processes on shared memory */
  907. /* TODO: May need to handle devices deregistering during postcopy */
  908. for (index = 2; index < pfd_len && poll_result; index++) {
  909. if (pfd[index].revents) {
  910. struct PostCopyFD *pcfd =
  911. &g_array_index(mis->postcopy_remote_fds,
  912. struct PostCopyFD, index - 2);
  913. poll_result--;
  914. if (pfd[index].revents & POLLERR) {
  915. error_report("%s: POLLERR on poll %zd fd=%d",
  916. __func__, index, pcfd->fd);
  917. pfd[index].events = 0;
  918. continue;
  919. }
  920. ret = read(pcfd->fd, &msg, sizeof(msg));
  921. if (ret != sizeof(msg)) {
  922. if (errno == EAGAIN) {
  923. /*
  924. * if a wake up happens on the other thread just after
  925. * the poll, there is nothing to read.
  926. */
  927. continue;
  928. }
  929. if (ret < 0) {
  930. error_report("%s: Failed to read full userfault "
  931. "message: %s (shared) revents=%d",
  932. __func__, strerror(errno),
  933. pfd[index].revents);
  934. /*TODO: Could just disable this sharer */
  935. break;
  936. } else {
  937. error_report("%s: Read %d bytes from userfaultfd "
  938. "expected %zd (shared)",
  939. __func__, ret, sizeof(msg));
  940. /*TODO: Could just disable this sharer */
  941. break; /*Lost alignment,don't know what we'd read next*/
  942. }
  943. }
  944. if (msg.event != UFFD_EVENT_PAGEFAULT) {
  945. error_report("%s: Read unexpected event %ud "
  946. "from userfaultfd (shared)",
  947. __func__, msg.event);
  948. continue; /* It's not a page fault, shouldn't happen */
  949. }
  950. /* Call the device handler registered with us */
  951. ret = pcfd->handler(pcfd, &msg);
  952. if (ret) {
  953. error_report("%s: Failed to resolve shared fault on %zd/%s",
  954. __func__, index, pcfd->idstr);
  955. /* TODO: Fail? Disable this sharer? */
  956. }
  957. }
  958. }
  959. }
  960. rcu_unregister_thread();
  961. trace_postcopy_ram_fault_thread_exit();
  962. g_free(pfd);
  963. return NULL;
  964. }
  965. int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
  966. {
  967. /* Open the fd for the kernel to give us userfaults */
  968. mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
  969. if (mis->userfault_fd == -1) {
  970. error_report("%s: Failed to open userfault fd: %s", __func__,
  971. strerror(errno));
  972. return -1;
  973. }
  974. /*
  975. * Although the host check already tested the API, we need to
  976. * do the check again as an ABI handshake on the new fd.
  977. */
  978. if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
  979. return -1;
  980. }
  981. /* Now an eventfd we use to tell the fault-thread to quit */
  982. mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
  983. if (mis->userfault_event_fd == -1) {
  984. error_report("%s: Opening userfault_event_fd: %s", __func__,
  985. strerror(errno));
  986. close(mis->userfault_fd);
  987. return -1;
  988. }
  989. qemu_sem_init(&mis->fault_thread_sem, 0);
  990. qemu_thread_create(&mis->fault_thread, "postcopy/fault",
  991. postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
  992. qemu_sem_wait(&mis->fault_thread_sem);
  993. qemu_sem_destroy(&mis->fault_thread_sem);
  994. mis->have_fault_thread = true;
  995. /* Mark so that we get notified of accesses to unwritten areas */
  996. if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
  997. error_report("ram_block_enable_notify failed");
  998. return -1;
  999. }
  1000. mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
  1001. PROT_READ | PROT_WRITE, MAP_PRIVATE |
  1002. MAP_ANONYMOUS, -1, 0);
  1003. if (mis->postcopy_tmp_page == MAP_FAILED) {
  1004. mis->postcopy_tmp_page = NULL;
  1005. error_report("%s: Failed to map postcopy_tmp_page %s",
  1006. __func__, strerror(errno));
  1007. return -1;
  1008. }
  1009. /*
  1010. * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages
  1011. */
  1012. mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
  1013. PROT_READ | PROT_WRITE,
  1014. MAP_PRIVATE | MAP_ANONYMOUS,
  1015. -1, 0);
  1016. if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
  1017. int e = errno;
  1018. mis->postcopy_tmp_zero_page = NULL;
  1019. error_report("%s: Failed to map large zero page %s",
  1020. __func__, strerror(e));
  1021. return -e;
  1022. }
  1023. memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
  1024. /*
  1025. * Ballooning can mark pages as absent while we're postcopying
  1026. * that would cause false userfaults.
  1027. */
  1028. postcopy_balloon_inhibit(true);
  1029. trace_postcopy_ram_enable_notify();
  1030. return 0;
  1031. }
  1032. static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
  1033. void *from_addr, uint64_t pagesize, RAMBlock *rb)
  1034. {
  1035. int ret;
  1036. if (from_addr) {
  1037. struct uffdio_copy copy_struct;
  1038. copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
  1039. copy_struct.src = (uint64_t)(uintptr_t)from_addr;
  1040. copy_struct.len = pagesize;
  1041. copy_struct.mode = 0;
  1042. ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
  1043. } else {
  1044. struct uffdio_zeropage zero_struct;
  1045. zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
  1046. zero_struct.range.len = pagesize;
  1047. zero_struct.mode = 0;
  1048. ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
  1049. }
  1050. if (!ret) {
  1051. ramblock_recv_bitmap_set_range(rb, host_addr,
  1052. pagesize / qemu_target_page_size());
  1053. mark_postcopy_blocktime_end((uintptr_t)host_addr);
  1054. }
  1055. return ret;
  1056. }
  1057. int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
  1058. {
  1059. int i;
  1060. MigrationIncomingState *mis = migration_incoming_get_current();
  1061. GArray *pcrfds = mis->postcopy_remote_fds;
  1062. for (i = 0; i < pcrfds->len; i++) {
  1063. struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
  1064. int ret = cur->waker(cur, rb, offset);
  1065. if (ret) {
  1066. return ret;
  1067. }
  1068. }
  1069. return 0;
  1070. }
  1071. /*
  1072. * Place a host page (from) at (host) atomically
  1073. * returns 0 on success
  1074. */
  1075. int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
  1076. RAMBlock *rb)
  1077. {
  1078. size_t pagesize = qemu_ram_pagesize(rb);
  1079. /* copy also acks to the kernel waking the stalled thread up
  1080. * TODO: We can inhibit that ack and only do it if it was requested
  1081. * which would be slightly cheaper, but we'd have to be careful
  1082. * of the order of updating our page state.
  1083. */
  1084. if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
  1085. int e = errno;
  1086. error_report("%s: %s copy host: %p from: %p (size: %zd)",
  1087. __func__, strerror(e), host, from, pagesize);
  1088. return -e;
  1089. }
  1090. trace_postcopy_place_page(host);
  1091. return postcopy_notify_shared_wake(rb,
  1092. qemu_ram_block_host_offset(rb, host));
  1093. }
  1094. /*
  1095. * Place a zero page at (host) atomically
  1096. * returns 0 on success
  1097. */
  1098. int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
  1099. RAMBlock *rb)
  1100. {
  1101. size_t pagesize = qemu_ram_pagesize(rb);
  1102. trace_postcopy_place_page_zero(host);
  1103. /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
  1104. * but it's not available for everything (e.g. hugetlbpages)
  1105. */
  1106. if (qemu_ram_is_uf_zeroable(rb)) {
  1107. if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
  1108. int e = errno;
  1109. error_report("%s: %s zero host: %p",
  1110. __func__, strerror(e), host);
  1111. return -e;
  1112. }
  1113. return postcopy_notify_shared_wake(rb,
  1114. qemu_ram_block_host_offset(rb,
  1115. host));
  1116. } else {
  1117. return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
  1118. }
  1119. }
  1120. #else
  1121. /* No target OS support, stubs just fail */
  1122. void fill_destination_postcopy_migration_info(MigrationInfo *info)
  1123. {
  1124. }
  1125. bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
  1126. {
  1127. error_report("%s: No OS support", __func__);
  1128. return false;
  1129. }
  1130. int postcopy_ram_incoming_init(MigrationIncomingState *mis)
  1131. {
  1132. error_report("postcopy_ram_incoming_init: No OS support");
  1133. return -1;
  1134. }
  1135. int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
  1136. {
  1137. assert(0);
  1138. return -1;
  1139. }
  1140. int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
  1141. {
  1142. assert(0);
  1143. return -1;
  1144. }
  1145. int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
  1146. uint64_t client_addr, uint64_t rb_offset)
  1147. {
  1148. assert(0);
  1149. return -1;
  1150. }
  1151. int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
  1152. {
  1153. assert(0);
  1154. return -1;
  1155. }
  1156. int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
  1157. RAMBlock *rb)
  1158. {
  1159. assert(0);
  1160. return -1;
  1161. }
  1162. int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
  1163. RAMBlock *rb)
  1164. {
  1165. assert(0);
  1166. return -1;
  1167. }
  1168. int postcopy_wake_shared(struct PostCopyFD *pcfd,
  1169. uint64_t client_addr,
  1170. RAMBlock *rb)
  1171. {
  1172. assert(0);
  1173. return -1;
  1174. }
  1175. #endif
  1176. /* ------------------------------------------------------------------------- */
  1177. void postcopy_fault_thread_notify(MigrationIncomingState *mis)
  1178. {
  1179. uint64_t tmp64 = 1;
  1180. /*
  1181. * Wakeup the fault_thread. It's an eventfd that should currently
  1182. * be at 0, we're going to increment it to 1
  1183. */
  1184. if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
  1185. /* Not much we can do here, but may as well report it */
  1186. error_report("%s: incrementing failed: %s", __func__,
  1187. strerror(errno));
  1188. }
  1189. }
  1190. /**
  1191. * postcopy_discard_send_init: Called at the start of each RAMBlock before
  1192. * asking to discard individual ranges.
  1193. *
  1194. * @ms: The current migration state.
  1195. * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
  1196. * @name: RAMBlock that discards will operate on.
  1197. */
  1198. static PostcopyDiscardState pds = {0};
  1199. void postcopy_discard_send_init(MigrationState *ms, const char *name)
  1200. {
  1201. pds.ramblock_name = name;
  1202. pds.cur_entry = 0;
  1203. pds.nsentwords = 0;
  1204. pds.nsentcmds = 0;
  1205. }
  1206. /**
  1207. * postcopy_discard_send_range: Called by the bitmap code for each chunk to
  1208. * discard. May send a discard message, may just leave it queued to
  1209. * be sent later.
  1210. *
  1211. * @ms: Current migration state.
  1212. * @start,@length: a range of pages in the migration bitmap in the
  1213. * RAM block passed to postcopy_discard_send_init() (length=1 is one page)
  1214. */
  1215. void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
  1216. unsigned long length)
  1217. {
  1218. size_t tp_size = qemu_target_page_size();
  1219. /* Convert to byte offsets within the RAM block */
  1220. pds.start_list[pds.cur_entry] = start * tp_size;
  1221. pds.length_list[pds.cur_entry] = length * tp_size;
  1222. trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
  1223. pds.cur_entry++;
  1224. pds.nsentwords++;
  1225. if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
  1226. /* Full set, ship it! */
  1227. qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
  1228. pds.ramblock_name,
  1229. pds.cur_entry,
  1230. pds.start_list,
  1231. pds.length_list);
  1232. pds.nsentcmds++;
  1233. pds.cur_entry = 0;
  1234. }
  1235. }
  1236. /**
  1237. * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
  1238. * bitmap code. Sends any outstanding discard messages, frees the PDS
  1239. *
  1240. * @ms: Current migration state.
  1241. */
  1242. void postcopy_discard_send_finish(MigrationState *ms)
  1243. {
  1244. /* Anything unsent? */
  1245. if (pds.cur_entry) {
  1246. qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
  1247. pds.ramblock_name,
  1248. pds.cur_entry,
  1249. pds.start_list,
  1250. pds.length_list);
  1251. pds.nsentcmds++;
  1252. }
  1253. trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
  1254. pds.nsentcmds);
  1255. }
  1256. /*
  1257. * Current state of incoming postcopy; note this is not part of
  1258. * MigrationIncomingState since it's state is used during cleanup
  1259. * at the end as MIS is being freed.
  1260. */
  1261. static PostcopyState incoming_postcopy_state;
  1262. PostcopyState postcopy_state_get(void)
  1263. {
  1264. return atomic_mb_read(&incoming_postcopy_state);
  1265. }
  1266. /* Set the state and return the old state */
  1267. PostcopyState postcopy_state_set(PostcopyState new_state)
  1268. {
  1269. return atomic_xchg(&incoming_postcopy_state, new_state);
  1270. }
  1271. /* Register a handler for external shared memory postcopy
  1272. * called on the destination.
  1273. */
  1274. void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
  1275. {
  1276. MigrationIncomingState *mis = migration_incoming_get_current();
  1277. mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
  1278. *pcfd);
  1279. }
  1280. /* Unregister a handler for external shared memory postcopy
  1281. */
  1282. void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
  1283. {
  1284. guint i;
  1285. MigrationIncomingState *mis = migration_incoming_get_current();
  1286. GArray *pcrfds = mis->postcopy_remote_fds;
  1287. for (i = 0; i < pcrfds->len; i++) {
  1288. struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
  1289. if (cur->fd == pcfd->fd) {
  1290. mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
  1291. return;
  1292. }
  1293. }
  1294. }