2
0

postcopy-ram.c 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753
  1. /*
  2. * Postcopy migration for RAM
  3. *
  4. * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
  5. *
  6. * Authors:
  7. * Dave Gilbert <dgilbert@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10. * See the COPYING file in the top-level directory.
  11. *
  12. */
  13. /*
  14. * Postcopy is a migration technique where the execution flips from the
  15. * source to the destination before all the data has been copied.
  16. */
  17. #include "qemu/osdep.h"
  18. #include "qemu/madvise.h"
  19. #include "exec/target_page.h"
  20. #include "migration.h"
  21. #include "qemu-file.h"
  22. #include "savevm.h"
  23. #include "postcopy-ram.h"
  24. #include "ram.h"
  25. #include "qapi/error.h"
  26. #include "qemu/notify.h"
  27. #include "qemu/rcu.h"
  28. #include "system/system.h"
  29. #include "qemu/error-report.h"
  30. #include "trace.h"
  31. #include "hw/boards.h"
  32. #include "exec/ramblock.h"
  33. #include "socket.h"
  34. #include "yank_functions.h"
  35. #include "tls.h"
  36. #include "qemu/userfaultfd.h"
  37. #include "qemu/mmap-alloc.h"
  38. #include "options.h"
  39. /* Arbitrary limit on size of each discard command,
  40. * keeps them around ~200 bytes
  41. */
  42. #define MAX_DISCARDS_PER_COMMAND 12
  43. typedef struct PostcopyDiscardState {
  44. const char *ramblock_name;
  45. uint16_t cur_entry;
  46. /*
  47. * Start and length of a discard range (bytes)
  48. */
  49. uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
  50. uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
  51. unsigned int nsentwords;
  52. unsigned int nsentcmds;
  53. } PostcopyDiscardState;
  54. static NotifierWithReturnList postcopy_notifier_list;
  55. void postcopy_infrastructure_init(void)
  56. {
  57. notifier_with_return_list_init(&postcopy_notifier_list);
  58. }
  59. void postcopy_add_notifier(NotifierWithReturn *nn)
  60. {
  61. notifier_with_return_list_add(&postcopy_notifier_list, nn);
  62. }
  63. void postcopy_remove_notifier(NotifierWithReturn *n)
  64. {
  65. notifier_with_return_remove(n);
  66. }
  67. int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
  68. {
  69. struct PostcopyNotifyData pnd;
  70. pnd.reason = reason;
  71. return notifier_with_return_list_notify(&postcopy_notifier_list,
  72. &pnd, errp);
  73. }
  74. /*
  75. * NOTE: this routine is not thread safe, we can't call it concurrently. But it
  76. * should be good enough for migration's purposes.
  77. */
  78. void postcopy_thread_create(MigrationIncomingState *mis,
  79. QemuThread *thread, const char *name,
  80. void *(*fn)(void *), int joinable)
  81. {
  82. qemu_sem_init(&mis->thread_sync_sem, 0);
  83. qemu_thread_create(thread, name, fn, mis, joinable);
  84. qemu_sem_wait(&mis->thread_sync_sem);
  85. qemu_sem_destroy(&mis->thread_sync_sem);
  86. }
  87. /* Postcopy needs to detect accesses to pages that haven't yet been copied
  88. * across, and efficiently map new pages in, the techniques for doing this
  89. * are target OS specific.
  90. */
  91. #if defined(__linux__)
  92. #include <poll.h>
  93. #include <sys/ioctl.h>
  94. #include <sys/syscall.h>
  95. #endif
  96. #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
  97. #include <sys/eventfd.h>
  98. #include <linux/userfaultfd.h>
  99. typedef struct PostcopyBlocktimeContext {
  100. /* time when page fault initiated per vCPU */
  101. uint32_t *page_fault_vcpu_time;
  102. /* page address per vCPU */
  103. uintptr_t *vcpu_addr;
  104. uint32_t total_blocktime;
  105. /* blocktime per vCPU */
  106. uint32_t *vcpu_blocktime;
  107. /* point in time when last page fault was initiated */
  108. uint32_t last_begin;
  109. /* number of vCPU are suspended */
  110. int smp_cpus_down;
  111. uint64_t start_time;
  112. /*
  113. * Handler for exit event, necessary for
  114. * releasing whole blocktime_ctx
  115. */
  116. Notifier exit_notifier;
  117. } PostcopyBlocktimeContext;
  118. static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
  119. {
  120. g_free(ctx->page_fault_vcpu_time);
  121. g_free(ctx->vcpu_addr);
  122. g_free(ctx->vcpu_blocktime);
  123. g_free(ctx);
  124. }
  125. static void migration_exit_cb(Notifier *n, void *data)
  126. {
  127. PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
  128. exit_notifier);
  129. destroy_blocktime_context(ctx);
  130. }
  131. static struct PostcopyBlocktimeContext *blocktime_context_new(void)
  132. {
  133. MachineState *ms = MACHINE(qdev_get_machine());
  134. unsigned int smp_cpus = ms->smp.cpus;
  135. PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
  136. ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
  137. ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
  138. ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
  139. ctx->exit_notifier.notify = migration_exit_cb;
  140. ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
  141. qemu_add_exit_notifier(&ctx->exit_notifier);
  142. return ctx;
  143. }
  144. static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
  145. {
  146. MachineState *ms = MACHINE(qdev_get_machine());
  147. uint32List *list = NULL;
  148. int i;
  149. for (i = ms->smp.cpus - 1; i >= 0; i--) {
  150. QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]);
  151. }
  152. return list;
  153. }
  154. /*
  155. * This function just populates MigrationInfo from postcopy's
  156. * blocktime context. It will not populate MigrationInfo,
  157. * unless postcopy-blocktime capability was set.
  158. *
  159. * @info: pointer to MigrationInfo to populate
  160. */
  161. void fill_destination_postcopy_migration_info(MigrationInfo *info)
  162. {
  163. MigrationIncomingState *mis = migration_incoming_get_current();
  164. PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
  165. if (!bc) {
  166. return;
  167. }
  168. info->has_postcopy_blocktime = true;
  169. info->postcopy_blocktime = bc->total_blocktime;
  170. info->has_postcopy_vcpu_blocktime = true;
  171. info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
  172. }
  173. static uint32_t get_postcopy_total_blocktime(void)
  174. {
  175. MigrationIncomingState *mis = migration_incoming_get_current();
  176. PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
  177. if (!bc) {
  178. return 0;
  179. }
  180. return bc->total_blocktime;
  181. }
  182. /**
  183. * receive_ufd_features: check userfault fd features, to request only supported
  184. * features in the future.
  185. *
  186. * Returns: true on success
  187. *
  188. * __NR_userfaultfd - should be checked before
  189. * @features: out parameter will contain uffdio_api.features provided by kernel
  190. * in case of success
  191. */
  192. static bool receive_ufd_features(uint64_t *features)
  193. {
  194. struct uffdio_api api_struct = {0};
  195. int ufd;
  196. bool ret = true;
  197. ufd = uffd_open(O_CLOEXEC);
  198. if (ufd == -1) {
  199. error_report("%s: uffd_open() failed: %s", __func__, strerror(errno));
  200. return false;
  201. }
  202. /* ask features */
  203. api_struct.api = UFFD_API;
  204. api_struct.features = 0;
  205. if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  206. error_report("%s: UFFDIO_API failed: %s", __func__,
  207. strerror(errno));
  208. ret = false;
  209. goto release_ufd;
  210. }
  211. *features = api_struct.features;
  212. release_ufd:
  213. close(ufd);
  214. return ret;
  215. }
  216. /**
  217. * request_ufd_features: this function should be called only once on a newly
  218. * opened ufd, subsequent calls will lead to error.
  219. *
  220. * Returns: true on success
  221. *
  222. * @ufd: fd obtained from userfaultfd syscall
  223. * @features: bit mask see UFFD_API_FEATURES
  224. */
  225. static bool request_ufd_features(int ufd, uint64_t features)
  226. {
  227. struct uffdio_api api_struct = {0};
  228. uint64_t ioctl_mask;
  229. api_struct.api = UFFD_API;
  230. api_struct.features = features;
  231. if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  232. error_report("%s failed: UFFDIO_API failed: %s", __func__,
  233. strerror(errno));
  234. return false;
  235. }
  236. ioctl_mask = 1ULL << _UFFDIO_REGISTER |
  237. 1ULL << _UFFDIO_UNREGISTER;
  238. if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
  239. error_report("Missing userfault features: %" PRIx64,
  240. (uint64_t)(~api_struct.ioctls & ioctl_mask));
  241. return false;
  242. }
  243. return true;
  244. }
  245. static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis,
  246. Error **errp)
  247. {
  248. ERRP_GUARD();
  249. uint64_t asked_features = 0;
  250. static uint64_t supported_features;
  251. /*
  252. * it's not possible to
  253. * request UFFD_API twice per one fd
  254. * userfault fd features is persistent
  255. */
  256. if (!supported_features) {
  257. if (!receive_ufd_features(&supported_features)) {
  258. error_setg(errp, "Userfault feature detection failed");
  259. return false;
  260. }
  261. }
  262. #ifdef UFFD_FEATURE_THREAD_ID
  263. if (UFFD_FEATURE_THREAD_ID & supported_features) {
  264. asked_features |= UFFD_FEATURE_THREAD_ID;
  265. if (migrate_postcopy_blocktime()) {
  266. if (!mis->blocktime_ctx) {
  267. mis->blocktime_ctx = blocktime_context_new();
  268. }
  269. }
  270. }
  271. #endif
  272. /*
  273. * request features, even if asked_features is 0, due to
  274. * kernel expects UFFD_API before UFFDIO_REGISTER, per
  275. * userfault file descriptor
  276. */
  277. if (!request_ufd_features(ufd, asked_features)) {
  278. error_setg(errp, "Failed features %" PRIu64, asked_features);
  279. return false;
  280. }
  281. if (qemu_real_host_page_size() != ram_pagesize_summary()) {
  282. bool have_hp = false;
  283. /* We've got a huge page */
  284. #ifdef UFFD_FEATURE_MISSING_HUGETLBFS
  285. have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
  286. #endif
  287. if (!have_hp) {
  288. error_setg(errp,
  289. "Userfault on this host does not support huge pages");
  290. return false;
  291. }
  292. }
  293. return true;
  294. }
  295. /* Callback from postcopy_ram_supported_by_host block iterator.
  296. */
  297. static int test_ramblock_postcopiable(RAMBlock *rb, Error **errp)
  298. {
  299. const char *block_name = qemu_ram_get_idstr(rb);
  300. ram_addr_t length = qemu_ram_get_used_length(rb);
  301. size_t pagesize = qemu_ram_pagesize(rb);
  302. QemuFsType fs;
  303. if (length % pagesize) {
  304. error_setg(errp,
  305. "Postcopy requires RAM blocks to be a page size multiple,"
  306. " block %s is 0x" RAM_ADDR_FMT " bytes with a "
  307. "page size of 0x%zx", block_name, length, pagesize);
  308. return 1;
  309. }
  310. if (rb->fd >= 0) {
  311. fs = qemu_fd_getfs(rb->fd);
  312. if (fs != QEMU_FS_TYPE_TMPFS && fs != QEMU_FS_TYPE_HUGETLBFS) {
  313. error_setg(errp,
  314. "Host backend files need to be TMPFS or HUGETLBFS only");
  315. return 1;
  316. }
  317. }
  318. return 0;
  319. }
  320. /*
  321. * Note: This has the side effect of munlock'ing all of RAM, that's
  322. * normally fine since if the postcopy succeeds it gets turned back on at the
  323. * end.
  324. */
  325. bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp)
  326. {
  327. ERRP_GUARD();
  328. long pagesize = qemu_real_host_page_size();
  329. int ufd = -1;
  330. bool ret = false; /* Error unless we change it */
  331. void *testarea = NULL;
  332. struct uffdio_register reg_struct;
  333. struct uffdio_range range_struct;
  334. uint64_t feature_mask;
  335. RAMBlock *block;
  336. if (qemu_target_page_size() > pagesize) {
  337. error_setg(errp, "Target page size bigger than host page size");
  338. goto out;
  339. }
  340. ufd = uffd_open(O_CLOEXEC);
  341. if (ufd == -1) {
  342. error_setg(errp, "Userfaultfd not available: %s", strerror(errno));
  343. goto out;
  344. }
  345. /* Give devices a chance to object */
  346. if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, errp)) {
  347. goto out;
  348. }
  349. /* Version and features check */
  350. if (!ufd_check_and_apply(ufd, mis, errp)) {
  351. goto out;
  352. }
  353. /*
  354. * We don't support postcopy with some type of ramblocks.
  355. *
  356. * NOTE: we explicitly ignored migrate_ram_is_ignored() instead we checked
  357. * all possible ramblocks. This is because this function can be called
  358. * when creating the migration object, during the phase RAM_MIGRATABLE
  359. * is not even properly set for all the ramblocks.
  360. *
  361. * A side effect of this is we'll also check against RAM_SHARED
  362. * ramblocks even if migrate_ignore_shared() is set (in which case
  363. * we'll never migrate RAM_SHARED at all), but normally this shouldn't
  364. * affect in reality, or we can revisit.
  365. */
  366. RAMBLOCK_FOREACH(block) {
  367. if (test_ramblock_postcopiable(block, errp)) {
  368. goto out;
  369. }
  370. }
  371. /*
  372. * userfault and mlock don't go together; we'll put it back later if
  373. * it was enabled.
  374. */
  375. if (munlockall()) {
  376. error_setg(errp, "munlockall() failed: %s", strerror(errno));
  377. goto out;
  378. }
  379. /*
  380. * We need to check that the ops we need are supported on anon memory
  381. * To do that we need to register a chunk and see the flags that
  382. * are returned.
  383. */
  384. testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
  385. MAP_ANONYMOUS, -1, 0);
  386. if (testarea == MAP_FAILED) {
  387. error_setg(errp, "Failed to map test area: %s", strerror(errno));
  388. goto out;
  389. }
  390. g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize));
  391. reg_struct.range.start = (uintptr_t)testarea;
  392. reg_struct.range.len = pagesize;
  393. reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
  394. if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
  395. error_setg(errp, "UFFDIO_REGISTER failed: %s", strerror(errno));
  396. goto out;
  397. }
  398. range_struct.start = (uintptr_t)testarea;
  399. range_struct.len = pagesize;
  400. if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
  401. error_setg(errp, "UFFDIO_UNREGISTER failed: %s", strerror(errno));
  402. goto out;
  403. }
  404. feature_mask = 1ULL << _UFFDIO_WAKE |
  405. 1ULL << _UFFDIO_COPY |
  406. 1ULL << _UFFDIO_ZEROPAGE;
  407. if ((reg_struct.ioctls & feature_mask) != feature_mask) {
  408. error_setg(errp, "Missing userfault map features: %" PRIx64,
  409. (uint64_t)(~reg_struct.ioctls & feature_mask));
  410. goto out;
  411. }
  412. /* Success! */
  413. ret = true;
  414. out:
  415. if (testarea) {
  416. munmap(testarea, pagesize);
  417. }
  418. if (ufd != -1) {
  419. close(ufd);
  420. }
  421. return ret;
  422. }
  423. /*
  424. * Setup an area of RAM so that it *can* be used for postcopy later; this
  425. * must be done right at the start prior to pre-copy.
  426. * opaque should be the MIS.
  427. */
  428. static int init_range(RAMBlock *rb, void *opaque)
  429. {
  430. const char *block_name = qemu_ram_get_idstr(rb);
  431. void *host_addr = qemu_ram_get_host_addr(rb);
  432. ram_addr_t offset = qemu_ram_get_offset(rb);
  433. ram_addr_t length = qemu_ram_get_used_length(rb);
  434. trace_postcopy_init_range(block_name, host_addr, offset, length);
  435. /*
  436. * Save the used_length before running the guest. In case we have to
  437. * resize RAM blocks when syncing RAM block sizes from the source during
  438. * precopy, we'll update it manually via the ram block notifier.
  439. */
  440. rb->postcopy_length = length;
  441. /*
  442. * We need the whole of RAM to be truly empty for postcopy, so things
  443. * like ROMs and any data tables built during init must be zero'd
  444. * - we're going to get the copy from the source anyway.
  445. * (Precopy will just overwrite this data, so doesn't need the discard)
  446. */
  447. if (ram_discard_range(block_name, 0, length)) {
  448. return -1;
  449. }
  450. return 0;
  451. }
  452. /*
  453. * At the end of migration, undo the effects of init_range
  454. * opaque should be the MIS.
  455. */
  456. static int cleanup_range(RAMBlock *rb, void *opaque)
  457. {
  458. const char *block_name = qemu_ram_get_idstr(rb);
  459. void *host_addr = qemu_ram_get_host_addr(rb);
  460. ram_addr_t offset = qemu_ram_get_offset(rb);
  461. ram_addr_t length = rb->postcopy_length;
  462. MigrationIncomingState *mis = opaque;
  463. struct uffdio_range range_struct;
  464. trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
  465. /*
  466. * We turned off hugepage for the precopy stage with postcopy enabled
  467. * we can turn it back on now.
  468. */
  469. qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
  470. /*
  471. * We can also turn off userfault now since we should have all the
  472. * pages. It can be useful to leave it on to debug postcopy
  473. * if you're not sure it's always getting every page.
  474. */
  475. range_struct.start = (uintptr_t)host_addr;
  476. range_struct.len = length;
  477. if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
  478. error_report("%s: userfault unregister %s", __func__, strerror(errno));
  479. return -1;
  480. }
  481. return 0;
  482. }
  483. /*
  484. * Initialise postcopy-ram, setting the RAM to a state where we can go into
  485. * postcopy later; must be called prior to any precopy.
  486. * called from arch_init's similarly named ram_postcopy_incoming_init
  487. */
  488. int postcopy_ram_incoming_init(MigrationIncomingState *mis)
  489. {
  490. if (foreach_not_ignored_block(init_range, NULL)) {
  491. return -1;
  492. }
  493. return 0;
  494. }
  495. static void postcopy_temp_pages_cleanup(MigrationIncomingState *mis)
  496. {
  497. int i;
  498. if (mis->postcopy_tmp_pages) {
  499. for (i = 0; i < mis->postcopy_channels; i++) {
  500. if (mis->postcopy_tmp_pages[i].tmp_huge_page) {
  501. munmap(mis->postcopy_tmp_pages[i].tmp_huge_page,
  502. mis->largest_page_size);
  503. mis->postcopy_tmp_pages[i].tmp_huge_page = NULL;
  504. }
  505. }
  506. g_free(mis->postcopy_tmp_pages);
  507. mis->postcopy_tmp_pages = NULL;
  508. }
  509. if (mis->postcopy_tmp_zero_page) {
  510. munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
  511. mis->postcopy_tmp_zero_page = NULL;
  512. }
  513. }
  514. /*
  515. * At the end of a migration where postcopy_ram_incoming_init was called.
  516. */
  517. int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
  518. {
  519. trace_postcopy_ram_incoming_cleanup_entry();
  520. if (mis->preempt_thread_status == PREEMPT_THREAD_CREATED) {
  521. /* Notify the fast load thread to quit */
  522. mis->preempt_thread_status = PREEMPT_THREAD_QUIT;
  523. /*
  524. * Update preempt_thread_status before reading count. Note: mutex
  525. * lock only provide ACQUIRE semantic, and it doesn't stops this
  526. * write to be reordered after reading the count.
  527. */
  528. smp_mb();
  529. /*
  530. * It's possible that the preempt thread is still handling the last
  531. * pages to arrive which were requested by guest page faults.
  532. * Making sure nothing is left behind by waiting on the condvar if
  533. * that unlikely case happened.
  534. */
  535. WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
  536. if (qatomic_read(&mis->page_requested_count)) {
  537. /*
  538. * It is guaranteed to receive a signal later, because the
  539. * count>0 now, so it's destined to be decreased to zero
  540. * very soon by the preempt thread.
  541. */
  542. qemu_cond_wait(&mis->page_request_cond,
  543. &mis->page_request_mutex);
  544. }
  545. }
  546. /* Notify the fast load thread to quit */
  547. if (mis->postcopy_qemufile_dst) {
  548. qemu_file_shutdown(mis->postcopy_qemufile_dst);
  549. }
  550. qemu_thread_join(&mis->postcopy_prio_thread);
  551. mis->preempt_thread_status = PREEMPT_THREAD_NONE;
  552. }
  553. if (mis->have_fault_thread) {
  554. Error *local_err = NULL;
  555. /* Let the fault thread quit */
  556. qatomic_set(&mis->fault_thread_quit, 1);
  557. postcopy_fault_thread_notify(mis);
  558. trace_postcopy_ram_incoming_cleanup_join();
  559. qemu_thread_join(&mis->fault_thread);
  560. if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
  561. error_report_err(local_err);
  562. return -1;
  563. }
  564. if (foreach_not_ignored_block(cleanup_range, mis)) {
  565. return -1;
  566. }
  567. trace_postcopy_ram_incoming_cleanup_closeuf();
  568. close(mis->userfault_fd);
  569. close(mis->userfault_event_fd);
  570. mis->have_fault_thread = false;
  571. }
  572. if (should_mlock(mlock_state)) {
  573. if (os_mlock(is_mlock_on_fault(mlock_state)) < 0) {
  574. error_report("mlock: %s", strerror(errno));
  575. /*
  576. * It doesn't feel right to fail at this point, we have a valid
  577. * VM state.
  578. */
  579. }
  580. }
  581. postcopy_temp_pages_cleanup(mis);
  582. trace_postcopy_ram_incoming_cleanup_blocktime(
  583. get_postcopy_total_blocktime());
  584. trace_postcopy_ram_incoming_cleanup_exit();
  585. return 0;
  586. }
  587. /*
  588. * Disable huge pages on an area
  589. */
  590. static int nhp_range(RAMBlock *rb, void *opaque)
  591. {
  592. const char *block_name = qemu_ram_get_idstr(rb);
  593. void *host_addr = qemu_ram_get_host_addr(rb);
  594. ram_addr_t offset = qemu_ram_get_offset(rb);
  595. ram_addr_t length = rb->postcopy_length;
  596. trace_postcopy_nhp_range(block_name, host_addr, offset, length);
  597. /*
  598. * Before we do discards we need to ensure those discards really
  599. * do delete areas of the page, even if THP thinks a hugepage would
  600. * be a good idea, so force hugepages off.
  601. */
  602. qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
  603. return 0;
  604. }
  605. /*
  606. * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
  607. * however leaving it until after precopy means that most of the precopy
  608. * data is still THPd
  609. */
  610. int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
  611. {
  612. if (foreach_not_ignored_block(nhp_range, mis)) {
  613. return -1;
  614. }
  615. postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
  616. return 0;
  617. }
  618. /*
  619. * Mark the given area of RAM as requiring notification to unwritten areas
  620. * Used as a callback on foreach_not_ignored_block.
  621. * host_addr: Base of area to mark
  622. * offset: Offset in the whole ram arena
  623. * length: Length of the section
  624. * opaque: MigrationIncomingState pointer
  625. * Returns 0 on success
  626. */
  627. static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
  628. {
  629. MigrationIncomingState *mis = opaque;
  630. struct uffdio_register reg_struct;
  631. reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
  632. reg_struct.range.len = rb->postcopy_length;
  633. reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
  634. /* Now tell our userfault_fd that it's responsible for this area */
  635. if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
  636. error_report("%s userfault register: %s", __func__, strerror(errno));
  637. return -1;
  638. }
  639. if (!(reg_struct.ioctls & (1ULL << _UFFDIO_COPY))) {
  640. error_report("%s userfault: Region doesn't support COPY", __func__);
  641. return -1;
  642. }
  643. if (reg_struct.ioctls & (1ULL << _UFFDIO_ZEROPAGE)) {
  644. qemu_ram_set_uf_zeroable(rb);
  645. }
  646. return 0;
  647. }
  648. int postcopy_wake_shared(struct PostCopyFD *pcfd,
  649. uint64_t client_addr,
  650. RAMBlock *rb)
  651. {
  652. size_t pagesize = qemu_ram_pagesize(rb);
  653. trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
  654. return uffd_wakeup(pcfd->fd,
  655. (void *)(uintptr_t)ROUND_DOWN(client_addr, pagesize),
  656. pagesize);
  657. }
  658. static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
  659. ram_addr_t start, uint64_t haddr)
  660. {
  661. void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
  662. /*
  663. * Discarded pages (via RamDiscardManager) are never migrated. On unlikely
  664. * access, place a zeropage, which will also set the relevant bits in the
  665. * recv_bitmap accordingly, so we won't try placing a zeropage twice.
  666. *
  667. * Checking a single bit is sufficient to handle pagesize > TPS as either
  668. * all relevant bits are set or not.
  669. */
  670. assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb)));
  671. if (ramblock_page_is_discarded(rb, start)) {
  672. bool received = ramblock_recv_bitmap_test_byte_offset(rb, start);
  673. return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
  674. }
  675. return migrate_send_rp_req_pages(mis, rb, start, haddr);
  676. }
  677. /*
  678. * Callback from shared fault handlers to ask for a page,
  679. * the page must be specified by a RAMBlock and an offset in that rb
  680. * Note: Only for use by shared fault handlers (in fault thread)
  681. */
  682. int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
  683. uint64_t client_addr, uint64_t rb_offset)
  684. {
  685. uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
  686. MigrationIncomingState *mis = migration_incoming_get_current();
  687. trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
  688. rb_offset);
  689. if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
  690. trace_postcopy_request_shared_page_present(pcfd->idstr,
  691. qemu_ram_get_idstr(rb), rb_offset);
  692. return postcopy_wake_shared(pcfd, client_addr, rb);
  693. }
  694. postcopy_request_page(mis, rb, aligned_rbo, client_addr);
  695. return 0;
  696. }
  697. static int get_mem_fault_cpu_index(uint32_t pid)
  698. {
  699. CPUState *cpu_iter;
  700. CPU_FOREACH(cpu_iter) {
  701. if (cpu_iter->thread_id == pid) {
  702. trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
  703. return cpu_iter->cpu_index;
  704. }
  705. }
  706. trace_get_mem_fault_cpu_index(-1, pid);
  707. return -1;
  708. }
  709. static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
  710. {
  711. int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
  712. dc->start_time;
  713. return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
  714. }
  715. /*
  716. * This function is being called when pagefault occurs. It
  717. * tracks down vCPU blocking time.
  718. *
  719. * @addr: faulted host virtual address
  720. * @ptid: faulted process thread id
  721. * @rb: ramblock appropriate to addr
  722. */
  723. static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
  724. RAMBlock *rb)
  725. {
  726. int cpu, already_received;
  727. MigrationIncomingState *mis = migration_incoming_get_current();
  728. PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
  729. uint32_t low_time_offset;
  730. if (!dc || ptid == 0) {
  731. return;
  732. }
  733. cpu = get_mem_fault_cpu_index(ptid);
  734. if (cpu < 0) {
  735. return;
  736. }
  737. low_time_offset = get_low_time_offset(dc);
  738. if (dc->vcpu_addr[cpu] == 0) {
  739. qatomic_inc(&dc->smp_cpus_down);
  740. }
  741. qatomic_xchg(&dc->last_begin, low_time_offset);
  742. qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
  743. qatomic_xchg(&dc->vcpu_addr[cpu], addr);
  744. /*
  745. * check it here, not at the beginning of the function,
  746. * due to, check could occur early than bitmap_set in
  747. * qemu_ufd_copy_ioctl
  748. */
  749. already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
  750. if (already_received) {
  751. qatomic_xchg(&dc->vcpu_addr[cpu], 0);
  752. qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
  753. qatomic_dec(&dc->smp_cpus_down);
  754. }
  755. trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
  756. cpu, already_received);
  757. }
  758. /*
  759. * This function just provide calculated blocktime per cpu and trace it.
  760. * Total blocktime is calculated in mark_postcopy_blocktime_end.
  761. *
  762. *
  763. * Assume we have 3 CPU
  764. *
  765. * S1 E1 S1 E1
  766. * -----***********------------xxx***************------------------------> CPU1
  767. *
  768. * S2 E2
  769. * ------------****************xxx---------------------------------------> CPU2
  770. *
  771. * S3 E3
  772. * ------------------------****xxx********-------------------------------> CPU3
  773. *
  774. * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
  775. * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
  776. * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
  777. * it's a part of total blocktime.
  778. * S1 - here is last_begin
  779. * Legend of the picture is following:
  780. * * - means blocktime per vCPU
  781. * x - means overlapped blocktime (total blocktime)
  782. *
  783. * @addr: host virtual address
  784. */
  785. static void mark_postcopy_blocktime_end(uintptr_t addr)
  786. {
  787. MigrationIncomingState *mis = migration_incoming_get_current();
  788. PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
  789. MachineState *ms = MACHINE(qdev_get_machine());
  790. unsigned int smp_cpus = ms->smp.cpus;
  791. int i, affected_cpu = 0;
  792. bool vcpu_total_blocktime = false;
  793. uint32_t read_vcpu_time, low_time_offset;
  794. if (!dc) {
  795. return;
  796. }
  797. low_time_offset = get_low_time_offset(dc);
  798. /* lookup cpu, to clear it,
  799. * that algorithm looks straightforward, but it's not
  800. * optimal, more optimal algorithm is keeping tree or hash
  801. * where key is address value is a list of */
  802. for (i = 0; i < smp_cpus; i++) {
  803. uint32_t vcpu_blocktime = 0;
  804. read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
  805. if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
  806. read_vcpu_time == 0) {
  807. continue;
  808. }
  809. qatomic_xchg(&dc->vcpu_addr[i], 0);
  810. vcpu_blocktime = low_time_offset - read_vcpu_time;
  811. affected_cpu += 1;
  812. /* we need to know is that mark_postcopy_end was due to
  813. * faulted page, another possible case it's prefetched
  814. * page and in that case we shouldn't be here */
  815. if (!vcpu_total_blocktime &&
  816. qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
  817. vcpu_total_blocktime = true;
  818. }
  819. /* continue cycle, due to one page could affect several vCPUs */
  820. dc->vcpu_blocktime[i] += vcpu_blocktime;
  821. }
  822. qatomic_sub(&dc->smp_cpus_down, affected_cpu);
  823. if (vcpu_total_blocktime) {
  824. dc->total_blocktime += low_time_offset - qatomic_fetch_add(
  825. &dc->last_begin, 0);
  826. }
  827. trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
  828. affected_cpu);
  829. }
  830. static void postcopy_pause_fault_thread(MigrationIncomingState *mis)
  831. {
  832. trace_postcopy_pause_fault_thread();
  833. qemu_sem_wait(&mis->postcopy_pause_sem_fault);
  834. trace_postcopy_pause_fault_thread_continued();
  835. }
  836. /*
  837. * Handle faults detected by the USERFAULT markings
  838. */
  839. static void *postcopy_ram_fault_thread(void *opaque)
  840. {
  841. MigrationIncomingState *mis = opaque;
  842. struct uffd_msg msg;
  843. int ret;
  844. size_t index;
  845. RAMBlock *rb = NULL;
  846. trace_postcopy_ram_fault_thread_entry();
  847. rcu_register_thread();
  848. mis->last_rb = NULL; /* last RAMBlock we sent part of */
  849. qemu_sem_post(&mis->thread_sync_sem);
  850. struct pollfd *pfd;
  851. size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
  852. pfd = g_new0(struct pollfd, pfd_len);
  853. pfd[0].fd = mis->userfault_fd;
  854. pfd[0].events = POLLIN;
  855. pfd[1].fd = mis->userfault_event_fd;
  856. pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
  857. trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
  858. for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
  859. struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
  860. struct PostCopyFD, index);
  861. pfd[2 + index].fd = pcfd->fd;
  862. pfd[2 + index].events = POLLIN;
  863. trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
  864. pcfd->fd);
  865. }
  866. while (true) {
  867. ram_addr_t rb_offset;
  868. int poll_result;
  869. /*
  870. * We're mainly waiting for the kernel to give us a faulting HVA,
  871. * however we can be told to quit via userfault_quit_fd which is
  872. * an eventfd
  873. */
  874. poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
  875. if (poll_result == -1) {
  876. error_report("%s: userfault poll: %s", __func__, strerror(errno));
  877. break;
  878. }
  879. if (!mis->to_src_file) {
  880. /*
  881. * Possibly someone tells us that the return path is
  882. * broken already using the event. We should hold until
  883. * the channel is rebuilt.
  884. */
  885. postcopy_pause_fault_thread(mis);
  886. }
  887. if (pfd[1].revents) {
  888. uint64_t tmp64 = 0;
  889. /* Consume the signal */
  890. if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
  891. /* Nothing obviously nicer than posting this error. */
  892. error_report("%s: read() failed", __func__);
  893. }
  894. if (qatomic_read(&mis->fault_thread_quit)) {
  895. trace_postcopy_ram_fault_thread_quit();
  896. break;
  897. }
  898. }
  899. if (pfd[0].revents) {
  900. poll_result--;
  901. ret = read(mis->userfault_fd, &msg, sizeof(msg));
  902. if (ret != sizeof(msg)) {
  903. if (errno == EAGAIN) {
  904. /*
  905. * if a wake up happens on the other thread just after
  906. * the poll, there is nothing to read.
  907. */
  908. continue;
  909. }
  910. if (ret < 0) {
  911. error_report("%s: Failed to read full userfault "
  912. "message: %s",
  913. __func__, strerror(errno));
  914. break;
  915. } else {
  916. error_report("%s: Read %d bytes from userfaultfd "
  917. "expected %zd",
  918. __func__, ret, sizeof(msg));
  919. break; /* Lost alignment, don't know what we'd read next */
  920. }
  921. }
  922. if (msg.event != UFFD_EVENT_PAGEFAULT) {
  923. error_report("%s: Read unexpected event %ud from userfaultfd",
  924. __func__, msg.event);
  925. continue; /* It's not a page fault, shouldn't happen */
  926. }
  927. rb = qemu_ram_block_from_host(
  928. (void *)(uintptr_t)msg.arg.pagefault.address,
  929. true, &rb_offset);
  930. if (!rb) {
  931. error_report("postcopy_ram_fault_thread: Fault outside guest: %"
  932. PRIx64, (uint64_t)msg.arg.pagefault.address);
  933. break;
  934. }
  935. rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
  936. trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
  937. qemu_ram_get_idstr(rb),
  938. rb_offset,
  939. msg.arg.pagefault.feat.ptid);
  940. mark_postcopy_blocktime_begin(
  941. (uintptr_t)(msg.arg.pagefault.address),
  942. msg.arg.pagefault.feat.ptid, rb);
  943. retry:
  944. /*
  945. * Send the request to the source - we want to request one
  946. * of our host page sizes (which is >= TPS)
  947. */
  948. ret = postcopy_request_page(mis, rb, rb_offset,
  949. msg.arg.pagefault.address);
  950. if (ret) {
  951. /* May be network failure, try to wait for recovery */
  952. postcopy_pause_fault_thread(mis);
  953. goto retry;
  954. }
  955. }
  956. /* Now handle any requests from external processes on shared memory */
  957. /* TODO: May need to handle devices deregistering during postcopy */
  958. for (index = 2; index < pfd_len && poll_result; index++) {
  959. if (pfd[index].revents) {
  960. struct PostCopyFD *pcfd =
  961. &g_array_index(mis->postcopy_remote_fds,
  962. struct PostCopyFD, index - 2);
  963. poll_result--;
  964. if (pfd[index].revents & POLLERR) {
  965. error_report("%s: POLLERR on poll %zd fd=%d",
  966. __func__, index, pcfd->fd);
  967. pfd[index].events = 0;
  968. continue;
  969. }
  970. ret = read(pcfd->fd, &msg, sizeof(msg));
  971. if (ret != sizeof(msg)) {
  972. if (errno == EAGAIN) {
  973. /*
  974. * if a wake up happens on the other thread just after
  975. * the poll, there is nothing to read.
  976. */
  977. continue;
  978. }
  979. if (ret < 0) {
  980. error_report("%s: Failed to read full userfault "
  981. "message: %s (shared) revents=%d",
  982. __func__, strerror(errno),
  983. pfd[index].revents);
  984. /*TODO: Could just disable this sharer */
  985. break;
  986. } else {
  987. error_report("%s: Read %d bytes from userfaultfd "
  988. "expected %zd (shared)",
  989. __func__, ret, sizeof(msg));
  990. /*TODO: Could just disable this sharer */
  991. break; /*Lost alignment,don't know what we'd read next*/
  992. }
  993. }
  994. if (msg.event != UFFD_EVENT_PAGEFAULT) {
  995. error_report("%s: Read unexpected event %ud "
  996. "from userfaultfd (shared)",
  997. __func__, msg.event);
  998. continue; /* It's not a page fault, shouldn't happen */
  999. }
  1000. /* Call the device handler registered with us */
  1001. ret = pcfd->handler(pcfd, &msg);
  1002. if (ret) {
  1003. error_report("%s: Failed to resolve shared fault on %zd/%s",
  1004. __func__, index, pcfd->idstr);
  1005. /* TODO: Fail? Disable this sharer? */
  1006. }
  1007. }
  1008. }
  1009. }
  1010. rcu_unregister_thread();
  1011. trace_postcopy_ram_fault_thread_exit();
  1012. g_free(pfd);
  1013. return NULL;
  1014. }
  1015. static int postcopy_temp_pages_setup(MigrationIncomingState *mis)
  1016. {
  1017. PostcopyTmpPage *tmp_page;
  1018. int err, i, channels;
  1019. void *temp_page;
  1020. if (migrate_postcopy_preempt()) {
  1021. /* If preemption enabled, need extra channel for urgent requests */
  1022. mis->postcopy_channels = RAM_CHANNEL_MAX;
  1023. } else {
  1024. /* Both precopy/postcopy on the same channel */
  1025. mis->postcopy_channels = 1;
  1026. }
  1027. channels = mis->postcopy_channels;
  1028. mis->postcopy_tmp_pages = g_malloc0_n(sizeof(PostcopyTmpPage), channels);
  1029. for (i = 0; i < channels; i++) {
  1030. tmp_page = &mis->postcopy_tmp_pages[i];
  1031. temp_page = mmap(NULL, mis->largest_page_size, PROT_READ | PROT_WRITE,
  1032. MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
  1033. if (temp_page == MAP_FAILED) {
  1034. err = errno;
  1035. error_report("%s: Failed to map postcopy_tmp_pages[%d]: %s",
  1036. __func__, i, strerror(err));
  1037. /* Clean up will be done later */
  1038. return -err;
  1039. }
  1040. tmp_page->tmp_huge_page = temp_page;
  1041. /* Initialize default states for each tmp page */
  1042. postcopy_temp_page_reset(tmp_page);
  1043. }
  1044. /*
  1045. * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages
  1046. */
  1047. mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
  1048. PROT_READ | PROT_WRITE,
  1049. MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
  1050. if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
  1051. err = errno;
  1052. mis->postcopy_tmp_zero_page = NULL;
  1053. error_report("%s: Failed to map large zero page %s",
  1054. __func__, strerror(err));
  1055. return -err;
  1056. }
  1057. memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
  1058. return 0;
  1059. }
  1060. int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
  1061. {
  1062. Error *local_err = NULL;
  1063. /* Open the fd for the kernel to give us userfaults */
  1064. mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK);
  1065. if (mis->userfault_fd == -1) {
  1066. error_report("%s: Failed to open userfault fd: %s", __func__,
  1067. strerror(errno));
  1068. return -1;
  1069. }
  1070. /*
  1071. * Although the host check already tested the API, we need to
  1072. * do the check again as an ABI handshake on the new fd.
  1073. */
  1074. if (!ufd_check_and_apply(mis->userfault_fd, mis, &local_err)) {
  1075. error_report_err(local_err);
  1076. return -1;
  1077. }
  1078. /* Now an eventfd we use to tell the fault-thread to quit */
  1079. mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
  1080. if (mis->userfault_event_fd == -1) {
  1081. error_report("%s: Opening userfault_event_fd: %s", __func__,
  1082. strerror(errno));
  1083. close(mis->userfault_fd);
  1084. return -1;
  1085. }
  1086. postcopy_thread_create(mis, &mis->fault_thread,
  1087. MIGRATION_THREAD_DST_FAULT,
  1088. postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE);
  1089. mis->have_fault_thread = true;
  1090. /* Mark so that we get notified of accesses to unwritten areas */
  1091. if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
  1092. error_report("ram_block_enable_notify failed");
  1093. return -1;
  1094. }
  1095. if (postcopy_temp_pages_setup(mis)) {
  1096. /* Error dumped in the sub-function */
  1097. return -1;
  1098. }
  1099. if (migrate_postcopy_preempt()) {
  1100. /*
  1101. * This thread needs to be created after the temp pages because
  1102. * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately.
  1103. */
  1104. postcopy_thread_create(mis, &mis->postcopy_prio_thread,
  1105. MIGRATION_THREAD_DST_PREEMPT,
  1106. postcopy_preempt_thread, QEMU_THREAD_JOINABLE);
  1107. mis->preempt_thread_status = PREEMPT_THREAD_CREATED;
  1108. }
  1109. trace_postcopy_ram_enable_notify();
  1110. return 0;
  1111. }
  1112. static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
  1113. void *from_addr, uint64_t pagesize, RAMBlock *rb)
  1114. {
  1115. int userfault_fd = mis->userfault_fd;
  1116. int ret;
  1117. if (from_addr) {
  1118. ret = uffd_copy_page(userfault_fd, host_addr, from_addr, pagesize,
  1119. false);
  1120. } else {
  1121. ret = uffd_zero_page(userfault_fd, host_addr, pagesize, false);
  1122. }
  1123. if (!ret) {
  1124. qemu_mutex_lock(&mis->page_request_mutex);
  1125. ramblock_recv_bitmap_set_range(rb, host_addr,
  1126. pagesize / qemu_target_page_size());
  1127. /*
  1128. * If this page resolves a page fault for a previous recorded faulted
  1129. * address, take a special note to maintain the requested page list.
  1130. */
  1131. if (g_tree_lookup(mis->page_requested, host_addr)) {
  1132. g_tree_remove(mis->page_requested, host_addr);
  1133. int left_pages = qatomic_dec_fetch(&mis->page_requested_count);
  1134. trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
  1135. /* Order the update of count and read of preempt status */
  1136. smp_mb();
  1137. if (mis->preempt_thread_status == PREEMPT_THREAD_QUIT &&
  1138. left_pages == 0) {
  1139. /*
  1140. * This probably means the main thread is waiting for us.
  1141. * Notify that we've finished receiving the last requested
  1142. * page.
  1143. */
  1144. qemu_cond_signal(&mis->page_request_cond);
  1145. }
  1146. }
  1147. qemu_mutex_unlock(&mis->page_request_mutex);
  1148. mark_postcopy_blocktime_end((uintptr_t)host_addr);
  1149. }
  1150. return ret;
  1151. }
  1152. int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
  1153. {
  1154. int i;
  1155. MigrationIncomingState *mis = migration_incoming_get_current();
  1156. GArray *pcrfds = mis->postcopy_remote_fds;
  1157. for (i = 0; i < pcrfds->len; i++) {
  1158. struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
  1159. int ret = cur->waker(cur, rb, offset);
  1160. if (ret) {
  1161. return ret;
  1162. }
  1163. }
  1164. return 0;
  1165. }
  1166. /*
  1167. * Place a host page (from) at (host) atomically
  1168. * returns 0 on success
  1169. */
  1170. int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
  1171. RAMBlock *rb)
  1172. {
  1173. size_t pagesize = qemu_ram_pagesize(rb);
  1174. int e;
  1175. /* copy also acks to the kernel waking the stalled thread up
  1176. * TODO: We can inhibit that ack and only do it if it was requested
  1177. * which would be slightly cheaper, but we'd have to be careful
  1178. * of the order of updating our page state.
  1179. */
  1180. e = qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb);
  1181. if (e) {
  1182. return e;
  1183. }
  1184. trace_postcopy_place_page(host);
  1185. return postcopy_notify_shared_wake(rb,
  1186. qemu_ram_block_host_offset(rb, host));
  1187. }
  1188. /*
  1189. * Place a zero page at (host) atomically
  1190. * returns 0 on success
  1191. */
  1192. int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
  1193. RAMBlock *rb)
  1194. {
  1195. size_t pagesize = qemu_ram_pagesize(rb);
  1196. trace_postcopy_place_page_zero(host);
  1197. /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
  1198. * but it's not available for everything (e.g. hugetlbpages)
  1199. */
  1200. if (qemu_ram_is_uf_zeroable(rb)) {
  1201. int e;
  1202. e = qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb);
  1203. if (e) {
  1204. return e;
  1205. }
  1206. return postcopy_notify_shared_wake(rb,
  1207. qemu_ram_block_host_offset(rb,
  1208. host));
  1209. } else {
  1210. return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
  1211. }
  1212. }
  1213. #else
  1214. /* No target OS support, stubs just fail */
  1215. void fill_destination_postcopy_migration_info(MigrationInfo *info)
  1216. {
  1217. }
  1218. bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp)
  1219. {
  1220. error_report("%s: No OS support", __func__);
  1221. return false;
  1222. }
  1223. int postcopy_ram_incoming_init(MigrationIncomingState *mis)
  1224. {
  1225. error_report("postcopy_ram_incoming_init: No OS support");
  1226. return -1;
  1227. }
  1228. int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
  1229. {
  1230. g_assert_not_reached();
  1231. }
  1232. int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
  1233. {
  1234. g_assert_not_reached();
  1235. }
  1236. int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
  1237. uint64_t client_addr, uint64_t rb_offset)
  1238. {
  1239. g_assert_not_reached();
  1240. }
  1241. int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
  1242. {
  1243. g_assert_not_reached();
  1244. }
  1245. int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
  1246. RAMBlock *rb)
  1247. {
  1248. g_assert_not_reached();
  1249. }
  1250. int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
  1251. RAMBlock *rb)
  1252. {
  1253. g_assert_not_reached();
  1254. }
  1255. int postcopy_wake_shared(struct PostCopyFD *pcfd,
  1256. uint64_t client_addr,
  1257. RAMBlock *rb)
  1258. {
  1259. g_assert_not_reached();
  1260. }
  1261. #endif
  1262. /* ------------------------------------------------------------------------- */
  1263. void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page)
  1264. {
  1265. tmp_page->target_pages = 0;
  1266. tmp_page->host_addr = NULL;
  1267. /*
  1268. * This is set to true when reset, and cleared as long as we received any
  1269. * of the non-zero small page within this huge page.
  1270. */
  1271. tmp_page->all_zero = true;
  1272. }
  1273. void postcopy_fault_thread_notify(MigrationIncomingState *mis)
  1274. {
  1275. uint64_t tmp64 = 1;
  1276. /*
  1277. * Wakeup the fault_thread. It's an eventfd that should currently
  1278. * be at 0, we're going to increment it to 1
  1279. */
  1280. if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
  1281. /* Not much we can do here, but may as well report it */
  1282. error_report("%s: incrementing failed: %s", __func__,
  1283. strerror(errno));
  1284. }
  1285. }
  1286. /**
  1287. * postcopy_discard_send_init: Called at the start of each RAMBlock before
  1288. * asking to discard individual ranges.
  1289. *
  1290. * @ms: The current migration state.
  1291. * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
  1292. * @name: RAMBlock that discards will operate on.
  1293. */
  1294. static PostcopyDiscardState pds = {0};
  1295. void postcopy_discard_send_init(MigrationState *ms, const char *name)
  1296. {
  1297. pds.ramblock_name = name;
  1298. pds.cur_entry = 0;
  1299. pds.nsentwords = 0;
  1300. pds.nsentcmds = 0;
  1301. }
  1302. /**
  1303. * postcopy_discard_send_range: Called by the bitmap code for each chunk to
  1304. * discard. May send a discard message, may just leave it queued to
  1305. * be sent later.
  1306. *
  1307. * @ms: Current migration state.
  1308. * @start,@length: a range of pages in the migration bitmap in the
  1309. * RAM block passed to postcopy_discard_send_init() (length=1 is one page)
  1310. */
  1311. void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
  1312. unsigned long length)
  1313. {
  1314. size_t tp_size = qemu_target_page_size();
  1315. /* Convert to byte offsets within the RAM block */
  1316. pds.start_list[pds.cur_entry] = start * tp_size;
  1317. pds.length_list[pds.cur_entry] = length * tp_size;
  1318. trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
  1319. pds.cur_entry++;
  1320. pds.nsentwords++;
  1321. if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
  1322. /* Full set, ship it! */
  1323. qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
  1324. pds.ramblock_name,
  1325. pds.cur_entry,
  1326. pds.start_list,
  1327. pds.length_list);
  1328. pds.nsentcmds++;
  1329. pds.cur_entry = 0;
  1330. }
  1331. }
  1332. /**
  1333. * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
  1334. * bitmap code. Sends any outstanding discard messages, frees the PDS
  1335. *
  1336. * @ms: Current migration state.
  1337. */
  1338. void postcopy_discard_send_finish(MigrationState *ms)
  1339. {
  1340. /* Anything unsent? */
  1341. if (pds.cur_entry) {
  1342. qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
  1343. pds.ramblock_name,
  1344. pds.cur_entry,
  1345. pds.start_list,
  1346. pds.length_list);
  1347. pds.nsentcmds++;
  1348. }
  1349. trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
  1350. pds.nsentcmds);
  1351. }
  1352. /*
  1353. * Current state of incoming postcopy; note this is not part of
  1354. * MigrationIncomingState since it's state is used during cleanup
  1355. * at the end as MIS is being freed.
  1356. */
  1357. static PostcopyState incoming_postcopy_state;
  1358. PostcopyState postcopy_state_get(void)
  1359. {
  1360. return qatomic_load_acquire(&incoming_postcopy_state);
  1361. }
  1362. /* Set the state and return the old state */
  1363. PostcopyState postcopy_state_set(PostcopyState new_state)
  1364. {
  1365. return qatomic_xchg(&incoming_postcopy_state, new_state);
  1366. }
  1367. /* Register a handler for external shared memory postcopy
  1368. * called on the destination.
  1369. */
  1370. void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
  1371. {
  1372. MigrationIncomingState *mis = migration_incoming_get_current();
  1373. mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
  1374. *pcfd);
  1375. }
  1376. /* Unregister a handler for external shared memory postcopy
  1377. */
  1378. void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
  1379. {
  1380. guint i;
  1381. MigrationIncomingState *mis = migration_incoming_get_current();
  1382. GArray *pcrfds = mis->postcopy_remote_fds;
  1383. if (!pcrfds) {
  1384. /* migration has already finished and freed the array */
  1385. return;
  1386. }
  1387. for (i = 0; i < pcrfds->len; i++) {
  1388. struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
  1389. if (cur->fd == pcfd->fd) {
  1390. mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
  1391. return;
  1392. }
  1393. }
  1394. }
  1395. void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
  1396. {
  1397. /*
  1398. * The new loading channel has its own threads, so it needs to be
  1399. * blocked too. It's by default true, just be explicit.
  1400. */
  1401. qemu_file_set_blocking(file, true);
  1402. mis->postcopy_qemufile_dst = file;
  1403. qemu_sem_post(&mis->postcopy_qemufile_dst_done);
  1404. trace_postcopy_preempt_new_channel();
  1405. }
  1406. /*
  1407. * Setup the postcopy preempt channel with the IOC. If ERROR is specified,
  1408. * setup the error instead. This helper will free the ERROR if specified.
  1409. */
  1410. static void
  1411. postcopy_preempt_send_channel_done(MigrationState *s,
  1412. QIOChannel *ioc, Error *local_err)
  1413. {
  1414. if (local_err) {
  1415. migrate_set_error(s, local_err);
  1416. error_free(local_err);
  1417. } else {
  1418. migration_ioc_register_yank(ioc);
  1419. s->postcopy_qemufile_src = qemu_file_new_output(ioc);
  1420. trace_postcopy_preempt_new_channel();
  1421. }
  1422. /*
  1423. * Kick the waiter in all cases. The waiter should check upon
  1424. * postcopy_qemufile_src to know whether it failed or not.
  1425. */
  1426. qemu_sem_post(&s->postcopy_qemufile_src_sem);
  1427. }
  1428. static void
  1429. postcopy_preempt_tls_handshake(QIOTask *task, gpointer opaque)
  1430. {
  1431. g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
  1432. MigrationState *s = opaque;
  1433. Error *local_err = NULL;
  1434. qio_task_propagate_error(task, &local_err);
  1435. postcopy_preempt_send_channel_done(s, ioc, local_err);
  1436. }
  1437. static void
  1438. postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque)
  1439. {
  1440. g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
  1441. MigrationState *s = opaque;
  1442. QIOChannelTLS *tioc;
  1443. Error *local_err = NULL;
  1444. if (qio_task_propagate_error(task, &local_err)) {
  1445. goto out;
  1446. }
  1447. if (migrate_channel_requires_tls_upgrade(ioc)) {
  1448. tioc = migration_tls_client_create(ioc, s->hostname, &local_err);
  1449. if (!tioc) {
  1450. goto out;
  1451. }
  1452. trace_postcopy_preempt_tls_handshake();
  1453. qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-preempt");
  1454. qio_channel_tls_handshake(tioc, postcopy_preempt_tls_handshake,
  1455. s, NULL, NULL);
  1456. /* Setup the channel until TLS handshake finished */
  1457. return;
  1458. }
  1459. out:
  1460. /* This handles both good and error cases */
  1461. postcopy_preempt_send_channel_done(s, ioc, local_err);
  1462. }
  1463. /*
  1464. * This function will kick off an async task to establish the preempt
  1465. * channel, and wait until the connection setup completed. Returns 0 if
  1466. * channel established, -1 for error.
  1467. */
  1468. int postcopy_preempt_establish_channel(MigrationState *s)
  1469. {
  1470. /* If preempt not enabled, no need to wait */
  1471. if (!migrate_postcopy_preempt()) {
  1472. return 0;
  1473. }
  1474. /*
  1475. * Kick off async task to establish preempt channel. Only do so with
  1476. * 8.0+ machines, because 7.1/7.2 require the channel to be created in
  1477. * setup phase of migration (even if racy in an unreliable network).
  1478. */
  1479. if (!s->preempt_pre_7_2) {
  1480. postcopy_preempt_setup(s);
  1481. }
  1482. /*
  1483. * We need the postcopy preempt channel to be established before
  1484. * starting doing anything.
  1485. */
  1486. qemu_sem_wait(&s->postcopy_qemufile_src_sem);
  1487. return s->postcopy_qemufile_src ? 0 : -1;
  1488. }
  1489. void postcopy_preempt_setup(MigrationState *s)
  1490. {
  1491. /* Kick an async task to connect */
  1492. socket_send_channel_create(postcopy_preempt_send_channel_new, s);
  1493. }
  1494. static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis)
  1495. {
  1496. trace_postcopy_pause_fast_load();
  1497. qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
  1498. qemu_sem_wait(&mis->postcopy_pause_sem_fast_load);
  1499. qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
  1500. trace_postcopy_pause_fast_load_continued();
  1501. }
  1502. static bool preempt_thread_should_run(MigrationIncomingState *mis)
  1503. {
  1504. return mis->preempt_thread_status != PREEMPT_THREAD_QUIT;
  1505. }
  1506. void *postcopy_preempt_thread(void *opaque)
  1507. {
  1508. MigrationIncomingState *mis = opaque;
  1509. int ret;
  1510. trace_postcopy_preempt_thread_entry();
  1511. rcu_register_thread();
  1512. qemu_sem_post(&mis->thread_sync_sem);
  1513. /*
  1514. * The preempt channel is established in asynchronous way. Wait
  1515. * for its completion.
  1516. */
  1517. qemu_sem_wait(&mis->postcopy_qemufile_dst_done);
  1518. /* Sending RAM_SAVE_FLAG_EOS to terminate this thread */
  1519. qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
  1520. while (preempt_thread_should_run(mis)) {
  1521. ret = ram_load_postcopy(mis->postcopy_qemufile_dst,
  1522. RAM_CHANNEL_POSTCOPY);
  1523. /* If error happened, go into recovery routine */
  1524. if (ret && preempt_thread_should_run(mis)) {
  1525. postcopy_pause_ram_fast_load(mis);
  1526. } else {
  1527. /* We're done */
  1528. break;
  1529. }
  1530. }
  1531. qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
  1532. rcu_unregister_thread();
  1533. trace_postcopy_preempt_thread_exit();
  1534. return NULL;
  1535. }
  1536. bool postcopy_is_paused(MigrationStatus status)
  1537. {
  1538. return status == MIGRATION_STATUS_POSTCOPY_PAUSED ||
  1539. status == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP;
  1540. }