hostmem.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600
  1. /*
  2. * QEMU Host Memory Backend
  3. *
  4. * Copyright (C) 2013-2014 Red Hat Inc
  5. *
  6. * Authors:
  7. * Igor Mammedov <imammedo@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10. * See the COPYING file in the top-level directory.
  11. */
  12. #include "qemu/osdep.h"
  13. #include "system/hostmem.h"
  14. #include "hw/boards.h"
  15. #include "qapi/error.h"
  16. #include "qapi/qapi-builtin-visit.h"
  17. #include "qapi/visitor.h"
  18. #include "qemu/config-file.h"
  19. #include "qom/object_interfaces.h"
  20. #include "qemu/mmap-alloc.h"
  21. #include "qemu/madvise.h"
  22. #include "qemu/cutils.h"
  23. #include "hw/qdev-core.h"
  24. #ifdef CONFIG_NUMA
  25. #include <numaif.h>
  26. #include <numa.h>
  27. QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_DEFAULT != MPOL_DEFAULT);
  28. /*
  29. * HOST_MEM_POLICY_PREFERRED may either translate to MPOL_PREFERRED or
  30. * MPOL_PREFERRED_MANY, see comments further below.
  31. */
  32. QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_PREFERRED != MPOL_PREFERRED);
  33. QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND);
  34. QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE);
  35. #endif
  36. char *
  37. host_memory_backend_get_name(HostMemoryBackend *backend)
  38. {
  39. if (!backend->use_canonical_path) {
  40. return g_strdup(object_get_canonical_path_component(OBJECT(backend)));
  41. }
  42. return object_get_canonical_path(OBJECT(backend));
  43. }
  44. static void
  45. host_memory_backend_get_size(Object *obj, Visitor *v, const char *name,
  46. void *opaque, Error **errp)
  47. {
  48. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  49. uint64_t value = backend->size;
  50. visit_type_size(v, name, &value, errp);
  51. }
  52. static void
  53. host_memory_backend_set_size(Object *obj, Visitor *v, const char *name,
  54. void *opaque, Error **errp)
  55. {
  56. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  57. uint64_t value;
  58. if (host_memory_backend_mr_inited(backend)) {
  59. error_setg(errp, "cannot change property %s of %s ", name,
  60. object_get_typename(obj));
  61. return;
  62. }
  63. if (!visit_type_size(v, name, &value, errp)) {
  64. return;
  65. }
  66. if (!value) {
  67. error_setg(errp,
  68. "property '%s' of %s doesn't take value '%" PRIu64 "'",
  69. name, object_get_typename(obj), value);
  70. return;
  71. }
  72. backend->size = value;
  73. }
  74. static void
  75. host_memory_backend_get_host_nodes(Object *obj, Visitor *v, const char *name,
  76. void *opaque, Error **errp)
  77. {
  78. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  79. uint16List *host_nodes = NULL;
  80. uint16List **tail = &host_nodes;
  81. unsigned long value;
  82. value = find_first_bit(backend->host_nodes, MAX_NODES);
  83. if (value == MAX_NODES) {
  84. goto ret;
  85. }
  86. QAPI_LIST_APPEND(tail, value);
  87. do {
  88. value = find_next_bit(backend->host_nodes, MAX_NODES, value + 1);
  89. if (value == MAX_NODES) {
  90. break;
  91. }
  92. QAPI_LIST_APPEND(tail, value);
  93. } while (true);
  94. ret:
  95. visit_type_uint16List(v, name, &host_nodes, errp);
  96. qapi_free_uint16List(host_nodes);
  97. }
  98. static void
  99. host_memory_backend_set_host_nodes(Object *obj, Visitor *v, const char *name,
  100. void *opaque, Error **errp)
  101. {
  102. #ifdef CONFIG_NUMA
  103. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  104. uint16List *l, *host_nodes = NULL;
  105. visit_type_uint16List(v, name, &host_nodes, errp);
  106. for (l = host_nodes; l; l = l->next) {
  107. if (l->value >= MAX_NODES) {
  108. error_setg(errp, "Invalid host-nodes value: %d", l->value);
  109. goto out;
  110. }
  111. }
  112. for (l = host_nodes; l; l = l->next) {
  113. bitmap_set(backend->host_nodes, l->value, 1);
  114. }
  115. out:
  116. qapi_free_uint16List(host_nodes);
  117. #else
  118. error_setg(errp, "NUMA node binding are not supported by this QEMU");
  119. #endif
  120. }
  121. static int
  122. host_memory_backend_get_policy(Object *obj, Error **errp G_GNUC_UNUSED)
  123. {
  124. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  125. return backend->policy;
  126. }
  127. static void
  128. host_memory_backend_set_policy(Object *obj, int policy, Error **errp)
  129. {
  130. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  131. backend->policy = policy;
  132. #ifndef CONFIG_NUMA
  133. if (policy != HOST_MEM_POLICY_DEFAULT) {
  134. error_setg(errp, "NUMA policies are not supported by this QEMU");
  135. }
  136. #endif
  137. }
  138. static bool host_memory_backend_get_merge(Object *obj, Error **errp)
  139. {
  140. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  141. return backend->merge;
  142. }
  143. static void host_memory_backend_set_merge(Object *obj, bool value, Error **errp)
  144. {
  145. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  146. if (QEMU_MADV_MERGEABLE == QEMU_MADV_INVALID) {
  147. if (value) {
  148. error_setg(errp, "Memory merging is not supported on this host");
  149. }
  150. assert(!backend->merge);
  151. return;
  152. }
  153. if (host_memory_backend_mr_inited(backend) &&
  154. value != backend->merge) {
  155. void *ptr = memory_region_get_ram_ptr(&backend->mr);
  156. uint64_t sz = memory_region_size(&backend->mr);
  157. qemu_madvise(ptr, sz,
  158. value ? QEMU_MADV_MERGEABLE : QEMU_MADV_UNMERGEABLE);
  159. }
  160. backend->merge = value;
  161. }
  162. static bool host_memory_backend_get_dump(Object *obj, Error **errp)
  163. {
  164. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  165. return backend->dump;
  166. }
  167. static void host_memory_backend_set_dump(Object *obj, bool value, Error **errp)
  168. {
  169. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  170. if (QEMU_MADV_DONTDUMP == QEMU_MADV_INVALID) {
  171. if (!value) {
  172. error_setg(errp, "Dumping guest memory cannot be disabled on this host");
  173. }
  174. assert(backend->dump);
  175. return;
  176. }
  177. if (host_memory_backend_mr_inited(backend) &&
  178. value != backend->dump) {
  179. void *ptr = memory_region_get_ram_ptr(&backend->mr);
  180. uint64_t sz = memory_region_size(&backend->mr);
  181. qemu_madvise(ptr, sz,
  182. value ? QEMU_MADV_DODUMP : QEMU_MADV_DONTDUMP);
  183. }
  184. backend->dump = value;
  185. }
  186. static bool host_memory_backend_get_prealloc(Object *obj, Error **errp)
  187. {
  188. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  189. return backend->prealloc;
  190. }
  191. static void host_memory_backend_set_prealloc(Object *obj, bool value,
  192. Error **errp)
  193. {
  194. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  195. if (!backend->reserve && value) {
  196. error_setg(errp, "'prealloc=on' and 'reserve=off' are incompatible");
  197. return;
  198. }
  199. if (!host_memory_backend_mr_inited(backend)) {
  200. backend->prealloc = value;
  201. return;
  202. }
  203. if (value && !backend->prealloc) {
  204. int fd = memory_region_get_fd(&backend->mr);
  205. void *ptr = memory_region_get_ram_ptr(&backend->mr);
  206. uint64_t sz = memory_region_size(&backend->mr);
  207. if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
  208. backend->prealloc_context, false, errp)) {
  209. return;
  210. }
  211. backend->prealloc = true;
  212. }
  213. }
  214. static void host_memory_backend_get_prealloc_threads(Object *obj, Visitor *v,
  215. const char *name, void *opaque, Error **errp)
  216. {
  217. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  218. visit_type_uint32(v, name, &backend->prealloc_threads, errp);
  219. }
  220. static void host_memory_backend_set_prealloc_threads(Object *obj, Visitor *v,
  221. const char *name, void *opaque, Error **errp)
  222. {
  223. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  224. uint32_t value;
  225. if (!visit_type_uint32(v, name, &value, errp)) {
  226. return;
  227. }
  228. if (value <= 0) {
  229. error_setg(errp, "property '%s' of %s doesn't take value '%d'", name,
  230. object_get_typename(obj), value);
  231. return;
  232. }
  233. backend->prealloc_threads = value;
  234. }
  235. static void host_memory_backend_init(Object *obj)
  236. {
  237. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  238. MachineState *machine = MACHINE(qdev_get_machine());
  239. /* TODO: convert access to globals to compat properties */
  240. backend->merge = machine_mem_merge(machine);
  241. backend->dump = machine_dump_guest_core(machine);
  242. backend->guest_memfd = machine_require_guest_memfd(machine);
  243. backend->reserve = true;
  244. backend->prealloc_threads = machine->smp.cpus;
  245. }
  246. static void host_memory_backend_post_init(Object *obj)
  247. {
  248. object_apply_compat_props(obj);
  249. }
  250. bool host_memory_backend_mr_inited(HostMemoryBackend *backend)
  251. {
  252. /*
  253. * NOTE: We forbid zero-length memory backend, so here zero means
  254. * "we haven't inited the backend memory region yet".
  255. */
  256. return memory_region_size(&backend->mr) != 0;
  257. }
  258. MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend)
  259. {
  260. return host_memory_backend_mr_inited(backend) ? &backend->mr : NULL;
  261. }
  262. void host_memory_backend_set_mapped(HostMemoryBackend *backend, bool mapped)
  263. {
  264. backend->is_mapped = mapped;
  265. }
  266. bool host_memory_backend_is_mapped(HostMemoryBackend *backend)
  267. {
  268. return backend->is_mapped;
  269. }
  270. size_t host_memory_backend_pagesize(HostMemoryBackend *memdev)
  271. {
  272. size_t pagesize = qemu_ram_pagesize(memdev->mr.ram_block);
  273. g_assert(pagesize >= qemu_real_host_page_size());
  274. return pagesize;
  275. }
  276. static void
  277. host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
  278. {
  279. HostMemoryBackend *backend = MEMORY_BACKEND(uc);
  280. HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
  281. void *ptr;
  282. uint64_t sz;
  283. size_t pagesize;
  284. bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);
  285. if (!bc->alloc) {
  286. return;
  287. }
  288. if (!bc->alloc(backend, errp)) {
  289. return;
  290. }
  291. ptr = memory_region_get_ram_ptr(&backend->mr);
  292. sz = memory_region_size(&backend->mr);
  293. pagesize = qemu_ram_pagesize(backend->mr.ram_block);
  294. if (backend->aligned && !QEMU_IS_ALIGNED(sz, pagesize)) {
  295. g_autofree char *pagesize_str = size_to_str(pagesize);
  296. error_setg(errp, "backend '%s' memory size must be multiple of %s",
  297. object_get_typename(OBJECT(uc)), pagesize_str);
  298. return;
  299. }
  300. if (backend->merge) {
  301. qemu_madvise(ptr, sz, QEMU_MADV_MERGEABLE);
  302. }
  303. if (!backend->dump) {
  304. qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP);
  305. }
  306. #ifdef CONFIG_NUMA
  307. unsigned long lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
  308. /* lastbit == MAX_NODES means maxnode = 0 */
  309. unsigned long maxnode = (lastbit + 1) % (MAX_NODES + 1);
  310. /*
  311. * Ensure policy won't be ignored in case memory is preallocated
  312. * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
  313. * this doesn't catch hugepage case.
  314. */
  315. unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
  316. int mode = backend->policy;
  317. /* check for invalid host-nodes and policies and give more verbose
  318. * error messages than mbind(). */
  319. if (maxnode && backend->policy == MPOL_DEFAULT) {
  320. error_setg(errp, "host-nodes must be empty for policy default,"
  321. " or you should explicitly specify a policy other"
  322. " than default");
  323. return;
  324. } else if (maxnode == 0 && backend->policy != MPOL_DEFAULT) {
  325. error_setg(errp, "host-nodes must be set for policy %s",
  326. HostMemPolicy_str(backend->policy));
  327. return;
  328. }
  329. /*
  330. * We can have up to MAX_NODES nodes, but we need to pass maxnode+1
  331. * as argument to mbind() due to an old Linux bug (feature?) which
  332. * cuts off the last specified node. This means backend->host_nodes
  333. * must have MAX_NODES+1 bits available.
  334. */
  335. assert(sizeof(backend->host_nodes) >=
  336. BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
  337. assert(maxnode <= MAX_NODES);
  338. #ifdef HAVE_NUMA_HAS_PREFERRED_MANY
  339. if (mode == MPOL_PREFERRED && numa_has_preferred_many() > 0) {
  340. /*
  341. * Replace with MPOL_PREFERRED_MANY otherwise the mbind() below
  342. * silently picks the first node.
  343. */
  344. mode = MPOL_PREFERRED_MANY;
  345. }
  346. #endif
  347. if (maxnode &&
  348. mbind(ptr, sz, mode, backend->host_nodes, maxnode + 1, flags)) {
  349. if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
  350. error_setg_errno(errp, errno,
  351. "cannot bind memory to host NUMA nodes");
  352. return;
  353. }
  354. }
  355. #endif
  356. /*
  357. * Preallocate memory after the NUMA policy has been instantiated.
  358. * This is necessary to guarantee memory is allocated with
  359. * specified NUMA policy in place.
  360. */
  361. if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
  362. ptr, sz,
  363. backend->prealloc_threads,
  364. backend->prealloc_context,
  365. async, errp)) {
  366. return;
  367. }
  368. }
  369. static bool
  370. host_memory_backend_can_be_deleted(UserCreatable *uc)
  371. {
  372. if (host_memory_backend_is_mapped(MEMORY_BACKEND(uc))) {
  373. return false;
  374. } else {
  375. return true;
  376. }
  377. }
  378. static bool host_memory_backend_get_share(Object *o, Error **errp)
  379. {
  380. HostMemoryBackend *backend = MEMORY_BACKEND(o);
  381. return backend->share;
  382. }
  383. static void host_memory_backend_set_share(Object *o, bool value, Error **errp)
  384. {
  385. HostMemoryBackend *backend = MEMORY_BACKEND(o);
  386. if (host_memory_backend_mr_inited(backend)) {
  387. error_setg(errp, "cannot change property value");
  388. return;
  389. }
  390. backend->share = value;
  391. }
  392. #ifdef CONFIG_LINUX
  393. static bool host_memory_backend_get_reserve(Object *o, Error **errp)
  394. {
  395. HostMemoryBackend *backend = MEMORY_BACKEND(o);
  396. return backend->reserve;
  397. }
  398. static void host_memory_backend_set_reserve(Object *o, bool value, Error **errp)
  399. {
  400. HostMemoryBackend *backend = MEMORY_BACKEND(o);
  401. if (host_memory_backend_mr_inited(backend)) {
  402. error_setg(errp, "cannot change property value");
  403. return;
  404. }
  405. if (backend->prealloc && !value) {
  406. error_setg(errp, "'prealloc=on' and 'reserve=off' are incompatible");
  407. return;
  408. }
  409. backend->reserve = value;
  410. }
  411. #endif /* CONFIG_LINUX */
  412. static bool
  413. host_memory_backend_get_use_canonical_path(Object *obj, Error **errp)
  414. {
  415. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  416. return backend->use_canonical_path;
  417. }
  418. static void
  419. host_memory_backend_set_use_canonical_path(Object *obj, bool value,
  420. Error **errp)
  421. {
  422. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  423. backend->use_canonical_path = value;
  424. }
  425. static void
  426. host_memory_backend_class_init(ObjectClass *oc, void *data)
  427. {
  428. UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
  429. ucc->complete = host_memory_backend_memory_complete;
  430. ucc->can_be_deleted = host_memory_backend_can_be_deleted;
  431. object_class_property_add_bool(oc, "merge",
  432. host_memory_backend_get_merge,
  433. host_memory_backend_set_merge);
  434. object_class_property_set_description(oc, "merge",
  435. "Mark memory as mergeable");
  436. object_class_property_add_bool(oc, "dump",
  437. host_memory_backend_get_dump,
  438. host_memory_backend_set_dump);
  439. object_class_property_set_description(oc, "dump",
  440. "Set to 'off' to exclude from core dump");
  441. object_class_property_add_bool(oc, "prealloc",
  442. host_memory_backend_get_prealloc,
  443. host_memory_backend_set_prealloc);
  444. object_class_property_set_description(oc, "prealloc",
  445. "Preallocate memory");
  446. object_class_property_add(oc, "prealloc-threads", "int",
  447. host_memory_backend_get_prealloc_threads,
  448. host_memory_backend_set_prealloc_threads,
  449. NULL, NULL);
  450. object_class_property_set_description(oc, "prealloc-threads",
  451. "Number of CPU threads to use for prealloc");
  452. object_class_property_add_link(oc, "prealloc-context",
  453. TYPE_THREAD_CONTEXT, offsetof(HostMemoryBackend, prealloc_context),
  454. object_property_allow_set_link, OBJ_PROP_LINK_STRONG);
  455. object_class_property_set_description(oc, "prealloc-context",
  456. "Context to use for creating CPU threads for preallocation");
  457. object_class_property_add(oc, "size", "int",
  458. host_memory_backend_get_size,
  459. host_memory_backend_set_size,
  460. NULL, NULL);
  461. object_class_property_set_description(oc, "size",
  462. "Size of the memory region (ex: 500M)");
  463. object_class_property_add(oc, "host-nodes", "int",
  464. host_memory_backend_get_host_nodes,
  465. host_memory_backend_set_host_nodes,
  466. NULL, NULL);
  467. object_class_property_set_description(oc, "host-nodes",
  468. "Binds memory to the list of NUMA host nodes");
  469. object_class_property_add_enum(oc, "policy", "HostMemPolicy",
  470. &HostMemPolicy_lookup,
  471. host_memory_backend_get_policy,
  472. host_memory_backend_set_policy);
  473. object_class_property_set_description(oc, "policy",
  474. "Set the NUMA policy");
  475. object_class_property_add_bool(oc, "share",
  476. host_memory_backend_get_share, host_memory_backend_set_share);
  477. object_class_property_set_description(oc, "share",
  478. "Mark the memory as private to QEMU or shared");
  479. #ifdef CONFIG_LINUX
  480. object_class_property_add_bool(oc, "reserve",
  481. host_memory_backend_get_reserve, host_memory_backend_set_reserve);
  482. object_class_property_set_description(oc, "reserve",
  483. "Reserve swap space (or huge pages) if applicable");
  484. #endif /* CONFIG_LINUX */
  485. /*
  486. * Do not delete/rename option. This option must be considered stable
  487. * (as if it didn't have the 'x-' prefix including deprecation period) as
  488. * long as 4.0 and older machine types exists.
  489. * Option will be used by upper layers to override (disable) canonical path
  490. * for ramblock-id set by compat properties on old machine types ( <= 4.0),
  491. * to keep migration working when backend is used for main RAM with
  492. * -machine memory-backend= option (main RAM historically used prefix-less
  493. * ramblock-id).
  494. */
  495. object_class_property_add_bool(oc, "x-use-canonical-path-for-ramblock-id",
  496. host_memory_backend_get_use_canonical_path,
  497. host_memory_backend_set_use_canonical_path);
  498. }
  499. static const TypeInfo host_memory_backend_info = {
  500. .name = TYPE_MEMORY_BACKEND,
  501. .parent = TYPE_OBJECT,
  502. .abstract = true,
  503. .class_size = sizeof(HostMemoryBackendClass),
  504. .class_init = host_memory_backend_class_init,
  505. .instance_size = sizeof(HostMemoryBackend),
  506. .instance_init = host_memory_backend_init,
  507. .instance_post_init = host_memory_backend_post_init,
  508. .interfaces = (InterfaceInfo[]) {
  509. { TYPE_USER_CREATABLE },
  510. { }
  511. }
  512. };
  513. static void register_types(void)
  514. {
  515. type_register_static(&host_memory_backend_info);
  516. }
  517. type_init(register_types);