2
0

hostmem.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. /*
  2. * QEMU Host Memory Backend
  3. *
  4. * Copyright (C) 2013-2014 Red Hat Inc
  5. *
  6. * Authors:
  7. * Igor Mammedov <imammedo@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10. * See the COPYING file in the top-level directory.
  11. */
  12. #include "qemu/osdep.h"
  13. #include "sysemu/hostmem.h"
  14. #include "hw/boards.h"
  15. #include "qapi/error.h"
  16. #include "qapi/qapi-builtin-visit.h"
  17. #include "qapi/visitor.h"
  18. #include "qemu/config-file.h"
  19. #include "qom/object_interfaces.h"
  20. #include "qemu/mmap-alloc.h"
  21. #include "qemu/madvise.h"
  22. #ifdef CONFIG_NUMA
  23. #include <numaif.h>
  24. #include <numa.h>
  25. QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_DEFAULT != MPOL_DEFAULT);
  26. /*
  27. * HOST_MEM_POLICY_PREFERRED may either translate to MPOL_PREFERRED or
  28. * MPOL_PREFERRED_MANY, see comments further below.
  29. */
  30. QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_PREFERRED != MPOL_PREFERRED);
  31. QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND);
  32. QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE);
  33. #endif
  34. char *
  35. host_memory_backend_get_name(HostMemoryBackend *backend)
  36. {
  37. if (!backend->use_canonical_path) {
  38. return g_strdup(object_get_canonical_path_component(OBJECT(backend)));
  39. }
  40. return object_get_canonical_path(OBJECT(backend));
  41. }
  42. static void
  43. host_memory_backend_get_size(Object *obj, Visitor *v, const char *name,
  44. void *opaque, Error **errp)
  45. {
  46. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  47. uint64_t value = backend->size;
  48. visit_type_size(v, name, &value, errp);
  49. }
  50. static void
  51. host_memory_backend_set_size(Object *obj, Visitor *v, const char *name,
  52. void *opaque, Error **errp)
  53. {
  54. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  55. uint64_t value;
  56. if (host_memory_backend_mr_inited(backend)) {
  57. error_setg(errp, "cannot change property %s of %s ", name,
  58. object_get_typename(obj));
  59. return;
  60. }
  61. if (!visit_type_size(v, name, &value, errp)) {
  62. return;
  63. }
  64. if (!value) {
  65. error_setg(errp,
  66. "property '%s' of %s doesn't take value '%" PRIu64 "'",
  67. name, object_get_typename(obj), value);
  68. return;
  69. }
  70. backend->size = value;
  71. }
  72. static void
  73. host_memory_backend_get_host_nodes(Object *obj, Visitor *v, const char *name,
  74. void *opaque, Error **errp)
  75. {
  76. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  77. uint16List *host_nodes = NULL;
  78. uint16List **tail = &host_nodes;
  79. unsigned long value;
  80. value = find_first_bit(backend->host_nodes, MAX_NODES);
  81. if (value == MAX_NODES) {
  82. goto ret;
  83. }
  84. QAPI_LIST_APPEND(tail, value);
  85. do {
  86. value = find_next_bit(backend->host_nodes, MAX_NODES, value + 1);
  87. if (value == MAX_NODES) {
  88. break;
  89. }
  90. QAPI_LIST_APPEND(tail, value);
  91. } while (true);
  92. ret:
  93. visit_type_uint16List(v, name, &host_nodes, errp);
  94. qapi_free_uint16List(host_nodes);
  95. }
  96. static void
  97. host_memory_backend_set_host_nodes(Object *obj, Visitor *v, const char *name,
  98. void *opaque, Error **errp)
  99. {
  100. #ifdef CONFIG_NUMA
  101. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  102. uint16List *l, *host_nodes = NULL;
  103. visit_type_uint16List(v, name, &host_nodes, errp);
  104. for (l = host_nodes; l; l = l->next) {
  105. if (l->value >= MAX_NODES) {
  106. error_setg(errp, "Invalid host-nodes value: %d", l->value);
  107. goto out;
  108. }
  109. }
  110. for (l = host_nodes; l; l = l->next) {
  111. bitmap_set(backend->host_nodes, l->value, 1);
  112. }
  113. out:
  114. qapi_free_uint16List(host_nodes);
  115. #else
  116. error_setg(errp, "NUMA node binding are not supported by this QEMU");
  117. #endif
  118. }
  119. static int
  120. host_memory_backend_get_policy(Object *obj, Error **errp G_GNUC_UNUSED)
  121. {
  122. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  123. return backend->policy;
  124. }
  125. static void
  126. host_memory_backend_set_policy(Object *obj, int policy, Error **errp)
  127. {
  128. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  129. backend->policy = policy;
  130. #ifndef CONFIG_NUMA
  131. if (policy != HOST_MEM_POLICY_DEFAULT) {
  132. error_setg(errp, "NUMA policies are not supported by this QEMU");
  133. }
  134. #endif
  135. }
  136. static bool host_memory_backend_get_merge(Object *obj, Error **errp)
  137. {
  138. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  139. return backend->merge;
  140. }
  141. static void host_memory_backend_set_merge(Object *obj, bool value, Error **errp)
  142. {
  143. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  144. if (!host_memory_backend_mr_inited(backend)) {
  145. backend->merge = value;
  146. return;
  147. }
  148. if (value != backend->merge) {
  149. void *ptr = memory_region_get_ram_ptr(&backend->mr);
  150. uint64_t sz = memory_region_size(&backend->mr);
  151. qemu_madvise(ptr, sz,
  152. value ? QEMU_MADV_MERGEABLE : QEMU_MADV_UNMERGEABLE);
  153. backend->merge = value;
  154. }
  155. }
  156. static bool host_memory_backend_get_dump(Object *obj, Error **errp)
  157. {
  158. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  159. return backend->dump;
  160. }
  161. static void host_memory_backend_set_dump(Object *obj, bool value, Error **errp)
  162. {
  163. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  164. if (!host_memory_backend_mr_inited(backend)) {
  165. backend->dump = value;
  166. return;
  167. }
  168. if (value != backend->dump) {
  169. void *ptr = memory_region_get_ram_ptr(&backend->mr);
  170. uint64_t sz = memory_region_size(&backend->mr);
  171. qemu_madvise(ptr, sz,
  172. value ? QEMU_MADV_DODUMP : QEMU_MADV_DONTDUMP);
  173. backend->dump = value;
  174. }
  175. }
  176. static bool host_memory_backend_get_prealloc(Object *obj, Error **errp)
  177. {
  178. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  179. return backend->prealloc;
  180. }
  181. static void host_memory_backend_set_prealloc(Object *obj, bool value,
  182. Error **errp)
  183. {
  184. Error *local_err = NULL;
  185. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  186. if (!backend->reserve && value) {
  187. error_setg(errp, "'prealloc=on' and 'reserve=off' are incompatible");
  188. return;
  189. }
  190. if (!host_memory_backend_mr_inited(backend)) {
  191. backend->prealloc = value;
  192. return;
  193. }
  194. if (value && !backend->prealloc) {
  195. int fd = memory_region_get_fd(&backend->mr);
  196. void *ptr = memory_region_get_ram_ptr(&backend->mr);
  197. uint64_t sz = memory_region_size(&backend->mr);
  198. qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
  199. backend->prealloc_context, &local_err);
  200. if (local_err) {
  201. error_propagate(errp, local_err);
  202. return;
  203. }
  204. backend->prealloc = true;
  205. }
  206. }
  207. static void host_memory_backend_get_prealloc_threads(Object *obj, Visitor *v,
  208. const char *name, void *opaque, Error **errp)
  209. {
  210. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  211. visit_type_uint32(v, name, &backend->prealloc_threads, errp);
  212. }
  213. static void host_memory_backend_set_prealloc_threads(Object *obj, Visitor *v,
  214. const char *name, void *opaque, Error **errp)
  215. {
  216. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  217. uint32_t value;
  218. if (!visit_type_uint32(v, name, &value, errp)) {
  219. return;
  220. }
  221. if (value <= 0) {
  222. error_setg(errp, "property '%s' of %s doesn't take value '%d'", name,
  223. object_get_typename(obj), value);
  224. return;
  225. }
  226. backend->prealloc_threads = value;
  227. }
  228. static void host_memory_backend_init(Object *obj)
  229. {
  230. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  231. MachineState *machine = MACHINE(qdev_get_machine());
  232. /* TODO: convert access to globals to compat properties */
  233. backend->merge = machine_mem_merge(machine);
  234. backend->dump = machine_dump_guest_core(machine);
  235. backend->reserve = true;
  236. backend->prealloc_threads = machine->smp.cpus;
  237. }
  238. static void host_memory_backend_post_init(Object *obj)
  239. {
  240. object_apply_compat_props(obj);
  241. }
  242. bool host_memory_backend_mr_inited(HostMemoryBackend *backend)
  243. {
  244. /*
  245. * NOTE: We forbid zero-length memory backend, so here zero means
  246. * "we haven't inited the backend memory region yet".
  247. */
  248. return memory_region_size(&backend->mr) != 0;
  249. }
  250. MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend)
  251. {
  252. return host_memory_backend_mr_inited(backend) ? &backend->mr : NULL;
  253. }
  254. void host_memory_backend_set_mapped(HostMemoryBackend *backend, bool mapped)
  255. {
  256. backend->is_mapped = mapped;
  257. }
  258. bool host_memory_backend_is_mapped(HostMemoryBackend *backend)
  259. {
  260. return backend->is_mapped;
  261. }
  262. size_t host_memory_backend_pagesize(HostMemoryBackend *memdev)
  263. {
  264. size_t pagesize = qemu_ram_pagesize(memdev->mr.ram_block);
  265. g_assert(pagesize >= qemu_real_host_page_size());
  266. return pagesize;
  267. }
  268. static void
  269. host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
  270. {
  271. HostMemoryBackend *backend = MEMORY_BACKEND(uc);
  272. HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
  273. Error *local_err = NULL;
  274. void *ptr;
  275. uint64_t sz;
  276. if (bc->alloc) {
  277. bc->alloc(backend, &local_err);
  278. if (local_err) {
  279. goto out;
  280. }
  281. ptr = memory_region_get_ram_ptr(&backend->mr);
  282. sz = memory_region_size(&backend->mr);
  283. if (backend->merge) {
  284. qemu_madvise(ptr, sz, QEMU_MADV_MERGEABLE);
  285. }
  286. if (!backend->dump) {
  287. qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP);
  288. }
  289. #ifdef CONFIG_NUMA
  290. unsigned long lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
  291. /* lastbit == MAX_NODES means maxnode = 0 */
  292. unsigned long maxnode = (lastbit + 1) % (MAX_NODES + 1);
  293. /* ensure policy won't be ignored in case memory is preallocated
  294. * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
  295. * this doesn't catch hugepage case. */
  296. unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
  297. int mode = backend->policy;
  298. /* check for invalid host-nodes and policies and give more verbose
  299. * error messages than mbind(). */
  300. if (maxnode && backend->policy == MPOL_DEFAULT) {
  301. error_setg(errp, "host-nodes must be empty for policy default,"
  302. " or you should explicitly specify a policy other"
  303. " than default");
  304. return;
  305. } else if (maxnode == 0 && backend->policy != MPOL_DEFAULT) {
  306. error_setg(errp, "host-nodes must be set for policy %s",
  307. HostMemPolicy_str(backend->policy));
  308. return;
  309. }
  310. /* We can have up to MAX_NODES nodes, but we need to pass maxnode+1
  311. * as argument to mbind() due to an old Linux bug (feature?) which
  312. * cuts off the last specified node. This means backend->host_nodes
  313. * must have MAX_NODES+1 bits available.
  314. */
  315. assert(sizeof(backend->host_nodes) >=
  316. BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
  317. assert(maxnode <= MAX_NODES);
  318. #ifdef HAVE_NUMA_HAS_PREFERRED_MANY
  319. if (mode == MPOL_PREFERRED && numa_has_preferred_many() > 0) {
  320. /*
  321. * Replace with MPOL_PREFERRED_MANY otherwise the mbind() below
  322. * silently picks the first node.
  323. */
  324. mode = MPOL_PREFERRED_MANY;
  325. }
  326. #endif
  327. if (maxnode &&
  328. mbind(ptr, sz, mode, backend->host_nodes, maxnode + 1, flags)) {
  329. if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
  330. error_setg_errno(errp, errno,
  331. "cannot bind memory to host NUMA nodes");
  332. return;
  333. }
  334. }
  335. #endif
  336. /* Preallocate memory after the NUMA policy has been instantiated.
  337. * This is necessary to guarantee memory is allocated with
  338. * specified NUMA policy in place.
  339. */
  340. if (backend->prealloc) {
  341. qemu_prealloc_mem(memory_region_get_fd(&backend->mr), ptr, sz,
  342. backend->prealloc_threads,
  343. backend->prealloc_context, &local_err);
  344. if (local_err) {
  345. goto out;
  346. }
  347. }
  348. }
  349. out:
  350. error_propagate(errp, local_err);
  351. }
  352. static bool
  353. host_memory_backend_can_be_deleted(UserCreatable *uc)
  354. {
  355. if (host_memory_backend_is_mapped(MEMORY_BACKEND(uc))) {
  356. return false;
  357. } else {
  358. return true;
  359. }
  360. }
  361. static bool host_memory_backend_get_share(Object *o, Error **errp)
  362. {
  363. HostMemoryBackend *backend = MEMORY_BACKEND(o);
  364. return backend->share;
  365. }
  366. static void host_memory_backend_set_share(Object *o, bool value, Error **errp)
  367. {
  368. HostMemoryBackend *backend = MEMORY_BACKEND(o);
  369. if (host_memory_backend_mr_inited(backend)) {
  370. error_setg(errp, "cannot change property value");
  371. return;
  372. }
  373. backend->share = value;
  374. }
  375. #ifdef CONFIG_LINUX
  376. static bool host_memory_backend_get_reserve(Object *o, Error **errp)
  377. {
  378. HostMemoryBackend *backend = MEMORY_BACKEND(o);
  379. return backend->reserve;
  380. }
  381. static void host_memory_backend_set_reserve(Object *o, bool value, Error **errp)
  382. {
  383. HostMemoryBackend *backend = MEMORY_BACKEND(o);
  384. if (host_memory_backend_mr_inited(backend)) {
  385. error_setg(errp, "cannot change property value");
  386. return;
  387. }
  388. if (backend->prealloc && !value) {
  389. error_setg(errp, "'prealloc=on' and 'reserve=off' are incompatible");
  390. return;
  391. }
  392. backend->reserve = value;
  393. }
  394. #endif /* CONFIG_LINUX */
  395. static bool
  396. host_memory_backend_get_use_canonical_path(Object *obj, Error **errp)
  397. {
  398. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  399. return backend->use_canonical_path;
  400. }
  401. static void
  402. host_memory_backend_set_use_canonical_path(Object *obj, bool value,
  403. Error **errp)
  404. {
  405. HostMemoryBackend *backend = MEMORY_BACKEND(obj);
  406. backend->use_canonical_path = value;
  407. }
  408. static void
  409. host_memory_backend_class_init(ObjectClass *oc, void *data)
  410. {
  411. UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
  412. ucc->complete = host_memory_backend_memory_complete;
  413. ucc->can_be_deleted = host_memory_backend_can_be_deleted;
  414. object_class_property_add_bool(oc, "merge",
  415. host_memory_backend_get_merge,
  416. host_memory_backend_set_merge);
  417. object_class_property_set_description(oc, "merge",
  418. "Mark memory as mergeable");
  419. object_class_property_add_bool(oc, "dump",
  420. host_memory_backend_get_dump,
  421. host_memory_backend_set_dump);
  422. object_class_property_set_description(oc, "dump",
  423. "Set to 'off' to exclude from core dump");
  424. object_class_property_add_bool(oc, "prealloc",
  425. host_memory_backend_get_prealloc,
  426. host_memory_backend_set_prealloc);
  427. object_class_property_set_description(oc, "prealloc",
  428. "Preallocate memory");
  429. object_class_property_add(oc, "prealloc-threads", "int",
  430. host_memory_backend_get_prealloc_threads,
  431. host_memory_backend_set_prealloc_threads,
  432. NULL, NULL);
  433. object_class_property_set_description(oc, "prealloc-threads",
  434. "Number of CPU threads to use for prealloc");
  435. object_class_property_add_link(oc, "prealloc-context",
  436. TYPE_THREAD_CONTEXT, offsetof(HostMemoryBackend, prealloc_context),
  437. object_property_allow_set_link, OBJ_PROP_LINK_STRONG);
  438. object_class_property_set_description(oc, "prealloc-context",
  439. "Context to use for creating CPU threads for preallocation");
  440. object_class_property_add(oc, "size", "int",
  441. host_memory_backend_get_size,
  442. host_memory_backend_set_size,
  443. NULL, NULL);
  444. object_class_property_set_description(oc, "size",
  445. "Size of the memory region (ex: 500M)");
  446. object_class_property_add(oc, "host-nodes", "int",
  447. host_memory_backend_get_host_nodes,
  448. host_memory_backend_set_host_nodes,
  449. NULL, NULL);
  450. object_class_property_set_description(oc, "host-nodes",
  451. "Binds memory to the list of NUMA host nodes");
  452. object_class_property_add_enum(oc, "policy", "HostMemPolicy",
  453. &HostMemPolicy_lookup,
  454. host_memory_backend_get_policy,
  455. host_memory_backend_set_policy);
  456. object_class_property_set_description(oc, "policy",
  457. "Set the NUMA policy");
  458. object_class_property_add_bool(oc, "share",
  459. host_memory_backend_get_share, host_memory_backend_set_share);
  460. object_class_property_set_description(oc, "share",
  461. "Mark the memory as private to QEMU or shared");
  462. #ifdef CONFIG_LINUX
  463. object_class_property_add_bool(oc, "reserve",
  464. host_memory_backend_get_reserve, host_memory_backend_set_reserve);
  465. object_class_property_set_description(oc, "reserve",
  466. "Reserve swap space (or huge pages) if applicable");
  467. #endif /* CONFIG_LINUX */
  468. /*
  469. * Do not delete/rename option. This option must be considered stable
  470. * (as if it didn't have the 'x-' prefix including deprecation period) as
  471. * long as 4.0 and older machine types exists.
  472. * Option will be used by upper layers to override (disable) canonical path
  473. * for ramblock-id set by compat properties on old machine types ( <= 4.0),
  474. * to keep migration working when backend is used for main RAM with
  475. * -machine memory-backend= option (main RAM historically used prefix-less
  476. * ramblock-id).
  477. */
  478. object_class_property_add_bool(oc, "x-use-canonical-path-for-ramblock-id",
  479. host_memory_backend_get_use_canonical_path,
  480. host_memory_backend_set_use_canonical_path);
  481. }
  482. static const TypeInfo host_memory_backend_info = {
  483. .name = TYPE_MEMORY_BACKEND,
  484. .parent = TYPE_OBJECT,
  485. .abstract = true,
  486. .class_size = sizeof(HostMemoryBackendClass),
  487. .class_init = host_memory_backend_class_init,
  488. .instance_size = sizeof(HostMemoryBackend),
  489. .instance_init = host_memory_backend_init,
  490. .instance_post_init = host_memory_backend_post_init,
  491. .interfaces = (InterfaceInfo[]) {
  492. { TYPE_USER_CREATABLE },
  493. { }
  494. }
  495. };
  496. static void register_types(void)
  497. {
  498. type_register_static(&host_memory_backend_info);
  499. }
  500. type_init(register_types);