2
0

memory-device.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. /*
  2. * Memory Device Interface
  3. *
  4. * Copyright ProfitBricks GmbH 2012
  5. * Copyright (C) 2014 Red Hat Inc
  6. * Copyright (c) 2018 Red Hat Inc
  7. *
  8. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  9. * See the COPYING file in the top-level directory.
  10. */
  11. #include "qemu/osdep.h"
  12. #include "qemu/error-report.h"
  13. #include "hw/mem/memory-device.h"
  14. #include "qapi/error.h"
  15. #include "hw/boards.h"
  16. #include "qemu/range.h"
  17. #include "hw/virtio/vhost.h"
  18. #include "system/kvm.h"
  19. #include "exec/address-spaces.h"
  20. #include "trace.h"
  21. static bool memory_device_is_empty(const MemoryDeviceState *md)
  22. {
  23. const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
  24. Error *local_err = NULL;
  25. MemoryRegion *mr;
  26. /* dropping const here is fine as we don't touch the memory region */
  27. mr = mdc->get_memory_region((MemoryDeviceState *)md, &local_err);
  28. if (local_err) {
  29. /* Not empty, we'll report errors later when containing the MR again. */
  30. error_free(local_err);
  31. return false;
  32. }
  33. return !mr;
  34. }
  35. static gint memory_device_addr_sort(gconstpointer a, gconstpointer b)
  36. {
  37. const MemoryDeviceState *md_a = MEMORY_DEVICE(a);
  38. const MemoryDeviceState *md_b = MEMORY_DEVICE(b);
  39. const MemoryDeviceClass *mdc_a = MEMORY_DEVICE_GET_CLASS(a);
  40. const MemoryDeviceClass *mdc_b = MEMORY_DEVICE_GET_CLASS(b);
  41. const uint64_t addr_a = mdc_a->get_addr(md_a);
  42. const uint64_t addr_b = mdc_b->get_addr(md_b);
  43. if (addr_a > addr_b) {
  44. return 1;
  45. } else if (addr_a < addr_b) {
  46. return -1;
  47. }
  48. return 0;
  49. }
  50. static int memory_device_build_list(Object *obj, void *opaque)
  51. {
  52. GSList **list = opaque;
  53. if (object_dynamic_cast(obj, TYPE_MEMORY_DEVICE)) {
  54. DeviceState *dev = DEVICE(obj);
  55. if (dev->realized) { /* only realized memory devices matter */
  56. *list = g_slist_insert_sorted(*list, dev, memory_device_addr_sort);
  57. }
  58. }
  59. object_child_foreach(obj, memory_device_build_list, opaque);
  60. return 0;
  61. }
  62. static unsigned int memory_device_get_memslots(MemoryDeviceState *md)
  63. {
  64. const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
  65. if (mdc->get_memslots) {
  66. return mdc->get_memslots(md);
  67. }
  68. return 1;
  69. }
  70. /*
  71. * Memslots that are reserved by memory devices (required but still reported
  72. * as free from KVM / vhost).
  73. */
  74. static unsigned int get_reserved_memslots(MachineState *ms)
  75. {
  76. if (ms->device_memory->used_memslots >
  77. ms->device_memory->required_memslots) {
  78. /* This is unexpected, and we warned already in the memory notifier. */
  79. return 0;
  80. }
  81. return ms->device_memory->required_memslots -
  82. ms->device_memory->used_memslots;
  83. }
  84. unsigned int memory_devices_get_reserved_memslots(void)
  85. {
  86. if (!current_machine->device_memory) {
  87. return 0;
  88. }
  89. return get_reserved_memslots(current_machine);
  90. }
  91. bool memory_devices_memslot_auto_decision_active(void)
  92. {
  93. if (!current_machine->device_memory) {
  94. return false;
  95. }
  96. return current_machine->device_memory->memslot_auto_decision_active;
  97. }
  98. static unsigned int memory_device_memslot_decision_limit(MachineState *ms,
  99. MemoryRegion *mr)
  100. {
  101. const unsigned int reserved = get_reserved_memslots(ms);
  102. const uint64_t size = memory_region_size(mr);
  103. unsigned int max = vhost_get_max_memslots();
  104. unsigned int free = vhost_get_free_memslots();
  105. uint64_t available_space;
  106. unsigned int memslots;
  107. if (kvm_enabled()) {
  108. max = MIN(max, kvm_get_max_memslots());
  109. free = MIN(free, kvm_get_free_memslots());
  110. }
  111. /*
  112. * If we only have less overall memslots than what we consider reasonable,
  113. * just keep it to a minimum.
  114. */
  115. if (max < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS) {
  116. return 1;
  117. }
  118. /*
  119. * Consider our soft-limit across all memory devices. We don't really
  120. * expect to exceed this limit in reasonable configurations.
  121. */
  122. if (MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT <=
  123. ms->device_memory->required_memslots) {
  124. return 1;
  125. }
  126. memslots = MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT -
  127. ms->device_memory->required_memslots;
  128. /*
  129. * Consider the actually still free memslots. This is only relevant if
  130. * other memslot consumers would consume *significantly* more memslots than
  131. * what we prepared for (> 253). Unlikely, but let's just handle it
  132. * cleanly.
  133. */
  134. memslots = MIN(memslots, free - reserved);
  135. if (memslots < 1 || unlikely(free < reserved)) {
  136. return 1;
  137. }
  138. /* We cannot have any other memory devices? So give all to this device. */
  139. if (size == ms->maxram_size - ms->ram_size) {
  140. return memslots;
  141. }
  142. /*
  143. * Simple heuristic: equally distribute the memslots over the space
  144. * still available for memory devices.
  145. */
  146. available_space = ms->maxram_size - ms->ram_size -
  147. ms->device_memory->used_region_size;
  148. memslots = (double)memslots * size / available_space;
  149. return memslots < 1 ? 1 : memslots;
  150. }
  151. static void memory_device_check_addable(MachineState *ms, MemoryDeviceState *md,
  152. MemoryRegion *mr, Error **errp)
  153. {
  154. const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
  155. const uint64_t used_region_size = ms->device_memory->used_region_size;
  156. const uint64_t size = memory_region_size(mr);
  157. const unsigned int reserved_memslots = get_reserved_memslots(ms);
  158. unsigned int required_memslots, memslot_limit;
  159. /*
  160. * Instruct the device to decide how many memslots to use, if applicable,
  161. * before we query the number of required memslots the first time.
  162. */
  163. if (mdc->decide_memslots) {
  164. memslot_limit = memory_device_memslot_decision_limit(ms, mr);
  165. mdc->decide_memslots(md, memslot_limit);
  166. }
  167. required_memslots = memory_device_get_memslots(md);
  168. /* we will need memory slots for kvm and vhost */
  169. if (kvm_enabled() &&
  170. kvm_get_free_memslots() < required_memslots + reserved_memslots) {
  171. error_setg(errp, "hypervisor has not enough free memory slots left");
  172. return;
  173. }
  174. if (vhost_get_free_memslots() < required_memslots + reserved_memslots) {
  175. error_setg(errp, "a used vhost backend has not enough free memory slots left");
  176. return;
  177. }
  178. /* will we exceed the total amount of memory specified */
  179. if (used_region_size + size < used_region_size ||
  180. used_region_size + size > ms->maxram_size - ms->ram_size) {
  181. error_setg(errp, "not enough space, currently 0x%" PRIx64
  182. " in use of total space for memory devices 0x" RAM_ADDR_FMT,
  183. used_region_size, ms->maxram_size - ms->ram_size);
  184. return;
  185. }
  186. }
  187. static uint64_t memory_device_get_free_addr(MachineState *ms,
  188. const uint64_t *hint,
  189. uint64_t align, uint64_t size,
  190. Error **errp)
  191. {
  192. GSList *list = NULL, *item;
  193. Range as, new = range_empty;
  194. range_init_nofail(&as, ms->device_memory->base,
  195. memory_region_size(&ms->device_memory->mr));
  196. /* start of address space indicates the maximum alignment we expect */
  197. if (!QEMU_IS_ALIGNED(range_lob(&as), align)) {
  198. warn_report("the alignment (0x%" PRIx64 ") exceeds the expected"
  199. " maximum alignment, memory will get fragmented and not"
  200. " all 'maxmem' might be usable for memory devices.",
  201. align);
  202. }
  203. if (hint && !QEMU_IS_ALIGNED(*hint, align)) {
  204. error_setg(errp, "address must be aligned to 0x%" PRIx64 " bytes",
  205. align);
  206. return 0;
  207. }
  208. if (hint) {
  209. if (range_init(&new, *hint, size) || !range_contains_range(&as, &new)) {
  210. error_setg(errp, "can't add memory device [0x%" PRIx64 ":0x%" PRIx64
  211. "], usable range for memory devices [0x%" PRIx64 ":0x%"
  212. PRIx64 "]", *hint, size, range_lob(&as),
  213. range_size(&as));
  214. return 0;
  215. }
  216. } else {
  217. if (range_init(&new, QEMU_ALIGN_UP(range_lob(&as), align), size)) {
  218. error_setg(errp, "can't add memory device, device too big");
  219. return 0;
  220. }
  221. }
  222. /* find address range that will fit new memory device */
  223. object_child_foreach(OBJECT(ms), memory_device_build_list, &list);
  224. for (item = list; item; item = g_slist_next(item)) {
  225. const MemoryDeviceState *md = item->data;
  226. const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(OBJECT(md));
  227. uint64_t next_addr;
  228. Range tmp;
  229. if (memory_device_is_empty(md)) {
  230. continue;
  231. }
  232. range_init_nofail(&tmp, mdc->get_addr(md),
  233. memory_device_get_region_size(md, &error_abort));
  234. if (range_overlaps_range(&tmp, &new)) {
  235. if (hint) {
  236. const DeviceState *d = DEVICE(md);
  237. error_setg(errp, "address range conflicts with memory device"
  238. " id='%s'", d->id ? d->id : "(unnamed)");
  239. goto out;
  240. }
  241. next_addr = QEMU_ALIGN_UP(range_upb(&tmp) + 1, align);
  242. if (!next_addr || range_init(&new, next_addr, range_size(&new))) {
  243. range_make_empty(&new);
  244. break;
  245. }
  246. } else if (range_lob(&tmp) > range_upb(&new)) {
  247. break;
  248. }
  249. }
  250. if (!range_contains_range(&as, &new)) {
  251. error_setg(errp, "could not find position in guest address space for "
  252. "memory device - memory fragmented due to alignments");
  253. }
  254. out:
  255. g_slist_free(list);
  256. return range_lob(&new);
  257. }
  258. MemoryDeviceInfoList *qmp_memory_device_list(void)
  259. {
  260. GSList *devices = NULL, *item;
  261. MemoryDeviceInfoList *list = NULL, **tail = &list;
  262. object_child_foreach(qdev_get_machine(), memory_device_build_list,
  263. &devices);
  264. for (item = devices; item; item = g_slist_next(item)) {
  265. const MemoryDeviceState *md = MEMORY_DEVICE(item->data);
  266. const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(item->data);
  267. MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1);
  268. /* Let's query infotmation even for empty memory devices. */
  269. mdc->fill_device_info(md, info);
  270. QAPI_LIST_APPEND(tail, info);
  271. }
  272. g_slist_free(devices);
  273. return list;
  274. }
  275. static int memory_device_plugged_size(Object *obj, void *opaque)
  276. {
  277. uint64_t *size = opaque;
  278. if (object_dynamic_cast(obj, TYPE_MEMORY_DEVICE)) {
  279. const DeviceState *dev = DEVICE(obj);
  280. const MemoryDeviceState *md = MEMORY_DEVICE(obj);
  281. const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(obj);
  282. if (dev->realized && !memory_device_is_empty(md)) {
  283. *size += mdc->get_plugged_size(md, &error_abort);
  284. }
  285. }
  286. object_child_foreach(obj, memory_device_plugged_size, opaque);
  287. return 0;
  288. }
  289. uint64_t get_plugged_memory_size(void)
  290. {
  291. uint64_t size = 0;
  292. memory_device_plugged_size(qdev_get_machine(), &size);
  293. return size;
  294. }
  295. void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms,
  296. Error **errp)
  297. {
  298. const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
  299. Error *local_err = NULL;
  300. uint64_t addr, align = 0;
  301. MemoryRegion *mr;
  302. /* We support empty memory devices even without device memory. */
  303. if (memory_device_is_empty(md)) {
  304. return;
  305. }
  306. if (!ms->device_memory) {
  307. error_setg(errp, "the configuration is not prepared for memory devices"
  308. " (e.g., for memory hotplug), consider specifying the"
  309. " maxmem option");
  310. return;
  311. }
  312. mr = mdc->get_memory_region(md, &local_err);
  313. if (local_err) {
  314. goto out;
  315. }
  316. memory_device_check_addable(ms, md, mr, &local_err);
  317. if (local_err) {
  318. goto out;
  319. }
  320. /*
  321. * We always want the memory region size to be multiples of the memory
  322. * region alignment: for example, DIMMs with 1G+1byte size don't make
  323. * any sense. Note that we don't check that the size is multiples
  324. * of any additional alignment requirements the memory device might
  325. * have when it comes to the address in physical address space.
  326. */
  327. if (!QEMU_IS_ALIGNED(memory_region_size(mr),
  328. memory_region_get_alignment(mr))) {
  329. error_setg(errp, "backend memory size must be multiple of 0x%"
  330. PRIx64, memory_region_get_alignment(mr));
  331. return;
  332. }
  333. if (mdc->get_min_alignment) {
  334. align = mdc->get_min_alignment(md);
  335. }
  336. align = MAX(align, memory_region_get_alignment(mr));
  337. addr = mdc->get_addr(md);
  338. addr = memory_device_get_free_addr(ms, !addr ? NULL : &addr, align,
  339. memory_region_size(mr), &local_err);
  340. if (local_err) {
  341. goto out;
  342. }
  343. mdc->set_addr(md, addr, &local_err);
  344. if (!local_err) {
  345. trace_memory_device_pre_plug(DEVICE(md)->id ? DEVICE(md)->id : "",
  346. addr);
  347. }
  348. out:
  349. error_propagate(errp, local_err);
  350. }
  351. void memory_device_plug(MemoryDeviceState *md, MachineState *ms)
  352. {
  353. const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
  354. unsigned int memslots;
  355. uint64_t addr;
  356. MemoryRegion *mr;
  357. if (memory_device_is_empty(md)) {
  358. return;
  359. }
  360. memslots = memory_device_get_memslots(md);
  361. addr = mdc->get_addr(md);
  362. /*
  363. * We expect that a previous call to memory_device_pre_plug() succeeded, so
  364. * it can't fail at this point.
  365. */
  366. mr = mdc->get_memory_region(md, &error_abort);
  367. g_assert(ms->device_memory);
  368. ms->device_memory->used_region_size += memory_region_size(mr);
  369. ms->device_memory->required_memslots += memslots;
  370. if (mdc->decide_memslots && memslots > 1) {
  371. ms->device_memory->memslot_auto_decision_active++;
  372. }
  373. memory_region_add_subregion(&ms->device_memory->mr,
  374. addr - ms->device_memory->base, mr);
  375. trace_memory_device_plug(DEVICE(md)->id ? DEVICE(md)->id : "", addr);
  376. }
  377. void memory_device_unplug(MemoryDeviceState *md, MachineState *ms)
  378. {
  379. const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
  380. const unsigned int memslots = memory_device_get_memslots(md);
  381. MemoryRegion *mr;
  382. if (memory_device_is_empty(md)) {
  383. return;
  384. }
  385. /*
  386. * We expect that a previous call to memory_device_pre_plug() succeeded, so
  387. * it can't fail at this point.
  388. */
  389. mr = mdc->get_memory_region(md, &error_abort);
  390. g_assert(ms->device_memory);
  391. memory_region_del_subregion(&ms->device_memory->mr, mr);
  392. if (mdc->decide_memslots && memslots > 1) {
  393. ms->device_memory->memslot_auto_decision_active--;
  394. }
  395. ms->device_memory->used_region_size -= memory_region_size(mr);
  396. ms->device_memory->required_memslots -= memslots;
  397. trace_memory_device_unplug(DEVICE(md)->id ? DEVICE(md)->id : "",
  398. mdc->get_addr(md));
  399. }
  400. uint64_t memory_device_get_region_size(const MemoryDeviceState *md,
  401. Error **errp)
  402. {
  403. const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
  404. MemoryRegion *mr;
  405. /* dropping const here is fine as we don't touch the memory region */
  406. mr = mdc->get_memory_region((MemoryDeviceState *)md, errp);
  407. if (!mr) {
  408. return 0;
  409. }
  410. return memory_region_size(mr);
  411. }
  412. static void memory_devices_region_mod(MemoryListener *listener,
  413. MemoryRegionSection *mrs, bool add)
  414. {
  415. DeviceMemoryState *dms = container_of(listener, DeviceMemoryState,
  416. listener);
  417. if (!memory_region_is_ram(mrs->mr)) {
  418. warn_report("Unexpected memory region mapped into device memory region.");
  419. return;
  420. }
  421. /*
  422. * The expectation is that each distinct RAM memory region section in
  423. * our region for memory devices consumes exactly one memslot in KVM
  424. * and in vhost. For vhost, this is true, except:
  425. * * ROM memory regions don't consume a memslot. These get used very
  426. * rarely for memory devices (R/O NVDIMMs).
  427. * * Memslots without a fd (memory-backend-ram) don't necessarily
  428. * consume a memslot. Such setups are quite rare and possibly bogus:
  429. * the memory would be inaccessible by such vhost devices.
  430. *
  431. * So for vhost, in corner cases we might over-estimate the number of
  432. * memslots that are currently used or that might still be reserved
  433. * (required - used).
  434. */
  435. dms->used_memslots += add ? 1 : -1;
  436. if (dms->used_memslots > dms->required_memslots) {
  437. warn_report("Memory devices use more memory slots than indicated as required.");
  438. }
  439. }
  440. static void memory_devices_region_add(MemoryListener *listener,
  441. MemoryRegionSection *mrs)
  442. {
  443. return memory_devices_region_mod(listener, mrs, true);
  444. }
  445. static void memory_devices_region_del(MemoryListener *listener,
  446. MemoryRegionSection *mrs)
  447. {
  448. return memory_devices_region_mod(listener, mrs, false);
  449. }
  450. void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size)
  451. {
  452. g_assert(size);
  453. g_assert(!ms->device_memory);
  454. ms->device_memory = g_new0(DeviceMemoryState, 1);
  455. ms->device_memory->base = base;
  456. memory_region_init(&ms->device_memory->mr, OBJECT(ms), "device-memory",
  457. size);
  458. address_space_init(&ms->device_memory->as, &ms->device_memory->mr,
  459. "device-memory");
  460. memory_region_add_subregion(get_system_memory(), ms->device_memory->base,
  461. &ms->device_memory->mr);
  462. /* Track the number of memslots used by memory devices. */
  463. ms->device_memory->listener.region_add = memory_devices_region_add;
  464. ms->device_memory->listener.region_del = memory_devices_region_del;
  465. memory_listener_register(&ms->device_memory->listener,
  466. &ms->device_memory->as);
  467. }
  468. static const TypeInfo memory_device_info = {
  469. .name = TYPE_MEMORY_DEVICE,
  470. .parent = TYPE_INTERFACE,
  471. .class_size = sizeof(MemoryDeviceClass),
  472. };
  473. static void memory_device_register_types(void)
  474. {
  475. type_register_static(&memory_device_info);
  476. }
  477. type_init(memory_device_register_types)