common.c 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564
  1. /*
  2. * generic functions used by VFIO devices
  3. *
  4. * Copyright Red Hat, Inc. 2012
  5. *
  6. * Authors:
  7. * Alex Williamson <alex.williamson@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Based on qemu-kvm device-assignment:
  13. * Adapted for KVM by Qumranet.
  14. * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15. * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16. * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17. * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18. * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19. */
  20. #include "qemu/osdep.h"
  21. #include <sys/ioctl.h>
  22. #ifdef CONFIG_KVM
  23. #include <linux/kvm.h>
  24. #endif
  25. #include <linux/vfio.h>
  26. #include "hw/vfio/vfio-common.h"
  27. #include "hw/vfio/pci.h"
  28. #include "exec/address-spaces.h"
  29. #include "exec/memory.h"
  30. #include "exec/ram_addr.h"
  31. #include "hw/hw.h"
  32. #include "qemu/error-report.h"
  33. #include "qemu/main-loop.h"
  34. #include "qemu/range.h"
  35. #include "system/kvm.h"
  36. #include "system/reset.h"
  37. #include "system/runstate.h"
  38. #include "trace.h"
  39. #include "qapi/error.h"
  40. #include "migration/misc.h"
  41. #include "migration/blocker.h"
  42. #include "migration/qemu-file.h"
  43. #include "system/tpm.h"
  44. VFIODeviceList vfio_device_list =
  45. QLIST_HEAD_INITIALIZER(vfio_device_list);
  46. static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
  47. QLIST_HEAD_INITIALIZER(vfio_address_spaces);
  48. #ifdef CONFIG_KVM
  49. /*
  50. * We have a single VFIO pseudo device per KVM VM. Once created it lives
  51. * for the life of the VM. Closing the file descriptor only drops our
  52. * reference to it and the device's reference to kvm. Therefore once
  53. * initialized, this file descriptor is only released on QEMU exit and
  54. * we'll re-use it should another vfio device be attached before then.
  55. */
  56. int vfio_kvm_device_fd = -1;
  57. #endif
  58. /*
  59. * Device state interfaces
  60. */
  61. bool vfio_mig_active(void)
  62. {
  63. VFIODevice *vbasedev;
  64. if (QLIST_EMPTY(&vfio_device_list)) {
  65. return false;
  66. }
  67. QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
  68. if (vbasedev->migration_blocker) {
  69. return false;
  70. }
  71. }
  72. return true;
  73. }
  74. static Error *multiple_devices_migration_blocker;
  75. /*
  76. * Multiple devices migration is allowed only if all devices support P2P
  77. * migration. Single device migration is allowed regardless of P2P migration
  78. * support.
  79. */
  80. static bool vfio_multiple_devices_migration_is_supported(void)
  81. {
  82. VFIODevice *vbasedev;
  83. unsigned int device_num = 0;
  84. bool all_support_p2p = true;
  85. QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
  86. if (vbasedev->migration) {
  87. device_num++;
  88. if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) {
  89. all_support_p2p = false;
  90. }
  91. }
  92. }
  93. return all_support_p2p || device_num <= 1;
  94. }
  95. int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
  96. {
  97. int ret;
  98. if (vfio_multiple_devices_migration_is_supported()) {
  99. return 0;
  100. }
  101. if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
  102. error_setg(errp, "Multiple VFIO devices migration is supported only if "
  103. "all of them support P2P migration");
  104. return -EINVAL;
  105. }
  106. if (multiple_devices_migration_blocker) {
  107. return 0;
  108. }
  109. error_setg(&multiple_devices_migration_blocker,
  110. "Multiple VFIO devices migration is supported only if all of "
  111. "them support P2P migration");
  112. ret = migrate_add_blocker_normal(&multiple_devices_migration_blocker, errp);
  113. return ret;
  114. }
  115. void vfio_unblock_multiple_devices_migration(void)
  116. {
  117. if (!multiple_devices_migration_blocker ||
  118. !vfio_multiple_devices_migration_is_supported()) {
  119. return;
  120. }
  121. migrate_del_blocker(&multiple_devices_migration_blocker);
  122. }
  123. bool vfio_viommu_preset(VFIODevice *vbasedev)
  124. {
  125. return vbasedev->bcontainer->space->as != &address_space_memory;
  126. }
  127. static void vfio_set_migration_error(int ret)
  128. {
  129. if (migration_is_running()) {
  130. migration_file_set_error(ret, NULL);
  131. }
  132. }
  133. bool vfio_device_state_is_running(VFIODevice *vbasedev)
  134. {
  135. VFIOMigration *migration = vbasedev->migration;
  136. return migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
  137. migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P;
  138. }
  139. bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
  140. {
  141. VFIOMigration *migration = vbasedev->migration;
  142. return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ||
  143. migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P;
  144. }
  145. static bool vfio_devices_all_device_dirty_tracking_started(
  146. const VFIOContainerBase *bcontainer)
  147. {
  148. VFIODevice *vbasedev;
  149. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  150. if (!vbasedev->dirty_tracking) {
  151. return false;
  152. }
  153. }
  154. return true;
  155. }
  156. bool vfio_devices_all_dirty_tracking_started(
  157. const VFIOContainerBase *bcontainer)
  158. {
  159. return vfio_devices_all_device_dirty_tracking_started(bcontainer) ||
  160. bcontainer->dirty_pages_started;
  161. }
  162. static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer)
  163. {
  164. VFIODevice *vbasedev;
  165. if (!vfio_devices_all_dirty_tracking_started(bcontainer)) {
  166. return false;
  167. }
  168. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  169. VFIOMigration *migration = vbasedev->migration;
  170. if (!migration) {
  171. return false;
  172. }
  173. if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
  174. (vfio_device_state_is_running(vbasedev) ||
  175. vfio_device_state_is_precopy(vbasedev))) {
  176. return false;
  177. }
  178. }
  179. return true;
  180. }
  181. bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer)
  182. {
  183. VFIODevice *vbasedev;
  184. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  185. if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) {
  186. return false;
  187. }
  188. if (!vbasedev->dirty_pages_supported) {
  189. return false;
  190. }
  191. }
  192. return true;
  193. }
  194. static bool vfio_listener_skipped_section(MemoryRegionSection *section)
  195. {
  196. return (!memory_region_is_ram(section->mr) &&
  197. !memory_region_is_iommu(section->mr)) ||
  198. memory_region_is_protected(section->mr) ||
  199. /*
  200. * Sizing an enabled 64-bit BAR can cause spurious mappings to
  201. * addresses in the upper part of the 64-bit address space. These
  202. * are never accessed by the CPU and beyond the address width of
  203. * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
  204. */
  205. section->offset_within_address_space & (1ULL << 63);
  206. }
  207. /* Called with rcu_read_lock held. */
  208. static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
  209. ram_addr_t *ram_addr, bool *read_only,
  210. Error **errp)
  211. {
  212. bool ret, mr_has_discard_manager;
  213. ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
  214. &mr_has_discard_manager, errp);
  215. if (ret && mr_has_discard_manager) {
  216. /*
  217. * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
  218. * pages will remain pinned inside vfio until unmapped, resulting in a
  219. * higher memory consumption than expected. If memory would get
  220. * populated again later, there would be an inconsistency between pages
  221. * pinned by vfio and pages seen by QEMU. This is the case until
  222. * unmapped from the IOMMU (e.g., during device reset).
  223. *
  224. * With malicious guests, we really only care about pinning more memory
  225. * than expected. RLIMIT_MEMLOCK set for the user/process can never be
  226. * exceeded and can be used to mitigate this problem.
  227. */
  228. warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
  229. " RAM (e.g., virtio-mem) works, however, malicious"
  230. " guests can trigger pinning of more memory than"
  231. " intended via an IOMMU. It's possible to mitigate "
  232. " by setting/adjusting RLIMIT_MEMLOCK.");
  233. }
  234. return ret;
  235. }
  236. static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
  237. {
  238. VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
  239. VFIOContainerBase *bcontainer = giommu->bcontainer;
  240. hwaddr iova = iotlb->iova + giommu->iommu_offset;
  241. void *vaddr;
  242. int ret;
  243. Error *local_err = NULL;
  244. trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
  245. iova, iova + iotlb->addr_mask);
  246. if (iotlb->target_as != &address_space_memory) {
  247. error_report("Wrong target AS \"%s\", only system memory is allowed",
  248. iotlb->target_as->name ? iotlb->target_as->name : "none");
  249. vfio_set_migration_error(-EINVAL);
  250. return;
  251. }
  252. rcu_read_lock();
  253. if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
  254. bool read_only;
  255. if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) {
  256. error_report_err(local_err);
  257. goto out;
  258. }
  259. /*
  260. * vaddr is only valid until rcu_read_unlock(). But after
  261. * vfio_dma_map has set up the mapping the pages will be
  262. * pinned by the kernel. This makes sure that the RAM backend
  263. * of vaddr will always be there, even if the memory object is
  264. * destroyed and its backing memory munmap-ed.
  265. */
  266. ret = vfio_container_dma_map(bcontainer, iova,
  267. iotlb->addr_mask + 1, vaddr,
  268. read_only);
  269. if (ret) {
  270. error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
  271. "0x%"HWADDR_PRIx", %p) = %d (%s)",
  272. bcontainer, iova,
  273. iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
  274. }
  275. } else {
  276. ret = vfio_container_dma_unmap(bcontainer, iova,
  277. iotlb->addr_mask + 1, iotlb);
  278. if (ret) {
  279. error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  280. "0x%"HWADDR_PRIx") = %d (%s)",
  281. bcontainer, iova,
  282. iotlb->addr_mask + 1, ret, strerror(-ret));
  283. vfio_set_migration_error(ret);
  284. }
  285. }
  286. out:
  287. rcu_read_unlock();
  288. }
  289. static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
  290. MemoryRegionSection *section)
  291. {
  292. VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
  293. listener);
  294. VFIOContainerBase *bcontainer = vrdl->bcontainer;
  295. const hwaddr size = int128_get64(section->size);
  296. const hwaddr iova = section->offset_within_address_space;
  297. int ret;
  298. /* Unmap with a single call. */
  299. ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL);
  300. if (ret) {
  301. error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
  302. strerror(-ret));
  303. }
  304. }
  305. static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
  306. MemoryRegionSection *section)
  307. {
  308. VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
  309. listener);
  310. VFIOContainerBase *bcontainer = vrdl->bcontainer;
  311. const hwaddr end = section->offset_within_region +
  312. int128_get64(section->size);
  313. hwaddr start, next, iova;
  314. void *vaddr;
  315. int ret;
  316. /*
  317. * Map in (aligned within memory region) minimum granularity, so we can
  318. * unmap in minimum granularity later.
  319. */
  320. for (start = section->offset_within_region; start < end; start = next) {
  321. next = ROUND_UP(start + 1, vrdl->granularity);
  322. next = MIN(next, end);
  323. iova = start - section->offset_within_region +
  324. section->offset_within_address_space;
  325. vaddr = memory_region_get_ram_ptr(section->mr) + start;
  326. ret = vfio_container_dma_map(bcontainer, iova, next - start,
  327. vaddr, section->readonly);
  328. if (ret) {
  329. /* Rollback */
  330. vfio_ram_discard_notify_discard(rdl, section);
  331. return ret;
  332. }
  333. }
  334. return 0;
  335. }
  336. static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer,
  337. MemoryRegionSection *section)
  338. {
  339. RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
  340. VFIORamDiscardListener *vrdl;
  341. /* Ignore some corner cases not relevant in practice. */
  342. g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
  343. g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
  344. TARGET_PAGE_SIZE));
  345. g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
  346. vrdl = g_new0(VFIORamDiscardListener, 1);
  347. vrdl->bcontainer = bcontainer;
  348. vrdl->mr = section->mr;
  349. vrdl->offset_within_address_space = section->offset_within_address_space;
  350. vrdl->size = int128_get64(section->size);
  351. vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
  352. section->mr);
  353. g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
  354. g_assert(bcontainer->pgsizes &&
  355. vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes));
  356. ram_discard_listener_init(&vrdl->listener,
  357. vfio_ram_discard_notify_populate,
  358. vfio_ram_discard_notify_discard, true);
  359. ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
  360. QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next);
  361. /*
  362. * Sanity-check if we have a theoretically problematic setup where we could
  363. * exceed the maximum number of possible DMA mappings over time. We assume
  364. * that each mapped section in the same address space as a RamDiscardManager
  365. * section consumes exactly one DMA mapping, with the exception of
  366. * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
  367. * in the same address space as RamDiscardManager sections.
  368. *
  369. * We assume that each section in the address space consumes one memslot.
  370. * We take the number of KVM memory slots as a best guess for the maximum
  371. * number of sections in the address space we could have over time,
  372. * also consuming DMA mappings.
  373. */
  374. if (bcontainer->dma_max_mappings) {
  375. unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
  376. #ifdef CONFIG_KVM
  377. if (kvm_enabled()) {
  378. max_memslots = kvm_get_max_memslots();
  379. }
  380. #endif
  381. QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
  382. hwaddr start, end;
  383. start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
  384. vrdl->granularity);
  385. end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
  386. vrdl->granularity);
  387. vrdl_mappings += (end - start) / vrdl->granularity;
  388. vrdl_count++;
  389. }
  390. if (vrdl_mappings + max_memslots - vrdl_count >
  391. bcontainer->dma_max_mappings) {
  392. warn_report("%s: possibly running out of DMA mappings. E.g., try"
  393. " increasing the 'block-size' of virtio-mem devies."
  394. " Maximum possible DMA mappings: %d, Maximum possible"
  395. " memslots: %d", __func__, bcontainer->dma_max_mappings,
  396. max_memslots);
  397. }
  398. }
  399. }
  400. static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer,
  401. MemoryRegionSection *section)
  402. {
  403. RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
  404. VFIORamDiscardListener *vrdl = NULL;
  405. QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
  406. if (vrdl->mr == section->mr &&
  407. vrdl->offset_within_address_space ==
  408. section->offset_within_address_space) {
  409. break;
  410. }
  411. }
  412. if (!vrdl) {
  413. hw_error("vfio: Trying to unregister missing RAM discard listener");
  414. }
  415. ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
  416. QLIST_REMOVE(vrdl, next);
  417. g_free(vrdl);
  418. }
  419. static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
  420. {
  421. MemoryRegion *mr = section->mr;
  422. if (!TPM_IS_CRB(mr->owner)) {
  423. return false;
  424. }
  425. /* this is a known safe misaligned region, just trace for debug purpose */
  426. trace_vfio_known_safe_misalignment(memory_region_name(mr),
  427. section->offset_within_address_space,
  428. section->offset_within_region,
  429. qemu_real_host_page_size());
  430. return true;
  431. }
  432. static bool vfio_listener_valid_section(MemoryRegionSection *section,
  433. const char *name)
  434. {
  435. if (vfio_listener_skipped_section(section)) {
  436. trace_vfio_listener_region_skip(name,
  437. section->offset_within_address_space,
  438. section->offset_within_address_space +
  439. int128_get64(int128_sub(section->size, int128_one())));
  440. return false;
  441. }
  442. if (unlikely((section->offset_within_address_space &
  443. ~qemu_real_host_page_mask()) !=
  444. (section->offset_within_region & ~qemu_real_host_page_mask()))) {
  445. if (!vfio_known_safe_misalignment(section)) {
  446. error_report("%s received unaligned region %s iova=0x%"PRIx64
  447. " offset_within_region=0x%"PRIx64
  448. " qemu_real_host_page_size=0x%"PRIxPTR,
  449. __func__, memory_region_name(section->mr),
  450. section->offset_within_address_space,
  451. section->offset_within_region,
  452. qemu_real_host_page_size());
  453. }
  454. return false;
  455. }
  456. return true;
  457. }
  458. static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
  459. MemoryRegionSection *section,
  460. hwaddr *out_iova, hwaddr *out_end,
  461. Int128 *out_llend)
  462. {
  463. Int128 llend;
  464. hwaddr iova;
  465. iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
  466. llend = int128_make64(section->offset_within_address_space);
  467. llend = int128_add(llend, section->size);
  468. llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
  469. if (int128_ge(int128_make64(iova), llend)) {
  470. return false;
  471. }
  472. *out_iova = iova;
  473. *out_end = int128_get64(int128_sub(llend, int128_one()));
  474. if (out_llend) {
  475. *out_llend = llend;
  476. }
  477. return true;
  478. }
  479. static void vfio_listener_region_add(MemoryListener *listener,
  480. MemoryRegionSection *section)
  481. {
  482. VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  483. listener);
  484. hwaddr iova, end;
  485. Int128 llend, llsize;
  486. void *vaddr;
  487. int ret;
  488. Error *err = NULL;
  489. if (!vfio_listener_valid_section(section, "region_add")) {
  490. return;
  491. }
  492. if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
  493. &llend)) {
  494. if (memory_region_is_ram_device(section->mr)) {
  495. trace_vfio_listener_region_add_no_dma_map(
  496. memory_region_name(section->mr),
  497. section->offset_within_address_space,
  498. int128_getlo(section->size),
  499. qemu_real_host_page_size());
  500. }
  501. return;
  502. }
  503. if (!vfio_container_add_section_window(bcontainer, section, &err)) {
  504. goto fail;
  505. }
  506. memory_region_ref(section->mr);
  507. if (memory_region_is_iommu(section->mr)) {
  508. VFIOGuestIOMMU *giommu;
  509. IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
  510. int iommu_idx;
  511. trace_vfio_listener_region_add_iommu(section->mr->name, iova, end);
  512. /*
  513. * FIXME: For VFIO iommu types which have KVM acceleration to
  514. * avoid bouncing all map/unmaps through qemu this way, this
  515. * would be the right place to wire that up (tell the KVM
  516. * device emulation the VFIO iommu handles to use).
  517. */
  518. giommu = g_malloc0(sizeof(*giommu));
  519. giommu->iommu_mr = iommu_mr;
  520. giommu->iommu_offset = section->offset_within_address_space -
  521. section->offset_within_region;
  522. giommu->bcontainer = bcontainer;
  523. llend = int128_add(int128_make64(section->offset_within_region),
  524. section->size);
  525. llend = int128_sub(llend, int128_one());
  526. iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
  527. MEMTXATTRS_UNSPECIFIED);
  528. iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
  529. IOMMU_NOTIFIER_IOTLB_EVENTS,
  530. section->offset_within_region,
  531. int128_get64(llend),
  532. iommu_idx);
  533. ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
  534. &err);
  535. if (ret) {
  536. g_free(giommu);
  537. goto fail;
  538. }
  539. QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next);
  540. memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
  541. return;
  542. }
  543. /* Here we assume that memory_region_is_ram(section->mr)==true */
  544. /*
  545. * For RAM memory regions with a RamDiscardManager, we only want to map the
  546. * actually populated parts - and update the mapping whenever we're notified
  547. * about changes.
  548. */
  549. if (memory_region_has_ram_discard_manager(section->mr)) {
  550. vfio_register_ram_discard_listener(bcontainer, section);
  551. return;
  552. }
  553. vaddr = memory_region_get_ram_ptr(section->mr) +
  554. section->offset_within_region +
  555. (iova - section->offset_within_address_space);
  556. trace_vfio_listener_region_add_ram(iova, end, vaddr);
  557. llsize = int128_sub(llend, int128_make64(iova));
  558. if (memory_region_is_ram_device(section->mr)) {
  559. hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
  560. if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
  561. trace_vfio_listener_region_add_no_dma_map(
  562. memory_region_name(section->mr),
  563. section->offset_within_address_space,
  564. int128_getlo(section->size),
  565. pgmask + 1);
  566. return;
  567. }
  568. }
  569. ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
  570. vaddr, section->readonly);
  571. if (ret) {
  572. error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
  573. "0x%"HWADDR_PRIx", %p) = %d (%s)",
  574. bcontainer, iova, int128_get64(llsize), vaddr, ret,
  575. strerror(-ret));
  576. if (memory_region_is_ram_device(section->mr)) {
  577. /* Allow unexpected mappings not to be fatal for RAM devices */
  578. error_report_err(err);
  579. return;
  580. }
  581. goto fail;
  582. }
  583. return;
  584. fail:
  585. if (memory_region_is_ram_device(section->mr)) {
  586. error_reportf_err(err, "PCI p2p may not work: ");
  587. return;
  588. }
  589. /*
  590. * On the initfn path, store the first error in the container so we
  591. * can gracefully fail. Runtime, there's not much we can do other
  592. * than throw a hardware error.
  593. */
  594. if (!bcontainer->initialized) {
  595. if (!bcontainer->error) {
  596. error_propagate_prepend(&bcontainer->error, err,
  597. "Region %s: ",
  598. memory_region_name(section->mr));
  599. } else {
  600. error_free(err);
  601. }
  602. } else {
  603. error_report_err(err);
  604. hw_error("vfio: DMA mapping failed, unable to continue");
  605. }
  606. }
  607. static void vfio_listener_region_del(MemoryListener *listener,
  608. MemoryRegionSection *section)
  609. {
  610. VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  611. listener);
  612. hwaddr iova, end;
  613. Int128 llend, llsize;
  614. int ret;
  615. bool try_unmap = true;
  616. if (!vfio_listener_valid_section(section, "region_del")) {
  617. return;
  618. }
  619. if (memory_region_is_iommu(section->mr)) {
  620. VFIOGuestIOMMU *giommu;
  621. trace_vfio_listener_region_del_iommu(section->mr->name);
  622. QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
  623. if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
  624. giommu->n.start == section->offset_within_region) {
  625. memory_region_unregister_iommu_notifier(section->mr,
  626. &giommu->n);
  627. QLIST_REMOVE(giommu, giommu_next);
  628. g_free(giommu);
  629. break;
  630. }
  631. }
  632. /*
  633. * FIXME: We assume the one big unmap below is adequate to
  634. * remove any individual page mappings in the IOMMU which
  635. * might have been copied into VFIO. This works for a page table
  636. * based IOMMU where a big unmap flattens a large range of IO-PTEs.
  637. * That may not be true for all IOMMU types.
  638. */
  639. }
  640. if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
  641. &llend)) {
  642. return;
  643. }
  644. llsize = int128_sub(llend, int128_make64(iova));
  645. trace_vfio_listener_region_del(iova, end);
  646. if (memory_region_is_ram_device(section->mr)) {
  647. hwaddr pgmask;
  648. pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
  649. try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
  650. } else if (memory_region_has_ram_discard_manager(section->mr)) {
  651. vfio_unregister_ram_discard_listener(bcontainer, section);
  652. /* Unregistering will trigger an unmap. */
  653. try_unmap = false;
  654. }
  655. if (try_unmap) {
  656. if (int128_eq(llsize, int128_2_64())) {
  657. /* The unmap ioctl doesn't accept a full 64-bit span. */
  658. llsize = int128_rshift(llsize, 1);
  659. ret = vfio_container_dma_unmap(bcontainer, iova,
  660. int128_get64(llsize), NULL);
  661. if (ret) {
  662. error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  663. "0x%"HWADDR_PRIx") = %d (%s)",
  664. bcontainer, iova, int128_get64(llsize), ret,
  665. strerror(-ret));
  666. }
  667. iova += int128_get64(llsize);
  668. }
  669. ret = vfio_container_dma_unmap(bcontainer, iova,
  670. int128_get64(llsize), NULL);
  671. if (ret) {
  672. error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  673. "0x%"HWADDR_PRIx") = %d (%s)",
  674. bcontainer, iova, int128_get64(llsize), ret,
  675. strerror(-ret));
  676. }
  677. }
  678. memory_region_unref(section->mr);
  679. vfio_container_del_section_window(bcontainer, section);
  680. }
  681. typedef struct VFIODirtyRanges {
  682. hwaddr min32;
  683. hwaddr max32;
  684. hwaddr min64;
  685. hwaddr max64;
  686. hwaddr minpci64;
  687. hwaddr maxpci64;
  688. } VFIODirtyRanges;
  689. typedef struct VFIODirtyRangesListener {
  690. VFIOContainerBase *bcontainer;
  691. VFIODirtyRanges ranges;
  692. MemoryListener listener;
  693. } VFIODirtyRangesListener;
  694. static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
  695. VFIOContainerBase *bcontainer)
  696. {
  697. VFIOPCIDevice *pcidev;
  698. VFIODevice *vbasedev;
  699. Object *owner;
  700. owner = memory_region_owner(section->mr);
  701. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  702. if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
  703. continue;
  704. }
  705. pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
  706. if (OBJECT(pcidev) == owner) {
  707. return true;
  708. }
  709. }
  710. return false;
  711. }
  712. static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range,
  713. hwaddr iova, hwaddr end,
  714. bool update_pci)
  715. {
  716. hwaddr *min, *max;
  717. /*
  718. * The address space passed to the dirty tracker is reduced to three ranges:
  719. * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
  720. * PCI 64-bit hole.
  721. *
  722. * The underlying reports of dirty will query a sub-interval of each of
  723. * these ranges.
  724. *
  725. * The purpose of the three range handling is to handle known cases of big
  726. * holes in the address space, like the x86 AMD 1T hole, and firmware (like
  727. * OVMF) which may relocate the pci-hole64 to the end of the address space.
  728. * The latter would otherwise generate large ranges for tracking, stressing
  729. * the limits of supported hardware. The pci-hole32 will always be below 4G
  730. * (overlapping or not) so it doesn't need special handling and is part of
  731. * the 32-bit range.
  732. *
  733. * The alternative would be an IOVATree but that has a much bigger runtime
  734. * overhead and unnecessary complexity.
  735. */
  736. if (update_pci && iova >= UINT32_MAX) {
  737. min = &range->minpci64;
  738. max = &range->maxpci64;
  739. } else {
  740. min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
  741. max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
  742. }
  743. if (*min > iova) {
  744. *min = iova;
  745. }
  746. if (*max < end) {
  747. *max = end;
  748. }
  749. trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
  750. }
  751. static void vfio_dirty_tracking_update(MemoryListener *listener,
  752. MemoryRegionSection *section)
  753. {
  754. VFIODirtyRangesListener *dirty =
  755. container_of(listener, VFIODirtyRangesListener, listener);
  756. hwaddr iova, end;
  757. if (!vfio_listener_valid_section(section, "tracking_update") ||
  758. !vfio_get_section_iova_range(dirty->bcontainer, section,
  759. &iova, &end, NULL)) {
  760. return;
  761. }
  762. vfio_dirty_tracking_update_range(&dirty->ranges, iova, end,
  763. vfio_section_is_vfio_pci(section, dirty->bcontainer));
  764. }
  765. static const MemoryListener vfio_dirty_tracking_listener = {
  766. .name = "vfio-tracking",
  767. .region_add = vfio_dirty_tracking_update,
  768. };
  769. static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer,
  770. VFIODirtyRanges *ranges)
  771. {
  772. VFIODirtyRangesListener dirty;
  773. memset(&dirty, 0, sizeof(dirty));
  774. dirty.ranges.min32 = UINT32_MAX;
  775. dirty.ranges.min64 = UINT64_MAX;
  776. dirty.ranges.minpci64 = UINT64_MAX;
  777. dirty.listener = vfio_dirty_tracking_listener;
  778. dirty.bcontainer = bcontainer;
  779. memory_listener_register(&dirty.listener,
  780. bcontainer->space->as);
  781. *ranges = dirty.ranges;
  782. /*
  783. * The memory listener is synchronous, and used to calculate the range
  784. * to dirty tracking. Unregister it after we are done as we are not
  785. * interested in any follow-up updates.
  786. */
  787. memory_listener_unregister(&dirty.listener);
  788. }
  789. static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
  790. {
  791. uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
  792. sizeof(uint64_t))] = {};
  793. struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
  794. VFIODevice *vbasedev;
  795. feature->argsz = sizeof(buf);
  796. feature->flags = VFIO_DEVICE_FEATURE_SET |
  797. VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
  798. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  799. if (!vbasedev->dirty_tracking) {
  800. continue;
  801. }
  802. if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
  803. warn_report("%s: Failed to stop DMA logging, err %d (%s)",
  804. vbasedev->name, -errno, strerror(errno));
  805. }
  806. vbasedev->dirty_tracking = false;
  807. }
  808. }
  809. static struct vfio_device_feature *
  810. vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer,
  811. VFIODirtyRanges *tracking)
  812. {
  813. struct vfio_device_feature *feature;
  814. size_t feature_size;
  815. struct vfio_device_feature_dma_logging_control *control;
  816. struct vfio_device_feature_dma_logging_range *ranges;
  817. feature_size = sizeof(struct vfio_device_feature) +
  818. sizeof(struct vfio_device_feature_dma_logging_control);
  819. feature = g_try_malloc0(feature_size);
  820. if (!feature) {
  821. errno = ENOMEM;
  822. return NULL;
  823. }
  824. feature->argsz = feature_size;
  825. feature->flags = VFIO_DEVICE_FEATURE_SET |
  826. VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
  827. control = (struct vfio_device_feature_dma_logging_control *)feature->data;
  828. control->page_size = qemu_real_host_page_size();
  829. /*
  830. * DMA logging uAPI guarantees to support at least a number of ranges that
  831. * fits into a single host kernel base page.
  832. */
  833. control->num_ranges = !!tracking->max32 + !!tracking->max64 +
  834. !!tracking->maxpci64;
  835. ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
  836. control->num_ranges);
  837. if (!ranges) {
  838. g_free(feature);
  839. errno = ENOMEM;
  840. return NULL;
  841. }
  842. control->ranges = (uintptr_t)ranges;
  843. if (tracking->max32) {
  844. ranges->iova = tracking->min32;
  845. ranges->length = (tracking->max32 - tracking->min32) + 1;
  846. ranges++;
  847. }
  848. if (tracking->max64) {
  849. ranges->iova = tracking->min64;
  850. ranges->length = (tracking->max64 - tracking->min64) + 1;
  851. ranges++;
  852. }
  853. if (tracking->maxpci64) {
  854. ranges->iova = tracking->minpci64;
  855. ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1;
  856. }
  857. trace_vfio_device_dirty_tracking_start(control->num_ranges,
  858. tracking->min32, tracking->max32,
  859. tracking->min64, tracking->max64,
  860. tracking->minpci64, tracking->maxpci64);
  861. return feature;
  862. }
  863. static void vfio_device_feature_dma_logging_start_destroy(
  864. struct vfio_device_feature *feature)
  865. {
  866. struct vfio_device_feature_dma_logging_control *control =
  867. (struct vfio_device_feature_dma_logging_control *)feature->data;
  868. struct vfio_device_feature_dma_logging_range *ranges =
  869. (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
  870. g_free(ranges);
  871. g_free(feature);
  872. }
  873. static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
  874. Error **errp)
  875. {
  876. struct vfio_device_feature *feature;
  877. VFIODirtyRanges ranges;
  878. VFIODevice *vbasedev;
  879. int ret = 0;
  880. vfio_dirty_tracking_init(bcontainer, &ranges);
  881. feature = vfio_device_feature_dma_logging_start_create(bcontainer,
  882. &ranges);
  883. if (!feature) {
  884. error_setg_errno(errp, errno, "Failed to prepare DMA logging");
  885. return false;
  886. }
  887. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  888. if (vbasedev->dirty_tracking) {
  889. continue;
  890. }
  891. ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
  892. if (ret) {
  893. ret = -errno;
  894. error_setg_errno(errp, errno, "%s: Failed to start DMA logging",
  895. vbasedev->name);
  896. goto out;
  897. }
  898. vbasedev->dirty_tracking = true;
  899. }
  900. out:
  901. if (ret) {
  902. vfio_devices_dma_logging_stop(bcontainer);
  903. }
  904. vfio_device_feature_dma_logging_start_destroy(feature);
  905. return ret == 0;
  906. }
  907. static bool vfio_listener_log_global_start(MemoryListener *listener,
  908. Error **errp)
  909. {
  910. ERRP_GUARD();
  911. VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  912. listener);
  913. bool ret;
  914. if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
  915. ret = vfio_devices_dma_logging_start(bcontainer, errp);
  916. } else {
  917. ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0;
  918. }
  919. if (!ret) {
  920. error_prepend(errp, "vfio: Could not start dirty page tracking - ");
  921. }
  922. return ret;
  923. }
  924. static void vfio_listener_log_global_stop(MemoryListener *listener)
  925. {
  926. VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  927. listener);
  928. Error *local_err = NULL;
  929. int ret = 0;
  930. if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
  931. vfio_devices_dma_logging_stop(bcontainer);
  932. } else {
  933. ret = vfio_container_set_dirty_page_tracking(bcontainer, false,
  934. &local_err);
  935. }
  936. if (ret) {
  937. error_prepend(&local_err,
  938. "vfio: Could not stop dirty page tracking - ");
  939. error_report_err(local_err);
  940. vfio_set_migration_error(ret);
  941. }
  942. }
  943. static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
  944. hwaddr size, void *bitmap)
  945. {
  946. uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
  947. sizeof(struct vfio_device_feature_dma_logging_report),
  948. sizeof(uint64_t))] = {};
  949. struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
  950. struct vfio_device_feature_dma_logging_report *report =
  951. (struct vfio_device_feature_dma_logging_report *)feature->data;
  952. report->iova = iova;
  953. report->length = size;
  954. report->page_size = qemu_real_host_page_size();
  955. report->bitmap = (uintptr_t)bitmap;
  956. feature->argsz = sizeof(buf);
  957. feature->flags = VFIO_DEVICE_FEATURE_GET |
  958. VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
  959. if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
  960. return -errno;
  961. }
  962. return 0;
  963. }
  964. int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
  965. VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
  966. {
  967. VFIODevice *vbasedev;
  968. int ret;
  969. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  970. ret = vfio_device_dma_logging_report(vbasedev, iova, size,
  971. vbmap->bitmap);
  972. if (ret) {
  973. error_setg_errno(errp, -ret,
  974. "%s: Failed to get DMA logging report, iova: "
  975. "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx,
  976. vbasedev->name, iova, size);
  977. return ret;
  978. }
  979. }
  980. return 0;
  981. }
  982. int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
  983. uint64_t size, ram_addr_t ram_addr, Error **errp)
  984. {
  985. bool all_device_dirty_tracking =
  986. vfio_devices_all_device_dirty_tracking(bcontainer);
  987. uint64_t dirty_pages;
  988. VFIOBitmap vbmap;
  989. int ret;
  990. if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) {
  991. cpu_physical_memory_set_dirty_range(ram_addr, size,
  992. tcg_enabled() ? DIRTY_CLIENTS_ALL :
  993. DIRTY_CLIENTS_NOCODE);
  994. return 0;
  995. }
  996. ret = vfio_bitmap_alloc(&vbmap, size);
  997. if (ret) {
  998. error_setg_errno(errp, -ret,
  999. "Failed to allocate dirty tracking bitmap");
  1000. return ret;
  1001. }
  1002. if (all_device_dirty_tracking) {
  1003. ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
  1004. errp);
  1005. } else {
  1006. ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
  1007. errp);
  1008. }
  1009. if (ret) {
  1010. goto out;
  1011. }
  1012. dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
  1013. vbmap.pages);
  1014. trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages);
  1015. out:
  1016. g_free(vbmap.bitmap);
  1017. return ret;
  1018. }
  1019. typedef struct {
  1020. IOMMUNotifier n;
  1021. VFIOGuestIOMMU *giommu;
  1022. } vfio_giommu_dirty_notifier;
  1023. static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
  1024. {
  1025. vfio_giommu_dirty_notifier *gdn = container_of(n,
  1026. vfio_giommu_dirty_notifier, n);
  1027. VFIOGuestIOMMU *giommu = gdn->giommu;
  1028. VFIOContainerBase *bcontainer = giommu->bcontainer;
  1029. hwaddr iova = iotlb->iova + giommu->iommu_offset;
  1030. ram_addr_t translated_addr;
  1031. Error *local_err = NULL;
  1032. int ret = -EINVAL;
  1033. trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
  1034. if (iotlb->target_as != &address_space_memory) {
  1035. error_report("Wrong target AS \"%s\", only system memory is allowed",
  1036. iotlb->target_as->name ? iotlb->target_as->name : "none");
  1037. goto out;
  1038. }
  1039. rcu_read_lock();
  1040. if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) {
  1041. error_report_err(local_err);
  1042. goto out_unlock;
  1043. }
  1044. ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
  1045. translated_addr, &local_err);
  1046. if (ret) {
  1047. error_prepend(&local_err,
  1048. "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
  1049. "0x%"HWADDR_PRIx") failed - ", bcontainer, iova,
  1050. iotlb->addr_mask + 1);
  1051. error_report_err(local_err);
  1052. }
  1053. out_unlock:
  1054. rcu_read_unlock();
  1055. out:
  1056. if (ret) {
  1057. vfio_set_migration_error(ret);
  1058. }
  1059. }
  1060. static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
  1061. void *opaque)
  1062. {
  1063. const hwaddr size = int128_get64(section->size);
  1064. const hwaddr iova = section->offset_within_address_space;
  1065. const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
  1066. section->offset_within_region;
  1067. VFIORamDiscardListener *vrdl = opaque;
  1068. Error *local_err = NULL;
  1069. int ret;
  1070. /*
  1071. * Sync the whole mapped region (spanning multiple individual mappings)
  1072. * in one go.
  1073. */
  1074. ret = vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr,
  1075. &local_err);
  1076. if (ret) {
  1077. error_report_err(local_err);
  1078. }
  1079. return ret;
  1080. }
  1081. static int
  1082. vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
  1083. MemoryRegionSection *section)
  1084. {
  1085. RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
  1086. VFIORamDiscardListener *vrdl = NULL;
  1087. QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
  1088. if (vrdl->mr == section->mr &&
  1089. vrdl->offset_within_address_space ==
  1090. section->offset_within_address_space) {
  1091. break;
  1092. }
  1093. }
  1094. if (!vrdl) {
  1095. hw_error("vfio: Trying to sync missing RAM discard listener");
  1096. }
  1097. /*
  1098. * We only want/can synchronize the bitmap for actually mapped parts -
  1099. * which correspond to populated parts. Replay all populated parts.
  1100. */
  1101. return ram_discard_manager_replay_populated(rdm, section,
  1102. vfio_ram_discard_get_dirty_bitmap,
  1103. &vrdl);
  1104. }
  1105. static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer,
  1106. MemoryRegionSection *section)
  1107. {
  1108. VFIOGuestIOMMU *giommu;
  1109. bool found = false;
  1110. Int128 llend;
  1111. vfio_giommu_dirty_notifier gdn;
  1112. int idx;
  1113. QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
  1114. if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
  1115. giommu->n.start == section->offset_within_region) {
  1116. found = true;
  1117. break;
  1118. }
  1119. }
  1120. if (!found) {
  1121. return 0;
  1122. }
  1123. gdn.giommu = giommu;
  1124. idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
  1125. MEMTXATTRS_UNSPECIFIED);
  1126. llend = int128_add(int128_make64(section->offset_within_region),
  1127. section->size);
  1128. llend = int128_sub(llend, int128_one());
  1129. iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP,
  1130. section->offset_within_region, int128_get64(llend),
  1131. idx);
  1132. memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
  1133. return 0;
  1134. }
  1135. static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
  1136. MemoryRegionSection *section, Error **errp)
  1137. {
  1138. ram_addr_t ram_addr;
  1139. if (memory_region_is_iommu(section->mr)) {
  1140. return vfio_sync_iommu_dirty_bitmap(bcontainer, section);
  1141. } else if (memory_region_has_ram_discard_manager(section->mr)) {
  1142. int ret;
  1143. ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section);
  1144. if (ret) {
  1145. error_setg(errp,
  1146. "Failed to sync dirty bitmap with RAM discard listener");
  1147. }
  1148. return ret;
  1149. }
  1150. ram_addr = memory_region_get_ram_addr(section->mr) +
  1151. section->offset_within_region;
  1152. return vfio_get_dirty_bitmap(bcontainer,
  1153. REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
  1154. int128_get64(section->size), ram_addr, errp);
  1155. }
  1156. static void vfio_listener_log_sync(MemoryListener *listener,
  1157. MemoryRegionSection *section)
  1158. {
  1159. VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  1160. listener);
  1161. int ret;
  1162. Error *local_err = NULL;
  1163. if (vfio_listener_skipped_section(section)) {
  1164. return;
  1165. }
  1166. if (vfio_log_sync_needed(bcontainer)) {
  1167. ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err);
  1168. if (ret) {
  1169. error_report_err(local_err);
  1170. vfio_set_migration_error(ret);
  1171. }
  1172. }
  1173. }
  1174. const MemoryListener vfio_memory_listener = {
  1175. .name = "vfio",
  1176. .region_add = vfio_listener_region_add,
  1177. .region_del = vfio_listener_region_del,
  1178. .log_global_start = vfio_listener_log_global_start,
  1179. .log_global_stop = vfio_listener_log_global_stop,
  1180. .log_sync = vfio_listener_log_sync,
  1181. };
  1182. void vfio_reset_handler(void *opaque)
  1183. {
  1184. VFIODevice *vbasedev;
  1185. QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
  1186. if (vbasedev->dev->realized) {
  1187. vbasedev->ops->vfio_compute_needs_reset(vbasedev);
  1188. }
  1189. }
  1190. QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
  1191. if (vbasedev->dev->realized && vbasedev->needs_reset) {
  1192. vbasedev->ops->vfio_hot_reset_multi(vbasedev);
  1193. }
  1194. }
  1195. }
  1196. int vfio_kvm_device_add_fd(int fd, Error **errp)
  1197. {
  1198. #ifdef CONFIG_KVM
  1199. struct kvm_device_attr attr = {
  1200. .group = KVM_DEV_VFIO_FILE,
  1201. .attr = KVM_DEV_VFIO_FILE_ADD,
  1202. .addr = (uint64_t)(unsigned long)&fd,
  1203. };
  1204. if (!kvm_enabled()) {
  1205. return 0;
  1206. }
  1207. if (vfio_kvm_device_fd < 0) {
  1208. struct kvm_create_device cd = {
  1209. .type = KVM_DEV_TYPE_VFIO,
  1210. };
  1211. if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
  1212. error_setg_errno(errp, errno, "Failed to create KVM VFIO device");
  1213. return -errno;
  1214. }
  1215. vfio_kvm_device_fd = cd.fd;
  1216. }
  1217. if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
  1218. error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device",
  1219. fd);
  1220. return -errno;
  1221. }
  1222. #endif
  1223. return 0;
  1224. }
  1225. int vfio_kvm_device_del_fd(int fd, Error **errp)
  1226. {
  1227. #ifdef CONFIG_KVM
  1228. struct kvm_device_attr attr = {
  1229. .group = KVM_DEV_VFIO_FILE,
  1230. .attr = KVM_DEV_VFIO_FILE_DEL,
  1231. .addr = (uint64_t)(unsigned long)&fd,
  1232. };
  1233. if (vfio_kvm_device_fd < 0) {
  1234. error_setg(errp, "KVM VFIO device isn't created yet");
  1235. return -EINVAL;
  1236. }
  1237. if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
  1238. error_setg_errno(errp, errno,
  1239. "Failed to remove fd %d from KVM VFIO device", fd);
  1240. return -errno;
  1241. }
  1242. #endif
  1243. return 0;
  1244. }
  1245. VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
  1246. {
  1247. VFIOAddressSpace *space;
  1248. QLIST_FOREACH(space, &vfio_address_spaces, list) {
  1249. if (space->as == as) {
  1250. return space;
  1251. }
  1252. }
  1253. /* No suitable VFIOAddressSpace, create a new one */
  1254. space = g_malloc0(sizeof(*space));
  1255. space->as = as;
  1256. QLIST_INIT(&space->containers);
  1257. if (QLIST_EMPTY(&vfio_address_spaces)) {
  1258. qemu_register_reset(vfio_reset_handler, NULL);
  1259. }
  1260. QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
  1261. return space;
  1262. }
  1263. void vfio_put_address_space(VFIOAddressSpace *space)
  1264. {
  1265. if (!QLIST_EMPTY(&space->containers)) {
  1266. return;
  1267. }
  1268. QLIST_REMOVE(space, list);
  1269. g_free(space);
  1270. if (QLIST_EMPTY(&vfio_address_spaces)) {
  1271. qemu_unregister_reset(vfio_reset_handler, NULL);
  1272. }
  1273. }
  1274. void vfio_address_space_insert(VFIOAddressSpace *space,
  1275. VFIOContainerBase *bcontainer)
  1276. {
  1277. QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
  1278. bcontainer->space = space;
  1279. }
  1280. struct vfio_device_info *vfio_get_device_info(int fd)
  1281. {
  1282. struct vfio_device_info *info;
  1283. uint32_t argsz = sizeof(*info);
  1284. info = g_malloc0(argsz);
  1285. retry:
  1286. info->argsz = argsz;
  1287. if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
  1288. g_free(info);
  1289. return NULL;
  1290. }
  1291. if (info->argsz > argsz) {
  1292. argsz = info->argsz;
  1293. info = g_realloc(info, argsz);
  1294. goto retry;
  1295. }
  1296. return info;
  1297. }
  1298. bool vfio_attach_device(char *name, VFIODevice *vbasedev,
  1299. AddressSpace *as, Error **errp)
  1300. {
  1301. const VFIOIOMMUClass *ops =
  1302. VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
  1303. HostIOMMUDevice *hiod = NULL;
  1304. if (vbasedev->iommufd) {
  1305. ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
  1306. }
  1307. assert(ops);
  1308. if (!vbasedev->mdev) {
  1309. hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
  1310. vbasedev->hiod = hiod;
  1311. }
  1312. if (!ops->attach_device(name, vbasedev, as, errp)) {
  1313. object_unref(hiod);
  1314. vbasedev->hiod = NULL;
  1315. return false;
  1316. }
  1317. return true;
  1318. }
  1319. void vfio_detach_device(VFIODevice *vbasedev)
  1320. {
  1321. if (!vbasedev->bcontainer) {
  1322. return;
  1323. }
  1324. object_unref(vbasedev->hiod);
  1325. VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
  1326. }