2
0

common.c 51 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586
  1. /*
  2. * generic functions used by VFIO devices
  3. *
  4. * Copyright Red Hat, Inc. 2012
  5. *
  6. * Authors:
  7. * Alex Williamson <alex.williamson@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Based on qemu-kvm device-assignment:
  13. * Adapted for KVM by Qumranet.
  14. * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15. * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16. * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17. * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18. * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19. */
  20. #include "qemu/osdep.h"
  21. #include <sys/ioctl.h>
  22. #ifdef CONFIG_KVM
  23. #include <linux/kvm.h>
  24. #endif
  25. #include <linux/vfio.h>
  26. #include "hw/vfio/vfio-common.h"
  27. #include "hw/vfio/pci.h"
  28. #include "exec/address-spaces.h"
  29. #include "exec/memory.h"
  30. #include "exec/ram_addr.h"
  31. #include "exec/target_page.h"
  32. #include "hw/hw.h"
  33. #include "qemu/error-report.h"
  34. #include "qemu/main-loop.h"
  35. #include "qemu/range.h"
  36. #include "system/kvm.h"
  37. #include "system/reset.h"
  38. #include "system/runstate.h"
  39. #include "trace.h"
  40. #include "qapi/error.h"
  41. #include "migration/misc.h"
  42. #include "migration/blocker.h"
  43. #include "migration/qemu-file.h"
  44. #include "system/tcg.h"
  45. #include "system/tpm.h"
  46. VFIODeviceList vfio_device_list =
  47. QLIST_HEAD_INITIALIZER(vfio_device_list);
  48. static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
  49. QLIST_HEAD_INITIALIZER(vfio_address_spaces);
  50. #ifdef CONFIG_KVM
  51. /*
  52. * We have a single VFIO pseudo device per KVM VM. Once created it lives
  53. * for the life of the VM. Closing the file descriptor only drops our
  54. * reference to it and the device's reference to kvm. Therefore once
  55. * initialized, this file descriptor is only released on QEMU exit and
  56. * we'll re-use it should another vfio device be attached before then.
  57. */
  58. int vfio_kvm_device_fd = -1;
  59. #endif
  60. /*
  61. * Device state interfaces
  62. */
  63. bool vfio_mig_active(void)
  64. {
  65. VFIODevice *vbasedev;
  66. if (QLIST_EMPTY(&vfio_device_list)) {
  67. return false;
  68. }
  69. QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
  70. if (vbasedev->migration_blocker) {
  71. return false;
  72. }
  73. }
  74. return true;
  75. }
  76. static Error *multiple_devices_migration_blocker;
  77. /*
  78. * Multiple devices migration is allowed only if all devices support P2P
  79. * migration. Single device migration is allowed regardless of P2P migration
  80. * support.
  81. */
  82. static bool vfio_multiple_devices_migration_is_supported(void)
  83. {
  84. VFIODevice *vbasedev;
  85. unsigned int device_num = 0;
  86. bool all_support_p2p = true;
  87. QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
  88. if (vbasedev->migration) {
  89. device_num++;
  90. if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) {
  91. all_support_p2p = false;
  92. }
  93. }
  94. }
  95. return all_support_p2p || device_num <= 1;
  96. }
  97. int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
  98. {
  99. int ret;
  100. if (vfio_multiple_devices_migration_is_supported()) {
  101. return 0;
  102. }
  103. if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
  104. error_setg(errp, "Multiple VFIO devices migration is supported only if "
  105. "all of them support P2P migration");
  106. return -EINVAL;
  107. }
  108. if (multiple_devices_migration_blocker) {
  109. return 0;
  110. }
  111. error_setg(&multiple_devices_migration_blocker,
  112. "Multiple VFIO devices migration is supported only if all of "
  113. "them support P2P migration");
  114. ret = migrate_add_blocker_normal(&multiple_devices_migration_blocker, errp);
  115. return ret;
  116. }
  117. void vfio_unblock_multiple_devices_migration(void)
  118. {
  119. if (!multiple_devices_migration_blocker ||
  120. !vfio_multiple_devices_migration_is_supported()) {
  121. return;
  122. }
  123. migrate_del_blocker(&multiple_devices_migration_blocker);
  124. }
  125. bool vfio_viommu_preset(VFIODevice *vbasedev)
  126. {
  127. return vbasedev->bcontainer->space->as != &address_space_memory;
  128. }
  129. static void vfio_set_migration_error(int ret)
  130. {
  131. if (migration_is_running()) {
  132. migration_file_set_error(ret, NULL);
  133. }
  134. }
  135. bool vfio_device_state_is_running(VFIODevice *vbasedev)
  136. {
  137. VFIOMigration *migration = vbasedev->migration;
  138. return migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
  139. migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P;
  140. }
  141. bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
  142. {
  143. VFIOMigration *migration = vbasedev->migration;
  144. return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ||
  145. migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P;
  146. }
  147. static bool vfio_devices_all_device_dirty_tracking_started(
  148. const VFIOContainerBase *bcontainer)
  149. {
  150. VFIODevice *vbasedev;
  151. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  152. if (!vbasedev->dirty_tracking) {
  153. return false;
  154. }
  155. }
  156. return true;
  157. }
  158. bool vfio_devices_all_dirty_tracking_started(
  159. const VFIOContainerBase *bcontainer)
  160. {
  161. return vfio_devices_all_device_dirty_tracking_started(bcontainer) ||
  162. bcontainer->dirty_pages_started;
  163. }
  164. static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer)
  165. {
  166. VFIODevice *vbasedev;
  167. if (!vfio_devices_all_dirty_tracking_started(bcontainer)) {
  168. return false;
  169. }
  170. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  171. VFIOMigration *migration = vbasedev->migration;
  172. if (!migration) {
  173. return false;
  174. }
  175. if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
  176. (vfio_device_state_is_running(vbasedev) ||
  177. vfio_device_state_is_precopy(vbasedev))) {
  178. return false;
  179. }
  180. }
  181. return true;
  182. }
  183. bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer)
  184. {
  185. VFIODevice *vbasedev;
  186. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  187. if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) {
  188. return false;
  189. }
  190. if (!vbasedev->dirty_pages_supported) {
  191. return false;
  192. }
  193. }
  194. return true;
  195. }
  196. static bool vfio_listener_skipped_section(MemoryRegionSection *section)
  197. {
  198. return (!memory_region_is_ram(section->mr) &&
  199. !memory_region_is_iommu(section->mr)) ||
  200. memory_region_is_protected(section->mr) ||
  201. /*
  202. * Sizing an enabled 64-bit BAR can cause spurious mappings to
  203. * addresses in the upper part of the 64-bit address space. These
  204. * are never accessed by the CPU and beyond the address width of
  205. * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
  206. */
  207. section->offset_within_address_space & (1ULL << 63);
  208. }
  209. /* Called with rcu_read_lock held. */
  210. static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
  211. ram_addr_t *ram_addr, bool *read_only,
  212. Error **errp)
  213. {
  214. bool ret, mr_has_discard_manager;
  215. ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
  216. &mr_has_discard_manager, errp);
  217. if (ret && mr_has_discard_manager) {
  218. /*
  219. * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
  220. * pages will remain pinned inside vfio until unmapped, resulting in a
  221. * higher memory consumption than expected. If memory would get
  222. * populated again later, there would be an inconsistency between pages
  223. * pinned by vfio and pages seen by QEMU. This is the case until
  224. * unmapped from the IOMMU (e.g., during device reset).
  225. *
  226. * With malicious guests, we really only care about pinning more memory
  227. * than expected. RLIMIT_MEMLOCK set for the user/process can never be
  228. * exceeded and can be used to mitigate this problem.
  229. */
  230. warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
  231. " RAM (e.g., virtio-mem) works, however, malicious"
  232. " guests can trigger pinning of more memory than"
  233. " intended via an IOMMU. It's possible to mitigate "
  234. " by setting/adjusting RLIMIT_MEMLOCK.");
  235. }
  236. return ret;
  237. }
  238. static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
  239. {
  240. VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
  241. VFIOContainerBase *bcontainer = giommu->bcontainer;
  242. hwaddr iova = iotlb->iova + giommu->iommu_offset;
  243. void *vaddr;
  244. int ret;
  245. Error *local_err = NULL;
  246. trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
  247. iova, iova + iotlb->addr_mask);
  248. if (iotlb->target_as != &address_space_memory) {
  249. error_report("Wrong target AS \"%s\", only system memory is allowed",
  250. iotlb->target_as->name ? iotlb->target_as->name : "none");
  251. vfio_set_migration_error(-EINVAL);
  252. return;
  253. }
  254. rcu_read_lock();
  255. if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
  256. bool read_only;
  257. if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) {
  258. error_report_err(local_err);
  259. goto out;
  260. }
  261. /*
  262. * vaddr is only valid until rcu_read_unlock(). But after
  263. * vfio_dma_map has set up the mapping the pages will be
  264. * pinned by the kernel. This makes sure that the RAM backend
  265. * of vaddr will always be there, even if the memory object is
  266. * destroyed and its backing memory munmap-ed.
  267. */
  268. ret = vfio_container_dma_map(bcontainer, iova,
  269. iotlb->addr_mask + 1, vaddr,
  270. read_only);
  271. if (ret) {
  272. error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
  273. "0x%"HWADDR_PRIx", %p) = %d (%s)",
  274. bcontainer, iova,
  275. iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
  276. }
  277. } else {
  278. ret = vfio_container_dma_unmap(bcontainer, iova,
  279. iotlb->addr_mask + 1, iotlb);
  280. if (ret) {
  281. error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  282. "0x%"HWADDR_PRIx") = %d (%s)",
  283. bcontainer, iova,
  284. iotlb->addr_mask + 1, ret, strerror(-ret));
  285. vfio_set_migration_error(ret);
  286. }
  287. }
  288. out:
  289. rcu_read_unlock();
  290. }
  291. static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
  292. MemoryRegionSection *section)
  293. {
  294. VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
  295. listener);
  296. VFIOContainerBase *bcontainer = vrdl->bcontainer;
  297. const hwaddr size = int128_get64(section->size);
  298. const hwaddr iova = section->offset_within_address_space;
  299. int ret;
  300. /* Unmap with a single call. */
  301. ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL);
  302. if (ret) {
  303. error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
  304. strerror(-ret));
  305. }
  306. }
  307. static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
  308. MemoryRegionSection *section)
  309. {
  310. VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
  311. listener);
  312. VFIOContainerBase *bcontainer = vrdl->bcontainer;
  313. const hwaddr end = section->offset_within_region +
  314. int128_get64(section->size);
  315. hwaddr start, next, iova;
  316. void *vaddr;
  317. int ret;
  318. /*
  319. * Map in (aligned within memory region) minimum granularity, so we can
  320. * unmap in minimum granularity later.
  321. */
  322. for (start = section->offset_within_region; start < end; start = next) {
  323. next = ROUND_UP(start + 1, vrdl->granularity);
  324. next = MIN(next, end);
  325. iova = start - section->offset_within_region +
  326. section->offset_within_address_space;
  327. vaddr = memory_region_get_ram_ptr(section->mr) + start;
  328. ret = vfio_container_dma_map(bcontainer, iova, next - start,
  329. vaddr, section->readonly);
  330. if (ret) {
  331. /* Rollback */
  332. vfio_ram_discard_notify_discard(rdl, section);
  333. return ret;
  334. }
  335. }
  336. return 0;
  337. }
  338. static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer,
  339. MemoryRegionSection *section)
  340. {
  341. RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
  342. int target_page_size = qemu_target_page_size();
  343. VFIORamDiscardListener *vrdl;
  344. /* Ignore some corner cases not relevant in practice. */
  345. g_assert(QEMU_IS_ALIGNED(section->offset_within_region, target_page_size));
  346. g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
  347. target_page_size));
  348. g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), target_page_size));
  349. vrdl = g_new0(VFIORamDiscardListener, 1);
  350. vrdl->bcontainer = bcontainer;
  351. vrdl->mr = section->mr;
  352. vrdl->offset_within_address_space = section->offset_within_address_space;
  353. vrdl->size = int128_get64(section->size);
  354. vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
  355. section->mr);
  356. g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
  357. g_assert(bcontainer->pgsizes &&
  358. vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes));
  359. ram_discard_listener_init(&vrdl->listener,
  360. vfio_ram_discard_notify_populate,
  361. vfio_ram_discard_notify_discard, true);
  362. ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
  363. QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next);
  364. /*
  365. * Sanity-check if we have a theoretically problematic setup where we could
  366. * exceed the maximum number of possible DMA mappings over time. We assume
  367. * that each mapped section in the same address space as a RamDiscardManager
  368. * section consumes exactly one DMA mapping, with the exception of
  369. * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
  370. * in the same address space as RamDiscardManager sections.
  371. *
  372. * We assume that each section in the address space consumes one memslot.
  373. * We take the number of KVM memory slots as a best guess for the maximum
  374. * number of sections in the address space we could have over time,
  375. * also consuming DMA mappings.
  376. */
  377. if (bcontainer->dma_max_mappings) {
  378. unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
  379. #ifdef CONFIG_KVM
  380. if (kvm_enabled()) {
  381. max_memslots = kvm_get_max_memslots();
  382. }
  383. #endif
  384. QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
  385. hwaddr start, end;
  386. start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
  387. vrdl->granularity);
  388. end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
  389. vrdl->granularity);
  390. vrdl_mappings += (end - start) / vrdl->granularity;
  391. vrdl_count++;
  392. }
  393. if (vrdl_mappings + max_memslots - vrdl_count >
  394. bcontainer->dma_max_mappings) {
  395. warn_report("%s: possibly running out of DMA mappings. E.g., try"
  396. " increasing the 'block-size' of virtio-mem devies."
  397. " Maximum possible DMA mappings: %d, Maximum possible"
  398. " memslots: %d", __func__, bcontainer->dma_max_mappings,
  399. max_memslots);
  400. }
  401. }
  402. }
  403. static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer,
  404. MemoryRegionSection *section)
  405. {
  406. RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
  407. VFIORamDiscardListener *vrdl = NULL;
  408. QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
  409. if (vrdl->mr == section->mr &&
  410. vrdl->offset_within_address_space ==
  411. section->offset_within_address_space) {
  412. break;
  413. }
  414. }
  415. if (!vrdl) {
  416. hw_error("vfio: Trying to unregister missing RAM discard listener");
  417. }
  418. ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
  419. QLIST_REMOVE(vrdl, next);
  420. g_free(vrdl);
  421. }
  422. static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
  423. {
  424. MemoryRegion *mr = section->mr;
  425. if (!TPM_IS_CRB(mr->owner)) {
  426. return false;
  427. }
  428. /* this is a known safe misaligned region, just trace for debug purpose */
  429. trace_vfio_known_safe_misalignment(memory_region_name(mr),
  430. section->offset_within_address_space,
  431. section->offset_within_region,
  432. qemu_real_host_page_size());
  433. return true;
  434. }
  435. static bool vfio_listener_valid_section(MemoryRegionSection *section,
  436. const char *name)
  437. {
  438. if (vfio_listener_skipped_section(section)) {
  439. trace_vfio_listener_region_skip(name,
  440. section->offset_within_address_space,
  441. section->offset_within_address_space +
  442. int128_get64(int128_sub(section->size, int128_one())));
  443. return false;
  444. }
  445. if (unlikely((section->offset_within_address_space &
  446. ~qemu_real_host_page_mask()) !=
  447. (section->offset_within_region & ~qemu_real_host_page_mask()))) {
  448. if (!vfio_known_safe_misalignment(section)) {
  449. error_report("%s received unaligned region %s iova=0x%"PRIx64
  450. " offset_within_region=0x%"PRIx64
  451. " qemu_real_host_page_size=0x%"PRIxPTR,
  452. __func__, memory_region_name(section->mr),
  453. section->offset_within_address_space,
  454. section->offset_within_region,
  455. qemu_real_host_page_size());
  456. }
  457. return false;
  458. }
  459. return true;
  460. }
  461. static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
  462. MemoryRegionSection *section,
  463. hwaddr *out_iova, hwaddr *out_end,
  464. Int128 *out_llend)
  465. {
  466. Int128 llend;
  467. hwaddr iova;
  468. iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
  469. llend = int128_make64(section->offset_within_address_space);
  470. llend = int128_add(llend, section->size);
  471. llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
  472. if (int128_ge(int128_make64(iova), llend)) {
  473. return false;
  474. }
  475. *out_iova = iova;
  476. *out_end = int128_get64(int128_sub(llend, int128_one()));
  477. if (out_llend) {
  478. *out_llend = llend;
  479. }
  480. return true;
  481. }
  482. static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp)
  483. {
  484. /*
  485. * MMIO region mapping failures are not fatal but in this case PCI
  486. * peer-to-peer transactions are broken.
  487. */
  488. if (vbasedev && vbasedev->type == VFIO_DEVICE_TYPE_PCI) {
  489. error_append_hint(errp, "%s: PCI peer-to-peer transactions "
  490. "on BARs are not supported.\n", vbasedev->name);
  491. }
  492. }
  493. static void vfio_listener_region_add(MemoryListener *listener,
  494. MemoryRegionSection *section)
  495. {
  496. VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  497. listener);
  498. hwaddr iova, end;
  499. Int128 llend, llsize;
  500. void *vaddr;
  501. int ret;
  502. Error *err = NULL;
  503. if (!vfio_listener_valid_section(section, "region_add")) {
  504. return;
  505. }
  506. if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
  507. &llend)) {
  508. if (memory_region_is_ram_device(section->mr)) {
  509. trace_vfio_listener_region_add_no_dma_map(
  510. memory_region_name(section->mr),
  511. section->offset_within_address_space,
  512. int128_getlo(section->size),
  513. qemu_real_host_page_size());
  514. }
  515. return;
  516. }
  517. /* PPC64/pseries machine only */
  518. if (!vfio_container_add_section_window(bcontainer, section, &err)) {
  519. goto mmio_dma_error;
  520. }
  521. memory_region_ref(section->mr);
  522. if (memory_region_is_iommu(section->mr)) {
  523. VFIOGuestIOMMU *giommu;
  524. IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
  525. int iommu_idx;
  526. trace_vfio_listener_region_add_iommu(section->mr->name, iova, end);
  527. /*
  528. * FIXME: For VFIO iommu types which have KVM acceleration to
  529. * avoid bouncing all map/unmaps through qemu this way, this
  530. * would be the right place to wire that up (tell the KVM
  531. * device emulation the VFIO iommu handles to use).
  532. */
  533. giommu = g_malloc0(sizeof(*giommu));
  534. giommu->iommu_mr = iommu_mr;
  535. giommu->iommu_offset = section->offset_within_address_space -
  536. section->offset_within_region;
  537. giommu->bcontainer = bcontainer;
  538. llend = int128_add(int128_make64(section->offset_within_region),
  539. section->size);
  540. llend = int128_sub(llend, int128_one());
  541. iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
  542. MEMTXATTRS_UNSPECIFIED);
  543. iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
  544. IOMMU_NOTIFIER_IOTLB_EVENTS,
  545. section->offset_within_region,
  546. int128_get64(llend),
  547. iommu_idx);
  548. ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
  549. &err);
  550. if (ret) {
  551. g_free(giommu);
  552. goto fail;
  553. }
  554. QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next);
  555. memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
  556. return;
  557. }
  558. /* Here we assume that memory_region_is_ram(section->mr)==true */
  559. /*
  560. * For RAM memory regions with a RamDiscardManager, we only want to map the
  561. * actually populated parts - and update the mapping whenever we're notified
  562. * about changes.
  563. */
  564. if (memory_region_has_ram_discard_manager(section->mr)) {
  565. vfio_register_ram_discard_listener(bcontainer, section);
  566. return;
  567. }
  568. vaddr = memory_region_get_ram_ptr(section->mr) +
  569. section->offset_within_region +
  570. (iova - section->offset_within_address_space);
  571. trace_vfio_listener_region_add_ram(iova, end, vaddr);
  572. llsize = int128_sub(llend, int128_make64(iova));
  573. if (memory_region_is_ram_device(section->mr)) {
  574. hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
  575. if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
  576. trace_vfio_listener_region_add_no_dma_map(
  577. memory_region_name(section->mr),
  578. section->offset_within_address_space,
  579. int128_getlo(section->size),
  580. pgmask + 1);
  581. return;
  582. }
  583. }
  584. ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
  585. vaddr, section->readonly);
  586. if (ret) {
  587. error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
  588. "0x%"HWADDR_PRIx", %p) = %d (%s)",
  589. bcontainer, iova, int128_get64(llsize), vaddr, ret,
  590. strerror(-ret));
  591. mmio_dma_error:
  592. if (memory_region_is_ram_device(section->mr)) {
  593. /* Allow unexpected mappings not to be fatal for RAM devices */
  594. VFIODevice *vbasedev =
  595. vfio_get_vfio_device(memory_region_owner(section->mr));
  596. vfio_device_error_append(vbasedev, &err);
  597. warn_report_err_once(err);
  598. return;
  599. }
  600. goto fail;
  601. }
  602. return;
  603. fail:
  604. if (!bcontainer->initialized) {
  605. /*
  606. * At machine init time or when the device is attached to the
  607. * VM, store the first error in the container so we can
  608. * gracefully fail the device realize routine.
  609. */
  610. if (!bcontainer->error) {
  611. error_propagate_prepend(&bcontainer->error, err,
  612. "Region %s: ",
  613. memory_region_name(section->mr));
  614. } else {
  615. error_free(err);
  616. }
  617. } else {
  618. /*
  619. * At runtime, there's not much we can do other than throw a
  620. * hardware error.
  621. */
  622. error_report_err(err);
  623. hw_error("vfio: DMA mapping failed, unable to continue");
  624. }
  625. }
  626. static void vfio_listener_region_del(MemoryListener *listener,
  627. MemoryRegionSection *section)
  628. {
  629. VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  630. listener);
  631. hwaddr iova, end;
  632. Int128 llend, llsize;
  633. int ret;
  634. bool try_unmap = true;
  635. if (!vfio_listener_valid_section(section, "region_del")) {
  636. return;
  637. }
  638. if (memory_region_is_iommu(section->mr)) {
  639. VFIOGuestIOMMU *giommu;
  640. trace_vfio_listener_region_del_iommu(section->mr->name);
  641. QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
  642. if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
  643. giommu->n.start == section->offset_within_region) {
  644. memory_region_unregister_iommu_notifier(section->mr,
  645. &giommu->n);
  646. QLIST_REMOVE(giommu, giommu_next);
  647. g_free(giommu);
  648. break;
  649. }
  650. }
  651. /*
  652. * FIXME: We assume the one big unmap below is adequate to
  653. * remove any individual page mappings in the IOMMU which
  654. * might have been copied into VFIO. This works for a page table
  655. * based IOMMU where a big unmap flattens a large range of IO-PTEs.
  656. * That may not be true for all IOMMU types.
  657. */
  658. }
  659. if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
  660. &llend)) {
  661. return;
  662. }
  663. llsize = int128_sub(llend, int128_make64(iova));
  664. trace_vfio_listener_region_del(iova, end);
  665. if (memory_region_is_ram_device(section->mr)) {
  666. hwaddr pgmask;
  667. pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
  668. try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
  669. } else if (memory_region_has_ram_discard_manager(section->mr)) {
  670. vfio_unregister_ram_discard_listener(bcontainer, section);
  671. /* Unregistering will trigger an unmap. */
  672. try_unmap = false;
  673. }
  674. if (try_unmap) {
  675. if (int128_eq(llsize, int128_2_64())) {
  676. /* The unmap ioctl doesn't accept a full 64-bit span. */
  677. llsize = int128_rshift(llsize, 1);
  678. ret = vfio_container_dma_unmap(bcontainer, iova,
  679. int128_get64(llsize), NULL);
  680. if (ret) {
  681. error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  682. "0x%"HWADDR_PRIx") = %d (%s)",
  683. bcontainer, iova, int128_get64(llsize), ret,
  684. strerror(-ret));
  685. }
  686. iova += int128_get64(llsize);
  687. }
  688. ret = vfio_container_dma_unmap(bcontainer, iova,
  689. int128_get64(llsize), NULL);
  690. if (ret) {
  691. error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  692. "0x%"HWADDR_PRIx") = %d (%s)",
  693. bcontainer, iova, int128_get64(llsize), ret,
  694. strerror(-ret));
  695. }
  696. }
  697. memory_region_unref(section->mr);
  698. /* PPC64/pseries machine only */
  699. vfio_container_del_section_window(bcontainer, section);
  700. }
  701. typedef struct VFIODirtyRanges {
  702. hwaddr min32;
  703. hwaddr max32;
  704. hwaddr min64;
  705. hwaddr max64;
  706. hwaddr minpci64;
  707. hwaddr maxpci64;
  708. } VFIODirtyRanges;
  709. typedef struct VFIODirtyRangesListener {
  710. VFIOContainerBase *bcontainer;
  711. VFIODirtyRanges ranges;
  712. MemoryListener listener;
  713. } VFIODirtyRangesListener;
  714. static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
  715. VFIOContainerBase *bcontainer)
  716. {
  717. VFIOPCIDevice *pcidev;
  718. VFIODevice *vbasedev;
  719. Object *owner;
  720. owner = memory_region_owner(section->mr);
  721. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  722. if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
  723. continue;
  724. }
  725. pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
  726. if (OBJECT(pcidev) == owner) {
  727. return true;
  728. }
  729. }
  730. return false;
  731. }
  732. static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range,
  733. hwaddr iova, hwaddr end,
  734. bool update_pci)
  735. {
  736. hwaddr *min, *max;
  737. /*
  738. * The address space passed to the dirty tracker is reduced to three ranges:
  739. * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
  740. * PCI 64-bit hole.
  741. *
  742. * The underlying reports of dirty will query a sub-interval of each of
  743. * these ranges.
  744. *
  745. * The purpose of the three range handling is to handle known cases of big
  746. * holes in the address space, like the x86 AMD 1T hole, and firmware (like
  747. * OVMF) which may relocate the pci-hole64 to the end of the address space.
  748. * The latter would otherwise generate large ranges for tracking, stressing
  749. * the limits of supported hardware. The pci-hole32 will always be below 4G
  750. * (overlapping or not) so it doesn't need special handling and is part of
  751. * the 32-bit range.
  752. *
  753. * The alternative would be an IOVATree but that has a much bigger runtime
  754. * overhead and unnecessary complexity.
  755. */
  756. if (update_pci && iova >= UINT32_MAX) {
  757. min = &range->minpci64;
  758. max = &range->maxpci64;
  759. } else {
  760. min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
  761. max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
  762. }
  763. if (*min > iova) {
  764. *min = iova;
  765. }
  766. if (*max < end) {
  767. *max = end;
  768. }
  769. trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
  770. }
  771. static void vfio_dirty_tracking_update(MemoryListener *listener,
  772. MemoryRegionSection *section)
  773. {
  774. VFIODirtyRangesListener *dirty =
  775. container_of(listener, VFIODirtyRangesListener, listener);
  776. hwaddr iova, end;
  777. if (!vfio_listener_valid_section(section, "tracking_update") ||
  778. !vfio_get_section_iova_range(dirty->bcontainer, section,
  779. &iova, &end, NULL)) {
  780. return;
  781. }
  782. vfio_dirty_tracking_update_range(&dirty->ranges, iova, end,
  783. vfio_section_is_vfio_pci(section, dirty->bcontainer));
  784. }
  785. static const MemoryListener vfio_dirty_tracking_listener = {
  786. .name = "vfio-tracking",
  787. .region_add = vfio_dirty_tracking_update,
  788. };
  789. static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer,
  790. VFIODirtyRanges *ranges)
  791. {
  792. VFIODirtyRangesListener dirty;
  793. memset(&dirty, 0, sizeof(dirty));
  794. dirty.ranges.min32 = UINT32_MAX;
  795. dirty.ranges.min64 = UINT64_MAX;
  796. dirty.ranges.minpci64 = UINT64_MAX;
  797. dirty.listener = vfio_dirty_tracking_listener;
  798. dirty.bcontainer = bcontainer;
  799. memory_listener_register(&dirty.listener,
  800. bcontainer->space->as);
  801. *ranges = dirty.ranges;
  802. /*
  803. * The memory listener is synchronous, and used to calculate the range
  804. * to dirty tracking. Unregister it after we are done as we are not
  805. * interested in any follow-up updates.
  806. */
  807. memory_listener_unregister(&dirty.listener);
  808. }
  809. static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
  810. {
  811. uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
  812. sizeof(uint64_t))] = {};
  813. struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
  814. VFIODevice *vbasedev;
  815. feature->argsz = sizeof(buf);
  816. feature->flags = VFIO_DEVICE_FEATURE_SET |
  817. VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
  818. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  819. if (!vbasedev->dirty_tracking) {
  820. continue;
  821. }
  822. if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
  823. warn_report("%s: Failed to stop DMA logging, err %d (%s)",
  824. vbasedev->name, -errno, strerror(errno));
  825. }
  826. vbasedev->dirty_tracking = false;
  827. }
  828. }
  829. static struct vfio_device_feature *
  830. vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer,
  831. VFIODirtyRanges *tracking)
  832. {
  833. struct vfio_device_feature *feature;
  834. size_t feature_size;
  835. struct vfio_device_feature_dma_logging_control *control;
  836. struct vfio_device_feature_dma_logging_range *ranges;
  837. feature_size = sizeof(struct vfio_device_feature) +
  838. sizeof(struct vfio_device_feature_dma_logging_control);
  839. feature = g_try_malloc0(feature_size);
  840. if (!feature) {
  841. errno = ENOMEM;
  842. return NULL;
  843. }
  844. feature->argsz = feature_size;
  845. feature->flags = VFIO_DEVICE_FEATURE_SET |
  846. VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
  847. control = (struct vfio_device_feature_dma_logging_control *)feature->data;
  848. control->page_size = qemu_real_host_page_size();
  849. /*
  850. * DMA logging uAPI guarantees to support at least a number of ranges that
  851. * fits into a single host kernel base page.
  852. */
  853. control->num_ranges = !!tracking->max32 + !!tracking->max64 +
  854. !!tracking->maxpci64;
  855. ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
  856. control->num_ranges);
  857. if (!ranges) {
  858. g_free(feature);
  859. errno = ENOMEM;
  860. return NULL;
  861. }
  862. control->ranges = (uintptr_t)ranges;
  863. if (tracking->max32) {
  864. ranges->iova = tracking->min32;
  865. ranges->length = (tracking->max32 - tracking->min32) + 1;
  866. ranges++;
  867. }
  868. if (tracking->max64) {
  869. ranges->iova = tracking->min64;
  870. ranges->length = (tracking->max64 - tracking->min64) + 1;
  871. ranges++;
  872. }
  873. if (tracking->maxpci64) {
  874. ranges->iova = tracking->minpci64;
  875. ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1;
  876. }
  877. trace_vfio_device_dirty_tracking_start(control->num_ranges,
  878. tracking->min32, tracking->max32,
  879. tracking->min64, tracking->max64,
  880. tracking->minpci64, tracking->maxpci64);
  881. return feature;
  882. }
  883. static void vfio_device_feature_dma_logging_start_destroy(
  884. struct vfio_device_feature *feature)
  885. {
  886. struct vfio_device_feature_dma_logging_control *control =
  887. (struct vfio_device_feature_dma_logging_control *)feature->data;
  888. struct vfio_device_feature_dma_logging_range *ranges =
  889. (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
  890. g_free(ranges);
  891. g_free(feature);
  892. }
  893. static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
  894. Error **errp)
  895. {
  896. struct vfio_device_feature *feature;
  897. VFIODirtyRanges ranges;
  898. VFIODevice *vbasedev;
  899. int ret = 0;
  900. vfio_dirty_tracking_init(bcontainer, &ranges);
  901. feature = vfio_device_feature_dma_logging_start_create(bcontainer,
  902. &ranges);
  903. if (!feature) {
  904. error_setg_errno(errp, errno, "Failed to prepare DMA logging");
  905. return false;
  906. }
  907. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  908. if (vbasedev->dirty_tracking) {
  909. continue;
  910. }
  911. ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
  912. if (ret) {
  913. ret = -errno;
  914. error_setg_errno(errp, errno, "%s: Failed to start DMA logging",
  915. vbasedev->name);
  916. goto out;
  917. }
  918. vbasedev->dirty_tracking = true;
  919. }
  920. out:
  921. if (ret) {
  922. vfio_devices_dma_logging_stop(bcontainer);
  923. }
  924. vfio_device_feature_dma_logging_start_destroy(feature);
  925. return ret == 0;
  926. }
  927. static bool vfio_listener_log_global_start(MemoryListener *listener,
  928. Error **errp)
  929. {
  930. ERRP_GUARD();
  931. VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  932. listener);
  933. bool ret;
  934. if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
  935. ret = vfio_devices_dma_logging_start(bcontainer, errp);
  936. } else {
  937. ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0;
  938. }
  939. if (!ret) {
  940. error_prepend(errp, "vfio: Could not start dirty page tracking - ");
  941. }
  942. return ret;
  943. }
  944. static void vfio_listener_log_global_stop(MemoryListener *listener)
  945. {
  946. VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  947. listener);
  948. Error *local_err = NULL;
  949. int ret = 0;
  950. if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
  951. vfio_devices_dma_logging_stop(bcontainer);
  952. } else {
  953. ret = vfio_container_set_dirty_page_tracking(bcontainer, false,
  954. &local_err);
  955. }
  956. if (ret) {
  957. error_prepend(&local_err,
  958. "vfio: Could not stop dirty page tracking - ");
  959. error_report_err(local_err);
  960. vfio_set_migration_error(ret);
  961. }
  962. }
  963. static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
  964. hwaddr size, void *bitmap)
  965. {
  966. uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
  967. sizeof(struct vfio_device_feature_dma_logging_report),
  968. sizeof(uint64_t))] = {};
  969. struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
  970. struct vfio_device_feature_dma_logging_report *report =
  971. (struct vfio_device_feature_dma_logging_report *)feature->data;
  972. report->iova = iova;
  973. report->length = size;
  974. report->page_size = qemu_real_host_page_size();
  975. report->bitmap = (uintptr_t)bitmap;
  976. feature->argsz = sizeof(buf);
  977. feature->flags = VFIO_DEVICE_FEATURE_GET |
  978. VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
  979. if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
  980. return -errno;
  981. }
  982. return 0;
  983. }
  984. int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
  985. VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
  986. {
  987. VFIODevice *vbasedev;
  988. int ret;
  989. QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
  990. ret = vfio_device_dma_logging_report(vbasedev, iova, size,
  991. vbmap->bitmap);
  992. if (ret) {
  993. error_setg_errno(errp, -ret,
  994. "%s: Failed to get DMA logging report, iova: "
  995. "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx,
  996. vbasedev->name, iova, size);
  997. return ret;
  998. }
  999. }
  1000. return 0;
  1001. }
  1002. int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
  1003. uint64_t size, ram_addr_t ram_addr, Error **errp)
  1004. {
  1005. bool all_device_dirty_tracking =
  1006. vfio_devices_all_device_dirty_tracking(bcontainer);
  1007. uint64_t dirty_pages;
  1008. VFIOBitmap vbmap;
  1009. int ret;
  1010. if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) {
  1011. cpu_physical_memory_set_dirty_range(ram_addr, size,
  1012. tcg_enabled() ? DIRTY_CLIENTS_ALL :
  1013. DIRTY_CLIENTS_NOCODE);
  1014. return 0;
  1015. }
  1016. ret = vfio_bitmap_alloc(&vbmap, size);
  1017. if (ret) {
  1018. error_setg_errno(errp, -ret,
  1019. "Failed to allocate dirty tracking bitmap");
  1020. return ret;
  1021. }
  1022. if (all_device_dirty_tracking) {
  1023. ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
  1024. errp);
  1025. } else {
  1026. ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size,
  1027. errp);
  1028. }
  1029. if (ret) {
  1030. goto out;
  1031. }
  1032. dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
  1033. vbmap.pages);
  1034. trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages);
  1035. out:
  1036. g_free(vbmap.bitmap);
  1037. return ret;
  1038. }
  1039. typedef struct {
  1040. IOMMUNotifier n;
  1041. VFIOGuestIOMMU *giommu;
  1042. } vfio_giommu_dirty_notifier;
  1043. static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
  1044. {
  1045. vfio_giommu_dirty_notifier *gdn = container_of(n,
  1046. vfio_giommu_dirty_notifier, n);
  1047. VFIOGuestIOMMU *giommu = gdn->giommu;
  1048. VFIOContainerBase *bcontainer = giommu->bcontainer;
  1049. hwaddr iova = iotlb->iova + giommu->iommu_offset;
  1050. ram_addr_t translated_addr;
  1051. Error *local_err = NULL;
  1052. int ret = -EINVAL;
  1053. trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
  1054. if (iotlb->target_as != &address_space_memory) {
  1055. error_report("Wrong target AS \"%s\", only system memory is allowed",
  1056. iotlb->target_as->name ? iotlb->target_as->name : "none");
  1057. goto out;
  1058. }
  1059. rcu_read_lock();
  1060. if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) {
  1061. error_report_err(local_err);
  1062. goto out_unlock;
  1063. }
  1064. ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
  1065. translated_addr, &local_err);
  1066. if (ret) {
  1067. error_prepend(&local_err,
  1068. "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
  1069. "0x%"HWADDR_PRIx") failed - ", bcontainer, iova,
  1070. iotlb->addr_mask + 1);
  1071. error_report_err(local_err);
  1072. }
  1073. out_unlock:
  1074. rcu_read_unlock();
  1075. out:
  1076. if (ret) {
  1077. vfio_set_migration_error(ret);
  1078. }
  1079. }
  1080. static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
  1081. void *opaque)
  1082. {
  1083. const hwaddr size = int128_get64(section->size);
  1084. const hwaddr iova = section->offset_within_address_space;
  1085. const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
  1086. section->offset_within_region;
  1087. VFIORamDiscardListener *vrdl = opaque;
  1088. Error *local_err = NULL;
  1089. int ret;
  1090. /*
  1091. * Sync the whole mapped region (spanning multiple individual mappings)
  1092. * in one go.
  1093. */
  1094. ret = vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr,
  1095. &local_err);
  1096. if (ret) {
  1097. error_report_err(local_err);
  1098. }
  1099. return ret;
  1100. }
  1101. static int
  1102. vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
  1103. MemoryRegionSection *section)
  1104. {
  1105. RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
  1106. VFIORamDiscardListener *vrdl = NULL;
  1107. QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
  1108. if (vrdl->mr == section->mr &&
  1109. vrdl->offset_within_address_space ==
  1110. section->offset_within_address_space) {
  1111. break;
  1112. }
  1113. }
  1114. if (!vrdl) {
  1115. hw_error("vfio: Trying to sync missing RAM discard listener");
  1116. }
  1117. /*
  1118. * We only want/can synchronize the bitmap for actually mapped parts -
  1119. * which correspond to populated parts. Replay all populated parts.
  1120. */
  1121. return ram_discard_manager_replay_populated(rdm, section,
  1122. vfio_ram_discard_get_dirty_bitmap,
  1123. &vrdl);
  1124. }
  1125. static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer,
  1126. MemoryRegionSection *section)
  1127. {
  1128. VFIOGuestIOMMU *giommu;
  1129. bool found = false;
  1130. Int128 llend;
  1131. vfio_giommu_dirty_notifier gdn;
  1132. int idx;
  1133. QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
  1134. if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
  1135. giommu->n.start == section->offset_within_region) {
  1136. found = true;
  1137. break;
  1138. }
  1139. }
  1140. if (!found) {
  1141. return 0;
  1142. }
  1143. gdn.giommu = giommu;
  1144. idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
  1145. MEMTXATTRS_UNSPECIFIED);
  1146. llend = int128_add(int128_make64(section->offset_within_region),
  1147. section->size);
  1148. llend = int128_sub(llend, int128_one());
  1149. iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP,
  1150. section->offset_within_region, int128_get64(llend),
  1151. idx);
  1152. memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
  1153. return 0;
  1154. }
  1155. static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
  1156. MemoryRegionSection *section, Error **errp)
  1157. {
  1158. ram_addr_t ram_addr;
  1159. if (memory_region_is_iommu(section->mr)) {
  1160. return vfio_sync_iommu_dirty_bitmap(bcontainer, section);
  1161. } else if (memory_region_has_ram_discard_manager(section->mr)) {
  1162. int ret;
  1163. ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section);
  1164. if (ret) {
  1165. error_setg(errp,
  1166. "Failed to sync dirty bitmap with RAM discard listener");
  1167. }
  1168. return ret;
  1169. }
  1170. ram_addr = memory_region_get_ram_addr(section->mr) +
  1171. section->offset_within_region;
  1172. return vfio_get_dirty_bitmap(bcontainer,
  1173. REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
  1174. int128_get64(section->size), ram_addr, errp);
  1175. }
  1176. static void vfio_listener_log_sync(MemoryListener *listener,
  1177. MemoryRegionSection *section)
  1178. {
  1179. VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  1180. listener);
  1181. int ret;
  1182. Error *local_err = NULL;
  1183. if (vfio_listener_skipped_section(section)) {
  1184. return;
  1185. }
  1186. if (vfio_log_sync_needed(bcontainer)) {
  1187. ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err);
  1188. if (ret) {
  1189. error_report_err(local_err);
  1190. vfio_set_migration_error(ret);
  1191. }
  1192. }
  1193. }
  1194. const MemoryListener vfio_memory_listener = {
  1195. .name = "vfio",
  1196. .region_add = vfio_listener_region_add,
  1197. .region_del = vfio_listener_region_del,
  1198. .log_global_start = vfio_listener_log_global_start,
  1199. .log_global_stop = vfio_listener_log_global_stop,
  1200. .log_sync = vfio_listener_log_sync,
  1201. };
  1202. void vfio_reset_handler(void *opaque)
  1203. {
  1204. VFIODevice *vbasedev;
  1205. trace_vfio_reset_handler();
  1206. QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
  1207. if (vbasedev->dev->realized) {
  1208. vbasedev->ops->vfio_compute_needs_reset(vbasedev);
  1209. }
  1210. }
  1211. QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
  1212. if (vbasedev->dev->realized && vbasedev->needs_reset) {
  1213. vbasedev->ops->vfio_hot_reset_multi(vbasedev);
  1214. }
  1215. }
  1216. }
  1217. int vfio_kvm_device_add_fd(int fd, Error **errp)
  1218. {
  1219. #ifdef CONFIG_KVM
  1220. struct kvm_device_attr attr = {
  1221. .group = KVM_DEV_VFIO_FILE,
  1222. .attr = KVM_DEV_VFIO_FILE_ADD,
  1223. .addr = (uint64_t)(unsigned long)&fd,
  1224. };
  1225. if (!kvm_enabled()) {
  1226. return 0;
  1227. }
  1228. if (vfio_kvm_device_fd < 0) {
  1229. struct kvm_create_device cd = {
  1230. .type = KVM_DEV_TYPE_VFIO,
  1231. };
  1232. if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
  1233. error_setg_errno(errp, errno, "Failed to create KVM VFIO device");
  1234. return -errno;
  1235. }
  1236. vfio_kvm_device_fd = cd.fd;
  1237. }
  1238. if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
  1239. error_setg_errno(errp, errno, "Failed to add fd %d to KVM VFIO device",
  1240. fd);
  1241. return -errno;
  1242. }
  1243. #endif
  1244. return 0;
  1245. }
  1246. int vfio_kvm_device_del_fd(int fd, Error **errp)
  1247. {
  1248. #ifdef CONFIG_KVM
  1249. struct kvm_device_attr attr = {
  1250. .group = KVM_DEV_VFIO_FILE,
  1251. .attr = KVM_DEV_VFIO_FILE_DEL,
  1252. .addr = (uint64_t)(unsigned long)&fd,
  1253. };
  1254. if (vfio_kvm_device_fd < 0) {
  1255. error_setg(errp, "KVM VFIO device isn't created yet");
  1256. return -EINVAL;
  1257. }
  1258. if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
  1259. error_setg_errno(errp, errno,
  1260. "Failed to remove fd %d from KVM VFIO device", fd);
  1261. return -errno;
  1262. }
  1263. #endif
  1264. return 0;
  1265. }
  1266. VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
  1267. {
  1268. VFIOAddressSpace *space;
  1269. QLIST_FOREACH(space, &vfio_address_spaces, list) {
  1270. if (space->as == as) {
  1271. return space;
  1272. }
  1273. }
  1274. /* No suitable VFIOAddressSpace, create a new one */
  1275. space = g_malloc0(sizeof(*space));
  1276. space->as = as;
  1277. QLIST_INIT(&space->containers);
  1278. if (QLIST_EMPTY(&vfio_address_spaces)) {
  1279. qemu_register_reset(vfio_reset_handler, NULL);
  1280. }
  1281. QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
  1282. return space;
  1283. }
  1284. void vfio_put_address_space(VFIOAddressSpace *space)
  1285. {
  1286. if (!QLIST_EMPTY(&space->containers)) {
  1287. return;
  1288. }
  1289. QLIST_REMOVE(space, list);
  1290. g_free(space);
  1291. if (QLIST_EMPTY(&vfio_address_spaces)) {
  1292. qemu_unregister_reset(vfio_reset_handler, NULL);
  1293. }
  1294. }
  1295. void vfio_address_space_insert(VFIOAddressSpace *space,
  1296. VFIOContainerBase *bcontainer)
  1297. {
  1298. QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
  1299. bcontainer->space = space;
  1300. }
  1301. struct vfio_device_info *vfio_get_device_info(int fd)
  1302. {
  1303. struct vfio_device_info *info;
  1304. uint32_t argsz = sizeof(*info);
  1305. info = g_malloc0(argsz);
  1306. retry:
  1307. info->argsz = argsz;
  1308. if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
  1309. g_free(info);
  1310. return NULL;
  1311. }
  1312. if (info->argsz > argsz) {
  1313. argsz = info->argsz;
  1314. info = g_realloc(info, argsz);
  1315. goto retry;
  1316. }
  1317. return info;
  1318. }
  1319. bool vfio_attach_device(char *name, VFIODevice *vbasedev,
  1320. AddressSpace *as, Error **errp)
  1321. {
  1322. const VFIOIOMMUClass *ops =
  1323. VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
  1324. HostIOMMUDevice *hiod = NULL;
  1325. if (vbasedev->iommufd) {
  1326. ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
  1327. }
  1328. assert(ops);
  1329. if (!vbasedev->mdev) {
  1330. hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
  1331. vbasedev->hiod = hiod;
  1332. }
  1333. if (!ops->attach_device(name, vbasedev, as, errp)) {
  1334. object_unref(hiod);
  1335. vbasedev->hiod = NULL;
  1336. return false;
  1337. }
  1338. return true;
  1339. }
  1340. void vfio_detach_device(VFIODevice *vbasedev)
  1341. {
  1342. if (!vbasedev->bcontainer) {
  1343. return;
  1344. }
  1345. object_unref(vbasedev->hiod);
  1346. VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
  1347. }