container.c 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212
  1. /*
  2. * generic functions used by VFIO devices
  3. *
  4. * Copyright Red Hat, Inc. 2012
  5. *
  6. * Authors:
  7. * Alex Williamson <alex.williamson@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Based on qemu-kvm device-assignment:
  13. * Adapted for KVM by Qumranet.
  14. * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15. * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16. * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17. * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18. * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19. */
  20. #include "qemu/osdep.h"
  21. #include <sys/ioctl.h>
  22. #include <linux/vfio.h>
  23. #include "hw/vfio/vfio-common.h"
  24. #include "exec/address-spaces.h"
  25. #include "exec/memory.h"
  26. #include "exec/ram_addr.h"
  27. #include "qemu/error-report.h"
  28. #include "qemu/range.h"
  29. #include "sysemu/reset.h"
  30. #include "trace.h"
  31. #include "qapi/error.h"
  32. #include "pci.h"
  33. VFIOGroupList vfio_group_list =
  34. QLIST_HEAD_INITIALIZER(vfio_group_list);
  35. static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
  36. {
  37. switch (container->iommu_type) {
  38. case VFIO_TYPE1v2_IOMMU:
  39. case VFIO_TYPE1_IOMMU:
  40. /*
  41. * We support coordinated discarding of RAM via the RamDiscardManager.
  42. */
  43. return ram_block_uncoordinated_discard_disable(state);
  44. default:
  45. /*
  46. * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
  47. * RamDiscardManager, however, it is completely untested.
  48. *
  49. * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
  50. * completely the opposite of managing mapping/pinning dynamically as
  51. * required by RamDiscardManager. We would have to special-case sections
  52. * with a RamDiscardManager.
  53. */
  54. return ram_block_discard_disable(state);
  55. }
  56. }
  57. static int vfio_dma_unmap_bitmap(const VFIOContainer *container,
  58. hwaddr iova, ram_addr_t size,
  59. IOMMUTLBEntry *iotlb)
  60. {
  61. const VFIOContainerBase *bcontainer = &container->bcontainer;
  62. struct vfio_iommu_type1_dma_unmap *unmap;
  63. struct vfio_bitmap *bitmap;
  64. VFIOBitmap vbmap;
  65. int ret;
  66. ret = vfio_bitmap_alloc(&vbmap, size);
  67. if (ret) {
  68. return ret;
  69. }
  70. unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
  71. unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
  72. unmap->iova = iova;
  73. unmap->size = size;
  74. unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
  75. bitmap = (struct vfio_bitmap *)&unmap->data;
  76. /*
  77. * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  78. * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
  79. * to qemu_real_host_page_size.
  80. */
  81. bitmap->pgsize = qemu_real_host_page_size();
  82. bitmap->size = vbmap.size;
  83. bitmap->data = (__u64 *)vbmap.bitmap;
  84. if (vbmap.size > bcontainer->max_dirty_bitmap_size) {
  85. error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
  86. ret = -E2BIG;
  87. goto unmap_exit;
  88. }
  89. ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
  90. if (!ret) {
  91. cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
  92. iotlb->translated_addr, vbmap.pages);
  93. } else {
  94. error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
  95. }
  96. unmap_exit:
  97. g_free(unmap);
  98. g_free(vbmap.bitmap);
  99. return ret;
  100. }
  101. /*
  102. * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  103. */
  104. static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
  105. hwaddr iova, ram_addr_t size,
  106. IOMMUTLBEntry *iotlb)
  107. {
  108. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  109. bcontainer);
  110. struct vfio_iommu_type1_dma_unmap unmap = {
  111. .argsz = sizeof(unmap),
  112. .flags = 0,
  113. .iova = iova,
  114. .size = size,
  115. };
  116. bool need_dirty_sync = false;
  117. int ret;
  118. Error *local_err = NULL;
  119. if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) {
  120. if (!vfio_devices_all_device_dirty_tracking(bcontainer) &&
  121. bcontainer->dirty_pages_supported) {
  122. return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
  123. }
  124. need_dirty_sync = true;
  125. }
  126. while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
  127. /*
  128. * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
  129. * v4.15) where an overflow in its wrap-around check prevents us from
  130. * unmapping the last page of the address space. Test for the error
  131. * condition and re-try the unmap excluding the last page. The
  132. * expectation is that we've never mapped the last page anyway and this
  133. * unmap request comes via vIOMMU support which also makes it unlikely
  134. * that this page is used. This bug was introduced well after type1 v2
  135. * support was introduced, so we shouldn't need to test for v1. A fix
  136. * is queued for kernel v5.0 so this workaround can be removed once
  137. * affected kernels are sufficiently deprecated.
  138. */
  139. if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
  140. container->iommu_type == VFIO_TYPE1v2_IOMMU) {
  141. trace_vfio_legacy_dma_unmap_overflow_workaround();
  142. unmap.size -= 1ULL << ctz64(bcontainer->pgsizes);
  143. continue;
  144. }
  145. error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
  146. return -errno;
  147. }
  148. if (need_dirty_sync) {
  149. ret = vfio_get_dirty_bitmap(bcontainer, iova, size,
  150. iotlb->translated_addr, &local_err);
  151. if (ret) {
  152. error_report_err(local_err);
  153. return ret;
  154. }
  155. }
  156. return 0;
  157. }
  158. static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
  159. ram_addr_t size, void *vaddr, bool readonly)
  160. {
  161. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  162. bcontainer);
  163. struct vfio_iommu_type1_dma_map map = {
  164. .argsz = sizeof(map),
  165. .flags = VFIO_DMA_MAP_FLAG_READ,
  166. .vaddr = (__u64)(uintptr_t)vaddr,
  167. .iova = iova,
  168. .size = size,
  169. };
  170. if (!readonly) {
  171. map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
  172. }
  173. /*
  174. * Try the mapping, if it fails with EBUSY, unmap the region and try
  175. * again. This shouldn't be necessary, but we sometimes see it in
  176. * the VGA ROM space.
  177. */
  178. if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
  179. (errno == EBUSY &&
  180. vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 &&
  181. ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
  182. return 0;
  183. }
  184. error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
  185. return -errno;
  186. }
  187. static int
  188. vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
  189. bool start, Error **errp)
  190. {
  191. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  192. bcontainer);
  193. int ret;
  194. struct vfio_iommu_type1_dirty_bitmap dirty = {
  195. .argsz = sizeof(dirty),
  196. };
  197. if (start) {
  198. dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
  199. } else {
  200. dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
  201. }
  202. ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
  203. if (ret) {
  204. ret = -errno;
  205. error_setg_errno(errp, errno, "Failed to set dirty tracking flag 0x%x",
  206. dirty.flags);
  207. }
  208. return ret;
  209. }
  210. static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
  211. VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
  212. {
  213. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  214. bcontainer);
  215. struct vfio_iommu_type1_dirty_bitmap *dbitmap;
  216. struct vfio_iommu_type1_dirty_bitmap_get *range;
  217. int ret;
  218. dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
  219. dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
  220. dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
  221. range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
  222. range->iova = iova;
  223. range->size = size;
  224. /*
  225. * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  226. * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
  227. * to qemu_real_host_page_size.
  228. */
  229. range->bitmap.pgsize = qemu_real_host_page_size();
  230. range->bitmap.size = vbmap->size;
  231. range->bitmap.data = (__u64 *)vbmap->bitmap;
  232. ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
  233. if (ret) {
  234. ret = -errno;
  235. error_setg_errno(errp, errno,
  236. "Failed to get dirty bitmap for iova: 0x%"PRIx64
  237. " size: 0x%"PRIx64, (uint64_t)range->iova,
  238. (uint64_t)range->size);
  239. }
  240. g_free(dbitmap);
  241. return ret;
  242. }
  243. static struct vfio_info_cap_header *
  244. vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
  245. {
  246. if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
  247. return NULL;
  248. }
  249. return vfio_get_cap((void *)info, info->cap_offset, id);
  250. }
  251. bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
  252. unsigned int *avail)
  253. {
  254. struct vfio_info_cap_header *hdr;
  255. struct vfio_iommu_type1_info_dma_avail *cap;
  256. /* If the capability cannot be found, assume no DMA limiting */
  257. hdr = vfio_get_iommu_type1_info_cap(info,
  258. VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
  259. if (!hdr) {
  260. return false;
  261. }
  262. if (avail != NULL) {
  263. cap = (void *) hdr;
  264. *avail = cap->avail;
  265. }
  266. return true;
  267. }
  268. static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
  269. VFIOContainerBase *bcontainer)
  270. {
  271. struct vfio_info_cap_header *hdr;
  272. struct vfio_iommu_type1_info_cap_iova_range *cap;
  273. hdr = vfio_get_iommu_type1_info_cap(info,
  274. VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
  275. if (!hdr) {
  276. return false;
  277. }
  278. cap = (void *)hdr;
  279. for (int i = 0; i < cap->nr_iovas; i++) {
  280. Range *range = g_new(Range, 1);
  281. range_set_bounds(range, cap->iova_ranges[i].start,
  282. cap->iova_ranges[i].end);
  283. bcontainer->iova_ranges =
  284. range_list_insert(bcontainer->iova_ranges, range);
  285. }
  286. return true;
  287. }
  288. static void vfio_kvm_device_add_group(VFIOGroup *group)
  289. {
  290. Error *err = NULL;
  291. if (vfio_kvm_device_add_fd(group->fd, &err)) {
  292. error_reportf_err(err, "group ID %d: ", group->groupid);
  293. }
  294. }
  295. static void vfio_kvm_device_del_group(VFIOGroup *group)
  296. {
  297. Error *err = NULL;
  298. if (vfio_kvm_device_del_fd(group->fd, &err)) {
  299. error_reportf_err(err, "group ID %d: ", group->groupid);
  300. }
  301. }
  302. /*
  303. * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
  304. */
  305. static int vfio_get_iommu_type(int container_fd,
  306. Error **errp)
  307. {
  308. int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
  309. VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
  310. int i;
  311. for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
  312. if (ioctl(container_fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
  313. return iommu_types[i];
  314. }
  315. }
  316. error_setg(errp, "No available IOMMU models");
  317. return -EINVAL;
  318. }
  319. /*
  320. * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type
  321. */
  322. static const char *vfio_get_iommu_class_name(int iommu_type)
  323. {
  324. switch (iommu_type) {
  325. case VFIO_TYPE1v2_IOMMU:
  326. case VFIO_TYPE1_IOMMU:
  327. return TYPE_VFIO_IOMMU_LEGACY;
  328. break;
  329. case VFIO_SPAPR_TCE_v2_IOMMU:
  330. case VFIO_SPAPR_TCE_IOMMU:
  331. return TYPE_VFIO_IOMMU_SPAPR;
  332. break;
  333. default:
  334. g_assert_not_reached();
  335. };
  336. }
  337. static bool vfio_set_iommu(int container_fd, int group_fd,
  338. int *iommu_type, Error **errp)
  339. {
  340. if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd)) {
  341. error_setg_errno(errp, errno, "Failed to set group container");
  342. return false;
  343. }
  344. while (ioctl(container_fd, VFIO_SET_IOMMU, *iommu_type)) {
  345. if (*iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
  346. /*
  347. * On sPAPR, despite the IOMMU subdriver always advertises v1 and
  348. * v2, the running platform may not support v2 and there is no
  349. * way to guess it until an IOMMU group gets added to the container.
  350. * So in case it fails with v2, try v1 as a fallback.
  351. */
  352. *iommu_type = VFIO_SPAPR_TCE_IOMMU;
  353. continue;
  354. }
  355. error_setg_errno(errp, errno, "Failed to set iommu for container");
  356. return false;
  357. }
  358. return true;
  359. }
  360. static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
  361. Error **errp)
  362. {
  363. int iommu_type;
  364. const char *vioc_name;
  365. VFIOContainer *container;
  366. iommu_type = vfio_get_iommu_type(fd, errp);
  367. if (iommu_type < 0) {
  368. return NULL;
  369. }
  370. if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
  371. return NULL;
  372. }
  373. vioc_name = vfio_get_iommu_class_name(iommu_type);
  374. container = VFIO_IOMMU_LEGACY(object_new(vioc_name));
  375. container->fd = fd;
  376. container->iommu_type = iommu_type;
  377. return container;
  378. }
  379. static int vfio_get_iommu_info(VFIOContainer *container,
  380. struct vfio_iommu_type1_info **info)
  381. {
  382. size_t argsz = sizeof(struct vfio_iommu_type1_info);
  383. *info = g_new0(struct vfio_iommu_type1_info, 1);
  384. again:
  385. (*info)->argsz = argsz;
  386. if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
  387. g_free(*info);
  388. *info = NULL;
  389. return -errno;
  390. }
  391. if (((*info)->argsz > argsz)) {
  392. argsz = (*info)->argsz;
  393. *info = g_realloc(*info, argsz);
  394. goto again;
  395. }
  396. return 0;
  397. }
  398. static struct vfio_info_cap_header *
  399. vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
  400. {
  401. struct vfio_info_cap_header *hdr;
  402. void *ptr = info;
  403. if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
  404. return NULL;
  405. }
  406. for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
  407. if (hdr->id == id) {
  408. return hdr;
  409. }
  410. }
  411. return NULL;
  412. }
  413. static void vfio_get_iommu_info_migration(VFIOContainer *container,
  414. struct vfio_iommu_type1_info *info)
  415. {
  416. struct vfio_info_cap_header *hdr;
  417. struct vfio_iommu_type1_info_cap_migration *cap_mig;
  418. VFIOContainerBase *bcontainer = &container->bcontainer;
  419. hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
  420. if (!hdr) {
  421. return;
  422. }
  423. cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
  424. header);
  425. /*
  426. * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  427. * qemu_real_host_page_size to mark those dirty.
  428. */
  429. if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
  430. bcontainer->dirty_pages_supported = true;
  431. bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
  432. bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap;
  433. }
  434. }
  435. static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
  436. {
  437. VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  438. bcontainer);
  439. g_autofree struct vfio_iommu_type1_info *info = NULL;
  440. int ret;
  441. ret = vfio_get_iommu_info(container, &info);
  442. if (ret) {
  443. error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
  444. return false;
  445. }
  446. if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
  447. bcontainer->pgsizes = info->iova_pgsizes;
  448. } else {
  449. bcontainer->pgsizes = qemu_real_host_page_size();
  450. }
  451. if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) {
  452. bcontainer->dma_max_mappings = 65535;
  453. }
  454. vfio_get_info_iova_range(info, bcontainer);
  455. vfio_get_iommu_info_migration(container, info);
  456. return true;
  457. }
  458. static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
  459. Error **errp)
  460. {
  461. VFIOContainer *container;
  462. VFIOContainerBase *bcontainer;
  463. int ret, fd;
  464. VFIOAddressSpace *space;
  465. VFIOIOMMUClass *vioc;
  466. space = vfio_get_address_space(as);
  467. /*
  468. * VFIO is currently incompatible with discarding of RAM insofar as the
  469. * madvise to purge (zap) the page from QEMU's address space does not
  470. * interact with the memory API and therefore leaves stale virtual to
  471. * physical mappings in the IOMMU if the page was previously pinned. We
  472. * therefore set discarding broken for each group added to a container,
  473. * whether the container is used individually or shared. This provides
  474. * us with options to allow devices within a group to opt-in and allow
  475. * discarding, so long as it is done consistently for a group (for instance
  476. * if the device is an mdev device where it is known that the host vendor
  477. * driver will never pin pages outside of the working set of the guest
  478. * driver, which would thus not be discarding candidates).
  479. *
  480. * The first opportunity to induce pinning occurs here where we attempt to
  481. * attach the group to existing containers within the AddressSpace. If any
  482. * pages are already zapped from the virtual address space, such as from
  483. * previous discards, new pinning will cause valid mappings to be
  484. * re-established. Likewise, when the overall MemoryListener for a new
  485. * container is registered, a replay of mappings within the AddressSpace
  486. * will occur, re-establishing any previously zapped pages as well.
  487. *
  488. * Especially virtio-balloon is currently only prevented from discarding
  489. * new memory, it will not yet set ram_block_discard_set_required() and
  490. * therefore, neither stops us here or deals with the sudden memory
  491. * consumption of inflated memory.
  492. *
  493. * We do support discarding of memory coordinated via the RamDiscardManager
  494. * with some IOMMU types. vfio_ram_block_discard_disable() handles the
  495. * details once we know which type of IOMMU we are using.
  496. */
  497. QLIST_FOREACH(bcontainer, &space->containers, next) {
  498. container = container_of(bcontainer, VFIOContainer, bcontainer);
  499. if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
  500. ret = vfio_ram_block_discard_disable(container, true);
  501. if (ret) {
  502. error_setg_errno(errp, -ret,
  503. "Cannot set discarding of RAM broken");
  504. if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
  505. &container->fd)) {
  506. error_report("vfio: error disconnecting group %d from"
  507. " container", group->groupid);
  508. }
  509. return false;
  510. }
  511. group->container = container;
  512. QLIST_INSERT_HEAD(&container->group_list, group, container_next);
  513. vfio_kvm_device_add_group(group);
  514. return true;
  515. }
  516. }
  517. fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
  518. if (fd < 0) {
  519. error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
  520. goto put_space_exit;
  521. }
  522. ret = ioctl(fd, VFIO_GET_API_VERSION);
  523. if (ret != VFIO_API_VERSION) {
  524. error_setg(errp, "supported vfio version: %d, "
  525. "reported version: %d", VFIO_API_VERSION, ret);
  526. goto close_fd_exit;
  527. }
  528. container = vfio_create_container(fd, group, errp);
  529. if (!container) {
  530. goto close_fd_exit;
  531. }
  532. bcontainer = &container->bcontainer;
  533. if (!vfio_cpr_register_container(bcontainer, errp)) {
  534. goto free_container_exit;
  535. }
  536. ret = vfio_ram_block_discard_disable(container, true);
  537. if (ret) {
  538. error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
  539. goto unregister_container_exit;
  540. }
  541. vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
  542. assert(vioc->setup);
  543. if (!vioc->setup(bcontainer, errp)) {
  544. goto enable_discards_exit;
  545. }
  546. vfio_kvm_device_add_group(group);
  547. vfio_address_space_insert(space, bcontainer);
  548. group->container = container;
  549. QLIST_INSERT_HEAD(&container->group_list, group, container_next);
  550. bcontainer->listener = vfio_memory_listener;
  551. memory_listener_register(&bcontainer->listener, bcontainer->space->as);
  552. if (bcontainer->error) {
  553. error_propagate_prepend(errp, bcontainer->error,
  554. "memory listener initialization failed: ");
  555. goto listener_release_exit;
  556. }
  557. bcontainer->initialized = true;
  558. return true;
  559. listener_release_exit:
  560. QLIST_REMOVE(group, container_next);
  561. QLIST_REMOVE(bcontainer, next);
  562. vfio_kvm_device_del_group(group);
  563. memory_listener_unregister(&bcontainer->listener);
  564. if (vioc->release) {
  565. vioc->release(bcontainer);
  566. }
  567. enable_discards_exit:
  568. vfio_ram_block_discard_disable(container, false);
  569. unregister_container_exit:
  570. vfio_cpr_unregister_container(bcontainer);
  571. free_container_exit:
  572. object_unref(container);
  573. close_fd_exit:
  574. close(fd);
  575. put_space_exit:
  576. vfio_put_address_space(space);
  577. return false;
  578. }
  579. static void vfio_disconnect_container(VFIOGroup *group)
  580. {
  581. VFIOContainer *container = group->container;
  582. VFIOContainerBase *bcontainer = &container->bcontainer;
  583. VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
  584. QLIST_REMOVE(group, container_next);
  585. group->container = NULL;
  586. /*
  587. * Explicitly release the listener first before unset container,
  588. * since unset may destroy the backend container if it's the last
  589. * group.
  590. */
  591. if (QLIST_EMPTY(&container->group_list)) {
  592. memory_listener_unregister(&bcontainer->listener);
  593. if (vioc->release) {
  594. vioc->release(bcontainer);
  595. }
  596. }
  597. if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
  598. error_report("vfio: error disconnecting group %d from container",
  599. group->groupid);
  600. }
  601. if (QLIST_EMPTY(&container->group_list)) {
  602. VFIOAddressSpace *space = bcontainer->space;
  603. trace_vfio_disconnect_container(container->fd);
  604. vfio_cpr_unregister_container(bcontainer);
  605. close(container->fd);
  606. object_unref(container);
  607. vfio_put_address_space(space);
  608. }
  609. }
  610. static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
  611. {
  612. ERRP_GUARD();
  613. VFIOGroup *group;
  614. char path[32];
  615. struct vfio_group_status status = { .argsz = sizeof(status) };
  616. QLIST_FOREACH(group, &vfio_group_list, next) {
  617. if (group->groupid == groupid) {
  618. /* Found it. Now is it already in the right context? */
  619. if (group->container->bcontainer.space->as == as) {
  620. return group;
  621. } else {
  622. error_setg(errp, "group %d used in multiple address spaces",
  623. group->groupid);
  624. return NULL;
  625. }
  626. }
  627. }
  628. group = g_malloc0(sizeof(*group));
  629. snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
  630. group->fd = qemu_open_old(path, O_RDWR);
  631. if (group->fd < 0) {
  632. error_setg_errno(errp, errno, "failed to open %s", path);
  633. goto free_group_exit;
  634. }
  635. if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
  636. error_setg_errno(errp, errno, "failed to get group %d status", groupid);
  637. goto close_fd_exit;
  638. }
  639. if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
  640. error_setg(errp, "group %d is not viable", groupid);
  641. error_append_hint(errp,
  642. "Please ensure all devices within the iommu_group "
  643. "are bound to their vfio bus driver.\n");
  644. goto close_fd_exit;
  645. }
  646. group->groupid = groupid;
  647. QLIST_INIT(&group->device_list);
  648. if (!vfio_connect_container(group, as, errp)) {
  649. error_prepend(errp, "failed to setup container for group %d: ",
  650. groupid);
  651. goto close_fd_exit;
  652. }
  653. QLIST_INSERT_HEAD(&vfio_group_list, group, next);
  654. return group;
  655. close_fd_exit:
  656. close(group->fd);
  657. free_group_exit:
  658. g_free(group);
  659. return NULL;
  660. }
  661. static void vfio_put_group(VFIOGroup *group)
  662. {
  663. if (!group || !QLIST_EMPTY(&group->device_list)) {
  664. return;
  665. }
  666. if (!group->ram_block_discard_allowed) {
  667. vfio_ram_block_discard_disable(group->container, false);
  668. }
  669. vfio_kvm_device_del_group(group);
  670. vfio_disconnect_container(group);
  671. QLIST_REMOVE(group, next);
  672. trace_vfio_put_group(group->fd);
  673. close(group->fd);
  674. g_free(group);
  675. }
  676. static bool vfio_get_device(VFIOGroup *group, const char *name,
  677. VFIODevice *vbasedev, Error **errp)
  678. {
  679. g_autofree struct vfio_device_info *info = NULL;
  680. int fd;
  681. fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
  682. if (fd < 0) {
  683. error_setg_errno(errp, errno, "error getting device from group %d",
  684. group->groupid);
  685. error_append_hint(errp,
  686. "Verify all devices in group %d are bound to vfio-<bus> "
  687. "or pci-stub and not already in use\n", group->groupid);
  688. return false;
  689. }
  690. info = vfio_get_device_info(fd);
  691. if (!info) {
  692. error_setg_errno(errp, errno, "error getting device info");
  693. close(fd);
  694. return false;
  695. }
  696. /*
  697. * Set discarding of RAM as not broken for this group if the driver knows
  698. * the device operates compatibly with discarding. Setting must be
  699. * consistent per group, but since compatibility is really only possible
  700. * with mdev currently, we expect singleton groups.
  701. */
  702. if (vbasedev->ram_block_discard_allowed !=
  703. group->ram_block_discard_allowed) {
  704. if (!QLIST_EMPTY(&group->device_list)) {
  705. error_setg(errp, "Inconsistent setting of support for discarding "
  706. "RAM (e.g., balloon) within group");
  707. close(fd);
  708. return false;
  709. }
  710. if (!group->ram_block_discard_allowed) {
  711. group->ram_block_discard_allowed = true;
  712. vfio_ram_block_discard_disable(group->container, false);
  713. }
  714. }
  715. vbasedev->fd = fd;
  716. vbasedev->group = group;
  717. QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
  718. vbasedev->num_irqs = info->num_irqs;
  719. vbasedev->num_regions = info->num_regions;
  720. vbasedev->flags = info->flags;
  721. trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
  722. vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
  723. return true;
  724. }
  725. static void vfio_put_base_device(VFIODevice *vbasedev)
  726. {
  727. if (!vbasedev->group) {
  728. return;
  729. }
  730. QLIST_REMOVE(vbasedev, next);
  731. vbasedev->group = NULL;
  732. trace_vfio_put_base_device(vbasedev->fd);
  733. close(vbasedev->fd);
  734. }
  735. static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
  736. {
  737. char *tmp, group_path[PATH_MAX];
  738. g_autofree char *group_name = NULL;
  739. int ret, groupid;
  740. ssize_t len;
  741. tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
  742. len = readlink(tmp, group_path, sizeof(group_path));
  743. g_free(tmp);
  744. if (len <= 0 || len >= sizeof(group_path)) {
  745. ret = len < 0 ? -errno : -ENAMETOOLONG;
  746. error_setg_errno(errp, -ret, "no iommu_group found");
  747. return ret;
  748. }
  749. group_path[len] = 0;
  750. group_name = g_path_get_basename(group_path);
  751. if (sscanf(group_name, "%d", &groupid) != 1) {
  752. error_setg_errno(errp, errno, "failed to read %s", group_path);
  753. return -errno;
  754. }
  755. return groupid;
  756. }
  757. /*
  758. * vfio_attach_device: attach a device to a security context
  759. * @name and @vbasedev->name are likely to be different depending
  760. * on the type of the device, hence the need for passing @name
  761. */
  762. static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
  763. AddressSpace *as, Error **errp)
  764. {
  765. int groupid = vfio_device_groupid(vbasedev, errp);
  766. VFIODevice *vbasedev_iter;
  767. VFIOGroup *group;
  768. VFIOContainerBase *bcontainer;
  769. if (groupid < 0) {
  770. return false;
  771. }
  772. trace_vfio_attach_device(vbasedev->name, groupid);
  773. group = vfio_get_group(groupid, as, errp);
  774. if (!group) {
  775. return false;
  776. }
  777. QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
  778. if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
  779. error_setg(errp, "device is already attached");
  780. vfio_put_group(group);
  781. return false;
  782. }
  783. }
  784. if (!vfio_get_device(group, name, vbasedev, errp)) {
  785. vfio_put_group(group);
  786. return false;
  787. }
  788. bcontainer = &group->container->bcontainer;
  789. vbasedev->bcontainer = bcontainer;
  790. QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
  791. QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
  792. return true;
  793. }
  794. static void vfio_legacy_detach_device(VFIODevice *vbasedev)
  795. {
  796. VFIOGroup *group = vbasedev->group;
  797. QLIST_REMOVE(vbasedev, global_next);
  798. QLIST_REMOVE(vbasedev, container_next);
  799. vbasedev->bcontainer = NULL;
  800. trace_vfio_detach_device(vbasedev->name, group->groupid);
  801. vfio_put_base_device(vbasedev);
  802. vfio_put_group(group);
  803. }
  804. static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
  805. {
  806. VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
  807. VFIOGroup *group;
  808. struct vfio_pci_hot_reset_info *info = NULL;
  809. struct vfio_pci_dependent_device *devices;
  810. struct vfio_pci_hot_reset *reset;
  811. int32_t *fds;
  812. int ret, i, count;
  813. bool multi = false;
  814. trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
  815. if (!single) {
  816. vfio_pci_pre_reset(vdev);
  817. }
  818. vdev->vbasedev.needs_reset = false;
  819. ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
  820. if (ret) {
  821. goto out_single;
  822. }
  823. devices = &info->devices[0];
  824. trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
  825. /* Verify that we have all the groups required */
  826. for (i = 0; i < info->count; i++) {
  827. PCIHostDeviceAddress host;
  828. VFIOPCIDevice *tmp;
  829. VFIODevice *vbasedev_iter;
  830. host.domain = devices[i].segment;
  831. host.bus = devices[i].bus;
  832. host.slot = PCI_SLOT(devices[i].devfn);
  833. host.function = PCI_FUNC(devices[i].devfn);
  834. trace_vfio_pci_hot_reset_dep_devices(host.domain,
  835. host.bus, host.slot, host.function, devices[i].group_id);
  836. if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
  837. continue;
  838. }
  839. QLIST_FOREACH(group, &vfio_group_list, next) {
  840. if (group->groupid == devices[i].group_id) {
  841. break;
  842. }
  843. }
  844. if (!group) {
  845. if (!vdev->has_pm_reset) {
  846. error_report("vfio: Cannot reset device %s, "
  847. "depends on group %d which is not owned.",
  848. vdev->vbasedev.name, devices[i].group_id);
  849. }
  850. ret = -EPERM;
  851. goto out;
  852. }
  853. /* Prep dependent devices for reset and clear our marker. */
  854. QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
  855. if (!vbasedev_iter->dev->realized ||
  856. vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
  857. continue;
  858. }
  859. tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
  860. if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
  861. if (single) {
  862. ret = -EINVAL;
  863. goto out_single;
  864. }
  865. vfio_pci_pre_reset(tmp);
  866. tmp->vbasedev.needs_reset = false;
  867. multi = true;
  868. break;
  869. }
  870. }
  871. }
  872. if (!single && !multi) {
  873. ret = -EINVAL;
  874. goto out_single;
  875. }
  876. /* Determine how many group fds need to be passed */
  877. count = 0;
  878. QLIST_FOREACH(group, &vfio_group_list, next) {
  879. for (i = 0; i < info->count; i++) {
  880. if (group->groupid == devices[i].group_id) {
  881. count++;
  882. break;
  883. }
  884. }
  885. }
  886. reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
  887. reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
  888. fds = &reset->group_fds[0];
  889. /* Fill in group fds */
  890. QLIST_FOREACH(group, &vfio_group_list, next) {
  891. for (i = 0; i < info->count; i++) {
  892. if (group->groupid == devices[i].group_id) {
  893. fds[reset->count++] = group->fd;
  894. break;
  895. }
  896. }
  897. }
  898. /* Bus reset! */
  899. ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
  900. g_free(reset);
  901. if (ret) {
  902. ret = -errno;
  903. }
  904. trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
  905. ret ? strerror(errno) : "Success");
  906. out:
  907. /* Re-enable INTx on affected devices */
  908. for (i = 0; i < info->count; i++) {
  909. PCIHostDeviceAddress host;
  910. VFIOPCIDevice *tmp;
  911. VFIODevice *vbasedev_iter;
  912. host.domain = devices[i].segment;
  913. host.bus = devices[i].bus;
  914. host.slot = PCI_SLOT(devices[i].devfn);
  915. host.function = PCI_FUNC(devices[i].devfn);
  916. if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
  917. continue;
  918. }
  919. QLIST_FOREACH(group, &vfio_group_list, next) {
  920. if (group->groupid == devices[i].group_id) {
  921. break;
  922. }
  923. }
  924. if (!group) {
  925. break;
  926. }
  927. QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
  928. if (!vbasedev_iter->dev->realized ||
  929. vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
  930. continue;
  931. }
  932. tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
  933. if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
  934. vfio_pci_post_reset(tmp);
  935. break;
  936. }
  937. }
  938. }
  939. out_single:
  940. if (!single) {
  941. vfio_pci_post_reset(vdev);
  942. }
  943. g_free(info);
  944. return ret;
  945. }
  946. static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
  947. {
  948. VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
  949. vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO;
  950. vioc->setup = vfio_legacy_setup;
  951. vioc->dma_map = vfio_legacy_dma_map;
  952. vioc->dma_unmap = vfio_legacy_dma_unmap;
  953. vioc->attach_device = vfio_legacy_attach_device;
  954. vioc->detach_device = vfio_legacy_detach_device;
  955. vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking;
  956. vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap;
  957. vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
  958. };
  959. static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
  960. Error **errp)
  961. {
  962. VFIODevice *vdev = opaque;
  963. hiod->name = g_strdup(vdev->name);
  964. hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev);
  965. hiod->agent = opaque;
  966. return true;
  967. }
  968. static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap,
  969. Error **errp)
  970. {
  971. HostIOMMUDeviceCaps *caps = &hiod->caps;
  972. switch (cap) {
  973. case HOST_IOMMU_DEVICE_CAP_AW_BITS:
  974. return caps->aw_bits;
  975. default:
  976. error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
  977. return -EINVAL;
  978. }
  979. }
  980. static GList *
  981. hiod_legacy_vfio_get_iova_ranges(HostIOMMUDevice *hiod, Error **errp)
  982. {
  983. VFIODevice *vdev = hiod->agent;
  984. GList *l = NULL;
  985. g_assert(vdev);
  986. if (vdev->bcontainer) {
  987. l = g_list_copy(vdev->bcontainer->iova_ranges);
  988. }
  989. return l;
  990. }
  991. static void vfio_iommu_legacy_instance_init(Object *obj)
  992. {
  993. VFIOContainer *container = VFIO_IOMMU_LEGACY(obj);
  994. QLIST_INIT(&container->group_list);
  995. }
  996. static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
  997. {
  998. HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
  999. hioc->realize = hiod_legacy_vfio_realize;
  1000. hioc->get_cap = hiod_legacy_vfio_get_cap;
  1001. hioc->get_iova_ranges = hiod_legacy_vfio_get_iova_ranges;
  1002. };
  1003. static const TypeInfo types[] = {
  1004. {
  1005. .name = TYPE_VFIO_IOMMU_LEGACY,
  1006. .parent = TYPE_VFIO_IOMMU,
  1007. .instance_init = vfio_iommu_legacy_instance_init,
  1008. .instance_size = sizeof(VFIOContainer),
  1009. .class_init = vfio_iommu_legacy_class_init,
  1010. }, {
  1011. .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
  1012. .parent = TYPE_HOST_IOMMU_DEVICE,
  1013. .class_init = hiod_legacy_vfio_class_init,
  1014. }
  1015. };
  1016. DEFINE_TYPES(types)