container.c 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212
  1. /*
  2. * generic functions used by VFIO devices
  3. *
  4. * Copyright Red Hat, Inc. 2012
  5. *
  6. * Authors:
  7. * Alex Williamson <alex.williamson@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Based on qemu-kvm device-assignment:
  13. * Adapted for KVM by Qumranet.
  14. * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15. * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16. * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17. * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18. * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19. */
  20. #include "qemu/osdep.h"
  21. #include <sys/ioctl.h>
  22. #include <linux/vfio.h>
  23. #include "hw/vfio/vfio-common.h"
  24. #include "exec/address-spaces.h"
  25. #include "exec/memory.h"
  26. #include "exec/ram_addr.h"
  27. #include "qemu/error-report.h"
  28. #include "qemu/range.h"
  29. #include "system/reset.h"
  30. #include "trace.h"
  31. #include "qapi/error.h"
  32. #include "pci.h"
  33. VFIOGroupList vfio_group_list =
  34. QLIST_HEAD_INITIALIZER(vfio_group_list);
  35. static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
  36. {
  37. switch (container->iommu_type) {
  38. case VFIO_TYPE1v2_IOMMU:
  39. case VFIO_TYPE1_IOMMU:
  40. /*
  41. * We support coordinated discarding of RAM via the RamDiscardManager.
  42. */
  43. return ram_block_uncoordinated_discard_disable(state);
  44. default:
  45. /*
  46. * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
  47. * RamDiscardManager, however, it is completely untested.
  48. *
  49. * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
  50. * completely the opposite of managing mapping/pinning dynamically as
  51. * required by RamDiscardManager. We would have to special-case sections
  52. * with a RamDiscardManager.
  53. */
  54. return ram_block_discard_disable(state);
  55. }
  56. }
  57. static int vfio_dma_unmap_bitmap(const VFIOContainer *container,
  58. hwaddr iova, ram_addr_t size,
  59. IOMMUTLBEntry *iotlb)
  60. {
  61. const VFIOContainerBase *bcontainer = &container->bcontainer;
  62. struct vfio_iommu_type1_dma_unmap *unmap;
  63. struct vfio_bitmap *bitmap;
  64. VFIOBitmap vbmap;
  65. int ret;
  66. ret = vfio_bitmap_alloc(&vbmap, size);
  67. if (ret) {
  68. return ret;
  69. }
  70. unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
  71. unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
  72. unmap->iova = iova;
  73. unmap->size = size;
  74. unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
  75. bitmap = (struct vfio_bitmap *)&unmap->data;
  76. /*
  77. * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  78. * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
  79. * to qemu_real_host_page_size.
  80. */
  81. bitmap->pgsize = qemu_real_host_page_size();
  82. bitmap->size = vbmap.size;
  83. bitmap->data = (__u64 *)vbmap.bitmap;
  84. if (vbmap.size > bcontainer->max_dirty_bitmap_size) {
  85. error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
  86. ret = -E2BIG;
  87. goto unmap_exit;
  88. }
  89. ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
  90. if (!ret) {
  91. cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
  92. iotlb->translated_addr, vbmap.pages);
  93. } else {
  94. error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
  95. }
  96. unmap_exit:
  97. g_free(unmap);
  98. g_free(vbmap.bitmap);
  99. return ret;
  100. }
  101. /*
  102. * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  103. */
  104. static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
  105. hwaddr iova, ram_addr_t size,
  106. IOMMUTLBEntry *iotlb)
  107. {
  108. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  109. bcontainer);
  110. struct vfio_iommu_type1_dma_unmap unmap = {
  111. .argsz = sizeof(unmap),
  112. .flags = 0,
  113. .iova = iova,
  114. .size = size,
  115. };
  116. bool need_dirty_sync = false;
  117. int ret;
  118. Error *local_err = NULL;
  119. if (iotlb && vfio_devices_all_dirty_tracking_started(bcontainer)) {
  120. if (!vfio_devices_all_device_dirty_tracking(bcontainer) &&
  121. bcontainer->dirty_pages_supported) {
  122. return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
  123. }
  124. need_dirty_sync = true;
  125. }
  126. while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
  127. /*
  128. * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
  129. * v4.15) where an overflow in its wrap-around check prevents us from
  130. * unmapping the last page of the address space. Test for the error
  131. * condition and re-try the unmap excluding the last page. The
  132. * expectation is that we've never mapped the last page anyway and this
  133. * unmap request comes via vIOMMU support which also makes it unlikely
  134. * that this page is used. This bug was introduced well after type1 v2
  135. * support was introduced, so we shouldn't need to test for v1. A fix
  136. * is queued for kernel v5.0 so this workaround can be removed once
  137. * affected kernels are sufficiently deprecated.
  138. */
  139. if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
  140. container->iommu_type == VFIO_TYPE1v2_IOMMU) {
  141. trace_vfio_legacy_dma_unmap_overflow_workaround();
  142. unmap.size -= 1ULL << ctz64(bcontainer->pgsizes);
  143. continue;
  144. }
  145. return -errno;
  146. }
  147. if (need_dirty_sync) {
  148. ret = vfio_get_dirty_bitmap(bcontainer, iova, size,
  149. iotlb->translated_addr, &local_err);
  150. if (ret) {
  151. error_report_err(local_err);
  152. return ret;
  153. }
  154. }
  155. return 0;
  156. }
  157. static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
  158. ram_addr_t size, void *vaddr, bool readonly)
  159. {
  160. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  161. bcontainer);
  162. struct vfio_iommu_type1_dma_map map = {
  163. .argsz = sizeof(map),
  164. .flags = VFIO_DMA_MAP_FLAG_READ,
  165. .vaddr = (__u64)(uintptr_t)vaddr,
  166. .iova = iova,
  167. .size = size,
  168. };
  169. if (!readonly) {
  170. map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
  171. }
  172. /*
  173. * Try the mapping, if it fails with EBUSY, unmap the region and try
  174. * again. This shouldn't be necessary, but we sometimes see it in
  175. * the VGA ROM space.
  176. */
  177. if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
  178. (errno == EBUSY &&
  179. vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 &&
  180. ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
  181. return 0;
  182. }
  183. return -errno;
  184. }
  185. static int
  186. vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
  187. bool start, Error **errp)
  188. {
  189. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  190. bcontainer);
  191. int ret;
  192. struct vfio_iommu_type1_dirty_bitmap dirty = {
  193. .argsz = sizeof(dirty),
  194. };
  195. if (start) {
  196. dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
  197. } else {
  198. dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
  199. }
  200. ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
  201. if (ret) {
  202. ret = -errno;
  203. error_setg_errno(errp, errno, "Failed to set dirty tracking flag 0x%x",
  204. dirty.flags);
  205. }
  206. return ret;
  207. }
  208. static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
  209. VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
  210. {
  211. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  212. bcontainer);
  213. struct vfio_iommu_type1_dirty_bitmap *dbitmap;
  214. struct vfio_iommu_type1_dirty_bitmap_get *range;
  215. int ret;
  216. dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
  217. dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
  218. dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
  219. range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
  220. range->iova = iova;
  221. range->size = size;
  222. /*
  223. * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  224. * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
  225. * to qemu_real_host_page_size.
  226. */
  227. range->bitmap.pgsize = qemu_real_host_page_size();
  228. range->bitmap.size = vbmap->size;
  229. range->bitmap.data = (__u64 *)vbmap->bitmap;
  230. ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
  231. if (ret) {
  232. ret = -errno;
  233. error_setg_errno(errp, errno,
  234. "Failed to get dirty bitmap for iova: 0x%"PRIx64
  235. " size: 0x%"PRIx64, (uint64_t)range->iova,
  236. (uint64_t)range->size);
  237. }
  238. g_free(dbitmap);
  239. return ret;
  240. }
  241. static struct vfio_info_cap_header *
  242. vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
  243. {
  244. if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
  245. return NULL;
  246. }
  247. return vfio_get_cap((void *)info, info->cap_offset, id);
  248. }
  249. bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
  250. unsigned int *avail)
  251. {
  252. struct vfio_info_cap_header *hdr;
  253. struct vfio_iommu_type1_info_dma_avail *cap;
  254. /* If the capability cannot be found, assume no DMA limiting */
  255. hdr = vfio_get_iommu_type1_info_cap(info,
  256. VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
  257. if (!hdr) {
  258. return false;
  259. }
  260. if (avail != NULL) {
  261. cap = (void *) hdr;
  262. *avail = cap->avail;
  263. }
  264. return true;
  265. }
  266. static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
  267. VFIOContainerBase *bcontainer)
  268. {
  269. struct vfio_info_cap_header *hdr;
  270. struct vfio_iommu_type1_info_cap_iova_range *cap;
  271. hdr = vfio_get_iommu_type1_info_cap(info,
  272. VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
  273. if (!hdr) {
  274. return false;
  275. }
  276. cap = (void *)hdr;
  277. for (int i = 0; i < cap->nr_iovas; i++) {
  278. Range *range = g_new(Range, 1);
  279. range_set_bounds(range, cap->iova_ranges[i].start,
  280. cap->iova_ranges[i].end);
  281. bcontainer->iova_ranges =
  282. range_list_insert(bcontainer->iova_ranges, range);
  283. }
  284. return true;
  285. }
  286. static void vfio_kvm_device_add_group(VFIOGroup *group)
  287. {
  288. Error *err = NULL;
  289. if (vfio_kvm_device_add_fd(group->fd, &err)) {
  290. error_reportf_err(err, "group ID %d: ", group->groupid);
  291. }
  292. }
  293. static void vfio_kvm_device_del_group(VFIOGroup *group)
  294. {
  295. Error *err = NULL;
  296. if (vfio_kvm_device_del_fd(group->fd, &err)) {
  297. error_reportf_err(err, "group ID %d: ", group->groupid);
  298. }
  299. }
  300. /*
  301. * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
  302. */
  303. static int vfio_get_iommu_type(int container_fd,
  304. Error **errp)
  305. {
  306. int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
  307. VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
  308. int i;
  309. for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
  310. if (ioctl(container_fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
  311. return iommu_types[i];
  312. }
  313. }
  314. error_setg(errp, "No available IOMMU models");
  315. return -EINVAL;
  316. }
  317. /*
  318. * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type
  319. */
  320. static const char *vfio_get_iommu_class_name(int iommu_type)
  321. {
  322. switch (iommu_type) {
  323. case VFIO_TYPE1v2_IOMMU:
  324. case VFIO_TYPE1_IOMMU:
  325. return TYPE_VFIO_IOMMU_LEGACY;
  326. break;
  327. case VFIO_SPAPR_TCE_v2_IOMMU:
  328. case VFIO_SPAPR_TCE_IOMMU:
  329. return TYPE_VFIO_IOMMU_SPAPR;
  330. break;
  331. default:
  332. g_assert_not_reached();
  333. };
  334. }
  335. static bool vfio_set_iommu(int container_fd, int group_fd,
  336. int *iommu_type, Error **errp)
  337. {
  338. if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd)) {
  339. error_setg_errno(errp, errno, "Failed to set group container");
  340. return false;
  341. }
  342. while (ioctl(container_fd, VFIO_SET_IOMMU, *iommu_type)) {
  343. if (*iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
  344. /*
  345. * On sPAPR, despite the IOMMU subdriver always advertises v1 and
  346. * v2, the running platform may not support v2 and there is no
  347. * way to guess it until an IOMMU group gets added to the container.
  348. * So in case it fails with v2, try v1 as a fallback.
  349. */
  350. *iommu_type = VFIO_SPAPR_TCE_IOMMU;
  351. continue;
  352. }
  353. error_setg_errno(errp, errno, "Failed to set iommu for container");
  354. return false;
  355. }
  356. return true;
  357. }
  358. static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
  359. Error **errp)
  360. {
  361. int iommu_type;
  362. const char *vioc_name;
  363. VFIOContainer *container;
  364. iommu_type = vfio_get_iommu_type(fd, errp);
  365. if (iommu_type < 0) {
  366. return NULL;
  367. }
  368. if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
  369. return NULL;
  370. }
  371. vioc_name = vfio_get_iommu_class_name(iommu_type);
  372. container = VFIO_IOMMU_LEGACY(object_new(vioc_name));
  373. container->fd = fd;
  374. container->iommu_type = iommu_type;
  375. return container;
  376. }
  377. static int vfio_get_iommu_info(VFIOContainer *container,
  378. struct vfio_iommu_type1_info **info)
  379. {
  380. size_t argsz = sizeof(struct vfio_iommu_type1_info);
  381. *info = g_new0(struct vfio_iommu_type1_info, 1);
  382. again:
  383. (*info)->argsz = argsz;
  384. if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
  385. g_free(*info);
  386. *info = NULL;
  387. return -errno;
  388. }
  389. if (((*info)->argsz > argsz)) {
  390. argsz = (*info)->argsz;
  391. *info = g_realloc(*info, argsz);
  392. goto again;
  393. }
  394. return 0;
  395. }
  396. static struct vfio_info_cap_header *
  397. vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
  398. {
  399. struct vfio_info_cap_header *hdr;
  400. void *ptr = info;
  401. if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
  402. return NULL;
  403. }
  404. for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
  405. if (hdr->id == id) {
  406. return hdr;
  407. }
  408. }
  409. return NULL;
  410. }
  411. static void vfio_get_iommu_info_migration(VFIOContainer *container,
  412. struct vfio_iommu_type1_info *info)
  413. {
  414. struct vfio_info_cap_header *hdr;
  415. struct vfio_iommu_type1_info_cap_migration *cap_mig;
  416. VFIOContainerBase *bcontainer = &container->bcontainer;
  417. hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
  418. if (!hdr) {
  419. return;
  420. }
  421. cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
  422. header);
  423. /*
  424. * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  425. * qemu_real_host_page_size to mark those dirty.
  426. */
  427. if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
  428. bcontainer->dirty_pages_supported = true;
  429. bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
  430. bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap;
  431. }
  432. }
  433. static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
  434. {
  435. VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  436. bcontainer);
  437. g_autofree struct vfio_iommu_type1_info *info = NULL;
  438. int ret;
  439. ret = vfio_get_iommu_info(container, &info);
  440. if (ret) {
  441. error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
  442. return false;
  443. }
  444. if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
  445. bcontainer->pgsizes = info->iova_pgsizes;
  446. } else {
  447. bcontainer->pgsizes = qemu_real_host_page_size();
  448. }
  449. if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) {
  450. bcontainer->dma_max_mappings = 65535;
  451. }
  452. vfio_get_info_iova_range(info, bcontainer);
  453. vfio_get_iommu_info_migration(container, info);
  454. return true;
  455. }
  456. static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
  457. Error **errp)
  458. {
  459. VFIOContainer *container;
  460. VFIOContainerBase *bcontainer;
  461. int ret, fd;
  462. VFIOAddressSpace *space;
  463. VFIOIOMMUClass *vioc;
  464. space = vfio_get_address_space(as);
  465. /*
  466. * VFIO is currently incompatible with discarding of RAM insofar as the
  467. * madvise to purge (zap) the page from QEMU's address space does not
  468. * interact with the memory API and therefore leaves stale virtual to
  469. * physical mappings in the IOMMU if the page was previously pinned. We
  470. * therefore set discarding broken for each group added to a container,
  471. * whether the container is used individually or shared. This provides
  472. * us with options to allow devices within a group to opt-in and allow
  473. * discarding, so long as it is done consistently for a group (for instance
  474. * if the device is an mdev device where it is known that the host vendor
  475. * driver will never pin pages outside of the working set of the guest
  476. * driver, which would thus not be discarding candidates).
  477. *
  478. * The first opportunity to induce pinning occurs here where we attempt to
  479. * attach the group to existing containers within the AddressSpace. If any
  480. * pages are already zapped from the virtual address space, such as from
  481. * previous discards, new pinning will cause valid mappings to be
  482. * re-established. Likewise, when the overall MemoryListener for a new
  483. * container is registered, a replay of mappings within the AddressSpace
  484. * will occur, re-establishing any previously zapped pages as well.
  485. *
  486. * Especially virtio-balloon is currently only prevented from discarding
  487. * new memory, it will not yet set ram_block_discard_set_required() and
  488. * therefore, neither stops us here or deals with the sudden memory
  489. * consumption of inflated memory.
  490. *
  491. * We do support discarding of memory coordinated via the RamDiscardManager
  492. * with some IOMMU types. vfio_ram_block_discard_disable() handles the
  493. * details once we know which type of IOMMU we are using.
  494. */
  495. QLIST_FOREACH(bcontainer, &space->containers, next) {
  496. container = container_of(bcontainer, VFIOContainer, bcontainer);
  497. if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
  498. ret = vfio_ram_block_discard_disable(container, true);
  499. if (ret) {
  500. error_setg_errno(errp, -ret,
  501. "Cannot set discarding of RAM broken");
  502. if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
  503. &container->fd)) {
  504. error_report("vfio: error disconnecting group %d from"
  505. " container", group->groupid);
  506. }
  507. return false;
  508. }
  509. group->container = container;
  510. QLIST_INSERT_HEAD(&container->group_list, group, container_next);
  511. vfio_kvm_device_add_group(group);
  512. return true;
  513. }
  514. }
  515. fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
  516. if (fd < 0) {
  517. goto put_space_exit;
  518. }
  519. ret = ioctl(fd, VFIO_GET_API_VERSION);
  520. if (ret != VFIO_API_VERSION) {
  521. error_setg(errp, "supported vfio version: %d, "
  522. "reported version: %d", VFIO_API_VERSION, ret);
  523. goto close_fd_exit;
  524. }
  525. container = vfio_create_container(fd, group, errp);
  526. if (!container) {
  527. goto close_fd_exit;
  528. }
  529. bcontainer = &container->bcontainer;
  530. if (!vfio_cpr_register_container(bcontainer, errp)) {
  531. goto free_container_exit;
  532. }
  533. ret = vfio_ram_block_discard_disable(container, true);
  534. if (ret) {
  535. error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
  536. goto unregister_container_exit;
  537. }
  538. vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
  539. assert(vioc->setup);
  540. if (!vioc->setup(bcontainer, errp)) {
  541. goto enable_discards_exit;
  542. }
  543. vfio_kvm_device_add_group(group);
  544. vfio_address_space_insert(space, bcontainer);
  545. group->container = container;
  546. QLIST_INSERT_HEAD(&container->group_list, group, container_next);
  547. bcontainer->listener = vfio_memory_listener;
  548. memory_listener_register(&bcontainer->listener, bcontainer->space->as);
  549. if (bcontainer->error) {
  550. error_propagate_prepend(errp, bcontainer->error,
  551. "memory listener initialization failed: ");
  552. goto listener_release_exit;
  553. }
  554. bcontainer->initialized = true;
  555. return true;
  556. listener_release_exit:
  557. QLIST_REMOVE(group, container_next);
  558. vfio_kvm_device_del_group(group);
  559. memory_listener_unregister(&bcontainer->listener);
  560. if (vioc->release) {
  561. vioc->release(bcontainer);
  562. }
  563. enable_discards_exit:
  564. vfio_ram_block_discard_disable(container, false);
  565. unregister_container_exit:
  566. vfio_cpr_unregister_container(bcontainer);
  567. free_container_exit:
  568. object_unref(container);
  569. close_fd_exit:
  570. close(fd);
  571. put_space_exit:
  572. vfio_put_address_space(space);
  573. return false;
  574. }
  575. static void vfio_disconnect_container(VFIOGroup *group)
  576. {
  577. VFIOContainer *container = group->container;
  578. VFIOContainerBase *bcontainer = &container->bcontainer;
  579. VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
  580. QLIST_REMOVE(group, container_next);
  581. group->container = NULL;
  582. /*
  583. * Explicitly release the listener first before unset container,
  584. * since unset may destroy the backend container if it's the last
  585. * group.
  586. */
  587. if (QLIST_EMPTY(&container->group_list)) {
  588. memory_listener_unregister(&bcontainer->listener);
  589. if (vioc->release) {
  590. vioc->release(bcontainer);
  591. }
  592. }
  593. if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
  594. error_report("vfio: error disconnecting group %d from container",
  595. group->groupid);
  596. }
  597. if (QLIST_EMPTY(&container->group_list)) {
  598. VFIOAddressSpace *space = bcontainer->space;
  599. trace_vfio_disconnect_container(container->fd);
  600. vfio_cpr_unregister_container(bcontainer);
  601. close(container->fd);
  602. object_unref(container);
  603. vfio_put_address_space(space);
  604. }
  605. }
  606. static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
  607. {
  608. ERRP_GUARD();
  609. VFIOGroup *group;
  610. char path[32];
  611. struct vfio_group_status status = { .argsz = sizeof(status) };
  612. QLIST_FOREACH(group, &vfio_group_list, next) {
  613. if (group->groupid == groupid) {
  614. /* Found it. Now is it already in the right context? */
  615. if (group->container->bcontainer.space->as == as) {
  616. return group;
  617. } else {
  618. error_setg(errp, "group %d used in multiple address spaces",
  619. group->groupid);
  620. return NULL;
  621. }
  622. }
  623. }
  624. group = g_malloc0(sizeof(*group));
  625. snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
  626. group->fd = qemu_open(path, O_RDWR, errp);
  627. if (group->fd < 0) {
  628. goto free_group_exit;
  629. }
  630. if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
  631. error_setg_errno(errp, errno, "failed to get group %d status", groupid);
  632. goto close_fd_exit;
  633. }
  634. if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
  635. error_setg(errp, "group %d is not viable", groupid);
  636. error_append_hint(errp,
  637. "Please ensure all devices within the iommu_group "
  638. "are bound to their vfio bus driver.\n");
  639. goto close_fd_exit;
  640. }
  641. group->groupid = groupid;
  642. QLIST_INIT(&group->device_list);
  643. if (!vfio_connect_container(group, as, errp)) {
  644. error_prepend(errp, "failed to setup container for group %d: ",
  645. groupid);
  646. goto close_fd_exit;
  647. }
  648. QLIST_INSERT_HEAD(&vfio_group_list, group, next);
  649. return group;
  650. close_fd_exit:
  651. close(group->fd);
  652. free_group_exit:
  653. g_free(group);
  654. return NULL;
  655. }
  656. static void vfio_put_group(VFIOGroup *group)
  657. {
  658. if (!group || !QLIST_EMPTY(&group->device_list)) {
  659. return;
  660. }
  661. if (!group->ram_block_discard_allowed) {
  662. vfio_ram_block_discard_disable(group->container, false);
  663. }
  664. vfio_kvm_device_del_group(group);
  665. vfio_disconnect_container(group);
  666. QLIST_REMOVE(group, next);
  667. trace_vfio_put_group(group->fd);
  668. close(group->fd);
  669. g_free(group);
  670. }
  671. static bool vfio_get_device(VFIOGroup *group, const char *name,
  672. VFIODevice *vbasedev, Error **errp)
  673. {
  674. g_autofree struct vfio_device_info *info = NULL;
  675. int fd;
  676. fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
  677. if (fd < 0) {
  678. error_setg_errno(errp, errno, "error getting device from group %d",
  679. group->groupid);
  680. error_append_hint(errp,
  681. "Verify all devices in group %d are bound to vfio-<bus> "
  682. "or pci-stub and not already in use\n", group->groupid);
  683. return false;
  684. }
  685. info = vfio_get_device_info(fd);
  686. if (!info) {
  687. error_setg_errno(errp, errno, "error getting device info");
  688. close(fd);
  689. return false;
  690. }
  691. /*
  692. * Set discarding of RAM as not broken for this group if the driver knows
  693. * the device operates compatibly with discarding. Setting must be
  694. * consistent per group, but since compatibility is really only possible
  695. * with mdev currently, we expect singleton groups.
  696. */
  697. if (vbasedev->ram_block_discard_allowed !=
  698. group->ram_block_discard_allowed) {
  699. if (!QLIST_EMPTY(&group->device_list)) {
  700. error_setg(errp, "Inconsistent setting of support for discarding "
  701. "RAM (e.g., balloon) within group");
  702. close(fd);
  703. return false;
  704. }
  705. if (!group->ram_block_discard_allowed) {
  706. group->ram_block_discard_allowed = true;
  707. vfio_ram_block_discard_disable(group->container, false);
  708. }
  709. }
  710. vbasedev->fd = fd;
  711. vbasedev->group = group;
  712. QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
  713. vbasedev->num_irqs = info->num_irqs;
  714. vbasedev->num_regions = info->num_regions;
  715. vbasedev->flags = info->flags;
  716. trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
  717. vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
  718. return true;
  719. }
  720. static void vfio_put_base_device(VFIODevice *vbasedev)
  721. {
  722. if (!vbasedev->group) {
  723. return;
  724. }
  725. QLIST_REMOVE(vbasedev, next);
  726. vbasedev->group = NULL;
  727. trace_vfio_put_base_device(vbasedev->fd);
  728. close(vbasedev->fd);
  729. }
  730. static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
  731. {
  732. char *tmp, group_path[PATH_MAX];
  733. g_autofree char *group_name = NULL;
  734. int ret, groupid;
  735. ssize_t len;
  736. tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
  737. len = readlink(tmp, group_path, sizeof(group_path));
  738. g_free(tmp);
  739. if (len <= 0 || len >= sizeof(group_path)) {
  740. ret = len < 0 ? -errno : -ENAMETOOLONG;
  741. error_setg_errno(errp, -ret, "no iommu_group found");
  742. return ret;
  743. }
  744. group_path[len] = 0;
  745. group_name = g_path_get_basename(group_path);
  746. if (sscanf(group_name, "%d", &groupid) != 1) {
  747. error_setg_errno(errp, errno, "failed to read %s", group_path);
  748. return -errno;
  749. }
  750. return groupid;
  751. }
  752. /*
  753. * vfio_attach_device: attach a device to a security context
  754. * @name and @vbasedev->name are likely to be different depending
  755. * on the type of the device, hence the need for passing @name
  756. */
  757. static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
  758. AddressSpace *as, Error **errp)
  759. {
  760. int groupid = vfio_device_groupid(vbasedev, errp);
  761. VFIODevice *vbasedev_iter;
  762. VFIOGroup *group;
  763. VFIOContainerBase *bcontainer;
  764. if (groupid < 0) {
  765. return false;
  766. }
  767. trace_vfio_attach_device(vbasedev->name, groupid);
  768. if (!vfio_device_hiod_realize(vbasedev, errp)) {
  769. return false;
  770. }
  771. group = vfio_get_group(groupid, as, errp);
  772. if (!group) {
  773. return false;
  774. }
  775. QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
  776. if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
  777. error_setg(errp, "device is already attached");
  778. vfio_put_group(group);
  779. return false;
  780. }
  781. }
  782. if (!vfio_get_device(group, name, vbasedev, errp)) {
  783. vfio_put_group(group);
  784. return false;
  785. }
  786. bcontainer = &group->container->bcontainer;
  787. vbasedev->bcontainer = bcontainer;
  788. QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
  789. QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
  790. return true;
  791. }
  792. static void vfio_legacy_detach_device(VFIODevice *vbasedev)
  793. {
  794. VFIOGroup *group = vbasedev->group;
  795. QLIST_REMOVE(vbasedev, global_next);
  796. QLIST_REMOVE(vbasedev, container_next);
  797. vbasedev->bcontainer = NULL;
  798. trace_vfio_detach_device(vbasedev->name, group->groupid);
  799. vfio_put_base_device(vbasedev);
  800. vfio_put_group(group);
  801. }
  802. static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
  803. {
  804. VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
  805. VFIOGroup *group;
  806. struct vfio_pci_hot_reset_info *info = NULL;
  807. struct vfio_pci_dependent_device *devices;
  808. struct vfio_pci_hot_reset *reset;
  809. int32_t *fds;
  810. int ret, i, count;
  811. bool multi = false;
  812. trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
  813. if (!single) {
  814. vfio_pci_pre_reset(vdev);
  815. }
  816. vdev->vbasedev.needs_reset = false;
  817. ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
  818. if (ret) {
  819. goto out_single;
  820. }
  821. devices = &info->devices[0];
  822. trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
  823. /* Verify that we have all the groups required */
  824. for (i = 0; i < info->count; i++) {
  825. PCIHostDeviceAddress host;
  826. VFIOPCIDevice *tmp;
  827. VFIODevice *vbasedev_iter;
  828. host.domain = devices[i].segment;
  829. host.bus = devices[i].bus;
  830. host.slot = PCI_SLOT(devices[i].devfn);
  831. host.function = PCI_FUNC(devices[i].devfn);
  832. trace_vfio_pci_hot_reset_dep_devices(host.domain,
  833. host.bus, host.slot, host.function, devices[i].group_id);
  834. if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
  835. continue;
  836. }
  837. QLIST_FOREACH(group, &vfio_group_list, next) {
  838. if (group->groupid == devices[i].group_id) {
  839. break;
  840. }
  841. }
  842. if (!group) {
  843. if (!vdev->has_pm_reset) {
  844. error_report("vfio: Cannot reset device %s, "
  845. "depends on group %d which is not owned.",
  846. vdev->vbasedev.name, devices[i].group_id);
  847. }
  848. ret = -EPERM;
  849. goto out;
  850. }
  851. /* Prep dependent devices for reset and clear our marker. */
  852. QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
  853. if (!vbasedev_iter->dev->realized ||
  854. vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
  855. continue;
  856. }
  857. tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
  858. if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
  859. if (single) {
  860. ret = -EINVAL;
  861. goto out_single;
  862. }
  863. vfio_pci_pre_reset(tmp);
  864. tmp->vbasedev.needs_reset = false;
  865. multi = true;
  866. break;
  867. }
  868. }
  869. }
  870. if (!single && !multi) {
  871. ret = -EINVAL;
  872. goto out_single;
  873. }
  874. /* Determine how many group fds need to be passed */
  875. count = 0;
  876. QLIST_FOREACH(group, &vfio_group_list, next) {
  877. for (i = 0; i < info->count; i++) {
  878. if (group->groupid == devices[i].group_id) {
  879. count++;
  880. break;
  881. }
  882. }
  883. }
  884. reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
  885. reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
  886. fds = &reset->group_fds[0];
  887. /* Fill in group fds */
  888. QLIST_FOREACH(group, &vfio_group_list, next) {
  889. for (i = 0; i < info->count; i++) {
  890. if (group->groupid == devices[i].group_id) {
  891. fds[reset->count++] = group->fd;
  892. break;
  893. }
  894. }
  895. }
  896. /* Bus reset! */
  897. ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
  898. g_free(reset);
  899. if (ret) {
  900. ret = -errno;
  901. }
  902. trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
  903. ret ? strerror(errno) : "Success");
  904. out:
  905. /* Re-enable INTx on affected devices */
  906. for (i = 0; i < info->count; i++) {
  907. PCIHostDeviceAddress host;
  908. VFIOPCIDevice *tmp;
  909. VFIODevice *vbasedev_iter;
  910. host.domain = devices[i].segment;
  911. host.bus = devices[i].bus;
  912. host.slot = PCI_SLOT(devices[i].devfn);
  913. host.function = PCI_FUNC(devices[i].devfn);
  914. if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
  915. continue;
  916. }
  917. QLIST_FOREACH(group, &vfio_group_list, next) {
  918. if (group->groupid == devices[i].group_id) {
  919. break;
  920. }
  921. }
  922. if (!group) {
  923. break;
  924. }
  925. QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
  926. if (!vbasedev_iter->dev->realized ||
  927. vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
  928. continue;
  929. }
  930. tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
  931. if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
  932. vfio_pci_post_reset(tmp);
  933. break;
  934. }
  935. }
  936. }
  937. out_single:
  938. if (!single) {
  939. vfio_pci_post_reset(vdev);
  940. }
  941. g_free(info);
  942. return ret;
  943. }
  944. static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
  945. {
  946. VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
  947. vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO;
  948. vioc->setup = vfio_legacy_setup;
  949. vioc->dma_map = vfio_legacy_dma_map;
  950. vioc->dma_unmap = vfio_legacy_dma_unmap;
  951. vioc->attach_device = vfio_legacy_attach_device;
  952. vioc->detach_device = vfio_legacy_detach_device;
  953. vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking;
  954. vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap;
  955. vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
  956. };
  957. static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
  958. Error **errp)
  959. {
  960. VFIODevice *vdev = opaque;
  961. hiod->name = g_strdup(vdev->name);
  962. hiod->agent = opaque;
  963. return true;
  964. }
  965. static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap,
  966. Error **errp)
  967. {
  968. switch (cap) {
  969. case HOST_IOMMU_DEVICE_CAP_AW_BITS:
  970. return vfio_device_get_aw_bits(hiod->agent);
  971. default:
  972. error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
  973. return -EINVAL;
  974. }
  975. }
  976. static GList *
  977. hiod_legacy_vfio_get_iova_ranges(HostIOMMUDevice *hiod)
  978. {
  979. VFIODevice *vdev = hiod->agent;
  980. g_assert(vdev);
  981. return vfio_container_get_iova_ranges(vdev->bcontainer);
  982. }
  983. static uint64_t
  984. hiod_legacy_vfio_get_page_size_mask(HostIOMMUDevice *hiod)
  985. {
  986. VFIODevice *vdev = hiod->agent;
  987. g_assert(vdev);
  988. return vfio_container_get_page_size_mask(vdev->bcontainer);
  989. }
  990. static void vfio_iommu_legacy_instance_init(Object *obj)
  991. {
  992. VFIOContainer *container = VFIO_IOMMU_LEGACY(obj);
  993. QLIST_INIT(&container->group_list);
  994. }
  995. static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
  996. {
  997. HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
  998. hioc->realize = hiod_legacy_vfio_realize;
  999. hioc->get_cap = hiod_legacy_vfio_get_cap;
  1000. hioc->get_iova_ranges = hiod_legacy_vfio_get_iova_ranges;
  1001. hioc->get_page_size_mask = hiod_legacy_vfio_get_page_size_mask;
  1002. };
  1003. static const TypeInfo types[] = {
  1004. {
  1005. .name = TYPE_VFIO_IOMMU_LEGACY,
  1006. .parent = TYPE_VFIO_IOMMU,
  1007. .instance_init = vfio_iommu_legacy_instance_init,
  1008. .instance_size = sizeof(VFIOContainer),
  1009. .class_init = vfio_iommu_legacy_class_init,
  1010. }, {
  1011. .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
  1012. .parent = TYPE_HOST_IOMMU_DEVICE,
  1013. .class_init = hiod_legacy_vfio_class_init,
  1014. }
  1015. };
  1016. DEFINE_TYPES(types)