container.c 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214
  1. /*
  2. * generic functions used by VFIO devices
  3. *
  4. * Copyright Red Hat, Inc. 2012
  5. *
  6. * Authors:
  7. * Alex Williamson <alex.williamson@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Based on qemu-kvm device-assignment:
  13. * Adapted for KVM by Qumranet.
  14. * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15. * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16. * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17. * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18. * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19. */
  20. #include "qemu/osdep.h"
  21. #include <sys/ioctl.h>
  22. #include <linux/vfio.h>
  23. #include "hw/vfio/vfio-common.h"
  24. #include "exec/address-spaces.h"
  25. #include "exec/memory.h"
  26. #include "exec/ram_addr.h"
  27. #include "qemu/error-report.h"
  28. #include "qemu/range.h"
  29. #include "system/reset.h"
  30. #include "trace.h"
  31. #include "qapi/error.h"
  32. #include "pci.h"
  33. VFIOGroupList vfio_group_list =
  34. QLIST_HEAD_INITIALIZER(vfio_group_list);
  35. static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
  36. {
  37. switch (container->iommu_type) {
  38. case VFIO_TYPE1v2_IOMMU:
  39. case VFIO_TYPE1_IOMMU:
  40. /*
  41. * We support coordinated discarding of RAM via the RamDiscardManager.
  42. */
  43. return ram_block_uncoordinated_discard_disable(state);
  44. default:
  45. /*
  46. * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
  47. * RamDiscardManager, however, it is completely untested.
  48. *
  49. * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
  50. * completely the opposite of managing mapping/pinning dynamically as
  51. * required by RamDiscardManager. We would have to special-case sections
  52. * with a RamDiscardManager.
  53. */
  54. return ram_block_discard_disable(state);
  55. }
  56. }
  57. static int vfio_dma_unmap_bitmap(const VFIOContainer *container,
  58. hwaddr iova, ram_addr_t size,
  59. IOMMUTLBEntry *iotlb)
  60. {
  61. const VFIOContainerBase *bcontainer = &container->bcontainer;
  62. struct vfio_iommu_type1_dma_unmap *unmap;
  63. struct vfio_bitmap *bitmap;
  64. VFIOBitmap vbmap;
  65. int ret;
  66. ret = vfio_bitmap_alloc(&vbmap, size);
  67. if (ret) {
  68. return ret;
  69. }
  70. unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
  71. unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
  72. unmap->iova = iova;
  73. unmap->size = size;
  74. unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
  75. bitmap = (struct vfio_bitmap *)&unmap->data;
  76. /*
  77. * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  78. * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
  79. * to qemu_real_host_page_size.
  80. */
  81. bitmap->pgsize = qemu_real_host_page_size();
  82. bitmap->size = vbmap.size;
  83. bitmap->data = (__u64 *)vbmap.bitmap;
  84. if (vbmap.size > bcontainer->max_dirty_bitmap_size) {
  85. error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
  86. ret = -E2BIG;
  87. goto unmap_exit;
  88. }
  89. ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
  90. if (!ret) {
  91. cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
  92. iotlb->translated_addr, vbmap.pages);
  93. } else {
  94. error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
  95. }
  96. unmap_exit:
  97. g_free(unmap);
  98. g_free(vbmap.bitmap);
  99. return ret;
  100. }
  101. /*
  102. * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  103. */
  104. static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
  105. hwaddr iova, ram_addr_t size,
  106. IOMMUTLBEntry *iotlb)
  107. {
  108. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  109. bcontainer);
  110. struct vfio_iommu_type1_dma_unmap unmap = {
  111. .argsz = sizeof(unmap),
  112. .flags = 0,
  113. .iova = iova,
  114. .size = size,
  115. };
  116. bool need_dirty_sync = false;
  117. int ret;
  118. Error *local_err = NULL;
  119. if (iotlb && vfio_devices_all_dirty_tracking_started(bcontainer)) {
  120. if (!vfio_devices_all_device_dirty_tracking(bcontainer) &&
  121. bcontainer->dirty_pages_supported) {
  122. return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
  123. }
  124. need_dirty_sync = true;
  125. }
  126. while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
  127. /*
  128. * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
  129. * v4.15) where an overflow in its wrap-around check prevents us from
  130. * unmapping the last page of the address space. Test for the error
  131. * condition and re-try the unmap excluding the last page. The
  132. * expectation is that we've never mapped the last page anyway and this
  133. * unmap request comes via vIOMMU support which also makes it unlikely
  134. * that this page is used. This bug was introduced well after type1 v2
  135. * support was introduced, so we shouldn't need to test for v1. A fix
  136. * is queued for kernel v5.0 so this workaround can be removed once
  137. * affected kernels are sufficiently deprecated.
  138. */
  139. if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
  140. container->iommu_type == VFIO_TYPE1v2_IOMMU) {
  141. trace_vfio_legacy_dma_unmap_overflow_workaround();
  142. unmap.size -= 1ULL << ctz64(bcontainer->pgsizes);
  143. continue;
  144. }
  145. error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
  146. return -errno;
  147. }
  148. if (need_dirty_sync) {
  149. ret = vfio_get_dirty_bitmap(bcontainer, iova, size,
  150. iotlb->translated_addr, &local_err);
  151. if (ret) {
  152. error_report_err(local_err);
  153. return ret;
  154. }
  155. }
  156. return 0;
  157. }
  158. static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
  159. ram_addr_t size, void *vaddr, bool readonly)
  160. {
  161. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  162. bcontainer);
  163. struct vfio_iommu_type1_dma_map map = {
  164. .argsz = sizeof(map),
  165. .flags = VFIO_DMA_MAP_FLAG_READ,
  166. .vaddr = (__u64)(uintptr_t)vaddr,
  167. .iova = iova,
  168. .size = size,
  169. };
  170. if (!readonly) {
  171. map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
  172. }
  173. /*
  174. * Try the mapping, if it fails with EBUSY, unmap the region and try
  175. * again. This shouldn't be necessary, but we sometimes see it in
  176. * the VGA ROM space.
  177. */
  178. if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
  179. (errno == EBUSY &&
  180. vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 &&
  181. ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
  182. return 0;
  183. }
  184. error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
  185. return -errno;
  186. }
  187. static int
  188. vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
  189. bool start, Error **errp)
  190. {
  191. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  192. bcontainer);
  193. int ret;
  194. struct vfio_iommu_type1_dirty_bitmap dirty = {
  195. .argsz = sizeof(dirty),
  196. };
  197. if (start) {
  198. dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
  199. } else {
  200. dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
  201. }
  202. ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
  203. if (ret) {
  204. ret = -errno;
  205. error_setg_errno(errp, errno, "Failed to set dirty tracking flag 0x%x",
  206. dirty.flags);
  207. }
  208. return ret;
  209. }
  210. static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
  211. VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
  212. {
  213. const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  214. bcontainer);
  215. struct vfio_iommu_type1_dirty_bitmap *dbitmap;
  216. struct vfio_iommu_type1_dirty_bitmap_get *range;
  217. int ret;
  218. dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
  219. dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
  220. dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
  221. range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
  222. range->iova = iova;
  223. range->size = size;
  224. /*
  225. * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  226. * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
  227. * to qemu_real_host_page_size.
  228. */
  229. range->bitmap.pgsize = qemu_real_host_page_size();
  230. range->bitmap.size = vbmap->size;
  231. range->bitmap.data = (__u64 *)vbmap->bitmap;
  232. ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
  233. if (ret) {
  234. ret = -errno;
  235. error_setg_errno(errp, errno,
  236. "Failed to get dirty bitmap for iova: 0x%"PRIx64
  237. " size: 0x%"PRIx64, (uint64_t)range->iova,
  238. (uint64_t)range->size);
  239. }
  240. g_free(dbitmap);
  241. return ret;
  242. }
  243. static struct vfio_info_cap_header *
  244. vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
  245. {
  246. if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
  247. return NULL;
  248. }
  249. return vfio_get_cap((void *)info, info->cap_offset, id);
  250. }
  251. bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
  252. unsigned int *avail)
  253. {
  254. struct vfio_info_cap_header *hdr;
  255. struct vfio_iommu_type1_info_dma_avail *cap;
  256. /* If the capability cannot be found, assume no DMA limiting */
  257. hdr = vfio_get_iommu_type1_info_cap(info,
  258. VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
  259. if (!hdr) {
  260. return false;
  261. }
  262. if (avail != NULL) {
  263. cap = (void *) hdr;
  264. *avail = cap->avail;
  265. }
  266. return true;
  267. }
  268. static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
  269. VFIOContainerBase *bcontainer)
  270. {
  271. struct vfio_info_cap_header *hdr;
  272. struct vfio_iommu_type1_info_cap_iova_range *cap;
  273. hdr = vfio_get_iommu_type1_info_cap(info,
  274. VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
  275. if (!hdr) {
  276. return false;
  277. }
  278. cap = (void *)hdr;
  279. for (int i = 0; i < cap->nr_iovas; i++) {
  280. Range *range = g_new(Range, 1);
  281. range_set_bounds(range, cap->iova_ranges[i].start,
  282. cap->iova_ranges[i].end);
  283. bcontainer->iova_ranges =
  284. range_list_insert(bcontainer->iova_ranges, range);
  285. }
  286. return true;
  287. }
  288. static void vfio_kvm_device_add_group(VFIOGroup *group)
  289. {
  290. Error *err = NULL;
  291. if (vfio_kvm_device_add_fd(group->fd, &err)) {
  292. error_reportf_err(err, "group ID %d: ", group->groupid);
  293. }
  294. }
  295. static void vfio_kvm_device_del_group(VFIOGroup *group)
  296. {
  297. Error *err = NULL;
  298. if (vfio_kvm_device_del_fd(group->fd, &err)) {
  299. error_reportf_err(err, "group ID %d: ", group->groupid);
  300. }
  301. }
  302. /*
  303. * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
  304. */
  305. static int vfio_get_iommu_type(int container_fd,
  306. Error **errp)
  307. {
  308. int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
  309. VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
  310. int i;
  311. for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
  312. if (ioctl(container_fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
  313. return iommu_types[i];
  314. }
  315. }
  316. error_setg(errp, "No available IOMMU models");
  317. return -EINVAL;
  318. }
  319. /*
  320. * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type
  321. */
  322. static const char *vfio_get_iommu_class_name(int iommu_type)
  323. {
  324. switch (iommu_type) {
  325. case VFIO_TYPE1v2_IOMMU:
  326. case VFIO_TYPE1_IOMMU:
  327. return TYPE_VFIO_IOMMU_LEGACY;
  328. break;
  329. case VFIO_SPAPR_TCE_v2_IOMMU:
  330. case VFIO_SPAPR_TCE_IOMMU:
  331. return TYPE_VFIO_IOMMU_SPAPR;
  332. break;
  333. default:
  334. g_assert_not_reached();
  335. };
  336. }
  337. static bool vfio_set_iommu(int container_fd, int group_fd,
  338. int *iommu_type, Error **errp)
  339. {
  340. if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd)) {
  341. error_setg_errno(errp, errno, "Failed to set group container");
  342. return false;
  343. }
  344. while (ioctl(container_fd, VFIO_SET_IOMMU, *iommu_type)) {
  345. if (*iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
  346. /*
  347. * On sPAPR, despite the IOMMU subdriver always advertises v1 and
  348. * v2, the running platform may not support v2 and there is no
  349. * way to guess it until an IOMMU group gets added to the container.
  350. * So in case it fails with v2, try v1 as a fallback.
  351. */
  352. *iommu_type = VFIO_SPAPR_TCE_IOMMU;
  353. continue;
  354. }
  355. error_setg_errno(errp, errno, "Failed to set iommu for container");
  356. return false;
  357. }
  358. return true;
  359. }
  360. static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
  361. Error **errp)
  362. {
  363. int iommu_type;
  364. const char *vioc_name;
  365. VFIOContainer *container;
  366. iommu_type = vfio_get_iommu_type(fd, errp);
  367. if (iommu_type < 0) {
  368. return NULL;
  369. }
  370. if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
  371. return NULL;
  372. }
  373. vioc_name = vfio_get_iommu_class_name(iommu_type);
  374. container = VFIO_IOMMU_LEGACY(object_new(vioc_name));
  375. container->fd = fd;
  376. container->iommu_type = iommu_type;
  377. return container;
  378. }
  379. static int vfio_get_iommu_info(VFIOContainer *container,
  380. struct vfio_iommu_type1_info **info)
  381. {
  382. size_t argsz = sizeof(struct vfio_iommu_type1_info);
  383. *info = g_new0(struct vfio_iommu_type1_info, 1);
  384. again:
  385. (*info)->argsz = argsz;
  386. if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
  387. g_free(*info);
  388. *info = NULL;
  389. return -errno;
  390. }
  391. if (((*info)->argsz > argsz)) {
  392. argsz = (*info)->argsz;
  393. *info = g_realloc(*info, argsz);
  394. goto again;
  395. }
  396. return 0;
  397. }
  398. static struct vfio_info_cap_header *
  399. vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
  400. {
  401. struct vfio_info_cap_header *hdr;
  402. void *ptr = info;
  403. if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
  404. return NULL;
  405. }
  406. for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
  407. if (hdr->id == id) {
  408. return hdr;
  409. }
  410. }
  411. return NULL;
  412. }
  413. static void vfio_get_iommu_info_migration(VFIOContainer *container,
  414. struct vfio_iommu_type1_info *info)
  415. {
  416. struct vfio_info_cap_header *hdr;
  417. struct vfio_iommu_type1_info_cap_migration *cap_mig;
  418. VFIOContainerBase *bcontainer = &container->bcontainer;
  419. hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
  420. if (!hdr) {
  421. return;
  422. }
  423. cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
  424. header);
  425. /*
  426. * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  427. * qemu_real_host_page_size to mark those dirty.
  428. */
  429. if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
  430. bcontainer->dirty_pages_supported = true;
  431. bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
  432. bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap;
  433. }
  434. }
  435. static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
  436. {
  437. VFIOContainer *container = container_of(bcontainer, VFIOContainer,
  438. bcontainer);
  439. g_autofree struct vfio_iommu_type1_info *info = NULL;
  440. int ret;
  441. ret = vfio_get_iommu_info(container, &info);
  442. if (ret) {
  443. error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
  444. return false;
  445. }
  446. if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
  447. bcontainer->pgsizes = info->iova_pgsizes;
  448. } else {
  449. bcontainer->pgsizes = qemu_real_host_page_size();
  450. }
  451. if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) {
  452. bcontainer->dma_max_mappings = 65535;
  453. }
  454. vfio_get_info_iova_range(info, bcontainer);
  455. vfio_get_iommu_info_migration(container, info);
  456. return true;
  457. }
  458. static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
  459. Error **errp)
  460. {
  461. VFIOContainer *container;
  462. VFIOContainerBase *bcontainer;
  463. int ret, fd;
  464. VFIOAddressSpace *space;
  465. VFIOIOMMUClass *vioc;
  466. space = vfio_get_address_space(as);
  467. /*
  468. * VFIO is currently incompatible with discarding of RAM insofar as the
  469. * madvise to purge (zap) the page from QEMU's address space does not
  470. * interact with the memory API and therefore leaves stale virtual to
  471. * physical mappings in the IOMMU if the page was previously pinned. We
  472. * therefore set discarding broken for each group added to a container,
  473. * whether the container is used individually or shared. This provides
  474. * us with options to allow devices within a group to opt-in and allow
  475. * discarding, so long as it is done consistently for a group (for instance
  476. * if the device is an mdev device where it is known that the host vendor
  477. * driver will never pin pages outside of the working set of the guest
  478. * driver, which would thus not be discarding candidates).
  479. *
  480. * The first opportunity to induce pinning occurs here where we attempt to
  481. * attach the group to existing containers within the AddressSpace. If any
  482. * pages are already zapped from the virtual address space, such as from
  483. * previous discards, new pinning will cause valid mappings to be
  484. * re-established. Likewise, when the overall MemoryListener for a new
  485. * container is registered, a replay of mappings within the AddressSpace
  486. * will occur, re-establishing any previously zapped pages as well.
  487. *
  488. * Especially virtio-balloon is currently only prevented from discarding
  489. * new memory, it will not yet set ram_block_discard_set_required() and
  490. * therefore, neither stops us here or deals with the sudden memory
  491. * consumption of inflated memory.
  492. *
  493. * We do support discarding of memory coordinated via the RamDiscardManager
  494. * with some IOMMU types. vfio_ram_block_discard_disable() handles the
  495. * details once we know which type of IOMMU we are using.
  496. */
  497. QLIST_FOREACH(bcontainer, &space->containers, next) {
  498. container = container_of(bcontainer, VFIOContainer, bcontainer);
  499. if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
  500. ret = vfio_ram_block_discard_disable(container, true);
  501. if (ret) {
  502. error_setg_errno(errp, -ret,
  503. "Cannot set discarding of RAM broken");
  504. if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
  505. &container->fd)) {
  506. error_report("vfio: error disconnecting group %d from"
  507. " container", group->groupid);
  508. }
  509. return false;
  510. }
  511. group->container = container;
  512. QLIST_INSERT_HEAD(&container->group_list, group, container_next);
  513. vfio_kvm_device_add_group(group);
  514. return true;
  515. }
  516. }
  517. fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
  518. if (fd < 0) {
  519. goto put_space_exit;
  520. }
  521. ret = ioctl(fd, VFIO_GET_API_VERSION);
  522. if (ret != VFIO_API_VERSION) {
  523. error_setg(errp, "supported vfio version: %d, "
  524. "reported version: %d", VFIO_API_VERSION, ret);
  525. goto close_fd_exit;
  526. }
  527. container = vfio_create_container(fd, group, errp);
  528. if (!container) {
  529. goto close_fd_exit;
  530. }
  531. bcontainer = &container->bcontainer;
  532. if (!vfio_cpr_register_container(bcontainer, errp)) {
  533. goto free_container_exit;
  534. }
  535. ret = vfio_ram_block_discard_disable(container, true);
  536. if (ret) {
  537. error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
  538. goto unregister_container_exit;
  539. }
  540. vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
  541. assert(vioc->setup);
  542. if (!vioc->setup(bcontainer, errp)) {
  543. goto enable_discards_exit;
  544. }
  545. vfio_kvm_device_add_group(group);
  546. vfio_address_space_insert(space, bcontainer);
  547. group->container = container;
  548. QLIST_INSERT_HEAD(&container->group_list, group, container_next);
  549. bcontainer->listener = vfio_memory_listener;
  550. memory_listener_register(&bcontainer->listener, bcontainer->space->as);
  551. if (bcontainer->error) {
  552. error_propagate_prepend(errp, bcontainer->error,
  553. "memory listener initialization failed: ");
  554. goto listener_release_exit;
  555. }
  556. bcontainer->initialized = true;
  557. return true;
  558. listener_release_exit:
  559. QLIST_REMOVE(group, container_next);
  560. vfio_kvm_device_del_group(group);
  561. memory_listener_unregister(&bcontainer->listener);
  562. if (vioc->release) {
  563. vioc->release(bcontainer);
  564. }
  565. enable_discards_exit:
  566. vfio_ram_block_discard_disable(container, false);
  567. unregister_container_exit:
  568. vfio_cpr_unregister_container(bcontainer);
  569. free_container_exit:
  570. object_unref(container);
  571. close_fd_exit:
  572. close(fd);
  573. put_space_exit:
  574. vfio_put_address_space(space);
  575. return false;
  576. }
  577. static void vfio_disconnect_container(VFIOGroup *group)
  578. {
  579. VFIOContainer *container = group->container;
  580. VFIOContainerBase *bcontainer = &container->bcontainer;
  581. VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
  582. QLIST_REMOVE(group, container_next);
  583. group->container = NULL;
  584. /*
  585. * Explicitly release the listener first before unset container,
  586. * since unset may destroy the backend container if it's the last
  587. * group.
  588. */
  589. if (QLIST_EMPTY(&container->group_list)) {
  590. memory_listener_unregister(&bcontainer->listener);
  591. if (vioc->release) {
  592. vioc->release(bcontainer);
  593. }
  594. }
  595. if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
  596. error_report("vfio: error disconnecting group %d from container",
  597. group->groupid);
  598. }
  599. if (QLIST_EMPTY(&container->group_list)) {
  600. VFIOAddressSpace *space = bcontainer->space;
  601. trace_vfio_disconnect_container(container->fd);
  602. vfio_cpr_unregister_container(bcontainer);
  603. close(container->fd);
  604. object_unref(container);
  605. vfio_put_address_space(space);
  606. }
  607. }
  608. static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
  609. {
  610. ERRP_GUARD();
  611. VFIOGroup *group;
  612. char path[32];
  613. struct vfio_group_status status = { .argsz = sizeof(status) };
  614. QLIST_FOREACH(group, &vfio_group_list, next) {
  615. if (group->groupid == groupid) {
  616. /* Found it. Now is it already in the right context? */
  617. if (group->container->bcontainer.space->as == as) {
  618. return group;
  619. } else {
  620. error_setg(errp, "group %d used in multiple address spaces",
  621. group->groupid);
  622. return NULL;
  623. }
  624. }
  625. }
  626. group = g_malloc0(sizeof(*group));
  627. snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
  628. group->fd = qemu_open(path, O_RDWR, errp);
  629. if (group->fd < 0) {
  630. goto free_group_exit;
  631. }
  632. if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
  633. error_setg_errno(errp, errno, "failed to get group %d status", groupid);
  634. goto close_fd_exit;
  635. }
  636. if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
  637. error_setg(errp, "group %d is not viable", groupid);
  638. error_append_hint(errp,
  639. "Please ensure all devices within the iommu_group "
  640. "are bound to their vfio bus driver.\n");
  641. goto close_fd_exit;
  642. }
  643. group->groupid = groupid;
  644. QLIST_INIT(&group->device_list);
  645. if (!vfio_connect_container(group, as, errp)) {
  646. error_prepend(errp, "failed to setup container for group %d: ",
  647. groupid);
  648. goto close_fd_exit;
  649. }
  650. QLIST_INSERT_HEAD(&vfio_group_list, group, next);
  651. return group;
  652. close_fd_exit:
  653. close(group->fd);
  654. free_group_exit:
  655. g_free(group);
  656. return NULL;
  657. }
  658. static void vfio_put_group(VFIOGroup *group)
  659. {
  660. if (!group || !QLIST_EMPTY(&group->device_list)) {
  661. return;
  662. }
  663. if (!group->ram_block_discard_allowed) {
  664. vfio_ram_block_discard_disable(group->container, false);
  665. }
  666. vfio_kvm_device_del_group(group);
  667. vfio_disconnect_container(group);
  668. QLIST_REMOVE(group, next);
  669. trace_vfio_put_group(group->fd);
  670. close(group->fd);
  671. g_free(group);
  672. }
  673. static bool vfio_get_device(VFIOGroup *group, const char *name,
  674. VFIODevice *vbasedev, Error **errp)
  675. {
  676. g_autofree struct vfio_device_info *info = NULL;
  677. int fd;
  678. fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
  679. if (fd < 0) {
  680. error_setg_errno(errp, errno, "error getting device from group %d",
  681. group->groupid);
  682. error_append_hint(errp,
  683. "Verify all devices in group %d are bound to vfio-<bus> "
  684. "or pci-stub and not already in use\n", group->groupid);
  685. return false;
  686. }
  687. info = vfio_get_device_info(fd);
  688. if (!info) {
  689. error_setg_errno(errp, errno, "error getting device info");
  690. close(fd);
  691. return false;
  692. }
  693. /*
  694. * Set discarding of RAM as not broken for this group if the driver knows
  695. * the device operates compatibly with discarding. Setting must be
  696. * consistent per group, but since compatibility is really only possible
  697. * with mdev currently, we expect singleton groups.
  698. */
  699. if (vbasedev->ram_block_discard_allowed !=
  700. group->ram_block_discard_allowed) {
  701. if (!QLIST_EMPTY(&group->device_list)) {
  702. error_setg(errp, "Inconsistent setting of support for discarding "
  703. "RAM (e.g., balloon) within group");
  704. close(fd);
  705. return false;
  706. }
  707. if (!group->ram_block_discard_allowed) {
  708. group->ram_block_discard_allowed = true;
  709. vfio_ram_block_discard_disable(group->container, false);
  710. }
  711. }
  712. vbasedev->fd = fd;
  713. vbasedev->group = group;
  714. QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
  715. vbasedev->num_irqs = info->num_irqs;
  716. vbasedev->num_regions = info->num_regions;
  717. vbasedev->flags = info->flags;
  718. trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
  719. vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
  720. return true;
  721. }
  722. static void vfio_put_base_device(VFIODevice *vbasedev)
  723. {
  724. if (!vbasedev->group) {
  725. return;
  726. }
  727. QLIST_REMOVE(vbasedev, next);
  728. vbasedev->group = NULL;
  729. trace_vfio_put_base_device(vbasedev->fd);
  730. close(vbasedev->fd);
  731. }
  732. static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
  733. {
  734. char *tmp, group_path[PATH_MAX];
  735. g_autofree char *group_name = NULL;
  736. int ret, groupid;
  737. ssize_t len;
  738. tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
  739. len = readlink(tmp, group_path, sizeof(group_path));
  740. g_free(tmp);
  741. if (len <= 0 || len >= sizeof(group_path)) {
  742. ret = len < 0 ? -errno : -ENAMETOOLONG;
  743. error_setg_errno(errp, -ret, "no iommu_group found");
  744. return ret;
  745. }
  746. group_path[len] = 0;
  747. group_name = g_path_get_basename(group_path);
  748. if (sscanf(group_name, "%d", &groupid) != 1) {
  749. error_setg_errno(errp, errno, "failed to read %s", group_path);
  750. return -errno;
  751. }
  752. return groupid;
  753. }
  754. /*
  755. * vfio_attach_device: attach a device to a security context
  756. * @name and @vbasedev->name are likely to be different depending
  757. * on the type of the device, hence the need for passing @name
  758. */
  759. static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
  760. AddressSpace *as, Error **errp)
  761. {
  762. int groupid = vfio_device_groupid(vbasedev, errp);
  763. VFIODevice *vbasedev_iter;
  764. VFIOGroup *group;
  765. VFIOContainerBase *bcontainer;
  766. if (groupid < 0) {
  767. return false;
  768. }
  769. trace_vfio_attach_device(vbasedev->name, groupid);
  770. if (!vfio_device_hiod_realize(vbasedev, errp)) {
  771. return false;
  772. }
  773. group = vfio_get_group(groupid, as, errp);
  774. if (!group) {
  775. return false;
  776. }
  777. QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
  778. if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
  779. error_setg(errp, "device is already attached");
  780. vfio_put_group(group);
  781. return false;
  782. }
  783. }
  784. if (!vfio_get_device(group, name, vbasedev, errp)) {
  785. vfio_put_group(group);
  786. return false;
  787. }
  788. bcontainer = &group->container->bcontainer;
  789. vbasedev->bcontainer = bcontainer;
  790. QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
  791. QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
  792. return true;
  793. }
  794. static void vfio_legacy_detach_device(VFIODevice *vbasedev)
  795. {
  796. VFIOGroup *group = vbasedev->group;
  797. QLIST_REMOVE(vbasedev, global_next);
  798. QLIST_REMOVE(vbasedev, container_next);
  799. vbasedev->bcontainer = NULL;
  800. trace_vfio_detach_device(vbasedev->name, group->groupid);
  801. vfio_put_base_device(vbasedev);
  802. vfio_put_group(group);
  803. }
  804. static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
  805. {
  806. VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
  807. VFIOGroup *group;
  808. struct vfio_pci_hot_reset_info *info = NULL;
  809. struct vfio_pci_dependent_device *devices;
  810. struct vfio_pci_hot_reset *reset;
  811. int32_t *fds;
  812. int ret, i, count;
  813. bool multi = false;
  814. trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
  815. if (!single) {
  816. vfio_pci_pre_reset(vdev);
  817. }
  818. vdev->vbasedev.needs_reset = false;
  819. ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
  820. if (ret) {
  821. goto out_single;
  822. }
  823. devices = &info->devices[0];
  824. trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
  825. /* Verify that we have all the groups required */
  826. for (i = 0; i < info->count; i++) {
  827. PCIHostDeviceAddress host;
  828. VFIOPCIDevice *tmp;
  829. VFIODevice *vbasedev_iter;
  830. host.domain = devices[i].segment;
  831. host.bus = devices[i].bus;
  832. host.slot = PCI_SLOT(devices[i].devfn);
  833. host.function = PCI_FUNC(devices[i].devfn);
  834. trace_vfio_pci_hot_reset_dep_devices(host.domain,
  835. host.bus, host.slot, host.function, devices[i].group_id);
  836. if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
  837. continue;
  838. }
  839. QLIST_FOREACH(group, &vfio_group_list, next) {
  840. if (group->groupid == devices[i].group_id) {
  841. break;
  842. }
  843. }
  844. if (!group) {
  845. if (!vdev->has_pm_reset) {
  846. error_report("vfio: Cannot reset device %s, "
  847. "depends on group %d which is not owned.",
  848. vdev->vbasedev.name, devices[i].group_id);
  849. }
  850. ret = -EPERM;
  851. goto out;
  852. }
  853. /* Prep dependent devices for reset and clear our marker. */
  854. QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
  855. if (!vbasedev_iter->dev->realized ||
  856. vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
  857. continue;
  858. }
  859. tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
  860. if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
  861. if (single) {
  862. ret = -EINVAL;
  863. goto out_single;
  864. }
  865. vfio_pci_pre_reset(tmp);
  866. tmp->vbasedev.needs_reset = false;
  867. multi = true;
  868. break;
  869. }
  870. }
  871. }
  872. if (!single && !multi) {
  873. ret = -EINVAL;
  874. goto out_single;
  875. }
  876. /* Determine how many group fds need to be passed */
  877. count = 0;
  878. QLIST_FOREACH(group, &vfio_group_list, next) {
  879. for (i = 0; i < info->count; i++) {
  880. if (group->groupid == devices[i].group_id) {
  881. count++;
  882. break;
  883. }
  884. }
  885. }
  886. reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
  887. reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
  888. fds = &reset->group_fds[0];
  889. /* Fill in group fds */
  890. QLIST_FOREACH(group, &vfio_group_list, next) {
  891. for (i = 0; i < info->count; i++) {
  892. if (group->groupid == devices[i].group_id) {
  893. fds[reset->count++] = group->fd;
  894. break;
  895. }
  896. }
  897. }
  898. /* Bus reset! */
  899. ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
  900. g_free(reset);
  901. if (ret) {
  902. ret = -errno;
  903. }
  904. trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
  905. ret ? strerror(errno) : "Success");
  906. out:
  907. /* Re-enable INTx on affected devices */
  908. for (i = 0; i < info->count; i++) {
  909. PCIHostDeviceAddress host;
  910. VFIOPCIDevice *tmp;
  911. VFIODevice *vbasedev_iter;
  912. host.domain = devices[i].segment;
  913. host.bus = devices[i].bus;
  914. host.slot = PCI_SLOT(devices[i].devfn);
  915. host.function = PCI_FUNC(devices[i].devfn);
  916. if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
  917. continue;
  918. }
  919. QLIST_FOREACH(group, &vfio_group_list, next) {
  920. if (group->groupid == devices[i].group_id) {
  921. break;
  922. }
  923. }
  924. if (!group) {
  925. break;
  926. }
  927. QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
  928. if (!vbasedev_iter->dev->realized ||
  929. vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
  930. continue;
  931. }
  932. tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
  933. if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
  934. vfio_pci_post_reset(tmp);
  935. break;
  936. }
  937. }
  938. }
  939. out_single:
  940. if (!single) {
  941. vfio_pci_post_reset(vdev);
  942. }
  943. g_free(info);
  944. return ret;
  945. }
  946. static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
  947. {
  948. VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
  949. vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO;
  950. vioc->setup = vfio_legacy_setup;
  951. vioc->dma_map = vfio_legacy_dma_map;
  952. vioc->dma_unmap = vfio_legacy_dma_unmap;
  953. vioc->attach_device = vfio_legacy_attach_device;
  954. vioc->detach_device = vfio_legacy_detach_device;
  955. vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking;
  956. vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap;
  957. vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
  958. };
  959. static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
  960. Error **errp)
  961. {
  962. VFIODevice *vdev = opaque;
  963. hiod->name = g_strdup(vdev->name);
  964. hiod->agent = opaque;
  965. return true;
  966. }
  967. static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap,
  968. Error **errp)
  969. {
  970. switch (cap) {
  971. case HOST_IOMMU_DEVICE_CAP_AW_BITS:
  972. return vfio_device_get_aw_bits(hiod->agent);
  973. default:
  974. error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
  975. return -EINVAL;
  976. }
  977. }
  978. static GList *
  979. hiod_legacy_vfio_get_iova_ranges(HostIOMMUDevice *hiod)
  980. {
  981. VFIODevice *vdev = hiod->agent;
  982. g_assert(vdev);
  983. return vfio_container_get_iova_ranges(vdev->bcontainer);
  984. }
  985. static uint64_t
  986. hiod_legacy_vfio_get_page_size_mask(HostIOMMUDevice *hiod)
  987. {
  988. VFIODevice *vdev = hiod->agent;
  989. g_assert(vdev);
  990. return vfio_container_get_page_size_mask(vdev->bcontainer);
  991. }
  992. static void vfio_iommu_legacy_instance_init(Object *obj)
  993. {
  994. VFIOContainer *container = VFIO_IOMMU_LEGACY(obj);
  995. QLIST_INIT(&container->group_list);
  996. }
  997. static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
  998. {
  999. HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
  1000. hioc->realize = hiod_legacy_vfio_realize;
  1001. hioc->get_cap = hiod_legacy_vfio_get_cap;
  1002. hioc->get_iova_ranges = hiod_legacy_vfio_get_iova_ranges;
  1003. hioc->get_page_size_mask = hiod_legacy_vfio_get_page_size_mask;
  1004. };
  1005. static const TypeInfo types[] = {
  1006. {
  1007. .name = TYPE_VFIO_IOMMU_LEGACY,
  1008. .parent = TYPE_VFIO_IOMMU,
  1009. .instance_init = vfio_iommu_legacy_instance_init,
  1010. .instance_size = sizeof(VFIOContainer),
  1011. .class_init = vfio_iommu_legacy_class_init,
  1012. }, {
  1013. .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
  1014. .parent = TYPE_HOST_IOMMU_DEVICE,
  1015. .class_init = hiod_legacy_vfio_class_init,
  1016. }
  1017. };
  1018. DEFINE_TYPES(types)