spapr.c 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. /*
  2. * DMA memory preregistration
  3. *
  4. * Authors:
  5. * Alexey Kardashevskiy <aik@ozlabs.ru>
  6. *
  7. * This work is licensed under the terms of the GNU GPL, version 2. See
  8. * the COPYING file in the top-level directory.
  9. */
  10. #include "qemu/osdep.h"
  11. #include "cpu.h"
  12. #include <sys/ioctl.h>
  13. #include <linux/vfio.h>
  14. #include "hw/vfio/vfio-common.h"
  15. #include "hw/hw.h"
  16. #include "exec/ram_addr.h"
  17. #include "qemu/error-report.h"
  18. #include "qapi/error.h"
  19. #include "trace.h"
  20. static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
  21. {
  22. if (memory_region_is_iommu(section->mr)) {
  23. hw_error("Cannot possibly preregister IOMMU memory");
  24. }
  25. return !memory_region_is_ram(section->mr) ||
  26. memory_region_is_ram_device(section->mr);
  27. }
  28. static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
  29. {
  30. return memory_region_get_ram_ptr(section->mr) +
  31. section->offset_within_region +
  32. (gpa - section->offset_within_address_space);
  33. }
  34. static void vfio_prereg_listener_region_add(MemoryListener *listener,
  35. MemoryRegionSection *section)
  36. {
  37. VFIOContainer *container = container_of(listener, VFIOContainer,
  38. prereg_listener);
  39. const hwaddr gpa = section->offset_within_address_space;
  40. hwaddr end;
  41. int ret;
  42. hwaddr page_mask = qemu_real_host_page_mask;
  43. struct vfio_iommu_spapr_register_memory reg = {
  44. .argsz = sizeof(reg),
  45. .flags = 0,
  46. };
  47. if (vfio_prereg_listener_skipped_section(section)) {
  48. trace_vfio_prereg_listener_region_add_skip(
  49. section->offset_within_address_space,
  50. section->offset_within_address_space +
  51. int128_get64(int128_sub(section->size, int128_one())));
  52. return;
  53. }
  54. if (unlikely((section->offset_within_address_space & ~page_mask) ||
  55. (section->offset_within_region & ~page_mask) ||
  56. (int128_get64(section->size) & ~page_mask))) {
  57. error_report("%s received unaligned region", __func__);
  58. return;
  59. }
  60. end = section->offset_within_address_space + int128_get64(section->size);
  61. if (gpa >= end) {
  62. return;
  63. }
  64. memory_region_ref(section->mr);
  65. reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
  66. reg.size = end - gpa;
  67. ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
  68. trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
  69. if (ret) {
  70. /*
  71. * On the initfn path, store the first error in the container so we
  72. * can gracefully fail. Runtime, there's not much we can do other
  73. * than throw a hardware error.
  74. */
  75. if (!container->initialized) {
  76. if (!container->error) {
  77. error_setg_errno(&container->error, -ret,
  78. "Memory registering failed");
  79. }
  80. } else {
  81. hw_error("vfio: Memory registering failed, unable to continue");
  82. }
  83. }
  84. }
  85. static void vfio_prereg_listener_region_del(MemoryListener *listener,
  86. MemoryRegionSection *section)
  87. {
  88. VFIOContainer *container = container_of(listener, VFIOContainer,
  89. prereg_listener);
  90. const hwaddr gpa = section->offset_within_address_space;
  91. hwaddr end;
  92. int ret;
  93. hwaddr page_mask = qemu_real_host_page_mask;
  94. struct vfio_iommu_spapr_register_memory reg = {
  95. .argsz = sizeof(reg),
  96. .flags = 0,
  97. };
  98. if (vfio_prereg_listener_skipped_section(section)) {
  99. trace_vfio_prereg_listener_region_del_skip(
  100. section->offset_within_address_space,
  101. section->offset_within_address_space +
  102. int128_get64(int128_sub(section->size, int128_one())));
  103. return;
  104. }
  105. if (unlikely((section->offset_within_address_space & ~page_mask) ||
  106. (section->offset_within_region & ~page_mask) ||
  107. (int128_get64(section->size) & ~page_mask))) {
  108. error_report("%s received unaligned region", __func__);
  109. return;
  110. }
  111. end = section->offset_within_address_space + int128_get64(section->size);
  112. if (gpa >= end) {
  113. return;
  114. }
  115. reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
  116. reg.size = end - gpa;
  117. ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
  118. trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
  119. }
  120. const MemoryListener vfio_prereg_listener = {
  121. .region_add = vfio_prereg_listener_region_add,
  122. .region_del = vfio_prereg_listener_region_del,
  123. };
  124. int vfio_spapr_create_window(VFIOContainer *container,
  125. MemoryRegionSection *section,
  126. hwaddr *pgsize)
  127. {
  128. int ret = 0;
  129. IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
  130. uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr);
  131. unsigned entries, bits_total, bits_per_level, max_levels;
  132. struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
  133. long rampagesize = qemu_minrampagesize();
  134. /*
  135. * The host might not support the guest supported IOMMU page size,
  136. * so we will use smaller physical IOMMU pages to back them.
  137. */
  138. if (pagesize > rampagesize) {
  139. pagesize = rampagesize;
  140. }
  141. pagesize = 1ULL << (63 - clz64(container->pgsizes &
  142. (pagesize | (pagesize - 1))));
  143. if (!pagesize) {
  144. error_report("Host doesn't support page size 0x%"PRIx64
  145. ", the supported mask is 0x%lx",
  146. memory_region_iommu_get_min_page_size(iommu_mr),
  147. container->pgsizes);
  148. return -EINVAL;
  149. }
  150. /*
  151. * FIXME: For VFIO iommu types which have KVM acceleration to
  152. * avoid bouncing all map/unmaps through qemu this way, this
  153. * would be the right place to wire that up (tell the KVM
  154. * device emulation the VFIO iommu handles to use).
  155. */
  156. create.window_size = int128_get64(section->size);
  157. create.page_shift = ctz64(pagesize);
  158. /*
  159. * SPAPR host supports multilevel TCE tables. We try to guess optimal
  160. * levels number and if this fails (for example due to the host memory
  161. * fragmentation), we increase levels. The DMA address structure is:
  162. * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii
  163. * where:
  164. * r = reserved (bits >= 55 are reserved in the existing hardware)
  165. * i = IOMMU page offset (64K in this example)
  166. * x = bits to index a TCE which can be split to equal chunks to index
  167. * within the level.
  168. * The aim is to split "x" to smaller possible number of levels.
  169. */
  170. entries = create.window_size >> create.page_shift;
  171. /* bits_total is number of "x" needed */
  172. bits_total = ctz64(entries * sizeof(uint64_t));
  173. /*
  174. * bits_per_level is a safe guess of how much we can allocate per level:
  175. * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER
  176. * is usually bigger than that.
  177. * Below we look at qemu_real_host_page_size as TCEs are allocated from
  178. * system pages.
  179. */
  180. bits_per_level = ctz64(qemu_real_host_page_size) + 8;
  181. create.levels = bits_total / bits_per_level;
  182. if (bits_total % bits_per_level) {
  183. ++create.levels;
  184. }
  185. max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size);
  186. for ( ; create.levels <= max_levels; ++create.levels) {
  187. ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
  188. if (!ret) {
  189. break;
  190. }
  191. }
  192. if (ret) {
  193. error_report("Failed to create a window, ret = %d (%m)", ret);
  194. return -errno;
  195. }
  196. if (create.start_addr != section->offset_within_address_space) {
  197. vfio_spapr_remove_window(container, create.start_addr);
  198. error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
  199. section->offset_within_address_space,
  200. (uint64_t)create.start_addr);
  201. return -EINVAL;
  202. }
  203. trace_vfio_spapr_create_window(create.page_shift,
  204. create.levels,
  205. create.window_size,
  206. create.start_addr);
  207. *pgsize = pagesize;
  208. return 0;
  209. }
  210. int vfio_spapr_remove_window(VFIOContainer *container,
  211. hwaddr offset_within_address_space)
  212. {
  213. struct vfio_iommu_spapr_tce_remove remove = {
  214. .argsz = sizeof(remove),
  215. .start_addr = offset_within_address_space,
  216. };
  217. int ret;
  218. ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
  219. if (ret) {
  220. error_report("Failed to remove window at %"PRIx64,
  221. (uint64_t)remove.start_addr);
  222. return -errno;
  223. }
  224. trace_vfio_spapr_remove_window(offset_within_address_space);
  225. return 0;
  226. }