2
0

userfaultfd.c 10 KB


  1. /*
  2. * Linux UFFD-WP support
  3. *
  4. * Copyright Virtuozzo GmbH, 2020
  5. *
  6. * Authors:
  7. * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2 or
  10. * later. See the COPYING file in the top-level directory.
  11. */
  12. #include "qemu/osdep.h"
  13. #include "qemu/bitops.h"
  14. #include "qemu/error-report.h"
  15. #include "qemu/userfaultfd.h"
  16. #include "trace.h"
  17. #include <poll.h>
  18. #include <sys/syscall.h>
  19. #include <sys/ioctl.h>
  20. #include <fcntl.h>
  21. typedef enum {
  22. UFFD_UNINITIALIZED = 0,
  23. UFFD_USE_DEV_PATH,
  24. UFFD_USE_SYSCALL,
  25. } uffd_open_mode;
  26. int uffd_open(int flags)
  27. {
  28. #if defined(__NR_userfaultfd)
  29. static uffd_open_mode open_mode;
  30. static int uffd_dev;
  31. /* Detect how to generate uffd desc when run the 1st time */
  32. if (open_mode == UFFD_UNINITIALIZED) {
  33. /*
  34. * Make /dev/userfaultfd the default approach because it has better
  35. * permission controls, meanwhile allows kernel faults without any
  36. * privilege requirement (e.g. SYS_CAP_PTRACE).
  37. */
  38. uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
  39. if (uffd_dev >= 0) {
  40. open_mode = UFFD_USE_DEV_PATH;
  41. } else {
  42. /* Fallback to the system call */
  43. open_mode = UFFD_USE_SYSCALL;
  44. }
  45. trace_uffd_detect_open_mode(open_mode);
  46. }
  47. if (open_mode == UFFD_USE_DEV_PATH) {
  48. assert(uffd_dev >= 0);
  49. return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
  50. }
  51. return syscall(__NR_userfaultfd, flags);
  52. #else
  53. return -EINVAL;
  54. #endif
  55. }
  56. /**
  57. * uffd_query_features: query UFFD features
  58. *
  59. * Returns: 0 on success, negative value in case of an error
  60. *
  61. * @features: parameter to receive 'uffdio_api.features'
  62. */
  63. int uffd_query_features(uint64_t *features)
  64. {
  65. int uffd_fd;
  66. struct uffdio_api api_struct = { 0 };
  67. int ret = -1;
  68. uffd_fd = uffd_open(O_CLOEXEC);
  69. if (uffd_fd < 0) {
  70. trace_uffd_query_features_nosys(errno);
  71. return -1;
  72. }
  73. api_struct.api = UFFD_API;
  74. api_struct.features = 0;
  75. if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
  76. trace_uffd_query_features_api_failed(errno);
  77. goto out;
  78. }
  79. *features = api_struct.features;
  80. ret = 0;
  81. out:
  82. close(uffd_fd);
  83. return ret;
  84. }
  85. /**
  86. * uffd_create_fd: create UFFD file descriptor
  87. *
  88. * Returns non-negative file descriptor or negative value in case of an error
  89. *
  90. * @features: UFFD features to request
  91. * @non_blocking: create UFFD file descriptor for non-blocking operation
  92. */
  93. int uffd_create_fd(uint64_t features, bool non_blocking)
  94. {
  95. int uffd_fd;
  96. int flags;
  97. struct uffdio_api api_struct = { 0 };
  98. uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
  99. flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
  100. uffd_fd = uffd_open(flags);
  101. if (uffd_fd < 0) {
  102. trace_uffd_create_fd_nosys(errno);
  103. return -1;
  104. }
  105. api_struct.api = UFFD_API;
  106. api_struct.features = features;
  107. if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
  108. trace_uffd_create_fd_api_failed(errno);
  109. goto fail;
  110. }
  111. if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
  112. trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
  113. goto fail;
  114. }
  115. return uffd_fd;
  116. fail:
  117. close(uffd_fd);
  118. return -1;
  119. }
  120. /**
  121. * uffd_close_fd: close UFFD file descriptor
  122. *
  123. * @uffd_fd: UFFD file descriptor
  124. */
  125. void uffd_close_fd(int uffd_fd)
  126. {
  127. assert(uffd_fd >= 0);
  128. close(uffd_fd);
  129. }
  130. /**
  131. * uffd_register_memory: register memory range via UFFD-IO
  132. *
  133. * Returns 0 in case of success, negative value in case of an error
  134. *
  135. * @uffd_fd: UFFD file descriptor
  136. * @addr: base address of memory range
  137. * @length: length of memory range
  138. * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
  139. * @ioctls: optional pointer to receive supported IOCTL mask
  140. */
  141. int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
  142. uint64_t mode, uint64_t *ioctls)
  143. {
  144. struct uffdio_register uffd_register;
  145. uffd_register.range.start = (uintptr_t) addr;
  146. uffd_register.range.len = length;
  147. uffd_register.mode = mode;
  148. if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
  149. trace_uffd_register_memory_failed(addr, length, mode, errno);
  150. return -1;
  151. }
  152. if (ioctls) {
  153. *ioctls = uffd_register.ioctls;
  154. }
  155. return 0;
  156. }
  157. /**
  158. * uffd_unregister_memory: un-register memory range with UFFD-IO
  159. *
  160. * Returns 0 in case of success, negative value in case of an error
  161. *
  162. * @uffd_fd: UFFD file descriptor
  163. * @addr: base address of memory range
  164. * @length: length of memory range
  165. */
  166. int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
  167. {
  168. struct uffdio_range uffd_range;
  169. uffd_range.start = (uintptr_t) addr;
  170. uffd_range.len = length;
  171. if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
  172. trace_uffd_unregister_memory_failed(addr, length, errno);
  173. return -1;
  174. }
  175. return 0;
  176. }
  177. /**
  178. * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
  179. *
  180. * Returns 0 on success, negative value in case of error
  181. *
  182. * @uffd_fd: UFFD file descriptor
  183. * @addr: base address of memory range
  184. * @length: length of memory range
  185. * @wp: write-protect/unprotect
  186. * @dont_wake: do not wake threads waiting on wr-protected page
  187. */
  188. int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
  189. bool wp, bool dont_wake)
  190. {
  191. struct uffdio_writeprotect uffd_writeprotect;
  192. uffd_writeprotect.range.start = (uintptr_t) addr;
  193. uffd_writeprotect.range.len = length;
  194. if (!wp && dont_wake) {
  195. /* DONTWAKE is meaningful only on protection release */
  196. uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
  197. } else {
  198. uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
  199. }
  200. if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
  201. error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
  202. " mode=%" PRIx64 " errno=%i", addr, length,
  203. (uint64_t) uffd_writeprotect.mode, errno);
  204. return -1;
  205. }
  206. return 0;
  207. }
  208. /**
  209. * uffd_copy_page: copy range of pages to destination via UFFD-IO
  210. *
  211. * Copy range of source pages to the destination to resolve
  212. * missing page fault somewhere in the destination range.
  213. *
  214. * Returns 0 on success, negative value in case of an error
  215. *
  216. * @uffd_fd: UFFD file descriptor
  217. * @dst_addr: destination base address
  218. * @src_addr: source base address
  219. * @length: length of the range to copy
  220. * @dont_wake: do not wake threads waiting on missing page
  221. */
  222. int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
  223. uint64_t length, bool dont_wake)
  224. {
  225. struct uffdio_copy uffd_copy;
  226. uffd_copy.dst = (uintptr_t) dst_addr;
  227. uffd_copy.src = (uintptr_t) src_addr;
  228. uffd_copy.len = length;
  229. uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
  230. if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
  231. error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
  232. " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
  233. length, (uint64_t) uffd_copy.mode, errno);
  234. return -1;
  235. }
  236. return 0;
  237. }
  238. /**
  239. * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
  240. *
  241. * Fill range pages with zeroes to resolve missing page fault within the range.
  242. *
  243. * Returns 0 on success, negative value in case of an error
  244. *
  245. * @uffd_fd: UFFD file descriptor
  246. * @addr: base address
  247. * @length: length of the range to fill with zeroes
  248. * @dont_wake: do not wake threads waiting on missing page
  249. */
  250. int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
  251. {
  252. struct uffdio_zeropage uffd_zeropage;
  253. uffd_zeropage.range.start = (uintptr_t) addr;
  254. uffd_zeropage.range.len = length;
  255. uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
  256. if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
  257. error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
  258. " mode=%" PRIx64 " errno=%i", addr, length,
  259. (uint64_t) uffd_zeropage.mode, errno);
  260. return -1;
  261. }
  262. return 0;
  263. }
  264. /**
  265. * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
  266. *
  267. * Wake up threads waiting on any page/pages from the designated range.
  268. * The main use case is when during some period, page faults are resolved
  269. * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
  270. * for the whole memory range are satisfied in a single call to uffd_wakeup().
  271. *
  272. * Returns 0 on success, negative value in case of an error
  273. *
  274. * @uffd_fd: UFFD file descriptor
  275. * @addr: base address
  276. * @length: length of the range
  277. */
  278. int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
  279. {
  280. struct uffdio_range uffd_range;
  281. uffd_range.start = (uintptr_t) addr;
  282. uffd_range.len = length;
  283. if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
  284. error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
  285. addr, length, errno);
  286. return -1;
  287. }
  288. return 0;
  289. }
  290. /**
  291. * uffd_read_events: read pending UFFD events
  292. *
  293. * Returns number of fetched messages, 0 if non is available or
  294. * negative value in case of an error
  295. *
  296. * @uffd_fd: UFFD file descriptor
  297. * @msgs: pointer to message buffer
  298. * @count: number of messages that can fit in the buffer
  299. */
  300. int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
  301. {
  302. ssize_t res;
  303. do {
  304. res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
  305. } while (res < 0 && errno == EINTR);
  306. if ((res < 0 && errno == EAGAIN)) {
  307. return 0;
  308. }
  309. if (res < 0) {
  310. error_report("uffd_read_events() failed: errno=%i", errno);
  311. return -1;
  312. }
  313. return (int) (res / sizeof(struct uffd_msg));
  314. }
  315. /**
  316. * uffd_poll_events: poll UFFD file descriptor for read
  317. *
  318. * Returns true if events are available for read, false otherwise
  319. *
  320. * @uffd_fd: UFFD file descriptor
  321. * @tmo: timeout value
  322. */
  323. bool uffd_poll_events(int uffd_fd, int tmo)
  324. {
  325. int res;
  326. struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
  327. do {
  328. res = poll(&poll_fd, 1, tmo);
  329. } while (res < 0 && errno == EINTR);
  330. if (res == 0) {
  331. return false;
  332. }
  333. if (res < 0) {
  334. error_report("uffd_poll_events() failed: errno=%i", errno);
  335. return false;
  336. }
  337. return (poll_fd.revents & POLLIN) != 0;
  338. }