userfaultfd.c 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. /*
  2. * Linux UFFD-WP support
  3. *
  4. * Copyright Virtuozzo GmbH, 2020
  5. *
  6. * Authors:
  7. * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2 or
  10. * later. See the COPYING file in the top-level directory.
  11. */
  12. #include "qemu/osdep.h"
  13. #include "qemu/bitops.h"
  14. #include "qemu/error-report.h"
  15. #include "qemu/userfaultfd.h"
  16. #include "trace.h"
  17. #include <poll.h>
  18. #include <sys/syscall.h>
  19. #include <sys/ioctl.h>
  20. typedef enum {
  21. UFFD_UNINITIALIZED = 0,
  22. UFFD_USE_DEV_PATH,
  23. UFFD_USE_SYSCALL,
  24. } uffd_open_mode;
  25. int uffd_open(int flags)
  26. {
  27. #if defined(__NR_userfaultfd)
  28. static uffd_open_mode open_mode;
  29. static int uffd_dev;
  30. /* Detect how to generate uffd desc when run the 1st time */
  31. if (open_mode == UFFD_UNINITIALIZED) {
  32. /*
  33. * Make /dev/userfaultfd the default approach because it has better
  34. * permission controls, meanwhile allows kernel faults without any
  35. * privilege requirement (e.g. SYS_CAP_PTRACE).
  36. */
  37. uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
  38. if (uffd_dev >= 0) {
  39. open_mode = UFFD_USE_DEV_PATH;
  40. } else {
  41. /* Fallback to the system call */
  42. open_mode = UFFD_USE_SYSCALL;
  43. }
  44. trace_uffd_detect_open_mode(open_mode);
  45. }
  46. if (open_mode == UFFD_USE_DEV_PATH) {
  47. assert(uffd_dev >= 0);
  48. return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
  49. }
  50. return syscall(__NR_userfaultfd, flags);
  51. #else
  52. return -EINVAL;
  53. #endif
  54. }
  55. /**
  56. * uffd_query_features: query UFFD features
  57. *
  58. * Returns: 0 on success, negative value in case of an error
  59. *
  60. * @features: parameter to receive 'uffdio_api.features'
  61. */
  62. int uffd_query_features(uint64_t *features)
  63. {
  64. int uffd_fd;
  65. struct uffdio_api api_struct = { 0 };
  66. int ret = -1;
  67. uffd_fd = uffd_open(O_CLOEXEC);
  68. if (uffd_fd < 0) {
  69. trace_uffd_query_features_nosys(errno);
  70. return -1;
  71. }
  72. api_struct.api = UFFD_API;
  73. api_struct.features = 0;
  74. if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
  75. trace_uffd_query_features_api_failed(errno);
  76. goto out;
  77. }
  78. *features = api_struct.features;
  79. ret = 0;
  80. out:
  81. close(uffd_fd);
  82. return ret;
  83. }
  84. /**
  85. * uffd_create_fd: create UFFD file descriptor
  86. *
  87. * Returns non-negative file descriptor or negative value in case of an error
  88. *
  89. * @features: UFFD features to request
  90. * @non_blocking: create UFFD file descriptor for non-blocking operation
  91. */
  92. int uffd_create_fd(uint64_t features, bool non_blocking)
  93. {
  94. int uffd_fd;
  95. int flags;
  96. struct uffdio_api api_struct = { 0 };
  97. uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
  98. flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
  99. uffd_fd = uffd_open(flags);
  100. if (uffd_fd < 0) {
  101. trace_uffd_create_fd_nosys(errno);
  102. return -1;
  103. }
  104. api_struct.api = UFFD_API;
  105. api_struct.features = features;
  106. if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
  107. trace_uffd_create_fd_api_failed(errno);
  108. goto fail;
  109. }
  110. if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
  111. trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
  112. goto fail;
  113. }
  114. return uffd_fd;
  115. fail:
  116. close(uffd_fd);
  117. return -1;
  118. }
  119. /**
  120. * uffd_close_fd: close UFFD file descriptor
  121. *
  122. * @uffd_fd: UFFD file descriptor
  123. */
  124. void uffd_close_fd(int uffd_fd)
  125. {
  126. assert(uffd_fd >= 0);
  127. close(uffd_fd);
  128. }
  129. /**
  130. * uffd_register_memory: register memory range via UFFD-IO
  131. *
  132. * Returns 0 in case of success, negative value in case of an error
  133. *
  134. * @uffd_fd: UFFD file descriptor
  135. * @addr: base address of memory range
  136. * @length: length of memory range
  137. * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
  138. * @ioctls: optional pointer to receive supported IOCTL mask
  139. */
  140. int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
  141. uint64_t mode, uint64_t *ioctls)
  142. {
  143. struct uffdio_register uffd_register;
  144. uffd_register.range.start = (uintptr_t) addr;
  145. uffd_register.range.len = length;
  146. uffd_register.mode = mode;
  147. if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
  148. trace_uffd_register_memory_failed(addr, length, mode, errno);
  149. return -1;
  150. }
  151. if (ioctls) {
  152. *ioctls = uffd_register.ioctls;
  153. }
  154. return 0;
  155. }
  156. /**
  157. * uffd_unregister_memory: un-register memory range with UFFD-IO
  158. *
  159. * Returns 0 in case of success, negative value in case of an error
  160. *
  161. * @uffd_fd: UFFD file descriptor
  162. * @addr: base address of memory range
  163. * @length: length of memory range
  164. */
  165. int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
  166. {
  167. struct uffdio_range uffd_range;
  168. uffd_range.start = (uintptr_t) addr;
  169. uffd_range.len = length;
  170. if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
  171. trace_uffd_unregister_memory_failed(addr, length, errno);
  172. return -1;
  173. }
  174. return 0;
  175. }
  176. /**
  177. * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
  178. *
  179. * Returns 0 on success, negative value in case of error
  180. *
  181. * @uffd_fd: UFFD file descriptor
  182. * @addr: base address of memory range
  183. * @length: length of memory range
  184. * @wp: write-protect/unprotect
  185. * @dont_wake: do not wake threads waiting on wr-protected page
  186. */
  187. int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
  188. bool wp, bool dont_wake)
  189. {
  190. struct uffdio_writeprotect uffd_writeprotect;
  191. uffd_writeprotect.range.start = (uintptr_t) addr;
  192. uffd_writeprotect.range.len = length;
  193. if (!wp && dont_wake) {
  194. /* DONTWAKE is meaningful only on protection release */
  195. uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
  196. } else {
  197. uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
  198. }
  199. if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
  200. error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
  201. " mode=%" PRIx64 " errno=%i", addr, length,
  202. (uint64_t) uffd_writeprotect.mode, errno);
  203. return -1;
  204. }
  205. return 0;
  206. }
  207. /**
  208. * uffd_copy_page: copy range of pages to destination via UFFD-IO
  209. *
  210. * Copy range of source pages to the destination to resolve
  211. * missing page fault somewhere in the destination range.
  212. *
  213. * Returns 0 on success, -errno in case of an error
  214. *
  215. * @uffd_fd: UFFD file descriptor
  216. * @dst_addr: destination base address
  217. * @src_addr: source base address
  218. * @length: length of the range to copy
  219. * @dont_wake: do not wake threads waiting on missing page
  220. */
  221. int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
  222. uint64_t length, bool dont_wake)
  223. {
  224. struct uffdio_copy uffd_copy;
  225. uffd_copy.dst = (uintptr_t) dst_addr;
  226. uffd_copy.src = (uintptr_t) src_addr;
  227. uffd_copy.len = length;
  228. uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
  229. if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
  230. int e = errno;
  231. error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
  232. " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
  233. length, (uint64_t) uffd_copy.mode, e);
  234. return -e;
  235. }
  236. return 0;
  237. }
  238. /**
  239. * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
  240. *
  241. * Fill range pages with zeroes to resolve missing page fault within the range.
  242. *
  243. * Returns 0 on success, -errno in case of an error
  244. *
  245. * @uffd_fd: UFFD file descriptor
  246. * @addr: base address
  247. * @length: length of the range to fill with zeroes
  248. * @dont_wake: do not wake threads waiting on missing page
  249. */
  250. int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
  251. {
  252. struct uffdio_zeropage uffd_zeropage;
  253. uffd_zeropage.range.start = (uintptr_t) addr;
  254. uffd_zeropage.range.len = length;
  255. uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
  256. if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
  257. int e = errno;
  258. error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
  259. " mode=%" PRIx64 " errno=%i", addr, length,
  260. (uint64_t) uffd_zeropage.mode, e);
  261. return -e;
  262. }
  263. return 0;
  264. }
  265. /**
  266. * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
  267. *
  268. * Wake up threads waiting on any page/pages from the designated range.
  269. * The main use case is when during some period, page faults are resolved
  270. * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
  271. * for the whole memory range are satisfied in a single call to uffd_wakeup().
  272. *
  273. * Returns 0 on success, -errno in case of an error
  274. *
  275. * @uffd_fd: UFFD file descriptor
  276. * @addr: base address
  277. * @length: length of the range
  278. */
  279. int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
  280. {
  281. struct uffdio_range uffd_range;
  282. uffd_range.start = (uintptr_t) addr;
  283. uffd_range.len = length;
  284. if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
  285. int e = errno;
  286. error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
  287. addr, length, e);
  288. return -e;
  289. }
  290. return 0;
  291. }
  292. /**
  293. * uffd_read_events: read pending UFFD events
  294. *
  295. * Returns number of fetched messages, 0 if non is available or
  296. * negative value in case of an error
  297. *
  298. * @uffd_fd: UFFD file descriptor
  299. * @msgs: pointer to message buffer
  300. * @count: number of messages that can fit in the buffer
  301. */
  302. int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
  303. {
  304. ssize_t res;
  305. do {
  306. res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
  307. } while (res < 0 && errno == EINTR);
  308. if ((res < 0 && errno == EAGAIN)) {
  309. return 0;
  310. }
  311. if (res < 0) {
  312. error_report("uffd_read_events() failed: errno=%i", errno);
  313. return -1;
  314. }
  315. return (int) (res / sizeof(struct uffd_msg));
  316. }