oslib-posix.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933
  1. /*
  2. * os-posix-lib.c
  3. *
  4. * Copyright (c) 2003-2008 Fabrice Bellard
  5. * Copyright (c) 2010 Red Hat, Inc.
  6. *
  7. * QEMU library functions on POSIX which are shared between QEMU and
  8. * the QEMU tools.
  9. *
  10. * Permission is hereby granted, free of charge, to any person obtaining a copy
  11. * of this software and associated documentation files (the "Software"), to deal
  12. * in the Software without restriction, including without limitation the rights
  13. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14. * copies of the Software, and to permit persons to whom the Software is
  15. * furnished to do so, subject to the following conditions:
  16. *
  17. * The above copyright notice and this permission notice shall be included in
  18. * all copies or substantial portions of the Software.
  19. *
  20. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26. * THE SOFTWARE.
  27. */
  28. #include "qemu/osdep.h"
  29. #include <termios.h>
  30. #include <glib/gprintf.h>
  31. #include "sysemu/sysemu.h"
  32. #include "trace.h"
  33. #include "qapi/error.h"
  34. #include "qemu/error-report.h"
  35. #include "qemu/madvise.h"
  36. #include "qemu/sockets.h"
  37. #include "qemu/thread.h"
  38. #include <libgen.h>
  39. #include "qemu/cutils.h"
  40. #include "qemu/units.h"
  41. #include "qemu/thread-context.h"
  42. #include "qemu/main-loop.h"
  43. #ifdef CONFIG_LINUX
  44. #include <sys/syscall.h>
  45. #endif
  46. #ifdef __FreeBSD__
  47. #include <sys/thr.h>
  48. #include <sys/user.h>
  49. #include <libutil.h>
  50. #endif
  51. #ifdef __NetBSD__
  52. #include <lwp.h>
  53. #endif
  54. #include "qemu/mmap-alloc.h"
  55. #define MAX_MEM_PREALLOC_THREAD_COUNT 16
  56. struct MemsetThread;
  57. static QLIST_HEAD(, MemsetContext) memset_contexts =
  58. QLIST_HEAD_INITIALIZER(memset_contexts);
  59. typedef struct MemsetContext {
  60. bool all_threads_created;
  61. bool any_thread_failed;
  62. struct MemsetThread *threads;
  63. int num_threads;
  64. QLIST_ENTRY(MemsetContext) next;
  65. } MemsetContext;
  66. struct MemsetThread {
  67. char *addr;
  68. size_t numpages;
  69. size_t hpagesize;
  70. QemuThread pgthread;
  71. sigjmp_buf env;
  72. MemsetContext *context;
  73. };
  74. typedef struct MemsetThread MemsetThread;
  75. /* used by sigbus_handler() */
  76. static MemsetContext *sigbus_memset_context;
  77. struct sigaction sigbus_oldact;
  78. static QemuMutex sigbus_mutex;
  79. static QemuMutex page_mutex;
  80. static QemuCond page_cond;
  81. int qemu_get_thread_id(void)
  82. {
  83. #if defined(__linux__)
  84. return syscall(SYS_gettid);
  85. #elif defined(__FreeBSD__)
  86. /* thread id is up to INT_MAX */
  87. long tid;
  88. thr_self(&tid);
  89. return (int)tid;
  90. #elif defined(__NetBSD__)
  91. return _lwp_self();
  92. #elif defined(__OpenBSD__)
  93. return getthrid();
  94. #else
  95. return getpid();
  96. #endif
  97. }
  98. int qemu_daemon(int nochdir, int noclose)
  99. {
  100. return daemon(nochdir, noclose);
  101. }
  102. bool qemu_write_pidfile(const char *path, Error **errp)
  103. {
  104. int fd;
  105. char pidstr[32];
  106. while (1) {
  107. struct stat a, b;
  108. struct flock lock = {
  109. .l_type = F_WRLCK,
  110. .l_whence = SEEK_SET,
  111. .l_len = 0,
  112. };
  113. fd = qemu_create(path, O_WRONLY, S_IRUSR | S_IWUSR, errp);
  114. if (fd == -1) {
  115. return false;
  116. }
  117. if (fstat(fd, &b) < 0) {
  118. error_setg_errno(errp, errno, "Cannot stat file");
  119. goto fail_close;
  120. }
  121. if (fcntl(fd, F_SETLK, &lock)) {
  122. error_setg_errno(errp, errno, "Cannot lock pid file");
  123. goto fail_close;
  124. }
  125. /*
  126. * Now make sure the path we locked is the same one that now
  127. * exists on the filesystem.
  128. */
  129. if (stat(path, &a) < 0) {
  130. /*
  131. * PID file disappeared, someone else must be racing with
  132. * us, so try again.
  133. */
  134. close(fd);
  135. continue;
  136. }
  137. if (a.st_ino == b.st_ino) {
  138. break;
  139. }
  140. /*
  141. * PID file was recreated, someone else must be racing with
  142. * us, so try again.
  143. */
  144. close(fd);
  145. }
  146. if (ftruncate(fd, 0) < 0) {
  147. error_setg_errno(errp, errno, "Failed to truncate pid file");
  148. goto fail_unlink;
  149. }
  150. snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
  151. if (qemu_write_full(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
  152. error_setg(errp, "Failed to write pid file");
  153. goto fail_unlink;
  154. }
  155. return true;
  156. fail_unlink:
  157. unlink(path);
  158. fail_close:
  159. close(fd);
  160. return false;
  161. }
  162. /* alloc shared memory pages */
  163. void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared,
  164. bool noreserve)
  165. {
  166. const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) |
  167. (noreserve ? QEMU_MAP_NORESERVE : 0);
  168. size_t align = QEMU_VMALLOC_ALIGN;
  169. void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0);
  170. if (ptr == MAP_FAILED) {
  171. return NULL;
  172. }
  173. if (alignment) {
  174. *alignment = align;
  175. }
  176. trace_qemu_anon_ram_alloc(size, ptr);
  177. return ptr;
  178. }
  179. void qemu_anon_ram_free(void *ptr, size_t size)
  180. {
  181. trace_qemu_anon_ram_free(ptr, size);
  182. qemu_ram_munmap(-1, ptr, size);
  183. }
  184. void qemu_socket_set_block(int fd)
  185. {
  186. g_unix_set_fd_nonblocking(fd, false, NULL);
  187. }
  188. int qemu_socket_try_set_nonblock(int fd)
  189. {
  190. return g_unix_set_fd_nonblocking(fd, true, NULL) ? 0 : -errno;
  191. }
  192. void qemu_socket_set_nonblock(int fd)
  193. {
  194. int f;
  195. f = qemu_socket_try_set_nonblock(fd);
  196. assert(f == 0);
  197. }
  198. int socket_set_fast_reuse(int fd)
  199. {
  200. int val = 1, ret;
  201. ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
  202. (const char *)&val, sizeof(val));
  203. assert(ret == 0);
  204. return ret;
  205. }
  206. void qemu_set_cloexec(int fd)
  207. {
  208. int f;
  209. f = fcntl(fd, F_GETFD);
  210. assert(f != -1);
  211. f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
  212. assert(f != -1);
  213. }
  214. int qemu_socketpair(int domain, int type, int protocol, int sv[2])
  215. {
  216. int ret;
  217. #ifdef SOCK_CLOEXEC
  218. ret = socketpair(domain, type | SOCK_CLOEXEC, protocol, sv);
  219. if (ret != -1 || errno != EINVAL) {
  220. return ret;
  221. }
  222. #endif
  223. ret = socketpair(domain, type, protocol, sv);
  224. if (ret == 0) {
  225. qemu_set_cloexec(sv[0]);
  226. qemu_set_cloexec(sv[1]);
  227. }
  228. return ret;
  229. }
  230. char *
  231. qemu_get_local_state_dir(void)
  232. {
  233. return get_relocated_path(CONFIG_QEMU_LOCALSTATEDIR);
  234. }
  235. void qemu_set_tty_echo(int fd, bool echo)
  236. {
  237. struct termios tty;
  238. tcgetattr(fd, &tty);
  239. if (echo) {
  240. tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
  241. } else {
  242. tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
  243. }
  244. tcsetattr(fd, TCSANOW, &tty);
  245. }
  246. #ifdef CONFIG_LINUX
  247. static void sigbus_handler(int signal, siginfo_t *siginfo, void *ctx)
  248. #else /* CONFIG_LINUX */
  249. static void sigbus_handler(int signal)
  250. #endif /* CONFIG_LINUX */
  251. {
  252. int i;
  253. if (sigbus_memset_context) {
  254. for (i = 0; i < sigbus_memset_context->num_threads; i++) {
  255. MemsetThread *thread = &sigbus_memset_context->threads[i];
  256. if (qemu_thread_is_self(&thread->pgthread)) {
  257. siglongjmp(thread->env, 1);
  258. }
  259. }
  260. }
  261. #ifdef CONFIG_LINUX
  262. /*
  263. * We assume that the MCE SIGBUS handler could have been registered. We
  264. * should never receive BUS_MCEERR_AO on any of our threads, but only on
  265. * the main thread registered for PR_MCE_KILL_EARLY. Further, we should not
  266. * receive BUS_MCEERR_AR triggered by action of other threads on one of
  267. * our threads. So, no need to check for unrelated SIGBUS when seeing one
  268. * for our threads.
  269. *
  270. * We will forward to the MCE handler, which will either handle the SIGBUS
  271. * or reinstall the default SIGBUS handler and reraise the SIGBUS. The
  272. * default SIGBUS handler will crash the process, so we don't care.
  273. */
  274. if (sigbus_oldact.sa_flags & SA_SIGINFO) {
  275. sigbus_oldact.sa_sigaction(signal, siginfo, ctx);
  276. return;
  277. }
  278. #endif /* CONFIG_LINUX */
  279. warn_report("qemu_prealloc_mem: unrelated SIGBUS detected and ignored");
  280. }
  281. static void *do_touch_pages(void *arg)
  282. {
  283. MemsetThread *memset_args = (MemsetThread *)arg;
  284. sigset_t set, oldset;
  285. int ret = 0;
  286. /*
  287. * On Linux, the page faults from the loop below can cause mmap_sem
  288. * contention with allocation of the thread stacks. Do not start
  289. * clearing until all threads have been created.
  290. */
  291. qemu_mutex_lock(&page_mutex);
  292. while (!memset_args->context->all_threads_created) {
  293. qemu_cond_wait(&page_cond, &page_mutex);
  294. }
  295. qemu_mutex_unlock(&page_mutex);
  296. /* unblock SIGBUS */
  297. sigemptyset(&set);
  298. sigaddset(&set, SIGBUS);
  299. pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
  300. if (sigsetjmp(memset_args->env, 1)) {
  301. ret = -EFAULT;
  302. } else {
  303. char *addr = memset_args->addr;
  304. size_t numpages = memset_args->numpages;
  305. size_t hpagesize = memset_args->hpagesize;
  306. size_t i;
  307. for (i = 0; i < numpages; i++) {
  308. /*
  309. * Read & write back the same value, so we don't
  310. * corrupt existing user/app data that might be
  311. * stored.
  312. *
  313. * 'volatile' to stop compiler optimizing this away
  314. * to a no-op
  315. */
  316. *(volatile char *)addr = *addr;
  317. addr += hpagesize;
  318. }
  319. }
  320. pthread_sigmask(SIG_SETMASK, &oldset, NULL);
  321. return (void *)(uintptr_t)ret;
  322. }
  323. static void *do_madv_populate_write_pages(void *arg)
  324. {
  325. MemsetThread *memset_args = (MemsetThread *)arg;
  326. const size_t size = memset_args->numpages * memset_args->hpagesize;
  327. char * const addr = memset_args->addr;
  328. int ret = 0;
  329. /* See do_touch_pages(). */
  330. qemu_mutex_lock(&page_mutex);
  331. while (!memset_args->context->all_threads_created) {
  332. qemu_cond_wait(&page_cond, &page_mutex);
  333. }
  334. qemu_mutex_unlock(&page_mutex);
  335. if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) {
  336. ret = -errno;
  337. }
  338. return (void *)(uintptr_t)ret;
  339. }
  340. static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
  341. int max_threads)
  342. {
  343. long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
  344. int ret = 1;
  345. if (host_procs > 0) {
  346. ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), max_threads);
  347. }
  348. /* Especially with gigantic pages, don't create more threads than pages. */
  349. ret = MIN(ret, numpages);
  350. /* Don't start threads to prealloc comparatively little memory. */
  351. ret = MIN(ret, MAX(1, hpagesize * numpages / (64 * MiB)));
  352. /* In case sysconf() fails, we fall back to single threaded */
  353. return ret;
  354. }
  355. static int wait_and_free_mem_prealloc_context(MemsetContext *context)
  356. {
  357. int i, ret = 0, tmp;
  358. for (i = 0; i < context->num_threads; i++) {
  359. tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);
  360. if (tmp) {
  361. ret = tmp;
  362. }
  363. }
  364. g_free(context->threads);
  365. g_free(context);
  366. return ret;
  367. }
  368. static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
  369. int max_threads, ThreadContext *tc, bool async,
  370. bool use_madv_populate_write)
  371. {
  372. static gsize initialized = 0;
  373. MemsetContext *context = g_malloc0(sizeof(MemsetContext));
  374. size_t numpages_per_thread, leftover;
  375. void *(*touch_fn)(void *);
  376. int ret, i = 0;
  377. char *addr = area;
  378. /*
  379. * Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE
  380. * and prealloc context for thread placement.
  381. */
  382. if (!use_madv_populate_write || !tc) {
  383. async = false;
  384. }
  385. context->num_threads =
  386. get_memset_num_threads(hpagesize, numpages, max_threads);
  387. if (g_once_init_enter(&initialized)) {
  388. qemu_mutex_init(&page_mutex);
  389. qemu_cond_init(&page_cond);
  390. g_once_init_leave(&initialized, 1);
  391. }
  392. if (use_madv_populate_write) {
  393. /*
  394. * Avoid creating a single thread for MADV_POPULATE_WRITE when
  395. * preallocating synchronously.
  396. */
  397. if (context->num_threads == 1 && !async) {
  398. ret = 0;
  399. if (qemu_madvise(area, hpagesize * numpages,
  400. QEMU_MADV_POPULATE_WRITE)) {
  401. ret = -errno;
  402. }
  403. g_free(context);
  404. return ret;
  405. }
  406. touch_fn = do_madv_populate_write_pages;
  407. } else {
  408. touch_fn = do_touch_pages;
  409. }
  410. context->threads = g_new0(MemsetThread, context->num_threads);
  411. numpages_per_thread = numpages / context->num_threads;
  412. leftover = numpages % context->num_threads;
  413. for (i = 0; i < context->num_threads; i++) {
  414. context->threads[i].addr = addr;
  415. context->threads[i].numpages = numpages_per_thread + (i < leftover);
  416. context->threads[i].hpagesize = hpagesize;
  417. context->threads[i].context = context;
  418. if (tc) {
  419. thread_context_create_thread(tc, &context->threads[i].pgthread,
  420. "touch_pages",
  421. touch_fn, &context->threads[i],
  422. QEMU_THREAD_JOINABLE);
  423. } else {
  424. qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
  425. touch_fn, &context->threads[i],
  426. QEMU_THREAD_JOINABLE);
  427. }
  428. addr += context->threads[i].numpages * hpagesize;
  429. }
  430. if (async) {
  431. /*
  432. * async requests currently require the BQL. Add it to the list and kick
  433. * preallocation off during qemu_finish_async_prealloc_mem().
  434. */
  435. assert(bql_locked());
  436. QLIST_INSERT_HEAD(&memset_contexts, context, next);
  437. return 0;
  438. }
  439. if (!use_madv_populate_write) {
  440. sigbus_memset_context = context;
  441. }
  442. qemu_mutex_lock(&page_mutex);
  443. context->all_threads_created = true;
  444. qemu_cond_broadcast(&page_cond);
  445. qemu_mutex_unlock(&page_mutex);
  446. ret = wait_and_free_mem_prealloc_context(context);
  447. if (!use_madv_populate_write) {
  448. sigbus_memset_context = NULL;
  449. }
  450. return ret;
  451. }
  452. bool qemu_finish_async_prealloc_mem(Error **errp)
  453. {
  454. int ret = 0, tmp;
  455. MemsetContext *context, *next_context;
  456. /* Waiting for preallocation requires the BQL. */
  457. assert(bql_locked());
  458. if (QLIST_EMPTY(&memset_contexts)) {
  459. return true;
  460. }
  461. qemu_mutex_lock(&page_mutex);
  462. QLIST_FOREACH(context, &memset_contexts, next) {
  463. context->all_threads_created = true;
  464. }
  465. qemu_cond_broadcast(&page_cond);
  466. qemu_mutex_unlock(&page_mutex);
  467. QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
  468. QLIST_REMOVE(context, next);
  469. tmp = wait_and_free_mem_prealloc_context(context);
  470. if (tmp) {
  471. ret = tmp;
  472. }
  473. }
  474. if (ret) {
  475. error_setg_errno(errp, -ret,
  476. "qemu_prealloc_mem: preallocating memory failed");
  477. return false;
  478. }
  479. return true;
  480. }
  481. static bool madv_populate_write_possible(char *area, size_t pagesize)
  482. {
  483. return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
  484. errno != EINVAL;
  485. }
  486. bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
  487. ThreadContext *tc, bool async, Error **errp)
  488. {
  489. static gsize initialized;
  490. int ret;
  491. size_t hpagesize = qemu_fd_getpagesize(fd);
  492. size_t numpages = DIV_ROUND_UP(sz, hpagesize);
  493. bool use_madv_populate_write;
  494. struct sigaction act;
  495. bool rv = true;
  496. /*
  497. * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
  498. * some special mappings, such as mapping /dev/mem.
  499. */
  500. use_madv_populate_write = madv_populate_write_possible(area, hpagesize);
  501. if (!use_madv_populate_write) {
  502. if (g_once_init_enter(&initialized)) {
  503. qemu_mutex_init(&sigbus_mutex);
  504. g_once_init_leave(&initialized, 1);
  505. }
  506. qemu_mutex_lock(&sigbus_mutex);
  507. memset(&act, 0, sizeof(act));
  508. #ifdef CONFIG_LINUX
  509. act.sa_sigaction = &sigbus_handler;
  510. act.sa_flags = SA_SIGINFO;
  511. #else /* CONFIG_LINUX */
  512. act.sa_handler = &sigbus_handler;
  513. act.sa_flags = 0;
  514. #endif /* CONFIG_LINUX */
  515. ret = sigaction(SIGBUS, &act, &sigbus_oldact);
  516. if (ret) {
  517. qemu_mutex_unlock(&sigbus_mutex);
  518. error_setg_errno(errp, errno,
  519. "qemu_prealloc_mem: failed to install signal handler");
  520. return false;
  521. }
  522. }
  523. /* touch pages simultaneously */
  524. ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
  525. use_madv_populate_write);
  526. if (ret) {
  527. error_setg_errno(errp, -ret,
  528. "qemu_prealloc_mem: preallocating memory failed");
  529. rv = false;
  530. }
  531. if (!use_madv_populate_write) {
  532. ret = sigaction(SIGBUS, &sigbus_oldact, NULL);
  533. if (ret) {
  534. /* Terminate QEMU since it can't recover from error */
  535. perror("qemu_prealloc_mem: failed to reinstall signal handler");
  536. exit(1);
  537. }
  538. qemu_mutex_unlock(&sigbus_mutex);
  539. }
  540. return rv;
  541. }
  542. char *qemu_get_pid_name(pid_t pid)
  543. {
  544. char *name = NULL;
  545. #if defined(__FreeBSD__)
  546. /* BSDs don't have /proc, but they provide a nice substitute */
  547. struct kinfo_proc *proc = kinfo_getproc(pid);
  548. if (proc) {
  549. name = g_strdup(proc->ki_comm);
  550. free(proc);
  551. }
  552. #else
  553. /* Assume a system with reasonable procfs */
  554. char *pid_path;
  555. size_t len;
  556. pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
  557. g_file_get_contents(pid_path, &name, &len, NULL);
  558. g_free(pid_path);
  559. #endif
  560. return name;
  561. }
  562. void *qemu_alloc_stack(size_t *sz)
  563. {
  564. void *ptr;
  565. int flags;
  566. #ifdef CONFIG_DEBUG_STACK_USAGE
  567. void *ptr2;
  568. #endif
  569. size_t pagesz = qemu_real_host_page_size();
  570. #ifdef _SC_THREAD_STACK_MIN
  571. /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
  572. long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
  573. *sz = MAX(MAX(min_stack_sz, 0), *sz);
  574. #endif
  575. /* adjust stack size to a multiple of the page size */
  576. *sz = ROUND_UP(*sz, pagesz);
  577. /* allocate one extra page for the guard page */
  578. *sz += pagesz;
  579. flags = MAP_PRIVATE | MAP_ANONYMOUS;
  580. #if defined(MAP_STACK) && defined(__OpenBSD__)
  581. /* Only enable MAP_STACK on OpenBSD. Other OS's such as
  582. * Linux/FreeBSD/NetBSD have a flag with the same name
  583. * but have differing functionality. OpenBSD will SEGV
  584. * if it spots execution with a stack pointer pointing
  585. * at memory that was not allocated with MAP_STACK.
  586. */
  587. flags |= MAP_STACK;
  588. #endif
  589. ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
  590. if (ptr == MAP_FAILED) {
  591. perror("failed to allocate memory for stack");
  592. abort();
  593. }
  594. /* Stack grows down -- guard page at the bottom. */
  595. if (mprotect(ptr, pagesz, PROT_NONE) != 0) {
  596. perror("failed to set up stack guard page");
  597. abort();
  598. }
  599. #ifdef CONFIG_DEBUG_STACK_USAGE
  600. for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
  601. *(uint32_t *)ptr2 = 0xdeadbeaf;
  602. }
  603. #endif
  604. return ptr;
  605. }
  606. #ifdef CONFIG_DEBUG_STACK_USAGE
  607. static __thread unsigned int max_stack_usage;
  608. #endif
  609. void qemu_free_stack(void *stack, size_t sz)
  610. {
  611. #ifdef CONFIG_DEBUG_STACK_USAGE
  612. unsigned int usage;
  613. void *ptr;
  614. for (ptr = stack + qemu_real_host_page_size(); ptr < stack + sz;
  615. ptr += sizeof(uint32_t)) {
  616. if (*(uint32_t *)ptr != 0xdeadbeaf) {
  617. break;
  618. }
  619. }
  620. usage = sz - (uintptr_t) (ptr - stack);
  621. if (usage > max_stack_usage) {
  622. error_report("thread %d max stack usage increased from %u to %u",
  623. qemu_get_thread_id(), max_stack_usage, usage);
  624. max_stack_usage = usage;
  625. }
  626. #endif
  627. munmap(stack, sz);
  628. }
  629. /*
  630. * Disable CFI checks.
  631. * We are going to call a signal handler directly. Such handler may or may not
  632. * have been defined in our binary, so there's no guarantee that the pointer
  633. * used to set the handler is a cfi-valid pointer. Since the handlers are
  634. * stored in kernel memory, changing the handler to an attacker-defined
  635. * function requires being able to call a sigaction() syscall,
  636. * which is not as easy as overwriting a pointer in memory.
  637. */
  638. QEMU_DISABLE_CFI
  639. void sigaction_invoke(struct sigaction *action,
  640. struct qemu_signalfd_siginfo *info)
  641. {
  642. siginfo_t si = {};
  643. si.si_signo = info->ssi_signo;
  644. si.si_errno = info->ssi_errno;
  645. si.si_code = info->ssi_code;
  646. /* Convert the minimal set of fields defined by POSIX.
  647. * Positive si_code values are reserved for kernel-generated
  648. * signals, where the valid siginfo fields are determined by
  649. * the signal number. But according to POSIX, it is unspecified
  650. * whether SI_USER and SI_QUEUE have values less than or equal to
  651. * zero.
  652. */
  653. if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
  654. info->ssi_code <= 0) {
  655. /* SIGTERM, etc. */
  656. si.si_pid = info->ssi_pid;
  657. si.si_uid = info->ssi_uid;
  658. } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
  659. info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
  660. si.si_addr = (void *)(uintptr_t)info->ssi_addr;
  661. } else if (info->ssi_signo == SIGCHLD) {
  662. si.si_pid = info->ssi_pid;
  663. si.si_status = info->ssi_status;
  664. si.si_uid = info->ssi_uid;
  665. }
  666. action->sa_sigaction(info->ssi_signo, &si, NULL);
  667. }
  668. size_t qemu_get_host_physmem(void)
  669. {
  670. #ifdef _SC_PHYS_PAGES
  671. long pages = sysconf(_SC_PHYS_PAGES);
  672. if (pages > 0) {
  673. if (pages > SIZE_MAX / qemu_real_host_page_size()) {
  674. return SIZE_MAX;
  675. } else {
  676. return pages * qemu_real_host_page_size();
  677. }
  678. }
  679. #endif
  680. return 0;
  681. }
  682. int qemu_msync(void *addr, size_t length, int fd)
  683. {
  684. size_t align_mask = ~(qemu_real_host_page_size() - 1);
  685. /**
  686. * There are no strict reqs as per the length of mapping
  687. * to be synced. Still the length needs to follow the address
  688. * alignment changes. Additionally - round the size to the multiple
  689. * of PAGE_SIZE
  690. */
  691. length += ((uintptr_t)addr & (qemu_real_host_page_size() - 1));
  692. length = (length + ~align_mask) & align_mask;
  693. addr = (void *)((uintptr_t)addr & align_mask);
  694. return msync(addr, length, MS_SYNC);
  695. }
  696. static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
  697. {
  698. struct dirent *de;
  699. int fd, dfd;
  700. DIR *dir;
  701. unsigned int skip_start = 0, skip_end = nskip;
  702. dir = opendir("/proc/self/fd");
  703. if (!dir) {
  704. /* If /proc is not mounted, there is nothing that can be done. */
  705. return false;
  706. }
  707. /* Avoid closing the directory. */
  708. dfd = dirfd(dir);
  709. for (de = readdir(dir); de; de = readdir(dir)) {
  710. bool close_fd = true;
  711. if (de->d_name[0] == '.') {
  712. continue;
  713. }
  714. fd = atoi(de->d_name);
  715. if (fd == dfd) {
  716. continue;
  717. }
  718. for (unsigned int i = skip_start; i < skip_end; i++) {
  719. if (fd < skip[i]) {
  720. /* We are below the next skipped fd, break */
  721. break;
  722. } else if (fd == skip[i]) {
  723. close_fd = false;
  724. /* Restrict the range as we found fds matching start/end */
  725. if (i == skip_start) {
  726. skip_start++;
  727. } else if (i == skip_end) {
  728. skip_end--;
  729. }
  730. break;
  731. }
  732. }
  733. if (close_fd) {
  734. close(fd);
  735. }
  736. }
  737. closedir(dir);
  738. return true;
  739. }
  740. static bool qemu_close_all_open_fd_close_range(const int *skip,
  741. unsigned int nskip,
  742. int open_max)
  743. {
  744. #ifdef CONFIG_CLOSE_RANGE
  745. int max_fd = open_max - 1;
  746. int first = 0, last;
  747. unsigned int cur_skip = 0;
  748. int ret;
  749. do {
  750. /* Find the start boundary of the range to close */
  751. while (cur_skip < nskip && first == skip[cur_skip]) {
  752. cur_skip++;
  753. first++;
  754. }
  755. /* Find the upper boundary of the range to close */
  756. last = max_fd;
  757. if (cur_skip < nskip) {
  758. last = skip[cur_skip] - 1;
  759. last = MIN(last, max_fd);
  760. }
  761. /* With the adjustments to the range, we might be done. */
  762. if (first > last) {
  763. break;
  764. }
  765. ret = close_range(first, last, 0);
  766. if (ret < 0) {
  767. return false;
  768. }
  769. first = last + 1;
  770. } while (last < max_fd);
  771. return true;
  772. #else
  773. return false;
  774. #endif
  775. }
  776. static void qemu_close_all_open_fd_fallback(const int *skip, unsigned int nskip,
  777. int open_max)
  778. {
  779. unsigned int cur_skip = 0;
  780. /* Fallback */
  781. for (int i = 0; i < open_max; i++) {
  782. if (cur_skip < nskip && i == skip[cur_skip]) {
  783. cur_skip++;
  784. continue;
  785. }
  786. close(i);
  787. }
  788. }
  789. /*
  790. * Close all open file descriptors.
  791. */
  792. void qemu_close_all_open_fd(const int *skip, unsigned int nskip)
  793. {
  794. int open_max = sysconf(_SC_OPEN_MAX);
  795. assert(skip != NULL || nskip == 0);
  796. if (!qemu_close_all_open_fd_close_range(skip, nskip, open_max) &&
  797. !qemu_close_all_open_fd_proc(skip, nskip)) {
  798. qemu_close_all_open_fd_fallback(skip, nskip, open_max);
  799. }
  800. }