libvhost-user.c 78 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883
  1. /*
  2. * Vhost User library
  3. *
  4. * Copyright IBM, Corp. 2007
  5. * Copyright (c) 2016 Red Hat, Inc.
  6. *
  7. * Authors:
  8. * Anthony Liguori <aliguori@us.ibm.com>
  9. * Marc-André Lureau <mlureau@redhat.com>
  10. * Victor Kaplansky <victork@redhat.com>
  11. *
  12. * This work is licensed under the terms of the GNU GPL, version 2 or
  13. * later. See the COPYING file in the top-level directory.
  14. */
  15. /* this code avoids GLib dependency */
  16. #include <stdlib.h>
  17. #include <stdio.h>
  18. #include <unistd.h>
  19. #include <stdarg.h>
  20. #include <errno.h>
  21. #include <string.h>
  22. #include <assert.h>
  23. #include <inttypes.h>
  24. #include <sys/types.h>
  25. #include <sys/socket.h>
  26. #include <sys/eventfd.h>
  27. #include <sys/mman.h>
  28. #include <endian.h>
  29. #if defined(__linux__)
  30. #include <sys/syscall.h>
  31. #include <fcntl.h>
  32. #include <sys/ioctl.h>
  33. #include <linux/vhost.h>
  34. #ifdef __NR_userfaultfd
  35. #include <linux/userfaultfd.h>
  36. #endif
  37. #endif
  38. #include "include/atomic.h"
  39. #include "libvhost-user.h"
  40. /* usually provided by GLib */
  41. #ifndef MIN
  42. #define MIN(x, y) ({ \
  43. typeof(x) _min1 = (x); \
  44. typeof(y) _min2 = (y); \
  45. (void) (&_min1 == &_min2); \
  46. _min1 < _min2 ? _min1 : _min2; })
  47. #endif
  48. /* Round number down to multiple */
  49. #define ALIGN_DOWN(n, m) ((n) / (m) * (m))
  50. /* Round number up to multiple */
  51. #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
  52. #ifndef unlikely
  53. #define unlikely(x) __builtin_expect(!!(x), 0)
  54. #endif
  55. /* Align each region to cache line size in inflight buffer */
  56. #define INFLIGHT_ALIGNMENT 64
  57. /* The version of inflight buffer */
  58. #define INFLIGHT_VERSION 1
  59. /* The version of the protocol we support */
  60. #define VHOST_USER_VERSION 1
  61. #define LIBVHOST_USER_DEBUG 0
  62. #define DPRINT(...) \
  63. do { \
  64. if (LIBVHOST_USER_DEBUG) { \
  65. fprintf(stderr, __VA_ARGS__); \
  66. } \
  67. } while (0)
  68. static inline
  69. bool has_feature(uint64_t features, unsigned int fbit)
  70. {
  71. assert(fbit < 64);
  72. return !!(features & (1ULL << fbit));
  73. }
  74. static inline
  75. bool vu_has_feature(VuDev *dev,
  76. unsigned int fbit)
  77. {
  78. return has_feature(dev->features, fbit);
  79. }
  80. static inline bool vu_has_protocol_feature(VuDev *dev, unsigned int fbit)
  81. {
  82. return has_feature(dev->protocol_features, fbit);
  83. }
  84. static const char *
  85. vu_request_to_string(unsigned int req)
  86. {
  87. #define REQ(req) [req] = #req
  88. static const char *vu_request_str[] = {
  89. REQ(VHOST_USER_NONE),
  90. REQ(VHOST_USER_GET_FEATURES),
  91. REQ(VHOST_USER_SET_FEATURES),
  92. REQ(VHOST_USER_SET_OWNER),
  93. REQ(VHOST_USER_RESET_OWNER),
  94. REQ(VHOST_USER_SET_MEM_TABLE),
  95. REQ(VHOST_USER_SET_LOG_BASE),
  96. REQ(VHOST_USER_SET_LOG_FD),
  97. REQ(VHOST_USER_SET_VRING_NUM),
  98. REQ(VHOST_USER_SET_VRING_ADDR),
  99. REQ(VHOST_USER_SET_VRING_BASE),
  100. REQ(VHOST_USER_GET_VRING_BASE),
  101. REQ(VHOST_USER_SET_VRING_KICK),
  102. REQ(VHOST_USER_SET_VRING_CALL),
  103. REQ(VHOST_USER_SET_VRING_ERR),
  104. REQ(VHOST_USER_GET_PROTOCOL_FEATURES),
  105. REQ(VHOST_USER_SET_PROTOCOL_FEATURES),
  106. REQ(VHOST_USER_GET_QUEUE_NUM),
  107. REQ(VHOST_USER_SET_VRING_ENABLE),
  108. REQ(VHOST_USER_SEND_RARP),
  109. REQ(VHOST_USER_NET_SET_MTU),
  110. REQ(VHOST_USER_SET_SLAVE_REQ_FD),
  111. REQ(VHOST_USER_IOTLB_MSG),
  112. REQ(VHOST_USER_SET_VRING_ENDIAN),
  113. REQ(VHOST_USER_GET_CONFIG),
  114. REQ(VHOST_USER_SET_CONFIG),
  115. REQ(VHOST_USER_POSTCOPY_ADVISE),
  116. REQ(VHOST_USER_POSTCOPY_LISTEN),
  117. REQ(VHOST_USER_POSTCOPY_END),
  118. REQ(VHOST_USER_GET_INFLIGHT_FD),
  119. REQ(VHOST_USER_SET_INFLIGHT_FD),
  120. REQ(VHOST_USER_GPU_SET_SOCKET),
  121. REQ(VHOST_USER_VRING_KICK),
  122. REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
  123. REQ(VHOST_USER_ADD_MEM_REG),
  124. REQ(VHOST_USER_REM_MEM_REG),
  125. REQ(VHOST_USER_MAX),
  126. };
  127. #undef REQ
  128. if (req < VHOST_USER_MAX) {
  129. return vu_request_str[req];
  130. } else {
  131. return "unknown";
  132. }
  133. }
  134. static void
  135. vu_panic(VuDev *dev, const char *msg, ...)
  136. {
  137. char *buf = NULL;
  138. va_list ap;
  139. va_start(ap, msg);
  140. if (vasprintf(&buf, msg, ap) < 0) {
  141. buf = NULL;
  142. }
  143. va_end(ap);
  144. dev->broken = true;
  145. dev->panic(dev, buf);
  146. free(buf);
  147. /*
  148. * FIXME:
  149. * find a way to call virtio_error, or perhaps close the connection?
  150. */
  151. }
  152. /* Translate guest physical address to our virtual address. */
  153. void *
  154. vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr)
  155. {
  156. int i;
  157. if (*plen == 0) {
  158. return NULL;
  159. }
  160. /* Find matching memory region. */
  161. for (i = 0; i < dev->nregions; i++) {
  162. VuDevRegion *r = &dev->regions[i];
  163. if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
  164. if ((guest_addr + *plen) > (r->gpa + r->size)) {
  165. *plen = r->gpa + r->size - guest_addr;
  166. }
  167. return (void *)(uintptr_t)
  168. guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
  169. }
  170. }
  171. return NULL;
  172. }
  173. /* Translate qemu virtual address to our virtual address. */
  174. static void *
  175. qva_to_va(VuDev *dev, uint64_t qemu_addr)
  176. {
  177. int i;
  178. /* Find matching memory region. */
  179. for (i = 0; i < dev->nregions; i++) {
  180. VuDevRegion *r = &dev->regions[i];
  181. if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
  182. return (void *)(uintptr_t)
  183. qemu_addr - r->qva + r->mmap_addr + r->mmap_offset;
  184. }
  185. }
  186. return NULL;
  187. }
  188. static void
  189. vmsg_close_fds(VhostUserMsg *vmsg)
  190. {
  191. int i;
  192. for (i = 0; i < vmsg->fd_num; i++) {
  193. close(vmsg->fds[i]);
  194. }
  195. }
  196. /* Set reply payload.u64 and clear request flags and fd_num */
  197. static void vmsg_set_reply_u64(VhostUserMsg *vmsg, uint64_t val)
  198. {
  199. vmsg->flags = 0; /* defaults will be set by vu_send_reply() */
  200. vmsg->size = sizeof(vmsg->payload.u64);
  201. vmsg->payload.u64 = val;
  202. vmsg->fd_num = 0;
  203. }
  204. /* A test to see if we have userfault available */
  205. static bool
  206. have_userfault(void)
  207. {
  208. #if defined(__linux__) && defined(__NR_userfaultfd) &&\
  209. defined(UFFD_FEATURE_MISSING_SHMEM) &&\
  210. defined(UFFD_FEATURE_MISSING_HUGETLBFS)
  211. /* Now test the kernel we're running on really has the features */
  212. int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
  213. struct uffdio_api api_struct;
  214. if (ufd < 0) {
  215. return false;
  216. }
  217. api_struct.api = UFFD_API;
  218. api_struct.features = UFFD_FEATURE_MISSING_SHMEM |
  219. UFFD_FEATURE_MISSING_HUGETLBFS;
  220. if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  221. close(ufd);
  222. return false;
  223. }
  224. close(ufd);
  225. return true;
  226. #else
  227. return false;
  228. #endif
  229. }
  230. static bool
  231. vu_message_read_default(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
  232. {
  233. char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {};
  234. struct iovec iov = {
  235. .iov_base = (char *)vmsg,
  236. .iov_len = VHOST_USER_HDR_SIZE,
  237. };
  238. struct msghdr msg = {
  239. .msg_iov = &iov,
  240. .msg_iovlen = 1,
  241. .msg_control = control,
  242. .msg_controllen = sizeof(control),
  243. };
  244. size_t fd_size;
  245. struct cmsghdr *cmsg;
  246. int rc;
  247. do {
  248. rc = recvmsg(conn_fd, &msg, 0);
  249. } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
  250. if (rc < 0) {
  251. vu_panic(dev, "Error while recvmsg: %s", strerror(errno));
  252. return false;
  253. }
  254. vmsg->fd_num = 0;
  255. for (cmsg = CMSG_FIRSTHDR(&msg);
  256. cmsg != NULL;
  257. cmsg = CMSG_NXTHDR(&msg, cmsg))
  258. {
  259. if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
  260. fd_size = cmsg->cmsg_len - CMSG_LEN(0);
  261. vmsg->fd_num = fd_size / sizeof(int);
  262. memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
  263. break;
  264. }
  265. }
  266. if (vmsg->size > sizeof(vmsg->payload)) {
  267. vu_panic(dev,
  268. "Error: too big message request: %d, size: vmsg->size: %u, "
  269. "while sizeof(vmsg->payload) = %zu\n",
  270. vmsg->request, vmsg->size, sizeof(vmsg->payload));
  271. goto fail;
  272. }
  273. if (vmsg->size) {
  274. do {
  275. rc = read(conn_fd, &vmsg->payload, vmsg->size);
  276. } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
  277. if (rc <= 0) {
  278. vu_panic(dev, "Error while reading: %s", strerror(errno));
  279. goto fail;
  280. }
  281. assert(rc == vmsg->size);
  282. }
  283. return true;
  284. fail:
  285. vmsg_close_fds(vmsg);
  286. return false;
  287. }
  288. static bool
  289. vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
  290. {
  291. int rc;
  292. uint8_t *p = (uint8_t *)vmsg;
  293. char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {};
  294. struct iovec iov = {
  295. .iov_base = (char *)vmsg,
  296. .iov_len = VHOST_USER_HDR_SIZE,
  297. };
  298. struct msghdr msg = {
  299. .msg_iov = &iov,
  300. .msg_iovlen = 1,
  301. .msg_control = control,
  302. };
  303. struct cmsghdr *cmsg;
  304. memset(control, 0, sizeof(control));
  305. assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS);
  306. if (vmsg->fd_num > 0) {
  307. size_t fdsize = vmsg->fd_num * sizeof(int);
  308. msg.msg_controllen = CMSG_SPACE(fdsize);
  309. cmsg = CMSG_FIRSTHDR(&msg);
  310. cmsg->cmsg_len = CMSG_LEN(fdsize);
  311. cmsg->cmsg_level = SOL_SOCKET;
  312. cmsg->cmsg_type = SCM_RIGHTS;
  313. memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize);
  314. } else {
  315. msg.msg_controllen = 0;
  316. }
  317. do {
  318. rc = sendmsg(conn_fd, &msg, 0);
  319. } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
  320. if (vmsg->size) {
  321. do {
  322. if (vmsg->data) {
  323. rc = write(conn_fd, vmsg->data, vmsg->size);
  324. } else {
  325. rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size);
  326. }
  327. } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
  328. }
  329. if (rc <= 0) {
  330. vu_panic(dev, "Error while writing: %s", strerror(errno));
  331. return false;
  332. }
  333. return true;
  334. }
  335. static bool
  336. vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
  337. {
  338. /* Set the version in the flags when sending the reply */
  339. vmsg->flags &= ~VHOST_USER_VERSION_MASK;
  340. vmsg->flags |= VHOST_USER_VERSION;
  341. vmsg->flags |= VHOST_USER_REPLY_MASK;
  342. return vu_message_write(dev, conn_fd, vmsg);
  343. }
  344. /*
  345. * Processes a reply on the slave channel.
  346. * Entered with slave_mutex held and releases it before exit.
  347. * Returns true on success.
  348. */
  349. static bool
  350. vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg)
  351. {
  352. VhostUserMsg msg_reply;
  353. bool result = false;
  354. if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) {
  355. result = true;
  356. goto out;
  357. }
  358. if (!vu_message_read_default(dev, dev->slave_fd, &msg_reply)) {
  359. goto out;
  360. }
  361. if (msg_reply.request != vmsg->request) {
  362. DPRINT("Received unexpected msg type. Expected %d received %d",
  363. vmsg->request, msg_reply.request);
  364. goto out;
  365. }
  366. result = msg_reply.payload.u64 == 0;
  367. out:
  368. pthread_mutex_unlock(&dev->slave_mutex);
  369. return result;
  370. }
  371. /* Kick the log_call_fd if required. */
  372. static void
  373. vu_log_kick(VuDev *dev)
  374. {
  375. if (dev->log_call_fd != -1) {
  376. DPRINT("Kicking the QEMU's log...\n");
  377. if (eventfd_write(dev->log_call_fd, 1) < 0) {
  378. vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
  379. }
  380. }
  381. }
  382. static void
  383. vu_log_page(uint8_t *log_table, uint64_t page)
  384. {
  385. DPRINT("Logged dirty guest page: %"PRId64"\n", page);
  386. qatomic_or(&log_table[page / 8], 1 << (page % 8));
  387. }
  388. static void
  389. vu_log_write(VuDev *dev, uint64_t address, uint64_t length)
  390. {
  391. uint64_t page;
  392. if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) ||
  393. !dev->log_table || !length) {
  394. return;
  395. }
  396. assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8));
  397. page = address / VHOST_LOG_PAGE;
  398. while (page * VHOST_LOG_PAGE < address + length) {
  399. vu_log_page(dev->log_table, page);
  400. page += 1;
  401. }
  402. vu_log_kick(dev);
  403. }
  404. static void
  405. vu_kick_cb(VuDev *dev, int condition, void *data)
  406. {
  407. int index = (intptr_t)data;
  408. VuVirtq *vq = &dev->vq[index];
  409. int sock = vq->kick_fd;
  410. eventfd_t kick_data;
  411. ssize_t rc;
  412. rc = eventfd_read(sock, &kick_data);
  413. if (rc == -1) {
  414. vu_panic(dev, "kick eventfd_read(): %s", strerror(errno));
  415. dev->remove_watch(dev, dev->vq[index].kick_fd);
  416. } else {
  417. DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n",
  418. kick_data, vq->handler, index);
  419. if (vq->handler) {
  420. vq->handler(dev, index);
  421. }
  422. }
  423. }
  424. static bool
  425. vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg)
  426. {
  427. vmsg->payload.u64 =
  428. /*
  429. * The following VIRTIO feature bits are supported by our virtqueue
  430. * implementation:
  431. */
  432. 1ULL << VIRTIO_F_NOTIFY_ON_EMPTY |
  433. 1ULL << VIRTIO_RING_F_INDIRECT_DESC |
  434. 1ULL << VIRTIO_RING_F_EVENT_IDX |
  435. 1ULL << VIRTIO_F_VERSION_1 |
  436. /* vhost-user feature bits */
  437. 1ULL << VHOST_F_LOG_ALL |
  438. 1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
  439. if (dev->iface->get_features) {
  440. vmsg->payload.u64 |= dev->iface->get_features(dev);
  441. }
  442. vmsg->size = sizeof(vmsg->payload.u64);
  443. vmsg->fd_num = 0;
  444. DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
  445. return true;
  446. }
  447. static void
  448. vu_set_enable_all_rings(VuDev *dev, bool enabled)
  449. {
  450. uint16_t i;
  451. for (i = 0; i < dev->max_queues; i++) {
  452. dev->vq[i].enable = enabled;
  453. }
  454. }
  455. static bool
  456. vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg)
  457. {
  458. DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
  459. dev->features = vmsg->payload.u64;
  460. if (!vu_has_feature(dev, VIRTIO_F_VERSION_1)) {
  461. /*
  462. * We only support devices conforming to VIRTIO 1.0 or
  463. * later
  464. */
  465. vu_panic(dev, "virtio legacy devices aren't supported by libvhost-user");
  466. return false;
  467. }
  468. if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) {
  469. vu_set_enable_all_rings(dev, true);
  470. }
  471. if (dev->iface->set_features) {
  472. dev->iface->set_features(dev, dev->features);
  473. }
  474. return false;
  475. }
  476. static bool
  477. vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg)
  478. {
  479. return false;
  480. }
  481. static void
  482. vu_close_log(VuDev *dev)
  483. {
  484. if (dev->log_table) {
  485. if (munmap(dev->log_table, dev->log_size) != 0) {
  486. perror("close log munmap() error");
  487. }
  488. dev->log_table = NULL;
  489. }
  490. if (dev->log_call_fd != -1) {
  491. close(dev->log_call_fd);
  492. dev->log_call_fd = -1;
  493. }
  494. }
  495. static bool
  496. vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
  497. {
  498. vu_set_enable_all_rings(dev, false);
  499. return false;
  500. }
  501. static bool
  502. map_ring(VuDev *dev, VuVirtq *vq)
  503. {
  504. vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr);
  505. vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr);
  506. vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr);
  507. DPRINT("Setting virtq addresses:\n");
  508. DPRINT(" vring_desc at %p\n", vq->vring.desc);
  509. DPRINT(" vring_used at %p\n", vq->vring.used);
  510. DPRINT(" vring_avail at %p\n", vq->vring.avail);
  511. return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
  512. }
  513. static bool
  514. generate_faults(VuDev *dev) {
  515. int i;
  516. for (i = 0; i < dev->nregions; i++) {
  517. VuDevRegion *dev_region = &dev->regions[i];
  518. int ret;
  519. #ifdef UFFDIO_REGISTER
  520. /*
  521. * We should already have an open ufd. Mark each memory
  522. * range as ufd.
  523. * Discard any mapping we have here; note I can't use MADV_REMOVE
  524. * or fallocate to make the hole since I don't want to lose
  525. * data that's already arrived in the shared process.
  526. * TODO: How to do hugepage
  527. */
  528. ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
  529. dev_region->size + dev_region->mmap_offset,
  530. MADV_DONTNEED);
  531. if (ret) {
  532. fprintf(stderr,
  533. "%s: Failed to madvise(DONTNEED) region %d: %s\n",
  534. __func__, i, strerror(errno));
  535. }
  536. /*
  537. * Turn off transparent hugepages so we dont get lose wakeups
  538. * in neighbouring pages.
  539. * TODO: Turn this backon later.
  540. */
  541. ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
  542. dev_region->size + dev_region->mmap_offset,
  543. MADV_NOHUGEPAGE);
  544. if (ret) {
  545. /*
  546. * Note: This can happen legally on kernels that are configured
  547. * without madvise'able hugepages
  548. */
  549. fprintf(stderr,
  550. "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n",
  551. __func__, i, strerror(errno));
  552. }
  553. struct uffdio_register reg_struct;
  554. reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
  555. reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
  556. reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
  557. if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
  558. vu_panic(dev, "%s: Failed to userfault region %d "
  559. "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
  560. __func__, i,
  561. dev_region->mmap_addr,
  562. dev_region->size, dev_region->mmap_offset,
  563. dev->postcopy_ufd, strerror(errno));
  564. return false;
  565. }
  566. if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
  567. vu_panic(dev, "%s Region (%d) doesn't support COPY",
  568. __func__, i);
  569. return false;
  570. }
  571. DPRINT("%s: region %d: Registered userfault for %"
  572. PRIx64 " + %" PRIx64 "\n", __func__, i,
  573. (uint64_t)reg_struct.range.start,
  574. (uint64_t)reg_struct.range.len);
  575. /* Now it's registered we can let the client at it */
  576. if (mprotect((void *)(uintptr_t)dev_region->mmap_addr,
  577. dev_region->size + dev_region->mmap_offset,
  578. PROT_READ | PROT_WRITE)) {
  579. vu_panic(dev, "failed to mprotect region %d for postcopy (%s)",
  580. i, strerror(errno));
  581. return false;
  582. }
  583. /* TODO: Stash 'zero' support flags somewhere */
  584. #endif
  585. }
  586. return true;
  587. }
  588. static bool
  589. vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
  590. int i;
  591. bool track_ramblocks = dev->postcopy_listening;
  592. VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
  593. VuDevRegion *dev_region = &dev->regions[dev->nregions];
  594. void *mmap_addr;
  595. /*
  596. * If we are in postcopy mode and we receive a u64 payload with a 0 value
  597. * we know all the postcopy client bases have been received, and we
  598. * should start generating faults.
  599. */
  600. if (track_ramblocks &&
  601. vmsg->size == sizeof(vmsg->payload.u64) &&
  602. vmsg->payload.u64 == 0) {
  603. (void)generate_faults(dev);
  604. return false;
  605. }
  606. DPRINT("Adding region: %u\n", dev->nregions);
  607. DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
  608. msg_region->guest_phys_addr);
  609. DPRINT(" memory_size: 0x%016"PRIx64"\n",
  610. msg_region->memory_size);
  611. DPRINT(" userspace_addr 0x%016"PRIx64"\n",
  612. msg_region->userspace_addr);
  613. DPRINT(" mmap_offset 0x%016"PRIx64"\n",
  614. msg_region->mmap_offset);
  615. dev_region->gpa = msg_region->guest_phys_addr;
  616. dev_region->size = msg_region->memory_size;
  617. dev_region->qva = msg_region->userspace_addr;
  618. dev_region->mmap_offset = msg_region->mmap_offset;
  619. /*
  620. * We don't use offset argument of mmap() since the
  621. * mapped address has to be page aligned, and we use huge
  622. * pages.
  623. */
  624. if (track_ramblocks) {
  625. /*
  626. * In postcopy we're using PROT_NONE here to catch anyone
  627. * accessing it before we userfault.
  628. */
  629. mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
  630. PROT_NONE, MAP_SHARED,
  631. vmsg->fds[0], 0);
  632. } else {
  633. mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
  634. PROT_READ | PROT_WRITE, MAP_SHARED, vmsg->fds[0],
  635. 0);
  636. }
  637. if (mmap_addr == MAP_FAILED) {
  638. vu_panic(dev, "region mmap error: %s", strerror(errno));
  639. } else {
  640. dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
  641. DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
  642. dev_region->mmap_addr);
  643. }
  644. close(vmsg->fds[0]);
  645. if (track_ramblocks) {
  646. /*
  647. * Return the address to QEMU so that it can translate the ufd
  648. * fault addresses back.
  649. */
  650. msg_region->userspace_addr = (uintptr_t)(mmap_addr +
  651. dev_region->mmap_offset);
  652. /* Send the message back to qemu with the addresses filled in. */
  653. vmsg->fd_num = 0;
  654. if (!vu_send_reply(dev, dev->sock, vmsg)) {
  655. vu_panic(dev, "failed to respond to add-mem-region for postcopy");
  656. return false;
  657. }
  658. DPRINT("Successfully added new region in postcopy\n");
  659. dev->nregions++;
  660. return false;
  661. } else {
  662. for (i = 0; i < dev->max_queues; i++) {
  663. if (dev->vq[i].vring.desc) {
  664. if (map_ring(dev, &dev->vq[i])) {
  665. vu_panic(dev, "remapping queue %d for new memory region",
  666. i);
  667. }
  668. }
  669. }
  670. DPRINT("Successfully added new region\n");
  671. dev->nregions++;
  672. vmsg_set_reply_u64(vmsg, 0);
  673. return true;
  674. }
  675. }
  676. static inline bool reg_equal(VuDevRegion *vudev_reg,
  677. VhostUserMemoryRegion *msg_reg)
  678. {
  679. if (vudev_reg->gpa == msg_reg->guest_phys_addr &&
  680. vudev_reg->qva == msg_reg->userspace_addr &&
  681. vudev_reg->size == msg_reg->memory_size) {
  682. return true;
  683. }
  684. return false;
  685. }
  686. static bool
  687. vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
  688. int i, j;
  689. bool found = false;
  690. VuDevRegion shadow_regions[VHOST_USER_MAX_RAM_SLOTS] = {};
  691. VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
  692. DPRINT("Removing region:\n");
  693. DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
  694. msg_region->guest_phys_addr);
  695. DPRINT(" memory_size: 0x%016"PRIx64"\n",
  696. msg_region->memory_size);
  697. DPRINT(" userspace_addr 0x%016"PRIx64"\n",
  698. msg_region->userspace_addr);
  699. DPRINT(" mmap_offset 0x%016"PRIx64"\n",
  700. msg_region->mmap_offset);
  701. for (i = 0, j = 0; i < dev->nregions; i++) {
  702. if (!reg_equal(&dev->regions[i], msg_region)) {
  703. shadow_regions[j].gpa = dev->regions[i].gpa;
  704. shadow_regions[j].size = dev->regions[i].size;
  705. shadow_regions[j].qva = dev->regions[i].qva;
  706. shadow_regions[j].mmap_offset = dev->regions[i].mmap_offset;
  707. j++;
  708. } else {
  709. found = true;
  710. VuDevRegion *r = &dev->regions[i];
  711. void *m = (void *) (uintptr_t) r->mmap_addr;
  712. if (m) {
  713. munmap(m, r->size + r->mmap_offset);
  714. }
  715. }
  716. }
  717. if (found) {
  718. memcpy(dev->regions, shadow_regions,
  719. sizeof(VuDevRegion) * VHOST_USER_MAX_RAM_SLOTS);
  720. DPRINT("Successfully removed a region\n");
  721. dev->nregions--;
  722. vmsg_set_reply_u64(vmsg, 0);
  723. } else {
  724. vu_panic(dev, "Specified region not found\n");
  725. }
  726. return true;
  727. }
  728. static bool
  729. vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
  730. {
  731. int i;
  732. VhostUserMemory m = vmsg->payload.memory, *memory = &m;
  733. dev->nregions = memory->nregions;
  734. DPRINT("Nregions: %u\n", memory->nregions);
  735. for (i = 0; i < dev->nregions; i++) {
  736. void *mmap_addr;
  737. VhostUserMemoryRegion *msg_region = &memory->regions[i];
  738. VuDevRegion *dev_region = &dev->regions[i];
  739. DPRINT("Region %d\n", i);
  740. DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
  741. msg_region->guest_phys_addr);
  742. DPRINT(" memory_size: 0x%016"PRIx64"\n",
  743. msg_region->memory_size);
  744. DPRINT(" userspace_addr 0x%016"PRIx64"\n",
  745. msg_region->userspace_addr);
  746. DPRINT(" mmap_offset 0x%016"PRIx64"\n",
  747. msg_region->mmap_offset);
  748. dev_region->gpa = msg_region->guest_phys_addr;
  749. dev_region->size = msg_region->memory_size;
  750. dev_region->qva = msg_region->userspace_addr;
  751. dev_region->mmap_offset = msg_region->mmap_offset;
  752. /* We don't use offset argument of mmap() since the
  753. * mapped address has to be page aligned, and we use huge
  754. * pages.
  755. * In postcopy we're using PROT_NONE here to catch anyone
  756. * accessing it before we userfault
  757. */
  758. mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
  759. PROT_NONE, MAP_SHARED,
  760. vmsg->fds[i], 0);
  761. if (mmap_addr == MAP_FAILED) {
  762. vu_panic(dev, "region mmap error: %s", strerror(errno));
  763. } else {
  764. dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
  765. DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
  766. dev_region->mmap_addr);
  767. }
  768. /* Return the address to QEMU so that it can translate the ufd
  769. * fault addresses back.
  770. */
  771. msg_region->userspace_addr = (uintptr_t)(mmap_addr +
  772. dev_region->mmap_offset);
  773. close(vmsg->fds[i]);
  774. }
  775. /* Send the message back to qemu with the addresses filled in */
  776. vmsg->fd_num = 0;
  777. if (!vu_send_reply(dev, dev->sock, vmsg)) {
  778. vu_panic(dev, "failed to respond to set-mem-table for postcopy");
  779. return false;
  780. }
  781. /* Wait for QEMU to confirm that it's registered the handler for the
  782. * faults.
  783. */
  784. if (!dev->read_msg(dev, dev->sock, vmsg) ||
  785. vmsg->size != sizeof(vmsg->payload.u64) ||
  786. vmsg->payload.u64 != 0) {
  787. vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
  788. return false;
  789. }
  790. /* OK, now we can go and register the memory and generate faults */
  791. (void)generate_faults(dev);
  792. return false;
  793. }
  794. static bool
  795. vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
  796. {
  797. int i;
  798. VhostUserMemory m = vmsg->payload.memory, *memory = &m;
  799. for (i = 0; i < dev->nregions; i++) {
  800. VuDevRegion *r = &dev->regions[i];
  801. void *m = (void *) (uintptr_t) r->mmap_addr;
  802. if (m) {
  803. munmap(m, r->size + r->mmap_offset);
  804. }
  805. }
  806. dev->nregions = memory->nregions;
  807. if (dev->postcopy_listening) {
  808. return vu_set_mem_table_exec_postcopy(dev, vmsg);
  809. }
  810. DPRINT("Nregions: %u\n", memory->nregions);
  811. for (i = 0; i < dev->nregions; i++) {
  812. void *mmap_addr;
  813. VhostUserMemoryRegion *msg_region = &memory->regions[i];
  814. VuDevRegion *dev_region = &dev->regions[i];
  815. DPRINT("Region %d\n", i);
  816. DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
  817. msg_region->guest_phys_addr);
  818. DPRINT(" memory_size: 0x%016"PRIx64"\n",
  819. msg_region->memory_size);
  820. DPRINT(" userspace_addr 0x%016"PRIx64"\n",
  821. msg_region->userspace_addr);
  822. DPRINT(" mmap_offset 0x%016"PRIx64"\n",
  823. msg_region->mmap_offset);
  824. dev_region->gpa = msg_region->guest_phys_addr;
  825. dev_region->size = msg_region->memory_size;
  826. dev_region->qva = msg_region->userspace_addr;
  827. dev_region->mmap_offset = msg_region->mmap_offset;
  828. /* We don't use offset argument of mmap() since the
  829. * mapped address has to be page aligned, and we use huge
  830. * pages. */
  831. mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
  832. PROT_READ | PROT_WRITE, MAP_SHARED,
  833. vmsg->fds[i], 0);
  834. if (mmap_addr == MAP_FAILED) {
  835. vu_panic(dev, "region mmap error: %s", strerror(errno));
  836. } else {
  837. dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
  838. DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
  839. dev_region->mmap_addr);
  840. }
  841. close(vmsg->fds[i]);
  842. }
  843. for (i = 0; i < dev->max_queues; i++) {
  844. if (dev->vq[i].vring.desc) {
  845. if (map_ring(dev, &dev->vq[i])) {
  846. vu_panic(dev, "remapping queue %d during setmemtable", i);
  847. }
  848. }
  849. }
  850. return false;
  851. }
  852. static bool
  853. vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg)
  854. {
  855. int fd;
  856. uint64_t log_mmap_size, log_mmap_offset;
  857. void *rc;
  858. if (vmsg->fd_num != 1 ||
  859. vmsg->size != sizeof(vmsg->payload.log)) {
  860. vu_panic(dev, "Invalid log_base message");
  861. return true;
  862. }
  863. fd = vmsg->fds[0];
  864. log_mmap_offset = vmsg->payload.log.mmap_offset;
  865. log_mmap_size = vmsg->payload.log.mmap_size;
  866. DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
  867. DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size);
  868. rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
  869. log_mmap_offset);
  870. close(fd);
  871. if (rc == MAP_FAILED) {
  872. perror("log mmap error");
  873. }
  874. if (dev->log_table) {
  875. munmap(dev->log_table, dev->log_size);
  876. }
  877. dev->log_table = rc;
  878. dev->log_size = log_mmap_size;
  879. vmsg->size = sizeof(vmsg->payload.u64);
  880. vmsg->fd_num = 0;
  881. return true;
  882. }
  883. static bool
  884. vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg)
  885. {
  886. if (vmsg->fd_num != 1) {
  887. vu_panic(dev, "Invalid log_fd message");
  888. return false;
  889. }
  890. if (dev->log_call_fd != -1) {
  891. close(dev->log_call_fd);
  892. }
  893. dev->log_call_fd = vmsg->fds[0];
  894. DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]);
  895. return false;
  896. }
  897. static bool
  898. vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg)
  899. {
  900. unsigned int index = vmsg->payload.state.index;
  901. unsigned int num = vmsg->payload.state.num;
  902. DPRINT("State.index: %u\n", index);
  903. DPRINT("State.num: %u\n", num);
  904. dev->vq[index].vring.num = num;
  905. return false;
  906. }
  907. static bool
  908. vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg)
  909. {
  910. struct vhost_vring_addr addr = vmsg->payload.addr, *vra = &addr;
  911. unsigned int index = vra->index;
  912. VuVirtq *vq = &dev->vq[index];
  913. DPRINT("vhost_vring_addr:\n");
  914. DPRINT(" index: %d\n", vra->index);
  915. DPRINT(" flags: %d\n", vra->flags);
  916. DPRINT(" desc_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->desc_user_addr);
  917. DPRINT(" used_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->used_user_addr);
  918. DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->avail_user_addr);
  919. DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->log_guest_addr);
  920. vq->vra = *vra;
  921. vq->vring.flags = vra->flags;
  922. vq->vring.log_guest_addr = vra->log_guest_addr;
  923. if (map_ring(dev, vq)) {
  924. vu_panic(dev, "Invalid vring_addr message");
  925. return false;
  926. }
  927. vq->used_idx = le16toh(vq->vring.used->idx);
  928. if (vq->last_avail_idx != vq->used_idx) {
  929. bool resume = dev->iface->queue_is_processed_in_order &&
  930. dev->iface->queue_is_processed_in_order(dev, index);
  931. DPRINT("Last avail index != used index: %u != %u%s\n",
  932. vq->last_avail_idx, vq->used_idx,
  933. resume ? ", resuming" : "");
  934. if (resume) {
  935. vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx;
  936. }
  937. }
  938. return false;
  939. }
  940. static bool
  941. vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
  942. {
  943. unsigned int index = vmsg->payload.state.index;
  944. unsigned int num = vmsg->payload.state.num;
  945. DPRINT("State.index: %u\n", index);
  946. DPRINT("State.num: %u\n", num);
  947. dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num;
  948. return false;
  949. }
  950. static bool
  951. vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
  952. {
  953. unsigned int index = vmsg->payload.state.index;
  954. DPRINT("State.index: %u\n", index);
  955. vmsg->payload.state.num = dev->vq[index].last_avail_idx;
  956. vmsg->size = sizeof(vmsg->payload.state);
  957. dev->vq[index].started = false;
  958. if (dev->iface->queue_set_started) {
  959. dev->iface->queue_set_started(dev, index, false);
  960. }
  961. if (dev->vq[index].call_fd != -1) {
  962. close(dev->vq[index].call_fd);
  963. dev->vq[index].call_fd = -1;
  964. }
  965. if (dev->vq[index].kick_fd != -1) {
  966. dev->remove_watch(dev, dev->vq[index].kick_fd);
  967. close(dev->vq[index].kick_fd);
  968. dev->vq[index].kick_fd = -1;
  969. }
  970. return true;
  971. }
  972. static bool
  973. vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
  974. {
  975. int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
  976. bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
  977. if (index >= dev->max_queues) {
  978. vmsg_close_fds(vmsg);
  979. vu_panic(dev, "Invalid queue index: %u", index);
  980. return false;
  981. }
  982. if (nofd) {
  983. vmsg_close_fds(vmsg);
  984. return true;
  985. }
  986. if (vmsg->fd_num != 1) {
  987. vmsg_close_fds(vmsg);
  988. vu_panic(dev, "Invalid fds in request: %d", vmsg->request);
  989. return false;
  990. }
  991. return true;
  992. }
  993. static int
  994. inflight_desc_compare(const void *a, const void *b)
  995. {
  996. VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a,
  997. *desc1 = (VuVirtqInflightDesc *)b;
  998. if (desc1->counter > desc0->counter &&
  999. (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
  1000. return 1;
  1001. }
  1002. return -1;
  1003. }
  1004. static int
  1005. vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
  1006. {
  1007. int i = 0;
  1008. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
  1009. return 0;
  1010. }
  1011. if (unlikely(!vq->inflight)) {
  1012. return -1;
  1013. }
  1014. if (unlikely(!vq->inflight->version)) {
  1015. /* initialize the buffer */
  1016. vq->inflight->version = INFLIGHT_VERSION;
  1017. return 0;
  1018. }
  1019. vq->used_idx = le16toh(vq->vring.used->idx);
  1020. vq->resubmit_num = 0;
  1021. vq->resubmit_list = NULL;
  1022. vq->counter = 0;
  1023. if (unlikely(vq->inflight->used_idx != vq->used_idx)) {
  1024. vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0;
  1025. barrier();
  1026. vq->inflight->used_idx = vq->used_idx;
  1027. }
  1028. for (i = 0; i < vq->inflight->desc_num; i++) {
  1029. if (vq->inflight->desc[i].inflight == 1) {
  1030. vq->inuse++;
  1031. }
  1032. }
  1033. vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
  1034. if (vq->inuse) {
  1035. vq->resubmit_list = calloc(vq->inuse, sizeof(VuVirtqInflightDesc));
  1036. if (!vq->resubmit_list) {
  1037. return -1;
  1038. }
  1039. for (i = 0; i < vq->inflight->desc_num; i++) {
  1040. if (vq->inflight->desc[i].inflight) {
  1041. vq->resubmit_list[vq->resubmit_num].index = i;
  1042. vq->resubmit_list[vq->resubmit_num].counter =
  1043. vq->inflight->desc[i].counter;
  1044. vq->resubmit_num++;
  1045. }
  1046. }
  1047. if (vq->resubmit_num > 1) {
  1048. qsort(vq->resubmit_list, vq->resubmit_num,
  1049. sizeof(VuVirtqInflightDesc), inflight_desc_compare);
  1050. }
  1051. vq->counter = vq->resubmit_list[0].counter + 1;
  1052. }
  1053. /* in case of I/O hang after reconnecting */
  1054. if (eventfd_write(vq->kick_fd, 1)) {
  1055. return -1;
  1056. }
  1057. return 0;
  1058. }
  1059. static bool
  1060. vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
  1061. {
  1062. int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
  1063. bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
  1064. DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
  1065. if (!vu_check_queue_msg_file(dev, vmsg)) {
  1066. return false;
  1067. }
  1068. if (dev->vq[index].kick_fd != -1) {
  1069. dev->remove_watch(dev, dev->vq[index].kick_fd);
  1070. close(dev->vq[index].kick_fd);
  1071. dev->vq[index].kick_fd = -1;
  1072. }
  1073. dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0];
  1074. DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index);
  1075. dev->vq[index].started = true;
  1076. if (dev->iface->queue_set_started) {
  1077. dev->iface->queue_set_started(dev, index, true);
  1078. }
  1079. if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) {
  1080. dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
  1081. vu_kick_cb, (void *)(long)index);
  1082. DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
  1083. dev->vq[index].kick_fd, index);
  1084. }
  1085. if (vu_check_queue_inflights(dev, &dev->vq[index])) {
  1086. vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
  1087. }
  1088. return false;
  1089. }
  1090. void vu_set_queue_handler(VuDev *dev, VuVirtq *vq,
  1091. vu_queue_handler_cb handler)
  1092. {
  1093. int qidx = vq - dev->vq;
  1094. vq->handler = handler;
  1095. if (vq->kick_fd >= 0) {
  1096. if (handler) {
  1097. dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN,
  1098. vu_kick_cb, (void *)(long)qidx);
  1099. } else {
  1100. dev->remove_watch(dev, vq->kick_fd);
  1101. }
  1102. }
  1103. }
  1104. bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd,
  1105. int size, int offset)
  1106. {
  1107. int qidx = vq - dev->vq;
  1108. int fd_num = 0;
  1109. VhostUserMsg vmsg = {
  1110. .request = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG,
  1111. .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
  1112. .size = sizeof(vmsg.payload.area),
  1113. .payload.area = {
  1114. .u64 = qidx & VHOST_USER_VRING_IDX_MASK,
  1115. .size = size,
  1116. .offset = offset,
  1117. },
  1118. };
  1119. if (fd == -1) {
  1120. vmsg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK;
  1121. } else {
  1122. vmsg.fds[fd_num++] = fd;
  1123. }
  1124. vmsg.fd_num = fd_num;
  1125. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) {
  1126. return false;
  1127. }
  1128. pthread_mutex_lock(&dev->slave_mutex);
  1129. if (!vu_message_write(dev, dev->slave_fd, &vmsg)) {
  1130. pthread_mutex_unlock(&dev->slave_mutex);
  1131. return false;
  1132. }
  1133. /* Also unlocks the slave_mutex */
  1134. return vu_process_message_reply(dev, &vmsg);
  1135. }
  1136. static bool
  1137. vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
  1138. {
  1139. int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
  1140. bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
  1141. DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
  1142. if (!vu_check_queue_msg_file(dev, vmsg)) {
  1143. return false;
  1144. }
  1145. if (dev->vq[index].call_fd != -1) {
  1146. close(dev->vq[index].call_fd);
  1147. dev->vq[index].call_fd = -1;
  1148. }
  1149. dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0];
  1150. /* in case of I/O hang after reconnecting */
  1151. if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) {
  1152. return -1;
  1153. }
  1154. DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index);
  1155. return false;
  1156. }
  1157. static bool
  1158. vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg)
  1159. {
  1160. int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
  1161. bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
  1162. DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
  1163. if (!vu_check_queue_msg_file(dev, vmsg)) {
  1164. return false;
  1165. }
  1166. if (dev->vq[index].err_fd != -1) {
  1167. close(dev->vq[index].err_fd);
  1168. dev->vq[index].err_fd = -1;
  1169. }
  1170. dev->vq[index].err_fd = nofd ? -1 : vmsg->fds[0];
  1171. return false;
  1172. }
  1173. static bool
  1174. vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
  1175. {
  1176. /*
  1177. * Note that we support, but intentionally do not set,
  1178. * VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. This means that
  1179. * a device implementation can return it in its callback
  1180. * (get_protocol_features) if it wants to use this for
  1181. * simulation, but it is otherwise not desirable (if even
  1182. * implemented by the master.)
  1183. */
  1184. uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_MQ |
  1185. 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
  1186. 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ |
  1187. 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER |
  1188. 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD |
  1189. 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
  1190. 1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS;
  1191. if (have_userfault()) {
  1192. features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT;
  1193. }
  1194. if (dev->iface->get_config && dev->iface->set_config) {
  1195. features |= 1ULL << VHOST_USER_PROTOCOL_F_CONFIG;
  1196. }
  1197. if (dev->iface->get_protocol_features) {
  1198. features |= dev->iface->get_protocol_features(dev);
  1199. }
  1200. vmsg_set_reply_u64(vmsg, features);
  1201. return true;
  1202. }
  1203. static bool
  1204. vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
  1205. {
  1206. uint64_t features = vmsg->payload.u64;
  1207. DPRINT("u64: 0x%016"PRIx64"\n", features);
  1208. dev->protocol_features = vmsg->payload.u64;
  1209. if (vu_has_protocol_feature(dev,
  1210. VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) &&
  1211. (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ) ||
  1212. !vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) {
  1213. /*
  1214. * The use case for using messages for kick/call is simulation, to make
  1215. * the kick and call synchronous. To actually get that behaviour, both
  1216. * of the other features are required.
  1217. * Theoretically, one could use only kick messages, or do them without
  1218. * having F_REPLY_ACK, but too many (possibly pending) messages on the
  1219. * socket will eventually cause the master to hang, to avoid this in
  1220. * scenarios where not desired enforce that the settings are in a way
  1221. * that actually enables the simulation case.
  1222. */
  1223. vu_panic(dev,
  1224. "F_IN_BAND_NOTIFICATIONS requires F_SLAVE_REQ && F_REPLY_ACK");
  1225. return false;
  1226. }
  1227. if (dev->iface->set_protocol_features) {
  1228. dev->iface->set_protocol_features(dev, features);
  1229. }
  1230. return false;
  1231. }
  1232. static bool
  1233. vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg)
  1234. {
  1235. vmsg_set_reply_u64(vmsg, dev->max_queues);
  1236. return true;
  1237. }
  1238. static bool
  1239. vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg)
  1240. {
  1241. unsigned int index = vmsg->payload.state.index;
  1242. unsigned int enable = vmsg->payload.state.num;
  1243. DPRINT("State.index: %u\n", index);
  1244. DPRINT("State.enable: %u\n", enable);
  1245. if (index >= dev->max_queues) {
  1246. vu_panic(dev, "Invalid vring_enable index: %u", index);
  1247. return false;
  1248. }
  1249. dev->vq[index].enable = enable;
  1250. return false;
  1251. }
  1252. static bool
  1253. vu_set_slave_req_fd(VuDev *dev, VhostUserMsg *vmsg)
  1254. {
  1255. if (vmsg->fd_num != 1) {
  1256. vu_panic(dev, "Invalid slave_req_fd message (%d fd's)", vmsg->fd_num);
  1257. return false;
  1258. }
  1259. if (dev->slave_fd != -1) {
  1260. close(dev->slave_fd);
  1261. }
  1262. dev->slave_fd = vmsg->fds[0];
  1263. DPRINT("Got slave_fd: %d\n", vmsg->fds[0]);
  1264. return false;
  1265. }
  1266. static bool
  1267. vu_get_config(VuDev *dev, VhostUserMsg *vmsg)
  1268. {
  1269. int ret = -1;
  1270. if (dev->iface->get_config) {
  1271. ret = dev->iface->get_config(dev, vmsg->payload.config.region,
  1272. vmsg->payload.config.size);
  1273. }
  1274. if (ret) {
  1275. /* resize to zero to indicate an error to master */
  1276. vmsg->size = 0;
  1277. }
  1278. return true;
  1279. }
  1280. static bool
  1281. vu_set_config(VuDev *dev, VhostUserMsg *vmsg)
  1282. {
  1283. int ret = -1;
  1284. if (dev->iface->set_config) {
  1285. ret = dev->iface->set_config(dev, vmsg->payload.config.region,
  1286. vmsg->payload.config.offset,
  1287. vmsg->payload.config.size,
  1288. vmsg->payload.config.flags);
  1289. if (ret) {
  1290. vu_panic(dev, "Set virtio configuration space failed");
  1291. }
  1292. }
  1293. return false;
  1294. }
  1295. static bool
  1296. vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg)
  1297. {
  1298. dev->postcopy_ufd = -1;
  1299. #ifdef UFFDIO_API
  1300. struct uffdio_api api_struct;
  1301. dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
  1302. vmsg->size = 0;
  1303. #endif
  1304. if (dev->postcopy_ufd == -1) {
  1305. vu_panic(dev, "Userfaultfd not available: %s", strerror(errno));
  1306. goto out;
  1307. }
  1308. #ifdef UFFDIO_API
  1309. api_struct.api = UFFD_API;
  1310. api_struct.features = 0;
  1311. if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
  1312. vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno));
  1313. close(dev->postcopy_ufd);
  1314. dev->postcopy_ufd = -1;
  1315. goto out;
  1316. }
  1317. /* TODO: Stash feature flags somewhere */
  1318. #endif
  1319. out:
  1320. /* Return a ufd to the QEMU */
  1321. vmsg->fd_num = 1;
  1322. vmsg->fds[0] = dev->postcopy_ufd;
  1323. return true; /* = send a reply */
  1324. }
  1325. static bool
  1326. vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg)
  1327. {
  1328. if (dev->nregions) {
  1329. vu_panic(dev, "Regions already registered at postcopy-listen");
  1330. vmsg_set_reply_u64(vmsg, -1);
  1331. return true;
  1332. }
  1333. dev->postcopy_listening = true;
  1334. vmsg_set_reply_u64(vmsg, 0);
  1335. return true;
  1336. }
  1337. static bool
  1338. vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
  1339. {
  1340. DPRINT("%s: Entry\n", __func__);
  1341. dev->postcopy_listening = false;
  1342. if (dev->postcopy_ufd > 0) {
  1343. close(dev->postcopy_ufd);
  1344. dev->postcopy_ufd = -1;
  1345. DPRINT("%s: Done close\n", __func__);
  1346. }
  1347. vmsg_set_reply_u64(vmsg, 0);
  1348. DPRINT("%s: exit\n", __func__);
  1349. return true;
  1350. }
  1351. static inline uint64_t
  1352. vu_inflight_queue_size(uint16_t queue_size)
  1353. {
  1354. return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size +
  1355. sizeof(uint16_t), INFLIGHT_ALIGNMENT);
  1356. }
  1357. #ifdef MFD_ALLOW_SEALING
  1358. static void *
  1359. memfd_alloc(const char *name, size_t size, unsigned int flags, int *fd)
  1360. {
  1361. void *ptr;
  1362. int ret;
  1363. *fd = memfd_create(name, MFD_ALLOW_SEALING);
  1364. if (*fd < 0) {
  1365. return NULL;
  1366. }
  1367. ret = ftruncate(*fd, size);
  1368. if (ret < 0) {
  1369. close(*fd);
  1370. return NULL;
  1371. }
  1372. ret = fcntl(*fd, F_ADD_SEALS, flags);
  1373. if (ret < 0) {
  1374. close(*fd);
  1375. return NULL;
  1376. }
  1377. ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0);
  1378. if (ptr == MAP_FAILED) {
  1379. close(*fd);
  1380. return NULL;
  1381. }
  1382. return ptr;
  1383. }
  1384. #endif
  1385. static bool
  1386. vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
  1387. {
  1388. int fd = -1;
  1389. void *addr = NULL;
  1390. uint64_t mmap_size;
  1391. uint16_t num_queues, queue_size;
  1392. if (vmsg->size != sizeof(vmsg->payload.inflight)) {
  1393. vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
  1394. vmsg->payload.inflight.mmap_size = 0;
  1395. return true;
  1396. }
  1397. num_queues = vmsg->payload.inflight.num_queues;
  1398. queue_size = vmsg->payload.inflight.queue_size;
  1399. DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
  1400. DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
  1401. mmap_size = vu_inflight_queue_size(queue_size) * num_queues;
  1402. #ifdef MFD_ALLOW_SEALING
  1403. addr = memfd_alloc("vhost-inflight", mmap_size,
  1404. F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
  1405. &fd);
  1406. #else
  1407. vu_panic(dev, "Not implemented: memfd support is missing");
  1408. #endif
  1409. if (!addr) {
  1410. vu_panic(dev, "Failed to alloc vhost inflight area");
  1411. vmsg->payload.inflight.mmap_size = 0;
  1412. return true;
  1413. }
  1414. memset(addr, 0, mmap_size);
  1415. dev->inflight_info.addr = addr;
  1416. dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
  1417. dev->inflight_info.fd = vmsg->fds[0] = fd;
  1418. vmsg->fd_num = 1;
  1419. vmsg->payload.inflight.mmap_offset = 0;
  1420. DPRINT("send inflight mmap_size: %"PRId64"\n",
  1421. vmsg->payload.inflight.mmap_size);
  1422. DPRINT("send inflight mmap offset: %"PRId64"\n",
  1423. vmsg->payload.inflight.mmap_offset);
  1424. return true;
  1425. }
  1426. static bool
  1427. vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
  1428. {
  1429. int fd, i;
  1430. uint64_t mmap_size, mmap_offset;
  1431. uint16_t num_queues, queue_size;
  1432. void *rc;
  1433. if (vmsg->fd_num != 1 ||
  1434. vmsg->size != sizeof(vmsg->payload.inflight)) {
  1435. vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
  1436. vmsg->size, vmsg->fd_num);
  1437. return false;
  1438. }
  1439. fd = vmsg->fds[0];
  1440. mmap_size = vmsg->payload.inflight.mmap_size;
  1441. mmap_offset = vmsg->payload.inflight.mmap_offset;
  1442. num_queues = vmsg->payload.inflight.num_queues;
  1443. queue_size = vmsg->payload.inflight.queue_size;
  1444. DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
  1445. DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
  1446. DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
  1447. DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
  1448. rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
  1449. fd, mmap_offset);
  1450. if (rc == MAP_FAILED) {
  1451. vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
  1452. return false;
  1453. }
  1454. if (dev->inflight_info.fd) {
  1455. close(dev->inflight_info.fd);
  1456. }
  1457. if (dev->inflight_info.addr) {
  1458. munmap(dev->inflight_info.addr, dev->inflight_info.size);
  1459. }
  1460. dev->inflight_info.fd = fd;
  1461. dev->inflight_info.addr = rc;
  1462. dev->inflight_info.size = mmap_size;
  1463. for (i = 0; i < num_queues; i++) {
  1464. dev->vq[i].inflight = (VuVirtqInflight *)rc;
  1465. dev->vq[i].inflight->desc_num = queue_size;
  1466. rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size));
  1467. }
  1468. return false;
  1469. }
  1470. static bool
  1471. vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg)
  1472. {
  1473. unsigned int index = vmsg->payload.state.index;
  1474. if (index >= dev->max_queues) {
  1475. vu_panic(dev, "Invalid queue index: %u", index);
  1476. return false;
  1477. }
  1478. DPRINT("Got kick message: handler:%p idx:%u\n",
  1479. dev->vq[index].handler, index);
  1480. if (!dev->vq[index].started) {
  1481. dev->vq[index].started = true;
  1482. if (dev->iface->queue_set_started) {
  1483. dev->iface->queue_set_started(dev, index, true);
  1484. }
  1485. }
  1486. if (dev->vq[index].handler) {
  1487. dev->vq[index].handler(dev, index);
  1488. }
  1489. return false;
  1490. }
  1491. static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg)
  1492. {
  1493. vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
  1494. vmsg->size = sizeof(vmsg->payload.u64);
  1495. vmsg->payload.u64 = VHOST_USER_MAX_RAM_SLOTS;
  1496. vmsg->fd_num = 0;
  1497. if (!vu_message_write(dev, dev->sock, vmsg)) {
  1498. vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno));
  1499. }
  1500. DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS);
  1501. return false;
  1502. }
  1503. static bool
  1504. vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
  1505. {
  1506. int do_reply = 0;
  1507. /* Print out generic part of the request. */
  1508. DPRINT("================ Vhost user message ================\n");
  1509. DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request),
  1510. vmsg->request);
  1511. DPRINT("Flags: 0x%x\n", vmsg->flags);
  1512. DPRINT("Size: %u\n", vmsg->size);
  1513. if (vmsg->fd_num) {
  1514. int i;
  1515. DPRINT("Fds:");
  1516. for (i = 0; i < vmsg->fd_num; i++) {
  1517. DPRINT(" %d", vmsg->fds[i]);
  1518. }
  1519. DPRINT("\n");
  1520. }
  1521. if (dev->iface->process_msg &&
  1522. dev->iface->process_msg(dev, vmsg, &do_reply)) {
  1523. return do_reply;
  1524. }
  1525. switch (vmsg->request) {
  1526. case VHOST_USER_GET_FEATURES:
  1527. return vu_get_features_exec(dev, vmsg);
  1528. case VHOST_USER_SET_FEATURES:
  1529. return vu_set_features_exec(dev, vmsg);
  1530. case VHOST_USER_GET_PROTOCOL_FEATURES:
  1531. return vu_get_protocol_features_exec(dev, vmsg);
  1532. case VHOST_USER_SET_PROTOCOL_FEATURES:
  1533. return vu_set_protocol_features_exec(dev, vmsg);
  1534. case VHOST_USER_SET_OWNER:
  1535. return vu_set_owner_exec(dev, vmsg);
  1536. case VHOST_USER_RESET_OWNER:
  1537. return vu_reset_device_exec(dev, vmsg);
  1538. case VHOST_USER_SET_MEM_TABLE:
  1539. return vu_set_mem_table_exec(dev, vmsg);
  1540. case VHOST_USER_SET_LOG_BASE:
  1541. return vu_set_log_base_exec(dev, vmsg);
  1542. case VHOST_USER_SET_LOG_FD:
  1543. return vu_set_log_fd_exec(dev, vmsg);
  1544. case VHOST_USER_SET_VRING_NUM:
  1545. return vu_set_vring_num_exec(dev, vmsg);
  1546. case VHOST_USER_SET_VRING_ADDR:
  1547. return vu_set_vring_addr_exec(dev, vmsg);
  1548. case VHOST_USER_SET_VRING_BASE:
  1549. return vu_set_vring_base_exec(dev, vmsg);
  1550. case VHOST_USER_GET_VRING_BASE:
  1551. return vu_get_vring_base_exec(dev, vmsg);
  1552. case VHOST_USER_SET_VRING_KICK:
  1553. return vu_set_vring_kick_exec(dev, vmsg);
  1554. case VHOST_USER_SET_VRING_CALL:
  1555. return vu_set_vring_call_exec(dev, vmsg);
  1556. case VHOST_USER_SET_VRING_ERR:
  1557. return vu_set_vring_err_exec(dev, vmsg);
  1558. case VHOST_USER_GET_QUEUE_NUM:
  1559. return vu_get_queue_num_exec(dev, vmsg);
  1560. case VHOST_USER_SET_VRING_ENABLE:
  1561. return vu_set_vring_enable_exec(dev, vmsg);
  1562. case VHOST_USER_SET_SLAVE_REQ_FD:
  1563. return vu_set_slave_req_fd(dev, vmsg);
  1564. case VHOST_USER_GET_CONFIG:
  1565. return vu_get_config(dev, vmsg);
  1566. case VHOST_USER_SET_CONFIG:
  1567. return vu_set_config(dev, vmsg);
  1568. case VHOST_USER_NONE:
  1569. /* if you need processing before exit, override iface->process_msg */
  1570. exit(0);
  1571. case VHOST_USER_POSTCOPY_ADVISE:
  1572. return vu_set_postcopy_advise(dev, vmsg);
  1573. case VHOST_USER_POSTCOPY_LISTEN:
  1574. return vu_set_postcopy_listen(dev, vmsg);
  1575. case VHOST_USER_POSTCOPY_END:
  1576. return vu_set_postcopy_end(dev, vmsg);
  1577. case VHOST_USER_GET_INFLIGHT_FD:
  1578. return vu_get_inflight_fd(dev, vmsg);
  1579. case VHOST_USER_SET_INFLIGHT_FD:
  1580. return vu_set_inflight_fd(dev, vmsg);
  1581. case VHOST_USER_VRING_KICK:
  1582. return vu_handle_vring_kick(dev, vmsg);
  1583. case VHOST_USER_GET_MAX_MEM_SLOTS:
  1584. return vu_handle_get_max_memslots(dev, vmsg);
  1585. case VHOST_USER_ADD_MEM_REG:
  1586. return vu_add_mem_reg(dev, vmsg);
  1587. case VHOST_USER_REM_MEM_REG:
  1588. return vu_rem_mem_reg(dev, vmsg);
  1589. default:
  1590. vmsg_close_fds(vmsg);
  1591. vu_panic(dev, "Unhandled request: %d", vmsg->request);
  1592. }
  1593. return false;
  1594. }
  1595. bool
  1596. vu_dispatch(VuDev *dev)
  1597. {
  1598. VhostUserMsg vmsg = { 0, };
  1599. int reply_requested;
  1600. bool need_reply, success = false;
  1601. if (!dev->read_msg(dev, dev->sock, &vmsg)) {
  1602. goto end;
  1603. }
  1604. need_reply = vmsg.flags & VHOST_USER_NEED_REPLY_MASK;
  1605. reply_requested = vu_process_message(dev, &vmsg);
  1606. if (!reply_requested && need_reply) {
  1607. vmsg_set_reply_u64(&vmsg, 0);
  1608. reply_requested = 1;
  1609. }
  1610. if (!reply_requested) {
  1611. success = true;
  1612. goto end;
  1613. }
  1614. if (!vu_send_reply(dev, dev->sock, &vmsg)) {
  1615. goto end;
  1616. }
  1617. success = true;
  1618. end:
  1619. free(vmsg.data);
  1620. return success;
  1621. }
  1622. void
  1623. vu_deinit(VuDev *dev)
  1624. {
  1625. int i;
  1626. for (i = 0; i < dev->nregions; i++) {
  1627. VuDevRegion *r = &dev->regions[i];
  1628. void *m = (void *) (uintptr_t) r->mmap_addr;
  1629. if (m != MAP_FAILED) {
  1630. munmap(m, r->size + r->mmap_offset);
  1631. }
  1632. }
  1633. dev->nregions = 0;
  1634. for (i = 0; i < dev->max_queues; i++) {
  1635. VuVirtq *vq = &dev->vq[i];
  1636. if (vq->call_fd != -1) {
  1637. close(vq->call_fd);
  1638. vq->call_fd = -1;
  1639. }
  1640. if (vq->kick_fd != -1) {
  1641. dev->remove_watch(dev, vq->kick_fd);
  1642. close(vq->kick_fd);
  1643. vq->kick_fd = -1;
  1644. }
  1645. if (vq->err_fd != -1) {
  1646. close(vq->err_fd);
  1647. vq->err_fd = -1;
  1648. }
  1649. if (vq->resubmit_list) {
  1650. free(vq->resubmit_list);
  1651. vq->resubmit_list = NULL;
  1652. }
  1653. vq->inflight = NULL;
  1654. }
  1655. if (dev->inflight_info.addr) {
  1656. munmap(dev->inflight_info.addr, dev->inflight_info.size);
  1657. dev->inflight_info.addr = NULL;
  1658. }
  1659. if (dev->inflight_info.fd > 0) {
  1660. close(dev->inflight_info.fd);
  1661. dev->inflight_info.fd = -1;
  1662. }
  1663. vu_close_log(dev);
  1664. if (dev->slave_fd != -1) {
  1665. close(dev->slave_fd);
  1666. dev->slave_fd = -1;
  1667. }
  1668. pthread_mutex_destroy(&dev->slave_mutex);
  1669. if (dev->sock != -1) {
  1670. close(dev->sock);
  1671. }
  1672. free(dev->vq);
  1673. dev->vq = NULL;
  1674. }
  1675. bool
  1676. vu_init(VuDev *dev,
  1677. uint16_t max_queues,
  1678. int socket,
  1679. vu_panic_cb panic,
  1680. vu_read_msg_cb read_msg,
  1681. vu_set_watch_cb set_watch,
  1682. vu_remove_watch_cb remove_watch,
  1683. const VuDevIface *iface)
  1684. {
  1685. uint16_t i;
  1686. assert(max_queues > 0);
  1687. assert(socket >= 0);
  1688. assert(set_watch);
  1689. assert(remove_watch);
  1690. assert(iface);
  1691. assert(panic);
  1692. memset(dev, 0, sizeof(*dev));
  1693. dev->sock = socket;
  1694. dev->panic = panic;
  1695. dev->read_msg = read_msg ? read_msg : vu_message_read_default;
  1696. dev->set_watch = set_watch;
  1697. dev->remove_watch = remove_watch;
  1698. dev->iface = iface;
  1699. dev->log_call_fd = -1;
  1700. pthread_mutex_init(&dev->slave_mutex, NULL);
  1701. dev->slave_fd = -1;
  1702. dev->max_queues = max_queues;
  1703. dev->vq = malloc(max_queues * sizeof(dev->vq[0]));
  1704. if (!dev->vq) {
  1705. DPRINT("%s: failed to malloc virtqueues\n", __func__);
  1706. return false;
  1707. }
  1708. for (i = 0; i < max_queues; i++) {
  1709. dev->vq[i] = (VuVirtq) {
  1710. .call_fd = -1, .kick_fd = -1, .err_fd = -1,
  1711. .notification = true,
  1712. };
  1713. }
  1714. return true;
  1715. }
  1716. VuVirtq *
  1717. vu_get_queue(VuDev *dev, int qidx)
  1718. {
  1719. assert(qidx < dev->max_queues);
  1720. return &dev->vq[qidx];
  1721. }
  1722. bool
  1723. vu_queue_enabled(VuDev *dev, VuVirtq *vq)
  1724. {
  1725. return vq->enable;
  1726. }
  1727. bool
  1728. vu_queue_started(const VuDev *dev, const VuVirtq *vq)
  1729. {
  1730. return vq->started;
  1731. }
  1732. static inline uint16_t
  1733. vring_avail_flags(VuVirtq *vq)
  1734. {
  1735. return le16toh(vq->vring.avail->flags);
  1736. }
  1737. static inline uint16_t
  1738. vring_avail_idx(VuVirtq *vq)
  1739. {
  1740. vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
  1741. return vq->shadow_avail_idx;
  1742. }
  1743. static inline uint16_t
  1744. vring_avail_ring(VuVirtq *vq, int i)
  1745. {
  1746. return le16toh(vq->vring.avail->ring[i]);
  1747. }
  1748. static inline uint16_t
  1749. vring_get_used_event(VuVirtq *vq)
  1750. {
  1751. return vring_avail_ring(vq, vq->vring.num);
  1752. }
  1753. static int
  1754. virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx)
  1755. {
  1756. uint16_t num_heads = vring_avail_idx(vq) - idx;
  1757. /* Check it isn't doing very strange things with descriptor numbers. */
  1758. if (num_heads > vq->vring.num) {
  1759. vu_panic(dev, "Guest moved used index from %u to %u",
  1760. idx, vq->shadow_avail_idx);
  1761. return -1;
  1762. }
  1763. if (num_heads) {
  1764. /* On success, callers read a descriptor at vq->last_avail_idx.
  1765. * Make sure descriptor read does not bypass avail index read. */
  1766. smp_rmb();
  1767. }
  1768. return num_heads;
  1769. }
  1770. static bool
  1771. virtqueue_get_head(VuDev *dev, VuVirtq *vq,
  1772. unsigned int idx, unsigned int *head)
  1773. {
  1774. /* Grab the next descriptor number they're advertising, and increment
  1775. * the index we've seen. */
  1776. *head = vring_avail_ring(vq, idx % vq->vring.num);
  1777. /* If their number is silly, that's a fatal mistake. */
  1778. if (*head >= vq->vring.num) {
  1779. vu_panic(dev, "Guest says index %u is available", *head);
  1780. return false;
  1781. }
  1782. return true;
  1783. }
  1784. static int
  1785. virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc,
  1786. uint64_t addr, size_t len)
  1787. {
  1788. struct vring_desc *ori_desc;
  1789. uint64_t read_len;
  1790. if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
  1791. return -1;
  1792. }
  1793. if (len == 0) {
  1794. return -1;
  1795. }
  1796. while (len) {
  1797. read_len = len;
  1798. ori_desc = vu_gpa_to_va(dev, &read_len, addr);
  1799. if (!ori_desc) {
  1800. return -1;
  1801. }
  1802. memcpy(desc, ori_desc, read_len);
  1803. len -= read_len;
  1804. addr += read_len;
  1805. desc += read_len;
  1806. }
  1807. return 0;
  1808. }
  1809. enum {
  1810. VIRTQUEUE_READ_DESC_ERROR = -1,
  1811. VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
  1812. VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
  1813. };
  1814. static int
  1815. virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc,
  1816. int i, unsigned int max, unsigned int *next)
  1817. {
  1818. /* If this descriptor says it doesn't chain, we're done. */
  1819. if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
  1820. return VIRTQUEUE_READ_DESC_DONE;
  1821. }
  1822. /* Check they're not leading us off end of descriptors. */
  1823. *next = le16toh(desc[i].next);
  1824. /* Make sure compiler knows to grab that: we don't want it changing! */
  1825. smp_wmb();
  1826. if (*next >= max) {
  1827. vu_panic(dev, "Desc next is %u", *next);
  1828. return VIRTQUEUE_READ_DESC_ERROR;
  1829. }
  1830. return VIRTQUEUE_READ_DESC_MORE;
  1831. }
  1832. void
  1833. vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes,
  1834. unsigned int *out_bytes,
  1835. unsigned max_in_bytes, unsigned max_out_bytes)
  1836. {
  1837. unsigned int idx;
  1838. unsigned int total_bufs, in_total, out_total;
  1839. int rc;
  1840. idx = vq->last_avail_idx;
  1841. total_bufs = in_total = out_total = 0;
  1842. if (unlikely(dev->broken) ||
  1843. unlikely(!vq->vring.avail)) {
  1844. goto done;
  1845. }
  1846. while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) {
  1847. unsigned int max, desc_len, num_bufs, indirect = 0;
  1848. uint64_t desc_addr, read_len;
  1849. struct vring_desc *desc;
  1850. struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
  1851. unsigned int i;
  1852. max = vq->vring.num;
  1853. num_bufs = total_bufs;
  1854. if (!virtqueue_get_head(dev, vq, idx++, &i)) {
  1855. goto err;
  1856. }
  1857. desc = vq->vring.desc;
  1858. if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
  1859. if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
  1860. vu_panic(dev, "Invalid size for indirect buffer table");
  1861. goto err;
  1862. }
  1863. /* If we've got too many, that implies a descriptor loop. */
  1864. if (num_bufs >= max) {
  1865. vu_panic(dev, "Looped descriptor");
  1866. goto err;
  1867. }
  1868. /* loop over the indirect descriptor table */
  1869. indirect = 1;
  1870. desc_addr = le64toh(desc[i].addr);
  1871. desc_len = le32toh(desc[i].len);
  1872. max = desc_len / sizeof(struct vring_desc);
  1873. read_len = desc_len;
  1874. desc = vu_gpa_to_va(dev, &read_len, desc_addr);
  1875. if (unlikely(desc && read_len != desc_len)) {
  1876. /* Failed to use zero copy */
  1877. desc = NULL;
  1878. if (!virtqueue_read_indirect_desc(dev, desc_buf,
  1879. desc_addr,
  1880. desc_len)) {
  1881. desc = desc_buf;
  1882. }
  1883. }
  1884. if (!desc) {
  1885. vu_panic(dev, "Invalid indirect buffer table");
  1886. goto err;
  1887. }
  1888. num_bufs = i = 0;
  1889. }
  1890. do {
  1891. /* If we've got too many, that implies a descriptor loop. */
  1892. if (++num_bufs > max) {
  1893. vu_panic(dev, "Looped descriptor");
  1894. goto err;
  1895. }
  1896. if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
  1897. in_total += le32toh(desc[i].len);
  1898. } else {
  1899. out_total += le32toh(desc[i].len);
  1900. }
  1901. if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
  1902. goto done;
  1903. }
  1904. rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
  1905. } while (rc == VIRTQUEUE_READ_DESC_MORE);
  1906. if (rc == VIRTQUEUE_READ_DESC_ERROR) {
  1907. goto err;
  1908. }
  1909. if (!indirect) {
  1910. total_bufs = num_bufs;
  1911. } else {
  1912. total_bufs++;
  1913. }
  1914. }
  1915. if (rc < 0) {
  1916. goto err;
  1917. }
  1918. done:
  1919. if (in_bytes) {
  1920. *in_bytes = in_total;
  1921. }
  1922. if (out_bytes) {
  1923. *out_bytes = out_total;
  1924. }
  1925. return;
  1926. err:
  1927. in_total = out_total = 0;
  1928. goto done;
  1929. }
  1930. bool
  1931. vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
  1932. unsigned int out_bytes)
  1933. {
  1934. unsigned int in_total, out_total;
  1935. vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total,
  1936. in_bytes, out_bytes);
  1937. return in_bytes <= in_total && out_bytes <= out_total;
  1938. }
  1939. /* Fetch avail_idx from VQ memory only when we really need to know if
  1940. * guest has added some buffers. */
  1941. bool
  1942. vu_queue_empty(VuDev *dev, VuVirtq *vq)
  1943. {
  1944. if (unlikely(dev->broken) ||
  1945. unlikely(!vq->vring.avail)) {
  1946. return true;
  1947. }
  1948. if (vq->shadow_avail_idx != vq->last_avail_idx) {
  1949. return false;
  1950. }
  1951. return vring_avail_idx(vq) == vq->last_avail_idx;
  1952. }
  1953. static bool
  1954. vring_notify(VuDev *dev, VuVirtq *vq)
  1955. {
  1956. uint16_t old, new;
  1957. bool v;
  1958. /* We need to expose used array entries before checking used event. */
  1959. smp_mb();
  1960. /* Always notify when queue is empty (when feature acknowledge) */
  1961. if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
  1962. !vq->inuse && vu_queue_empty(dev, vq)) {
  1963. return true;
  1964. }
  1965. if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
  1966. return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
  1967. }
  1968. v = vq->signalled_used_valid;
  1969. vq->signalled_used_valid = true;
  1970. old = vq->signalled_used;
  1971. new = vq->signalled_used = vq->used_idx;
  1972. return !v || vring_need_event(vring_get_used_event(vq), new, old);
  1973. }
  1974. static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync)
  1975. {
  1976. if (unlikely(dev->broken) ||
  1977. unlikely(!vq->vring.avail)) {
  1978. return;
  1979. }
  1980. if (!vring_notify(dev, vq)) {
  1981. DPRINT("skipped notify...\n");
  1982. return;
  1983. }
  1984. if (vq->call_fd < 0 &&
  1985. vu_has_protocol_feature(dev,
  1986. VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) &&
  1987. vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ)) {
  1988. VhostUserMsg vmsg = {
  1989. .request = VHOST_USER_SLAVE_VRING_CALL,
  1990. .flags = VHOST_USER_VERSION,
  1991. .size = sizeof(vmsg.payload.state),
  1992. .payload.state = {
  1993. .index = vq - dev->vq,
  1994. },
  1995. };
  1996. bool ack = sync &&
  1997. vu_has_protocol_feature(dev,
  1998. VHOST_USER_PROTOCOL_F_REPLY_ACK);
  1999. if (ack) {
  2000. vmsg.flags |= VHOST_USER_NEED_REPLY_MASK;
  2001. }
  2002. vu_message_write(dev, dev->slave_fd, &vmsg);
  2003. if (ack) {
  2004. vu_message_read_default(dev, dev->slave_fd, &vmsg);
  2005. }
  2006. return;
  2007. }
  2008. if (eventfd_write(vq->call_fd, 1) < 0) {
  2009. vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
  2010. }
  2011. }
  2012. void vu_queue_notify(VuDev *dev, VuVirtq *vq)
  2013. {
  2014. _vu_queue_notify(dev, vq, false);
  2015. }
  2016. void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq)
  2017. {
  2018. _vu_queue_notify(dev, vq, true);
  2019. }
  2020. static inline void
  2021. vring_used_flags_set_bit(VuVirtq *vq, int mask)
  2022. {
  2023. uint16_t *flags;
  2024. flags = (uint16_t *)((char*)vq->vring.used +
  2025. offsetof(struct vring_used, flags));
  2026. *flags = htole16(le16toh(*flags) | mask);
  2027. }
  2028. static inline void
  2029. vring_used_flags_unset_bit(VuVirtq *vq, int mask)
  2030. {
  2031. uint16_t *flags;
  2032. flags = (uint16_t *)((char*)vq->vring.used +
  2033. offsetof(struct vring_used, flags));
  2034. *flags = htole16(le16toh(*flags) & ~mask);
  2035. }
  2036. static inline void
  2037. vring_set_avail_event(VuVirtq *vq, uint16_t val)
  2038. {
  2039. uint16_t *avail;
  2040. if (!vq->notification) {
  2041. return;
  2042. }
  2043. avail = (uint16_t *)&vq->vring.used->ring[vq->vring.num];
  2044. *avail = htole16(val);
  2045. }
  2046. void
  2047. vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable)
  2048. {
  2049. vq->notification = enable;
  2050. if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
  2051. vring_set_avail_event(vq, vring_avail_idx(vq));
  2052. } else if (enable) {
  2053. vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY);
  2054. } else {
  2055. vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY);
  2056. }
  2057. if (enable) {
  2058. /* Expose avail event/used flags before caller checks the avail idx. */
  2059. smp_mb();
  2060. }
  2061. }
  2062. static bool
  2063. virtqueue_map_desc(VuDev *dev,
  2064. unsigned int *p_num_sg, struct iovec *iov,
  2065. unsigned int max_num_sg, bool is_write,
  2066. uint64_t pa, size_t sz)
  2067. {
  2068. unsigned num_sg = *p_num_sg;
  2069. assert(num_sg <= max_num_sg);
  2070. if (!sz) {
  2071. vu_panic(dev, "virtio: zero sized buffers are not allowed");
  2072. return false;
  2073. }
  2074. while (sz) {
  2075. uint64_t len = sz;
  2076. if (num_sg == max_num_sg) {
  2077. vu_panic(dev, "virtio: too many descriptors in indirect table");
  2078. return false;
  2079. }
  2080. iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa);
  2081. if (iov[num_sg].iov_base == NULL) {
  2082. vu_panic(dev, "virtio: invalid address for buffers");
  2083. return false;
  2084. }
  2085. iov[num_sg].iov_len = len;
  2086. num_sg++;
  2087. sz -= len;
  2088. pa += len;
  2089. }
  2090. *p_num_sg = num_sg;
  2091. return true;
  2092. }
  2093. static void *
  2094. virtqueue_alloc_element(size_t sz,
  2095. unsigned out_num, unsigned in_num)
  2096. {
  2097. VuVirtqElement *elem;
  2098. size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
  2099. size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
  2100. size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
  2101. assert(sz >= sizeof(VuVirtqElement));
  2102. elem = malloc(out_sg_end);
  2103. elem->out_num = out_num;
  2104. elem->in_num = in_num;
  2105. elem->in_sg = (void *)elem + in_sg_ofs;
  2106. elem->out_sg = (void *)elem + out_sg_ofs;
  2107. return elem;
  2108. }
  2109. static void *
  2110. vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
  2111. {
  2112. struct vring_desc *desc = vq->vring.desc;
  2113. uint64_t desc_addr, read_len;
  2114. unsigned int desc_len;
  2115. unsigned int max = vq->vring.num;
  2116. unsigned int i = idx;
  2117. VuVirtqElement *elem;
  2118. unsigned int out_num = 0, in_num = 0;
  2119. struct iovec iov[VIRTQUEUE_MAX_SIZE];
  2120. struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
  2121. int rc;
  2122. if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
  2123. if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
  2124. vu_panic(dev, "Invalid size for indirect buffer table");
  2125. return NULL;
  2126. }
  2127. /* loop over the indirect descriptor table */
  2128. desc_addr = le64toh(desc[i].addr);
  2129. desc_len = le32toh(desc[i].len);
  2130. max = desc_len / sizeof(struct vring_desc);
  2131. read_len = desc_len;
  2132. desc = vu_gpa_to_va(dev, &read_len, desc_addr);
  2133. if (unlikely(desc && read_len != desc_len)) {
  2134. /* Failed to use zero copy */
  2135. desc = NULL;
  2136. if (!virtqueue_read_indirect_desc(dev, desc_buf,
  2137. desc_addr,
  2138. desc_len)) {
  2139. desc = desc_buf;
  2140. }
  2141. }
  2142. if (!desc) {
  2143. vu_panic(dev, "Invalid indirect buffer table");
  2144. return NULL;
  2145. }
  2146. i = 0;
  2147. }
  2148. /* Collect all the descriptors */
  2149. do {
  2150. if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
  2151. if (!virtqueue_map_desc(dev, &in_num, iov + out_num,
  2152. VIRTQUEUE_MAX_SIZE - out_num, true,
  2153. le64toh(desc[i].addr),
  2154. le32toh(desc[i].len))) {
  2155. return NULL;
  2156. }
  2157. } else {
  2158. if (in_num) {
  2159. vu_panic(dev, "Incorrect order for descriptors");
  2160. return NULL;
  2161. }
  2162. if (!virtqueue_map_desc(dev, &out_num, iov,
  2163. VIRTQUEUE_MAX_SIZE, false,
  2164. le64toh(desc[i].addr),
  2165. le32toh(desc[i].len))) {
  2166. return NULL;
  2167. }
  2168. }
  2169. /* If we've got too many, that implies a descriptor loop. */
  2170. if ((in_num + out_num) > max) {
  2171. vu_panic(dev, "Looped descriptor");
  2172. return NULL;
  2173. }
  2174. rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
  2175. } while (rc == VIRTQUEUE_READ_DESC_MORE);
  2176. if (rc == VIRTQUEUE_READ_DESC_ERROR) {
  2177. vu_panic(dev, "read descriptor error");
  2178. return NULL;
  2179. }
  2180. /* Now copy what we have collected and mapped */
  2181. elem = virtqueue_alloc_element(sz, out_num, in_num);
  2182. elem->index = idx;
  2183. for (i = 0; i < out_num; i++) {
  2184. elem->out_sg[i] = iov[i];
  2185. }
  2186. for (i = 0; i < in_num; i++) {
  2187. elem->in_sg[i] = iov[out_num + i];
  2188. }
  2189. return elem;
  2190. }
  2191. static int
  2192. vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
  2193. {
  2194. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
  2195. return 0;
  2196. }
  2197. if (unlikely(!vq->inflight)) {
  2198. return -1;
  2199. }
  2200. vq->inflight->desc[desc_idx].counter = vq->counter++;
  2201. vq->inflight->desc[desc_idx].inflight = 1;
  2202. return 0;
  2203. }
  2204. static int
  2205. vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx)
  2206. {
  2207. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
  2208. return 0;
  2209. }
  2210. if (unlikely(!vq->inflight)) {
  2211. return -1;
  2212. }
  2213. vq->inflight->last_batch_head = desc_idx;
  2214. return 0;
  2215. }
  2216. static int
  2217. vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx)
  2218. {
  2219. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
  2220. return 0;
  2221. }
  2222. if (unlikely(!vq->inflight)) {
  2223. return -1;
  2224. }
  2225. barrier();
  2226. vq->inflight->desc[desc_idx].inflight = 0;
  2227. barrier();
  2228. vq->inflight->used_idx = vq->used_idx;
  2229. return 0;
  2230. }
  2231. void *
  2232. vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
  2233. {
  2234. int i;
  2235. unsigned int head;
  2236. VuVirtqElement *elem;
  2237. if (unlikely(dev->broken) ||
  2238. unlikely(!vq->vring.avail)) {
  2239. return NULL;
  2240. }
  2241. if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
  2242. i = (--vq->resubmit_num);
  2243. elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz);
  2244. if (!vq->resubmit_num) {
  2245. free(vq->resubmit_list);
  2246. vq->resubmit_list = NULL;
  2247. }
  2248. return elem;
  2249. }
  2250. if (vu_queue_empty(dev, vq)) {
  2251. return NULL;
  2252. }
  2253. /*
  2254. * Needed after virtio_queue_empty(), see comment in
  2255. * virtqueue_num_heads().
  2256. */
  2257. smp_rmb();
  2258. if (vq->inuse >= vq->vring.num) {
  2259. vu_panic(dev, "Virtqueue size exceeded");
  2260. return NULL;
  2261. }
  2262. if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
  2263. return NULL;
  2264. }
  2265. if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
  2266. vring_set_avail_event(vq, vq->last_avail_idx);
  2267. }
  2268. elem = vu_queue_map_desc(dev, vq, head, sz);
  2269. if (!elem) {
  2270. return NULL;
  2271. }
  2272. vq->inuse++;
  2273. vu_queue_inflight_get(dev, vq, head);
  2274. return elem;
  2275. }
  2276. static void
  2277. vu_queue_detach_element(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
  2278. size_t len)
  2279. {
  2280. vq->inuse--;
  2281. /* unmap, when DMA support is added */
  2282. }
  2283. void
  2284. vu_queue_unpop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
  2285. size_t len)
  2286. {
  2287. vq->last_avail_idx--;
  2288. vu_queue_detach_element(dev, vq, elem, len);
  2289. }
  2290. bool
  2291. vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
  2292. {
  2293. if (num > vq->inuse) {
  2294. return false;
  2295. }
  2296. vq->last_avail_idx -= num;
  2297. vq->inuse -= num;
  2298. return true;
  2299. }
  2300. static inline
  2301. void vring_used_write(VuDev *dev, VuVirtq *vq,
  2302. struct vring_used_elem *uelem, int i)
  2303. {
  2304. struct vring_used *used = vq->vring.used;
  2305. used->ring[i] = *uelem;
  2306. vu_log_write(dev, vq->vring.log_guest_addr +
  2307. offsetof(struct vring_used, ring[i]),
  2308. sizeof(used->ring[i]));
  2309. }
  2310. static void
  2311. vu_log_queue_fill(VuDev *dev, VuVirtq *vq,
  2312. const VuVirtqElement *elem,
  2313. unsigned int len)
  2314. {
  2315. struct vring_desc *desc = vq->vring.desc;
  2316. unsigned int i, max, min, desc_len;
  2317. uint64_t desc_addr, read_len;
  2318. struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
  2319. unsigned num_bufs = 0;
  2320. max = vq->vring.num;
  2321. i = elem->index;
  2322. if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
  2323. if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
  2324. vu_panic(dev, "Invalid size for indirect buffer table");
  2325. return;
  2326. }
  2327. /* loop over the indirect descriptor table */
  2328. desc_addr = le64toh(desc[i].addr);
  2329. desc_len = le32toh(desc[i].len);
  2330. max = desc_len / sizeof(struct vring_desc);
  2331. read_len = desc_len;
  2332. desc = vu_gpa_to_va(dev, &read_len, desc_addr);
  2333. if (unlikely(desc && read_len != desc_len)) {
  2334. /* Failed to use zero copy */
  2335. desc = NULL;
  2336. if (!virtqueue_read_indirect_desc(dev, desc_buf,
  2337. desc_addr,
  2338. desc_len)) {
  2339. desc = desc_buf;
  2340. }
  2341. }
  2342. if (!desc) {
  2343. vu_panic(dev, "Invalid indirect buffer table");
  2344. return;
  2345. }
  2346. i = 0;
  2347. }
  2348. do {
  2349. if (++num_bufs > max) {
  2350. vu_panic(dev, "Looped descriptor");
  2351. return;
  2352. }
  2353. if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
  2354. min = MIN(le32toh(desc[i].len), len);
  2355. vu_log_write(dev, le64toh(desc[i].addr), min);
  2356. len -= min;
  2357. }
  2358. } while (len > 0 &&
  2359. (virtqueue_read_next_desc(dev, desc, i, max, &i)
  2360. == VIRTQUEUE_READ_DESC_MORE));
  2361. }
  2362. void
  2363. vu_queue_fill(VuDev *dev, VuVirtq *vq,
  2364. const VuVirtqElement *elem,
  2365. unsigned int len, unsigned int idx)
  2366. {
  2367. struct vring_used_elem uelem;
  2368. if (unlikely(dev->broken) ||
  2369. unlikely(!vq->vring.avail)) {
  2370. return;
  2371. }
  2372. vu_log_queue_fill(dev, vq, elem, len);
  2373. idx = (idx + vq->used_idx) % vq->vring.num;
  2374. uelem.id = htole32(elem->index);
  2375. uelem.len = htole32(len);
  2376. vring_used_write(dev, vq, &uelem, idx);
  2377. }
  2378. static inline
  2379. void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val)
  2380. {
  2381. vq->vring.used->idx = htole16(val);
  2382. vu_log_write(dev,
  2383. vq->vring.log_guest_addr + offsetof(struct vring_used, idx),
  2384. sizeof(vq->vring.used->idx));
  2385. vq->used_idx = val;
  2386. }
  2387. void
  2388. vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count)
  2389. {
  2390. uint16_t old, new;
  2391. if (unlikely(dev->broken) ||
  2392. unlikely(!vq->vring.avail)) {
  2393. return;
  2394. }
  2395. /* Make sure buffer is written before we update index. */
  2396. smp_wmb();
  2397. old = vq->used_idx;
  2398. new = old + count;
  2399. vring_used_idx_set(dev, vq, new);
  2400. vq->inuse -= count;
  2401. if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
  2402. vq->signalled_used_valid = false;
  2403. }
  2404. }
  2405. void
  2406. vu_queue_push(VuDev *dev, VuVirtq *vq,
  2407. const VuVirtqElement *elem, unsigned int len)
  2408. {
  2409. vu_queue_fill(dev, vq, elem, len, 0);
  2410. vu_queue_inflight_pre_put(dev, vq, elem->index);
  2411. vu_queue_flush(dev, vq, 1);
  2412. vu_queue_inflight_post_put(dev, vq, elem->index);
  2413. }