2
0

libvhost-user.c 84 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150
  1. /*
  2. * Vhost User library
  3. *
  4. * Copyright IBM, Corp. 2007
  5. * Copyright (c) 2016 Red Hat, Inc.
  6. *
  7. * Authors:
  8. * Anthony Liguori <aliguori@us.ibm.com>
  9. * Marc-André Lureau <mlureau@redhat.com>
  10. * Victor Kaplansky <victork@redhat.com>
  11. *
  12. * This work is licensed under the terms of the GNU GPL, version 2 or
  13. * later. See the COPYING file in the top-level directory.
  14. */
  15. #ifndef _GNU_SOURCE
  16. #define _GNU_SOURCE
  17. #endif
  18. /* this code avoids GLib dependency */
  19. #include <stdlib.h>
  20. #include <stdio.h>
  21. #include <unistd.h>
  22. #include <stdarg.h>
  23. #include <errno.h>
  24. #include <string.h>
  25. #include <assert.h>
  26. #include <inttypes.h>
  27. #include <sys/types.h>
  28. #include <sys/socket.h>
  29. #include <sys/eventfd.h>
  30. #include <sys/mman.h>
  31. #include <endian.h>
  32. /* Necessary to provide VIRTIO_F_VERSION_1 on system
  33. * with older linux headers. Must appear before
  34. * <linux/vhost.h> below.
  35. */
  36. #include "standard-headers/linux/virtio_config.h"
  37. #if defined(__linux__)
  38. #include <sys/syscall.h>
  39. #include <fcntl.h>
  40. #include <sys/ioctl.h>
  41. #include <linux/vhost.h>
  42. #include <sys/vfs.h>
  43. #include <linux/magic.h>
  44. #ifdef __NR_userfaultfd
  45. #include <linux/userfaultfd.h>
  46. #endif
  47. #endif
  48. #include "include/atomic.h"
  49. #include "libvhost-user.h"
  50. /* usually provided by GLib */
  51. #if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
  52. #if !defined(__clang__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 4)
  53. #define G_GNUC_PRINTF(format_idx, arg_idx) \
  54. __attribute__((__format__(gnu_printf, format_idx, arg_idx)))
  55. #else
  56. #define G_GNUC_PRINTF(format_idx, arg_idx) \
  57. __attribute__((__format__(__printf__, format_idx, arg_idx)))
  58. #endif
  59. #else /* !__GNUC__ */
  60. #define G_GNUC_PRINTF(format_idx, arg_idx)
  61. #endif /* !__GNUC__ */
  62. #ifndef MIN
  63. #define MIN(x, y) ({ \
  64. __typeof__(x) _min1 = (x); \
  65. __typeof__(y) _min2 = (y); \
  66. (void) (&_min1 == &_min2); \
  67. _min1 < _min2 ? _min1 : _min2; })
  68. #endif
  69. /* Round number down to multiple */
  70. #define ALIGN_DOWN(n, m) ((n) / (m) * (m))
  71. /* Round number up to multiple */
  72. #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
  73. #ifndef unlikely
  74. #define unlikely(x) __builtin_expect(!!(x), 0)
  75. #endif
  76. /* Align each region to cache line size in inflight buffer */
  77. #define INFLIGHT_ALIGNMENT 64
  78. /* The version of inflight buffer */
  79. #define INFLIGHT_VERSION 1
  80. /* The version of the protocol we support */
  81. #define VHOST_USER_VERSION 1
  82. #define LIBVHOST_USER_DEBUG 0
  83. #define DPRINT(...) \
  84. do { \
  85. if (LIBVHOST_USER_DEBUG) { \
  86. fprintf(stderr, __VA_ARGS__); \
  87. } \
  88. } while (0)
  89. static inline
  90. bool has_feature(uint64_t features, unsigned int fbit)
  91. {
  92. assert(fbit < 64);
  93. return !!(features & (1ULL << fbit));
  94. }
  95. static inline
  96. bool vu_has_feature(VuDev *dev,
  97. unsigned int fbit)
  98. {
  99. return has_feature(dev->features, fbit);
  100. }
  101. static inline bool vu_has_protocol_feature(VuDev *dev, unsigned int fbit)
  102. {
  103. return has_feature(dev->protocol_features, fbit);
  104. }
  105. const char *
  106. vu_request_to_string(unsigned int req)
  107. {
  108. #define REQ(req) [req] = #req
  109. static const char *vu_request_str[] = {
  110. REQ(VHOST_USER_NONE),
  111. REQ(VHOST_USER_GET_FEATURES),
  112. REQ(VHOST_USER_SET_FEATURES),
  113. REQ(VHOST_USER_SET_OWNER),
  114. REQ(VHOST_USER_RESET_OWNER),
  115. REQ(VHOST_USER_SET_MEM_TABLE),
  116. REQ(VHOST_USER_SET_LOG_BASE),
  117. REQ(VHOST_USER_SET_LOG_FD),
  118. REQ(VHOST_USER_SET_VRING_NUM),
  119. REQ(VHOST_USER_SET_VRING_ADDR),
  120. REQ(VHOST_USER_SET_VRING_BASE),
  121. REQ(VHOST_USER_GET_VRING_BASE),
  122. REQ(VHOST_USER_SET_VRING_KICK),
  123. REQ(VHOST_USER_SET_VRING_CALL),
  124. REQ(VHOST_USER_SET_VRING_ERR),
  125. REQ(VHOST_USER_GET_PROTOCOL_FEATURES),
  126. REQ(VHOST_USER_SET_PROTOCOL_FEATURES),
  127. REQ(VHOST_USER_GET_QUEUE_NUM),
  128. REQ(VHOST_USER_SET_VRING_ENABLE),
  129. REQ(VHOST_USER_SEND_RARP),
  130. REQ(VHOST_USER_NET_SET_MTU),
  131. REQ(VHOST_USER_SET_BACKEND_REQ_FD),
  132. REQ(VHOST_USER_IOTLB_MSG),
  133. REQ(VHOST_USER_SET_VRING_ENDIAN),
  134. REQ(VHOST_USER_GET_CONFIG),
  135. REQ(VHOST_USER_SET_CONFIG),
  136. REQ(VHOST_USER_POSTCOPY_ADVISE),
  137. REQ(VHOST_USER_POSTCOPY_LISTEN),
  138. REQ(VHOST_USER_POSTCOPY_END),
  139. REQ(VHOST_USER_GET_INFLIGHT_FD),
  140. REQ(VHOST_USER_SET_INFLIGHT_FD),
  141. REQ(VHOST_USER_GPU_SET_SOCKET),
  142. REQ(VHOST_USER_VRING_KICK),
  143. REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
  144. REQ(VHOST_USER_ADD_MEM_REG),
  145. REQ(VHOST_USER_REM_MEM_REG),
  146. REQ(VHOST_USER_GET_SHARED_OBJECT),
  147. REQ(VHOST_USER_MAX),
  148. };
  149. #undef REQ
  150. if (req < VHOST_USER_MAX) {
  151. return vu_request_str[req];
  152. } else {
  153. return "unknown";
  154. }
  155. }
  156. static void G_GNUC_PRINTF(2, 3)
  157. vu_panic(VuDev *dev, const char *msg, ...)
  158. {
  159. char *buf = NULL;
  160. va_list ap;
  161. va_start(ap, msg);
  162. if (vasprintf(&buf, msg, ap) < 0) {
  163. buf = NULL;
  164. }
  165. va_end(ap);
  166. dev->broken = true;
  167. dev->panic(dev, buf);
  168. free(buf);
  169. /*
  170. * FIXME:
  171. * find a way to call virtio_error, or perhaps close the connection?
  172. */
  173. }
  174. /* Search for a memory region that covers this guest physical address. */
  175. static VuDevRegion *
  176. vu_gpa_to_mem_region(VuDev *dev, uint64_t guest_addr)
  177. {
  178. int low = 0;
  179. int high = dev->nregions - 1;
  180. /*
  181. * Memory regions cannot overlap in guest physical address space. Each
  182. * GPA belongs to exactly one memory region, so there can only be one
  183. * match.
  184. *
  185. * We store our memory regions ordered by GPA and can simply perform a
  186. * binary search.
  187. */
  188. while (low <= high) {
  189. unsigned int mid = low + (high - low) / 2;
  190. VuDevRegion *cur = &dev->regions[mid];
  191. if (guest_addr >= cur->gpa && guest_addr < cur->gpa + cur->size) {
  192. return cur;
  193. }
  194. if (guest_addr >= cur->gpa + cur->size) {
  195. low = mid + 1;
  196. }
  197. if (guest_addr < cur->gpa) {
  198. high = mid - 1;
  199. }
  200. }
  201. return NULL;
  202. }
  203. /* Translate guest physical address to our virtual address. */
  204. void *
  205. vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr)
  206. {
  207. VuDevRegion *r;
  208. if (*plen == 0) {
  209. return NULL;
  210. }
  211. r = vu_gpa_to_mem_region(dev, guest_addr);
  212. if (!r) {
  213. return NULL;
  214. }
  215. if ((guest_addr + *plen) > (r->gpa + r->size)) {
  216. *plen = r->gpa + r->size - guest_addr;
  217. }
  218. return (void *)(uintptr_t)guest_addr - r->gpa + r->mmap_addr +
  219. r->mmap_offset;
  220. }
  221. /* Translate qemu virtual address to our virtual address. */
  222. static void *
  223. qva_to_va(VuDev *dev, uint64_t qemu_addr)
  224. {
  225. unsigned int i;
  226. /* Find matching memory region. */
  227. for (i = 0; i < dev->nregions; i++) {
  228. VuDevRegion *r = &dev->regions[i];
  229. if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
  230. return (void *)(uintptr_t)
  231. qemu_addr - r->qva + r->mmap_addr + r->mmap_offset;
  232. }
  233. }
  234. return NULL;
  235. }
  236. static void
  237. vu_remove_all_mem_regs(VuDev *dev)
  238. {
  239. unsigned int i;
  240. for (i = 0; i < dev->nregions; i++) {
  241. VuDevRegion *r = &dev->regions[i];
  242. munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset);
  243. }
  244. dev->nregions = 0;
  245. }
  246. static bool
  247. map_ring(VuDev *dev, VuVirtq *vq)
  248. {
  249. vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr);
  250. vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr);
  251. vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr);
  252. DPRINT("Setting virtq addresses:\n");
  253. DPRINT(" vring_desc at %p\n", vq->vring.desc);
  254. DPRINT(" vring_used at %p\n", vq->vring.used);
  255. DPRINT(" vring_avail at %p\n", vq->vring.avail);
  256. return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
  257. }
  258. static bool
  259. vu_is_vq_usable(VuDev *dev, VuVirtq *vq)
  260. {
  261. if (unlikely(dev->broken)) {
  262. return false;
  263. }
  264. if (likely(vq->vring.avail)) {
  265. return true;
  266. }
  267. /*
  268. * In corner cases, we might temporarily remove a memory region that
  269. * mapped a ring. When removing a memory region we make sure to
  270. * unmap any rings that would be impacted. Let's try to remap if we
  271. * already succeeded mapping this ring once.
  272. */
  273. if (!vq->vra.desc_user_addr || !vq->vra.used_user_addr ||
  274. !vq->vra.avail_user_addr) {
  275. return false;
  276. }
  277. if (map_ring(dev, vq)) {
  278. vu_panic(dev, "remapping queue on access");
  279. return false;
  280. }
  281. return true;
  282. }
  283. static void
  284. unmap_rings(VuDev *dev, VuDevRegion *r)
  285. {
  286. int i;
  287. for (i = 0; i < dev->max_queues; i++) {
  288. VuVirtq *vq = &dev->vq[i];
  289. const uintptr_t desc = (uintptr_t)vq->vring.desc;
  290. const uintptr_t used = (uintptr_t)vq->vring.used;
  291. const uintptr_t avail = (uintptr_t)vq->vring.avail;
  292. if (desc < r->mmap_addr || desc >= r->mmap_addr + r->size) {
  293. continue;
  294. }
  295. if (used < r->mmap_addr || used >= r->mmap_addr + r->size) {
  296. continue;
  297. }
  298. if (avail < r->mmap_addr || avail >= r->mmap_addr + r->size) {
  299. continue;
  300. }
  301. DPRINT("Unmapping rings of queue %d\n", i);
  302. vq->vring.desc = NULL;
  303. vq->vring.used = NULL;
  304. vq->vring.avail = NULL;
  305. }
  306. }
  307. static size_t
  308. get_fd_hugepagesize(int fd)
  309. {
  310. #if defined(__linux__)
  311. struct statfs fs;
  312. int ret;
  313. do {
  314. ret = fstatfs(fd, &fs);
  315. } while (ret != 0 && errno == EINTR);
  316. if (!ret && (unsigned int)fs.f_type == HUGETLBFS_MAGIC) {
  317. return fs.f_bsize;
  318. }
  319. #endif
  320. return 0;
  321. }
  322. static void
  323. _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd)
  324. {
  325. const uint64_t start_gpa = msg_region->guest_phys_addr;
  326. const uint64_t end_gpa = start_gpa + msg_region->memory_size;
  327. int prot = PROT_READ | PROT_WRITE;
  328. uint64_t mmap_offset, fd_offset;
  329. size_t hugepagesize;
  330. VuDevRegion *r;
  331. void *mmap_addr;
  332. int low = 0;
  333. int high = dev->nregions - 1;
  334. unsigned int idx;
  335. DPRINT("Adding region %d\n", dev->nregions);
  336. DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
  337. msg_region->guest_phys_addr);
  338. DPRINT(" memory_size: 0x%016"PRIx64"\n",
  339. msg_region->memory_size);
  340. DPRINT(" userspace_addr: 0x%016"PRIx64"\n",
  341. msg_region->userspace_addr);
  342. DPRINT(" old mmap_offset: 0x%016"PRIx64"\n",
  343. msg_region->mmap_offset);
  344. if (dev->postcopy_listening) {
  345. /*
  346. * In postcopy we're using PROT_NONE here to catch anyone
  347. * accessing it before we userfault
  348. */
  349. prot = PROT_NONE;
  350. }
  351. /*
  352. * We will add memory regions into the array sorted by GPA. Perform a
  353. * binary search to locate the insertion point: it will be at the low
  354. * index.
  355. */
  356. while (low <= high) {
  357. unsigned int mid = low + (high - low) / 2;
  358. VuDevRegion *cur = &dev->regions[mid];
  359. /* Overlap of GPA addresses. */
  360. if (start_gpa < cur->gpa + cur->size && cur->gpa < end_gpa) {
  361. vu_panic(dev, "regions with overlapping guest physical addresses");
  362. return;
  363. }
  364. if (start_gpa >= cur->gpa + cur->size) {
  365. low = mid + 1;
  366. }
  367. if (start_gpa < cur->gpa) {
  368. high = mid - 1;
  369. }
  370. }
  371. idx = low;
  372. /*
  373. * Convert most of msg_region->mmap_offset to fd_offset. In almost all
  374. * cases, this will leave us with mmap_offset == 0, mmap()'ing only
  375. * what we really need. Only if a memory region would partially cover
  376. * hugetlb pages, we'd get mmap_offset != 0, which usually doesn't happen
  377. * anymore (i.e., modern QEMU).
  378. *
  379. * Note that mmap() with hugetlb would fail if the offset into the file
  380. * is not aligned to the huge page size.
  381. */
  382. hugepagesize = get_fd_hugepagesize(fd);
  383. if (hugepagesize) {
  384. fd_offset = ALIGN_DOWN(msg_region->mmap_offset, hugepagesize);
  385. mmap_offset = msg_region->mmap_offset - fd_offset;
  386. } else {
  387. fd_offset = msg_region->mmap_offset;
  388. mmap_offset = 0;
  389. }
  390. DPRINT(" fd_offset: 0x%016"PRIx64"\n",
  391. fd_offset);
  392. DPRINT(" new mmap_offset: 0x%016"PRIx64"\n",
  393. mmap_offset);
  394. mmap_addr = mmap(0, msg_region->memory_size + mmap_offset,
  395. prot, MAP_SHARED | MAP_NORESERVE, fd, fd_offset);
  396. if (mmap_addr == MAP_FAILED) {
  397. vu_panic(dev, "region mmap error: %s", strerror(errno));
  398. return;
  399. }
  400. DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
  401. (uint64_t)(uintptr_t)mmap_addr);
  402. #if defined(__linux__)
  403. /* Don't include all guest memory in a coredump. */
  404. madvise(mmap_addr, msg_region->memory_size + mmap_offset,
  405. MADV_DONTDUMP);
  406. #endif
  407. /* Shift all affected entries by 1 to open a hole at idx. */
  408. r = &dev->regions[idx];
  409. memmove(r + 1, r, sizeof(VuDevRegion) * (dev->nregions - idx));
  410. r->gpa = msg_region->guest_phys_addr;
  411. r->size = msg_region->memory_size;
  412. r->qva = msg_region->userspace_addr;
  413. r->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
  414. r->mmap_offset = mmap_offset;
  415. dev->nregions++;
  416. if (dev->postcopy_listening) {
  417. /*
  418. * Return the address to QEMU so that it can translate the ufd
  419. * fault addresses back.
  420. */
  421. msg_region->userspace_addr = r->mmap_addr + r->mmap_offset;
  422. }
  423. }
  424. static void
  425. vmsg_close_fds(VhostUserMsg *vmsg)
  426. {
  427. int i;
  428. for (i = 0; i < vmsg->fd_num; i++) {
  429. close(vmsg->fds[i]);
  430. }
  431. }
  432. /* Set reply payload.u64 and clear request flags and fd_num */
  433. static void vmsg_set_reply_u64(VhostUserMsg *vmsg, uint64_t val)
  434. {
  435. vmsg->flags = 0; /* defaults will be set by vu_send_reply() */
  436. vmsg->size = sizeof(vmsg->payload.u64);
  437. vmsg->payload.u64 = val;
  438. vmsg->fd_num = 0;
  439. }
  440. /* A test to see if we have userfault available */
  441. static bool
  442. have_userfault(void)
  443. {
  444. #if defined(__linux__) && defined(__NR_userfaultfd) &&\
  445. defined(UFFD_FEATURE_MISSING_SHMEM) &&\
  446. defined(UFFD_FEATURE_MISSING_HUGETLBFS)
  447. /* Now test the kernel we're running on really has the features */
  448. int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
  449. struct uffdio_api api_struct;
  450. if (ufd < 0) {
  451. return false;
  452. }
  453. api_struct.api = UFFD_API;
  454. api_struct.features = UFFD_FEATURE_MISSING_SHMEM |
  455. UFFD_FEATURE_MISSING_HUGETLBFS;
  456. if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  457. close(ufd);
  458. return false;
  459. }
  460. close(ufd);
  461. return true;
  462. #else
  463. return false;
  464. #endif
  465. }
  466. static bool
  467. vu_message_read_default(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
  468. {
  469. char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {};
  470. struct iovec iov = {
  471. .iov_base = (char *)vmsg,
  472. .iov_len = VHOST_USER_HDR_SIZE,
  473. };
  474. struct msghdr msg = {
  475. .msg_iov = &iov,
  476. .msg_iovlen = 1,
  477. .msg_control = control,
  478. .msg_controllen = sizeof(control),
  479. };
  480. size_t fd_size;
  481. struct cmsghdr *cmsg;
  482. int rc;
  483. do {
  484. rc = recvmsg(conn_fd, &msg, 0);
  485. } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
  486. if (rc < 0) {
  487. vu_panic(dev, "Error while recvmsg: %s", strerror(errno));
  488. return false;
  489. }
  490. vmsg->fd_num = 0;
  491. for (cmsg = CMSG_FIRSTHDR(&msg);
  492. cmsg != NULL;
  493. cmsg = CMSG_NXTHDR(&msg, cmsg))
  494. {
  495. if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
  496. fd_size = cmsg->cmsg_len - CMSG_LEN(0);
  497. vmsg->fd_num = fd_size / sizeof(int);
  498. assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS);
  499. memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
  500. break;
  501. }
  502. }
  503. if (vmsg->size > sizeof(vmsg->payload)) {
  504. vu_panic(dev,
  505. "Error: too big message request: %d, size: vmsg->size: %u, "
  506. "while sizeof(vmsg->payload) = %zu\n",
  507. vmsg->request, vmsg->size, sizeof(vmsg->payload));
  508. goto fail;
  509. }
  510. if (vmsg->size) {
  511. do {
  512. rc = read(conn_fd, &vmsg->payload, vmsg->size);
  513. } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
  514. if (rc <= 0) {
  515. vu_panic(dev, "Error while reading: %s", strerror(errno));
  516. goto fail;
  517. }
  518. assert((uint32_t)rc == vmsg->size);
  519. }
  520. return true;
  521. fail:
  522. vmsg_close_fds(vmsg);
  523. return false;
  524. }
  525. static bool
  526. vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
  527. {
  528. int rc;
  529. uint8_t *p = (uint8_t *)vmsg;
  530. char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {};
  531. struct iovec iov = {
  532. .iov_base = (char *)vmsg,
  533. .iov_len = VHOST_USER_HDR_SIZE,
  534. };
  535. struct msghdr msg = {
  536. .msg_iov = &iov,
  537. .msg_iovlen = 1,
  538. .msg_control = control,
  539. };
  540. struct cmsghdr *cmsg;
  541. memset(control, 0, sizeof(control));
  542. assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS);
  543. if (vmsg->fd_num > 0) {
  544. size_t fdsize = vmsg->fd_num * sizeof(int);
  545. msg.msg_controllen = CMSG_SPACE(fdsize);
  546. cmsg = CMSG_FIRSTHDR(&msg);
  547. cmsg->cmsg_len = CMSG_LEN(fdsize);
  548. cmsg->cmsg_level = SOL_SOCKET;
  549. cmsg->cmsg_type = SCM_RIGHTS;
  550. memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize);
  551. } else {
  552. msg.msg_controllen = 0;
  553. msg.msg_control = NULL;
  554. }
  555. do {
  556. rc = sendmsg(conn_fd, &msg, 0);
  557. } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
  558. if (rc <= 0) {
  559. vu_panic(dev, "Error while writing: %s", strerror(errno));
  560. return false;
  561. }
  562. if (vmsg->size) {
  563. do {
  564. if (vmsg->data) {
  565. rc = write(conn_fd, vmsg->data, vmsg->size);
  566. } else {
  567. rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size);
  568. }
  569. } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
  570. }
  571. if (rc <= 0) {
  572. vu_panic(dev, "Error while writing: %s", strerror(errno));
  573. return false;
  574. }
  575. return true;
  576. }
  577. static bool
  578. vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
  579. {
  580. /* Set the version in the flags when sending the reply */
  581. vmsg->flags &= ~VHOST_USER_VERSION_MASK;
  582. vmsg->flags |= VHOST_USER_VERSION;
  583. vmsg->flags |= VHOST_USER_REPLY_MASK;
  584. return vu_message_write(dev, conn_fd, vmsg);
  585. }
  586. /*
  587. * Processes a reply on the backend channel.
  588. * Entered with backend_mutex held and releases it before exit.
  589. * Returns true on success.
  590. */
  591. static bool
  592. vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg)
  593. {
  594. VhostUserMsg msg_reply;
  595. bool result = false;
  596. if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) {
  597. result = true;
  598. goto out;
  599. }
  600. if (!vu_message_read_default(dev, dev->backend_fd, &msg_reply)) {
  601. goto out;
  602. }
  603. if (msg_reply.request != vmsg->request) {
  604. DPRINT("Received unexpected msg type. Expected %d received %d",
  605. vmsg->request, msg_reply.request);
  606. goto out;
  607. }
  608. result = msg_reply.payload.u64 == 0;
  609. out:
  610. pthread_mutex_unlock(&dev->backend_mutex);
  611. return result;
  612. }
  613. /* Kick the log_call_fd if required. */
  614. static void
  615. vu_log_kick(VuDev *dev)
  616. {
  617. if (dev->log_call_fd != -1) {
  618. DPRINT("Kicking the QEMU's log...\n");
  619. if (eventfd_write(dev->log_call_fd, 1) < 0) {
  620. vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
  621. }
  622. }
  623. }
  624. static void
  625. vu_log_page(uint8_t *log_table, uint64_t page)
  626. {
  627. DPRINT("Logged dirty guest page: %"PRId64"\n", page);
  628. qatomic_or(&log_table[page / 8], 1 << (page % 8));
  629. }
  630. static void
  631. vu_log_write(VuDev *dev, uint64_t address, uint64_t length)
  632. {
  633. uint64_t page;
  634. if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) ||
  635. !dev->log_table || !length) {
  636. return;
  637. }
  638. assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8));
  639. page = address / VHOST_LOG_PAGE;
  640. while (page * VHOST_LOG_PAGE < address + length) {
  641. vu_log_page(dev->log_table, page);
  642. page += 1;
  643. }
  644. vu_log_kick(dev);
  645. }
  646. static void
  647. vu_kick_cb(VuDev *dev, int condition, void *data)
  648. {
  649. int index = (intptr_t)data;
  650. VuVirtq *vq = &dev->vq[index];
  651. int sock = vq->kick_fd;
  652. eventfd_t kick_data;
  653. ssize_t rc;
  654. rc = eventfd_read(sock, &kick_data);
  655. if (rc == -1) {
  656. vu_panic(dev, "kick eventfd_read(): %s", strerror(errno));
  657. dev->remove_watch(dev, dev->vq[index].kick_fd);
  658. } else {
  659. DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n",
  660. kick_data, vq->handler, index);
  661. if (vq->handler) {
  662. vq->handler(dev, index);
  663. }
  664. }
  665. }
  666. static bool
  667. vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg)
  668. {
  669. vmsg->payload.u64 =
  670. /*
  671. * The following VIRTIO feature bits are supported by our virtqueue
  672. * implementation:
  673. */
  674. 1ULL << VIRTIO_F_NOTIFY_ON_EMPTY |
  675. 1ULL << VIRTIO_RING_F_INDIRECT_DESC |
  676. 1ULL << VIRTIO_RING_F_EVENT_IDX |
  677. 1ULL << VIRTIO_F_VERSION_1 |
  678. /* vhost-user feature bits */
  679. 1ULL << VHOST_F_LOG_ALL |
  680. 1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
  681. if (dev->iface->get_features) {
  682. vmsg->payload.u64 |= dev->iface->get_features(dev);
  683. }
  684. vmsg->size = sizeof(vmsg->payload.u64);
  685. vmsg->fd_num = 0;
  686. DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
  687. return true;
  688. }
  689. static void
  690. vu_set_enable_all_rings(VuDev *dev, bool enabled)
  691. {
  692. uint16_t i;
  693. for (i = 0; i < dev->max_queues; i++) {
  694. dev->vq[i].enable = enabled;
  695. }
  696. }
  697. static bool
  698. vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg)
  699. {
  700. DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
  701. dev->features = vmsg->payload.u64;
  702. if (!vu_has_feature(dev, VIRTIO_F_VERSION_1)) {
  703. /*
  704. * We only support devices conforming to VIRTIO 1.0 or
  705. * later
  706. */
  707. vu_panic(dev, "virtio legacy devices aren't supported by libvhost-user");
  708. return false;
  709. }
  710. if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) {
  711. vu_set_enable_all_rings(dev, true);
  712. }
  713. if (dev->iface->set_features) {
  714. dev->iface->set_features(dev, dev->features);
  715. }
  716. return false;
  717. }
  718. static bool
  719. vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg)
  720. {
  721. return false;
  722. }
  723. static void
  724. vu_close_log(VuDev *dev)
  725. {
  726. if (dev->log_table) {
  727. if (munmap(dev->log_table, dev->log_size) != 0) {
  728. perror("close log munmap() error");
  729. }
  730. dev->log_table = NULL;
  731. }
  732. if (dev->log_call_fd != -1) {
  733. close(dev->log_call_fd);
  734. dev->log_call_fd = -1;
  735. }
  736. }
  737. static bool
  738. vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
  739. {
  740. vu_set_enable_all_rings(dev, false);
  741. return false;
  742. }
  743. static bool
  744. generate_faults(VuDev *dev) {
  745. unsigned int i;
  746. for (i = 0; i < dev->nregions; i++) {
  747. #ifdef UFFDIO_REGISTER
  748. VuDevRegion *dev_region = &dev->regions[i];
  749. int ret;
  750. struct uffdio_register reg_struct;
  751. /*
  752. * We should already have an open ufd. Mark each memory
  753. * range as ufd.
  754. * Discard any mapping we have here; note I can't use MADV_REMOVE
  755. * or fallocate to make the hole since I don't want to lose
  756. * data that's already arrived in the shared process.
  757. * TODO: How to do hugepage
  758. */
  759. ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
  760. dev_region->size + dev_region->mmap_offset,
  761. MADV_DONTNEED);
  762. if (ret) {
  763. fprintf(stderr,
  764. "%s: Failed to madvise(DONTNEED) region %d: %s\n",
  765. __func__, i, strerror(errno));
  766. }
  767. /*
  768. * Turn off transparent hugepages so we dont get lose wakeups
  769. * in neighbouring pages.
  770. * TODO: Turn this backon later.
  771. */
  772. ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
  773. dev_region->size + dev_region->mmap_offset,
  774. MADV_NOHUGEPAGE);
  775. if (ret) {
  776. /*
  777. * Note: This can happen legally on kernels that are configured
  778. * without madvise'able hugepages
  779. */
  780. fprintf(stderr,
  781. "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n",
  782. __func__, i, strerror(errno));
  783. }
  784. reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
  785. reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
  786. reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
  787. if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
  788. vu_panic(dev, "%s: Failed to userfault region %d "
  789. "@%" PRIx64 " + size:%" PRIx64 " offset: %" PRIx64
  790. ": (ufd=%d)%s\n",
  791. __func__, i,
  792. dev_region->mmap_addr,
  793. dev_region->size, dev_region->mmap_offset,
  794. dev->postcopy_ufd, strerror(errno));
  795. return false;
  796. }
  797. if (!(reg_struct.ioctls & (1ULL << _UFFDIO_COPY))) {
  798. vu_panic(dev, "%s Region (%d) doesn't support COPY",
  799. __func__, i);
  800. return false;
  801. }
  802. DPRINT("%s: region %d: Registered userfault for %"
  803. PRIx64 " + %" PRIx64 "\n", __func__, i,
  804. (uint64_t)reg_struct.range.start,
  805. (uint64_t)reg_struct.range.len);
  806. /* Now it's registered we can let the client at it */
  807. if (mprotect((void *)(uintptr_t)dev_region->mmap_addr,
  808. dev_region->size + dev_region->mmap_offset,
  809. PROT_READ | PROT_WRITE)) {
  810. vu_panic(dev, "failed to mprotect region %d for postcopy (%s)",
  811. i, strerror(errno));
  812. return false;
  813. }
  814. /* TODO: Stash 'zero' support flags somewhere */
  815. #endif
  816. }
  817. return true;
  818. }
  819. static bool
  820. vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
  821. VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
  822. if (vmsg->fd_num != 1) {
  823. vmsg_close_fds(vmsg);
  824. vu_panic(dev, "VHOST_USER_ADD_MEM_REG received %d fds - only 1 fd "
  825. "should be sent for this message type", vmsg->fd_num);
  826. return false;
  827. }
  828. if (vmsg->size < VHOST_USER_MEM_REG_SIZE) {
  829. close(vmsg->fds[0]);
  830. vu_panic(dev, "VHOST_USER_ADD_MEM_REG requires a message size of at "
  831. "least %zu bytes and only %d bytes were received",
  832. VHOST_USER_MEM_REG_SIZE, vmsg->size);
  833. return false;
  834. }
  835. if (dev->nregions == VHOST_USER_MAX_RAM_SLOTS) {
  836. close(vmsg->fds[0]);
  837. vu_panic(dev, "failing attempt to hot add memory via "
  838. "VHOST_USER_ADD_MEM_REG message because the backend has "
  839. "no free ram slots available");
  840. return false;
  841. }
  842. /*
  843. * If we are in postcopy mode and we receive a u64 payload with a 0 value
  844. * we know all the postcopy client bases have been received, and we
  845. * should start generating faults.
  846. */
  847. if (dev->postcopy_listening &&
  848. vmsg->size == sizeof(vmsg->payload.u64) &&
  849. vmsg->payload.u64 == 0) {
  850. (void)generate_faults(dev);
  851. return false;
  852. }
  853. _vu_add_mem_reg(dev, msg_region, vmsg->fds[0]);
  854. close(vmsg->fds[0]);
  855. if (dev->postcopy_listening) {
  856. /* Send the message back to qemu with the addresses filled in. */
  857. vmsg->fd_num = 0;
  858. DPRINT("Successfully added new region in postcopy\n");
  859. return true;
  860. }
  861. DPRINT("Successfully added new region\n");
  862. return false;
  863. }
  864. static inline bool reg_equal(VuDevRegion *vudev_reg,
  865. VhostUserMemoryRegion *msg_reg)
  866. {
  867. if (vudev_reg->gpa == msg_reg->guest_phys_addr &&
  868. vudev_reg->qva == msg_reg->userspace_addr &&
  869. vudev_reg->size == msg_reg->memory_size) {
  870. return true;
  871. }
  872. return false;
  873. }
  874. static bool
  875. vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
  876. VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
  877. unsigned int idx;
  878. VuDevRegion *r;
  879. if (vmsg->fd_num > 1) {
  880. vmsg_close_fds(vmsg);
  881. vu_panic(dev, "VHOST_USER_REM_MEM_REG received %d fds - at most 1 fd "
  882. "should be sent for this message type", vmsg->fd_num);
  883. return false;
  884. }
  885. if (vmsg->size < VHOST_USER_MEM_REG_SIZE) {
  886. vmsg_close_fds(vmsg);
  887. vu_panic(dev, "VHOST_USER_REM_MEM_REG requires a message size of at "
  888. "least %zu bytes and only %d bytes were received",
  889. VHOST_USER_MEM_REG_SIZE, vmsg->size);
  890. return false;
  891. }
  892. DPRINT("Removing region:\n");
  893. DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
  894. msg_region->guest_phys_addr);
  895. DPRINT(" memory_size: 0x%016"PRIx64"\n",
  896. msg_region->memory_size);
  897. DPRINT(" userspace_addr 0x%016"PRIx64"\n",
  898. msg_region->userspace_addr);
  899. DPRINT(" mmap_offset 0x%016"PRIx64"\n",
  900. msg_region->mmap_offset);
  901. r = vu_gpa_to_mem_region(dev, msg_region->guest_phys_addr);
  902. if (!r || !reg_equal(r, msg_region)) {
  903. vmsg_close_fds(vmsg);
  904. vu_panic(dev, "Specified region not found\n");
  905. return false;
  906. }
  907. /*
  908. * There might be valid cases where we temporarily remove memory regions
  909. * to readd them again, or remove memory regions and don't use the rings
  910. * anymore before we set the ring addresses and restart the device.
  911. *
  912. * Unmap all affected rings, remapping them on demand later. This should
  913. * be a corner case.
  914. */
  915. unmap_rings(dev, r);
  916. munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset);
  917. idx = r - dev->regions;
  918. assert(idx < dev->nregions);
  919. /* Shift all affected entries by 1 to close the hole. */
  920. memmove(r, r + 1, sizeof(VuDevRegion) * (dev->nregions - idx - 1));
  921. DPRINT("Successfully removed a region\n");
  922. dev->nregions--;
  923. vmsg_close_fds(vmsg);
  924. return false;
  925. }
  926. static bool
  927. vu_get_shared_object(VuDev *dev, VhostUserMsg *vmsg)
  928. {
  929. int fd_num = 0;
  930. int dmabuf_fd = -1;
  931. if (dev->iface->get_shared_object) {
  932. dmabuf_fd = dev->iface->get_shared_object(
  933. dev, &vmsg->payload.object.uuid[0]);
  934. }
  935. if (dmabuf_fd != -1) {
  936. DPRINT("dmabuf_fd found for requested UUID\n");
  937. vmsg->fds[fd_num++] = dmabuf_fd;
  938. }
  939. vmsg->fd_num = fd_num;
  940. return true;
  941. }
  942. static bool
  943. vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
  944. {
  945. VhostUserMemory m = vmsg->payload.memory, *memory = &m;
  946. unsigned int i;
  947. vu_remove_all_mem_regs(dev);
  948. DPRINT("Nregions: %u\n", memory->nregions);
  949. for (i = 0; i < memory->nregions; i++) {
  950. _vu_add_mem_reg(dev, &memory->regions[i], vmsg->fds[i]);
  951. close(vmsg->fds[i]);
  952. }
  953. if (dev->postcopy_listening) {
  954. /* Send the message back to qemu with the addresses filled in */
  955. vmsg->fd_num = 0;
  956. if (!vu_send_reply(dev, dev->sock, vmsg)) {
  957. vu_panic(dev, "failed to respond to set-mem-table for postcopy");
  958. return false;
  959. }
  960. /*
  961. * Wait for QEMU to confirm that it's registered the handler for the
  962. * faults.
  963. */
  964. if (!dev->read_msg(dev, dev->sock, vmsg) ||
  965. vmsg->size != sizeof(vmsg->payload.u64) ||
  966. vmsg->payload.u64 != 0) {
  967. vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
  968. return false;
  969. }
  970. /* OK, now we can go and register the memory and generate faults */
  971. (void)generate_faults(dev);
  972. return false;
  973. }
  974. for (i = 0; i < dev->max_queues; i++) {
  975. if (dev->vq[i].vring.desc) {
  976. if (map_ring(dev, &dev->vq[i])) {
  977. vu_panic(dev, "remapping queue %d during setmemtable", i);
  978. }
  979. }
  980. }
  981. return false;
  982. }
  983. static bool
  984. vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg)
  985. {
  986. int fd;
  987. uint64_t log_mmap_size, log_mmap_offset;
  988. void *rc;
  989. if (vmsg->fd_num != 1 ||
  990. vmsg->size != sizeof(vmsg->payload.log)) {
  991. vu_panic(dev, "Invalid log_base message");
  992. return true;
  993. }
  994. fd = vmsg->fds[0];
  995. log_mmap_offset = vmsg->payload.log.mmap_offset;
  996. log_mmap_size = vmsg->payload.log.mmap_size;
  997. DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
  998. DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size);
  999. rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
  1000. log_mmap_offset);
  1001. close(fd);
  1002. if (rc == MAP_FAILED) {
  1003. perror("log mmap error");
  1004. }
  1005. if (dev->log_table) {
  1006. munmap(dev->log_table, dev->log_size);
  1007. }
  1008. dev->log_table = rc;
  1009. dev->log_size = log_mmap_size;
  1010. vmsg->size = sizeof(vmsg->payload.u64);
  1011. vmsg->fd_num = 0;
  1012. return true;
  1013. }
  1014. static bool
  1015. vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg)
  1016. {
  1017. if (vmsg->fd_num != 1) {
  1018. vu_panic(dev, "Invalid log_fd message");
  1019. return false;
  1020. }
  1021. if (dev->log_call_fd != -1) {
  1022. close(dev->log_call_fd);
  1023. }
  1024. dev->log_call_fd = vmsg->fds[0];
  1025. DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]);
  1026. return false;
  1027. }
  1028. static bool
  1029. vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg)
  1030. {
  1031. unsigned int index = vmsg->payload.state.index;
  1032. unsigned int num = vmsg->payload.state.num;
  1033. DPRINT("State.index: %u\n", index);
  1034. DPRINT("State.num: %u\n", num);
  1035. dev->vq[index].vring.num = num;
  1036. return false;
  1037. }
  1038. static bool
  1039. vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg)
  1040. {
  1041. struct vhost_vring_addr addr = vmsg->payload.addr, *vra = &addr;
  1042. unsigned int index = vra->index;
  1043. VuVirtq *vq = &dev->vq[index];
  1044. DPRINT("vhost_vring_addr:\n");
  1045. DPRINT(" index: %d\n", vra->index);
  1046. DPRINT(" flags: %d\n", vra->flags);
  1047. DPRINT(" desc_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->desc_user_addr);
  1048. DPRINT(" used_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->used_user_addr);
  1049. DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->avail_user_addr);
  1050. DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->log_guest_addr);
  1051. vq->vra = *vra;
  1052. vq->vring.flags = vra->flags;
  1053. vq->vring.log_guest_addr = vra->log_guest_addr;
  1054. if (map_ring(dev, vq)) {
  1055. vu_panic(dev, "Invalid vring_addr message");
  1056. return false;
  1057. }
  1058. vq->used_idx = le16toh(vq->vring.used->idx);
  1059. if (vq->last_avail_idx != vq->used_idx) {
  1060. bool resume = dev->iface->queue_is_processed_in_order &&
  1061. dev->iface->queue_is_processed_in_order(dev, index);
  1062. DPRINT("Last avail index != used index: %u != %u%s\n",
  1063. vq->last_avail_idx, vq->used_idx,
  1064. resume ? ", resuming" : "");
  1065. if (resume) {
  1066. vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx;
  1067. }
  1068. }
  1069. return false;
  1070. }
  1071. static bool
  1072. vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
  1073. {
  1074. unsigned int index = vmsg->payload.state.index;
  1075. unsigned int num = vmsg->payload.state.num;
  1076. DPRINT("State.index: %u\n", index);
  1077. DPRINT("State.num: %u\n", num);
  1078. dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num;
  1079. return false;
  1080. }
  1081. static bool
  1082. vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
  1083. {
  1084. unsigned int index = vmsg->payload.state.index;
  1085. DPRINT("State.index: %u\n", index);
  1086. vmsg->payload.state.num = dev->vq[index].last_avail_idx;
  1087. vmsg->size = sizeof(vmsg->payload.state);
  1088. dev->vq[index].started = false;
  1089. if (dev->iface->queue_set_started) {
  1090. dev->iface->queue_set_started(dev, index, false);
  1091. }
  1092. if (dev->vq[index].call_fd != -1) {
  1093. close(dev->vq[index].call_fd);
  1094. dev->vq[index].call_fd = -1;
  1095. }
  1096. if (dev->vq[index].kick_fd != -1) {
  1097. dev->remove_watch(dev, dev->vq[index].kick_fd);
  1098. close(dev->vq[index].kick_fd);
  1099. dev->vq[index].kick_fd = -1;
  1100. }
  1101. return true;
  1102. }
  1103. static bool
  1104. vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
  1105. {
  1106. int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
  1107. bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
  1108. if (index >= dev->max_queues) {
  1109. vmsg_close_fds(vmsg);
  1110. vu_panic(dev, "Invalid queue index: %u", index);
  1111. return false;
  1112. }
  1113. if (nofd) {
  1114. vmsg_close_fds(vmsg);
  1115. return true;
  1116. }
  1117. if (vmsg->fd_num != 1) {
  1118. vmsg_close_fds(vmsg);
  1119. vu_panic(dev, "Invalid fds in request: %d", vmsg->request);
  1120. return false;
  1121. }
  1122. return true;
  1123. }
  1124. static int
  1125. inflight_desc_compare(const void *a, const void *b)
  1126. {
  1127. VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a,
  1128. *desc1 = (VuVirtqInflightDesc *)b;
  1129. if (desc1->counter > desc0->counter &&
  1130. (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
  1131. return 1;
  1132. }
  1133. return -1;
  1134. }
  1135. static int
  1136. vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
  1137. {
  1138. int i = 0;
  1139. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
  1140. return 0;
  1141. }
  1142. if (unlikely(!vq->inflight)) {
  1143. return -1;
  1144. }
  1145. if (unlikely(!vq->inflight->version)) {
  1146. /* initialize the buffer */
  1147. vq->inflight->version = INFLIGHT_VERSION;
  1148. return 0;
  1149. }
  1150. vq->used_idx = le16toh(vq->vring.used->idx);
  1151. vq->resubmit_num = 0;
  1152. vq->resubmit_list = NULL;
  1153. vq->counter = 0;
  1154. if (unlikely(vq->inflight->used_idx != vq->used_idx)) {
  1155. vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0;
  1156. barrier();
  1157. vq->inflight->used_idx = vq->used_idx;
  1158. }
  1159. for (i = 0; i < vq->inflight->desc_num; i++) {
  1160. if (vq->inflight->desc[i].inflight == 1) {
  1161. vq->inuse++;
  1162. }
  1163. }
  1164. vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
  1165. if (vq->inuse) {
  1166. vq->resubmit_list = calloc(vq->inuse, sizeof(VuVirtqInflightDesc));
  1167. if (!vq->resubmit_list) {
  1168. return -1;
  1169. }
  1170. for (i = 0; i < vq->inflight->desc_num; i++) {
  1171. if (vq->inflight->desc[i].inflight) {
  1172. vq->resubmit_list[vq->resubmit_num].index = i;
  1173. vq->resubmit_list[vq->resubmit_num].counter =
  1174. vq->inflight->desc[i].counter;
  1175. vq->resubmit_num++;
  1176. }
  1177. }
  1178. if (vq->resubmit_num > 1) {
  1179. qsort(vq->resubmit_list, vq->resubmit_num,
  1180. sizeof(VuVirtqInflightDesc), inflight_desc_compare);
  1181. }
  1182. vq->counter = vq->resubmit_list[0].counter + 1;
  1183. }
  1184. /* in case of I/O hang after reconnecting */
  1185. if (eventfd_write(vq->kick_fd, 1)) {
  1186. return -1;
  1187. }
  1188. return 0;
  1189. }
  1190. static bool
  1191. vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
  1192. {
  1193. int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
  1194. bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
  1195. DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
  1196. if (!vu_check_queue_msg_file(dev, vmsg)) {
  1197. return false;
  1198. }
  1199. if (dev->vq[index].kick_fd != -1) {
  1200. dev->remove_watch(dev, dev->vq[index].kick_fd);
  1201. close(dev->vq[index].kick_fd);
  1202. dev->vq[index].kick_fd = -1;
  1203. }
  1204. dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0];
  1205. DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index);
  1206. dev->vq[index].started = true;
  1207. if (dev->iface->queue_set_started) {
  1208. dev->iface->queue_set_started(dev, index, true);
  1209. }
  1210. if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) {
  1211. dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
  1212. vu_kick_cb, (void *)(long)index);
  1213. DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
  1214. dev->vq[index].kick_fd, index);
  1215. }
  1216. if (vu_check_queue_inflights(dev, &dev->vq[index])) {
  1217. vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
  1218. }
  1219. return false;
  1220. }
  1221. void vu_set_queue_handler(VuDev *dev, VuVirtq *vq,
  1222. vu_queue_handler_cb handler)
  1223. {
  1224. int qidx = vq - dev->vq;
  1225. vq->handler = handler;
  1226. if (vq->kick_fd >= 0) {
  1227. if (handler) {
  1228. dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN,
  1229. vu_kick_cb, (void *)(long)qidx);
  1230. } else {
  1231. dev->remove_watch(dev, vq->kick_fd);
  1232. }
  1233. }
  1234. }
  1235. bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd,
  1236. int size, int offset)
  1237. {
  1238. int qidx = vq - dev->vq;
  1239. int fd_num = 0;
  1240. VhostUserMsg vmsg = {
  1241. .request = VHOST_USER_BACKEND_VRING_HOST_NOTIFIER_MSG,
  1242. .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
  1243. .size = sizeof(vmsg.payload.area),
  1244. .payload.area = {
  1245. .u64 = qidx & VHOST_USER_VRING_IDX_MASK,
  1246. .size = size,
  1247. .offset = offset,
  1248. },
  1249. };
  1250. if (fd == -1) {
  1251. vmsg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK;
  1252. } else {
  1253. vmsg.fds[fd_num++] = fd;
  1254. }
  1255. vmsg.fd_num = fd_num;
  1256. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD)) {
  1257. return false;
  1258. }
  1259. pthread_mutex_lock(&dev->backend_mutex);
  1260. if (!vu_message_write(dev, dev->backend_fd, &vmsg)) {
  1261. pthread_mutex_unlock(&dev->backend_mutex);
  1262. return false;
  1263. }
  1264. /* Also unlocks the backend_mutex */
  1265. return vu_process_message_reply(dev, &vmsg);
  1266. }
  1267. bool
  1268. vu_lookup_shared_object(VuDev *dev, unsigned char uuid[UUID_LEN],
  1269. int *dmabuf_fd)
  1270. {
  1271. bool result = false;
  1272. VhostUserMsg msg_reply;
  1273. VhostUserMsg msg = {
  1274. .request = VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP,
  1275. .size = sizeof(msg.payload.object),
  1276. .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
  1277. };
  1278. memcpy(msg.payload.object.uuid, uuid, sizeof(uuid[0]) * UUID_LEN);
  1279. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SHARED_OBJECT)) {
  1280. return false;
  1281. }
  1282. pthread_mutex_lock(&dev->backend_mutex);
  1283. if (!vu_message_write(dev, dev->backend_fd, &msg)) {
  1284. goto out;
  1285. }
  1286. if (!vu_message_read_default(dev, dev->backend_fd, &msg_reply)) {
  1287. goto out;
  1288. }
  1289. if (msg_reply.request != msg.request) {
  1290. DPRINT("Received unexpected msg type. Expected %d, received %d",
  1291. msg.request, msg_reply.request);
  1292. goto out;
  1293. }
  1294. if (msg_reply.fd_num != 1) {
  1295. DPRINT("Received unexpected number of fds. Expected 1, received %d",
  1296. msg_reply.fd_num);
  1297. goto out;
  1298. }
  1299. *dmabuf_fd = msg_reply.fds[0];
  1300. result = *dmabuf_fd > 0 && msg_reply.payload.u64 == 0;
  1301. out:
  1302. pthread_mutex_unlock(&dev->backend_mutex);
  1303. return result;
  1304. }
  1305. static bool
  1306. vu_send_message(VuDev *dev, VhostUserMsg *vmsg)
  1307. {
  1308. bool result = false;
  1309. pthread_mutex_lock(&dev->backend_mutex);
  1310. if (!vu_message_write(dev, dev->backend_fd, vmsg)) {
  1311. goto out;
  1312. }
  1313. result = true;
  1314. out:
  1315. pthread_mutex_unlock(&dev->backend_mutex);
  1316. return result;
  1317. }
  1318. bool
  1319. vu_add_shared_object(VuDev *dev, unsigned char uuid[UUID_LEN])
  1320. {
  1321. VhostUserMsg msg = {
  1322. .request = VHOST_USER_BACKEND_SHARED_OBJECT_ADD,
  1323. .size = sizeof(msg.payload.object),
  1324. .flags = VHOST_USER_VERSION,
  1325. };
  1326. memcpy(msg.payload.object.uuid, uuid, sizeof(uuid[0]) * UUID_LEN);
  1327. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SHARED_OBJECT)) {
  1328. return false;
  1329. }
  1330. return vu_send_message(dev, &msg);
  1331. }
  1332. bool
  1333. vu_rm_shared_object(VuDev *dev, unsigned char uuid[UUID_LEN])
  1334. {
  1335. VhostUserMsg msg = {
  1336. .request = VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE,
  1337. .size = sizeof(msg.payload.object),
  1338. .flags = VHOST_USER_VERSION,
  1339. };
  1340. memcpy(msg.payload.object.uuid, uuid, sizeof(uuid[0]) * UUID_LEN);
  1341. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SHARED_OBJECT)) {
  1342. return false;
  1343. }
  1344. return vu_send_message(dev, &msg);
  1345. }
  1346. static bool
  1347. vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
  1348. {
  1349. int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
  1350. bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
  1351. DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
  1352. if (!vu_check_queue_msg_file(dev, vmsg)) {
  1353. return false;
  1354. }
  1355. if (dev->vq[index].call_fd != -1) {
  1356. close(dev->vq[index].call_fd);
  1357. dev->vq[index].call_fd = -1;
  1358. }
  1359. dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0];
  1360. /* in case of I/O hang after reconnecting */
  1361. if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) {
  1362. return -1;
  1363. }
  1364. DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index);
  1365. return false;
  1366. }
  1367. static bool
  1368. vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg)
  1369. {
  1370. int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
  1371. bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
  1372. DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
  1373. if (!vu_check_queue_msg_file(dev, vmsg)) {
  1374. return false;
  1375. }
  1376. if (dev->vq[index].err_fd != -1) {
  1377. close(dev->vq[index].err_fd);
  1378. dev->vq[index].err_fd = -1;
  1379. }
  1380. dev->vq[index].err_fd = nofd ? -1 : vmsg->fds[0];
  1381. return false;
  1382. }
  1383. static bool
  1384. vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
  1385. {
  1386. /*
  1387. * Note that we support, but intentionally do not set,
  1388. * VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. This means that
  1389. * a device implementation can return it in its callback
  1390. * (get_protocol_features) if it wants to use this for
  1391. * simulation, but it is otherwise not desirable (if even
  1392. * implemented by the frontend.)
  1393. */
  1394. uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_MQ |
  1395. 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
  1396. 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_REQ |
  1397. 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER |
  1398. 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD |
  1399. 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
  1400. 1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS;
  1401. if (have_userfault()) {
  1402. features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT;
  1403. }
  1404. if (dev->iface->get_config && dev->iface->set_config) {
  1405. features |= 1ULL << VHOST_USER_PROTOCOL_F_CONFIG;
  1406. }
  1407. if (dev->iface->get_protocol_features) {
  1408. features |= dev->iface->get_protocol_features(dev);
  1409. }
  1410. #ifndef MFD_ALLOW_SEALING
  1411. /*
  1412. * If MFD_ALLOW_SEALING is not defined, we are not able to handle
  1413. * VHOST_USER_GET_INFLIGHT_FD messages, since we can't create a memfd.
  1414. * Those messages are used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
  1415. * is negotiated. A device implementation can enable it, so let's mask
  1416. * it to avoid a runtime panic.
  1417. */
  1418. features &= ~(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD);
  1419. #endif
  1420. vmsg_set_reply_u64(vmsg, features);
  1421. return true;
  1422. }
  1423. static bool
  1424. vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
  1425. {
  1426. uint64_t features = vmsg->payload.u64;
  1427. DPRINT("u64: 0x%016"PRIx64"\n", features);
  1428. dev->protocol_features = vmsg->payload.u64;
  1429. if (vu_has_protocol_feature(dev,
  1430. VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) &&
  1431. (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_BACKEND_REQ) ||
  1432. !vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) {
  1433. /*
  1434. * The use case for using messages for kick/call is simulation, to make
  1435. * the kick and call synchronous. To actually get that behaviour, both
  1436. * of the other features are required.
  1437. * Theoretically, one could use only kick messages, or do them without
  1438. * having F_REPLY_ACK, but too many (possibly pending) messages on the
  1439. * socket will eventually cause the frontend to hang, to avoid this in
  1440. * scenarios where not desired enforce that the settings are in a way
  1441. * that actually enables the simulation case.
  1442. */
  1443. vu_panic(dev,
  1444. "F_IN_BAND_NOTIFICATIONS requires F_BACKEND_REQ && F_REPLY_ACK");
  1445. return false;
  1446. }
  1447. if (dev->iface->set_protocol_features) {
  1448. dev->iface->set_protocol_features(dev, features);
  1449. }
  1450. return false;
  1451. }
  1452. static bool
  1453. vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg)
  1454. {
  1455. vmsg_set_reply_u64(vmsg, dev->max_queues);
  1456. return true;
  1457. }
  1458. static bool
  1459. vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg)
  1460. {
  1461. unsigned int index = vmsg->payload.state.index;
  1462. unsigned int enable = vmsg->payload.state.num;
  1463. DPRINT("State.index: %u\n", index);
  1464. DPRINT("State.enable: %u\n", enable);
  1465. if (index >= dev->max_queues) {
  1466. vu_panic(dev, "Invalid vring_enable index: %u", index);
  1467. return false;
  1468. }
  1469. dev->vq[index].enable = enable;
  1470. return false;
  1471. }
  1472. static bool
  1473. vu_set_backend_req_fd(VuDev *dev, VhostUserMsg *vmsg)
  1474. {
  1475. if (vmsg->fd_num != 1) {
  1476. vu_panic(dev, "Invalid backend_req_fd message (%d fd's)", vmsg->fd_num);
  1477. return false;
  1478. }
  1479. if (dev->backend_fd != -1) {
  1480. close(dev->backend_fd);
  1481. }
  1482. dev->backend_fd = vmsg->fds[0];
  1483. DPRINT("Got backend_fd: %d\n", vmsg->fds[0]);
  1484. return false;
  1485. }
  1486. static bool
  1487. vu_get_config(VuDev *dev, VhostUserMsg *vmsg)
  1488. {
  1489. int ret = -1;
  1490. if (dev->iface->get_config) {
  1491. ret = dev->iface->get_config(dev, vmsg->payload.config.region,
  1492. vmsg->payload.config.size);
  1493. }
  1494. if (ret) {
  1495. /* resize to zero to indicate an error to frontend */
  1496. vmsg->size = 0;
  1497. }
  1498. return true;
  1499. }
  1500. static bool
  1501. vu_set_config(VuDev *dev, VhostUserMsg *vmsg)
  1502. {
  1503. int ret = -1;
  1504. if (dev->iface->set_config) {
  1505. ret = dev->iface->set_config(dev, vmsg->payload.config.region,
  1506. vmsg->payload.config.offset,
  1507. vmsg->payload.config.size,
  1508. vmsg->payload.config.flags);
  1509. if (ret) {
  1510. vu_panic(dev, "Set virtio configuration space failed");
  1511. }
  1512. }
  1513. return false;
  1514. }
  1515. static bool
  1516. vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg)
  1517. {
  1518. #ifdef UFFDIO_API
  1519. struct uffdio_api api_struct;
  1520. dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
  1521. vmsg->size = 0;
  1522. #else
  1523. dev->postcopy_ufd = -1;
  1524. #endif
  1525. if (dev->postcopy_ufd == -1) {
  1526. vu_panic(dev, "Userfaultfd not available: %s", strerror(errno));
  1527. goto out;
  1528. }
  1529. #ifdef UFFDIO_API
  1530. api_struct.api = UFFD_API;
  1531. api_struct.features = 0;
  1532. if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
  1533. vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno));
  1534. close(dev->postcopy_ufd);
  1535. dev->postcopy_ufd = -1;
  1536. goto out;
  1537. }
  1538. /* TODO: Stash feature flags somewhere */
  1539. #endif
  1540. out:
  1541. /* Return a ufd to the QEMU */
  1542. vmsg->fd_num = 1;
  1543. vmsg->fds[0] = dev->postcopy_ufd;
  1544. return true; /* = send a reply */
  1545. }
  1546. static bool
  1547. vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg)
  1548. {
  1549. if (dev->nregions) {
  1550. vu_panic(dev, "Regions already registered at postcopy-listen");
  1551. vmsg_set_reply_u64(vmsg, -1);
  1552. return true;
  1553. }
  1554. dev->postcopy_listening = true;
  1555. vmsg_set_reply_u64(vmsg, 0);
  1556. return true;
  1557. }
  1558. static bool
  1559. vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
  1560. {
  1561. DPRINT("%s: Entry\n", __func__);
  1562. dev->postcopy_listening = false;
  1563. if (dev->postcopy_ufd > 0) {
  1564. close(dev->postcopy_ufd);
  1565. dev->postcopy_ufd = -1;
  1566. DPRINT("%s: Done close\n", __func__);
  1567. }
  1568. vmsg_set_reply_u64(vmsg, 0);
  1569. DPRINT("%s: exit\n", __func__);
  1570. return true;
  1571. }
  1572. static inline uint64_t
  1573. vu_inflight_queue_size(uint16_t queue_size)
  1574. {
  1575. return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size +
  1576. sizeof(uint16_t), INFLIGHT_ALIGNMENT);
  1577. }
  1578. #ifdef MFD_ALLOW_SEALING
  1579. static void *
  1580. memfd_alloc(const char *name, size_t size, unsigned int flags, int *fd)
  1581. {
  1582. void *ptr;
  1583. int ret;
  1584. *fd = memfd_create(name, MFD_ALLOW_SEALING);
  1585. if (*fd < 0) {
  1586. return NULL;
  1587. }
  1588. ret = ftruncate(*fd, size);
  1589. if (ret < 0) {
  1590. close(*fd);
  1591. return NULL;
  1592. }
  1593. ret = fcntl(*fd, F_ADD_SEALS, flags);
  1594. if (ret < 0) {
  1595. close(*fd);
  1596. return NULL;
  1597. }
  1598. ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0);
  1599. if (ptr == MAP_FAILED) {
  1600. close(*fd);
  1601. return NULL;
  1602. }
  1603. return ptr;
  1604. }
  1605. #endif
  1606. static bool
  1607. vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
  1608. {
  1609. int fd = -1;
  1610. void *addr = NULL;
  1611. uint64_t mmap_size;
  1612. uint16_t num_queues, queue_size;
  1613. if (vmsg->size != sizeof(vmsg->payload.inflight)) {
  1614. vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
  1615. vmsg->payload.inflight.mmap_size = 0;
  1616. return true;
  1617. }
  1618. num_queues = vmsg->payload.inflight.num_queues;
  1619. queue_size = vmsg->payload.inflight.queue_size;
  1620. DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
  1621. DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
  1622. mmap_size = vu_inflight_queue_size(queue_size) * num_queues;
  1623. #ifdef MFD_ALLOW_SEALING
  1624. addr = memfd_alloc("vhost-inflight", mmap_size,
  1625. F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
  1626. &fd);
  1627. #else
  1628. vu_panic(dev, "Not implemented: memfd support is missing");
  1629. #endif
  1630. if (!addr) {
  1631. vu_panic(dev, "Failed to alloc vhost inflight area");
  1632. vmsg->payload.inflight.mmap_size = 0;
  1633. return true;
  1634. }
  1635. memset(addr, 0, mmap_size);
  1636. dev->inflight_info.addr = addr;
  1637. dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
  1638. dev->inflight_info.fd = vmsg->fds[0] = fd;
  1639. vmsg->fd_num = 1;
  1640. vmsg->payload.inflight.mmap_offset = 0;
  1641. DPRINT("send inflight mmap_size: %"PRId64"\n",
  1642. vmsg->payload.inflight.mmap_size);
  1643. DPRINT("send inflight mmap offset: %"PRId64"\n",
  1644. vmsg->payload.inflight.mmap_offset);
  1645. return true;
  1646. }
  1647. static bool
  1648. vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
  1649. {
  1650. int fd, i;
  1651. uint64_t mmap_size, mmap_offset;
  1652. uint16_t num_queues, queue_size;
  1653. void *rc;
  1654. if (vmsg->fd_num != 1 ||
  1655. vmsg->size != sizeof(vmsg->payload.inflight)) {
  1656. vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
  1657. vmsg->size, vmsg->fd_num);
  1658. return false;
  1659. }
  1660. fd = vmsg->fds[0];
  1661. mmap_size = vmsg->payload.inflight.mmap_size;
  1662. mmap_offset = vmsg->payload.inflight.mmap_offset;
  1663. num_queues = vmsg->payload.inflight.num_queues;
  1664. queue_size = vmsg->payload.inflight.queue_size;
  1665. DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
  1666. DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
  1667. DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
  1668. DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
  1669. rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
  1670. fd, mmap_offset);
  1671. if (rc == MAP_FAILED) {
  1672. vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
  1673. return false;
  1674. }
  1675. if (dev->inflight_info.fd) {
  1676. close(dev->inflight_info.fd);
  1677. }
  1678. if (dev->inflight_info.addr) {
  1679. munmap(dev->inflight_info.addr, dev->inflight_info.size);
  1680. }
  1681. dev->inflight_info.fd = fd;
  1682. dev->inflight_info.addr = rc;
  1683. dev->inflight_info.size = mmap_size;
  1684. for (i = 0; i < num_queues; i++) {
  1685. dev->vq[i].inflight = (VuVirtqInflight *)rc;
  1686. dev->vq[i].inflight->desc_num = queue_size;
  1687. rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size));
  1688. }
  1689. return false;
  1690. }
  1691. static bool
  1692. vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg)
  1693. {
  1694. unsigned int index = vmsg->payload.state.index;
  1695. if (index >= dev->max_queues) {
  1696. vu_panic(dev, "Invalid queue index: %u", index);
  1697. return false;
  1698. }
  1699. DPRINT("Got kick message: handler:%p idx:%u\n",
  1700. dev->vq[index].handler, index);
  1701. if (!dev->vq[index].started) {
  1702. dev->vq[index].started = true;
  1703. if (dev->iface->queue_set_started) {
  1704. dev->iface->queue_set_started(dev, index, true);
  1705. }
  1706. }
  1707. if (dev->vq[index].handler) {
  1708. dev->vq[index].handler(dev, index);
  1709. }
  1710. return false;
  1711. }
  1712. static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg)
  1713. {
  1714. vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_RAM_SLOTS);
  1715. DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS);
  1716. return true;
  1717. }
  1718. static bool
  1719. vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
  1720. {
  1721. int do_reply = 0;
  1722. /* Print out generic part of the request. */
  1723. DPRINT("================ Vhost user message ================\n");
  1724. DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request),
  1725. vmsg->request);
  1726. DPRINT("Flags: 0x%x\n", vmsg->flags);
  1727. DPRINT("Size: %u\n", vmsg->size);
  1728. if (vmsg->fd_num) {
  1729. int i;
  1730. DPRINT("Fds:");
  1731. for (i = 0; i < vmsg->fd_num; i++) {
  1732. DPRINT(" %d", vmsg->fds[i]);
  1733. }
  1734. DPRINT("\n");
  1735. }
  1736. if (dev->iface->process_msg &&
  1737. dev->iface->process_msg(dev, vmsg, &do_reply)) {
  1738. return do_reply;
  1739. }
  1740. switch (vmsg->request) {
  1741. case VHOST_USER_GET_FEATURES:
  1742. return vu_get_features_exec(dev, vmsg);
  1743. case VHOST_USER_SET_FEATURES:
  1744. return vu_set_features_exec(dev, vmsg);
  1745. case VHOST_USER_GET_PROTOCOL_FEATURES:
  1746. return vu_get_protocol_features_exec(dev, vmsg);
  1747. case VHOST_USER_SET_PROTOCOL_FEATURES:
  1748. return vu_set_protocol_features_exec(dev, vmsg);
  1749. case VHOST_USER_SET_OWNER:
  1750. return vu_set_owner_exec(dev, vmsg);
  1751. case VHOST_USER_RESET_OWNER:
  1752. return vu_reset_device_exec(dev, vmsg);
  1753. case VHOST_USER_SET_MEM_TABLE:
  1754. return vu_set_mem_table_exec(dev, vmsg);
  1755. case VHOST_USER_SET_LOG_BASE:
  1756. return vu_set_log_base_exec(dev, vmsg);
  1757. case VHOST_USER_SET_LOG_FD:
  1758. return vu_set_log_fd_exec(dev, vmsg);
  1759. case VHOST_USER_SET_VRING_NUM:
  1760. return vu_set_vring_num_exec(dev, vmsg);
  1761. case VHOST_USER_SET_VRING_ADDR:
  1762. return vu_set_vring_addr_exec(dev, vmsg);
  1763. case VHOST_USER_SET_VRING_BASE:
  1764. return vu_set_vring_base_exec(dev, vmsg);
  1765. case VHOST_USER_GET_VRING_BASE:
  1766. return vu_get_vring_base_exec(dev, vmsg);
  1767. case VHOST_USER_SET_VRING_KICK:
  1768. return vu_set_vring_kick_exec(dev, vmsg);
  1769. case VHOST_USER_SET_VRING_CALL:
  1770. return vu_set_vring_call_exec(dev, vmsg);
  1771. case VHOST_USER_SET_VRING_ERR:
  1772. return vu_set_vring_err_exec(dev, vmsg);
  1773. case VHOST_USER_GET_QUEUE_NUM:
  1774. return vu_get_queue_num_exec(dev, vmsg);
  1775. case VHOST_USER_SET_VRING_ENABLE:
  1776. return vu_set_vring_enable_exec(dev, vmsg);
  1777. case VHOST_USER_SET_BACKEND_REQ_FD:
  1778. return vu_set_backend_req_fd(dev, vmsg);
  1779. case VHOST_USER_GET_CONFIG:
  1780. return vu_get_config(dev, vmsg);
  1781. case VHOST_USER_SET_CONFIG:
  1782. return vu_set_config(dev, vmsg);
  1783. case VHOST_USER_NONE:
  1784. /* if you need processing before exit, override iface->process_msg */
  1785. exit(0);
  1786. case VHOST_USER_POSTCOPY_ADVISE:
  1787. return vu_set_postcopy_advise(dev, vmsg);
  1788. case VHOST_USER_POSTCOPY_LISTEN:
  1789. return vu_set_postcopy_listen(dev, vmsg);
  1790. case VHOST_USER_POSTCOPY_END:
  1791. return vu_set_postcopy_end(dev, vmsg);
  1792. case VHOST_USER_GET_INFLIGHT_FD:
  1793. return vu_get_inflight_fd(dev, vmsg);
  1794. case VHOST_USER_SET_INFLIGHT_FD:
  1795. return vu_set_inflight_fd(dev, vmsg);
  1796. case VHOST_USER_VRING_KICK:
  1797. return vu_handle_vring_kick(dev, vmsg);
  1798. case VHOST_USER_GET_MAX_MEM_SLOTS:
  1799. return vu_handle_get_max_memslots(dev, vmsg);
  1800. case VHOST_USER_ADD_MEM_REG:
  1801. return vu_add_mem_reg(dev, vmsg);
  1802. case VHOST_USER_REM_MEM_REG:
  1803. return vu_rem_mem_reg(dev, vmsg);
  1804. case VHOST_USER_GET_SHARED_OBJECT:
  1805. return vu_get_shared_object(dev, vmsg);
  1806. default:
  1807. vmsg_close_fds(vmsg);
  1808. vu_panic(dev, "Unhandled request: %d", vmsg->request);
  1809. }
  1810. return false;
  1811. }
  1812. bool
  1813. vu_dispatch(VuDev *dev)
  1814. {
  1815. VhostUserMsg vmsg = { 0, };
  1816. int reply_requested;
  1817. bool need_reply, success = false;
  1818. if (!dev->read_msg(dev, dev->sock, &vmsg)) {
  1819. goto end;
  1820. }
  1821. need_reply = vmsg.flags & VHOST_USER_NEED_REPLY_MASK;
  1822. reply_requested = vu_process_message(dev, &vmsg);
  1823. if (!reply_requested && need_reply) {
  1824. vmsg_set_reply_u64(&vmsg, 0);
  1825. reply_requested = 1;
  1826. }
  1827. if (!reply_requested) {
  1828. success = true;
  1829. goto end;
  1830. }
  1831. if (!vu_send_reply(dev, dev->sock, &vmsg)) {
  1832. goto end;
  1833. }
  1834. success = true;
  1835. end:
  1836. free(vmsg.data);
  1837. return success;
  1838. }
  1839. void
  1840. vu_deinit(VuDev *dev)
  1841. {
  1842. unsigned int i;
  1843. vu_remove_all_mem_regs(dev);
  1844. for (i = 0; i < dev->max_queues; i++) {
  1845. VuVirtq *vq = &dev->vq[i];
  1846. if (vq->call_fd != -1) {
  1847. close(vq->call_fd);
  1848. vq->call_fd = -1;
  1849. }
  1850. if (vq->kick_fd != -1) {
  1851. dev->remove_watch(dev, vq->kick_fd);
  1852. close(vq->kick_fd);
  1853. vq->kick_fd = -1;
  1854. }
  1855. if (vq->err_fd != -1) {
  1856. close(vq->err_fd);
  1857. vq->err_fd = -1;
  1858. }
  1859. if (vq->resubmit_list) {
  1860. free(vq->resubmit_list);
  1861. vq->resubmit_list = NULL;
  1862. }
  1863. vq->inflight = NULL;
  1864. }
  1865. if (dev->inflight_info.addr) {
  1866. munmap(dev->inflight_info.addr, dev->inflight_info.size);
  1867. dev->inflight_info.addr = NULL;
  1868. }
  1869. if (dev->inflight_info.fd > 0) {
  1870. close(dev->inflight_info.fd);
  1871. dev->inflight_info.fd = -1;
  1872. }
  1873. vu_close_log(dev);
  1874. if (dev->backend_fd != -1) {
  1875. close(dev->backend_fd);
  1876. dev->backend_fd = -1;
  1877. }
  1878. pthread_mutex_destroy(&dev->backend_mutex);
  1879. if (dev->sock != -1) {
  1880. close(dev->sock);
  1881. }
  1882. free(dev->vq);
  1883. dev->vq = NULL;
  1884. free(dev->regions);
  1885. dev->regions = NULL;
  1886. }
  1887. bool
  1888. vu_init(VuDev *dev,
  1889. uint16_t max_queues,
  1890. int socket,
  1891. vu_panic_cb panic,
  1892. vu_read_msg_cb read_msg,
  1893. vu_set_watch_cb set_watch,
  1894. vu_remove_watch_cb remove_watch,
  1895. const VuDevIface *iface)
  1896. {
  1897. uint16_t i;
  1898. assert(max_queues > 0);
  1899. assert(socket >= 0);
  1900. assert(set_watch);
  1901. assert(remove_watch);
  1902. assert(iface);
  1903. assert(panic);
  1904. memset(dev, 0, sizeof(*dev));
  1905. dev->sock = socket;
  1906. dev->panic = panic;
  1907. dev->read_msg = read_msg ? read_msg : vu_message_read_default;
  1908. dev->set_watch = set_watch;
  1909. dev->remove_watch = remove_watch;
  1910. dev->iface = iface;
  1911. dev->log_call_fd = -1;
  1912. pthread_mutex_init(&dev->backend_mutex, NULL);
  1913. dev->backend_fd = -1;
  1914. dev->max_queues = max_queues;
  1915. dev->regions = malloc(VHOST_USER_MAX_RAM_SLOTS * sizeof(dev->regions[0]));
  1916. if (!dev->regions) {
  1917. DPRINT("%s: failed to malloc mem regions\n", __func__);
  1918. return false;
  1919. }
  1920. dev->vq = malloc(max_queues * sizeof(dev->vq[0]));
  1921. if (!dev->vq) {
  1922. DPRINT("%s: failed to malloc virtqueues\n", __func__);
  1923. free(dev->regions);
  1924. dev->regions = NULL;
  1925. return false;
  1926. }
  1927. for (i = 0; i < max_queues; i++) {
  1928. dev->vq[i] = (VuVirtq) {
  1929. .call_fd = -1, .kick_fd = -1, .err_fd = -1,
  1930. .notification = true,
  1931. };
  1932. }
  1933. return true;
  1934. }
  1935. VuVirtq *
  1936. vu_get_queue(VuDev *dev, int qidx)
  1937. {
  1938. assert(qidx < dev->max_queues);
  1939. return &dev->vq[qidx];
  1940. }
  1941. bool
  1942. vu_queue_enabled(VuDev *dev, VuVirtq *vq)
  1943. {
  1944. return vq->enable;
  1945. }
  1946. bool
  1947. vu_queue_started(const VuDev *dev, const VuVirtq *vq)
  1948. {
  1949. return vq->started;
  1950. }
  1951. static inline uint16_t
  1952. vring_avail_flags(VuVirtq *vq)
  1953. {
  1954. return le16toh(vq->vring.avail->flags);
  1955. }
  1956. static inline uint16_t
  1957. vring_avail_idx(VuVirtq *vq)
  1958. {
  1959. vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
  1960. return vq->shadow_avail_idx;
  1961. }
  1962. static inline uint16_t
  1963. vring_avail_ring(VuVirtq *vq, int i)
  1964. {
  1965. return le16toh(vq->vring.avail->ring[i]);
  1966. }
  1967. static inline uint16_t
  1968. vring_get_used_event(VuVirtq *vq)
  1969. {
  1970. return vring_avail_ring(vq, vq->vring.num);
  1971. }
  1972. static int
  1973. virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx)
  1974. {
  1975. uint16_t num_heads = vring_avail_idx(vq) - idx;
  1976. /* Check it isn't doing very strange things with descriptor numbers. */
  1977. if (num_heads > vq->vring.num) {
  1978. vu_panic(dev, "Guest moved used index from %u to %u",
  1979. idx, vq->shadow_avail_idx);
  1980. return -1;
  1981. }
  1982. if (num_heads) {
  1983. /* On success, callers read a descriptor at vq->last_avail_idx.
  1984. * Make sure descriptor read does not bypass avail index read. */
  1985. smp_rmb();
  1986. }
  1987. return num_heads;
  1988. }
  1989. static bool
  1990. virtqueue_get_head(VuDev *dev, VuVirtq *vq,
  1991. unsigned int idx, unsigned int *head)
  1992. {
  1993. /* Grab the next descriptor number they're advertising, and increment
  1994. * the index we've seen. */
  1995. *head = vring_avail_ring(vq, idx % vq->vring.num);
  1996. /* If their number is silly, that's a fatal mistake. */
  1997. if (*head >= vq->vring.num) {
  1998. vu_panic(dev, "Guest says index %u is available", *head);
  1999. return false;
  2000. }
  2001. return true;
  2002. }
  2003. static int
  2004. virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc,
  2005. uint64_t addr, size_t len)
  2006. {
  2007. struct vring_desc *ori_desc;
  2008. uint64_t read_len;
  2009. if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
  2010. return -1;
  2011. }
  2012. if (len == 0) {
  2013. return -1;
  2014. }
  2015. while (len) {
  2016. read_len = len;
  2017. ori_desc = vu_gpa_to_va(dev, &read_len, addr);
  2018. if (!ori_desc) {
  2019. return -1;
  2020. }
  2021. memcpy(desc, ori_desc, read_len);
  2022. len -= read_len;
  2023. addr += read_len;
  2024. desc += read_len;
  2025. }
  2026. return 0;
  2027. }
  2028. enum {
  2029. VIRTQUEUE_READ_DESC_ERROR = -1,
  2030. VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
  2031. VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
  2032. };
  2033. static int
  2034. virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc,
  2035. int i, unsigned int max, unsigned int *next)
  2036. {
  2037. /* If this descriptor says it doesn't chain, we're done. */
  2038. if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
  2039. return VIRTQUEUE_READ_DESC_DONE;
  2040. }
  2041. /* Check they're not leading us off end of descriptors. */
  2042. *next = le16toh(desc[i].next);
  2043. /* Make sure compiler knows to grab that: we don't want it changing! */
  2044. smp_wmb();
  2045. if (*next >= max) {
  2046. vu_panic(dev, "Desc next is %u", *next);
  2047. return VIRTQUEUE_READ_DESC_ERROR;
  2048. }
  2049. return VIRTQUEUE_READ_DESC_MORE;
  2050. }
  2051. void
  2052. vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes,
  2053. unsigned int *out_bytes,
  2054. unsigned max_in_bytes, unsigned max_out_bytes)
  2055. {
  2056. unsigned int idx;
  2057. unsigned int total_bufs, in_total, out_total;
  2058. int rc;
  2059. idx = vq->last_avail_idx;
  2060. total_bufs = in_total = out_total = 0;
  2061. if (!vu_is_vq_usable(dev, vq)) {
  2062. goto done;
  2063. }
  2064. while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) {
  2065. unsigned int max, desc_len, num_bufs, indirect = 0;
  2066. uint64_t desc_addr, read_len;
  2067. struct vring_desc *desc;
  2068. struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
  2069. unsigned int i;
  2070. max = vq->vring.num;
  2071. num_bufs = total_bufs;
  2072. if (!virtqueue_get_head(dev, vq, idx++, &i)) {
  2073. goto err;
  2074. }
  2075. desc = vq->vring.desc;
  2076. if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
  2077. if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
  2078. vu_panic(dev, "Invalid size for indirect buffer table");
  2079. goto err;
  2080. }
  2081. /* If we've got too many, that implies a descriptor loop. */
  2082. if (num_bufs >= max) {
  2083. vu_panic(dev, "Looped descriptor");
  2084. goto err;
  2085. }
  2086. /* loop over the indirect descriptor table */
  2087. indirect = 1;
  2088. desc_addr = le64toh(desc[i].addr);
  2089. desc_len = le32toh(desc[i].len);
  2090. max = desc_len / sizeof(struct vring_desc);
  2091. read_len = desc_len;
  2092. desc = vu_gpa_to_va(dev, &read_len, desc_addr);
  2093. if (unlikely(desc && read_len != desc_len)) {
  2094. /* Failed to use zero copy */
  2095. desc = NULL;
  2096. if (!virtqueue_read_indirect_desc(dev, desc_buf,
  2097. desc_addr,
  2098. desc_len)) {
  2099. desc = desc_buf;
  2100. }
  2101. }
  2102. if (!desc) {
  2103. vu_panic(dev, "Invalid indirect buffer table");
  2104. goto err;
  2105. }
  2106. num_bufs = i = 0;
  2107. }
  2108. do {
  2109. /* If we've got too many, that implies a descriptor loop. */
  2110. if (++num_bufs > max) {
  2111. vu_panic(dev, "Looped descriptor");
  2112. goto err;
  2113. }
  2114. if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
  2115. in_total += le32toh(desc[i].len);
  2116. } else {
  2117. out_total += le32toh(desc[i].len);
  2118. }
  2119. if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
  2120. goto done;
  2121. }
  2122. rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
  2123. } while (rc == VIRTQUEUE_READ_DESC_MORE);
  2124. if (rc == VIRTQUEUE_READ_DESC_ERROR) {
  2125. goto err;
  2126. }
  2127. if (!indirect) {
  2128. total_bufs = num_bufs;
  2129. } else {
  2130. total_bufs++;
  2131. }
  2132. }
  2133. if (rc < 0) {
  2134. goto err;
  2135. }
  2136. done:
  2137. if (in_bytes) {
  2138. *in_bytes = in_total;
  2139. }
  2140. if (out_bytes) {
  2141. *out_bytes = out_total;
  2142. }
  2143. return;
  2144. err:
  2145. in_total = out_total = 0;
  2146. goto done;
  2147. }
  2148. bool
  2149. vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
  2150. unsigned int out_bytes)
  2151. {
  2152. unsigned int in_total, out_total;
  2153. vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total,
  2154. in_bytes, out_bytes);
  2155. return in_bytes <= in_total && out_bytes <= out_total;
  2156. }
  2157. /* Fetch avail_idx from VQ memory only when we really need to know if
  2158. * guest has added some buffers. */
  2159. bool
  2160. vu_queue_empty(VuDev *dev, VuVirtq *vq)
  2161. {
  2162. if (!vu_is_vq_usable(dev, vq)) {
  2163. return true;
  2164. }
  2165. if (vq->shadow_avail_idx != vq->last_avail_idx) {
  2166. return false;
  2167. }
  2168. return vring_avail_idx(vq) == vq->last_avail_idx;
  2169. }
  2170. static bool
  2171. vring_notify(VuDev *dev, VuVirtq *vq)
  2172. {
  2173. uint16_t old, new;
  2174. bool v;
  2175. /* We need to expose used array entries before checking used event. */
  2176. smp_mb();
  2177. /* Always notify when queue is empty (when feature acknowledge) */
  2178. if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
  2179. !vq->inuse && vu_queue_empty(dev, vq)) {
  2180. return true;
  2181. }
  2182. if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
  2183. return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
  2184. }
  2185. v = vq->signalled_used_valid;
  2186. vq->signalled_used_valid = true;
  2187. old = vq->signalled_used;
  2188. new = vq->signalled_used = vq->used_idx;
  2189. return !v || vring_need_event(vring_get_used_event(vq), new, old);
  2190. }
  2191. static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync)
  2192. {
  2193. if (!vu_is_vq_usable(dev, vq)) {
  2194. return;
  2195. }
  2196. if (!vring_notify(dev, vq)) {
  2197. DPRINT("skipped notify...\n");
  2198. return;
  2199. }
  2200. if (vq->call_fd < 0 &&
  2201. vu_has_protocol_feature(dev,
  2202. VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) &&
  2203. vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_BACKEND_REQ)) {
  2204. VhostUserMsg vmsg = {
  2205. .request = VHOST_USER_BACKEND_VRING_CALL,
  2206. .flags = VHOST_USER_VERSION,
  2207. .size = sizeof(vmsg.payload.state),
  2208. .payload.state = {
  2209. .index = vq - dev->vq,
  2210. },
  2211. };
  2212. bool ack = sync &&
  2213. vu_has_protocol_feature(dev,
  2214. VHOST_USER_PROTOCOL_F_REPLY_ACK);
  2215. if (ack) {
  2216. vmsg.flags |= VHOST_USER_NEED_REPLY_MASK;
  2217. }
  2218. vu_message_write(dev, dev->backend_fd, &vmsg);
  2219. if (ack) {
  2220. vu_message_read_default(dev, dev->backend_fd, &vmsg);
  2221. }
  2222. return;
  2223. }
  2224. if (eventfd_write(vq->call_fd, 1) < 0) {
  2225. vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
  2226. }
  2227. }
  2228. void vu_queue_notify(VuDev *dev, VuVirtq *vq)
  2229. {
  2230. _vu_queue_notify(dev, vq, false);
  2231. }
  2232. void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq)
  2233. {
  2234. _vu_queue_notify(dev, vq, true);
  2235. }
  2236. void vu_config_change_msg(VuDev *dev)
  2237. {
  2238. VhostUserMsg vmsg = {
  2239. .request = VHOST_USER_BACKEND_CONFIG_CHANGE_MSG,
  2240. .flags = VHOST_USER_VERSION,
  2241. };
  2242. vu_message_write(dev, dev->backend_fd, &vmsg);
  2243. }
  2244. static inline void
  2245. vring_used_flags_set_bit(VuVirtq *vq, int mask)
  2246. {
  2247. uint16_t *flags;
  2248. flags = (uint16_t *)((char*)vq->vring.used +
  2249. offsetof(struct vring_used, flags));
  2250. *flags = htole16(le16toh(*flags) | mask);
  2251. }
  2252. static inline void
  2253. vring_used_flags_unset_bit(VuVirtq *vq, int mask)
  2254. {
  2255. uint16_t *flags;
  2256. flags = (uint16_t *)((char*)vq->vring.used +
  2257. offsetof(struct vring_used, flags));
  2258. *flags = htole16(le16toh(*flags) & ~mask);
  2259. }
  2260. static inline void
  2261. vring_set_avail_event(VuVirtq *vq, uint16_t val)
  2262. {
  2263. uint16_t val_le = htole16(val);
  2264. if (!vq->notification) {
  2265. return;
  2266. }
  2267. memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t));
  2268. }
  2269. void
  2270. vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable)
  2271. {
  2272. vq->notification = enable;
  2273. if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
  2274. vring_set_avail_event(vq, vring_avail_idx(vq));
  2275. } else if (enable) {
  2276. vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY);
  2277. } else {
  2278. vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY);
  2279. }
  2280. if (enable) {
  2281. /* Expose avail event/used flags before caller checks the avail idx. */
  2282. smp_mb();
  2283. }
  2284. }
  2285. static bool
  2286. virtqueue_map_desc(VuDev *dev,
  2287. unsigned int *p_num_sg, struct iovec *iov,
  2288. unsigned int max_num_sg, bool is_write,
  2289. uint64_t pa, size_t sz)
  2290. {
  2291. unsigned num_sg = *p_num_sg;
  2292. assert(num_sg <= max_num_sg);
  2293. if (!sz) {
  2294. vu_panic(dev, "virtio: zero sized buffers are not allowed");
  2295. return false;
  2296. }
  2297. while (sz) {
  2298. uint64_t len = sz;
  2299. if (num_sg == max_num_sg) {
  2300. vu_panic(dev, "virtio: too many descriptors in indirect table");
  2301. return false;
  2302. }
  2303. iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa);
  2304. if (iov[num_sg].iov_base == NULL) {
  2305. vu_panic(dev, "virtio: invalid address for buffers");
  2306. return false;
  2307. }
  2308. iov[num_sg].iov_len = len;
  2309. num_sg++;
  2310. sz -= len;
  2311. pa += len;
  2312. }
  2313. *p_num_sg = num_sg;
  2314. return true;
  2315. }
  2316. static void *
  2317. virtqueue_alloc_element(size_t sz,
  2318. unsigned out_num, unsigned in_num)
  2319. {
  2320. VuVirtqElement *elem;
  2321. size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
  2322. size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
  2323. size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
  2324. assert(sz >= sizeof(VuVirtqElement));
  2325. elem = malloc(out_sg_end);
  2326. if (!elem) {
  2327. DPRINT("%s: failed to malloc virtqueue element\n", __func__);
  2328. return NULL;
  2329. }
  2330. elem->out_num = out_num;
  2331. elem->in_num = in_num;
  2332. elem->in_sg = (void *)elem + in_sg_ofs;
  2333. elem->out_sg = (void *)elem + out_sg_ofs;
  2334. return elem;
  2335. }
  2336. static void *
  2337. vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
  2338. {
  2339. struct vring_desc *desc = vq->vring.desc;
  2340. uint64_t desc_addr, read_len;
  2341. unsigned int desc_len;
  2342. unsigned int max = vq->vring.num;
  2343. unsigned int i = idx;
  2344. VuVirtqElement *elem;
  2345. unsigned int out_num = 0, in_num = 0;
  2346. struct iovec iov[VIRTQUEUE_MAX_SIZE];
  2347. struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
  2348. int rc;
  2349. if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
  2350. if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
  2351. vu_panic(dev, "Invalid size for indirect buffer table");
  2352. return NULL;
  2353. }
  2354. /* loop over the indirect descriptor table */
  2355. desc_addr = le64toh(desc[i].addr);
  2356. desc_len = le32toh(desc[i].len);
  2357. max = desc_len / sizeof(struct vring_desc);
  2358. read_len = desc_len;
  2359. desc = vu_gpa_to_va(dev, &read_len, desc_addr);
  2360. if (unlikely(desc && read_len != desc_len)) {
  2361. /* Failed to use zero copy */
  2362. desc = NULL;
  2363. if (!virtqueue_read_indirect_desc(dev, desc_buf,
  2364. desc_addr,
  2365. desc_len)) {
  2366. desc = desc_buf;
  2367. }
  2368. }
  2369. if (!desc) {
  2370. vu_panic(dev, "Invalid indirect buffer table");
  2371. return NULL;
  2372. }
  2373. i = 0;
  2374. }
  2375. /* Collect all the descriptors */
  2376. do {
  2377. if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
  2378. if (!virtqueue_map_desc(dev, &in_num, iov + out_num,
  2379. VIRTQUEUE_MAX_SIZE - out_num, true,
  2380. le64toh(desc[i].addr),
  2381. le32toh(desc[i].len))) {
  2382. return NULL;
  2383. }
  2384. } else {
  2385. if (in_num) {
  2386. vu_panic(dev, "Incorrect order for descriptors");
  2387. return NULL;
  2388. }
  2389. if (!virtqueue_map_desc(dev, &out_num, iov,
  2390. VIRTQUEUE_MAX_SIZE, false,
  2391. le64toh(desc[i].addr),
  2392. le32toh(desc[i].len))) {
  2393. return NULL;
  2394. }
  2395. }
  2396. /* If we've got too many, that implies a descriptor loop. */
  2397. if ((in_num + out_num) > max) {
  2398. vu_panic(dev, "Looped descriptor");
  2399. return NULL;
  2400. }
  2401. rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
  2402. } while (rc == VIRTQUEUE_READ_DESC_MORE);
  2403. if (rc == VIRTQUEUE_READ_DESC_ERROR) {
  2404. vu_panic(dev, "read descriptor error");
  2405. return NULL;
  2406. }
  2407. /* Now copy what we have collected and mapped */
  2408. elem = virtqueue_alloc_element(sz, out_num, in_num);
  2409. if (!elem) {
  2410. return NULL;
  2411. }
  2412. elem->index = idx;
  2413. for (i = 0; i < out_num; i++) {
  2414. elem->out_sg[i] = iov[i];
  2415. }
  2416. for (i = 0; i < in_num; i++) {
  2417. elem->in_sg[i] = iov[out_num + i];
  2418. }
  2419. return elem;
  2420. }
  2421. static int
  2422. vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
  2423. {
  2424. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
  2425. return 0;
  2426. }
  2427. if (unlikely(!vq->inflight)) {
  2428. return -1;
  2429. }
  2430. vq->inflight->desc[desc_idx].counter = vq->counter++;
  2431. vq->inflight->desc[desc_idx].inflight = 1;
  2432. return 0;
  2433. }
  2434. static int
  2435. vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx)
  2436. {
  2437. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
  2438. return 0;
  2439. }
  2440. if (unlikely(!vq->inflight)) {
  2441. return -1;
  2442. }
  2443. vq->inflight->last_batch_head = desc_idx;
  2444. return 0;
  2445. }
  2446. static int
  2447. vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx)
  2448. {
  2449. if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
  2450. return 0;
  2451. }
  2452. if (unlikely(!vq->inflight)) {
  2453. return -1;
  2454. }
  2455. barrier();
  2456. vq->inflight->desc[desc_idx].inflight = 0;
  2457. barrier();
  2458. vq->inflight->used_idx = vq->used_idx;
  2459. return 0;
  2460. }
  2461. void *
  2462. vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
  2463. {
  2464. int i;
  2465. unsigned int head;
  2466. VuVirtqElement *elem;
  2467. if (!vu_is_vq_usable(dev, vq)) {
  2468. return NULL;
  2469. }
  2470. if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
  2471. i = (--vq->resubmit_num);
  2472. elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz);
  2473. if (!vq->resubmit_num) {
  2474. free(vq->resubmit_list);
  2475. vq->resubmit_list = NULL;
  2476. }
  2477. return elem;
  2478. }
  2479. if (vu_queue_empty(dev, vq)) {
  2480. return NULL;
  2481. }
  2482. /*
  2483. * Needed after virtio_queue_empty(), see comment in
  2484. * virtqueue_num_heads().
  2485. */
  2486. smp_rmb();
  2487. if (vq->inuse >= vq->vring.num) {
  2488. vu_panic(dev, "Virtqueue size exceeded");
  2489. return NULL;
  2490. }
  2491. if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
  2492. return NULL;
  2493. }
  2494. if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
  2495. vring_set_avail_event(vq, vq->last_avail_idx);
  2496. }
  2497. elem = vu_queue_map_desc(dev, vq, head, sz);
  2498. if (!elem) {
  2499. return NULL;
  2500. }
  2501. vq->inuse++;
  2502. vu_queue_inflight_get(dev, vq, head);
  2503. return elem;
  2504. }
  2505. static void
  2506. vu_queue_detach_element(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
  2507. size_t len)
  2508. {
  2509. vq->inuse--;
  2510. /* unmap, when DMA support is added */
  2511. }
  2512. void
  2513. vu_queue_unpop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
  2514. size_t len)
  2515. {
  2516. vq->last_avail_idx--;
  2517. vu_queue_detach_element(dev, vq, elem, len);
  2518. }
  2519. bool
  2520. vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
  2521. {
  2522. if (num > vq->inuse) {
  2523. return false;
  2524. }
  2525. vq->last_avail_idx -= num;
  2526. vq->inuse -= num;
  2527. return true;
  2528. }
  2529. static inline
  2530. void vring_used_write(VuDev *dev, VuVirtq *vq,
  2531. struct vring_used_elem *uelem, int i)
  2532. {
  2533. struct vring_used *used = vq->vring.used;
  2534. used->ring[i] = *uelem;
  2535. vu_log_write(dev, vq->vring.log_guest_addr +
  2536. offsetof(struct vring_used, ring[i]),
  2537. sizeof(used->ring[i]));
  2538. }
  2539. static void
  2540. vu_log_queue_fill(VuDev *dev, VuVirtq *vq,
  2541. const VuVirtqElement *elem,
  2542. unsigned int len)
  2543. {
  2544. struct vring_desc *desc = vq->vring.desc;
  2545. unsigned int i, max, min, desc_len;
  2546. uint64_t desc_addr, read_len;
  2547. struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
  2548. unsigned num_bufs = 0;
  2549. max = vq->vring.num;
  2550. i = elem->index;
  2551. if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
  2552. if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
  2553. vu_panic(dev, "Invalid size for indirect buffer table");
  2554. return;
  2555. }
  2556. /* loop over the indirect descriptor table */
  2557. desc_addr = le64toh(desc[i].addr);
  2558. desc_len = le32toh(desc[i].len);
  2559. max = desc_len / sizeof(struct vring_desc);
  2560. read_len = desc_len;
  2561. desc = vu_gpa_to_va(dev, &read_len, desc_addr);
  2562. if (unlikely(desc && read_len != desc_len)) {
  2563. /* Failed to use zero copy */
  2564. desc = NULL;
  2565. if (!virtqueue_read_indirect_desc(dev, desc_buf,
  2566. desc_addr,
  2567. desc_len)) {
  2568. desc = desc_buf;
  2569. }
  2570. }
  2571. if (!desc) {
  2572. vu_panic(dev, "Invalid indirect buffer table");
  2573. return;
  2574. }
  2575. i = 0;
  2576. }
  2577. do {
  2578. if (++num_bufs > max) {
  2579. vu_panic(dev, "Looped descriptor");
  2580. return;
  2581. }
  2582. if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
  2583. min = MIN(le32toh(desc[i].len), len);
  2584. vu_log_write(dev, le64toh(desc[i].addr), min);
  2585. len -= min;
  2586. }
  2587. } while (len > 0 &&
  2588. (virtqueue_read_next_desc(dev, desc, i, max, &i)
  2589. == VIRTQUEUE_READ_DESC_MORE));
  2590. }
  2591. void
  2592. vu_queue_fill(VuDev *dev, VuVirtq *vq,
  2593. const VuVirtqElement *elem,
  2594. unsigned int len, unsigned int idx)
  2595. {
  2596. struct vring_used_elem uelem;
  2597. if (!vu_is_vq_usable(dev, vq)) {
  2598. return;
  2599. }
  2600. vu_log_queue_fill(dev, vq, elem, len);
  2601. idx = (idx + vq->used_idx) % vq->vring.num;
  2602. uelem.id = htole32(elem->index);
  2603. uelem.len = htole32(len);
  2604. vring_used_write(dev, vq, &uelem, idx);
  2605. }
  2606. static inline
  2607. void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val)
  2608. {
  2609. vq->vring.used->idx = htole16(val);
  2610. vu_log_write(dev,
  2611. vq->vring.log_guest_addr + offsetof(struct vring_used, idx),
  2612. sizeof(vq->vring.used->idx));
  2613. vq->used_idx = val;
  2614. }
  2615. void
  2616. vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count)
  2617. {
  2618. uint16_t old, new;
  2619. if (!vu_is_vq_usable(dev, vq)) {
  2620. return;
  2621. }
  2622. /* Make sure buffer is written before we update index. */
  2623. smp_wmb();
  2624. old = vq->used_idx;
  2625. new = old + count;
  2626. vring_used_idx_set(dev, vq, new);
  2627. vq->inuse -= count;
  2628. if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
  2629. vq->signalled_used_valid = false;
  2630. }
  2631. }
  2632. void
  2633. vu_queue_push(VuDev *dev, VuVirtq *vq,
  2634. const VuVirtqElement *elem, unsigned int len)
  2635. {
  2636. vu_queue_fill(dev, vq, elem, len, 0);
  2637. vu_queue_inflight_pre_put(dev, vq, elem->index);
  2638. vu_queue_flush(dev, vq, 1);
  2639. vu_queue_inflight_post_put(dev, vq, elem->index);
  2640. }