2
0

vhost-vdpa.c 61 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899
  1. /*
  2. * vhost-vdpa.c
  3. *
  4. * Copyright(c) 2017-2018 Intel Corporation.
  5. * Copyright(c) 2020 Red Hat, Inc.
  6. *
  7. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  8. * See the COPYING file in the top-level directory.
  9. *
  10. */
  11. #include "qemu/osdep.h"
  12. #include "clients.h"
  13. #include "hw/virtio/virtio-net.h"
  14. #include "net/vhost_net.h"
  15. #include "net/vhost-vdpa.h"
  16. #include "hw/virtio/vhost-vdpa.h"
  17. #include "qemu/config-file.h"
  18. #include "qemu/error-report.h"
  19. #include "qemu/log.h"
  20. #include "qemu/memalign.h"
  21. #include "qemu/option.h"
  22. #include "qapi/error.h"
  23. #include <linux/vhost.h>
  24. #include <sys/ioctl.h>
  25. #include <err.h>
  26. #include "standard-headers/linux/virtio_net.h"
  27. #include "monitor/monitor.h"
  28. #include "migration/misc.h"
  29. #include "hw/virtio/vhost.h"
  30. #include "trace.h"
  31. /* Todo:need to add the multiqueue support here */
  32. typedef struct VhostVDPAState {
  33. NetClientState nc;
  34. struct vhost_vdpa vhost_vdpa;
  35. NotifierWithReturn migration_state;
  36. VHostNetState *vhost_net;
  37. /* Control commands shadow buffers */
  38. void *cvq_cmd_out_buffer;
  39. virtio_net_ctrl_ack *status;
  40. /* The device always have SVQ enabled */
  41. bool always_svq;
  42. /* The device can isolate CVQ in its own ASID */
  43. bool cvq_isolated;
  44. bool started;
  45. } VhostVDPAState;
  46. /*
  47. * The array is sorted alphabetically in ascending order,
  48. * with the exception of VHOST_INVALID_FEATURE_BIT,
  49. * which should always be the last entry.
  50. */
  51. const int vdpa_feature_bits[] = {
  52. VIRTIO_F_ANY_LAYOUT,
  53. VIRTIO_F_IOMMU_PLATFORM,
  54. VIRTIO_F_NOTIFY_ON_EMPTY,
  55. VIRTIO_F_RING_PACKED,
  56. VIRTIO_F_RING_RESET,
  57. VIRTIO_F_VERSION_1,
  58. VIRTIO_F_IN_ORDER,
  59. VIRTIO_F_NOTIFICATION_DATA,
  60. VIRTIO_NET_F_CSUM,
  61. VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,
  62. VIRTIO_NET_F_CTRL_MAC_ADDR,
  63. VIRTIO_NET_F_CTRL_RX,
  64. VIRTIO_NET_F_CTRL_RX_EXTRA,
  65. VIRTIO_NET_F_CTRL_VLAN,
  66. VIRTIO_NET_F_CTRL_VQ,
  67. VIRTIO_NET_F_GSO,
  68. VIRTIO_NET_F_GUEST_CSUM,
  69. VIRTIO_NET_F_GUEST_ECN,
  70. VIRTIO_NET_F_GUEST_TSO4,
  71. VIRTIO_NET_F_GUEST_TSO6,
  72. VIRTIO_NET_F_GUEST_UFO,
  73. VIRTIO_NET_F_GUEST_USO4,
  74. VIRTIO_NET_F_GUEST_USO6,
  75. VIRTIO_NET_F_HASH_REPORT,
  76. VIRTIO_NET_F_HOST_ECN,
  77. VIRTIO_NET_F_HOST_TSO4,
  78. VIRTIO_NET_F_HOST_TSO6,
  79. VIRTIO_NET_F_HOST_UFO,
  80. VIRTIO_NET_F_HOST_USO,
  81. VIRTIO_NET_F_MQ,
  82. VIRTIO_NET_F_MRG_RXBUF,
  83. VIRTIO_NET_F_MTU,
  84. VIRTIO_NET_F_RSC_EXT,
  85. VIRTIO_NET_F_RSS,
  86. VIRTIO_NET_F_STATUS,
  87. VIRTIO_RING_F_EVENT_IDX,
  88. VIRTIO_RING_F_INDIRECT_DESC,
  89. /* VHOST_INVALID_FEATURE_BIT should always be the last entry */
  90. VHOST_INVALID_FEATURE_BIT
  91. };
  92. /** Supported device specific feature bits with SVQ */
  93. static const uint64_t vdpa_svq_device_features =
  94. BIT_ULL(VIRTIO_NET_F_CSUM) |
  95. BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
  96. BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) |
  97. BIT_ULL(VIRTIO_NET_F_MTU) |
  98. BIT_ULL(VIRTIO_NET_F_MAC) |
  99. BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
  100. BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
  101. BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
  102. BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
  103. BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
  104. BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
  105. BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
  106. BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
  107. BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
  108. BIT_ULL(VIRTIO_NET_F_STATUS) |
  109. BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
  110. BIT_ULL(VIRTIO_NET_F_CTRL_RX) |
  111. BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |
  112. BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) |
  113. BIT_ULL(VIRTIO_NET_F_MQ) |
  114. BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
  115. BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
  116. /* VHOST_F_LOG_ALL is exposed by SVQ */
  117. BIT_ULL(VHOST_F_LOG_ALL) |
  118. BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |
  119. BIT_ULL(VIRTIO_NET_F_RSS) |
  120. BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
  121. BIT_ULL(VIRTIO_NET_F_STANDBY) |
  122. BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX);
  123. #define VHOST_VDPA_NET_CVQ_ASID 1
  124. VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
  125. {
  126. VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
  127. assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
  128. return s->vhost_net;
  129. }
  130. static size_t vhost_vdpa_net_cvq_cmd_len(void)
  131. {
  132. /*
  133. * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
  134. * In buffer is always 1 byte, so it should fit here
  135. */
  136. return sizeof(struct virtio_net_ctrl_hdr) +
  137. 2 * sizeof(struct virtio_net_ctrl_mac) +
  138. MAC_TABLE_ENTRIES * ETH_ALEN;
  139. }
  140. static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
  141. {
  142. return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
  143. }
  144. static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp)
  145. {
  146. uint64_t invalid_dev_features =
  147. features & ~vdpa_svq_device_features &
  148. /* Transport are all accepted at this point */
  149. ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
  150. VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
  151. if (invalid_dev_features) {
  152. error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
  153. invalid_dev_features);
  154. return false;
  155. }
  156. return vhost_svq_valid_features(features, errp);
  157. }
  158. static int vhost_vdpa_net_check_device_id(struct vhost_net *net)
  159. {
  160. uint32_t device_id;
  161. int ret;
  162. struct vhost_dev *hdev;
  163. hdev = (struct vhost_dev *)&net->dev;
  164. ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id);
  165. if (device_id != VIRTIO_ID_NET) {
  166. return -ENOTSUP;
  167. }
  168. return ret;
  169. }
  170. static int vhost_vdpa_add(NetClientState *ncs, void *be,
  171. int queue_pair_index, int nvqs)
  172. {
  173. VhostNetOptions options;
  174. struct vhost_net *net = NULL;
  175. VhostVDPAState *s;
  176. int ret;
  177. options.backend_type = VHOST_BACKEND_TYPE_VDPA;
  178. assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
  179. s = DO_UPCAST(VhostVDPAState, nc, ncs);
  180. options.net_backend = ncs;
  181. options.opaque = be;
  182. options.busyloop_timeout = 0;
  183. options.nvqs = nvqs;
  184. net = vhost_net_init(&options);
  185. if (!net) {
  186. error_report("failed to init vhost_net for queue");
  187. goto err_init;
  188. }
  189. s->vhost_net = net;
  190. ret = vhost_vdpa_net_check_device_id(net);
  191. if (ret) {
  192. goto err_check;
  193. }
  194. return 0;
  195. err_check:
  196. vhost_net_cleanup(net);
  197. g_free(net);
  198. err_init:
  199. return -1;
  200. }
  201. static void vhost_vdpa_cleanup(NetClientState *nc)
  202. {
  203. VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
  204. munmap(s->cvq_cmd_out_buffer, vhost_vdpa_net_cvq_cmd_page_len());
  205. munmap(s->status, vhost_vdpa_net_cvq_cmd_page_len());
  206. if (s->vhost_net) {
  207. vhost_net_cleanup(s->vhost_net);
  208. g_free(s->vhost_net);
  209. s->vhost_net = NULL;
  210. }
  211. if (s->vhost_vdpa.index != 0) {
  212. return;
  213. }
  214. qemu_close(s->vhost_vdpa.shared->device_fd);
  215. g_free(s->vhost_vdpa.shared);
  216. }
  217. /** Dummy SetSteeringEBPF to support RSS for vhost-vdpa backend */
  218. static bool vhost_vdpa_set_steering_ebpf(NetClientState *nc, int prog_fd)
  219. {
  220. return true;
  221. }
  222. static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc)
  223. {
  224. assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
  225. return true;
  226. }
  227. static bool vhost_vdpa_has_ufo(NetClientState *nc)
  228. {
  229. assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
  230. VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
  231. uint64_t features = 0;
  232. features |= (1ULL << VIRTIO_NET_F_HOST_UFO);
  233. features = vhost_net_get_features(s->vhost_net, features);
  234. return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO));
  235. }
  236. /*
  237. * FIXME: vhost_vdpa doesn't have an API to "set h/w endianness". But it's
  238. * reasonable to assume that h/w is LE by default, because LE is what
  239. * virtio 1.0 and later ask for. So, this function just says "yes, the h/w is
  240. * LE". Otherwise, on a BE machine, higher-level code would mistakely think
  241. * the h/w is BE and can't support VDPA for a virtio 1.0 client.
  242. */
  243. static int vhost_vdpa_set_vnet_le(NetClientState *nc, bool enable)
  244. {
  245. return 0;
  246. }
  247. static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc,
  248. Error **errp)
  249. {
  250. const char *driver = object_class_get_name(oc);
  251. if (!g_str_has_prefix(driver, "virtio-net-")) {
  252. error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
  253. return false;
  254. }
  255. return true;
  256. }
  257. /** Dummy receive in case qemu falls back to userland tap networking */
  258. static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
  259. size_t size)
  260. {
  261. return size;
  262. }
  263. /** From any vdpa net client, get the netclient of the i-th queue pair */
  264. static VhostVDPAState *vhost_vdpa_net_get_nc_vdpa(VhostVDPAState *s, int i)
  265. {
  266. NICState *nic = qemu_get_nic(s->nc.peer);
  267. NetClientState *nc_i = qemu_get_peer(nic->ncs, i);
  268. return DO_UPCAST(VhostVDPAState, nc, nc_i);
  269. }
  270. static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
  271. {
  272. return vhost_vdpa_net_get_nc_vdpa(s, 0);
  273. }
  274. static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
  275. {
  276. struct vhost_vdpa *v = &s->vhost_vdpa;
  277. VirtIONet *n;
  278. VirtIODevice *vdev;
  279. int data_queue_pairs, cvq, r;
  280. /* We are only called on the first data vqs and only if x-svq is not set */
  281. if (s->vhost_vdpa.shadow_vqs_enabled == enable) {
  282. return;
  283. }
  284. vdev = v->dev->vdev;
  285. n = VIRTIO_NET(vdev);
  286. if (!n->vhost_started) {
  287. return;
  288. }
  289. data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
  290. cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
  291. n->max_ncs - n->max_queue_pairs : 0;
  292. v->shared->svq_switching = enable ?
  293. SVQ_TSTATE_ENABLING : SVQ_TSTATE_DISABLING;
  294. /*
  295. * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter
  296. * in the future and resume the device if read-only operations between
  297. * suspend and reset goes wrong.
  298. */
  299. vhost_net_stop(vdev, n->nic->ncs, data_queue_pairs, cvq);
  300. /* Start will check migration setup_or_active to configure or not SVQ */
  301. r = vhost_net_start(vdev, n->nic->ncs, data_queue_pairs, cvq);
  302. if (unlikely(r < 0)) {
  303. error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r);
  304. }
  305. v->shared->svq_switching = SVQ_TSTATE_DONE;
  306. }
  307. static int vdpa_net_migration_state_notifier(NotifierWithReturn *notifier,
  308. MigrationEvent *e, Error **errp)
  309. {
  310. VhostVDPAState *s = container_of(notifier, VhostVDPAState, migration_state);
  311. if (e->type == MIG_EVENT_PRECOPY_SETUP) {
  312. vhost_vdpa_net_log_global_enable(s, true);
  313. } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
  314. vhost_vdpa_net_log_global_enable(s, false);
  315. }
  316. return 0;
  317. }
  318. static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
  319. {
  320. struct vhost_vdpa *v = &s->vhost_vdpa;
  321. migration_add_notifier(&s->migration_state,
  322. vdpa_net_migration_state_notifier);
  323. if (v->shadow_vqs_enabled) {
  324. v->shared->iova_tree = vhost_iova_tree_new(v->shared->iova_range.first,
  325. v->shared->iova_range.last);
  326. }
  327. }
  328. static int vhost_vdpa_net_data_start(NetClientState *nc)
  329. {
  330. VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
  331. struct vhost_vdpa *v = &s->vhost_vdpa;
  332. assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
  333. if (s->always_svq || migration_is_running()) {
  334. v->shadow_vqs_enabled = true;
  335. } else {
  336. v->shadow_vqs_enabled = false;
  337. }
  338. if (v->index == 0) {
  339. v->shared->shadow_data = v->shadow_vqs_enabled;
  340. vhost_vdpa_net_data_start_first(s);
  341. return 0;
  342. }
  343. return 0;
  344. }
  345. static int vhost_vdpa_net_data_load(NetClientState *nc)
  346. {
  347. VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
  348. struct vhost_vdpa *v = &s->vhost_vdpa;
  349. bool has_cvq = v->dev->vq_index_end % 2;
  350. if (has_cvq) {
  351. return 0;
  352. }
  353. for (int i = 0; i < v->dev->nvqs; ++i) {
  354. int ret = vhost_vdpa_set_vring_ready(v, i + v->dev->vq_index);
  355. if (ret < 0) {
  356. return ret;
  357. }
  358. }
  359. return 0;
  360. }
  361. static void vhost_vdpa_net_client_stop(NetClientState *nc)
  362. {
  363. VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
  364. struct vhost_dev *dev;
  365. assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
  366. if (s->vhost_vdpa.index == 0) {
  367. migration_remove_notifier(&s->migration_state);
  368. }
  369. dev = s->vhost_vdpa.dev;
  370. if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
  371. g_clear_pointer(&s->vhost_vdpa.shared->iova_tree,
  372. vhost_iova_tree_delete);
  373. }
  374. }
  375. static NetClientInfo net_vhost_vdpa_info = {
  376. .type = NET_CLIENT_DRIVER_VHOST_VDPA,
  377. .size = sizeof(VhostVDPAState),
  378. .receive = vhost_vdpa_receive,
  379. .start = vhost_vdpa_net_data_start,
  380. .load = vhost_vdpa_net_data_load,
  381. .stop = vhost_vdpa_net_client_stop,
  382. .cleanup = vhost_vdpa_cleanup,
  383. .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
  384. .has_ufo = vhost_vdpa_has_ufo,
  385. .set_vnet_le = vhost_vdpa_set_vnet_le,
  386. .check_peer_type = vhost_vdpa_check_peer_type,
  387. .set_steering_ebpf = vhost_vdpa_set_steering_ebpf,
  388. };
  389. static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index,
  390. Error **errp)
  391. {
  392. struct vhost_vring_state state = {
  393. .index = vq_index,
  394. };
  395. int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state);
  396. if (unlikely(r < 0)) {
  397. r = -errno;
  398. error_setg_errno(errp, errno, "Cannot get VQ %u group", vq_index);
  399. return r;
  400. }
  401. return state.num;
  402. }
  403. static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v,
  404. unsigned vq_group,
  405. unsigned asid_num)
  406. {
  407. struct vhost_vring_state asid = {
  408. .index = vq_group,
  409. .num = asid_num,
  410. };
  411. int r;
  412. trace_vhost_vdpa_set_address_space_id(v, vq_group, asid_num);
  413. r = ioctl(v->shared->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid);
  414. if (unlikely(r < 0)) {
  415. error_report("Can't set vq group %u asid %u, errno=%d (%s)",
  416. asid.index, asid.num, errno, g_strerror(errno));
  417. }
  418. return r;
  419. }
  420. static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
  421. {
  422. VhostIOVATree *tree = v->shared->iova_tree;
  423. DMAMap needle = {
  424. /*
  425. * No need to specify size or to look for more translations since
  426. * this contiguous chunk was allocated by us.
  427. */
  428. .translated_addr = (hwaddr)(uintptr_t)addr,
  429. };
  430. const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
  431. int r;
  432. if (unlikely(!map)) {
  433. error_report("Cannot locate expected map");
  434. return;
  435. }
  436. r = vhost_vdpa_dma_unmap(v->shared, v->address_space_id, map->iova,
  437. map->size + 1);
  438. if (unlikely(r != 0)) {
  439. error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
  440. }
  441. vhost_iova_tree_remove(tree, *map);
  442. }
  443. /** Map CVQ buffer. */
  444. static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size,
  445. bool write)
  446. {
  447. DMAMap map = {};
  448. hwaddr taddr = (hwaddr)(uintptr_t)buf;
  449. int r;
  450. map.size = size - 1;
  451. map.perm = write ? IOMMU_RW : IOMMU_RO,
  452. r = vhost_iova_tree_map_alloc(v->shared->iova_tree, &map, taddr);
  453. if (unlikely(r != IOVA_OK)) {
  454. error_report("Cannot map injected element");
  455. if (map.translated_addr == taddr) {
  456. error_report("Insertion to IOVA->HVA tree failed");
  457. /* Remove the mapping from the IOVA-only tree */
  458. goto dma_map_err;
  459. }
  460. return r;
  461. }
  462. r = vhost_vdpa_dma_map(v->shared, v->address_space_id, map.iova,
  463. vhost_vdpa_net_cvq_cmd_page_len(), buf, !write);
  464. if (unlikely(r < 0)) {
  465. goto dma_map_err;
  466. }
  467. return 0;
  468. dma_map_err:
  469. vhost_iova_tree_remove(v->shared->iova_tree, map);
  470. return r;
  471. }
  472. static int vhost_vdpa_net_cvq_start(NetClientState *nc)
  473. {
  474. VhostVDPAState *s, *s0;
  475. struct vhost_vdpa *v;
  476. int64_t cvq_group;
  477. int r;
  478. Error *err = NULL;
  479. assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
  480. s = DO_UPCAST(VhostVDPAState, nc, nc);
  481. v = &s->vhost_vdpa;
  482. s0 = vhost_vdpa_net_first_nc_vdpa(s);
  483. v->shadow_vqs_enabled = s0->vhost_vdpa.shadow_vqs_enabled;
  484. s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
  485. if (v->shared->shadow_data) {
  486. /* SVQ is already configured for all virtqueues */
  487. goto out;
  488. }
  489. /*
  490. * If we early return in these cases SVQ will not be enabled. The migration
  491. * will be blocked as long as vhost-vdpa backends will not offer _F_LOG.
  492. */
  493. if (!vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) {
  494. return 0;
  495. }
  496. if (!s->cvq_isolated) {
  497. return 0;
  498. }
  499. cvq_group = vhost_vdpa_get_vring_group(v->shared->device_fd,
  500. v->dev->vq_index_end - 1,
  501. &err);
  502. if (unlikely(cvq_group < 0)) {
  503. error_report_err(err);
  504. return cvq_group;
  505. }
  506. r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID);
  507. if (unlikely(r < 0)) {
  508. return r;
  509. }
  510. v->shadow_vqs_enabled = true;
  511. s->vhost_vdpa.address_space_id = VHOST_VDPA_NET_CVQ_ASID;
  512. out:
  513. if (!s->vhost_vdpa.shadow_vqs_enabled) {
  514. return 0;
  515. }
  516. /*
  517. * If other vhost_vdpa already have an iova_tree, reuse it for simplicity,
  518. * whether CVQ shares ASID with guest or not, because:
  519. * - Memory listener need access to guest's memory addresses allocated in
  520. * the IOVA tree.
  521. * - There should be plenty of IOVA address space for both ASID not to
  522. * worry about collisions between them. Guest's translations are still
  523. * validated with virtio virtqueue_pop so there is no risk for the guest
  524. * to access memory that it shouldn't.
  525. *
  526. * To allocate a iova tree per ASID is doable but it complicates the code
  527. * and it is not worth it for the moment.
  528. */
  529. if (!v->shared->iova_tree) {
  530. v->shared->iova_tree = vhost_iova_tree_new(v->shared->iova_range.first,
  531. v->shared->iova_range.last);
  532. }
  533. r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
  534. vhost_vdpa_net_cvq_cmd_page_len(), false);
  535. if (unlikely(r < 0)) {
  536. return r;
  537. }
  538. r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->status,
  539. vhost_vdpa_net_cvq_cmd_page_len(), true);
  540. if (unlikely(r < 0)) {
  541. vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
  542. }
  543. return r;
  544. }
  545. static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
  546. {
  547. VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
  548. assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
  549. if (s->vhost_vdpa.shadow_vqs_enabled) {
  550. vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
  551. vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
  552. }
  553. vhost_vdpa_net_client_stop(nc);
  554. }
  555. static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s,
  556. const struct iovec *out_sg, size_t out_num,
  557. const struct iovec *in_sg, size_t in_num)
  558. {
  559. VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
  560. int r;
  561. r = vhost_svq_add(svq, out_sg, out_num, NULL, in_sg, in_num, NULL, NULL);
  562. if (unlikely(r != 0)) {
  563. if (unlikely(r == -ENOSPC)) {
  564. qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
  565. __func__);
  566. }
  567. }
  568. return r;
  569. }
  570. /*
  571. * Convenience wrapper to poll SVQ for multiple control commands.
  572. *
  573. * Caller should hold the BQL when invoking this function, and should take
  574. * the answer before SVQ pulls by itself when BQL is released.
  575. */
  576. static ssize_t vhost_vdpa_net_svq_poll(VhostVDPAState *s, size_t cmds_in_flight)
  577. {
  578. VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
  579. return vhost_svq_poll(svq, cmds_in_flight);
  580. }
  581. static void vhost_vdpa_net_load_cursor_reset(VhostVDPAState *s,
  582. struct iovec *out_cursor,
  583. struct iovec *in_cursor)
  584. {
  585. /* reset the cursor of the output buffer for the device */
  586. out_cursor->iov_base = s->cvq_cmd_out_buffer;
  587. out_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len();
  588. /* reset the cursor of the in buffer for the device */
  589. in_cursor->iov_base = s->status;
  590. in_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len();
  591. }
  592. /*
  593. * Poll SVQ for multiple pending control commands and check the device's ack.
  594. *
  595. * Caller should hold the BQL when invoking this function.
  596. *
  597. * @s: The VhostVDPAState
  598. * @len: The length of the pending status shadow buffer
  599. */
  600. static ssize_t vhost_vdpa_net_svq_flush(VhostVDPAState *s, size_t len)
  601. {
  602. /* device uses a one-byte length ack for each control command */
  603. ssize_t dev_written = vhost_vdpa_net_svq_poll(s, len);
  604. if (unlikely(dev_written != len)) {
  605. return -EIO;
  606. }
  607. /* check the device's ack */
  608. for (int i = 0; i < len; ++i) {
  609. if (s->status[i] != VIRTIO_NET_OK) {
  610. return -EIO;
  611. }
  612. }
  613. return 0;
  614. }
  615. static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s,
  616. struct iovec *out_cursor,
  617. struct iovec *in_cursor, uint8_t class,
  618. uint8_t cmd, const struct iovec *data_sg,
  619. size_t data_num)
  620. {
  621. const struct virtio_net_ctrl_hdr ctrl = {
  622. .class = class,
  623. .cmd = cmd,
  624. };
  625. size_t data_size = iov_size(data_sg, data_num), cmd_size;
  626. struct iovec out, in;
  627. ssize_t r;
  628. unsigned dummy_cursor_iov_cnt;
  629. VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
  630. assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
  631. cmd_size = sizeof(ctrl) + data_size;
  632. trace_vhost_vdpa_net_load_cmd(s, class, cmd, data_num, data_size);
  633. if (vhost_svq_available_slots(svq) < 2 ||
  634. iov_size(out_cursor, 1) < cmd_size) {
  635. /*
  636. * It is time to flush all pending control commands if SVQ is full
  637. * or control commands shadow buffers are full.
  638. *
  639. * We can poll here since we've had BQL from the time
  640. * we sent the descriptor.
  641. */
  642. r = vhost_vdpa_net_svq_flush(s, in_cursor->iov_base -
  643. (void *)s->status);
  644. if (unlikely(r < 0)) {
  645. return r;
  646. }
  647. vhost_vdpa_net_load_cursor_reset(s, out_cursor, in_cursor);
  648. }
  649. /* pack the CVQ command header */
  650. iov_from_buf(out_cursor, 1, 0, &ctrl, sizeof(ctrl));
  651. /* pack the CVQ command command-specific-data */
  652. iov_to_buf(data_sg, data_num, 0,
  653. out_cursor->iov_base + sizeof(ctrl), data_size);
  654. /* extract the required buffer from the cursor for output */
  655. iov_copy(&out, 1, out_cursor, 1, 0, cmd_size);
  656. /* extract the required buffer from the cursor for input */
  657. iov_copy(&in, 1, in_cursor, 1, 0, sizeof(*s->status));
  658. r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1);
  659. if (unlikely(r < 0)) {
  660. trace_vhost_vdpa_net_load_cmd_retval(s, class, cmd, r);
  661. return r;
  662. }
  663. /* iterate the cursors */
  664. dummy_cursor_iov_cnt = 1;
  665. iov_discard_front(&out_cursor, &dummy_cursor_iov_cnt, cmd_size);
  666. dummy_cursor_iov_cnt = 1;
  667. iov_discard_front(&in_cursor, &dummy_cursor_iov_cnt, sizeof(*s->status));
  668. return 0;
  669. }
  670. static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n,
  671. struct iovec *out_cursor,
  672. struct iovec *in_cursor)
  673. {
  674. if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
  675. const struct iovec data = {
  676. .iov_base = (void *)n->mac,
  677. .iov_len = sizeof(n->mac),
  678. };
  679. ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
  680. VIRTIO_NET_CTRL_MAC,
  681. VIRTIO_NET_CTRL_MAC_ADDR_SET,
  682. &data, 1);
  683. if (unlikely(r < 0)) {
  684. return r;
  685. }
  686. }
  687. /*
  688. * According to VirtIO standard, "The device MUST have an
  689. * empty MAC filtering table on reset.".
  690. *
  691. * Therefore, there is no need to send this CVQ command if the
  692. * driver also sets an empty MAC filter table, which aligns with
  693. * the device's defaults.
  694. *
  695. * Note that the device's defaults can mismatch the driver's
  696. * configuration only at live migration.
  697. */
  698. if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX) ||
  699. n->mac_table.in_use == 0) {
  700. return 0;
  701. }
  702. uint32_t uni_entries = n->mac_table.first_multi,
  703. uni_macs_size = uni_entries * ETH_ALEN,
  704. mul_entries = n->mac_table.in_use - uni_entries,
  705. mul_macs_size = mul_entries * ETH_ALEN;
  706. struct virtio_net_ctrl_mac uni = {
  707. .entries = cpu_to_le32(uni_entries),
  708. };
  709. struct virtio_net_ctrl_mac mul = {
  710. .entries = cpu_to_le32(mul_entries),
  711. };
  712. const struct iovec data[] = {
  713. {
  714. .iov_base = &uni,
  715. .iov_len = sizeof(uni),
  716. }, {
  717. .iov_base = n->mac_table.macs,
  718. .iov_len = uni_macs_size,
  719. }, {
  720. .iov_base = &mul,
  721. .iov_len = sizeof(mul),
  722. }, {
  723. .iov_base = &n->mac_table.macs[uni_macs_size],
  724. .iov_len = mul_macs_size,
  725. },
  726. };
  727. ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
  728. VIRTIO_NET_CTRL_MAC,
  729. VIRTIO_NET_CTRL_MAC_TABLE_SET,
  730. data, ARRAY_SIZE(data));
  731. if (unlikely(r < 0)) {
  732. return r;
  733. }
  734. return 0;
  735. }
  736. static int vhost_vdpa_net_load_rss(VhostVDPAState *s, const VirtIONet *n,
  737. struct iovec *out_cursor,
  738. struct iovec *in_cursor, bool do_rss)
  739. {
  740. struct virtio_net_rss_config cfg = {};
  741. ssize_t r;
  742. g_autofree uint16_t *table = NULL;
  743. /*
  744. * According to VirtIO standard, "Initially the device has all hash
  745. * types disabled and reports only VIRTIO_NET_HASH_REPORT_NONE.".
  746. *
  747. * Therefore, there is no need to send this CVQ command if the
  748. * driver disables the all hash types, which aligns with
  749. * the device's defaults.
  750. *
  751. * Note that the device's defaults can mismatch the driver's
  752. * configuration only at live migration.
  753. */
  754. if (!n->rss_data.enabled ||
  755. n->rss_data.hash_types == VIRTIO_NET_HASH_REPORT_NONE) {
  756. return 0;
  757. }
  758. table = g_malloc_n(n->rss_data.indirections_len,
  759. sizeof(n->rss_data.indirections_table[0]));
  760. cfg.hash_types = cpu_to_le32(n->rss_data.hash_types);
  761. if (do_rss) {
  762. /*
  763. * According to VirtIO standard, "Number of entries in indirection_table
  764. * is (indirection_table_mask + 1)".
  765. */
  766. cfg.indirection_table_mask = cpu_to_le16(n->rss_data.indirections_len -
  767. 1);
  768. cfg.unclassified_queue = cpu_to_le16(n->rss_data.default_queue);
  769. for (int i = 0; i < n->rss_data.indirections_len; ++i) {
  770. table[i] = cpu_to_le16(n->rss_data.indirections_table[i]);
  771. }
  772. cfg.max_tx_vq = cpu_to_le16(n->curr_queue_pairs);
  773. } else {
  774. /*
  775. * According to VirtIO standard, "Field reserved MUST contain zeroes.
  776. * It is defined to make the structure to match the layout of
  777. * virtio_net_rss_config structure, defined in 5.1.6.5.7.".
  778. *
  779. * Therefore, we need to zero the fields in
  780. * struct virtio_net_rss_config, which corresponds to the
  781. * `reserved` field in struct virtio_net_hash_config.
  782. *
  783. * Note that all other fields are zeroed at their definitions,
  784. * except for the `indirection_table` field, where the actual data
  785. * is stored in the `table` variable to ensure compatibility
  786. * with RSS case. Therefore, we need to zero the `table` variable here.
  787. */
  788. table[0] = 0;
  789. }
  790. /*
  791. * Considering that virtio_net_handle_rss() currently does not restore
  792. * the hash key length parsed from the CVQ command sent from the guest
  793. * into n->rss_data and uses the maximum key length in other code, so
  794. * we also employ the maximum key length here.
  795. */
  796. cfg.hash_key_length = sizeof(n->rss_data.key);
  797. const struct iovec data[] = {
  798. {
  799. .iov_base = &cfg,
  800. .iov_len = offsetof(struct virtio_net_rss_config,
  801. indirection_table),
  802. }, {
  803. .iov_base = table,
  804. .iov_len = n->rss_data.indirections_len *
  805. sizeof(n->rss_data.indirections_table[0]),
  806. }, {
  807. .iov_base = &cfg.max_tx_vq,
  808. .iov_len = offsetof(struct virtio_net_rss_config, hash_key_data) -
  809. offsetof(struct virtio_net_rss_config, max_tx_vq),
  810. }, {
  811. .iov_base = (void *)n->rss_data.key,
  812. .iov_len = sizeof(n->rss_data.key),
  813. }
  814. };
  815. r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
  816. VIRTIO_NET_CTRL_MQ,
  817. do_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG :
  818. VIRTIO_NET_CTRL_MQ_HASH_CONFIG,
  819. data, ARRAY_SIZE(data));
  820. if (unlikely(r < 0)) {
  821. return r;
  822. }
  823. return 0;
  824. }
  825. static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
  826. const VirtIONet *n,
  827. struct iovec *out_cursor,
  828. struct iovec *in_cursor)
  829. {
  830. struct virtio_net_ctrl_mq mq;
  831. ssize_t r;
  832. if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_MQ)) {
  833. return 0;
  834. }
  835. trace_vhost_vdpa_net_load_mq(s, n->curr_queue_pairs);
  836. mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
  837. const struct iovec data = {
  838. .iov_base = &mq,
  839. .iov_len = sizeof(mq),
  840. };
  841. r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
  842. VIRTIO_NET_CTRL_MQ,
  843. VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET,
  844. &data, 1);
  845. if (unlikely(r < 0)) {
  846. return r;
  847. }
  848. if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_RSS)) {
  849. /* load the receive-side scaling state */
  850. r = vhost_vdpa_net_load_rss(s, n, out_cursor, in_cursor, true);
  851. if (unlikely(r < 0)) {
  852. return r;
  853. }
  854. } else if (virtio_vdev_has_feature(&n->parent_obj,
  855. VIRTIO_NET_F_HASH_REPORT)) {
  856. /* load the hash calculation state */
  857. r = vhost_vdpa_net_load_rss(s, n, out_cursor, in_cursor, false);
  858. if (unlikely(r < 0)) {
  859. return r;
  860. }
  861. }
  862. return 0;
  863. }
  864. static int vhost_vdpa_net_load_offloads(VhostVDPAState *s,
  865. const VirtIONet *n,
  866. struct iovec *out_cursor,
  867. struct iovec *in_cursor)
  868. {
  869. uint64_t offloads;
  870. ssize_t r;
  871. if (!virtio_vdev_has_feature(&n->parent_obj,
  872. VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
  873. return 0;
  874. }
  875. if (n->curr_guest_offloads == virtio_net_supported_guest_offloads(n)) {
  876. /*
  877. * According to VirtIO standard, "Upon feature negotiation
  878. * corresponding offload gets enabled to preserve
  879. * backward compatibility.".
  880. *
  881. * Therefore, there is no need to send this CVQ command if the
  882. * driver also enables all supported offloads, which aligns with
  883. * the device's defaults.
  884. *
  885. * Note that the device's defaults can mismatch the driver's
  886. * configuration only at live migration.
  887. */
  888. return 0;
  889. }
  890. offloads = cpu_to_le64(n->curr_guest_offloads);
  891. const struct iovec data = {
  892. .iov_base = &offloads,
  893. .iov_len = sizeof(offloads),
  894. };
  895. r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
  896. VIRTIO_NET_CTRL_GUEST_OFFLOADS,
  897. VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
  898. &data, 1);
  899. if (unlikely(r < 0)) {
  900. return r;
  901. }
  902. return 0;
  903. }
  904. static int vhost_vdpa_net_load_rx_mode(VhostVDPAState *s,
  905. struct iovec *out_cursor,
  906. struct iovec *in_cursor,
  907. uint8_t cmd,
  908. uint8_t on)
  909. {
  910. const struct iovec data = {
  911. .iov_base = &on,
  912. .iov_len = sizeof(on),
  913. };
  914. ssize_t r;
  915. r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
  916. VIRTIO_NET_CTRL_RX, cmd, &data, 1);
  917. if (unlikely(r < 0)) {
  918. return r;
  919. }
  920. return 0;
  921. }
  922. static int vhost_vdpa_net_load_rx(VhostVDPAState *s,
  923. const VirtIONet *n,
  924. struct iovec *out_cursor,
  925. struct iovec *in_cursor)
  926. {
  927. ssize_t r;
  928. if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX)) {
  929. return 0;
  930. }
  931. /*
  932. * According to virtio_net_reset(), device turns promiscuous mode
  933. * on by default.
  934. *
  935. * Additionally, according to VirtIO standard, "Since there are
  936. * no guarantees, it can use a hash filter or silently switch to
  937. * allmulti or promiscuous mode if it is given too many addresses.".
  938. * QEMU marks `n->mac_table.uni_overflow` if guest sets too many
  939. * non-multicast MAC addresses, indicating that promiscuous mode
  940. * should be enabled.
  941. *
  942. * Therefore, QEMU should only send this CVQ command if the
  943. * `n->mac_table.uni_overflow` is not marked and `n->promisc` is off,
  944. * which sets promiscuous mode on, different from the device's defaults.
  945. *
  946. * Note that the device's defaults can mismatch the driver's
  947. * configuration only at live migration.
  948. */
  949. if (!n->mac_table.uni_overflow && !n->promisc) {
  950. r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
  951. VIRTIO_NET_CTRL_RX_PROMISC, 0);
  952. if (unlikely(r < 0)) {
  953. return r;
  954. }
  955. }
  956. /*
  957. * According to virtio_net_reset(), device turns all-multicast mode
  958. * off by default.
  959. *
  960. * According to VirtIO standard, "Since there are no guarantees,
  961. * it can use a hash filter or silently switch to allmulti or
  962. * promiscuous mode if it is given too many addresses.". QEMU marks
  963. * `n->mac_table.multi_overflow` if guest sets too many
  964. * non-multicast MAC addresses.
  965. *
  966. * Therefore, QEMU should only send this CVQ command if the
  967. * `n->mac_table.multi_overflow` is marked or `n->allmulti` is on,
  968. * which sets all-multicast mode on, different from the device's defaults.
  969. *
  970. * Note that the device's defaults can mismatch the driver's
  971. * configuration only at live migration.
  972. */
  973. if (n->mac_table.multi_overflow || n->allmulti) {
  974. r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
  975. VIRTIO_NET_CTRL_RX_ALLMULTI, 1);
  976. if (unlikely(r < 0)) {
  977. return r;
  978. }
  979. }
  980. if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX_EXTRA)) {
  981. return 0;
  982. }
  983. /*
  984. * According to virtio_net_reset(), device turns all-unicast mode
  985. * off by default.
  986. *
  987. * Therefore, QEMU should only send this CVQ command if the driver
  988. * sets all-unicast mode on, different from the device's defaults.
  989. *
  990. * Note that the device's defaults can mismatch the driver's
  991. * configuration only at live migration.
  992. */
  993. if (n->alluni) {
  994. r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
  995. VIRTIO_NET_CTRL_RX_ALLUNI, 1);
  996. if (r < 0) {
  997. return r;
  998. }
  999. }
  1000. /*
  1001. * According to virtio_net_reset(), device turns non-multicast mode
  1002. * off by default.
  1003. *
  1004. * Therefore, QEMU should only send this CVQ command if the driver
  1005. * sets non-multicast mode on, different from the device's defaults.
  1006. *
  1007. * Note that the device's defaults can mismatch the driver's
  1008. * configuration only at live migration.
  1009. */
  1010. if (n->nomulti) {
  1011. r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
  1012. VIRTIO_NET_CTRL_RX_NOMULTI, 1);
  1013. if (r < 0) {
  1014. return r;
  1015. }
  1016. }
  1017. /*
  1018. * According to virtio_net_reset(), device turns non-unicast mode
  1019. * off by default.
  1020. *
  1021. * Therefore, QEMU should only send this CVQ command if the driver
  1022. * sets non-unicast mode on, different from the device's defaults.
  1023. *
  1024. * Note that the device's defaults can mismatch the driver's
  1025. * configuration only at live migration.
  1026. */
  1027. if (n->nouni) {
  1028. r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
  1029. VIRTIO_NET_CTRL_RX_NOUNI, 1);
  1030. if (r < 0) {
  1031. return r;
  1032. }
  1033. }
  1034. /*
  1035. * According to virtio_net_reset(), device turns non-broadcast mode
  1036. * off by default.
  1037. *
  1038. * Therefore, QEMU should only send this CVQ command if the driver
  1039. * sets non-broadcast mode on, different from the device's defaults.
  1040. *
  1041. * Note that the device's defaults can mismatch the driver's
  1042. * configuration only at live migration.
  1043. */
  1044. if (n->nobcast) {
  1045. r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
  1046. VIRTIO_NET_CTRL_RX_NOBCAST, 1);
  1047. if (r < 0) {
  1048. return r;
  1049. }
  1050. }
  1051. return 0;
  1052. }
  1053. static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s,
  1054. const VirtIONet *n,
  1055. struct iovec *out_cursor,
  1056. struct iovec *in_cursor,
  1057. uint16_t vid)
  1058. {
  1059. const struct iovec data = {
  1060. .iov_base = &vid,
  1061. .iov_len = sizeof(vid),
  1062. };
  1063. ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
  1064. VIRTIO_NET_CTRL_VLAN,
  1065. VIRTIO_NET_CTRL_VLAN_ADD,
  1066. &data, 1);
  1067. if (unlikely(r < 0)) {
  1068. return r;
  1069. }
  1070. return 0;
  1071. }
  1072. static int vhost_vdpa_net_load_vlan(VhostVDPAState *s,
  1073. const VirtIONet *n,
  1074. struct iovec *out_cursor,
  1075. struct iovec *in_cursor)
  1076. {
  1077. int r;
  1078. if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_VLAN)) {
  1079. return 0;
  1080. }
  1081. for (int i = 0; i < MAX_VLAN >> 5; i++) {
  1082. for (int j = 0; n->vlans[i] && j <= 0x1f; j++) {
  1083. if (n->vlans[i] & (1U << j)) {
  1084. r = vhost_vdpa_net_load_single_vlan(s, n, out_cursor,
  1085. in_cursor, (i << 5) + j);
  1086. if (unlikely(r != 0)) {
  1087. return r;
  1088. }
  1089. }
  1090. }
  1091. }
  1092. return 0;
  1093. }
  1094. static int vhost_vdpa_net_cvq_load(NetClientState *nc)
  1095. {
  1096. VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
  1097. struct vhost_vdpa *v = &s->vhost_vdpa;
  1098. const VirtIONet *n;
  1099. int r;
  1100. struct iovec out_cursor, in_cursor;
  1101. assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
  1102. r = vhost_vdpa_set_vring_ready(v, v->dev->vq_index);
  1103. if (unlikely(r < 0)) {
  1104. return r;
  1105. }
  1106. if (v->shadow_vqs_enabled) {
  1107. n = VIRTIO_NET(v->dev->vdev);
  1108. vhost_vdpa_net_load_cursor_reset(s, &out_cursor, &in_cursor);
  1109. r = vhost_vdpa_net_load_mac(s, n, &out_cursor, &in_cursor);
  1110. if (unlikely(r < 0)) {
  1111. return r;
  1112. }
  1113. r = vhost_vdpa_net_load_mq(s, n, &out_cursor, &in_cursor);
  1114. if (unlikely(r)) {
  1115. return r;
  1116. }
  1117. r = vhost_vdpa_net_load_offloads(s, n, &out_cursor, &in_cursor);
  1118. if (unlikely(r)) {
  1119. return r;
  1120. }
  1121. r = vhost_vdpa_net_load_rx(s, n, &out_cursor, &in_cursor);
  1122. if (unlikely(r)) {
  1123. return r;
  1124. }
  1125. r = vhost_vdpa_net_load_vlan(s, n, &out_cursor, &in_cursor);
  1126. if (unlikely(r)) {
  1127. return r;
  1128. }
  1129. /*
  1130. * We need to poll and check all pending device's used buffers.
  1131. *
  1132. * We can poll here since we've had BQL from the time
  1133. * we sent the descriptor.
  1134. */
  1135. r = vhost_vdpa_net_svq_flush(s, in_cursor.iov_base - (void *)s->status);
  1136. if (unlikely(r)) {
  1137. return r;
  1138. }
  1139. }
  1140. for (int i = 0; i < v->dev->vq_index; ++i) {
  1141. r = vhost_vdpa_set_vring_ready(v, i);
  1142. if (unlikely(r < 0)) {
  1143. return r;
  1144. }
  1145. }
  1146. return 0;
  1147. }
  1148. static NetClientInfo net_vhost_vdpa_cvq_info = {
  1149. .type = NET_CLIENT_DRIVER_VHOST_VDPA,
  1150. .size = sizeof(VhostVDPAState),
  1151. .receive = vhost_vdpa_receive,
  1152. .start = vhost_vdpa_net_cvq_start,
  1153. .load = vhost_vdpa_net_cvq_load,
  1154. .stop = vhost_vdpa_net_cvq_stop,
  1155. .cleanup = vhost_vdpa_cleanup,
  1156. .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
  1157. .has_ufo = vhost_vdpa_has_ufo,
  1158. .check_peer_type = vhost_vdpa_check_peer_type,
  1159. .set_steering_ebpf = vhost_vdpa_set_steering_ebpf,
  1160. };
  1161. /*
  1162. * Forward the excessive VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command to
  1163. * vdpa device.
  1164. *
  1165. * Considering that QEMU cannot send the entire filter table to the
  1166. * vdpa device, it should send the VIRTIO_NET_CTRL_RX_PROMISC CVQ
  1167. * command to enable promiscuous mode to receive all packets,
  1168. * according to VirtIO standard, "Since there are no guarantees,
  1169. * it can use a hash filter or silently switch to allmulti or
  1170. * promiscuous mode if it is given too many addresses.".
  1171. *
  1172. * Since QEMU ignores MAC addresses beyond `MAC_TABLE_ENTRIES` and
  1173. * marks `n->mac_table.x_overflow` accordingly, it should have
  1174. * the same effect on the device model to receive
  1175. * (`MAC_TABLE_ENTRIES` + 1) or more non-multicast MAC addresses.
  1176. * The same applies to multicast MAC addresses.
  1177. *
  1178. * Therefore, QEMU can provide the device model with a fake
  1179. * VIRTIO_NET_CTRL_MAC_TABLE_SET command with (`MAC_TABLE_ENTRIES` + 1)
  1180. * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1) multicast
  1181. * MAC addresses. This ensures that the device model marks
  1182. * `n->mac_table.uni_overflow` and `n->mac_table.multi_overflow`,
  1183. * allowing all packets to be received, which aligns with the
  1184. * state of the vdpa device.
  1185. */
  1186. static int vhost_vdpa_net_excessive_mac_filter_cvq_add(VhostVDPAState *s,
  1187. VirtQueueElement *elem,
  1188. struct iovec *out,
  1189. const struct iovec *in)
  1190. {
  1191. struct virtio_net_ctrl_mac mac_data, *mac_ptr;
  1192. struct virtio_net_ctrl_hdr *hdr_ptr;
  1193. uint32_t cursor;
  1194. ssize_t r;
  1195. uint8_t on = 1;
  1196. /* parse the non-multicast MAC address entries from CVQ command */
  1197. cursor = sizeof(*hdr_ptr);
  1198. r = iov_to_buf(elem->out_sg, elem->out_num, cursor,
  1199. &mac_data, sizeof(mac_data));
  1200. if (unlikely(r != sizeof(mac_data))) {
  1201. /*
  1202. * If the CVQ command is invalid, we should simulate the vdpa device
  1203. * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
  1204. */
  1205. *s->status = VIRTIO_NET_ERR;
  1206. return sizeof(*s->status);
  1207. }
  1208. cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN;
  1209. /* parse the multicast MAC address entries from CVQ command */
  1210. r = iov_to_buf(elem->out_sg, elem->out_num, cursor,
  1211. &mac_data, sizeof(mac_data));
  1212. if (r != sizeof(mac_data)) {
  1213. /*
  1214. * If the CVQ command is invalid, we should simulate the vdpa device
  1215. * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
  1216. */
  1217. *s->status = VIRTIO_NET_ERR;
  1218. return sizeof(*s->status);
  1219. }
  1220. cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN;
  1221. /* validate the CVQ command */
  1222. if (iov_size(elem->out_sg, elem->out_num) != cursor) {
  1223. /*
  1224. * If the CVQ command is invalid, we should simulate the vdpa device
  1225. * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
  1226. */
  1227. *s->status = VIRTIO_NET_ERR;
  1228. return sizeof(*s->status);
  1229. }
  1230. /*
  1231. * According to VirtIO standard, "Since there are no guarantees,
  1232. * it can use a hash filter or silently switch to allmulti or
  1233. * promiscuous mode if it is given too many addresses.".
  1234. *
  1235. * Therefore, considering that QEMU is unable to send the entire
  1236. * filter table to the vdpa device, it should send the
  1237. * VIRTIO_NET_CTRL_RX_PROMISC CVQ command to enable promiscuous mode
  1238. */
  1239. hdr_ptr = out->iov_base;
  1240. out->iov_len = sizeof(*hdr_ptr) + sizeof(on);
  1241. hdr_ptr->class = VIRTIO_NET_CTRL_RX;
  1242. hdr_ptr->cmd = VIRTIO_NET_CTRL_RX_PROMISC;
  1243. iov_from_buf(out, 1, sizeof(*hdr_ptr), &on, sizeof(on));
  1244. r = vhost_vdpa_net_cvq_add(s, out, 1, in, 1);
  1245. if (unlikely(r < 0)) {
  1246. return r;
  1247. }
  1248. /*
  1249. * We can poll here since we've had BQL from the time
  1250. * we sent the descriptor.
  1251. */
  1252. r = vhost_vdpa_net_svq_poll(s, 1);
  1253. if (unlikely(r < sizeof(*s->status))) {
  1254. return r;
  1255. }
  1256. if (*s->status != VIRTIO_NET_OK) {
  1257. return sizeof(*s->status);
  1258. }
  1259. /*
  1260. * QEMU should also send a fake VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ
  1261. * command to the device model, including (`MAC_TABLE_ENTRIES` + 1)
  1262. * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1)
  1263. * multicast MAC addresses.
  1264. *
  1265. * By doing so, the device model can mark `n->mac_table.uni_overflow`
  1266. * and `n->mac_table.multi_overflow`, enabling all packets to be
  1267. * received, which aligns with the state of the vdpa device.
  1268. */
  1269. cursor = 0;
  1270. uint32_t fake_uni_entries = MAC_TABLE_ENTRIES + 1,
  1271. fake_mul_entries = MAC_TABLE_ENTRIES + 1,
  1272. fake_cvq_size = sizeof(struct virtio_net_ctrl_hdr) +
  1273. sizeof(mac_data) + fake_uni_entries * ETH_ALEN +
  1274. sizeof(mac_data) + fake_mul_entries * ETH_ALEN;
  1275. assert(fake_cvq_size < vhost_vdpa_net_cvq_cmd_page_len());
  1276. out->iov_len = fake_cvq_size;
  1277. /* pack the header for fake CVQ command */
  1278. hdr_ptr = out->iov_base + cursor;
  1279. hdr_ptr->class = VIRTIO_NET_CTRL_MAC;
  1280. hdr_ptr->cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
  1281. cursor += sizeof(*hdr_ptr);
  1282. /*
  1283. * Pack the non-multicast MAC addresses part for fake CVQ command.
  1284. *
  1285. * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC
  1286. * addresses provided in CVQ command. Therefore, only the entries
  1287. * field need to be prepared in the CVQ command.
  1288. */
  1289. mac_ptr = out->iov_base + cursor;
  1290. mac_ptr->entries = cpu_to_le32(fake_uni_entries);
  1291. cursor += sizeof(*mac_ptr) + fake_uni_entries * ETH_ALEN;
  1292. /*
  1293. * Pack the multicast MAC addresses part for fake CVQ command.
  1294. *
  1295. * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC
  1296. * addresses provided in CVQ command. Therefore, only the entries
  1297. * field need to be prepared in the CVQ command.
  1298. */
  1299. mac_ptr = out->iov_base + cursor;
  1300. mac_ptr->entries = cpu_to_le32(fake_mul_entries);
  1301. /*
  1302. * Simulating QEMU poll a vdpa device used buffer
  1303. * for VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
  1304. */
  1305. return sizeof(*s->status);
  1306. }
  1307. /**
  1308. * Validate and copy control virtqueue commands.
  1309. *
  1310. * Following QEMU guidelines, we offer a copy of the buffers to the device to
  1311. * prevent TOCTOU bugs.
  1312. */
  1313. static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
  1314. VirtQueueElement *elem,
  1315. void *opaque)
  1316. {
  1317. VhostVDPAState *s = opaque;
  1318. size_t in_len;
  1319. const struct virtio_net_ctrl_hdr *ctrl;
  1320. virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
  1321. /* Out buffer sent to both the vdpa device and the device model */
  1322. struct iovec out = {
  1323. .iov_base = s->cvq_cmd_out_buffer,
  1324. };
  1325. /* in buffer used for device model */
  1326. const struct iovec model_in = {
  1327. .iov_base = &status,
  1328. .iov_len = sizeof(status),
  1329. };
  1330. /* in buffer used for vdpa device */
  1331. const struct iovec vdpa_in = {
  1332. .iov_base = s->status,
  1333. .iov_len = sizeof(*s->status),
  1334. };
  1335. ssize_t dev_written = -EINVAL;
  1336. out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0,
  1337. s->cvq_cmd_out_buffer,
  1338. vhost_vdpa_net_cvq_cmd_page_len());
  1339. ctrl = s->cvq_cmd_out_buffer;
  1340. if (ctrl->class == VIRTIO_NET_CTRL_ANNOUNCE) {
  1341. /*
  1342. * Guest announce capability is emulated by qemu, so don't forward to
  1343. * the device.
  1344. */
  1345. dev_written = sizeof(status);
  1346. *s->status = VIRTIO_NET_OK;
  1347. } else if (unlikely(ctrl->class == VIRTIO_NET_CTRL_MAC &&
  1348. ctrl->cmd == VIRTIO_NET_CTRL_MAC_TABLE_SET &&
  1349. iov_size(elem->out_sg, elem->out_num) > out.iov_len)) {
  1350. /*
  1351. * Due to the size limitation of the out buffer sent to the vdpa device,
  1352. * which is determined by vhost_vdpa_net_cvq_cmd_page_len(), excessive
  1353. * MAC addresses set by the driver for the filter table can cause
  1354. * truncation of the CVQ command in QEMU. As a result, the vdpa device
  1355. * rejects the flawed CVQ command.
  1356. *
  1357. * Therefore, QEMU must handle this situation instead of sending
  1358. * the CVQ command directly.
  1359. */
  1360. dev_written = vhost_vdpa_net_excessive_mac_filter_cvq_add(s, elem,
  1361. &out, &vdpa_in);
  1362. if (unlikely(dev_written < 0)) {
  1363. goto out;
  1364. }
  1365. } else {
  1366. ssize_t r;
  1367. r = vhost_vdpa_net_cvq_add(s, &out, 1, &vdpa_in, 1);
  1368. if (unlikely(r < 0)) {
  1369. dev_written = r;
  1370. goto out;
  1371. }
  1372. /*
  1373. * We can poll here since we've had BQL from the time
  1374. * we sent the descriptor.
  1375. */
  1376. dev_written = vhost_vdpa_net_svq_poll(s, 1);
  1377. }
  1378. if (unlikely(dev_written < sizeof(status))) {
  1379. error_report("Insufficient written data (%zu)", dev_written);
  1380. goto out;
  1381. }
  1382. if (*s->status != VIRTIO_NET_OK) {
  1383. goto out;
  1384. }
  1385. status = VIRTIO_NET_ERR;
  1386. virtio_net_handle_ctrl_iov(svq->vdev, &model_in, 1, &out, 1);
  1387. if (status != VIRTIO_NET_OK) {
  1388. error_report("Bad CVQ processing in model");
  1389. }
  1390. out:
  1391. in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
  1392. sizeof(status));
  1393. if (unlikely(in_len < sizeof(status))) {
  1394. error_report("Bad device CVQ written length");
  1395. }
  1396. vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
  1397. /*
  1398. * `elem` belongs to vhost_vdpa_net_handle_ctrl_avail() only when
  1399. * the function successfully forwards the CVQ command, indicated
  1400. * by a non-negative value of `dev_written`. Otherwise, it still
  1401. * belongs to SVQ.
  1402. * This function should only free the `elem` when it owns.
  1403. */
  1404. if (dev_written >= 0) {
  1405. g_free(elem);
  1406. }
  1407. return dev_written < 0 ? dev_written : 0;
  1408. }
  1409. static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
  1410. .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
  1411. };
  1412. /**
  1413. * Probe if CVQ is isolated
  1414. *
  1415. * @device_fd The vdpa device fd
  1416. * @features Features offered by the device.
  1417. * @cvq_index The control vq pair index
  1418. *
  1419. * Returns <0 in case of failure, 0 if false and 1 if true.
  1420. */
  1421. static int vhost_vdpa_probe_cvq_isolation(int device_fd, uint64_t features,
  1422. int cvq_index, Error **errp)
  1423. {
  1424. ERRP_GUARD();
  1425. uint64_t backend_features;
  1426. int64_t cvq_group;
  1427. uint8_t status = VIRTIO_CONFIG_S_ACKNOWLEDGE |
  1428. VIRTIO_CONFIG_S_DRIVER;
  1429. int r;
  1430. r = ioctl(device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features);
  1431. if (unlikely(r < 0)) {
  1432. error_setg_errno(errp, errno, "Cannot get vdpa backend_features");
  1433. return r;
  1434. }
  1435. if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID))) {
  1436. return 0;
  1437. }
  1438. r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
  1439. if (unlikely(r)) {
  1440. error_setg_errno(errp, -r, "Cannot set device status");
  1441. goto out;
  1442. }
  1443. r = ioctl(device_fd, VHOST_SET_FEATURES, &features);
  1444. if (unlikely(r)) {
  1445. error_setg_errno(errp, -r, "Cannot set features");
  1446. goto out;
  1447. }
  1448. status |= VIRTIO_CONFIG_S_FEATURES_OK;
  1449. r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
  1450. if (unlikely(r)) {
  1451. error_setg_errno(errp, -r, "Cannot set device status");
  1452. goto out;
  1453. }
  1454. cvq_group = vhost_vdpa_get_vring_group(device_fd, cvq_index, errp);
  1455. if (unlikely(cvq_group < 0)) {
  1456. if (cvq_group != -ENOTSUP) {
  1457. r = cvq_group;
  1458. goto out;
  1459. }
  1460. /*
  1461. * The kernel report VHOST_BACKEND_F_IOTLB_ASID if the vdpa frontend
  1462. * support ASID even if the parent driver does not. The CVQ cannot be
  1463. * isolated in this case.
  1464. */
  1465. error_free(*errp);
  1466. *errp = NULL;
  1467. r = 0;
  1468. goto out;
  1469. }
  1470. for (int i = 0; i < cvq_index; ++i) {
  1471. int64_t group = vhost_vdpa_get_vring_group(device_fd, i, errp);
  1472. if (unlikely(group < 0)) {
  1473. r = group;
  1474. goto out;
  1475. }
  1476. if (group == (int64_t)cvq_group) {
  1477. r = 0;
  1478. goto out;
  1479. }
  1480. }
  1481. r = 1;
  1482. out:
  1483. status = 0;
  1484. ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
  1485. return r;
  1486. }
  1487. static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
  1488. const char *device,
  1489. const char *name,
  1490. int vdpa_device_fd,
  1491. int queue_pair_index,
  1492. int nvqs,
  1493. bool is_datapath,
  1494. bool svq,
  1495. struct vhost_vdpa_iova_range iova_range,
  1496. uint64_t features,
  1497. VhostVDPAShared *shared,
  1498. Error **errp)
  1499. {
  1500. NetClientState *nc = NULL;
  1501. VhostVDPAState *s;
  1502. int ret = 0;
  1503. assert(name);
  1504. int cvq_isolated = 0;
  1505. if (is_datapath) {
  1506. nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device,
  1507. name);
  1508. } else {
  1509. cvq_isolated = vhost_vdpa_probe_cvq_isolation(vdpa_device_fd, features,
  1510. queue_pair_index * 2,
  1511. errp);
  1512. if (unlikely(cvq_isolated < 0)) {
  1513. return NULL;
  1514. }
  1515. nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer,
  1516. device, name);
  1517. }
  1518. qemu_set_info_str(nc, TYPE_VHOST_VDPA);
  1519. s = DO_UPCAST(VhostVDPAState, nc, nc);
  1520. s->vhost_vdpa.index = queue_pair_index;
  1521. s->always_svq = svq;
  1522. s->migration_state.notify = NULL;
  1523. s->vhost_vdpa.shadow_vqs_enabled = svq;
  1524. if (queue_pair_index == 0) {
  1525. vhost_vdpa_net_valid_svq_features(features,
  1526. &s->vhost_vdpa.migration_blocker);
  1527. s->vhost_vdpa.shared = g_new0(VhostVDPAShared, 1);
  1528. s->vhost_vdpa.shared->device_fd = vdpa_device_fd;
  1529. s->vhost_vdpa.shared->iova_range = iova_range;
  1530. s->vhost_vdpa.shared->shadow_data = svq;
  1531. } else if (!is_datapath) {
  1532. s->cvq_cmd_out_buffer = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(),
  1533. PROT_READ | PROT_WRITE,
  1534. MAP_SHARED | MAP_ANONYMOUS, -1, 0);
  1535. s->status = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(),
  1536. PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
  1537. -1, 0);
  1538. s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
  1539. s->vhost_vdpa.shadow_vq_ops_opaque = s;
  1540. s->cvq_isolated = cvq_isolated;
  1541. }
  1542. if (queue_pair_index != 0) {
  1543. s->vhost_vdpa.shared = shared;
  1544. }
  1545. ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
  1546. if (ret) {
  1547. qemu_del_net_client(nc);
  1548. return NULL;
  1549. }
  1550. return nc;
  1551. }
  1552. static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
  1553. {
  1554. int ret = ioctl(fd, VHOST_GET_FEATURES, features);
  1555. if (unlikely(ret < 0)) {
  1556. error_setg_errno(errp, errno,
  1557. "Fail to query features from vhost-vDPA device");
  1558. }
  1559. return ret;
  1560. }
  1561. static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
  1562. int *has_cvq, Error **errp)
  1563. {
  1564. unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
  1565. g_autofree struct vhost_vdpa_config *config = NULL;
  1566. __virtio16 *max_queue_pairs;
  1567. int ret;
  1568. if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
  1569. *has_cvq = 1;
  1570. } else {
  1571. *has_cvq = 0;
  1572. }
  1573. if (features & (1 << VIRTIO_NET_F_MQ)) {
  1574. config = g_malloc0(config_size + sizeof(*max_queue_pairs));
  1575. config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs);
  1576. config->len = sizeof(*max_queue_pairs);
  1577. ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config);
  1578. if (ret) {
  1579. error_setg(errp, "Fail to get config from vhost-vDPA device");
  1580. return -ret;
  1581. }
  1582. max_queue_pairs = (__virtio16 *)&config->buf;
  1583. return lduw_le_p(max_queue_pairs);
  1584. }
  1585. return 1;
  1586. }
  1587. int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
  1588. NetClientState *peer, Error **errp)
  1589. {
  1590. ERRP_GUARD();
  1591. const NetdevVhostVDPAOptions *opts;
  1592. uint64_t features;
  1593. int vdpa_device_fd;
  1594. g_autofree NetClientState **ncs = NULL;
  1595. struct vhost_vdpa_iova_range iova_range;
  1596. NetClientState *nc;
  1597. int queue_pairs, r, i = 0, has_cvq = 0;
  1598. assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
  1599. opts = &netdev->u.vhost_vdpa;
  1600. if (!opts->vhostdev && !opts->vhostfd) {
  1601. error_setg(errp,
  1602. "vhost-vdpa: neither vhostdev= nor vhostfd= was specified");
  1603. return -1;
  1604. }
  1605. if (opts->vhostdev && opts->vhostfd) {
  1606. error_setg(errp,
  1607. "vhost-vdpa: vhostdev= and vhostfd= are mutually exclusive");
  1608. return -1;
  1609. }
  1610. if (opts->vhostdev) {
  1611. vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp);
  1612. if (vdpa_device_fd == -1) {
  1613. return -errno;
  1614. }
  1615. } else {
  1616. /* has_vhostfd */
  1617. vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, errp);
  1618. if (vdpa_device_fd == -1) {
  1619. error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: ");
  1620. return -1;
  1621. }
  1622. }
  1623. r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
  1624. if (unlikely(r < 0)) {
  1625. goto err;
  1626. }
  1627. queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
  1628. &has_cvq, errp);
  1629. if (queue_pairs < 0) {
  1630. qemu_close(vdpa_device_fd);
  1631. return queue_pairs;
  1632. }
  1633. r = vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
  1634. if (unlikely(r < 0)) {
  1635. error_setg(errp, "vhost-vdpa: get iova range failed: %s",
  1636. strerror(-r));
  1637. goto err;
  1638. }
  1639. if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
  1640. goto err;
  1641. }
  1642. ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
  1643. for (i = 0; i < queue_pairs; i++) {
  1644. VhostVDPAShared *shared = NULL;
  1645. if (i) {
  1646. shared = DO_UPCAST(VhostVDPAState, nc, ncs[0])->vhost_vdpa.shared;
  1647. }
  1648. ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
  1649. vdpa_device_fd, i, 2, true, opts->x_svq,
  1650. iova_range, features, shared, errp);
  1651. if (!ncs[i])
  1652. goto err;
  1653. }
  1654. if (has_cvq) {
  1655. VhostVDPAState *s0 = DO_UPCAST(VhostVDPAState, nc, ncs[0]);
  1656. VhostVDPAShared *shared = s0->vhost_vdpa.shared;
  1657. nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
  1658. vdpa_device_fd, i, 1, false,
  1659. opts->x_svq, iova_range, features, shared,
  1660. errp);
  1661. if (!nc)
  1662. goto err;
  1663. }
  1664. return 0;
  1665. err:
  1666. if (i) {
  1667. for (i--; i >= 0; i--) {
  1668. qemu_del_net_client(ncs[i]);
  1669. }
  1670. }
  1671. qemu_close(vdpa_device_fd);
  1672. return -1;
  1673. }