vhost.c 52 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711
  1. /*
  2. * vhost support
  3. *
  4. * Copyright Red Hat, Inc. 2010
  5. *
  6. * Authors:
  7. * Michael S. Tsirkin <mst@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Contributions after 2012-01-13 are licensed under the terms of the
  13. * GNU GPL, version 2 or (at your option) any later version.
  14. */
  15. #include "qemu/osdep.h"
  16. #include "qapi/error.h"
  17. #include "hw/virtio/vhost.h"
  18. #include "qemu/atomic.h"
  19. #include "qemu/range.h"
  20. #include "qemu/error-report.h"
  21. #include "qemu/memfd.h"
  22. #include "standard-headers/linux/vhost_types.h"
  23. #include "exec/address-spaces.h"
  24. #include "hw/virtio/virtio-bus.h"
  25. #include "hw/virtio/virtio-access.h"
  26. #include "migration/blocker.h"
  27. #include "migration/qemu-file-types.h"
  28. #include "sysemu/dma.h"
  29. #include "trace.h"
  30. /* enabled until disconnected backend stabilizes */
  31. #define _VHOST_DEBUG 1
  32. #ifdef _VHOST_DEBUG
  33. #define VHOST_OPS_DEBUG(fmt, ...) \
  34. do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
  35. strerror(errno), errno); } while (0)
  36. #else
  37. #define VHOST_OPS_DEBUG(fmt, ...) \
  38. do { } while (0)
  39. #endif
  40. static struct vhost_log *vhost_log;
  41. static struct vhost_log *vhost_log_shm;
  42. static unsigned int used_memslots;
  43. static QLIST_HEAD(, vhost_dev) vhost_devices =
  44. QLIST_HEAD_INITIALIZER(vhost_devices);
  45. bool vhost_has_free_slot(void)
  46. {
  47. unsigned int slots_limit = ~0U;
  48. struct vhost_dev *hdev;
  49. QLIST_FOREACH(hdev, &vhost_devices, entry) {
  50. unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  51. slots_limit = MIN(slots_limit, r);
  52. }
  53. return slots_limit > used_memslots;
  54. }
  55. static void vhost_dev_sync_region(struct vhost_dev *dev,
  56. MemoryRegionSection *section,
  57. uint64_t mfirst, uint64_t mlast,
  58. uint64_t rfirst, uint64_t rlast)
  59. {
  60. vhost_log_chunk_t *log = dev->log->log;
  61. uint64_t start = MAX(mfirst, rfirst);
  62. uint64_t end = MIN(mlast, rlast);
  63. vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  64. vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  65. uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
  66. if (end < start) {
  67. return;
  68. }
  69. assert(end / VHOST_LOG_CHUNK < dev->log_size);
  70. assert(start / VHOST_LOG_CHUNK < dev->log_size);
  71. for (;from < to; ++from) {
  72. vhost_log_chunk_t log;
  73. /* We first check with non-atomic: much cheaper,
  74. * and we expect non-dirty to be the common case. */
  75. if (!*from) {
  76. addr += VHOST_LOG_CHUNK;
  77. continue;
  78. }
  79. /* Data must be read atomically. We don't really need barrier semantics
  80. * but it's easier to use atomic_* than roll our own. */
  81. log = atomic_xchg(from, 0);
  82. while (log) {
  83. int bit = ctzl(log);
  84. hwaddr page_addr;
  85. hwaddr section_offset;
  86. hwaddr mr_offset;
  87. page_addr = addr + bit * VHOST_LOG_PAGE;
  88. section_offset = page_addr - section->offset_within_address_space;
  89. mr_offset = section_offset + section->offset_within_region;
  90. memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
  91. log &= ~(0x1ull << bit);
  92. }
  93. addr += VHOST_LOG_CHUNK;
  94. }
  95. }
  96. static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
  97. MemoryRegionSection *section,
  98. hwaddr first,
  99. hwaddr last)
  100. {
  101. int i;
  102. hwaddr start_addr;
  103. hwaddr end_addr;
  104. if (!dev->log_enabled || !dev->started) {
  105. return 0;
  106. }
  107. start_addr = section->offset_within_address_space;
  108. end_addr = range_get_last(start_addr, int128_get64(section->size));
  109. start_addr = MAX(first, start_addr);
  110. end_addr = MIN(last, end_addr);
  111. for (i = 0; i < dev->mem->nregions; ++i) {
  112. struct vhost_memory_region *reg = dev->mem->regions + i;
  113. vhost_dev_sync_region(dev, section, start_addr, end_addr,
  114. reg->guest_phys_addr,
  115. range_get_last(reg->guest_phys_addr,
  116. reg->memory_size));
  117. }
  118. for (i = 0; i < dev->nvqs; ++i) {
  119. struct vhost_virtqueue *vq = dev->vqs + i;
  120. if (!vq->used_phys && !vq->used_size) {
  121. continue;
  122. }
  123. vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
  124. range_get_last(vq->used_phys, vq->used_size));
  125. }
  126. return 0;
  127. }
  128. static void vhost_log_sync(MemoryListener *listener,
  129. MemoryRegionSection *section)
  130. {
  131. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  132. memory_listener);
  133. vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
  134. }
  135. static void vhost_log_sync_range(struct vhost_dev *dev,
  136. hwaddr first, hwaddr last)
  137. {
  138. int i;
  139. /* FIXME: this is N^2 in number of sections */
  140. for (i = 0; i < dev->n_mem_sections; ++i) {
  141. MemoryRegionSection *section = &dev->mem_sections[i];
  142. vhost_sync_dirty_bitmap(dev, section, first, last);
  143. }
  144. }
  145. static uint64_t vhost_get_log_size(struct vhost_dev *dev)
  146. {
  147. uint64_t log_size = 0;
  148. int i;
  149. for (i = 0; i < dev->mem->nregions; ++i) {
  150. struct vhost_memory_region *reg = dev->mem->regions + i;
  151. uint64_t last = range_get_last(reg->guest_phys_addr,
  152. reg->memory_size);
  153. log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
  154. }
  155. for (i = 0; i < dev->nvqs; ++i) {
  156. struct vhost_virtqueue *vq = dev->vqs + i;
  157. if (!vq->used_phys && !vq->used_size) {
  158. continue;
  159. }
  160. uint64_t last = vq->used_phys + vq->used_size - 1;
  161. log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
  162. }
  163. return log_size;
  164. }
  165. static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
  166. {
  167. Error *err = NULL;
  168. struct vhost_log *log;
  169. uint64_t logsize = size * sizeof(*(log->log));
  170. int fd = -1;
  171. log = g_new0(struct vhost_log, 1);
  172. if (share) {
  173. log->log = qemu_memfd_alloc("vhost-log", logsize,
  174. F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
  175. &fd, &err);
  176. if (err) {
  177. error_report_err(err);
  178. g_free(log);
  179. return NULL;
  180. }
  181. memset(log->log, 0, logsize);
  182. } else {
  183. log->log = g_malloc0(logsize);
  184. }
  185. log->size = size;
  186. log->refcnt = 1;
  187. log->fd = fd;
  188. return log;
  189. }
  190. static struct vhost_log *vhost_log_get(uint64_t size, bool share)
  191. {
  192. struct vhost_log *log = share ? vhost_log_shm : vhost_log;
  193. if (!log || log->size != size) {
  194. log = vhost_log_alloc(size, share);
  195. if (share) {
  196. vhost_log_shm = log;
  197. } else {
  198. vhost_log = log;
  199. }
  200. } else {
  201. ++log->refcnt;
  202. }
  203. return log;
  204. }
  205. static void vhost_log_put(struct vhost_dev *dev, bool sync)
  206. {
  207. struct vhost_log *log = dev->log;
  208. if (!log) {
  209. return;
  210. }
  211. --log->refcnt;
  212. if (log->refcnt == 0) {
  213. /* Sync only the range covered by the old log */
  214. if (dev->log_size && sync) {
  215. vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
  216. }
  217. if (vhost_log == log) {
  218. g_free(log->log);
  219. vhost_log = NULL;
  220. } else if (vhost_log_shm == log) {
  221. qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
  222. log->fd);
  223. vhost_log_shm = NULL;
  224. }
  225. g_free(log);
  226. }
  227. dev->log = NULL;
  228. dev->log_size = 0;
  229. }
  230. static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
  231. {
  232. return dev->vhost_ops->vhost_requires_shm_log &&
  233. dev->vhost_ops->vhost_requires_shm_log(dev);
  234. }
  235. static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
  236. {
  237. struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
  238. uint64_t log_base = (uintptr_t)log->log;
  239. int r;
  240. /* inform backend of log switching, this must be done before
  241. releasing the current log, to ensure no logging is lost */
  242. r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
  243. if (r < 0) {
  244. VHOST_OPS_DEBUG("vhost_set_log_base failed");
  245. }
  246. vhost_log_put(dev, true);
  247. dev->log = log;
  248. dev->log_size = size;
  249. }
  250. static int vhost_dev_has_iommu(struct vhost_dev *dev)
  251. {
  252. VirtIODevice *vdev = dev->vdev;
  253. return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
  254. }
  255. static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
  256. hwaddr *plen, int is_write)
  257. {
  258. if (!vhost_dev_has_iommu(dev)) {
  259. return cpu_physical_memory_map(addr, plen, is_write);
  260. } else {
  261. return (void *)(uintptr_t)addr;
  262. }
  263. }
  264. static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
  265. hwaddr len, int is_write,
  266. hwaddr access_len)
  267. {
  268. if (!vhost_dev_has_iommu(dev)) {
  269. cpu_physical_memory_unmap(buffer, len, is_write, access_len);
  270. }
  271. }
  272. static int vhost_verify_ring_part_mapping(void *ring_hva,
  273. uint64_t ring_gpa,
  274. uint64_t ring_size,
  275. void *reg_hva,
  276. uint64_t reg_gpa,
  277. uint64_t reg_size)
  278. {
  279. uint64_t hva_ring_offset;
  280. uint64_t ring_last = range_get_last(ring_gpa, ring_size);
  281. uint64_t reg_last = range_get_last(reg_gpa, reg_size);
  282. if (ring_last < reg_gpa || ring_gpa > reg_last) {
  283. return 0;
  284. }
  285. /* check that whole ring's is mapped */
  286. if (ring_last > reg_last) {
  287. return -ENOMEM;
  288. }
  289. /* check that ring's MemoryRegion wasn't replaced */
  290. hva_ring_offset = ring_gpa - reg_gpa;
  291. if (ring_hva != reg_hva + hva_ring_offset) {
  292. return -EBUSY;
  293. }
  294. return 0;
  295. }
  296. static int vhost_verify_ring_mappings(struct vhost_dev *dev,
  297. void *reg_hva,
  298. uint64_t reg_gpa,
  299. uint64_t reg_size)
  300. {
  301. int i, j;
  302. int r = 0;
  303. const char *part_name[] = {
  304. "descriptor table",
  305. "available ring",
  306. "used ring"
  307. };
  308. if (vhost_dev_has_iommu(dev)) {
  309. return 0;
  310. }
  311. for (i = 0; i < dev->nvqs; ++i) {
  312. struct vhost_virtqueue *vq = dev->vqs + i;
  313. if (vq->desc_phys == 0) {
  314. continue;
  315. }
  316. j = 0;
  317. r = vhost_verify_ring_part_mapping(
  318. vq->desc, vq->desc_phys, vq->desc_size,
  319. reg_hva, reg_gpa, reg_size);
  320. if (r) {
  321. break;
  322. }
  323. j++;
  324. r = vhost_verify_ring_part_mapping(
  325. vq->avail, vq->avail_phys, vq->avail_size,
  326. reg_hva, reg_gpa, reg_size);
  327. if (r) {
  328. break;
  329. }
  330. j++;
  331. r = vhost_verify_ring_part_mapping(
  332. vq->used, vq->used_phys, vq->used_size,
  333. reg_hva, reg_gpa, reg_size);
  334. if (r) {
  335. break;
  336. }
  337. }
  338. if (r == -ENOMEM) {
  339. error_report("Unable to map %s for ring %d", part_name[j], i);
  340. } else if (r == -EBUSY) {
  341. error_report("%s relocated for ring %d", part_name[j], i);
  342. }
  343. return r;
  344. }
  345. static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
  346. {
  347. bool result;
  348. bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &
  349. ~(1 << DIRTY_MEMORY_MIGRATION);
  350. result = memory_region_is_ram(section->mr) &&
  351. !memory_region_is_rom(section->mr);
  352. /* Vhost doesn't handle any block which is doing dirty-tracking other
  353. * than migration; this typically fires on VGA areas.
  354. */
  355. result &= !log_dirty;
  356. if (result && dev->vhost_ops->vhost_backend_mem_section_filter) {
  357. result &=
  358. dev->vhost_ops->vhost_backend_mem_section_filter(dev, section);
  359. }
  360. trace_vhost_section(section->mr->name, result);
  361. return result;
  362. }
  363. static void vhost_begin(MemoryListener *listener)
  364. {
  365. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  366. memory_listener);
  367. dev->tmp_sections = NULL;
  368. dev->n_tmp_sections = 0;
  369. }
  370. static void vhost_commit(MemoryListener *listener)
  371. {
  372. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  373. memory_listener);
  374. MemoryRegionSection *old_sections;
  375. int n_old_sections;
  376. uint64_t log_size;
  377. size_t regions_size;
  378. int r;
  379. int i;
  380. bool changed = false;
  381. /* Note we can be called before the device is started, but then
  382. * starting the device calls set_mem_table, so we need to have
  383. * built the data structures.
  384. */
  385. old_sections = dev->mem_sections;
  386. n_old_sections = dev->n_mem_sections;
  387. dev->mem_sections = dev->tmp_sections;
  388. dev->n_mem_sections = dev->n_tmp_sections;
  389. if (dev->n_mem_sections != n_old_sections) {
  390. changed = true;
  391. } else {
  392. /* Same size, lets check the contents */
  393. for (int i = 0; i < n_old_sections; i++) {
  394. if (!MemoryRegionSection_eq(&old_sections[i],
  395. &dev->mem_sections[i])) {
  396. changed = true;
  397. break;
  398. }
  399. }
  400. }
  401. trace_vhost_commit(dev->started, changed);
  402. if (!changed) {
  403. goto out;
  404. }
  405. /* Rebuild the regions list from the new sections list */
  406. regions_size = offsetof(struct vhost_memory, regions) +
  407. dev->n_mem_sections * sizeof dev->mem->regions[0];
  408. dev->mem = g_realloc(dev->mem, regions_size);
  409. dev->mem->nregions = dev->n_mem_sections;
  410. used_memslots = dev->mem->nregions;
  411. for (i = 0; i < dev->n_mem_sections; i++) {
  412. struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
  413. struct MemoryRegionSection *mrs = dev->mem_sections + i;
  414. cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
  415. cur_vmr->memory_size = int128_get64(mrs->size);
  416. cur_vmr->userspace_addr =
  417. (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
  418. mrs->offset_within_region;
  419. cur_vmr->flags_padding = 0;
  420. }
  421. if (!dev->started) {
  422. goto out;
  423. }
  424. for (i = 0; i < dev->mem->nregions; i++) {
  425. if (vhost_verify_ring_mappings(dev,
  426. (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
  427. dev->mem->regions[i].guest_phys_addr,
  428. dev->mem->regions[i].memory_size)) {
  429. error_report("Verify ring failure on region %d", i);
  430. abort();
  431. }
  432. }
  433. if (!dev->log_enabled) {
  434. r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
  435. if (r < 0) {
  436. VHOST_OPS_DEBUG("vhost_set_mem_table failed");
  437. }
  438. goto out;
  439. }
  440. log_size = vhost_get_log_size(dev);
  441. /* We allocate an extra 4K bytes to log,
  442. * to reduce the * number of reallocations. */
  443. #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
  444. /* To log more, must increase log size before table update. */
  445. if (dev->log_size < log_size) {
  446. vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
  447. }
  448. r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
  449. if (r < 0) {
  450. VHOST_OPS_DEBUG("vhost_set_mem_table failed");
  451. }
  452. /* To log less, can only decrease log size after table update. */
  453. if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
  454. vhost_dev_log_resize(dev, log_size);
  455. }
  456. out:
  457. /* Deref the old list of sections, this must happen _after_ the
  458. * vhost_set_mem_table to ensure the client isn't still using the
  459. * section we're about to unref.
  460. */
  461. while (n_old_sections--) {
  462. memory_region_unref(old_sections[n_old_sections].mr);
  463. }
  464. g_free(old_sections);
  465. return;
  466. }
  467. /* Adds the section data to the tmp_section structure.
  468. * It relies on the listener calling us in memory address order
  469. * and for each region (via the _add and _nop methods) to
  470. * join neighbours.
  471. */
  472. static void vhost_region_add_section(struct vhost_dev *dev,
  473. MemoryRegionSection *section)
  474. {
  475. bool need_add = true;
  476. uint64_t mrs_size = int128_get64(section->size);
  477. uint64_t mrs_gpa = section->offset_within_address_space;
  478. uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
  479. section->offset_within_region;
  480. RAMBlock *mrs_rb = section->mr->ram_block;
  481. size_t mrs_page = qemu_ram_pagesize(mrs_rb);
  482. trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
  483. mrs_host);
  484. /* Round the section to it's page size */
  485. /* First align the start down to a page boundary */
  486. uint64_t alignage = mrs_host & (mrs_page - 1);
  487. if (alignage) {
  488. mrs_host -= alignage;
  489. mrs_size += alignage;
  490. mrs_gpa -= alignage;
  491. }
  492. /* Now align the size up to a page boundary */
  493. alignage = mrs_size & (mrs_page - 1);
  494. if (alignage) {
  495. mrs_size += mrs_page - alignage;
  496. }
  497. trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size,
  498. mrs_host);
  499. if (dev->n_tmp_sections) {
  500. /* Since we already have at least one section, lets see if
  501. * this extends it; since we're scanning in order, we only
  502. * have to look at the last one, and the FlatView that calls
  503. * us shouldn't have overlaps.
  504. */
  505. MemoryRegionSection *prev_sec = dev->tmp_sections +
  506. (dev->n_tmp_sections - 1);
  507. uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
  508. uint64_t prev_size = int128_get64(prev_sec->size);
  509. uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
  510. uint64_t prev_host_start =
  511. (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
  512. prev_sec->offset_within_region;
  513. uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
  514. if (mrs_gpa <= (prev_gpa_end + 1)) {
  515. /* OK, looks like overlapping/intersecting - it's possible that
  516. * the rounding to page sizes has made them overlap, but they should
  517. * match up in the same RAMBlock if they do.
  518. */
  519. if (mrs_gpa < prev_gpa_start) {
  520. error_report("%s:Section rounded to %"PRIx64
  521. " prior to previous %"PRIx64,
  522. __func__, mrs_gpa, prev_gpa_start);
  523. /* A way to cleanly fail here would be better */
  524. return;
  525. }
  526. /* Offset from the start of the previous GPA to this GPA */
  527. size_t offset = mrs_gpa - prev_gpa_start;
  528. if (prev_host_start + offset == mrs_host &&
  529. section->mr == prev_sec->mr &&
  530. (!dev->vhost_ops->vhost_backend_can_merge ||
  531. dev->vhost_ops->vhost_backend_can_merge(dev,
  532. mrs_host, mrs_size,
  533. prev_host_start, prev_size))) {
  534. uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
  535. need_add = false;
  536. prev_sec->offset_within_address_space =
  537. MIN(prev_gpa_start, mrs_gpa);
  538. prev_sec->offset_within_region =
  539. MIN(prev_host_start, mrs_host) -
  540. (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
  541. prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
  542. mrs_host));
  543. trace_vhost_region_add_section_merge(section->mr->name,
  544. int128_get64(prev_sec->size),
  545. prev_sec->offset_within_address_space,
  546. prev_sec->offset_within_region);
  547. } else {
  548. /* adjoining regions are fine, but overlapping ones with
  549. * different blocks/offsets shouldn't happen
  550. */
  551. if (mrs_gpa != prev_gpa_end + 1) {
  552. error_report("%s: Overlapping but not coherent sections "
  553. "at %"PRIx64,
  554. __func__, mrs_gpa);
  555. return;
  556. }
  557. }
  558. }
  559. }
  560. if (need_add) {
  561. ++dev->n_tmp_sections;
  562. dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
  563. dev->n_tmp_sections);
  564. dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
  565. /* The flatview isn't stable and we don't use it, making it NULL
  566. * means we can memcmp the list.
  567. */
  568. dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
  569. memory_region_ref(section->mr);
  570. }
  571. }
  572. /* Used for both add and nop callbacks */
  573. static void vhost_region_addnop(MemoryListener *listener,
  574. MemoryRegionSection *section)
  575. {
  576. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  577. memory_listener);
  578. if (!vhost_section(dev, section)) {
  579. return;
  580. }
  581. vhost_region_add_section(dev, section);
  582. }
  583. static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
  584. {
  585. struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
  586. struct vhost_dev *hdev = iommu->hdev;
  587. hwaddr iova = iotlb->iova + iommu->iommu_offset;
  588. if (vhost_backend_invalidate_device_iotlb(hdev, iova,
  589. iotlb->addr_mask + 1)) {
  590. error_report("Fail to invalidate device iotlb");
  591. }
  592. }
  593. static void vhost_iommu_region_add(MemoryListener *listener,
  594. MemoryRegionSection *section)
  595. {
  596. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  597. iommu_listener);
  598. struct vhost_iommu *iommu;
  599. Int128 end;
  600. int iommu_idx, ret;
  601. IOMMUMemoryRegion *iommu_mr;
  602. Error *err = NULL;
  603. if (!memory_region_is_iommu(section->mr)) {
  604. return;
  605. }
  606. iommu_mr = IOMMU_MEMORY_REGION(section->mr);
  607. iommu = g_malloc0(sizeof(*iommu));
  608. end = int128_add(int128_make64(section->offset_within_region),
  609. section->size);
  610. end = int128_sub(end, int128_one());
  611. iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
  612. MEMTXATTRS_UNSPECIFIED);
  613. iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
  614. IOMMU_NOTIFIER_UNMAP,
  615. section->offset_within_region,
  616. int128_get64(end),
  617. iommu_idx);
  618. iommu->mr = section->mr;
  619. iommu->iommu_offset = section->offset_within_address_space -
  620. section->offset_within_region;
  621. iommu->hdev = dev;
  622. ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, &err);
  623. if (ret) {
  624. error_report_err(err);
  625. exit(1);
  626. }
  627. QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
  628. /* TODO: can replay help performance here? */
  629. }
  630. static void vhost_iommu_region_del(MemoryListener *listener,
  631. MemoryRegionSection *section)
  632. {
  633. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  634. iommu_listener);
  635. struct vhost_iommu *iommu;
  636. if (!memory_region_is_iommu(section->mr)) {
  637. return;
  638. }
  639. QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
  640. if (iommu->mr == section->mr &&
  641. iommu->n.start == section->offset_within_region) {
  642. memory_region_unregister_iommu_notifier(iommu->mr,
  643. &iommu->n);
  644. QLIST_REMOVE(iommu, iommu_next);
  645. g_free(iommu);
  646. break;
  647. }
  648. }
  649. }
  650. static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
  651. struct vhost_virtqueue *vq,
  652. unsigned idx, bool enable_log)
  653. {
  654. struct vhost_vring_addr addr = {
  655. .index = idx,
  656. .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
  657. .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
  658. .used_user_addr = (uint64_t)(unsigned long)vq->used,
  659. .log_guest_addr = vq->used_phys,
  660. .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
  661. };
  662. int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
  663. if (r < 0) {
  664. VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
  665. return -errno;
  666. }
  667. return 0;
  668. }
  669. static int vhost_dev_set_features(struct vhost_dev *dev,
  670. bool enable_log)
  671. {
  672. uint64_t features = dev->acked_features;
  673. int r;
  674. if (enable_log) {
  675. features |= 0x1ULL << VHOST_F_LOG_ALL;
  676. }
  677. r = dev->vhost_ops->vhost_set_features(dev, features);
  678. if (r < 0) {
  679. VHOST_OPS_DEBUG("vhost_set_features failed");
  680. }
  681. return r < 0 ? -errno : 0;
  682. }
  683. static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
  684. {
  685. int r, i, idx;
  686. r = vhost_dev_set_features(dev, enable_log);
  687. if (r < 0) {
  688. goto err_features;
  689. }
  690. for (i = 0; i < dev->nvqs; ++i) {
  691. idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
  692. r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
  693. enable_log);
  694. if (r < 0) {
  695. goto err_vq;
  696. }
  697. }
  698. return 0;
  699. err_vq:
  700. for (; i >= 0; --i) {
  701. idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
  702. vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
  703. dev->log_enabled);
  704. }
  705. vhost_dev_set_features(dev, dev->log_enabled);
  706. err_features:
  707. return r;
  708. }
  709. static int vhost_migration_log(MemoryListener *listener, int enable)
  710. {
  711. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  712. memory_listener);
  713. int r;
  714. if (!!enable == dev->log_enabled) {
  715. return 0;
  716. }
  717. if (!dev->started) {
  718. dev->log_enabled = enable;
  719. return 0;
  720. }
  721. if (!enable) {
  722. r = vhost_dev_set_log(dev, false);
  723. if (r < 0) {
  724. return r;
  725. }
  726. vhost_log_put(dev, false);
  727. } else {
  728. vhost_dev_log_resize(dev, vhost_get_log_size(dev));
  729. r = vhost_dev_set_log(dev, true);
  730. if (r < 0) {
  731. return r;
  732. }
  733. }
  734. dev->log_enabled = enable;
  735. return 0;
  736. }
  737. static void vhost_log_global_start(MemoryListener *listener)
  738. {
  739. int r;
  740. r = vhost_migration_log(listener, true);
  741. if (r < 0) {
  742. abort();
  743. }
  744. }
  745. static void vhost_log_global_stop(MemoryListener *listener)
  746. {
  747. int r;
  748. r = vhost_migration_log(listener, false);
  749. if (r < 0) {
  750. abort();
  751. }
  752. }
  753. static void vhost_log_start(MemoryListener *listener,
  754. MemoryRegionSection *section,
  755. int old, int new)
  756. {
  757. /* FIXME: implement */
  758. }
  759. static void vhost_log_stop(MemoryListener *listener,
  760. MemoryRegionSection *section,
  761. int old, int new)
  762. {
  763. /* FIXME: implement */
  764. }
  765. /* The vhost driver natively knows how to handle the vrings of non
  766. * cross-endian legacy devices and modern devices. Only legacy devices
  767. * exposed to a bi-endian guest may require the vhost driver to use a
  768. * specific endianness.
  769. */
  770. static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
  771. {
  772. if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
  773. return false;
  774. }
  775. #ifdef HOST_WORDS_BIGENDIAN
  776. return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
  777. #else
  778. return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
  779. #endif
  780. }
  781. static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
  782. bool is_big_endian,
  783. int vhost_vq_index)
  784. {
  785. struct vhost_vring_state s = {
  786. .index = vhost_vq_index,
  787. .num = is_big_endian
  788. };
  789. if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
  790. return 0;
  791. }
  792. VHOST_OPS_DEBUG("vhost_set_vring_endian failed");
  793. if (errno == ENOTTY) {
  794. error_report("vhost does not support cross-endian");
  795. return -ENOSYS;
  796. }
  797. return -errno;
  798. }
  799. static int vhost_memory_region_lookup(struct vhost_dev *hdev,
  800. uint64_t gpa, uint64_t *uaddr,
  801. uint64_t *len)
  802. {
  803. int i;
  804. for (i = 0; i < hdev->mem->nregions; i++) {
  805. struct vhost_memory_region *reg = hdev->mem->regions + i;
  806. if (gpa >= reg->guest_phys_addr &&
  807. reg->guest_phys_addr + reg->memory_size > gpa) {
  808. *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
  809. *len = reg->guest_phys_addr + reg->memory_size - gpa;
  810. return 0;
  811. }
  812. }
  813. return -EFAULT;
  814. }
  815. int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
  816. {
  817. IOMMUTLBEntry iotlb;
  818. uint64_t uaddr, len;
  819. int ret = -EFAULT;
  820. RCU_READ_LOCK_GUARD();
  821. trace_vhost_iotlb_miss(dev, 1);
  822. iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
  823. iova, write,
  824. MEMTXATTRS_UNSPECIFIED);
  825. if (iotlb.target_as != NULL) {
  826. ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
  827. &uaddr, &len);
  828. if (ret) {
  829. trace_vhost_iotlb_miss(dev, 3);
  830. error_report("Fail to lookup the translated address "
  831. "%"PRIx64, iotlb.translated_addr);
  832. goto out;
  833. }
  834. len = MIN(iotlb.addr_mask + 1, len);
  835. iova = iova & ~iotlb.addr_mask;
  836. ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
  837. len, iotlb.perm);
  838. if (ret) {
  839. trace_vhost_iotlb_miss(dev, 4);
  840. error_report("Fail to update device iotlb");
  841. goto out;
  842. }
  843. }
  844. trace_vhost_iotlb_miss(dev, 2);
  845. out:
  846. return ret;
  847. }
  848. static int vhost_virtqueue_start(struct vhost_dev *dev,
  849. struct VirtIODevice *vdev,
  850. struct vhost_virtqueue *vq,
  851. unsigned idx)
  852. {
  853. BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
  854. VirtioBusState *vbus = VIRTIO_BUS(qbus);
  855. VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
  856. hwaddr s, l, a;
  857. int r;
  858. int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
  859. struct vhost_vring_file file = {
  860. .index = vhost_vq_index
  861. };
  862. struct vhost_vring_state state = {
  863. .index = vhost_vq_index
  864. };
  865. struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
  866. a = virtio_queue_get_desc_addr(vdev, idx);
  867. if (a == 0) {
  868. /* Queue might not be ready for start */
  869. return 0;
  870. }
  871. vq->num = state.num = virtio_queue_get_num(vdev, idx);
  872. r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
  873. if (r) {
  874. VHOST_OPS_DEBUG("vhost_set_vring_num failed");
  875. return -errno;
  876. }
  877. state.num = virtio_queue_get_last_avail_idx(vdev, idx);
  878. r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
  879. if (r) {
  880. VHOST_OPS_DEBUG("vhost_set_vring_base failed");
  881. return -errno;
  882. }
  883. if (vhost_needs_vring_endian(vdev)) {
  884. r = vhost_virtqueue_set_vring_endian_legacy(dev,
  885. virtio_is_big_endian(vdev),
  886. vhost_vq_index);
  887. if (r) {
  888. return -errno;
  889. }
  890. }
  891. vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
  892. vq->desc_phys = a;
  893. vq->desc = vhost_memory_map(dev, a, &l, 0);
  894. if (!vq->desc || l != s) {
  895. r = -ENOMEM;
  896. goto fail_alloc_desc;
  897. }
  898. vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
  899. vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
  900. vq->avail = vhost_memory_map(dev, a, &l, 0);
  901. if (!vq->avail || l != s) {
  902. r = -ENOMEM;
  903. goto fail_alloc_avail;
  904. }
  905. vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
  906. vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
  907. vq->used = vhost_memory_map(dev, a, &l, 1);
  908. if (!vq->used || l != s) {
  909. r = -ENOMEM;
  910. goto fail_alloc_used;
  911. }
  912. r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
  913. if (r < 0) {
  914. r = -errno;
  915. goto fail_alloc;
  916. }
  917. file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
  918. r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
  919. if (r) {
  920. VHOST_OPS_DEBUG("vhost_set_vring_kick failed");
  921. r = -errno;
  922. goto fail_kick;
  923. }
  924. /* Clear and discard previous events if any. */
  925. event_notifier_test_and_clear(&vq->masked_notifier);
  926. /* Init vring in unmasked state, unless guest_notifier_mask
  927. * will do it later.
  928. */
  929. if (!vdev->use_guest_notifier_mask) {
  930. /* TODO: check and handle errors. */
  931. vhost_virtqueue_mask(dev, vdev, idx, false);
  932. }
  933. if (k->query_guest_notifiers &&
  934. k->query_guest_notifiers(qbus->parent) &&
  935. virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
  936. file.fd = -1;
  937. r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
  938. if (r) {
  939. goto fail_vector;
  940. }
  941. }
  942. return 0;
  943. fail_vector:
  944. fail_kick:
  945. fail_alloc:
  946. vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
  947. 0, 0);
  948. fail_alloc_used:
  949. vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
  950. 0, 0);
  951. fail_alloc_avail:
  952. vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
  953. 0, 0);
  954. fail_alloc_desc:
  955. return r;
  956. }
  957. static void vhost_virtqueue_stop(struct vhost_dev *dev,
  958. struct VirtIODevice *vdev,
  959. struct vhost_virtqueue *vq,
  960. unsigned idx)
  961. {
  962. int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
  963. struct vhost_vring_state state = {
  964. .index = vhost_vq_index,
  965. };
  966. int r;
  967. if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
  968. /* Don't stop the virtqueue which might have not been started */
  969. return;
  970. }
  971. r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
  972. if (r < 0) {
  973. VHOST_OPS_DEBUG("vhost VQ %u ring restore failed: %d", idx, r);
  974. /* Connection to the backend is broken, so let's sync internal
  975. * last avail idx to the device used idx.
  976. */
  977. virtio_queue_restore_last_avail_idx(vdev, idx);
  978. } else {
  979. virtio_queue_set_last_avail_idx(vdev, idx, state.num);
  980. }
  981. virtio_queue_invalidate_signalled_used(vdev, idx);
  982. virtio_queue_update_used_idx(vdev, idx);
  983. /* In the cross-endian case, we need to reset the vring endianness to
  984. * native as legacy devices expect so by default.
  985. */
  986. if (vhost_needs_vring_endian(vdev)) {
  987. vhost_virtqueue_set_vring_endian_legacy(dev,
  988. !virtio_is_big_endian(vdev),
  989. vhost_vq_index);
  990. }
  991. vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
  992. 1, virtio_queue_get_used_size(vdev, idx));
  993. vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
  994. 0, virtio_queue_get_avail_size(vdev, idx));
  995. vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
  996. 0, virtio_queue_get_desc_size(vdev, idx));
  997. }
  998. static void vhost_eventfd_add(MemoryListener *listener,
  999. MemoryRegionSection *section,
  1000. bool match_data, uint64_t data, EventNotifier *e)
  1001. {
  1002. }
  1003. static void vhost_eventfd_del(MemoryListener *listener,
  1004. MemoryRegionSection *section,
  1005. bool match_data, uint64_t data, EventNotifier *e)
  1006. {
  1007. }
  1008. static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
  1009. int n, uint32_t timeout)
  1010. {
  1011. int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
  1012. struct vhost_vring_state state = {
  1013. .index = vhost_vq_index,
  1014. .num = timeout,
  1015. };
  1016. int r;
  1017. if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
  1018. return -EINVAL;
  1019. }
  1020. r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
  1021. if (r) {
  1022. VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed");
  1023. return r;
  1024. }
  1025. return 0;
  1026. }
  1027. static int vhost_virtqueue_init(struct vhost_dev *dev,
  1028. struct vhost_virtqueue *vq, int n)
  1029. {
  1030. int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
  1031. struct vhost_vring_file file = {
  1032. .index = vhost_vq_index,
  1033. };
  1034. int r = event_notifier_init(&vq->masked_notifier, 0);
  1035. if (r < 0) {
  1036. return r;
  1037. }
  1038. file.fd = event_notifier_get_fd(&vq->masked_notifier);
  1039. r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
  1040. if (r) {
  1041. VHOST_OPS_DEBUG("vhost_set_vring_call failed");
  1042. r = -errno;
  1043. goto fail_call;
  1044. }
  1045. vq->dev = dev;
  1046. return 0;
  1047. fail_call:
  1048. event_notifier_cleanup(&vq->masked_notifier);
  1049. return r;
  1050. }
  1051. static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
  1052. {
  1053. event_notifier_cleanup(&vq->masked_notifier);
  1054. }
  1055. int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
  1056. VhostBackendType backend_type, uint32_t busyloop_timeout)
  1057. {
  1058. uint64_t features;
  1059. int i, r, n_initialized_vqs = 0;
  1060. Error *local_err = NULL;
  1061. hdev->vdev = NULL;
  1062. hdev->migration_blocker = NULL;
  1063. r = vhost_set_backend_type(hdev, backend_type);
  1064. assert(r >= 0);
  1065. r = hdev->vhost_ops->vhost_backend_init(hdev, opaque);
  1066. if (r < 0) {
  1067. goto fail;
  1068. }
  1069. r = hdev->vhost_ops->vhost_set_owner(hdev);
  1070. if (r < 0) {
  1071. VHOST_OPS_DEBUG("vhost_set_owner failed");
  1072. goto fail;
  1073. }
  1074. r = hdev->vhost_ops->vhost_get_features(hdev, &features);
  1075. if (r < 0) {
  1076. VHOST_OPS_DEBUG("vhost_get_features failed");
  1077. goto fail;
  1078. }
  1079. for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
  1080. r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
  1081. if (r < 0) {
  1082. goto fail;
  1083. }
  1084. }
  1085. if (busyloop_timeout) {
  1086. for (i = 0; i < hdev->nvqs; ++i) {
  1087. r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
  1088. busyloop_timeout);
  1089. if (r < 0) {
  1090. goto fail_busyloop;
  1091. }
  1092. }
  1093. }
  1094. hdev->features = features;
  1095. hdev->memory_listener = (MemoryListener) {
  1096. .begin = vhost_begin,
  1097. .commit = vhost_commit,
  1098. .region_add = vhost_region_addnop,
  1099. .region_nop = vhost_region_addnop,
  1100. .log_start = vhost_log_start,
  1101. .log_stop = vhost_log_stop,
  1102. .log_sync = vhost_log_sync,
  1103. .log_global_start = vhost_log_global_start,
  1104. .log_global_stop = vhost_log_global_stop,
  1105. .eventfd_add = vhost_eventfd_add,
  1106. .eventfd_del = vhost_eventfd_del,
  1107. .priority = 10
  1108. };
  1109. hdev->iommu_listener = (MemoryListener) {
  1110. .region_add = vhost_iommu_region_add,
  1111. .region_del = vhost_iommu_region_del,
  1112. };
  1113. if (hdev->migration_blocker == NULL) {
  1114. if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
  1115. error_setg(&hdev->migration_blocker,
  1116. "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
  1117. } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
  1118. error_setg(&hdev->migration_blocker,
  1119. "Migration disabled: failed to allocate shared memory");
  1120. }
  1121. }
  1122. if (hdev->migration_blocker != NULL) {
  1123. r = migrate_add_blocker(hdev->migration_blocker, &local_err);
  1124. if (local_err) {
  1125. error_report_err(local_err);
  1126. error_free(hdev->migration_blocker);
  1127. goto fail_busyloop;
  1128. }
  1129. }
  1130. hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
  1131. hdev->n_mem_sections = 0;
  1132. hdev->mem_sections = NULL;
  1133. hdev->log = NULL;
  1134. hdev->log_size = 0;
  1135. hdev->log_enabled = false;
  1136. hdev->started = false;
  1137. memory_listener_register(&hdev->memory_listener, &address_space_memory);
  1138. QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
  1139. if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
  1140. error_report("vhost backend memory slots limit is less"
  1141. " than current number of present memory slots");
  1142. r = -1;
  1143. if (busyloop_timeout) {
  1144. goto fail_busyloop;
  1145. } else {
  1146. goto fail;
  1147. }
  1148. }
  1149. return 0;
  1150. fail_busyloop:
  1151. while (--i >= 0) {
  1152. vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
  1153. }
  1154. fail:
  1155. hdev->nvqs = n_initialized_vqs;
  1156. vhost_dev_cleanup(hdev);
  1157. return r;
  1158. }
  1159. void vhost_dev_cleanup(struct vhost_dev *hdev)
  1160. {
  1161. int i;
  1162. for (i = 0; i < hdev->nvqs; ++i) {
  1163. vhost_virtqueue_cleanup(hdev->vqs + i);
  1164. }
  1165. if (hdev->mem) {
  1166. /* those are only safe after successful init */
  1167. memory_listener_unregister(&hdev->memory_listener);
  1168. QLIST_REMOVE(hdev, entry);
  1169. }
  1170. if (hdev->migration_blocker) {
  1171. migrate_del_blocker(hdev->migration_blocker);
  1172. error_free(hdev->migration_blocker);
  1173. }
  1174. g_free(hdev->mem);
  1175. g_free(hdev->mem_sections);
  1176. if (hdev->vhost_ops) {
  1177. hdev->vhost_ops->vhost_backend_cleanup(hdev);
  1178. }
  1179. assert(!hdev->log);
  1180. memset(hdev, 0, sizeof(struct vhost_dev));
  1181. }
  1182. /* Stop processing guest IO notifications in qemu.
  1183. * Start processing them in vhost in kernel.
  1184. */
  1185. int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
  1186. {
  1187. BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
  1188. int i, r, e;
  1189. /* We will pass the notifiers to the kernel, make sure that QEMU
  1190. * doesn't interfere.
  1191. */
  1192. r = virtio_device_grab_ioeventfd(vdev);
  1193. if (r < 0) {
  1194. error_report("binding does not support host notifiers");
  1195. goto fail;
  1196. }
  1197. for (i = 0; i < hdev->nvqs; ++i) {
  1198. r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  1199. true);
  1200. if (r < 0) {
  1201. error_report("vhost VQ %d notifier binding failed: %d", i, -r);
  1202. goto fail_vq;
  1203. }
  1204. }
  1205. return 0;
  1206. fail_vq:
  1207. while (--i >= 0) {
  1208. e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  1209. false);
  1210. if (e < 0) {
  1211. error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
  1212. }
  1213. assert (e >= 0);
  1214. virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
  1215. }
  1216. virtio_device_release_ioeventfd(vdev);
  1217. fail:
  1218. return r;
  1219. }
  1220. /* Stop processing guest IO notifications in vhost.
  1221. * Start processing them in qemu.
  1222. * This might actually run the qemu handlers right away,
  1223. * so virtio in qemu must be completely setup when this is called.
  1224. */
  1225. void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
  1226. {
  1227. BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
  1228. int i, r;
  1229. for (i = 0; i < hdev->nvqs; ++i) {
  1230. r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  1231. false);
  1232. if (r < 0) {
  1233. error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
  1234. }
  1235. assert (r >= 0);
  1236. virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
  1237. }
  1238. virtio_device_release_ioeventfd(vdev);
  1239. }
  1240. /* Test and clear event pending status.
  1241. * Should be called after unmask to avoid losing events.
  1242. */
  1243. bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
  1244. {
  1245. struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
  1246. assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
  1247. return event_notifier_test_and_clear(&vq->masked_notifier);
  1248. }
  1249. /* Mask/unmask events from this vq. */
  1250. void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
  1251. bool mask)
  1252. {
  1253. struct VirtQueue *vvq = virtio_get_queue(vdev, n);
  1254. int r, index = n - hdev->vq_index;
  1255. struct vhost_vring_file file;
  1256. /* should only be called after backend is connected */
  1257. assert(hdev->vhost_ops);
  1258. if (mask) {
  1259. assert(vdev->use_guest_notifier_mask);
  1260. file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
  1261. } else {
  1262. file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
  1263. }
  1264. file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
  1265. r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
  1266. if (r < 0) {
  1267. VHOST_OPS_DEBUG("vhost_set_vring_call failed");
  1268. }
  1269. }
  1270. uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
  1271. uint64_t features)
  1272. {
  1273. const int *bit = feature_bits;
  1274. while (*bit != VHOST_INVALID_FEATURE_BIT) {
  1275. uint64_t bit_mask = (1ULL << *bit);
  1276. if (!(hdev->features & bit_mask)) {
  1277. features &= ~bit_mask;
  1278. }
  1279. bit++;
  1280. }
  1281. return features;
  1282. }
  1283. void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
  1284. uint64_t features)
  1285. {
  1286. const int *bit = feature_bits;
  1287. while (*bit != VHOST_INVALID_FEATURE_BIT) {
  1288. uint64_t bit_mask = (1ULL << *bit);
  1289. if (features & bit_mask) {
  1290. hdev->acked_features |= bit_mask;
  1291. }
  1292. bit++;
  1293. }
  1294. }
  1295. int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
  1296. uint32_t config_len)
  1297. {
  1298. assert(hdev->vhost_ops);
  1299. if (hdev->vhost_ops->vhost_get_config) {
  1300. return hdev->vhost_ops->vhost_get_config(hdev, config, config_len);
  1301. }
  1302. return -1;
  1303. }
  1304. int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
  1305. uint32_t offset, uint32_t size, uint32_t flags)
  1306. {
  1307. assert(hdev->vhost_ops);
  1308. if (hdev->vhost_ops->vhost_set_config) {
  1309. return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
  1310. size, flags);
  1311. }
  1312. return -1;
  1313. }
  1314. void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
  1315. const VhostDevConfigOps *ops)
  1316. {
  1317. hdev->config_ops = ops;
  1318. }
  1319. void vhost_dev_free_inflight(struct vhost_inflight *inflight)
  1320. {
  1321. if (inflight->addr) {
  1322. qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
  1323. inflight->addr = NULL;
  1324. inflight->fd = -1;
  1325. }
  1326. }
  1327. static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
  1328. uint64_t new_size)
  1329. {
  1330. Error *err = NULL;
  1331. int fd = -1;
  1332. void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
  1333. F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
  1334. &fd, &err);
  1335. if (err) {
  1336. error_report_err(err);
  1337. return -1;
  1338. }
  1339. vhost_dev_free_inflight(inflight);
  1340. inflight->offset = 0;
  1341. inflight->addr = addr;
  1342. inflight->fd = fd;
  1343. inflight->size = new_size;
  1344. return 0;
  1345. }
  1346. void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f)
  1347. {
  1348. if (inflight->addr) {
  1349. qemu_put_be64(f, inflight->size);
  1350. qemu_put_be16(f, inflight->queue_size);
  1351. qemu_put_buffer(f, inflight->addr, inflight->size);
  1352. } else {
  1353. qemu_put_be64(f, 0);
  1354. }
  1355. }
  1356. int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
  1357. {
  1358. uint64_t size;
  1359. size = qemu_get_be64(f);
  1360. if (!size) {
  1361. return 0;
  1362. }
  1363. if (inflight->size != size) {
  1364. if (vhost_dev_resize_inflight(inflight, size)) {
  1365. return -1;
  1366. }
  1367. }
  1368. inflight->queue_size = qemu_get_be16(f);
  1369. qemu_get_buffer(f, inflight->addr, size);
  1370. return 0;
  1371. }
  1372. int vhost_dev_set_inflight(struct vhost_dev *dev,
  1373. struct vhost_inflight *inflight)
  1374. {
  1375. int r;
  1376. if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
  1377. r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
  1378. if (r) {
  1379. VHOST_OPS_DEBUG("vhost_set_inflight_fd failed");
  1380. return -errno;
  1381. }
  1382. }
  1383. return 0;
  1384. }
  1385. int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
  1386. struct vhost_inflight *inflight)
  1387. {
  1388. int r;
  1389. if (dev->vhost_ops->vhost_get_inflight_fd) {
  1390. r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
  1391. if (r) {
  1392. VHOST_OPS_DEBUG("vhost_get_inflight_fd failed");
  1393. return -errno;
  1394. }
  1395. }
  1396. return 0;
  1397. }
  1398. /* Host notifiers must be enabled at this point. */
  1399. int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
  1400. {
  1401. int i, r;
  1402. /* should only be called after backend is connected */
  1403. assert(hdev->vhost_ops);
  1404. hdev->started = true;
  1405. hdev->vdev = vdev;
  1406. r = vhost_dev_set_features(hdev, hdev->log_enabled);
  1407. if (r < 0) {
  1408. goto fail_features;
  1409. }
  1410. if (vhost_dev_has_iommu(hdev)) {
  1411. memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
  1412. }
  1413. r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
  1414. if (r < 0) {
  1415. VHOST_OPS_DEBUG("vhost_set_mem_table failed");
  1416. r = -errno;
  1417. goto fail_mem;
  1418. }
  1419. for (i = 0; i < hdev->nvqs; ++i) {
  1420. r = vhost_virtqueue_start(hdev,
  1421. vdev,
  1422. hdev->vqs + i,
  1423. hdev->vq_index + i);
  1424. if (r < 0) {
  1425. goto fail_vq;
  1426. }
  1427. }
  1428. if (hdev->log_enabled) {
  1429. uint64_t log_base;
  1430. hdev->log_size = vhost_get_log_size(hdev);
  1431. hdev->log = vhost_log_get(hdev->log_size,
  1432. vhost_dev_log_is_shared(hdev));
  1433. log_base = (uintptr_t)hdev->log->log;
  1434. r = hdev->vhost_ops->vhost_set_log_base(hdev,
  1435. hdev->log_size ? log_base : 0,
  1436. hdev->log);
  1437. if (r < 0) {
  1438. VHOST_OPS_DEBUG("vhost_set_log_base failed");
  1439. r = -errno;
  1440. goto fail_log;
  1441. }
  1442. }
  1443. if (vhost_dev_has_iommu(hdev)) {
  1444. hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
  1445. /* Update used ring information for IOTLB to work correctly,
  1446. * vhost-kernel code requires for this.*/
  1447. for (i = 0; i < hdev->nvqs; ++i) {
  1448. struct vhost_virtqueue *vq = hdev->vqs + i;
  1449. vhost_device_iotlb_miss(hdev, vq->used_phys, true);
  1450. }
  1451. }
  1452. return 0;
  1453. fail_log:
  1454. vhost_log_put(hdev, false);
  1455. fail_vq:
  1456. while (--i >= 0) {
  1457. vhost_virtqueue_stop(hdev,
  1458. vdev,
  1459. hdev->vqs + i,
  1460. hdev->vq_index + i);
  1461. }
  1462. fail_mem:
  1463. fail_features:
  1464. hdev->started = false;
  1465. return r;
  1466. }
  1467. /* Host notifiers must be enabled at this point. */
  1468. void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
  1469. {
  1470. int i;
  1471. /* should only be called after backend is connected */
  1472. assert(hdev->vhost_ops);
  1473. for (i = 0; i < hdev->nvqs; ++i) {
  1474. vhost_virtqueue_stop(hdev,
  1475. vdev,
  1476. hdev->vqs + i,
  1477. hdev->vq_index + i);
  1478. }
  1479. if (vhost_dev_has_iommu(hdev)) {
  1480. hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
  1481. memory_listener_unregister(&hdev->iommu_listener);
  1482. }
  1483. vhost_log_put(hdev, true);
  1484. hdev->started = false;
  1485. hdev->vdev = NULL;
  1486. }
  1487. int vhost_net_set_backend(struct vhost_dev *hdev,
  1488. struct vhost_vring_file *file)
  1489. {
  1490. if (hdev->vhost_ops->vhost_net_set_backend) {
  1491. return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
  1492. }
  1493. return -1;
  1494. }