vhost.c 54 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775
  1. /*
  2. * vhost support
  3. *
  4. * Copyright Red Hat, Inc. 2010
  5. *
  6. * Authors:
  7. * Michael S. Tsirkin <mst@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Contributions after 2012-01-13 are licensed under the terms of the
  13. * GNU GPL, version 2 or (at your option) any later version.
  14. */
  15. #include "qemu/osdep.h"
  16. #include "qapi/error.h"
  17. #include "hw/virtio/vhost.h"
  18. #include "qemu/atomic.h"
  19. #include "qemu/range.h"
  20. #include "qemu/error-report.h"
  21. #include "qemu/memfd.h"
  22. #include "standard-headers/linux/vhost_types.h"
  23. #include "exec/address-spaces.h"
  24. #include "hw/virtio/virtio-bus.h"
  25. #include "hw/virtio/virtio-access.h"
  26. #include "migration/blocker.h"
  27. #include "migration/qemu-file-types.h"
  28. #include "sysemu/dma.h"
  29. #include "sysemu/tcg.h"
  30. #include "trace.h"
  31. /* enabled until disconnected backend stabilizes */
  32. #define _VHOST_DEBUG 1
  33. #ifdef _VHOST_DEBUG
  34. #define VHOST_OPS_DEBUG(fmt, ...) \
  35. do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
  36. strerror(errno), errno); } while (0)
  37. #else
  38. #define VHOST_OPS_DEBUG(fmt, ...) \
  39. do { } while (0)
  40. #endif
  41. static struct vhost_log *vhost_log;
  42. static struct vhost_log *vhost_log_shm;
  43. static unsigned int used_memslots;
  44. static QLIST_HEAD(, vhost_dev) vhost_devices =
  45. QLIST_HEAD_INITIALIZER(vhost_devices);
  46. bool vhost_has_free_slot(void)
  47. {
  48. unsigned int slots_limit = ~0U;
  49. struct vhost_dev *hdev;
  50. QLIST_FOREACH(hdev, &vhost_devices, entry) {
  51. unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  52. slots_limit = MIN(slots_limit, r);
  53. }
  54. return slots_limit > used_memslots;
  55. }
  56. static void vhost_dev_sync_region(struct vhost_dev *dev,
  57. MemoryRegionSection *section,
  58. uint64_t mfirst, uint64_t mlast,
  59. uint64_t rfirst, uint64_t rlast)
  60. {
  61. vhost_log_chunk_t *log = dev->log->log;
  62. uint64_t start = MAX(mfirst, rfirst);
  63. uint64_t end = MIN(mlast, rlast);
  64. vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  65. vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  66. uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
  67. if (end < start) {
  68. return;
  69. }
  70. assert(end / VHOST_LOG_CHUNK < dev->log_size);
  71. assert(start / VHOST_LOG_CHUNK < dev->log_size);
  72. for (;from < to; ++from) {
  73. vhost_log_chunk_t log;
  74. /* We first check with non-atomic: much cheaper,
  75. * and we expect non-dirty to be the common case. */
  76. if (!*from) {
  77. addr += VHOST_LOG_CHUNK;
  78. continue;
  79. }
  80. /* Data must be read atomically. We don't really need barrier semantics
  81. * but it's easier to use atomic_* than roll our own. */
  82. log = atomic_xchg(from, 0);
  83. while (log) {
  84. int bit = ctzl(log);
  85. hwaddr page_addr;
  86. hwaddr section_offset;
  87. hwaddr mr_offset;
  88. page_addr = addr + bit * VHOST_LOG_PAGE;
  89. section_offset = page_addr - section->offset_within_address_space;
  90. mr_offset = section_offset + section->offset_within_region;
  91. memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
  92. log &= ~(0x1ull << bit);
  93. }
  94. addr += VHOST_LOG_CHUNK;
  95. }
  96. }
  97. static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
  98. MemoryRegionSection *section,
  99. hwaddr first,
  100. hwaddr last)
  101. {
  102. int i;
  103. hwaddr start_addr;
  104. hwaddr end_addr;
  105. if (!dev->log_enabled || !dev->started) {
  106. return 0;
  107. }
  108. start_addr = section->offset_within_address_space;
  109. end_addr = range_get_last(start_addr, int128_get64(section->size));
  110. start_addr = MAX(first, start_addr);
  111. end_addr = MIN(last, end_addr);
  112. for (i = 0; i < dev->mem->nregions; ++i) {
  113. struct vhost_memory_region *reg = dev->mem->regions + i;
  114. vhost_dev_sync_region(dev, section, start_addr, end_addr,
  115. reg->guest_phys_addr,
  116. range_get_last(reg->guest_phys_addr,
  117. reg->memory_size));
  118. }
  119. for (i = 0; i < dev->nvqs; ++i) {
  120. struct vhost_virtqueue *vq = dev->vqs + i;
  121. if (!vq->used_phys && !vq->used_size) {
  122. continue;
  123. }
  124. vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
  125. range_get_last(vq->used_phys, vq->used_size));
  126. }
  127. return 0;
  128. }
  129. static void vhost_log_sync(MemoryListener *listener,
  130. MemoryRegionSection *section)
  131. {
  132. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  133. memory_listener);
  134. vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
  135. }
  136. static void vhost_log_sync_range(struct vhost_dev *dev,
  137. hwaddr first, hwaddr last)
  138. {
  139. int i;
  140. /* FIXME: this is N^2 in number of sections */
  141. for (i = 0; i < dev->n_mem_sections; ++i) {
  142. MemoryRegionSection *section = &dev->mem_sections[i];
  143. vhost_sync_dirty_bitmap(dev, section, first, last);
  144. }
  145. }
  146. static uint64_t vhost_get_log_size(struct vhost_dev *dev)
  147. {
  148. uint64_t log_size = 0;
  149. int i;
  150. for (i = 0; i < dev->mem->nregions; ++i) {
  151. struct vhost_memory_region *reg = dev->mem->regions + i;
  152. uint64_t last = range_get_last(reg->guest_phys_addr,
  153. reg->memory_size);
  154. log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
  155. }
  156. for (i = 0; i < dev->nvqs; ++i) {
  157. struct vhost_virtqueue *vq = dev->vqs + i;
  158. if (!vq->used_phys && !vq->used_size) {
  159. continue;
  160. }
  161. uint64_t last = vq->used_phys + vq->used_size - 1;
  162. log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
  163. }
  164. return log_size;
  165. }
  166. static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
  167. {
  168. Error *err = NULL;
  169. struct vhost_log *log;
  170. uint64_t logsize = size * sizeof(*(log->log));
  171. int fd = -1;
  172. log = g_new0(struct vhost_log, 1);
  173. if (share) {
  174. log->log = qemu_memfd_alloc("vhost-log", logsize,
  175. F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
  176. &fd, &err);
  177. if (err) {
  178. error_report_err(err);
  179. g_free(log);
  180. return NULL;
  181. }
  182. memset(log->log, 0, logsize);
  183. } else {
  184. log->log = g_malloc0(logsize);
  185. }
  186. log->size = size;
  187. log->refcnt = 1;
  188. log->fd = fd;
  189. return log;
  190. }
  191. static struct vhost_log *vhost_log_get(uint64_t size, bool share)
  192. {
  193. struct vhost_log *log = share ? vhost_log_shm : vhost_log;
  194. if (!log || log->size != size) {
  195. log = vhost_log_alloc(size, share);
  196. if (share) {
  197. vhost_log_shm = log;
  198. } else {
  199. vhost_log = log;
  200. }
  201. } else {
  202. ++log->refcnt;
  203. }
  204. return log;
  205. }
  206. static void vhost_log_put(struct vhost_dev *dev, bool sync)
  207. {
  208. struct vhost_log *log = dev->log;
  209. if (!log) {
  210. return;
  211. }
  212. --log->refcnt;
  213. if (log->refcnt == 0) {
  214. /* Sync only the range covered by the old log */
  215. if (dev->log_size && sync) {
  216. vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
  217. }
  218. if (vhost_log == log) {
  219. g_free(log->log);
  220. vhost_log = NULL;
  221. } else if (vhost_log_shm == log) {
  222. qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
  223. log->fd);
  224. vhost_log_shm = NULL;
  225. }
  226. g_free(log);
  227. }
  228. dev->log = NULL;
  229. dev->log_size = 0;
  230. }
  231. static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
  232. {
  233. return dev->vhost_ops->vhost_requires_shm_log &&
  234. dev->vhost_ops->vhost_requires_shm_log(dev);
  235. }
  236. static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
  237. {
  238. struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
  239. uint64_t log_base = (uintptr_t)log->log;
  240. int r;
  241. /* inform backend of log switching, this must be done before
  242. releasing the current log, to ensure no logging is lost */
  243. r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
  244. if (r < 0) {
  245. VHOST_OPS_DEBUG("vhost_set_log_base failed");
  246. }
  247. vhost_log_put(dev, true);
  248. dev->log = log;
  249. dev->log_size = size;
  250. }
  251. static int vhost_dev_has_iommu(struct vhost_dev *dev)
  252. {
  253. VirtIODevice *vdev = dev->vdev;
  254. /*
  255. * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
  256. * incremental memory mapping API via IOTLB API. For platform that
  257. * does not have IOMMU, there's no need to enable this feature
  258. * which may cause unnecessary IOTLB miss/update trnasactions.
  259. */
  260. return vdev->dma_as != &address_space_memory &&
  261. virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
  262. }
  263. static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
  264. hwaddr *plen, bool is_write)
  265. {
  266. if (!vhost_dev_has_iommu(dev)) {
  267. return cpu_physical_memory_map(addr, plen, is_write);
  268. } else {
  269. return (void *)(uintptr_t)addr;
  270. }
  271. }
  272. static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
  273. hwaddr len, int is_write,
  274. hwaddr access_len)
  275. {
  276. if (!vhost_dev_has_iommu(dev)) {
  277. cpu_physical_memory_unmap(buffer, len, is_write, access_len);
  278. }
  279. }
  280. static int vhost_verify_ring_part_mapping(void *ring_hva,
  281. uint64_t ring_gpa,
  282. uint64_t ring_size,
  283. void *reg_hva,
  284. uint64_t reg_gpa,
  285. uint64_t reg_size)
  286. {
  287. uint64_t hva_ring_offset;
  288. uint64_t ring_last = range_get_last(ring_gpa, ring_size);
  289. uint64_t reg_last = range_get_last(reg_gpa, reg_size);
  290. if (ring_last < reg_gpa || ring_gpa > reg_last) {
  291. return 0;
  292. }
  293. /* check that whole ring's is mapped */
  294. if (ring_last > reg_last) {
  295. return -ENOMEM;
  296. }
  297. /* check that ring's MemoryRegion wasn't replaced */
  298. hva_ring_offset = ring_gpa - reg_gpa;
  299. if (ring_hva != reg_hva + hva_ring_offset) {
  300. return -EBUSY;
  301. }
  302. return 0;
  303. }
  304. static int vhost_verify_ring_mappings(struct vhost_dev *dev,
  305. void *reg_hva,
  306. uint64_t reg_gpa,
  307. uint64_t reg_size)
  308. {
  309. int i, j;
  310. int r = 0;
  311. const char *part_name[] = {
  312. "descriptor table",
  313. "available ring",
  314. "used ring"
  315. };
  316. if (vhost_dev_has_iommu(dev)) {
  317. return 0;
  318. }
  319. for (i = 0; i < dev->nvqs; ++i) {
  320. struct vhost_virtqueue *vq = dev->vqs + i;
  321. if (vq->desc_phys == 0) {
  322. continue;
  323. }
  324. j = 0;
  325. r = vhost_verify_ring_part_mapping(
  326. vq->desc, vq->desc_phys, vq->desc_size,
  327. reg_hva, reg_gpa, reg_size);
  328. if (r) {
  329. break;
  330. }
  331. j++;
  332. r = vhost_verify_ring_part_mapping(
  333. vq->avail, vq->avail_phys, vq->avail_size,
  334. reg_hva, reg_gpa, reg_size);
  335. if (r) {
  336. break;
  337. }
  338. j++;
  339. r = vhost_verify_ring_part_mapping(
  340. vq->used, vq->used_phys, vq->used_size,
  341. reg_hva, reg_gpa, reg_size);
  342. if (r) {
  343. break;
  344. }
  345. }
  346. if (r == -ENOMEM) {
  347. error_report("Unable to map %s for ring %d", part_name[j], i);
  348. } else if (r == -EBUSY) {
  349. error_report("%s relocated for ring %d", part_name[j], i);
  350. }
  351. return r;
  352. }
  353. /*
  354. * vhost_section: identify sections needed for vhost access
  355. *
  356. * We only care about RAM sections here (where virtqueue and guest
  357. * internals accessed by virtio might live). If we find one we still
  358. * allow the backend to potentially filter it out of our list.
  359. */
  360. static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
  361. {
  362. MemoryRegion *mr = section->mr;
  363. if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
  364. uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
  365. uint8_t handled_dirty;
  366. /*
  367. * Kernel based vhost doesn't handle any block which is doing
  368. * dirty-tracking other than migration for which it has
  369. * specific logging support. However for TCG the kernel never
  370. * gets involved anyway so we can also ignore it's
  371. * self-modiying code detection flags. However a vhost-user
  372. * client could still confuse a TCG guest if it re-writes
  373. * executable memory that has already been translated.
  374. */
  375. handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
  376. (1 << DIRTY_MEMORY_CODE);
  377. if (dirty_mask & ~handled_dirty) {
  378. trace_vhost_reject_section(mr->name, 1);
  379. return false;
  380. }
  381. if (dev->vhost_ops->vhost_backend_mem_section_filter &&
  382. !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {
  383. trace_vhost_reject_section(mr->name, 2);
  384. return false;
  385. }
  386. trace_vhost_section(mr->name);
  387. return true;
  388. } else {
  389. trace_vhost_reject_section(mr->name, 3);
  390. return false;
  391. }
  392. }
  393. static void vhost_begin(MemoryListener *listener)
  394. {
  395. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  396. memory_listener);
  397. dev->tmp_sections = NULL;
  398. dev->n_tmp_sections = 0;
  399. }
  400. static void vhost_commit(MemoryListener *listener)
  401. {
  402. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  403. memory_listener);
  404. MemoryRegionSection *old_sections;
  405. int n_old_sections;
  406. uint64_t log_size;
  407. size_t regions_size;
  408. int r;
  409. int i;
  410. bool changed = false;
  411. /* Note we can be called before the device is started, but then
  412. * starting the device calls set_mem_table, so we need to have
  413. * built the data structures.
  414. */
  415. old_sections = dev->mem_sections;
  416. n_old_sections = dev->n_mem_sections;
  417. dev->mem_sections = dev->tmp_sections;
  418. dev->n_mem_sections = dev->n_tmp_sections;
  419. if (dev->n_mem_sections != n_old_sections) {
  420. changed = true;
  421. } else {
  422. /* Same size, lets check the contents */
  423. for (int i = 0; i < n_old_sections; i++) {
  424. if (!MemoryRegionSection_eq(&old_sections[i],
  425. &dev->mem_sections[i])) {
  426. changed = true;
  427. break;
  428. }
  429. }
  430. }
  431. trace_vhost_commit(dev->started, changed);
  432. if (!changed) {
  433. goto out;
  434. }
  435. /* Rebuild the regions list from the new sections list */
  436. regions_size = offsetof(struct vhost_memory, regions) +
  437. dev->n_mem_sections * sizeof dev->mem->regions[0];
  438. dev->mem = g_realloc(dev->mem, regions_size);
  439. dev->mem->nregions = dev->n_mem_sections;
  440. used_memslots = dev->mem->nregions;
  441. for (i = 0; i < dev->n_mem_sections; i++) {
  442. struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
  443. struct MemoryRegionSection *mrs = dev->mem_sections + i;
  444. cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
  445. cur_vmr->memory_size = int128_get64(mrs->size);
  446. cur_vmr->userspace_addr =
  447. (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
  448. mrs->offset_within_region;
  449. cur_vmr->flags_padding = 0;
  450. }
  451. if (!dev->started) {
  452. goto out;
  453. }
  454. for (i = 0; i < dev->mem->nregions; i++) {
  455. if (vhost_verify_ring_mappings(dev,
  456. (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
  457. dev->mem->regions[i].guest_phys_addr,
  458. dev->mem->regions[i].memory_size)) {
  459. error_report("Verify ring failure on region %d", i);
  460. abort();
  461. }
  462. }
  463. if (!dev->log_enabled) {
  464. r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
  465. if (r < 0) {
  466. VHOST_OPS_DEBUG("vhost_set_mem_table failed");
  467. }
  468. goto out;
  469. }
  470. log_size = vhost_get_log_size(dev);
  471. /* We allocate an extra 4K bytes to log,
  472. * to reduce the * number of reallocations. */
  473. #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
  474. /* To log more, must increase log size before table update. */
  475. if (dev->log_size < log_size) {
  476. vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
  477. }
  478. r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
  479. if (r < 0) {
  480. VHOST_OPS_DEBUG("vhost_set_mem_table failed");
  481. }
  482. /* To log less, can only decrease log size after table update. */
  483. if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
  484. vhost_dev_log_resize(dev, log_size);
  485. }
  486. out:
  487. /* Deref the old list of sections, this must happen _after_ the
  488. * vhost_set_mem_table to ensure the client isn't still using the
  489. * section we're about to unref.
  490. */
  491. while (n_old_sections--) {
  492. memory_region_unref(old_sections[n_old_sections].mr);
  493. }
  494. g_free(old_sections);
  495. return;
  496. }
  497. /* Adds the section data to the tmp_section structure.
  498. * It relies on the listener calling us in memory address order
  499. * and for each region (via the _add and _nop methods) to
  500. * join neighbours.
  501. */
  502. static void vhost_region_add_section(struct vhost_dev *dev,
  503. MemoryRegionSection *section)
  504. {
  505. bool need_add = true;
  506. uint64_t mrs_size = int128_get64(section->size);
  507. uint64_t mrs_gpa = section->offset_within_address_space;
  508. uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
  509. section->offset_within_region;
  510. RAMBlock *mrs_rb = section->mr->ram_block;
  511. trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
  512. mrs_host);
  513. if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
  514. /* Round the section to it's page size */
  515. /* First align the start down to a page boundary */
  516. size_t mrs_page = qemu_ram_pagesize(mrs_rb);
  517. uint64_t alignage = mrs_host & (mrs_page - 1);
  518. if (alignage) {
  519. mrs_host -= alignage;
  520. mrs_size += alignage;
  521. mrs_gpa -= alignage;
  522. }
  523. /* Now align the size up to a page boundary */
  524. alignage = mrs_size & (mrs_page - 1);
  525. if (alignage) {
  526. mrs_size += mrs_page - alignage;
  527. }
  528. trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
  529. mrs_size, mrs_host);
  530. }
  531. if (dev->n_tmp_sections) {
  532. /* Since we already have at least one section, lets see if
  533. * this extends it; since we're scanning in order, we only
  534. * have to look at the last one, and the FlatView that calls
  535. * us shouldn't have overlaps.
  536. */
  537. MemoryRegionSection *prev_sec = dev->tmp_sections +
  538. (dev->n_tmp_sections - 1);
  539. uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
  540. uint64_t prev_size = int128_get64(prev_sec->size);
  541. uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
  542. uint64_t prev_host_start =
  543. (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
  544. prev_sec->offset_within_region;
  545. uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
  546. if (mrs_gpa <= (prev_gpa_end + 1)) {
  547. /* OK, looks like overlapping/intersecting - it's possible that
  548. * the rounding to page sizes has made them overlap, but they should
  549. * match up in the same RAMBlock if they do.
  550. */
  551. if (mrs_gpa < prev_gpa_start) {
  552. error_report("%s:Section '%s' rounded to %"PRIx64
  553. " prior to previous '%s' %"PRIx64,
  554. __func__, section->mr->name, mrs_gpa,
  555. prev_sec->mr->name, prev_gpa_start);
  556. /* A way to cleanly fail here would be better */
  557. return;
  558. }
  559. /* Offset from the start of the previous GPA to this GPA */
  560. size_t offset = mrs_gpa - prev_gpa_start;
  561. if (prev_host_start + offset == mrs_host &&
  562. section->mr == prev_sec->mr &&
  563. (!dev->vhost_ops->vhost_backend_can_merge ||
  564. dev->vhost_ops->vhost_backend_can_merge(dev,
  565. mrs_host, mrs_size,
  566. prev_host_start, prev_size))) {
  567. uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
  568. need_add = false;
  569. prev_sec->offset_within_address_space =
  570. MIN(prev_gpa_start, mrs_gpa);
  571. prev_sec->offset_within_region =
  572. MIN(prev_host_start, mrs_host) -
  573. (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
  574. prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
  575. mrs_host));
  576. trace_vhost_region_add_section_merge(section->mr->name,
  577. int128_get64(prev_sec->size),
  578. prev_sec->offset_within_address_space,
  579. prev_sec->offset_within_region);
  580. } else {
  581. /* adjoining regions are fine, but overlapping ones with
  582. * different blocks/offsets shouldn't happen
  583. */
  584. if (mrs_gpa != prev_gpa_end + 1) {
  585. error_report("%s: Overlapping but not coherent sections "
  586. "at %"PRIx64,
  587. __func__, mrs_gpa);
  588. return;
  589. }
  590. }
  591. }
  592. }
  593. if (need_add) {
  594. ++dev->n_tmp_sections;
  595. dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
  596. dev->n_tmp_sections);
  597. dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
  598. /* The flatview isn't stable and we don't use it, making it NULL
  599. * means we can memcmp the list.
  600. */
  601. dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
  602. memory_region_ref(section->mr);
  603. }
  604. }
  605. /* Used for both add and nop callbacks */
  606. static void vhost_region_addnop(MemoryListener *listener,
  607. MemoryRegionSection *section)
  608. {
  609. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  610. memory_listener);
  611. if (!vhost_section(dev, section)) {
  612. return;
  613. }
  614. vhost_region_add_section(dev, section);
  615. }
  616. static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
  617. {
  618. struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
  619. struct vhost_dev *hdev = iommu->hdev;
  620. hwaddr iova = iotlb->iova + iommu->iommu_offset;
  621. if (vhost_backend_invalidate_device_iotlb(hdev, iova,
  622. iotlb->addr_mask + 1)) {
  623. error_report("Fail to invalidate device iotlb");
  624. }
  625. }
  626. static void vhost_iommu_region_add(MemoryListener *listener,
  627. MemoryRegionSection *section)
  628. {
  629. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  630. iommu_listener);
  631. struct vhost_iommu *iommu;
  632. Int128 end;
  633. int iommu_idx, ret;
  634. IOMMUMemoryRegion *iommu_mr;
  635. Error *err = NULL;
  636. if (!memory_region_is_iommu(section->mr)) {
  637. return;
  638. }
  639. iommu_mr = IOMMU_MEMORY_REGION(section->mr);
  640. iommu = g_malloc0(sizeof(*iommu));
  641. end = int128_add(int128_make64(section->offset_within_region),
  642. section->size);
  643. end = int128_sub(end, int128_one());
  644. iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
  645. MEMTXATTRS_UNSPECIFIED);
  646. iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
  647. IOMMU_NOTIFIER_UNMAP,
  648. section->offset_within_region,
  649. int128_get64(end),
  650. iommu_idx);
  651. iommu->mr = section->mr;
  652. iommu->iommu_offset = section->offset_within_address_space -
  653. section->offset_within_region;
  654. iommu->hdev = dev;
  655. ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, &err);
  656. if (ret) {
  657. error_report_err(err);
  658. exit(1);
  659. }
  660. QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
  661. /* TODO: can replay help performance here? */
  662. }
  663. static void vhost_iommu_region_del(MemoryListener *listener,
  664. MemoryRegionSection *section)
  665. {
  666. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  667. iommu_listener);
  668. struct vhost_iommu *iommu;
  669. if (!memory_region_is_iommu(section->mr)) {
  670. return;
  671. }
  672. QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
  673. if (iommu->mr == section->mr &&
  674. iommu->n.start == section->offset_within_region) {
  675. memory_region_unregister_iommu_notifier(iommu->mr,
  676. &iommu->n);
  677. QLIST_REMOVE(iommu, iommu_next);
  678. g_free(iommu);
  679. break;
  680. }
  681. }
  682. }
  683. static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
  684. struct vhost_virtqueue *vq,
  685. unsigned idx, bool enable_log)
  686. {
  687. struct vhost_vring_addr addr;
  688. int r;
  689. memset(&addr, 0, sizeof(struct vhost_vring_addr));
  690. if (dev->vhost_ops->vhost_vq_get_addr) {
  691. r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
  692. if (r < 0) {
  693. VHOST_OPS_DEBUG("vhost_vq_get_addr failed");
  694. return -errno;
  695. }
  696. } else {
  697. addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
  698. addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
  699. addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
  700. }
  701. addr.index = idx;
  702. addr.log_guest_addr = vq->used_phys;
  703. addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
  704. r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
  705. if (r < 0) {
  706. VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
  707. return -errno;
  708. }
  709. return 0;
  710. }
  711. static int vhost_dev_set_features(struct vhost_dev *dev,
  712. bool enable_log)
  713. {
  714. uint64_t features = dev->acked_features;
  715. int r;
  716. if (enable_log) {
  717. features |= 0x1ULL << VHOST_F_LOG_ALL;
  718. }
  719. if (!vhost_dev_has_iommu(dev)) {
  720. features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
  721. }
  722. if (dev->vhost_ops->vhost_force_iommu) {
  723. if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
  724. features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
  725. }
  726. }
  727. r = dev->vhost_ops->vhost_set_features(dev, features);
  728. if (r < 0) {
  729. VHOST_OPS_DEBUG("vhost_set_features failed");
  730. }
  731. return r < 0 ? -errno : 0;
  732. }
  733. static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
  734. {
  735. int r, i, idx;
  736. r = vhost_dev_set_features(dev, enable_log);
  737. if (r < 0) {
  738. goto err_features;
  739. }
  740. for (i = 0; i < dev->nvqs; ++i) {
  741. idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
  742. r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
  743. enable_log);
  744. if (r < 0) {
  745. goto err_vq;
  746. }
  747. }
  748. return 0;
  749. err_vq:
  750. for (; i >= 0; --i) {
  751. idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
  752. vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
  753. dev->log_enabled);
  754. }
  755. vhost_dev_set_features(dev, dev->log_enabled);
  756. err_features:
  757. return r;
  758. }
  759. static int vhost_migration_log(MemoryListener *listener, bool enable)
  760. {
  761. struct vhost_dev *dev = container_of(listener, struct vhost_dev,
  762. memory_listener);
  763. int r;
  764. if (enable == dev->log_enabled) {
  765. return 0;
  766. }
  767. if (!dev->started) {
  768. dev->log_enabled = enable;
  769. return 0;
  770. }
  771. if (!enable) {
  772. r = vhost_dev_set_log(dev, false);
  773. if (r < 0) {
  774. return r;
  775. }
  776. vhost_log_put(dev, false);
  777. } else {
  778. vhost_dev_log_resize(dev, vhost_get_log_size(dev));
  779. r = vhost_dev_set_log(dev, true);
  780. if (r < 0) {
  781. return r;
  782. }
  783. }
  784. dev->log_enabled = enable;
  785. return 0;
  786. }
  787. static void vhost_log_global_start(MemoryListener *listener)
  788. {
  789. int r;
  790. r = vhost_migration_log(listener, true);
  791. if (r < 0) {
  792. abort();
  793. }
  794. }
  795. static void vhost_log_global_stop(MemoryListener *listener)
  796. {
  797. int r;
  798. r = vhost_migration_log(listener, false);
  799. if (r < 0) {
  800. abort();
  801. }
  802. }
  803. static void vhost_log_start(MemoryListener *listener,
  804. MemoryRegionSection *section,
  805. int old, int new)
  806. {
  807. /* FIXME: implement */
  808. }
  809. static void vhost_log_stop(MemoryListener *listener,
  810. MemoryRegionSection *section,
  811. int old, int new)
  812. {
  813. /* FIXME: implement */
  814. }
  815. /* The vhost driver natively knows how to handle the vrings of non
  816. * cross-endian legacy devices and modern devices. Only legacy devices
  817. * exposed to a bi-endian guest may require the vhost driver to use a
  818. * specific endianness.
  819. */
  820. static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
  821. {
  822. if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
  823. return false;
  824. }
  825. #ifdef HOST_WORDS_BIGENDIAN
  826. return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
  827. #else
  828. return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
  829. #endif
  830. }
  831. static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
  832. bool is_big_endian,
  833. int vhost_vq_index)
  834. {
  835. struct vhost_vring_state s = {
  836. .index = vhost_vq_index,
  837. .num = is_big_endian
  838. };
  839. if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
  840. return 0;
  841. }
  842. VHOST_OPS_DEBUG("vhost_set_vring_endian failed");
  843. if (errno == ENOTTY) {
  844. error_report("vhost does not support cross-endian");
  845. return -ENOSYS;
  846. }
  847. return -errno;
  848. }
  849. static int vhost_memory_region_lookup(struct vhost_dev *hdev,
  850. uint64_t gpa, uint64_t *uaddr,
  851. uint64_t *len)
  852. {
  853. int i;
  854. for (i = 0; i < hdev->mem->nregions; i++) {
  855. struct vhost_memory_region *reg = hdev->mem->regions + i;
  856. if (gpa >= reg->guest_phys_addr &&
  857. reg->guest_phys_addr + reg->memory_size > gpa) {
  858. *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
  859. *len = reg->guest_phys_addr + reg->memory_size - gpa;
  860. return 0;
  861. }
  862. }
  863. return -EFAULT;
  864. }
  865. int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
  866. {
  867. IOMMUTLBEntry iotlb;
  868. uint64_t uaddr, len;
  869. int ret = -EFAULT;
  870. RCU_READ_LOCK_GUARD();
  871. trace_vhost_iotlb_miss(dev, 1);
  872. iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
  873. iova, write,
  874. MEMTXATTRS_UNSPECIFIED);
  875. if (iotlb.target_as != NULL) {
  876. ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
  877. &uaddr, &len);
  878. if (ret) {
  879. trace_vhost_iotlb_miss(dev, 3);
  880. error_report("Fail to lookup the translated address "
  881. "%"PRIx64, iotlb.translated_addr);
  882. goto out;
  883. }
  884. len = MIN(iotlb.addr_mask + 1, len);
  885. iova = iova & ~iotlb.addr_mask;
  886. ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
  887. len, iotlb.perm);
  888. if (ret) {
  889. trace_vhost_iotlb_miss(dev, 4);
  890. error_report("Fail to update device iotlb");
  891. goto out;
  892. }
  893. }
  894. trace_vhost_iotlb_miss(dev, 2);
  895. out:
  896. return ret;
  897. }
  898. static int vhost_virtqueue_start(struct vhost_dev *dev,
  899. struct VirtIODevice *vdev,
  900. struct vhost_virtqueue *vq,
  901. unsigned idx)
  902. {
  903. BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
  904. VirtioBusState *vbus = VIRTIO_BUS(qbus);
  905. VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
  906. hwaddr s, l, a;
  907. int r;
  908. int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
  909. struct vhost_vring_file file = {
  910. .index = vhost_vq_index
  911. };
  912. struct vhost_vring_state state = {
  913. .index = vhost_vq_index
  914. };
  915. struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
  916. a = virtio_queue_get_desc_addr(vdev, idx);
  917. if (a == 0) {
  918. /* Queue might not be ready for start */
  919. return 0;
  920. }
  921. vq->num = state.num = virtio_queue_get_num(vdev, idx);
  922. r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
  923. if (r) {
  924. VHOST_OPS_DEBUG("vhost_set_vring_num failed");
  925. return -errno;
  926. }
  927. state.num = virtio_queue_get_last_avail_idx(vdev, idx);
  928. r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
  929. if (r) {
  930. VHOST_OPS_DEBUG("vhost_set_vring_base failed");
  931. return -errno;
  932. }
  933. if (vhost_needs_vring_endian(vdev)) {
  934. r = vhost_virtqueue_set_vring_endian_legacy(dev,
  935. virtio_is_big_endian(vdev),
  936. vhost_vq_index);
  937. if (r) {
  938. return -errno;
  939. }
  940. }
  941. vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
  942. vq->desc_phys = a;
  943. vq->desc = vhost_memory_map(dev, a, &l, false);
  944. if (!vq->desc || l != s) {
  945. r = -ENOMEM;
  946. goto fail_alloc_desc;
  947. }
  948. vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
  949. vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
  950. vq->avail = vhost_memory_map(dev, a, &l, false);
  951. if (!vq->avail || l != s) {
  952. r = -ENOMEM;
  953. goto fail_alloc_avail;
  954. }
  955. vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
  956. vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
  957. vq->used = vhost_memory_map(dev, a, &l, true);
  958. if (!vq->used || l != s) {
  959. r = -ENOMEM;
  960. goto fail_alloc_used;
  961. }
  962. r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
  963. if (r < 0) {
  964. r = -errno;
  965. goto fail_alloc;
  966. }
  967. file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
  968. r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
  969. if (r) {
  970. VHOST_OPS_DEBUG("vhost_set_vring_kick failed");
  971. r = -errno;
  972. goto fail_kick;
  973. }
  974. /* Clear and discard previous events if any. */
  975. event_notifier_test_and_clear(&vq->masked_notifier);
  976. /* Init vring in unmasked state, unless guest_notifier_mask
  977. * will do it later.
  978. */
  979. if (!vdev->use_guest_notifier_mask) {
  980. /* TODO: check and handle errors. */
  981. vhost_virtqueue_mask(dev, vdev, idx, false);
  982. }
  983. if (k->query_guest_notifiers &&
  984. k->query_guest_notifiers(qbus->parent) &&
  985. virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
  986. file.fd = -1;
  987. r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
  988. if (r) {
  989. goto fail_vector;
  990. }
  991. }
  992. return 0;
  993. fail_vector:
  994. fail_kick:
  995. fail_alloc:
  996. vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
  997. 0, 0);
  998. fail_alloc_used:
  999. vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
  1000. 0, 0);
  1001. fail_alloc_avail:
  1002. vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
  1003. 0, 0);
  1004. fail_alloc_desc:
  1005. return r;
  1006. }
  1007. static void vhost_virtqueue_stop(struct vhost_dev *dev,
  1008. struct VirtIODevice *vdev,
  1009. struct vhost_virtqueue *vq,
  1010. unsigned idx)
  1011. {
  1012. int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
  1013. struct vhost_vring_state state = {
  1014. .index = vhost_vq_index,
  1015. };
  1016. int r;
  1017. if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
  1018. /* Don't stop the virtqueue which might have not been started */
  1019. return;
  1020. }
  1021. r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
  1022. if (r < 0) {
  1023. VHOST_OPS_DEBUG("vhost VQ %u ring restore failed: %d", idx, r);
  1024. /* Connection to the backend is broken, so let's sync internal
  1025. * last avail idx to the device used idx.
  1026. */
  1027. virtio_queue_restore_last_avail_idx(vdev, idx);
  1028. } else {
  1029. virtio_queue_set_last_avail_idx(vdev, idx, state.num);
  1030. }
  1031. virtio_queue_invalidate_signalled_used(vdev, idx);
  1032. virtio_queue_update_used_idx(vdev, idx);
  1033. /* In the cross-endian case, we need to reset the vring endianness to
  1034. * native as legacy devices expect so by default.
  1035. */
  1036. if (vhost_needs_vring_endian(vdev)) {
  1037. vhost_virtqueue_set_vring_endian_legacy(dev,
  1038. !virtio_is_big_endian(vdev),
  1039. vhost_vq_index);
  1040. }
  1041. vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
  1042. 1, virtio_queue_get_used_size(vdev, idx));
  1043. vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
  1044. 0, virtio_queue_get_avail_size(vdev, idx));
  1045. vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
  1046. 0, virtio_queue_get_desc_size(vdev, idx));
  1047. }
  1048. static void vhost_eventfd_add(MemoryListener *listener,
  1049. MemoryRegionSection *section,
  1050. bool match_data, uint64_t data, EventNotifier *e)
  1051. {
  1052. }
  1053. static void vhost_eventfd_del(MemoryListener *listener,
  1054. MemoryRegionSection *section,
  1055. bool match_data, uint64_t data, EventNotifier *e)
  1056. {
  1057. }
  1058. static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
  1059. int n, uint32_t timeout)
  1060. {
  1061. int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
  1062. struct vhost_vring_state state = {
  1063. .index = vhost_vq_index,
  1064. .num = timeout,
  1065. };
  1066. int r;
  1067. if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
  1068. return -EINVAL;
  1069. }
  1070. r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
  1071. if (r) {
  1072. VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed");
  1073. return r;
  1074. }
  1075. return 0;
  1076. }
  1077. static int vhost_virtqueue_init(struct vhost_dev *dev,
  1078. struct vhost_virtqueue *vq, int n)
  1079. {
  1080. int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
  1081. struct vhost_vring_file file = {
  1082. .index = vhost_vq_index,
  1083. };
  1084. int r = event_notifier_init(&vq->masked_notifier, 0);
  1085. if (r < 0) {
  1086. return r;
  1087. }
  1088. file.fd = event_notifier_get_fd(&vq->masked_notifier);
  1089. r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
  1090. if (r) {
  1091. VHOST_OPS_DEBUG("vhost_set_vring_call failed");
  1092. r = -errno;
  1093. goto fail_call;
  1094. }
  1095. vq->dev = dev;
  1096. return 0;
  1097. fail_call:
  1098. event_notifier_cleanup(&vq->masked_notifier);
  1099. return r;
  1100. }
  1101. static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
  1102. {
  1103. event_notifier_cleanup(&vq->masked_notifier);
  1104. }
  1105. int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
  1106. VhostBackendType backend_type, uint32_t busyloop_timeout)
  1107. {
  1108. uint64_t features;
  1109. int i, r, n_initialized_vqs = 0;
  1110. Error *local_err = NULL;
  1111. hdev->vdev = NULL;
  1112. hdev->migration_blocker = NULL;
  1113. r = vhost_set_backend_type(hdev, backend_type);
  1114. assert(r >= 0);
  1115. r = hdev->vhost_ops->vhost_backend_init(hdev, opaque);
  1116. if (r < 0) {
  1117. goto fail;
  1118. }
  1119. r = hdev->vhost_ops->vhost_set_owner(hdev);
  1120. if (r < 0) {
  1121. VHOST_OPS_DEBUG("vhost_set_owner failed");
  1122. goto fail;
  1123. }
  1124. r = hdev->vhost_ops->vhost_get_features(hdev, &features);
  1125. if (r < 0) {
  1126. VHOST_OPS_DEBUG("vhost_get_features failed");
  1127. goto fail;
  1128. }
  1129. for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
  1130. r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
  1131. if (r < 0) {
  1132. goto fail;
  1133. }
  1134. }
  1135. if (busyloop_timeout) {
  1136. for (i = 0; i < hdev->nvqs; ++i) {
  1137. r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
  1138. busyloop_timeout);
  1139. if (r < 0) {
  1140. goto fail_busyloop;
  1141. }
  1142. }
  1143. }
  1144. hdev->features = features;
  1145. hdev->memory_listener = (MemoryListener) {
  1146. .begin = vhost_begin,
  1147. .commit = vhost_commit,
  1148. .region_add = vhost_region_addnop,
  1149. .region_nop = vhost_region_addnop,
  1150. .log_start = vhost_log_start,
  1151. .log_stop = vhost_log_stop,
  1152. .log_sync = vhost_log_sync,
  1153. .log_global_start = vhost_log_global_start,
  1154. .log_global_stop = vhost_log_global_stop,
  1155. .eventfd_add = vhost_eventfd_add,
  1156. .eventfd_del = vhost_eventfd_del,
  1157. .priority = 10
  1158. };
  1159. hdev->iommu_listener = (MemoryListener) {
  1160. .region_add = vhost_iommu_region_add,
  1161. .region_del = vhost_iommu_region_del,
  1162. };
  1163. if (hdev->migration_blocker == NULL) {
  1164. if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
  1165. error_setg(&hdev->migration_blocker,
  1166. "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
  1167. } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
  1168. error_setg(&hdev->migration_blocker,
  1169. "Migration disabled: failed to allocate shared memory");
  1170. }
  1171. }
  1172. if (hdev->migration_blocker != NULL) {
  1173. r = migrate_add_blocker(hdev->migration_blocker, &local_err);
  1174. if (local_err) {
  1175. error_report_err(local_err);
  1176. error_free(hdev->migration_blocker);
  1177. goto fail_busyloop;
  1178. }
  1179. }
  1180. hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
  1181. hdev->n_mem_sections = 0;
  1182. hdev->mem_sections = NULL;
  1183. hdev->log = NULL;
  1184. hdev->log_size = 0;
  1185. hdev->log_enabled = false;
  1186. hdev->started = false;
  1187. memory_listener_register(&hdev->memory_listener, &address_space_memory);
  1188. QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
  1189. if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
  1190. error_report("vhost backend memory slots limit is less"
  1191. " than current number of present memory slots");
  1192. r = -1;
  1193. if (busyloop_timeout) {
  1194. goto fail_busyloop;
  1195. } else {
  1196. goto fail;
  1197. }
  1198. }
  1199. return 0;
  1200. fail_busyloop:
  1201. while (--i >= 0) {
  1202. vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
  1203. }
  1204. fail:
  1205. hdev->nvqs = n_initialized_vqs;
  1206. vhost_dev_cleanup(hdev);
  1207. return r;
  1208. }
  1209. void vhost_dev_cleanup(struct vhost_dev *hdev)
  1210. {
  1211. int i;
  1212. for (i = 0; i < hdev->nvqs; ++i) {
  1213. vhost_virtqueue_cleanup(hdev->vqs + i);
  1214. }
  1215. if (hdev->mem) {
  1216. /* those are only safe after successful init */
  1217. memory_listener_unregister(&hdev->memory_listener);
  1218. QLIST_REMOVE(hdev, entry);
  1219. }
  1220. if (hdev->migration_blocker) {
  1221. migrate_del_blocker(hdev->migration_blocker);
  1222. error_free(hdev->migration_blocker);
  1223. }
  1224. g_free(hdev->mem);
  1225. g_free(hdev->mem_sections);
  1226. if (hdev->vhost_ops) {
  1227. hdev->vhost_ops->vhost_backend_cleanup(hdev);
  1228. }
  1229. assert(!hdev->log);
  1230. memset(hdev, 0, sizeof(struct vhost_dev));
  1231. }
  1232. /* Stop processing guest IO notifications in qemu.
  1233. * Start processing them in vhost in kernel.
  1234. */
  1235. int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
  1236. {
  1237. BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
  1238. int i, r, e;
  1239. /* We will pass the notifiers to the kernel, make sure that QEMU
  1240. * doesn't interfere.
  1241. */
  1242. r = virtio_device_grab_ioeventfd(vdev);
  1243. if (r < 0) {
  1244. error_report("binding does not support host notifiers");
  1245. goto fail;
  1246. }
  1247. for (i = 0; i < hdev->nvqs; ++i) {
  1248. r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  1249. true);
  1250. if (r < 0) {
  1251. error_report("vhost VQ %d notifier binding failed: %d", i, -r);
  1252. goto fail_vq;
  1253. }
  1254. }
  1255. return 0;
  1256. fail_vq:
  1257. while (--i >= 0) {
  1258. e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  1259. false);
  1260. if (e < 0) {
  1261. error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
  1262. }
  1263. assert (e >= 0);
  1264. virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
  1265. }
  1266. virtio_device_release_ioeventfd(vdev);
  1267. fail:
  1268. return r;
  1269. }
  1270. /* Stop processing guest IO notifications in vhost.
  1271. * Start processing them in qemu.
  1272. * This might actually run the qemu handlers right away,
  1273. * so virtio in qemu must be completely setup when this is called.
  1274. */
  1275. void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
  1276. {
  1277. BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
  1278. int i, r;
  1279. for (i = 0; i < hdev->nvqs; ++i) {
  1280. r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  1281. false);
  1282. if (r < 0) {
  1283. error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
  1284. }
  1285. assert (r >= 0);
  1286. virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
  1287. }
  1288. virtio_device_release_ioeventfd(vdev);
  1289. }
  1290. /* Test and clear event pending status.
  1291. * Should be called after unmask to avoid losing events.
  1292. */
  1293. bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
  1294. {
  1295. struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
  1296. assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
  1297. return event_notifier_test_and_clear(&vq->masked_notifier);
  1298. }
  1299. /* Mask/unmask events from this vq. */
  1300. void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
  1301. bool mask)
  1302. {
  1303. struct VirtQueue *vvq = virtio_get_queue(vdev, n);
  1304. int r, index = n - hdev->vq_index;
  1305. struct vhost_vring_file file;
  1306. /* should only be called after backend is connected */
  1307. assert(hdev->vhost_ops);
  1308. if (mask) {
  1309. assert(vdev->use_guest_notifier_mask);
  1310. file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
  1311. } else {
  1312. file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
  1313. }
  1314. file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
  1315. r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
  1316. if (r < 0) {
  1317. VHOST_OPS_DEBUG("vhost_set_vring_call failed");
  1318. }
  1319. }
  1320. uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
  1321. uint64_t features)
  1322. {
  1323. const int *bit = feature_bits;
  1324. while (*bit != VHOST_INVALID_FEATURE_BIT) {
  1325. uint64_t bit_mask = (1ULL << *bit);
  1326. if (!(hdev->features & bit_mask)) {
  1327. features &= ~bit_mask;
  1328. }
  1329. bit++;
  1330. }
  1331. return features;
  1332. }
  1333. void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
  1334. uint64_t features)
  1335. {
  1336. const int *bit = feature_bits;
  1337. while (*bit != VHOST_INVALID_FEATURE_BIT) {
  1338. uint64_t bit_mask = (1ULL << *bit);
  1339. if (features & bit_mask) {
  1340. hdev->acked_features |= bit_mask;
  1341. }
  1342. bit++;
  1343. }
  1344. }
  1345. int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
  1346. uint32_t config_len)
  1347. {
  1348. assert(hdev->vhost_ops);
  1349. if (hdev->vhost_ops->vhost_get_config) {
  1350. return hdev->vhost_ops->vhost_get_config(hdev, config, config_len);
  1351. }
  1352. return -1;
  1353. }
  1354. int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
  1355. uint32_t offset, uint32_t size, uint32_t flags)
  1356. {
  1357. assert(hdev->vhost_ops);
  1358. if (hdev->vhost_ops->vhost_set_config) {
  1359. return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
  1360. size, flags);
  1361. }
  1362. return -1;
  1363. }
  1364. void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
  1365. const VhostDevConfigOps *ops)
  1366. {
  1367. hdev->config_ops = ops;
  1368. }
  1369. void vhost_dev_free_inflight(struct vhost_inflight *inflight)
  1370. {
  1371. if (inflight && inflight->addr) {
  1372. qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
  1373. inflight->addr = NULL;
  1374. inflight->fd = -1;
  1375. }
  1376. }
  1377. static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
  1378. uint64_t new_size)
  1379. {
  1380. Error *err = NULL;
  1381. int fd = -1;
  1382. void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
  1383. F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
  1384. &fd, &err);
  1385. if (err) {
  1386. error_report_err(err);
  1387. return -1;
  1388. }
  1389. vhost_dev_free_inflight(inflight);
  1390. inflight->offset = 0;
  1391. inflight->addr = addr;
  1392. inflight->fd = fd;
  1393. inflight->size = new_size;
  1394. return 0;
  1395. }
  1396. void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f)
  1397. {
  1398. if (inflight->addr) {
  1399. qemu_put_be64(f, inflight->size);
  1400. qemu_put_be16(f, inflight->queue_size);
  1401. qemu_put_buffer(f, inflight->addr, inflight->size);
  1402. } else {
  1403. qemu_put_be64(f, 0);
  1404. }
  1405. }
  1406. int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
  1407. {
  1408. uint64_t size;
  1409. size = qemu_get_be64(f);
  1410. if (!size) {
  1411. return 0;
  1412. }
  1413. if (inflight->size != size) {
  1414. if (vhost_dev_resize_inflight(inflight, size)) {
  1415. return -1;
  1416. }
  1417. }
  1418. inflight->queue_size = qemu_get_be16(f);
  1419. qemu_get_buffer(f, inflight->addr, size);
  1420. return 0;
  1421. }
  1422. int vhost_dev_set_inflight(struct vhost_dev *dev,
  1423. struct vhost_inflight *inflight)
  1424. {
  1425. int r;
  1426. if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
  1427. r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
  1428. if (r) {
  1429. VHOST_OPS_DEBUG("vhost_set_inflight_fd failed");
  1430. return -errno;
  1431. }
  1432. }
  1433. return 0;
  1434. }
  1435. int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
  1436. struct vhost_inflight *inflight)
  1437. {
  1438. int r;
  1439. if (dev->vhost_ops->vhost_get_inflight_fd) {
  1440. r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
  1441. if (r) {
  1442. VHOST_OPS_DEBUG("vhost_get_inflight_fd failed");
  1443. return -errno;
  1444. }
  1445. }
  1446. return 0;
  1447. }
  1448. /* Host notifiers must be enabled at this point. */
  1449. int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
  1450. {
  1451. int i, r;
  1452. /* should only be called after backend is connected */
  1453. assert(hdev->vhost_ops);
  1454. hdev->started = true;
  1455. hdev->vdev = vdev;
  1456. r = vhost_dev_set_features(hdev, hdev->log_enabled);
  1457. if (r < 0) {
  1458. goto fail_features;
  1459. }
  1460. if (vhost_dev_has_iommu(hdev)) {
  1461. memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
  1462. }
  1463. r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
  1464. if (r < 0) {
  1465. VHOST_OPS_DEBUG("vhost_set_mem_table failed");
  1466. r = -errno;
  1467. goto fail_mem;
  1468. }
  1469. for (i = 0; i < hdev->nvqs; ++i) {
  1470. r = vhost_virtqueue_start(hdev,
  1471. vdev,
  1472. hdev->vqs + i,
  1473. hdev->vq_index + i);
  1474. if (r < 0) {
  1475. goto fail_vq;
  1476. }
  1477. }
  1478. if (hdev->log_enabled) {
  1479. uint64_t log_base;
  1480. hdev->log_size = vhost_get_log_size(hdev);
  1481. hdev->log = vhost_log_get(hdev->log_size,
  1482. vhost_dev_log_is_shared(hdev));
  1483. log_base = (uintptr_t)hdev->log->log;
  1484. r = hdev->vhost_ops->vhost_set_log_base(hdev,
  1485. hdev->log_size ? log_base : 0,
  1486. hdev->log);
  1487. if (r < 0) {
  1488. VHOST_OPS_DEBUG("vhost_set_log_base failed");
  1489. r = -errno;
  1490. goto fail_log;
  1491. }
  1492. }
  1493. if (hdev->vhost_ops->vhost_dev_start) {
  1494. r = hdev->vhost_ops->vhost_dev_start(hdev, true);
  1495. if (r) {
  1496. goto fail_log;
  1497. }
  1498. }
  1499. if (vhost_dev_has_iommu(hdev) &&
  1500. hdev->vhost_ops->vhost_set_iotlb_callback) {
  1501. hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
  1502. /* Update used ring information for IOTLB to work correctly,
  1503. * vhost-kernel code requires for this.*/
  1504. for (i = 0; i < hdev->nvqs; ++i) {
  1505. struct vhost_virtqueue *vq = hdev->vqs + i;
  1506. vhost_device_iotlb_miss(hdev, vq->used_phys, true);
  1507. }
  1508. }
  1509. return 0;
  1510. fail_log:
  1511. vhost_log_put(hdev, false);
  1512. fail_vq:
  1513. while (--i >= 0) {
  1514. vhost_virtqueue_stop(hdev,
  1515. vdev,
  1516. hdev->vqs + i,
  1517. hdev->vq_index + i);
  1518. }
  1519. fail_mem:
  1520. fail_features:
  1521. hdev->started = false;
  1522. return r;
  1523. }
  1524. /* Host notifiers must be enabled at this point. */
  1525. void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
  1526. {
  1527. int i;
  1528. /* should only be called after backend is connected */
  1529. assert(hdev->vhost_ops);
  1530. if (hdev->vhost_ops->vhost_dev_start) {
  1531. hdev->vhost_ops->vhost_dev_start(hdev, false);
  1532. }
  1533. for (i = 0; i < hdev->nvqs; ++i) {
  1534. vhost_virtqueue_stop(hdev,
  1535. vdev,
  1536. hdev->vqs + i,
  1537. hdev->vq_index + i);
  1538. }
  1539. if (vhost_dev_has_iommu(hdev)) {
  1540. if (hdev->vhost_ops->vhost_set_iotlb_callback) {
  1541. hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
  1542. }
  1543. memory_listener_unregister(&hdev->iommu_listener);
  1544. }
  1545. vhost_log_put(hdev, true);
  1546. hdev->started = false;
  1547. hdev->vdev = NULL;
  1548. }
  1549. int vhost_net_set_backend(struct vhost_dev *hdev,
  1550. struct vhost_vring_file *file)
  1551. {
  1552. if (hdev->vhost_ops->vhost_net_set_backend) {
  1553. return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
  1554. }
  1555. return -1;
  1556. }