kvm-all.c 124 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435
  1. /*
  2. * QEMU KVM support
  3. *
  4. * Copyright IBM, Corp. 2008
  5. * Red Hat, Inc. 2008
  6. *
  7. * Authors:
  8. * Anthony Liguori <aliguori@us.ibm.com>
  9. * Glauber Costa <gcosta@redhat.com>
  10. *
  11. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12. * See the COPYING file in the top-level directory.
  13. *
  14. */
  15. #include "qemu/osdep.h"
  16. #include <sys/ioctl.h>
  17. #include <poll.h>
  18. #include <linux/kvm.h>
  19. #include "qemu/atomic.h"
  20. #include "qemu/option.h"
  21. #include "qemu/config-file.h"
  22. #include "qemu/error-report.h"
  23. #include "qapi/error.h"
  24. #include "hw/pci/msi.h"
  25. #include "hw/pci/msix.h"
  26. #include "hw/s390x/adapter.h"
  27. #include "gdbstub/enums.h"
  28. #include "system/kvm_int.h"
  29. #include "system/runstate.h"
  30. #include "system/cpus.h"
  31. #include "system/accel-blocker.h"
  32. #include "qemu/bswap.h"
  33. #include "exec/memory.h"
  34. #include "exec/ram_addr.h"
  35. #include "qemu/event_notifier.h"
  36. #include "qemu/main-loop.h"
  37. #include "trace.h"
  38. #include "hw/irq.h"
  39. #include "qapi/visitor.h"
  40. #include "qapi/qapi-types-common.h"
  41. #include "qapi/qapi-visit-common.h"
  42. #include "system/reset.h"
  43. #include "qemu/guest-random.h"
  44. #include "system/hw_accel.h"
  45. #include "kvm-cpus.h"
  46. #include "system/dirtylimit.h"
  47. #include "qemu/range.h"
  48. #include "hw/boards.h"
  49. #include "system/stats.h"
  50. /* This check must be after config-host.h is included */
  51. #ifdef CONFIG_EVENTFD
  52. #include <sys/eventfd.h>
  53. #endif
  54. /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
  55. * need to use the real host PAGE_SIZE, as that's what KVM will use.
  56. */
  57. #ifdef PAGE_SIZE
  58. #undef PAGE_SIZE
  59. #endif
  60. #define PAGE_SIZE qemu_real_host_page_size()
  61. #ifndef KVM_GUESTDBG_BLOCKIRQ
  62. #define KVM_GUESTDBG_BLOCKIRQ 0
  63. #endif
  64. /* Default num of memslots to be allocated when VM starts */
  65. #define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16
  66. /* Default max allowed memslots if kernel reported nothing */
  67. #define KVM_MEMSLOTS_NR_MAX_DEFAULT 32
  68. struct KVMParkedVcpu {
  69. unsigned long vcpu_id;
  70. int kvm_fd;
  71. QLIST_ENTRY(KVMParkedVcpu) node;
  72. };
  73. KVMState *kvm_state;
  74. bool kvm_kernel_irqchip;
  75. bool kvm_split_irqchip;
  76. bool kvm_async_interrupts_allowed;
  77. bool kvm_halt_in_kernel_allowed;
  78. bool kvm_resamplefds_allowed;
  79. bool kvm_msi_via_irqfd_allowed;
  80. bool kvm_gsi_routing_allowed;
  81. bool kvm_gsi_direct_mapping;
  82. bool kvm_allowed;
  83. bool kvm_readonly_mem_allowed;
  84. bool kvm_vm_attributes_allowed;
  85. bool kvm_msi_use_devid;
  86. static bool kvm_has_guest_debug;
  87. static int kvm_sstep_flags;
  88. static bool kvm_immediate_exit;
  89. static uint64_t kvm_supported_memory_attributes;
  90. static bool kvm_guest_memfd_supported;
  91. static hwaddr kvm_max_slot_size = ~0;
  92. static const KVMCapabilityInfo kvm_required_capabilites[] = {
  93. KVM_CAP_INFO(USER_MEMORY),
  94. KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
  95. KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
  96. KVM_CAP_INFO(INTERNAL_ERROR_DATA),
  97. KVM_CAP_INFO(IOEVENTFD),
  98. KVM_CAP_INFO(IOEVENTFD_ANY_LENGTH),
  99. KVM_CAP_LAST_INFO
  100. };
  101. static NotifierList kvm_irqchip_change_notifiers =
  102. NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
  103. struct KVMResampleFd {
  104. int gsi;
  105. EventNotifier *resample_event;
  106. QLIST_ENTRY(KVMResampleFd) node;
  107. };
  108. typedef struct KVMResampleFd KVMResampleFd;
  109. /*
  110. * Only used with split irqchip where we need to do the resample fd
  111. * kick for the kernel from userspace.
  112. */
  113. static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
  114. QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
  115. static QemuMutex kml_slots_lock;
  116. #define kvm_slots_lock() qemu_mutex_lock(&kml_slots_lock)
  117. #define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock)
  118. static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
  119. static inline void kvm_resample_fd_remove(int gsi)
  120. {
  121. KVMResampleFd *rfd;
  122. QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
  123. if (rfd->gsi == gsi) {
  124. QLIST_REMOVE(rfd, node);
  125. g_free(rfd);
  126. break;
  127. }
  128. }
  129. }
  130. static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
  131. {
  132. KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
  133. rfd->gsi = gsi;
  134. rfd->resample_event = event;
  135. QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
  136. }
  137. void kvm_resample_fd_notify(int gsi)
  138. {
  139. KVMResampleFd *rfd;
  140. QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
  141. if (rfd->gsi == gsi) {
  142. event_notifier_set(rfd->resample_event);
  143. trace_kvm_resample_fd_notify(gsi);
  144. return;
  145. }
  146. }
  147. }
  148. /**
  149. * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener
  150. *
  151. * @kml: The KVMMemoryListener* to grow the slots[] array
  152. * @nr_slots_new: The new size of slots[] array
  153. *
  154. * Returns: True if the array grows larger, false otherwise.
  155. */
  156. static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new)
  157. {
  158. unsigned int i, cur = kml->nr_slots_allocated;
  159. KVMSlot *slots;
  160. if (nr_slots_new > kvm_state->nr_slots_max) {
  161. nr_slots_new = kvm_state->nr_slots_max;
  162. }
  163. if (cur >= nr_slots_new) {
  164. /* Big enough, no need to grow, or we reached max */
  165. return false;
  166. }
  167. if (cur == 0) {
  168. slots = g_new0(KVMSlot, nr_slots_new);
  169. } else {
  170. assert(kml->slots);
  171. slots = g_renew(KVMSlot, kml->slots, nr_slots_new);
  172. /*
  173. * g_renew() doesn't initialize extended buffers, however kvm
  174. * memslots require fields to be zero-initialized. E.g. pointers,
  175. * memory_size field, etc.
  176. */
  177. memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur));
  178. }
  179. for (i = cur; i < nr_slots_new; i++) {
  180. slots[i].slot = i;
  181. }
  182. kml->slots = slots;
  183. kml->nr_slots_allocated = nr_slots_new;
  184. trace_kvm_slots_grow(cur, nr_slots_new);
  185. return true;
  186. }
  187. static bool kvm_slots_double(KVMMemoryListener *kml)
  188. {
  189. return kvm_slots_grow(kml, kml->nr_slots_allocated * 2);
  190. }
  191. unsigned int kvm_get_max_memslots(void)
  192. {
  193. KVMState *s = KVM_STATE(current_accel());
  194. return s->nr_slots_max;
  195. }
  196. unsigned int kvm_get_free_memslots(void)
  197. {
  198. unsigned int used_slots = 0;
  199. KVMState *s = kvm_state;
  200. int i;
  201. kvm_slots_lock();
  202. for (i = 0; i < s->nr_as; i++) {
  203. if (!s->as[i].ml) {
  204. continue;
  205. }
  206. used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used);
  207. }
  208. kvm_slots_unlock();
  209. return s->nr_slots_max - used_slots;
  210. }
  211. /* Called with KVMMemoryListener.slots_lock held */
  212. static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
  213. {
  214. unsigned int n;
  215. int i;
  216. for (i = 0; i < kml->nr_slots_allocated; i++) {
  217. if (kml->slots[i].memory_size == 0) {
  218. return &kml->slots[i];
  219. }
  220. }
  221. /*
  222. * If no free slots, try to grow first by doubling. Cache the old size
  223. * here to avoid another round of search: if the grow succeeded, it
  224. * means slots[] now must have the existing "n" slots occupied,
  225. * followed by one or more free slots starting from slots[n].
  226. */
  227. n = kml->nr_slots_allocated;
  228. if (kvm_slots_double(kml)) {
  229. return &kml->slots[n];
  230. }
  231. return NULL;
  232. }
  233. /* Called with KVMMemoryListener.slots_lock held */
  234. static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
  235. {
  236. KVMSlot *slot = kvm_get_free_slot(kml);
  237. if (slot) {
  238. return slot;
  239. }
  240. fprintf(stderr, "%s: no free slot available\n", __func__);
  241. abort();
  242. }
  243. static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
  244. hwaddr start_addr,
  245. hwaddr size)
  246. {
  247. int i;
  248. for (i = 0; i < kml->nr_slots_allocated; i++) {
  249. KVMSlot *mem = &kml->slots[i];
  250. if (start_addr == mem->start_addr && size == mem->memory_size) {
  251. return mem;
  252. }
  253. }
  254. return NULL;
  255. }
  256. /*
  257. * Calculate and align the start address and the size of the section.
  258. * Return the size. If the size is 0, the aligned section is empty.
  259. */
  260. static hwaddr kvm_align_section(MemoryRegionSection *section,
  261. hwaddr *start)
  262. {
  263. hwaddr size = int128_get64(section->size);
  264. hwaddr delta, aligned;
  265. /* kvm works in page size chunks, but the function may be called
  266. with sub-page size and unaligned start address. Pad the start
  267. address to next and truncate size to previous page boundary. */
  268. aligned = ROUND_UP(section->offset_within_address_space,
  269. qemu_real_host_page_size());
  270. delta = aligned - section->offset_within_address_space;
  271. *start = aligned;
  272. if (delta > size) {
  273. return 0;
  274. }
  275. return (size - delta) & qemu_real_host_page_mask();
  276. }
  277. int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
  278. hwaddr *phys_addr)
  279. {
  280. KVMMemoryListener *kml = &s->memory_listener;
  281. int i, ret = 0;
  282. kvm_slots_lock();
  283. for (i = 0; i < kml->nr_slots_allocated; i++) {
  284. KVMSlot *mem = &kml->slots[i];
  285. if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
  286. *phys_addr = mem->start_addr + (ram - mem->ram);
  287. ret = 1;
  288. break;
  289. }
  290. }
  291. kvm_slots_unlock();
  292. return ret;
  293. }
  294. static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
  295. {
  296. KVMState *s = kvm_state;
  297. struct kvm_userspace_memory_region2 mem;
  298. int ret;
  299. mem.slot = slot->slot | (kml->as_id << 16);
  300. mem.guest_phys_addr = slot->start_addr;
  301. mem.userspace_addr = (unsigned long)slot->ram;
  302. mem.flags = slot->flags;
  303. mem.guest_memfd = slot->guest_memfd;
  304. mem.guest_memfd_offset = slot->guest_memfd_offset;
  305. if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
  306. /* Set the slot size to 0 before setting the slot to the desired
  307. * value. This is needed based on KVM commit 75d61fbc. */
  308. mem.memory_size = 0;
  309. if (kvm_guest_memfd_supported) {
  310. ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
  311. } else {
  312. ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
  313. }
  314. if (ret < 0) {
  315. goto err;
  316. }
  317. }
  318. mem.memory_size = slot->memory_size;
  319. if (kvm_guest_memfd_supported) {
  320. ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
  321. } else {
  322. ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
  323. }
  324. slot->old_flags = mem.flags;
  325. err:
  326. trace_kvm_set_user_memory(mem.slot >> 16, (uint16_t)mem.slot, mem.flags,
  327. mem.guest_phys_addr, mem.memory_size,
  328. mem.userspace_addr, mem.guest_memfd,
  329. mem.guest_memfd_offset, ret);
  330. if (ret < 0) {
  331. if (kvm_guest_memfd_supported) {
  332. error_report("%s: KVM_SET_USER_MEMORY_REGION2 failed, slot=%d,"
  333. " start=0x%" PRIx64 ", size=0x%" PRIx64 ","
  334. " flags=0x%" PRIx32 ", guest_memfd=%" PRId32 ","
  335. " guest_memfd_offset=0x%" PRIx64 ": %s",
  336. __func__, mem.slot, slot->start_addr,
  337. (uint64_t)mem.memory_size, mem.flags,
  338. mem.guest_memfd, (uint64_t)mem.guest_memfd_offset,
  339. strerror(errno));
  340. } else {
  341. error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
  342. " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
  343. __func__, mem.slot, slot->start_addr,
  344. (uint64_t)mem.memory_size, strerror(errno));
  345. }
  346. }
  347. return ret;
  348. }
  349. void kvm_park_vcpu(CPUState *cpu)
  350. {
  351. struct KVMParkedVcpu *vcpu;
  352. trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
  353. vcpu = g_malloc0(sizeof(*vcpu));
  354. vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
  355. vcpu->kvm_fd = cpu->kvm_fd;
  356. QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
  357. }
  358. int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id)
  359. {
  360. struct KVMParkedVcpu *cpu;
  361. int kvm_fd = -ENOENT;
  362. QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
  363. if (cpu->vcpu_id == vcpu_id) {
  364. QLIST_REMOVE(cpu, node);
  365. kvm_fd = cpu->kvm_fd;
  366. g_free(cpu);
  367. break;
  368. }
  369. }
  370. trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked");
  371. return kvm_fd;
  372. }
  373. int kvm_create_vcpu(CPUState *cpu)
  374. {
  375. unsigned long vcpu_id = kvm_arch_vcpu_id(cpu);
  376. KVMState *s = kvm_state;
  377. int kvm_fd;
  378. /* check if the KVM vCPU already exist but is parked */
  379. kvm_fd = kvm_unpark_vcpu(s, vcpu_id);
  380. if (kvm_fd < 0) {
  381. /* vCPU not parked: create a new KVM vCPU */
  382. kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
  383. if (kvm_fd < 0) {
  384. error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id);
  385. return kvm_fd;
  386. }
  387. }
  388. cpu->kvm_fd = kvm_fd;
  389. cpu->kvm_state = s;
  390. cpu->vcpu_dirty = true;
  391. cpu->dirty_pages = 0;
  392. cpu->throttle_us_per_full = 0;
  393. trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
  394. return 0;
  395. }
  396. int kvm_create_and_park_vcpu(CPUState *cpu)
  397. {
  398. int ret = 0;
  399. ret = kvm_create_vcpu(cpu);
  400. if (!ret) {
  401. kvm_park_vcpu(cpu);
  402. }
  403. return ret;
  404. }
  405. static int do_kvm_destroy_vcpu(CPUState *cpu)
  406. {
  407. KVMState *s = kvm_state;
  408. int mmap_size;
  409. int ret = 0;
  410. trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
  411. ret = kvm_arch_destroy_vcpu(cpu);
  412. if (ret < 0) {
  413. goto err;
  414. }
  415. mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
  416. if (mmap_size < 0) {
  417. ret = mmap_size;
  418. trace_kvm_failed_get_vcpu_mmap_size();
  419. goto err;
  420. }
  421. ret = munmap(cpu->kvm_run, mmap_size);
  422. if (ret < 0) {
  423. goto err;
  424. }
  425. if (cpu->kvm_dirty_gfns) {
  426. ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
  427. if (ret < 0) {
  428. goto err;
  429. }
  430. }
  431. kvm_park_vcpu(cpu);
  432. err:
  433. return ret;
  434. }
  435. void kvm_destroy_vcpu(CPUState *cpu)
  436. {
  437. if (do_kvm_destroy_vcpu(cpu) < 0) {
  438. error_report("kvm_destroy_vcpu failed");
  439. exit(EXIT_FAILURE);
  440. }
  441. }
  442. int kvm_init_vcpu(CPUState *cpu, Error **errp)
  443. {
  444. KVMState *s = kvm_state;
  445. int mmap_size;
  446. int ret;
  447. trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
  448. ret = kvm_create_vcpu(cpu);
  449. if (ret < 0) {
  450. error_setg_errno(errp, -ret,
  451. "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
  452. kvm_arch_vcpu_id(cpu));
  453. goto err;
  454. }
  455. mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
  456. if (mmap_size < 0) {
  457. ret = mmap_size;
  458. error_setg_errno(errp, -mmap_size,
  459. "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
  460. goto err;
  461. }
  462. cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
  463. cpu->kvm_fd, 0);
  464. if (cpu->kvm_run == MAP_FAILED) {
  465. ret = -errno;
  466. error_setg_errno(errp, ret,
  467. "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
  468. kvm_arch_vcpu_id(cpu));
  469. goto err;
  470. }
  471. if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
  472. s->coalesced_mmio_ring =
  473. (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
  474. }
  475. if (s->kvm_dirty_ring_size) {
  476. /* Use MAP_SHARED to share pages with the kernel */
  477. cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
  478. PROT_READ | PROT_WRITE, MAP_SHARED,
  479. cpu->kvm_fd,
  480. PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
  481. if (cpu->kvm_dirty_gfns == MAP_FAILED) {
  482. ret = -errno;
  483. goto err;
  484. }
  485. }
  486. ret = kvm_arch_init_vcpu(cpu);
  487. if (ret < 0) {
  488. error_setg_errno(errp, -ret,
  489. "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
  490. kvm_arch_vcpu_id(cpu));
  491. }
  492. cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
  493. err:
  494. return ret;
  495. }
  496. /*
  497. * dirty pages logging control
  498. */
  499. static int kvm_mem_flags(MemoryRegion *mr)
  500. {
  501. bool readonly = mr->readonly || memory_region_is_romd(mr);
  502. int flags = 0;
  503. if (memory_region_get_dirty_log_mask(mr) != 0) {
  504. flags |= KVM_MEM_LOG_DIRTY_PAGES;
  505. }
  506. if (readonly && kvm_readonly_mem_allowed) {
  507. flags |= KVM_MEM_READONLY;
  508. }
  509. if (memory_region_has_guest_memfd(mr)) {
  510. assert(kvm_guest_memfd_supported);
  511. flags |= KVM_MEM_GUEST_MEMFD;
  512. }
  513. return flags;
  514. }
  515. /* Called with KVMMemoryListener.slots_lock held */
  516. static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
  517. MemoryRegion *mr)
  518. {
  519. mem->flags = kvm_mem_flags(mr);
  520. /* If nothing changed effectively, no need to issue ioctl */
  521. if (mem->flags == mem->old_flags) {
  522. return 0;
  523. }
  524. kvm_slot_init_dirty_bitmap(mem);
  525. return kvm_set_user_memory_region(kml, mem, false);
  526. }
  527. static int kvm_section_update_flags(KVMMemoryListener *kml,
  528. MemoryRegionSection *section)
  529. {
  530. hwaddr start_addr, size, slot_size;
  531. KVMSlot *mem;
  532. int ret = 0;
  533. size = kvm_align_section(section, &start_addr);
  534. if (!size) {
  535. return 0;
  536. }
  537. kvm_slots_lock();
  538. while (size && !ret) {
  539. slot_size = MIN(kvm_max_slot_size, size);
  540. mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
  541. if (!mem) {
  542. /* We don't have a slot if we want to trap every access. */
  543. goto out;
  544. }
  545. ret = kvm_slot_update_flags(kml, mem, section->mr);
  546. start_addr += slot_size;
  547. size -= slot_size;
  548. }
  549. out:
  550. kvm_slots_unlock();
  551. return ret;
  552. }
  553. static void kvm_log_start(MemoryListener *listener,
  554. MemoryRegionSection *section,
  555. int old, int new)
  556. {
  557. KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
  558. int r;
  559. if (old != 0) {
  560. return;
  561. }
  562. r = kvm_section_update_flags(kml, section);
  563. if (r < 0) {
  564. abort();
  565. }
  566. }
  567. static void kvm_log_stop(MemoryListener *listener,
  568. MemoryRegionSection *section,
  569. int old, int new)
  570. {
  571. KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
  572. int r;
  573. if (new != 0) {
  574. return;
  575. }
  576. r = kvm_section_update_flags(kml, section);
  577. if (r < 0) {
  578. abort();
  579. }
  580. }
  581. /* get kvm's dirty pages bitmap and update qemu's */
  582. static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
  583. {
  584. ram_addr_t start = slot->ram_start_offset;
  585. ram_addr_t pages = slot->memory_size / qemu_real_host_page_size();
  586. cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
  587. }
  588. static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
  589. {
  590. memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
  591. }
  592. #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
  593. /* Allocate the dirty bitmap for a slot */
  594. static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
  595. {
  596. if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
  597. return;
  598. }
  599. /*
  600. * XXX bad kernel interface alert
  601. * For dirty bitmap, kernel allocates array of size aligned to
  602. * bits-per-long. But for case when the kernel is 64bits and
  603. * the userspace is 32bits, userspace can't align to the same
  604. * bits-per-long, since sizeof(long) is different between kernel
  605. * and user space. This way, userspace will provide buffer which
  606. * may be 4 bytes less than the kernel will use, resulting in
  607. * userspace memory corruption (which is not detectable by valgrind
  608. * too, in most cases).
  609. * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
  610. * a hope that sizeof(long) won't become >8 any time soon.
  611. *
  612. * Note: the granule of kvm dirty log is qemu_real_host_page_size.
  613. * And mem->memory_size is aligned to it (otherwise this mem can't
  614. * be registered to KVM).
  615. */
  616. hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(),
  617. /*HOST_LONG_BITS*/ 64) / 8;
  618. mem->dirty_bmap = g_malloc0(bitmap_size);
  619. mem->dirty_bmap_size = bitmap_size;
  620. }
  621. /*
  622. * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
  623. * succeeded, false otherwise
  624. */
  625. static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
  626. {
  627. struct kvm_dirty_log d = {};
  628. int ret;
  629. d.dirty_bitmap = slot->dirty_bmap;
  630. d.slot = slot->slot | (slot->as_id << 16);
  631. ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
  632. if (ret == -ENOENT) {
  633. /* kernel does not have dirty bitmap in this slot */
  634. ret = 0;
  635. }
  636. if (ret) {
  637. error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
  638. __func__, ret);
  639. }
  640. return ret == 0;
  641. }
  642. /* Should be with all slots_lock held for the address spaces. */
  643. static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
  644. uint32_t slot_id, uint64_t offset)
  645. {
  646. KVMMemoryListener *kml;
  647. KVMSlot *mem;
  648. if (as_id >= s->nr_as) {
  649. return;
  650. }
  651. kml = s->as[as_id].ml;
  652. mem = &kml->slots[slot_id];
  653. if (!mem->memory_size || offset >=
  654. (mem->memory_size / qemu_real_host_page_size())) {
  655. return;
  656. }
  657. set_bit(offset, mem->dirty_bmap);
  658. }
  659. static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
  660. {
  661. /*
  662. * Read the flags before the value. Pairs with barrier in
  663. * KVM's kvm_dirty_ring_push() function.
  664. */
  665. return qatomic_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
  666. }
  667. static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
  668. {
  669. /*
  670. * Use a store-release so that the CPU that executes KVM_RESET_DIRTY_RINGS
  671. * sees the full content of the ring:
  672. *
  673. * CPU0 CPU1 CPU2
  674. * ------------------------------------------------------------------------------
  675. * fill gfn0
  676. * store-rel flags for gfn0
  677. * load-acq flags for gfn0
  678. * store-rel RESET for gfn0
  679. * ioctl(RESET_RINGS)
  680. * load-acq flags for gfn0
  681. * check if flags have RESET
  682. *
  683. * The synchronization goes from CPU2 to CPU0 to CPU1.
  684. */
  685. qatomic_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
  686. }
  687. /*
  688. * Should be with all slots_lock held for the address spaces. It returns the
  689. * dirty page we've collected on this dirty ring.
  690. */
  691. static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
  692. {
  693. struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
  694. uint32_t ring_size = s->kvm_dirty_ring_size;
  695. uint32_t count = 0, fetch = cpu->kvm_fetch_index;
  696. /*
  697. * It's possible that we race with vcpu creation code where the vcpu is
  698. * put onto the vcpus list but not yet initialized the dirty ring
  699. * structures. If so, skip it.
  700. */
  701. if (!cpu->created) {
  702. return 0;
  703. }
  704. assert(dirty_gfns && ring_size);
  705. trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
  706. while (true) {
  707. cur = &dirty_gfns[fetch % ring_size];
  708. if (!dirty_gfn_is_dirtied(cur)) {
  709. break;
  710. }
  711. kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
  712. cur->offset);
  713. dirty_gfn_set_collected(cur);
  714. trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
  715. fetch++;
  716. count++;
  717. }
  718. cpu->kvm_fetch_index = fetch;
  719. cpu->dirty_pages += count;
  720. return count;
  721. }
  722. /* Must be with slots_lock held */
  723. static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
  724. {
  725. int ret;
  726. uint64_t total = 0;
  727. int64_t stamp;
  728. stamp = get_clock();
  729. if (cpu) {
  730. total = kvm_dirty_ring_reap_one(s, cpu);
  731. } else {
  732. CPU_FOREACH(cpu) {
  733. total += kvm_dirty_ring_reap_one(s, cpu);
  734. }
  735. }
  736. if (total) {
  737. ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
  738. assert(ret == total);
  739. }
  740. stamp = get_clock() - stamp;
  741. if (total) {
  742. trace_kvm_dirty_ring_reap(total, stamp / 1000);
  743. }
  744. return total;
  745. }
  746. /*
  747. * Currently for simplicity, we must hold BQL before calling this. We can
  748. * consider to drop the BQL if we're clear with all the race conditions.
  749. */
  750. static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
  751. {
  752. uint64_t total;
  753. /*
  754. * We need to lock all kvm slots for all address spaces here,
  755. * because:
  756. *
  757. * (1) We need to mark dirty for dirty bitmaps in multiple slots
  758. * and for tons of pages, so it's better to take the lock here
  759. * once rather than once per page. And more importantly,
  760. *
  761. * (2) We must _NOT_ publish dirty bits to the other threads
  762. * (e.g., the migration thread) via the kvm memory slot dirty
  763. * bitmaps before correctly re-protect those dirtied pages.
  764. * Otherwise we can have potential risk of data corruption if
  765. * the page data is read in the other thread before we do
  766. * reset below.
  767. */
  768. kvm_slots_lock();
  769. total = kvm_dirty_ring_reap_locked(s, cpu);
  770. kvm_slots_unlock();
  771. return total;
  772. }
  773. static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
  774. {
  775. /* No need to do anything */
  776. }
  777. /*
  778. * Kick all vcpus out in a synchronized way. When returned, we
  779. * guarantee that every vcpu has been kicked and at least returned to
  780. * userspace once.
  781. */
  782. static void kvm_cpu_synchronize_kick_all(void)
  783. {
  784. CPUState *cpu;
  785. CPU_FOREACH(cpu) {
  786. run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
  787. }
  788. }
  789. /*
  790. * Flush all the existing dirty pages to the KVM slot buffers. When
  791. * this call returns, we guarantee that all the touched dirty pages
  792. * before calling this function have been put into the per-kvmslot
  793. * dirty bitmap.
  794. *
  795. * This function must be called with BQL held.
  796. */
  797. static void kvm_dirty_ring_flush(void)
  798. {
  799. trace_kvm_dirty_ring_flush(0);
  800. /*
  801. * The function needs to be serialized. Since this function
  802. * should always be with BQL held, serialization is guaranteed.
  803. * However, let's be sure of it.
  804. */
  805. assert(bql_locked());
  806. /*
  807. * First make sure to flush the hardware buffers by kicking all
  808. * vcpus out in a synchronous way.
  809. */
  810. kvm_cpu_synchronize_kick_all();
  811. kvm_dirty_ring_reap(kvm_state, NULL);
  812. trace_kvm_dirty_ring_flush(1);
  813. }
  814. /**
  815. * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
  816. *
  817. * This function will first try to fetch dirty bitmap from the kernel,
  818. * and then updates qemu's dirty bitmap.
  819. *
  820. * NOTE: caller must be with kml->slots_lock held.
  821. *
  822. * @kml: the KVM memory listener object
  823. * @section: the memory section to sync the dirty bitmap with
  824. */
  825. static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
  826. MemoryRegionSection *section)
  827. {
  828. KVMState *s = kvm_state;
  829. KVMSlot *mem;
  830. hwaddr start_addr, size;
  831. hwaddr slot_size;
  832. size = kvm_align_section(section, &start_addr);
  833. while (size) {
  834. slot_size = MIN(kvm_max_slot_size, size);
  835. mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
  836. if (!mem) {
  837. /* We don't have a slot if we want to trap every access. */
  838. return;
  839. }
  840. if (kvm_slot_get_dirty_log(s, mem)) {
  841. kvm_slot_sync_dirty_pages(mem);
  842. }
  843. start_addr += slot_size;
  844. size -= slot_size;
  845. }
  846. }
  847. /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
  848. #define KVM_CLEAR_LOG_SHIFT 6
  849. #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT)
  850. #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN)
  851. static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
  852. uint64_t size)
  853. {
  854. KVMState *s = kvm_state;
  855. uint64_t end, bmap_start, start_delta, bmap_npages;
  856. struct kvm_clear_dirty_log d;
  857. unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
  858. int ret;
  859. /*
  860. * We need to extend either the start or the size or both to
  861. * satisfy the KVM interface requirement. Firstly, do the start
  862. * page alignment on 64 host pages
  863. */
  864. bmap_start = start & KVM_CLEAR_LOG_MASK;
  865. start_delta = start - bmap_start;
  866. bmap_start /= psize;
  867. /*
  868. * The kernel interface has restriction on the size too, that either:
  869. *
  870. * (1) the size is 64 host pages aligned (just like the start), or
  871. * (2) the size fills up until the end of the KVM memslot.
  872. */
  873. bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
  874. << KVM_CLEAR_LOG_SHIFT;
  875. end = mem->memory_size / psize;
  876. if (bmap_npages > end - bmap_start) {
  877. bmap_npages = end - bmap_start;
  878. }
  879. start_delta /= psize;
  880. /*
  881. * Prepare the bitmap to clear dirty bits. Here we must guarantee
  882. * that we won't clear any unknown dirty bits otherwise we might
  883. * accidentally clear some set bits which are not yet synced from
  884. * the kernel into QEMU's bitmap, then we'll lose track of the
  885. * guest modifications upon those pages (which can directly lead
  886. * to guest data loss or panic after migration).
  887. *
  888. * Layout of the KVMSlot.dirty_bmap:
  889. *
  890. * |<-------- bmap_npages -----------..>|
  891. * [1]
  892. * start_delta size
  893. * |----------------|-------------|------------------|------------|
  894. * ^ ^ ^ ^
  895. * | | | |
  896. * start bmap_start (start) end
  897. * of memslot of memslot
  898. *
  899. * [1] bmap_npages can be aligned to either 64 pages or the end of slot
  900. */
  901. assert(bmap_start % BITS_PER_LONG == 0);
  902. /* We should never do log_clear before log_sync */
  903. assert(mem->dirty_bmap);
  904. if (start_delta || bmap_npages - size / psize) {
  905. /* Slow path - we need to manipulate a temp bitmap */
  906. bmap_clear = bitmap_new(bmap_npages);
  907. bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
  908. bmap_start, start_delta + size / psize);
  909. /*
  910. * We need to fill the holes at start because that was not
  911. * specified by the caller and we extended the bitmap only for
  912. * 64 pages alignment
  913. */
  914. bitmap_clear(bmap_clear, 0, start_delta);
  915. d.dirty_bitmap = bmap_clear;
  916. } else {
  917. /*
  918. * Fast path - both start and size align well with BITS_PER_LONG
  919. * (or the end of memory slot)
  920. */
  921. d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
  922. }
  923. d.first_page = bmap_start;
  924. /* It should never overflow. If it happens, say something */
  925. assert(bmap_npages <= UINT32_MAX);
  926. d.num_pages = bmap_npages;
  927. d.slot = mem->slot | (as_id << 16);
  928. ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
  929. if (ret < 0 && ret != -ENOENT) {
  930. error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
  931. "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
  932. __func__, d.slot, (uint64_t)d.first_page,
  933. (uint32_t)d.num_pages, ret);
  934. } else {
  935. ret = 0;
  936. trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
  937. }
  938. /*
  939. * After we have updated the remote dirty bitmap, we update the
  940. * cached bitmap as well for the memslot, then if another user
  941. * clears the same region we know we shouldn't clear it again on
  942. * the remote otherwise it's data loss as well.
  943. */
  944. bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
  945. size / psize);
  946. /* This handles the NULL case well */
  947. g_free(bmap_clear);
  948. return ret;
  949. }
  950. /**
  951. * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
  952. *
  953. * NOTE: this will be a no-op if we haven't enabled manual dirty log
  954. * protection in the host kernel because in that case this operation
  955. * will be done within log_sync().
  956. *
  957. * @kml: the kvm memory listener
  958. * @section: the memory range to clear dirty bitmap
  959. */
  960. static int kvm_physical_log_clear(KVMMemoryListener *kml,
  961. MemoryRegionSection *section)
  962. {
  963. KVMState *s = kvm_state;
  964. uint64_t start, size, offset, count;
  965. KVMSlot *mem;
  966. int ret = 0, i;
  967. if (!s->manual_dirty_log_protect) {
  968. /* No need to do explicit clear */
  969. return ret;
  970. }
  971. start = section->offset_within_address_space;
  972. size = int128_get64(section->size);
  973. if (!size) {
  974. /* Nothing more we can do... */
  975. return ret;
  976. }
  977. kvm_slots_lock();
  978. for (i = 0; i < kml->nr_slots_allocated; i++) {
  979. mem = &kml->slots[i];
  980. /* Discard slots that are empty or do not overlap the section */
  981. if (!mem->memory_size ||
  982. mem->start_addr > start + size - 1 ||
  983. start > mem->start_addr + mem->memory_size - 1) {
  984. continue;
  985. }
  986. if (start >= mem->start_addr) {
  987. /* The slot starts before section or is aligned to it. */
  988. offset = start - mem->start_addr;
  989. count = MIN(mem->memory_size - offset, size);
  990. } else {
  991. /* The slot starts after section. */
  992. offset = 0;
  993. count = MIN(mem->memory_size, size - (mem->start_addr - start));
  994. }
  995. ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
  996. if (ret < 0) {
  997. break;
  998. }
  999. }
  1000. kvm_slots_unlock();
  1001. return ret;
  1002. }
  1003. static void kvm_coalesce_mmio_region(MemoryListener *listener,
  1004. MemoryRegionSection *secion,
  1005. hwaddr start, hwaddr size)
  1006. {
  1007. KVMState *s = kvm_state;
  1008. if (s->coalesced_mmio) {
  1009. struct kvm_coalesced_mmio_zone zone;
  1010. zone.addr = start;
  1011. zone.size = size;
  1012. zone.pad = 0;
  1013. (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
  1014. }
  1015. }
  1016. static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
  1017. MemoryRegionSection *secion,
  1018. hwaddr start, hwaddr size)
  1019. {
  1020. KVMState *s = kvm_state;
  1021. if (s->coalesced_mmio) {
  1022. struct kvm_coalesced_mmio_zone zone;
  1023. zone.addr = start;
  1024. zone.size = size;
  1025. zone.pad = 0;
  1026. (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
  1027. }
  1028. }
  1029. static void kvm_coalesce_pio_add(MemoryListener *listener,
  1030. MemoryRegionSection *section,
  1031. hwaddr start, hwaddr size)
  1032. {
  1033. KVMState *s = kvm_state;
  1034. if (s->coalesced_pio) {
  1035. struct kvm_coalesced_mmio_zone zone;
  1036. zone.addr = start;
  1037. zone.size = size;
  1038. zone.pio = 1;
  1039. (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
  1040. }
  1041. }
  1042. static void kvm_coalesce_pio_del(MemoryListener *listener,
  1043. MemoryRegionSection *section,
  1044. hwaddr start, hwaddr size)
  1045. {
  1046. KVMState *s = kvm_state;
  1047. if (s->coalesced_pio) {
  1048. struct kvm_coalesced_mmio_zone zone;
  1049. zone.addr = start;
  1050. zone.size = size;
  1051. zone.pio = 1;
  1052. (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
  1053. }
  1054. }
  1055. int kvm_check_extension(KVMState *s, unsigned int extension)
  1056. {
  1057. int ret;
  1058. ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
  1059. if (ret < 0) {
  1060. ret = 0;
  1061. }
  1062. return ret;
  1063. }
  1064. int kvm_vm_check_extension(KVMState *s, unsigned int extension)
  1065. {
  1066. int ret;
  1067. ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
  1068. if (ret < 0) {
  1069. /* VM wide version not implemented, use global one instead */
  1070. ret = kvm_check_extension(s, extension);
  1071. }
  1072. return ret;
  1073. }
  1074. /*
  1075. * We track the poisoned pages to be able to:
  1076. * - replace them on VM reset
  1077. * - block a migration for a VM with a poisoned page
  1078. */
  1079. typedef struct HWPoisonPage {
  1080. ram_addr_t ram_addr;
  1081. QLIST_ENTRY(HWPoisonPage) list;
  1082. } HWPoisonPage;
  1083. static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
  1084. QLIST_HEAD_INITIALIZER(hwpoison_page_list);
  1085. static void kvm_unpoison_all(void *param)
  1086. {
  1087. HWPoisonPage *page, *next_page;
  1088. QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
  1089. QLIST_REMOVE(page, list);
  1090. qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
  1091. g_free(page);
  1092. }
  1093. }
  1094. void kvm_hwpoison_page_add(ram_addr_t ram_addr)
  1095. {
  1096. HWPoisonPage *page;
  1097. QLIST_FOREACH(page, &hwpoison_page_list, list) {
  1098. if (page->ram_addr == ram_addr) {
  1099. return;
  1100. }
  1101. }
  1102. page = g_new(HWPoisonPage, 1);
  1103. page->ram_addr = ram_addr;
  1104. QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
  1105. }
  1106. bool kvm_hwpoisoned_mem(void)
  1107. {
  1108. return !QLIST_EMPTY(&hwpoison_page_list);
  1109. }
  1110. static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
  1111. {
  1112. #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN
  1113. /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN
  1114. * endianness, but the memory core hands them in target endianness.
  1115. * For example, PPC is always treated as big-endian even if running
  1116. * on KVM and on PPC64LE. Correct here.
  1117. */
  1118. switch (size) {
  1119. case 2:
  1120. val = bswap16(val);
  1121. break;
  1122. case 4:
  1123. val = bswap32(val);
  1124. break;
  1125. }
  1126. #endif
  1127. return val;
  1128. }
  1129. static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
  1130. bool assign, uint32_t size, bool datamatch)
  1131. {
  1132. int ret;
  1133. struct kvm_ioeventfd iofd = {
  1134. .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
  1135. .addr = addr,
  1136. .len = size,
  1137. .flags = 0,
  1138. .fd = fd,
  1139. };
  1140. trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
  1141. datamatch);
  1142. if (!kvm_enabled()) {
  1143. return -ENOSYS;
  1144. }
  1145. if (datamatch) {
  1146. iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
  1147. }
  1148. if (!assign) {
  1149. iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
  1150. }
  1151. ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
  1152. if (ret < 0) {
  1153. return -errno;
  1154. }
  1155. return 0;
  1156. }
  1157. static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
  1158. bool assign, uint32_t size, bool datamatch)
  1159. {
  1160. struct kvm_ioeventfd kick = {
  1161. .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
  1162. .addr = addr,
  1163. .flags = KVM_IOEVENTFD_FLAG_PIO,
  1164. .len = size,
  1165. .fd = fd,
  1166. };
  1167. int r;
  1168. trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
  1169. if (!kvm_enabled()) {
  1170. return -ENOSYS;
  1171. }
  1172. if (datamatch) {
  1173. kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
  1174. }
  1175. if (!assign) {
  1176. kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
  1177. }
  1178. r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
  1179. if (r < 0) {
  1180. return r;
  1181. }
  1182. return 0;
  1183. }
  1184. static const KVMCapabilityInfo *
  1185. kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
  1186. {
  1187. while (list->name) {
  1188. if (!kvm_check_extension(s, list->value)) {
  1189. return list;
  1190. }
  1191. list++;
  1192. }
  1193. return NULL;
  1194. }
  1195. void kvm_set_max_memslot_size(hwaddr max_slot_size)
  1196. {
  1197. g_assert(
  1198. ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size
  1199. );
  1200. kvm_max_slot_size = max_slot_size;
  1201. }
  1202. static int kvm_set_memory_attributes(hwaddr start, uint64_t size, uint64_t attr)
  1203. {
  1204. struct kvm_memory_attributes attrs;
  1205. int r;
  1206. assert((attr & kvm_supported_memory_attributes) == attr);
  1207. attrs.attributes = attr;
  1208. attrs.address = start;
  1209. attrs.size = size;
  1210. attrs.flags = 0;
  1211. r = kvm_vm_ioctl(kvm_state, KVM_SET_MEMORY_ATTRIBUTES, &attrs);
  1212. if (r) {
  1213. error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 ") "
  1214. "with attr 0x%" PRIx64 " error '%s'",
  1215. start, size, attr, strerror(errno));
  1216. }
  1217. return r;
  1218. }
  1219. int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
  1220. {
  1221. return kvm_set_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
  1222. }
  1223. int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
  1224. {
  1225. return kvm_set_memory_attributes(start, size, 0);
  1226. }
  1227. /* Called with KVMMemoryListener.slots_lock held */
  1228. static void kvm_set_phys_mem(KVMMemoryListener *kml,
  1229. MemoryRegionSection *section, bool add)
  1230. {
  1231. KVMSlot *mem;
  1232. int err;
  1233. MemoryRegion *mr = section->mr;
  1234. bool writable = !mr->readonly && !mr->rom_device;
  1235. hwaddr start_addr, size, slot_size, mr_offset;
  1236. ram_addr_t ram_start_offset;
  1237. void *ram;
  1238. if (!memory_region_is_ram(mr)) {
  1239. if (writable || !kvm_readonly_mem_allowed) {
  1240. return;
  1241. } else if (!mr->romd_mode) {
  1242. /* If the memory device is not in romd_mode, then we actually want
  1243. * to remove the kvm memory slot so all accesses will trap. */
  1244. add = false;
  1245. }
  1246. }
  1247. size = kvm_align_section(section, &start_addr);
  1248. if (!size) {
  1249. return;
  1250. }
  1251. /* The offset of the kvmslot within the memory region */
  1252. mr_offset = section->offset_within_region + start_addr -
  1253. section->offset_within_address_space;
  1254. /* use aligned delta to align the ram address and offset */
  1255. ram = memory_region_get_ram_ptr(mr) + mr_offset;
  1256. ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
  1257. if (!add) {
  1258. do {
  1259. slot_size = MIN(kvm_max_slot_size, size);
  1260. mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
  1261. if (!mem) {
  1262. return;
  1263. }
  1264. if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
  1265. /*
  1266. * NOTE: We should be aware of the fact that here we're only
  1267. * doing a best effort to sync dirty bits. No matter whether
  1268. * we're using dirty log or dirty ring, we ignored two facts:
  1269. *
  1270. * (1) dirty bits can reside in hardware buffers (PML)
  1271. *
  1272. * (2) after we collected dirty bits here, pages can be dirtied
  1273. * again before we do the final KVM_SET_USER_MEMORY_REGION to
  1274. * remove the slot.
  1275. *
  1276. * Not easy. Let's cross the fingers until it's fixed.
  1277. */
  1278. if (kvm_state->kvm_dirty_ring_size) {
  1279. kvm_dirty_ring_reap_locked(kvm_state, NULL);
  1280. if (kvm_state->kvm_dirty_ring_with_bitmap) {
  1281. kvm_slot_sync_dirty_pages(mem);
  1282. kvm_slot_get_dirty_log(kvm_state, mem);
  1283. }
  1284. } else {
  1285. kvm_slot_get_dirty_log(kvm_state, mem);
  1286. }
  1287. kvm_slot_sync_dirty_pages(mem);
  1288. }
  1289. /* unregister the slot */
  1290. g_free(mem->dirty_bmap);
  1291. mem->dirty_bmap = NULL;
  1292. mem->memory_size = 0;
  1293. mem->flags = 0;
  1294. err = kvm_set_user_memory_region(kml, mem, false);
  1295. if (err) {
  1296. fprintf(stderr, "%s: error unregistering slot: %s\n",
  1297. __func__, strerror(-err));
  1298. abort();
  1299. }
  1300. start_addr += slot_size;
  1301. size -= slot_size;
  1302. kml->nr_slots_used--;
  1303. } while (size);
  1304. return;
  1305. }
  1306. /* register the new slot */
  1307. do {
  1308. slot_size = MIN(kvm_max_slot_size, size);
  1309. mem = kvm_alloc_slot(kml);
  1310. mem->as_id = kml->as_id;
  1311. mem->memory_size = slot_size;
  1312. mem->start_addr = start_addr;
  1313. mem->ram_start_offset = ram_start_offset;
  1314. mem->ram = ram;
  1315. mem->flags = kvm_mem_flags(mr);
  1316. mem->guest_memfd = mr->ram_block->guest_memfd;
  1317. mem->guest_memfd_offset = (uint8_t*)ram - mr->ram_block->host;
  1318. kvm_slot_init_dirty_bitmap(mem);
  1319. err = kvm_set_user_memory_region(kml, mem, true);
  1320. if (err) {
  1321. fprintf(stderr, "%s: error registering slot: %s\n", __func__,
  1322. strerror(-err));
  1323. abort();
  1324. }
  1325. if (memory_region_has_guest_memfd(mr)) {
  1326. err = kvm_set_memory_attributes_private(start_addr, slot_size);
  1327. if (err) {
  1328. error_report("%s: failed to set memory attribute private: %s",
  1329. __func__, strerror(-err));
  1330. exit(1);
  1331. }
  1332. }
  1333. start_addr += slot_size;
  1334. ram_start_offset += slot_size;
  1335. ram += slot_size;
  1336. size -= slot_size;
  1337. kml->nr_slots_used++;
  1338. } while (size);
  1339. }
  1340. static void *kvm_dirty_ring_reaper_thread(void *data)
  1341. {
  1342. KVMState *s = data;
  1343. struct KVMDirtyRingReaper *r = &s->reaper;
  1344. rcu_register_thread();
  1345. trace_kvm_dirty_ring_reaper("init");
  1346. while (true) {
  1347. r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
  1348. trace_kvm_dirty_ring_reaper("wait");
  1349. /*
  1350. * TODO: provide a smarter timeout rather than a constant?
  1351. */
  1352. sleep(1);
  1353. /* keep sleeping so that dirtylimit not be interfered by reaper */
  1354. if (dirtylimit_in_service()) {
  1355. continue;
  1356. }
  1357. trace_kvm_dirty_ring_reaper("wakeup");
  1358. r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
  1359. bql_lock();
  1360. kvm_dirty_ring_reap(s, NULL);
  1361. bql_unlock();
  1362. r->reaper_iteration++;
  1363. }
  1364. g_assert_not_reached();
  1365. }
  1366. static void kvm_dirty_ring_reaper_init(KVMState *s)
  1367. {
  1368. struct KVMDirtyRingReaper *r = &s->reaper;
  1369. qemu_thread_create(&r->reaper_thr, "kvm-reaper",
  1370. kvm_dirty_ring_reaper_thread,
  1371. s, QEMU_THREAD_JOINABLE);
  1372. }
  1373. static int kvm_dirty_ring_init(KVMState *s)
  1374. {
  1375. uint32_t ring_size = s->kvm_dirty_ring_size;
  1376. uint64_t ring_bytes = ring_size * sizeof(struct kvm_dirty_gfn);
  1377. unsigned int capability = KVM_CAP_DIRTY_LOG_RING;
  1378. int ret;
  1379. s->kvm_dirty_ring_size = 0;
  1380. s->kvm_dirty_ring_bytes = 0;
  1381. /* Bail if the dirty ring size isn't specified */
  1382. if (!ring_size) {
  1383. return 0;
  1384. }
  1385. /*
  1386. * Read the max supported pages. Fall back to dirty logging mode
  1387. * if the dirty ring isn't supported.
  1388. */
  1389. ret = kvm_vm_check_extension(s, capability);
  1390. if (ret <= 0) {
  1391. capability = KVM_CAP_DIRTY_LOG_RING_ACQ_REL;
  1392. ret = kvm_vm_check_extension(s, capability);
  1393. }
  1394. if (ret <= 0) {
  1395. warn_report("KVM dirty ring not available, using bitmap method");
  1396. return 0;
  1397. }
  1398. if (ring_bytes > ret) {
  1399. error_report("KVM dirty ring size %" PRIu32 " too big "
  1400. "(maximum is %ld). Please use a smaller value.",
  1401. ring_size, (long)ret / sizeof(struct kvm_dirty_gfn));
  1402. return -EINVAL;
  1403. }
  1404. ret = kvm_vm_enable_cap(s, capability, 0, ring_bytes);
  1405. if (ret) {
  1406. error_report("Enabling of KVM dirty ring failed: %s. "
  1407. "Suggested minimum value is 1024.", strerror(-ret));
  1408. return -EIO;
  1409. }
  1410. /* Enable the backup bitmap if it is supported */
  1411. ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP);
  1412. if (ret > 0) {
  1413. ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, 0);
  1414. if (ret) {
  1415. error_report("Enabling of KVM dirty ring's backup bitmap failed: "
  1416. "%s. ", strerror(-ret));
  1417. return -EIO;
  1418. }
  1419. s->kvm_dirty_ring_with_bitmap = true;
  1420. }
  1421. s->kvm_dirty_ring_size = ring_size;
  1422. s->kvm_dirty_ring_bytes = ring_bytes;
  1423. return 0;
  1424. }
  1425. static void kvm_region_add(MemoryListener *listener,
  1426. MemoryRegionSection *section)
  1427. {
  1428. KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
  1429. KVMMemoryUpdate *update;
  1430. update = g_new0(KVMMemoryUpdate, 1);
  1431. update->section = *section;
  1432. QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
  1433. }
  1434. static void kvm_region_del(MemoryListener *listener,
  1435. MemoryRegionSection *section)
  1436. {
  1437. KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
  1438. KVMMemoryUpdate *update;
  1439. update = g_new0(KVMMemoryUpdate, 1);
  1440. update->section = *section;
  1441. QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
  1442. }
  1443. static void kvm_region_commit(MemoryListener *listener)
  1444. {
  1445. KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
  1446. listener);
  1447. KVMMemoryUpdate *u1, *u2;
  1448. bool need_inhibit = false;
  1449. if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
  1450. QSIMPLEQ_EMPTY(&kml->transaction_del)) {
  1451. return;
  1452. }
  1453. /*
  1454. * We have to be careful when regions to add overlap with ranges to remove.
  1455. * We have to simulate atomic KVM memslot updates by making sure no ioctl()
  1456. * is currently active.
  1457. *
  1458. * The lists are order by addresses, so it's easy to find overlaps.
  1459. */
  1460. u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
  1461. u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
  1462. while (u1 && u2) {
  1463. Range r1, r2;
  1464. range_init_nofail(&r1, u1->section.offset_within_address_space,
  1465. int128_get64(u1->section.size));
  1466. range_init_nofail(&r2, u2->section.offset_within_address_space,
  1467. int128_get64(u2->section.size));
  1468. if (range_overlaps_range(&r1, &r2)) {
  1469. need_inhibit = true;
  1470. break;
  1471. }
  1472. if (range_lob(&r1) < range_lob(&r2)) {
  1473. u1 = QSIMPLEQ_NEXT(u1, next);
  1474. } else {
  1475. u2 = QSIMPLEQ_NEXT(u2, next);
  1476. }
  1477. }
  1478. kvm_slots_lock();
  1479. if (need_inhibit) {
  1480. accel_ioctl_inhibit_begin();
  1481. }
  1482. /* Remove all memslots before adding the new ones. */
  1483. while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
  1484. u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
  1485. QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
  1486. kvm_set_phys_mem(kml, &u1->section, false);
  1487. memory_region_unref(u1->section.mr);
  1488. g_free(u1);
  1489. }
  1490. while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
  1491. u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
  1492. QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
  1493. memory_region_ref(u1->section.mr);
  1494. kvm_set_phys_mem(kml, &u1->section, true);
  1495. g_free(u1);
  1496. }
  1497. if (need_inhibit) {
  1498. accel_ioctl_inhibit_end();
  1499. }
  1500. kvm_slots_unlock();
  1501. }
  1502. static void kvm_log_sync(MemoryListener *listener,
  1503. MemoryRegionSection *section)
  1504. {
  1505. KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
  1506. kvm_slots_lock();
  1507. kvm_physical_sync_dirty_bitmap(kml, section);
  1508. kvm_slots_unlock();
  1509. }
  1510. static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
  1511. {
  1512. KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
  1513. KVMState *s = kvm_state;
  1514. KVMSlot *mem;
  1515. int i;
  1516. /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
  1517. kvm_dirty_ring_flush();
  1518. kvm_slots_lock();
  1519. for (i = 0; i < kml->nr_slots_allocated; i++) {
  1520. mem = &kml->slots[i];
  1521. if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
  1522. kvm_slot_sync_dirty_pages(mem);
  1523. if (s->kvm_dirty_ring_with_bitmap && last_stage &&
  1524. kvm_slot_get_dirty_log(s, mem)) {
  1525. kvm_slot_sync_dirty_pages(mem);
  1526. }
  1527. /*
  1528. * This is not needed by KVM_GET_DIRTY_LOG because the
  1529. * ioctl will unconditionally overwrite the whole region.
  1530. * However kvm dirty ring has no such side effect.
  1531. */
  1532. kvm_slot_reset_dirty_pages(mem);
  1533. }
  1534. }
  1535. kvm_slots_unlock();
  1536. }
  1537. static void kvm_log_clear(MemoryListener *listener,
  1538. MemoryRegionSection *section)
  1539. {
  1540. KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
  1541. int r;
  1542. r = kvm_physical_log_clear(kml, section);
  1543. if (r < 0) {
  1544. error_report_once("%s: kvm log clear failed: mr=%s "
  1545. "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
  1546. section->mr->name, section->offset_within_region,
  1547. int128_get64(section->size));
  1548. abort();
  1549. }
  1550. }
  1551. static void kvm_mem_ioeventfd_add(MemoryListener *listener,
  1552. MemoryRegionSection *section,
  1553. bool match_data, uint64_t data,
  1554. EventNotifier *e)
  1555. {
  1556. int fd = event_notifier_get_fd(e);
  1557. int r;
  1558. r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
  1559. data, true, int128_get64(section->size),
  1560. match_data);
  1561. if (r < 0) {
  1562. fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
  1563. __func__, strerror(-r), -r);
  1564. abort();
  1565. }
  1566. }
  1567. static void kvm_mem_ioeventfd_del(MemoryListener *listener,
  1568. MemoryRegionSection *section,
  1569. bool match_data, uint64_t data,
  1570. EventNotifier *e)
  1571. {
  1572. int fd = event_notifier_get_fd(e);
  1573. int r;
  1574. r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
  1575. data, false, int128_get64(section->size),
  1576. match_data);
  1577. if (r < 0) {
  1578. fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
  1579. __func__, strerror(-r), -r);
  1580. abort();
  1581. }
  1582. }
  1583. static void kvm_io_ioeventfd_add(MemoryListener *listener,
  1584. MemoryRegionSection *section,
  1585. bool match_data, uint64_t data,
  1586. EventNotifier *e)
  1587. {
  1588. int fd = event_notifier_get_fd(e);
  1589. int r;
  1590. r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
  1591. data, true, int128_get64(section->size),
  1592. match_data);
  1593. if (r < 0) {
  1594. fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
  1595. __func__, strerror(-r), -r);
  1596. abort();
  1597. }
  1598. }
  1599. static void kvm_io_ioeventfd_del(MemoryListener *listener,
  1600. MemoryRegionSection *section,
  1601. bool match_data, uint64_t data,
  1602. EventNotifier *e)
  1603. {
  1604. int fd = event_notifier_get_fd(e);
  1605. int r;
  1606. r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
  1607. data, false, int128_get64(section->size),
  1608. match_data);
  1609. if (r < 0) {
  1610. fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
  1611. __func__, strerror(-r), -r);
  1612. abort();
  1613. }
  1614. }
  1615. void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
  1616. AddressSpace *as, int as_id, const char *name)
  1617. {
  1618. int i;
  1619. kml->as_id = as_id;
  1620. kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT);
  1621. QSIMPLEQ_INIT(&kml->transaction_add);
  1622. QSIMPLEQ_INIT(&kml->transaction_del);
  1623. kml->listener.region_add = kvm_region_add;
  1624. kml->listener.region_del = kvm_region_del;
  1625. kml->listener.commit = kvm_region_commit;
  1626. kml->listener.log_start = kvm_log_start;
  1627. kml->listener.log_stop = kvm_log_stop;
  1628. kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL;
  1629. kml->listener.name = name;
  1630. if (s->kvm_dirty_ring_size) {
  1631. kml->listener.log_sync_global = kvm_log_sync_global;
  1632. } else {
  1633. kml->listener.log_sync = kvm_log_sync;
  1634. kml->listener.log_clear = kvm_log_clear;
  1635. }
  1636. memory_listener_register(&kml->listener, as);
  1637. for (i = 0; i < s->nr_as; ++i) {
  1638. if (!s->as[i].as) {
  1639. s->as[i].as = as;
  1640. s->as[i].ml = kml;
  1641. break;
  1642. }
  1643. }
  1644. }
  1645. static MemoryListener kvm_io_listener = {
  1646. .name = "kvm-io",
  1647. .coalesced_io_add = kvm_coalesce_pio_add,
  1648. .coalesced_io_del = kvm_coalesce_pio_del,
  1649. .eventfd_add = kvm_io_ioeventfd_add,
  1650. .eventfd_del = kvm_io_ioeventfd_del,
  1651. .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND,
  1652. };
  1653. int kvm_set_irq(KVMState *s, int irq, int level)
  1654. {
  1655. struct kvm_irq_level event;
  1656. int ret;
  1657. assert(kvm_async_interrupts_enabled());
  1658. event.level = level;
  1659. event.irq = irq;
  1660. ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
  1661. if (ret < 0) {
  1662. perror("kvm_set_irq");
  1663. abort();
  1664. }
  1665. return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
  1666. }
  1667. #ifdef KVM_CAP_IRQ_ROUTING
  1668. typedef struct KVMMSIRoute {
  1669. struct kvm_irq_routing_entry kroute;
  1670. QTAILQ_ENTRY(KVMMSIRoute) entry;
  1671. } KVMMSIRoute;
  1672. static void set_gsi(KVMState *s, unsigned int gsi)
  1673. {
  1674. set_bit(gsi, s->used_gsi_bitmap);
  1675. }
  1676. static void clear_gsi(KVMState *s, unsigned int gsi)
  1677. {
  1678. clear_bit(gsi, s->used_gsi_bitmap);
  1679. }
  1680. void kvm_init_irq_routing(KVMState *s)
  1681. {
  1682. int gsi_count;
  1683. gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
  1684. if (gsi_count > 0) {
  1685. /* Round up so we can search ints using ffs */
  1686. s->used_gsi_bitmap = bitmap_new(gsi_count);
  1687. s->gsi_count = gsi_count;
  1688. }
  1689. s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
  1690. s->nr_allocated_irq_routes = 0;
  1691. kvm_arch_init_irq_routing(s);
  1692. }
  1693. void kvm_irqchip_commit_routes(KVMState *s)
  1694. {
  1695. int ret;
  1696. if (kvm_gsi_direct_mapping()) {
  1697. return;
  1698. }
  1699. if (!kvm_gsi_routing_enabled()) {
  1700. return;
  1701. }
  1702. s->irq_routes->flags = 0;
  1703. trace_kvm_irqchip_commit_routes();
  1704. ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
  1705. assert(ret == 0);
  1706. }
  1707. void kvm_add_routing_entry(KVMState *s,
  1708. struct kvm_irq_routing_entry *entry)
  1709. {
  1710. struct kvm_irq_routing_entry *new;
  1711. int n, size;
  1712. if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
  1713. n = s->nr_allocated_irq_routes * 2;
  1714. if (n < 64) {
  1715. n = 64;
  1716. }
  1717. size = sizeof(struct kvm_irq_routing);
  1718. size += n * sizeof(*new);
  1719. s->irq_routes = g_realloc(s->irq_routes, size);
  1720. s->nr_allocated_irq_routes = n;
  1721. }
  1722. n = s->irq_routes->nr++;
  1723. new = &s->irq_routes->entries[n];
  1724. *new = *entry;
  1725. set_gsi(s, entry->gsi);
  1726. }
  1727. static int kvm_update_routing_entry(KVMState *s,
  1728. struct kvm_irq_routing_entry *new_entry)
  1729. {
  1730. struct kvm_irq_routing_entry *entry;
  1731. int n;
  1732. for (n = 0; n < s->irq_routes->nr; n++) {
  1733. entry = &s->irq_routes->entries[n];
  1734. if (entry->gsi != new_entry->gsi) {
  1735. continue;
  1736. }
  1737. if(!memcmp(entry, new_entry, sizeof *entry)) {
  1738. return 0;
  1739. }
  1740. *entry = *new_entry;
  1741. return 0;
  1742. }
  1743. return -ESRCH;
  1744. }
  1745. void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
  1746. {
  1747. struct kvm_irq_routing_entry e = {};
  1748. assert(pin < s->gsi_count);
  1749. e.gsi = irq;
  1750. e.type = KVM_IRQ_ROUTING_IRQCHIP;
  1751. e.flags = 0;
  1752. e.u.irqchip.irqchip = irqchip;
  1753. e.u.irqchip.pin = pin;
  1754. kvm_add_routing_entry(s, &e);
  1755. }
  1756. void kvm_irqchip_release_virq(KVMState *s, int virq)
  1757. {
  1758. struct kvm_irq_routing_entry *e;
  1759. int i;
  1760. if (kvm_gsi_direct_mapping()) {
  1761. return;
  1762. }
  1763. for (i = 0; i < s->irq_routes->nr; i++) {
  1764. e = &s->irq_routes->entries[i];
  1765. if (e->gsi == virq) {
  1766. s->irq_routes->nr--;
  1767. *e = s->irq_routes->entries[s->irq_routes->nr];
  1768. }
  1769. }
  1770. clear_gsi(s, virq);
  1771. kvm_arch_release_virq_post(virq);
  1772. trace_kvm_irqchip_release_virq(virq);
  1773. }
  1774. void kvm_irqchip_add_change_notifier(Notifier *n)
  1775. {
  1776. notifier_list_add(&kvm_irqchip_change_notifiers, n);
  1777. }
  1778. void kvm_irqchip_remove_change_notifier(Notifier *n)
  1779. {
  1780. notifier_remove(n);
  1781. }
  1782. void kvm_irqchip_change_notify(void)
  1783. {
  1784. notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
  1785. }
  1786. int kvm_irqchip_get_virq(KVMState *s)
  1787. {
  1788. int next_virq;
  1789. /* Return the lowest unused GSI in the bitmap */
  1790. next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
  1791. if (next_virq >= s->gsi_count) {
  1792. return -ENOSPC;
  1793. } else {
  1794. return next_virq;
  1795. }
  1796. }
  1797. int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
  1798. {
  1799. struct kvm_msi msi;
  1800. msi.address_lo = (uint32_t)msg.address;
  1801. msi.address_hi = msg.address >> 32;
  1802. msi.data = le32_to_cpu(msg.data);
  1803. msi.flags = 0;
  1804. memset(msi.pad, 0, sizeof(msi.pad));
  1805. return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
  1806. }
  1807. int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
  1808. {
  1809. struct kvm_irq_routing_entry kroute = {};
  1810. int virq;
  1811. KVMState *s = c->s;
  1812. MSIMessage msg = {0, 0};
  1813. if (pci_available && dev) {
  1814. msg = pci_get_msi_message(dev, vector);
  1815. }
  1816. if (kvm_gsi_direct_mapping()) {
  1817. return kvm_arch_msi_data_to_gsi(msg.data);
  1818. }
  1819. if (!kvm_gsi_routing_enabled()) {
  1820. return -ENOSYS;
  1821. }
  1822. virq = kvm_irqchip_get_virq(s);
  1823. if (virq < 0) {
  1824. return virq;
  1825. }
  1826. kroute.gsi = virq;
  1827. kroute.type = KVM_IRQ_ROUTING_MSI;
  1828. kroute.flags = 0;
  1829. kroute.u.msi.address_lo = (uint32_t)msg.address;
  1830. kroute.u.msi.address_hi = msg.address >> 32;
  1831. kroute.u.msi.data = le32_to_cpu(msg.data);
  1832. if (pci_available && kvm_msi_devid_required()) {
  1833. kroute.flags = KVM_MSI_VALID_DEVID;
  1834. kroute.u.msi.devid = pci_requester_id(dev);
  1835. }
  1836. if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
  1837. kvm_irqchip_release_virq(s, virq);
  1838. return -EINVAL;
  1839. }
  1840. if (s->irq_routes->nr < s->gsi_count) {
  1841. trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
  1842. vector, virq);
  1843. kvm_add_routing_entry(s, &kroute);
  1844. kvm_arch_add_msi_route_post(&kroute, vector, dev);
  1845. c->changes++;
  1846. } else {
  1847. kvm_irqchip_release_virq(s, virq);
  1848. return -ENOSPC;
  1849. }
  1850. return virq;
  1851. }
  1852. int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
  1853. PCIDevice *dev)
  1854. {
  1855. struct kvm_irq_routing_entry kroute = {};
  1856. if (kvm_gsi_direct_mapping()) {
  1857. return 0;
  1858. }
  1859. if (!kvm_irqchip_in_kernel()) {
  1860. return -ENOSYS;
  1861. }
  1862. kroute.gsi = virq;
  1863. kroute.type = KVM_IRQ_ROUTING_MSI;
  1864. kroute.flags = 0;
  1865. kroute.u.msi.address_lo = (uint32_t)msg.address;
  1866. kroute.u.msi.address_hi = msg.address >> 32;
  1867. kroute.u.msi.data = le32_to_cpu(msg.data);
  1868. if (pci_available && kvm_msi_devid_required()) {
  1869. kroute.flags = KVM_MSI_VALID_DEVID;
  1870. kroute.u.msi.devid = pci_requester_id(dev);
  1871. }
  1872. if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
  1873. return -EINVAL;
  1874. }
  1875. trace_kvm_irqchip_update_msi_route(virq);
  1876. return kvm_update_routing_entry(s, &kroute);
  1877. }
  1878. static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
  1879. EventNotifier *resample, int virq,
  1880. bool assign)
  1881. {
  1882. int fd = event_notifier_get_fd(event);
  1883. int rfd = resample ? event_notifier_get_fd(resample) : -1;
  1884. struct kvm_irqfd irqfd = {
  1885. .fd = fd,
  1886. .gsi = virq,
  1887. .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
  1888. };
  1889. if (rfd != -1) {
  1890. assert(assign);
  1891. if (kvm_irqchip_is_split()) {
  1892. /*
  1893. * When the slow irqchip (e.g. IOAPIC) is in the
  1894. * userspace, KVM kernel resamplefd will not work because
  1895. * the EOI of the interrupt will be delivered to userspace
  1896. * instead, so the KVM kernel resamplefd kick will be
  1897. * skipped. The userspace here mimics what the kernel
  1898. * provides with resamplefd, remember the resamplefd and
  1899. * kick it when we receive EOI of this IRQ.
  1900. *
  1901. * This is hackery because IOAPIC is mostly bypassed
  1902. * (except EOI broadcasts) when irqfd is used. However
  1903. * this can bring much performance back for split irqchip
  1904. * with INTx IRQs (for VFIO, this gives 93% perf of the
  1905. * full fast path, which is 46% perf boost comparing to
  1906. * the INTx slow path).
  1907. */
  1908. kvm_resample_fd_insert(virq, resample);
  1909. } else {
  1910. irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
  1911. irqfd.resamplefd = rfd;
  1912. }
  1913. } else if (!assign) {
  1914. if (kvm_irqchip_is_split()) {
  1915. kvm_resample_fd_remove(virq);
  1916. }
  1917. }
  1918. return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
  1919. }
  1920. #else /* !KVM_CAP_IRQ_ROUTING */
  1921. void kvm_init_irq_routing(KVMState *s)
  1922. {
  1923. }
  1924. void kvm_irqchip_release_virq(KVMState *s, int virq)
  1925. {
  1926. }
  1927. int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
  1928. {
  1929. abort();
  1930. }
  1931. int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
  1932. {
  1933. return -ENOSYS;
  1934. }
  1935. int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
  1936. {
  1937. return -ENOSYS;
  1938. }
  1939. int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
  1940. {
  1941. return -ENOSYS;
  1942. }
  1943. static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
  1944. EventNotifier *resample, int virq,
  1945. bool assign)
  1946. {
  1947. abort();
  1948. }
  1949. int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
  1950. {
  1951. return -ENOSYS;
  1952. }
  1953. #endif /* !KVM_CAP_IRQ_ROUTING */
  1954. int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
  1955. EventNotifier *rn, int virq)
  1956. {
  1957. return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
  1958. }
  1959. int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
  1960. int virq)
  1961. {
  1962. return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
  1963. }
  1964. int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
  1965. EventNotifier *rn, qemu_irq irq)
  1966. {
  1967. gpointer key, gsi;
  1968. gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
  1969. if (!found) {
  1970. return -ENXIO;
  1971. }
  1972. return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
  1973. }
  1974. int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
  1975. qemu_irq irq)
  1976. {
  1977. gpointer key, gsi;
  1978. gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
  1979. if (!found) {
  1980. return -ENXIO;
  1981. }
  1982. return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
  1983. }
  1984. void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
  1985. {
  1986. g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
  1987. }
  1988. static void kvm_irqchip_create(KVMState *s)
  1989. {
  1990. int ret;
  1991. assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
  1992. if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
  1993. ;
  1994. } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
  1995. ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
  1996. if (ret < 0) {
  1997. fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
  1998. exit(1);
  1999. }
  2000. } else {
  2001. return;
  2002. }
  2003. if (kvm_check_extension(s, KVM_CAP_IRQFD) <= 0) {
  2004. fprintf(stderr, "kvm: irqfd not implemented\n");
  2005. exit(1);
  2006. }
  2007. /* First probe and see if there's a arch-specific hook to create the
  2008. * in-kernel irqchip for us */
  2009. ret = kvm_arch_irqchip_create(s);
  2010. if (ret == 0) {
  2011. if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
  2012. error_report("Split IRQ chip mode not supported.");
  2013. exit(1);
  2014. } else {
  2015. ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
  2016. }
  2017. }
  2018. if (ret < 0) {
  2019. fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
  2020. exit(1);
  2021. }
  2022. kvm_kernel_irqchip = true;
  2023. /* If we have an in-kernel IRQ chip then we must have asynchronous
  2024. * interrupt delivery (though the reverse is not necessarily true)
  2025. */
  2026. kvm_async_interrupts_allowed = true;
  2027. kvm_halt_in_kernel_allowed = true;
  2028. kvm_init_irq_routing(s);
  2029. s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
  2030. }
  2031. /* Find number of supported CPUs using the recommended
  2032. * procedure from the kernel API documentation to cope with
  2033. * older kernels that may be missing capabilities.
  2034. */
  2035. static int kvm_recommended_vcpus(KVMState *s)
  2036. {
  2037. int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
  2038. return (ret) ? ret : 4;
  2039. }
  2040. static int kvm_max_vcpus(KVMState *s)
  2041. {
  2042. int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
  2043. return (ret) ? ret : kvm_recommended_vcpus(s);
  2044. }
  2045. static int kvm_max_vcpu_id(KVMState *s)
  2046. {
  2047. int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
  2048. return (ret) ? ret : kvm_max_vcpus(s);
  2049. }
  2050. bool kvm_vcpu_id_is_valid(int vcpu_id)
  2051. {
  2052. KVMState *s = KVM_STATE(current_accel());
  2053. return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
  2054. }
  2055. bool kvm_dirty_ring_enabled(void)
  2056. {
  2057. return kvm_state && kvm_state->kvm_dirty_ring_size;
  2058. }
  2059. static void query_stats_cb(StatsResultList **result, StatsTarget target,
  2060. strList *names, strList *targets, Error **errp);
  2061. static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
  2062. uint32_t kvm_dirty_ring_size(void)
  2063. {
  2064. return kvm_state->kvm_dirty_ring_size;
  2065. }
  2066. static int do_kvm_create_vm(MachineState *ms, int type)
  2067. {
  2068. KVMState *s;
  2069. int ret;
  2070. s = KVM_STATE(ms->accelerator);
  2071. do {
  2072. ret = kvm_ioctl(s, KVM_CREATE_VM, type);
  2073. } while (ret == -EINTR);
  2074. if (ret < 0) {
  2075. error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret));
  2076. #ifdef TARGET_S390X
  2077. if (ret == -EINVAL) {
  2078. error_printf("Host kernel setup problem detected."
  2079. " Please verify:\n");
  2080. error_printf("- for kernels supporting the"
  2081. " switch_amode or user_mode parameters, whether");
  2082. error_printf(" user space is running in primary address space\n");
  2083. error_printf("- for kernels supporting the vm.allocate_pgste"
  2084. " sysctl, whether it is enabled\n");
  2085. }
  2086. #elif defined(TARGET_PPC)
  2087. if (ret == -EINVAL) {
  2088. error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
  2089. (type == 2) ? "pr" : "hv");
  2090. }
  2091. #endif
  2092. }
  2093. return ret;
  2094. }
  2095. static int find_kvm_machine_type(MachineState *ms)
  2096. {
  2097. MachineClass *mc = MACHINE_GET_CLASS(ms);
  2098. int type;
  2099. if (object_property_find(OBJECT(current_machine), "kvm-type")) {
  2100. g_autofree char *kvm_type;
  2101. kvm_type = object_property_get_str(OBJECT(current_machine),
  2102. "kvm-type",
  2103. &error_abort);
  2104. type = mc->kvm_type(ms, kvm_type);
  2105. } else if (mc->kvm_type) {
  2106. type = mc->kvm_type(ms, NULL);
  2107. } else {
  2108. type = kvm_arch_get_default_type(ms);
  2109. }
  2110. return type;
  2111. }
  2112. static int kvm_setup_dirty_ring(KVMState *s)
  2113. {
  2114. uint64_t dirty_log_manual_caps;
  2115. int ret;
  2116. /*
  2117. * Enable KVM dirty ring if supported, otherwise fall back to
  2118. * dirty logging mode
  2119. */
  2120. ret = kvm_dirty_ring_init(s);
  2121. if (ret < 0) {
  2122. return ret;
  2123. }
  2124. /*
  2125. * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
  2126. * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
  2127. * page is wr-protected initially, which is against how kvm dirty ring is
  2128. * usage - kvm dirty ring requires all pages are wr-protected at the very
  2129. * beginning. Enabling this feature for dirty ring causes data corruption.
  2130. *
  2131. * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
  2132. * we may expect a higher stall time when starting the migration. In the
  2133. * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
  2134. * instead of clearing dirty bit, it can be a way to explicitly wr-protect
  2135. * guest pages.
  2136. */
  2137. if (!s->kvm_dirty_ring_size) {
  2138. dirty_log_manual_caps =
  2139. kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
  2140. dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
  2141. KVM_DIRTY_LOG_INITIALLY_SET);
  2142. s->manual_dirty_log_protect = dirty_log_manual_caps;
  2143. if (dirty_log_manual_caps) {
  2144. ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
  2145. dirty_log_manual_caps);
  2146. if (ret) {
  2147. warn_report("Trying to enable capability %"PRIu64" of "
  2148. "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
  2149. "Falling back to the legacy mode. ",
  2150. dirty_log_manual_caps);
  2151. s->manual_dirty_log_protect = 0;
  2152. }
  2153. }
  2154. }
  2155. return 0;
  2156. }
  2157. static int kvm_init(MachineState *ms)
  2158. {
  2159. MachineClass *mc = MACHINE_GET_CLASS(ms);
  2160. static const char upgrade_note[] =
  2161. "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
  2162. "(see http://sourceforge.net/projects/kvm).\n";
  2163. const struct {
  2164. const char *name;
  2165. int num;
  2166. } num_cpus[] = {
  2167. { "SMP", ms->smp.cpus },
  2168. { "hotpluggable", ms->smp.max_cpus },
  2169. { /* end of list */ }
  2170. }, *nc = num_cpus;
  2171. int soft_vcpus_limit, hard_vcpus_limit;
  2172. KVMState *s;
  2173. const KVMCapabilityInfo *missing_cap;
  2174. int ret;
  2175. int type;
  2176. qemu_mutex_init(&kml_slots_lock);
  2177. s = KVM_STATE(ms->accelerator);
  2178. /*
  2179. * On systems where the kernel can support different base page
  2180. * sizes, host page size may be different from TARGET_PAGE_SIZE,
  2181. * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum
  2182. * page size for the system though.
  2183. */
  2184. assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size());
  2185. s->sigmask_len = 8;
  2186. accel_blocker_init();
  2187. #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
  2188. QTAILQ_INIT(&s->kvm_sw_breakpoints);
  2189. #endif
  2190. QLIST_INIT(&s->kvm_parked_vcpus);
  2191. s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR);
  2192. if (s->fd == -1) {
  2193. error_report("Could not access KVM kernel module: %m");
  2194. ret = -errno;
  2195. goto err;
  2196. }
  2197. ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
  2198. if (ret < KVM_API_VERSION) {
  2199. if (ret >= 0) {
  2200. ret = -EINVAL;
  2201. }
  2202. error_report("kvm version too old");
  2203. goto err;
  2204. }
  2205. if (ret > KVM_API_VERSION) {
  2206. ret = -EINVAL;
  2207. error_report("kvm version not supported");
  2208. goto err;
  2209. }
  2210. kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
  2211. s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
  2212. /* If unspecified, use the default value */
  2213. if (!s->nr_slots_max) {
  2214. s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT;
  2215. }
  2216. type = find_kvm_machine_type(ms);
  2217. if (type < 0) {
  2218. ret = -EINVAL;
  2219. goto err;
  2220. }
  2221. ret = do_kvm_create_vm(ms, type);
  2222. if (ret < 0) {
  2223. goto err;
  2224. }
  2225. s->vmfd = ret;
  2226. s->nr_as = kvm_vm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
  2227. if (s->nr_as <= 1) {
  2228. s->nr_as = 1;
  2229. }
  2230. s->as = g_new0(struct KVMAs, s->nr_as);
  2231. /* check the vcpu limits */
  2232. soft_vcpus_limit = kvm_recommended_vcpus(s);
  2233. hard_vcpus_limit = kvm_max_vcpus(s);
  2234. while (nc->name) {
  2235. if (nc->num > soft_vcpus_limit) {
  2236. warn_report("Number of %s cpus requested (%d) exceeds "
  2237. "the recommended cpus supported by KVM (%d)",
  2238. nc->name, nc->num, soft_vcpus_limit);
  2239. if (nc->num > hard_vcpus_limit) {
  2240. error_report("Number of %s cpus requested (%d) exceeds "
  2241. "the maximum cpus supported by KVM (%d)",
  2242. nc->name, nc->num, hard_vcpus_limit);
  2243. exit(1);
  2244. }
  2245. }
  2246. nc++;
  2247. }
  2248. missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
  2249. if (!missing_cap) {
  2250. missing_cap =
  2251. kvm_check_extension_list(s, kvm_arch_required_capabilities);
  2252. }
  2253. if (missing_cap) {
  2254. ret = -EINVAL;
  2255. error_report("kvm does not support %s", missing_cap->name);
  2256. error_printf("%s", upgrade_note);
  2257. goto err;
  2258. }
  2259. s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
  2260. s->coalesced_pio = s->coalesced_mmio &&
  2261. kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
  2262. ret = kvm_setup_dirty_ring(s);
  2263. if (ret < 0) {
  2264. goto err;
  2265. }
  2266. #ifdef KVM_CAP_VCPU_EVENTS
  2267. s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
  2268. #endif
  2269. s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
  2270. s->irq_set_ioctl = KVM_IRQ_LINE;
  2271. if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
  2272. s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
  2273. }
  2274. kvm_readonly_mem_allowed =
  2275. (kvm_vm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
  2276. kvm_resamplefds_allowed =
  2277. (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
  2278. kvm_vm_attributes_allowed =
  2279. (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
  2280. #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
  2281. kvm_has_guest_debug =
  2282. (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0);
  2283. #endif
  2284. kvm_sstep_flags = 0;
  2285. if (kvm_has_guest_debug) {
  2286. kvm_sstep_flags = SSTEP_ENABLE;
  2287. #if defined TARGET_KVM_HAVE_GUEST_DEBUG
  2288. int guest_debug_flags =
  2289. kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2);
  2290. if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) {
  2291. kvm_sstep_flags |= SSTEP_NOIRQ;
  2292. }
  2293. #endif
  2294. }
  2295. kvm_state = s;
  2296. ret = kvm_arch_init(ms, s);
  2297. if (ret < 0) {
  2298. goto err;
  2299. }
  2300. kvm_supported_memory_attributes = kvm_vm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
  2301. kvm_guest_memfd_supported =
  2302. kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
  2303. kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
  2304. (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
  2305. if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
  2306. s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
  2307. }
  2308. qemu_register_reset(kvm_unpoison_all, NULL);
  2309. if (s->kernel_irqchip_allowed) {
  2310. kvm_irqchip_create(s);
  2311. }
  2312. s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
  2313. s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
  2314. s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
  2315. s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
  2316. kvm_memory_listener_register(s, &s->memory_listener,
  2317. &address_space_memory, 0, "kvm-memory");
  2318. memory_listener_register(&kvm_io_listener,
  2319. &address_space_io);
  2320. s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
  2321. if (!s->sync_mmu) {
  2322. ret = ram_block_discard_disable(true);
  2323. assert(!ret);
  2324. }
  2325. if (s->kvm_dirty_ring_size) {
  2326. kvm_dirty_ring_reaper_init(s);
  2327. }
  2328. if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
  2329. add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb,
  2330. query_stats_schemas_cb);
  2331. }
  2332. return 0;
  2333. err:
  2334. assert(ret < 0);
  2335. if (s->vmfd >= 0) {
  2336. close(s->vmfd);
  2337. }
  2338. if (s->fd != -1) {
  2339. close(s->fd);
  2340. }
  2341. g_free(s->as);
  2342. g_free(s->memory_listener.slots);
  2343. return ret;
  2344. }
  2345. void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
  2346. {
  2347. s->sigmask_len = sigmask_len;
  2348. }
  2349. static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
  2350. int size, uint32_t count)
  2351. {
  2352. int i;
  2353. uint8_t *ptr = data;
  2354. for (i = 0; i < count; i++) {
  2355. address_space_rw(&address_space_io, port, attrs,
  2356. ptr, size,
  2357. direction == KVM_EXIT_IO_OUT);
  2358. ptr += size;
  2359. }
  2360. }
  2361. static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
  2362. {
  2363. int i;
  2364. fprintf(stderr, "KVM internal error. Suberror: %d\n",
  2365. run->internal.suberror);
  2366. for (i = 0; i < run->internal.ndata; ++i) {
  2367. fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
  2368. i, (uint64_t)run->internal.data[i]);
  2369. }
  2370. if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
  2371. fprintf(stderr, "emulation failure\n");
  2372. if (!kvm_arch_stop_on_emulation_error(cpu)) {
  2373. cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
  2374. return EXCP_INTERRUPT;
  2375. }
  2376. }
  2377. /* FIXME: Should trigger a qmp message to let management know
  2378. * something went wrong.
  2379. */
  2380. return -1;
  2381. }
  2382. void kvm_flush_coalesced_mmio_buffer(void)
  2383. {
  2384. KVMState *s = kvm_state;
  2385. if (!s || s->coalesced_flush_in_progress) {
  2386. return;
  2387. }
  2388. s->coalesced_flush_in_progress = true;
  2389. if (s->coalesced_mmio_ring) {
  2390. struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
  2391. while (ring->first != ring->last) {
  2392. struct kvm_coalesced_mmio *ent;
  2393. ent = &ring->coalesced_mmio[ring->first];
  2394. if (ent->pio == 1) {
  2395. address_space_write(&address_space_io, ent->phys_addr,
  2396. MEMTXATTRS_UNSPECIFIED, ent->data,
  2397. ent->len);
  2398. } else {
  2399. cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
  2400. }
  2401. smp_wmb();
  2402. ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
  2403. }
  2404. }
  2405. s->coalesced_flush_in_progress = false;
  2406. }
  2407. static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
  2408. {
  2409. if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
  2410. Error *err = NULL;
  2411. int ret = kvm_arch_get_registers(cpu, &err);
  2412. if (ret) {
  2413. if (err) {
  2414. error_reportf_err(err, "Failed to synchronize CPU state: ");
  2415. } else {
  2416. error_report("Failed to get registers: %s", strerror(-ret));
  2417. }
  2418. cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
  2419. vm_stop(RUN_STATE_INTERNAL_ERROR);
  2420. }
  2421. cpu->vcpu_dirty = true;
  2422. }
  2423. }
  2424. void kvm_cpu_synchronize_state(CPUState *cpu)
  2425. {
  2426. if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
  2427. run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
  2428. }
  2429. }
  2430. static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
  2431. {
  2432. Error *err = NULL;
  2433. int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE, &err);
  2434. if (ret) {
  2435. if (err) {
  2436. error_reportf_err(err, "Restoring resisters after reset: ");
  2437. } else {
  2438. error_report("Failed to put registers after reset: %s",
  2439. strerror(-ret));
  2440. }
  2441. cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
  2442. vm_stop(RUN_STATE_INTERNAL_ERROR);
  2443. }
  2444. cpu->vcpu_dirty = false;
  2445. }
  2446. void kvm_cpu_synchronize_post_reset(CPUState *cpu)
  2447. {
  2448. run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
  2449. }
  2450. static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
  2451. {
  2452. Error *err = NULL;
  2453. int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE, &err);
  2454. if (ret) {
  2455. if (err) {
  2456. error_reportf_err(err, "Putting registers after init: ");
  2457. } else {
  2458. error_report("Failed to put registers after init: %s",
  2459. strerror(-ret));
  2460. }
  2461. exit(1);
  2462. }
  2463. cpu->vcpu_dirty = false;
  2464. }
  2465. void kvm_cpu_synchronize_post_init(CPUState *cpu)
  2466. {
  2467. if (!kvm_state->guest_state_protected) {
  2468. /*
  2469. * This runs before the machine_init_done notifiers, and is the last
  2470. * opportunity to synchronize the state of confidential guests.
  2471. */
  2472. run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
  2473. }
  2474. }
  2475. static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
  2476. {
  2477. cpu->vcpu_dirty = true;
  2478. }
  2479. void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
  2480. {
  2481. run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
  2482. }
  2483. #ifdef KVM_HAVE_MCE_INJECTION
  2484. static __thread void *pending_sigbus_addr;
  2485. static __thread int pending_sigbus_code;
  2486. static __thread bool have_sigbus_pending;
  2487. #endif
  2488. static void kvm_cpu_kick(CPUState *cpu)
  2489. {
  2490. qatomic_set(&cpu->kvm_run->immediate_exit, 1);
  2491. }
  2492. static void kvm_cpu_kick_self(void)
  2493. {
  2494. if (kvm_immediate_exit) {
  2495. kvm_cpu_kick(current_cpu);
  2496. } else {
  2497. qemu_cpu_kick_self();
  2498. }
  2499. }
  2500. static void kvm_eat_signals(CPUState *cpu)
  2501. {
  2502. struct timespec ts = { 0, 0 };
  2503. siginfo_t siginfo;
  2504. sigset_t waitset;
  2505. sigset_t chkset;
  2506. int r;
  2507. if (kvm_immediate_exit) {
  2508. qatomic_set(&cpu->kvm_run->immediate_exit, 0);
  2509. /* Write kvm_run->immediate_exit before the cpu->exit_request
  2510. * write in kvm_cpu_exec.
  2511. */
  2512. smp_wmb();
  2513. return;
  2514. }
  2515. sigemptyset(&waitset);
  2516. sigaddset(&waitset, SIG_IPI);
  2517. do {
  2518. r = sigtimedwait(&waitset, &siginfo, &ts);
  2519. if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
  2520. perror("sigtimedwait");
  2521. exit(1);
  2522. }
  2523. r = sigpending(&chkset);
  2524. if (r == -1) {
  2525. perror("sigpending");
  2526. exit(1);
  2527. }
  2528. } while (sigismember(&chkset, SIG_IPI));
  2529. }
  2530. int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
  2531. {
  2532. MemoryRegionSection section;
  2533. ram_addr_t offset;
  2534. MemoryRegion *mr;
  2535. RAMBlock *rb;
  2536. void *addr;
  2537. int ret = -1;
  2538. trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared");
  2539. if (!QEMU_PTR_IS_ALIGNED(start, qemu_real_host_page_size()) ||
  2540. !QEMU_PTR_IS_ALIGNED(size, qemu_real_host_page_size())) {
  2541. return -1;
  2542. }
  2543. if (!size) {
  2544. return -1;
  2545. }
  2546. section = memory_region_find(get_system_memory(), start, size);
  2547. mr = section.mr;
  2548. if (!mr) {
  2549. /*
  2550. * Ignore converting non-assigned region to shared.
  2551. *
  2552. * TDX requires vMMIO region to be shared to inject #VE to guest.
  2553. * OVMF issues conservatively MapGPA(shared) on 32bit PCI MMIO region,
  2554. * and vIO-APIC 0xFEC00000 4K page.
  2555. * OVMF assigns 32bit PCI MMIO region to
  2556. * [top of low memory: typically 2GB=0xC000000, 0xFC00000)
  2557. */
  2558. if (!to_private) {
  2559. return 0;
  2560. }
  2561. return -1;
  2562. }
  2563. if (!memory_region_has_guest_memfd(mr)) {
  2564. /*
  2565. * Because vMMIO region must be shared, guest TD may convert vMMIO
  2566. * region to shared explicitly. Don't complain such case. See
  2567. * memory_region_type() for checking if the region is MMIO region.
  2568. */
  2569. if (!to_private &&
  2570. !memory_region_is_ram(mr) &&
  2571. !memory_region_is_ram_device(mr) &&
  2572. !memory_region_is_rom(mr) &&
  2573. !memory_region_is_romd(mr)) {
  2574. ret = 0;
  2575. } else {
  2576. error_report("Convert non guest_memfd backed memory region "
  2577. "(0x%"HWADDR_PRIx" ,+ 0x%"HWADDR_PRIx") to %s",
  2578. start, size, to_private ? "private" : "shared");
  2579. }
  2580. goto out_unref;
  2581. }
  2582. if (to_private) {
  2583. ret = kvm_set_memory_attributes_private(start, size);
  2584. } else {
  2585. ret = kvm_set_memory_attributes_shared(start, size);
  2586. }
  2587. if (ret) {
  2588. goto out_unref;
  2589. }
  2590. addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
  2591. rb = qemu_ram_block_from_host(addr, false, &offset);
  2592. if (to_private) {
  2593. if (rb->page_size != qemu_real_host_page_size()) {
  2594. /*
  2595. * shared memory is backed by hugetlb, which is supposed to be
  2596. * pre-allocated and doesn't need to be discarded
  2597. */
  2598. goto out_unref;
  2599. }
  2600. ret = ram_block_discard_range(rb, offset, size);
  2601. } else {
  2602. ret = ram_block_discard_guest_memfd_range(rb, offset, size);
  2603. }
  2604. out_unref:
  2605. memory_region_unref(mr);
  2606. return ret;
  2607. }
  2608. int kvm_cpu_exec(CPUState *cpu)
  2609. {
  2610. struct kvm_run *run = cpu->kvm_run;
  2611. int ret, run_ret;
  2612. trace_kvm_cpu_exec();
  2613. if (kvm_arch_process_async_events(cpu)) {
  2614. qatomic_set(&cpu->exit_request, 0);
  2615. return EXCP_HLT;
  2616. }
  2617. bql_unlock();
  2618. cpu_exec_start(cpu);
  2619. do {
  2620. MemTxAttrs attrs;
  2621. if (cpu->vcpu_dirty) {
  2622. Error *err = NULL;
  2623. ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE, &err);
  2624. if (ret) {
  2625. if (err) {
  2626. error_reportf_err(err, "Putting registers after init: ");
  2627. } else {
  2628. error_report("Failed to put registers after init: %s",
  2629. strerror(-ret));
  2630. }
  2631. ret = -1;
  2632. break;
  2633. }
  2634. cpu->vcpu_dirty = false;
  2635. }
  2636. kvm_arch_pre_run(cpu, run);
  2637. if (qatomic_read(&cpu->exit_request)) {
  2638. trace_kvm_interrupt_exit_request();
  2639. /*
  2640. * KVM requires us to reenter the kernel after IO exits to complete
  2641. * instruction emulation. This self-signal will ensure that we
  2642. * leave ASAP again.
  2643. */
  2644. kvm_cpu_kick_self();
  2645. }
  2646. /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
  2647. * Matching barrier in kvm_eat_signals.
  2648. */
  2649. smp_rmb();
  2650. run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
  2651. attrs = kvm_arch_post_run(cpu, run);
  2652. #ifdef KVM_HAVE_MCE_INJECTION
  2653. if (unlikely(have_sigbus_pending)) {
  2654. bql_lock();
  2655. kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
  2656. pending_sigbus_addr);
  2657. have_sigbus_pending = false;
  2658. bql_unlock();
  2659. }
  2660. #endif
  2661. if (run_ret < 0) {
  2662. if (run_ret == -EINTR || run_ret == -EAGAIN) {
  2663. trace_kvm_io_window_exit();
  2664. kvm_eat_signals(cpu);
  2665. ret = EXCP_INTERRUPT;
  2666. break;
  2667. }
  2668. if (!(run_ret == -EFAULT && run->exit_reason == KVM_EXIT_MEMORY_FAULT)) {
  2669. fprintf(stderr, "error: kvm run failed %s\n",
  2670. strerror(-run_ret));
  2671. #ifdef TARGET_PPC
  2672. if (run_ret == -EBUSY) {
  2673. fprintf(stderr,
  2674. "This is probably because your SMT is enabled.\n"
  2675. "VCPU can only run on primary threads with all "
  2676. "secondary threads offline.\n");
  2677. }
  2678. #endif
  2679. ret = -1;
  2680. break;
  2681. }
  2682. }
  2683. trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
  2684. switch (run->exit_reason) {
  2685. case KVM_EXIT_IO:
  2686. /* Called outside BQL */
  2687. kvm_handle_io(run->io.port, attrs,
  2688. (uint8_t *)run + run->io.data_offset,
  2689. run->io.direction,
  2690. run->io.size,
  2691. run->io.count);
  2692. ret = 0;
  2693. break;
  2694. case KVM_EXIT_MMIO:
  2695. /* Called outside BQL */
  2696. address_space_rw(&address_space_memory,
  2697. run->mmio.phys_addr, attrs,
  2698. run->mmio.data,
  2699. run->mmio.len,
  2700. run->mmio.is_write);
  2701. ret = 0;
  2702. break;
  2703. case KVM_EXIT_IRQ_WINDOW_OPEN:
  2704. ret = EXCP_INTERRUPT;
  2705. break;
  2706. case KVM_EXIT_SHUTDOWN:
  2707. qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
  2708. ret = EXCP_INTERRUPT;
  2709. break;
  2710. case KVM_EXIT_UNKNOWN:
  2711. fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
  2712. (uint64_t)run->hw.hardware_exit_reason);
  2713. ret = -1;
  2714. break;
  2715. case KVM_EXIT_INTERNAL_ERROR:
  2716. ret = kvm_handle_internal_error(cpu, run);
  2717. break;
  2718. case KVM_EXIT_DIRTY_RING_FULL:
  2719. /*
  2720. * We shouldn't continue if the dirty ring of this vcpu is
  2721. * still full. Got kicked by KVM_RESET_DIRTY_RINGS.
  2722. */
  2723. trace_kvm_dirty_ring_full(cpu->cpu_index);
  2724. bql_lock();
  2725. /*
  2726. * We throttle vCPU by making it sleep once it exit from kernel
  2727. * due to dirty ring full. In the dirtylimit scenario, reaping
  2728. * all vCPUs after a single vCPU dirty ring get full result in
  2729. * the miss of sleep, so just reap the ring-fulled vCPU.
  2730. */
  2731. if (dirtylimit_in_service()) {
  2732. kvm_dirty_ring_reap(kvm_state, cpu);
  2733. } else {
  2734. kvm_dirty_ring_reap(kvm_state, NULL);
  2735. }
  2736. bql_unlock();
  2737. dirtylimit_vcpu_execute(cpu);
  2738. ret = 0;
  2739. break;
  2740. case KVM_EXIT_SYSTEM_EVENT:
  2741. trace_kvm_run_exit_system_event(cpu->cpu_index, run->system_event.type);
  2742. switch (run->system_event.type) {
  2743. case KVM_SYSTEM_EVENT_SHUTDOWN:
  2744. qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
  2745. ret = EXCP_INTERRUPT;
  2746. break;
  2747. case KVM_SYSTEM_EVENT_RESET:
  2748. qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
  2749. ret = EXCP_INTERRUPT;
  2750. break;
  2751. case KVM_SYSTEM_EVENT_CRASH:
  2752. kvm_cpu_synchronize_state(cpu);
  2753. bql_lock();
  2754. qemu_system_guest_panicked(cpu_get_crash_info(cpu));
  2755. bql_unlock();
  2756. ret = 0;
  2757. break;
  2758. default:
  2759. ret = kvm_arch_handle_exit(cpu, run);
  2760. break;
  2761. }
  2762. break;
  2763. case KVM_EXIT_MEMORY_FAULT:
  2764. trace_kvm_memory_fault(run->memory_fault.gpa,
  2765. run->memory_fault.size,
  2766. run->memory_fault.flags);
  2767. if (run->memory_fault.flags & ~KVM_MEMORY_EXIT_FLAG_PRIVATE) {
  2768. error_report("KVM_EXIT_MEMORY_FAULT: Unknown flag 0x%" PRIx64,
  2769. (uint64_t)run->memory_fault.flags);
  2770. ret = -1;
  2771. break;
  2772. }
  2773. ret = kvm_convert_memory(run->memory_fault.gpa, run->memory_fault.size,
  2774. run->memory_fault.flags & KVM_MEMORY_EXIT_FLAG_PRIVATE);
  2775. break;
  2776. default:
  2777. ret = kvm_arch_handle_exit(cpu, run);
  2778. break;
  2779. }
  2780. } while (ret == 0);
  2781. cpu_exec_end(cpu);
  2782. bql_lock();
  2783. if (ret < 0) {
  2784. cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
  2785. vm_stop(RUN_STATE_INTERNAL_ERROR);
  2786. }
  2787. qatomic_set(&cpu->exit_request, 0);
  2788. return ret;
  2789. }
  2790. int kvm_ioctl(KVMState *s, unsigned long type, ...)
  2791. {
  2792. int ret;
  2793. void *arg;
  2794. va_list ap;
  2795. va_start(ap, type);
  2796. arg = va_arg(ap, void *);
  2797. va_end(ap);
  2798. trace_kvm_ioctl(type, arg);
  2799. ret = ioctl(s->fd, type, arg);
  2800. if (ret == -1) {
  2801. ret = -errno;
  2802. }
  2803. return ret;
  2804. }
  2805. int kvm_vm_ioctl(KVMState *s, unsigned long type, ...)
  2806. {
  2807. int ret;
  2808. void *arg;
  2809. va_list ap;
  2810. va_start(ap, type);
  2811. arg = va_arg(ap, void *);
  2812. va_end(ap);
  2813. trace_kvm_vm_ioctl(type, arg);
  2814. accel_ioctl_begin();
  2815. ret = ioctl(s->vmfd, type, arg);
  2816. accel_ioctl_end();
  2817. if (ret == -1) {
  2818. ret = -errno;
  2819. }
  2820. return ret;
  2821. }
  2822. int kvm_vcpu_ioctl(CPUState *cpu, unsigned long type, ...)
  2823. {
  2824. int ret;
  2825. void *arg;
  2826. va_list ap;
  2827. va_start(ap, type);
  2828. arg = va_arg(ap, void *);
  2829. va_end(ap);
  2830. trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
  2831. accel_cpu_ioctl_begin(cpu);
  2832. ret = ioctl(cpu->kvm_fd, type, arg);
  2833. accel_cpu_ioctl_end(cpu);
  2834. if (ret == -1) {
  2835. ret = -errno;
  2836. }
  2837. return ret;
  2838. }
  2839. int kvm_device_ioctl(int fd, unsigned long type, ...)
  2840. {
  2841. int ret;
  2842. void *arg;
  2843. va_list ap;
  2844. va_start(ap, type);
  2845. arg = va_arg(ap, void *);
  2846. va_end(ap);
  2847. trace_kvm_device_ioctl(fd, type, arg);
  2848. accel_ioctl_begin();
  2849. ret = ioctl(fd, type, arg);
  2850. accel_ioctl_end();
  2851. if (ret == -1) {
  2852. ret = -errno;
  2853. }
  2854. return ret;
  2855. }
  2856. int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
  2857. {
  2858. int ret;
  2859. struct kvm_device_attr attribute = {
  2860. .group = group,
  2861. .attr = attr,
  2862. };
  2863. if (!kvm_vm_attributes_allowed) {
  2864. return 0;
  2865. }
  2866. ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
  2867. /* kvm returns 0 on success for HAS_DEVICE_ATTR */
  2868. return ret ? 0 : 1;
  2869. }
  2870. int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
  2871. {
  2872. struct kvm_device_attr attribute = {
  2873. .group = group,
  2874. .attr = attr,
  2875. .flags = 0,
  2876. };
  2877. return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
  2878. }
  2879. int kvm_device_access(int fd, int group, uint64_t attr,
  2880. void *val, bool write, Error **errp)
  2881. {
  2882. struct kvm_device_attr kvmattr;
  2883. int err;
  2884. kvmattr.flags = 0;
  2885. kvmattr.group = group;
  2886. kvmattr.attr = attr;
  2887. kvmattr.addr = (uintptr_t)val;
  2888. err = kvm_device_ioctl(fd,
  2889. write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
  2890. &kvmattr);
  2891. if (err < 0) {
  2892. error_setg_errno(errp, -err,
  2893. "KVM_%s_DEVICE_ATTR failed: Group %d "
  2894. "attr 0x%016" PRIx64,
  2895. write ? "SET" : "GET", group, attr);
  2896. }
  2897. return err;
  2898. }
  2899. bool kvm_has_sync_mmu(void)
  2900. {
  2901. return kvm_state->sync_mmu;
  2902. }
  2903. int kvm_has_vcpu_events(void)
  2904. {
  2905. return kvm_state->vcpu_events;
  2906. }
  2907. int kvm_max_nested_state_length(void)
  2908. {
  2909. return kvm_state->max_nested_state_len;
  2910. }
  2911. int kvm_has_gsi_routing(void)
  2912. {
  2913. #ifdef KVM_CAP_IRQ_ROUTING
  2914. return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
  2915. #else
  2916. return false;
  2917. #endif
  2918. }
  2919. bool kvm_arm_supports_user_irq(void)
  2920. {
  2921. return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
  2922. }
  2923. #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
  2924. struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc)
  2925. {
  2926. struct kvm_sw_breakpoint *bp;
  2927. QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
  2928. if (bp->pc == pc) {
  2929. return bp;
  2930. }
  2931. }
  2932. return NULL;
  2933. }
  2934. int kvm_sw_breakpoints_active(CPUState *cpu)
  2935. {
  2936. return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
  2937. }
  2938. struct kvm_set_guest_debug_data {
  2939. struct kvm_guest_debug dbg;
  2940. int err;
  2941. };
  2942. static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
  2943. {
  2944. struct kvm_set_guest_debug_data *dbg_data =
  2945. (struct kvm_set_guest_debug_data *) data.host_ptr;
  2946. dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
  2947. &dbg_data->dbg);
  2948. }
  2949. int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
  2950. {
  2951. struct kvm_set_guest_debug_data data;
  2952. data.dbg.control = reinject_trap;
  2953. if (cpu->singlestep_enabled) {
  2954. data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
  2955. if (cpu->singlestep_enabled & SSTEP_NOIRQ) {
  2956. data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ;
  2957. }
  2958. }
  2959. kvm_arch_update_guest_debug(cpu, &data.dbg);
  2960. run_on_cpu(cpu, kvm_invoke_set_guest_debug,
  2961. RUN_ON_CPU_HOST_PTR(&data));
  2962. return data.err;
  2963. }
  2964. bool kvm_supports_guest_debug(void)
  2965. {
  2966. /* probed during kvm_init() */
  2967. return kvm_has_guest_debug;
  2968. }
  2969. int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
  2970. {
  2971. struct kvm_sw_breakpoint *bp;
  2972. int err;
  2973. if (type == GDB_BREAKPOINT_SW) {
  2974. bp = kvm_find_sw_breakpoint(cpu, addr);
  2975. if (bp) {
  2976. bp->use_count++;
  2977. return 0;
  2978. }
  2979. bp = g_new(struct kvm_sw_breakpoint, 1);
  2980. bp->pc = addr;
  2981. bp->use_count = 1;
  2982. err = kvm_arch_insert_sw_breakpoint(cpu, bp);
  2983. if (err) {
  2984. g_free(bp);
  2985. return err;
  2986. }
  2987. QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
  2988. } else {
  2989. err = kvm_arch_insert_hw_breakpoint(addr, len, type);
  2990. if (err) {
  2991. return err;
  2992. }
  2993. }
  2994. CPU_FOREACH(cpu) {
  2995. err = kvm_update_guest_debug(cpu, 0);
  2996. if (err) {
  2997. return err;
  2998. }
  2999. }
  3000. return 0;
  3001. }
  3002. int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
  3003. {
  3004. struct kvm_sw_breakpoint *bp;
  3005. int err;
  3006. if (type == GDB_BREAKPOINT_SW) {
  3007. bp = kvm_find_sw_breakpoint(cpu, addr);
  3008. if (!bp) {
  3009. return -ENOENT;
  3010. }
  3011. if (bp->use_count > 1) {
  3012. bp->use_count--;
  3013. return 0;
  3014. }
  3015. err = kvm_arch_remove_sw_breakpoint(cpu, bp);
  3016. if (err) {
  3017. return err;
  3018. }
  3019. QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
  3020. g_free(bp);
  3021. } else {
  3022. err = kvm_arch_remove_hw_breakpoint(addr, len, type);
  3023. if (err) {
  3024. return err;
  3025. }
  3026. }
  3027. CPU_FOREACH(cpu) {
  3028. err = kvm_update_guest_debug(cpu, 0);
  3029. if (err) {
  3030. return err;
  3031. }
  3032. }
  3033. return 0;
  3034. }
  3035. void kvm_remove_all_breakpoints(CPUState *cpu)
  3036. {
  3037. struct kvm_sw_breakpoint *bp, *next;
  3038. KVMState *s = cpu->kvm_state;
  3039. CPUState *tmpcpu;
  3040. QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
  3041. if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
  3042. /* Try harder to find a CPU that currently sees the breakpoint. */
  3043. CPU_FOREACH(tmpcpu) {
  3044. if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
  3045. break;
  3046. }
  3047. }
  3048. }
  3049. QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
  3050. g_free(bp);
  3051. }
  3052. kvm_arch_remove_all_hw_breakpoints();
  3053. CPU_FOREACH(cpu) {
  3054. kvm_update_guest_debug(cpu, 0);
  3055. }
  3056. }
  3057. #endif /* !TARGET_KVM_HAVE_GUEST_DEBUG */
  3058. static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
  3059. {
  3060. KVMState *s = kvm_state;
  3061. struct kvm_signal_mask *sigmask;
  3062. int r;
  3063. sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
  3064. sigmask->len = s->sigmask_len;
  3065. memcpy(sigmask->sigset, sigset, sizeof(*sigset));
  3066. r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
  3067. g_free(sigmask);
  3068. return r;
  3069. }
  3070. static void kvm_ipi_signal(int sig)
  3071. {
  3072. if (current_cpu) {
  3073. assert(kvm_immediate_exit);
  3074. kvm_cpu_kick(current_cpu);
  3075. }
  3076. }
  3077. void kvm_init_cpu_signals(CPUState *cpu)
  3078. {
  3079. int r;
  3080. sigset_t set;
  3081. struct sigaction sigact;
  3082. memset(&sigact, 0, sizeof(sigact));
  3083. sigact.sa_handler = kvm_ipi_signal;
  3084. sigaction(SIG_IPI, &sigact, NULL);
  3085. pthread_sigmask(SIG_BLOCK, NULL, &set);
  3086. #if defined KVM_HAVE_MCE_INJECTION
  3087. sigdelset(&set, SIGBUS);
  3088. pthread_sigmask(SIG_SETMASK, &set, NULL);
  3089. #endif
  3090. sigdelset(&set, SIG_IPI);
  3091. if (kvm_immediate_exit) {
  3092. r = pthread_sigmask(SIG_SETMASK, &set, NULL);
  3093. } else {
  3094. r = kvm_set_signal_mask(cpu, &set);
  3095. }
  3096. if (r) {
  3097. fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
  3098. exit(1);
  3099. }
  3100. }
  3101. /* Called asynchronously in VCPU thread. */
  3102. int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
  3103. {
  3104. #ifdef KVM_HAVE_MCE_INJECTION
  3105. if (have_sigbus_pending) {
  3106. return 1;
  3107. }
  3108. have_sigbus_pending = true;
  3109. pending_sigbus_addr = addr;
  3110. pending_sigbus_code = code;
  3111. qatomic_set(&cpu->exit_request, 1);
  3112. return 0;
  3113. #else
  3114. return 1;
  3115. #endif
  3116. }
  3117. /* Called synchronously (via signalfd) in main thread. */
  3118. int kvm_on_sigbus(int code, void *addr)
  3119. {
  3120. #ifdef KVM_HAVE_MCE_INJECTION
  3121. /* Action required MCE kills the process if SIGBUS is blocked. Because
  3122. * that's what happens in the I/O thread, where we handle MCE via signalfd,
  3123. * we can only get action optional here.
  3124. */
  3125. assert(code != BUS_MCEERR_AR);
  3126. kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
  3127. return 0;
  3128. #else
  3129. return 1;
  3130. #endif
  3131. }
  3132. int kvm_create_device(KVMState *s, uint64_t type, bool test)
  3133. {
  3134. int ret;
  3135. struct kvm_create_device create_dev;
  3136. create_dev.type = type;
  3137. create_dev.fd = -1;
  3138. create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
  3139. if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
  3140. return -ENOTSUP;
  3141. }
  3142. ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
  3143. if (ret) {
  3144. return ret;
  3145. }
  3146. return test ? 0 : create_dev.fd;
  3147. }
  3148. bool kvm_device_supported(int vmfd, uint64_t type)
  3149. {
  3150. struct kvm_create_device create_dev = {
  3151. .type = type,
  3152. .fd = -1,
  3153. .flags = KVM_CREATE_DEVICE_TEST,
  3154. };
  3155. if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
  3156. return false;
  3157. }
  3158. return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
  3159. }
  3160. int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
  3161. {
  3162. struct kvm_one_reg reg;
  3163. int r;
  3164. reg.id = id;
  3165. reg.addr = (uintptr_t) source;
  3166. r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
  3167. if (r) {
  3168. trace_kvm_failed_reg_set(id, strerror(-r));
  3169. }
  3170. return r;
  3171. }
  3172. int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
  3173. {
  3174. struct kvm_one_reg reg;
  3175. int r;
  3176. reg.id = id;
  3177. reg.addr = (uintptr_t) target;
  3178. r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
  3179. if (r) {
  3180. trace_kvm_failed_reg_get(id, strerror(-r));
  3181. }
  3182. return r;
  3183. }
  3184. static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
  3185. hwaddr start_addr, hwaddr size)
  3186. {
  3187. KVMState *kvm = KVM_STATE(ms->accelerator);
  3188. int i;
  3189. for (i = 0; i < kvm->nr_as; ++i) {
  3190. if (kvm->as[i].as == as && kvm->as[i].ml) {
  3191. size = MIN(kvm_max_slot_size, size);
  3192. return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
  3193. start_addr, size);
  3194. }
  3195. }
  3196. return false;
  3197. }
  3198. static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
  3199. const char *name, void *opaque,
  3200. Error **errp)
  3201. {
  3202. KVMState *s = KVM_STATE(obj);
  3203. int64_t value = s->kvm_shadow_mem;
  3204. visit_type_int(v, name, &value, errp);
  3205. }
  3206. static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
  3207. const char *name, void *opaque,
  3208. Error **errp)
  3209. {
  3210. KVMState *s = KVM_STATE(obj);
  3211. int64_t value;
  3212. if (s->fd != -1) {
  3213. error_setg(errp, "Cannot set properties after the accelerator has been initialized");
  3214. return;
  3215. }
  3216. if (!visit_type_int(v, name, &value, errp)) {
  3217. return;
  3218. }
  3219. s->kvm_shadow_mem = value;
  3220. }
  3221. static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
  3222. const char *name, void *opaque,
  3223. Error **errp)
  3224. {
  3225. KVMState *s = KVM_STATE(obj);
  3226. OnOffSplit mode;
  3227. if (s->fd != -1) {
  3228. error_setg(errp, "Cannot set properties after the accelerator has been initialized");
  3229. return;
  3230. }
  3231. if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
  3232. return;
  3233. }
  3234. switch (mode) {
  3235. case ON_OFF_SPLIT_ON:
  3236. s->kernel_irqchip_allowed = true;
  3237. s->kernel_irqchip_required = true;
  3238. s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
  3239. break;
  3240. case ON_OFF_SPLIT_OFF:
  3241. s->kernel_irqchip_allowed = false;
  3242. s->kernel_irqchip_required = false;
  3243. s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
  3244. break;
  3245. case ON_OFF_SPLIT_SPLIT:
  3246. s->kernel_irqchip_allowed = true;
  3247. s->kernel_irqchip_required = true;
  3248. s->kernel_irqchip_split = ON_OFF_AUTO_ON;
  3249. break;
  3250. default:
  3251. /* The value was checked in visit_type_OnOffSplit() above. If
  3252. * we get here, then something is wrong in QEMU.
  3253. */
  3254. abort();
  3255. }
  3256. }
  3257. bool kvm_kernel_irqchip_allowed(void)
  3258. {
  3259. return kvm_state->kernel_irqchip_allowed;
  3260. }
  3261. bool kvm_kernel_irqchip_required(void)
  3262. {
  3263. return kvm_state->kernel_irqchip_required;
  3264. }
  3265. bool kvm_kernel_irqchip_split(void)
  3266. {
  3267. return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
  3268. }
  3269. static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
  3270. const char *name, void *opaque,
  3271. Error **errp)
  3272. {
  3273. KVMState *s = KVM_STATE(obj);
  3274. uint32_t value = s->kvm_dirty_ring_size;
  3275. visit_type_uint32(v, name, &value, errp);
  3276. }
  3277. static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
  3278. const char *name, void *opaque,
  3279. Error **errp)
  3280. {
  3281. KVMState *s = KVM_STATE(obj);
  3282. uint32_t value;
  3283. if (s->fd != -1) {
  3284. error_setg(errp, "Cannot set properties after the accelerator has been initialized");
  3285. return;
  3286. }
  3287. if (!visit_type_uint32(v, name, &value, errp)) {
  3288. return;
  3289. }
  3290. if (value & (value - 1)) {
  3291. error_setg(errp, "dirty-ring-size must be a power of two.");
  3292. return;
  3293. }
  3294. s->kvm_dirty_ring_size = value;
  3295. }
  3296. static char *kvm_get_device(Object *obj,
  3297. Error **errp G_GNUC_UNUSED)
  3298. {
  3299. KVMState *s = KVM_STATE(obj);
  3300. return g_strdup(s->device);
  3301. }
  3302. static void kvm_set_device(Object *obj,
  3303. const char *value,
  3304. Error **errp G_GNUC_UNUSED)
  3305. {
  3306. KVMState *s = KVM_STATE(obj);
  3307. g_free(s->device);
  3308. s->device = g_strdup(value);
  3309. }
  3310. static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
  3311. {
  3312. KVMState *s = KVM_STATE(obj);
  3313. s->msr_energy.enable = value;
  3314. }
  3315. static void kvm_set_kvm_rapl_socket_path(Object *obj,
  3316. const char *str,
  3317. Error **errp)
  3318. {
  3319. KVMState *s = KVM_STATE(obj);
  3320. g_free(s->msr_energy.socket_path);
  3321. s->msr_energy.socket_path = g_strdup(str);
  3322. }
  3323. static void kvm_accel_instance_init(Object *obj)
  3324. {
  3325. KVMState *s = KVM_STATE(obj);
  3326. s->fd = -1;
  3327. s->vmfd = -1;
  3328. s->kvm_shadow_mem = -1;
  3329. s->kernel_irqchip_allowed = true;
  3330. s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
  3331. /* KVM dirty ring is by default off */
  3332. s->kvm_dirty_ring_size = 0;
  3333. s->kvm_dirty_ring_with_bitmap = false;
  3334. s->kvm_eager_split_size = 0;
  3335. s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
  3336. s->notify_window = 0;
  3337. s->xen_version = 0;
  3338. s->xen_gnttab_max_frames = 64;
  3339. s->xen_evtchn_max_pirq = 256;
  3340. s->device = NULL;
  3341. s->msr_energy.enable = false;
  3342. }
  3343. /**
  3344. * kvm_gdbstub_sstep_flags():
  3345. *
  3346. * Returns: SSTEP_* flags that KVM supports for guest debug. The
  3347. * support is probed during kvm_init()
  3348. */
  3349. static int kvm_gdbstub_sstep_flags(void)
  3350. {
  3351. return kvm_sstep_flags;
  3352. }
  3353. static void kvm_accel_class_init(ObjectClass *oc, void *data)
  3354. {
  3355. AccelClass *ac = ACCEL_CLASS(oc);
  3356. ac->name = "KVM";
  3357. ac->init_machine = kvm_init;
  3358. ac->has_memory = kvm_accel_has_memory;
  3359. ac->allowed = &kvm_allowed;
  3360. ac->gdbstub_supported_sstep_flags = kvm_gdbstub_sstep_flags;
  3361. object_class_property_add(oc, "kernel-irqchip", "on|off|split",
  3362. NULL, kvm_set_kernel_irqchip,
  3363. NULL, NULL);
  3364. object_class_property_set_description(oc, "kernel-irqchip",
  3365. "Configure KVM in-kernel irqchip");
  3366. object_class_property_add(oc, "kvm-shadow-mem", "int",
  3367. kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
  3368. NULL, NULL);
  3369. object_class_property_set_description(oc, "kvm-shadow-mem",
  3370. "KVM shadow MMU size");
  3371. object_class_property_add(oc, "dirty-ring-size", "uint32",
  3372. kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
  3373. NULL, NULL);
  3374. object_class_property_set_description(oc, "dirty-ring-size",
  3375. "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
  3376. object_class_property_add_str(oc, "device", kvm_get_device, kvm_set_device);
  3377. object_class_property_set_description(oc, "device",
  3378. "Path to the device node to use (default: /dev/kvm)");
  3379. object_class_property_add_bool(oc, "rapl",
  3380. NULL,
  3381. kvm_set_kvm_rapl);
  3382. object_class_property_set_description(oc, "rapl",
  3383. "Allow energy related MSRs for RAPL interface in Guest");
  3384. object_class_property_add_str(oc, "rapl-helper-socket", NULL,
  3385. kvm_set_kvm_rapl_socket_path);
  3386. object_class_property_set_description(oc, "rapl-helper-socket",
  3387. "Socket Path for comminucating with the Virtual MSR helper daemon");
  3388. kvm_arch_accel_class_init(oc);
  3389. }
  3390. static const TypeInfo kvm_accel_type = {
  3391. .name = TYPE_KVM_ACCEL,
  3392. .parent = TYPE_ACCEL,
  3393. .instance_init = kvm_accel_instance_init,
  3394. .class_init = kvm_accel_class_init,
  3395. .instance_size = sizeof(KVMState),
  3396. };
  3397. static void kvm_type_init(void)
  3398. {
  3399. type_register_static(&kvm_accel_type);
  3400. }
  3401. type_init(kvm_type_init);
  3402. typedef struct StatsArgs {
  3403. union StatsResultsType {
  3404. StatsResultList **stats;
  3405. StatsSchemaList **schema;
  3406. } result;
  3407. strList *names;
  3408. Error **errp;
  3409. } StatsArgs;
  3410. static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
  3411. uint64_t *stats_data,
  3412. StatsList *stats_list,
  3413. Error **errp)
  3414. {
  3415. Stats *stats;
  3416. uint64List *val_list = NULL;
  3417. /* Only add stats that we understand. */
  3418. switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
  3419. case KVM_STATS_TYPE_CUMULATIVE:
  3420. case KVM_STATS_TYPE_INSTANT:
  3421. case KVM_STATS_TYPE_PEAK:
  3422. case KVM_STATS_TYPE_LINEAR_HIST:
  3423. case KVM_STATS_TYPE_LOG_HIST:
  3424. break;
  3425. default:
  3426. return stats_list;
  3427. }
  3428. switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
  3429. case KVM_STATS_UNIT_NONE:
  3430. case KVM_STATS_UNIT_BYTES:
  3431. case KVM_STATS_UNIT_CYCLES:
  3432. case KVM_STATS_UNIT_SECONDS:
  3433. case KVM_STATS_UNIT_BOOLEAN:
  3434. break;
  3435. default:
  3436. return stats_list;
  3437. }
  3438. switch (pdesc->flags & KVM_STATS_BASE_MASK) {
  3439. case KVM_STATS_BASE_POW10:
  3440. case KVM_STATS_BASE_POW2:
  3441. break;
  3442. default:
  3443. return stats_list;
  3444. }
  3445. /* Alloc and populate data list */
  3446. stats = g_new0(Stats, 1);
  3447. stats->name = g_strdup(pdesc->name);
  3448. stats->value = g_new0(StatsValue, 1);
  3449. if ((pdesc->flags & KVM_STATS_UNIT_MASK) == KVM_STATS_UNIT_BOOLEAN) {
  3450. stats->value->u.boolean = *stats_data;
  3451. stats->value->type = QTYPE_QBOOL;
  3452. } else if (pdesc->size == 1) {
  3453. stats->value->u.scalar = *stats_data;
  3454. stats->value->type = QTYPE_QNUM;
  3455. } else {
  3456. int i;
  3457. for (i = 0; i < pdesc->size; i++) {
  3458. QAPI_LIST_PREPEND(val_list, stats_data[i]);
  3459. }
  3460. stats->value->u.list = val_list;
  3461. stats->value->type = QTYPE_QLIST;
  3462. }
  3463. QAPI_LIST_PREPEND(stats_list, stats);
  3464. return stats_list;
  3465. }
  3466. static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
  3467. StatsSchemaValueList *list,
  3468. Error **errp)
  3469. {
  3470. StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1);
  3471. schema_entry->value = g_new0(StatsSchemaValue, 1);
  3472. switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
  3473. case KVM_STATS_TYPE_CUMULATIVE:
  3474. schema_entry->value->type = STATS_TYPE_CUMULATIVE;
  3475. break;
  3476. case KVM_STATS_TYPE_INSTANT:
  3477. schema_entry->value->type = STATS_TYPE_INSTANT;
  3478. break;
  3479. case KVM_STATS_TYPE_PEAK:
  3480. schema_entry->value->type = STATS_TYPE_PEAK;
  3481. break;
  3482. case KVM_STATS_TYPE_LINEAR_HIST:
  3483. schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM;
  3484. schema_entry->value->bucket_size = pdesc->bucket_size;
  3485. schema_entry->value->has_bucket_size = true;
  3486. break;
  3487. case KVM_STATS_TYPE_LOG_HIST:
  3488. schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM;
  3489. break;
  3490. default:
  3491. goto exit;
  3492. }
  3493. switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
  3494. case KVM_STATS_UNIT_NONE:
  3495. break;
  3496. case KVM_STATS_UNIT_BOOLEAN:
  3497. schema_entry->value->has_unit = true;
  3498. schema_entry->value->unit = STATS_UNIT_BOOLEAN;
  3499. break;
  3500. case KVM_STATS_UNIT_BYTES:
  3501. schema_entry->value->has_unit = true;
  3502. schema_entry->value->unit = STATS_UNIT_BYTES;
  3503. break;
  3504. case KVM_STATS_UNIT_CYCLES:
  3505. schema_entry->value->has_unit = true;
  3506. schema_entry->value->unit = STATS_UNIT_CYCLES;
  3507. break;
  3508. case KVM_STATS_UNIT_SECONDS:
  3509. schema_entry->value->has_unit = true;
  3510. schema_entry->value->unit = STATS_UNIT_SECONDS;
  3511. break;
  3512. default:
  3513. goto exit;
  3514. }
  3515. schema_entry->value->exponent = pdesc->exponent;
  3516. if (pdesc->exponent) {
  3517. switch (pdesc->flags & KVM_STATS_BASE_MASK) {
  3518. case KVM_STATS_BASE_POW10:
  3519. schema_entry->value->has_base = true;
  3520. schema_entry->value->base = 10;
  3521. break;
  3522. case KVM_STATS_BASE_POW2:
  3523. schema_entry->value->has_base = true;
  3524. schema_entry->value->base = 2;
  3525. break;
  3526. default:
  3527. goto exit;
  3528. }
  3529. }
  3530. schema_entry->value->name = g_strdup(pdesc->name);
  3531. schema_entry->next = list;
  3532. return schema_entry;
  3533. exit:
  3534. g_free(schema_entry->value);
  3535. g_free(schema_entry);
  3536. return list;
  3537. }
  3538. /* Cached stats descriptors */
  3539. typedef struct StatsDescriptors {
  3540. const char *ident; /* cache key, currently the StatsTarget */
  3541. struct kvm_stats_desc *kvm_stats_desc;
  3542. struct kvm_stats_header kvm_stats_header;
  3543. QTAILQ_ENTRY(StatsDescriptors) next;
  3544. } StatsDescriptors;
  3545. static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors =
  3546. QTAILQ_HEAD_INITIALIZER(stats_descriptors);
  3547. /*
  3548. * Return the descriptors for 'target', that either have already been read
  3549. * or are retrieved from 'stats_fd'.
  3550. */
  3551. static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd,
  3552. Error **errp)
  3553. {
  3554. StatsDescriptors *descriptors;
  3555. const char *ident;
  3556. struct kvm_stats_desc *kvm_stats_desc;
  3557. struct kvm_stats_header *kvm_stats_header;
  3558. size_t size_desc;
  3559. ssize_t ret;
  3560. ident = StatsTarget_str(target);
  3561. QTAILQ_FOREACH(descriptors, &stats_descriptors, next) {
  3562. if (g_str_equal(descriptors->ident, ident)) {
  3563. return descriptors;
  3564. }
  3565. }
  3566. descriptors = g_new0(StatsDescriptors, 1);
  3567. /* Read stats header */
  3568. kvm_stats_header = &descriptors->kvm_stats_header;
  3569. ret = pread(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header), 0);
  3570. if (ret != sizeof(*kvm_stats_header)) {
  3571. error_setg(errp, "KVM stats: failed to read stats header: "
  3572. "expected %zu actual %zu",
  3573. sizeof(*kvm_stats_header), ret);
  3574. g_free(descriptors);
  3575. return NULL;
  3576. }
  3577. size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
  3578. /* Read stats descriptors */
  3579. kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc);
  3580. ret = pread(stats_fd, kvm_stats_desc,
  3581. size_desc * kvm_stats_header->num_desc,
  3582. kvm_stats_header->desc_offset);
  3583. if (ret != size_desc * kvm_stats_header->num_desc) {
  3584. error_setg(errp, "KVM stats: failed to read stats descriptors: "
  3585. "expected %zu actual %zu",
  3586. size_desc * kvm_stats_header->num_desc, ret);
  3587. g_free(descriptors);
  3588. g_free(kvm_stats_desc);
  3589. return NULL;
  3590. }
  3591. descriptors->kvm_stats_desc = kvm_stats_desc;
  3592. descriptors->ident = ident;
  3593. QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next);
  3594. return descriptors;
  3595. }
  3596. static void query_stats(StatsResultList **result, StatsTarget target,
  3597. strList *names, int stats_fd, CPUState *cpu,
  3598. Error **errp)
  3599. {
  3600. struct kvm_stats_desc *kvm_stats_desc;
  3601. struct kvm_stats_header *kvm_stats_header;
  3602. StatsDescriptors *descriptors;
  3603. g_autofree uint64_t *stats_data = NULL;
  3604. struct kvm_stats_desc *pdesc;
  3605. StatsList *stats_list = NULL;
  3606. size_t size_desc, size_data = 0;
  3607. ssize_t ret;
  3608. int i;
  3609. descriptors = find_stats_descriptors(target, stats_fd, errp);
  3610. if (!descriptors) {
  3611. return;
  3612. }
  3613. kvm_stats_header = &descriptors->kvm_stats_header;
  3614. kvm_stats_desc = descriptors->kvm_stats_desc;
  3615. size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
  3616. /* Tally the total data size; read schema data */
  3617. for (i = 0; i < kvm_stats_header->num_desc; ++i) {
  3618. pdesc = (void *)kvm_stats_desc + i * size_desc;
  3619. size_data += pdesc->size * sizeof(*stats_data);
  3620. }
  3621. stats_data = g_malloc0(size_data);
  3622. ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset);
  3623. if (ret != size_data) {
  3624. error_setg(errp, "KVM stats: failed to read data: "
  3625. "expected %zu actual %zu", size_data, ret);
  3626. return;
  3627. }
  3628. for (i = 0; i < kvm_stats_header->num_desc; ++i) {
  3629. uint64_t *stats;
  3630. pdesc = (void *)kvm_stats_desc + i * size_desc;
  3631. /* Add entry to the list */
  3632. stats = (void *)stats_data + pdesc->offset;
  3633. if (!apply_str_list_filter(pdesc->name, names)) {
  3634. continue;
  3635. }
  3636. stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp);
  3637. }
  3638. if (!stats_list) {
  3639. return;
  3640. }
  3641. switch (target) {
  3642. case STATS_TARGET_VM:
  3643. add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list);
  3644. break;
  3645. case STATS_TARGET_VCPU:
  3646. add_stats_entry(result, STATS_PROVIDER_KVM,
  3647. cpu->parent_obj.canonical_path,
  3648. stats_list);
  3649. break;
  3650. default:
  3651. g_assert_not_reached();
  3652. }
  3653. }
  3654. static void query_stats_schema(StatsSchemaList **result, StatsTarget target,
  3655. int stats_fd, Error **errp)
  3656. {
  3657. struct kvm_stats_desc *kvm_stats_desc;
  3658. struct kvm_stats_header *kvm_stats_header;
  3659. StatsDescriptors *descriptors;
  3660. struct kvm_stats_desc *pdesc;
  3661. StatsSchemaValueList *stats_list = NULL;
  3662. size_t size_desc;
  3663. int i;
  3664. descriptors = find_stats_descriptors(target, stats_fd, errp);
  3665. if (!descriptors) {
  3666. return;
  3667. }
  3668. kvm_stats_header = &descriptors->kvm_stats_header;
  3669. kvm_stats_desc = descriptors->kvm_stats_desc;
  3670. size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
  3671. /* Tally the total data size; read schema data */
  3672. for (i = 0; i < kvm_stats_header->num_desc; ++i) {
  3673. pdesc = (void *)kvm_stats_desc + i * size_desc;
  3674. stats_list = add_kvmschema_entry(pdesc, stats_list, errp);
  3675. }
  3676. add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list);
  3677. }
  3678. static void query_stats_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
  3679. {
  3680. int stats_fd = cpu->kvm_vcpu_stats_fd;
  3681. Error *local_err = NULL;
  3682. if (stats_fd == -1) {
  3683. error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
  3684. error_propagate(kvm_stats_args->errp, local_err);
  3685. return;
  3686. }
  3687. query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU,
  3688. kvm_stats_args->names, stats_fd, cpu,
  3689. kvm_stats_args->errp);
  3690. }
  3691. static void query_stats_schema_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
  3692. {
  3693. int stats_fd = cpu->kvm_vcpu_stats_fd;
  3694. Error *local_err = NULL;
  3695. if (stats_fd == -1) {
  3696. error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
  3697. error_propagate(kvm_stats_args->errp, local_err);
  3698. return;
  3699. }
  3700. query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd,
  3701. kvm_stats_args->errp);
  3702. }
  3703. static void query_stats_cb(StatsResultList **result, StatsTarget target,
  3704. strList *names, strList *targets, Error **errp)
  3705. {
  3706. KVMState *s = kvm_state;
  3707. CPUState *cpu;
  3708. int stats_fd;
  3709. switch (target) {
  3710. case STATS_TARGET_VM:
  3711. {
  3712. stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
  3713. if (stats_fd == -1) {
  3714. error_setg_errno(errp, errno, "KVM stats: ioctl failed");
  3715. return;
  3716. }
  3717. query_stats(result, target, names, stats_fd, NULL, errp);
  3718. close(stats_fd);
  3719. break;
  3720. }
  3721. case STATS_TARGET_VCPU:
  3722. {
  3723. StatsArgs stats_args;
  3724. stats_args.result.stats = result;
  3725. stats_args.names = names;
  3726. stats_args.errp = errp;
  3727. CPU_FOREACH(cpu) {
  3728. if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) {
  3729. continue;
  3730. }
  3731. query_stats_vcpu(cpu, &stats_args);
  3732. }
  3733. break;
  3734. }
  3735. default:
  3736. break;
  3737. }
  3738. }
  3739. void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
  3740. {
  3741. StatsArgs stats_args;
  3742. KVMState *s = kvm_state;
  3743. int stats_fd;
  3744. stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
  3745. if (stats_fd == -1) {
  3746. error_setg_errno(errp, errno, "KVM stats: ioctl failed");
  3747. return;
  3748. }
  3749. query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp);
  3750. close(stats_fd);
  3751. if (first_cpu) {
  3752. stats_args.result.schema = result;
  3753. stats_args.errp = errp;
  3754. query_stats_schema_vcpu(first_cpu, &stats_args);
  3755. }
  3756. }
  3757. void kvm_mark_guest_state_protected(void)
  3758. {
  3759. kvm_state->guest_state_protected = true;
  3760. }
  3761. int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
  3762. {
  3763. int fd;
  3764. struct kvm_create_guest_memfd guest_memfd = {
  3765. .size = size,
  3766. .flags = flags,
  3767. };
  3768. if (!kvm_guest_memfd_supported) {
  3769. error_setg(errp, "KVM does not support guest_memfd");
  3770. return -1;
  3771. }
  3772. fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_GUEST_MEMFD, &guest_memfd);
  3773. if (fd < 0) {
  3774. error_setg_errno(errp, errno, "Error creating KVM guest_memfd");
  3775. return -1;
  3776. }
  3777. return fd;
  3778. }