pci.c 141 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479
  1. /*
  2. * vfio based device assignment support
  3. *
  4. * Copyright Red Hat, Inc. 2012
  5. *
  6. * Authors:
  7. * Alex Williamson <alex.williamson@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Based on qemu-kvm device-assignment:
  13. * Adapted for KVM by Qumranet.
  14. * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15. * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16. * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17. * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18. * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19. */
  20. #include <dirent.h>
  21. #include <linux/vfio.h>
  22. #include <sys/ioctl.h>
  23. #include <sys/mman.h>
  24. #include <sys/stat.h>
  25. #include <sys/types.h>
  26. #include <unistd.h>
  27. #include "config.h"
  28. #include "exec/address-spaces.h"
  29. #include "exec/memory.h"
  30. #include "hw/pci/msi.h"
  31. #include "hw/pci/msix.h"
  32. #include "hw/pci/pci.h"
  33. #include "qemu-common.h"
  34. #include "qemu/error-report.h"
  35. #include "qemu/event_notifier.h"
  36. #include "qemu/queue.h"
  37. #include "qemu/range.h"
  38. #include "sysemu/kvm.h"
  39. #include "sysemu/sysemu.h"
  40. #include "trace.h"
  41. #include "hw/vfio/vfio.h"
  42. /* Extra debugging, trap acceleration paths for more logging */
  43. #define VFIO_ALLOW_MMAP 1
  44. #define VFIO_ALLOW_KVM_INTX 1
  45. #define VFIO_ALLOW_KVM_MSI 1
  46. #define VFIO_ALLOW_KVM_MSIX 1
  47. struct VFIODevice;
  48. typedef struct VFIOQuirk {
  49. MemoryRegion mem;
  50. struct VFIODevice *vdev;
  51. QLIST_ENTRY(VFIOQuirk) next;
  52. struct {
  53. uint32_t base_offset:TARGET_PAGE_BITS;
  54. uint32_t address_offset:TARGET_PAGE_BITS;
  55. uint32_t address_size:3;
  56. uint32_t bar:3;
  57. uint32_t address_match;
  58. uint32_t address_mask;
  59. uint32_t address_val:TARGET_PAGE_BITS;
  60. uint32_t data_offset:TARGET_PAGE_BITS;
  61. uint32_t data_size:3;
  62. uint8_t flags;
  63. uint8_t read_flags;
  64. uint8_t write_flags;
  65. } data;
  66. } VFIOQuirk;
  67. typedef struct VFIOBAR {
  68. off_t fd_offset; /* offset of BAR within device fd */
  69. int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
  70. MemoryRegion mem; /* slow, read/write access */
  71. MemoryRegion mmap_mem; /* direct mapped access */
  72. void *mmap;
  73. size_t size;
  74. uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
  75. uint8_t nr; /* cache the BAR number for debug */
  76. bool ioport;
  77. bool mem64;
  78. QLIST_HEAD(, VFIOQuirk) quirks;
  79. } VFIOBAR;
  80. typedef struct VFIOVGARegion {
  81. MemoryRegion mem;
  82. off_t offset;
  83. int nr;
  84. QLIST_HEAD(, VFIOQuirk) quirks;
  85. } VFIOVGARegion;
  86. typedef struct VFIOVGA {
  87. off_t fd_offset;
  88. int fd;
  89. VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS];
  90. } VFIOVGA;
  91. typedef struct VFIOINTx {
  92. bool pending; /* interrupt pending */
  93. bool kvm_accel; /* set when QEMU bypass through KVM enabled */
  94. uint8_t pin; /* which pin to pull for qemu_set_irq */
  95. EventNotifier interrupt; /* eventfd triggered on interrupt */
  96. EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
  97. PCIINTxRoute route; /* routing info for QEMU bypass */
  98. uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
  99. QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
  100. } VFIOINTx;
  101. typedef struct VFIOMSIVector {
  102. /*
  103. * Two interrupt paths are configured per vector. The first, is only used
  104. * for interrupts injected via QEMU. This is typically the non-accel path,
  105. * but may also be used when we want QEMU to handle masking and pending
  106. * bits. The KVM path bypasses QEMU and is therefore higher performance,
  107. * but requires masking at the device. virq is used to track the MSI route
  108. * through KVM, thus kvm_interrupt is only available when virq is set to a
  109. * valid (>= 0) value.
  110. */
  111. EventNotifier interrupt;
  112. EventNotifier kvm_interrupt;
  113. struct VFIODevice *vdev; /* back pointer to device */
  114. int virq;
  115. bool use;
  116. } VFIOMSIVector;
  117. enum {
  118. VFIO_INT_NONE = 0,
  119. VFIO_INT_INTx = 1,
  120. VFIO_INT_MSI = 2,
  121. VFIO_INT_MSIX = 3,
  122. };
  123. typedef struct VFIOAddressSpace {
  124. AddressSpace *as;
  125. QLIST_HEAD(, VFIOContainer) containers;
  126. QLIST_ENTRY(VFIOAddressSpace) list;
  127. } VFIOAddressSpace;
  128. static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
  129. QLIST_HEAD_INITIALIZER(vfio_address_spaces);
  130. struct VFIOGroup;
  131. typedef struct VFIOType1 {
  132. MemoryListener listener;
  133. int error;
  134. bool initialized;
  135. } VFIOType1;
  136. typedef struct VFIOContainer {
  137. VFIOAddressSpace *space;
  138. int fd; /* /dev/vfio/vfio, empowered by the attached groups */
  139. struct {
  140. /* enable abstraction to support various iommu backends */
  141. union {
  142. VFIOType1 type1;
  143. };
  144. void (*release)(struct VFIOContainer *);
  145. } iommu_data;
  146. QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
  147. QLIST_HEAD(, VFIOGroup) group_list;
  148. QLIST_ENTRY(VFIOContainer) next;
  149. } VFIOContainer;
  150. typedef struct VFIOGuestIOMMU {
  151. VFIOContainer *container;
  152. MemoryRegion *iommu;
  153. Notifier n;
  154. QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
  155. } VFIOGuestIOMMU;
  156. /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
  157. typedef struct VFIOMSIXInfo {
  158. uint8_t table_bar;
  159. uint8_t pba_bar;
  160. uint16_t entries;
  161. uint32_t table_offset;
  162. uint32_t pba_offset;
  163. MemoryRegion mmap_mem;
  164. void *mmap;
  165. } VFIOMSIXInfo;
  166. typedef struct VFIODevice {
  167. PCIDevice pdev;
  168. int fd;
  169. VFIOINTx intx;
  170. unsigned int config_size;
  171. uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
  172. off_t config_offset; /* Offset of config space region within device fd */
  173. unsigned int rom_size;
  174. off_t rom_offset; /* Offset of ROM region within device fd */
  175. void *rom;
  176. int msi_cap_size;
  177. VFIOMSIVector *msi_vectors;
  178. VFIOMSIXInfo *msix;
  179. int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
  180. int interrupt; /* Current interrupt type */
  181. VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
  182. VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
  183. PCIHostDeviceAddress host;
  184. QLIST_ENTRY(VFIODevice) next;
  185. struct VFIOGroup *group;
  186. EventNotifier err_notifier;
  187. uint32_t features;
  188. #define VFIO_FEATURE_ENABLE_VGA_BIT 0
  189. #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
  190. int32_t bootindex;
  191. uint8_t pm_cap;
  192. bool reset_works;
  193. bool has_vga;
  194. bool pci_aer;
  195. bool has_flr;
  196. bool has_pm_reset;
  197. bool needs_reset;
  198. bool rom_read_failed;
  199. } VFIODevice;
  200. typedef struct VFIOGroup {
  201. int fd;
  202. int groupid;
  203. VFIOContainer *container;
  204. QLIST_HEAD(, VFIODevice) device_list;
  205. QLIST_ENTRY(VFIOGroup) next;
  206. QLIST_ENTRY(VFIOGroup) container_next;
  207. } VFIOGroup;
  208. typedef struct VFIORomBlacklistEntry {
  209. uint16_t vendor_id;
  210. uint16_t device_id;
  211. } VFIORomBlacklistEntry;
  212. /*
  213. * List of device ids/vendor ids for which to disable
  214. * option rom loading. This avoids the guest hangs during rom
  215. * execution as noticed with the BCM 57810 card for lack of a
  216. * more better way to handle such issues.
  217. * The user can still override by specifying a romfile or
  218. * rombar=1.
  219. * Please see https://bugs.launchpad.net/qemu/+bug/1284874
  220. * for an analysis of the 57810 card hang. When adding
  221. * a new vendor id/device id combination below, please also add
  222. * your card/environment details and information that could
  223. * help in debugging to the bug tracking this issue
  224. */
  225. static const VFIORomBlacklistEntry romblacklist[] = {
  226. /* Broadcom BCM 57810 */
  227. { 0x14e4, 0x168e }
  228. };
  229. #define MSIX_CAP_LENGTH 12
  230. static QLIST_HEAD(, VFIOGroup)
  231. group_list = QLIST_HEAD_INITIALIZER(group_list);
  232. #ifdef CONFIG_KVM
  233. /*
  234. * We have a single VFIO pseudo device per KVM VM. Once created it lives
  235. * for the life of the VM. Closing the file descriptor only drops our
  236. * reference to it and the device's reference to kvm. Therefore once
  237. * initialized, this file descriptor is only released on QEMU exit and
  238. * we'll re-use it should another vfio device be attached before then.
  239. */
  240. static int vfio_kvm_device_fd = -1;
  241. #endif
  242. static void vfio_disable_interrupts(VFIODevice *vdev);
  243. static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
  244. static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
  245. uint32_t val, int len);
  246. static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
  247. /*
  248. * Common VFIO interrupt disable
  249. */
  250. static void vfio_disable_irqindex(VFIODevice *vdev, int index)
  251. {
  252. struct vfio_irq_set irq_set = {
  253. .argsz = sizeof(irq_set),
  254. .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
  255. .index = index,
  256. .start = 0,
  257. .count = 0,
  258. };
  259. ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
  260. }
  261. /*
  262. * INTx
  263. */
  264. static void vfio_unmask_intx(VFIODevice *vdev)
  265. {
  266. struct vfio_irq_set irq_set = {
  267. .argsz = sizeof(irq_set),
  268. .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
  269. .index = VFIO_PCI_INTX_IRQ_INDEX,
  270. .start = 0,
  271. .count = 1,
  272. };
  273. ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
  274. }
  275. #ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
  276. static void vfio_mask_intx(VFIODevice *vdev)
  277. {
  278. struct vfio_irq_set irq_set = {
  279. .argsz = sizeof(irq_set),
  280. .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
  281. .index = VFIO_PCI_INTX_IRQ_INDEX,
  282. .start = 0,
  283. .count = 1,
  284. };
  285. ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
  286. }
  287. #endif
  288. /*
  289. * Disabling BAR mmaping can be slow, but toggling it around INTx can
  290. * also be a huge overhead. We try to get the best of both worlds by
  291. * waiting until an interrupt to disable mmaps (subsequent transitions
  292. * to the same state are effectively no overhead). If the interrupt has
  293. * been serviced and the time gap is long enough, we re-enable mmaps for
  294. * performance. This works well for things like graphics cards, which
  295. * may not use their interrupt at all and are penalized to an unusable
  296. * level by read/write BAR traps. Other devices, like NICs, have more
  297. * regular interrupts and see much better latency by staying in non-mmap
  298. * mode. We therefore set the default mmap_timeout such that a ping
  299. * is just enough to keep the mmap disabled. Users can experiment with
  300. * other options with the x-intx-mmap-timeout-ms parameter (a value of
  301. * zero disables the timer).
  302. */
  303. static void vfio_intx_mmap_enable(void *opaque)
  304. {
  305. VFIODevice *vdev = opaque;
  306. if (vdev->intx.pending) {
  307. timer_mod(vdev->intx.mmap_timer,
  308. qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
  309. return;
  310. }
  311. vfio_mmap_set_enabled(vdev, true);
  312. }
  313. static void vfio_intx_interrupt(void *opaque)
  314. {
  315. VFIODevice *vdev = opaque;
  316. if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
  317. return;
  318. }
  319. trace_vfio_intx_interrupt(vdev->host.domain, vdev->host.bus,
  320. vdev->host.slot, vdev->host.function,
  321. 'A' + vdev->intx.pin);
  322. vdev->intx.pending = true;
  323. pci_irq_assert(&vdev->pdev);
  324. vfio_mmap_set_enabled(vdev, false);
  325. if (vdev->intx.mmap_timeout) {
  326. timer_mod(vdev->intx.mmap_timer,
  327. qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
  328. }
  329. }
  330. static void vfio_eoi(VFIODevice *vdev)
  331. {
  332. if (!vdev->intx.pending) {
  333. return;
  334. }
  335. trace_vfio_eoi(vdev->host.domain, vdev->host.bus,
  336. vdev->host.slot, vdev->host.function);
  337. vdev->intx.pending = false;
  338. pci_irq_deassert(&vdev->pdev);
  339. vfio_unmask_intx(vdev);
  340. }
  341. static void vfio_enable_intx_kvm(VFIODevice *vdev)
  342. {
  343. #ifdef CONFIG_KVM
  344. struct kvm_irqfd irqfd = {
  345. .fd = event_notifier_get_fd(&vdev->intx.interrupt),
  346. .gsi = vdev->intx.route.irq,
  347. .flags = KVM_IRQFD_FLAG_RESAMPLE,
  348. };
  349. struct vfio_irq_set *irq_set;
  350. int ret, argsz;
  351. int32_t *pfd;
  352. if (!VFIO_ALLOW_KVM_INTX || !kvm_irqfds_enabled() ||
  353. vdev->intx.route.mode != PCI_INTX_ENABLED ||
  354. !kvm_resamplefds_enabled()) {
  355. return;
  356. }
  357. /* Get to a known interrupt state */
  358. qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
  359. vfio_mask_intx(vdev);
  360. vdev->intx.pending = false;
  361. pci_irq_deassert(&vdev->pdev);
  362. /* Get an eventfd for resample/unmask */
  363. if (event_notifier_init(&vdev->intx.unmask, 0)) {
  364. error_report("vfio: Error: event_notifier_init failed eoi");
  365. goto fail;
  366. }
  367. /* KVM triggers it, VFIO listens for it */
  368. irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
  369. if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
  370. error_report("vfio: Error: Failed to setup resample irqfd: %m");
  371. goto fail_irqfd;
  372. }
  373. argsz = sizeof(*irq_set) + sizeof(*pfd);
  374. irq_set = g_malloc0(argsz);
  375. irq_set->argsz = argsz;
  376. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
  377. irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
  378. irq_set->start = 0;
  379. irq_set->count = 1;
  380. pfd = (int32_t *)&irq_set->data;
  381. *pfd = irqfd.resamplefd;
  382. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  383. g_free(irq_set);
  384. if (ret) {
  385. error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
  386. goto fail_vfio;
  387. }
  388. /* Let'em rip */
  389. vfio_unmask_intx(vdev);
  390. vdev->intx.kvm_accel = true;
  391. trace_vfio_enable_intx_kvm(vdev->host.domain, vdev->host.bus,
  392. vdev->host.slot, vdev->host.function);
  393. return;
  394. fail_vfio:
  395. irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
  396. kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
  397. fail_irqfd:
  398. event_notifier_cleanup(&vdev->intx.unmask);
  399. fail:
  400. qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
  401. vfio_unmask_intx(vdev);
  402. #endif
  403. }
  404. static void vfio_disable_intx_kvm(VFIODevice *vdev)
  405. {
  406. #ifdef CONFIG_KVM
  407. struct kvm_irqfd irqfd = {
  408. .fd = event_notifier_get_fd(&vdev->intx.interrupt),
  409. .gsi = vdev->intx.route.irq,
  410. .flags = KVM_IRQFD_FLAG_DEASSIGN,
  411. };
  412. if (!vdev->intx.kvm_accel) {
  413. return;
  414. }
  415. /*
  416. * Get to a known state, hardware masked, QEMU ready to accept new
  417. * interrupts, QEMU IRQ de-asserted.
  418. */
  419. vfio_mask_intx(vdev);
  420. vdev->intx.pending = false;
  421. pci_irq_deassert(&vdev->pdev);
  422. /* Tell KVM to stop listening for an INTx irqfd */
  423. if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
  424. error_report("vfio: Error: Failed to disable INTx irqfd: %m");
  425. }
  426. /* We only need to close the eventfd for VFIO to cleanup the kernel side */
  427. event_notifier_cleanup(&vdev->intx.unmask);
  428. /* QEMU starts listening for interrupt events. */
  429. qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
  430. vdev->intx.kvm_accel = false;
  431. /* If we've missed an event, let it re-fire through QEMU */
  432. vfio_unmask_intx(vdev);
  433. trace_vfio_disable_intx_kvm(vdev->host.domain, vdev->host.bus,
  434. vdev->host.slot, vdev->host.function);
  435. #endif
  436. }
  437. static void vfio_update_irq(PCIDevice *pdev)
  438. {
  439. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  440. PCIINTxRoute route;
  441. if (vdev->interrupt != VFIO_INT_INTx) {
  442. return;
  443. }
  444. route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
  445. if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
  446. return; /* Nothing changed */
  447. }
  448. trace_vfio_update_irq(vdev->host.domain, vdev->host.bus,
  449. vdev->host.slot, vdev->host.function,
  450. vdev->intx.route.irq, route.irq);
  451. vfio_disable_intx_kvm(vdev);
  452. vdev->intx.route = route;
  453. if (route.mode != PCI_INTX_ENABLED) {
  454. return;
  455. }
  456. vfio_enable_intx_kvm(vdev);
  457. /* Re-enable the interrupt in cased we missed an EOI */
  458. vfio_eoi(vdev);
  459. }
  460. static int vfio_enable_intx(VFIODevice *vdev)
  461. {
  462. uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
  463. int ret, argsz;
  464. struct vfio_irq_set *irq_set;
  465. int32_t *pfd;
  466. if (!pin) {
  467. return 0;
  468. }
  469. vfio_disable_interrupts(vdev);
  470. vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
  471. pci_config_set_interrupt_pin(vdev->pdev.config, pin);
  472. #ifdef CONFIG_KVM
  473. /*
  474. * Only conditional to avoid generating error messages on platforms
  475. * where we won't actually use the result anyway.
  476. */
  477. if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
  478. vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
  479. vdev->intx.pin);
  480. }
  481. #endif
  482. ret = event_notifier_init(&vdev->intx.interrupt, 0);
  483. if (ret) {
  484. error_report("vfio: Error: event_notifier_init failed");
  485. return ret;
  486. }
  487. argsz = sizeof(*irq_set) + sizeof(*pfd);
  488. irq_set = g_malloc0(argsz);
  489. irq_set->argsz = argsz;
  490. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
  491. irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
  492. irq_set->start = 0;
  493. irq_set->count = 1;
  494. pfd = (int32_t *)&irq_set->data;
  495. *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
  496. qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
  497. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  498. g_free(irq_set);
  499. if (ret) {
  500. error_report("vfio: Error: Failed to setup INTx fd: %m");
  501. qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
  502. event_notifier_cleanup(&vdev->intx.interrupt);
  503. return -errno;
  504. }
  505. vfio_enable_intx_kvm(vdev);
  506. vdev->interrupt = VFIO_INT_INTx;
  507. trace_vfio_enable_intx(vdev->host.domain, vdev->host.bus,
  508. vdev->host.slot, vdev->host.function);
  509. return 0;
  510. }
  511. static void vfio_disable_intx(VFIODevice *vdev)
  512. {
  513. int fd;
  514. timer_del(vdev->intx.mmap_timer);
  515. vfio_disable_intx_kvm(vdev);
  516. vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
  517. vdev->intx.pending = false;
  518. pci_irq_deassert(&vdev->pdev);
  519. vfio_mmap_set_enabled(vdev, true);
  520. fd = event_notifier_get_fd(&vdev->intx.interrupt);
  521. qemu_set_fd_handler(fd, NULL, NULL, vdev);
  522. event_notifier_cleanup(&vdev->intx.interrupt);
  523. vdev->interrupt = VFIO_INT_NONE;
  524. trace_vfio_disable_intx(vdev->host.domain, vdev->host.bus,
  525. vdev->host.slot, vdev->host.function);
  526. }
  527. /*
  528. * MSI/X
  529. */
  530. static void vfio_msi_interrupt(void *opaque)
  531. {
  532. VFIOMSIVector *vector = opaque;
  533. VFIODevice *vdev = vector->vdev;
  534. int nr = vector - vdev->msi_vectors;
  535. if (!event_notifier_test_and_clear(&vector->interrupt)) {
  536. return;
  537. }
  538. #ifdef DEBUG_VFIO
  539. MSIMessage msg;
  540. if (vdev->interrupt == VFIO_INT_MSIX) {
  541. msg = msix_get_message(&vdev->pdev, nr);
  542. } else if (vdev->interrupt == VFIO_INT_MSI) {
  543. msg = msi_get_message(&vdev->pdev, nr);
  544. } else {
  545. abort();
  546. }
  547. trace_vfio_msi_interrupt(vdev->host.domain, vdev->host.bus,
  548. vdev->host.slot, vdev->host.function,
  549. nr, msg.address, msg.data);
  550. #endif
  551. if (vdev->interrupt == VFIO_INT_MSIX) {
  552. msix_notify(&vdev->pdev, nr);
  553. } else if (vdev->interrupt == VFIO_INT_MSI) {
  554. msi_notify(&vdev->pdev, nr);
  555. } else {
  556. error_report("vfio: MSI interrupt receieved, but not enabled?");
  557. }
  558. }
  559. static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
  560. {
  561. struct vfio_irq_set *irq_set;
  562. int ret = 0, i, argsz;
  563. int32_t *fds;
  564. argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
  565. irq_set = g_malloc0(argsz);
  566. irq_set->argsz = argsz;
  567. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
  568. irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
  569. irq_set->start = 0;
  570. irq_set->count = vdev->nr_vectors;
  571. fds = (int32_t *)&irq_set->data;
  572. for (i = 0; i < vdev->nr_vectors; i++) {
  573. int fd = -1;
  574. /*
  575. * MSI vs MSI-X - The guest has direct access to MSI mask and pending
  576. * bits, therefore we always use the KVM signaling path when setup.
  577. * MSI-X mask and pending bits are emulated, so we want to use the
  578. * KVM signaling path only when configured and unmasked.
  579. */
  580. if (vdev->msi_vectors[i].use) {
  581. if (vdev->msi_vectors[i].virq < 0 ||
  582. (msix && msix_is_masked(&vdev->pdev, i))) {
  583. fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
  584. } else {
  585. fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
  586. }
  587. }
  588. fds[i] = fd;
  589. }
  590. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  591. g_free(irq_set);
  592. return ret;
  593. }
  594. static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg,
  595. bool msix)
  596. {
  597. int virq;
  598. if ((msix && !VFIO_ALLOW_KVM_MSIX) ||
  599. (!msix && !VFIO_ALLOW_KVM_MSI) || !msg) {
  600. return;
  601. }
  602. if (event_notifier_init(&vector->kvm_interrupt, 0)) {
  603. return;
  604. }
  605. virq = kvm_irqchip_add_msi_route(kvm_state, *msg);
  606. if (virq < 0) {
  607. event_notifier_cleanup(&vector->kvm_interrupt);
  608. return;
  609. }
  610. if (kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
  611. NULL, virq) < 0) {
  612. kvm_irqchip_release_virq(kvm_state, virq);
  613. event_notifier_cleanup(&vector->kvm_interrupt);
  614. return;
  615. }
  616. vector->virq = virq;
  617. }
  618. static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
  619. {
  620. kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
  621. vector->virq);
  622. kvm_irqchip_release_virq(kvm_state, vector->virq);
  623. vector->virq = -1;
  624. event_notifier_cleanup(&vector->kvm_interrupt);
  625. }
  626. static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg)
  627. {
  628. kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
  629. }
  630. static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
  631. MSIMessage *msg, IOHandler *handler)
  632. {
  633. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  634. VFIOMSIVector *vector;
  635. int ret;
  636. trace_vfio_msix_vector_do_use(vdev->host.domain, vdev->host.bus,
  637. vdev->host.slot, vdev->host.function,
  638. nr);
  639. vector = &vdev->msi_vectors[nr];
  640. if (!vector->use) {
  641. vector->vdev = vdev;
  642. vector->virq = -1;
  643. if (event_notifier_init(&vector->interrupt, 0)) {
  644. error_report("vfio: Error: event_notifier_init failed");
  645. }
  646. vector->use = true;
  647. msix_vector_use(pdev, nr);
  648. }
  649. qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
  650. handler, NULL, vector);
  651. /*
  652. * Attempt to enable route through KVM irqchip,
  653. * default to userspace handling if unavailable.
  654. */
  655. if (vector->virq >= 0) {
  656. if (!msg) {
  657. vfio_remove_kvm_msi_virq(vector);
  658. } else {
  659. vfio_update_kvm_msi_virq(vector, *msg);
  660. }
  661. } else {
  662. vfio_add_kvm_msi_virq(vector, msg, true);
  663. }
  664. /*
  665. * We don't want to have the host allocate all possible MSI vectors
  666. * for a device if they're not in use, so we shutdown and incrementally
  667. * increase them as needed.
  668. */
  669. if (vdev->nr_vectors < nr + 1) {
  670. vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
  671. vdev->nr_vectors = nr + 1;
  672. ret = vfio_enable_vectors(vdev, true);
  673. if (ret) {
  674. error_report("vfio: failed to enable vectors, %d", ret);
  675. }
  676. } else {
  677. int argsz;
  678. struct vfio_irq_set *irq_set;
  679. int32_t *pfd;
  680. argsz = sizeof(*irq_set) + sizeof(*pfd);
  681. irq_set = g_malloc0(argsz);
  682. irq_set->argsz = argsz;
  683. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
  684. VFIO_IRQ_SET_ACTION_TRIGGER;
  685. irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
  686. irq_set->start = nr;
  687. irq_set->count = 1;
  688. pfd = (int32_t *)&irq_set->data;
  689. if (vector->virq >= 0) {
  690. *pfd = event_notifier_get_fd(&vector->kvm_interrupt);
  691. } else {
  692. *pfd = event_notifier_get_fd(&vector->interrupt);
  693. }
  694. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  695. g_free(irq_set);
  696. if (ret) {
  697. error_report("vfio: failed to modify vector, %d", ret);
  698. }
  699. }
  700. return 0;
  701. }
  702. static int vfio_msix_vector_use(PCIDevice *pdev,
  703. unsigned int nr, MSIMessage msg)
  704. {
  705. return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
  706. }
  707. static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
  708. {
  709. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  710. VFIOMSIVector *vector = &vdev->msi_vectors[nr];
  711. trace_vfio_msix_vector_release(vdev->host.domain, vdev->host.bus,
  712. vdev->host.slot, vdev->host.function,
  713. nr);
  714. /*
  715. * There are still old guests that mask and unmask vectors on every
  716. * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of
  717. * the KVM setup in place, simply switch VFIO to use the non-bypass
  718. * eventfd. We'll then fire the interrupt through QEMU and the MSI-X
  719. * core will mask the interrupt and set pending bits, allowing it to
  720. * be re-asserted on unmask. Nothing to do if already using QEMU mode.
  721. */
  722. if (vector->virq >= 0) {
  723. int argsz;
  724. struct vfio_irq_set *irq_set;
  725. int32_t *pfd;
  726. argsz = sizeof(*irq_set) + sizeof(*pfd);
  727. irq_set = g_malloc0(argsz);
  728. irq_set->argsz = argsz;
  729. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
  730. VFIO_IRQ_SET_ACTION_TRIGGER;
  731. irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
  732. irq_set->start = nr;
  733. irq_set->count = 1;
  734. pfd = (int32_t *)&irq_set->data;
  735. *pfd = event_notifier_get_fd(&vector->interrupt);
  736. ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  737. g_free(irq_set);
  738. }
  739. }
  740. static void vfio_enable_msix(VFIODevice *vdev)
  741. {
  742. vfio_disable_interrupts(vdev);
  743. vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
  744. vdev->interrupt = VFIO_INT_MSIX;
  745. /*
  746. * Some communication channels between VF & PF or PF & fw rely on the
  747. * physical state of the device and expect that enabling MSI-X from the
  748. * guest enables the same on the host. When our guest is Linux, the
  749. * guest driver call to pci_enable_msix() sets the enabling bit in the
  750. * MSI-X capability, but leaves the vector table masked. We therefore
  751. * can't rely on a vector_use callback (from request_irq() in the guest)
  752. * to switch the physical device into MSI-X mode because that may come a
  753. * long time after pci_enable_msix(). This code enables vector 0 with
  754. * triggering to userspace, then immediately release the vector, leaving
  755. * the physical device with no vectors enabled, but MSI-X enabled, just
  756. * like the guest view.
  757. */
  758. vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
  759. vfio_msix_vector_release(&vdev->pdev, 0);
  760. if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
  761. vfio_msix_vector_release, NULL)) {
  762. error_report("vfio: msix_set_vector_notifiers failed");
  763. }
  764. trace_vfio_enable_msix(vdev->host.domain, vdev->host.bus,
  765. vdev->host.slot, vdev->host.function);
  766. }
  767. static void vfio_enable_msi(VFIODevice *vdev)
  768. {
  769. int ret, i;
  770. vfio_disable_interrupts(vdev);
  771. vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
  772. retry:
  773. vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
  774. for (i = 0; i < vdev->nr_vectors; i++) {
  775. VFIOMSIVector *vector = &vdev->msi_vectors[i];
  776. MSIMessage msg = msi_get_message(&vdev->pdev, i);
  777. vector->vdev = vdev;
  778. vector->virq = -1;
  779. vector->use = true;
  780. if (event_notifier_init(&vector->interrupt, 0)) {
  781. error_report("vfio: Error: event_notifier_init failed");
  782. }
  783. qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
  784. vfio_msi_interrupt, NULL, vector);
  785. /*
  786. * Attempt to enable route through KVM irqchip,
  787. * default to userspace handling if unavailable.
  788. */
  789. vfio_add_kvm_msi_virq(vector, &msg, false);
  790. }
  791. /* Set interrupt type prior to possible interrupts */
  792. vdev->interrupt = VFIO_INT_MSI;
  793. ret = vfio_enable_vectors(vdev, false);
  794. if (ret) {
  795. if (ret < 0) {
  796. error_report("vfio: Error: Failed to setup MSI fds: %m");
  797. } else if (ret != vdev->nr_vectors) {
  798. error_report("vfio: Error: Failed to enable %d "
  799. "MSI vectors, retry with %d", vdev->nr_vectors, ret);
  800. }
  801. for (i = 0; i < vdev->nr_vectors; i++) {
  802. VFIOMSIVector *vector = &vdev->msi_vectors[i];
  803. if (vector->virq >= 0) {
  804. vfio_remove_kvm_msi_virq(vector);
  805. }
  806. qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
  807. NULL, NULL, NULL);
  808. event_notifier_cleanup(&vector->interrupt);
  809. }
  810. g_free(vdev->msi_vectors);
  811. if (ret > 0 && ret != vdev->nr_vectors) {
  812. vdev->nr_vectors = ret;
  813. goto retry;
  814. }
  815. vdev->nr_vectors = 0;
  816. /*
  817. * Failing to setup MSI doesn't really fall within any specification.
  818. * Let's try leaving interrupts disabled and hope the guest figures
  819. * out to fall back to INTx for this device.
  820. */
  821. error_report("vfio: Error: Failed to enable MSI");
  822. vdev->interrupt = VFIO_INT_NONE;
  823. return;
  824. }
  825. trace_vfio_enable_msi(vdev->host.domain, vdev->host.bus,
  826. vdev->host.slot, vdev->host.function,
  827. vdev->nr_vectors);
  828. }
  829. static void vfio_disable_msi_common(VFIODevice *vdev)
  830. {
  831. int i;
  832. for (i = 0; i < vdev->nr_vectors; i++) {
  833. VFIOMSIVector *vector = &vdev->msi_vectors[i];
  834. if (vdev->msi_vectors[i].use) {
  835. if (vector->virq >= 0) {
  836. vfio_remove_kvm_msi_virq(vector);
  837. }
  838. qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
  839. NULL, NULL, NULL);
  840. event_notifier_cleanup(&vector->interrupt);
  841. }
  842. }
  843. g_free(vdev->msi_vectors);
  844. vdev->msi_vectors = NULL;
  845. vdev->nr_vectors = 0;
  846. vdev->interrupt = VFIO_INT_NONE;
  847. vfio_enable_intx(vdev);
  848. }
  849. static void vfio_disable_msix(VFIODevice *vdev)
  850. {
  851. int i;
  852. msix_unset_vector_notifiers(&vdev->pdev);
  853. /*
  854. * MSI-X will only release vectors if MSI-X is still enabled on the
  855. * device, check through the rest and release it ourselves if necessary.
  856. */
  857. for (i = 0; i < vdev->nr_vectors; i++) {
  858. if (vdev->msi_vectors[i].use) {
  859. vfio_msix_vector_release(&vdev->pdev, i);
  860. msix_vector_unuse(&vdev->pdev, i);
  861. }
  862. }
  863. if (vdev->nr_vectors) {
  864. vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
  865. }
  866. vfio_disable_msi_common(vdev);
  867. trace_vfio_disable_msix(vdev->host.domain, vdev->host.bus,
  868. vdev->host.slot, vdev->host.function);
  869. }
  870. static void vfio_disable_msi(VFIODevice *vdev)
  871. {
  872. vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
  873. vfio_disable_msi_common(vdev);
  874. trace_vfio_disable_msi(vdev->host.domain, vdev->host.bus,
  875. vdev->host.slot, vdev->host.function);
  876. }
  877. static void vfio_update_msi(VFIODevice *vdev)
  878. {
  879. int i;
  880. for (i = 0; i < vdev->nr_vectors; i++) {
  881. VFIOMSIVector *vector = &vdev->msi_vectors[i];
  882. MSIMessage msg;
  883. if (!vector->use || vector->virq < 0) {
  884. continue;
  885. }
  886. msg = msi_get_message(&vdev->pdev, i);
  887. vfio_update_kvm_msi_virq(vector, msg);
  888. }
  889. }
  890. /*
  891. * IO Port/MMIO - Beware of the endians, VFIO is always little endian
  892. */
  893. static void vfio_bar_write(void *opaque, hwaddr addr,
  894. uint64_t data, unsigned size)
  895. {
  896. VFIOBAR *bar = opaque;
  897. union {
  898. uint8_t byte;
  899. uint16_t word;
  900. uint32_t dword;
  901. uint64_t qword;
  902. } buf;
  903. switch (size) {
  904. case 1:
  905. buf.byte = data;
  906. break;
  907. case 2:
  908. buf.word = cpu_to_le16(data);
  909. break;
  910. case 4:
  911. buf.dword = cpu_to_le32(data);
  912. break;
  913. default:
  914. hw_error("vfio: unsupported write size, %d bytes", size);
  915. break;
  916. }
  917. if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
  918. error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
  919. __func__, addr, data, size);
  920. }
  921. #ifdef DEBUG_VFIO
  922. {
  923. VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
  924. trace_vfio_bar_write(vdev->host.domain, vdev->host.bus,
  925. vdev->host.slot, vdev->host.function,
  926. region->nr, addr, data, size);
  927. }
  928. #endif
  929. /*
  930. * A read or write to a BAR always signals an INTx EOI. This will
  931. * do nothing if not pending (including not in INTx mode). We assume
  932. * that a BAR access is in response to an interrupt and that BAR
  933. * accesses will service the interrupt. Unfortunately, we don't know
  934. * which access will service the interrupt, so we're potentially
  935. * getting quite a few host interrupts per guest interrupt.
  936. */
  937. vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
  938. }
  939. static uint64_t vfio_bar_read(void *opaque,
  940. hwaddr addr, unsigned size)
  941. {
  942. VFIOBAR *bar = opaque;
  943. union {
  944. uint8_t byte;
  945. uint16_t word;
  946. uint32_t dword;
  947. uint64_t qword;
  948. } buf;
  949. uint64_t data = 0;
  950. if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
  951. error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
  952. __func__, addr, size);
  953. return (uint64_t)-1;
  954. }
  955. switch (size) {
  956. case 1:
  957. data = buf.byte;
  958. break;
  959. case 2:
  960. data = le16_to_cpu(buf.word);
  961. break;
  962. case 4:
  963. data = le32_to_cpu(buf.dword);
  964. break;
  965. default:
  966. hw_error("vfio: unsupported read size, %d bytes", size);
  967. break;
  968. }
  969. #ifdef DEBUG_VFIO
  970. {
  971. VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
  972. trace_vfio_bar_read(vdev->host.domain, vdev->host.bus,
  973. vdev->host.slot, vdev->host.function,
  974. region->nr, addr, size, data);
  975. }
  976. #endif
  977. /* Same as write above */
  978. vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
  979. return data;
  980. }
  981. static const MemoryRegionOps vfio_bar_ops = {
  982. .read = vfio_bar_read,
  983. .write = vfio_bar_write,
  984. .endianness = DEVICE_LITTLE_ENDIAN,
  985. };
  986. static void vfio_pci_load_rom(VFIODevice *vdev)
  987. {
  988. struct vfio_region_info reg_info = {
  989. .argsz = sizeof(reg_info),
  990. .index = VFIO_PCI_ROM_REGION_INDEX
  991. };
  992. uint64_t size;
  993. off_t off = 0;
  994. size_t bytes;
  995. if (ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
  996. error_report("vfio: Error getting ROM info: %m");
  997. return;
  998. }
  999. trace_vfio_pci_load_rom(vdev->host.domain, vdev->host.bus,
  1000. vdev->host.slot, vdev->host.function,
  1001. (unsigned long)reg_info.size,
  1002. (unsigned long)reg_info.offset,
  1003. (unsigned long)reg_info.flags);
  1004. vdev->rom_size = size = reg_info.size;
  1005. vdev->rom_offset = reg_info.offset;
  1006. if (!vdev->rom_size) {
  1007. vdev->rom_read_failed = true;
  1008. error_report("vfio-pci: Cannot read device rom at "
  1009. "%04x:%02x:%02x.%x",
  1010. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1011. vdev->host.function);
  1012. error_printf("Device option ROM contents are probably invalid "
  1013. "(check dmesg).\nSkip option ROM probe with rombar=0, "
  1014. "or load from file with romfile=\n");
  1015. return;
  1016. }
  1017. vdev->rom = g_malloc(size);
  1018. memset(vdev->rom, 0xff, size);
  1019. while (size) {
  1020. bytes = pread(vdev->fd, vdev->rom + off, size, vdev->rom_offset + off);
  1021. if (bytes == 0) {
  1022. break;
  1023. } else if (bytes > 0) {
  1024. off += bytes;
  1025. size -= bytes;
  1026. } else {
  1027. if (errno == EINTR || errno == EAGAIN) {
  1028. continue;
  1029. }
  1030. error_report("vfio: Error reading device ROM: %m");
  1031. break;
  1032. }
  1033. }
  1034. }
  1035. static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
  1036. {
  1037. VFIODevice *vdev = opaque;
  1038. union {
  1039. uint8_t byte;
  1040. uint16_t word;
  1041. uint32_t dword;
  1042. uint64_t qword;
  1043. } val;
  1044. uint64_t data = 0;
  1045. /* Load the ROM lazily when the guest tries to read it */
  1046. if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
  1047. vfio_pci_load_rom(vdev);
  1048. }
  1049. memcpy(&val, vdev->rom + addr,
  1050. (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
  1051. switch (size) {
  1052. case 1:
  1053. data = val.byte;
  1054. break;
  1055. case 2:
  1056. data = le16_to_cpu(val.word);
  1057. break;
  1058. case 4:
  1059. data = le32_to_cpu(val.dword);
  1060. break;
  1061. default:
  1062. hw_error("vfio: unsupported read size, %d bytes\n", size);
  1063. break;
  1064. }
  1065. trace_vfio_rom_read(vdev->host.domain, vdev->host.bus,
  1066. vdev->host.slot, vdev->host.function,
  1067. addr, size, data);
  1068. return data;
  1069. }
  1070. static void vfio_rom_write(void *opaque, hwaddr addr,
  1071. uint64_t data, unsigned size)
  1072. {
  1073. }
  1074. static const MemoryRegionOps vfio_rom_ops = {
  1075. .read = vfio_rom_read,
  1076. .write = vfio_rom_write,
  1077. .endianness = DEVICE_LITTLE_ENDIAN,
  1078. };
  1079. static bool vfio_blacklist_opt_rom(VFIODevice *vdev)
  1080. {
  1081. PCIDevice *pdev = &vdev->pdev;
  1082. uint16_t vendor_id, device_id;
  1083. int count = 0;
  1084. vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
  1085. device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
  1086. while (count < ARRAY_SIZE(romblacklist)) {
  1087. if (romblacklist[count].vendor_id == vendor_id &&
  1088. romblacklist[count].device_id == device_id) {
  1089. return true;
  1090. }
  1091. count++;
  1092. }
  1093. return false;
  1094. }
  1095. static void vfio_pci_size_rom(VFIODevice *vdev)
  1096. {
  1097. uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
  1098. off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
  1099. DeviceState *dev = DEVICE(vdev);
  1100. char name[32];
  1101. if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
  1102. /* Since pci handles romfile, just print a message and return */
  1103. if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) {
  1104. error_printf("Warning : Device at %04x:%02x:%02x.%x "
  1105. "is known to cause system instability issues during "
  1106. "option rom execution. "
  1107. "Proceeding anyway since user specified romfile\n",
  1108. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1109. vdev->host.function);
  1110. }
  1111. return;
  1112. }
  1113. /*
  1114. * Use the same size ROM BAR as the physical device. The contents
  1115. * will get filled in later when the guest tries to read it.
  1116. */
  1117. if (pread(vdev->fd, &orig, 4, offset) != 4 ||
  1118. pwrite(vdev->fd, &size, 4, offset) != 4 ||
  1119. pread(vdev->fd, &size, 4, offset) != 4 ||
  1120. pwrite(vdev->fd, &orig, 4, offset) != 4) {
  1121. error_report("%s(%04x:%02x:%02x.%x) failed: %m",
  1122. __func__, vdev->host.domain, vdev->host.bus,
  1123. vdev->host.slot, vdev->host.function);
  1124. return;
  1125. }
  1126. size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
  1127. if (!size) {
  1128. return;
  1129. }
  1130. if (vfio_blacklist_opt_rom(vdev)) {
  1131. if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
  1132. error_printf("Warning : Device at %04x:%02x:%02x.%x "
  1133. "is known to cause system instability issues during "
  1134. "option rom execution. "
  1135. "Proceeding anyway since user specified non zero value for "
  1136. "rombar\n",
  1137. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1138. vdev->host.function);
  1139. } else {
  1140. error_printf("Warning : Rom loading for device at "
  1141. "%04x:%02x:%02x.%x has been disabled due to "
  1142. "system instability issues. "
  1143. "Specify rombar=1 or romfile to force\n",
  1144. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1145. vdev->host.function);
  1146. return;
  1147. }
  1148. }
  1149. trace_vfio_pci_size_rom(vdev->host.domain, vdev->host.bus,
  1150. vdev->host.slot, vdev->host.function,
  1151. size);
  1152. snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
  1153. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1154. vdev->host.function);
  1155. memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
  1156. &vfio_rom_ops, vdev, name, size);
  1157. pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
  1158. PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
  1159. vdev->pdev.has_rom = true;
  1160. vdev->rom_read_failed = false;
  1161. }
  1162. static void vfio_vga_write(void *opaque, hwaddr addr,
  1163. uint64_t data, unsigned size)
  1164. {
  1165. VFIOVGARegion *region = opaque;
  1166. VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
  1167. union {
  1168. uint8_t byte;
  1169. uint16_t word;
  1170. uint32_t dword;
  1171. uint64_t qword;
  1172. } buf;
  1173. off_t offset = vga->fd_offset + region->offset + addr;
  1174. switch (size) {
  1175. case 1:
  1176. buf.byte = data;
  1177. break;
  1178. case 2:
  1179. buf.word = cpu_to_le16(data);
  1180. break;
  1181. case 4:
  1182. buf.dword = cpu_to_le32(data);
  1183. break;
  1184. default:
  1185. hw_error("vfio: unsupported write size, %d bytes", size);
  1186. break;
  1187. }
  1188. if (pwrite(vga->fd, &buf, size, offset) != size) {
  1189. error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
  1190. __func__, region->offset + addr, data, size);
  1191. }
  1192. trace_vfio_vga_write(region->offset + addr, data, size);
  1193. }
  1194. static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
  1195. {
  1196. VFIOVGARegion *region = opaque;
  1197. VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
  1198. union {
  1199. uint8_t byte;
  1200. uint16_t word;
  1201. uint32_t dword;
  1202. uint64_t qword;
  1203. } buf;
  1204. uint64_t data = 0;
  1205. off_t offset = vga->fd_offset + region->offset + addr;
  1206. if (pread(vga->fd, &buf, size, offset) != size) {
  1207. error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
  1208. __func__, region->offset + addr, size);
  1209. return (uint64_t)-1;
  1210. }
  1211. switch (size) {
  1212. case 1:
  1213. data = buf.byte;
  1214. break;
  1215. case 2:
  1216. data = le16_to_cpu(buf.word);
  1217. break;
  1218. case 4:
  1219. data = le32_to_cpu(buf.dword);
  1220. break;
  1221. default:
  1222. hw_error("vfio: unsupported read size, %d bytes", size);
  1223. break;
  1224. }
  1225. trace_vfio_vga_read(region->offset + addr, size, data);
  1226. return data;
  1227. }
  1228. static const MemoryRegionOps vfio_vga_ops = {
  1229. .read = vfio_vga_read,
  1230. .write = vfio_vga_write,
  1231. .endianness = DEVICE_LITTLE_ENDIAN,
  1232. };
  1233. /*
  1234. * Device specific quirks
  1235. */
  1236. /* Is range1 fully contained within range2? */
  1237. static bool vfio_range_contained(uint64_t first1, uint64_t len1,
  1238. uint64_t first2, uint64_t len2) {
  1239. return (first1 >= first2 && first1 + len1 <= first2 + len2);
  1240. }
  1241. static bool vfio_flags_enabled(uint8_t flags, uint8_t mask)
  1242. {
  1243. return (mask && (flags & mask) == mask);
  1244. }
  1245. static uint64_t vfio_generic_window_quirk_read(void *opaque,
  1246. hwaddr addr, unsigned size)
  1247. {
  1248. VFIOQuirk *quirk = opaque;
  1249. VFIODevice *vdev = quirk->vdev;
  1250. uint64_t data;
  1251. if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
  1252. ranges_overlap(addr, size,
  1253. quirk->data.data_offset, quirk->data.data_size)) {
  1254. hwaddr offset = addr - quirk->data.data_offset;
  1255. if (!vfio_range_contained(addr, size, quirk->data.data_offset,
  1256. quirk->data.data_size)) {
  1257. hw_error("%s: window data read not fully contained: %s",
  1258. __func__, memory_region_name(&quirk->mem));
  1259. }
  1260. data = vfio_pci_read_config(&vdev->pdev,
  1261. quirk->data.address_val + offset, size);
  1262. trace_vfio_generic_window_quirk_read(memory_region_name(&quirk->mem),
  1263. vdev->host.domain,
  1264. vdev->host.bus,
  1265. vdev->host.slot,
  1266. vdev->host.function,
  1267. quirk->data.bar,
  1268. addr, size, data);
  1269. } else {
  1270. data = vfio_bar_read(&vdev->bars[quirk->data.bar],
  1271. addr + quirk->data.base_offset, size);
  1272. }
  1273. return data;
  1274. }
  1275. static void vfio_generic_window_quirk_write(void *opaque, hwaddr addr,
  1276. uint64_t data, unsigned size)
  1277. {
  1278. VFIOQuirk *quirk = opaque;
  1279. VFIODevice *vdev = quirk->vdev;
  1280. if (ranges_overlap(addr, size,
  1281. quirk->data.address_offset, quirk->data.address_size)) {
  1282. if (addr != quirk->data.address_offset) {
  1283. hw_error("%s: offset write into address window: %s",
  1284. __func__, memory_region_name(&quirk->mem));
  1285. }
  1286. if ((data & ~quirk->data.address_mask) == quirk->data.address_match) {
  1287. quirk->data.flags |= quirk->data.write_flags |
  1288. quirk->data.read_flags;
  1289. quirk->data.address_val = data & quirk->data.address_mask;
  1290. } else {
  1291. quirk->data.flags &= ~(quirk->data.write_flags |
  1292. quirk->data.read_flags);
  1293. }
  1294. }
  1295. if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
  1296. ranges_overlap(addr, size,
  1297. quirk->data.data_offset, quirk->data.data_size)) {
  1298. hwaddr offset = addr - quirk->data.data_offset;
  1299. if (!vfio_range_contained(addr, size, quirk->data.data_offset,
  1300. quirk->data.data_size)) {
  1301. hw_error("%s: window data write not fully contained: %s",
  1302. __func__, memory_region_name(&quirk->mem));
  1303. }
  1304. vfio_pci_write_config(&vdev->pdev,
  1305. quirk->data.address_val + offset, data, size);
  1306. trace_vfio_generic_window_quirk_write(memory_region_name(&quirk->mem),
  1307. vdev->host.domain,
  1308. vdev->host.bus,
  1309. vdev->host.slot,
  1310. vdev->host.function,
  1311. quirk->data.bar,
  1312. addr, data, size);
  1313. return;
  1314. }
  1315. vfio_bar_write(&vdev->bars[quirk->data.bar],
  1316. addr + quirk->data.base_offset, data, size);
  1317. }
  1318. static const MemoryRegionOps vfio_generic_window_quirk = {
  1319. .read = vfio_generic_window_quirk_read,
  1320. .write = vfio_generic_window_quirk_write,
  1321. .endianness = DEVICE_LITTLE_ENDIAN,
  1322. };
  1323. static uint64_t vfio_generic_quirk_read(void *opaque,
  1324. hwaddr addr, unsigned size)
  1325. {
  1326. VFIOQuirk *quirk = opaque;
  1327. VFIODevice *vdev = quirk->vdev;
  1328. hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
  1329. hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
  1330. uint64_t data;
  1331. if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
  1332. ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
  1333. if (!vfio_range_contained(addr, size, offset,
  1334. quirk->data.address_mask + 1)) {
  1335. hw_error("%s: read not fully contained: %s",
  1336. __func__, memory_region_name(&quirk->mem));
  1337. }
  1338. data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
  1339. trace_vfio_generic_quirk_read(memory_region_name(&quirk->mem),
  1340. vdev->host.domain,
  1341. vdev->host.bus,
  1342. vdev->host.slot,
  1343. vdev->host.function,
  1344. quirk->data.bar,
  1345. addr + base, size, data);
  1346. } else {
  1347. data = vfio_bar_read(&vdev->bars[quirk->data.bar], addr + base, size);
  1348. }
  1349. return data;
  1350. }
  1351. static void vfio_generic_quirk_write(void *opaque, hwaddr addr,
  1352. uint64_t data, unsigned size)
  1353. {
  1354. VFIOQuirk *quirk = opaque;
  1355. VFIODevice *vdev = quirk->vdev;
  1356. hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
  1357. hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
  1358. if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
  1359. ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
  1360. if (!vfio_range_contained(addr, size, offset,
  1361. quirk->data.address_mask + 1)) {
  1362. hw_error("%s: write not fully contained: %s",
  1363. __func__, memory_region_name(&quirk->mem));
  1364. }
  1365. vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
  1366. trace_vfio_generic_quirk_write(memory_region_name(&quirk->mem),
  1367. vdev->host.domain,
  1368. vdev->host.bus,
  1369. vdev->host.slot,
  1370. vdev->host.function,
  1371. quirk->data.bar,
  1372. addr + base, data, size);
  1373. } else {
  1374. vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
  1375. }
  1376. }
  1377. static const MemoryRegionOps vfio_generic_quirk = {
  1378. .read = vfio_generic_quirk_read,
  1379. .write = vfio_generic_quirk_write,
  1380. .endianness = DEVICE_LITTLE_ENDIAN,
  1381. };
  1382. #define PCI_VENDOR_ID_ATI 0x1002
  1383. /*
  1384. * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
  1385. * through VGA register 0x3c3. On newer cards, the I/O port BAR is always
  1386. * BAR4 (older cards like the X550 used BAR1, but we don't care to support
  1387. * those). Note that on bare metal, a read of 0x3c3 doesn't always return the
  1388. * I/O port BAR address. Originally this was coded to return the virtual BAR
  1389. * address only if the physical register read returns the actual BAR address,
  1390. * but users have reported greater success if we return the virtual address
  1391. * unconditionally.
  1392. */
  1393. static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
  1394. hwaddr addr, unsigned size)
  1395. {
  1396. VFIOQuirk *quirk = opaque;
  1397. VFIODevice *vdev = quirk->vdev;
  1398. uint64_t data = vfio_pci_read_config(&vdev->pdev,
  1399. PCI_BASE_ADDRESS_0 + (4 * 4) + 1,
  1400. size);
  1401. trace_vfio_ati_3c3_quirk_read(data);
  1402. return data;
  1403. }
  1404. static const MemoryRegionOps vfio_ati_3c3_quirk = {
  1405. .read = vfio_ati_3c3_quirk_read,
  1406. .endianness = DEVICE_LITTLE_ENDIAN,
  1407. };
  1408. static void vfio_vga_probe_ati_3c3_quirk(VFIODevice *vdev)
  1409. {
  1410. PCIDevice *pdev = &vdev->pdev;
  1411. VFIOQuirk *quirk;
  1412. if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
  1413. return;
  1414. }
  1415. /*
  1416. * As long as the BAR is >= 256 bytes it will be aligned such that the
  1417. * lower byte is always zero. Filter out anything else, if it exists.
  1418. */
  1419. if (!vdev->bars[4].ioport || vdev->bars[4].size < 256) {
  1420. return;
  1421. }
  1422. quirk = g_malloc0(sizeof(*quirk));
  1423. quirk->vdev = vdev;
  1424. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, quirk,
  1425. "vfio-ati-3c3-quirk", 1);
  1426. memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
  1427. 3 /* offset 3 bytes from 0x3c0 */, &quirk->mem);
  1428. QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
  1429. quirk, next);
  1430. trace_vfio_vga_probe_ati_3c3_quirk(vdev->host.domain, vdev->host.bus,
  1431. vdev->host.slot, vdev->host.function);
  1432. }
  1433. /*
  1434. * Newer ATI/AMD devices, including HD5450 and HD7850, have a window to PCI
  1435. * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access
  1436. * the MMIO space directly, but a window to this space is provided through
  1437. * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the
  1438. * data register. When the address is programmed to a range of 0x4000-0x4fff
  1439. * PCI configuration space is available. Experimentation seems to indicate
  1440. * that only read-only access is provided, but we drop writes when the window
  1441. * is enabled to config space nonetheless.
  1442. */
  1443. static void vfio_probe_ati_bar4_window_quirk(VFIODevice *vdev, int nr)
  1444. {
  1445. PCIDevice *pdev = &vdev->pdev;
  1446. VFIOQuirk *quirk;
  1447. if (!vdev->has_vga || nr != 4 ||
  1448. pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
  1449. return;
  1450. }
  1451. quirk = g_malloc0(sizeof(*quirk));
  1452. quirk->vdev = vdev;
  1453. quirk->data.address_size = 4;
  1454. quirk->data.data_offset = 4;
  1455. quirk->data.data_size = 4;
  1456. quirk->data.address_match = 0x4000;
  1457. quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
  1458. quirk->data.bar = nr;
  1459. quirk->data.read_flags = quirk->data.write_flags = 1;
  1460. memory_region_init_io(&quirk->mem, OBJECT(vdev),
  1461. &vfio_generic_window_quirk, quirk,
  1462. "vfio-ati-bar4-window-quirk", 8);
  1463. memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
  1464. quirk->data.base_offset, &quirk->mem, 1);
  1465. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1466. trace_vfio_probe_ati_bar4_window_quirk(vdev->host.domain,
  1467. vdev->host.bus,
  1468. vdev->host.slot,
  1469. vdev->host.function);
  1470. }
  1471. #define PCI_VENDOR_ID_REALTEK 0x10ec
  1472. /*
  1473. * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2
  1474. * offset 0x70 there is a dword data register, offset 0x74 is a dword address
  1475. * register. According to the Linux r8169 driver, the MSI-X table is addressed
  1476. * when the "type" portion of the address register is set to 0x1. This appears
  1477. * to be bits 16:30. Bit 31 is both a write indicator and some sort of
  1478. * "address latched" indicator. Bits 12:15 are a mask field, which we can
  1479. * ignore because the MSI-X table should always be accessed as a dword (full
  1480. * mask). Bits 0:11 is offset within the type.
  1481. *
  1482. * Example trace:
  1483. *
  1484. * Read from MSI-X table offset 0
  1485. * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
  1486. * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
  1487. * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
  1488. *
  1489. * Write 0xfee00000 to MSI-X table offset 0
  1490. * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
  1491. * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
  1492. * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
  1493. */
  1494. static uint64_t vfio_rtl8168_window_quirk_read(void *opaque,
  1495. hwaddr addr, unsigned size)
  1496. {
  1497. VFIOQuirk *quirk = opaque;
  1498. VFIODevice *vdev = quirk->vdev;
  1499. switch (addr) {
  1500. case 4: /* address */
  1501. if (quirk->data.flags) {
  1502. trace_vfio_rtl8168_window_quirk_read_fake(
  1503. memory_region_name(&quirk->mem),
  1504. vdev->host.domain, vdev->host.bus,
  1505. vdev->host.slot, vdev->host.function);
  1506. return quirk->data.address_match ^ 0x10000000U;
  1507. }
  1508. break;
  1509. case 0: /* data */
  1510. if (quirk->data.flags) {
  1511. uint64_t val;
  1512. trace_vfio_rtl8168_window_quirk_read_table(
  1513. memory_region_name(&quirk->mem),
  1514. vdev->host.domain, vdev->host.bus,
  1515. vdev->host.slot, vdev->host.function
  1516. );
  1517. if (!(vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
  1518. return 0;
  1519. }
  1520. io_mem_read(&vdev->pdev.msix_table_mmio,
  1521. (hwaddr)(quirk->data.address_match & 0xfff),
  1522. &val, size);
  1523. return val;
  1524. }
  1525. }
  1526. trace_vfio_rtl8168_window_quirk_read_direct(
  1527. memory_region_name(&quirk->mem),
  1528. vdev->host.domain, vdev->host.bus,
  1529. vdev->host.slot, vdev->host.function);
  1530. return vfio_bar_read(&vdev->bars[quirk->data.bar], addr + 0x70, size);
  1531. }
  1532. static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr,
  1533. uint64_t data, unsigned size)
  1534. {
  1535. VFIOQuirk *quirk = opaque;
  1536. VFIODevice *vdev = quirk->vdev;
  1537. switch (addr) {
  1538. case 4: /* address */
  1539. if ((data & 0x7fff0000) == 0x10000) {
  1540. if (data & 0x10000000U &&
  1541. vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
  1542. trace_vfio_rtl8168_window_quirk_write_table(
  1543. memory_region_name(&quirk->mem),
  1544. vdev->host.domain, vdev->host.bus,
  1545. vdev->host.slot, vdev->host.function);
  1546. io_mem_write(&vdev->pdev.msix_table_mmio,
  1547. (hwaddr)(quirk->data.address_match & 0xfff),
  1548. data, size);
  1549. }
  1550. quirk->data.flags = 1;
  1551. quirk->data.address_match = data;
  1552. return;
  1553. }
  1554. quirk->data.flags = 0;
  1555. break;
  1556. case 0: /* data */
  1557. quirk->data.address_mask = data;
  1558. break;
  1559. }
  1560. trace_vfio_rtl8168_window_quirk_write_direct(
  1561. memory_region_name(&quirk->mem),
  1562. vdev->host.domain, vdev->host.bus,
  1563. vdev->host.slot, vdev->host.function);
  1564. vfio_bar_write(&vdev->bars[quirk->data.bar], addr + 0x70, data, size);
  1565. }
  1566. static const MemoryRegionOps vfio_rtl8168_window_quirk = {
  1567. .read = vfio_rtl8168_window_quirk_read,
  1568. .write = vfio_rtl8168_window_quirk_write,
  1569. .valid = {
  1570. .min_access_size = 4,
  1571. .max_access_size = 4,
  1572. .unaligned = false,
  1573. },
  1574. .endianness = DEVICE_LITTLE_ENDIAN,
  1575. };
  1576. static void vfio_probe_rtl8168_bar2_window_quirk(VFIODevice *vdev, int nr)
  1577. {
  1578. PCIDevice *pdev = &vdev->pdev;
  1579. VFIOQuirk *quirk;
  1580. if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_REALTEK ||
  1581. pci_get_word(pdev->config + PCI_DEVICE_ID) != 0x8168 || nr != 2) {
  1582. return;
  1583. }
  1584. quirk = g_malloc0(sizeof(*quirk));
  1585. quirk->vdev = vdev;
  1586. quirk->data.bar = nr;
  1587. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_rtl8168_window_quirk,
  1588. quirk, "vfio-rtl8168-window-quirk", 8);
  1589. memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
  1590. 0x70, &quirk->mem, 1);
  1591. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1592. trace_vfio_probe_rtl8168_bar2_window_quirk(vdev->host.domain,
  1593. vdev->host.bus,
  1594. vdev->host.slot,
  1595. vdev->host.function);
  1596. }
  1597. /*
  1598. * Trap the BAR2 MMIO window to config space as well.
  1599. */
  1600. static void vfio_probe_ati_bar2_4000_quirk(VFIODevice *vdev, int nr)
  1601. {
  1602. PCIDevice *pdev = &vdev->pdev;
  1603. VFIOQuirk *quirk;
  1604. /* Only enable on newer devices where BAR2 is 64bit */
  1605. if (!vdev->has_vga || nr != 2 || !vdev->bars[2].mem64 ||
  1606. pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
  1607. return;
  1608. }
  1609. quirk = g_malloc0(sizeof(*quirk));
  1610. quirk->vdev = vdev;
  1611. quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
  1612. quirk->data.address_match = 0x4000;
  1613. quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
  1614. quirk->data.bar = nr;
  1615. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
  1616. "vfio-ati-bar2-4000-quirk",
  1617. TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
  1618. memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
  1619. quirk->data.address_match & TARGET_PAGE_MASK,
  1620. &quirk->mem, 1);
  1621. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1622. trace_vfio_probe_ati_bar2_4000_quirk(vdev->host.domain,
  1623. vdev->host.bus,
  1624. vdev->host.slot,
  1625. vdev->host.function);
  1626. }
  1627. /*
  1628. * Older ATI/AMD cards like the X550 have a similar window to that above.
  1629. * I/O port BAR1 provides a window to a mirror of PCI config space located
  1630. * in BAR2 at offset 0xf00. We don't care to support such older cards, but
  1631. * note it for future reference.
  1632. */
  1633. #define PCI_VENDOR_ID_NVIDIA 0x10de
  1634. /*
  1635. * Nvidia has several different methods to get to config space, the
  1636. * nouveu project has several of these documented here:
  1637. * https://github.com/pathscale/envytools/tree/master/hwdocs
  1638. *
  1639. * The first quirk is actually not documented in envytools and is found
  1640. * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an
  1641. * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access
  1642. * the mirror of PCI config space found at BAR0 offset 0x1800. The access
  1643. * sequence first writes 0x338 to I/O port 0x3d4. The target offset is
  1644. * then written to 0x3d0. Finally 0x538 is written for a read and 0x738
  1645. * is written for a write to 0x3d4. The BAR0 offset is then accessible
  1646. * through 0x3d0. This quirk doesn't seem to be necessary on newer cards
  1647. * that use the I/O port BAR5 window but it doesn't hurt to leave it.
  1648. */
  1649. enum {
  1650. NV_3D0_NONE = 0,
  1651. NV_3D0_SELECT,
  1652. NV_3D0_WINDOW,
  1653. NV_3D0_READ,
  1654. NV_3D0_WRITE,
  1655. };
  1656. static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
  1657. hwaddr addr, unsigned size)
  1658. {
  1659. VFIOQuirk *quirk = opaque;
  1660. VFIODevice *vdev = quirk->vdev;
  1661. PCIDevice *pdev = &vdev->pdev;
  1662. uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
  1663. addr + quirk->data.base_offset, size);
  1664. if (quirk->data.flags == NV_3D0_READ && addr == quirk->data.data_offset) {
  1665. data = vfio_pci_read_config(pdev, quirk->data.address_val, size);
  1666. trace_vfio_nvidia_3d0_quirk_read(size, data);
  1667. }
  1668. quirk->data.flags = NV_3D0_NONE;
  1669. return data;
  1670. }
  1671. static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
  1672. uint64_t data, unsigned size)
  1673. {
  1674. VFIOQuirk *quirk = opaque;
  1675. VFIODevice *vdev = quirk->vdev;
  1676. PCIDevice *pdev = &vdev->pdev;
  1677. switch (quirk->data.flags) {
  1678. case NV_3D0_NONE:
  1679. if (addr == quirk->data.address_offset && data == 0x338) {
  1680. quirk->data.flags = NV_3D0_SELECT;
  1681. }
  1682. break;
  1683. case NV_3D0_SELECT:
  1684. quirk->data.flags = NV_3D0_NONE;
  1685. if (addr == quirk->data.data_offset &&
  1686. (data & ~quirk->data.address_mask) == quirk->data.address_match) {
  1687. quirk->data.flags = NV_3D0_WINDOW;
  1688. quirk->data.address_val = data & quirk->data.address_mask;
  1689. }
  1690. break;
  1691. case NV_3D0_WINDOW:
  1692. quirk->data.flags = NV_3D0_NONE;
  1693. if (addr == quirk->data.address_offset) {
  1694. if (data == 0x538) {
  1695. quirk->data.flags = NV_3D0_READ;
  1696. } else if (data == 0x738) {
  1697. quirk->data.flags = NV_3D0_WRITE;
  1698. }
  1699. }
  1700. break;
  1701. case NV_3D0_WRITE:
  1702. quirk->data.flags = NV_3D0_NONE;
  1703. if (addr == quirk->data.data_offset) {
  1704. vfio_pci_write_config(pdev, quirk->data.address_val, data, size);
  1705. trace_vfio_nvidia_3d0_quirk_write(data, size);
  1706. return;
  1707. }
  1708. break;
  1709. }
  1710. vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
  1711. addr + quirk->data.base_offset, data, size);
  1712. }
  1713. static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
  1714. .read = vfio_nvidia_3d0_quirk_read,
  1715. .write = vfio_nvidia_3d0_quirk_write,
  1716. .endianness = DEVICE_LITTLE_ENDIAN,
  1717. };
  1718. static void vfio_vga_probe_nvidia_3d0_quirk(VFIODevice *vdev)
  1719. {
  1720. PCIDevice *pdev = &vdev->pdev;
  1721. VFIOQuirk *quirk;
  1722. if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA ||
  1723. !vdev->bars[1].size) {
  1724. return;
  1725. }
  1726. quirk = g_malloc0(sizeof(*quirk));
  1727. quirk->vdev = vdev;
  1728. quirk->data.base_offset = 0x10;
  1729. quirk->data.address_offset = 4;
  1730. quirk->data.address_size = 2;
  1731. quirk->data.address_match = 0x1800;
  1732. quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
  1733. quirk->data.data_offset = 0;
  1734. quirk->data.data_size = 4;
  1735. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_3d0_quirk,
  1736. quirk, "vfio-nvidia-3d0-quirk", 6);
  1737. memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
  1738. quirk->data.base_offset, &quirk->mem);
  1739. QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
  1740. quirk, next);
  1741. trace_vfio_vga_probe_nvidia_3d0_quirk(vdev->host.domain,
  1742. vdev->host.bus,
  1743. vdev->host.slot,
  1744. vdev->host.function);
  1745. }
  1746. /*
  1747. * The second quirk is documented in envytools. The I/O port BAR5 is just
  1748. * a set of address/data ports to the MMIO BARs. The BAR we care about is
  1749. * again BAR0. This backdoor is apparently a bit newer than the one above
  1750. * so we need to not only trap 256 bytes @0x1800, but all of PCI config
  1751. * space, including extended space is available at the 4k @0x88000.
  1752. */
  1753. enum {
  1754. NV_BAR5_ADDRESS = 0x1,
  1755. NV_BAR5_ENABLE = 0x2,
  1756. NV_BAR5_MASTER = 0x4,
  1757. NV_BAR5_VALID = 0x7,
  1758. };
  1759. static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr,
  1760. uint64_t data, unsigned size)
  1761. {
  1762. VFIOQuirk *quirk = opaque;
  1763. switch (addr) {
  1764. case 0x0:
  1765. if (data & 0x1) {
  1766. quirk->data.flags |= NV_BAR5_MASTER;
  1767. } else {
  1768. quirk->data.flags &= ~NV_BAR5_MASTER;
  1769. }
  1770. break;
  1771. case 0x4:
  1772. if (data & 0x1) {
  1773. quirk->data.flags |= NV_BAR5_ENABLE;
  1774. } else {
  1775. quirk->data.flags &= ~NV_BAR5_ENABLE;
  1776. }
  1777. break;
  1778. case 0x8:
  1779. if (quirk->data.flags & NV_BAR5_MASTER) {
  1780. if ((data & ~0xfff) == 0x88000) {
  1781. quirk->data.flags |= NV_BAR5_ADDRESS;
  1782. quirk->data.address_val = data & 0xfff;
  1783. } else if ((data & ~0xff) == 0x1800) {
  1784. quirk->data.flags |= NV_BAR5_ADDRESS;
  1785. quirk->data.address_val = data & 0xff;
  1786. } else {
  1787. quirk->data.flags &= ~NV_BAR5_ADDRESS;
  1788. }
  1789. }
  1790. break;
  1791. }
  1792. vfio_generic_window_quirk_write(opaque, addr, data, size);
  1793. }
  1794. static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = {
  1795. .read = vfio_generic_window_quirk_read,
  1796. .write = vfio_nvidia_bar5_window_quirk_write,
  1797. .valid.min_access_size = 4,
  1798. .endianness = DEVICE_LITTLE_ENDIAN,
  1799. };
  1800. static void vfio_probe_nvidia_bar5_window_quirk(VFIODevice *vdev, int nr)
  1801. {
  1802. PCIDevice *pdev = &vdev->pdev;
  1803. VFIOQuirk *quirk;
  1804. if (!vdev->has_vga || nr != 5 ||
  1805. pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
  1806. return;
  1807. }
  1808. quirk = g_malloc0(sizeof(*quirk));
  1809. quirk->vdev = vdev;
  1810. quirk->data.read_flags = quirk->data.write_flags = NV_BAR5_VALID;
  1811. quirk->data.address_offset = 0x8;
  1812. quirk->data.address_size = 0; /* actually 4, but avoids generic code */
  1813. quirk->data.data_offset = 0xc;
  1814. quirk->data.data_size = 4;
  1815. quirk->data.bar = nr;
  1816. memory_region_init_io(&quirk->mem, OBJECT(vdev),
  1817. &vfio_nvidia_bar5_window_quirk, quirk,
  1818. "vfio-nvidia-bar5-window-quirk", 16);
  1819. memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
  1820. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1821. trace_vfio_probe_nvidia_bar5_window_quirk(vdev->host.domain,
  1822. vdev->host.bus,
  1823. vdev->host.slot,
  1824. vdev->host.function);
  1825. }
  1826. static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr,
  1827. uint64_t data, unsigned size)
  1828. {
  1829. VFIOQuirk *quirk = opaque;
  1830. VFIODevice *vdev = quirk->vdev;
  1831. PCIDevice *pdev = &vdev->pdev;
  1832. hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
  1833. vfio_generic_quirk_write(opaque, addr, data, size);
  1834. /*
  1835. * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
  1836. * MSI capability ID register. Both the ID and next register are
  1837. * read-only, so we allow writes covering either of those to real hw.
  1838. * NB - only fixed for the 0x88000 MMIO window.
  1839. */
  1840. if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
  1841. vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
  1842. vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
  1843. }
  1844. }
  1845. static const MemoryRegionOps vfio_nvidia_88000_quirk = {
  1846. .read = vfio_generic_quirk_read,
  1847. .write = vfio_nvidia_88000_quirk_write,
  1848. .endianness = DEVICE_LITTLE_ENDIAN,
  1849. };
  1850. /*
  1851. * Finally, BAR0 itself. We want to redirect any accesses to either
  1852. * 0x1800 or 0x88000 through the PCI config space access functions.
  1853. *
  1854. * NB - quirk at a page granularity or else they don't seem to work when
  1855. * BARs are mmap'd
  1856. *
  1857. * Here's offset 0x88000...
  1858. */
  1859. static void vfio_probe_nvidia_bar0_88000_quirk(VFIODevice *vdev, int nr)
  1860. {
  1861. PCIDevice *pdev = &vdev->pdev;
  1862. VFIOQuirk *quirk;
  1863. uint16_t vendor, class;
  1864. vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
  1865. class = pci_get_word(pdev->config + PCI_CLASS_DEVICE);
  1866. if (nr != 0 || vendor != PCI_VENDOR_ID_NVIDIA ||
  1867. class != PCI_CLASS_DISPLAY_VGA) {
  1868. return;
  1869. }
  1870. quirk = g_malloc0(sizeof(*quirk));
  1871. quirk->vdev = vdev;
  1872. quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
  1873. quirk->data.address_match = 0x88000;
  1874. quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
  1875. quirk->data.bar = nr;
  1876. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk,
  1877. quirk, "vfio-nvidia-bar0-88000-quirk",
  1878. TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
  1879. memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
  1880. quirk->data.address_match & TARGET_PAGE_MASK,
  1881. &quirk->mem, 1);
  1882. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1883. trace_vfio_probe_nvidia_bar0_88000_quirk(vdev->host.domain,
  1884. vdev->host.bus,
  1885. vdev->host.slot,
  1886. vdev->host.function);
  1887. }
  1888. /*
  1889. * And here's the same for BAR0 offset 0x1800...
  1890. */
  1891. static void vfio_probe_nvidia_bar0_1800_quirk(VFIODevice *vdev, int nr)
  1892. {
  1893. PCIDevice *pdev = &vdev->pdev;
  1894. VFIOQuirk *quirk;
  1895. if (!vdev->has_vga || nr != 0 ||
  1896. pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
  1897. return;
  1898. }
  1899. /* Log the chipset ID */
  1900. trace_vfio_probe_nvidia_bar0_1800_quirk_id(
  1901. (unsigned int)(vfio_bar_read(&vdev->bars[0], 0, 4) >> 20) & 0xff);
  1902. quirk = g_malloc0(sizeof(*quirk));
  1903. quirk->vdev = vdev;
  1904. quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
  1905. quirk->data.address_match = 0x1800;
  1906. quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
  1907. quirk->data.bar = nr;
  1908. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
  1909. "vfio-nvidia-bar0-1800-quirk",
  1910. TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
  1911. memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
  1912. quirk->data.address_match & TARGET_PAGE_MASK,
  1913. &quirk->mem, 1);
  1914. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1915. trace_vfio_probe_nvidia_bar0_1800_quirk(vdev->host.domain,
  1916. vdev->host.bus,
  1917. vdev->host.slot,
  1918. vdev->host.function);
  1919. }
  1920. /*
  1921. * TODO - Some Nvidia devices provide config access to their companion HDA
  1922. * device and even to their parent bridge via these config space mirrors.
  1923. * Add quirks for those regions.
  1924. */
  1925. /*
  1926. * Common quirk probe entry points.
  1927. */
  1928. static void vfio_vga_quirk_setup(VFIODevice *vdev)
  1929. {
  1930. vfio_vga_probe_ati_3c3_quirk(vdev);
  1931. vfio_vga_probe_nvidia_3d0_quirk(vdev);
  1932. }
  1933. static void vfio_vga_quirk_teardown(VFIODevice *vdev)
  1934. {
  1935. int i;
  1936. for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
  1937. while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
  1938. VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
  1939. memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
  1940. object_unparent(OBJECT(&quirk->mem));
  1941. QLIST_REMOVE(quirk, next);
  1942. g_free(quirk);
  1943. }
  1944. }
  1945. }
  1946. static void vfio_bar_quirk_setup(VFIODevice *vdev, int nr)
  1947. {
  1948. vfio_probe_ati_bar4_window_quirk(vdev, nr);
  1949. vfio_probe_ati_bar2_4000_quirk(vdev, nr);
  1950. vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
  1951. vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
  1952. vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
  1953. vfio_probe_rtl8168_bar2_window_quirk(vdev, nr);
  1954. }
  1955. static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr)
  1956. {
  1957. VFIOBAR *bar = &vdev->bars[nr];
  1958. while (!QLIST_EMPTY(&bar->quirks)) {
  1959. VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
  1960. memory_region_del_subregion(&bar->mem, &quirk->mem);
  1961. object_unparent(OBJECT(&quirk->mem));
  1962. QLIST_REMOVE(quirk, next);
  1963. g_free(quirk);
  1964. }
  1965. }
  1966. /*
  1967. * PCI config space
  1968. */
  1969. static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
  1970. {
  1971. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  1972. uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
  1973. memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
  1974. emu_bits = le32_to_cpu(emu_bits);
  1975. if (emu_bits) {
  1976. emu_val = pci_default_read_config(pdev, addr, len);
  1977. }
  1978. if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
  1979. ssize_t ret;
  1980. ret = pread(vdev->fd, &phys_val, len, vdev->config_offset + addr);
  1981. if (ret != len) {
  1982. error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
  1983. __func__, vdev->host.domain, vdev->host.bus,
  1984. vdev->host.slot, vdev->host.function, addr, len);
  1985. return -errno;
  1986. }
  1987. phys_val = le32_to_cpu(phys_val);
  1988. }
  1989. val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
  1990. trace_vfio_pci_read_config(vdev->host.domain, vdev->host.bus,
  1991. vdev->host.slot, vdev->host.function,
  1992. addr, len, val);
  1993. return val;
  1994. }
  1995. static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
  1996. uint32_t val, int len)
  1997. {
  1998. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  1999. uint32_t val_le = cpu_to_le32(val);
  2000. trace_vfio_pci_write_config(vdev->host.domain, vdev->host.bus,
  2001. vdev->host.slot, vdev->host.function,
  2002. addr, val, len);
  2003. /* Write everything to VFIO, let it filter out what we can't write */
  2004. if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
  2005. error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
  2006. __func__, vdev->host.domain, vdev->host.bus,
  2007. vdev->host.slot, vdev->host.function, addr, val, len);
  2008. }
  2009. /* MSI/MSI-X Enabling/Disabling */
  2010. if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
  2011. ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
  2012. int is_enabled, was_enabled = msi_enabled(pdev);
  2013. pci_default_write_config(pdev, addr, val, len);
  2014. is_enabled = msi_enabled(pdev);
  2015. if (!was_enabled) {
  2016. if (is_enabled) {
  2017. vfio_enable_msi(vdev);
  2018. }
  2019. } else {
  2020. if (!is_enabled) {
  2021. vfio_disable_msi(vdev);
  2022. } else {
  2023. vfio_update_msi(vdev);
  2024. }
  2025. }
  2026. } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
  2027. ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
  2028. int is_enabled, was_enabled = msix_enabled(pdev);
  2029. pci_default_write_config(pdev, addr, val, len);
  2030. is_enabled = msix_enabled(pdev);
  2031. if (!was_enabled && is_enabled) {
  2032. vfio_enable_msix(vdev);
  2033. } else if (was_enabled && !is_enabled) {
  2034. vfio_disable_msix(vdev);
  2035. }
  2036. } else {
  2037. /* Write everything to QEMU to keep emulated bits correct */
  2038. pci_default_write_config(pdev, addr, val, len);
  2039. }
  2040. }
  2041. /*
  2042. * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  2043. */
  2044. static int vfio_dma_unmap(VFIOContainer *container,
  2045. hwaddr iova, ram_addr_t size)
  2046. {
  2047. struct vfio_iommu_type1_dma_unmap unmap = {
  2048. .argsz = sizeof(unmap),
  2049. .flags = 0,
  2050. .iova = iova,
  2051. .size = size,
  2052. };
  2053. if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
  2054. error_report("VFIO_UNMAP_DMA: %d\n", -errno);
  2055. return -errno;
  2056. }
  2057. return 0;
  2058. }
  2059. static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
  2060. ram_addr_t size, void *vaddr, bool readonly)
  2061. {
  2062. struct vfio_iommu_type1_dma_map map = {
  2063. .argsz = sizeof(map),
  2064. .flags = VFIO_DMA_MAP_FLAG_READ,
  2065. .vaddr = (__u64)(uintptr_t)vaddr,
  2066. .iova = iova,
  2067. .size = size,
  2068. };
  2069. if (!readonly) {
  2070. map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
  2071. }
  2072. /*
  2073. * Try the mapping, if it fails with EBUSY, unmap the region and try
  2074. * again. This shouldn't be necessary, but we sometimes see it in
  2075. * the the VGA ROM space.
  2076. */
  2077. if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
  2078. (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
  2079. ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
  2080. return 0;
  2081. }
  2082. error_report("VFIO_MAP_DMA: %d\n", -errno);
  2083. return -errno;
  2084. }
  2085. static bool vfio_listener_skipped_section(MemoryRegionSection *section)
  2086. {
  2087. return (!memory_region_is_ram(section->mr) &&
  2088. !memory_region_is_iommu(section->mr)) ||
  2089. /*
  2090. * Sizing an enabled 64-bit BAR can cause spurious mappings to
  2091. * addresses in the upper part of the 64-bit address space. These
  2092. * are never accessed by the CPU and beyond the address width of
  2093. * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
  2094. */
  2095. section->offset_within_address_space & (1ULL << 63);
  2096. }
  2097. static void vfio_iommu_map_notify(Notifier *n, void *data)
  2098. {
  2099. VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
  2100. VFIOContainer *container = giommu->container;
  2101. IOMMUTLBEntry *iotlb = data;
  2102. MemoryRegion *mr;
  2103. hwaddr xlat;
  2104. hwaddr len = iotlb->addr_mask + 1;
  2105. void *vaddr;
  2106. int ret;
  2107. trace_vfio_iommu_map_notify(iotlb->iova,
  2108. iotlb->iova + iotlb->addr_mask);
  2109. /*
  2110. * The IOMMU TLB entry we have just covers translation through
  2111. * this IOMMU to its immediate target. We need to translate
  2112. * it the rest of the way through to memory.
  2113. */
  2114. mr = address_space_translate(&address_space_memory,
  2115. iotlb->translated_addr,
  2116. &xlat, &len, iotlb->perm & IOMMU_WO);
  2117. if (!memory_region_is_ram(mr)) {
  2118. error_report("iommu map to non memory area %"HWADDR_PRIx"\n",
  2119. xlat);
  2120. return;
  2121. }
  2122. /*
  2123. * Translation truncates length to the IOMMU page size,
  2124. * check that it did not truncate too much.
  2125. */
  2126. if (len & iotlb->addr_mask) {
  2127. error_report("iommu has granularity incompatible with target AS\n");
  2128. return;
  2129. }
  2130. if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
  2131. vaddr = memory_region_get_ram_ptr(mr) + xlat;
  2132. ret = vfio_dma_map(container, iotlb->iova,
  2133. iotlb->addr_mask + 1, vaddr,
  2134. !(iotlb->perm & IOMMU_WO) || mr->readonly);
  2135. if (ret) {
  2136. error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
  2137. "0x%"HWADDR_PRIx", %p) = %d (%m)",
  2138. container, iotlb->iova,
  2139. iotlb->addr_mask + 1, vaddr, ret);
  2140. }
  2141. } else {
  2142. ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
  2143. if (ret) {
  2144. error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  2145. "0x%"HWADDR_PRIx") = %d (%m)",
  2146. container, iotlb->iova,
  2147. iotlb->addr_mask + 1, ret);
  2148. }
  2149. }
  2150. }
  2151. static void vfio_listener_region_add(MemoryListener *listener,
  2152. MemoryRegionSection *section)
  2153. {
  2154. VFIOContainer *container = container_of(listener, VFIOContainer,
  2155. iommu_data.type1.listener);
  2156. hwaddr iova, end;
  2157. Int128 llend;
  2158. void *vaddr;
  2159. int ret;
  2160. if (vfio_listener_skipped_section(section)) {
  2161. trace_vfio_listener_region_add_skip(
  2162. section->offset_within_address_space,
  2163. section->offset_within_address_space +
  2164. int128_get64(int128_sub(section->size, int128_one())));
  2165. return;
  2166. }
  2167. if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
  2168. (section->offset_within_region & ~TARGET_PAGE_MASK))) {
  2169. error_report("%s received unaligned region", __func__);
  2170. return;
  2171. }
  2172. iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
  2173. llend = int128_make64(section->offset_within_address_space);
  2174. llend = int128_add(llend, section->size);
  2175. llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
  2176. if (int128_ge(int128_make64(iova), llend)) {
  2177. return;
  2178. }
  2179. memory_region_ref(section->mr);
  2180. if (memory_region_is_iommu(section->mr)) {
  2181. VFIOGuestIOMMU *giommu;
  2182. trace_vfio_listener_region_add_iommu(iova,
  2183. int128_get64(int128_sub(llend, int128_one())));
  2184. /*
  2185. * FIXME: We should do some checking to see if the
  2186. * capabilities of the host VFIO IOMMU are adequate to model
  2187. * the guest IOMMU
  2188. *
  2189. * FIXME: For VFIO iommu types which have KVM acceleration to
  2190. * avoid bouncing all map/unmaps through qemu this way, this
  2191. * would be the right place to wire that up (tell the KVM
  2192. * device emulation the VFIO iommu handles to use).
  2193. */
  2194. /*
  2195. * This assumes that the guest IOMMU is empty of
  2196. * mappings at this point.
  2197. *
  2198. * One way of doing this is:
  2199. * 1. Avoid sharing IOMMUs between emulated devices or different
  2200. * IOMMU groups.
  2201. * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
  2202. * there are some mappings in IOMMU.
  2203. *
  2204. * VFIO on SPAPR does that. Other IOMMU models may do that different,
  2205. * they must make sure there are no existing mappings or
  2206. * loop through existing mappings to map them into VFIO.
  2207. */
  2208. giommu = g_malloc0(sizeof(*giommu));
  2209. giommu->iommu = section->mr;
  2210. giommu->container = container;
  2211. giommu->n.notify = vfio_iommu_map_notify;
  2212. QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
  2213. memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
  2214. return;
  2215. }
  2216. /* Here we assume that memory_region_is_ram(section->mr)==true */
  2217. end = int128_get64(llend);
  2218. vaddr = memory_region_get_ram_ptr(section->mr) +
  2219. section->offset_within_region +
  2220. (iova - section->offset_within_address_space);
  2221. trace_vfio_listener_region_add_ram(iova, end - 1, vaddr);
  2222. ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
  2223. if (ret) {
  2224. error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
  2225. "0x%"HWADDR_PRIx", %p) = %d (%m)",
  2226. container, iova, end - iova, vaddr, ret);
  2227. /*
  2228. * On the initfn path, store the first error in the container so we
  2229. * can gracefully fail. Runtime, there's not much we can do other
  2230. * than throw a hardware error.
  2231. */
  2232. if (!container->iommu_data.type1.initialized) {
  2233. if (!container->iommu_data.type1.error) {
  2234. container->iommu_data.type1.error = ret;
  2235. }
  2236. } else {
  2237. hw_error("vfio: DMA mapping failed, unable to continue");
  2238. }
  2239. }
  2240. }
  2241. static void vfio_listener_region_del(MemoryListener *listener,
  2242. MemoryRegionSection *section)
  2243. {
  2244. VFIOContainer *container = container_of(listener, VFIOContainer,
  2245. iommu_data.type1.listener);
  2246. hwaddr iova, end;
  2247. int ret;
  2248. if (vfio_listener_skipped_section(section)) {
  2249. trace_vfio_listener_region_del_skip(
  2250. section->offset_within_address_space,
  2251. section->offset_within_address_space +
  2252. int128_get64(int128_sub(section->size, int128_one())));
  2253. return;
  2254. }
  2255. if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
  2256. (section->offset_within_region & ~TARGET_PAGE_MASK))) {
  2257. error_report("%s received unaligned region", __func__);
  2258. return;
  2259. }
  2260. if (memory_region_is_iommu(section->mr)) {
  2261. VFIOGuestIOMMU *giommu;
  2262. QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
  2263. if (giommu->iommu == section->mr) {
  2264. memory_region_unregister_iommu_notifier(&giommu->n);
  2265. QLIST_REMOVE(giommu, giommu_next);
  2266. g_free(giommu);
  2267. break;
  2268. }
  2269. }
  2270. /*
  2271. * FIXME: We assume the one big unmap below is adequate to
  2272. * remove any individual page mappings in the IOMMU which
  2273. * might have been copied into VFIO. This works for a page table
  2274. * based IOMMU where a big unmap flattens a large range of IO-PTEs.
  2275. * That may not be true for all IOMMU types.
  2276. */
  2277. }
  2278. iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
  2279. end = (section->offset_within_address_space + int128_get64(section->size)) &
  2280. TARGET_PAGE_MASK;
  2281. if (iova >= end) {
  2282. return;
  2283. }
  2284. trace_vfio_listener_region_del(iova, end - 1);
  2285. ret = vfio_dma_unmap(container, iova, end - iova);
  2286. memory_region_unref(section->mr);
  2287. if (ret) {
  2288. error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  2289. "0x%"HWADDR_PRIx") = %d (%m)",
  2290. container, iova, end - iova, ret);
  2291. }
  2292. }
  2293. static MemoryListener vfio_memory_listener = {
  2294. .region_add = vfio_listener_region_add,
  2295. .region_del = vfio_listener_region_del,
  2296. };
  2297. static void vfio_listener_release(VFIOContainer *container)
  2298. {
  2299. memory_listener_unregister(&container->iommu_data.type1.listener);
  2300. }
  2301. /*
  2302. * Interrupt setup
  2303. */
  2304. static void vfio_disable_interrupts(VFIODevice *vdev)
  2305. {
  2306. switch (vdev->interrupt) {
  2307. case VFIO_INT_INTx:
  2308. vfio_disable_intx(vdev);
  2309. break;
  2310. case VFIO_INT_MSI:
  2311. vfio_disable_msi(vdev);
  2312. break;
  2313. case VFIO_INT_MSIX:
  2314. vfio_disable_msix(vdev);
  2315. break;
  2316. }
  2317. }
  2318. static int vfio_setup_msi(VFIODevice *vdev, int pos)
  2319. {
  2320. uint16_t ctrl;
  2321. bool msi_64bit, msi_maskbit;
  2322. int ret, entries;
  2323. if (pread(vdev->fd, &ctrl, sizeof(ctrl),
  2324. vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
  2325. return -errno;
  2326. }
  2327. ctrl = le16_to_cpu(ctrl);
  2328. msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
  2329. msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
  2330. entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
  2331. trace_vfio_setup_msi(vdev->host.domain, vdev->host.bus,
  2332. vdev->host.slot, vdev->host.function, pos);
  2333. ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
  2334. if (ret < 0) {
  2335. if (ret == -ENOTSUP) {
  2336. return 0;
  2337. }
  2338. error_report("vfio: msi_init failed");
  2339. return ret;
  2340. }
  2341. vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
  2342. return 0;
  2343. }
  2344. /*
  2345. * We don't have any control over how pci_add_capability() inserts
  2346. * capabilities into the chain. In order to setup MSI-X we need a
  2347. * MemoryRegion for the BAR. In order to setup the BAR and not
  2348. * attempt to mmap the MSI-X table area, which VFIO won't allow, we
  2349. * need to first look for where the MSI-X table lives. So we
  2350. * unfortunately split MSI-X setup across two functions.
  2351. */
  2352. static int vfio_early_setup_msix(VFIODevice *vdev)
  2353. {
  2354. uint8_t pos;
  2355. uint16_t ctrl;
  2356. uint32_t table, pba;
  2357. pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
  2358. if (!pos) {
  2359. return 0;
  2360. }
  2361. if (pread(vdev->fd, &ctrl, sizeof(ctrl),
  2362. vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
  2363. return -errno;
  2364. }
  2365. if (pread(vdev->fd, &table, sizeof(table),
  2366. vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
  2367. return -errno;
  2368. }
  2369. if (pread(vdev->fd, &pba, sizeof(pba),
  2370. vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
  2371. return -errno;
  2372. }
  2373. ctrl = le16_to_cpu(ctrl);
  2374. table = le32_to_cpu(table);
  2375. pba = le32_to_cpu(pba);
  2376. vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
  2377. vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
  2378. vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
  2379. vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
  2380. vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
  2381. vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
  2382. trace_vfio_early_setup_msix(vdev->host.domain, vdev->host.bus,
  2383. vdev->host.slot, vdev->host.function,
  2384. pos, vdev->msix->table_bar,
  2385. vdev->msix->table_offset,
  2386. vdev->msix->entries);
  2387. return 0;
  2388. }
  2389. static int vfio_setup_msix(VFIODevice *vdev, int pos)
  2390. {
  2391. int ret;
  2392. ret = msix_init(&vdev->pdev, vdev->msix->entries,
  2393. &vdev->bars[vdev->msix->table_bar].mem,
  2394. vdev->msix->table_bar, vdev->msix->table_offset,
  2395. &vdev->bars[vdev->msix->pba_bar].mem,
  2396. vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
  2397. if (ret < 0) {
  2398. if (ret == -ENOTSUP) {
  2399. return 0;
  2400. }
  2401. error_report("vfio: msix_init failed");
  2402. return ret;
  2403. }
  2404. return 0;
  2405. }
  2406. static void vfio_teardown_msi(VFIODevice *vdev)
  2407. {
  2408. msi_uninit(&vdev->pdev);
  2409. if (vdev->msix) {
  2410. msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
  2411. &vdev->bars[vdev->msix->pba_bar].mem);
  2412. }
  2413. }
  2414. /*
  2415. * Resource setup
  2416. */
  2417. static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled)
  2418. {
  2419. int i;
  2420. for (i = 0; i < PCI_ROM_SLOT; i++) {
  2421. VFIOBAR *bar = &vdev->bars[i];
  2422. if (!bar->size) {
  2423. continue;
  2424. }
  2425. memory_region_set_enabled(&bar->mmap_mem, enabled);
  2426. if (vdev->msix && vdev->msix->table_bar == i) {
  2427. memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
  2428. }
  2429. }
  2430. }
  2431. static void vfio_unmap_bar(VFIODevice *vdev, int nr)
  2432. {
  2433. VFIOBAR *bar = &vdev->bars[nr];
  2434. if (!bar->size) {
  2435. return;
  2436. }
  2437. vfio_bar_quirk_teardown(vdev, nr);
  2438. memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
  2439. munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
  2440. if (vdev->msix && vdev->msix->table_bar == nr) {
  2441. memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
  2442. munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
  2443. }
  2444. }
  2445. static int vfio_mmap_bar(VFIODevice *vdev, VFIOBAR *bar,
  2446. MemoryRegion *mem, MemoryRegion *submem,
  2447. void **map, size_t size, off_t offset,
  2448. const char *name)
  2449. {
  2450. int ret = 0;
  2451. if (VFIO_ALLOW_MMAP && size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
  2452. int prot = 0;
  2453. if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
  2454. prot |= PROT_READ;
  2455. }
  2456. if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
  2457. prot |= PROT_WRITE;
  2458. }
  2459. *map = mmap(NULL, size, prot, MAP_SHARED,
  2460. bar->fd, bar->fd_offset + offset);
  2461. if (*map == MAP_FAILED) {
  2462. *map = NULL;
  2463. ret = -errno;
  2464. goto empty_region;
  2465. }
  2466. memory_region_init_ram_ptr(submem, OBJECT(vdev), name, size, *map);
  2467. memory_region_set_skip_dump(submem);
  2468. } else {
  2469. empty_region:
  2470. /* Create a zero sized sub-region to make cleanup easy. */
  2471. memory_region_init(submem, OBJECT(vdev), name, 0);
  2472. }
  2473. memory_region_add_subregion(mem, offset, submem);
  2474. return ret;
  2475. }
  2476. static void vfio_map_bar(VFIODevice *vdev, int nr)
  2477. {
  2478. VFIOBAR *bar = &vdev->bars[nr];
  2479. unsigned size = bar->size;
  2480. char name[64];
  2481. uint32_t pci_bar;
  2482. uint8_t type;
  2483. int ret;
  2484. /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
  2485. if (!size) {
  2486. return;
  2487. }
  2488. snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
  2489. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  2490. vdev->host.function, nr);
  2491. /* Determine what type of BAR this is for registration */
  2492. ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
  2493. vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
  2494. if (ret != sizeof(pci_bar)) {
  2495. error_report("vfio: Failed to read BAR %d (%m)", nr);
  2496. return;
  2497. }
  2498. pci_bar = le32_to_cpu(pci_bar);
  2499. bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
  2500. bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
  2501. type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
  2502. ~PCI_BASE_ADDRESS_MEM_MASK);
  2503. /* A "slow" read/write mapping underlies all BARs */
  2504. memory_region_init_io(&bar->mem, OBJECT(vdev), &vfio_bar_ops,
  2505. bar, name, size);
  2506. pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
  2507. /*
  2508. * We can't mmap areas overlapping the MSIX vector table, so we
  2509. * potentially insert a direct-mapped subregion before and after it.
  2510. */
  2511. if (vdev->msix && vdev->msix->table_bar == nr) {
  2512. size = vdev->msix->table_offset & qemu_host_page_mask;
  2513. }
  2514. strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
  2515. if (vfio_mmap_bar(vdev, bar, &bar->mem,
  2516. &bar->mmap_mem, &bar->mmap, size, 0, name)) {
  2517. error_report("%s unsupported. Performance may be slow", name);
  2518. }
  2519. if (vdev->msix && vdev->msix->table_bar == nr) {
  2520. unsigned start;
  2521. start = HOST_PAGE_ALIGN(vdev->msix->table_offset +
  2522. (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
  2523. size = start < bar->size ? bar->size - start : 0;
  2524. strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
  2525. /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
  2526. if (vfio_mmap_bar(vdev, bar, &bar->mem, &vdev->msix->mmap_mem,
  2527. &vdev->msix->mmap, size, start, name)) {
  2528. error_report("%s unsupported. Performance may be slow", name);
  2529. }
  2530. }
  2531. vfio_bar_quirk_setup(vdev, nr);
  2532. }
  2533. static void vfio_map_bars(VFIODevice *vdev)
  2534. {
  2535. int i;
  2536. for (i = 0; i < PCI_ROM_SLOT; i++) {
  2537. vfio_map_bar(vdev, i);
  2538. }
  2539. if (vdev->has_vga) {
  2540. memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
  2541. OBJECT(vdev), &vfio_vga_ops,
  2542. &vdev->vga.region[QEMU_PCI_VGA_MEM],
  2543. "vfio-vga-mmio@0xa0000",
  2544. QEMU_PCI_VGA_MEM_SIZE);
  2545. memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
  2546. OBJECT(vdev), &vfio_vga_ops,
  2547. &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
  2548. "vfio-vga-io@0x3b0",
  2549. QEMU_PCI_VGA_IO_LO_SIZE);
  2550. memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
  2551. OBJECT(vdev), &vfio_vga_ops,
  2552. &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
  2553. "vfio-vga-io@0x3c0",
  2554. QEMU_PCI_VGA_IO_HI_SIZE);
  2555. pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
  2556. &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
  2557. &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
  2558. vfio_vga_quirk_setup(vdev);
  2559. }
  2560. }
  2561. static void vfio_unmap_bars(VFIODevice *vdev)
  2562. {
  2563. int i;
  2564. for (i = 0; i < PCI_ROM_SLOT; i++) {
  2565. vfio_unmap_bar(vdev, i);
  2566. }
  2567. if (vdev->has_vga) {
  2568. vfio_vga_quirk_teardown(vdev);
  2569. pci_unregister_vga(&vdev->pdev);
  2570. }
  2571. }
  2572. /*
  2573. * General setup
  2574. */
  2575. static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
  2576. {
  2577. uint8_t tmp, next = 0xff;
  2578. for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
  2579. tmp = pdev->config[tmp + 1]) {
  2580. if (tmp > pos && tmp < next) {
  2581. next = tmp;
  2582. }
  2583. }
  2584. return next - pos;
  2585. }
  2586. static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
  2587. {
  2588. pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
  2589. }
  2590. static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
  2591. uint16_t val, uint16_t mask)
  2592. {
  2593. vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
  2594. vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
  2595. vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
  2596. }
  2597. static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
  2598. {
  2599. pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
  2600. }
  2601. static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
  2602. uint32_t val, uint32_t mask)
  2603. {
  2604. vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
  2605. vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
  2606. vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
  2607. }
  2608. static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
  2609. {
  2610. uint16_t flags;
  2611. uint8_t type;
  2612. flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
  2613. type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
  2614. if (type != PCI_EXP_TYPE_ENDPOINT &&
  2615. type != PCI_EXP_TYPE_LEG_END &&
  2616. type != PCI_EXP_TYPE_RC_END) {
  2617. error_report("vfio: Assignment of PCIe type 0x%x "
  2618. "devices is not currently supported", type);
  2619. return -EINVAL;
  2620. }
  2621. if (!pci_bus_is_express(vdev->pdev.bus)) {
  2622. /*
  2623. * Use express capability as-is on PCI bus. It doesn't make much
  2624. * sense to even expose, but some drivers (ex. tg3) depend on it
  2625. * and guests don't seem to be particular about it. We'll need
  2626. * to revist this or force express devices to express buses if we
  2627. * ever expose an IOMMU to the guest.
  2628. */
  2629. } else if (pci_bus_is_root(vdev->pdev.bus)) {
  2630. /*
  2631. * On a Root Complex bus Endpoints become Root Complex Integrated
  2632. * Endpoints, which changes the type and clears the LNK & LNK2 fields.
  2633. */
  2634. if (type == PCI_EXP_TYPE_ENDPOINT) {
  2635. vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
  2636. PCI_EXP_TYPE_RC_END << 4,
  2637. PCI_EXP_FLAGS_TYPE);
  2638. /* Link Capabilities, Status, and Control goes away */
  2639. if (size > PCI_EXP_LNKCTL) {
  2640. vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
  2641. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
  2642. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
  2643. #ifndef PCI_EXP_LNKCAP2
  2644. #define PCI_EXP_LNKCAP2 44
  2645. #endif
  2646. #ifndef PCI_EXP_LNKSTA2
  2647. #define PCI_EXP_LNKSTA2 50
  2648. #endif
  2649. /* Link 2 Capabilities, Status, and Control goes away */
  2650. if (size > PCI_EXP_LNKCAP2) {
  2651. vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
  2652. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
  2653. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
  2654. }
  2655. }
  2656. } else if (type == PCI_EXP_TYPE_LEG_END) {
  2657. /*
  2658. * Legacy endpoints don't belong on the root complex. Windows
  2659. * seems to be happier with devices if we skip the capability.
  2660. */
  2661. return 0;
  2662. }
  2663. } else {
  2664. /*
  2665. * Convert Root Complex Integrated Endpoints to regular endpoints.
  2666. * These devices don't support LNK/LNK2 capabilities, so make them up.
  2667. */
  2668. if (type == PCI_EXP_TYPE_RC_END) {
  2669. vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
  2670. PCI_EXP_TYPE_ENDPOINT << 4,
  2671. PCI_EXP_FLAGS_TYPE);
  2672. vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
  2673. PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
  2674. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
  2675. }
  2676. /* Mark the Link Status bits as emulated to allow virtual negotiation */
  2677. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
  2678. pci_get_word(vdev->pdev.config + pos +
  2679. PCI_EXP_LNKSTA),
  2680. PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
  2681. }
  2682. pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
  2683. if (pos >= 0) {
  2684. vdev->pdev.exp.exp_cap = pos;
  2685. }
  2686. return pos;
  2687. }
  2688. static void vfio_check_pcie_flr(VFIODevice *vdev, uint8_t pos)
  2689. {
  2690. uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
  2691. if (cap & PCI_EXP_DEVCAP_FLR) {
  2692. trace_vfio_check_pcie_flr(vdev->host.domain, vdev->host.bus,
  2693. vdev->host.slot, vdev->host.function);
  2694. vdev->has_flr = true;
  2695. }
  2696. }
  2697. static void vfio_check_pm_reset(VFIODevice *vdev, uint8_t pos)
  2698. {
  2699. uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
  2700. if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
  2701. trace_vfio_check_pm_reset(vdev->host.domain, vdev->host.bus,
  2702. vdev->host.slot, vdev->host.function);
  2703. vdev->has_pm_reset = true;
  2704. }
  2705. }
  2706. static void vfio_check_af_flr(VFIODevice *vdev, uint8_t pos)
  2707. {
  2708. uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
  2709. if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
  2710. trace_vfio_check_af_flr(vdev->host.domain, vdev->host.bus,
  2711. vdev->host.slot, vdev->host.function);
  2712. vdev->has_flr = true;
  2713. }
  2714. }
  2715. static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
  2716. {
  2717. PCIDevice *pdev = &vdev->pdev;
  2718. uint8_t cap_id, next, size;
  2719. int ret;
  2720. cap_id = pdev->config[pos];
  2721. next = pdev->config[pos + 1];
  2722. /*
  2723. * If it becomes important to configure capabilities to their actual
  2724. * size, use this as the default when it's something we don't recognize.
  2725. * Since QEMU doesn't actually handle many of the config accesses,
  2726. * exact size doesn't seem worthwhile.
  2727. */
  2728. size = vfio_std_cap_max_size(pdev, pos);
  2729. /*
  2730. * pci_add_capability always inserts the new capability at the head
  2731. * of the chain. Therefore to end up with a chain that matches the
  2732. * physical device, we insert from the end by making this recursive.
  2733. * This is also why we pre-caclulate size above as cached config space
  2734. * will be changed as we unwind the stack.
  2735. */
  2736. if (next) {
  2737. ret = vfio_add_std_cap(vdev, next);
  2738. if (ret) {
  2739. return ret;
  2740. }
  2741. } else {
  2742. /* Begin the rebuild, use QEMU emulated list bits */
  2743. pdev->config[PCI_CAPABILITY_LIST] = 0;
  2744. vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
  2745. vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
  2746. }
  2747. /* Use emulated next pointer to allow dropping caps */
  2748. pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
  2749. switch (cap_id) {
  2750. case PCI_CAP_ID_MSI:
  2751. ret = vfio_setup_msi(vdev, pos);
  2752. break;
  2753. case PCI_CAP_ID_EXP:
  2754. vfio_check_pcie_flr(vdev, pos);
  2755. ret = vfio_setup_pcie_cap(vdev, pos, size);
  2756. break;
  2757. case PCI_CAP_ID_MSIX:
  2758. ret = vfio_setup_msix(vdev, pos);
  2759. break;
  2760. case PCI_CAP_ID_PM:
  2761. vfio_check_pm_reset(vdev, pos);
  2762. vdev->pm_cap = pos;
  2763. ret = pci_add_capability(pdev, cap_id, pos, size);
  2764. break;
  2765. case PCI_CAP_ID_AF:
  2766. vfio_check_af_flr(vdev, pos);
  2767. ret = pci_add_capability(pdev, cap_id, pos, size);
  2768. break;
  2769. default:
  2770. ret = pci_add_capability(pdev, cap_id, pos, size);
  2771. break;
  2772. }
  2773. if (ret < 0) {
  2774. error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
  2775. "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
  2776. vdev->host.bus, vdev->host.slot, vdev->host.function,
  2777. cap_id, size, pos, ret);
  2778. return ret;
  2779. }
  2780. return 0;
  2781. }
  2782. static int vfio_add_capabilities(VFIODevice *vdev)
  2783. {
  2784. PCIDevice *pdev = &vdev->pdev;
  2785. if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
  2786. !pdev->config[PCI_CAPABILITY_LIST]) {
  2787. return 0; /* Nothing to add */
  2788. }
  2789. return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
  2790. }
  2791. static void vfio_pci_pre_reset(VFIODevice *vdev)
  2792. {
  2793. PCIDevice *pdev = &vdev->pdev;
  2794. uint16_t cmd;
  2795. vfio_disable_interrupts(vdev);
  2796. /* Make sure the device is in D0 */
  2797. if (vdev->pm_cap) {
  2798. uint16_t pmcsr;
  2799. uint8_t state;
  2800. pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
  2801. state = pmcsr & PCI_PM_CTRL_STATE_MASK;
  2802. if (state) {
  2803. pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
  2804. vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
  2805. /* vfio handles the necessary delay here */
  2806. pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
  2807. state = pmcsr & PCI_PM_CTRL_STATE_MASK;
  2808. if (state) {
  2809. error_report("vfio: Unable to power on device, stuck in D%d",
  2810. state);
  2811. }
  2812. }
  2813. }
  2814. /*
  2815. * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
  2816. * Also put INTx Disable in known state.
  2817. */
  2818. cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
  2819. cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
  2820. PCI_COMMAND_INTX_DISABLE);
  2821. vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
  2822. }
  2823. static void vfio_pci_post_reset(VFIODevice *vdev)
  2824. {
  2825. vfio_enable_intx(vdev);
  2826. }
  2827. static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
  2828. PCIHostDeviceAddress *host2)
  2829. {
  2830. return (host1->domain == host2->domain && host1->bus == host2->bus &&
  2831. host1->slot == host2->slot && host1->function == host2->function);
  2832. }
  2833. static int vfio_pci_hot_reset(VFIODevice *vdev, bool single)
  2834. {
  2835. VFIOGroup *group;
  2836. struct vfio_pci_hot_reset_info *info;
  2837. struct vfio_pci_dependent_device *devices;
  2838. struct vfio_pci_hot_reset *reset;
  2839. int32_t *fds;
  2840. int ret, i, count;
  2841. bool multi = false;
  2842. trace_vfio_pci_hot_reset(vdev->host.domain, vdev->host.bus,
  2843. vdev->host.slot, vdev->host.function,
  2844. single ? "one" : "multi");
  2845. vfio_pci_pre_reset(vdev);
  2846. vdev->needs_reset = false;
  2847. info = g_malloc0(sizeof(*info));
  2848. info->argsz = sizeof(*info);
  2849. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
  2850. if (ret && errno != ENOSPC) {
  2851. ret = -errno;
  2852. if (!vdev->has_pm_reset) {
  2853. error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
  2854. "no available reset mechanism.", vdev->host.domain,
  2855. vdev->host.bus, vdev->host.slot, vdev->host.function);
  2856. }
  2857. goto out_single;
  2858. }
  2859. count = info->count;
  2860. info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
  2861. info->argsz = sizeof(*info) + (count * sizeof(*devices));
  2862. devices = &info->devices[0];
  2863. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
  2864. if (ret) {
  2865. ret = -errno;
  2866. error_report("vfio: hot reset info failed: %m");
  2867. goto out_single;
  2868. }
  2869. trace_vfio_pci_hot_reset_has_dep_devices(vdev->host.domain,
  2870. vdev->host.bus,
  2871. vdev->host.slot,
  2872. vdev->host.function);
  2873. /* Verify that we have all the groups required */
  2874. for (i = 0; i < info->count; i++) {
  2875. PCIHostDeviceAddress host;
  2876. VFIODevice *tmp;
  2877. host.domain = devices[i].segment;
  2878. host.bus = devices[i].bus;
  2879. host.slot = PCI_SLOT(devices[i].devfn);
  2880. host.function = PCI_FUNC(devices[i].devfn);
  2881. trace_vfio_pci_hot_reset_dep_devices(host.domain,
  2882. host.bus, host.slot, host.function, devices[i].group_id);
  2883. if (vfio_pci_host_match(&host, &vdev->host)) {
  2884. continue;
  2885. }
  2886. QLIST_FOREACH(group, &group_list, next) {
  2887. if (group->groupid == devices[i].group_id) {
  2888. break;
  2889. }
  2890. }
  2891. if (!group) {
  2892. if (!vdev->has_pm_reset) {
  2893. error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
  2894. "depends on group %d which is not owned.",
  2895. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  2896. vdev->host.function, devices[i].group_id);
  2897. }
  2898. ret = -EPERM;
  2899. goto out;
  2900. }
  2901. /* Prep dependent devices for reset and clear our marker. */
  2902. QLIST_FOREACH(tmp, &group->device_list, next) {
  2903. if (vfio_pci_host_match(&host, &tmp->host)) {
  2904. if (single) {
  2905. error_report("vfio: found another in-use device "
  2906. "%04x:%02x:%02x.%x\n", host.domain, host.bus,
  2907. host.slot, host.function);
  2908. ret = -EINVAL;
  2909. goto out_single;
  2910. }
  2911. vfio_pci_pre_reset(tmp);
  2912. tmp->needs_reset = false;
  2913. multi = true;
  2914. break;
  2915. }
  2916. }
  2917. }
  2918. if (!single && !multi) {
  2919. error_report("vfio: No other in-use devices for multi hot reset\n");
  2920. ret = -EINVAL;
  2921. goto out_single;
  2922. }
  2923. /* Determine how many group fds need to be passed */
  2924. count = 0;
  2925. QLIST_FOREACH(group, &group_list, next) {
  2926. for (i = 0; i < info->count; i++) {
  2927. if (group->groupid == devices[i].group_id) {
  2928. count++;
  2929. break;
  2930. }
  2931. }
  2932. }
  2933. reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
  2934. reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
  2935. fds = &reset->group_fds[0];
  2936. /* Fill in group fds */
  2937. QLIST_FOREACH(group, &group_list, next) {
  2938. for (i = 0; i < info->count; i++) {
  2939. if (group->groupid == devices[i].group_id) {
  2940. fds[reset->count++] = group->fd;
  2941. break;
  2942. }
  2943. }
  2944. }
  2945. /* Bus reset! */
  2946. ret = ioctl(vdev->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
  2947. g_free(reset);
  2948. trace_vfio_pci_hot_reset_result(vdev->host.domain,
  2949. vdev->host.bus,
  2950. vdev->host.slot,
  2951. vdev->host.function,
  2952. ret ? "%m" : "Success");
  2953. out:
  2954. /* Re-enable INTx on affected devices */
  2955. for (i = 0; i < info->count; i++) {
  2956. PCIHostDeviceAddress host;
  2957. VFIODevice *tmp;
  2958. host.domain = devices[i].segment;
  2959. host.bus = devices[i].bus;
  2960. host.slot = PCI_SLOT(devices[i].devfn);
  2961. host.function = PCI_FUNC(devices[i].devfn);
  2962. if (vfio_pci_host_match(&host, &vdev->host)) {
  2963. continue;
  2964. }
  2965. QLIST_FOREACH(group, &group_list, next) {
  2966. if (group->groupid == devices[i].group_id) {
  2967. break;
  2968. }
  2969. }
  2970. if (!group) {
  2971. break;
  2972. }
  2973. QLIST_FOREACH(tmp, &group->device_list, next) {
  2974. if (vfio_pci_host_match(&host, &tmp->host)) {
  2975. vfio_pci_post_reset(tmp);
  2976. break;
  2977. }
  2978. }
  2979. }
  2980. out_single:
  2981. vfio_pci_post_reset(vdev);
  2982. g_free(info);
  2983. return ret;
  2984. }
  2985. /*
  2986. * We want to differentiate hot reset of mulitple in-use devices vs hot reset
  2987. * of a single in-use device. VFIO_DEVICE_RESET will already handle the case
  2988. * of doing hot resets when there is only a single device per bus. The in-use
  2989. * here refers to how many VFIODevices are affected. A hot reset that affects
  2990. * multiple devices, but only a single in-use device, means that we can call
  2991. * it from our bus ->reset() callback since the extent is effectively a single
  2992. * device. This allows us to make use of it in the hotplug path. When there
  2993. * are multiple in-use devices, we can only trigger the hot reset during a
  2994. * system reset and thus from our reset handler. We separate _one vs _multi
  2995. * here so that we don't overlap and do a double reset on the system reset
  2996. * path where both our reset handler and ->reset() callback are used. Calling
  2997. * _one() will only do a hot reset for the one in-use devices case, calling
  2998. * _multi() will do nothing if a _one() would have been sufficient.
  2999. */
  3000. static int vfio_pci_hot_reset_one(VFIODevice *vdev)
  3001. {
  3002. return vfio_pci_hot_reset(vdev, true);
  3003. }
  3004. static int vfio_pci_hot_reset_multi(VFIODevice *vdev)
  3005. {
  3006. return vfio_pci_hot_reset(vdev, false);
  3007. }
  3008. static void vfio_pci_reset_handler(void *opaque)
  3009. {
  3010. VFIOGroup *group;
  3011. VFIODevice *vdev;
  3012. QLIST_FOREACH(group, &group_list, next) {
  3013. QLIST_FOREACH(vdev, &group->device_list, next) {
  3014. if (!vdev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
  3015. vdev->needs_reset = true;
  3016. }
  3017. }
  3018. }
  3019. QLIST_FOREACH(group, &group_list, next) {
  3020. QLIST_FOREACH(vdev, &group->device_list, next) {
  3021. if (vdev->needs_reset) {
  3022. vfio_pci_hot_reset_multi(vdev);
  3023. }
  3024. }
  3025. }
  3026. }
  3027. static void vfio_kvm_device_add_group(VFIOGroup *group)
  3028. {
  3029. #ifdef CONFIG_KVM
  3030. struct kvm_device_attr attr = {
  3031. .group = KVM_DEV_VFIO_GROUP,
  3032. .attr = KVM_DEV_VFIO_GROUP_ADD,
  3033. .addr = (uint64_t)(unsigned long)&group->fd,
  3034. };
  3035. if (!kvm_enabled()) {
  3036. return;
  3037. }
  3038. if (vfio_kvm_device_fd < 0) {
  3039. struct kvm_create_device cd = {
  3040. .type = KVM_DEV_TYPE_VFIO,
  3041. };
  3042. if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
  3043. error_report("KVM_CREATE_DEVICE: %m\n");
  3044. return;
  3045. }
  3046. vfio_kvm_device_fd = cd.fd;
  3047. }
  3048. if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
  3049. error_report("Failed to add group %d to KVM VFIO device: %m",
  3050. group->groupid);
  3051. }
  3052. #endif
  3053. }
  3054. static void vfio_kvm_device_del_group(VFIOGroup *group)
  3055. {
  3056. #ifdef CONFIG_KVM
  3057. struct kvm_device_attr attr = {
  3058. .group = KVM_DEV_VFIO_GROUP,
  3059. .attr = KVM_DEV_VFIO_GROUP_DEL,
  3060. .addr = (uint64_t)(unsigned long)&group->fd,
  3061. };
  3062. if (vfio_kvm_device_fd < 0) {
  3063. return;
  3064. }
  3065. if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
  3066. error_report("Failed to remove group %d from KVM VFIO device: %m",
  3067. group->groupid);
  3068. }
  3069. #endif
  3070. }
  3071. static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
  3072. {
  3073. VFIOAddressSpace *space;
  3074. QLIST_FOREACH(space, &vfio_address_spaces, list) {
  3075. if (space->as == as) {
  3076. return space;
  3077. }
  3078. }
  3079. /* No suitable VFIOAddressSpace, create a new one */
  3080. space = g_malloc0(sizeof(*space));
  3081. space->as = as;
  3082. QLIST_INIT(&space->containers);
  3083. QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
  3084. return space;
  3085. }
  3086. static void vfio_put_address_space(VFIOAddressSpace *space)
  3087. {
  3088. if (QLIST_EMPTY(&space->containers)) {
  3089. QLIST_REMOVE(space, list);
  3090. g_free(space);
  3091. }
  3092. }
  3093. static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
  3094. {
  3095. VFIOContainer *container;
  3096. int ret, fd;
  3097. VFIOAddressSpace *space;
  3098. space = vfio_get_address_space(as);
  3099. QLIST_FOREACH(container, &space->containers, next) {
  3100. if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
  3101. group->container = container;
  3102. QLIST_INSERT_HEAD(&container->group_list, group, container_next);
  3103. return 0;
  3104. }
  3105. }
  3106. fd = qemu_open("/dev/vfio/vfio", O_RDWR);
  3107. if (fd < 0) {
  3108. error_report("vfio: failed to open /dev/vfio/vfio: %m");
  3109. ret = -errno;
  3110. goto put_space_exit;
  3111. }
  3112. ret = ioctl(fd, VFIO_GET_API_VERSION);
  3113. if (ret != VFIO_API_VERSION) {
  3114. error_report("vfio: supported vfio version: %d, "
  3115. "reported version: %d", VFIO_API_VERSION, ret);
  3116. ret = -EINVAL;
  3117. goto close_fd_exit;
  3118. }
  3119. container = g_malloc0(sizeof(*container));
  3120. container->space = space;
  3121. container->fd = fd;
  3122. if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
  3123. ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
  3124. if (ret) {
  3125. error_report("vfio: failed to set group container: %m");
  3126. ret = -errno;
  3127. goto free_container_exit;
  3128. }
  3129. ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
  3130. if (ret) {
  3131. error_report("vfio: failed to set iommu for container: %m");
  3132. ret = -errno;
  3133. goto free_container_exit;
  3134. }
  3135. container->iommu_data.type1.listener = vfio_memory_listener;
  3136. container->iommu_data.release = vfio_listener_release;
  3137. memory_listener_register(&container->iommu_data.type1.listener,
  3138. container->space->as);
  3139. if (container->iommu_data.type1.error) {
  3140. ret = container->iommu_data.type1.error;
  3141. error_report("vfio: memory listener initialization failed for container");
  3142. goto listener_release_exit;
  3143. }
  3144. container->iommu_data.type1.initialized = true;
  3145. } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
  3146. ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
  3147. if (ret) {
  3148. error_report("vfio: failed to set group container: %m");
  3149. ret = -errno;
  3150. goto free_container_exit;
  3151. }
  3152. ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU);
  3153. if (ret) {
  3154. error_report("vfio: failed to set iommu for container: %m");
  3155. ret = -errno;
  3156. goto free_container_exit;
  3157. }
  3158. /*
  3159. * The host kernel code implementing VFIO_IOMMU_DISABLE is called
  3160. * when container fd is closed so we do not call it explicitly
  3161. * in this file.
  3162. */
  3163. ret = ioctl(fd, VFIO_IOMMU_ENABLE);
  3164. if (ret) {
  3165. error_report("vfio: failed to enable container: %m");
  3166. ret = -errno;
  3167. goto free_container_exit;
  3168. }
  3169. container->iommu_data.type1.listener = vfio_memory_listener;
  3170. container->iommu_data.release = vfio_listener_release;
  3171. memory_listener_register(&container->iommu_data.type1.listener,
  3172. container->space->as);
  3173. } else {
  3174. error_report("vfio: No available IOMMU models");
  3175. ret = -EINVAL;
  3176. goto free_container_exit;
  3177. }
  3178. QLIST_INIT(&container->group_list);
  3179. QLIST_INSERT_HEAD(&space->containers, container, next);
  3180. group->container = container;
  3181. QLIST_INSERT_HEAD(&container->group_list, group, container_next);
  3182. return 0;
  3183. listener_release_exit:
  3184. vfio_listener_release(container);
  3185. free_container_exit:
  3186. g_free(container);
  3187. close_fd_exit:
  3188. close(fd);
  3189. put_space_exit:
  3190. vfio_put_address_space(space);
  3191. return ret;
  3192. }
  3193. static void vfio_disconnect_container(VFIOGroup *group)
  3194. {
  3195. VFIOContainer *container = group->container;
  3196. if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
  3197. error_report("vfio: error disconnecting group %d from container",
  3198. group->groupid);
  3199. }
  3200. QLIST_REMOVE(group, container_next);
  3201. group->container = NULL;
  3202. if (QLIST_EMPTY(&container->group_list)) {
  3203. VFIOAddressSpace *space = container->space;
  3204. if (container->iommu_data.release) {
  3205. container->iommu_data.release(container);
  3206. }
  3207. QLIST_REMOVE(container, next);
  3208. trace_vfio_disconnect_container(container->fd);
  3209. close(container->fd);
  3210. g_free(container);
  3211. vfio_put_address_space(space);
  3212. }
  3213. }
  3214. static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as)
  3215. {
  3216. VFIOGroup *group;
  3217. char path[32];
  3218. struct vfio_group_status status = { .argsz = sizeof(status) };
  3219. QLIST_FOREACH(group, &group_list, next) {
  3220. if (group->groupid == groupid) {
  3221. /* Found it. Now is it already in the right context? */
  3222. if (group->container->space->as == as) {
  3223. return group;
  3224. } else {
  3225. error_report("vfio: group %d used in multiple address spaces",
  3226. group->groupid);
  3227. return NULL;
  3228. }
  3229. }
  3230. }
  3231. group = g_malloc0(sizeof(*group));
  3232. snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
  3233. group->fd = qemu_open(path, O_RDWR);
  3234. if (group->fd < 0) {
  3235. error_report("vfio: error opening %s: %m", path);
  3236. goto free_group_exit;
  3237. }
  3238. if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
  3239. error_report("vfio: error getting group status: %m");
  3240. goto close_fd_exit;
  3241. }
  3242. if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
  3243. error_report("vfio: error, group %d is not viable, please ensure "
  3244. "all devices within the iommu_group are bound to their "
  3245. "vfio bus driver.", groupid);
  3246. goto close_fd_exit;
  3247. }
  3248. group->groupid = groupid;
  3249. QLIST_INIT(&group->device_list);
  3250. if (vfio_connect_container(group, as)) {
  3251. error_report("vfio: failed to setup container for group %d", groupid);
  3252. goto close_fd_exit;
  3253. }
  3254. if (QLIST_EMPTY(&group_list)) {
  3255. qemu_register_reset(vfio_pci_reset_handler, NULL);
  3256. }
  3257. QLIST_INSERT_HEAD(&group_list, group, next);
  3258. vfio_kvm_device_add_group(group);
  3259. return group;
  3260. close_fd_exit:
  3261. close(group->fd);
  3262. free_group_exit:
  3263. g_free(group);
  3264. return NULL;
  3265. }
  3266. static void vfio_put_group(VFIOGroup *group)
  3267. {
  3268. if (!QLIST_EMPTY(&group->device_list)) {
  3269. return;
  3270. }
  3271. vfio_kvm_device_del_group(group);
  3272. vfio_disconnect_container(group);
  3273. QLIST_REMOVE(group, next);
  3274. trace_vfio_put_group(group->fd);
  3275. close(group->fd);
  3276. g_free(group);
  3277. if (QLIST_EMPTY(&group_list)) {
  3278. qemu_unregister_reset(vfio_pci_reset_handler, NULL);
  3279. }
  3280. }
  3281. static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
  3282. {
  3283. struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
  3284. struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
  3285. struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
  3286. int ret, i;
  3287. ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
  3288. if (ret < 0) {
  3289. error_report("vfio: error getting device %s from group %d: %m",
  3290. name, group->groupid);
  3291. error_printf("Verify all devices in group %d are bound to vfio-pci "
  3292. "or pci-stub and not already in use\n", group->groupid);
  3293. return ret;
  3294. }
  3295. vdev->fd = ret;
  3296. vdev->group = group;
  3297. QLIST_INSERT_HEAD(&group->device_list, vdev, next);
  3298. /* Sanity check device */
  3299. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
  3300. if (ret) {
  3301. error_report("vfio: error getting device info: %m");
  3302. goto error;
  3303. }
  3304. trace_vfio_get_device_irq(name, dev_info.flags,
  3305. dev_info.num_regions, dev_info.num_irqs);
  3306. if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
  3307. error_report("vfio: Um, this isn't a PCI device");
  3308. goto error;
  3309. }
  3310. vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
  3311. if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
  3312. error_report("vfio: unexpected number of io regions %u",
  3313. dev_info.num_regions);
  3314. goto error;
  3315. }
  3316. if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
  3317. error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
  3318. goto error;
  3319. }
  3320. for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
  3321. reg_info.index = i;
  3322. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
  3323. if (ret) {
  3324. error_report("vfio: Error getting region %d info: %m", i);
  3325. goto error;
  3326. }
  3327. trace_vfio_get_device_region(name, i,
  3328. (unsigned long)reg_info.size,
  3329. (unsigned long)reg_info.offset,
  3330. (unsigned long)reg_info.flags);
  3331. vdev->bars[i].flags = reg_info.flags;
  3332. vdev->bars[i].size = reg_info.size;
  3333. vdev->bars[i].fd_offset = reg_info.offset;
  3334. vdev->bars[i].fd = vdev->fd;
  3335. vdev->bars[i].nr = i;
  3336. QLIST_INIT(&vdev->bars[i].quirks);
  3337. }
  3338. reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
  3339. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
  3340. if (ret) {
  3341. error_report("vfio: Error getting config info: %m");
  3342. goto error;
  3343. }
  3344. trace_vfio_get_device_config(name, (unsigned long)reg_info.size,
  3345. (unsigned long)reg_info.offset,
  3346. (unsigned long)reg_info.flags);
  3347. vdev->config_size = reg_info.size;
  3348. if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
  3349. vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
  3350. }
  3351. vdev->config_offset = reg_info.offset;
  3352. if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
  3353. dev_info.num_regions > VFIO_PCI_VGA_REGION_INDEX) {
  3354. struct vfio_region_info vga_info = {
  3355. .argsz = sizeof(vga_info),
  3356. .index = VFIO_PCI_VGA_REGION_INDEX,
  3357. };
  3358. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
  3359. if (ret) {
  3360. error_report(
  3361. "vfio: Device does not support requested feature x-vga");
  3362. goto error;
  3363. }
  3364. if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
  3365. !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
  3366. vga_info.size < 0xbffff + 1) {
  3367. error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
  3368. (unsigned long)vga_info.flags,
  3369. (unsigned long)vga_info.size);
  3370. goto error;
  3371. }
  3372. vdev->vga.fd_offset = vga_info.offset;
  3373. vdev->vga.fd = vdev->fd;
  3374. vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
  3375. vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
  3376. QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
  3377. vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
  3378. vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
  3379. QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
  3380. vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
  3381. vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
  3382. QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
  3383. vdev->has_vga = true;
  3384. }
  3385. irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
  3386. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
  3387. if (ret) {
  3388. /* This can fail for an old kernel or legacy PCI dev */
  3389. trace_vfio_get_device_get_irq_info_failure();
  3390. ret = 0;
  3391. } else if (irq_info.count == 1) {
  3392. vdev->pci_aer = true;
  3393. } else {
  3394. error_report("vfio: %04x:%02x:%02x.%x "
  3395. "Could not enable error recovery for the device",
  3396. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  3397. vdev->host.function);
  3398. }
  3399. error:
  3400. if (ret) {
  3401. QLIST_REMOVE(vdev, next);
  3402. vdev->group = NULL;
  3403. close(vdev->fd);
  3404. }
  3405. return ret;
  3406. }
  3407. static void vfio_put_device(VFIODevice *vdev)
  3408. {
  3409. QLIST_REMOVE(vdev, next);
  3410. vdev->group = NULL;
  3411. trace_vfio_put_device(vdev->fd);
  3412. close(vdev->fd);
  3413. if (vdev->msix) {
  3414. g_free(vdev->msix);
  3415. vdev->msix = NULL;
  3416. }
  3417. }
  3418. static void vfio_err_notifier_handler(void *opaque)
  3419. {
  3420. VFIODevice *vdev = opaque;
  3421. if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
  3422. return;
  3423. }
  3424. /*
  3425. * TBD. Retrieve the error details and decide what action
  3426. * needs to be taken. One of the actions could be to pass
  3427. * the error to the guest and have the guest driver recover
  3428. * from the error. This requires that PCIe capabilities be
  3429. * exposed to the guest. For now, we just terminate the
  3430. * guest to contain the error.
  3431. */
  3432. error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. "
  3433. "Please collect any data possible and then kill the guest",
  3434. __func__, vdev->host.domain, vdev->host.bus,
  3435. vdev->host.slot, vdev->host.function);
  3436. vm_stop(RUN_STATE_INTERNAL_ERROR);
  3437. }
  3438. /*
  3439. * Registers error notifier for devices supporting error recovery.
  3440. * If we encounter a failure in this function, we report an error
  3441. * and continue after disabling error recovery support for the
  3442. * device.
  3443. */
  3444. static void vfio_register_err_notifier(VFIODevice *vdev)
  3445. {
  3446. int ret;
  3447. int argsz;
  3448. struct vfio_irq_set *irq_set;
  3449. int32_t *pfd;
  3450. if (!vdev->pci_aer) {
  3451. return;
  3452. }
  3453. if (event_notifier_init(&vdev->err_notifier, 0)) {
  3454. error_report("vfio: Unable to init event notifier for error detection");
  3455. vdev->pci_aer = false;
  3456. return;
  3457. }
  3458. argsz = sizeof(*irq_set) + sizeof(*pfd);
  3459. irq_set = g_malloc0(argsz);
  3460. irq_set->argsz = argsz;
  3461. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
  3462. VFIO_IRQ_SET_ACTION_TRIGGER;
  3463. irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
  3464. irq_set->start = 0;
  3465. irq_set->count = 1;
  3466. pfd = (int32_t *)&irq_set->data;
  3467. *pfd = event_notifier_get_fd(&vdev->err_notifier);
  3468. qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
  3469. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  3470. if (ret) {
  3471. error_report("vfio: Failed to set up error notification");
  3472. qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
  3473. event_notifier_cleanup(&vdev->err_notifier);
  3474. vdev->pci_aer = false;
  3475. }
  3476. g_free(irq_set);
  3477. }
  3478. static void vfio_unregister_err_notifier(VFIODevice *vdev)
  3479. {
  3480. int argsz;
  3481. struct vfio_irq_set *irq_set;
  3482. int32_t *pfd;
  3483. int ret;
  3484. if (!vdev->pci_aer) {
  3485. return;
  3486. }
  3487. argsz = sizeof(*irq_set) + sizeof(*pfd);
  3488. irq_set = g_malloc0(argsz);
  3489. irq_set->argsz = argsz;
  3490. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
  3491. VFIO_IRQ_SET_ACTION_TRIGGER;
  3492. irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
  3493. irq_set->start = 0;
  3494. irq_set->count = 1;
  3495. pfd = (int32_t *)&irq_set->data;
  3496. *pfd = -1;
  3497. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  3498. if (ret) {
  3499. error_report("vfio: Failed to de-assign error fd: %m");
  3500. }
  3501. g_free(irq_set);
  3502. qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
  3503. NULL, NULL, vdev);
  3504. event_notifier_cleanup(&vdev->err_notifier);
  3505. }
  3506. static int vfio_initfn(PCIDevice *pdev)
  3507. {
  3508. VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  3509. VFIOGroup *group;
  3510. char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
  3511. ssize_t len;
  3512. struct stat st;
  3513. int groupid;
  3514. int ret;
  3515. /* Check that the host device exists */
  3516. snprintf(path, sizeof(path),
  3517. "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
  3518. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  3519. vdev->host.function);
  3520. if (stat(path, &st) < 0) {
  3521. error_report("vfio: error: no such host device: %s", path);
  3522. return -errno;
  3523. }
  3524. strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
  3525. len = readlink(path, iommu_group_path, sizeof(path));
  3526. if (len <= 0 || len >= sizeof(path)) {
  3527. error_report("vfio: error no iommu_group for device");
  3528. return len < 0 ? -errno : ENAMETOOLONG;
  3529. }
  3530. iommu_group_path[len] = 0;
  3531. group_name = basename(iommu_group_path);
  3532. if (sscanf(group_name, "%d", &groupid) != 1) {
  3533. error_report("vfio: error reading %s: %m", path);
  3534. return -errno;
  3535. }
  3536. trace_vfio_initfn(vdev->host.domain, vdev->host.bus,
  3537. vdev->host.slot, vdev->host.function, groupid);
  3538. group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev));
  3539. if (!group) {
  3540. error_report("vfio: failed to get group %d", groupid);
  3541. return -ENOENT;
  3542. }
  3543. snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
  3544. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  3545. vdev->host.function);
  3546. QLIST_FOREACH(pvdev, &group->device_list, next) {
  3547. if (pvdev->host.domain == vdev->host.domain &&
  3548. pvdev->host.bus == vdev->host.bus &&
  3549. pvdev->host.slot == vdev->host.slot &&
  3550. pvdev->host.function == vdev->host.function) {
  3551. error_report("vfio: error: device %s is already attached", path);
  3552. vfio_put_group(group);
  3553. return -EBUSY;
  3554. }
  3555. }
  3556. ret = vfio_get_device(group, path, vdev);
  3557. if (ret) {
  3558. error_report("vfio: failed to get device %s", path);
  3559. vfio_put_group(group);
  3560. return ret;
  3561. }
  3562. /* Get a copy of config space */
  3563. ret = pread(vdev->fd, vdev->pdev.config,
  3564. MIN(pci_config_size(&vdev->pdev), vdev->config_size),
  3565. vdev->config_offset);
  3566. if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
  3567. ret = ret < 0 ? -errno : -EFAULT;
  3568. error_report("vfio: Failed to read device config space");
  3569. goto out_put;
  3570. }
  3571. /* vfio emulates a lot for us, but some bits need extra love */
  3572. vdev->emulated_config_bits = g_malloc0(vdev->config_size);
  3573. /* QEMU can choose to expose the ROM or not */
  3574. memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
  3575. /* QEMU can change multi-function devices to single function, or reverse */
  3576. vdev->emulated_config_bits[PCI_HEADER_TYPE] =
  3577. PCI_HEADER_TYPE_MULTI_FUNCTION;
  3578. /* Restore or clear multifunction, this is always controlled by QEMU */
  3579. if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
  3580. vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
  3581. } else {
  3582. vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
  3583. }
  3584. /*
  3585. * Clear host resource mapping info. If we choose not to register a
  3586. * BAR, such as might be the case with the option ROM, we can get
  3587. * confusing, unwritable, residual addresses from the host here.
  3588. */
  3589. memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
  3590. memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
  3591. vfio_pci_size_rom(vdev);
  3592. ret = vfio_early_setup_msix(vdev);
  3593. if (ret) {
  3594. goto out_put;
  3595. }
  3596. vfio_map_bars(vdev);
  3597. ret = vfio_add_capabilities(vdev);
  3598. if (ret) {
  3599. goto out_teardown;
  3600. }
  3601. /* QEMU emulates all of MSI & MSIX */
  3602. if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
  3603. memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
  3604. MSIX_CAP_LENGTH);
  3605. }
  3606. if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
  3607. memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
  3608. vdev->msi_cap_size);
  3609. }
  3610. if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
  3611. vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
  3612. vfio_intx_mmap_enable, vdev);
  3613. pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
  3614. ret = vfio_enable_intx(vdev);
  3615. if (ret) {
  3616. goto out_teardown;
  3617. }
  3618. }
  3619. vfio_register_err_notifier(vdev);
  3620. return 0;
  3621. out_teardown:
  3622. pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
  3623. vfio_teardown_msi(vdev);
  3624. vfio_unmap_bars(vdev);
  3625. out_put:
  3626. g_free(vdev->emulated_config_bits);
  3627. vfio_put_device(vdev);
  3628. vfio_put_group(group);
  3629. return ret;
  3630. }
  3631. static void vfio_exitfn(PCIDevice *pdev)
  3632. {
  3633. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  3634. VFIOGroup *group = vdev->group;
  3635. vfio_unregister_err_notifier(vdev);
  3636. pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
  3637. vfio_disable_interrupts(vdev);
  3638. if (vdev->intx.mmap_timer) {
  3639. timer_free(vdev->intx.mmap_timer);
  3640. }
  3641. vfio_teardown_msi(vdev);
  3642. vfio_unmap_bars(vdev);
  3643. g_free(vdev->emulated_config_bits);
  3644. g_free(vdev->rom);
  3645. vfio_put_device(vdev);
  3646. vfio_put_group(group);
  3647. }
  3648. static void vfio_pci_reset(DeviceState *dev)
  3649. {
  3650. PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
  3651. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  3652. trace_vfio_pci_reset(vdev->host.domain, vdev->host.bus,
  3653. vdev->host.slot, vdev->host.function);
  3654. vfio_pci_pre_reset(vdev);
  3655. if (vdev->reset_works && (vdev->has_flr || !vdev->has_pm_reset) &&
  3656. !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
  3657. trace_vfio_pci_reset_flr(vdev->host.domain, vdev->host.bus,
  3658. vdev->host.slot, vdev->host.function);
  3659. goto post_reset;
  3660. }
  3661. /* See if we can do our own bus reset */
  3662. if (!vfio_pci_hot_reset_one(vdev)) {
  3663. goto post_reset;
  3664. }
  3665. /* If nothing else works and the device supports PM reset, use it */
  3666. if (vdev->reset_works && vdev->has_pm_reset &&
  3667. !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
  3668. trace_vfio_pci_reset_pm(vdev->host.domain, vdev->host.bus,
  3669. vdev->host.slot, vdev->host.function);
  3670. goto post_reset;
  3671. }
  3672. post_reset:
  3673. vfio_pci_post_reset(vdev);
  3674. }
  3675. static void vfio_instance_init(Object *obj)
  3676. {
  3677. PCIDevice *pci_dev = PCI_DEVICE(obj);
  3678. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, PCI_DEVICE(obj));
  3679. device_add_bootindex_property(obj, &vdev->bootindex,
  3680. "bootindex", NULL,
  3681. &pci_dev->qdev, NULL);
  3682. }
  3683. static Property vfio_pci_dev_properties[] = {
  3684. DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
  3685. DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIODevice,
  3686. intx.mmap_timeout, 1100),
  3687. DEFINE_PROP_BIT("x-vga", VFIODevice, features,
  3688. VFIO_FEATURE_ENABLE_VGA_BIT, false),
  3689. /*
  3690. * TODO - support passed fds... is this necessary?
  3691. * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
  3692. * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
  3693. */
  3694. DEFINE_PROP_END_OF_LIST(),
  3695. };
  3696. static const VMStateDescription vfio_pci_vmstate = {
  3697. .name = "vfio-pci",
  3698. .unmigratable = 1,
  3699. };
  3700. static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
  3701. {
  3702. DeviceClass *dc = DEVICE_CLASS(klass);
  3703. PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
  3704. dc->reset = vfio_pci_reset;
  3705. dc->props = vfio_pci_dev_properties;
  3706. dc->vmsd = &vfio_pci_vmstate;
  3707. dc->desc = "VFIO-based PCI device assignment";
  3708. set_bit(DEVICE_CATEGORY_MISC, dc->categories);
  3709. pdc->init = vfio_initfn;
  3710. pdc->exit = vfio_exitfn;
  3711. pdc->config_read = vfio_pci_read_config;
  3712. pdc->config_write = vfio_pci_write_config;
  3713. pdc->is_express = 1; /* We might be */
  3714. }
  3715. static const TypeInfo vfio_pci_dev_info = {
  3716. .name = "vfio-pci",
  3717. .parent = TYPE_PCI_DEVICE,
  3718. .instance_size = sizeof(VFIODevice),
  3719. .class_init = vfio_pci_dev_class_init,
  3720. .instance_init = vfio_instance_init,
  3721. };
  3722. static void register_vfio_pci_dev_type(void)
  3723. {
  3724. type_register_static(&vfio_pci_dev_info);
  3725. }
  3726. type_init(register_vfio_pci_dev_type)
  3727. static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid,
  3728. int req, void *param)
  3729. {
  3730. VFIOGroup *group;
  3731. VFIOContainer *container;
  3732. int ret = -1;
  3733. group = vfio_get_group(groupid, as);
  3734. if (!group) {
  3735. error_report("vfio: group %d not registered", groupid);
  3736. return ret;
  3737. }
  3738. container = group->container;
  3739. if (group->container) {
  3740. ret = ioctl(container->fd, req, param);
  3741. if (ret < 0) {
  3742. error_report("vfio: failed to ioctl container: ret=%d, %s",
  3743. ret, strerror(errno));
  3744. }
  3745. }
  3746. vfio_put_group(group);
  3747. return ret;
  3748. }
  3749. int vfio_container_ioctl(AddressSpace *as, int32_t groupid,
  3750. int req, void *param)
  3751. {
  3752. /* We allow only certain ioctls to the container */
  3753. switch (req) {
  3754. case VFIO_CHECK_EXTENSION:
  3755. case VFIO_IOMMU_SPAPR_TCE_GET_INFO:
  3756. break;
  3757. default:
  3758. /* Return an error on unknown requests */
  3759. error_report("vfio: unsupported ioctl %X", req);
  3760. return -1;
  3761. }
  3762. return vfio_container_do_ioctl(as, groupid, req, param);
  3763. }