vfio.c 125 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988
  1. /*
  2. * vfio based device assignment support
  3. *
  4. * Copyright Red Hat, Inc. 2012
  5. *
  6. * Authors:
  7. * Alex Williamson <alex.williamson@redhat.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Based on qemu-kvm device-assignment:
  13. * Adapted for KVM by Qumranet.
  14. * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15. * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16. * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17. * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18. * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19. */
  20. #include <dirent.h>
  21. #include <linux/vfio.h>
  22. #include <sys/ioctl.h>
  23. #include <sys/mman.h>
  24. #include <sys/stat.h>
  25. #include <sys/types.h>
  26. #include <unistd.h>
  27. #include "config.h"
  28. #include "exec/address-spaces.h"
  29. #include "exec/memory.h"
  30. #include "hw/pci/msi.h"
  31. #include "hw/pci/msix.h"
  32. #include "hw/pci/pci.h"
  33. #include "qemu-common.h"
  34. #include "qemu/error-report.h"
  35. #include "qemu/event_notifier.h"
  36. #include "qemu/queue.h"
  37. #include "qemu/range.h"
  38. #include "sysemu/kvm.h"
  39. #include "sysemu/sysemu.h"
  40. /* #define DEBUG_VFIO */
  41. #ifdef DEBUG_VFIO
  42. #define DPRINTF(fmt, ...) \
  43. do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
  44. #else
  45. #define DPRINTF(fmt, ...) \
  46. do { } while (0)
  47. #endif
  48. /* Extra debugging, trap acceleration paths for more logging */
  49. #define VFIO_ALLOW_MMAP 1
  50. #define VFIO_ALLOW_KVM_INTX 1
  51. #define VFIO_ALLOW_KVM_MSI 1
  52. #define VFIO_ALLOW_KVM_MSIX 1
  53. struct VFIODevice;
  54. typedef struct VFIOQuirk {
  55. MemoryRegion mem;
  56. struct VFIODevice *vdev;
  57. QLIST_ENTRY(VFIOQuirk) next;
  58. struct {
  59. uint32_t base_offset:TARGET_PAGE_BITS;
  60. uint32_t address_offset:TARGET_PAGE_BITS;
  61. uint32_t address_size:3;
  62. uint32_t bar:3;
  63. uint32_t address_match;
  64. uint32_t address_mask;
  65. uint32_t address_val:TARGET_PAGE_BITS;
  66. uint32_t data_offset:TARGET_PAGE_BITS;
  67. uint32_t data_size:3;
  68. uint8_t flags;
  69. uint8_t read_flags;
  70. uint8_t write_flags;
  71. } data;
  72. } VFIOQuirk;
  73. typedef struct VFIOBAR {
  74. off_t fd_offset; /* offset of BAR within device fd */
  75. int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
  76. MemoryRegion mem; /* slow, read/write access */
  77. MemoryRegion mmap_mem; /* direct mapped access */
  78. void *mmap;
  79. size_t size;
  80. uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
  81. uint8_t nr; /* cache the BAR number for debug */
  82. bool ioport;
  83. bool mem64;
  84. QLIST_HEAD(, VFIOQuirk) quirks;
  85. } VFIOBAR;
  86. typedef struct VFIOVGARegion {
  87. MemoryRegion mem;
  88. off_t offset;
  89. int nr;
  90. QLIST_HEAD(, VFIOQuirk) quirks;
  91. } VFIOVGARegion;
  92. typedef struct VFIOVGA {
  93. off_t fd_offset;
  94. int fd;
  95. VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS];
  96. } VFIOVGA;
  97. typedef struct VFIOINTx {
  98. bool pending; /* interrupt pending */
  99. bool kvm_accel; /* set when QEMU bypass through KVM enabled */
  100. uint8_t pin; /* which pin to pull for qemu_set_irq */
  101. EventNotifier interrupt; /* eventfd triggered on interrupt */
  102. EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
  103. PCIINTxRoute route; /* routing info for QEMU bypass */
  104. uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
  105. QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
  106. } VFIOINTx;
  107. typedef struct VFIOMSIVector {
  108. EventNotifier interrupt; /* eventfd triggered on interrupt */
  109. struct VFIODevice *vdev; /* back pointer to device */
  110. MSIMessage msg; /* cache the MSI message so we know when it changes */
  111. int virq; /* KVM irqchip route for QEMU bypass */
  112. bool use;
  113. } VFIOMSIVector;
  114. enum {
  115. VFIO_INT_NONE = 0,
  116. VFIO_INT_INTx = 1,
  117. VFIO_INT_MSI = 2,
  118. VFIO_INT_MSIX = 3,
  119. };
  120. struct VFIOGroup;
  121. typedef struct VFIOType1 {
  122. MemoryListener listener;
  123. int error;
  124. bool initialized;
  125. } VFIOType1;
  126. typedef struct VFIOContainer {
  127. int fd; /* /dev/vfio/vfio, empowered by the attached groups */
  128. struct {
  129. /* enable abstraction to support various iommu backends */
  130. union {
  131. VFIOType1 type1;
  132. };
  133. void (*release)(struct VFIOContainer *);
  134. } iommu_data;
  135. QLIST_HEAD(, VFIOGroup) group_list;
  136. QLIST_ENTRY(VFIOContainer) next;
  137. } VFIOContainer;
  138. /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
  139. typedef struct VFIOMSIXInfo {
  140. uint8_t table_bar;
  141. uint8_t pba_bar;
  142. uint16_t entries;
  143. uint32_t table_offset;
  144. uint32_t pba_offset;
  145. MemoryRegion mmap_mem;
  146. void *mmap;
  147. } VFIOMSIXInfo;
  148. typedef struct VFIODevice {
  149. PCIDevice pdev;
  150. int fd;
  151. VFIOINTx intx;
  152. unsigned int config_size;
  153. uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
  154. off_t config_offset; /* Offset of config space region within device fd */
  155. unsigned int rom_size;
  156. off_t rom_offset; /* Offset of ROM region within device fd */
  157. void *rom;
  158. int msi_cap_size;
  159. VFIOMSIVector *msi_vectors;
  160. VFIOMSIXInfo *msix;
  161. int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
  162. int interrupt; /* Current interrupt type */
  163. VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
  164. VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
  165. PCIHostDeviceAddress host;
  166. QLIST_ENTRY(VFIODevice) next;
  167. struct VFIOGroup *group;
  168. EventNotifier err_notifier;
  169. uint32_t features;
  170. #define VFIO_FEATURE_ENABLE_VGA_BIT 0
  171. #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
  172. int32_t bootindex;
  173. uint8_t pm_cap;
  174. bool reset_works;
  175. bool has_vga;
  176. bool pci_aer;
  177. bool has_flr;
  178. bool has_pm_reset;
  179. bool needs_reset;
  180. bool rom_read_failed;
  181. } VFIODevice;
  182. typedef struct VFIOGroup {
  183. int fd;
  184. int groupid;
  185. VFIOContainer *container;
  186. QLIST_HEAD(, VFIODevice) device_list;
  187. QLIST_ENTRY(VFIOGroup) next;
  188. QLIST_ENTRY(VFIOGroup) container_next;
  189. } VFIOGroup;
  190. typedef struct VFIORomBlacklistEntry {
  191. uint16_t vendor_id;
  192. uint16_t device_id;
  193. } VFIORomBlacklistEntry;
  194. /*
  195. * List of device ids/vendor ids for which to disable
  196. * option rom loading. This avoids the guest hangs during rom
  197. * execution as noticed with the BCM 57810 card for lack of a
  198. * more better way to handle such issues.
  199. * The user can still override by specifying a romfile or
  200. * rombar=1.
  201. * Please see https://bugs.launchpad.net/qemu/+bug/1284874
  202. * for an analysis of the 57810 card hang. When adding
  203. * a new vendor id/device id combination below, please also add
  204. * your card/environment details and information that could
  205. * help in debugging to the bug tracking this issue
  206. */
  207. static const VFIORomBlacklistEntry romblacklist[] = {
  208. /* Broadcom BCM 57810 */
  209. { 0x14e4, 0x168e }
  210. };
  211. #define MSIX_CAP_LENGTH 12
  212. static QLIST_HEAD(, VFIOContainer)
  213. container_list = QLIST_HEAD_INITIALIZER(container_list);
  214. static QLIST_HEAD(, VFIOGroup)
  215. group_list = QLIST_HEAD_INITIALIZER(group_list);
  216. #ifdef CONFIG_KVM
  217. /*
  218. * We have a single VFIO pseudo device per KVM VM. Once created it lives
  219. * for the life of the VM. Closing the file descriptor only drops our
  220. * reference to it and the device's reference to kvm. Therefore once
  221. * initialized, this file descriptor is only released on QEMU exit and
  222. * we'll re-use it should another vfio device be attached before then.
  223. */
  224. static int vfio_kvm_device_fd = -1;
  225. #endif
  226. static void vfio_disable_interrupts(VFIODevice *vdev);
  227. static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
  228. static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
  229. uint32_t val, int len);
  230. static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
  231. /*
  232. * Common VFIO interrupt disable
  233. */
  234. static void vfio_disable_irqindex(VFIODevice *vdev, int index)
  235. {
  236. struct vfio_irq_set irq_set = {
  237. .argsz = sizeof(irq_set),
  238. .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
  239. .index = index,
  240. .start = 0,
  241. .count = 0,
  242. };
  243. ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
  244. }
  245. /*
  246. * INTx
  247. */
  248. static void vfio_unmask_intx(VFIODevice *vdev)
  249. {
  250. struct vfio_irq_set irq_set = {
  251. .argsz = sizeof(irq_set),
  252. .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
  253. .index = VFIO_PCI_INTX_IRQ_INDEX,
  254. .start = 0,
  255. .count = 1,
  256. };
  257. ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
  258. }
  259. #ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
  260. static void vfio_mask_intx(VFIODevice *vdev)
  261. {
  262. struct vfio_irq_set irq_set = {
  263. .argsz = sizeof(irq_set),
  264. .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
  265. .index = VFIO_PCI_INTX_IRQ_INDEX,
  266. .start = 0,
  267. .count = 1,
  268. };
  269. ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
  270. }
  271. #endif
  272. /*
  273. * Disabling BAR mmaping can be slow, but toggling it around INTx can
  274. * also be a huge overhead. We try to get the best of both worlds by
  275. * waiting until an interrupt to disable mmaps (subsequent transitions
  276. * to the same state are effectively no overhead). If the interrupt has
  277. * been serviced and the time gap is long enough, we re-enable mmaps for
  278. * performance. This works well for things like graphics cards, which
  279. * may not use their interrupt at all and are penalized to an unusable
  280. * level by read/write BAR traps. Other devices, like NICs, have more
  281. * regular interrupts and see much better latency by staying in non-mmap
  282. * mode. We therefore set the default mmap_timeout such that a ping
  283. * is just enough to keep the mmap disabled. Users can experiment with
  284. * other options with the x-intx-mmap-timeout-ms parameter (a value of
  285. * zero disables the timer).
  286. */
  287. static void vfio_intx_mmap_enable(void *opaque)
  288. {
  289. VFIODevice *vdev = opaque;
  290. if (vdev->intx.pending) {
  291. timer_mod(vdev->intx.mmap_timer,
  292. qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
  293. return;
  294. }
  295. vfio_mmap_set_enabled(vdev, true);
  296. }
  297. static void vfio_intx_interrupt(void *opaque)
  298. {
  299. VFIODevice *vdev = opaque;
  300. if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
  301. return;
  302. }
  303. DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
  304. vdev->host.bus, vdev->host.slot, vdev->host.function,
  305. 'A' + vdev->intx.pin);
  306. vdev->intx.pending = true;
  307. pci_irq_assert(&vdev->pdev);
  308. vfio_mmap_set_enabled(vdev, false);
  309. if (vdev->intx.mmap_timeout) {
  310. timer_mod(vdev->intx.mmap_timer,
  311. qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
  312. }
  313. }
  314. static void vfio_eoi(VFIODevice *vdev)
  315. {
  316. if (!vdev->intx.pending) {
  317. return;
  318. }
  319. DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
  320. vdev->host.bus, vdev->host.slot, vdev->host.function);
  321. vdev->intx.pending = false;
  322. pci_irq_deassert(&vdev->pdev);
  323. vfio_unmask_intx(vdev);
  324. }
  325. static void vfio_enable_intx_kvm(VFIODevice *vdev)
  326. {
  327. #ifdef CONFIG_KVM
  328. struct kvm_irqfd irqfd = {
  329. .fd = event_notifier_get_fd(&vdev->intx.interrupt),
  330. .gsi = vdev->intx.route.irq,
  331. .flags = KVM_IRQFD_FLAG_RESAMPLE,
  332. };
  333. struct vfio_irq_set *irq_set;
  334. int ret, argsz;
  335. int32_t *pfd;
  336. if (!VFIO_ALLOW_KVM_INTX || !kvm_irqfds_enabled() ||
  337. vdev->intx.route.mode != PCI_INTX_ENABLED ||
  338. !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
  339. return;
  340. }
  341. /* Get to a known interrupt state */
  342. qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
  343. vfio_mask_intx(vdev);
  344. vdev->intx.pending = false;
  345. pci_irq_deassert(&vdev->pdev);
  346. /* Get an eventfd for resample/unmask */
  347. if (event_notifier_init(&vdev->intx.unmask, 0)) {
  348. error_report("vfio: Error: event_notifier_init failed eoi");
  349. goto fail;
  350. }
  351. /* KVM triggers it, VFIO listens for it */
  352. irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
  353. if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
  354. error_report("vfio: Error: Failed to setup resample irqfd: %m");
  355. goto fail_irqfd;
  356. }
  357. argsz = sizeof(*irq_set) + sizeof(*pfd);
  358. irq_set = g_malloc0(argsz);
  359. irq_set->argsz = argsz;
  360. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
  361. irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
  362. irq_set->start = 0;
  363. irq_set->count = 1;
  364. pfd = (int32_t *)&irq_set->data;
  365. *pfd = irqfd.resamplefd;
  366. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  367. g_free(irq_set);
  368. if (ret) {
  369. error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
  370. goto fail_vfio;
  371. }
  372. /* Let'em rip */
  373. vfio_unmask_intx(vdev);
  374. vdev->intx.kvm_accel = true;
  375. DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
  376. __func__, vdev->host.domain, vdev->host.bus,
  377. vdev->host.slot, vdev->host.function);
  378. return;
  379. fail_vfio:
  380. irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
  381. kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
  382. fail_irqfd:
  383. event_notifier_cleanup(&vdev->intx.unmask);
  384. fail:
  385. qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
  386. vfio_unmask_intx(vdev);
  387. #endif
  388. }
  389. static void vfio_disable_intx_kvm(VFIODevice *vdev)
  390. {
  391. #ifdef CONFIG_KVM
  392. struct kvm_irqfd irqfd = {
  393. .fd = event_notifier_get_fd(&vdev->intx.interrupt),
  394. .gsi = vdev->intx.route.irq,
  395. .flags = KVM_IRQFD_FLAG_DEASSIGN,
  396. };
  397. if (!vdev->intx.kvm_accel) {
  398. return;
  399. }
  400. /*
  401. * Get to a known state, hardware masked, QEMU ready to accept new
  402. * interrupts, QEMU IRQ de-asserted.
  403. */
  404. vfio_mask_intx(vdev);
  405. vdev->intx.pending = false;
  406. pci_irq_deassert(&vdev->pdev);
  407. /* Tell KVM to stop listening for an INTx irqfd */
  408. if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
  409. error_report("vfio: Error: Failed to disable INTx irqfd: %m");
  410. }
  411. /* We only need to close the eventfd for VFIO to cleanup the kernel side */
  412. event_notifier_cleanup(&vdev->intx.unmask);
  413. /* QEMU starts listening for interrupt events. */
  414. qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
  415. vdev->intx.kvm_accel = false;
  416. /* If we've missed an event, let it re-fire through QEMU */
  417. vfio_unmask_intx(vdev);
  418. DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
  419. __func__, vdev->host.domain, vdev->host.bus,
  420. vdev->host.slot, vdev->host.function);
  421. #endif
  422. }
  423. static void vfio_update_irq(PCIDevice *pdev)
  424. {
  425. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  426. PCIINTxRoute route;
  427. if (vdev->interrupt != VFIO_INT_INTx) {
  428. return;
  429. }
  430. route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
  431. if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
  432. return; /* Nothing changed */
  433. }
  434. DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
  435. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  436. vdev->host.function, vdev->intx.route.irq, route.irq);
  437. vfio_disable_intx_kvm(vdev);
  438. vdev->intx.route = route;
  439. if (route.mode != PCI_INTX_ENABLED) {
  440. return;
  441. }
  442. vfio_enable_intx_kvm(vdev);
  443. /* Re-enable the interrupt in cased we missed an EOI */
  444. vfio_eoi(vdev);
  445. }
  446. static int vfio_enable_intx(VFIODevice *vdev)
  447. {
  448. uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
  449. int ret, argsz;
  450. struct vfio_irq_set *irq_set;
  451. int32_t *pfd;
  452. if (!pin) {
  453. return 0;
  454. }
  455. vfio_disable_interrupts(vdev);
  456. vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
  457. pci_config_set_interrupt_pin(vdev->pdev.config, pin);
  458. #ifdef CONFIG_KVM
  459. /*
  460. * Only conditional to avoid generating error messages on platforms
  461. * where we won't actually use the result anyway.
  462. */
  463. if (kvm_irqfds_enabled() &&
  464. kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
  465. vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
  466. vdev->intx.pin);
  467. }
  468. #endif
  469. ret = event_notifier_init(&vdev->intx.interrupt, 0);
  470. if (ret) {
  471. error_report("vfio: Error: event_notifier_init failed");
  472. return ret;
  473. }
  474. argsz = sizeof(*irq_set) + sizeof(*pfd);
  475. irq_set = g_malloc0(argsz);
  476. irq_set->argsz = argsz;
  477. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
  478. irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
  479. irq_set->start = 0;
  480. irq_set->count = 1;
  481. pfd = (int32_t *)&irq_set->data;
  482. *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
  483. qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
  484. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  485. g_free(irq_set);
  486. if (ret) {
  487. error_report("vfio: Error: Failed to setup INTx fd: %m");
  488. qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
  489. event_notifier_cleanup(&vdev->intx.interrupt);
  490. return -errno;
  491. }
  492. vfio_enable_intx_kvm(vdev);
  493. vdev->interrupt = VFIO_INT_INTx;
  494. DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
  495. vdev->host.bus, vdev->host.slot, vdev->host.function);
  496. return 0;
  497. }
  498. static void vfio_disable_intx(VFIODevice *vdev)
  499. {
  500. int fd;
  501. timer_del(vdev->intx.mmap_timer);
  502. vfio_disable_intx_kvm(vdev);
  503. vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
  504. vdev->intx.pending = false;
  505. pci_irq_deassert(&vdev->pdev);
  506. vfio_mmap_set_enabled(vdev, true);
  507. fd = event_notifier_get_fd(&vdev->intx.interrupt);
  508. qemu_set_fd_handler(fd, NULL, NULL, vdev);
  509. event_notifier_cleanup(&vdev->intx.interrupt);
  510. vdev->interrupt = VFIO_INT_NONE;
  511. DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
  512. vdev->host.bus, vdev->host.slot, vdev->host.function);
  513. }
  514. /*
  515. * MSI/X
  516. */
  517. static void vfio_msi_interrupt(void *opaque)
  518. {
  519. VFIOMSIVector *vector = opaque;
  520. VFIODevice *vdev = vector->vdev;
  521. int nr = vector - vdev->msi_vectors;
  522. if (!event_notifier_test_and_clear(&vector->interrupt)) {
  523. return;
  524. }
  525. #ifdef DEBUG_VFIO
  526. MSIMessage msg;
  527. if (vdev->interrupt == VFIO_INT_MSIX) {
  528. msg = msi_get_message(&vdev->pdev, nr);
  529. } else if (vdev->interrupt == VFIO_INT_MSI) {
  530. msg = msix_get_message(&vdev->pdev, nr);
  531. } else {
  532. abort();
  533. }
  534. DPRINTF("%s(%04x:%02x:%02x.%x) vector %d 0x%"PRIx64"/0x%x\n", __func__,
  535. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  536. vdev->host.function, nr, msg.address, msg.data);
  537. #endif
  538. if (vdev->interrupt == VFIO_INT_MSIX) {
  539. msix_notify(&vdev->pdev, nr);
  540. } else if (vdev->interrupt == VFIO_INT_MSI) {
  541. msi_notify(&vdev->pdev, nr);
  542. } else {
  543. error_report("vfio: MSI interrupt receieved, but not enabled?");
  544. }
  545. }
  546. static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
  547. {
  548. struct vfio_irq_set *irq_set;
  549. int ret = 0, i, argsz;
  550. int32_t *fds;
  551. argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
  552. irq_set = g_malloc0(argsz);
  553. irq_set->argsz = argsz;
  554. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
  555. irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
  556. irq_set->start = 0;
  557. irq_set->count = vdev->nr_vectors;
  558. fds = (int32_t *)&irq_set->data;
  559. for (i = 0; i < vdev->nr_vectors; i++) {
  560. if (!vdev->msi_vectors[i].use) {
  561. fds[i] = -1;
  562. continue;
  563. }
  564. fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
  565. }
  566. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  567. g_free(irq_set);
  568. return ret;
  569. }
  570. static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
  571. MSIMessage *msg, IOHandler *handler)
  572. {
  573. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  574. VFIOMSIVector *vector;
  575. int ret;
  576. DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
  577. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  578. vdev->host.function, nr);
  579. vector = &vdev->msi_vectors[nr];
  580. vector->vdev = vdev;
  581. vector->use = true;
  582. msix_vector_use(pdev, nr);
  583. if (event_notifier_init(&vector->interrupt, 0)) {
  584. error_report("vfio: Error: event_notifier_init failed");
  585. }
  586. /*
  587. * Attempt to enable route through KVM irqchip,
  588. * default to userspace handling if unavailable.
  589. */
  590. vector->virq = msg && VFIO_ALLOW_KVM_MSIX ?
  591. kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
  592. if (vector->virq < 0 ||
  593. kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
  594. NULL, vector->virq) < 0) {
  595. if (vector->virq >= 0) {
  596. kvm_irqchip_release_virq(kvm_state, vector->virq);
  597. vector->virq = -1;
  598. }
  599. qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
  600. handler, NULL, vector);
  601. }
  602. /*
  603. * We don't want to have the host allocate all possible MSI vectors
  604. * for a device if they're not in use, so we shutdown and incrementally
  605. * increase them as needed.
  606. */
  607. if (vdev->nr_vectors < nr + 1) {
  608. vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
  609. vdev->nr_vectors = nr + 1;
  610. ret = vfio_enable_vectors(vdev, true);
  611. if (ret) {
  612. error_report("vfio: failed to enable vectors, %d", ret);
  613. }
  614. } else {
  615. int argsz;
  616. struct vfio_irq_set *irq_set;
  617. int32_t *pfd;
  618. argsz = sizeof(*irq_set) + sizeof(*pfd);
  619. irq_set = g_malloc0(argsz);
  620. irq_set->argsz = argsz;
  621. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
  622. VFIO_IRQ_SET_ACTION_TRIGGER;
  623. irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
  624. irq_set->start = nr;
  625. irq_set->count = 1;
  626. pfd = (int32_t *)&irq_set->data;
  627. *pfd = event_notifier_get_fd(&vector->interrupt);
  628. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  629. g_free(irq_set);
  630. if (ret) {
  631. error_report("vfio: failed to modify vector, %d", ret);
  632. }
  633. }
  634. return 0;
  635. }
  636. static int vfio_msix_vector_use(PCIDevice *pdev,
  637. unsigned int nr, MSIMessage msg)
  638. {
  639. return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
  640. }
  641. static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
  642. {
  643. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  644. VFIOMSIVector *vector = &vdev->msi_vectors[nr];
  645. int argsz;
  646. struct vfio_irq_set *irq_set;
  647. int32_t *pfd;
  648. DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
  649. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  650. vdev->host.function, nr);
  651. /*
  652. * XXX What's the right thing to do here? This turns off the interrupt
  653. * completely, but do we really just want to switch the interrupt to
  654. * bouncing through userspace and let msix.c drop it? Not sure.
  655. */
  656. msix_vector_unuse(pdev, nr);
  657. argsz = sizeof(*irq_set) + sizeof(*pfd);
  658. irq_set = g_malloc0(argsz);
  659. irq_set->argsz = argsz;
  660. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
  661. VFIO_IRQ_SET_ACTION_TRIGGER;
  662. irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
  663. irq_set->start = nr;
  664. irq_set->count = 1;
  665. pfd = (int32_t *)&irq_set->data;
  666. *pfd = -1;
  667. ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  668. g_free(irq_set);
  669. if (vector->virq < 0) {
  670. qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
  671. NULL, NULL, NULL);
  672. } else {
  673. kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
  674. vector->virq);
  675. kvm_irqchip_release_virq(kvm_state, vector->virq);
  676. vector->virq = -1;
  677. }
  678. event_notifier_cleanup(&vector->interrupt);
  679. vector->use = false;
  680. }
  681. static void vfio_enable_msix(VFIODevice *vdev)
  682. {
  683. vfio_disable_interrupts(vdev);
  684. vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
  685. vdev->interrupt = VFIO_INT_MSIX;
  686. /*
  687. * Some communication channels between VF & PF or PF & fw rely on the
  688. * physical state of the device and expect that enabling MSI-X from the
  689. * guest enables the same on the host. When our guest is Linux, the
  690. * guest driver call to pci_enable_msix() sets the enabling bit in the
  691. * MSI-X capability, but leaves the vector table masked. We therefore
  692. * can't rely on a vector_use callback (from request_irq() in the guest)
  693. * to switch the physical device into MSI-X mode because that may come a
  694. * long time after pci_enable_msix(). This code enables vector 0 with
  695. * triggering to userspace, then immediately release the vector, leaving
  696. * the physical device with no vectors enabled, but MSI-X enabled, just
  697. * like the guest view.
  698. */
  699. vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
  700. vfio_msix_vector_release(&vdev->pdev, 0);
  701. if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
  702. vfio_msix_vector_release, NULL)) {
  703. error_report("vfio: msix_set_vector_notifiers failed");
  704. }
  705. DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
  706. vdev->host.bus, vdev->host.slot, vdev->host.function);
  707. }
  708. static void vfio_enable_msi(VFIODevice *vdev)
  709. {
  710. int ret, i;
  711. vfio_disable_interrupts(vdev);
  712. vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
  713. retry:
  714. vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
  715. for (i = 0; i < vdev->nr_vectors; i++) {
  716. VFIOMSIVector *vector = &vdev->msi_vectors[i];
  717. vector->vdev = vdev;
  718. vector->use = true;
  719. if (event_notifier_init(&vector->interrupt, 0)) {
  720. error_report("vfio: Error: event_notifier_init failed");
  721. }
  722. vector->msg = msi_get_message(&vdev->pdev, i);
  723. /*
  724. * Attempt to enable route through KVM irqchip,
  725. * default to userspace handling if unavailable.
  726. */
  727. vector->virq = VFIO_ALLOW_KVM_MSI ?
  728. kvm_irqchip_add_msi_route(kvm_state, vector->msg) : -1;
  729. if (vector->virq < 0 ||
  730. kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
  731. NULL, vector->virq) < 0) {
  732. qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
  733. vfio_msi_interrupt, NULL, vector);
  734. }
  735. }
  736. ret = vfio_enable_vectors(vdev, false);
  737. if (ret) {
  738. if (ret < 0) {
  739. error_report("vfio: Error: Failed to setup MSI fds: %m");
  740. } else if (ret != vdev->nr_vectors) {
  741. error_report("vfio: Error: Failed to enable %d "
  742. "MSI vectors, retry with %d", vdev->nr_vectors, ret);
  743. }
  744. for (i = 0; i < vdev->nr_vectors; i++) {
  745. VFIOMSIVector *vector = &vdev->msi_vectors[i];
  746. if (vector->virq >= 0) {
  747. kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
  748. vector->virq);
  749. kvm_irqchip_release_virq(kvm_state, vector->virq);
  750. vector->virq = -1;
  751. } else {
  752. qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
  753. NULL, NULL, NULL);
  754. }
  755. event_notifier_cleanup(&vector->interrupt);
  756. }
  757. g_free(vdev->msi_vectors);
  758. if (ret > 0 && ret != vdev->nr_vectors) {
  759. vdev->nr_vectors = ret;
  760. goto retry;
  761. }
  762. vdev->nr_vectors = 0;
  763. return;
  764. }
  765. vdev->interrupt = VFIO_INT_MSI;
  766. DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
  767. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  768. vdev->host.function, vdev->nr_vectors);
  769. }
  770. static void vfio_disable_msi_common(VFIODevice *vdev)
  771. {
  772. g_free(vdev->msi_vectors);
  773. vdev->msi_vectors = NULL;
  774. vdev->nr_vectors = 0;
  775. vdev->interrupt = VFIO_INT_NONE;
  776. vfio_enable_intx(vdev);
  777. }
  778. static void vfio_disable_msix(VFIODevice *vdev)
  779. {
  780. int i;
  781. msix_unset_vector_notifiers(&vdev->pdev);
  782. /*
  783. * MSI-X will only release vectors if MSI-X is still enabled on the
  784. * device, check through the rest and release it ourselves if necessary.
  785. */
  786. for (i = 0; i < vdev->nr_vectors; i++) {
  787. if (vdev->msi_vectors[i].use) {
  788. vfio_msix_vector_release(&vdev->pdev, i);
  789. }
  790. }
  791. if (vdev->nr_vectors) {
  792. vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
  793. }
  794. vfio_disable_msi_common(vdev);
  795. DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
  796. vdev->host.bus, vdev->host.slot, vdev->host.function);
  797. }
  798. static void vfio_disable_msi(VFIODevice *vdev)
  799. {
  800. int i;
  801. vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
  802. for (i = 0; i < vdev->nr_vectors; i++) {
  803. VFIOMSIVector *vector = &vdev->msi_vectors[i];
  804. if (!vector->use) {
  805. continue;
  806. }
  807. if (vector->virq >= 0) {
  808. kvm_irqchip_remove_irqfd_notifier(kvm_state,
  809. &vector->interrupt, vector->virq);
  810. kvm_irqchip_release_virq(kvm_state, vector->virq);
  811. vector->virq = -1;
  812. } else {
  813. qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
  814. NULL, NULL, NULL);
  815. }
  816. event_notifier_cleanup(&vector->interrupt);
  817. }
  818. vfio_disable_msi_common(vdev);
  819. DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
  820. vdev->host.bus, vdev->host.slot, vdev->host.function);
  821. }
  822. static void vfio_update_msi(VFIODevice *vdev)
  823. {
  824. int i;
  825. for (i = 0; i < vdev->nr_vectors; i++) {
  826. VFIOMSIVector *vector = &vdev->msi_vectors[i];
  827. MSIMessage msg;
  828. if (!vector->use || vector->virq < 0) {
  829. continue;
  830. }
  831. msg = msi_get_message(&vdev->pdev, i);
  832. if (msg.address != vector->msg.address ||
  833. msg.data != vector->msg.data) {
  834. DPRINTF("%s(%04x:%02x:%02x.%x) MSI vector %d changed\n",
  835. __func__, vdev->host.domain, vdev->host.bus,
  836. vdev->host.slot, vdev->host.function, i);
  837. kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
  838. vector->msg = msg;
  839. }
  840. }
  841. }
  842. /*
  843. * IO Port/MMIO - Beware of the endians, VFIO is always little endian
  844. */
  845. static void vfio_bar_write(void *opaque, hwaddr addr,
  846. uint64_t data, unsigned size)
  847. {
  848. VFIOBAR *bar = opaque;
  849. union {
  850. uint8_t byte;
  851. uint16_t word;
  852. uint32_t dword;
  853. uint64_t qword;
  854. } buf;
  855. switch (size) {
  856. case 1:
  857. buf.byte = data;
  858. break;
  859. case 2:
  860. buf.word = cpu_to_le16(data);
  861. break;
  862. case 4:
  863. buf.dword = cpu_to_le32(data);
  864. break;
  865. default:
  866. hw_error("vfio: unsupported write size, %d bytes", size);
  867. break;
  868. }
  869. if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
  870. error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
  871. __func__, addr, data, size);
  872. }
  873. #ifdef DEBUG_VFIO
  874. {
  875. VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
  876. DPRINTF("%s(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"PRIx64
  877. ", %d)\n", __func__, vdev->host.domain, vdev->host.bus,
  878. vdev->host.slot, vdev->host.function, bar->nr, addr,
  879. data, size);
  880. }
  881. #endif
  882. /*
  883. * A read or write to a BAR always signals an INTx EOI. This will
  884. * do nothing if not pending (including not in INTx mode). We assume
  885. * that a BAR access is in response to an interrupt and that BAR
  886. * accesses will service the interrupt. Unfortunately, we don't know
  887. * which access will service the interrupt, so we're potentially
  888. * getting quite a few host interrupts per guest interrupt.
  889. */
  890. vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
  891. }
  892. static uint64_t vfio_bar_read(void *opaque,
  893. hwaddr addr, unsigned size)
  894. {
  895. VFIOBAR *bar = opaque;
  896. union {
  897. uint8_t byte;
  898. uint16_t word;
  899. uint32_t dword;
  900. uint64_t qword;
  901. } buf;
  902. uint64_t data = 0;
  903. if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
  904. error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
  905. __func__, addr, size);
  906. return (uint64_t)-1;
  907. }
  908. switch (size) {
  909. case 1:
  910. data = buf.byte;
  911. break;
  912. case 2:
  913. data = le16_to_cpu(buf.word);
  914. break;
  915. case 4:
  916. data = le32_to_cpu(buf.dword);
  917. break;
  918. default:
  919. hw_error("vfio: unsupported read size, %d bytes", size);
  920. break;
  921. }
  922. #ifdef DEBUG_VFIO
  923. {
  924. VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
  925. DPRINTF("%s(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx
  926. ", %d) = 0x%"PRIx64"\n", __func__, vdev->host.domain,
  927. vdev->host.bus, vdev->host.slot, vdev->host.function,
  928. bar->nr, addr, size, data);
  929. }
  930. #endif
  931. /* Same as write above */
  932. vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
  933. return data;
  934. }
  935. static const MemoryRegionOps vfio_bar_ops = {
  936. .read = vfio_bar_read,
  937. .write = vfio_bar_write,
  938. .endianness = DEVICE_LITTLE_ENDIAN,
  939. };
  940. static void vfio_pci_load_rom(VFIODevice *vdev)
  941. {
  942. struct vfio_region_info reg_info = {
  943. .argsz = sizeof(reg_info),
  944. .index = VFIO_PCI_ROM_REGION_INDEX
  945. };
  946. uint64_t size;
  947. off_t off = 0;
  948. size_t bytes;
  949. if (ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
  950. error_report("vfio: Error getting ROM info: %m");
  951. return;
  952. }
  953. DPRINTF("Device %04x:%02x:%02x.%x ROM:\n", vdev->host.domain,
  954. vdev->host.bus, vdev->host.slot, vdev->host.function);
  955. DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
  956. (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
  957. (unsigned long)reg_info.flags);
  958. vdev->rom_size = size = reg_info.size;
  959. vdev->rom_offset = reg_info.offset;
  960. if (!vdev->rom_size) {
  961. vdev->rom_read_failed = true;
  962. error_report("vfio-pci: Cannot read device rom at "
  963. "%04x:%02x:%02x.%x",
  964. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  965. vdev->host.function);
  966. error_printf("Device option ROM contents are probably invalid "
  967. "(check dmesg).\nSkip option ROM probe with rombar=0, "
  968. "or load from file with romfile=\n");
  969. return;
  970. }
  971. vdev->rom = g_malloc(size);
  972. memset(vdev->rom, 0xff, size);
  973. while (size) {
  974. bytes = pread(vdev->fd, vdev->rom + off, size, vdev->rom_offset + off);
  975. if (bytes == 0) {
  976. break;
  977. } else if (bytes > 0) {
  978. off += bytes;
  979. size -= bytes;
  980. } else {
  981. if (errno == EINTR || errno == EAGAIN) {
  982. continue;
  983. }
  984. error_report("vfio: Error reading device ROM: %m");
  985. break;
  986. }
  987. }
  988. }
  989. static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
  990. {
  991. VFIODevice *vdev = opaque;
  992. uint64_t val = ((uint64_t)1 << (size * 8)) - 1;
  993. /* Load the ROM lazily when the guest tries to read it */
  994. if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
  995. vfio_pci_load_rom(vdev);
  996. }
  997. memcpy(&val, vdev->rom + addr,
  998. (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
  999. DPRINTF("%s(%04x:%02x:%02x.%x, 0x%"HWADDR_PRIx", 0x%x) = 0x%"PRIx64"\n",
  1000. __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1001. vdev->host.function, addr, size, val);
  1002. return val;
  1003. }
  1004. static void vfio_rom_write(void *opaque, hwaddr addr,
  1005. uint64_t data, unsigned size)
  1006. {
  1007. }
  1008. static const MemoryRegionOps vfio_rom_ops = {
  1009. .read = vfio_rom_read,
  1010. .write = vfio_rom_write,
  1011. .endianness = DEVICE_LITTLE_ENDIAN,
  1012. };
  1013. static bool vfio_blacklist_opt_rom(VFIODevice *vdev)
  1014. {
  1015. PCIDevice *pdev = &vdev->pdev;
  1016. uint16_t vendor_id, device_id;
  1017. int count = 0;
  1018. vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
  1019. device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
  1020. while (count < ARRAY_SIZE(romblacklist)) {
  1021. if (romblacklist[count].vendor_id == vendor_id &&
  1022. romblacklist[count].device_id == device_id) {
  1023. return true;
  1024. }
  1025. count++;
  1026. }
  1027. return false;
  1028. }
  1029. static void vfio_pci_size_rom(VFIODevice *vdev)
  1030. {
  1031. uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
  1032. off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
  1033. DeviceState *dev = DEVICE(vdev);
  1034. char name[32];
  1035. if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
  1036. /* Since pci handles romfile, just print a message and return */
  1037. if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) {
  1038. error_printf("Warning : Device at %04x:%02x:%02x.%x "
  1039. "is known to cause system instability issues during "
  1040. "option rom execution. "
  1041. "Proceeding anyway since user specified romfile\n",
  1042. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1043. vdev->host.function);
  1044. }
  1045. return;
  1046. }
  1047. /*
  1048. * Use the same size ROM BAR as the physical device. The contents
  1049. * will get filled in later when the guest tries to read it.
  1050. */
  1051. if (pread(vdev->fd, &orig, 4, offset) != 4 ||
  1052. pwrite(vdev->fd, &size, 4, offset) != 4 ||
  1053. pread(vdev->fd, &size, 4, offset) != 4 ||
  1054. pwrite(vdev->fd, &orig, 4, offset) != 4) {
  1055. error_report("%s(%04x:%02x:%02x.%x) failed: %m",
  1056. __func__, vdev->host.domain, vdev->host.bus,
  1057. vdev->host.slot, vdev->host.function);
  1058. return;
  1059. }
  1060. size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
  1061. if (!size) {
  1062. return;
  1063. }
  1064. if (vfio_blacklist_opt_rom(vdev)) {
  1065. if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
  1066. error_printf("Warning : Device at %04x:%02x:%02x.%x "
  1067. "is known to cause system instability issues during "
  1068. "option rom execution. "
  1069. "Proceeding anyway since user specified non zero value for "
  1070. "rombar\n",
  1071. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1072. vdev->host.function);
  1073. } else {
  1074. error_printf("Warning : Rom loading for device at "
  1075. "%04x:%02x:%02x.%x has been disabled due to "
  1076. "system instability issues. "
  1077. "Specify rombar=1 or romfile to force\n",
  1078. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1079. vdev->host.function);
  1080. return;
  1081. }
  1082. }
  1083. DPRINTF("%04x:%02x:%02x.%x ROM size 0x%x\n", vdev->host.domain,
  1084. vdev->host.bus, vdev->host.slot, vdev->host.function, size);
  1085. snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
  1086. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1087. vdev->host.function);
  1088. memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
  1089. &vfio_rom_ops, vdev, name, size);
  1090. pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
  1091. PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
  1092. vdev->pdev.has_rom = true;
  1093. vdev->rom_read_failed = false;
  1094. }
  1095. static void vfio_vga_write(void *opaque, hwaddr addr,
  1096. uint64_t data, unsigned size)
  1097. {
  1098. VFIOVGARegion *region = opaque;
  1099. VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
  1100. union {
  1101. uint8_t byte;
  1102. uint16_t word;
  1103. uint32_t dword;
  1104. uint64_t qword;
  1105. } buf;
  1106. off_t offset = vga->fd_offset + region->offset + addr;
  1107. switch (size) {
  1108. case 1:
  1109. buf.byte = data;
  1110. break;
  1111. case 2:
  1112. buf.word = cpu_to_le16(data);
  1113. break;
  1114. case 4:
  1115. buf.dword = cpu_to_le32(data);
  1116. break;
  1117. default:
  1118. hw_error("vfio: unsupported write size, %d bytes", size);
  1119. break;
  1120. }
  1121. if (pwrite(vga->fd, &buf, size, offset) != size) {
  1122. error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
  1123. __func__, region->offset + addr, data, size);
  1124. }
  1125. DPRINTF("%s(0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n",
  1126. __func__, region->offset + addr, data, size);
  1127. }
  1128. static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
  1129. {
  1130. VFIOVGARegion *region = opaque;
  1131. VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
  1132. union {
  1133. uint8_t byte;
  1134. uint16_t word;
  1135. uint32_t dword;
  1136. uint64_t qword;
  1137. } buf;
  1138. uint64_t data = 0;
  1139. off_t offset = vga->fd_offset + region->offset + addr;
  1140. if (pread(vga->fd, &buf, size, offset) != size) {
  1141. error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
  1142. __func__, region->offset + addr, size);
  1143. return (uint64_t)-1;
  1144. }
  1145. switch (size) {
  1146. case 1:
  1147. data = buf.byte;
  1148. break;
  1149. case 2:
  1150. data = le16_to_cpu(buf.word);
  1151. break;
  1152. case 4:
  1153. data = le32_to_cpu(buf.dword);
  1154. break;
  1155. default:
  1156. hw_error("vfio: unsupported read size, %d bytes", size);
  1157. break;
  1158. }
  1159. DPRINTF("%s(0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
  1160. __func__, region->offset + addr, size, data);
  1161. return data;
  1162. }
  1163. static const MemoryRegionOps vfio_vga_ops = {
  1164. .read = vfio_vga_read,
  1165. .write = vfio_vga_write,
  1166. .endianness = DEVICE_LITTLE_ENDIAN,
  1167. };
  1168. /*
  1169. * Device specific quirks
  1170. */
  1171. /* Is range1 fully contained within range2? */
  1172. static bool vfio_range_contained(uint64_t first1, uint64_t len1,
  1173. uint64_t first2, uint64_t len2) {
  1174. return (first1 >= first2 && first1 + len1 <= first2 + len2);
  1175. }
  1176. static bool vfio_flags_enabled(uint8_t flags, uint8_t mask)
  1177. {
  1178. return (mask && (flags & mask) == mask);
  1179. }
  1180. static uint64_t vfio_generic_window_quirk_read(void *opaque,
  1181. hwaddr addr, unsigned size)
  1182. {
  1183. VFIOQuirk *quirk = opaque;
  1184. VFIODevice *vdev = quirk->vdev;
  1185. uint64_t data;
  1186. if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
  1187. ranges_overlap(addr, size,
  1188. quirk->data.data_offset, quirk->data.data_size)) {
  1189. hwaddr offset = addr - quirk->data.data_offset;
  1190. if (!vfio_range_contained(addr, size, quirk->data.data_offset,
  1191. quirk->data.data_size)) {
  1192. hw_error("%s: window data read not fully contained: %s",
  1193. __func__, memory_region_name(&quirk->mem));
  1194. }
  1195. data = vfio_pci_read_config(&vdev->pdev,
  1196. quirk->data.address_val + offset, size);
  1197. DPRINTF("%s read(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", %d) = 0x%"
  1198. PRIx64"\n", memory_region_name(&quirk->mem), vdev->host.domain,
  1199. vdev->host.bus, vdev->host.slot, vdev->host.function,
  1200. quirk->data.bar, addr, size, data);
  1201. } else {
  1202. data = vfio_bar_read(&vdev->bars[quirk->data.bar],
  1203. addr + quirk->data.base_offset, size);
  1204. }
  1205. return data;
  1206. }
  1207. static void vfio_generic_window_quirk_write(void *opaque, hwaddr addr,
  1208. uint64_t data, unsigned size)
  1209. {
  1210. VFIOQuirk *quirk = opaque;
  1211. VFIODevice *vdev = quirk->vdev;
  1212. if (ranges_overlap(addr, size,
  1213. quirk->data.address_offset, quirk->data.address_size)) {
  1214. if (addr != quirk->data.address_offset) {
  1215. hw_error("%s: offset write into address window: %s",
  1216. __func__, memory_region_name(&quirk->mem));
  1217. }
  1218. if ((data & ~quirk->data.address_mask) == quirk->data.address_match) {
  1219. quirk->data.flags |= quirk->data.write_flags |
  1220. quirk->data.read_flags;
  1221. quirk->data.address_val = data & quirk->data.address_mask;
  1222. } else {
  1223. quirk->data.flags &= ~(quirk->data.write_flags |
  1224. quirk->data.read_flags);
  1225. }
  1226. }
  1227. if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
  1228. ranges_overlap(addr, size,
  1229. quirk->data.data_offset, quirk->data.data_size)) {
  1230. hwaddr offset = addr - quirk->data.data_offset;
  1231. if (!vfio_range_contained(addr, size, quirk->data.data_offset,
  1232. quirk->data.data_size)) {
  1233. hw_error("%s: window data write not fully contained: %s",
  1234. __func__, memory_region_name(&quirk->mem));
  1235. }
  1236. vfio_pci_write_config(&vdev->pdev,
  1237. quirk->data.address_val + offset, data, size);
  1238. DPRINTF("%s write(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"
  1239. PRIx64", %d)\n", memory_region_name(&quirk->mem),
  1240. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1241. vdev->host.function, quirk->data.bar, addr, data, size);
  1242. return;
  1243. }
  1244. vfio_bar_write(&vdev->bars[quirk->data.bar],
  1245. addr + quirk->data.base_offset, data, size);
  1246. }
  1247. static const MemoryRegionOps vfio_generic_window_quirk = {
  1248. .read = vfio_generic_window_quirk_read,
  1249. .write = vfio_generic_window_quirk_write,
  1250. .endianness = DEVICE_LITTLE_ENDIAN,
  1251. };
  1252. static uint64_t vfio_generic_quirk_read(void *opaque,
  1253. hwaddr addr, unsigned size)
  1254. {
  1255. VFIOQuirk *quirk = opaque;
  1256. VFIODevice *vdev = quirk->vdev;
  1257. hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
  1258. hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
  1259. uint64_t data;
  1260. if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
  1261. ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
  1262. if (!vfio_range_contained(addr, size, offset,
  1263. quirk->data.address_mask + 1)) {
  1264. hw_error("%s: read not fully contained: %s",
  1265. __func__, memory_region_name(&quirk->mem));
  1266. }
  1267. data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
  1268. DPRINTF("%s read(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", %d) = 0x%"
  1269. PRIx64"\n", memory_region_name(&quirk->mem), vdev->host.domain,
  1270. vdev->host.bus, vdev->host.slot, vdev->host.function,
  1271. quirk->data.bar, addr + base, size, data);
  1272. } else {
  1273. data = vfio_bar_read(&vdev->bars[quirk->data.bar], addr + base, size);
  1274. }
  1275. return data;
  1276. }
  1277. static void vfio_generic_quirk_write(void *opaque, hwaddr addr,
  1278. uint64_t data, unsigned size)
  1279. {
  1280. VFIOQuirk *quirk = opaque;
  1281. VFIODevice *vdev = quirk->vdev;
  1282. hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
  1283. hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
  1284. if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
  1285. ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
  1286. if (!vfio_range_contained(addr, size, offset,
  1287. quirk->data.address_mask + 1)) {
  1288. hw_error("%s: write not fully contained: %s",
  1289. __func__, memory_region_name(&quirk->mem));
  1290. }
  1291. vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
  1292. DPRINTF("%s write(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"
  1293. PRIx64", %d)\n", memory_region_name(&quirk->mem),
  1294. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1295. vdev->host.function, quirk->data.bar, addr + base, data, size);
  1296. } else {
  1297. vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
  1298. }
  1299. }
  1300. static const MemoryRegionOps vfio_generic_quirk = {
  1301. .read = vfio_generic_quirk_read,
  1302. .write = vfio_generic_quirk_write,
  1303. .endianness = DEVICE_LITTLE_ENDIAN,
  1304. };
  1305. #define PCI_VENDOR_ID_ATI 0x1002
  1306. /*
  1307. * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
  1308. * through VGA register 0x3c3. On newer cards, the I/O port BAR is always
  1309. * BAR4 (older cards like the X550 used BAR1, but we don't care to support
  1310. * those). Note that on bare metal, a read of 0x3c3 doesn't always return the
  1311. * I/O port BAR address. Originally this was coded to return the virtual BAR
  1312. * address only if the physical register read returns the actual BAR address,
  1313. * but users have reported greater success if we return the virtual address
  1314. * unconditionally.
  1315. */
  1316. static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
  1317. hwaddr addr, unsigned size)
  1318. {
  1319. VFIOQuirk *quirk = opaque;
  1320. VFIODevice *vdev = quirk->vdev;
  1321. uint64_t data = vfio_pci_read_config(&vdev->pdev,
  1322. PCI_BASE_ADDRESS_0 + (4 * 4) + 1,
  1323. size);
  1324. DPRINTF("%s(0x3c3, 1) = 0x%"PRIx64"\n", __func__, data);
  1325. return data;
  1326. }
  1327. static const MemoryRegionOps vfio_ati_3c3_quirk = {
  1328. .read = vfio_ati_3c3_quirk_read,
  1329. .endianness = DEVICE_LITTLE_ENDIAN,
  1330. };
  1331. static void vfio_vga_probe_ati_3c3_quirk(VFIODevice *vdev)
  1332. {
  1333. PCIDevice *pdev = &vdev->pdev;
  1334. VFIOQuirk *quirk;
  1335. if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
  1336. return;
  1337. }
  1338. /*
  1339. * As long as the BAR is >= 256 bytes it will be aligned such that the
  1340. * lower byte is always zero. Filter out anything else, if it exists.
  1341. */
  1342. if (!vdev->bars[4].ioport || vdev->bars[4].size < 256) {
  1343. return;
  1344. }
  1345. quirk = g_malloc0(sizeof(*quirk));
  1346. quirk->vdev = vdev;
  1347. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, quirk,
  1348. "vfio-ati-3c3-quirk", 1);
  1349. memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
  1350. 3 /* offset 3 bytes from 0x3c0 */, &quirk->mem);
  1351. QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
  1352. quirk, next);
  1353. DPRINTF("Enabled ATI/AMD quirk 0x3c3 BAR4for device %04x:%02x:%02x.%x\n",
  1354. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1355. vdev->host.function);
  1356. }
  1357. /*
  1358. * Newer ATI/AMD devices, including HD5450 and HD7850, have a window to PCI
  1359. * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access
  1360. * the MMIO space directly, but a window to this space is provided through
  1361. * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the
  1362. * data register. When the address is programmed to a range of 0x4000-0x4fff
  1363. * PCI configuration space is available. Experimentation seems to indicate
  1364. * that only read-only access is provided, but we drop writes when the window
  1365. * is enabled to config space nonetheless.
  1366. */
  1367. static void vfio_probe_ati_bar4_window_quirk(VFIODevice *vdev, int nr)
  1368. {
  1369. PCIDevice *pdev = &vdev->pdev;
  1370. VFIOQuirk *quirk;
  1371. if (!vdev->has_vga || nr != 4 ||
  1372. pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
  1373. return;
  1374. }
  1375. quirk = g_malloc0(sizeof(*quirk));
  1376. quirk->vdev = vdev;
  1377. quirk->data.address_size = 4;
  1378. quirk->data.data_offset = 4;
  1379. quirk->data.data_size = 4;
  1380. quirk->data.address_match = 0x4000;
  1381. quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
  1382. quirk->data.bar = nr;
  1383. quirk->data.read_flags = quirk->data.write_flags = 1;
  1384. memory_region_init_io(&quirk->mem, OBJECT(vdev),
  1385. &vfio_generic_window_quirk, quirk,
  1386. "vfio-ati-bar4-window-quirk", 8);
  1387. memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
  1388. quirk->data.base_offset, &quirk->mem, 1);
  1389. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1390. DPRINTF("Enabled ATI/AMD BAR4 window quirk for device %04x:%02x:%02x.%x\n",
  1391. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1392. vdev->host.function);
  1393. }
  1394. /*
  1395. * Trap the BAR2 MMIO window to config space as well.
  1396. */
  1397. static void vfio_probe_ati_bar2_4000_quirk(VFIODevice *vdev, int nr)
  1398. {
  1399. PCIDevice *pdev = &vdev->pdev;
  1400. VFIOQuirk *quirk;
  1401. /* Only enable on newer devices where BAR2 is 64bit */
  1402. if (!vdev->has_vga || nr != 2 || !vdev->bars[2].mem64 ||
  1403. pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
  1404. return;
  1405. }
  1406. quirk = g_malloc0(sizeof(*quirk));
  1407. quirk->vdev = vdev;
  1408. quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
  1409. quirk->data.address_match = 0x4000;
  1410. quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
  1411. quirk->data.bar = nr;
  1412. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
  1413. "vfio-ati-bar2-4000-quirk",
  1414. TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
  1415. memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
  1416. quirk->data.address_match & TARGET_PAGE_MASK,
  1417. &quirk->mem, 1);
  1418. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1419. DPRINTF("Enabled ATI/AMD BAR2 0x4000 quirk for device %04x:%02x:%02x.%x\n",
  1420. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1421. vdev->host.function);
  1422. }
  1423. /*
  1424. * Older ATI/AMD cards like the X550 have a similar window to that above.
  1425. * I/O port BAR1 provides a window to a mirror of PCI config space located
  1426. * in BAR2 at offset 0xf00. We don't care to support such older cards, but
  1427. * note it for future reference.
  1428. */
  1429. #define PCI_VENDOR_ID_NVIDIA 0x10de
  1430. /*
  1431. * Nvidia has several different methods to get to config space, the
  1432. * nouveu project has several of these documented here:
  1433. * https://github.com/pathscale/envytools/tree/master/hwdocs
  1434. *
  1435. * The first quirk is actually not documented in envytools and is found
  1436. * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an
  1437. * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access
  1438. * the mirror of PCI config space found at BAR0 offset 0x1800. The access
  1439. * sequence first writes 0x338 to I/O port 0x3d4. The target offset is
  1440. * then written to 0x3d0. Finally 0x538 is written for a read and 0x738
  1441. * is written for a write to 0x3d4. The BAR0 offset is then accessible
  1442. * through 0x3d0. This quirk doesn't seem to be necessary on newer cards
  1443. * that use the I/O port BAR5 window but it doesn't hurt to leave it.
  1444. */
  1445. enum {
  1446. NV_3D0_NONE = 0,
  1447. NV_3D0_SELECT,
  1448. NV_3D0_WINDOW,
  1449. NV_3D0_READ,
  1450. NV_3D0_WRITE,
  1451. };
  1452. static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
  1453. hwaddr addr, unsigned size)
  1454. {
  1455. VFIOQuirk *quirk = opaque;
  1456. VFIODevice *vdev = quirk->vdev;
  1457. PCIDevice *pdev = &vdev->pdev;
  1458. uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
  1459. addr + quirk->data.base_offset, size);
  1460. if (quirk->data.flags == NV_3D0_READ && addr == quirk->data.data_offset) {
  1461. data = vfio_pci_read_config(pdev, quirk->data.address_val, size);
  1462. DPRINTF("%s(0x3d0, %d) = 0x%"PRIx64"\n", __func__, size, data);
  1463. }
  1464. quirk->data.flags = NV_3D0_NONE;
  1465. return data;
  1466. }
  1467. static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
  1468. uint64_t data, unsigned size)
  1469. {
  1470. VFIOQuirk *quirk = opaque;
  1471. VFIODevice *vdev = quirk->vdev;
  1472. PCIDevice *pdev = &vdev->pdev;
  1473. switch (quirk->data.flags) {
  1474. case NV_3D0_NONE:
  1475. if (addr == quirk->data.address_offset && data == 0x338) {
  1476. quirk->data.flags = NV_3D0_SELECT;
  1477. }
  1478. break;
  1479. case NV_3D0_SELECT:
  1480. quirk->data.flags = NV_3D0_NONE;
  1481. if (addr == quirk->data.data_offset &&
  1482. (data & ~quirk->data.address_mask) == quirk->data.address_match) {
  1483. quirk->data.flags = NV_3D0_WINDOW;
  1484. quirk->data.address_val = data & quirk->data.address_mask;
  1485. }
  1486. break;
  1487. case NV_3D0_WINDOW:
  1488. quirk->data.flags = NV_3D0_NONE;
  1489. if (addr == quirk->data.address_offset) {
  1490. if (data == 0x538) {
  1491. quirk->data.flags = NV_3D0_READ;
  1492. } else if (data == 0x738) {
  1493. quirk->data.flags = NV_3D0_WRITE;
  1494. }
  1495. }
  1496. break;
  1497. case NV_3D0_WRITE:
  1498. quirk->data.flags = NV_3D0_NONE;
  1499. if (addr == quirk->data.data_offset) {
  1500. vfio_pci_write_config(pdev, quirk->data.address_val, data, size);
  1501. DPRINTF("%s(0x3d0, 0x%"PRIx64", %d)\n", __func__, data, size);
  1502. return;
  1503. }
  1504. break;
  1505. }
  1506. vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
  1507. addr + quirk->data.base_offset, data, size);
  1508. }
  1509. static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
  1510. .read = vfio_nvidia_3d0_quirk_read,
  1511. .write = vfio_nvidia_3d0_quirk_write,
  1512. .endianness = DEVICE_LITTLE_ENDIAN,
  1513. };
  1514. static void vfio_vga_probe_nvidia_3d0_quirk(VFIODevice *vdev)
  1515. {
  1516. PCIDevice *pdev = &vdev->pdev;
  1517. VFIOQuirk *quirk;
  1518. if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA ||
  1519. !vdev->bars[1].size) {
  1520. return;
  1521. }
  1522. quirk = g_malloc0(sizeof(*quirk));
  1523. quirk->vdev = vdev;
  1524. quirk->data.base_offset = 0x10;
  1525. quirk->data.address_offset = 4;
  1526. quirk->data.address_size = 2;
  1527. quirk->data.address_match = 0x1800;
  1528. quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
  1529. quirk->data.data_offset = 0;
  1530. quirk->data.data_size = 4;
  1531. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_3d0_quirk,
  1532. quirk, "vfio-nvidia-3d0-quirk", 6);
  1533. memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
  1534. quirk->data.base_offset, &quirk->mem);
  1535. QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
  1536. quirk, next);
  1537. DPRINTF("Enabled NVIDIA VGA 0x3d0 quirk for device %04x:%02x:%02x.%x\n",
  1538. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1539. vdev->host.function);
  1540. }
  1541. /*
  1542. * The second quirk is documented in envytools. The I/O port BAR5 is just
  1543. * a set of address/data ports to the MMIO BARs. The BAR we care about is
  1544. * again BAR0. This backdoor is apparently a bit newer than the one above
  1545. * so we need to not only trap 256 bytes @0x1800, but all of PCI config
  1546. * space, including extended space is available at the 4k @0x88000.
  1547. */
  1548. enum {
  1549. NV_BAR5_ADDRESS = 0x1,
  1550. NV_BAR5_ENABLE = 0x2,
  1551. NV_BAR5_MASTER = 0x4,
  1552. NV_BAR5_VALID = 0x7,
  1553. };
  1554. static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr,
  1555. uint64_t data, unsigned size)
  1556. {
  1557. VFIOQuirk *quirk = opaque;
  1558. switch (addr) {
  1559. case 0x0:
  1560. if (data & 0x1) {
  1561. quirk->data.flags |= NV_BAR5_MASTER;
  1562. } else {
  1563. quirk->data.flags &= ~NV_BAR5_MASTER;
  1564. }
  1565. break;
  1566. case 0x4:
  1567. if (data & 0x1) {
  1568. quirk->data.flags |= NV_BAR5_ENABLE;
  1569. } else {
  1570. quirk->data.flags &= ~NV_BAR5_ENABLE;
  1571. }
  1572. break;
  1573. case 0x8:
  1574. if (quirk->data.flags & NV_BAR5_MASTER) {
  1575. if ((data & ~0xfff) == 0x88000) {
  1576. quirk->data.flags |= NV_BAR5_ADDRESS;
  1577. quirk->data.address_val = data & 0xfff;
  1578. } else if ((data & ~0xff) == 0x1800) {
  1579. quirk->data.flags |= NV_BAR5_ADDRESS;
  1580. quirk->data.address_val = data & 0xff;
  1581. } else {
  1582. quirk->data.flags &= ~NV_BAR5_ADDRESS;
  1583. }
  1584. }
  1585. break;
  1586. }
  1587. vfio_generic_window_quirk_write(opaque, addr, data, size);
  1588. }
  1589. static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = {
  1590. .read = vfio_generic_window_quirk_read,
  1591. .write = vfio_nvidia_bar5_window_quirk_write,
  1592. .valid.min_access_size = 4,
  1593. .endianness = DEVICE_LITTLE_ENDIAN,
  1594. };
  1595. static void vfio_probe_nvidia_bar5_window_quirk(VFIODevice *vdev, int nr)
  1596. {
  1597. PCIDevice *pdev = &vdev->pdev;
  1598. VFIOQuirk *quirk;
  1599. if (!vdev->has_vga || nr != 5 ||
  1600. pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
  1601. return;
  1602. }
  1603. quirk = g_malloc0(sizeof(*quirk));
  1604. quirk->vdev = vdev;
  1605. quirk->data.read_flags = quirk->data.write_flags = NV_BAR5_VALID;
  1606. quirk->data.address_offset = 0x8;
  1607. quirk->data.address_size = 0; /* actually 4, but avoids generic code */
  1608. quirk->data.data_offset = 0xc;
  1609. quirk->data.data_size = 4;
  1610. quirk->data.bar = nr;
  1611. memory_region_init_io(&quirk->mem, OBJECT(vdev),
  1612. &vfio_nvidia_bar5_window_quirk, quirk,
  1613. "vfio-nvidia-bar5-window-quirk", 16);
  1614. memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
  1615. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1616. DPRINTF("Enabled NVIDIA BAR5 window quirk for device %04x:%02x:%02x.%x\n",
  1617. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1618. vdev->host.function);
  1619. }
  1620. static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr,
  1621. uint64_t data, unsigned size)
  1622. {
  1623. VFIOQuirk *quirk = opaque;
  1624. VFIODevice *vdev = quirk->vdev;
  1625. PCIDevice *pdev = &vdev->pdev;
  1626. hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
  1627. vfio_generic_quirk_write(opaque, addr, data, size);
  1628. /*
  1629. * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
  1630. * MSI capability ID register. Both the ID and next register are
  1631. * read-only, so we allow writes covering either of those to real hw.
  1632. * NB - only fixed for the 0x88000 MMIO window.
  1633. */
  1634. if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
  1635. vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
  1636. vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
  1637. }
  1638. }
  1639. static const MemoryRegionOps vfio_nvidia_88000_quirk = {
  1640. .read = vfio_generic_quirk_read,
  1641. .write = vfio_nvidia_88000_quirk_write,
  1642. .endianness = DEVICE_LITTLE_ENDIAN,
  1643. };
  1644. /*
  1645. * Finally, BAR0 itself. We want to redirect any accesses to either
  1646. * 0x1800 or 0x88000 through the PCI config space access functions.
  1647. *
  1648. * NB - quirk at a page granularity or else they don't seem to work when
  1649. * BARs are mmap'd
  1650. *
  1651. * Here's offset 0x88000...
  1652. */
  1653. static void vfio_probe_nvidia_bar0_88000_quirk(VFIODevice *vdev, int nr)
  1654. {
  1655. PCIDevice *pdev = &vdev->pdev;
  1656. VFIOQuirk *quirk;
  1657. if (!vdev->has_vga || nr != 0 ||
  1658. pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
  1659. return;
  1660. }
  1661. quirk = g_malloc0(sizeof(*quirk));
  1662. quirk->vdev = vdev;
  1663. quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
  1664. quirk->data.address_match = 0x88000;
  1665. quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
  1666. quirk->data.bar = nr;
  1667. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk,
  1668. quirk, "vfio-nvidia-bar0-88000-quirk",
  1669. TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
  1670. memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
  1671. quirk->data.address_match & TARGET_PAGE_MASK,
  1672. &quirk->mem, 1);
  1673. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1674. DPRINTF("Enabled NVIDIA BAR0 0x88000 quirk for device %04x:%02x:%02x.%x\n",
  1675. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1676. vdev->host.function);
  1677. }
  1678. /*
  1679. * And here's the same for BAR0 offset 0x1800...
  1680. */
  1681. static void vfio_probe_nvidia_bar0_1800_quirk(VFIODevice *vdev, int nr)
  1682. {
  1683. PCIDevice *pdev = &vdev->pdev;
  1684. VFIOQuirk *quirk;
  1685. if (!vdev->has_vga || nr != 0 ||
  1686. pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
  1687. return;
  1688. }
  1689. /* Log the chipset ID */
  1690. DPRINTF("Nvidia NV%02x\n",
  1691. (unsigned int)(vfio_bar_read(&vdev->bars[0], 0, 4) >> 20) & 0xff);
  1692. quirk = g_malloc0(sizeof(*quirk));
  1693. quirk->vdev = vdev;
  1694. quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
  1695. quirk->data.address_match = 0x1800;
  1696. quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
  1697. quirk->data.bar = nr;
  1698. memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
  1699. "vfio-nvidia-bar0-1800-quirk",
  1700. TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
  1701. memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
  1702. quirk->data.address_match & TARGET_PAGE_MASK,
  1703. &quirk->mem, 1);
  1704. QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
  1705. DPRINTF("Enabled NVIDIA BAR0 0x1800 quirk for device %04x:%02x:%02x.%x\n",
  1706. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1707. vdev->host.function);
  1708. }
  1709. /*
  1710. * TODO - Some Nvidia devices provide config access to their companion HDA
  1711. * device and even to their parent bridge via these config space mirrors.
  1712. * Add quirks for those regions.
  1713. */
  1714. /*
  1715. * Common quirk probe entry points.
  1716. */
  1717. static void vfio_vga_quirk_setup(VFIODevice *vdev)
  1718. {
  1719. vfio_vga_probe_ati_3c3_quirk(vdev);
  1720. vfio_vga_probe_nvidia_3d0_quirk(vdev);
  1721. }
  1722. static void vfio_vga_quirk_teardown(VFIODevice *vdev)
  1723. {
  1724. int i;
  1725. for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
  1726. while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
  1727. VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
  1728. memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
  1729. memory_region_destroy(&quirk->mem);
  1730. QLIST_REMOVE(quirk, next);
  1731. g_free(quirk);
  1732. }
  1733. }
  1734. }
  1735. static void vfio_bar_quirk_setup(VFIODevice *vdev, int nr)
  1736. {
  1737. vfio_probe_ati_bar4_window_quirk(vdev, nr);
  1738. vfio_probe_ati_bar2_4000_quirk(vdev, nr);
  1739. vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
  1740. vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
  1741. vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
  1742. }
  1743. static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr)
  1744. {
  1745. VFIOBAR *bar = &vdev->bars[nr];
  1746. while (!QLIST_EMPTY(&bar->quirks)) {
  1747. VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
  1748. memory_region_del_subregion(&bar->mem, &quirk->mem);
  1749. memory_region_destroy(&quirk->mem);
  1750. QLIST_REMOVE(quirk, next);
  1751. g_free(quirk);
  1752. }
  1753. }
  1754. /*
  1755. * PCI config space
  1756. */
  1757. static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
  1758. {
  1759. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  1760. uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
  1761. memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
  1762. emu_bits = le32_to_cpu(emu_bits);
  1763. if (emu_bits) {
  1764. emu_val = pci_default_read_config(pdev, addr, len);
  1765. }
  1766. if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
  1767. ssize_t ret;
  1768. ret = pread(vdev->fd, &phys_val, len, vdev->config_offset + addr);
  1769. if (ret != len) {
  1770. error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
  1771. __func__, vdev->host.domain, vdev->host.bus,
  1772. vdev->host.slot, vdev->host.function, addr, len);
  1773. return -errno;
  1774. }
  1775. phys_val = le32_to_cpu(phys_val);
  1776. }
  1777. val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
  1778. DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
  1779. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1780. vdev->host.function, addr, len, val);
  1781. return val;
  1782. }
  1783. static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
  1784. uint32_t val, int len)
  1785. {
  1786. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  1787. uint32_t val_le = cpu_to_le32(val);
  1788. DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
  1789. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  1790. vdev->host.function, addr, val, len);
  1791. /* Write everything to VFIO, let it filter out what we can't write */
  1792. if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
  1793. error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
  1794. __func__, vdev->host.domain, vdev->host.bus,
  1795. vdev->host.slot, vdev->host.function, addr, val, len);
  1796. }
  1797. /* MSI/MSI-X Enabling/Disabling */
  1798. if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
  1799. ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
  1800. int is_enabled, was_enabled = msi_enabled(pdev);
  1801. pci_default_write_config(pdev, addr, val, len);
  1802. is_enabled = msi_enabled(pdev);
  1803. if (!was_enabled) {
  1804. if (is_enabled) {
  1805. vfio_enable_msi(vdev);
  1806. }
  1807. } else {
  1808. if (!is_enabled) {
  1809. vfio_disable_msi(vdev);
  1810. } else {
  1811. vfio_update_msi(vdev);
  1812. }
  1813. }
  1814. } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
  1815. ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
  1816. int is_enabled, was_enabled = msix_enabled(pdev);
  1817. pci_default_write_config(pdev, addr, val, len);
  1818. is_enabled = msix_enabled(pdev);
  1819. if (!was_enabled && is_enabled) {
  1820. vfio_enable_msix(vdev);
  1821. } else if (was_enabled && !is_enabled) {
  1822. vfio_disable_msix(vdev);
  1823. }
  1824. } else {
  1825. /* Write everything to QEMU to keep emulated bits correct */
  1826. pci_default_write_config(pdev, addr, val, len);
  1827. }
  1828. }
  1829. /*
  1830. * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  1831. */
  1832. static int vfio_dma_unmap(VFIOContainer *container,
  1833. hwaddr iova, ram_addr_t size)
  1834. {
  1835. struct vfio_iommu_type1_dma_unmap unmap = {
  1836. .argsz = sizeof(unmap),
  1837. .flags = 0,
  1838. .iova = iova,
  1839. .size = size,
  1840. };
  1841. if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
  1842. DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
  1843. return -errno;
  1844. }
  1845. return 0;
  1846. }
  1847. static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
  1848. ram_addr_t size, void *vaddr, bool readonly)
  1849. {
  1850. struct vfio_iommu_type1_dma_map map = {
  1851. .argsz = sizeof(map),
  1852. .flags = VFIO_DMA_MAP_FLAG_READ,
  1853. .vaddr = (__u64)(uintptr_t)vaddr,
  1854. .iova = iova,
  1855. .size = size,
  1856. };
  1857. if (!readonly) {
  1858. map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
  1859. }
  1860. /*
  1861. * Try the mapping, if it fails with EBUSY, unmap the region and try
  1862. * again. This shouldn't be necessary, but we sometimes see it in
  1863. * the the VGA ROM space.
  1864. */
  1865. if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
  1866. (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
  1867. ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
  1868. return 0;
  1869. }
  1870. DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
  1871. return -errno;
  1872. }
  1873. static bool vfio_listener_skipped_section(MemoryRegionSection *section)
  1874. {
  1875. return !memory_region_is_ram(section->mr) ||
  1876. /*
  1877. * Sizing an enabled 64-bit BAR can cause spurious mappings to
  1878. * addresses in the upper part of the 64-bit address space. These
  1879. * are never accessed by the CPU and beyond the address width of
  1880. * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
  1881. */
  1882. section->offset_within_address_space & (1ULL << 63);
  1883. }
  1884. static void vfio_listener_region_add(MemoryListener *listener,
  1885. MemoryRegionSection *section)
  1886. {
  1887. VFIOContainer *container = container_of(listener, VFIOContainer,
  1888. iommu_data.type1.listener);
  1889. hwaddr iova, end;
  1890. void *vaddr;
  1891. int ret;
  1892. assert(!memory_region_is_iommu(section->mr));
  1893. if (vfio_listener_skipped_section(section)) {
  1894. DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
  1895. section->offset_within_address_space,
  1896. section->offset_within_address_space +
  1897. int128_get64(int128_sub(section->size, int128_one())));
  1898. return;
  1899. }
  1900. if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
  1901. (section->offset_within_region & ~TARGET_PAGE_MASK))) {
  1902. error_report("%s received unaligned region", __func__);
  1903. return;
  1904. }
  1905. iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
  1906. end = (section->offset_within_address_space + int128_get64(section->size)) &
  1907. TARGET_PAGE_MASK;
  1908. if (iova >= end) {
  1909. return;
  1910. }
  1911. vaddr = memory_region_get_ram_ptr(section->mr) +
  1912. section->offset_within_region +
  1913. (iova - section->offset_within_address_space);
  1914. DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
  1915. iova, end - 1, vaddr);
  1916. memory_region_ref(section->mr);
  1917. ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
  1918. if (ret) {
  1919. error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
  1920. "0x%"HWADDR_PRIx", %p) = %d (%m)",
  1921. container, iova, end - iova, vaddr, ret);
  1922. /*
  1923. * On the initfn path, store the first error in the container so we
  1924. * can gracefully fail. Runtime, there's not much we can do other
  1925. * than throw a hardware error.
  1926. */
  1927. if (!container->iommu_data.type1.initialized) {
  1928. if (!container->iommu_data.type1.error) {
  1929. container->iommu_data.type1.error = ret;
  1930. }
  1931. } else {
  1932. hw_error("vfio: DMA mapping failed, unable to continue");
  1933. }
  1934. }
  1935. }
  1936. static void vfio_listener_region_del(MemoryListener *listener,
  1937. MemoryRegionSection *section)
  1938. {
  1939. VFIOContainer *container = container_of(listener, VFIOContainer,
  1940. iommu_data.type1.listener);
  1941. hwaddr iova, end;
  1942. int ret;
  1943. if (vfio_listener_skipped_section(section)) {
  1944. DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
  1945. section->offset_within_address_space,
  1946. section->offset_within_address_space +
  1947. int128_get64(int128_sub(section->size, int128_one())));
  1948. return;
  1949. }
  1950. if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
  1951. (section->offset_within_region & ~TARGET_PAGE_MASK))) {
  1952. error_report("%s received unaligned region", __func__);
  1953. return;
  1954. }
  1955. iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
  1956. end = (section->offset_within_address_space + int128_get64(section->size)) &
  1957. TARGET_PAGE_MASK;
  1958. if (iova >= end) {
  1959. return;
  1960. }
  1961. DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
  1962. iova, end - 1);
  1963. ret = vfio_dma_unmap(container, iova, end - iova);
  1964. memory_region_unref(section->mr);
  1965. if (ret) {
  1966. error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  1967. "0x%"HWADDR_PRIx") = %d (%m)",
  1968. container, iova, end - iova, ret);
  1969. }
  1970. }
  1971. static MemoryListener vfio_memory_listener = {
  1972. .region_add = vfio_listener_region_add,
  1973. .region_del = vfio_listener_region_del,
  1974. };
  1975. static void vfio_listener_release(VFIOContainer *container)
  1976. {
  1977. memory_listener_unregister(&container->iommu_data.type1.listener);
  1978. }
  1979. /*
  1980. * Interrupt setup
  1981. */
  1982. static void vfio_disable_interrupts(VFIODevice *vdev)
  1983. {
  1984. switch (vdev->interrupt) {
  1985. case VFIO_INT_INTx:
  1986. vfio_disable_intx(vdev);
  1987. break;
  1988. case VFIO_INT_MSI:
  1989. vfio_disable_msi(vdev);
  1990. break;
  1991. case VFIO_INT_MSIX:
  1992. vfio_disable_msix(vdev);
  1993. break;
  1994. }
  1995. }
  1996. static int vfio_setup_msi(VFIODevice *vdev, int pos)
  1997. {
  1998. uint16_t ctrl;
  1999. bool msi_64bit, msi_maskbit;
  2000. int ret, entries;
  2001. if (pread(vdev->fd, &ctrl, sizeof(ctrl),
  2002. vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
  2003. return -errno;
  2004. }
  2005. ctrl = le16_to_cpu(ctrl);
  2006. msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
  2007. msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
  2008. entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
  2009. DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
  2010. vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
  2011. ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
  2012. if (ret < 0) {
  2013. if (ret == -ENOTSUP) {
  2014. return 0;
  2015. }
  2016. error_report("vfio: msi_init failed");
  2017. return ret;
  2018. }
  2019. vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
  2020. return 0;
  2021. }
  2022. /*
  2023. * We don't have any control over how pci_add_capability() inserts
  2024. * capabilities into the chain. In order to setup MSI-X we need a
  2025. * MemoryRegion for the BAR. In order to setup the BAR and not
  2026. * attempt to mmap the MSI-X table area, which VFIO won't allow, we
  2027. * need to first look for where the MSI-X table lives. So we
  2028. * unfortunately split MSI-X setup across two functions.
  2029. */
  2030. static int vfio_early_setup_msix(VFIODevice *vdev)
  2031. {
  2032. uint8_t pos;
  2033. uint16_t ctrl;
  2034. uint32_t table, pba;
  2035. pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
  2036. if (!pos) {
  2037. return 0;
  2038. }
  2039. if (pread(vdev->fd, &ctrl, sizeof(ctrl),
  2040. vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
  2041. return -errno;
  2042. }
  2043. if (pread(vdev->fd, &table, sizeof(table),
  2044. vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
  2045. return -errno;
  2046. }
  2047. if (pread(vdev->fd, &pba, sizeof(pba),
  2048. vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
  2049. return -errno;
  2050. }
  2051. ctrl = le16_to_cpu(ctrl);
  2052. table = le32_to_cpu(table);
  2053. pba = le32_to_cpu(pba);
  2054. vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
  2055. vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
  2056. vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
  2057. vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
  2058. vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
  2059. vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
  2060. DPRINTF("%04x:%02x:%02x.%x "
  2061. "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
  2062. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  2063. vdev->host.function, pos, vdev->msix->table_bar,
  2064. vdev->msix->table_offset, vdev->msix->entries);
  2065. return 0;
  2066. }
  2067. static int vfio_setup_msix(VFIODevice *vdev, int pos)
  2068. {
  2069. int ret;
  2070. ret = msix_init(&vdev->pdev, vdev->msix->entries,
  2071. &vdev->bars[vdev->msix->table_bar].mem,
  2072. vdev->msix->table_bar, vdev->msix->table_offset,
  2073. &vdev->bars[vdev->msix->pba_bar].mem,
  2074. vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
  2075. if (ret < 0) {
  2076. if (ret == -ENOTSUP) {
  2077. return 0;
  2078. }
  2079. error_report("vfio: msix_init failed");
  2080. return ret;
  2081. }
  2082. return 0;
  2083. }
  2084. static void vfio_teardown_msi(VFIODevice *vdev)
  2085. {
  2086. msi_uninit(&vdev->pdev);
  2087. if (vdev->msix) {
  2088. msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
  2089. &vdev->bars[vdev->msix->pba_bar].mem);
  2090. }
  2091. }
  2092. /*
  2093. * Resource setup
  2094. */
  2095. static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled)
  2096. {
  2097. int i;
  2098. for (i = 0; i < PCI_ROM_SLOT; i++) {
  2099. VFIOBAR *bar = &vdev->bars[i];
  2100. if (!bar->size) {
  2101. continue;
  2102. }
  2103. memory_region_set_enabled(&bar->mmap_mem, enabled);
  2104. if (vdev->msix && vdev->msix->table_bar == i) {
  2105. memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
  2106. }
  2107. }
  2108. }
  2109. static void vfio_unmap_bar(VFIODevice *vdev, int nr)
  2110. {
  2111. VFIOBAR *bar = &vdev->bars[nr];
  2112. if (!bar->size) {
  2113. return;
  2114. }
  2115. vfio_bar_quirk_teardown(vdev, nr);
  2116. memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
  2117. munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
  2118. memory_region_destroy(&bar->mmap_mem);
  2119. if (vdev->msix && vdev->msix->table_bar == nr) {
  2120. memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
  2121. munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
  2122. memory_region_destroy(&vdev->msix->mmap_mem);
  2123. }
  2124. memory_region_destroy(&bar->mem);
  2125. }
  2126. static int vfio_mmap_bar(VFIODevice *vdev, VFIOBAR *bar,
  2127. MemoryRegion *mem, MemoryRegion *submem,
  2128. void **map, size_t size, off_t offset,
  2129. const char *name)
  2130. {
  2131. int ret = 0;
  2132. if (VFIO_ALLOW_MMAP && size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
  2133. int prot = 0;
  2134. if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
  2135. prot |= PROT_READ;
  2136. }
  2137. if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
  2138. prot |= PROT_WRITE;
  2139. }
  2140. *map = mmap(NULL, size, prot, MAP_SHARED,
  2141. bar->fd, bar->fd_offset + offset);
  2142. if (*map == MAP_FAILED) {
  2143. *map = NULL;
  2144. ret = -errno;
  2145. goto empty_region;
  2146. }
  2147. memory_region_init_ram_ptr(submem, OBJECT(vdev), name, size, *map);
  2148. } else {
  2149. empty_region:
  2150. /* Create a zero sized sub-region to make cleanup easy. */
  2151. memory_region_init(submem, OBJECT(vdev), name, 0);
  2152. }
  2153. memory_region_add_subregion(mem, offset, submem);
  2154. return ret;
  2155. }
  2156. static void vfio_map_bar(VFIODevice *vdev, int nr)
  2157. {
  2158. VFIOBAR *bar = &vdev->bars[nr];
  2159. unsigned size = bar->size;
  2160. char name[64];
  2161. uint32_t pci_bar;
  2162. uint8_t type;
  2163. int ret;
  2164. /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
  2165. if (!size) {
  2166. return;
  2167. }
  2168. snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
  2169. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  2170. vdev->host.function, nr);
  2171. /* Determine what type of BAR this is for registration */
  2172. ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
  2173. vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
  2174. if (ret != sizeof(pci_bar)) {
  2175. error_report("vfio: Failed to read BAR %d (%m)", nr);
  2176. return;
  2177. }
  2178. pci_bar = le32_to_cpu(pci_bar);
  2179. bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
  2180. bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
  2181. type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
  2182. ~PCI_BASE_ADDRESS_MEM_MASK);
  2183. /* A "slow" read/write mapping underlies all BARs */
  2184. memory_region_init_io(&bar->mem, OBJECT(vdev), &vfio_bar_ops,
  2185. bar, name, size);
  2186. pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
  2187. /*
  2188. * We can't mmap areas overlapping the MSIX vector table, so we
  2189. * potentially insert a direct-mapped subregion before and after it.
  2190. */
  2191. if (vdev->msix && vdev->msix->table_bar == nr) {
  2192. size = vdev->msix->table_offset & qemu_host_page_mask;
  2193. }
  2194. strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
  2195. if (vfio_mmap_bar(vdev, bar, &bar->mem,
  2196. &bar->mmap_mem, &bar->mmap, size, 0, name)) {
  2197. error_report("%s unsupported. Performance may be slow", name);
  2198. }
  2199. if (vdev->msix && vdev->msix->table_bar == nr) {
  2200. unsigned start;
  2201. start = HOST_PAGE_ALIGN(vdev->msix->table_offset +
  2202. (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
  2203. size = start < bar->size ? bar->size - start : 0;
  2204. strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
  2205. /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
  2206. if (vfio_mmap_bar(vdev, bar, &bar->mem, &vdev->msix->mmap_mem,
  2207. &vdev->msix->mmap, size, start, name)) {
  2208. error_report("%s unsupported. Performance may be slow", name);
  2209. }
  2210. }
  2211. vfio_bar_quirk_setup(vdev, nr);
  2212. }
  2213. static void vfio_map_bars(VFIODevice *vdev)
  2214. {
  2215. int i;
  2216. for (i = 0; i < PCI_ROM_SLOT; i++) {
  2217. vfio_map_bar(vdev, i);
  2218. }
  2219. if (vdev->has_vga) {
  2220. memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
  2221. OBJECT(vdev), &vfio_vga_ops,
  2222. &vdev->vga.region[QEMU_PCI_VGA_MEM],
  2223. "vfio-vga-mmio@0xa0000",
  2224. QEMU_PCI_VGA_MEM_SIZE);
  2225. memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
  2226. OBJECT(vdev), &vfio_vga_ops,
  2227. &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
  2228. "vfio-vga-io@0x3b0",
  2229. QEMU_PCI_VGA_IO_LO_SIZE);
  2230. memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
  2231. OBJECT(vdev), &vfio_vga_ops,
  2232. &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
  2233. "vfio-vga-io@0x3c0",
  2234. QEMU_PCI_VGA_IO_HI_SIZE);
  2235. pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
  2236. &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
  2237. &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
  2238. vfio_vga_quirk_setup(vdev);
  2239. }
  2240. }
  2241. static void vfio_unmap_bars(VFIODevice *vdev)
  2242. {
  2243. int i;
  2244. for (i = 0; i < PCI_ROM_SLOT; i++) {
  2245. vfio_unmap_bar(vdev, i);
  2246. }
  2247. if (vdev->has_vga) {
  2248. vfio_vga_quirk_teardown(vdev);
  2249. pci_unregister_vga(&vdev->pdev);
  2250. memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem);
  2251. memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem);
  2252. memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
  2253. }
  2254. }
  2255. /*
  2256. * General setup
  2257. */
  2258. static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
  2259. {
  2260. uint8_t tmp, next = 0xff;
  2261. for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
  2262. tmp = pdev->config[tmp + 1]) {
  2263. if (tmp > pos && tmp < next) {
  2264. next = tmp;
  2265. }
  2266. }
  2267. return next - pos;
  2268. }
  2269. static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
  2270. {
  2271. pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
  2272. }
  2273. static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
  2274. uint16_t val, uint16_t mask)
  2275. {
  2276. vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
  2277. vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
  2278. vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
  2279. }
  2280. static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
  2281. {
  2282. pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
  2283. }
  2284. static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
  2285. uint32_t val, uint32_t mask)
  2286. {
  2287. vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
  2288. vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
  2289. vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
  2290. }
  2291. static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
  2292. {
  2293. uint16_t flags;
  2294. uint8_t type;
  2295. flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
  2296. type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
  2297. if (type != PCI_EXP_TYPE_ENDPOINT &&
  2298. type != PCI_EXP_TYPE_LEG_END &&
  2299. type != PCI_EXP_TYPE_RC_END) {
  2300. error_report("vfio: Assignment of PCIe type 0x%x "
  2301. "devices is not currently supported", type);
  2302. return -EINVAL;
  2303. }
  2304. if (!pci_bus_is_express(vdev->pdev.bus)) {
  2305. /*
  2306. * Use express capability as-is on PCI bus. It doesn't make much
  2307. * sense to even expose, but some drivers (ex. tg3) depend on it
  2308. * and guests don't seem to be particular about it. We'll need
  2309. * to revist this or force express devices to express buses if we
  2310. * ever expose an IOMMU to the guest.
  2311. */
  2312. } else if (pci_bus_is_root(vdev->pdev.bus)) {
  2313. /*
  2314. * On a Root Complex bus Endpoints become Root Complex Integrated
  2315. * Endpoints, which changes the type and clears the LNK & LNK2 fields.
  2316. */
  2317. if (type == PCI_EXP_TYPE_ENDPOINT) {
  2318. vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
  2319. PCI_EXP_TYPE_RC_END << 4,
  2320. PCI_EXP_FLAGS_TYPE);
  2321. /* Link Capabilities, Status, and Control goes away */
  2322. if (size > PCI_EXP_LNKCTL) {
  2323. vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
  2324. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
  2325. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
  2326. #ifndef PCI_EXP_LNKCAP2
  2327. #define PCI_EXP_LNKCAP2 44
  2328. #endif
  2329. #ifndef PCI_EXP_LNKSTA2
  2330. #define PCI_EXP_LNKSTA2 50
  2331. #endif
  2332. /* Link 2 Capabilities, Status, and Control goes away */
  2333. if (size > PCI_EXP_LNKCAP2) {
  2334. vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
  2335. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
  2336. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
  2337. }
  2338. }
  2339. } else if (type == PCI_EXP_TYPE_LEG_END) {
  2340. /*
  2341. * Legacy endpoints don't belong on the root complex. Windows
  2342. * seems to be happier with devices if we skip the capability.
  2343. */
  2344. return 0;
  2345. }
  2346. } else {
  2347. /*
  2348. * Convert Root Complex Integrated Endpoints to regular endpoints.
  2349. * These devices don't support LNK/LNK2 capabilities, so make them up.
  2350. */
  2351. if (type == PCI_EXP_TYPE_RC_END) {
  2352. vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
  2353. PCI_EXP_TYPE_ENDPOINT << 4,
  2354. PCI_EXP_FLAGS_TYPE);
  2355. vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
  2356. PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
  2357. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
  2358. }
  2359. /* Mark the Link Status bits as emulated to allow virtual negotiation */
  2360. vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
  2361. pci_get_word(vdev->pdev.config + pos +
  2362. PCI_EXP_LNKSTA),
  2363. PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
  2364. }
  2365. pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
  2366. if (pos >= 0) {
  2367. vdev->pdev.exp.exp_cap = pos;
  2368. }
  2369. return pos;
  2370. }
  2371. static void vfio_check_pcie_flr(VFIODevice *vdev, uint8_t pos)
  2372. {
  2373. uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
  2374. if (cap & PCI_EXP_DEVCAP_FLR) {
  2375. DPRINTF("%04x:%02x:%02x.%x Supports FLR via PCIe cap\n",
  2376. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  2377. vdev->host.function);
  2378. vdev->has_flr = true;
  2379. }
  2380. }
  2381. static void vfio_check_pm_reset(VFIODevice *vdev, uint8_t pos)
  2382. {
  2383. uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
  2384. if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
  2385. DPRINTF("%04x:%02x:%02x.%x Supports PM reset\n",
  2386. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  2387. vdev->host.function);
  2388. vdev->has_pm_reset = true;
  2389. }
  2390. }
  2391. static void vfio_check_af_flr(VFIODevice *vdev, uint8_t pos)
  2392. {
  2393. uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
  2394. if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
  2395. DPRINTF("%04x:%02x:%02x.%x Supports FLR via AF cap\n",
  2396. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  2397. vdev->host.function);
  2398. vdev->has_flr = true;
  2399. }
  2400. }
  2401. static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
  2402. {
  2403. PCIDevice *pdev = &vdev->pdev;
  2404. uint8_t cap_id, next, size;
  2405. int ret;
  2406. cap_id = pdev->config[pos];
  2407. next = pdev->config[pos + 1];
  2408. /*
  2409. * If it becomes important to configure capabilities to their actual
  2410. * size, use this as the default when it's something we don't recognize.
  2411. * Since QEMU doesn't actually handle many of the config accesses,
  2412. * exact size doesn't seem worthwhile.
  2413. */
  2414. size = vfio_std_cap_max_size(pdev, pos);
  2415. /*
  2416. * pci_add_capability always inserts the new capability at the head
  2417. * of the chain. Therefore to end up with a chain that matches the
  2418. * physical device, we insert from the end by making this recursive.
  2419. * This is also why we pre-caclulate size above as cached config space
  2420. * will be changed as we unwind the stack.
  2421. */
  2422. if (next) {
  2423. ret = vfio_add_std_cap(vdev, next);
  2424. if (ret) {
  2425. return ret;
  2426. }
  2427. } else {
  2428. /* Begin the rebuild, use QEMU emulated list bits */
  2429. pdev->config[PCI_CAPABILITY_LIST] = 0;
  2430. vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
  2431. vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
  2432. }
  2433. /* Use emulated next pointer to allow dropping caps */
  2434. pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
  2435. switch (cap_id) {
  2436. case PCI_CAP_ID_MSI:
  2437. ret = vfio_setup_msi(vdev, pos);
  2438. break;
  2439. case PCI_CAP_ID_EXP:
  2440. vfio_check_pcie_flr(vdev, pos);
  2441. ret = vfio_setup_pcie_cap(vdev, pos, size);
  2442. break;
  2443. case PCI_CAP_ID_MSIX:
  2444. ret = vfio_setup_msix(vdev, pos);
  2445. break;
  2446. case PCI_CAP_ID_PM:
  2447. vfio_check_pm_reset(vdev, pos);
  2448. vdev->pm_cap = pos;
  2449. ret = pci_add_capability(pdev, cap_id, pos, size);
  2450. break;
  2451. case PCI_CAP_ID_AF:
  2452. vfio_check_af_flr(vdev, pos);
  2453. ret = pci_add_capability(pdev, cap_id, pos, size);
  2454. break;
  2455. default:
  2456. ret = pci_add_capability(pdev, cap_id, pos, size);
  2457. break;
  2458. }
  2459. if (ret < 0) {
  2460. error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
  2461. "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
  2462. vdev->host.bus, vdev->host.slot, vdev->host.function,
  2463. cap_id, size, pos, ret);
  2464. return ret;
  2465. }
  2466. return 0;
  2467. }
  2468. static int vfio_add_capabilities(VFIODevice *vdev)
  2469. {
  2470. PCIDevice *pdev = &vdev->pdev;
  2471. if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
  2472. !pdev->config[PCI_CAPABILITY_LIST]) {
  2473. return 0; /* Nothing to add */
  2474. }
  2475. return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
  2476. }
  2477. static void vfio_pci_pre_reset(VFIODevice *vdev)
  2478. {
  2479. PCIDevice *pdev = &vdev->pdev;
  2480. uint16_t cmd;
  2481. vfio_disable_interrupts(vdev);
  2482. /* Make sure the device is in D0 */
  2483. if (vdev->pm_cap) {
  2484. uint16_t pmcsr;
  2485. uint8_t state;
  2486. pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
  2487. state = pmcsr & PCI_PM_CTRL_STATE_MASK;
  2488. if (state) {
  2489. pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
  2490. vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
  2491. /* vfio handles the necessary delay here */
  2492. pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
  2493. state = pmcsr & PCI_PM_CTRL_STATE_MASK;
  2494. if (state) {
  2495. error_report("vfio: Unable to power on device, stuck in D%d",
  2496. state);
  2497. }
  2498. }
  2499. }
  2500. /*
  2501. * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
  2502. * Also put INTx Disable in known state.
  2503. */
  2504. cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
  2505. cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
  2506. PCI_COMMAND_INTX_DISABLE);
  2507. vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
  2508. }
  2509. static void vfio_pci_post_reset(VFIODevice *vdev)
  2510. {
  2511. vfio_enable_intx(vdev);
  2512. }
  2513. static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
  2514. PCIHostDeviceAddress *host2)
  2515. {
  2516. return (host1->domain == host2->domain && host1->bus == host2->bus &&
  2517. host1->slot == host2->slot && host1->function == host2->function);
  2518. }
  2519. static int vfio_pci_hot_reset(VFIODevice *vdev, bool single)
  2520. {
  2521. VFIOGroup *group;
  2522. struct vfio_pci_hot_reset_info *info;
  2523. struct vfio_pci_dependent_device *devices;
  2524. struct vfio_pci_hot_reset *reset;
  2525. int32_t *fds;
  2526. int ret, i, count;
  2527. bool multi = false;
  2528. DPRINTF("%s(%04x:%02x:%02x.%x) %s\n", __func__, vdev->host.domain,
  2529. vdev->host.bus, vdev->host.slot, vdev->host.function,
  2530. single ? "one" : "multi");
  2531. vfio_pci_pre_reset(vdev);
  2532. vdev->needs_reset = false;
  2533. info = g_malloc0(sizeof(*info));
  2534. info->argsz = sizeof(*info);
  2535. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
  2536. if (ret && errno != ENOSPC) {
  2537. ret = -errno;
  2538. if (!vdev->has_pm_reset) {
  2539. error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
  2540. "no available reset mechanism.", vdev->host.domain,
  2541. vdev->host.bus, vdev->host.slot, vdev->host.function);
  2542. }
  2543. goto out_single;
  2544. }
  2545. count = info->count;
  2546. info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
  2547. info->argsz = sizeof(*info) + (count * sizeof(*devices));
  2548. devices = &info->devices[0];
  2549. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
  2550. if (ret) {
  2551. ret = -errno;
  2552. error_report("vfio: hot reset info failed: %m");
  2553. goto out_single;
  2554. }
  2555. DPRINTF("%04x:%02x:%02x.%x: hot reset dependent devices:\n",
  2556. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  2557. vdev->host.function);
  2558. /* Verify that we have all the groups required */
  2559. for (i = 0; i < info->count; i++) {
  2560. PCIHostDeviceAddress host;
  2561. VFIODevice *tmp;
  2562. host.domain = devices[i].segment;
  2563. host.bus = devices[i].bus;
  2564. host.slot = PCI_SLOT(devices[i].devfn);
  2565. host.function = PCI_FUNC(devices[i].devfn);
  2566. DPRINTF("\t%04x:%02x:%02x.%x group %d\n", host.domain,
  2567. host.bus, host.slot, host.function, devices[i].group_id);
  2568. if (vfio_pci_host_match(&host, &vdev->host)) {
  2569. continue;
  2570. }
  2571. QLIST_FOREACH(group, &group_list, next) {
  2572. if (group->groupid == devices[i].group_id) {
  2573. break;
  2574. }
  2575. }
  2576. if (!group) {
  2577. if (!vdev->has_pm_reset) {
  2578. error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
  2579. "depends on group %d which is not owned.",
  2580. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  2581. vdev->host.function, devices[i].group_id);
  2582. }
  2583. ret = -EPERM;
  2584. goto out;
  2585. }
  2586. /* Prep dependent devices for reset and clear our marker. */
  2587. QLIST_FOREACH(tmp, &group->device_list, next) {
  2588. if (vfio_pci_host_match(&host, &tmp->host)) {
  2589. if (single) {
  2590. DPRINTF("vfio: found another in-use device "
  2591. "%04x:%02x:%02x.%x\n", host.domain, host.bus,
  2592. host.slot, host.function);
  2593. ret = -EINVAL;
  2594. goto out_single;
  2595. }
  2596. vfio_pci_pre_reset(tmp);
  2597. tmp->needs_reset = false;
  2598. multi = true;
  2599. break;
  2600. }
  2601. }
  2602. }
  2603. if (!single && !multi) {
  2604. DPRINTF("vfio: No other in-use devices for multi hot reset\n");
  2605. ret = -EINVAL;
  2606. goto out_single;
  2607. }
  2608. /* Determine how many group fds need to be passed */
  2609. count = 0;
  2610. QLIST_FOREACH(group, &group_list, next) {
  2611. for (i = 0; i < info->count; i++) {
  2612. if (group->groupid == devices[i].group_id) {
  2613. count++;
  2614. break;
  2615. }
  2616. }
  2617. }
  2618. reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
  2619. reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
  2620. fds = &reset->group_fds[0];
  2621. /* Fill in group fds */
  2622. QLIST_FOREACH(group, &group_list, next) {
  2623. for (i = 0; i < info->count; i++) {
  2624. if (group->groupid == devices[i].group_id) {
  2625. fds[reset->count++] = group->fd;
  2626. break;
  2627. }
  2628. }
  2629. }
  2630. /* Bus reset! */
  2631. ret = ioctl(vdev->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
  2632. g_free(reset);
  2633. DPRINTF("%04x:%02x:%02x.%x hot reset: %s\n", vdev->host.domain,
  2634. vdev->host.bus, vdev->host.slot, vdev->host.function,
  2635. ret ? "%m" : "Success");
  2636. out:
  2637. /* Re-enable INTx on affected devices */
  2638. for (i = 0; i < info->count; i++) {
  2639. PCIHostDeviceAddress host;
  2640. VFIODevice *tmp;
  2641. host.domain = devices[i].segment;
  2642. host.bus = devices[i].bus;
  2643. host.slot = PCI_SLOT(devices[i].devfn);
  2644. host.function = PCI_FUNC(devices[i].devfn);
  2645. if (vfio_pci_host_match(&host, &vdev->host)) {
  2646. continue;
  2647. }
  2648. QLIST_FOREACH(group, &group_list, next) {
  2649. if (group->groupid == devices[i].group_id) {
  2650. break;
  2651. }
  2652. }
  2653. if (!group) {
  2654. break;
  2655. }
  2656. QLIST_FOREACH(tmp, &group->device_list, next) {
  2657. if (vfio_pci_host_match(&host, &tmp->host)) {
  2658. vfio_pci_post_reset(tmp);
  2659. break;
  2660. }
  2661. }
  2662. }
  2663. out_single:
  2664. vfio_pci_post_reset(vdev);
  2665. g_free(info);
  2666. return ret;
  2667. }
  2668. /*
  2669. * We want to differentiate hot reset of mulitple in-use devices vs hot reset
  2670. * of a single in-use device. VFIO_DEVICE_RESET will already handle the case
  2671. * of doing hot resets when there is only a single device per bus. The in-use
  2672. * here refers to how many VFIODevices are affected. A hot reset that affects
  2673. * multiple devices, but only a single in-use device, means that we can call
  2674. * it from our bus ->reset() callback since the extent is effectively a single
  2675. * device. This allows us to make use of it in the hotplug path. When there
  2676. * are multiple in-use devices, we can only trigger the hot reset during a
  2677. * system reset and thus from our reset handler. We separate _one vs _multi
  2678. * here so that we don't overlap and do a double reset on the system reset
  2679. * path where both our reset handler and ->reset() callback are used. Calling
  2680. * _one() will only do a hot reset for the one in-use devices case, calling
  2681. * _multi() will do nothing if a _one() would have been sufficient.
  2682. */
  2683. static int vfio_pci_hot_reset_one(VFIODevice *vdev)
  2684. {
  2685. return vfio_pci_hot_reset(vdev, true);
  2686. }
  2687. static int vfio_pci_hot_reset_multi(VFIODevice *vdev)
  2688. {
  2689. return vfio_pci_hot_reset(vdev, false);
  2690. }
  2691. static void vfio_pci_reset_handler(void *opaque)
  2692. {
  2693. VFIOGroup *group;
  2694. VFIODevice *vdev;
  2695. QLIST_FOREACH(group, &group_list, next) {
  2696. QLIST_FOREACH(vdev, &group->device_list, next) {
  2697. if (!vdev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
  2698. vdev->needs_reset = true;
  2699. }
  2700. }
  2701. }
  2702. QLIST_FOREACH(group, &group_list, next) {
  2703. QLIST_FOREACH(vdev, &group->device_list, next) {
  2704. if (vdev->needs_reset) {
  2705. vfio_pci_hot_reset_multi(vdev);
  2706. }
  2707. }
  2708. }
  2709. }
  2710. static void vfio_kvm_device_add_group(VFIOGroup *group)
  2711. {
  2712. #ifdef CONFIG_KVM
  2713. struct kvm_device_attr attr = {
  2714. .group = KVM_DEV_VFIO_GROUP,
  2715. .attr = KVM_DEV_VFIO_GROUP_ADD,
  2716. .addr = (uint64_t)(unsigned long)&group->fd,
  2717. };
  2718. if (!kvm_enabled()) {
  2719. return;
  2720. }
  2721. if (vfio_kvm_device_fd < 0) {
  2722. struct kvm_create_device cd = {
  2723. .type = KVM_DEV_TYPE_VFIO,
  2724. };
  2725. if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
  2726. DPRINTF("KVM_CREATE_DEVICE: %m\n");
  2727. return;
  2728. }
  2729. vfio_kvm_device_fd = cd.fd;
  2730. }
  2731. if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
  2732. error_report("Failed to add group %d to KVM VFIO device: %m",
  2733. group->groupid);
  2734. }
  2735. #endif
  2736. }
  2737. static void vfio_kvm_device_del_group(VFIOGroup *group)
  2738. {
  2739. #ifdef CONFIG_KVM
  2740. struct kvm_device_attr attr = {
  2741. .group = KVM_DEV_VFIO_GROUP,
  2742. .attr = KVM_DEV_VFIO_GROUP_DEL,
  2743. .addr = (uint64_t)(unsigned long)&group->fd,
  2744. };
  2745. if (vfio_kvm_device_fd < 0) {
  2746. return;
  2747. }
  2748. if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
  2749. error_report("Failed to remove group %d from KVM VFIO device: %m",
  2750. group->groupid);
  2751. }
  2752. #endif
  2753. }
  2754. static int vfio_connect_container(VFIOGroup *group)
  2755. {
  2756. VFIOContainer *container;
  2757. int ret, fd;
  2758. if (group->container) {
  2759. return 0;
  2760. }
  2761. QLIST_FOREACH(container, &container_list, next) {
  2762. if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
  2763. group->container = container;
  2764. QLIST_INSERT_HEAD(&container->group_list, group, container_next);
  2765. return 0;
  2766. }
  2767. }
  2768. fd = qemu_open("/dev/vfio/vfio", O_RDWR);
  2769. if (fd < 0) {
  2770. error_report("vfio: failed to open /dev/vfio/vfio: %m");
  2771. return -errno;
  2772. }
  2773. ret = ioctl(fd, VFIO_GET_API_VERSION);
  2774. if (ret != VFIO_API_VERSION) {
  2775. error_report("vfio: supported vfio version: %d, "
  2776. "reported version: %d", VFIO_API_VERSION, ret);
  2777. close(fd);
  2778. return -EINVAL;
  2779. }
  2780. container = g_malloc0(sizeof(*container));
  2781. container->fd = fd;
  2782. if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
  2783. ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
  2784. if (ret) {
  2785. error_report("vfio: failed to set group container: %m");
  2786. g_free(container);
  2787. close(fd);
  2788. return -errno;
  2789. }
  2790. ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
  2791. if (ret) {
  2792. error_report("vfio: failed to set iommu for container: %m");
  2793. g_free(container);
  2794. close(fd);
  2795. return -errno;
  2796. }
  2797. container->iommu_data.type1.listener = vfio_memory_listener;
  2798. container->iommu_data.release = vfio_listener_release;
  2799. memory_listener_register(&container->iommu_data.type1.listener,
  2800. &address_space_memory);
  2801. if (container->iommu_data.type1.error) {
  2802. ret = container->iommu_data.type1.error;
  2803. vfio_listener_release(container);
  2804. g_free(container);
  2805. close(fd);
  2806. error_report("vfio: memory listener initialization failed for container");
  2807. return ret;
  2808. }
  2809. container->iommu_data.type1.initialized = true;
  2810. } else {
  2811. error_report("vfio: No available IOMMU models");
  2812. g_free(container);
  2813. close(fd);
  2814. return -EINVAL;
  2815. }
  2816. QLIST_INIT(&container->group_list);
  2817. QLIST_INSERT_HEAD(&container_list, container, next);
  2818. group->container = container;
  2819. QLIST_INSERT_HEAD(&container->group_list, group, container_next);
  2820. return 0;
  2821. }
  2822. static void vfio_disconnect_container(VFIOGroup *group)
  2823. {
  2824. VFIOContainer *container = group->container;
  2825. if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
  2826. error_report("vfio: error disconnecting group %d from container",
  2827. group->groupid);
  2828. }
  2829. QLIST_REMOVE(group, container_next);
  2830. group->container = NULL;
  2831. if (QLIST_EMPTY(&container->group_list)) {
  2832. if (container->iommu_data.release) {
  2833. container->iommu_data.release(container);
  2834. }
  2835. QLIST_REMOVE(container, next);
  2836. DPRINTF("vfio_disconnect_container: close container->fd\n");
  2837. close(container->fd);
  2838. g_free(container);
  2839. }
  2840. }
  2841. static VFIOGroup *vfio_get_group(int groupid)
  2842. {
  2843. VFIOGroup *group;
  2844. char path[32];
  2845. struct vfio_group_status status = { .argsz = sizeof(status) };
  2846. QLIST_FOREACH(group, &group_list, next) {
  2847. if (group->groupid == groupid) {
  2848. return group;
  2849. }
  2850. }
  2851. group = g_malloc0(sizeof(*group));
  2852. snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
  2853. group->fd = qemu_open(path, O_RDWR);
  2854. if (group->fd < 0) {
  2855. error_report("vfio: error opening %s: %m", path);
  2856. g_free(group);
  2857. return NULL;
  2858. }
  2859. if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
  2860. error_report("vfio: error getting group status: %m");
  2861. close(group->fd);
  2862. g_free(group);
  2863. return NULL;
  2864. }
  2865. if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
  2866. error_report("vfio: error, group %d is not viable, please ensure "
  2867. "all devices within the iommu_group are bound to their "
  2868. "vfio bus driver.", groupid);
  2869. close(group->fd);
  2870. g_free(group);
  2871. return NULL;
  2872. }
  2873. group->groupid = groupid;
  2874. QLIST_INIT(&group->device_list);
  2875. if (vfio_connect_container(group)) {
  2876. error_report("vfio: failed to setup container for group %d", groupid);
  2877. close(group->fd);
  2878. g_free(group);
  2879. return NULL;
  2880. }
  2881. if (QLIST_EMPTY(&group_list)) {
  2882. qemu_register_reset(vfio_pci_reset_handler, NULL);
  2883. }
  2884. QLIST_INSERT_HEAD(&group_list, group, next);
  2885. vfio_kvm_device_add_group(group);
  2886. return group;
  2887. }
  2888. static void vfio_put_group(VFIOGroup *group)
  2889. {
  2890. if (!QLIST_EMPTY(&group->device_list)) {
  2891. return;
  2892. }
  2893. vfio_kvm_device_del_group(group);
  2894. vfio_disconnect_container(group);
  2895. QLIST_REMOVE(group, next);
  2896. DPRINTF("vfio_put_group: close group->fd\n");
  2897. close(group->fd);
  2898. g_free(group);
  2899. if (QLIST_EMPTY(&group_list)) {
  2900. qemu_unregister_reset(vfio_pci_reset_handler, NULL);
  2901. }
  2902. }
  2903. static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
  2904. {
  2905. struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
  2906. struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
  2907. struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
  2908. int ret, i;
  2909. ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
  2910. if (ret < 0) {
  2911. error_report("vfio: error getting device %s from group %d: %m",
  2912. name, group->groupid);
  2913. error_printf("Verify all devices in group %d are bound to vfio-pci "
  2914. "or pci-stub and not already in use\n", group->groupid);
  2915. return ret;
  2916. }
  2917. vdev->fd = ret;
  2918. vdev->group = group;
  2919. QLIST_INSERT_HEAD(&group->device_list, vdev, next);
  2920. /* Sanity check device */
  2921. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
  2922. if (ret) {
  2923. error_report("vfio: error getting device info: %m");
  2924. goto error;
  2925. }
  2926. DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
  2927. dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
  2928. if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
  2929. error_report("vfio: Um, this isn't a PCI device");
  2930. goto error;
  2931. }
  2932. vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
  2933. if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
  2934. error_report("vfio: unexpected number of io regions %u",
  2935. dev_info.num_regions);
  2936. goto error;
  2937. }
  2938. if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
  2939. error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
  2940. goto error;
  2941. }
  2942. for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
  2943. reg_info.index = i;
  2944. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
  2945. if (ret) {
  2946. error_report("vfio: Error getting region %d info: %m", i);
  2947. goto error;
  2948. }
  2949. DPRINTF("Device %s region %d:\n", name, i);
  2950. DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
  2951. (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
  2952. (unsigned long)reg_info.flags);
  2953. vdev->bars[i].flags = reg_info.flags;
  2954. vdev->bars[i].size = reg_info.size;
  2955. vdev->bars[i].fd_offset = reg_info.offset;
  2956. vdev->bars[i].fd = vdev->fd;
  2957. vdev->bars[i].nr = i;
  2958. QLIST_INIT(&vdev->bars[i].quirks);
  2959. }
  2960. reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
  2961. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
  2962. if (ret) {
  2963. error_report("vfio: Error getting config info: %m");
  2964. goto error;
  2965. }
  2966. DPRINTF("Device %s config:\n", name);
  2967. DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
  2968. (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
  2969. (unsigned long)reg_info.flags);
  2970. vdev->config_size = reg_info.size;
  2971. if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
  2972. vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
  2973. }
  2974. vdev->config_offset = reg_info.offset;
  2975. if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
  2976. dev_info.num_regions > VFIO_PCI_VGA_REGION_INDEX) {
  2977. struct vfio_region_info vga_info = {
  2978. .argsz = sizeof(vga_info),
  2979. .index = VFIO_PCI_VGA_REGION_INDEX,
  2980. };
  2981. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
  2982. if (ret) {
  2983. error_report(
  2984. "vfio: Device does not support requested feature x-vga");
  2985. goto error;
  2986. }
  2987. if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
  2988. !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
  2989. vga_info.size < 0xbffff + 1) {
  2990. error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
  2991. (unsigned long)vga_info.flags,
  2992. (unsigned long)vga_info.size);
  2993. goto error;
  2994. }
  2995. vdev->vga.fd_offset = vga_info.offset;
  2996. vdev->vga.fd = vdev->fd;
  2997. vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
  2998. vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
  2999. QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
  3000. vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
  3001. vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
  3002. QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
  3003. vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
  3004. vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
  3005. QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
  3006. vdev->has_vga = true;
  3007. }
  3008. irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
  3009. ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
  3010. if (ret) {
  3011. /* This can fail for an old kernel or legacy PCI dev */
  3012. DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure: %m\n");
  3013. ret = 0;
  3014. } else if (irq_info.count == 1) {
  3015. vdev->pci_aer = true;
  3016. } else {
  3017. error_report("vfio: %04x:%02x:%02x.%x "
  3018. "Could not enable error recovery for the device",
  3019. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  3020. vdev->host.function);
  3021. }
  3022. error:
  3023. if (ret) {
  3024. QLIST_REMOVE(vdev, next);
  3025. vdev->group = NULL;
  3026. close(vdev->fd);
  3027. }
  3028. return ret;
  3029. }
  3030. static void vfio_put_device(VFIODevice *vdev)
  3031. {
  3032. QLIST_REMOVE(vdev, next);
  3033. vdev->group = NULL;
  3034. DPRINTF("vfio_put_device: close vdev->fd\n");
  3035. close(vdev->fd);
  3036. if (vdev->msix) {
  3037. g_free(vdev->msix);
  3038. vdev->msix = NULL;
  3039. }
  3040. }
  3041. static void vfio_err_notifier_handler(void *opaque)
  3042. {
  3043. VFIODevice *vdev = opaque;
  3044. if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
  3045. return;
  3046. }
  3047. /*
  3048. * TBD. Retrieve the error details and decide what action
  3049. * needs to be taken. One of the actions could be to pass
  3050. * the error to the guest and have the guest driver recover
  3051. * from the error. This requires that PCIe capabilities be
  3052. * exposed to the guest. For now, we just terminate the
  3053. * guest to contain the error.
  3054. */
  3055. error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. "
  3056. "Please collect any data possible and then kill the guest",
  3057. __func__, vdev->host.domain, vdev->host.bus,
  3058. vdev->host.slot, vdev->host.function);
  3059. vm_stop(RUN_STATE_IO_ERROR);
  3060. }
  3061. /*
  3062. * Registers error notifier for devices supporting error recovery.
  3063. * If we encounter a failure in this function, we report an error
  3064. * and continue after disabling error recovery support for the
  3065. * device.
  3066. */
  3067. static void vfio_register_err_notifier(VFIODevice *vdev)
  3068. {
  3069. int ret;
  3070. int argsz;
  3071. struct vfio_irq_set *irq_set;
  3072. int32_t *pfd;
  3073. if (!vdev->pci_aer) {
  3074. return;
  3075. }
  3076. if (event_notifier_init(&vdev->err_notifier, 0)) {
  3077. error_report("vfio: Unable to init event notifier for error detection");
  3078. vdev->pci_aer = false;
  3079. return;
  3080. }
  3081. argsz = sizeof(*irq_set) + sizeof(*pfd);
  3082. irq_set = g_malloc0(argsz);
  3083. irq_set->argsz = argsz;
  3084. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
  3085. VFIO_IRQ_SET_ACTION_TRIGGER;
  3086. irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
  3087. irq_set->start = 0;
  3088. irq_set->count = 1;
  3089. pfd = (int32_t *)&irq_set->data;
  3090. *pfd = event_notifier_get_fd(&vdev->err_notifier);
  3091. qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
  3092. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  3093. if (ret) {
  3094. error_report("vfio: Failed to set up error notification");
  3095. qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
  3096. event_notifier_cleanup(&vdev->err_notifier);
  3097. vdev->pci_aer = false;
  3098. }
  3099. g_free(irq_set);
  3100. }
  3101. static void vfio_unregister_err_notifier(VFIODevice *vdev)
  3102. {
  3103. int argsz;
  3104. struct vfio_irq_set *irq_set;
  3105. int32_t *pfd;
  3106. int ret;
  3107. if (!vdev->pci_aer) {
  3108. return;
  3109. }
  3110. argsz = sizeof(*irq_set) + sizeof(*pfd);
  3111. irq_set = g_malloc0(argsz);
  3112. irq_set->argsz = argsz;
  3113. irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
  3114. VFIO_IRQ_SET_ACTION_TRIGGER;
  3115. irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
  3116. irq_set->start = 0;
  3117. irq_set->count = 1;
  3118. pfd = (int32_t *)&irq_set->data;
  3119. *pfd = -1;
  3120. ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
  3121. if (ret) {
  3122. error_report("vfio: Failed to de-assign error fd: %m");
  3123. }
  3124. g_free(irq_set);
  3125. qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
  3126. NULL, NULL, vdev);
  3127. event_notifier_cleanup(&vdev->err_notifier);
  3128. }
  3129. static int vfio_initfn(PCIDevice *pdev)
  3130. {
  3131. VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  3132. VFIOGroup *group;
  3133. char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
  3134. ssize_t len;
  3135. struct stat st;
  3136. int groupid;
  3137. int ret;
  3138. /* Check that the host device exists */
  3139. snprintf(path, sizeof(path),
  3140. "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
  3141. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  3142. vdev->host.function);
  3143. if (stat(path, &st) < 0) {
  3144. error_report("vfio: error: no such host device: %s", path);
  3145. return -errno;
  3146. }
  3147. strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
  3148. len = readlink(path, iommu_group_path, sizeof(path));
  3149. if (len <= 0 || len >= sizeof(path)) {
  3150. error_report("vfio: error no iommu_group for device");
  3151. return len < 0 ? -errno : ENAMETOOLONG;
  3152. }
  3153. iommu_group_path[len] = 0;
  3154. group_name = basename(iommu_group_path);
  3155. if (sscanf(group_name, "%d", &groupid) != 1) {
  3156. error_report("vfio: error reading %s: %m", path);
  3157. return -errno;
  3158. }
  3159. DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
  3160. vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
  3161. group = vfio_get_group(groupid);
  3162. if (!group) {
  3163. error_report("vfio: failed to get group %d", groupid);
  3164. return -ENOENT;
  3165. }
  3166. snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
  3167. vdev->host.domain, vdev->host.bus, vdev->host.slot,
  3168. vdev->host.function);
  3169. QLIST_FOREACH(pvdev, &group->device_list, next) {
  3170. if (pvdev->host.domain == vdev->host.domain &&
  3171. pvdev->host.bus == vdev->host.bus &&
  3172. pvdev->host.slot == vdev->host.slot &&
  3173. pvdev->host.function == vdev->host.function) {
  3174. error_report("vfio: error: device %s is already attached", path);
  3175. vfio_put_group(group);
  3176. return -EBUSY;
  3177. }
  3178. }
  3179. ret = vfio_get_device(group, path, vdev);
  3180. if (ret) {
  3181. error_report("vfio: failed to get device %s", path);
  3182. vfio_put_group(group);
  3183. return ret;
  3184. }
  3185. /* Get a copy of config space */
  3186. ret = pread(vdev->fd, vdev->pdev.config,
  3187. MIN(pci_config_size(&vdev->pdev), vdev->config_size),
  3188. vdev->config_offset);
  3189. if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
  3190. ret = ret < 0 ? -errno : -EFAULT;
  3191. error_report("vfio: Failed to read device config space");
  3192. goto out_put;
  3193. }
  3194. /* vfio emulates a lot for us, but some bits need extra love */
  3195. vdev->emulated_config_bits = g_malloc0(vdev->config_size);
  3196. /* QEMU can choose to expose the ROM or not */
  3197. memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
  3198. /* QEMU can change multi-function devices to single function, or reverse */
  3199. vdev->emulated_config_bits[PCI_HEADER_TYPE] =
  3200. PCI_HEADER_TYPE_MULTI_FUNCTION;
  3201. /* Restore or clear multifunction, this is always controlled by QEMU */
  3202. if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
  3203. vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
  3204. } else {
  3205. vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
  3206. }
  3207. /*
  3208. * Clear host resource mapping info. If we choose not to register a
  3209. * BAR, such as might be the case with the option ROM, we can get
  3210. * confusing, unwritable, residual addresses from the host here.
  3211. */
  3212. memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
  3213. memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
  3214. vfio_pci_size_rom(vdev);
  3215. ret = vfio_early_setup_msix(vdev);
  3216. if (ret) {
  3217. goto out_put;
  3218. }
  3219. vfio_map_bars(vdev);
  3220. ret = vfio_add_capabilities(vdev);
  3221. if (ret) {
  3222. goto out_teardown;
  3223. }
  3224. /* QEMU emulates all of MSI & MSIX */
  3225. if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
  3226. memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
  3227. MSIX_CAP_LENGTH);
  3228. }
  3229. if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
  3230. memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
  3231. vdev->msi_cap_size);
  3232. }
  3233. if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
  3234. vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
  3235. vfio_intx_mmap_enable, vdev);
  3236. pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
  3237. ret = vfio_enable_intx(vdev);
  3238. if (ret) {
  3239. goto out_teardown;
  3240. }
  3241. }
  3242. add_boot_device_path(vdev->bootindex, &pdev->qdev, NULL);
  3243. vfio_register_err_notifier(vdev);
  3244. return 0;
  3245. out_teardown:
  3246. pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
  3247. vfio_teardown_msi(vdev);
  3248. vfio_unmap_bars(vdev);
  3249. out_put:
  3250. g_free(vdev->emulated_config_bits);
  3251. vfio_put_device(vdev);
  3252. vfio_put_group(group);
  3253. return ret;
  3254. }
  3255. static void vfio_exitfn(PCIDevice *pdev)
  3256. {
  3257. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  3258. VFIOGroup *group = vdev->group;
  3259. vfio_unregister_err_notifier(vdev);
  3260. pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
  3261. vfio_disable_interrupts(vdev);
  3262. if (vdev->intx.mmap_timer) {
  3263. timer_free(vdev->intx.mmap_timer);
  3264. }
  3265. vfio_teardown_msi(vdev);
  3266. vfio_unmap_bars(vdev);
  3267. g_free(vdev->emulated_config_bits);
  3268. g_free(vdev->rom);
  3269. vfio_put_device(vdev);
  3270. vfio_put_group(group);
  3271. }
  3272. static void vfio_pci_reset(DeviceState *dev)
  3273. {
  3274. PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
  3275. VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
  3276. DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
  3277. vdev->host.bus, vdev->host.slot, vdev->host.function);
  3278. vfio_pci_pre_reset(vdev);
  3279. if (vdev->reset_works && (vdev->has_flr || !vdev->has_pm_reset) &&
  3280. !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
  3281. DPRINTF("%04x:%02x:%02x.%x FLR/VFIO_DEVICE_RESET\n", vdev->host.domain,
  3282. vdev->host.bus, vdev->host.slot, vdev->host.function);
  3283. goto post_reset;
  3284. }
  3285. /* See if we can do our own bus reset */
  3286. if (!vfio_pci_hot_reset_one(vdev)) {
  3287. goto post_reset;
  3288. }
  3289. /* If nothing else works and the device supports PM reset, use it */
  3290. if (vdev->reset_works && vdev->has_pm_reset &&
  3291. !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
  3292. DPRINTF("%04x:%02x:%02x.%x PCI PM Reset\n", vdev->host.domain,
  3293. vdev->host.bus, vdev->host.slot, vdev->host.function);
  3294. goto post_reset;
  3295. }
  3296. post_reset:
  3297. vfio_pci_post_reset(vdev);
  3298. }
  3299. static Property vfio_pci_dev_properties[] = {
  3300. DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
  3301. DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIODevice,
  3302. intx.mmap_timeout, 1100),
  3303. DEFINE_PROP_BIT("x-vga", VFIODevice, features,
  3304. VFIO_FEATURE_ENABLE_VGA_BIT, false),
  3305. DEFINE_PROP_INT32("bootindex", VFIODevice, bootindex, -1),
  3306. /*
  3307. * TODO - support passed fds... is this necessary?
  3308. * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
  3309. * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
  3310. */
  3311. DEFINE_PROP_END_OF_LIST(),
  3312. };
  3313. static const VMStateDescription vfio_pci_vmstate = {
  3314. .name = "vfio-pci",
  3315. .unmigratable = 1,
  3316. };
  3317. static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
  3318. {
  3319. DeviceClass *dc = DEVICE_CLASS(klass);
  3320. PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
  3321. dc->reset = vfio_pci_reset;
  3322. dc->props = vfio_pci_dev_properties;
  3323. dc->vmsd = &vfio_pci_vmstate;
  3324. dc->desc = "VFIO-based PCI device assignment";
  3325. set_bit(DEVICE_CATEGORY_MISC, dc->categories);
  3326. pdc->init = vfio_initfn;
  3327. pdc->exit = vfio_exitfn;
  3328. pdc->config_read = vfio_pci_read_config;
  3329. pdc->config_write = vfio_pci_write_config;
  3330. pdc->is_express = 1; /* We might be */
  3331. }
  3332. static const TypeInfo vfio_pci_dev_info = {
  3333. .name = "vfio-pci",
  3334. .parent = TYPE_PCI_DEVICE,
  3335. .instance_size = sizeof(VFIODevice),
  3336. .class_init = vfio_pci_dev_class_init,
  3337. };
  3338. static void register_vfio_pci_dev_type(void)
  3339. {
  3340. type_register_static(&vfio_pci_dev_info);
  3341. }
  3342. type_init(register_vfio_pci_dev_type)