pci-assign.c 63 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898
  1. /*
  2. * Copyright (c) 2007, Neocleus Corporation.
  3. *
  4. * This work is licensed under the terms of the GNU GPL, version 2. See
  5. * the COPYING file in the top-level directory.
  6. *
  7. *
  8. * Assign a PCI device from the host to a guest VM.
  9. *
  10. * This implementation uses the classic device assignment interface of KVM
  11. * and is only available on x86 hosts. It is expected to be obsoleted by VFIO
  12. * based device assignment.
  13. *
  14. * Adapted for KVM (qemu-kvm) by Qumranet. QEMU version was based on qemu-kvm
  15. * revision 4144fe9d48. See its repository for the history.
  16. *
  17. * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  18. * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  19. * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  20. * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  21. * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  22. */
  23. #include "qemu/osdep.h"
  24. #include "qapi/error.h"
  25. #include "hw/hw.h"
  26. #include "hw/i386/pc.h"
  27. #include "qemu/error-report.h"
  28. #include "ui/console.h"
  29. #include "hw/loader.h"
  30. #include "monitor/monitor.h"
  31. #include "qemu/range.h"
  32. #include "sysemu/sysemu.h"
  33. #include "hw/pci/pci.h"
  34. #include "hw/pci/msi.h"
  35. #include "linux/kvm.h"
  36. #include "kvm_i386.h"
  37. #include "hw/pci/pci-assign.h"
  38. #define MSIX_PAGE_SIZE 0x1000
  39. /* From linux/ioport.h */
  40. #define IORESOURCE_IO 0x00000100 /* Resource type */
  41. #define IORESOURCE_MEM 0x00000200
  42. #define IORESOURCE_IRQ 0x00000400
  43. #define IORESOURCE_DMA 0x00000800
  44. #define IORESOURCE_PREFETCH 0x00002000 /* No side effects */
  45. #define IORESOURCE_MEM_64 0x00100000
  46. typedef struct PCIRegion {
  47. int type; /* Memory or port I/O */
  48. int valid;
  49. uint64_t base_addr;
  50. uint64_t size; /* size of the region */
  51. int resource_fd;
  52. } PCIRegion;
  53. typedef struct PCIDevRegions {
  54. uint8_t bus, dev, func; /* Bus inside domain, device and function */
  55. int irq; /* IRQ number */
  56. uint16_t region_number; /* number of active regions */
  57. /* Port I/O or MMIO Regions */
  58. PCIRegion regions[PCI_NUM_REGIONS - 1];
  59. int config_fd;
  60. } PCIDevRegions;
  61. typedef struct AssignedDevRegion {
  62. MemoryRegion container;
  63. MemoryRegion real_iomem;
  64. union {
  65. uint8_t *r_virtbase; /* mmapped access address for memory regions */
  66. uint32_t r_baseport; /* the base guest port for I/O regions */
  67. } u;
  68. pcibus_t e_size; /* emulated size of region in bytes */
  69. pcibus_t r_size; /* real size of region in bytes */
  70. PCIRegion *region;
  71. } AssignedDevRegion;
  72. #define ASSIGNED_DEVICE_PREFER_MSI_BIT 0
  73. #define ASSIGNED_DEVICE_SHARE_INTX_BIT 1
  74. #define ASSIGNED_DEVICE_PREFER_MSI_MASK (1 << ASSIGNED_DEVICE_PREFER_MSI_BIT)
  75. #define ASSIGNED_DEVICE_SHARE_INTX_MASK (1 << ASSIGNED_DEVICE_SHARE_INTX_BIT)
  76. typedef struct MSIXTableEntry {
  77. uint32_t addr_lo;
  78. uint32_t addr_hi;
  79. uint32_t data;
  80. uint32_t ctrl;
  81. } MSIXTableEntry;
  82. typedef enum AssignedIRQType {
  83. ASSIGNED_IRQ_NONE = 0,
  84. ASSIGNED_IRQ_INTX_HOST_INTX,
  85. ASSIGNED_IRQ_INTX_HOST_MSI,
  86. ASSIGNED_IRQ_MSI,
  87. ASSIGNED_IRQ_MSIX
  88. } AssignedIRQType;
  89. typedef struct AssignedDevice {
  90. PCIDevice dev;
  91. PCIHostDeviceAddress host;
  92. uint32_t dev_id;
  93. uint32_t features;
  94. int intpin;
  95. AssignedDevRegion v_addrs[PCI_NUM_REGIONS - 1];
  96. PCIDevRegions real_device;
  97. PCIINTxRoute intx_route;
  98. AssignedIRQType assigned_irq_type;
  99. struct {
  100. #define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
  101. #define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
  102. uint32_t available;
  103. #define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
  104. #define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
  105. #define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
  106. uint32_t state;
  107. } cap;
  108. uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE];
  109. uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE];
  110. int msi_virq_nr;
  111. int *msi_virq;
  112. MSIXTableEntry *msix_table;
  113. hwaddr msix_table_addr;
  114. uint16_t msix_max;
  115. MemoryRegion mmio;
  116. char *configfd_name;
  117. int32_t bootindex;
  118. } AssignedDevice;
  119. #define TYPE_PCI_ASSIGN "kvm-pci-assign"
  120. #define PCI_ASSIGN(obj) OBJECT_CHECK(AssignedDevice, (obj), TYPE_PCI_ASSIGN)
  121. static void assigned_dev_update_irq_routing(PCIDevice *dev);
  122. static void assigned_dev_load_option_rom(AssignedDevice *dev);
  123. static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
  124. static uint64_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
  125. hwaddr addr, int size,
  126. uint64_t *data)
  127. {
  128. uint64_t val = 0;
  129. int fd = dev_region->region->resource_fd;
  130. if (data) {
  131. DEBUG("pwrite data=%" PRIx64 ", size=%d, e_phys=" TARGET_FMT_plx
  132. ", addr="TARGET_FMT_plx"\n", *data, size, addr, addr);
  133. if (pwrite(fd, data, size, addr) != size) {
  134. error_report("%s - pwrite failed %s", __func__, strerror(errno));
  135. }
  136. } else {
  137. if (pread(fd, &val, size, addr) != size) {
  138. error_report("%s - pread failed %s", __func__, strerror(errno));
  139. val = (1UL << (size * 8)) - 1;
  140. }
  141. DEBUG("pread val=%" PRIx64 ", size=%d, e_phys=" TARGET_FMT_plx
  142. ", addr=" TARGET_FMT_plx "\n", val, size, addr, addr);
  143. }
  144. return val;
  145. }
  146. static void assigned_dev_ioport_write(void *opaque, hwaddr addr,
  147. uint64_t data, unsigned size)
  148. {
  149. assigned_dev_ioport_rw(opaque, addr, size, &data);
  150. }
  151. static uint64_t assigned_dev_ioport_read(void *opaque,
  152. hwaddr addr, unsigned size)
  153. {
  154. return assigned_dev_ioport_rw(opaque, addr, size, NULL);
  155. }
  156. static uint32_t slow_bar_readb(void *opaque, hwaddr addr)
  157. {
  158. AssignedDevRegion *d = opaque;
  159. uint8_t *in = d->u.r_virtbase + addr;
  160. uint32_t r;
  161. r = *in;
  162. DEBUG("addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
  163. return r;
  164. }
  165. static uint32_t slow_bar_readw(void *opaque, hwaddr addr)
  166. {
  167. AssignedDevRegion *d = opaque;
  168. uint16_t *in = (uint16_t *)(d->u.r_virtbase + addr);
  169. uint32_t r;
  170. r = *in;
  171. DEBUG("addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
  172. return r;
  173. }
  174. static uint32_t slow_bar_readl(void *opaque, hwaddr addr)
  175. {
  176. AssignedDevRegion *d = opaque;
  177. uint32_t *in = (uint32_t *)(d->u.r_virtbase + addr);
  178. uint32_t r;
  179. r = *in;
  180. DEBUG("addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
  181. return r;
  182. }
  183. static void slow_bar_writeb(void *opaque, hwaddr addr, uint32_t val)
  184. {
  185. AssignedDevRegion *d = opaque;
  186. uint8_t *out = d->u.r_virtbase + addr;
  187. DEBUG("addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
  188. *out = val;
  189. }
  190. static void slow_bar_writew(void *opaque, hwaddr addr, uint32_t val)
  191. {
  192. AssignedDevRegion *d = opaque;
  193. uint16_t *out = (uint16_t *)(d->u.r_virtbase + addr);
  194. DEBUG("addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
  195. *out = val;
  196. }
  197. static void slow_bar_writel(void *opaque, hwaddr addr, uint32_t val)
  198. {
  199. AssignedDevRegion *d = opaque;
  200. uint32_t *out = (uint32_t *)(d->u.r_virtbase + addr);
  201. DEBUG("addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
  202. *out = val;
  203. }
  204. static const MemoryRegionOps slow_bar_ops = {
  205. .old_mmio = {
  206. .read = { slow_bar_readb, slow_bar_readw, slow_bar_readl, },
  207. .write = { slow_bar_writeb, slow_bar_writew, slow_bar_writel, },
  208. },
  209. .endianness = DEVICE_NATIVE_ENDIAN,
  210. };
  211. static void assigned_dev_iomem_setup(PCIDevice *pci_dev, int region_num,
  212. pcibus_t e_size)
  213. {
  214. AssignedDevice *r_dev = PCI_ASSIGN(pci_dev);
  215. AssignedDevRegion *region = &r_dev->v_addrs[region_num];
  216. PCIRegion *real_region = &r_dev->real_device.regions[region_num];
  217. if (e_size > 0) {
  218. memory_region_init(&region->container, OBJECT(pci_dev),
  219. "assigned-dev-container", e_size);
  220. memory_region_add_subregion(&region->container, 0, &region->real_iomem);
  221. /* deal with MSI-X MMIO page */
  222. if (real_region->base_addr <= r_dev->msix_table_addr &&
  223. real_region->base_addr + real_region->size >
  224. r_dev->msix_table_addr) {
  225. uint64_t offset = r_dev->msix_table_addr - real_region->base_addr;
  226. memory_region_add_subregion_overlap(&region->container,
  227. offset,
  228. &r_dev->mmio,
  229. 1);
  230. }
  231. }
  232. }
  233. static const MemoryRegionOps assigned_dev_ioport_ops = {
  234. .read = assigned_dev_ioport_read,
  235. .write = assigned_dev_ioport_write,
  236. .endianness = DEVICE_NATIVE_ENDIAN,
  237. };
  238. static void assigned_dev_ioport_setup(PCIDevice *pci_dev, int region_num,
  239. pcibus_t size)
  240. {
  241. AssignedDevice *r_dev = PCI_ASSIGN(pci_dev);
  242. AssignedDevRegion *region = &r_dev->v_addrs[region_num];
  243. region->e_size = size;
  244. memory_region_init(&region->container, OBJECT(pci_dev),
  245. "assigned-dev-container", size);
  246. memory_region_init_io(&region->real_iomem, OBJECT(pci_dev),
  247. &assigned_dev_ioport_ops, r_dev->v_addrs + region_num,
  248. "assigned-dev-iomem", size);
  249. memory_region_add_subregion(&region->container, 0, &region->real_iomem);
  250. }
  251. static uint32_t assigned_dev_pci_read(PCIDevice *d, int pos, int len)
  252. {
  253. AssignedDevice *pci_dev = PCI_ASSIGN(d);
  254. uint32_t val;
  255. ssize_t ret;
  256. int fd = pci_dev->real_device.config_fd;
  257. again:
  258. ret = pread(fd, &val, len, pos);
  259. if (ret != len) {
  260. if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) {
  261. goto again;
  262. }
  263. hw_error("pci read failed, ret = %zd errno = %d\n", ret, errno);
  264. }
  265. return val;
  266. }
  267. static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
  268. {
  269. return (uint8_t)assigned_dev_pci_read(d, pos, 1);
  270. }
  271. static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
  272. {
  273. AssignedDevice *pci_dev = PCI_ASSIGN(d);
  274. ssize_t ret;
  275. int fd = pci_dev->real_device.config_fd;
  276. again:
  277. ret = pwrite(fd, &val, len, pos);
  278. if (ret != len) {
  279. if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) {
  280. goto again;
  281. }
  282. hw_error("pci write failed, ret = %zd errno = %d\n", ret, errno);
  283. }
  284. }
  285. static void assigned_dev_emulate_config_read(AssignedDevice *dev,
  286. uint32_t offset, uint32_t len)
  287. {
  288. memset(dev->emulate_config_read + offset, 0xff, len);
  289. }
  290. static void assigned_dev_direct_config_read(AssignedDevice *dev,
  291. uint32_t offset, uint32_t len)
  292. {
  293. memset(dev->emulate_config_read + offset, 0, len);
  294. }
  295. static void assigned_dev_direct_config_write(AssignedDevice *dev,
  296. uint32_t offset, uint32_t len)
  297. {
  298. memset(dev->emulate_config_write + offset, 0, len);
  299. }
  300. static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start)
  301. {
  302. int id;
  303. int max_cap = 48;
  304. int pos = start ? start : PCI_CAPABILITY_LIST;
  305. int status;
  306. status = assigned_dev_pci_read_byte(d, PCI_STATUS);
  307. if ((status & PCI_STATUS_CAP_LIST) == 0) {
  308. return 0;
  309. }
  310. while (max_cap--) {
  311. pos = assigned_dev_pci_read_byte(d, pos);
  312. if (pos < 0x40) {
  313. break;
  314. }
  315. pos &= ~3;
  316. id = assigned_dev_pci_read_byte(d, pos + PCI_CAP_LIST_ID);
  317. if (id == 0xff) {
  318. break;
  319. }
  320. if (id == cap) {
  321. return pos;
  322. }
  323. pos += PCI_CAP_LIST_NEXT;
  324. }
  325. return 0;
  326. }
  327. static void assigned_dev_register_regions(PCIRegion *io_regions,
  328. unsigned long regions_num,
  329. AssignedDevice *pci_dev,
  330. Error **errp)
  331. {
  332. uint32_t i;
  333. PCIRegion *cur_region = io_regions;
  334. for (i = 0; i < regions_num; i++, cur_region++) {
  335. if (!cur_region->valid) {
  336. continue;
  337. }
  338. /* handle memory io regions */
  339. if (cur_region->type & IORESOURCE_MEM) {
  340. int t = PCI_BASE_ADDRESS_SPACE_MEMORY;
  341. if (cur_region->type & IORESOURCE_PREFETCH) {
  342. t |= PCI_BASE_ADDRESS_MEM_PREFETCH;
  343. }
  344. if (cur_region->type & IORESOURCE_MEM_64) {
  345. t |= PCI_BASE_ADDRESS_MEM_TYPE_64;
  346. }
  347. /* map physical memory */
  348. pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL, cur_region->size,
  349. PROT_WRITE | PROT_READ,
  350. MAP_SHARED,
  351. cur_region->resource_fd,
  352. (off_t)0);
  353. if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
  354. pci_dev->v_addrs[i].u.r_virtbase = NULL;
  355. error_setg_errno(errp, errno, "Couldn't mmap 0x%" PRIx64 "!",
  356. cur_region->base_addr);
  357. return;
  358. }
  359. pci_dev->v_addrs[i].r_size = cur_region->size;
  360. pci_dev->v_addrs[i].e_size = 0;
  361. /* add offset */
  362. pci_dev->v_addrs[i].u.r_virtbase +=
  363. (cur_region->base_addr & 0xFFF);
  364. if (cur_region->size & 0xFFF) {
  365. error_report("PCI region %d at address 0x%" PRIx64 " has "
  366. "size 0x%" PRIx64 ", which is not a multiple of "
  367. "4K. You might experience some performance hit "
  368. "due to that.",
  369. i, cur_region->base_addr, cur_region->size);
  370. memory_region_init_io(&pci_dev->v_addrs[i].real_iomem,
  371. OBJECT(pci_dev), &slow_bar_ops,
  372. &pci_dev->v_addrs[i],
  373. "assigned-dev-slow-bar",
  374. cur_region->size);
  375. } else {
  376. void *virtbase = pci_dev->v_addrs[i].u.r_virtbase;
  377. char name[32];
  378. snprintf(name, sizeof(name), "%s.bar%d",
  379. object_get_typename(OBJECT(pci_dev)), i);
  380. memory_region_init_ram_ptr(&pci_dev->v_addrs[i].real_iomem,
  381. OBJECT(pci_dev), name,
  382. cur_region->size, virtbase);
  383. vmstate_register_ram(&pci_dev->v_addrs[i].real_iomem,
  384. &pci_dev->dev.qdev);
  385. }
  386. assigned_dev_iomem_setup(&pci_dev->dev, i, cur_region->size);
  387. pci_register_bar((PCIDevice *) pci_dev, i, t,
  388. &pci_dev->v_addrs[i].container);
  389. continue;
  390. } else {
  391. /* handle port io regions */
  392. uint32_t val;
  393. int ret;
  394. /* Test kernel support for ioport resource read/write. Old
  395. * kernels return EIO. New kernels only allow 1/2/4 byte reads
  396. * so should return EINVAL for a 3 byte read */
  397. ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
  398. if (ret >= 0) {
  399. error_report("Unexpected return from I/O port read: %d", ret);
  400. abort();
  401. } else if (errno != EINVAL) {
  402. error_report("Kernel doesn't support ioport resource "
  403. "access, hiding this region.");
  404. close(pci_dev->v_addrs[i].region->resource_fd);
  405. cur_region->valid = 0;
  406. continue;
  407. }
  408. pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
  409. pci_dev->v_addrs[i].r_size = cur_region->size;
  410. pci_dev->v_addrs[i].e_size = 0;
  411. assigned_dev_ioport_setup(&pci_dev->dev, i, cur_region->size);
  412. pci_register_bar((PCIDevice *) pci_dev, i,
  413. PCI_BASE_ADDRESS_SPACE_IO,
  414. &pci_dev->v_addrs[i].container);
  415. }
  416. }
  417. /* success */
  418. }
  419. static void get_real_id(const char *devpath, const char *idname, uint16_t *val,
  420. Error **errp)
  421. {
  422. FILE *f;
  423. char name[128];
  424. long id;
  425. snprintf(name, sizeof(name), "%s%s", devpath, idname);
  426. f = fopen(name, "r");
  427. if (f == NULL) {
  428. error_setg_file_open(errp, errno, name);
  429. return;
  430. }
  431. if (fscanf(f, "%li\n", &id) == 1) {
  432. *val = id;
  433. } else {
  434. error_setg(errp, "Failed to parse contents of '%s'", name);
  435. }
  436. fclose(f);
  437. }
  438. static void get_real_vendor_id(const char *devpath, uint16_t *val,
  439. Error **errp)
  440. {
  441. get_real_id(devpath, "vendor", val, errp);
  442. }
  443. static void get_real_device_id(const char *devpath, uint16_t *val,
  444. Error **errp)
  445. {
  446. get_real_id(devpath, "device", val, errp);
  447. }
  448. static void get_real_device(AssignedDevice *pci_dev, Error **errp)
  449. {
  450. char dir[128], name[128];
  451. int fd, r = 0;
  452. FILE *f;
  453. uint64_t start, end, size, flags;
  454. uint16_t id;
  455. PCIRegion *rp;
  456. PCIDevRegions *dev = &pci_dev->real_device;
  457. Error *local_err = NULL;
  458. dev->region_number = 0;
  459. snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
  460. pci_dev->host.domain, pci_dev->host.bus,
  461. pci_dev->host.slot, pci_dev->host.function);
  462. snprintf(name, sizeof(name), "%sconfig", dir);
  463. if (pci_dev->configfd_name && *pci_dev->configfd_name) {
  464. dev->config_fd = monitor_fd_param(cur_mon, pci_dev->configfd_name,
  465. &local_err);
  466. if (local_err) {
  467. error_propagate(errp, local_err);
  468. return;
  469. }
  470. } else {
  471. dev->config_fd = open(name, O_RDWR);
  472. if (dev->config_fd == -1) {
  473. error_setg_file_open(errp, errno, name);
  474. return;
  475. }
  476. }
  477. again:
  478. r = read(dev->config_fd, pci_dev->dev.config,
  479. pci_config_size(&pci_dev->dev));
  480. if (r < 0) {
  481. if (errno == EINTR || errno == EAGAIN) {
  482. goto again;
  483. }
  484. error_setg_errno(errp, errno, "read(\"%s\")",
  485. (pci_dev->configfd_name && *pci_dev->configfd_name) ?
  486. pci_dev->configfd_name : name);
  487. return;
  488. }
  489. /* Restore or clear multifunction, this is always controlled by qemu */
  490. if (pci_dev->dev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
  491. pci_dev->dev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
  492. } else {
  493. pci_dev->dev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
  494. }
  495. /* Clear host resource mapping info. If we choose not to register a
  496. * BAR, such as might be the case with the option ROM, we can get
  497. * confusing, unwritable, residual addresses from the host here. */
  498. memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);
  499. memset(&pci_dev->dev.config[PCI_ROM_ADDRESS], 0, 4);
  500. snprintf(name, sizeof(name), "%sresource", dir);
  501. f = fopen(name, "r");
  502. if (f == NULL) {
  503. error_setg_file_open(errp, errno, name);
  504. return;
  505. }
  506. for (r = 0; r < PCI_ROM_SLOT; r++) {
  507. if (fscanf(f, "%" SCNi64 " %" SCNi64 " %" SCNi64 "\n",
  508. &start, &end, &flags) != 3) {
  509. break;
  510. }
  511. rp = dev->regions + r;
  512. rp->valid = 0;
  513. rp->resource_fd = -1;
  514. size = end - start + 1;
  515. flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH
  516. | IORESOURCE_MEM_64;
  517. if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) {
  518. continue;
  519. }
  520. if (flags & IORESOURCE_MEM) {
  521. flags &= ~IORESOURCE_IO;
  522. } else {
  523. flags &= ~IORESOURCE_PREFETCH;
  524. }
  525. snprintf(name, sizeof(name), "%sresource%d", dir, r);
  526. fd = open(name, O_RDWR);
  527. if (fd == -1) {
  528. continue;
  529. }
  530. rp->resource_fd = fd;
  531. rp->type = flags;
  532. rp->valid = 1;
  533. rp->base_addr = start;
  534. rp->size = size;
  535. pci_dev->v_addrs[r].region = rp;
  536. DEBUG("region %d size %" PRIu64 " start 0x%" PRIx64
  537. " type %d resource_fd %d\n",
  538. r, rp->size, start, rp->type, rp->resource_fd);
  539. }
  540. fclose(f);
  541. /* read and fill vendor ID */
  542. get_real_vendor_id(dir, &id, &local_err);
  543. if (local_err) {
  544. error_propagate(errp, local_err);
  545. return;
  546. }
  547. pci_dev->dev.config[0] = id & 0xff;
  548. pci_dev->dev.config[1] = (id & 0xff00) >> 8;
  549. /* read and fill device ID */
  550. get_real_device_id(dir, &id, &local_err);
  551. if (local_err) {
  552. error_propagate(errp, local_err);
  553. return;
  554. }
  555. pci_dev->dev.config[2] = id & 0xff;
  556. pci_dev->dev.config[3] = (id & 0xff00) >> 8;
  557. pci_word_test_and_clear_mask(pci_dev->emulate_config_write + PCI_COMMAND,
  558. PCI_COMMAND_MASTER | PCI_COMMAND_INTX_DISABLE);
  559. dev->region_number = r;
  560. }
  561. static void free_msi_virqs(AssignedDevice *dev)
  562. {
  563. int i;
  564. for (i = 0; i < dev->msi_virq_nr; i++) {
  565. if (dev->msi_virq[i] >= 0) {
  566. kvm_irqchip_release_virq(kvm_state, dev->msi_virq[i]);
  567. dev->msi_virq[i] = -1;
  568. }
  569. }
  570. g_free(dev->msi_virq);
  571. dev->msi_virq = NULL;
  572. dev->msi_virq_nr = 0;
  573. }
  574. static void free_assigned_device(AssignedDevice *dev)
  575. {
  576. int i;
  577. if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
  578. assigned_dev_unregister_msix_mmio(dev);
  579. }
  580. for (i = 0; i < dev->real_device.region_number; i++) {
  581. PCIRegion *pci_region = &dev->real_device.regions[i];
  582. AssignedDevRegion *region = &dev->v_addrs[i];
  583. if (!pci_region->valid) {
  584. continue;
  585. }
  586. if (pci_region->type & IORESOURCE_IO) {
  587. if (region->u.r_baseport) {
  588. memory_region_del_subregion(&region->container,
  589. &region->real_iomem);
  590. }
  591. } else if (pci_region->type & IORESOURCE_MEM) {
  592. if (region->u.r_virtbase) {
  593. memory_region_del_subregion(&region->container,
  594. &region->real_iomem);
  595. /* Remove MSI-X table subregion */
  596. if (pci_region->base_addr <= dev->msix_table_addr &&
  597. pci_region->base_addr + pci_region->size >
  598. dev->msix_table_addr) {
  599. memory_region_del_subregion(&region->container,
  600. &dev->mmio);
  601. }
  602. if (munmap(region->u.r_virtbase,
  603. (pci_region->size + 0xFFF) & 0xFFFFF000)) {
  604. error_report("Failed to unmap assigned device region: %s",
  605. strerror(errno));
  606. }
  607. }
  608. }
  609. if (pci_region->resource_fd >= 0) {
  610. close(pci_region->resource_fd);
  611. }
  612. }
  613. if (dev->real_device.config_fd >= 0) {
  614. close(dev->real_device.config_fd);
  615. }
  616. free_msi_virqs(dev);
  617. }
  618. /* This function tries to determine the cause of the PCI assignment failure. It
  619. * always returns the cause as a dynamically allocated, human readable string.
  620. * If the function fails to determine the cause for any internal reason, then
  621. * the returned string will state that fact.
  622. */
  623. static char *assign_failed_examine(const AssignedDevice *dev)
  624. {
  625. char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
  626. uint16_t vendor_id, device_id;
  627. int r;
  628. Error *local_err = NULL;
  629. snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
  630. dev->host.domain, dev->host.bus, dev->host.slot,
  631. dev->host.function);
  632. snprintf(name, sizeof(name), "%sdriver", dir);
  633. r = readlink(name, driver, sizeof(driver));
  634. if ((r <= 0) || r >= sizeof(driver)) {
  635. goto fail;
  636. }
  637. driver[r] = 0;
  638. ns = strrchr(driver, '/');
  639. if (!ns) {
  640. goto fail;
  641. }
  642. ns++;
  643. if ((get_real_vendor_id(dir, &vendor_id, &local_err), local_err) ||
  644. (get_real_device_id(dir, &device_id, &local_err), local_err)) {
  645. /* We're already analyzing an assignment error, so we suppress this
  646. * one just like the others above.
  647. */
  648. error_free(local_err);
  649. goto fail;
  650. }
  651. return g_strdup_printf(
  652. "*** The driver '%s' is occupying your device %04x:%02x:%02x.%x.\n"
  653. "***\n"
  654. "*** You can try the following commands to free it:\n"
  655. "***\n"
  656. "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/new_id\n"
  657. "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/%s/unbind\n"
  658. "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
  659. "pci-stub/bind\n"
  660. "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/remove_id\n"
  661. "***\n",
  662. ns, dev->host.domain, dev->host.bus, dev->host.slot,
  663. dev->host.function, vendor_id, device_id,
  664. dev->host.domain, dev->host.bus, dev->host.slot, dev->host.function,
  665. ns, dev->host.domain, dev->host.bus, dev->host.slot,
  666. dev->host.function, vendor_id, device_id);
  667. fail:
  668. return g_strdup("Couldn't find out why.\n");
  669. }
  670. static void assign_device(AssignedDevice *dev, Error **errp)
  671. {
  672. uint32_t flags = KVM_DEV_ASSIGN_ENABLE_IOMMU;
  673. int r;
  674. /* Only pass non-zero PCI segment to capable module */
  675. if (!kvm_check_extension(kvm_state, KVM_CAP_PCI_SEGMENT) &&
  676. dev->host.domain) {
  677. error_setg(errp, "Can't assign device inside non-zero PCI segment "
  678. "as this KVM module doesn't support it.");
  679. return;
  680. }
  681. if (!kvm_check_extension(kvm_state, KVM_CAP_IOMMU)) {
  682. error_setg(errp, "No IOMMU found. Unable to assign device \"%s\"",
  683. dev->dev.qdev.id);
  684. return;
  685. }
  686. if (dev->features & ASSIGNED_DEVICE_SHARE_INTX_MASK &&
  687. kvm_has_intx_set_mask()) {
  688. flags |= KVM_DEV_ASSIGN_PCI_2_3;
  689. }
  690. r = kvm_device_pci_assign(kvm_state, &dev->host, flags, &dev->dev_id);
  691. if (r < 0) {
  692. switch (r) {
  693. case -EBUSY: {
  694. char *cause;
  695. cause = assign_failed_examine(dev);
  696. error_setg_errno(errp, -r, "Failed to assign device \"%s\"",
  697. dev->dev.qdev.id);
  698. error_append_hint(errp, "%s", cause);
  699. g_free(cause);
  700. break;
  701. }
  702. default:
  703. error_setg_errno(errp, -r, "Failed to assign device \"%s\"",
  704. dev->dev.qdev.id);
  705. break;
  706. }
  707. }
  708. }
  709. static void verify_irqchip_in_kernel(Error **errp)
  710. {
  711. if (kvm_irqchip_in_kernel()) {
  712. return;
  713. }
  714. error_setg(errp, "pci-assign requires KVM with in-kernel irqchip enabled");
  715. }
  716. static int assign_intx(AssignedDevice *dev, Error **errp)
  717. {
  718. AssignedIRQType new_type;
  719. PCIINTxRoute intx_route;
  720. bool intx_host_msi;
  721. int r;
  722. Error *local_err = NULL;
  723. /* Interrupt PIN 0 means don't use INTx */
  724. if (assigned_dev_pci_read_byte(&dev->dev, PCI_INTERRUPT_PIN) == 0) {
  725. pci_device_set_intx_routing_notifier(&dev->dev, NULL);
  726. return 0;
  727. }
  728. verify_irqchip_in_kernel(&local_err);
  729. if (local_err) {
  730. error_propagate(errp, local_err);
  731. return -ENOTSUP;
  732. }
  733. pci_device_set_intx_routing_notifier(&dev->dev,
  734. assigned_dev_update_irq_routing);
  735. intx_route = pci_device_route_intx_to_irq(&dev->dev, dev->intpin);
  736. assert(intx_route.mode != PCI_INTX_INVERTED);
  737. if (!pci_intx_route_changed(&dev->intx_route, &intx_route)) {
  738. return 0;
  739. }
  740. switch (dev->assigned_irq_type) {
  741. case ASSIGNED_IRQ_INTX_HOST_INTX:
  742. case ASSIGNED_IRQ_INTX_HOST_MSI:
  743. intx_host_msi = dev->assigned_irq_type == ASSIGNED_IRQ_INTX_HOST_MSI;
  744. r = kvm_device_intx_deassign(kvm_state, dev->dev_id, intx_host_msi);
  745. break;
  746. case ASSIGNED_IRQ_MSI:
  747. r = kvm_device_msi_deassign(kvm_state, dev->dev_id);
  748. break;
  749. case ASSIGNED_IRQ_MSIX:
  750. r = kvm_device_msix_deassign(kvm_state, dev->dev_id);
  751. break;
  752. default:
  753. r = 0;
  754. break;
  755. }
  756. if (r) {
  757. perror("assign_intx: deassignment of previous interrupt failed");
  758. }
  759. dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
  760. if (intx_route.mode == PCI_INTX_DISABLED) {
  761. dev->intx_route = intx_route;
  762. return 0;
  763. }
  764. retry:
  765. if (dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK &&
  766. dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
  767. intx_host_msi = true;
  768. new_type = ASSIGNED_IRQ_INTX_HOST_MSI;
  769. } else {
  770. intx_host_msi = false;
  771. new_type = ASSIGNED_IRQ_INTX_HOST_INTX;
  772. }
  773. r = kvm_device_intx_assign(kvm_state, dev->dev_id, intx_host_msi,
  774. intx_route.irq);
  775. if (r < 0) {
  776. if (r == -EIO && !(dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK) &&
  777. dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
  778. /* Retry with host-side MSI. There might be an IRQ conflict and
  779. * either the kernel or the device doesn't support sharing. */
  780. error_report("Host-side INTx sharing not supported, "
  781. "using MSI instead");
  782. error_printf("Some devices do not work properly in this mode.\n");
  783. dev->features |= ASSIGNED_DEVICE_PREFER_MSI_MASK;
  784. goto retry;
  785. }
  786. error_setg_errno(errp, -r, "Failed to assign irq for \"%s\"",
  787. dev->dev.qdev.id);
  788. error_append_hint(errp, "Perhaps you are assigning a device "
  789. "that shares an IRQ with another device?\n");
  790. return r;
  791. }
  792. dev->intx_route = intx_route;
  793. dev->assigned_irq_type = new_type;
  794. return r;
  795. }
  796. static void deassign_device(AssignedDevice *dev)
  797. {
  798. int r;
  799. r = kvm_device_pci_deassign(kvm_state, dev->dev_id);
  800. assert(r == 0);
  801. }
  802. /* The pci config space got updated. Check if irq numbers have changed
  803. * for our devices
  804. */
  805. static void assigned_dev_update_irq_routing(PCIDevice *dev)
  806. {
  807. AssignedDevice *assigned_dev = PCI_ASSIGN(dev);
  808. Error *err = NULL;
  809. int r;
  810. r = assign_intx(assigned_dev, &err);
  811. if (r < 0) {
  812. error_report_err(err);
  813. err = NULL;
  814. qdev_unplug(&dev->qdev, &err);
  815. assert(!err);
  816. }
  817. }
  818. static void assigned_dev_update_msi(PCIDevice *pci_dev)
  819. {
  820. AssignedDevice *assigned_dev = PCI_ASSIGN(pci_dev);
  821. uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap +
  822. PCI_MSI_FLAGS);
  823. int r;
  824. /* Some guests gratuitously disable MSI even if they're not using it,
  825. * try to catch this by only deassigning irqs if the guest is using
  826. * MSI or intends to start. */
  827. if (assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSI ||
  828. (ctrl_byte & PCI_MSI_FLAGS_ENABLE)) {
  829. r = kvm_device_msi_deassign(kvm_state, assigned_dev->dev_id);
  830. /* -ENXIO means no assigned irq */
  831. if (r && r != -ENXIO) {
  832. perror("assigned_dev_update_msi: deassign irq");
  833. }
  834. free_msi_virqs(assigned_dev);
  835. assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
  836. pci_device_set_intx_routing_notifier(pci_dev, NULL);
  837. }
  838. if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) {
  839. MSIMessage msg = msi_get_message(pci_dev, 0);
  840. int virq;
  841. virq = kvm_irqchip_add_msi_route(kvm_state, msg, pci_dev);
  842. if (virq < 0) {
  843. perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route");
  844. return;
  845. }
  846. assigned_dev->msi_virq = g_malloc(sizeof(*assigned_dev->msi_virq));
  847. assigned_dev->msi_virq_nr = 1;
  848. assigned_dev->msi_virq[0] = virq;
  849. if (kvm_device_msi_assign(kvm_state, assigned_dev->dev_id, virq) < 0) {
  850. perror("assigned_dev_update_msi: kvm_device_msi_assign");
  851. }
  852. assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
  853. assigned_dev->intx_route.irq = -1;
  854. assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSI;
  855. } else {
  856. Error *local_err = NULL;
  857. assign_intx(assigned_dev, &local_err);
  858. if (local_err) {
  859. error_report_err(local_err);
  860. }
  861. }
  862. }
  863. static void assigned_dev_update_msi_msg(PCIDevice *pci_dev)
  864. {
  865. AssignedDevice *assigned_dev = PCI_ASSIGN(pci_dev);
  866. uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap +
  867. PCI_MSI_FLAGS);
  868. if (assigned_dev->assigned_irq_type != ASSIGNED_IRQ_MSI ||
  869. !(ctrl_byte & PCI_MSI_FLAGS_ENABLE)) {
  870. return;
  871. }
  872. kvm_irqchip_update_msi_route(kvm_state, assigned_dev->msi_virq[0],
  873. msi_get_message(pci_dev, 0), pci_dev);
  874. }
  875. static bool assigned_dev_msix_masked(MSIXTableEntry *entry)
  876. {
  877. return (entry->ctrl & cpu_to_le32(0x1)) != 0;
  878. }
  879. /*
  880. * When MSI-X is first enabled the vector table typically has all the
  881. * vectors masked, so we can't use that as the obvious test to figure out
  882. * how many vectors to initially enable. Instead we look at the data field
  883. * because this is what worked for pci-assign for a long time. This makes
  884. * sure the physical MSI-X state tracks the guest's view, which is important
  885. * for some VF/PF and PF/fw communication channels.
  886. */
  887. static bool assigned_dev_msix_skipped(MSIXTableEntry *entry)
  888. {
  889. return !entry->data;
  890. }
  891. static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
  892. {
  893. AssignedDevice *adev = PCI_ASSIGN(pci_dev);
  894. uint16_t entries_nr = 0;
  895. int i, r = 0;
  896. MSIXTableEntry *entry = adev->msix_table;
  897. MSIMessage msg;
  898. /* Get the usable entry number for allocating */
  899. for (i = 0; i < adev->msix_max; i++, entry++) {
  900. if (assigned_dev_msix_skipped(entry)) {
  901. continue;
  902. }
  903. entries_nr++;
  904. }
  905. DEBUG("MSI-X entries: %d\n", entries_nr);
  906. /* It's valid to enable MSI-X with all entries masked */
  907. if (!entries_nr) {
  908. return 0;
  909. }
  910. r = kvm_device_msix_init_vectors(kvm_state, adev->dev_id, entries_nr);
  911. if (r != 0) {
  912. error_report("fail to set MSI-X entry number for MSIX! %s",
  913. strerror(-r));
  914. return r;
  915. }
  916. free_msi_virqs(adev);
  917. adev->msi_virq_nr = adev->msix_max;
  918. adev->msi_virq = g_malloc(adev->msix_max * sizeof(*adev->msi_virq));
  919. entry = adev->msix_table;
  920. for (i = 0; i < adev->msix_max; i++, entry++) {
  921. adev->msi_virq[i] = -1;
  922. if (assigned_dev_msix_skipped(entry)) {
  923. continue;
  924. }
  925. msg.address = entry->addr_lo | ((uint64_t)entry->addr_hi << 32);
  926. msg.data = entry->data;
  927. r = kvm_irqchip_add_msi_route(kvm_state, msg, pci_dev);
  928. if (r < 0) {
  929. return r;
  930. }
  931. adev->msi_virq[i] = r;
  932. DEBUG("MSI-X vector %d, gsi %d, addr %08x_%08x, data %08x\n", i,
  933. r, entry->addr_hi, entry->addr_lo, entry->data);
  934. r = kvm_device_msix_set_vector(kvm_state, adev->dev_id, i,
  935. adev->msi_virq[i]);
  936. if (r) {
  937. error_report("fail to set MSI-X entry! %s", strerror(-r));
  938. break;
  939. }
  940. }
  941. return r;
  942. }
  943. static void assigned_dev_update_msix(PCIDevice *pci_dev)
  944. {
  945. AssignedDevice *assigned_dev = PCI_ASSIGN(pci_dev);
  946. uint16_t ctrl_word = pci_get_word(pci_dev->config + pci_dev->msix_cap +
  947. PCI_MSIX_FLAGS);
  948. int r;
  949. /* Some guests gratuitously disable MSIX even if they're not using it,
  950. * try to catch this by only deassigning irqs if the guest is using
  951. * MSIX or intends to start. */
  952. if ((assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSIX) ||
  953. (ctrl_word & PCI_MSIX_FLAGS_ENABLE)) {
  954. r = kvm_device_msix_deassign(kvm_state, assigned_dev->dev_id);
  955. /* -ENXIO means no assigned irq */
  956. if (r && r != -ENXIO) {
  957. perror("assigned_dev_update_msix: deassign irq");
  958. }
  959. free_msi_virqs(assigned_dev);
  960. assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE;
  961. pci_device_set_intx_routing_notifier(pci_dev, NULL);
  962. }
  963. if (ctrl_word & PCI_MSIX_FLAGS_ENABLE) {
  964. if (assigned_dev_update_msix_mmio(pci_dev) < 0) {
  965. perror("assigned_dev_update_msix_mmio");
  966. return;
  967. }
  968. if (assigned_dev->msi_virq_nr > 0) {
  969. if (kvm_device_msix_assign(kvm_state, assigned_dev->dev_id) < 0) {
  970. perror("assigned_dev_enable_msix: assign irq");
  971. return;
  972. }
  973. }
  974. assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
  975. assigned_dev->intx_route.irq = -1;
  976. assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSIX;
  977. } else {
  978. Error *local_err = NULL;
  979. assign_intx(assigned_dev, &local_err);
  980. if (local_err) {
  981. error_report_err(local_err);
  982. }
  983. }
  984. }
  985. static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
  986. uint32_t address, int len)
  987. {
  988. AssignedDevice *assigned_dev = PCI_ASSIGN(pci_dev);
  989. uint32_t virt_val = pci_default_read_config(pci_dev, address, len);
  990. uint32_t real_val, emulate_mask, full_emulation_mask;
  991. emulate_mask = 0;
  992. memcpy(&emulate_mask, assigned_dev->emulate_config_read + address, len);
  993. emulate_mask = le32_to_cpu(emulate_mask);
  994. full_emulation_mask = 0xffffffff >> (32 - len * 8);
  995. if (emulate_mask != full_emulation_mask) {
  996. real_val = assigned_dev_pci_read(pci_dev, address, len);
  997. return (virt_val & emulate_mask) | (real_val & ~emulate_mask);
  998. } else {
  999. return virt_val;
  1000. }
  1001. }
  1002. static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
  1003. uint32_t val, int len)
  1004. {
  1005. AssignedDevice *assigned_dev = PCI_ASSIGN(pci_dev);
  1006. uint16_t old_cmd = pci_get_word(pci_dev->config + PCI_COMMAND);
  1007. uint32_t emulate_mask, full_emulation_mask;
  1008. int ret;
  1009. pci_default_write_config(pci_dev, address, val, len);
  1010. if (kvm_has_intx_set_mask() &&
  1011. range_covers_byte(address, len, PCI_COMMAND + 1)) {
  1012. bool intx_masked = (pci_get_word(pci_dev->config + PCI_COMMAND) &
  1013. PCI_COMMAND_INTX_DISABLE);
  1014. if (intx_masked != !!(old_cmd & PCI_COMMAND_INTX_DISABLE)) {
  1015. ret = kvm_device_intx_set_mask(kvm_state, assigned_dev->dev_id,
  1016. intx_masked);
  1017. if (ret) {
  1018. perror("assigned_dev_pci_write_config: set intx mask");
  1019. }
  1020. }
  1021. }
  1022. if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
  1023. if (range_covers_byte(address, len,
  1024. pci_dev->msi_cap + PCI_MSI_FLAGS)) {
  1025. assigned_dev_update_msi(pci_dev);
  1026. } else if (ranges_overlap(address, len, /* 32bit MSI only */
  1027. pci_dev->msi_cap + PCI_MSI_ADDRESS_LO, 6)) {
  1028. assigned_dev_update_msi_msg(pci_dev);
  1029. }
  1030. }
  1031. if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
  1032. if (range_covers_byte(address, len,
  1033. pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)) {
  1034. assigned_dev_update_msix(pci_dev);
  1035. }
  1036. }
  1037. emulate_mask = 0;
  1038. memcpy(&emulate_mask, assigned_dev->emulate_config_write + address, len);
  1039. emulate_mask = le32_to_cpu(emulate_mask);
  1040. full_emulation_mask = 0xffffffff >> (32 - len * 8);
  1041. if (emulate_mask != full_emulation_mask) {
  1042. if (emulate_mask) {
  1043. val &= ~emulate_mask;
  1044. val |= assigned_dev_pci_read(pci_dev, address, len) & emulate_mask;
  1045. }
  1046. assigned_dev_pci_write(pci_dev, address, val, len);
  1047. }
  1048. }
  1049. static void assigned_dev_setup_cap_read(AssignedDevice *dev, uint32_t offset,
  1050. uint32_t len)
  1051. {
  1052. assigned_dev_direct_config_read(dev, offset, len);
  1053. assigned_dev_emulate_config_read(dev, offset + PCI_CAP_LIST_NEXT, 1);
  1054. }
  1055. static int assigned_device_pci_cap_init(PCIDevice *pci_dev, Error **errp)
  1056. {
  1057. AssignedDevice *dev = PCI_ASSIGN(pci_dev);
  1058. PCIRegion *pci_region = dev->real_device.regions;
  1059. int ret, pos;
  1060. Error *local_err = NULL;
  1061. /* Clear initial capabilities pointer and status copied from hw */
  1062. pci_set_byte(pci_dev->config + PCI_CAPABILITY_LIST, 0);
  1063. pci_set_word(pci_dev->config + PCI_STATUS,
  1064. pci_get_word(pci_dev->config + PCI_STATUS) &
  1065. ~PCI_STATUS_CAP_LIST);
  1066. /* Expose MSI capability
  1067. * MSI capability is the 1st capability in capability config */
  1068. pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI, 0);
  1069. if (pos != 0 && kvm_check_extension(kvm_state, KVM_CAP_ASSIGN_DEV_IRQ)) {
  1070. verify_irqchip_in_kernel(&local_err);
  1071. if (local_err) {
  1072. error_propagate(errp, local_err);
  1073. return -ENOTSUP;
  1074. }
  1075. dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI;
  1076. /* Only 32-bit/no-mask currently supported */
  1077. ret = pci_add_capability2(pci_dev, PCI_CAP_ID_MSI, pos, 10,
  1078. &local_err);
  1079. if (ret < 0) {
  1080. error_propagate(errp, local_err);
  1081. return ret;
  1082. }
  1083. pci_dev->msi_cap = pos;
  1084. pci_set_word(pci_dev->config + pos + PCI_MSI_FLAGS,
  1085. pci_get_word(pci_dev->config + pos + PCI_MSI_FLAGS) &
  1086. PCI_MSI_FLAGS_QMASK);
  1087. pci_set_long(pci_dev->config + pos + PCI_MSI_ADDRESS_LO, 0);
  1088. pci_set_word(pci_dev->config + pos + PCI_MSI_DATA_32, 0);
  1089. /* Set writable fields */
  1090. pci_set_word(pci_dev->wmask + pos + PCI_MSI_FLAGS,
  1091. PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
  1092. pci_set_long(pci_dev->wmask + pos + PCI_MSI_ADDRESS_LO, 0xfffffffc);
  1093. pci_set_word(pci_dev->wmask + pos + PCI_MSI_DATA_32, 0xffff);
  1094. }
  1095. /* Expose MSI-X capability */
  1096. pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0);
  1097. if (pos != 0 && kvm_device_msix_supported(kvm_state)) {
  1098. int bar_nr;
  1099. uint32_t msix_table_entry;
  1100. uint16_t msix_max;
  1101. verify_irqchip_in_kernel(&local_err);
  1102. if (local_err) {
  1103. error_propagate(errp, local_err);
  1104. return -ENOTSUP;
  1105. }
  1106. dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX;
  1107. ret = pci_add_capability2(pci_dev, PCI_CAP_ID_MSIX, pos, 12,
  1108. &local_err);
  1109. if (ret < 0) {
  1110. error_propagate(errp, local_err);
  1111. return ret;
  1112. }
  1113. pci_dev->msix_cap = pos;
  1114. msix_max = (pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
  1115. PCI_MSIX_FLAGS_QSIZE) + 1;
  1116. msix_max = MIN(msix_max, KVM_MAX_MSIX_PER_DEV);
  1117. pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS, msix_max - 1);
  1118. /* Only enable and function mask bits are writable */
  1119. pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS,
  1120. PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
  1121. msix_table_entry = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE);
  1122. bar_nr = msix_table_entry & PCI_MSIX_FLAGS_BIRMASK;
  1123. msix_table_entry &= ~PCI_MSIX_FLAGS_BIRMASK;
  1124. dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
  1125. dev->msix_max = msix_max;
  1126. }
  1127. /* Minimal PM support, nothing writable, device appears to NAK changes */
  1128. pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM, 0);
  1129. if (pos) {
  1130. uint16_t pmc;
  1131. ret = pci_add_capability2(pci_dev, PCI_CAP_ID_PM, pos, PCI_PM_SIZEOF,
  1132. &local_err);
  1133. if (ret < 0) {
  1134. error_propagate(errp, local_err);
  1135. return ret;
  1136. }
  1137. assigned_dev_setup_cap_read(dev, pos, PCI_PM_SIZEOF);
  1138. pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
  1139. pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
  1140. pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
  1141. /* assign_device will bring the device up to D0, so we don't need
  1142. * to worry about doing that ourselves here. */
  1143. pci_set_word(pci_dev->config + pos + PCI_PM_CTRL,
  1144. PCI_PM_CTRL_NO_SOFT_RESET);
  1145. pci_set_byte(pci_dev->config + pos + PCI_PM_PPB_EXTENSIONS, 0);
  1146. pci_set_byte(pci_dev->config + pos + PCI_PM_DATA_REGISTER, 0);
  1147. }
  1148. pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP, 0);
  1149. if (pos) {
  1150. uint8_t version, size = 0;
  1151. uint16_t type, devctl, lnksta;
  1152. uint32_t devcap, lnkcap;
  1153. version = pci_get_byte(pci_dev->config + pos + PCI_EXP_FLAGS);
  1154. version &= PCI_EXP_FLAGS_VERS;
  1155. if (version == 1) {
  1156. size = 0x14;
  1157. } else if (version == 2) {
  1158. /*
  1159. * Check for non-std size, accept reduced size to 0x34,
  1160. * which is what bcm5761 implemented, violating the
  1161. * PCIe v3.0 spec that regs should exist and be read as 0,
  1162. * not optionally provided and shorten the struct size.
  1163. */
  1164. size = MIN(0x3c, PCI_CONFIG_SPACE_SIZE - pos);
  1165. if (size < 0x34) {
  1166. error_setg(errp, "Invalid size PCIe cap-id 0x%x",
  1167. PCI_CAP_ID_EXP);
  1168. return -EINVAL;
  1169. } else if (size != 0x3c) {
  1170. error_report("WARNING, %s: PCIe cap-id 0x%x has "
  1171. "non-standard size 0x%x; std size should be 0x3c",
  1172. __func__, PCI_CAP_ID_EXP, size);
  1173. }
  1174. } else if (version == 0) {
  1175. uint16_t vid, did;
  1176. vid = pci_get_word(pci_dev->config + PCI_VENDOR_ID);
  1177. did = pci_get_word(pci_dev->config + PCI_DEVICE_ID);
  1178. if (vid == PCI_VENDOR_ID_INTEL && did == 0x10ed) {
  1179. /*
  1180. * quirk for Intel 82599 VF with invalid PCIe capability
  1181. * version, should really be version 2 (same as PF)
  1182. */
  1183. size = 0x3c;
  1184. }
  1185. }
  1186. if (size == 0) {
  1187. error_setg(errp, "Unsupported PCI express capability version %d",
  1188. version);
  1189. return -EINVAL;
  1190. }
  1191. ret = pci_add_capability2(pci_dev, PCI_CAP_ID_EXP, pos, size,
  1192. &local_err);
  1193. if (ret < 0) {
  1194. error_propagate(errp, local_err);
  1195. return ret;
  1196. }
  1197. assigned_dev_setup_cap_read(dev, pos, size);
  1198. type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS);
  1199. type = (type & PCI_EXP_FLAGS_TYPE) >> 4;
  1200. if (type != PCI_EXP_TYPE_ENDPOINT &&
  1201. type != PCI_EXP_TYPE_LEG_END && type != PCI_EXP_TYPE_RC_END) {
  1202. error_setg(errp, "Device assignment only supports endpoint "
  1203. "assignment, device type %d", type);
  1204. return -EINVAL;
  1205. }
  1206. /* capabilities, pass existing read-only copy
  1207. * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
  1208. /* device capabilities: hide FLR */
  1209. devcap = pci_get_long(pci_dev->config + pos + PCI_EXP_DEVCAP);
  1210. devcap &= ~PCI_EXP_DEVCAP_FLR;
  1211. pci_set_long(pci_dev->config + pos + PCI_EXP_DEVCAP, devcap);
  1212. /* device control: clear all error reporting enable bits, leaving
  1213. * only a few host values. Note, these are
  1214. * all writable, but not passed to hw.
  1215. */
  1216. devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
  1217. devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
  1218. PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
  1219. pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
  1220. devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
  1221. pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
  1222. /* Clear device status */
  1223. pci_set_word(pci_dev->config + pos + PCI_EXP_DEVSTA, 0);
  1224. /* Link capabilities, expose links and latencues, clear reporting */
  1225. lnkcap = pci_get_long(pci_dev->config + pos + PCI_EXP_LNKCAP);
  1226. lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
  1227. PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
  1228. PCI_EXP_LNKCAP_L1EL);
  1229. pci_set_long(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
  1230. /* Link control, pass existing read-only copy. Should be writable? */
  1231. /* Link status, only expose current speed and width */
  1232. lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
  1233. lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
  1234. pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
  1235. if (version >= 2) {
  1236. /* Slot capabilities, control, status - not needed for endpoints */
  1237. pci_set_long(pci_dev->config + pos + PCI_EXP_SLTCAP, 0);
  1238. pci_set_word(pci_dev->config + pos + PCI_EXP_SLTCTL, 0);
  1239. pci_set_word(pci_dev->config + pos + PCI_EXP_SLTSTA, 0);
  1240. /* Root control, capabilities, status - not needed for endpoints */
  1241. pci_set_word(pci_dev->config + pos + PCI_EXP_RTCTL, 0);
  1242. pci_set_word(pci_dev->config + pos + PCI_EXP_RTCAP, 0);
  1243. pci_set_long(pci_dev->config + pos + PCI_EXP_RTSTA, 0);
  1244. /* Device capabilities/control 2, pass existing read-only copy */
  1245. /* Link control 2, pass existing read-only copy */
  1246. }
  1247. }
  1248. pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX, 0);
  1249. if (pos) {
  1250. uint16_t cmd;
  1251. uint32_t status;
  1252. /* Only expose the minimum, 8 byte capability */
  1253. ret = pci_add_capability2(pci_dev, PCI_CAP_ID_PCIX, pos, 8,
  1254. &local_err);
  1255. if (ret < 0) {
  1256. error_propagate(errp, local_err);
  1257. return ret;
  1258. }
  1259. assigned_dev_setup_cap_read(dev, pos, 8);
  1260. /* Command register, clear upper bits, including extended modes */
  1261. cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
  1262. cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
  1263. PCI_X_CMD_MAX_SPLIT);
  1264. pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
  1265. /* Status register, update with emulated PCI bus location, clear
  1266. * error bits, leave the rest. */
  1267. status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
  1268. status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
  1269. status |= pci_requester_id(pci_dev);
  1270. status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
  1271. PCI_X_STATUS_SPL_ERR);
  1272. pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
  1273. }
  1274. pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD, 0);
  1275. if (pos) {
  1276. /* Direct R/W passthrough */
  1277. ret = pci_add_capability2(pci_dev, PCI_CAP_ID_VPD, pos, 8,
  1278. &local_err);
  1279. if (ret < 0) {
  1280. error_propagate(errp, local_err);
  1281. return ret;
  1282. }
  1283. assigned_dev_setup_cap_read(dev, pos, 8);
  1284. /* direct write for cap content */
  1285. assigned_dev_direct_config_write(dev, pos + 2, 6);
  1286. }
  1287. /* Devices can have multiple vendor capabilities, get them all */
  1288. for (pos = 0; (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
  1289. pos += PCI_CAP_LIST_NEXT) {
  1290. uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
  1291. /* Direct R/W passthrough */
  1292. ret = pci_add_capability2(pci_dev, PCI_CAP_ID_VNDR, pos, len,
  1293. &local_err);
  1294. if (ret < 0) {
  1295. error_propagate(errp, local_err);
  1296. return ret;
  1297. }
  1298. assigned_dev_setup_cap_read(dev, pos, len);
  1299. /* direct write for cap content */
  1300. assigned_dev_direct_config_write(dev, pos + 2, len - 2);
  1301. }
  1302. /* If real and virtual capability list status bits differ, virtualize the
  1303. * access. */
  1304. if ((pci_get_word(pci_dev->config + PCI_STATUS) & PCI_STATUS_CAP_LIST) !=
  1305. (assigned_dev_pci_read_byte(pci_dev, PCI_STATUS) &
  1306. PCI_STATUS_CAP_LIST)) {
  1307. dev->emulate_config_read[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
  1308. }
  1309. return 0;
  1310. }
  1311. static uint64_t
  1312. assigned_dev_msix_mmio_read(void *opaque, hwaddr addr,
  1313. unsigned size)
  1314. {
  1315. AssignedDevice *adev = opaque;
  1316. uint64_t val;
  1317. memcpy(&val, (void *)((uint8_t *)adev->msix_table + addr), size);
  1318. return val;
  1319. }
  1320. static void assigned_dev_msix_mmio_write(void *opaque, hwaddr addr,
  1321. uint64_t val, unsigned size)
  1322. {
  1323. AssignedDevice *adev = opaque;
  1324. PCIDevice *pdev = &adev->dev;
  1325. uint16_t ctrl;
  1326. MSIXTableEntry orig;
  1327. int i = addr >> 4;
  1328. if (i >= adev->msix_max) {
  1329. return; /* Drop write */
  1330. }
  1331. ctrl = pci_get_word(pdev->config + pdev->msix_cap + PCI_MSIX_FLAGS);
  1332. DEBUG("write to MSI-X table offset 0x%lx, val 0x%lx\n", addr, val);
  1333. if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
  1334. orig = adev->msix_table[i];
  1335. }
  1336. memcpy((uint8_t *)adev->msix_table + addr, &val, size);
  1337. if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
  1338. MSIXTableEntry *entry = &adev->msix_table[i];
  1339. if (!assigned_dev_msix_masked(&orig) &&
  1340. assigned_dev_msix_masked(entry)) {
  1341. /*
  1342. * Vector masked, disable it
  1343. *
  1344. * XXX It's not clear if we can or should actually attempt
  1345. * to mask or disable the interrupt. KVM doesn't have
  1346. * support for pending bits and kvm_assign_set_msix_entry
  1347. * doesn't modify the device hardware mask. Interrupts
  1348. * while masked are simply not injected to the guest, so
  1349. * are lost. Can we get away with always injecting an
  1350. * interrupt on unmask?
  1351. */
  1352. } else if (assigned_dev_msix_masked(&orig) &&
  1353. !assigned_dev_msix_masked(entry)) {
  1354. /* Vector unmasked */
  1355. if (i >= adev->msi_virq_nr || adev->msi_virq[i] < 0) {
  1356. /* Previously unassigned vector, start from scratch */
  1357. assigned_dev_update_msix(pdev);
  1358. return;
  1359. } else {
  1360. /* Update an existing, previously masked vector */
  1361. MSIMessage msg;
  1362. int ret;
  1363. msg.address = entry->addr_lo |
  1364. ((uint64_t)entry->addr_hi << 32);
  1365. msg.data = entry->data;
  1366. ret = kvm_irqchip_update_msi_route(kvm_state,
  1367. adev->msi_virq[i], msg,
  1368. pdev);
  1369. if (ret) {
  1370. error_report("Error updating irq routing entry (%d)", ret);
  1371. }
  1372. }
  1373. }
  1374. }
  1375. }
  1376. static const MemoryRegionOps assigned_dev_msix_mmio_ops = {
  1377. .read = assigned_dev_msix_mmio_read,
  1378. .write = assigned_dev_msix_mmio_write,
  1379. .endianness = DEVICE_NATIVE_ENDIAN,
  1380. .valid = {
  1381. .min_access_size = 4,
  1382. .max_access_size = 8,
  1383. },
  1384. .impl = {
  1385. .min_access_size = 4,
  1386. .max_access_size = 8,
  1387. },
  1388. };
  1389. static void assigned_dev_msix_reset(AssignedDevice *dev)
  1390. {
  1391. MSIXTableEntry *entry;
  1392. int i;
  1393. if (!dev->msix_table) {
  1394. return;
  1395. }
  1396. memset(dev->msix_table, 0, MSIX_PAGE_SIZE);
  1397. for (i = 0, entry = dev->msix_table; i < dev->msix_max; i++, entry++) {
  1398. entry->ctrl = cpu_to_le32(0x1); /* Masked */
  1399. }
  1400. }
  1401. static void assigned_dev_register_msix_mmio(AssignedDevice *dev, Error **errp)
  1402. {
  1403. dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE,
  1404. MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
  1405. if (dev->msix_table == MAP_FAILED) {
  1406. error_setg_errno(errp, errno, "failed to allocate msix_table");
  1407. dev->msix_table = NULL;
  1408. return;
  1409. }
  1410. assigned_dev_msix_reset(dev);
  1411. memory_region_init_io(&dev->mmio, OBJECT(dev), &assigned_dev_msix_mmio_ops,
  1412. dev, "assigned-dev-msix", MSIX_PAGE_SIZE);
  1413. }
  1414. static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
  1415. {
  1416. if (!dev->msix_table) {
  1417. return;
  1418. }
  1419. if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) {
  1420. error_report("error unmapping msix_table! %s", strerror(errno));
  1421. }
  1422. dev->msix_table = NULL;
  1423. }
  1424. static const VMStateDescription vmstate_assigned_device = {
  1425. .name = "pci-assign",
  1426. .unmigratable = 1,
  1427. };
  1428. static void reset_assigned_device(DeviceState *dev)
  1429. {
  1430. PCIDevice *pci_dev = PCI_DEVICE(dev);
  1431. AssignedDevice *adev = PCI_ASSIGN(pci_dev);
  1432. char reset_file[64];
  1433. const char reset[] = "1";
  1434. int fd, ret;
  1435. /*
  1436. * If a guest is reset without being shutdown, MSI/MSI-X can still
  1437. * be running. We want to return the device to a known state on
  1438. * reset, so disable those here. We especially do not want MSI-X
  1439. * enabled since it lives in MMIO space, which is about to get
  1440. * disabled.
  1441. */
  1442. if (adev->assigned_irq_type == ASSIGNED_IRQ_MSIX) {
  1443. uint16_t ctrl = pci_get_word(pci_dev->config +
  1444. pci_dev->msix_cap + PCI_MSIX_FLAGS);
  1445. pci_set_word(pci_dev->config + pci_dev->msix_cap + PCI_MSIX_FLAGS,
  1446. ctrl & ~PCI_MSIX_FLAGS_ENABLE);
  1447. assigned_dev_update_msix(pci_dev);
  1448. } else if (adev->assigned_irq_type == ASSIGNED_IRQ_MSI) {
  1449. uint8_t ctrl = pci_get_byte(pci_dev->config +
  1450. pci_dev->msi_cap + PCI_MSI_FLAGS);
  1451. pci_set_byte(pci_dev->config + pci_dev->msi_cap + PCI_MSI_FLAGS,
  1452. ctrl & ~PCI_MSI_FLAGS_ENABLE);
  1453. assigned_dev_update_msi(pci_dev);
  1454. }
  1455. snprintf(reset_file, sizeof(reset_file),
  1456. "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset",
  1457. adev->host.domain, adev->host.bus, adev->host.slot,
  1458. adev->host.function);
  1459. /*
  1460. * Issue a device reset via pci-sysfs. Note that we use write(2) here
  1461. * and ignore the return value because some kernels have a bug that
  1462. * returns 0 rather than bytes written on success, sending us into an
  1463. * infinite retry loop using other write mechanisms.
  1464. */
  1465. fd = open(reset_file, O_WRONLY);
  1466. if (fd != -1) {
  1467. ret = write(fd, reset, strlen(reset));
  1468. (void)ret;
  1469. close(fd);
  1470. }
  1471. /*
  1472. * When a 0 is written to the bus master register, the device is logically
  1473. * disconnected from the PCI bus. This avoids further DMA transfers.
  1474. */
  1475. assigned_dev_pci_write_config(pci_dev, PCI_COMMAND, 0, 1);
  1476. }
  1477. static void assigned_realize(struct PCIDevice *pci_dev, Error **errp)
  1478. {
  1479. AssignedDevice *dev = PCI_ASSIGN(pci_dev);
  1480. uint8_t e_intx;
  1481. int r;
  1482. Error *local_err = NULL;
  1483. if (!kvm_enabled()) {
  1484. error_setg(&local_err, "pci-assign requires KVM support");
  1485. goto exit_with_error;
  1486. }
  1487. if (!dev->host.domain && !dev->host.bus && !dev->host.slot &&
  1488. !dev->host.function) {
  1489. error_setg(&local_err, "no host device specified");
  1490. goto exit_with_error;
  1491. }
  1492. /*
  1493. * Set up basic config space access control. Will be further refined during
  1494. * device initialization.
  1495. */
  1496. assigned_dev_emulate_config_read(dev, 0, PCI_CONFIG_SPACE_SIZE);
  1497. assigned_dev_direct_config_read(dev, PCI_STATUS, 2);
  1498. assigned_dev_direct_config_read(dev, PCI_REVISION_ID, 1);
  1499. assigned_dev_direct_config_read(dev, PCI_CLASS_PROG, 3);
  1500. assigned_dev_direct_config_read(dev, PCI_CACHE_LINE_SIZE, 1);
  1501. assigned_dev_direct_config_read(dev, PCI_LATENCY_TIMER, 1);
  1502. assigned_dev_direct_config_read(dev, PCI_BIST, 1);
  1503. assigned_dev_direct_config_read(dev, PCI_CARDBUS_CIS, 4);
  1504. assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_VENDOR_ID, 2);
  1505. assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_ID, 2);
  1506. assigned_dev_direct_config_read(dev, PCI_CAPABILITY_LIST + 1, 7);
  1507. assigned_dev_direct_config_read(dev, PCI_MIN_GNT, 1);
  1508. assigned_dev_direct_config_read(dev, PCI_MAX_LAT, 1);
  1509. memcpy(dev->emulate_config_write, dev->emulate_config_read,
  1510. sizeof(dev->emulate_config_read));
  1511. get_real_device(dev, &local_err);
  1512. if (local_err) {
  1513. goto out;
  1514. }
  1515. if (assigned_device_pci_cap_init(pci_dev, &local_err) < 0) {
  1516. goto out;
  1517. }
  1518. /* intercept MSI-X entry page in the MMIO */
  1519. if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
  1520. assigned_dev_register_msix_mmio(dev, &local_err);
  1521. if (local_err) {
  1522. goto out;
  1523. }
  1524. }
  1525. /* handle real device's MMIO/PIO BARs */
  1526. assigned_dev_register_regions(dev->real_device.regions,
  1527. dev->real_device.region_number, dev,
  1528. &local_err);
  1529. if (local_err) {
  1530. goto out;
  1531. }
  1532. /* handle interrupt routing */
  1533. e_intx = dev->dev.config[PCI_INTERRUPT_PIN] - 1;
  1534. dev->intpin = e_intx;
  1535. dev->intx_route.mode = PCI_INTX_DISABLED;
  1536. dev->intx_route.irq = -1;
  1537. /* assign device to guest */
  1538. assign_device(dev, &local_err);
  1539. if (local_err) {
  1540. goto out;
  1541. }
  1542. /* assign legacy INTx to the device */
  1543. r = assign_intx(dev, &local_err);
  1544. if (r < 0) {
  1545. goto assigned_out;
  1546. }
  1547. assigned_dev_load_option_rom(dev);
  1548. return;
  1549. assigned_out:
  1550. deassign_device(dev);
  1551. out:
  1552. free_assigned_device(dev);
  1553. exit_with_error:
  1554. assert(local_err);
  1555. error_propagate(errp, local_err);
  1556. }
  1557. static void assigned_exitfn(struct PCIDevice *pci_dev)
  1558. {
  1559. AssignedDevice *dev = PCI_ASSIGN(pci_dev);
  1560. deassign_device(dev);
  1561. free_assigned_device(dev);
  1562. }
  1563. static void assigned_dev_instance_init(Object *obj)
  1564. {
  1565. PCIDevice *pci_dev = PCI_DEVICE(obj);
  1566. AssignedDevice *d = PCI_ASSIGN(pci_dev);
  1567. device_add_bootindex_property(obj, &d->bootindex,
  1568. "bootindex", NULL,
  1569. &pci_dev->qdev, NULL);
  1570. }
  1571. static Property assigned_dev_properties[] = {
  1572. DEFINE_PROP_PCI_HOST_DEVADDR("host", AssignedDevice, host),
  1573. DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features,
  1574. ASSIGNED_DEVICE_PREFER_MSI_BIT, false),
  1575. DEFINE_PROP_BIT("share_intx", AssignedDevice, features,
  1576. ASSIGNED_DEVICE_SHARE_INTX_BIT, true),
  1577. DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name),
  1578. DEFINE_PROP_END_OF_LIST(),
  1579. };
  1580. static void assign_class_init(ObjectClass *klass, void *data)
  1581. {
  1582. PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
  1583. DeviceClass *dc = DEVICE_CLASS(klass);
  1584. k->realize = assigned_realize;
  1585. k->exit = assigned_exitfn;
  1586. k->config_read = assigned_dev_pci_read_config;
  1587. k->config_write = assigned_dev_pci_write_config;
  1588. dc->props = assigned_dev_properties;
  1589. dc->vmsd = &vmstate_assigned_device;
  1590. dc->reset = reset_assigned_device;
  1591. set_bit(DEVICE_CATEGORY_MISC, dc->categories);
  1592. dc->desc = "KVM-based PCI passthrough";
  1593. }
  1594. static const TypeInfo assign_info = {
  1595. .name = TYPE_PCI_ASSIGN,
  1596. .parent = TYPE_PCI_DEVICE,
  1597. .instance_size = sizeof(AssignedDevice),
  1598. .class_init = assign_class_init,
  1599. .instance_init = assigned_dev_instance_init,
  1600. };
  1601. static void assign_register_types(void)
  1602. {
  1603. type_register_static(&assign_info);
  1604. }
  1605. type_init(assign_register_types)
  1606. static void assigned_dev_load_option_rom(AssignedDevice *dev)
  1607. {
  1608. int size = 0;
  1609. pci_assign_dev_load_option_rom(&dev->dev, OBJECT(dev), &size,
  1610. dev->host.domain, dev->host.bus,
  1611. dev->host.slot, dev->host.function);
  1612. if (!size) {
  1613. error_report("pci-assign: Invalid ROM.");
  1614. }
  1615. }