Selaa lähdekoodia

Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into staging

virtio,pc,pci: features, cleanups, fixes

more memslots support in libvhost-user
support PCIe Gen5/Gen6 link speeds in pcie
more traces in vdpa
network simulation devices support in vdpa
SMBIOS type 9 descriptor implementation
Bump max_cpus to 4096 vcpus in q35
aw-bits and granule options in VIRTIO-IOMMU
Support report NUMA nodes for device memory using GI in acpi
Beginning of shutdown event support in pvpanic

fixes, cleanups all over the place.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

# -----BEGIN PGP SIGNATURE-----
#
# iQFDBAABCAAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAmXw0TMPHG1zdEByZWRo
# YXQuY29tAAoJECgfDbjSjVRp8x4H+gLMoGwaGAX7gDGPgn2Ix4j/3kO77ZJ9X9k/
# 1KqZu/9eMS1j2Ei+vZqf05w7qRjxxhwDq3ilEXF/+UFqgAehLqpRRB8j5inqvzYt
# +jv0DbL11PBp/oFjWcytm5CbiVsvq8KlqCF29VNzc162XdtcduUOWagL96y8lJfZ
# uPrOoyeR7SMH9lp3LLLHWgu+9W4nOS03RroZ6Umj40y5B7yR0Rrppz8lMw5AoQtr
# 0gMRnFhYXeiW6CXdz+Tzcr7XfvkkYDi/j7ibiNSURLBfOpZa6Y8+kJGKxz5H1K1G
# 6ZY4PBcOpQzl+NMrktPHogczgJgOK10t+1i/R3bGZYw2Qn/93Eg=
# =C0UU
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 12 Mar 2024 22:03:31 GMT
# gpg:                using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469
# gpg:                issuer "mst@redhat.com"
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full]
# gpg:                 aka "Michael S. Tsirkin <mst@redhat.com>" [full]
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17  0970 C350 3912 AFBE 8E67
#      Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA  8A0D 281F 0DB8 D28D 5469

* tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu: (68 commits)
  docs/specs/pvpanic: document shutdown event
  hw/cxl: Fix missing reserved data in CXL Device DVSEC
  hmat acpi: Fix out of bounds access due to missing use of indirection
  hmat acpi: Do not add Memory Proximity Domain Attributes Structure targetting non existent memory.
  qemu-options.hx: Document the virtio-iommu-pci aw-bits option
  hw/arm/virt: Set virtio-iommu aw-bits default value to 48
  hw/i386/q35: Set virtio-iommu aw-bits default value to 39
  virtio-iommu: Add an option to define the input range width
  virtio-iommu: Trace domain range limits as unsigned int
  qemu-options.hx: Document the virtio-iommu-pci granule option
  virtio-iommu: Change the default granule to the host page size
  virtio-iommu: Add a granule property
  hw/i386/acpi-build: Add support for SRAT Generic Initiator structures
  hw/acpi: Implement the SRAT GI affinity structure
  qom: new object to associate device to NUMA node
  hw/i386/pc: Inline pc_cmos_init() into pc_cmos_init_late() and remove it
  hw/i386/pc: Set "normal" boot device order in pc_basic_device_init()
  hw/i386/pc: Avoid one use of the current_machine global
  hw/i386/pc: Remove "rtc_state" link again
  Revert "hw/i386/pc: Confine system flash handling to pc_sysfw"
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

# Conflicts:
#	hw/core/machine.c
Peter Maydell 1 vuosi sitten
vanhempi
commit
6fc6931231
56 muutettua tiedostoa jossa 1425 lisäystä ja 381 poistoa
  1. 5 0
      MAINTAINERS
  2. 3 1
      docs/interop/vhost-user.rst
  3. 2 0
      docs/specs/pvpanic.rst
  4. 1 0
      docs/system/device-emulation.rst
  5. 121 0
      docs/system/devices/vdpa-net.rst
  6. 148 0
      hw/acpi/acpi_generic_initiator.c
  7. 13 2
      hw/acpi/hmat.c
  8. 1 0
      hw/acpi/meson.build
  9. 3 0
      hw/arm/virt-acpi-build.c
  10. 17 0
      hw/arm/virt.c
  11. 5 2
      hw/audio/virtio-snd.c
  12. 3 0
      hw/core/machine.c
  13. 2 1
      hw/core/numa.c
  14. 14 2
      hw/core/qdev-properties-system.c
  15. 17 4
      hw/cxl/cxl-component-utils.c
  16. 3 0
      hw/i386/acpi-build.c
  17. 7 23
      hw/i386/pc.c
  18. 1 2
      hw/i386/pc_piix.c
  19. 11 3
      hw/i386/pc_q35.c
  20. 13 4
      hw/i386/pc_sysfw.c
  21. 0 2
      hw/net/igb.c
  22. 16 0
      hw/net/virtio-net.c
  23. 8 22
      hw/nvme/ctrl.c
  24. 1 1
      hw/pci-bridge/pci_expander_bridge.c
  25. 1 0
      hw/pci/pci.c
  26. 8 0
      hw/pci/pcie.c
  27. 23 9
      hw/pci/pcie_sriov.c
  28. 142 0
      hw/smbios/smbios.c
  29. 4 3
      hw/virtio/trace-events
  30. 19 2
      hw/virtio/vhost-user.c
  31. 33 11
      hw/virtio/vhost-vdpa.c
  32. 32 4
      hw/virtio/virtio-iommu.c
  33. 186 3
      hw/virtio/virtio-pci.c
  34. 39 0
      hw/virtio/virtio.c
  35. 47 0
      include/hw/acpi/acpi_generic_initiator.h
  36. 1 0
      include/hw/audio/virtio-snd.h
  37. 1 0
      include/hw/cxl/cxl_component.h
  38. 2 1
      include/hw/cxl/cxl_pci.h
  39. 17 0
      include/hw/firmware/smbios.h
  40. 2 2
      include/hw/i386/pc.h
  41. 2 0
      include/hw/pci/pcie_regs.h
  42. 2 2
      include/hw/pci/pcie_sriov.h
  43. 9 0
      include/hw/virtio/vhost-vdpa.h
  44. 3 0
      include/hw/virtio/virtio-iommu.h
  45. 5 0
      include/hw/virtio/virtio-pci.h
  46. 19 0
      include/hw/virtio/virtio.h
  47. 7 0
      include/standard-headers/linux/virtio_pci.h
  48. 1 0
      include/sysemu/numa.h
  49. 6 0
      net/trace-events
  50. 28 2
      net/vhost-vdpa.c
  51. 5 1
      qapi/common.json
  52. 17 0
      qapi/qom.json
  53. 14 0
      qemu-options.hx
  54. 328 267
      subprojects/libvhost-user/libvhost-user.c
  55. 6 4
      subprojects/libvhost-user/libvhost-user.h
  56. 1 1
      tests/qtest/virtio-iommu-test.c

+ 5 - 0
MAINTAINERS

@@ -2370,6 +2370,11 @@ F: hw/virtio/vhost-user-scmi*
 F: include/hw/virtio/vhost-user-scmi.h
 F: include/hw/virtio/vhost-user-scmi.h
 F: tests/qtest/libqos/virtio-scmi.*
 F: tests/qtest/libqos/virtio-scmi.*
 
 
+vdpa-net
+M: Hao Chen <chenh@yusur.tech>
+S: Maintained
+F: docs/system/devices/vdpa-net.rst
+
 virtio-crypto
 virtio-crypto
 M: Gonglei <arei.gonglei@huawei.com>
 M: Gonglei <arei.gonglei@huawei.com>
 S: Supported
 S: Supported

+ 3 - 1
docs/interop/vhost-user.rst

@@ -1839,7 +1839,9 @@ is sent by the front-end.
   When the ``VHOST_USER_PROTOCOL_F_SHARED_OBJECT`` protocol
   When the ``VHOST_USER_PROTOCOL_F_SHARED_OBJECT`` protocol
   feature has been successfully negotiated, this message can be submitted
   feature has been successfully negotiated, this message can be submitted
   by the backend to remove themselves from to the virtio-dmabuf shared
   by the backend to remove themselves from to the virtio-dmabuf shared
-  table API. The shared table will remove the back-end device associated with
+  table API. Only the back-end owning the entry (i.e., the one that first added
+  it) will have permission to remove it. Otherwise, the message is ignored.
+  The shared table will remove the back-end device associated with
   the UUID. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and the
   the UUID. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and the
   back-end sets the ``VHOST_USER_NEED_REPLY`` flag, the front-end must respond
   back-end sets the ``VHOST_USER_NEED_REPLY`` flag, the front-end must respond
   with zero when operation is successfully completed, or non-zero otherwise.
   with zero when operation is successfully completed, or non-zero otherwise.

+ 2 - 0
docs/specs/pvpanic.rst

@@ -29,6 +29,8 @@ bit 1
   a guest panic has happened and will be handled by the guest;
   a guest panic has happened and will be handled by the guest;
   the host should record it or report it, but should not affect
   the host should record it or report it, but should not affect
   the execution of the guest.
   the execution of the guest.
+bit 2
+  a regular guest shutdown has happened and should be processed by the host
 
 
 PCI Interface
 PCI Interface
 -------------
 -------------

+ 1 - 0
docs/system/device-emulation.rst

@@ -99,3 +99,4 @@ Emulated Devices
    devices/canokey.rst
    devices/canokey.rst
    devices/usb-u2f.rst
    devices/usb-u2f.rst
    devices/igb.rst
    devices/igb.rst
+   devices/vdpa-net.rst

+ 121 - 0
docs/system/devices/vdpa-net.rst

@@ -0,0 +1,121 @@
+vdpa net
+============
+
+This document explains the setup and usage of the vdpa network device.
+The vdpa network device is a paravirtualized vdpa emulate device.
+
+Description
+-----------
+
+VDPA net devices support dirty page bitmap mark and vring state saving and recovery.
+
+Users can use this VDPA device for live migration simulation testing in a nested virtualization environment.
+
+Registers layout
+----------------
+
+The vdpa device add live migrate registers layout as follow::
+
+  Offset       Register Name	        Bitwidth     Associated vq
+  0x0          LM_LOGGING_CTRL          4bits
+  0x10         LM_BASE_ADDR_LOW         32bits
+  0x14         LM_BASE_ADDR_HIGH        32bits
+  0x18         LM_END_ADDR_LOW          32bits
+  0x1c         LM_END_ADDR_HIGH         32bits
+  0x20         LM_RING_STATE_OFFSET	32bits       vq0
+  0x24         LM_RING_STATE_OFFSET	32bits       vq1
+  0x28         LM_RING_STATE_OFFSET	32bits       vq2
+  ......
+  0x20+1023*4  LM_RING_STATE_OFFSET     32bits       vq1023
+
+These registers are extended at the end of the notify bar space.
+
+Architecture diagram
+--------------------
+::
+
+  |------------------------------------------------------------------------|
+  | guest-L1-user-space                                                    |
+  |                                                                        |
+  |                               |----------------------------------------|
+  |                               |       [virtio-net driver]              |
+  |                               |              ^  guest-L2-src(iommu=on) |
+  |                               |--------------|-------------------------|
+  |                               |              |  qemu-L2-src(viommu)    |
+  | [dpdk-vdpa]<->[vhost socket]<-+->[vhost-user backend(iommu=on)]        |
+  --------------------------------------------------------------------------
+  --------------------------------------------------------------------------
+  |       ^                             guest-L1-kernel-space              |
+  |       |                                                                |
+  |    [VFIO]                                                              |
+  |       ^                                                                |
+  |       |                             guest-L1-src(iommu=on)             |
+  --------|-----------------------------------------------------------------
+  --------|-----------------------------------------------------------------
+  | [vdpa net device(iommu=on)]        [manager nic device]                |
+  |          |                                    |                        |
+  |          |                                    |                        |
+  |     [tap device]     qemu-L1-src(viommu)      |                        |
+  ------------------------------------------------+-------------------------
+                                                  |
+                                                  |
+                        ---------------------     |
+                        | kernel net bridge |<-----
+                        |     virbr0        |<----------------------------------
+                        ---------------------                                  |
+                                                                               |
+                                                                               |
+  --------------------------------------------------------------------------   |
+  | guest-L1-user-space                                                    |   |
+  |                                                                        |   |
+  |                               |----------------------------------------|   |
+  |                               |       [virtio-net driver]              |   |
+  |                               |              ^  guest-L2-dst(iommu=on) |   |
+  |                               |--------------|-------------------------|   |
+  |                               |              |  qemu-L2-dst(viommu)    |   |
+  | [dpdk-vdpa]<->[vhost socket]<-+->[vhost-user backend(iommu=on)]        |   |
+  --------------------------------------------------------------------------   |
+  --------------------------------------------------------------------------   |
+  |       ^                             guest-L1-kernel-space              |   |
+  |       |                                                                |   |
+  |    [VFIO]                                                              |   |
+  |       ^                                                                |   |
+  |       |                             guest-L1-dst(iommu=on)             |   |
+  --------|-----------------------------------------------------------------   |
+  --------|-----------------------------------------------------------------   |
+  | [vdpa net device(iommu=on)]        [manager nic device]----------------+----
+  |          |                                                             |
+  |          |                                                             |
+  |     [tap device]     qemu-L1-dst(viommu)                               |
+  --------------------------------------------------------------------------
+
+
+Device properties
+-----------------
+
+The Virtio vdpa device can be configured with the following properties:
+
+ * ``vdpa=on`` open vdpa device emulated.
+
+Usages
+--------
+This patch add virtio sriov support and vdpa live migrate support.
+You can open vdpa by set xml file as follow::
+
+  <qemu:commandline  xmlns:qemu='http://libvirt.org/schemas/domain/qemu/1.0'>
+  <qemu:arg value='-device'/>
+  <qemu:arg value='intel-iommu,intremap=on,device-iotlb=on,aw-bits=48'/>
+  <qemu:arg value='-netdev'/>
+  <qemu:arg value='tap,id=hostnet1,script=no,downscript=no,vhost=off'/>
+  <qemu:arg value='-device'/>
+  <qemu:arg value='virtio-net-pci,netdev=hostnet1,id=net1,mac=56:4a:b7:4f:4d:a9,bus=pci.6,addr=0x0,iommu_platform=on,ats=on,vdpa=on'/>
+  </qemu:commandline>
+
+Limitations
+-----------
+1. Dependent on tap device with param ``vhost=off``.
+2. Nested virtualization environment only supports ``q35`` machines.
+3. Current only support split vring live migrate.
+
+
+

+ 148 - 0
hw/acpi/acpi_generic_initiator.c

@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "qemu/osdep.h"
+#include "hw/acpi/acpi_generic_initiator.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/boards.h"
+#include "hw/pci/pci_device.h"
+#include "qemu/error-report.h"
+
+typedef struct AcpiGenericInitiatorClass {
+    ObjectClass parent_class;
+} AcpiGenericInitiatorClass;
+
+OBJECT_DEFINE_TYPE_WITH_INTERFACES(AcpiGenericInitiator, acpi_generic_initiator,
+                   ACPI_GENERIC_INITIATOR, OBJECT,
+                   { TYPE_USER_CREATABLE },
+                   { NULL })
+
+OBJECT_DECLARE_SIMPLE_TYPE(AcpiGenericInitiator, ACPI_GENERIC_INITIATOR)
+
+static void acpi_generic_initiator_init(Object *obj)
+{
+    AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
+
+    gi->node = MAX_NODES;
+    gi->pci_dev = NULL;
+}
+
+static void acpi_generic_initiator_finalize(Object *obj)
+{
+    AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
+
+    g_free(gi->pci_dev);
+}
+
+static void acpi_generic_initiator_set_pci_device(Object *obj, const char *val,
+                                                  Error **errp)
+{
+    AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
+
+    gi->pci_dev = g_strdup(val);
+}
+
+static void acpi_generic_initiator_set_node(Object *obj, Visitor *v,
+                                            const char *name, void *opaque,
+                                            Error **errp)
+{
+    AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
+    MachineState *ms = MACHINE(qdev_get_machine());
+    uint32_t value;
+
+    if (!visit_type_uint32(v, name, &value, errp)) {
+        return;
+    }
+
+    if (value >= MAX_NODES) {
+        error_printf("%s: Invalid NUMA node specified\n",
+                     TYPE_ACPI_GENERIC_INITIATOR);
+        exit(1);
+    }
+
+    gi->node = value;
+    ms->numa_state->nodes[gi->node].has_gi = true;
+}
+
+static void acpi_generic_initiator_class_init(ObjectClass *oc, void *data)
+{
+    object_class_property_add_str(oc, "pci-dev", NULL,
+        acpi_generic_initiator_set_pci_device);
+    object_class_property_add(oc, "node", "int", NULL,
+        acpi_generic_initiator_set_node, NULL, NULL);
+}
+
+/*
+ * ACPI 6.3:
+ * Table 5-78 Generic Initiator Affinity Structure
+ */
+static void
+build_srat_generic_pci_initiator_affinity(GArray *table_data, int node,
+                                          PCIDeviceHandle *handle)
+{
+    uint8_t index;
+
+    build_append_int_noprefix(table_data, 5, 1);  /* Type */
+    build_append_int_noprefix(table_data, 32, 1); /* Length */
+    build_append_int_noprefix(table_data, 0, 1);  /* Reserved */
+    build_append_int_noprefix(table_data, 1, 1);  /* Device Handle Type: PCI */
+    build_append_int_noprefix(table_data, node, 4);  /* Proximity Domain */
+
+    /* Device Handle - PCI */
+    build_append_int_noprefix(table_data, handle->segment, 2);
+    build_append_int_noprefix(table_data, handle->bdf, 2);
+    for (index = 0; index < 12; index++) {
+        build_append_int_noprefix(table_data, 0, 1);
+    }
+
+    build_append_int_noprefix(table_data, GEN_AFFINITY_ENABLED, 4); /* Flags */
+    build_append_int_noprefix(table_data, 0, 4);     /* Reserved */
+}
+
+static int build_all_acpi_generic_initiators(Object *obj, void *opaque)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    AcpiGenericInitiator *gi;
+    GArray *table_data = opaque;
+    PCIDeviceHandle dev_handle;
+    PCIDevice *pci_dev;
+    Object *o;
+
+    if (!object_dynamic_cast(obj, TYPE_ACPI_GENERIC_INITIATOR)) {
+        return 0;
+    }
+
+    gi = ACPI_GENERIC_INITIATOR(obj);
+    if (gi->node >= ms->numa_state->num_nodes) {
+        error_printf("%s: Specified node %d is invalid.\n",
+                     TYPE_ACPI_GENERIC_INITIATOR, gi->node);
+        exit(1);
+    }
+
+    o = object_resolve_path_type(gi->pci_dev, TYPE_PCI_DEVICE, NULL);
+    if (!o) {
+        error_printf("%s: Specified device must be a PCI device.\n",
+                     TYPE_ACPI_GENERIC_INITIATOR);
+        exit(1);
+    }
+
+    pci_dev = PCI_DEVICE(o);
+
+    dev_handle.segment = 0;
+    dev_handle.bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
+                                               pci_dev->devfn);
+
+    build_srat_generic_pci_initiator_affinity(table_data,
+                                              gi->node, &dev_handle);
+
+    return 0;
+}
+
+void build_srat_generic_pci_initiator(GArray *table_data)
+{
+    object_child_foreach_recursive(object_get_root(),
+                                   build_all_acpi_generic_initiators,
+                                   table_data);
+}

+ 13 - 2
hw/acpi/hmat.c

@@ -78,6 +78,7 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
                           uint32_t *initiator_list)
                           uint32_t *initiator_list)
 {
 {
     int i, index;
     int i, index;
+    uint32_t initiator_to_index[MAX_NODES] = {};
     HMAT_LB_Data *lb_data;
     HMAT_LB_Data *lb_data;
     uint16_t *entry_list;
     uint16_t *entry_list;
     uint32_t base;
     uint32_t base;
@@ -121,6 +122,8 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
     /* Initiator Proximity Domain List */
     /* Initiator Proximity Domain List */
     for (i = 0; i < num_initiator; i++) {
     for (i = 0; i < num_initiator; i++) {
         build_append_int_noprefix(table_data, initiator_list[i], 4);
         build_append_int_noprefix(table_data, initiator_list[i], 4);
+        /* Reverse mapping for array possitions */
+        initiator_to_index[initiator_list[i]] = i;
     }
     }
 
 
     /* Target Proximity Domain List */
     /* Target Proximity Domain List */
@@ -132,7 +135,8 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
     entry_list = g_new0(uint16_t, num_initiator * num_target);
     entry_list = g_new0(uint16_t, num_initiator * num_target);
     for (i = 0; i < hmat_lb->list->len; i++) {
     for (i = 0; i < hmat_lb->list->len; i++) {
         lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
         lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
-        index = lb_data->initiator * num_target + lb_data->target;
+        index = initiator_to_index[lb_data->initiator] * num_target +
+            lb_data->target;
 
 
         entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base);
         entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base);
     }
     }
@@ -204,6 +208,13 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state)
     build_append_int_noprefix(table_data, 0, 4); /* Reserved */
     build_append_int_noprefix(table_data, 0, 4); /* Reserved */
 
 
     for (i = 0; i < numa_state->num_nodes; i++) {
     for (i = 0; i < numa_state->num_nodes; i++) {
+        /*
+         * Linux rejects whole HMAT table if a node with no memory
+         * has one of these structures listing it as a target.
+         */
+        if (!numa_state->nodes[i].node_mem) {
+            continue;
+        }
         flags = 0;
         flags = 0;
 
 
         if (numa_state->nodes[i].initiator < MAX_NODES) {
         if (numa_state->nodes[i].initiator < MAX_NODES) {
@@ -214,7 +225,7 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state)
     }
     }
 
 
     for (i = 0; i < numa_state->num_nodes; i++) {
     for (i = 0; i < numa_state->num_nodes; i++) {
-        if (numa_state->nodes[i].has_cpu) {
+        if (numa_state->nodes[i].has_cpu || numa_state->nodes[i].has_gi) {
             initiator_list[num_initiator++] = i;
             initiator_list[num_initiator++] = i;
         }
         }
     }
     }

+ 1 - 0
hw/acpi/meson.build

@@ -1,5 +1,6 @@
 acpi_ss = ss.source_set()
 acpi_ss = ss.source_set()
 acpi_ss.add(files(
 acpi_ss.add(files(
+  'acpi_generic_initiator.c',
   'acpi_interface.c',
   'acpi_interface.c',
   'aml-build.c',
   'aml-build.c',
   'bios-linker-loader.c',
   'bios-linker-loader.c',

+ 3 - 0
hw/arm/virt-acpi-build.c

@@ -57,6 +57,7 @@
 #include "migration/vmstate.h"
 #include "migration/vmstate.h"
 #include "hw/acpi/ghes.h"
 #include "hw/acpi/ghes.h"
 #include "hw/acpi/viot.h"
 #include "hw/acpi/viot.h"
+#include "hw/acpi/acpi_generic_initiator.h"
 #include "hw/virtio/virtio-acpi.h"
 #include "hw/virtio/virtio-acpi.h"
 #include "target/arm/multiprocessing.h"
 #include "target/arm/multiprocessing.h"
 
 
@@ -504,6 +505,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
         }
         }
     }
     }
 
 
+    build_srat_generic_pci_initiator(table_data);
+
     if (ms->nvdimms_state->is_enabled) {
     if (ms->nvdimms_state->is_enabled) {
         nvdimm_build_srat(table_data);
         nvdimm_build_srat(table_data);
     }
     }

+ 17 - 0
hw/arm/virt.c

@@ -85,11 +85,28 @@
 #include "hw/char/pl011.h"
 #include "hw/char/pl011.h"
 #include "qemu/guest-random.h"
 #include "qemu/guest-random.h"
 
 
+static GlobalProperty arm_virt_compat[] = {
+    { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "48" },
+};
+static const size_t arm_virt_compat_len = G_N_ELEMENTS(arm_virt_compat);
+
+/*
+ * This cannot be called from the virt_machine_class_init() because
+ * TYPE_VIRT_MACHINE is abstract and mc->compat_props g_ptr_array_new()
+ * only is called on virt non abstract class init.
+ */
+static void arm_virt_compat_set(MachineClass *mc)
+{
+    compat_props_add(mc->compat_props, arm_virt_compat,
+                     arm_virt_compat_len);
+}
+
 #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
 #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
     static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
     static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
                                                     void *data) \
                                                     void *data) \
     { \
     { \
         MachineClass *mc = MACHINE_CLASS(oc); \
         MachineClass *mc = MACHINE_CLASS(oc); \
+        arm_virt_compat_set(mc); \
         virt_machine_##major##_##minor##_options(mc); \
         virt_machine_##major##_##minor##_options(mc); \
         mc->desc = "QEMU " # major "." # minor " ARM Virtual Machine"; \
         mc->desc = "QEMU " # major "." # minor " ARM Virtual Machine"; \
         if (latest) { \
         if (latest) { \

+ 5 - 2
hw/audio/virtio-snd.c

@@ -243,12 +243,13 @@ static void virtio_snd_handle_pcm_info(VirtIOSound *s,
         memset(&pcm_info[i].padding, 0, 5);
         memset(&pcm_info[i].padding, 0, 5);
     }
     }
 
 
+    cmd->payload_size = sizeof(virtio_snd_pcm_info) * count;
     cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK);
     cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK);
     iov_from_buf(cmd->elem->in_sg,
     iov_from_buf(cmd->elem->in_sg,
                  cmd->elem->in_num,
                  cmd->elem->in_num,
                  sizeof(virtio_snd_hdr),
                  sizeof(virtio_snd_hdr),
                  pcm_info,
                  pcm_info,
-                 sizeof(virtio_snd_pcm_info) * count);
+                 cmd->payload_size);
 }
 }
 
 
 /*
 /*
@@ -749,7 +750,8 @@ process_cmd(VirtIOSound *s, virtio_snd_ctrl_command *cmd)
                  0,
                  0,
                  &cmd->resp,
                  &cmd->resp,
                  sizeof(virtio_snd_hdr));
                  sizeof(virtio_snd_hdr));
-    virtqueue_push(cmd->vq, cmd->elem, sizeof(virtio_snd_hdr));
+    virtqueue_push(cmd->vq, cmd->elem,
+                   sizeof(virtio_snd_hdr) + cmd->payload_size);
     virtio_notify(VIRTIO_DEVICE(s), cmd->vq);
     virtio_notify(VIRTIO_DEVICE(s), cmd->vq);
 }
 }
 
 
@@ -808,6 +810,7 @@ static void virtio_snd_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
         cmd->elem = elem;
         cmd->elem = elem;
         cmd->vq = vq;
         cmd->vq = vq;
         cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK);
         cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK);
+        /* implicit cmd->payload_size = 0; */
         QTAILQ_INSERT_TAIL(&s->cmdq, cmd, next);
         QTAILQ_INSERT_TAIL(&s->cmdq, cmd, next);
         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
     }
     }

+ 3 - 0
hw/core/machine.c

@@ -30,10 +30,13 @@
 #include "exec/confidential-guest-support.h"
 #include "exec/confidential-guest-support.h"
 #include "hw/virtio/virtio-pci.h"
 #include "hw/virtio/virtio-pci.h"
 #include "hw/virtio/virtio-net.h"
 #include "hw/virtio/virtio-net.h"
+#include "hw/virtio/virtio-iommu.h"
 #include "audio/audio.h"
 #include "audio/audio.h"
 
 
 GlobalProperty hw_compat_8_2[] = {
 GlobalProperty hw_compat_8_2[] = {
     { "migration", "zero-page-detection", "legacy"},
     { "migration", "zero-page-detection", "legacy"},
+    { TYPE_VIRTIO_IOMMU_PCI, "granule", "4k" },
+    { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "64" },
 };
 };
 const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2);
 const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2);
 
 

+ 2 - 1
hw/core/numa.c

@@ -227,7 +227,8 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
                    node->target, numa_state->num_nodes);
                    node->target, numa_state->num_nodes);
         return;
         return;
     }
     }
-    if (!numa_info[node->initiator].has_cpu) {
+    if (!numa_info[node->initiator].has_cpu &&
+        !numa_info[node->initiator].has_gi) {
         error_setg(errp, "Invalid initiator=%d, it isn't an "
         error_setg(errp, "Invalid initiator=%d, it isn't an "
                    "initiator proximity domain", node->initiator);
                    "initiator proximity domain", node->initiator);
         return;
         return;

+ 14 - 2
hw/core/qdev-properties-system.c

@@ -966,7 +966,7 @@ const PropertyInfo qdev_prop_off_auto_pcibar = {
     .set_default_value = qdev_propinfo_set_default_value_enum,
     .set_default_value = qdev_propinfo_set_default_value_enum,
 };
 };
 
 
-/* --- PCIELinkSpeed 2_5/5/8/16 -- */
+/* --- PCIELinkSpeed 2_5/5/8/16/32/64 -- */
 
 
 static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name,
 static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name,
                                    void *opaque, Error **errp)
                                    void *opaque, Error **errp)
@@ -988,6 +988,12 @@ static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name,
     case QEMU_PCI_EXP_LNK_16GT:
     case QEMU_PCI_EXP_LNK_16GT:
         speed = PCIE_LINK_SPEED_16;
         speed = PCIE_LINK_SPEED_16;
         break;
         break;
+    case QEMU_PCI_EXP_LNK_32GT:
+        speed = PCIE_LINK_SPEED_32;
+        break;
+    case QEMU_PCI_EXP_LNK_64GT:
+        speed = PCIE_LINK_SPEED_64;
+        break;
     default:
     default:
         /* Unreachable */
         /* Unreachable */
         abort();
         abort();
@@ -1021,6 +1027,12 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name,
     case PCIE_LINK_SPEED_16:
     case PCIE_LINK_SPEED_16:
         *p = QEMU_PCI_EXP_LNK_16GT;
         *p = QEMU_PCI_EXP_LNK_16GT;
         break;
         break;
+    case PCIE_LINK_SPEED_32:
+        *p = QEMU_PCI_EXP_LNK_32GT;
+        break;
+    case PCIE_LINK_SPEED_64:
+        *p = QEMU_PCI_EXP_LNK_64GT;
+        break;
     default:
     default:
         /* Unreachable */
         /* Unreachable */
         abort();
         abort();
@@ -1029,7 +1041,7 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name,
 
 
 const PropertyInfo qdev_prop_pcie_link_speed = {
 const PropertyInfo qdev_prop_pcie_link_speed = {
     .name = "PCIELinkSpeed",
     .name = "PCIELinkSpeed",
-    .description = "2_5/5/8/16",
+    .description = "2_5/5/8/16/32/64",
     .enum_table = &PCIELinkSpeed_lookup,
     .enum_table = &PCIELinkSpeed_lookup,
     .get = get_prop_pcielinkspeed,
     .get = get_prop_pcielinkspeed,
     .set = set_prop_pcielinkspeed,
     .set = set_prop_pcielinkspeed,

+ 17 - 4
hw/cxl/cxl-component-utils.c

@@ -297,6 +297,7 @@ void cxl_component_register_init_common(uint32_t *reg_state,
         caps = 3;
         caps = 3;
         break;
         break;
     case CXL2_ROOT_PORT:
     case CXL2_ROOT_PORT:
+    case CXL2_RC:
         /* + Extended Security, + Snoop */
         /* + Extended Security, + Snoop */
         caps = 5;
         caps = 5;
         break;
         break;
@@ -326,8 +327,19 @@ void cxl_component_register_init_common(uint32_t *reg_state,
                        CXL_##reg##_REGISTERS_OFFSET);                         \
                        CXL_##reg##_REGISTERS_OFFSET);                         \
     } while (0)
     } while (0)
 
 
+    switch (type) {
+    case CXL2_DEVICE:
+    case CXL2_TYPE3_DEVICE:
+    case CXL2_LOGICAL_DEVICE:
+    case CXL2_ROOT_PORT:
+    case CXL2_UPSTREAM_PORT:
+    case CXL2_DOWNSTREAM_PORT:
     init_cap_reg(RAS, 2, CXL_RAS_CAPABILITY_VERSION);
     init_cap_reg(RAS, 2, CXL_RAS_CAPABILITY_VERSION);
-    ras_init_common(reg_state, write_msk);
+        ras_init_common(reg_state, write_msk);
+        break;
+    default:
+        break;
+    }
 
 
     init_cap_reg(LINK, 4, CXL_LINK_CAPABILITY_VERSION);
     init_cap_reg(LINK, 4, CXL_LINK_CAPABILITY_VERSION);
 
 
@@ -335,9 +347,10 @@ void cxl_component_register_init_common(uint32_t *reg_state,
         return;
         return;
     }
     }
 
 
-    init_cap_reg(HDM, 5, CXL_HDM_CAPABILITY_VERSION);
-    hdm_init_common(reg_state, write_msk, type);
-
+    if (type != CXL2_ROOT_PORT) {
+        init_cap_reg(HDM, 5, CXL_HDM_CAPABILITY_VERSION);
+        hdm_init_common(reg_state, write_msk, type);
+    }
     if (caps < 5) {
     if (caps < 5) {
         return;
         return;
     }
     }

+ 3 - 0
hw/i386/acpi-build.c

@@ -68,6 +68,7 @@
 #include "hw/acpi/utils.h"
 #include "hw/acpi/utils.h"
 #include "hw/acpi/pci.h"
 #include "hw/acpi/pci.h"
 #include "hw/acpi/cxl.h"
 #include "hw/acpi/cxl.h"
+#include "hw/acpi/acpi_generic_initiator.h"
 
 
 #include "qom/qom-qobject.h"
 #include "qom/qom-qobject.h"
 #include "hw/i386/amd_iommu.h"
 #include "hw/i386/amd_iommu.h"
@@ -2046,6 +2047,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
         build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS);
         build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS);
     }
     }
 
 
+    build_srat_generic_pci_initiator(table_data);
+
     /*
     /*
      * Entry is required for Windows to enable memory hotplug in OS
      * Entry is required for Windows to enable memory hotplug in OS
      * and for Linux to enable SWIOTLB when booted with less than
      * and for Linux to enable SWIOTLB when booted with less than

+ 7 - 23
hw/i386/pc.c

@@ -425,9 +425,10 @@ static void set_boot_dev(PCMachineState *pcms, MC146818RtcState *s,
 
 
 static void pc_boot_set(void *opaque, const char *boot_device, Error **errp)
 static void pc_boot_set(void *opaque, const char *boot_device, Error **errp)
 {
 {
-    PCMachineState *pcms = PC_MACHINE(current_machine);
+    PCMachineState *pcms = opaque;
+    X86MachineState *x86ms = X86_MACHINE(pcms);
 
 
-    set_boot_dev(pcms, opaque, boot_device, errp);
+    set_boot_dev(pcms, MC146818_RTC(x86ms->rtc), boot_device, errp);
 }
 }
 
 
 static void pc_cmos_init_floppy(MC146818RtcState *rtc_state, ISADevice *floppy)
 static void pc_cmos_init_floppy(MC146818RtcState *rtc_state, ISADevice *floppy)
@@ -569,14 +570,6 @@ static void pc_cmos_init_late(PCMachineState *pcms)
     mc146818rtc_set_cmos_data(s, 0x39, val);
     mc146818rtc_set_cmos_data(s, 0x39, val);
 
 
     pc_cmos_init_floppy(s, pc_find_fdc0());
     pc_cmos_init_floppy(s, pc_find_fdc0());
-}
-
-void pc_cmos_init(PCMachineState *pcms,
-                  ISADevice *rtc)
-{
-    int val;
-    X86MachineState *x86ms = X86_MACHINE(pcms);
-    MC146818RtcState *s = MC146818_RTC(rtc);
 
 
     /* various important CMOS locations needed by PC/Bochs bios */
     /* various important CMOS locations needed by PC/Bochs bios */
 
 
@@ -613,22 +606,10 @@ void pc_cmos_init(PCMachineState *pcms,
     mc146818rtc_set_cmos_data(s, 0x5c, val >> 8);
     mc146818rtc_set_cmos_data(s, 0x5c, val >> 8);
     mc146818rtc_set_cmos_data(s, 0x5d, val >> 16);
     mc146818rtc_set_cmos_data(s, 0x5d, val >> 16);
 
 
-    object_property_add_link(OBJECT(pcms), "rtc_state",
-                             TYPE_ISA_DEVICE,
-                             (Object **)&x86ms->rtc,
-                             object_property_allow_set_link,
-                             OBJ_PROP_LINK_STRONG);
-    object_property_set_link(OBJECT(pcms), "rtc_state", OBJECT(s),
-                             &error_abort);
-
-    set_boot_dev(pcms, s, MACHINE(pcms)->boot_config.order, &error_fatal);
-
     val = 0;
     val = 0;
     val |= 0x02; /* FPU is there */
     val |= 0x02; /* FPU is there */
     val |= 0x04; /* PS/2 mouse installed */
     val |= 0x04; /* PS/2 mouse installed */
     mc146818rtc_set_cmos_data(s, REG_EQUIPMENT_BYTE, val);
     mc146818rtc_set_cmos_data(s, REG_EQUIPMENT_BYTE, val);
-
-    /* hard drives and FDC are handled by pc_cmos_init_late() */
 }
 }
 
 
 static void handle_a20_line_change(void *opaque, int irq, int level)
 static void handle_a20_line_change(void *opaque, int irq, int level)
@@ -1261,7 +1242,9 @@ void pc_basic_device_init(struct PCMachineState *pcms,
     }
     }
 #endif
 #endif
 
 
-    qemu_register_boot_set(pc_boot_set, rtc_state);
+    qemu_register_boot_set(pc_boot_set, pcms);
+    set_boot_dev(pcms, MC146818_RTC(rtc_state),
+                 MACHINE(pcms)->boot_config.order, &error_fatal);
 
 
     if (!xen_enabled() &&
     if (!xen_enabled() &&
         (x86ms->pit == ON_OFF_AUTO_AUTO || x86ms->pit == ON_OFF_AUTO_ON)) {
         (x86ms->pit == ON_OFF_AUTO_AUTO || x86ms->pit == ON_OFF_AUTO_ON)) {
@@ -1751,6 +1734,7 @@ static void pc_machine_initfn(Object *obj)
     pcms->fd_bootchk = true;
     pcms->fd_bootchk = true;
     pcms->default_bus_bypass_iommu = false;
     pcms->default_bus_bypass_iommu = false;
 
 
+    pc_system_flash_create(pcms);
     pcms->pcspk = isa_new(TYPE_PC_SPEAKER);
     pcms->pcspk = isa_new(TYPE_PC_SPEAKER);
     object_property_add_alias(OBJECT(pcms), "pcspk-audiodev",
     object_property_add_alias(OBJECT(pcms), "pcspk-audiodev",
                               OBJECT(pcms->pcspk), "audiodev");
                               OBJECT(pcms->pcspk), "audiodev");

+ 1 - 2
hw/i386/pc_piix.c

@@ -228,6 +228,7 @@ static void pc_init1(MachineState *machine, const char *pci_type)
         assert(machine->ram_size == x86ms->below_4g_mem_size +
         assert(machine->ram_size == x86ms->below_4g_mem_size +
                                     x86ms->above_4g_mem_size);
                                     x86ms->above_4g_mem_size);
 
 
+        pc_system_flash_cleanup_unused(pcms);
         if (machine->kernel_filename != NULL) {
         if (machine->kernel_filename != NULL) {
             /* For xen HVM direct kernel boot, load linux here */
             /* For xen HVM direct kernel boot, load linux here */
             xen_load_linux(pcms);
             xen_load_linux(pcms);
@@ -343,8 +344,6 @@ static void pc_init1(MachineState *machine, const char *pci_type)
     }
     }
 #endif
 #endif
 
 
-    pc_cmos_init(pcms, x86ms->rtc);
-
     if (piix4_pm) {
     if (piix4_pm) {
         smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0);
         smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0);
 
 

+ 11 - 3
hw/i386/pc_q35.c

@@ -45,6 +45,7 @@
 #include "hw/i386/pc.h"
 #include "hw/i386/pc.h"
 #include "hw/i386/amd_iommu.h"
 #include "hw/i386/amd_iommu.h"
 #include "hw/i386/intel_iommu.h"
 #include "hw/i386/intel_iommu.h"
+#include "hw/virtio/virtio-iommu.h"
 #include "hw/display/ramfb.h"
 #include "hw/display/ramfb.h"
 #include "hw/ide/pci.h"
 #include "hw/ide/pci.h"
 #include "hw/ide/ahci-pci.h"
 #include "hw/ide/ahci-pci.h"
@@ -63,6 +64,12 @@
 /* ICH9 AHCI has 6 ports */
 /* ICH9 AHCI has 6 ports */
 #define MAX_SATA_PORTS     6
 #define MAX_SATA_PORTS     6
 
 
+static GlobalProperty pc_q35_compat_defaults[] = {
+    { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "39" },
+};
+static const size_t pc_q35_compat_defaults_len =
+    G_N_ELEMENTS(pc_q35_compat_defaults);
+
 struct ehci_companions {
 struct ehci_companions {
     const char *name;
     const char *name;
     int func;
     int func;
@@ -311,8 +318,6 @@ static void pc_q35_init(MachineState *machine)
         smbus_eeprom_init(pcms->smbus, 8, NULL, 0);
         smbus_eeprom_init(pcms->smbus, 8, NULL, 0);
     }
     }
 
 
-    pc_cmos_init(pcms, x86ms->rtc);
-
     /* the rest devices to which pci devfn is automatically assigned */
     /* the rest devices to which pci devfn is automatically assigned */
     pc_vga_init(isa_bus, pcms->pcibus);
     pc_vga_init(isa_bus, pcms->pcibus);
     pc_nic_init(pcmc, isa_bus, pcms->pcibus);
     pc_nic_init(pcmc, isa_bus, pcms->pcibus);
@@ -350,12 +355,14 @@ static void pc_q35_machine_options(MachineClass *m)
     m->default_nic = "e1000e";
     m->default_nic = "e1000e";
     m->default_kernel_irqchip_split = false;
     m->default_kernel_irqchip_split = false;
     m->no_floppy = 1;
     m->no_floppy = 1;
-    m->max_cpus = 1024;
+    m->max_cpus = 4096;
     m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL);
     m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL);
     machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE);
     machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE);
     machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE);
     machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE);
     machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE);
     machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE);
     machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
     machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
+    compat_props_add(m->compat_props,
+                     pc_q35_compat_defaults, pc_q35_compat_defaults_len);
 }
 }
 
 
 static void pc_q35_9_0_machine_options(MachineClass *m)
 static void pc_q35_9_0_machine_options(MachineClass *m)
@@ -371,6 +378,7 @@ static void pc_q35_8_2_machine_options(MachineClass *m)
 {
 {
     pc_q35_9_0_machine_options(m);
     pc_q35_9_0_machine_options(m);
     m->alias = NULL;
     m->alias = NULL;
+    m->max_cpus = 1024;
     compat_props_add(m->compat_props, hw_compat_8_2, hw_compat_8_2_len);
     compat_props_add(m->compat_props, hw_compat_8_2, hw_compat_8_2_len);
     compat_props_add(m->compat_props, pc_compat_8_2, pc_compat_8_2_len);
     compat_props_add(m->compat_props, pc_compat_8_2, pc_compat_8_2_len);
 }
 }

+ 13 - 4
hw/i386/pc_sysfw.c

@@ -91,7 +91,19 @@ static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms,
     return PFLASH_CFI01(dev);
     return PFLASH_CFI01(dev);
 }
 }
 
 
-static void pc_system_flash_cleanup_unused(PCMachineState *pcms)
+void pc_system_flash_create(PCMachineState *pcms)
+{
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+
+    if (pcmc->pci_enabled) {
+        pcms->flash[0] = pc_pflash_create(pcms, "system.flash0",
+                                          "pflash0");
+        pcms->flash[1] = pc_pflash_create(pcms, "system.flash1",
+                                          "pflash1");
+    }
+}
+
+void pc_system_flash_cleanup_unused(PCMachineState *pcms)
 {
 {
     char *prop_name;
     char *prop_name;
     int i;
     int i;
@@ -198,9 +210,6 @@ void pc_system_firmware_init(PCMachineState *pcms,
         return;
         return;
     }
     }
 
 
-    pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", "pflash0");
-    pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", "pflash1");
-
     /* Map legacy -drive if=pflash to machine properties */
     /* Map legacy -drive if=pflash to machine properties */
     for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
     for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
         pflash_cfi01_legacy_drive(pcms->flash[i],
         pflash_cfi01_legacy_drive(pcms->flash[i],

+ 0 - 2
hw/net/igb.c

@@ -488,12 +488,10 @@ static void igb_pci_uninit(PCIDevice *pci_dev)
 
 
 static void igb_qdev_reset_hold(Object *obj)
 static void igb_qdev_reset_hold(Object *obj)
 {
 {
-    PCIDevice *d = PCI_DEVICE(obj);
     IGBState *s = IGB(obj);
     IGBState *s = IGB(obj);
 
 
     trace_e1000e_cb_qdev_reset_hold();
     trace_e1000e_cb_qdev_reset_hold();
 
 
-    pcie_sriov_pf_disable_vfs(d);
     igb_core_reset(&s->core);
     igb_core_reset(&s->core);
 }
 }
 
 

+ 16 - 0
hw/net/virtio-net.c

@@ -2039,6 +2039,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
             goto err;
             goto err;
         }
         }
 
 
+        /* Mark dirty page's bitmap of guest memory */
+        if (vdev->lm_logging_ctrl == LM_ENABLE) {
+            uint64_t chunk = elem->in_addr[i] / VHOST_LOG_CHUNK;
+            /* Get chunk index */
+            BitmapMemoryRegionCaches *caches = qatomic_rcu_read(&vdev->caches);
+            uint64_t index = chunk / 8;
+            uint64_t shift = chunk % 8;
+            uint8_t val = 0;
+            address_space_read_cached(&caches->bitmap, index, &val,
+                                      sizeof(val));
+            val |= 1 << shift;
+            address_space_write_cached(&caches->bitmap, index, &val,
+                                       sizeof(val));
+            address_space_cache_invalidate(&caches->bitmap, index, sizeof(val));
+        }
+
         elems[i] = elem;
         elems[i] = elem;
         lens[i] = total;
         lens[i] = total;
         i++;
         i++;

+ 8 - 22
hw/nvme/ctrl.c

@@ -7126,10 +7126,6 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
                 sctrl = &n->sec_ctrl_list.sec[i];
                 sctrl = &n->sec_ctrl_list.sec[i];
                 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
                 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
             }
             }
-
-            if (rst != NVME_RESET_CONTROLLER) {
-                pcie_sriov_pf_disable_vfs(pci_dev);
-            }
         }
         }
 
 
         if (rst != NVME_RESET_CONTROLLER) {
         if (rst != NVME_RESET_CONTROLLER) {
@@ -8509,36 +8505,26 @@ static void nvme_pci_reset(DeviceState *qdev)
     nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
     nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
 }
 }
 
 
-static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address,
-                                      uint32_t val, int len)
+static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
 {
 {
     NvmeCtrl *n = NVME(dev);
     NvmeCtrl *n = NVME(dev);
     NvmeSecCtrlEntry *sctrl;
     NvmeSecCtrlEntry *sctrl;
-    uint16_t sriov_cap = dev->exp.sriov_cap;
-    uint32_t off = address - sriov_cap;
-    int i, num_vfs;
+    int i;
 
 
-    if (!sriov_cap) {
-        return;
-    }
-
-    if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
-        if (!(val & PCI_SRIOV_CTRL_VFE)) {
-            num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
-            for (i = 0; i < num_vfs; i++) {
-                sctrl = &n->sec_ctrl_list.sec[i];
-                nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
-            }
-        }
+    for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
+        sctrl = &n->sec_ctrl_list.sec[i];
+        nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
     }
     }
 }
 }
 
 
 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
                                   uint32_t val, int len)
                                   uint32_t val, int len)
 {
 {
-    nvme_sriov_pre_write_ctrl(dev, address, val, len);
+    uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
+
     pci_default_write_config(dev, address, val, len);
     pci_default_write_config(dev, address, val, len);
     pcie_cap_flr_write_config(dev, address, val, len);
     pcie_cap_flr_write_config(dev, address, val, len);
+    nvme_sriov_post_write_config(dev, old_num_vfs);
 }
 }
 
 
 static const VMStateDescription nvme_vmstate = {
 static const VMStateDescription nvme_vmstate = {

+ 1 - 1
hw/pci-bridge/pci_expander_bridge.c

@@ -290,7 +290,7 @@ static void pxb_cxl_dev_reset(DeviceState *dev)
     uint32_t *write_msk = cxl_cstate->crb.cache_mem_regs_write_mask;
     uint32_t *write_msk = cxl_cstate->crb.cache_mem_regs_write_mask;
     int dsp_count = 0;
     int dsp_count = 0;
 
 
-    cxl_component_register_init_common(reg_state, write_msk, CXL2_ROOT_PORT);
+    cxl_component_register_init_common(reg_state, write_msk, CXL2_RC);
     /*
     /*
      * The CXL specification allows for host bridges with no HDM decoders
      * The CXL specification allows for host bridges with no HDM decoders
      * if they only have a single root port.
      * if they only have a single root port.

+ 1 - 0
hw/pci/pci.c

@@ -409,6 +409,7 @@ static void pci_do_device_reset(PCIDevice *dev)
 
 
     msi_reset(dev);
     msi_reset(dev);
     msix_reset(dev);
     msix_reset(dev);
+    pcie_sriov_pf_reset(dev);
 }
 }
 
 
 /*
 /*

+ 8 - 0
hw/pci/pcie.c

@@ -171,6 +171,14 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev)
             pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
             pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
                                        PCI_EXP_LNKCAP2_SLS_16_0GB);
                                        PCI_EXP_LNKCAP2_SLS_16_0GB);
         }
         }
+        if (s->speed > QEMU_PCI_EXP_LNK_16GT) {
+            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
+                                       PCI_EXP_LNKCAP2_SLS_32_0GB);
+        }
+        if (s->speed > QEMU_PCI_EXP_LNK_32GT) {
+            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
+                                       PCI_EXP_LNKCAP2_SLS_64_0GB);
+        }
     }
     }
 }
 }
 
 

+ 23 - 9
hw/pci/pcie_sriov.c

@@ -176,6 +176,9 @@ static void register_vfs(PCIDevice *dev)
 
 
     assert(sriov_cap > 0);
     assert(sriov_cap > 0);
     num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
     num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
+    if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
+        return;
+    }
 
 
     dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs);
     dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs);
 
 
@@ -212,7 +215,6 @@ static void unregister_vfs(PCIDevice *dev)
     g_free(dev->exp.sriov_pf.vf);
     g_free(dev->exp.sriov_pf.vf);
     dev->exp.sriov_pf.vf = NULL;
     dev->exp.sriov_pf.vf = NULL;
     dev->exp.sriov_pf.num_vfs = 0;
     dev->exp.sriov_pf.num_vfs = 0;
-    pci_set_word(dev->config + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0);
 }
 }
 
 
 void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
 void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
@@ -246,16 +248,28 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
 }
 }
 
 
 
 
-/* Reset SR/IOV VF Enable bit to trigger an unregister of all VFs */
-void pcie_sriov_pf_disable_vfs(PCIDevice *dev)
+/* Reset SR/IOV */
+void pcie_sriov_pf_reset(PCIDevice *dev)
 {
 {
     uint16_t sriov_cap = dev->exp.sriov_cap;
     uint16_t sriov_cap = dev->exp.sriov_cap;
-    if (sriov_cap) {
-        uint32_t val = pci_get_byte(dev->config + sriov_cap + PCI_SRIOV_CTRL);
-        if (val & PCI_SRIOV_CTRL_VFE) {
-            val &= ~PCI_SRIOV_CTRL_VFE;
-            pcie_sriov_config_write(dev, sriov_cap + PCI_SRIOV_CTRL, val, 1);
-        }
+    if (!sriov_cap) {
+        return;
+    }
+
+    pci_set_word(dev->config + sriov_cap + PCI_SRIOV_CTRL, 0);
+    unregister_vfs(dev);
+
+    pci_set_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF, 0);
+
+    /*
+     * Default is to use 4K pages, software can modify it
+     * to any of the supported bits
+     */
+    pci_set_word(dev->config + sriov_cap + PCI_SRIOV_SYS_PGSIZE, 0x1);
+
+    for (uint16_t i = 0; i < PCI_NUM_REGIONS; i++) {
+        pci_set_quad(dev->config + sriov_cap + PCI_SRIOV_BAR + i * 4,
+                     dev->exp.sriov_pf.vf_bar_type[i]);
     }
     }
 }
 }
 
 

+ 142 - 0
hw/smbios/smbios.c

@@ -121,6 +121,16 @@ struct type8_instance {
 };
 };
 static QTAILQ_HEAD(, type8_instance) type8 = QTAILQ_HEAD_INITIALIZER(type8);
 static QTAILQ_HEAD(, type8_instance) type8 = QTAILQ_HEAD_INITIALIZER(type8);
 
 
+/* type 9 instance for parsing */
+struct type9_instance {
+    const char *slot_designation, *pcidev;
+    uint8_t slot_type, slot_data_bus_width, current_usage, slot_length,
+            slot_characteristics1, slot_characteristics2;
+    uint16_t slot_id;
+    QTAILQ_ENTRY(type9_instance) next;
+};
+static QTAILQ_HEAD(, type9_instance) type9 = QTAILQ_HEAD_INITIALIZER(type9);
+
 static struct {
 static struct {
     size_t nvalues;
     size_t nvalues;
     char **values;
     char **values;
@@ -380,6 +390,59 @@ static const QemuOptDesc qemu_smbios_type8_opts[] = {
     { /* end of list */ }
     { /* end of list */ }
 };
 };
 
 
+static const QemuOptDesc qemu_smbios_type9_opts[] = {
+    {
+        .name = "type",
+        .type = QEMU_OPT_NUMBER,
+        .help = "SMBIOS element type",
+    },
+    {
+        .name = "slot_designation",
+        .type = QEMU_OPT_STRING,
+        .help = "string number for reference designation",
+    },
+    {
+        .name = "slot_type",
+        .type = QEMU_OPT_NUMBER,
+        .help = "connector type",
+    },
+    {
+        .name = "slot_data_bus_width",
+        .type = QEMU_OPT_NUMBER,
+        .help = "port type",
+    },
+    {
+        .name = "current_usage",
+        .type = QEMU_OPT_NUMBER,
+        .help = "current usage",
+    },
+    {
+        .name = "slot_length",
+        .type = QEMU_OPT_NUMBER,
+        .help = "system slot length",
+    },
+    {
+        .name = "slot_id",
+        .type = QEMU_OPT_NUMBER,
+        .help = "system slot id",
+    },
+    {
+        .name = "slot_characteristics1",
+        .type = QEMU_OPT_NUMBER,
+        .help = "slot characteristics1, see the spec",
+    },
+    {
+        .name = "slot_characteristics2",
+        .type = QEMU_OPT_NUMBER,
+        .help = "slot characteristics2, see the spec",
+    },
+    {
+        .name = "pci_device",
+        .type = QEMU_OPT_STRING,
+        .help = "PCI device, if provided."
+    }
+};
+
 static const QemuOptDesc qemu_smbios_type11_opts[] = {
 static const QemuOptDesc qemu_smbios_type11_opts[] = {
     {
     {
         .name = "type",
         .name = "type",
@@ -609,6 +672,7 @@ bool smbios_skip_table(uint8_t type, bool required_table)
 #define T2_BASE 0x200
 #define T2_BASE 0x200
 #define T3_BASE 0x300
 #define T3_BASE 0x300
 #define T4_BASE 0x400
 #define T4_BASE 0x400
+#define T9_BASE 0x900
 #define T11_BASE 0xe00
 #define T11_BASE 0xe00
 
 
 #define T16_BASE 0x1000
 #define T16_BASE 0x1000
@@ -807,6 +871,65 @@ static void smbios_build_type_8_table(void)
     }
     }
 }
 }
 
 
+static void smbios_build_type_9_table(Error **errp)
+{
+    unsigned instance = 0;
+    struct type9_instance *t9;
+
+    QTAILQ_FOREACH(t9, &type9, next) {
+        SMBIOS_BUILD_TABLE_PRE(9, T9_BASE + instance, true);
+
+        SMBIOS_TABLE_SET_STR(9, slot_designation, t9->slot_designation);
+        t->slot_type = t9->slot_type;
+        t->slot_data_bus_width = t9->slot_data_bus_width;
+        t->current_usage = t9->current_usage;
+        t->slot_length = t9->slot_length;
+        t->slot_id = t9->slot_id;
+        t->slot_characteristics1 = t9->slot_characteristics1;
+        t->slot_characteristics2 = t9->slot_characteristics2;
+
+        if (t9->pcidev) {
+            PCIDevice *pdev = NULL;
+            int rc = pci_qdev_find_device(t9->pcidev, &pdev);
+            if (rc != 0) {
+                error_setg(errp,
+                           "No PCI device %s for SMBIOS type 9 entry %s",
+                           t9->pcidev, t9->slot_designation);
+                return;
+            }
+            /*
+             * We only handle the case were the device is attached to
+             * the PCI root bus. The general case is more complex as
+             * bridges are enumerated later and the table would need
+             * to be updated at this moment.
+             */
+            if (!pci_bus_is_root(pci_get_bus(pdev))) {
+                error_setg(errp,
+                           "Cannot create type 9 entry for PCI device %s: "
+                           "not attached to the root bus",
+                           t9->pcidev);
+                return;
+            }
+            t->segment_group_number = cpu_to_le16(0);
+            t->bus_number = pci_dev_bus_num(pdev);
+            t->device_number = pdev->devfn;
+        } else {
+            /*
+             * Per SMBIOS spec, For slots that are not of the PCI, AGP, PCI-X,
+             * or PCI-Express type that do not have bus/device/function
+             * information, 0FFh should be populated in the fields of Segment
+             * Group Number, Bus Number, Device/Function Number.
+             */
+            t->segment_group_number = 0xff;
+            t->bus_number = 0xff;
+            t->device_number = 0xff;
+        }
+
+        SMBIOS_BUILD_TABLE_POST;
+        instance++;
+    }
+}
+
 static void smbios_build_type_11_table(void)
 static void smbios_build_type_11_table(void)
 {
 {
     char count_str[128];
     char count_str[128];
@@ -1126,6 +1249,7 @@ void smbios_get_tables(MachineState *ms,
         }
         }
 
 
         smbios_build_type_8_table();
         smbios_build_type_8_table();
+        smbios_build_type_9_table(errp);
         smbios_build_type_11_table();
         smbios_build_type_11_table();
 
 
 #define MAX_DIMM_SZ (16 * GiB)
 #define MAX_DIMM_SZ (16 * GiB)
@@ -1460,6 +1584,24 @@ void smbios_entry_add(QemuOpts *opts, Error **errp)
             t8_i->port_type = qemu_opt_get_number(opts, "port_type", 0);
             t8_i->port_type = qemu_opt_get_number(opts, "port_type", 0);
             QTAILQ_INSERT_TAIL(&type8, t8_i, next);
             QTAILQ_INSERT_TAIL(&type8, t8_i, next);
             return;
             return;
+        case 9: {
+            if (!qemu_opts_validate(opts, qemu_smbios_type9_opts, errp)) {
+                return;
+            }
+            struct type9_instance *t;
+            t = g_new0(struct type9_instance, 1);
+            save_opt(&t->slot_designation, opts, "slot_designation");
+            t->slot_type = qemu_opt_get_number(opts, "slot_type", 0);
+            t->slot_data_bus_width = qemu_opt_get_number(opts, "slot_data_bus_width", 0);
+            t->current_usage = qemu_opt_get_number(opts, "current_usage", 0);
+            t->slot_length = qemu_opt_get_number(opts, "slot_length", 0);
+            t->slot_id = qemu_opt_get_number(opts, "slot_id", 0);
+            t->slot_characteristics1 = qemu_opt_get_number(opts, "slot_characteristics1", 0);
+            t->slot_characteristics2 = qemu_opt_get_number(opts, "slot_characteristics2", 0);
+            save_opt(&t->pcidev, opts, "pcidev");
+            QTAILQ_INSERT_TAIL(&type9, t, next);
+            return;
+        }
         case 11:
         case 11:
             if (!qemu_opts_validate(opts, qemu_smbios_type11_opts, errp)) {
             if (!qemu_opts_validate(opts, qemu_smbios_type11_opts, errp)) {
                 return;
                 return;

+ 4 - 3
hw/virtio/trace-events

@@ -30,6 +30,7 @@ vhost_user_write(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32""
 vhost_user_create_notifier(int idx, void *n) "idx:%d n:%p"
 vhost_user_create_notifier(int idx, void *n) "idx:%d n:%p"
 
 
 # vhost-vdpa.c
 # vhost-vdpa.c
+vhost_vdpa_skipped_memory_section(int is_ram, int is_iommu, int is_protected, int is_ram_device, uint64_t first, uint64_t last, int page_mask) "is_ram=%d, is_iommu=%d, is_protected=%d, is_ram_device=%d iova_min=0x%"PRIx64" iova_last=0x%"PRIx64" page_mask=0x%x"
 vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" perm: 0x%"PRIx8" type: %"PRIu8
 vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" perm: 0x%"PRIx8" type: %"PRIu8
 vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8
 vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8
 vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type)  "vdpa_shared:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8
 vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type)  "vdpa_shared:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8
@@ -57,8 +58,8 @@ vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d"
 vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p"
 vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p"
 vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64
 vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64
 vhost_vdpa_set_vring_num(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u"
 vhost_vdpa_set_vring_num(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u"
-vhost_vdpa_set_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u"
-vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u"
+vhost_vdpa_set_dev_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d"
+vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d"
 vhost_vdpa_set_vring_kick(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d"
 vhost_vdpa_set_vring_kick(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d"
 vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d"
 vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d"
 vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
 vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
@@ -111,7 +112,7 @@ virtio_iommu_device_reset(void) "reset!"
 virtio_iommu_system_reset(void) "system reset!"
 virtio_iommu_system_reset(void) "system reset!"
 virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64
 virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64
 virtio_iommu_device_status(uint8_t status) "driver status = %d"
 virtio_iommu_device_status(uint8_t status) "driver status = %d"
-virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%d domain range end=%d probe_size=0x%x bypass=0x%x"
+virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%u domain range end=%u probe_size=0x%x bypass=0x%x"
 virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x"
 virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x"
 virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
 virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
 virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
 virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"

+ 19 - 2
hw/virtio/vhost-user.c

@@ -1610,11 +1610,27 @@ vhost_user_backend_handle_shared_object_add(struct vhost_dev *dev,
 }
 }
 
 
 static int
 static int
-vhost_user_backend_handle_shared_object_remove(VhostUserShared *object)
+vhost_user_backend_handle_shared_object_remove(struct vhost_dev *dev,
+                                               VhostUserShared *object)
 {
 {
     QemuUUID uuid;
     QemuUUID uuid;
 
 
     memcpy(uuid.data, object->uuid, sizeof(object->uuid));
     memcpy(uuid.data, object->uuid, sizeof(object->uuid));
+    switch (virtio_object_type(&uuid)) {
+    case TYPE_VHOST_DEV:
+    {
+        struct vhost_dev *owner = virtio_lookup_vhost_device(&uuid);
+        if (dev != owner) {
+            /* Not allowed to remove non-owned entries */
+            return 0;
+        }
+        break;
+    }
+    default:
+        /* Not allowed to remove non-owned entries */
+        return 0;
+    }
+
     return virtio_remove_resource(&uuid);
     return virtio_remove_resource(&uuid);
 }
 }
 
 
@@ -1793,7 +1809,8 @@ static gboolean backend_read(QIOChannel *ioc, GIOCondition condition,
         ret = vhost_user_backend_handle_shared_object_add(dev, &payload.object);
         ret = vhost_user_backend_handle_shared_object_add(dev, &payload.object);
         break;
         break;
     case VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE:
     case VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE:
-        ret = vhost_user_backend_handle_shared_object_remove(&payload.object);
+        ret = vhost_user_backend_handle_shared_object_remove(dev,
+                                                             &payload.object);
         break;
         break;
     case VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP:
     case VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP:
         ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, ioc,
         ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, ioc,

+ 33 - 11
hw/virtio/vhost-vdpa.c

@@ -47,12 +47,17 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
                                                 int page_mask)
                                                 int page_mask)
 {
 {
     Int128 llend;
     Int128 llend;
+    bool is_ram = memory_region_is_ram(section->mr);
+    bool is_iommu = memory_region_is_iommu(section->mr);
+    bool is_protected = memory_region_is_protected(section->mr);
 
 
-    if ((!memory_region_is_ram(section->mr) &&
-         !memory_region_is_iommu(section->mr)) ||
-        memory_region_is_protected(section->mr) ||
-        /* vhost-vDPA doesn't allow MMIO to be mapped  */
-        memory_region_is_ram_device(section->mr)) {
+    /* vhost-vDPA doesn't allow MMIO to be mapped  */
+    bool is_ram_device = memory_region_is_ram_device(section->mr);
+
+    if ((!is_ram && !is_iommu) || is_protected || is_ram_device) {
+        trace_vhost_vdpa_skipped_memory_section(is_ram, is_iommu, is_protected,
+                                                is_ram_device, iova_min,
+                                                iova_max, page_mask);
         return true;
         return true;
     }
     }
 
 
@@ -69,7 +74,7 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
      * size that maps to the kernel
      * size that maps to the kernel
      */
      */
 
 
-    if (!memory_region_is_iommu(section->mr)) {
+    if (!is_iommu) {
         llend = vhost_vdpa_section_end(section, page_mask);
         llend = vhost_vdpa_section_end(section, page_mask);
         if (int128_gt(llend, int128_make64(iova_max))) {
         if (int128_gt(llend, int128_make64(iova_max))) {
             error_report("RAM section out of device range (max=0x%" PRIx64
             error_report("RAM section out of device range (max=0x%" PRIx64
@@ -555,6 +560,11 @@ static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
     return v->index == 0;
     return v->index == 0;
 }
 }
 
 
+static bool vhost_vdpa_last_dev(struct vhost_dev *dev)
+{
+    return dev->vq_index + dev->nvqs == dev->vq_index_end;
+}
+
 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
                                        uint64_t *features)
                                        uint64_t *features)
 {
 {
@@ -965,7 +975,10 @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
                                          struct vhost_vring_state *ring)
                                          struct vhost_vring_state *ring)
 {
 {
-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+    struct vhost_vdpa *v = dev->opaque;
+
+    trace_vhost_vdpa_set_dev_vring_base(dev, ring->index, ring->num,
+                                        v->shadow_vqs_enabled);
     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 }
 }
 
 
@@ -1315,7 +1328,7 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
     }
 
 
-    if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
+    if (!vhost_vdpa_last_dev(dev)) {
         return 0;
         return 0;
     }
     }
 
 
@@ -1337,7 +1350,7 @@ static void vhost_vdpa_reset_status(struct vhost_dev *dev)
 {
 {
     struct vhost_vdpa *v = dev->opaque;
     struct vhost_vdpa *v = dev->opaque;
 
 
-    if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
+    if (!vhost_vdpa_last_dev(dev)) {
         return;
         return;
     }
     }
 
 
@@ -1407,6 +1420,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
 
 
     if (v->shadow_vqs_enabled) {
     if (v->shadow_vqs_enabled) {
         ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
         ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
+        trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, true);
         return 0;
         return 0;
     }
     }
 
 
@@ -1419,7 +1433,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
     }
     }
 
 
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
-    trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
+    trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, false);
     return ret;
     return ret;
 }
 }
 
 
@@ -1447,7 +1461,15 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 
 
     /* Remember last call fd because we can switch to SVQ anytime. */
     /* Remember last call fd because we can switch to SVQ anytime. */
     vhost_svq_set_svq_call_fd(svq, file->fd);
     vhost_svq_set_svq_call_fd(svq, file->fd);
-    if (v->shadow_vqs_enabled) {
+    /*
+     * When SVQ is transitioning to off, shadow_vqs_enabled has
+     * not been set back to false yet, but the underlying call fd
+     * will have to switch back to the guest notifier to signal the
+     * passthrough virtqueues. In other situations, SVQ's own call
+     * fd shall be used to signal the device model.
+     */
+    if (v->shadow_vqs_enabled &&
+        v->shared->svq_switching != SVQ_TSTATE_DISABLING) {
         return 0;
         return 0;
     }
     }
 
 

+ 32 - 4
hw/virtio/virtio-iommu.c

@@ -29,6 +29,7 @@
 #include "sysemu/reset.h"
 #include "sysemu/reset.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/sysemu.h"
 #include "qemu/reserved-region.h"
 #include "qemu/reserved-region.h"
+#include "qemu/units.h"
 #include "qapi/error.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qemu/error-report.h"
 #include "trace.h"
 #include "trace.h"
@@ -1115,8 +1116,8 @@ static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr,
 }
 }
 
 
 /*
 /*
- * The default mask (TARGET_PAGE_MASK) is the smallest supported guest granule,
- * for example 0xfffffffffffff000. When an assigned device has page size
+ * The default mask depends on the "granule" property. For example, with
+ * 4k granule, it is -(4 * KiB). When an assigned device has page size
  * restrictions due to the hardware IOMMU configuration, apply this restriction
  * restrictions due to the hardware IOMMU configuration, apply this restriction
  * to the mask.
  * to the mask.
  */
  */
@@ -1313,8 +1314,32 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
      * in vfio realize
      * in vfio realize
      */
      */
     s->config.bypass = s->boot_bypass;
     s->config.bypass = s->boot_bypass;
-    s->config.page_size_mask = qemu_target_page_mask();
-    s->config.input_range.end = UINT64_MAX;
+    if (s->aw_bits < 32 || s->aw_bits > 64) {
+        error_setg(errp, "aw-bits must be within [32,64]");
+        return;
+    }
+    s->config.input_range.end =
+        s->aw_bits == 64 ? UINT64_MAX : BIT_ULL(s->aw_bits) - 1;
+
+    switch (s->granule_mode) {
+    case GRANULE_MODE_4K:
+        s->config.page_size_mask = -(4 * KiB);
+        break;
+    case GRANULE_MODE_8K:
+        s->config.page_size_mask = -(8 * KiB);
+        break;
+    case GRANULE_MODE_16K:
+        s->config.page_size_mask = -(16 * KiB);
+        break;
+    case GRANULE_MODE_64K:
+        s->config.page_size_mask = -(64 * KiB);
+        break;
+    case GRANULE_MODE_HOST:
+        s->config.page_size_mask = qemu_real_host_page_mask();
+        break;
+    default:
+        error_setg(errp, "Unsupported granule mode");
+    }
     s->config.domain_range.end = UINT32_MAX;
     s->config.domain_range.end = UINT32_MAX;
     s->config.probe_size = VIOMMU_PROBE_SIZE;
     s->config.probe_size = VIOMMU_PROBE_SIZE;
 
 
@@ -1522,6 +1547,9 @@ static Property virtio_iommu_properties[] = {
     DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus,
     DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus,
                      TYPE_PCI_BUS, PCIBus *),
                      TYPE_PCI_BUS, PCIBus *),
     DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true),
     DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true),
+    DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode,
+                             GRANULE_MODE_HOST),
+    DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 64),
     DEFINE_PROP_END_OF_LIST(),
     DEFINE_PROP_END_OF_LIST(),
 };
 };
 
 

+ 186 - 3
hw/virtio/virtio-pci.c

@@ -1442,6 +1442,155 @@ int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy,
     return virtio_pci_add_mem_cap(proxy, &cap.cap);
     return virtio_pci_add_mem_cap(proxy, &cap.cap);
 }
 }
 
 
+/* Called within call_rcu().  */
+static void bitmap_free_region_cache(BitmapMemoryRegionCaches *caches)
+{
+    assert(caches != NULL);
+    address_space_cache_destroy(&caches->bitmap);
+    g_free(caches);
+}
+
+static void lm_disable(VirtIODevice *vdev)
+{
+    BitmapMemoryRegionCaches *caches;
+    caches = qatomic_read(&vdev->caches);
+    qatomic_rcu_set(&vdev->caches, NULL);
+    if (caches) {
+        call_rcu(caches, bitmap_free_region_cache, rcu);
+    }
+}
+
+static void lm_enable(VirtIODevice *vdev)
+{
+    BitmapMemoryRegionCaches *old = vdev->caches;
+    BitmapMemoryRegionCaches *new = NULL;
+    hwaddr addr, end, size;
+    int64_t len;
+
+    addr = vdev->lm_base_addr_low | ((hwaddr)(vdev->lm_base_addr_high) << 32);
+    end = vdev->lm_end_addr_low | ((hwaddr)(vdev->lm_end_addr_high) << 32);
+    size = end - addr;
+    if (size <= 0) {
+        error_report("Invalid lm size.");
+        return;
+    }
+
+    new = g_new0(BitmapMemoryRegionCaches, 1);
+    len = address_space_cache_init(&new->bitmap, vdev->dma_as, addr, size,
+                                   true);
+    if (len < size) {
+        virtio_error(vdev, "Cannot map bitmap");
+        goto err_bitmap;
+    }
+    qatomic_rcu_set(&vdev->caches, new);
+
+    if (old) {
+        call_rcu(old, bitmap_free_region_cache, rcu);
+    }
+
+    return;
+
+err_bitmap:
+    address_space_cache_destroy(&new->bitmap);
+    g_free(new);
+}
+
+static uint64_t virtio_pci_lm_read(void *opaque, hwaddr addr,
+                                       unsigned size)
+{
+    VirtIOPCIProxy *proxy = opaque;
+    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+    hwaddr offset_end = LM_VRING_STATE_OFFSET +
+                        virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX;
+    uint32_t val;
+    int qid;
+
+    if (vdev == NULL) {
+        return UINT64_MAX;
+    }
+    switch (addr) {
+    case LM_LOGGING_CTRL:
+        val = vdev->lm_logging_ctrl;
+        break;
+    case LM_BASE_ADDR_LOW:
+        val = vdev->lm_base_addr_low;
+        break;
+    case LM_BASE_ADDR_HIGH:
+        val = vdev->lm_base_addr_high;
+        break;
+    case LM_END_ADDR_LOW:
+        val = vdev->lm_end_addr_low;
+        break;
+    case LM_END_ADDR_HIGH:
+        val = vdev->lm_end_addr_high;
+        break;
+    default:
+        if (addr >= LM_VRING_STATE_OFFSET && addr <= offset_end) {
+            qid = (addr - LM_VRING_STATE_OFFSET) /
+                  virtio_pci_queue_mem_mult(proxy);
+            val = virtio_queue_get_vring_states(vdev, qid);
+        } else
+            val = 0;
+
+        break;
+    }
+
+    return val;
+}
+
+static void virtio_pci_lm_write(void *opaque, hwaddr addr,
+                                    uint64_t val, unsigned size)
+{
+    VirtIOPCIProxy *proxy = opaque;
+    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+    hwaddr offset_end = LM_VRING_STATE_OFFSET +
+                        virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX;
+    int qid;
+
+    if (vdev == NULL) {
+        return;
+    }
+
+    switch (addr) {
+    case LM_LOGGING_CTRL:
+        vdev->lm_logging_ctrl = val;
+        switch (val) {
+        case LM_DISABLE:
+            lm_disable(vdev);
+            break;
+        case LM_ENABLE:
+            lm_enable(vdev);
+            break;
+        default:
+            virtio_error(vdev, "Unsupport LM_LOGGING_CTRL value: %"PRIx64,
+                         val);
+                break;
+        };
+
+        break;
+    case LM_BASE_ADDR_LOW:
+        vdev->lm_base_addr_low = val;
+        break;
+    case LM_BASE_ADDR_HIGH:
+        vdev->lm_base_addr_high = val;
+        break;
+    case LM_END_ADDR_LOW:
+        vdev->lm_end_addr_low = val;
+        break;
+    case LM_END_ADDR_HIGH:
+        vdev->lm_end_addr_high = val;
+        break;
+    default:
+        if (addr >= LM_VRING_STATE_OFFSET && addr <= offset_end) {
+            qid = (addr - LM_VRING_STATE_OFFSET) /
+                  virtio_pci_queue_mem_mult(proxy);
+            virtio_queue_set_vring_states(vdev, qid, val);
+        } else
+            virtio_error(vdev, "Unsupport addr: %"PRIx64, addr);
+        break;
+    }
+}
+
 static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr,
 static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr,
                                        unsigned size)
                                        unsigned size)
 {
 {
@@ -1823,6 +1972,15 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy,
         },
         },
         .endianness = DEVICE_LITTLE_ENDIAN,
         .endianness = DEVICE_LITTLE_ENDIAN,
     };
     };
+    static const MemoryRegionOps lm_ops = {
+        .read = virtio_pci_lm_read,
+        .write = virtio_pci_lm_write,
+        .impl = {
+            .min_access_size = 1,
+            .max_access_size = 4,
+        },
+        .endianness = DEVICE_LITTLE_ENDIAN,
+    };
     g_autoptr(GString) name = g_string_new(NULL);
     g_autoptr(GString) name = g_string_new(NULL);
 
 
     g_string_printf(name, "virtio-pci-common-%s", vdev_name);
     g_string_printf(name, "virtio-pci-common-%s", vdev_name);
@@ -1859,6 +2017,14 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy,
                           proxy,
                           proxy,
                           name->str,
                           name->str,
                           proxy->notify_pio.size);
                           proxy->notify_pio.size);
+    if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) {
+        g_string_printf(name, "virtio-pci-lm-%s", vdev_name);
+        memory_region_init_io(&proxy->lm.mr, OBJECT(proxy),
+                          &lm_ops,
+                          proxy,
+                          name->str,
+                          proxy->lm.size);
+    }
 }
 }
 
 
 static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy,
 static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy,
@@ -2021,6 +2187,10 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
         virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap);
         virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap);
         virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap);
         virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap);
         virtio_pci_modern_mem_region_map(proxy, &proxy->notify, &notify.cap);
         virtio_pci_modern_mem_region_map(proxy, &proxy->notify, &notify.cap);
+        if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) {
+            memory_region_add_subregion(&proxy->modern_bar,
+                                        proxy->lm.offset, &proxy->lm.mr);
+        }
 
 
         if (modern_pio) {
         if (modern_pio) {
             memory_region_init(&proxy->io_bar, OBJECT(proxy),
             memory_region_init(&proxy->io_bar, OBJECT(proxy),
@@ -2090,6 +2260,9 @@ static void virtio_pci_device_unplugged(DeviceState *d)
         virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr);
         virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr);
         virtio_pci_modern_mem_region_unmap(proxy, &proxy->device);
         virtio_pci_modern_mem_region_unmap(proxy, &proxy->device);
         virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify);
         virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify);
+        if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) {
+            memory_region_del_subregion(&proxy->modern_bar, &proxy->lm.mr);
+        }
         if (modern_pio) {
         if (modern_pio) {
             virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio);
             virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio);
         }
         }
@@ -2144,9 +2317,17 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
     proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG;
     proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG;
 
 
     /* subclasses can enforce modern, so do this unconditionally */
     /* subclasses can enforce modern, so do this unconditionally */
-    memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci",
-                       /* PCI BAR regions must be powers of 2 */
-                       pow2ceil(proxy->notify.offset + proxy->notify.size));
+    if (!(proxy->flags & VIRTIO_PCI_FLAG_VDPA)) {
+        memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci",
+                           /* PCI BAR regions must be powers of 2 */
+                           pow2ceil(proxy->notify.offset + proxy->notify.size));
+    } else {
+        proxy->lm.offset = proxy->notify.offset + proxy->notify.size;
+        proxy->lm.size = 0x20 + VIRTIO_QUEUE_MAX * 4;
+        memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci",
+                           /* PCI BAR regions must be powers of 2 */
+                           pow2ceil(proxy->lm.offset + proxy->lm.size));
+    }
 
 
     if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) {
     if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) {
         proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
         proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
@@ -2301,6 +2482,8 @@ static Property virtio_pci_properties[] = {
                     VIRTIO_PCI_FLAG_INIT_FLR_BIT, true),
                     VIRTIO_PCI_FLAG_INIT_FLR_BIT, true),
     DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags,
     DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags,
                     VIRTIO_PCI_FLAG_AER_BIT, false),
                     VIRTIO_PCI_FLAG_AER_BIT, false),
+    DEFINE_PROP_BIT("vdpa", VirtIOPCIProxy, flags,
+                    VIRTIO_PCI_FLAG_VDPA_BIT, false),
     DEFINE_PROP_END_OF_LIST(),
     DEFINE_PROP_END_OF_LIST(),
 };
 };
 
 

+ 39 - 0
hw/virtio/virtio.c

@@ -3368,6 +3368,18 @@ static uint16_t virtio_queue_split_get_last_avail_idx(VirtIODevice *vdev,
     return vdev->vq[n].last_avail_idx;
     return vdev->vq[n].last_avail_idx;
 }
 }
 
 
+static uint32_t virtio_queue_split_get_vring_states(VirtIODevice *vdev,
+                                                      int n)
+{
+    struct VirtQueue *vq = &vdev->vq[n];
+    uint16_t avail, used;
+
+    avail = vq->last_avail_idx;
+    used = vq->used_idx;
+
+    return avail | (uint32_t)used << 16;
+}
+
 unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n)
 unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n)
 {
 {
     if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
     if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
@@ -3377,6 +3389,33 @@ unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n)
     }
     }
 }
 }
 
 
+unsigned int virtio_queue_get_vring_states(VirtIODevice *vdev, int n)
+{
+    if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+        return -1;
+    } else {
+        return virtio_queue_split_get_vring_states(vdev, n);
+    }
+}
+
+static void virtio_queue_split_set_vring_states(VirtIODevice *vdev,
+                                                int n, uint32_t idx)
+{
+    struct VirtQueue *vq = &vdev->vq[n];
+    vq->last_avail_idx = (uint16_t)(idx & 0xffff);
+    vq->shadow_avail_idx = (uint16_t)(idx & 0xffff);
+    vq->used_idx = (uint16_t)(idx >> 16);
+}
+
+void virtio_queue_set_vring_states(VirtIODevice *vdev, int n, uint32_t idx)
+{
+    if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+        return;
+    } else {
+        virtio_queue_split_set_vring_states(vdev, n, idx);
+    }
+}
+
 static void virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev,
 static void virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev,
                                                    int n, unsigned int idx)
                                                    int n, unsigned int idx)
 {
 {

+ 47 - 0
include/hw/acpi/acpi_generic_initiator.h

@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#ifndef ACPI_GENERIC_INITIATOR_H
+#define ACPI_GENERIC_INITIATOR_H
+
+#include "qom/object_interfaces.h"
+
+#define TYPE_ACPI_GENERIC_INITIATOR "acpi-generic-initiator"
+
+typedef struct AcpiGenericInitiator {
+    /* private */
+    Object parent;
+
+    /* public */
+    char *pci_dev;
+    uint16_t node;
+} AcpiGenericInitiator;
+
+/*
+ * ACPI 6.3:
+ * Table 5-81 Flags – Generic Initiator Affinity Structure
+ */
+typedef enum {
+    /*
+     * If clear, the OSPM ignores the contents of the Generic
+     * Initiator/Port Affinity Structure. This allows system firmware
+     * to populate the SRAT with a static number of structures, but only
+     * enable them as necessary.
+     */
+    GEN_AFFINITY_ENABLED = (1 << 0),
+} GenericAffinityFlags;
+
+/*
+ * ACPI 6.3:
+ * Table 5-80 Device Handle - PCI
+ */
+typedef struct PCIDeviceHandle {
+    uint16_t segment;
+    uint16_t bdf;
+} PCIDeviceHandle;
+
+void build_srat_generic_pci_initiator(GArray *table_data);
+
+#endif

+ 1 - 0
include/hw/audio/virtio-snd.h

@@ -230,6 +230,7 @@ struct virtio_snd_ctrl_command {
     VirtQueue *vq;
     VirtQueue *vq;
     virtio_snd_hdr ctrl;
     virtio_snd_hdr ctrl;
     virtio_snd_hdr resp;
     virtio_snd_hdr resp;
+    size_t payload_size;
     QTAILQ_ENTRY(virtio_snd_ctrl_command) next;
     QTAILQ_ENTRY(virtio_snd_ctrl_command) next;
 };
 };
 #endif
 #endif

+ 1 - 0
include/hw/cxl/cxl_component.h

@@ -25,6 +25,7 @@ enum reg_type {
     CXL2_TYPE3_DEVICE,
     CXL2_TYPE3_DEVICE,
     CXL2_LOGICAL_DEVICE,
     CXL2_LOGICAL_DEVICE,
     CXL2_ROOT_PORT,
     CXL2_ROOT_PORT,
+    CXL2_RC,
     CXL2_UPSTREAM_PORT,
     CXL2_UPSTREAM_PORT,
     CXL2_DOWNSTREAM_PORT,
     CXL2_DOWNSTREAM_PORT,
     CXL3_SWITCH_MAILBOX_CCI,
     CXL3_SWITCH_MAILBOX_CCI,

+ 2 - 1
include/hw/cxl/cxl_pci.h

@@ -92,8 +92,9 @@ typedef struct CXLDVSECDevice {
     uint32_t range2_base_hi;
     uint32_t range2_base_hi;
     uint32_t range2_base_lo;
     uint32_t range2_base_lo;
     uint16_t cap3;
     uint16_t cap3;
+    uint16_t resv;
 } QEMU_PACKED CXLDVSECDevice;
 } QEMU_PACKED CXLDVSECDevice;
-QEMU_BUILD_BUG_ON(sizeof(CXLDVSECDevice) != 0x3A);
+QEMU_BUILD_BUG_ON(sizeof(CXLDVSECDevice) != PCIE_CXL_DEVICE_DVSEC_LENGTH);
 
 
 /*
 /*
  * CXL r3.1 Section 8.1.5: CXL Extensions DVSEC for Ports
  * CXL r3.1 Section 8.1.5: CXL Extensions DVSEC for Ports

+ 17 - 0
include/hw/firmware/smbios.h

@@ -211,6 +211,23 @@ struct smbios_type_8 {
     uint8_t port_type;
     uint8_t port_type;
 } QEMU_PACKED;
 } QEMU_PACKED;
 
 
+/* SMBIOS type 9 - System Slots (v2.1+) */
+struct smbios_type_9 {
+    struct smbios_structure_header header;
+    uint8_t slot_designation;
+    uint8_t slot_type;
+    uint8_t slot_data_bus_width;
+    uint8_t current_usage;
+    uint8_t slot_length;
+    uint16_t slot_id;
+    uint8_t slot_characteristics1;
+    uint8_t slot_characteristics2;
+    /* SMBIOS spec v2.6+ */
+    uint16_t segment_group_number;
+    uint8_t bus_number;
+    uint8_t device_number;
+} QEMU_PACKED;
+
 /* SMBIOS type 11 - OEM strings */
 /* SMBIOS type 11 - OEM strings */
 struct smbios_type_11 {
 struct smbios_type_11 {
     struct smbios_structure_header header;
     struct smbios_structure_header header;

+ 2 - 2
include/hw/i386/pc.h

@@ -178,8 +178,6 @@ void pc_basic_device_init(struct PCMachineState *pcms,
                           ISADevice *rtc_state,
                           ISADevice *rtc_state,
                           bool create_fdctrl,
                           bool create_fdctrl,
                           uint32_t hpet_irqs);
                           uint32_t hpet_irqs);
-void pc_cmos_init(PCMachineState *pcms,
-                  ISADevice *s);
 void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus);
 void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus);
 
 
 void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs);
 void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs);
@@ -190,6 +188,8 @@ void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs);
 #define TYPE_PORT92 "port92"
 #define TYPE_PORT92 "port92"
 
 
 /* pc_sysfw.c */
 /* pc_sysfw.c */
+void pc_system_flash_create(PCMachineState *pcms);
+void pc_system_flash_cleanup_unused(PCMachineState *pcms);
 void pc_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory);
 void pc_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory);
 bool pc_system_ovmf_table_find(const char *entry, uint8_t **data,
 bool pc_system_ovmf_table_find(const char *entry, uint8_t **data,
                                int *data_len);
                                int *data_len);

+ 2 - 0
include/hw/pci/pcie_regs.h

@@ -39,6 +39,8 @@ typedef enum PCIExpLinkSpeed {
     QEMU_PCI_EXP_LNK_5GT,
     QEMU_PCI_EXP_LNK_5GT,
     QEMU_PCI_EXP_LNK_8GT,
     QEMU_PCI_EXP_LNK_8GT,
     QEMU_PCI_EXP_LNK_16GT,
     QEMU_PCI_EXP_LNK_16GT,
+    QEMU_PCI_EXP_LNK_32GT,
+    QEMU_PCI_EXP_LNK_64GT,
 } PCIExpLinkSpeed;
 } PCIExpLinkSpeed;
 
 
 #define QEMU_PCI_EXP_LNKCAP_MLS(speed)  (speed)
 #define QEMU_PCI_EXP_LNKCAP_MLS(speed)  (speed)

+ 2 - 2
include/hw/pci/pcie_sriov.h

@@ -58,8 +58,8 @@ void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize);
 void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
 void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
                              uint32_t val, int len);
                              uint32_t val, int len);
 
 
-/* Reset SR/IOV VF Enable bit to unregister all VFs */
-void pcie_sriov_pf_disable_vfs(PCIDevice *dev);
+/* Reset SR/IOV */
+void pcie_sriov_pf_reset(PCIDevice *dev);
 
 
 /* Get logical VF number of a VF - only valid for VFs */
 /* Get logical VF number of a VF - only valid for VFs */
 uint16_t pcie_sriov_vf_number(PCIDevice *dev);
 uint16_t pcie_sriov_vf_number(PCIDevice *dev);

+ 9 - 0
include/hw/virtio/vhost-vdpa.h

@@ -30,6 +30,12 @@ typedef struct VhostVDPAHostNotifier {
     void *addr;
     void *addr;
 } VhostVDPAHostNotifier;
 } VhostVDPAHostNotifier;
 
 
+typedef enum SVQTransitionState {
+    SVQ_TSTATE_DISABLING = -1,
+    SVQ_TSTATE_DONE,
+    SVQ_TSTATE_ENABLING
+} SVQTransitionState;
+
 /* Info shared by all vhost_vdpa device models */
 /* Info shared by all vhost_vdpa device models */
 typedef struct vhost_vdpa_shared {
 typedef struct vhost_vdpa_shared {
     int device_fd;
     int device_fd;
@@ -47,6 +53,9 @@ typedef struct vhost_vdpa_shared {
 
 
     /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
     /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
     bool shadow_data;
     bool shadow_data;
+
+    /* SVQ switching is in progress, or already completed? */
+    SVQTransitionState svq_switching;
 } VhostVDPAShared;
 } VhostVDPAShared;
 
 
 typedef struct vhost_vdpa {
 typedef struct vhost_vdpa {

+ 3 - 0
include/hw/virtio/virtio-iommu.h

@@ -24,6 +24,7 @@
 #include "hw/virtio/virtio.h"
 #include "hw/virtio/virtio.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pci.h"
 #include "qom/object.h"
 #include "qom/object.h"
+#include "qapi/qapi-types-virtio.h"
 
 
 #define TYPE_VIRTIO_IOMMU "virtio-iommu-device"
 #define TYPE_VIRTIO_IOMMU "virtio-iommu-device"
 #define TYPE_VIRTIO_IOMMU_PCI "virtio-iommu-pci"
 #define TYPE_VIRTIO_IOMMU_PCI "virtio-iommu-pci"
@@ -66,6 +67,8 @@ struct VirtIOIOMMU {
     bool boot_bypass;
     bool boot_bypass;
     Notifier machine_done;
     Notifier machine_done;
     bool granule_frozen;
     bool granule_frozen;
+    GranuleMode granule_mode;
+    uint8_t aw_bits;
 };
 };
 
 
 #endif
 #endif

+ 5 - 0
include/hw/virtio/virtio-pci.h

@@ -43,6 +43,7 @@ enum {
     VIRTIO_PCI_FLAG_INIT_FLR_BIT,
     VIRTIO_PCI_FLAG_INIT_FLR_BIT,
     VIRTIO_PCI_FLAG_AER_BIT,
     VIRTIO_PCI_FLAG_AER_BIT,
     VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT,
     VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT,
+    VIRTIO_PCI_FLAG_VDPA_BIT,
 };
 };
 
 
 /* Need to activate work-arounds for buggy guests at vmstate load. */
 /* Need to activate work-arounds for buggy guests at vmstate load. */
@@ -89,6 +90,9 @@ enum {
 #define VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \
 #define VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \
   (1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT)
   (1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT)
 
 
+/* VDPA supported flags */
+#define VIRTIO_PCI_FLAG_VDPA (1 << VIRTIO_PCI_FLAG_VDPA_BIT)
+
 typedef struct {
 typedef struct {
     MSIMessage msg;
     MSIMessage msg;
     int virq;
     int virq;
@@ -140,6 +144,7 @@ struct VirtIOPCIProxy {
         };
         };
         VirtIOPCIRegion regs[5];
         VirtIOPCIRegion regs[5];
     };
     };
+    VirtIOPCIRegion lm;
     MemoryRegion modern_bar;
     MemoryRegion modern_bar;
     MemoryRegion io_bar;
     MemoryRegion io_bar;
     uint32_t legacy_io_bar_idx;
     uint32_t legacy_io_bar_idx;

+ 19 - 0
include/hw/virtio/virtio.h

@@ -35,6 +35,9 @@
                                 (0x1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
                                 (0x1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
                                 (0x1ULL << VIRTIO_F_ANY_LAYOUT))
                                 (0x1ULL << VIRTIO_F_ANY_LAYOUT))
 
 
+#define LM_DISABLE      0x00
+#define LM_ENABLE       0x01
+
 struct VirtQueue;
 struct VirtQueue;
 
 
 static inline hwaddr vring_align(hwaddr addr,
 static inline hwaddr vring_align(hwaddr addr,
@@ -95,6 +98,11 @@ enum virtio_device_endian {
     VIRTIO_DEVICE_ENDIAN_BIG,
     VIRTIO_DEVICE_ENDIAN_BIG,
 };
 };
 
 
+typedef struct BitmapMemoryRegionCaches {
+    struct rcu_head rcu;
+    MemoryRegionCache bitmap;
+} BitmapMemoryRegionCaches;
+
 /**
 /**
  * struct VirtIODevice - common VirtIO structure
  * struct VirtIODevice - common VirtIO structure
  * @name: name of the device
  * @name: name of the device
@@ -128,6 +136,14 @@ struct VirtIODevice
     uint32_t generation;
     uint32_t generation;
     int nvectors;
     int nvectors;
     VirtQueue *vq;
     VirtQueue *vq;
+    uint8_t lm_logging_ctrl;
+    uint32_t lm_base_addr_low;
+    uint32_t lm_base_addr_high;
+    uint32_t lm_end_addr_low;
+    uint32_t lm_end_addr_high;
+
+    BitmapMemoryRegionCaches *caches;
+
     MemoryListener listener;
     MemoryListener listener;
     uint16_t device_id;
     uint16_t device_id;
     /* @vm_running: current VM running state via virtio_vmstate_change() */
     /* @vm_running: current VM running state via virtio_vmstate_change() */
@@ -379,8 +395,11 @@ hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n);
 hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n);
 hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n);
 hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n);
 hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n);
 unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n);
 unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n);
+unsigned int virtio_queue_get_vring_states(VirtIODevice *vdev, int n);
 void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n,
 void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n,
                                      unsigned int idx);
                                      unsigned int idx);
+void virtio_queue_set_vring_states(VirtIODevice *vdev, int n,
+                                   unsigned int idx);
 void virtio_queue_restore_last_avail_idx(VirtIODevice *vdev, int n);
 void virtio_queue_restore_last_avail_idx(VirtIODevice *vdev, int n);
 void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n);
 void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n);
 void virtio_queue_update_used_idx(VirtIODevice *vdev, int n);
 void virtio_queue_update_used_idx(VirtIODevice *vdev, int n);

+ 7 - 0
include/standard-headers/linux/virtio_pci.h

@@ -221,6 +221,13 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_PCI_COMMON_ADM_Q_IDX	60
 #define VIRTIO_PCI_COMMON_ADM_Q_IDX	60
 #define VIRTIO_PCI_COMMON_ADM_Q_NUM	62
 #define VIRTIO_PCI_COMMON_ADM_Q_NUM	62
 
 
+#define LM_LOGGING_CTRL                 0
+#define LM_BASE_ADDR_LOW                4
+#define LM_BASE_ADDR_HIGH               8
+#define LM_END_ADDR_LOW                 12
+#define LM_END_ADDR_HIGH                16
+#define LM_VRING_STATE_OFFSET           0x20
+
 #endif /* VIRTIO_PCI_NO_MODERN */
 #endif /* VIRTIO_PCI_NO_MODERN */
 
 
 /* Admin command status. */
 /* Admin command status. */

+ 1 - 0
include/sysemu/numa.h

@@ -41,6 +41,7 @@ struct NodeInfo {
     struct HostMemoryBackend *node_memdev;
     struct HostMemoryBackend *node_memdev;
     bool present;
     bool present;
     bool has_cpu;
     bool has_cpu;
+    bool has_gi;
     uint8_t lb_info_provided;
     uint8_t lb_info_provided;
     uint16_t initiator;
     uint16_t initiator;
     uint8_t distance[MAX_NODES];
     uint8_t distance[MAX_NODES];

+ 6 - 0
net/trace-events

@@ -23,3 +23,9 @@ colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int hdlen, in
 # filter-rewriter.c
 # filter-rewriter.c
 colo_filter_rewriter_pkt_info(const char *func, const char *src, const char *dst, uint32_t seq, uint32_t ack, uint32_t flag) "%s: src/dst: %s/%s p: seq/ack=%u/%u  flags=0x%x"
 colo_filter_rewriter_pkt_info(const char *func, const char *src, const char *dst, uint32_t seq, uint32_t ack, uint32_t flag) "%s: src/dst: %s/%s p: seq/ack=%u/%u  flags=0x%x"
 colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u"
 colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u"
+
+# vhost-vdpa.c
+vhost_vdpa_set_address_space_id(void *v, unsigned vq_group, unsigned asid_num) "vhost_vdpa: %p vq_group: %u asid: %u"
+vhost_vdpa_net_load_cmd(void *s, uint8_t class, uint8_t cmd, int data_num, int data_size) "vdpa state: %p class: %u cmd: %u sg_num: %d size: %d"
+vhost_vdpa_net_load_cmd_retval(void *s, uint8_t class, uint8_t cmd, int r) "vdpa state: %p class: %u cmd: %u retval: %d"
+vhost_vdpa_net_load_mq(void *s, int ncurqps) "vdpa state: %p current_qpairs: %d"

+ 28 - 2
net/vhost-vdpa.c

@@ -28,6 +28,7 @@
 #include "monitor/monitor.h"
 #include "monitor/monitor.h"
 #include "migration/misc.h"
 #include "migration/misc.h"
 #include "hw/virtio/vhost.h"
 #include "hw/virtio/vhost.h"
+#include "trace.h"
 
 
 /* Todo:need to add the multiqueue support here */
 /* Todo:need to add the multiqueue support here */
 typedef struct VhostVDPAState {
 typedef struct VhostVDPAState {
@@ -286,6 +287,21 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
     return size;
     return size;
 }
 }
 
 
+
+/** From any vdpa net client, get the netclient of the i-th queue pair */
+static VhostVDPAState *vhost_vdpa_net_get_nc_vdpa(VhostVDPAState *s, int i)
+{
+    NICState *nic = qemu_get_nic(s->nc.peer);
+    NetClientState *nc_i = qemu_get_peer(nic->ncs, i);
+
+    return DO_UPCAST(VhostVDPAState, nc, nc_i);
+}
+
+static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
+{
+    return vhost_vdpa_net_get_nc_vdpa(s, 0);
+}
+
 static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
 static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
 {
 {
     struct vhost_vdpa *v = &s->vhost_vdpa;
     struct vhost_vdpa *v = &s->vhost_vdpa;
@@ -307,6 +323,8 @@ static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
     data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
     data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
     cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
     cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
                                   n->max_ncs - n->max_queue_pairs : 0;
                                   n->max_ncs - n->max_queue_pairs : 0;
+    v->shared->svq_switching = enable ?
+        SVQ_TSTATE_ENABLING : SVQ_TSTATE_DISABLING;
     /*
     /*
      * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter
      * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter
      * in the future and resume the device if read-only operations between
      * in the future and resume the device if read-only operations between
@@ -319,6 +337,7 @@ static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
     if (unlikely(r < 0)) {
     if (unlikely(r < 0)) {
         error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r);
         error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r);
     }
     }
+    v->shared->svq_switching = SVQ_TSTATE_DONE;
 }
 }
 
 
 static int vdpa_net_migration_state_notifier(NotifierWithReturn *notifier,
 static int vdpa_net_migration_state_notifier(NotifierWithReturn *notifier,
@@ -444,6 +463,8 @@ static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v,
     };
     };
     int r;
     int r;
 
 
+    trace_vhost_vdpa_set_address_space_id(v, vq_group, asid_num);
+
     r = ioctl(v->shared->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid);
     r = ioctl(v->shared->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid);
     if (unlikely(r < 0)) {
     if (unlikely(r < 0)) {
         error_report("Can't set vq group %u asid %u, errno=%d (%s)",
         error_report("Can't set vq group %u asid %u, errno=%d (%s)",
@@ -510,7 +531,7 @@ dma_map_err:
 
 
 static int vhost_vdpa_net_cvq_start(NetClientState *nc)
 static int vhost_vdpa_net_cvq_start(NetClientState *nc)
 {
 {
-    VhostVDPAState *s;
+    VhostVDPAState *s, *s0;
     struct vhost_vdpa *v;
     struct vhost_vdpa *v;
     int64_t cvq_group;
     int64_t cvq_group;
     int r;
     int r;
@@ -521,7 +542,8 @@ static int vhost_vdpa_net_cvq_start(NetClientState *nc)
     s = DO_UPCAST(VhostVDPAState, nc, nc);
     s = DO_UPCAST(VhostVDPAState, nc, nc);
     v = &s->vhost_vdpa;
     v = &s->vhost_vdpa;
 
 
-    v->shadow_vqs_enabled = v->shared->shadow_data;
+    s0 = vhost_vdpa_net_first_nc_vdpa(s);
+    v->shadow_vqs_enabled = s0->vhost_vdpa.shadow_vqs_enabled;
     s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
     s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
 
 
     if (v->shared->shadow_data) {
     if (v->shared->shadow_data) {
@@ -695,6 +717,7 @@ static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s,
 
 
     assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
     assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
     cmd_size = sizeof(ctrl) + data_size;
     cmd_size = sizeof(ctrl) + data_size;
+    trace_vhost_vdpa_net_load_cmd(s, class, cmd, data_num, data_size);
     if (vhost_svq_available_slots(svq) < 2 ||
     if (vhost_svq_available_slots(svq) < 2 ||
         iov_size(out_cursor, 1) < cmd_size) {
         iov_size(out_cursor, 1) < cmd_size) {
         /*
         /*
@@ -726,6 +749,7 @@ static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s,
 
 
     r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1);
     r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1);
     if (unlikely(r < 0)) {
     if (unlikely(r < 0)) {
+        trace_vhost_vdpa_net_load_cmd_retval(s, class, cmd, r);
         return r;
         return r;
     }
     }
 
 
@@ -917,6 +941,8 @@ static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
         return 0;
         return 0;
     }
     }
 
 
+    trace_vhost_vdpa_net_load_mq(s, n->curr_queue_pairs);
+
     mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
     mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
     const struct iovec data = {
     const struct iovec data = {
         .iov_base = &mq,
         .iov_base = &mq,

+ 5 - 1
qapi/common.json

@@ -107,10 +107,14 @@
 #
 #
 # @16: 16.0GT/s
 # @16: 16.0GT/s
 #
 #
+# @32: 32.0GT/s
+#
+# @64: 64.0GT/s
+#
 # Since: 4.0
 # Since: 4.0
 ##
 ##
 { 'enum': 'PCIELinkSpeed',
 { 'enum': 'PCIELinkSpeed',
-  'data': [ '2_5', '5', '8', '16' ] }
+  'data': [ '2_5', '5', '8', '16', '32', '64' ] }
 
 
 ##
 ##
 # @PCIELinkWidth:
 # @PCIELinkWidth:

+ 17 - 0
qapi/qom.json

@@ -811,6 +811,21 @@
 { 'struct': 'IOMMUFDProperties',
 { 'struct': 'IOMMUFDProperties',
   'data': { '*fd': 'str' } }
   'data': { '*fd': 'str' } }
 
 
+##
+# @AcpiGenericInitiatorProperties:
+#
+# Properties for acpi-generic-initiator objects.
+#
+# @pci-dev: PCI device ID to be associated with the node
+#
+# @node: NUMA node associated with the PCI device
+#
+# Since: 9.0
+##
+{ 'struct': 'AcpiGenericInitiatorProperties',
+  'data': { 'pci-dev': 'str',
+            'node': 'uint32' } }
+
 ##
 ##
 # @RngProperties:
 # @RngProperties:
 #
 #
@@ -928,6 +943,7 @@
 ##
 ##
 { 'enum': 'ObjectType',
 { 'enum': 'ObjectType',
   'data': [
   'data': [
+    'acpi-generic-initiator',
     'authz-list',
     'authz-list',
     'authz-listfile',
     'authz-listfile',
     'authz-pam',
     'authz-pam',
@@ -999,6 +1015,7 @@
             'id': 'str' },
             'id': 'str' },
   'discriminator': 'qom-type',
   'discriminator': 'qom-type',
   'data': {
   'data': {
+      'acpi-generic-initiator':     'AcpiGenericInitiatorProperties',
       'authz-list':                 'AuthZListProperties',
       'authz-list':                 'AuthZListProperties',
       'authz-listfile':             'AuthZListFileProperties',
       'authz-listfile':             'AuthZListFileProperties',
       'authz-pam':                  'AuthZPAMProperties',
       'authz-pam':                  'AuthZPAMProperties',

+ 14 - 0
qemu-options.hx

@@ -1172,6 +1172,17 @@ SRST
     Please also refer to the wiki page for general scenarios of VT-d
     Please also refer to the wiki page for general scenarios of VT-d
     emulation in QEMU: https://wiki.qemu.org/Features/VT-d.
     emulation in QEMU: https://wiki.qemu.org/Features/VT-d.
 
 
+``-device virtio-iommu-pci[,option=...]``
+    This is only supported by ``-machine q35`` (x86_64) and ``-machine virt`` (ARM).
+    It supports below options:
+
+    ``granule=val`` (possible values are 4k, 8k, 16k, 64k and host; default: host)
+        This decides the default granule to be be exposed by the
+        virtio-iommu. If host, the granule matches the host page size.
+
+    ``aw-bits=val`` (val between 32 and 64, default depends on machine)
+        This decides the address width of the IOVA address space.
+
 ERST
 ERST
 
 
 DEF("name", HAS_ARG, QEMU_OPTION_name,
 DEF("name", HAS_ARG, QEMU_OPTION_name,
@@ -2718,6 +2729,9 @@ SRST
 ``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-family=%d][,processor-id=%d]``
 ``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-family=%d][,processor-id=%d]``
     Specify SMBIOS type 4 fields
     Specify SMBIOS type 4 fields
 
 
+``-smbios type=9[,slot_designation=str][,slot_type=%d][,slot_data_bus_width=%d][,current_usage=%d][,slot_length=%d][,slot_id=%d][,slot_characteristics1=%d][,slot_characteristics12=%d][,pci_device=str]``
+    Specify SMBIOS type 9 fields
+
 ``-smbios type=11[,value=str][,path=filename]``
 ``-smbios type=11[,value=str][,path=filename]``
     Specify SMBIOS type 11 fields
     Specify SMBIOS type 11 fields
 
 

+ 328 - 267
subprojects/libvhost-user/libvhost-user.c

@@ -43,6 +43,8 @@
 #include <fcntl.h>
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/ioctl.h>
 #include <linux/vhost.h>
 #include <linux/vhost.h>
+#include <sys/vfs.h>
+#include <linux/magic.h>
 
 
 #ifdef __NR_userfaultfd
 #ifdef __NR_userfaultfd
 #include <linux/userfaultfd.h>
 #include <linux/userfaultfd.h>
@@ -195,30 +197,58 @@ vu_panic(VuDev *dev, const char *msg, ...)
      */
      */
 }
 }
 
 
+/* Search for a memory region that covers this guest physical address. */
+static VuDevRegion *
+vu_gpa_to_mem_region(VuDev *dev, uint64_t guest_addr)
+{
+    int low = 0;
+    int high = dev->nregions - 1;
+
+    /*
+     * Memory regions cannot overlap in guest physical address space. Each
+     * GPA belongs to exactly one memory region, so there can only be one
+     * match.
+     *
+     * We store our memory regions ordered by GPA and can simply perform a
+     * binary search.
+     */
+    while (low <= high) {
+        unsigned int mid = low + (high - low) / 2;
+        VuDevRegion *cur = &dev->regions[mid];
+
+        if (guest_addr >= cur->gpa && guest_addr < cur->gpa + cur->size) {
+            return cur;
+        }
+        if (guest_addr >= cur->gpa + cur->size) {
+            low = mid + 1;
+        }
+        if (guest_addr < cur->gpa) {
+            high = mid - 1;
+        }
+    }
+    return NULL;
+}
+
 /* Translate guest physical address to our virtual address.  */
 /* Translate guest physical address to our virtual address.  */
 void *
 void *
 vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr)
 vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr)
 {
 {
-    unsigned int i;
+    VuDevRegion *r;
 
 
     if (*plen == 0) {
     if (*plen == 0) {
         return NULL;
         return NULL;
     }
     }
 
 
-    /* Find matching memory region.  */
-    for (i = 0; i < dev->nregions; i++) {
-        VuDevRegion *r = &dev->regions[i];
-
-        if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
-            if ((guest_addr + *plen) > (r->gpa + r->size)) {
-                *plen = r->gpa + r->size - guest_addr;
-            }
-            return (void *)(uintptr_t)
-                guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
-        }
+    r = vu_gpa_to_mem_region(dev, guest_addr);
+    if (!r) {
+        return NULL;
     }
     }
 
 
-    return NULL;
+    if ((guest_addr + *plen) > (r->gpa + r->size)) {
+        *plen = r->gpa + r->size - guest_addr;
+    }
+    return (void *)(uintptr_t)guest_addr - r->gpa + r->mmap_addr +
+           r->mmap_offset;
 }
 }
 
 
 /* Translate qemu virtual address to our virtual address.  */
 /* Translate qemu virtual address to our virtual address.  */
@@ -240,6 +270,221 @@ qva_to_va(VuDev *dev, uint64_t qemu_addr)
     return NULL;
     return NULL;
 }
 }
 
 
+static void
+vu_remove_all_mem_regs(VuDev *dev)
+{
+    unsigned int i;
+
+    for (i = 0; i < dev->nregions; i++) {
+        VuDevRegion *r = &dev->regions[i];
+
+        munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset);
+    }
+    dev->nregions = 0;
+}
+
+static bool
+map_ring(VuDev *dev, VuVirtq *vq)
+{
+    vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr);
+    vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr);
+    vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr);
+
+    DPRINT("Setting virtq addresses:\n");
+    DPRINT("    vring_desc  at %p\n", vq->vring.desc);
+    DPRINT("    vring_used  at %p\n", vq->vring.used);
+    DPRINT("    vring_avail at %p\n", vq->vring.avail);
+
+    return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
+}
+
+static bool
+vu_is_vq_usable(VuDev *dev, VuVirtq *vq)
+{
+    if (unlikely(dev->broken)) {
+        return false;
+    }
+
+    if (likely(vq->vring.avail)) {
+        return true;
+    }
+
+    /*
+     * In corner cases, we might temporarily remove a memory region that
+     * mapped a ring. When removing a memory region we make sure to
+     * unmap any rings that would be impacted. Let's try to remap if we
+     * already succeeded mapping this ring once.
+     */
+    if (!vq->vra.desc_user_addr || !vq->vra.used_user_addr ||
+        !vq->vra.avail_user_addr) {
+        return false;
+    }
+    if (map_ring(dev, vq)) {
+        vu_panic(dev, "remapping queue on access");
+        return false;
+    }
+    return true;
+}
+
+static void
+unmap_rings(VuDev *dev, VuDevRegion *r)
+{
+    int i;
+
+    for (i = 0; i < dev->max_queues; i++) {
+        VuVirtq *vq = &dev->vq[i];
+        const uintptr_t desc = (uintptr_t)vq->vring.desc;
+        const uintptr_t used = (uintptr_t)vq->vring.used;
+        const uintptr_t avail = (uintptr_t)vq->vring.avail;
+
+        if (desc < r->mmap_addr || desc >= r->mmap_addr + r->size) {
+            continue;
+        }
+        if (used < r->mmap_addr || used >= r->mmap_addr + r->size) {
+            continue;
+        }
+        if (avail < r->mmap_addr || avail >= r->mmap_addr + r->size) {
+            continue;
+        }
+
+        DPRINT("Unmapping rings of queue %d\n", i);
+        vq->vring.desc = NULL;
+        vq->vring.used = NULL;
+        vq->vring.avail = NULL;
+    }
+}
+
+static size_t
+get_fd_hugepagesize(int fd)
+{
+#if defined(__linux__)
+    struct statfs fs;
+    int ret;
+
+    do {
+        ret = fstatfs(fd, &fs);
+    } while (ret != 0 && errno == EINTR);
+
+    if (!ret && (unsigned int)fs.f_type == HUGETLBFS_MAGIC) {
+        return fs.f_bsize;
+    }
+#endif
+    return 0;
+}
+
+static void
+_vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd)
+{
+    const uint64_t start_gpa = msg_region->guest_phys_addr;
+    const uint64_t end_gpa = start_gpa + msg_region->memory_size;
+    int prot = PROT_READ | PROT_WRITE;
+    uint64_t mmap_offset, fd_offset;
+    size_t hugepagesize;
+    VuDevRegion *r;
+    void *mmap_addr;
+    int low = 0;
+    int high = dev->nregions - 1;
+    unsigned int idx;
+
+    DPRINT("Adding region %d\n", dev->nregions);
+    DPRINT("    guest_phys_addr: 0x%016"PRIx64"\n",
+           msg_region->guest_phys_addr);
+    DPRINT("    memory_size:     0x%016"PRIx64"\n",
+           msg_region->memory_size);
+    DPRINT("    userspace_addr:  0x%016"PRIx64"\n",
+           msg_region->userspace_addr);
+    DPRINT("    old mmap_offset: 0x%016"PRIx64"\n",
+           msg_region->mmap_offset);
+
+    if (dev->postcopy_listening) {
+        /*
+         * In postcopy we're using PROT_NONE here to catch anyone
+         * accessing it before we userfault
+         */
+        prot = PROT_NONE;
+    }
+
+    /*
+     * We will add memory regions into the array sorted by GPA. Perform a
+     * binary search to locate the insertion point: it will be at the low
+     * index.
+     */
+    while (low <= high) {
+        unsigned int mid = low + (high - low)  / 2;
+        VuDevRegion *cur = &dev->regions[mid];
+
+        /* Overlap of GPA addresses. */
+        if (start_gpa < cur->gpa + cur->size && cur->gpa < end_gpa) {
+            vu_panic(dev, "regions with overlapping guest physical addresses");
+            return;
+        }
+        if (start_gpa >= cur->gpa + cur->size) {
+            low = mid + 1;
+        }
+        if (start_gpa < cur->gpa) {
+            high = mid - 1;
+        }
+    }
+    idx = low;
+
+    /*
+     * Convert most of msg_region->mmap_offset to fd_offset. In almost all
+     * cases, this will leave us with mmap_offset == 0, mmap()'ing only
+     * what we really need. Only if a memory region would partially cover
+     * hugetlb pages, we'd get mmap_offset != 0, which usually doesn't happen
+     * anymore (i.e., modern QEMU).
+     *
+     * Note that mmap() with hugetlb would fail if the offset into the file
+     * is not aligned to the huge page size.
+     */
+    hugepagesize = get_fd_hugepagesize(fd);
+    if (hugepagesize) {
+        fd_offset = ALIGN_DOWN(msg_region->mmap_offset, hugepagesize);
+        mmap_offset = msg_region->mmap_offset - fd_offset;
+    } else {
+        fd_offset = msg_region->mmap_offset;
+        mmap_offset = 0;
+    }
+
+    DPRINT("    fd_offset:       0x%016"PRIx64"\n",
+           fd_offset);
+    DPRINT("    new mmap_offset: 0x%016"PRIx64"\n",
+           mmap_offset);
+
+    mmap_addr = mmap(0, msg_region->memory_size + mmap_offset,
+                     prot, MAP_SHARED | MAP_NORESERVE, fd, fd_offset);
+    if (mmap_addr == MAP_FAILED) {
+        vu_panic(dev, "region mmap error: %s", strerror(errno));
+        return;
+    }
+    DPRINT("    mmap_addr:       0x%016"PRIx64"\n",
+           (uint64_t)(uintptr_t)mmap_addr);
+
+#if defined(__linux__)
+    /* Don't include all guest memory in a coredump. */
+    madvise(mmap_addr, msg_region->memory_size + mmap_offset,
+            MADV_DONTDUMP);
+#endif
+
+    /* Shift all affected entries by 1 to open a hole at idx. */
+    r = &dev->regions[idx];
+    memmove(r + 1, r, sizeof(VuDevRegion) * (dev->nregions - idx));
+    r->gpa = msg_region->guest_phys_addr;
+    r->size = msg_region->memory_size;
+    r->qva = msg_region->userspace_addr;
+    r->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
+    r->mmap_offset = mmap_offset;
+    dev->nregions++;
+
+    if (dev->postcopy_listening) {
+        /*
+         * Return the address to QEMU so that it can translate the ufd
+         * fault addresses back.
+         */
+        msg_region->userspace_addr = r->mmap_addr + r->mmap_offset;
+    }
+}
+
 static void
 static void
 vmsg_close_fds(VhostUserMsg *vmsg)
 vmsg_close_fds(VhostUserMsg *vmsg)
 {
 {
@@ -612,21 +857,6 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
     return false;
     return false;
 }
 }
 
 
-static bool
-map_ring(VuDev *dev, VuVirtq *vq)
-{
-    vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr);
-    vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr);
-    vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr);
-
-    DPRINT("Setting virtq addresses:\n");
-    DPRINT("    vring_desc  at %p\n", vq->vring.desc);
-    DPRINT("    vring_used  at %p\n", vq->vring.used);
-    DPRINT("    vring_avail at %p\n", vq->vring.avail);
-
-    return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
-}
-
 static bool
 static bool
 generate_faults(VuDev *dev) {
 generate_faults(VuDev *dev) {
     unsigned int i;
     unsigned int i;
@@ -710,11 +940,7 @@ generate_faults(VuDev *dev) {
 
 
 static bool
 static bool
 vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
 vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
-    int i;
-    bool track_ramblocks = dev->postcopy_listening;
     VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
     VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
-    VuDevRegion *dev_region = &dev->regions[dev->nregions];
-    void *mmap_addr;
 
 
     if (vmsg->fd_num != 1) {
     if (vmsg->fd_num != 1) {
         vmsg_close_fds(vmsg);
         vmsg_close_fds(vmsg);
@@ -744,84 +970,24 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
      * we know all the postcopy client bases have been received, and we
      * we know all the postcopy client bases have been received, and we
      * should start generating faults.
      * should start generating faults.
      */
      */
-    if (track_ramblocks &&
+    if (dev->postcopy_listening &&
         vmsg->size == sizeof(vmsg->payload.u64) &&
         vmsg->size == sizeof(vmsg->payload.u64) &&
         vmsg->payload.u64 == 0) {
         vmsg->payload.u64 == 0) {
         (void)generate_faults(dev);
         (void)generate_faults(dev);
         return false;
         return false;
     }
     }
 
 
-    DPRINT("Adding region: %u\n", dev->nregions);
-    DPRINT("    guest_phys_addr: 0x%016"PRIx64"\n",
-           msg_region->guest_phys_addr);
-    DPRINT("    memory_size:     0x%016"PRIx64"\n",
-           msg_region->memory_size);
-    DPRINT("    userspace_addr   0x%016"PRIx64"\n",
-           msg_region->userspace_addr);
-    DPRINT("    mmap_offset      0x%016"PRIx64"\n",
-           msg_region->mmap_offset);
-
-    dev_region->gpa = msg_region->guest_phys_addr;
-    dev_region->size = msg_region->memory_size;
-    dev_region->qva = msg_region->userspace_addr;
-    dev_region->mmap_offset = msg_region->mmap_offset;
-
-    /*
-     * We don't use offset argument of mmap() since the
-     * mapped address has to be page aligned, and we use huge
-     * pages.
-     */
-    if (track_ramblocks) {
-        /*
-         * In postcopy we're using PROT_NONE here to catch anyone
-         * accessing it before we userfault.
-         */
-        mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
-                         PROT_NONE, MAP_SHARED | MAP_NORESERVE,
-                         vmsg->fds[0], 0);
-    } else {
-        mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
-                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE,
-                         vmsg->fds[0], 0);
-    }
-
-    if (mmap_addr == MAP_FAILED) {
-        vu_panic(dev, "region mmap error: %s", strerror(errno));
-    } else {
-        dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
-        DPRINT("    mmap_addr:       0x%016"PRIx64"\n",
-               dev_region->mmap_addr);
-    }
-
+    _vu_add_mem_reg(dev, msg_region, vmsg->fds[0]);
     close(vmsg->fds[0]);
     close(vmsg->fds[0]);
 
 
-    if (track_ramblocks) {
-        /*
-         * Return the address to QEMU so that it can translate the ufd
-         * fault addresses back.
-         */
-        msg_region->userspace_addr = (uintptr_t)(mmap_addr +
-                                                 dev_region->mmap_offset);
-
+    if (dev->postcopy_listening) {
         /* Send the message back to qemu with the addresses filled in. */
         /* Send the message back to qemu with the addresses filled in. */
         vmsg->fd_num = 0;
         vmsg->fd_num = 0;
         DPRINT("Successfully added new region in postcopy\n");
         DPRINT("Successfully added new region in postcopy\n");
-        dev->nregions++;
         return true;
         return true;
-    } else {
-        for (i = 0; i < dev->max_queues; i++) {
-            if (dev->vq[i].vring.desc) {
-                if (map_ring(dev, &dev->vq[i])) {
-                    vu_panic(dev, "remapping queue %d for new memory region",
-                             i);
-                }
-            }
-        }
-
-        DPRINT("Successfully added new region\n");
-        dev->nregions++;
-        return false;
     }
     }
+    DPRINT("Successfully added new region\n");
+    return false;
 }
 }
 
 
 static inline bool reg_equal(VuDevRegion *vudev_reg,
 static inline bool reg_equal(VuDevRegion *vudev_reg,
@@ -839,8 +1005,8 @@ static inline bool reg_equal(VuDevRegion *vudev_reg,
 static bool
 static bool
 vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
 vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
     VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
     VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
-    unsigned int i;
-    bool found = false;
+    unsigned int idx;
+    VuDevRegion *r;
 
 
     if (vmsg->fd_num > 1) {
     if (vmsg->fd_num > 1) {
         vmsg_close_fds(vmsg);
         vmsg_close_fds(vmsg);
@@ -867,35 +1033,31 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
     DPRINT("    mmap_offset      0x%016"PRIx64"\n",
     DPRINT("    mmap_offset      0x%016"PRIx64"\n",
            msg_region->mmap_offset);
            msg_region->mmap_offset);
 
 
-    for (i = 0; i < dev->nregions; i++) {
-        if (reg_equal(&dev->regions[i], msg_region)) {
-            VuDevRegion *r = &dev->regions[i];
-            void *ma = (void *) (uintptr_t) r->mmap_addr;
-
-            if (ma) {
-                munmap(ma, r->size + r->mmap_offset);
-            }
-
-            /*
-             * Shift all affected entries by 1 to close the hole at index i and
-             * zero out the last entry.
-             */
-            memmove(dev->regions + i, dev->regions + i + 1,
-                    sizeof(VuDevRegion) * (dev->nregions - i - 1));
-            memset(dev->regions + dev->nregions - 1, 0, sizeof(VuDevRegion));
-            DPRINT("Successfully removed a region\n");
-            dev->nregions--;
-            i--;
+    r = vu_gpa_to_mem_region(dev, msg_region->guest_phys_addr);
+    if (!r || !reg_equal(r, msg_region)) {
+        vmsg_close_fds(vmsg);
+        vu_panic(dev, "Specified region not found\n");
+        return false;
+    }
 
 
-            found = true;
+    /*
+     * There might be valid cases where we temporarily remove memory regions
+     * to readd them again, or remove memory regions and don't use the rings
+     * anymore before we set the ring addresses and restart the device.
+     *
+     * Unmap all affected rings, remapping them on demand later. This should
+     * be a corner case.
+     */
+    unmap_rings(dev, r);
 
 
-            /* Continue the search for eventual duplicates. */
-        }
-    }
+    munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset);
 
 
-    if (!found) {
-        vu_panic(dev, "Specified region not found\n");
-    }
+    idx = r - dev->regions;
+    assert(idx < dev->nregions);
+    /* Shift all affected entries by 1 to close the hole. */
+    memmove(r, r + 1, sizeof(VuDevRegion) * (dev->nregions - idx - 1));
+    DPRINT("Successfully removed a region\n");
+    dev->nregions--;
 
 
     vmsg_close_fds(vmsg);
     vmsg_close_fds(vmsg);
 
 
@@ -920,140 +1082,42 @@ vu_get_shared_object(VuDev *dev, VhostUserMsg *vmsg)
     return true;
     return true;
 }
 }
 
 
-static bool
-vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
-{
-    unsigned int i;
-    VhostUserMemory m = vmsg->payload.memory, *memory = &m;
-    dev->nregions = memory->nregions;
-
-    DPRINT("Nregions: %u\n", memory->nregions);
-    for (i = 0; i < dev->nregions; i++) {
-        void *mmap_addr;
-        VhostUserMemoryRegion *msg_region = &memory->regions[i];
-        VuDevRegion *dev_region = &dev->regions[i];
-
-        DPRINT("Region %d\n", i);
-        DPRINT("    guest_phys_addr: 0x%016"PRIx64"\n",
-               msg_region->guest_phys_addr);
-        DPRINT("    memory_size:     0x%016"PRIx64"\n",
-               msg_region->memory_size);
-        DPRINT("    userspace_addr   0x%016"PRIx64"\n",
-               msg_region->userspace_addr);
-        DPRINT("    mmap_offset      0x%016"PRIx64"\n",
-               msg_region->mmap_offset);
-
-        dev_region->gpa = msg_region->guest_phys_addr;
-        dev_region->size = msg_region->memory_size;
-        dev_region->qva = msg_region->userspace_addr;
-        dev_region->mmap_offset = msg_region->mmap_offset;
-
-        /* We don't use offset argument of mmap() since the
-         * mapped address has to be page aligned, and we use huge
-         * pages.
-         * In postcopy we're using PROT_NONE here to catch anyone
-         * accessing it before we userfault
-         */
-        mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
-                         PROT_NONE, MAP_SHARED | MAP_NORESERVE,
-                         vmsg->fds[i], 0);
-
-        if (mmap_addr == MAP_FAILED) {
-            vu_panic(dev, "region mmap error: %s", strerror(errno));
-        } else {
-            dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
-            DPRINT("    mmap_addr:       0x%016"PRIx64"\n",
-                   dev_region->mmap_addr);
-        }
-
-        /* Return the address to QEMU so that it can translate the ufd
-         * fault addresses back.
-         */
-        msg_region->userspace_addr = (uintptr_t)(mmap_addr +
-                                                 dev_region->mmap_offset);
-        close(vmsg->fds[i]);
-    }
-
-    /* Send the message back to qemu with the addresses filled in */
-    vmsg->fd_num = 0;
-    if (!vu_send_reply(dev, dev->sock, vmsg)) {
-        vu_panic(dev, "failed to respond to set-mem-table for postcopy");
-        return false;
-    }
-
-    /* Wait for QEMU to confirm that it's registered the handler for the
-     * faults.
-     */
-    if (!dev->read_msg(dev, dev->sock, vmsg) ||
-        vmsg->size != sizeof(vmsg->payload.u64) ||
-        vmsg->payload.u64 != 0) {
-        vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
-        return false;
-    }
-
-    /* OK, now we can go and register the memory and generate faults */
-    (void)generate_faults(dev);
-
-    return false;
-}
-
 static bool
 static bool
 vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
 vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
 {
 {
-    unsigned int i;
     VhostUserMemory m = vmsg->payload.memory, *memory = &m;
     VhostUserMemory m = vmsg->payload.memory, *memory = &m;
+    unsigned int i;
 
 
-    for (i = 0; i < dev->nregions; i++) {
-        VuDevRegion *r = &dev->regions[i];
-        void *ma = (void *) (uintptr_t) r->mmap_addr;
+    vu_remove_all_mem_regs(dev);
 
 
-        if (ma) {
-            munmap(ma, r->size + r->mmap_offset);
-        }
+    DPRINT("Nregions: %u\n", memory->nregions);
+    for (i = 0; i < memory->nregions; i++) {
+        _vu_add_mem_reg(dev, &memory->regions[i], vmsg->fds[i]);
+        close(vmsg->fds[i]);
     }
     }
-    dev->nregions = memory->nregions;
 
 
     if (dev->postcopy_listening) {
     if (dev->postcopy_listening) {
-        return vu_set_mem_table_exec_postcopy(dev, vmsg);
-    }
-
-    DPRINT("Nregions: %u\n", memory->nregions);
-    for (i = 0; i < dev->nregions; i++) {
-        void *mmap_addr;
-        VhostUserMemoryRegion *msg_region = &memory->regions[i];
-        VuDevRegion *dev_region = &dev->regions[i];
+        /* Send the message back to qemu with the addresses filled in */
+        vmsg->fd_num = 0;
+        if (!vu_send_reply(dev, dev->sock, vmsg)) {
+            vu_panic(dev, "failed to respond to set-mem-table for postcopy");
+            return false;
+        }
 
 
-        DPRINT("Region %d\n", i);
-        DPRINT("    guest_phys_addr: 0x%016"PRIx64"\n",
-               msg_region->guest_phys_addr);
-        DPRINT("    memory_size:     0x%016"PRIx64"\n",
-               msg_region->memory_size);
-        DPRINT("    userspace_addr   0x%016"PRIx64"\n",
-               msg_region->userspace_addr);
-        DPRINT("    mmap_offset      0x%016"PRIx64"\n",
-               msg_region->mmap_offset);
-
-        dev_region->gpa = msg_region->guest_phys_addr;
-        dev_region->size = msg_region->memory_size;
-        dev_region->qva = msg_region->userspace_addr;
-        dev_region->mmap_offset = msg_region->mmap_offset;
-
-        /* We don't use offset argument of mmap() since the
-         * mapped address has to be page aligned, and we use huge
-         * pages.  */
-        mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
-                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE,
-                         vmsg->fds[i], 0);
-
-        if (mmap_addr == MAP_FAILED) {
-            vu_panic(dev, "region mmap error: %s", strerror(errno));
-        } else {
-            dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
-            DPRINT("    mmap_addr:       0x%016"PRIx64"\n",
-                   dev_region->mmap_addr);
+        /*
+         * Wait for QEMU to confirm that it's registered the handler for the
+         * faults.
+         */
+        if (!dev->read_msg(dev, dev->sock, vmsg) ||
+            vmsg->size != sizeof(vmsg->payload.u64) ||
+            vmsg->payload.u64 != 0) {
+            vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
+            return false;
         }
         }
 
 
-        close(vmsg->fds[i]);
+        /* OK, now we can go and register the memory and generate faults */
+        (void)generate_faults(dev);
+        return false;
     }
     }
 
 
     for (i = 0; i < dev->max_queues; i++) {
     for (i = 0; i < dev->max_queues; i++) {
@@ -2112,14 +2176,7 @@ vu_deinit(VuDev *dev)
 {
 {
     unsigned int i;
     unsigned int i;
 
 
-    for (i = 0; i < dev->nregions; i++) {
-        VuDevRegion *r = &dev->regions[i];
-        void *m = (void *) (uintptr_t) r->mmap_addr;
-        if (m != MAP_FAILED) {
-            munmap(m, r->size + r->mmap_offset);
-        }
-    }
-    dev->nregions = 0;
+    vu_remove_all_mem_regs(dev);
 
 
     for (i = 0; i < dev->max_queues; i++) {
     for (i = 0; i < dev->max_queues; i++) {
         VuVirtq *vq = &dev->vq[i];
         VuVirtq *vq = &dev->vq[i];
@@ -2171,6 +2228,8 @@ vu_deinit(VuDev *dev)
 
 
     free(dev->vq);
     free(dev->vq);
     dev->vq = NULL;
     dev->vq = NULL;
+    free(dev->regions);
+    dev->regions = NULL;
 }
 }
 
 
 bool
 bool
@@ -2205,9 +2264,17 @@ vu_init(VuDev *dev,
     dev->backend_fd = -1;
     dev->backend_fd = -1;
     dev->max_queues = max_queues;
     dev->max_queues = max_queues;
 
 
+    dev->regions = malloc(VHOST_USER_MAX_RAM_SLOTS * sizeof(dev->regions[0]));
+    if (!dev->regions) {
+        DPRINT("%s: failed to malloc mem regions\n", __func__);
+        return false;
+    }
+
     dev->vq = malloc(max_queues * sizeof(dev->vq[0]));
     dev->vq = malloc(max_queues * sizeof(dev->vq[0]));
     if (!dev->vq) {
     if (!dev->vq) {
         DPRINT("%s: failed to malloc virtqueues\n", __func__);
         DPRINT("%s: failed to malloc virtqueues\n", __func__);
+        free(dev->regions);
+        dev->regions = NULL;
         return false;
         return false;
     }
     }
 
 
@@ -2374,8 +2441,7 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes,
     idx = vq->last_avail_idx;
     idx = vq->last_avail_idx;
 
 
     total_bufs = in_total = out_total = 0;
     total_bufs = in_total = out_total = 0;
-    if (unlikely(dev->broken) ||
-        unlikely(!vq->vring.avail)) {
+    if (!vu_is_vq_usable(dev, vq)) {
         goto done;
         goto done;
     }
     }
 
 
@@ -2490,8 +2556,7 @@ vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
 bool
 bool
 vu_queue_empty(VuDev *dev, VuVirtq *vq)
 vu_queue_empty(VuDev *dev, VuVirtq *vq)
 {
 {
-    if (unlikely(dev->broken) ||
-        unlikely(!vq->vring.avail)) {
+    if (!vu_is_vq_usable(dev, vq)) {
         return true;
         return true;
     }
     }
 
 
@@ -2530,8 +2595,7 @@ vring_notify(VuDev *dev, VuVirtq *vq)
 
 
 static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync)
 static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync)
 {
 {
-    if (unlikely(dev->broken) ||
-        unlikely(!vq->vring.avail)) {
+    if (!vu_is_vq_usable(dev, vq)) {
         return;
         return;
     }
     }
 
 
@@ -2856,8 +2920,7 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
     unsigned int head;
     unsigned int head;
     VuVirtqElement *elem;
     VuVirtqElement *elem;
 
 
-    if (unlikely(dev->broken) ||
-        unlikely(!vq->vring.avail)) {
+    if (!vu_is_vq_usable(dev, vq)) {
         return NULL;
         return NULL;
     }
     }
 
 
@@ -3014,8 +3077,7 @@ vu_queue_fill(VuDev *dev, VuVirtq *vq,
 {
 {
     struct vring_used_elem uelem;
     struct vring_used_elem uelem;
 
 
-    if (unlikely(dev->broken) ||
-        unlikely(!vq->vring.avail)) {
+    if (!vu_is_vq_usable(dev, vq)) {
         return;
         return;
     }
     }
 
 
@@ -3044,8 +3106,7 @@ vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count)
 {
 {
     uint16_t old, new;
     uint16_t old, new;
 
 
-    if (unlikely(dev->broken) ||
-        unlikely(!vq->vring.avail)) {
+    if (!vu_is_vq_usable(dev, vq)) {
         return;
         return;
     }
     }
 
 

+ 6 - 4
subprojects/libvhost-user/libvhost-user.h

@@ -31,10 +31,12 @@
 #define VHOST_MEMORY_BASELINE_NREGIONS 8
 #define VHOST_MEMORY_BASELINE_NREGIONS 8
 
 
 /*
 /*
- * Set a reasonable maximum number of ram slots, which will be supported by
- * any architecture.
+ * vhost in the kernel usually supports 509 mem slots. 509 used to be the
+ * KVM limit, it supported 512, but 3 were used for internal purposes. This
+ * limit is sufficient to support many DIMMs and virtio-mem in
+ * "dynamic-memslots" mode.
  */
  */
-#define VHOST_USER_MAX_RAM_SLOTS 32
+#define VHOST_USER_MAX_RAM_SLOTS 509
 
 
 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
 
 
@@ -398,7 +400,7 @@ typedef struct VuDevInflightInfo {
 struct VuDev {
 struct VuDev {
     int sock;
     int sock;
     uint32_t nregions;
     uint32_t nregions;
-    VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS];
+    VuDevRegion *regions;
     VuVirtq *vq;
     VuVirtq *vq;
     VuDevInflightInfo inflight_info;
     VuDevInflightInfo inflight_info;
     int log_call_fd;
     int log_call_fd;

+ 1 - 1
tests/qtest/virtio-iommu-test.c

@@ -34,7 +34,7 @@ static void pci_config(void *obj, void *data, QGuestAllocator *t_alloc)
     uint8_t bypass = qvirtio_config_readb(dev, 36);
     uint8_t bypass = qvirtio_config_readb(dev, 36);
 
 
     g_assert_cmpint(input_range_start, ==, 0);
     g_assert_cmpint(input_range_start, ==, 0);
-    g_assert_cmphex(input_range_end, ==, UINT64_MAX);
+    g_assert_cmphex(input_range_end, >=, UINT32_MAX);
     g_assert_cmpint(domain_range_start, ==, 0);
     g_assert_cmpint(domain_range_start, ==, 0);
     g_assert_cmpint(domain_range_end, ==, UINT32_MAX);
     g_assert_cmpint(domain_range_end, ==, UINT32_MAX);
     g_assert_cmpint(bypass, ==, 1);
     g_assert_cmpint(bypass, ==, 1);