Explorar o código

Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging

* target/i386/kvm: support for reading RAPL MSRs using a helper program
* hpet: emulation improvements

# -----BEGIN PGP SIGNATURE-----
#
# iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmaelL4UHHBib256aW5p
# QHJlZGhhdC5jb20ACgkQv/vSX3jHroMXoQf+K77lNlHLETSgeeP3dr7yZPOmXjjN
# qFY/18jiyLw7MK1rZC09fF+n9SoaTH8JDKupt0z9M1R10HKHLIO04f8zDE+dOxaE
# Rou3yKnlTgFPGSoPPFr1n1JJfxtYlLZRoUzaAcHUaa4W7JR/OHJX90n1Rb9MXeDk
# jV6P0v1FWtIDdM6ERm9qBGoQdYhj6Ra2T4/NZKJFXwIhKEkxgu4yO7WXv8l0dxQz
# jE4fKotqAvrkYW1EsiVZm30lw/19duhvGiYeQXoYhk8KKXXjAbJMblLITSNWsCio
# 3l6Uud/lOxekkJDAq5nH3H9hCBm0WwvwL+0vRf3Mkr+/xRGvrhtmUdp8NQ==
# =00mB
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 23 Jul 2024 03:19:58 AM AEST
# gpg:                using RSA key F13338574B662389866C7682BFFBD25F78C7AE83
# gpg:                issuer "pbonzini@redhat.com"
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [full]
# gpg:                 aka "Paolo Bonzini <pbonzini@redhat.com>" [full]

* tag 'for-upstream' of https://gitlab.com/bonzini/qemu:
  hpet: avoid timer storms on periodic timers
  hpet: store full 64-bit target value of the counter
  hpet: accept 64-bit reads and writes
  hpet: place read-only bits directly in "new_val"
  hpet: remove unnecessary variable "index"
  hpet: ignore high bits of comparator in 32-bit mode
  hpet: fix and cleanup persistence of interrupt status
  Add support for RAPL MSRs in KVM/Qemu
  tools: build qemu-vmsr-helper
  qio: add support for SO_PEERCRED for socket channel
  target/i386: do not crash if microvm guest uses SGX CPUID leaves

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Richard Henderson hai 1 ano
pai
achega
43f59bf765

+ 1 - 0
MAINTAINERS

@@ -140,6 +140,7 @@ F: docs/system/target-i386*
 F: target/i386/*.[ch]
 F: target/i386/*.[ch]
 F: target/i386/Kconfig
 F: target/i386/Kconfig
 F: target/i386/meson.build
 F: target/i386/meson.build
+F: tools/i386/
 
 
 Guest CPU cores (TCG)
 Guest CPU cores (TCG)
 ---------------------
 ---------------------

+ 27 - 0
accel/kvm/kvm-all.c

@@ -3776,6 +3776,21 @@ static void kvm_set_device(Object *obj,
     s->device = g_strdup(value);
     s->device = g_strdup(value);
 }
 }
 
 
+static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
+{
+    KVMState *s = KVM_STATE(obj);
+    s->msr_energy.enable = value;
+}
+
+static void kvm_set_kvm_rapl_socket_path(Object *obj,
+                                         const char *str,
+                                         Error **errp)
+{
+    KVMState *s = KVM_STATE(obj);
+    g_free(s->msr_energy.socket_path);
+    s->msr_energy.socket_path = g_strdup(str);
+}
+
 static void kvm_accel_instance_init(Object *obj)
 static void kvm_accel_instance_init(Object *obj)
 {
 {
     KVMState *s = KVM_STATE(obj);
     KVMState *s = KVM_STATE(obj);
@@ -3795,6 +3810,7 @@ static void kvm_accel_instance_init(Object *obj)
     s->xen_gnttab_max_frames = 64;
     s->xen_gnttab_max_frames = 64;
     s->xen_evtchn_max_pirq = 256;
     s->xen_evtchn_max_pirq = 256;
     s->device = NULL;
     s->device = NULL;
+    s->msr_energy.enable = false;
 }
 }
 
 
 /**
 /**
@@ -3839,6 +3855,17 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data)
     object_class_property_set_description(oc, "device",
     object_class_property_set_description(oc, "device",
         "Path to the device node to use (default: /dev/kvm)");
         "Path to the device node to use (default: /dev/kvm)");
 
 
+    object_class_property_add_bool(oc, "rapl",
+                                   NULL,
+                                   kvm_set_kvm_rapl);
+    object_class_property_set_description(oc, "rapl",
+        "Allow energy related MSRs for RAPL interface in Guest");
+
+    object_class_property_add_str(oc, "rapl-helper-socket", NULL,
+                                  kvm_set_kvm_rapl_socket_path);
+    object_class_property_set_description(oc, "rapl-helper-socket",
+        "Socket Path for comminucating with the Virtual MSR helper daemon");
+
     kvm_arch_accel_class_init(oc);
     kvm_arch_accel_class_init(oc);
 }
 }
 
 

+ 15 - 0
contrib/systemd/qemu-vmsr-helper.service

@@ -0,0 +1,15 @@
+[Unit]
+Description=Virtual RAPL MSR Daemon for QEMU
+
+[Service]
+WorkingDirectory=/tmp
+Type=simple
+ExecStart=/usr/bin/qemu-vmsr-helper
+PrivateTmp=yes
+ProtectSystem=strict
+ReadWritePaths=/var/run
+RestrictAddressFamilies=AF_UNIX
+Restart=always
+RestartSec=0
+
+[Install]

+ 9 - 0
contrib/systemd/qemu-vmsr-helper.socket

@@ -0,0 +1,9 @@
+[Unit]
+Description=Virtual RAPL MSR helper for QEMU
+
+[Socket]
+ListenStream=/run/qemu-vmsr-helper.sock
+SocketMode=0600
+
+[Install]
+WantedBy=multi-user.target

+ 1 - 0
docs/specs/index.rst

@@ -34,3 +34,4 @@ guest hardware that is specific to QEMU.
    virt-ctlr
    virt-ctlr
    vmcoreinfo
    vmcoreinfo
    vmgenid
    vmgenid
+   rapl-msr

+ 155 - 0
docs/specs/rapl-msr.rst

@@ -0,0 +1,155 @@
+================
+RAPL MSR support
+================
+
+The RAPL interface (Running Average Power Limit) is advertising the accumulated
+energy consumption of various power domains (e.g. CPU packages, DRAM, etc.).
+
+The consumption is reported via MSRs (model specific registers) like
+MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are 64 bits
+registers that represent the accumulated energy consumption in micro Joules.
+
+Thanks to the MSR Filtering patch [#a]_ not all MSRs are handled by KVM. Some
+of them can now be handled by the userspace (QEMU). It uses a mechanism called
+"MSR filtering" where a list of MSRs is given at init time of a VM to KVM so
+that a callback is put in place. The design of this patch uses only this
+mechanism for handling the MSRs between guest/host.
+
+At the moment the following MSRs are involved:
+
+.. code:: C
+
+    #define MSR_RAPL_POWER_UNIT             0x00000606
+    #define MSR_PKG_POWER_LIMIT             0x00000610
+    #define MSR_PKG_ENERGY_STATUS           0x00000611
+    #define MSR_PKG_POWER_INFO              0x00000614
+
+The ``*_POWER_UNIT``, ``*_POWER_LIMIT``, ``*_POWER INFO`` are part of the RAPL
+spec and specify the power limit of the package, provide range of parameter(min
+power, max power,..) and also the information of the multiplier for the energy
+counter to calculate the power. Those MSRs are populated once at the beginning
+by reading the host CPU MSRs and are given back to the guest 1:1 when
+requested.
+
+The MSR_PKG_ENERGY_STATUS is a counter; it represents the total amount of
+energy consumed since the last time the register was cleared. If you multiply
+it with the UNIT provided above you'll get the power in micro-joules. This
+counter is always increasing and it increases more or less faster depending on
+the consumption of the package. This counter is supposed to overflow at some
+point.
+
+Each core belonging to the same Package reading the MSR_PKG_ENERGY_STATUS (i.e
+"rdmsr 0x611") will retrieve the same value. The value represents the energy
+for the whole package. Whatever Core reading it will get the same value and a
+core that belongs to PKG-0 will not be able to get the value of PKG-1 and
+vice-versa.
+
+High level implementation
+-------------------------
+
+In order to update the value of the virtual MSR, a QEMU thread is created.
+The thread is basically just an infinity loop that does:
+
+1. Snapshot of the time metrics of all QEMU threads (Time spent scheduled in
+   Userspace and System)
+
+2. Snapshot of the actual MSR_PKG_ENERGY_STATUS counter of all packages where
+   the QEMU threads are running on.
+
+3. Sleep for 1 second - During this pause the vcpu and other non-vcpu threads
+   will do what they have to do and so the energy counter will increase.
+
+4. Repeat 2. and 3. and calculate the delta of every metrics representing the
+   time spent scheduled for each QEMU thread *and* the energy spent by the
+   packages during the pause.
+
+5. Filter the vcpu threads and the non-vcpu threads.
+
+6. Retrieve the topology of the Virtual Machine. This helps identify which
+   vCPU is running on which virtual package.
+
+7. The total energy spent by the non-vcpu threads is divided by the number
+   of vcpu threads so that each vcpu thread will get an equal part of the
+   energy spent by the QEMU workers.
+
+8. Calculate the ratio of energy spent per vcpu threads.
+
+9. Calculate the energy for each virtual package.
+
+10. The virtual MSRs are updated for each virtual package. Each vCPU that
+    belongs to the same package will return the same value when accessing the
+    the MSR.
+
+11. Loop back to 1.
+
+Ratio calculation
+-----------------
+
+In Linux, a process has an execution time associated with it. The scheduler is
+dividing the time in clock ticks. The number of clock ticks per second can be
+found by the sysconf system call. A typical value of clock ticks per second is
+100. So a core can run a process at the maximum of 100 ticks per second. If a
+package has 4 cores, 400 ticks maximum can be scheduled on all the cores
+of the package for a period of 1 second.
+
+The /proc/[pid]/stat [#b]_ is a sysfs file that can give the executed time of a
+process with the [pid] as the process ID. It gives the amount of ticks the
+process has been scheduled in userspace (utime) and kernel space (stime).
+
+By reading those metrics for a thread, one can calculate the ratio of time the
+package has spent executing the thread.
+
+Example:
+
+A 4 cores package can schedule a maximum of 400 ticks per second with 100 ticks
+per second per core. If a thread was scheduled for 100 ticks between a second
+on this package, that means my thread has been scheduled for 1/4 of the whole
+package. With that, the calculation of the energy spent by the thread on this
+package during this whole second is 1/4 of the total energy spent by the
+package.
+
+Usage
+-----
+
+Currently this feature is only working on an Intel CPU that has the RAPL driver
+mounted and available in the sysfs. if not, QEMU fails at start-up.
+
+This feature is activated with -accel
+kvm,rapl=true,rapl-helper-socket=/path/sock.sock
+
+It is important that the socket path is the same as the one
+:program:`qemu-vmsr-helper` is listening to.
+
+qemu-vmsr-helper
+----------------
+
+The qemu-vmsr-helper is working very much like the qemu-pr-helper. Instead of
+making persistent reservation, qemu-vmsr-helper is here to overcome the
+CVE-2020-8694 which remove user access to the rapl msr attributes.
+
+A socket communication is established between QEMU processes that has the RAPL
+MSR support activated and the qemu-vmsr-helper. A systemd service and socket
+activation is provided in contrib/systemd/qemu-vmsr-helper.(service/socket).
+
+The systemd socket uses 600, like contrib/systemd/qemu-pr-helper.socket. The
+socket can be passed via SCM_RIGHTS by libvirt, or its permissions can be
+changed (e.g. 660 and root:kvm for a Debian system for example). Libvirt could
+also start a separate helper if needed. All in all, the policy is left to the
+user.
+
+See the qemu-pr-helper documentation or manpage for further details.
+
+Current Limitations
+-------------------
+
+- Works only on Intel host CPUs because AMD CPUs are using different MSR
+  addresses.
+
+- Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at the
+  moment.
+
+References
+----------
+
+.. [#a] https://patchwork.kernel.org/project/kvm/patch/20200916202951.23760-7-graf@amazon.com/
+.. [#b] https://man7.org/linux/man-pages/man5/proc.5.html

+ 1 - 0
docs/tools/index.rst

@@ -16,3 +16,4 @@ command line utilities and other standalone programs.
    qemu-pr-helper
    qemu-pr-helper
    qemu-trace-stap
    qemu-trace-stap
    virtfs-proxy-helper
    virtfs-proxy-helper
+   qemu-vmsr-helper

+ 89 - 0
docs/tools/qemu-vmsr-helper.rst

@@ -0,0 +1,89 @@
+==================================
+QEMU virtual RAPL MSR helper
+==================================
+
+Synopsis
+--------
+
+**qemu-vmsr-helper** [*OPTION*]
+
+Description
+-----------
+
+Implements the virtual RAPL MSR helper for QEMU.
+
+Accessing the RAPL (Running Average Power Limit) MSR enables the RAPL powercap
+driver to advertise and monitor the power consumption or accumulated energy
+consumption of different power domains, such as CPU packages, DRAM, and other
+components when available.
+
+However those register are accesible under priviliged access (CAP_SYS_RAWIO).
+QEMU can use an external helper to access those priviliged register.
+
+:program:`qemu-vmsr-helper` is that external helper; it creates a listener
+socket which will accept incoming connections for communication with QEMU.
+
+If you want to run VMs in a setup like this, this helper should be started as a
+system service, and you should read the QEMU manual section on "RAPL MSR
+support" to find out how to configure QEMU to connect to the socket created by
+:program:`qemu-vmsr-helper`.
+
+After connecting to the socket, :program:`qemu-vmsr-helper` can
+optionally drop root privileges, except for those capabilities that
+are needed for its operation.
+
+:program:`qemu-vmsr-helper` can also use the systemd socket activation
+protocol.  In this case, the systemd socket unit should specify a
+Unix stream socket, like this::
+
+    [Socket]
+    ListenStream=/var/run/qemu-vmsr-helper.sock
+
+Options
+-------
+
+.. program:: qemu-vmsr-helper
+
+.. option:: -d, --daemon
+
+  run in the background (and create a PID file)
+
+.. option:: -q, --quiet
+
+  decrease verbosity
+
+.. option:: -v, --verbose
+
+  increase verbosity
+
+.. option:: -f, --pidfile=PATH
+
+  PID file when running as a daemon. By default the PID file
+  is created in the system runtime state directory, for example
+  :file:`/var/run/qemu-vmsr-helper.pid`.
+
+.. option:: -k, --socket=PATH
+
+  path to the socket. By default the socket is created in
+  the system runtime state directory, for example
+  :file:`/var/run/qemu-vmsr-helper.sock`.
+
+.. option:: -T, --trace [[enable=]PATTERN][,events=FILE][,file=FILE]
+
+  .. include:: ../qemu-option-trace.rst.inc
+
+.. option:: -u, --user=USER
+
+  user to drop privileges to
+
+.. option:: -g, --group=GROUP
+
+  group to drop privileges to
+
+.. option:: -h, --help
+
+  Display a help message and exit.
+
+.. option:: -V, --version
+
+  Display version information and exit.

+ 4 - 2
hw/i386/sgx.c

@@ -268,10 +268,12 @@ void hmp_info_sgx(Monitor *mon, const QDict *qdict)
 
 
 bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
 bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
 {
 {
-    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+    PCMachineState *pcms =
+        (PCMachineState *)object_dynamic_cast(qdev_get_machine(),
+                                              TYPE_PC_MACHINE);
     SGXEPCDevice *epc;
     SGXEPCDevice *epc;
 
 
-    if (pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) {
+    if (!pcms || pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) {
         return true;
         return true;
     }
     }
 
 

+ 146 - 177
hw/timer/hpet.c

@@ -54,10 +54,12 @@ typedef struct HPETTimer {  /* timers */
     uint64_t cmp;           /* comparator */
     uint64_t cmp;           /* comparator */
     uint64_t fsb;           /* FSB route */
     uint64_t fsb;           /* FSB route */
     /* Hidden register state */
     /* Hidden register state */
+    uint64_t cmp64;         /* comparator (extended to counter width) */
     uint64_t period;        /* Last value written to comparator */
     uint64_t period;        /* Last value written to comparator */
     uint8_t wrap_flag;      /* timer pop will indicate wrap for one-shot 32-bit
     uint8_t wrap_flag;      /* timer pop will indicate wrap for one-shot 32-bit
                              * mode. Next pop will be actual timer expiration.
                              * mode. Next pop will be actual timer expiration.
                              */
                              */
+    uint64_t last;          /* last value armed, to avoid timer storms */
 } HPETTimer;
 } HPETTimer;
 
 
 struct HPETState {
 struct HPETState {
@@ -115,11 +117,6 @@ static uint32_t timer_enabled(HPETTimer *t)
 }
 }
 
 
 static uint32_t hpet_time_after(uint64_t a, uint64_t b)
 static uint32_t hpet_time_after(uint64_t a, uint64_t b)
-{
-    return ((int32_t)(b - a) < 0);
-}
-
-static uint32_t hpet_time_after64(uint64_t a, uint64_t b)
 {
 {
     return ((int64_t)(b - a) < 0);
     return ((int64_t)(b - a) < 0);
 }
 }
@@ -156,29 +153,34 @@ static uint64_t hpet_get_ticks(HPETState *s)
     return ns_to_ticks(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->hpet_offset);
     return ns_to_ticks(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->hpet_offset);
 }
 }
 
 
+static uint64_t hpet_get_ns(HPETState *s, uint64_t tick)
+{
+    return ticks_to_ns(tick) - s->hpet_offset;
+}
+
 /*
 /*
- * calculate diff between comparator value and current ticks
+ * calculate next value of the general counter that matches the
+ * target (either entirely, or the low 32-bit only depending on
+ * the timer mode).
  */
  */
-static inline uint64_t hpet_calculate_diff(HPETTimer *t, uint64_t current)
+static uint64_t hpet_calculate_cmp64(HPETTimer *t, uint64_t cur_tick, uint64_t target)
 {
 {
-
     if (t->config & HPET_TN_32BIT) {
     if (t->config & HPET_TN_32BIT) {
-        uint32_t diff, cmp;
-
-        cmp = (uint32_t)t->cmp;
-        diff = cmp - (uint32_t)current;
-        diff = (int32_t)diff > 0 ? diff : (uint32_t)1;
-        return (uint64_t)diff;
+        uint64_t result = deposit64(cur_tick, 0, 32, target);
+        if (result < cur_tick) {
+            result += 0x100000000ULL;
+        }
+        return result;
     } else {
     } else {
-        uint64_t diff, cmp;
-
-        cmp = t->cmp;
-        diff = cmp - current;
-        diff = (int64_t)diff > 0 ? diff : (uint64_t)1;
-        return diff;
+        return target;
     }
     }
 }
 }
 
 
+static uint64_t hpet_next_wrap(uint64_t cur_tick)
+{
+    return (cur_tick | 0xffffffffU) + 1;
+}
+
 static void update_irq(struct HPETTimer *timer, int set)
 static void update_irq(struct HPETTimer *timer, int set)
 {
 {
     uint64_t mask;
     uint64_t mask;
@@ -196,21 +198,31 @@ static void update_irq(struct HPETTimer *timer, int set)
     }
     }
     s = timer->state;
     s = timer->state;
     mask = 1 << timer->tn;
     mask = 1 << timer->tn;
-    if (!set || !timer_enabled(timer) || !hpet_enabled(timer->state)) {
+
+    if (set && (timer->config & HPET_TN_TYPE_LEVEL)) {
+        /*
+         * If HPET_TN_ENABLE bit is 0, "the timer will still operate and
+         * generate appropriate status bits, but will not cause an interrupt"
+         */
+        s->isr |= mask;
+    } else {
         s->isr &= ~mask;
         s->isr &= ~mask;
+    }
+
+    if (set && timer_enabled(timer) && hpet_enabled(s)) {
+        if (timer_fsb_route(timer)) {
+            address_space_stl_le(&address_space_memory, timer->fsb >> 32,
+                                 timer->fsb & 0xffffffff, MEMTXATTRS_UNSPECIFIED,
+                                 NULL);
+        } else if (timer->config & HPET_TN_TYPE_LEVEL) {
+            qemu_irq_raise(s->irqs[route]);
+        } else {
+            qemu_irq_pulse(s->irqs[route]);
+        }
+    } else {
         if (!timer_fsb_route(timer)) {
         if (!timer_fsb_route(timer)) {
             qemu_irq_lower(s->irqs[route]);
             qemu_irq_lower(s->irqs[route]);
         }
         }
-    } else if (timer_fsb_route(timer)) {
-        address_space_stl_le(&address_space_memory, timer->fsb >> 32,
-                             timer->fsb & 0xffffffff, MEMTXATTRS_UNSPECIFIED,
-                             NULL);
-    } else if (timer->config & HPET_TN_TYPE_LEVEL) {
-        s->isr |= mask;
-        qemu_irq_raise(s->irqs[route]);
-    } else {
-        s->isr &= ~mask;
-        qemu_irq_pulse(s->irqs[route]);
     }
     }
 }
 }
 
 
@@ -250,7 +262,13 @@ static bool hpet_validate_num_timers(void *opaque, int version_id)
 static int hpet_post_load(void *opaque, int version_id)
 static int hpet_post_load(void *opaque, int version_id)
 {
 {
     HPETState *s = opaque;
     HPETState *s = opaque;
+    int i;
 
 
+    for (i = 0; i < s->num_timers; i++) {
+        HPETTimer *t = &s->timer[i];
+        t->cmp64 = hpet_calculate_cmp64(t, s->hpet_counter, t->cmp);
+        t->last = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) - NANOSECONDS_PER_SECOND;
+    }
     /* Recalculate the offset between the main counter and guest time */
     /* Recalculate the offset between the main counter and guest time */
     if (!s->hpet_offset_saved) {
     if (!s->hpet_offset_saved) {
         s->hpet_offset = ticks_to_ns(s->hpet_counter)
         s->hpet_offset = ticks_to_ns(s->hpet_counter)
@@ -346,14 +364,17 @@ static const VMStateDescription vmstate_hpet = {
     }
     }
 };
 };
 
 
-static void hpet_arm(HPETTimer *t, uint64_t ticks)
+static void hpet_arm(HPETTimer *t, uint64_t tick)
 {
 {
-    if (ticks < ns_to_ticks(INT64_MAX / 2)) {
-        timer_mod(t->qemu_timer,
-                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ticks_to_ns(ticks));
-    } else {
-        timer_del(t->qemu_timer);
+    uint64_t ns = hpet_get_ns(t->state, tick);
+
+    /* Clamp period to reasonable min value (1 us) */
+    if (timer_is_periodic(t) && ns - t->last < 1000) {
+        ns = t->last + 1000;
     }
     }
+
+    t->last = ns;
+    timer_mod(t->qemu_timer, ns);
 }
 }
 
 
 /*
 /*
@@ -362,72 +383,68 @@ static void hpet_arm(HPETTimer *t, uint64_t ticks)
 static void hpet_timer(void *opaque)
 static void hpet_timer(void *opaque)
 {
 {
     HPETTimer *t = opaque;
     HPETTimer *t = opaque;
-    uint64_t diff;
-
     uint64_t period = t->period;
     uint64_t period = t->period;
     uint64_t cur_tick = hpet_get_ticks(t->state);
     uint64_t cur_tick = hpet_get_ticks(t->state);
 
 
     if (timer_is_periodic(t) && period != 0) {
     if (timer_is_periodic(t) && period != 0) {
+        while (hpet_time_after(cur_tick, t->cmp64)) {
+            t->cmp64 += period;
+        }
         if (t->config & HPET_TN_32BIT) {
         if (t->config & HPET_TN_32BIT) {
-            while (hpet_time_after(cur_tick, t->cmp)) {
-                t->cmp = (uint32_t)(t->cmp + t->period);
-            }
+            t->cmp = (uint32_t)t->cmp64;
         } else {
         } else {
-            while (hpet_time_after64(cur_tick, t->cmp)) {
-                t->cmp += period;
-            }
-        }
-        diff = hpet_calculate_diff(t, cur_tick);
-        hpet_arm(t, diff);
-    } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
-        if (t->wrap_flag) {
-            diff = hpet_calculate_diff(t, cur_tick);
-            hpet_arm(t, diff);
-            t->wrap_flag = 0;
+            t->cmp = t->cmp64;
         }
         }
+        hpet_arm(t, t->cmp64);
+    } else if (t->wrap_flag) {
+        t->wrap_flag = 0;
+        hpet_arm(t, t->cmp64);
     }
     }
     update_irq(t, 1);
     update_irq(t, 1);
 }
 }
 
 
 static void hpet_set_timer(HPETTimer *t)
 static void hpet_set_timer(HPETTimer *t)
 {
 {
-    uint64_t diff;
-    uint32_t wrap_diff;  /* how many ticks until we wrap? */
     uint64_t cur_tick = hpet_get_ticks(t->state);
     uint64_t cur_tick = hpet_get_ticks(t->state);
 
 
-    /* whenever new timer is being set up, make sure wrap_flag is 0 */
     t->wrap_flag = 0;
     t->wrap_flag = 0;
-    diff = hpet_calculate_diff(t, cur_tick);
-
-    /* hpet spec says in one-shot 32-bit mode, generate an interrupt when
-     * counter wraps in addition to an interrupt with comparator match.
-     */
-    if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
-        wrap_diff = 0xffffffff - (uint32_t)cur_tick;
-        if (wrap_diff < (uint32_t)diff) {
-            diff = wrap_diff;
+    t->cmp64 = hpet_calculate_cmp64(t, cur_tick, t->cmp);
+    if (t->config & HPET_TN_32BIT) {
+
+        /* hpet spec says in one-shot 32-bit mode, generate an interrupt when
+         * counter wraps in addition to an interrupt with comparator match.
+         */
+        if (!timer_is_periodic(t) && t->cmp64 > hpet_next_wrap(cur_tick)) {
             t->wrap_flag = 1;
             t->wrap_flag = 1;
+            hpet_arm(t, hpet_next_wrap(cur_tick));
+            return;
         }
         }
     }
     }
-    hpet_arm(t, diff);
+    hpet_arm(t, t->cmp64);
 }
 }
 
 
 static void hpet_del_timer(HPETTimer *t)
 static void hpet_del_timer(HPETTimer *t)
 {
 {
+    HPETState *s = t->state;
     timer_del(t->qemu_timer);
     timer_del(t->qemu_timer);
-    update_irq(t, 0);
+
+    if (s->isr & (1 << t->tn)) {
+        /* For level-triggered interrupt, this leaves ISR set but lowers irq.  */
+        update_irq(t, 1);
+    }
 }
 }
 
 
 static uint64_t hpet_ram_read(void *opaque, hwaddr addr,
 static uint64_t hpet_ram_read(void *opaque, hwaddr addr,
                               unsigned size)
                               unsigned size)
 {
 {
     HPETState *s = opaque;
     HPETState *s = opaque;
-    uint64_t cur_tick, index;
+    int shift = (addr & 4) * 8;
+    uint64_t cur_tick;
 
 
     trace_hpet_ram_read(addr);
     trace_hpet_ram_read(addr);
-    index = addr;
+
     /*address range of all TN regs*/
     /*address range of all TN regs*/
-    if (index >= 0x100 && index <= 0x3ff) {
+    if (addr >= 0x100 && addr <= 0x3ff) {
         uint8_t timer_id = (addr - 0x100) / 0x20;
         uint8_t timer_id = (addr - 0x100) / 0x20;
         HPETTimer *timer = &s->timer[timer_id];
         HPETTimer *timer = &s->timer[timer_id];
 
 
@@ -436,52 +453,33 @@ static uint64_t hpet_ram_read(void *opaque, hwaddr addr,
             return 0;
             return 0;
         }
         }
 
 
-        switch ((addr - 0x100) % 0x20) {
-        case HPET_TN_CFG:
-            return timer->config;
-        case HPET_TN_CFG + 4: // Interrupt capabilities
-            return timer->config >> 32;
+        switch (addr & 0x18) {
+        case HPET_TN_CFG: // including interrupt capabilities
+            return timer->config >> shift;
         case HPET_TN_CMP: // comparator register
         case HPET_TN_CMP: // comparator register
-            return timer->cmp;
-        case HPET_TN_CMP + 4:
-            return timer->cmp >> 32;
+            return timer->cmp >> shift;
         case HPET_TN_ROUTE:
         case HPET_TN_ROUTE:
-            return timer->fsb;
-        case HPET_TN_ROUTE + 4:
-            return timer->fsb >> 32;
+            return timer->fsb >> shift;
         default:
         default:
             trace_hpet_ram_read_invalid();
             trace_hpet_ram_read_invalid();
             break;
             break;
         }
         }
     } else {
     } else {
-        switch (index) {
-        case HPET_ID:
-            return s->capability;
-        case HPET_PERIOD:
-            return s->capability >> 32;
+        switch (addr & ~4) {
+        case HPET_ID: // including HPET_PERIOD
+            return s->capability >> shift;
         case HPET_CFG:
         case HPET_CFG:
-            return s->config;
-        case HPET_CFG + 4:
-            trace_hpet_invalid_hpet_cfg(4);
-            return 0;
+            return s->config >> shift;
         case HPET_COUNTER:
         case HPET_COUNTER:
             if (hpet_enabled(s)) {
             if (hpet_enabled(s)) {
                 cur_tick = hpet_get_ticks(s);
                 cur_tick = hpet_get_ticks(s);
             } else {
             } else {
                 cur_tick = s->hpet_counter;
                 cur_tick = s->hpet_counter;
             }
             }
-            trace_hpet_ram_read_reading_counter(0, cur_tick);
-            return cur_tick;
-        case HPET_COUNTER + 4:
-            if (hpet_enabled(s)) {
-                cur_tick = hpet_get_ticks(s);
-            } else {
-                cur_tick = s->hpet_counter;
-            }
-            trace_hpet_ram_read_reading_counter(4, cur_tick);
-            return cur_tick >> 32;
+            trace_hpet_ram_read_reading_counter(addr & 4, cur_tick);
+            return cur_tick >> shift;
         case HPET_STATUS:
         case HPET_STATUS:
-            return s->isr;
+            return s->isr >> shift;
         default:
         default:
             trace_hpet_ram_read_invalid();
             trace_hpet_ram_read_invalid();
             break;
             break;
@@ -495,15 +493,14 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
 {
 {
     int i;
     int i;
     HPETState *s = opaque;
     HPETState *s = opaque;
-    uint64_t old_val, new_val, val, index;
+    int shift = (addr & 4) * 8;
+    int len = MIN(size * 8, 64 - shift);
+    uint64_t old_val, new_val, cleared;
 
 
     trace_hpet_ram_write(addr, value);
     trace_hpet_ram_write(addr, value);
-    index = addr;
-    old_val = hpet_ram_read(opaque, addr, 4);
-    new_val = value;
 
 
     /*address range of all TN regs*/
     /*address range of all TN regs*/
-    if (index >= 0x100 && index <= 0x3ff) {
+    if (addr >= 0x100 && addr <= 0x3ff) {
         uint8_t timer_id = (addr - 0x100) / 0x20;
         uint8_t timer_id = (addr - 0x100) / 0x20;
         HPETTimer *timer = &s->timer[timer_id];
         HPETTimer *timer = &s->timer[timer_id];
 
 
@@ -512,71 +509,49 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
             trace_hpet_timer_id_out_of_range(timer_id);
             trace_hpet_timer_id_out_of_range(timer_id);
             return;
             return;
         }
         }
-        switch ((addr - 0x100) % 0x20) {
+        switch (addr & 0x18) {
         case HPET_TN_CFG:
         case HPET_TN_CFG:
-            trace_hpet_ram_write_tn_cfg();
-            if (activating_bit(old_val, new_val, HPET_TN_FSB_ENABLE)) {
+            trace_hpet_ram_write_tn_cfg(addr & 4);
+            old_val = timer->config;
+            new_val = deposit64(old_val, shift, len, value);
+            new_val = hpet_fixup_reg(new_val, old_val, HPET_TN_CFG_WRITE_MASK);
+            if (deactivating_bit(old_val, new_val, HPET_TN_TYPE_LEVEL)) {
+                /*
+                 * Do this before changing timer->config; otherwise, if
+                 * HPET_TN_FSB is set, update_irq will not lower the qemu_irq.
+                 */
                 update_irq(timer, 0);
                 update_irq(timer, 0);
             }
             }
-            val = hpet_fixup_reg(new_val, old_val, HPET_TN_CFG_WRITE_MASK);
-            timer->config = (timer->config & 0xffffffff00000000ULL) | val;
+            timer->config = new_val;
+            if (activating_bit(old_val, new_val, HPET_TN_ENABLE)
+                && (s->isr & (1 << timer_id))) {
+                update_irq(timer, 1);
+            }
             if (new_val & HPET_TN_32BIT) {
             if (new_val & HPET_TN_32BIT) {
                 timer->cmp = (uint32_t)timer->cmp;
                 timer->cmp = (uint32_t)timer->cmp;
                 timer->period = (uint32_t)timer->period;
                 timer->period = (uint32_t)timer->period;
             }
             }
-            if (activating_bit(old_val, new_val, HPET_TN_ENABLE) &&
-                hpet_enabled(s)) {
+            if (hpet_enabled(s)) {
                 hpet_set_timer(timer);
                 hpet_set_timer(timer);
-            } else if (deactivating_bit(old_val, new_val, HPET_TN_ENABLE)) {
-                hpet_del_timer(timer);
             }
             }
             break;
             break;
-        case HPET_TN_CFG + 4: // Interrupt capabilities
-            trace_hpet_ram_write_invalid_tn_cfg(4);
-            break;
         case HPET_TN_CMP: // comparator register
         case HPET_TN_CMP: // comparator register
-            trace_hpet_ram_write_tn_cmp(0);
             if (timer->config & HPET_TN_32BIT) {
             if (timer->config & HPET_TN_32BIT) {
-                new_val = (uint32_t)new_val;
-            }
-            if (!timer_is_periodic(timer)
-                || (timer->config & HPET_TN_SETVAL)) {
-                timer->cmp = (timer->cmp & 0xffffffff00000000ULL) | new_val;
-            }
-            if (timer_is_periodic(timer)) {
-                /*
-                 * FIXME: Clamp period to reasonable min value?
-                 * Clamp period to reasonable max value
-                 */
-                if (timer->config & HPET_TN_32BIT) {
-                    new_val = MIN(new_val, ~0u >> 1);
+                /* High 32-bits are zero, leave them untouched.  */
+                if (shift) {
+                    trace_hpet_ram_write_invalid_tn_cmp();
+                    break;
                 }
                 }
-                timer->period =
-                    (timer->period & 0xffffffff00000000ULL) | new_val;
-            }
-            /*
-             * FIXME: on a 64-bit write, HPET_TN_SETVAL should apply to the
-             * high bits part as well.
-             */
-            timer->config &= ~HPET_TN_SETVAL;
-            if (hpet_enabled(s)) {
-                hpet_set_timer(timer);
+                len = 64;
+                value = (uint32_t) value;
             }
             }
-            break;
-        case HPET_TN_CMP + 4: // comparator register high order
-            trace_hpet_ram_write_tn_cmp(4);
+            trace_hpet_ram_write_tn_cmp(addr & 4);
             if (!timer_is_periodic(timer)
             if (!timer_is_periodic(timer)
                 || (timer->config & HPET_TN_SETVAL)) {
                 || (timer->config & HPET_TN_SETVAL)) {
-                timer->cmp = (timer->cmp & 0xffffffffULL) | new_val << 32;
+                timer->cmp = deposit64(timer->cmp, shift, len, value);
             }
             }
             if (timer_is_periodic(timer)) {
             if (timer_is_periodic(timer)) {
-                /*
-                 * FIXME: Clamp period to reasonable min value?
-                 * Clamp period to reasonable max value
-                 */
-                new_val = MIN(new_val, ~0u >> 1);
-                timer->period =
-                    (timer->period & 0xffffffffULL) | new_val << 32;
+                timer->period = deposit64(timer->period, shift, len, value);
             }
             }
             timer->config &= ~HPET_TN_SETVAL;
             timer->config &= ~HPET_TN_SETVAL;
             if (hpet_enabled(s)) {
             if (hpet_enabled(s)) {
@@ -584,10 +559,7 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
             }
             }
             break;
             break;
         case HPET_TN_ROUTE:
         case HPET_TN_ROUTE:
-            timer->fsb = (timer->fsb & 0xffffffff00000000ULL) | new_val;
-            break;
-        case HPET_TN_ROUTE + 4:
-            timer->fsb = (new_val << 32) | (timer->fsb & 0xffffffff);
+            timer->fsb = deposit64(timer->fsb, shift, len, value);
             break;
             break;
         default:
         default:
             trace_hpet_ram_write_invalid();
             trace_hpet_ram_write_invalid();
@@ -595,20 +567,23 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
         }
         }
         return;
         return;
     } else {
     } else {
-        switch (index) {
+        switch (addr & ~4) {
         case HPET_ID:
         case HPET_ID:
             return;
             return;
         case HPET_CFG:
         case HPET_CFG:
-            val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK);
-            s->config = (s->config & 0xffffffff00000000ULL) | val;
+            old_val = s->config;
+            new_val = deposit64(old_val, shift, len, value);
+            new_val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK);
+            s->config = new_val;
             if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
             if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
                 /* Enable main counter and interrupt generation. */
                 /* Enable main counter and interrupt generation. */
                 s->hpet_offset =
                 s->hpet_offset =
                     ticks_to_ns(s->hpet_counter) - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
                     ticks_to_ns(s->hpet_counter) - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
                 for (i = 0; i < s->num_timers; i++) {
                 for (i = 0; i < s->num_timers; i++) {
-                    if ((&s->timer[i])->cmp != ~0ULL) {
-                        hpet_set_timer(&s->timer[i]);
+                    if (timer_enabled(&s->timer[i]) && (s->isr & (1 << i))) {
+                        update_irq(&s->timer[i], 1);
                     }
                     }
+                    hpet_set_timer(&s->timer[i]);
                 }
                 }
             } else if (deactivating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
             } else if (deactivating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
                 /* Halt main counter and disable interrupt generation. */
                 /* Halt main counter and disable interrupt generation. */
@@ -629,13 +604,11 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
                 qemu_set_irq(s->irqs[RTC_ISA_IRQ], s->rtc_irq_level);
                 qemu_set_irq(s->irqs[RTC_ISA_IRQ], s->rtc_irq_level);
             }
             }
             break;
             break;
-        case HPET_CFG + 4:
-            trace_hpet_invalid_hpet_cfg(4);
-            break;
         case HPET_STATUS:
         case HPET_STATUS:
-            val = new_val & s->isr;
+            new_val = value << shift;
+            cleared = new_val & s->isr;
             for (i = 0; i < s->num_timers; i++) {
             for (i = 0; i < s->num_timers; i++) {
-                if (val & (1 << i)) {
+                if (cleared & (1 << i)) {
                     update_irq(&s->timer[i], 0);
                     update_irq(&s->timer[i], 0);
                 }
                 }
             }
             }
@@ -644,15 +617,7 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
             if (hpet_enabled(s)) {
             if (hpet_enabled(s)) {
                 trace_hpet_ram_write_counter_write_while_enabled();
                 trace_hpet_ram_write_counter_write_while_enabled();
             }
             }
-            s->hpet_counter =
-                (s->hpet_counter & 0xffffffff00000000ULL) | value;
-            trace_hpet_ram_write_counter_written(0, value, s->hpet_counter);
-            break;
-        case HPET_COUNTER + 4:
-            trace_hpet_ram_write_counter_write_while_enabled();
-            s->hpet_counter =
-                (s->hpet_counter & 0xffffffffULL) | (((uint64_t)value) << 32);
-            trace_hpet_ram_write_counter_written(4, value, s->hpet_counter);
+            s->hpet_counter = deposit64(s->hpet_counter, shift, len, value);
             break;
             break;
         default:
         default:
             trace_hpet_ram_write_invalid();
             trace_hpet_ram_write_invalid();
@@ -666,7 +631,11 @@ static const MemoryRegionOps hpet_ram_ops = {
     .write = hpet_ram_write,
     .write = hpet_ram_write,
     .valid = {
     .valid = {
         .min_access_size = 4,
         .min_access_size = 4,
-        .max_access_size = 4,
+        .max_access_size = 8,
+    },
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 8,
     },
     },
     .endianness = DEVICE_NATIVE_ENDIAN,
     .endianness = DEVICE_NATIVE_ENDIAN,
 };
 };

+ 2 - 2
hw/timer/trace-events

@@ -108,9 +108,9 @@ hpet_ram_read_reading_counter(uint8_t reg_off, uint64_t cur_tick) "reading count
 hpet_ram_read_invalid(void) "invalid hpet_ram_readl"
 hpet_ram_read_invalid(void) "invalid hpet_ram_readl"
 hpet_ram_write(uint64_t addr, uint64_t value) "enter hpet_ram_writel at 0x%" PRIx64 " = 0x%" PRIx64
 hpet_ram_write(uint64_t addr, uint64_t value) "enter hpet_ram_writel at 0x%" PRIx64 " = 0x%" PRIx64
 hpet_ram_write_timer_id(uint64_t timer_id) "hpet_ram_writel timer_id = 0x%" PRIx64
 hpet_ram_write_timer_id(uint64_t timer_id) "hpet_ram_writel timer_id = 0x%" PRIx64
-hpet_ram_write_tn_cfg(void) "hpet_ram_writel HPET_TN_CFG"
-hpet_ram_write_invalid_tn_cfg(uint8_t reg_off) "invalid HPET_TN_CFG + %" PRIu8 " write"
+hpet_ram_write_tn_cfg(uint8_t reg_off) "hpet_ram_writel HPET_TN_CFG + %" PRIu8
 hpet_ram_write_tn_cmp(uint8_t reg_off) "hpet_ram_writel HPET_TN_CMP + %" PRIu8
 hpet_ram_write_tn_cmp(uint8_t reg_off) "hpet_ram_writel HPET_TN_CMP + %" PRIu8
+hpet_ram_write_invalid_tn_cmp(void) "invalid HPET_TN_CMP + 4 write"
 hpet_ram_write_invalid(void) "invalid hpet_ram_writel"
 hpet_ram_write_invalid(void) "invalid hpet_ram_writel"
 hpet_ram_write_counter_write_while_enabled(void) "Writing counter while HPET enabled!"
 hpet_ram_write_counter_write_while_enabled(void) "Writing counter while HPET enabled!"
 hpet_ram_write_counter_written(uint8_t reg_off, uint64_t value, uint64_t counter) "HPET counter + %" PRIu8 "written. crt = 0x%" PRIx64 " -> 0x%" PRIx64
 hpet_ram_write_counter_written(uint8_t reg_off, uint64_t value, uint64_t counter) "HPET counter + %" PRIu8 "written. crt = 0x%" PRIx64 " -> 0x%" PRIx64

+ 21 - 0
include/io/channel.h

@@ -160,6 +160,9 @@ struct QIOChannelClass {
                                   void *opaque);
                                   void *opaque);
     int (*io_flush)(QIOChannel *ioc,
     int (*io_flush)(QIOChannel *ioc,
                     Error **errp);
                     Error **errp);
+    int (*io_peerpid)(QIOChannel *ioc,
+                       unsigned int *pid,
+                       Error **errp);
 };
 };
 
 
 /* General I/O handling functions */
 /* General I/O handling functions */
@@ -981,4 +984,22 @@ int coroutine_mixed_fn qio_channel_writev_full_all(QIOChannel *ioc,
 int qio_channel_flush(QIOChannel *ioc,
 int qio_channel_flush(QIOChannel *ioc,
                       Error **errp);
                       Error **errp);
 
 
+/**
+ * qio_channel_get_peercred:
+ * @ioc: the channel object
+ * @pid: pointer to pid
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Returns the pid of the peer process connected to this socket.
+ *
+ * The use of this function is possible only for connected
+ * AF_UNIX stream sockets and for AF_UNIX stream and datagram
+ * socket pairs on Linux.
+ * Return -1 on error with pid -1 for the non-Linux OS.
+ *
+ */
+int qio_channel_get_peerpid(QIOChannel *ioc,
+                             unsigned int *pid,
+                             Error **errp);
+
 #endif /* QIO_CHANNEL_H */
 #endif /* QIO_CHANNEL_H */

+ 32 - 0
include/sysemu/kvm_int.h

@@ -14,6 +14,9 @@
 #include "qemu/accel.h"
 #include "qemu/accel.h"
 #include "qemu/queue.h"
 #include "qemu/queue.h"
 #include "sysemu/kvm.h"
 #include "sysemu/kvm.h"
+#include "hw/boards.h"
+#include "hw/i386/topology.h"
+#include "io/channel-socket.h"
 
 
 typedef struct KVMSlot
 typedef struct KVMSlot
 {
 {
@@ -50,6 +53,34 @@ typedef struct KVMMemoryListener {
 
 
 #define KVM_MSI_HASHTAB_SIZE    256
 #define KVM_MSI_HASHTAB_SIZE    256
 
 
+typedef struct KVMHostTopoInfo {
+    /* Number of package on the Host */
+    unsigned int maxpkgs;
+    /* Number of cpus on the Host */
+    unsigned int maxcpus;
+    /* Number of cpus on each different package */
+    unsigned int *pkg_cpu_count;
+    /* Each package can have different maxticks */
+    unsigned int *maxticks;
+} KVMHostTopoInfo;
+
+struct KVMMsrEnergy {
+    pid_t pid;
+    bool enable;
+    char *socket_path;
+    QIOChannelSocket *sioc;
+    QemuThread msr_thr;
+    unsigned int guest_vcpus;
+    unsigned int guest_vsockets;
+    X86CPUTopoInfo guest_topo_info;
+    KVMHostTopoInfo host_topo;
+    const CPUArchIdList *guest_cpu_list;
+    uint64_t *msr_value;
+    uint64_t msr_unit;
+    uint64_t msr_limit;
+    uint64_t msr_info;
+};
+
 enum KVMDirtyRingReaperState {
 enum KVMDirtyRingReaperState {
     KVM_DIRTY_RING_REAPER_NONE = 0,
     KVM_DIRTY_RING_REAPER_NONE = 0,
     /* The reaper is sleeping */
     /* The reaper is sleeping */
@@ -117,6 +148,7 @@ struct KVMState
     bool kvm_dirty_ring_with_bitmap;
     bool kvm_dirty_ring_with_bitmap;
     uint64_t kvm_eager_split_size;  /* Eager Page Splitting chunk size */
     uint64_t kvm_eager_split_size;  /* Eager Page Splitting chunk size */
     struct KVMDirtyRingReaper reaper;
     struct KVMDirtyRingReaper reaper;
+    struct KVMMsrEnergy msr_energy;
     NotifyVmexitOption notify_vmexit;
     NotifyVmexitOption notify_vmexit;
     uint32_t notify_window;
     uint32_t notify_window;
     uint32_t xen_version;
     uint32_t xen_version;

+ 28 - 0
io/channel-socket.c

@@ -841,6 +841,33 @@ qio_channel_socket_set_cork(QIOChannel *ioc,
     socket_set_cork(sioc->fd, v);
     socket_set_cork(sioc->fd, v);
 }
 }
 
 
+static int
+qio_channel_socket_get_peerpid(QIOChannel *ioc,
+                               unsigned int *pid,
+                               Error **errp)
+{
+#ifdef CONFIG_LINUX
+    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+    Error *err = NULL;
+    socklen_t len = sizeof(struct ucred);
+
+    struct ucred cred;
+    if (getsockopt(sioc->fd,
+               SOL_SOCKET, SO_PEERCRED,
+               &cred, &len) == -1) {
+        error_setg_errno(&err, errno, "Unable to get peer credentials");
+        error_propagate(errp, err);
+        *pid = -1;
+        return -1;
+    }
+    *pid = (unsigned int)cred.pid;
+    return 0;
+#else
+    error_setg(errp, "Unsupported feature");
+    *pid = -1;
+    return -1;
+#endif
+}
 
 
 static int
 static int
 qio_channel_socket_close(QIOChannel *ioc,
 qio_channel_socket_close(QIOChannel *ioc,
@@ -938,6 +965,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass,
 #ifdef QEMU_MSG_ZEROCOPY
 #ifdef QEMU_MSG_ZEROCOPY
     ioc_klass->io_flush = qio_channel_socket_flush;
     ioc_klass->io_flush = qio_channel_socket_flush;
 #endif
 #endif
+    ioc_klass->io_peerpid = qio_channel_socket_get_peerpid;
 }
 }
 
 
 static const TypeInfo qio_channel_socket_info = {
 static const TypeInfo qio_channel_socket_info = {

+ 13 - 0
io/channel.c

@@ -548,6 +548,19 @@ void qio_channel_set_cork(QIOChannel *ioc,
     }
     }
 }
 }
 
 
+int qio_channel_get_peerpid(QIOChannel *ioc,
+                             unsigned int *pid,
+                             Error **errp)
+{
+    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+
+    if (!klass->io_peerpid) {
+        error_setg(errp, "Channel does not support peer pid");
+        return -1;
+    }
+    klass->io_peerpid(ioc, pid, errp);
+    return 0;
+}
 
 
 off_t qio_channel_io_seek(QIOChannel *ioc,
 off_t qio_channel_io_seek(QIOChannel *ioc,
                           off_t offset,
                           off_t offset,

+ 7 - 0
meson.build

@@ -4089,6 +4089,13 @@ if have_tools
                dependencies: [authz, crypto, io, qom, qemuutil,
                dependencies: [authz, crypto, io, qom, qemuutil,
                               libcap_ng, mpathpersist],
                               libcap_ng, mpathpersist],
                install: true)
                install: true)
+
+    if cpu in ['x86', 'x86_64']
+      executable('qemu-vmsr-helper', files('tools/i386/qemu-vmsr-helper.c'),
+               dependencies: [authz, crypto, io, qom, qemuutil,
+                              libcap_ng, mpathpersist],
+               install: true)
+    endif
   endif
   endif
 
 
   if have_ivshmem
   if have_ivshmem

+ 8 - 0
target/i386/cpu.h

@@ -414,6 +414,10 @@ typedef enum X86Seg {
 #define MSR_IA32_TSX_CTRL		0x122
 #define MSR_IA32_TSX_CTRL		0x122
 #define MSR_IA32_TSCDEADLINE            0x6e0
 #define MSR_IA32_TSCDEADLINE            0x6e0
 #define MSR_IA32_PKRS                   0x6e1
 #define MSR_IA32_PKRS                   0x6e1
+#define MSR_RAPL_POWER_UNIT             0x00000606
+#define MSR_PKG_POWER_LIMIT             0x00000610
+#define MSR_PKG_ENERGY_STATUS           0x00000611
+#define MSR_PKG_POWER_INFO              0x00000614
 #define MSR_ARCH_LBR_CTL                0x000014ce
 #define MSR_ARCH_LBR_CTL                0x000014ce
 #define MSR_ARCH_LBR_DEPTH              0x000014cf
 #define MSR_ARCH_LBR_DEPTH              0x000014cf
 #define MSR_ARCH_LBR_FROM_0             0x00001500
 #define MSR_ARCH_LBR_FROM_0             0x00001500
@@ -1880,6 +1884,10 @@ typedef struct CPUArchState {
 
 
     uintptr_t retaddr;
     uintptr_t retaddr;
 
 
+    /* RAPL MSR */
+    uint64_t msr_rapl_power_unit;
+    uint64_t msr_pkg_energy_status;
+
     /* Fields up to this point are cleared by a CPU reset */
     /* Fields up to this point are cleared by a CPU reset */
     struct {} end_reset_fields;
     struct {} end_reset_fields;
 
 

+ 430 - 1
target/i386/kvm/kvm.c

@@ -16,9 +16,12 @@
 #include "qapi/qapi-events-run-state.h"
 #include "qapi/qapi-events-run-state.h"
 #include "qapi/error.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
 #include "qapi/visitor.h"
+#include <math.h>
 #include <sys/ioctl.h>
 #include <sys/ioctl.h>
 #include <sys/utsname.h>
 #include <sys/utsname.h>
 #include <sys/syscall.h>
 #include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/time.h>
 
 
 #include <linux/kvm.h>
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
 #include <linux/kvm_para.h>
@@ -27,6 +30,7 @@
 
 
 #include "cpu.h"
 #include "cpu.h"
 #include "host-cpu.h"
 #include "host-cpu.h"
+#include "vmsr_energy.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/hw_accel.h"
 #include "sysemu/hw_accel.h"
 #include "sysemu/kvm_int.h"
 #include "sysemu/kvm_int.h"
@@ -2559,7 +2563,8 @@ static int kvm_get_supported_msrs(KVMState *s)
     return ret;
     return ret;
 }
 }
 
 
-static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
+static bool kvm_rdmsr_core_thread_count(X86CPU *cpu,
+                                        uint32_t msr,
                                         uint64_t *val)
                                         uint64_t *val)
 {
 {
     CPUState *cs = CPU(cpu);
     CPUState *cs = CPU(cpu);
@@ -2570,6 +2575,53 @@ static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
     return true;
     return true;
 }
 }
 
 
+static bool kvm_rdmsr_rapl_power_unit(X86CPU *cpu,
+                                      uint32_t msr,
+                                      uint64_t *val)
+{
+
+    CPUState *cs = CPU(cpu);
+
+    *val = cs->kvm_state->msr_energy.msr_unit;
+
+    return true;
+}
+
+static bool kvm_rdmsr_pkg_power_limit(X86CPU *cpu,
+                                      uint32_t msr,
+                                      uint64_t *val)
+{
+
+    CPUState *cs = CPU(cpu);
+
+    *val = cs->kvm_state->msr_energy.msr_limit;
+
+    return true;
+}
+
+static bool kvm_rdmsr_pkg_power_info(X86CPU *cpu,
+                                     uint32_t msr,
+                                     uint64_t *val)
+{
+
+    CPUState *cs = CPU(cpu);
+
+    *val = cs->kvm_state->msr_energy.msr_info;
+
+    return true;
+}
+
+static bool kvm_rdmsr_pkg_energy_status(X86CPU *cpu,
+                                        uint32_t msr,
+                                        uint64_t *val)
+{
+
+    CPUState *cs = CPU(cpu);
+    *val = cs->kvm_state->msr_energy.msr_value[cs->cpu_index];
+
+    return true;
+}
+
 static Notifier smram_machine_done;
 static Notifier smram_machine_done;
 static KVMMemoryListener smram_listener;
 static KVMMemoryListener smram_listener;
 static AddressSpace smram_address_space;
 static AddressSpace smram_address_space;
@@ -2604,6 +2656,340 @@ static void register_smram_listener(Notifier *n, void *unused)
                                  &smram_address_space, 1, "kvm-smram");
                                  &smram_address_space, 1, "kvm-smram");
 }
 }
 
 
+static void *kvm_msr_energy_thread(void *data)
+{
+    KVMState *s = data;
+    struct KVMMsrEnergy *vmsr = &s->msr_energy;
+
+    g_autofree vmsr_package_energy_stat *pkg_stat = NULL;
+    g_autofree vmsr_thread_stat *thd_stat = NULL;
+    g_autofree CPUState *cpu = NULL;
+    g_autofree unsigned int *vpkgs_energy_stat = NULL;
+    unsigned int num_threads = 0;
+
+    X86CPUTopoIDs topo_ids;
+
+    rcu_register_thread();
+
+    /* Allocate memory for each package energy status */
+    pkg_stat = g_new0(vmsr_package_energy_stat, vmsr->host_topo.maxpkgs);
+
+    /* Allocate memory for thread stats */
+    thd_stat = g_new0(vmsr_thread_stat, 1);
+
+    /* Allocate memory for holding virtual package energy counter */
+    vpkgs_energy_stat = g_new0(unsigned int, vmsr->guest_vsockets);
+
+    /* Populate the max tick of each packages */
+    for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+        /*
+         * Max numbers of ticks per package
+         * Time in second * Number of ticks/second * Number of cores/package
+         * ex: 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max
+         */
+        vmsr->host_topo.maxticks[i] = (MSR_ENERGY_THREAD_SLEEP_US / 1000000)
+                        * sysconf(_SC_CLK_TCK)
+                        * vmsr->host_topo.pkg_cpu_count[i];
+    }
+
+    while (true) {
+        /* Get all qemu threads id */
+        g_autofree pid_t *thread_ids =
+            thread_ids = vmsr_get_thread_ids(vmsr->pid, &num_threads);
+
+        if (thread_ids == NULL) {
+            goto clean;
+        }
+
+        thd_stat = g_renew(vmsr_thread_stat, thd_stat, num_threads);
+        /* Unlike g_new0, g_renew0 function doesn't exist yet... */
+        memset(thd_stat, 0, num_threads * sizeof(vmsr_thread_stat));
+
+        /* Populate all the thread stats */
+        for (int i = 0; i < num_threads; i++) {
+            thd_stat[i].utime = g_new0(unsigned long long, 2);
+            thd_stat[i].stime = g_new0(unsigned long long, 2);
+            thd_stat[i].thread_id = thread_ids[i];
+            vmsr_read_thread_stat(vmsr->pid,
+                                  thd_stat[i].thread_id,
+                                  thd_stat[i].utime,
+                                  thd_stat[i].stime,
+                                  &thd_stat[i].cpu_id);
+            thd_stat[i].pkg_id =
+                vmsr_get_physical_package_id(thd_stat[i].cpu_id);
+        }
+
+        /* Retrieve all packages power plane energy counter */
+        for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+            for (int j = 0; j < num_threads; j++) {
+                /*
+                 * Use the first thread we found that ran on the CPU
+                 * of the package to read the packages energy counter
+                 */
+                if (thd_stat[j].pkg_id == i) {
+                    pkg_stat[i].e_start =
+                    vmsr_read_msr(MSR_PKG_ENERGY_STATUS,
+                                  thd_stat[j].cpu_id,
+                                  thd_stat[j].thread_id,
+                                  s->msr_energy.sioc);
+                    break;
+                }
+            }
+        }
+
+        /* Sleep a short period while the other threads are working */
+        usleep(MSR_ENERGY_THREAD_SLEEP_US);
+
+        /*
+         * Retrieve all packages power plane energy counter
+         * Calculate the delta of all packages
+         */
+        for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+            for (int j = 0; j < num_threads; j++) {
+                /*
+                 * Use the first thread we found that ran on the CPU
+                 * of the package to read the packages energy counter
+                 */
+                if (thd_stat[j].pkg_id == i) {
+                    pkg_stat[i].e_end =
+                    vmsr_read_msr(MSR_PKG_ENERGY_STATUS,
+                                  thd_stat[j].cpu_id,
+                                  thd_stat[j].thread_id,
+                                  s->msr_energy.sioc);
+                    /*
+                     * Prevent the case we have migrate the VM
+                     * during the sleep period or any other cases
+                     * were energy counter might be lower after
+                     * the sleep period.
+                     */
+                    if (pkg_stat[i].e_end > pkg_stat[i].e_start) {
+                        pkg_stat[i].e_delta =
+                            pkg_stat[i].e_end - pkg_stat[i].e_start;
+                    } else {
+                        pkg_stat[i].e_delta = 0;
+                    }
+                    break;
+                }
+            }
+        }
+
+        /* Delta of ticks spend by each thread between the sample */
+        for (int i = 0; i < num_threads; i++) {
+            vmsr_read_thread_stat(vmsr->pid,
+                                  thd_stat[i].thread_id,
+                                  thd_stat[i].utime,
+                                  thd_stat[i].stime,
+                                  &thd_stat[i].cpu_id);
+
+            if (vmsr->pid < 0) {
+                /*
+                 * We don't count the dead thread
+                 * i.e threads that existed before the sleep
+                 * and not anymore
+                 */
+                thd_stat[i].delta_ticks = 0;
+            } else {
+                vmsr_delta_ticks(thd_stat, i);
+            }
+        }
+
+        /*
+         * Identify the vcpu threads
+         * Calculate the number of vcpu per package
+         */
+        CPU_FOREACH(cpu) {
+            for (int i = 0; i < num_threads; i++) {
+                if (cpu->thread_id == thd_stat[i].thread_id) {
+                    thd_stat[i].is_vcpu = true;
+                    thd_stat[i].vcpu_id = cpu->cpu_index;
+                    pkg_stat[thd_stat[i].pkg_id].nb_vcpu++;
+                    thd_stat[i].acpi_id = kvm_arch_vcpu_id(cpu);
+                    break;
+                }
+            }
+        }
+
+        /* Retrieve the virtual package number of each vCPU */
+        for (int i = 0; i < vmsr->guest_cpu_list->len; i++) {
+            for (int j = 0; j < num_threads; j++) {
+                if ((thd_stat[j].acpi_id ==
+                        vmsr->guest_cpu_list->cpus[i].arch_id)
+                    && (thd_stat[j].is_vcpu == true)) {
+                    x86_topo_ids_from_apicid(thd_stat[j].acpi_id,
+                        &vmsr->guest_topo_info, &topo_ids);
+                    thd_stat[j].vpkg_id = topo_ids.pkg_id;
+                }
+            }
+        }
+
+        /* Calculate the total energy of all non-vCPU thread */
+        for (int i = 0; i < num_threads; i++) {
+            if ((thd_stat[i].is_vcpu != true) &&
+                (thd_stat[i].delta_ticks > 0)) {
+                double temp;
+                temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta,
+                    thd_stat[i].delta_ticks,
+                    vmsr->host_topo.maxticks[thd_stat[i].pkg_id]);
+                pkg_stat[thd_stat[i].pkg_id].e_ratio
+                    += (uint64_t)lround(temp);
+            }
+        }
+
+        /* Calculate the ratio per non-vCPU thread of each package */
+        for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+            if (pkg_stat[i].nb_vcpu > 0) {
+                pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
+            }
+        }
+
+        /*
+         * Calculate the energy for each Package:
+         * Energy Package = sum of each vCPU energy that belongs to the package
+         */
+        for (int i = 0; i < num_threads; i++) {
+            if ((thd_stat[i].is_vcpu == true) && \
+                    (thd_stat[i].delta_ticks > 0)) {
+                double temp;
+                temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta,
+                    thd_stat[i].delta_ticks,
+                    vmsr->host_topo.maxticks[thd_stat[i].pkg_id]);
+                vpkgs_energy_stat[thd_stat[i].vpkg_id] +=
+                    (uint64_t)lround(temp);
+                vpkgs_energy_stat[thd_stat[i].vpkg_id] +=
+                    pkg_stat[thd_stat[i].pkg_id].e_ratio;
+            }
+        }
+
+        /*
+         * Finally populate the vmsr register of each vCPU with the total
+         * package value to emulate the real hardware where each CPU return the
+         * value of the package it belongs.
+         */
+        for (int i = 0; i < num_threads; i++) {
+            if ((thd_stat[i].is_vcpu == true) && \
+                    (thd_stat[i].delta_ticks > 0)) {
+                vmsr->msr_value[thd_stat[i].vcpu_id] = \
+                                        vpkgs_energy_stat[thd_stat[i].vpkg_id];
+          }
+        }
+
+        /* Freeing memory before zeroing the pointer */
+        for (int i = 0; i < num_threads; i++) {
+            g_free(thd_stat[i].utime);
+            g_free(thd_stat[i].stime);
+        }
+   }
+
+clean:
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
+{
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
+    struct KVMMsrEnergy *r = &s->msr_energy;
+    int ret = 0;
+
+    /*
+     * Sanity check
+     * 1. Host cpu must be Intel cpu
+     * 2. RAPL must be enabled on the Host
+     */
+    if (is_host_cpu_intel()) {
+        error_report("The RAPL feature can only be enabled on hosts\
+                      with Intel CPU models");
+        ret = 1;
+        goto out;
+    }
+
+    if (!is_rapl_enabled()) {
+        ret = 1;
+        goto out;
+    }
+
+    /* Retrieve the virtual topology */
+    vmsr_init_topo_info(&r->guest_topo_info, ms);
+
+    /* Retrieve the number of vcpu */
+    r->guest_vcpus = ms->smp.cpus;
+
+    /* Retrieve the number of virtual sockets */
+    r->guest_vsockets = ms->smp.sockets;
+
+    /* Allocate register memory (MSR_PKG_STATUS) for each vcpu */
+    r->msr_value = g_new0(uint64_t, r->guest_vcpus);
+
+    /* Retrieve the CPUArchIDlist */
+    r->guest_cpu_list = mc->possible_cpu_arch_ids(ms);
+
+    /* Max number of cpus on the Host */
+    r->host_topo.maxcpus = vmsr_get_maxcpus();
+    if (r->host_topo.maxcpus == 0) {
+        error_report("host max cpus = 0");
+        ret = 1;
+        goto out;
+    }
+
+    /* Max number of packages on the host */
+    r->host_topo.maxpkgs = vmsr_get_max_physical_package(r->host_topo.maxcpus);
+    if (r->host_topo.maxpkgs == 0) {
+        error_report("host max pkgs = 0");
+        ret = 1;
+        goto out;
+    }
+
+    /* Allocate memory for each package on the host */
+    r->host_topo.pkg_cpu_count = g_new0(unsigned int, r->host_topo.maxpkgs);
+    r->host_topo.maxticks = g_new0(unsigned int, r->host_topo.maxpkgs);
+
+    vmsr_count_cpus_per_package(r->host_topo.pkg_cpu_count,
+                                r->host_topo.maxpkgs);
+    for (int i = 0; i < r->host_topo.maxpkgs; i++) {
+        if (r->host_topo.pkg_cpu_count[i] == 0) {
+            error_report("cpu per packages = 0 on package_%d", i);
+            ret = 1;
+            goto out;
+        }
+    }
+
+    /* Get QEMU PID*/
+    r->pid = getpid();
+
+    /* Compute the socket path if necessary */
+    if (s->msr_energy.socket_path == NULL) {
+        s->msr_energy.socket_path = vmsr_compute_default_paths();
+    }
+
+    /* Open socket with vmsr helper */
+    s->msr_energy.sioc = vmsr_open_socket(s->msr_energy.socket_path);
+
+    if (s->msr_energy.sioc == NULL) {
+        error_report("vmsr socket opening failed");
+        ret = 1;
+        goto out;
+    }
+
+    /* Those MSR values should not change */
+    r->msr_unit  = vmsr_read_msr(MSR_RAPL_POWER_UNIT, 0, r->pid,
+                                    s->msr_energy.sioc);
+    r->msr_limit = vmsr_read_msr(MSR_PKG_POWER_LIMIT, 0, r->pid,
+                                    s->msr_energy.sioc);
+    r->msr_info  = vmsr_read_msr(MSR_PKG_POWER_INFO, 0, r->pid,
+                                    s->msr_energy.sioc);
+    if (r->msr_unit == 0 || r->msr_limit == 0 || r->msr_info == 0) {
+        error_report("can't read any virtual msr");
+        ret = 1;
+        goto out;
+    }
+
+    qemu_thread_create(&r->msr_thr, "kvm-msr",
+                       kvm_msr_energy_thread,
+                       s, QEMU_THREAD_JOINABLE);
+out:
+    return ret;
+}
+
 int kvm_arch_get_default_type(MachineState *ms)
 int kvm_arch_get_default_type(MachineState *ms)
 {
 {
     return 0;
     return 0;
@@ -2804,6 +3190,49 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
                          strerror(-ret));
                          strerror(-ret));
             exit(1);
             exit(1);
         }
         }
+
+        if (s->msr_energy.enable == true) {
+            r = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT,
+                               kvm_rdmsr_rapl_power_unit, NULL);
+            if (!r) {
+                error_report("Could not install MSR_RAPL_POWER_UNIT \
+                                handler: %s",
+                             strerror(-ret));
+                exit(1);
+            }
+
+            r = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT,
+                               kvm_rdmsr_pkg_power_limit, NULL);
+            if (!r) {
+                error_report("Could not install MSR_PKG_POWER_LIMIT \
+                                handler: %s",
+                             strerror(-ret));
+                exit(1);
+            }
+
+            r = kvm_filter_msr(s, MSR_PKG_POWER_INFO,
+                               kvm_rdmsr_pkg_power_info, NULL);
+            if (!r) {
+                error_report("Could not install MSR_PKG_POWER_INFO \
+                                handler: %s",
+                             strerror(-ret));
+                exit(1);
+            }
+            r = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS,
+                               kvm_rdmsr_pkg_energy_status, NULL);
+            if (!r) {
+                error_report("Could not install MSR_PKG_ENERGY_STATUS \
+                                handler: %s",
+                             strerror(-ret));
+                exit(1);
+            }
+            r = kvm_msr_energy_thread_init(s, ms);
+            if (r) {
+                error_report("kvm : error RAPL feature requirement not meet");
+                exit(1);
+            }
+
+        }
     }
     }
 
 
     return 0;
     return 0;

+ 1 - 0
target/i386/kvm/meson.build

@@ -3,6 +3,7 @@ i386_kvm_ss = ss.source_set()
 i386_kvm_ss.add(files(
 i386_kvm_ss.add(files(
   'kvm.c',
   'kvm.c',
   'kvm-cpu.c',
   'kvm-cpu.c',
+  'vmsr_energy.c',
 ))
 ))
 
 
 i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
 i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))

+ 345 - 0
target/i386/kvm/vmsr_energy.c

@@ -0,0 +1,345 @@
+/*
+ * QEMU KVM support -- x86 virtual RAPL msr
+ *
+ * Copyright 2024 Red Hat, Inc. 2024
+ *
+ *  Author:
+ *      Anthony Harivel <aharivel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "vmsr_energy.h"
+#include "io/channel.h"
+#include "io/channel-socket.h"
+#include "hw/boards.h"
+#include "cpu.h"
+#include "host-cpu.h"
+
+char *vmsr_compute_default_paths(void)
+{
+    g_autofree char *state = qemu_get_local_state_dir();
+
+    return g_build_filename(state, "run", "qemu-vmsr-helper.sock", NULL);
+}
+
+bool is_host_cpu_intel(void)
+{
+    int family, model, stepping;
+    char vendor[CPUID_VENDOR_SZ + 1];
+
+    host_cpu_vendor_fms(vendor, &family, &model, &stepping);
+
+    return strcmp(vendor, CPUID_VENDOR_INTEL);
+}
+
+int is_rapl_enabled(void)
+{
+    const char *path = "/sys/class/powercap/intel-rapl/enabled";
+    FILE *file = fopen(path, "r");
+    int value = 0;
+
+    if (file != NULL) {
+        if (fscanf(file, "%d", &value) != 1) {
+            error_report("INTEL RAPL not enabled");
+        }
+        fclose(file);
+    } else {
+        error_report("Error opening %s", path);
+    }
+
+    return value;
+}
+
+QIOChannelSocket *vmsr_open_socket(const char *path)
+{
+    g_autofree char *socket_path = NULL;
+
+    socket_path = g_strdup(path);
+
+    SocketAddress saddr = {
+        .type = SOCKET_ADDRESS_TYPE_UNIX,
+        .u.q_unix.path = socket_path
+    };
+
+    QIOChannelSocket *sioc = qio_channel_socket_new();
+    Error *local_err = NULL;
+
+    qio_channel_set_name(QIO_CHANNEL(sioc), "vmsr-helper");
+    qio_channel_socket_connect_sync(sioc,
+                                    &saddr,
+                                    &local_err);
+    if (local_err) {
+        /* Close socket. */
+        qio_channel_close(QIO_CHANNEL(sioc), NULL);
+        object_unref(OBJECT(sioc));
+        sioc = NULL;
+        goto out;
+    }
+
+    qio_channel_set_delay(QIO_CHANNEL(sioc), false);
+out:
+    return sioc;
+}
+
+uint64_t vmsr_read_msr(uint32_t reg, uint32_t cpu_id, uint32_t tid,
+                       QIOChannelSocket *sioc)
+{
+    uint64_t data = 0;
+    int r = 0;
+    Error *local_err = NULL;
+    uint32_t buffer[3];
+    /*
+     * Send the required arguments:
+     * 1. RAPL MSR register to read
+     * 2. On which CPU ID
+     * 3. From which vCPU (Thread ID)
+     */
+    buffer[0] = reg;
+    buffer[1] = cpu_id;
+    buffer[2] = tid;
+
+    r = qio_channel_write_all(QIO_CHANNEL(sioc),
+                              (char *)buffer, sizeof(buffer),
+                              &local_err);
+    if (r < 0) {
+        goto out_close;
+    }
+
+    r = qio_channel_read(QIO_CHANNEL(sioc),
+                             (char *)&data, sizeof(data),
+                             &local_err);
+    if (r < 0) {
+        data = 0;
+        goto out_close;
+    }
+
+out_close:
+   return data;
+}
+
+/* Retrieve the max number of physical package */
+unsigned int vmsr_get_max_physical_package(unsigned int max_cpus)
+{
+    const char *dir = "/sys/devices/system/cpu/";
+    const char *topo_path = "topology/physical_package_id";
+    g_autofree int *uniquePackages = g_new0(int, max_cpus);
+    unsigned int packageCount = 0;
+    FILE *file = NULL;
+
+    for (int i = 0; i < max_cpus; i++) {
+        g_autofree char *filePath = NULL;
+        g_autofree char *cpuid = g_strdup_printf("cpu%d", i);
+
+        filePath = g_build_filename(dir, cpuid, topo_path, NULL);
+
+        file = fopen(filePath, "r");
+
+        if (file == NULL) {
+            error_report("Error opening physical_package_id file");
+            return 0;
+        }
+
+        char packageId[10];
+        if (fgets(packageId, sizeof(packageId), file) == NULL) {
+            packageCount = 0;
+        }
+
+        fclose(file);
+
+        int currentPackageId = atoi(packageId);
+
+        bool isUnique = true;
+        for (int j = 0; j < packageCount; j++) {
+            if (uniquePackages[j] == currentPackageId) {
+                isUnique = false;
+                break;
+            }
+        }
+
+        if (isUnique) {
+            uniquePackages[packageCount] = currentPackageId;
+            packageCount++;
+
+            if (packageCount >= max_cpus) {
+                break;
+            }
+        }
+    }
+
+    return (packageCount == 0) ? 1 : packageCount;
+}
+
+/* Retrieve the max number of physical cpu on the host */
+unsigned int vmsr_get_maxcpus(void)
+{
+    GDir *dir;
+    const gchar *entry_name;
+    unsigned int cpu_count = 0;
+    const char *path = "/sys/devices/system/cpu/";
+
+    dir = g_dir_open(path, 0, NULL);
+    if (dir == NULL) {
+        error_report("Unable to open cpu directory");
+        return -1;
+    }
+
+    while ((entry_name = g_dir_read_name(dir)) != NULL) {
+        if (g_ascii_strncasecmp(entry_name, "cpu", 3) == 0 &&
+            isdigit(entry_name[3])) {
+            cpu_count++;
+        }
+    }
+
+    g_dir_close(dir);
+
+    return cpu_count;
+}
+
+/* Count the number of physical cpu on each packages */
+unsigned int vmsr_count_cpus_per_package(unsigned int *package_count,
+                                         unsigned int max_pkgs)
+{
+    g_autofree char *file_contents = NULL;
+    g_autofree char *path = NULL;
+    g_autofree char *path_name = NULL;
+    gsize length;
+
+    /* Iterate over cpus and count cpus in each package */
+    for (int cpu_id = 0; ; cpu_id++) {
+        path_name = g_strdup_printf("/sys/devices/system/cpu/cpu%d/"
+            "topology/physical_package_id", cpu_id);
+
+        path = g_build_filename(path_name, NULL);
+
+        if (!g_file_get_contents(path, &file_contents, &length, NULL)) {
+            break; /* No more cpus */
+        }
+
+        /* Get the physical package ID for this CPU */
+        int package_id = atoi(file_contents);
+
+        /* Check if the package ID is within the known number of packages */
+        if (package_id >= 0 && package_id < max_pkgs) {
+            /* If yes, count the cpu for this package*/
+            package_count[package_id]++;
+        }
+    }
+
+    return 0;
+}
+
+/* Get the physical package id from a given cpu id */
+int vmsr_get_physical_package_id(int cpu_id)
+{
+    g_autofree char *file_contents = NULL;
+    g_autofree char *file_path = NULL;
+    int package_id = -1;
+    gsize length;
+
+    file_path = g_strdup_printf("/sys/devices/system/cpu/cpu%d"
+        "/topology/physical_package_id", cpu_id);
+
+    if (!g_file_get_contents(file_path, &file_contents, &length, NULL)) {
+        goto out;
+    }
+
+    package_id = atoi(file_contents);
+
+out:
+    return package_id;
+}
+
+/* Read the scheduled time for a given thread of a give pid */
+void vmsr_read_thread_stat(pid_t pid,
+                      unsigned int thread_id,
+                      unsigned long long *utime,
+                      unsigned long long *stime,
+                      unsigned int *cpu_id)
+{
+    g_autofree char *path = NULL;
+    g_autofree char *path_name = NULL;
+
+    path_name = g_strdup_printf("/proc/%u/task/%d/stat", pid, thread_id);
+
+    path = g_build_filename(path_name, NULL);
+
+    FILE *file = fopen(path, "r");
+    if (file == NULL) {
+        pid = -1;
+        return;
+    }
+
+    if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u"
+        " %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u"
+        " %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u",
+           utime, stime, cpu_id) != 3)
+    {
+        pid = -1;
+        return;
+    }
+
+    fclose(file);
+    return;
+}
+
+/* Read QEMU stat task folder to retrieve all QEMU threads ID */
+pid_t *vmsr_get_thread_ids(pid_t pid, unsigned int *num_threads)
+{
+    g_autofree char *task_path = g_strdup_printf("%d/task", pid);
+    g_autofree char *path = g_build_filename("/proc", task_path, NULL);
+
+    DIR *dir = opendir(path);
+    if (dir == NULL) {
+        error_report("Error opening /proc/qemu/task");
+        return NULL;
+    }
+
+    pid_t *thread_ids = NULL;
+    unsigned int thread_count = 0;
+
+    g_autofree struct dirent *ent = NULL;
+    while ((ent = readdir(dir)) != NULL) {
+        if (ent->d_name[0] == '.') {
+            continue;
+        }
+        pid_t tid = atoi(ent->d_name);
+        if (pid != tid) {
+            thread_ids = g_renew(pid_t, thread_ids, (thread_count + 1));
+            thread_ids[thread_count] = tid;
+            thread_count++;
+        }
+    }
+
+    closedir(dir);
+
+    *num_threads = thread_count;
+    return thread_ids;
+}
+
+void vmsr_delta_ticks(vmsr_thread_stat *thd_stat, int i)
+{
+    thd_stat[i].delta_ticks = (thd_stat[i].utime[1] + thd_stat[i].stime[1])
+                            - (thd_stat[i].utime[0] + thd_stat[i].stime[0]);
+}
+
+double vmsr_get_ratio(uint64_t e_delta,
+                      unsigned long long delta_ticks,
+                      unsigned int maxticks)
+{
+    return (e_delta / 100.0) * ((100.0 / maxticks) * delta_ticks);
+}
+
+void vmsr_init_topo_info(X86CPUTopoInfo *topo_info,
+                           const MachineState *ms)
+{
+    topo_info->dies_per_pkg = ms->smp.dies;
+    topo_info->modules_per_die = ms->smp.modules;
+    topo_info->cores_per_module = ms->smp.cores;
+    topo_info->threads_per_core = ms->smp.threads;
+}
+

+ 99 - 0
target/i386/kvm/vmsr_energy.h

@@ -0,0 +1,99 @@
+/*
+ * QEMU KVM support -- x86 virtual energy-related MSR.
+ *
+ * Copyright 2024 Red Hat, Inc. 2024
+ *
+ *  Author:
+ *      Anthony Harivel <aharivel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef VMSR_ENERGY_H
+#define VMSR_ENERGY_H
+
+#include <stdint.h>
+#include "qemu/osdep.h"
+#include "io/channel-socket.h"
+#include "hw/i386/topology.h"
+
+/*
+ * Define the interval time in micro seconds between 2 samples of
+ * energy related MSRs
+ */
+#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
+
+/*
+ * Thread statistic
+ * @ thread_id: TID (thread ID)
+ * @ is_vcpu: true if TID is vCPU thread
+ * @ cpu_id: CPU number last executed on
+ * @ pkg_id: package number of the CPU
+ * @ vcpu_id: vCPU ID
+ * @ vpkg: virtual package number
+ * @ acpi_id: APIC id of the vCPU
+ * @ utime: amount of clock ticks the thread
+ *          has been scheduled in User mode
+ * @ stime: amount of clock ticks the thread
+ *          has been scheduled in System mode
+ * @ delta_ticks: delta of utime+stime between
+ *          the two samples (before/after sleep)
+ */
+struct vmsr_thread_stat {
+    unsigned int thread_id;
+    bool is_vcpu;
+    unsigned int cpu_id;
+    unsigned int pkg_id;
+    unsigned int vpkg_id;
+    unsigned int vcpu_id;
+    unsigned long acpi_id;
+    unsigned long long *utime;
+    unsigned long long *stime;
+    unsigned long long delta_ticks;
+};
+
+/*
+ * Package statistic
+ * @ e_start: package energy counter before the sleep
+ * @ e_end: package energy counter after the sleep
+ * @ e_delta: delta of package energy counter
+ * @ e_ratio: store the energy ratio of non-vCPU thread
+ * @ nb_vcpu: number of vCPU running on this package
+ */
+struct vmsr_package_energy_stat {
+    uint64_t e_start;
+    uint64_t e_end;
+    uint64_t e_delta;
+    uint64_t e_ratio;
+    unsigned int nb_vcpu;
+};
+
+typedef struct vmsr_thread_stat vmsr_thread_stat;
+typedef struct vmsr_package_energy_stat vmsr_package_energy_stat;
+
+char *vmsr_compute_default_paths(void);
+void vmsr_read_thread_stat(pid_t pid,
+                      unsigned int thread_id,
+                      unsigned long long *utime,
+                      unsigned long long *stime,
+                      unsigned int *cpu_id);
+
+QIOChannelSocket *vmsr_open_socket(const char *path);
+uint64_t vmsr_read_msr(uint32_t reg, uint32_t cpu_id,
+                       uint32_t tid, QIOChannelSocket *sioc);
+void vmsr_delta_ticks(vmsr_thread_stat *thd_stat, int i);
+unsigned int vmsr_get_maxcpus(void);
+unsigned int vmsr_get_max_physical_package(unsigned int max_cpus);
+unsigned int vmsr_count_cpus_per_package(unsigned int *package_count,
+                                         unsigned int max_pkgs);
+int vmsr_get_physical_package_id(int cpu_id);
+pid_t *vmsr_get_thread_ids(pid_t pid, unsigned int *num_threads);
+double vmsr_get_ratio(uint64_t e_delta,
+                      unsigned long long delta_ticks,
+                      unsigned int maxticks);
+void vmsr_init_topo_info(X86CPUTopoInfo *topo_info, const MachineState *ms);
+bool is_host_cpu_intel(void);
+int is_rapl_enabled(void);
+#endif /* VMSR_ENERGY_H */

+ 530 - 0
tools/i386/qemu-vmsr-helper.c

@@ -0,0 +1,530 @@
+/*
+ * Privileged RAPL MSR helper commands for QEMU
+ *
+ * Copyright (C) 2024 Red Hat, Inc. <aharivel@redhat.com>
+ *
+ * Author: Anthony Harivel <aharivel@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include <getopt.h>
+#include <stdbool.h>
+#include <sys/ioctl.h>
+#ifdef CONFIG_LIBCAP_NG
+#include <cap-ng.h>
+#endif
+#include <pwd.h>
+#include <grp.h>
+
+#include "qemu/help-texts.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "qemu/main-loop.h"
+#include "qemu/module.h"
+#include "qemu/error-report.h"
+#include "qemu/config-file.h"
+#include "qemu-version.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/log.h"
+#include "qemu/systemd.h"
+#include "io/channel.h"
+#include "io/channel-socket.h"
+#include "trace/control.h"
+#include "qemu-version.h"
+#include "rapl-msr-index.h"
+
+#define MSR_PATH_TEMPLATE "/dev/cpu/%u/msr"
+
+static char *socket_path;
+static char *pidfile;
+static enum { RUNNING, TERMINATE, TERMINATING } state;
+static QIOChannelSocket *server_ioc;
+static int server_watch;
+static int num_active_sockets = 1;
+
+#ifdef CONFIG_LIBCAP_NG
+static int uid = -1;
+static int gid = -1;
+#endif
+
+static void compute_default_paths(void)
+{
+    g_autofree char *state = qemu_get_local_state_dir();
+
+    socket_path = g_build_filename(state, "run", "qemu-vmsr-helper.sock", NULL);
+    pidfile = g_build_filename(state, "run", "qemu-vmsr-helper.pid", NULL);
+}
+
+static int is_intel_processor(void)
+{
+    int result;
+    int ebx, ecx, edx;
+
+    /* Execute CPUID instruction with eax=0 (basic identification) */
+    asm volatile (
+        "cpuid"
+        : "=b" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (0)
+    );
+
+    /*
+     *  Check if processor is "GenuineIntel"
+     *  0x756e6547 = "Genu"
+     *  0x49656e69 = "ineI"
+     *  0x6c65746e = "ntel"
+     */
+    result = (ebx == 0x756e6547) && (edx == 0x49656e69) && (ecx == 0x6c65746e);
+
+    return result;
+}
+
+static int is_rapl_enabled(void)
+{
+    const char *path = "/sys/class/powercap/intel-rapl/enabled";
+    FILE *file = fopen(path, "r");
+    int value = 0;
+
+    if (file != NULL) {
+        if (fscanf(file, "%d", &value) != 1) {
+            error_report("INTEL RAPL not enabled");
+        }
+        fclose(file);
+    } else {
+        error_report("Error opening %s", path);
+    }
+
+    return value;
+}
+
+/*
+ * Check if the TID that request the MSR read
+ * belongs to the peer. It be should a TID of a vCPU.
+ */
+static bool is_tid_present(pid_t pid, pid_t tid)
+{
+    g_autofree char *tidPath = g_strdup_printf("/proc/%d/task/%d", pid, tid);
+
+    /* Check if the TID directory exists within the PID directory */
+    if (access(tidPath, F_OK) == 0) {
+        return true;
+    }
+
+    error_report("Failed to open /proc at %s", tidPath);
+    return false;
+}
+
+/*
+ * Only the RAPL MSR in target/i386/cpu.h are allowed
+ */
+static bool is_msr_allowed(uint32_t reg)
+{
+    switch (reg) {
+    case MSR_RAPL_POWER_UNIT:
+    case MSR_PKG_POWER_LIMIT:
+    case MSR_PKG_ENERGY_STATUS:
+    case MSR_PKG_POWER_INFO:
+        return true;
+    default:
+        return false;
+    }
+}
+
+static uint64_t vmsr_read_msr(uint32_t msr_register, unsigned int cpu_id)
+{
+    int fd;
+    uint64_t result = 0;
+
+    g_autofree char *path = g_strdup_printf(MSR_PATH_TEMPLATE, cpu_id);
+
+    fd = open(path, O_RDONLY);
+    if (fd < 0) {
+        error_report("Failed to open MSR file at %s", path);
+        return result;
+    }
+
+    if (pread(fd, &result, sizeof(result), msr_register) != sizeof(result)) {
+        error_report("Failed to read MSR");
+        result = 0;
+    }
+
+    close(fd);
+    return result;
+}
+
+static void usage(const char *name)
+{
+    (printf) (
+"Usage: %s [OPTIONS] FILE\n"
+"Virtual RAPL MSR helper program for QEMU\n"
+"\n"
+"  -h, --help                display this help and exit\n"
+"  -V, --version             output version information and exit\n"
+"\n"
+"  -d, --daemon              run in the background\n"
+"  -f, --pidfile=PATH        PID file when running as a daemon\n"
+"                            (default '%s')\n"
+"  -k, --socket=PATH         path to the unix socket\n"
+"                            (default '%s')\n"
+"  -T, --trace [[enable=]<pattern>][,events=<file>][,file=<file>]\n"
+"                            specify tracing options\n"
+#ifdef CONFIG_LIBCAP_NG
+"  -u, --user=USER           user to drop privileges to\n"
+"  -g, --group=GROUP         group to drop privileges to\n"
+#endif
+"\n"
+QEMU_HELP_BOTTOM "\n"
+    , name, pidfile, socket_path);
+}
+
+static void version(const char *name)
+{
+    printf(
+"%s " QEMU_FULL_VERSION "\n"
+"Written by Anthony Harivel.\n"
+"\n"
+QEMU_COPYRIGHT "\n"
+"This is free software; see the source for copying conditions.  There is NO\n"
+"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"
+    , name);
+}
+
+typedef struct VMSRHelperClient {
+    QIOChannelSocket *ioc;
+    Coroutine *co;
+} VMSRHelperClient;
+
+static void coroutine_fn vh_co_entry(void *opaque)
+{
+    VMSRHelperClient *client = opaque;
+    Error *local_err = NULL;
+    unsigned int peer_pid;
+    uint32_t request[3];
+    uint64_t vmsr;
+    int r;
+
+    qio_channel_set_blocking(QIO_CHANNEL(client->ioc),
+                             false, NULL);
+
+    qio_channel_set_follow_coroutine_ctx(QIO_CHANNEL(client->ioc), true);
+
+    /*
+     * Check peer credentials
+     */
+    r = qio_channel_get_peerpid(QIO_CHANNEL(client->ioc),
+                                &peer_pid,
+                                &local_err);
+    if (r < 0) {
+        error_report_err(local_err);
+        goto out;
+    }
+
+    while (r < 0) {
+        /*
+         * Read the requested MSR
+         * Only RAPL MSR in rapl-msr-index.h is allowed
+         */
+        r = qio_channel_read_all(QIO_CHANNEL(client->ioc),
+                                (char *) &request, sizeof(request), &local_err);
+        if (r < 0) {
+            error_report_err(local_err);
+            break;
+        }
+
+        if (!is_msr_allowed(request[0])) {
+            error_report("Requested unallowed msr: %d", request[0]);
+            break;
+        }
+
+        vmsr = vmsr_read_msr(request[0], request[1]);
+
+        if (!is_tid_present(peer_pid, request[2])) {
+            error_report("Requested TID not in peer PID: %d %d",
+                peer_pid, request[2]);
+            vmsr = 0;
+        }
+
+        r = qio_channel_write_all(QIO_CHANNEL(client->ioc),
+                                  (char *) &vmsr,
+                                  sizeof(vmsr),
+                                  &local_err);
+        if (r < 0) {
+            error_report_err(local_err);
+            break;
+        }
+    }
+out:
+    object_unref(OBJECT(client->ioc));
+    g_free(client);
+}
+
+static gboolean accept_client(QIOChannel *ioc,
+                              GIOCondition cond,
+                              gpointer opaque)
+{
+    QIOChannelSocket *cioc;
+    VMSRHelperClient *vmsrh;
+
+    cioc = qio_channel_socket_accept(QIO_CHANNEL_SOCKET(ioc),
+                                     NULL);
+    if (!cioc) {
+        return TRUE;
+    }
+
+    vmsrh = g_new(VMSRHelperClient, 1);
+    vmsrh->ioc = cioc;
+    vmsrh->co = qemu_coroutine_create(vh_co_entry, vmsrh);
+    qemu_coroutine_enter(vmsrh->co);
+
+    return TRUE;
+}
+
+static void termsig_handler(int signum)
+{
+    qatomic_cmpxchg(&state, RUNNING, TERMINATE);
+    qemu_notify_event();
+}
+
+static void close_server_socket(void)
+{
+    assert(server_ioc);
+
+    g_source_remove(server_watch);
+    server_watch = -1;
+    object_unref(OBJECT(server_ioc));
+    num_active_sockets--;
+}
+
+#ifdef CONFIG_LIBCAP_NG
+static int drop_privileges(void)
+{
+    /* clear all capabilities */
+    capng_clear(CAPNG_SELECT_BOTH);
+
+    if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE | CAPNG_PERMITTED,
+                     CAP_SYS_RAWIO) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+    const char *sopt = "hVk:f:dT:u:g:vq";
+    struct option lopt[] = {
+        { "help", no_argument, NULL, 'h' },
+        { "version", no_argument, NULL, 'V' },
+        { "socket", required_argument, NULL, 'k' },
+        { "pidfile", required_argument, NULL, 'f' },
+        { "daemon", no_argument, NULL, 'd' },
+        { "trace", required_argument, NULL, 'T' },
+        { "verbose", no_argument, NULL, 'v' },
+        { NULL, 0, NULL, 0 }
+    };
+    int opt_ind = 0;
+    int ch;
+    Error *local_err = NULL;
+    bool daemonize = false;
+    bool pidfile_specified = false;
+    bool socket_path_specified = false;
+    unsigned socket_activation;
+
+    struct sigaction sa_sigterm;
+    memset(&sa_sigterm, 0, sizeof(sa_sigterm));
+    sa_sigterm.sa_handler = termsig_handler;
+    sigaction(SIGTERM, &sa_sigterm, NULL);
+    sigaction(SIGINT, &sa_sigterm, NULL);
+    sigaction(SIGHUP, &sa_sigterm, NULL);
+
+    signal(SIGPIPE, SIG_IGN);
+
+    error_init(argv[0]);
+    module_call_init(MODULE_INIT_TRACE);
+    module_call_init(MODULE_INIT_QOM);
+    qemu_add_opts(&qemu_trace_opts);
+    qemu_init_exec_dir(argv[0]);
+
+    compute_default_paths();
+
+    /*
+     * Sanity check
+     * 1. cpu must be Intel cpu
+     * 2. RAPL must be enabled
+     */
+    if (!is_intel_processor()) {
+        error_report("error: CPU is not INTEL cpu");
+        exit(EXIT_FAILURE);
+    }
+
+    if (!is_rapl_enabled()) {
+        error_report("error: RAPL driver not enable");
+        exit(EXIT_FAILURE);
+    }
+
+    while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
+        switch (ch) {
+        case 'k':
+            g_free(socket_path);
+            socket_path = g_strdup(optarg);
+            socket_path_specified = true;
+            if (socket_path[0] != '/') {
+                error_report("socket path must be absolute");
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 'f':
+            g_free(pidfile);
+            pidfile = g_strdup(optarg);
+            pidfile_specified = true;
+            break;
+#ifdef CONFIG_LIBCAP_NG
+        case 'u': {
+            unsigned long res;
+            struct passwd *userinfo = getpwnam(optarg);
+            if (userinfo) {
+                uid = userinfo->pw_uid;
+            } else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 &&
+                       (uid_t)res == res) {
+                uid = res;
+            } else {
+                error_report("invalid user '%s'", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        }
+        case 'g': {
+            unsigned long res;
+            struct group *groupinfo = getgrnam(optarg);
+            if (groupinfo) {
+                gid = groupinfo->gr_gid;
+            } else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 &&
+                       (gid_t)res == res) {
+                gid = res;
+            } else {
+                error_report("invalid group '%s'", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        }
+#else
+        case 'u':
+        case 'g':
+            error_report("-%c not supported by this %s", ch, argv[0]);
+            exit(1);
+#endif
+        case 'd':
+            daemonize = true;
+            break;
+        case 'T':
+            trace_opt_parse(optarg);
+            break;
+        case 'V':
+            version(argv[0]);
+            exit(EXIT_SUCCESS);
+            break;
+        case 'h':
+            usage(argv[0]);
+            exit(EXIT_SUCCESS);
+            break;
+        case '?':
+            error_report("Try `%s --help' for more information.", argv[0]);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    if (!trace_init_backends()) {
+        exit(EXIT_FAILURE);
+    }
+    trace_init_file();
+    qemu_set_log(LOG_TRACE, &error_fatal);
+
+    socket_activation = check_socket_activation();
+    if (socket_activation == 0) {
+        SocketAddress saddr;
+        saddr = (SocketAddress){
+            .type = SOCKET_ADDRESS_TYPE_UNIX,
+            .u.q_unix.path = socket_path,
+        };
+        server_ioc = qio_channel_socket_new();
+        if (qio_channel_socket_listen_sync(server_ioc, &saddr,
+                                           1, &local_err) < 0) {
+            object_unref(OBJECT(server_ioc));
+            error_report_err(local_err);
+            return 1;
+        }
+    } else {
+        /* Using socket activation - check user didn't use -p etc. */
+        if (socket_path_specified) {
+            error_report("Unix socket can't be set when"
+                         "using socket activation");
+            exit(EXIT_FAILURE);
+        }
+
+        /* Can only listen on a single socket.  */
+        if (socket_activation > 1) {
+            error_report("%s does not support socket activation"
+                         "with LISTEN_FDS > 1",
+                        argv[0]);
+            exit(EXIT_FAILURE);
+        }
+        server_ioc = qio_channel_socket_new_fd(FIRST_SOCKET_ACTIVATION_FD,
+                                               &local_err);
+        if (server_ioc == NULL) {
+            error_reportf_err(local_err,
+                              "Failed to use socket activation: ");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    qemu_init_main_loop(&error_fatal);
+
+    server_watch = qio_channel_add_watch(QIO_CHANNEL(server_ioc),
+                                         G_IO_IN,
+                                         accept_client,
+                                         NULL, NULL);
+
+    if (daemonize) {
+        if (daemon(0, 0) < 0) {
+            error_report("Failed to daemonize: %s", strerror(errno));
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    if (daemonize || pidfile_specified) {
+        qemu_write_pidfile(pidfile, &error_fatal);
+    }
+
+#ifdef CONFIG_LIBCAP_NG
+    if (drop_privileges() < 0) {
+        error_report("Failed to drop privileges: %s", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+#endif
+
+    info_report("Listening on %s", socket_path);
+
+    state = RUNNING;
+    do {
+        main_loop_wait(false);
+        if (state == TERMINATE) {
+            state = TERMINATING;
+            close_server_socket();
+        }
+    } while (num_active_sockets > 0);
+
+    exit(EXIT_SUCCESS);
+}

+ 28 - 0
tools/i386/rapl-msr-index.h

@@ -0,0 +1,28 @@
+/*
+ * Allowed list of MSR for Privileged RAPL MSR helper commands for QEMU
+ *
+ * Copyright (C) 2023 Red Hat, Inc. <aharivel@redhat.com>
+ *
+ * Author: Anthony Harivel <aharivel@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Should stay in sync with the RAPL MSR
+ * in target/i386/cpu.h
+ */
+#define MSR_RAPL_POWER_UNIT             0x00000606
+#define MSR_PKG_POWER_LIMIT             0x00000610
+#define MSR_PKG_ENERGY_STATUS           0x00000611
+#define MSR_PKG_POWER_INFO              0x00000614