x86.c 49 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528
  1. /*
  2. * Copyright (c) 2003-2004 Fabrice Bellard
  3. * Copyright (c) 2019 Red Hat, Inc.
  4. *
  5. * Permission is hereby granted, free of charge, to any person obtaining a copy
  6. * of this software and associated documentation files (the "Software"), to deal
  7. * in the Software without restriction, including without limitation the rights
  8. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. * copies of the Software, and to permit persons to whom the Software is
  10. * furnished to do so, subject to the following conditions:
  11. *
  12. * The above copyright notice and this permission notice shall be included in
  13. * all copies or substantial portions of the Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  21. * THE SOFTWARE.
  22. */
  23. #include "qemu/osdep.h"
  24. #include "qemu/error-report.h"
  25. #include "qemu/option.h"
  26. #include "qemu/cutils.h"
  27. #include "qemu/units.h"
  28. #include "qemu/datadir.h"
  29. #include "qemu/guest-random.h"
  30. #include "qapi/error.h"
  31. #include "qapi/qapi-visit-common.h"
  32. #include "qapi/clone-visitor.h"
  33. #include "qapi/qapi-visit-machine.h"
  34. #include "qapi/visitor.h"
  35. #include "sysemu/qtest.h"
  36. #include "sysemu/whpx.h"
  37. #include "sysemu/numa.h"
  38. #include "sysemu/replay.h"
  39. #include "sysemu/reset.h"
  40. #include "sysemu/sysemu.h"
  41. #include "sysemu/cpu-timers.h"
  42. #include "sysemu/xen.h"
  43. #include "trace.h"
  44. #include "hw/i386/x86.h"
  45. #include "target/i386/cpu.h"
  46. #include "hw/i386/topology.h"
  47. #include "hw/i386/fw_cfg.h"
  48. #include "hw/intc/i8259.h"
  49. #include "hw/rtc/mc146818rtc.h"
  50. #include "target/i386/sev.h"
  51. #include "hw/i386/microvm.h"
  52. #include "hw/acpi/cpu_hotplug.h"
  53. #include "hw/irq.h"
  54. #include "hw/nmi.h"
  55. #include "hw/loader.h"
  56. #include "multiboot.h"
  57. #include "elf.h"
  58. #include "standard-headers/asm-x86/bootparam.h"
  59. #include CONFIG_DEVICES
  60. #include "kvm/kvm_i386.h"
  61. /* Physical Address of PVH entry point read from kernel ELF NOTE */
  62. static size_t pvh_start_addr;
  63. inline void init_topo_info(X86CPUTopoInfo *topo_info,
  64. const X86MachineState *x86ms)
  65. {
  66. MachineState *ms = MACHINE(x86ms);
  67. topo_info->dies_per_pkg = ms->smp.dies;
  68. topo_info->cores_per_die = ms->smp.cores;
  69. topo_info->threads_per_core = ms->smp.threads;
  70. }
  71. /*
  72. * Calculates initial APIC ID for a specific CPU index
  73. *
  74. * Currently we need to be able to calculate the APIC ID from the CPU index
  75. * alone (without requiring a CPU object), as the QEMU<->Seabios interfaces have
  76. * no concept of "CPU index", and the NUMA tables on fw_cfg need the APIC ID of
  77. * all CPUs up to max_cpus.
  78. */
  79. uint32_t x86_cpu_apic_id_from_index(X86MachineState *x86ms,
  80. unsigned int cpu_index)
  81. {
  82. X86CPUTopoInfo topo_info;
  83. init_topo_info(&topo_info, x86ms);
  84. return x86_apicid_from_cpu_idx(&topo_info, cpu_index);
  85. }
  86. void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp)
  87. {
  88. Object *cpu = object_new(MACHINE(x86ms)->cpu_type);
  89. if (!object_property_set_uint(cpu, "apic-id", apic_id, errp)) {
  90. goto out;
  91. }
  92. qdev_realize(DEVICE(cpu), NULL, errp);
  93. out:
  94. object_unref(cpu);
  95. }
  96. void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
  97. {
  98. int i;
  99. const CPUArchIdList *possible_cpus;
  100. MachineState *ms = MACHINE(x86ms);
  101. MachineClass *mc = MACHINE_GET_CLASS(x86ms);
  102. x86_cpu_set_default_version(default_cpu_version);
  103. /*
  104. * Calculates the limit to CPU APIC ID values
  105. *
  106. * Limit for the APIC ID value, so that all
  107. * CPU APIC IDs are < x86ms->apic_id_limit.
  108. *
  109. * This is used for FW_CFG_MAX_CPUS. See comments on fw_cfg_arch_create().
  110. */
  111. x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms,
  112. ms->smp.max_cpus - 1) + 1;
  113. /*
  114. * Can we support APIC ID 255 or higher?
  115. *
  116. * Under Xen: yes.
  117. * With userspace emulated lapic: no
  118. * With KVM's in-kernel lapic: only if X2APIC API is enabled.
  119. */
  120. if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
  121. (!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) {
  122. error_report("current -smp configuration requires kernel "
  123. "irqchip and X2APIC API support.");
  124. exit(EXIT_FAILURE);
  125. }
  126. if (kvm_enabled()) {
  127. kvm_set_max_apic_id(x86ms->apic_id_limit);
  128. }
  129. possible_cpus = mc->possible_cpu_arch_ids(ms);
  130. for (i = 0; i < ms->smp.cpus; i++) {
  131. x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
  132. }
  133. }
  134. void x86_rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count)
  135. {
  136. if (cpus_count > 0xff) {
  137. /*
  138. * If the number of CPUs can't be represented in 8 bits, the
  139. * BIOS must use "FW_CFG_NB_CPUS". Set RTC field to 0 just
  140. * to make old BIOSes fail more predictably.
  141. */
  142. rtc_set_memory(rtc, 0x5f, 0);
  143. } else {
  144. rtc_set_memory(rtc, 0x5f, cpus_count - 1);
  145. }
  146. }
  147. static int x86_apic_cmp(const void *a, const void *b)
  148. {
  149. CPUArchId *apic_a = (CPUArchId *)a;
  150. CPUArchId *apic_b = (CPUArchId *)b;
  151. return apic_a->arch_id - apic_b->arch_id;
  152. }
  153. /*
  154. * returns pointer to CPUArchId descriptor that matches CPU's apic_id
  155. * in ms->possible_cpus->cpus, if ms->possible_cpus->cpus has no
  156. * entry corresponding to CPU's apic_id returns NULL.
  157. */
  158. CPUArchId *x86_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
  159. {
  160. CPUArchId apic_id, *found_cpu;
  161. apic_id.arch_id = id;
  162. found_cpu = bsearch(&apic_id, ms->possible_cpus->cpus,
  163. ms->possible_cpus->len, sizeof(*ms->possible_cpus->cpus),
  164. x86_apic_cmp);
  165. if (found_cpu && idx) {
  166. *idx = found_cpu - ms->possible_cpus->cpus;
  167. }
  168. return found_cpu;
  169. }
  170. void x86_cpu_plug(HotplugHandler *hotplug_dev,
  171. DeviceState *dev, Error **errp)
  172. {
  173. CPUArchId *found_cpu;
  174. Error *local_err = NULL;
  175. X86CPU *cpu = X86_CPU(dev);
  176. X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
  177. if (x86ms->acpi_dev) {
  178. hotplug_handler_plug(x86ms->acpi_dev, dev, &local_err);
  179. if (local_err) {
  180. goto out;
  181. }
  182. }
  183. /* increment the number of CPUs */
  184. x86ms->boot_cpus++;
  185. if (x86ms->rtc) {
  186. x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
  187. }
  188. if (x86ms->fw_cfg) {
  189. fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
  190. }
  191. found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL);
  192. found_cpu->cpu = OBJECT(dev);
  193. out:
  194. error_propagate(errp, local_err);
  195. }
  196. void x86_cpu_unplug_request_cb(HotplugHandler *hotplug_dev,
  197. DeviceState *dev, Error **errp)
  198. {
  199. int idx = -1;
  200. X86CPU *cpu = X86_CPU(dev);
  201. X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
  202. if (!x86ms->acpi_dev) {
  203. error_setg(errp, "CPU hot unplug not supported without ACPI");
  204. return;
  205. }
  206. x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx);
  207. assert(idx != -1);
  208. if (idx == 0) {
  209. error_setg(errp, "Boot CPU is unpluggable");
  210. return;
  211. }
  212. hotplug_handler_unplug_request(x86ms->acpi_dev, dev,
  213. errp);
  214. }
  215. void x86_cpu_unplug_cb(HotplugHandler *hotplug_dev,
  216. DeviceState *dev, Error **errp)
  217. {
  218. CPUArchId *found_cpu;
  219. Error *local_err = NULL;
  220. X86CPU *cpu = X86_CPU(dev);
  221. X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
  222. hotplug_handler_unplug(x86ms->acpi_dev, dev, &local_err);
  223. if (local_err) {
  224. goto out;
  225. }
  226. found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL);
  227. found_cpu->cpu = NULL;
  228. qdev_unrealize(dev);
  229. /* decrement the number of CPUs */
  230. x86ms->boot_cpus--;
  231. /* Update the number of CPUs in CMOS */
  232. x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
  233. fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
  234. out:
  235. error_propagate(errp, local_err);
  236. }
  237. void x86_cpu_pre_plug(HotplugHandler *hotplug_dev,
  238. DeviceState *dev, Error **errp)
  239. {
  240. int idx;
  241. CPUState *cs;
  242. CPUArchId *cpu_slot;
  243. X86CPUTopoIDs topo_ids;
  244. X86CPU *cpu = X86_CPU(dev);
  245. CPUX86State *env = &cpu->env;
  246. MachineState *ms = MACHINE(hotplug_dev);
  247. X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
  248. unsigned int smp_cores = ms->smp.cores;
  249. unsigned int smp_threads = ms->smp.threads;
  250. X86CPUTopoInfo topo_info;
  251. if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) {
  252. error_setg(errp, "Invalid CPU type, expected cpu type: '%s'",
  253. ms->cpu_type);
  254. return;
  255. }
  256. if (x86ms->acpi_dev) {
  257. Error *local_err = NULL;
  258. hotplug_handler_pre_plug(HOTPLUG_HANDLER(x86ms->acpi_dev), dev,
  259. &local_err);
  260. if (local_err) {
  261. error_propagate(errp, local_err);
  262. return;
  263. }
  264. }
  265. init_topo_info(&topo_info, x86ms);
  266. env->nr_dies = ms->smp.dies;
  267. /*
  268. * If APIC ID is not set,
  269. * set it based on socket/die/core/thread properties.
  270. */
  271. if (cpu->apic_id == UNASSIGNED_APIC_ID) {
  272. int max_socket = (ms->smp.max_cpus - 1) /
  273. smp_threads / smp_cores / ms->smp.dies;
  274. /*
  275. * die-id was optional in QEMU 4.0 and older, so keep it optional
  276. * if there's only one die per socket.
  277. */
  278. if (cpu->die_id < 0 && ms->smp.dies == 1) {
  279. cpu->die_id = 0;
  280. }
  281. if (cpu->socket_id < 0) {
  282. error_setg(errp, "CPU socket-id is not set");
  283. return;
  284. } else if (cpu->socket_id > max_socket) {
  285. error_setg(errp, "Invalid CPU socket-id: %u must be in range 0:%u",
  286. cpu->socket_id, max_socket);
  287. return;
  288. }
  289. if (cpu->die_id < 0) {
  290. error_setg(errp, "CPU die-id is not set");
  291. return;
  292. } else if (cpu->die_id > ms->smp.dies - 1) {
  293. error_setg(errp, "Invalid CPU die-id: %u must be in range 0:%u",
  294. cpu->die_id, ms->smp.dies - 1);
  295. return;
  296. }
  297. if (cpu->core_id < 0) {
  298. error_setg(errp, "CPU core-id is not set");
  299. return;
  300. } else if (cpu->core_id > (smp_cores - 1)) {
  301. error_setg(errp, "Invalid CPU core-id: %u must be in range 0:%u",
  302. cpu->core_id, smp_cores - 1);
  303. return;
  304. }
  305. if (cpu->thread_id < 0) {
  306. error_setg(errp, "CPU thread-id is not set");
  307. return;
  308. } else if (cpu->thread_id > (smp_threads - 1)) {
  309. error_setg(errp, "Invalid CPU thread-id: %u must be in range 0:%u",
  310. cpu->thread_id, smp_threads - 1);
  311. return;
  312. }
  313. topo_ids.pkg_id = cpu->socket_id;
  314. topo_ids.die_id = cpu->die_id;
  315. topo_ids.core_id = cpu->core_id;
  316. topo_ids.smt_id = cpu->thread_id;
  317. cpu->apic_id = x86_apicid_from_topo_ids(&topo_info, &topo_ids);
  318. }
  319. cpu_slot = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx);
  320. if (!cpu_slot) {
  321. MachineState *ms = MACHINE(x86ms);
  322. x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids);
  323. error_setg(errp,
  324. "Invalid CPU [socket: %u, die: %u, core: %u, thread: %u] with"
  325. " APIC ID %" PRIu32 ", valid index range 0:%d",
  326. topo_ids.pkg_id, topo_ids.die_id, topo_ids.core_id, topo_ids.smt_id,
  327. cpu->apic_id, ms->possible_cpus->len - 1);
  328. return;
  329. }
  330. if (cpu_slot->cpu) {
  331. error_setg(errp, "CPU[%d] with APIC ID %" PRIu32 " exists",
  332. idx, cpu->apic_id);
  333. return;
  334. }
  335. /* if 'address' properties socket-id/core-id/thread-id are not set, set them
  336. * so that machine_query_hotpluggable_cpus would show correct values
  337. */
  338. /* TODO: move socket_id/core_id/thread_id checks into x86_cpu_realizefn()
  339. * once -smp refactoring is complete and there will be CPU private
  340. * CPUState::nr_cores and CPUState::nr_threads fields instead of globals */
  341. x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids);
  342. if (cpu->socket_id != -1 && cpu->socket_id != topo_ids.pkg_id) {
  343. error_setg(errp, "property socket-id: %u doesn't match set apic-id:"
  344. " 0x%x (socket-id: %u)", cpu->socket_id, cpu->apic_id,
  345. topo_ids.pkg_id);
  346. return;
  347. }
  348. cpu->socket_id = topo_ids.pkg_id;
  349. if (cpu->die_id != -1 && cpu->die_id != topo_ids.die_id) {
  350. error_setg(errp, "property die-id: %u doesn't match set apic-id:"
  351. " 0x%x (die-id: %u)", cpu->die_id, cpu->apic_id, topo_ids.die_id);
  352. return;
  353. }
  354. cpu->die_id = topo_ids.die_id;
  355. if (cpu->core_id != -1 && cpu->core_id != topo_ids.core_id) {
  356. error_setg(errp, "property core-id: %u doesn't match set apic-id:"
  357. " 0x%x (core-id: %u)", cpu->core_id, cpu->apic_id,
  358. topo_ids.core_id);
  359. return;
  360. }
  361. cpu->core_id = topo_ids.core_id;
  362. if (cpu->thread_id != -1 && cpu->thread_id != topo_ids.smt_id) {
  363. error_setg(errp, "property thread-id: %u doesn't match set apic-id:"
  364. " 0x%x (thread-id: %u)", cpu->thread_id, cpu->apic_id,
  365. topo_ids.smt_id);
  366. return;
  367. }
  368. cpu->thread_id = topo_ids.smt_id;
  369. if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) &&
  370. !kvm_hv_vpindex_settable()) {
  371. error_setg(errp, "kernel doesn't allow setting HyperV VP_INDEX");
  372. return;
  373. }
  374. cs = CPU(cpu);
  375. cs->cpu_index = idx;
  376. numa_cpu_pre_plug(cpu_slot, dev, errp);
  377. }
  378. CpuInstanceProperties
  379. x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
  380. {
  381. MachineClass *mc = MACHINE_GET_CLASS(ms);
  382. const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms);
  383. assert(cpu_index < possible_cpus->len);
  384. return possible_cpus->cpus[cpu_index].props;
  385. }
  386. int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx)
  387. {
  388. X86CPUTopoIDs topo_ids;
  389. X86MachineState *x86ms = X86_MACHINE(ms);
  390. X86CPUTopoInfo topo_info;
  391. init_topo_info(&topo_info, x86ms);
  392. assert(idx < ms->possible_cpus->len);
  393. x86_topo_ids_from_apicid(ms->possible_cpus->cpus[idx].arch_id,
  394. &topo_info, &topo_ids);
  395. return topo_ids.pkg_id % ms->numa_state->num_nodes;
  396. }
  397. const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms)
  398. {
  399. X86MachineState *x86ms = X86_MACHINE(ms);
  400. unsigned int max_cpus = ms->smp.max_cpus;
  401. X86CPUTopoInfo topo_info;
  402. int i;
  403. if (ms->possible_cpus) {
  404. /*
  405. * make sure that max_cpus hasn't changed since the first use, i.e.
  406. * -smp hasn't been parsed after it
  407. */
  408. assert(ms->possible_cpus->len == max_cpus);
  409. return ms->possible_cpus;
  410. }
  411. ms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
  412. sizeof(CPUArchId) * max_cpus);
  413. ms->possible_cpus->len = max_cpus;
  414. init_topo_info(&topo_info, x86ms);
  415. for (i = 0; i < ms->possible_cpus->len; i++) {
  416. X86CPUTopoIDs topo_ids;
  417. ms->possible_cpus->cpus[i].type = ms->cpu_type;
  418. ms->possible_cpus->cpus[i].vcpus_count = 1;
  419. ms->possible_cpus->cpus[i].arch_id =
  420. x86_cpu_apic_id_from_index(x86ms, i);
  421. x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id,
  422. &topo_info, &topo_ids);
  423. ms->possible_cpus->cpus[i].props.has_socket_id = true;
  424. ms->possible_cpus->cpus[i].props.socket_id = topo_ids.pkg_id;
  425. if (ms->smp.dies > 1) {
  426. ms->possible_cpus->cpus[i].props.has_die_id = true;
  427. ms->possible_cpus->cpus[i].props.die_id = topo_ids.die_id;
  428. }
  429. ms->possible_cpus->cpus[i].props.has_core_id = true;
  430. ms->possible_cpus->cpus[i].props.core_id = topo_ids.core_id;
  431. ms->possible_cpus->cpus[i].props.has_thread_id = true;
  432. ms->possible_cpus->cpus[i].props.thread_id = topo_ids.smt_id;
  433. }
  434. return ms->possible_cpus;
  435. }
  436. static void x86_nmi(NMIState *n, int cpu_index, Error **errp)
  437. {
  438. /* cpu index isn't used */
  439. CPUState *cs;
  440. CPU_FOREACH(cs) {
  441. X86CPU *cpu = X86_CPU(cs);
  442. if (!cpu->apic_state) {
  443. cpu_interrupt(cs, CPU_INTERRUPT_NMI);
  444. } else {
  445. apic_deliver_nmi(cpu->apic_state);
  446. }
  447. }
  448. }
  449. static long get_file_size(FILE *f)
  450. {
  451. long where, size;
  452. /* XXX: on Unix systems, using fstat() probably makes more sense */
  453. where = ftell(f);
  454. fseek(f, 0, SEEK_END);
  455. size = ftell(f);
  456. fseek(f, where, SEEK_SET);
  457. return size;
  458. }
  459. /* TSC handling */
  460. uint64_t cpu_get_tsc(CPUX86State *env)
  461. {
  462. return cpus_get_elapsed_ticks();
  463. }
  464. /* IRQ handling */
  465. static void pic_irq_request(void *opaque, int irq, int level)
  466. {
  467. CPUState *cs = first_cpu;
  468. X86CPU *cpu = X86_CPU(cs);
  469. trace_x86_pic_interrupt(irq, level);
  470. if (cpu->apic_state && !kvm_irqchip_in_kernel() &&
  471. !whpx_apic_in_platform()) {
  472. CPU_FOREACH(cs) {
  473. cpu = X86_CPU(cs);
  474. if (apic_accept_pic_intr(cpu->apic_state)) {
  475. apic_deliver_pic_intr(cpu->apic_state, level);
  476. }
  477. }
  478. } else {
  479. if (level) {
  480. cpu_interrupt(cs, CPU_INTERRUPT_HARD);
  481. } else {
  482. cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
  483. }
  484. }
  485. }
  486. qemu_irq x86_allocate_cpu_irq(void)
  487. {
  488. return qemu_allocate_irq(pic_irq_request, NULL, 0);
  489. }
  490. int cpu_get_pic_interrupt(CPUX86State *env)
  491. {
  492. X86CPU *cpu = env_archcpu(env);
  493. int intno;
  494. if (!kvm_irqchip_in_kernel() && !whpx_apic_in_platform()) {
  495. intno = apic_get_interrupt(cpu->apic_state);
  496. if (intno >= 0) {
  497. return intno;
  498. }
  499. /* read the irq from the PIC */
  500. if (!apic_accept_pic_intr(cpu->apic_state)) {
  501. return -1;
  502. }
  503. }
  504. intno = pic_read_irq(isa_pic);
  505. return intno;
  506. }
  507. DeviceState *cpu_get_current_apic(void)
  508. {
  509. if (current_cpu) {
  510. X86CPU *cpu = X86_CPU(current_cpu);
  511. return cpu->apic_state;
  512. } else {
  513. return NULL;
  514. }
  515. }
  516. void gsi_handler(void *opaque, int n, int level)
  517. {
  518. GSIState *s = opaque;
  519. trace_x86_gsi_interrupt(n, level);
  520. switch (n) {
  521. case 0 ... ISA_NUM_IRQS - 1:
  522. if (s->i8259_irq[n]) {
  523. /* Under KVM, Kernel will forward to both PIC and IOAPIC */
  524. qemu_set_irq(s->i8259_irq[n], level);
  525. }
  526. /* fall through */
  527. case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1:
  528. qemu_set_irq(s->ioapic_irq[n], level);
  529. break;
  530. case IO_APIC_SECONDARY_IRQBASE
  531. ... IO_APIC_SECONDARY_IRQBASE + IOAPIC_NUM_PINS - 1:
  532. qemu_set_irq(s->ioapic2_irq[n - IO_APIC_SECONDARY_IRQBASE], level);
  533. break;
  534. }
  535. }
  536. void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
  537. {
  538. DeviceState *dev;
  539. SysBusDevice *d;
  540. unsigned int i;
  541. assert(parent_name);
  542. if (kvm_ioapic_in_kernel()) {
  543. dev = qdev_new(TYPE_KVM_IOAPIC);
  544. } else {
  545. dev = qdev_new(TYPE_IOAPIC);
  546. }
  547. object_property_add_child(object_resolve_path(parent_name, NULL),
  548. "ioapic", OBJECT(dev));
  549. d = SYS_BUS_DEVICE(dev);
  550. sysbus_realize_and_unref(d, &error_fatal);
  551. sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
  552. for (i = 0; i < IOAPIC_NUM_PINS; i++) {
  553. gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i);
  554. }
  555. }
  556. DeviceState *ioapic_init_secondary(GSIState *gsi_state)
  557. {
  558. DeviceState *dev;
  559. SysBusDevice *d;
  560. unsigned int i;
  561. dev = qdev_new(TYPE_IOAPIC);
  562. d = SYS_BUS_DEVICE(dev);
  563. sysbus_realize_and_unref(d, &error_fatal);
  564. sysbus_mmio_map(d, 0, IO_APIC_SECONDARY_ADDRESS);
  565. for (i = 0; i < IOAPIC_NUM_PINS; i++) {
  566. gsi_state->ioapic2_irq[i] = qdev_get_gpio_in(dev, i);
  567. }
  568. return dev;
  569. }
  570. typedef struct SetupData {
  571. uint64_t next;
  572. uint32_t type;
  573. uint32_t len;
  574. uint8_t data[];
  575. } __attribute__((packed)) SetupData;
  576. /*
  577. * The entry point into the kernel for PVH boot is different from
  578. * the native entry point. The PVH entry is defined by the x86/HVM
  579. * direct boot ABI and is available in an ELFNOTE in the kernel binary.
  580. *
  581. * This function is passed to load_elf() when it is called from
  582. * load_elfboot() which then additionally checks for an ELF Note of
  583. * type XEN_ELFNOTE_PHYS32_ENTRY and passes it to this function to
  584. * parse the PVH entry address from the ELF Note.
  585. *
  586. * Due to trickery in elf_opts.h, load_elf() is actually available as
  587. * load_elf32() or load_elf64() and this routine needs to be able
  588. * to deal with being called as 32 or 64 bit.
  589. *
  590. * The address of the PVH entry point is saved to the 'pvh_start_addr'
  591. * global variable. (although the entry point is 32-bit, the kernel
  592. * binary can be either 32-bit or 64-bit).
  593. */
  594. static uint64_t read_pvh_start_addr(void *arg1, void *arg2, bool is64)
  595. {
  596. size_t *elf_note_data_addr;
  597. /* Check if ELF Note header passed in is valid */
  598. if (arg1 == NULL) {
  599. return 0;
  600. }
  601. if (is64) {
  602. struct elf64_note *nhdr64 = (struct elf64_note *)arg1;
  603. uint64_t nhdr_size64 = sizeof(struct elf64_note);
  604. uint64_t phdr_align = *(uint64_t *)arg2;
  605. uint64_t nhdr_namesz = nhdr64->n_namesz;
  606. elf_note_data_addr =
  607. ((void *)nhdr64) + nhdr_size64 +
  608. QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
  609. pvh_start_addr = *elf_note_data_addr;
  610. } else {
  611. struct elf32_note *nhdr32 = (struct elf32_note *)arg1;
  612. uint32_t nhdr_size32 = sizeof(struct elf32_note);
  613. uint32_t phdr_align = *(uint32_t *)arg2;
  614. uint32_t nhdr_namesz = nhdr32->n_namesz;
  615. elf_note_data_addr =
  616. ((void *)nhdr32) + nhdr_size32 +
  617. QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
  618. pvh_start_addr = *(uint32_t *)elf_note_data_addr;
  619. }
  620. return pvh_start_addr;
  621. }
  622. static bool load_elfboot(const char *kernel_filename,
  623. int kernel_file_size,
  624. uint8_t *header,
  625. size_t pvh_xen_start_addr,
  626. FWCfgState *fw_cfg)
  627. {
  628. uint32_t flags = 0;
  629. uint32_t mh_load_addr = 0;
  630. uint32_t elf_kernel_size = 0;
  631. uint64_t elf_entry;
  632. uint64_t elf_low, elf_high;
  633. int kernel_size;
  634. if (ldl_p(header) != 0x464c457f) {
  635. return false; /* no elfboot */
  636. }
  637. bool elf_is64 = header[EI_CLASS] == ELFCLASS64;
  638. flags = elf_is64 ?
  639. ((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags;
  640. if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */
  641. error_report("elfboot unsupported flags = %x", flags);
  642. exit(1);
  643. }
  644. uint64_t elf_note_type = XEN_ELFNOTE_PHYS32_ENTRY;
  645. kernel_size = load_elf(kernel_filename, read_pvh_start_addr,
  646. NULL, &elf_note_type, &elf_entry,
  647. &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE,
  648. 0, 0);
  649. if (kernel_size < 0) {
  650. error_report("Error while loading elf kernel");
  651. exit(1);
  652. }
  653. mh_load_addr = elf_low;
  654. elf_kernel_size = elf_high - elf_low;
  655. if (pvh_start_addr == 0) {
  656. error_report("Error loading uncompressed kernel without PVH ELF Note");
  657. exit(1);
  658. }
  659. fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, pvh_start_addr);
  660. fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr);
  661. fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, elf_kernel_size);
  662. return true;
  663. }
  664. typedef struct SetupDataFixup {
  665. void *pos;
  666. hwaddr orig_val, new_val;
  667. uint32_t addr;
  668. } SetupDataFixup;
  669. static void fixup_setup_data(void *opaque)
  670. {
  671. SetupDataFixup *fixup = opaque;
  672. stq_p(fixup->pos, fixup->new_val);
  673. }
  674. static void reset_setup_data(void *opaque)
  675. {
  676. SetupDataFixup *fixup = opaque;
  677. stq_p(fixup->pos, fixup->orig_val);
  678. }
  679. static void reset_rng_seed(void *opaque)
  680. {
  681. SetupData *setup_data = opaque;
  682. qemu_guest_getrandom_nofail(setup_data->data, le32_to_cpu(setup_data->len));
  683. }
  684. void x86_load_linux(X86MachineState *x86ms,
  685. FWCfgState *fw_cfg,
  686. int acpi_data_size,
  687. bool pvh_enabled,
  688. bool legacy_no_rng_seed)
  689. {
  690. bool linuxboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled;
  691. uint16_t protocol;
  692. int setup_size, kernel_size, cmdline_size;
  693. int dtb_size, setup_data_offset;
  694. uint32_t initrd_max;
  695. uint8_t header[8192], *setup, *kernel;
  696. hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0, first_setup_data = 0;
  697. FILE *f;
  698. char *vmode;
  699. MachineState *machine = MACHINE(x86ms);
  700. SetupData *setup_data;
  701. const char *kernel_filename = machine->kernel_filename;
  702. const char *initrd_filename = machine->initrd_filename;
  703. const char *dtb_filename = machine->dtb;
  704. char *kernel_cmdline;
  705. SevKernelLoaderContext sev_load_ctx = {};
  706. enum { RNG_SEED_LENGTH = 32 };
  707. /*
  708. * Add the NUL terminator, some padding for the microvm cmdline fiddling
  709. * hack, and then align to 16 bytes as a paranoia measure
  710. */
  711. cmdline_size = (strlen(machine->kernel_cmdline) + 1 +
  712. VIRTIO_CMDLINE_TOTAL_MAX_LEN + 16) & ~15;
  713. /* Make a copy, since we might append arbitrary bytes to it later. */
  714. kernel_cmdline = g_strndup(machine->kernel_cmdline, cmdline_size);
  715. /* load the kernel header */
  716. f = fopen(kernel_filename, "rb");
  717. if (!f) {
  718. fprintf(stderr, "qemu: could not open kernel file '%s': %s\n",
  719. kernel_filename, strerror(errno));
  720. exit(1);
  721. }
  722. kernel_size = get_file_size(f);
  723. if (!kernel_size ||
  724. fread(header, 1, MIN(ARRAY_SIZE(header), kernel_size), f) !=
  725. MIN(ARRAY_SIZE(header), kernel_size)) {
  726. fprintf(stderr, "qemu: could not load kernel '%s': %s\n",
  727. kernel_filename, strerror(errno));
  728. exit(1);
  729. }
  730. /* kernel protocol version */
  731. if (ldl_p(header + 0x202) == 0x53726448) {
  732. protocol = lduw_p(header + 0x206);
  733. } else {
  734. /*
  735. * This could be a multiboot kernel. If it is, let's stop treating it
  736. * like a Linux kernel.
  737. * Note: some multiboot images could be in the ELF format (the same of
  738. * PVH), so we try multiboot first since we check the multiboot magic
  739. * header before to load it.
  740. */
  741. if (load_multiboot(x86ms, fw_cfg, f, kernel_filename, initrd_filename,
  742. kernel_cmdline, kernel_size, header)) {
  743. return;
  744. }
  745. /*
  746. * Check if the file is an uncompressed kernel file (ELF) and load it,
  747. * saving the PVH entry point used by the x86/HVM direct boot ABI.
  748. * If load_elfboot() is successful, populate the fw_cfg info.
  749. */
  750. if (pvh_enabled &&
  751. load_elfboot(kernel_filename, kernel_size,
  752. header, pvh_start_addr, fw_cfg)) {
  753. fclose(f);
  754. fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
  755. strlen(kernel_cmdline) + 1);
  756. fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
  757. fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header));
  758. fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA,
  759. header, sizeof(header));
  760. /* load initrd */
  761. if (initrd_filename) {
  762. GMappedFile *mapped_file;
  763. gsize initrd_size;
  764. gchar *initrd_data;
  765. GError *gerr = NULL;
  766. mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
  767. if (!mapped_file) {
  768. fprintf(stderr, "qemu: error reading initrd %s: %s\n",
  769. initrd_filename, gerr->message);
  770. exit(1);
  771. }
  772. x86ms->initrd_mapped_file = mapped_file;
  773. initrd_data = g_mapped_file_get_contents(mapped_file);
  774. initrd_size = g_mapped_file_get_length(mapped_file);
  775. initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
  776. if (initrd_size >= initrd_max) {
  777. fprintf(stderr, "qemu: initrd is too large, cannot support."
  778. "(max: %"PRIu32", need %"PRId64")\n",
  779. initrd_max, (uint64_t)initrd_size);
  780. exit(1);
  781. }
  782. initrd_addr = (initrd_max - initrd_size) & ~4095;
  783. fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
  784. fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
  785. fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data,
  786. initrd_size);
  787. }
  788. option_rom[nb_option_roms].bootindex = 0;
  789. option_rom[nb_option_roms].name = "pvh.bin";
  790. nb_option_roms++;
  791. return;
  792. }
  793. protocol = 0;
  794. }
  795. if (protocol < 0x200 || !(header[0x211] & 0x01)) {
  796. /* Low kernel */
  797. real_addr = 0x90000;
  798. cmdline_addr = 0x9a000 - cmdline_size;
  799. prot_addr = 0x10000;
  800. } else if (protocol < 0x202) {
  801. /* High but ancient kernel */
  802. real_addr = 0x90000;
  803. cmdline_addr = 0x9a000 - cmdline_size;
  804. prot_addr = 0x100000;
  805. } else {
  806. /* High and recent kernel */
  807. real_addr = 0x10000;
  808. cmdline_addr = 0x20000;
  809. prot_addr = 0x100000;
  810. }
  811. /* highest address for loading the initrd */
  812. if (protocol >= 0x20c &&
  813. lduw_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
  814. /*
  815. * Linux has supported initrd up to 4 GB for a very long time (2007,
  816. * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013),
  817. * though it only sets initrd_max to 2 GB to "work around bootloader
  818. * bugs". Luckily, QEMU firmware(which does something like bootloader)
  819. * has supported this.
  820. *
  821. * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can
  822. * be loaded into any address.
  823. *
  824. * In addition, initrd_max is uint32_t simply because QEMU doesn't
  825. * support the 64-bit boot protocol (specifically the ext_ramdisk_image
  826. * field).
  827. *
  828. * Therefore here just limit initrd_max to UINT32_MAX simply as well.
  829. */
  830. initrd_max = UINT32_MAX;
  831. } else if (protocol >= 0x203) {
  832. initrd_max = ldl_p(header + 0x22c);
  833. } else {
  834. initrd_max = 0x37ffffff;
  835. }
  836. if (initrd_max >= x86ms->below_4g_mem_size - acpi_data_size) {
  837. initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
  838. }
  839. if (protocol >= 0x202) {
  840. stl_p(header + 0x228, cmdline_addr);
  841. } else {
  842. stw_p(header + 0x20, 0xA33F);
  843. stw_p(header + 0x22, cmdline_addr - real_addr);
  844. }
  845. /* handle vga= parameter */
  846. vmode = strstr(kernel_cmdline, "vga=");
  847. if (vmode) {
  848. unsigned int video_mode;
  849. const char *end;
  850. int ret;
  851. /* skip "vga=" */
  852. vmode += 4;
  853. if (!strncmp(vmode, "normal", 6)) {
  854. video_mode = 0xffff;
  855. } else if (!strncmp(vmode, "ext", 3)) {
  856. video_mode = 0xfffe;
  857. } else if (!strncmp(vmode, "ask", 3)) {
  858. video_mode = 0xfffd;
  859. } else {
  860. ret = qemu_strtoui(vmode, &end, 0, &video_mode);
  861. if (ret != 0 || (*end && *end != ' ')) {
  862. fprintf(stderr, "qemu: invalid 'vga=' kernel parameter.\n");
  863. exit(1);
  864. }
  865. }
  866. stw_p(header + 0x1fa, video_mode);
  867. }
  868. /* loader type */
  869. /*
  870. * High nybble = B reserved for QEMU; low nybble is revision number.
  871. * If this code is substantially changed, you may want to consider
  872. * incrementing the revision.
  873. */
  874. if (protocol >= 0x200) {
  875. header[0x210] = 0xB0;
  876. }
  877. /* heap */
  878. if (protocol >= 0x201) {
  879. header[0x211] |= 0x80; /* CAN_USE_HEAP */
  880. stw_p(header + 0x224, cmdline_addr - real_addr - 0x200);
  881. }
  882. /* load initrd */
  883. if (initrd_filename) {
  884. GMappedFile *mapped_file;
  885. gsize initrd_size;
  886. gchar *initrd_data;
  887. GError *gerr = NULL;
  888. if (protocol < 0x200) {
  889. fprintf(stderr, "qemu: linux kernel too old to load a ram disk\n");
  890. exit(1);
  891. }
  892. mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
  893. if (!mapped_file) {
  894. fprintf(stderr, "qemu: error reading initrd %s: %s\n",
  895. initrd_filename, gerr->message);
  896. exit(1);
  897. }
  898. x86ms->initrd_mapped_file = mapped_file;
  899. initrd_data = g_mapped_file_get_contents(mapped_file);
  900. initrd_size = g_mapped_file_get_length(mapped_file);
  901. if (initrd_size >= initrd_max) {
  902. fprintf(stderr, "qemu: initrd is too large, cannot support."
  903. "(max: %"PRIu32", need %"PRId64")\n",
  904. initrd_max, (uint64_t)initrd_size);
  905. exit(1);
  906. }
  907. initrd_addr = (initrd_max - initrd_size) & ~4095;
  908. fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
  909. fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
  910. fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, initrd_size);
  911. sev_load_ctx.initrd_data = initrd_data;
  912. sev_load_ctx.initrd_size = initrd_size;
  913. stl_p(header + 0x218, initrd_addr);
  914. stl_p(header + 0x21c, initrd_size);
  915. }
  916. /* load kernel and setup */
  917. setup_size = header[0x1f1];
  918. if (setup_size == 0) {
  919. setup_size = 4;
  920. }
  921. setup_size = (setup_size + 1) * 512;
  922. if (setup_size > kernel_size) {
  923. fprintf(stderr, "qemu: invalid kernel header\n");
  924. exit(1);
  925. }
  926. kernel_size -= setup_size;
  927. setup = g_malloc(setup_size);
  928. kernel = g_malloc(kernel_size);
  929. fseek(f, 0, SEEK_SET);
  930. if (fread(setup, 1, setup_size, f) != setup_size) {
  931. fprintf(stderr, "fread() failed\n");
  932. exit(1);
  933. }
  934. if (fread(kernel, 1, kernel_size, f) != kernel_size) {
  935. fprintf(stderr, "fread() failed\n");
  936. exit(1);
  937. }
  938. fclose(f);
  939. /* append dtb to kernel */
  940. if (dtb_filename) {
  941. if (protocol < 0x209) {
  942. fprintf(stderr, "qemu: Linux kernel too old to load a dtb\n");
  943. exit(1);
  944. }
  945. dtb_size = get_image_size(dtb_filename);
  946. if (dtb_size <= 0) {
  947. fprintf(stderr, "qemu: error reading dtb %s: %s\n",
  948. dtb_filename, strerror(errno));
  949. exit(1);
  950. }
  951. setup_data_offset = cmdline_size;
  952. cmdline_size += sizeof(SetupData) + dtb_size;
  953. kernel_cmdline = g_realloc(kernel_cmdline, cmdline_size);
  954. setup_data = (void *)kernel_cmdline + setup_data_offset;
  955. setup_data->next = cpu_to_le64(first_setup_data);
  956. first_setup_data = cmdline_addr + setup_data_offset;
  957. setup_data->type = cpu_to_le32(SETUP_DTB);
  958. setup_data->len = cpu_to_le32(dtb_size);
  959. load_image_size(dtb_filename, setup_data->data, dtb_size);
  960. }
  961. if (!legacy_no_rng_seed && protocol >= 0x209) {
  962. setup_data_offset = cmdline_size;
  963. cmdline_size += sizeof(SetupData) + RNG_SEED_LENGTH;
  964. kernel_cmdline = g_realloc(kernel_cmdline, cmdline_size);
  965. setup_data = (void *)kernel_cmdline + setup_data_offset;
  966. setup_data->next = cpu_to_le64(first_setup_data);
  967. first_setup_data = cmdline_addr + setup_data_offset;
  968. setup_data->type = cpu_to_le32(SETUP_RNG_SEED);
  969. setup_data->len = cpu_to_le32(RNG_SEED_LENGTH);
  970. qemu_guest_getrandom_nofail(setup_data->data, RNG_SEED_LENGTH);
  971. qemu_register_reset_nosnapshotload(reset_rng_seed, setup_data);
  972. fw_cfg_add_bytes_callback(fw_cfg, FW_CFG_KERNEL_DATA, reset_rng_seed, NULL,
  973. setup_data, kernel, kernel_size, true);
  974. } else {
  975. fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size);
  976. }
  977. fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr);
  978. fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, cmdline_size);
  979. fw_cfg_add_bytes(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline, cmdline_size);
  980. sev_load_ctx.cmdline_data = (char *)kernel_cmdline;
  981. sev_load_ctx.cmdline_size = cmdline_size;
  982. fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr);
  983. fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size);
  984. sev_load_ctx.kernel_data = (char *)kernel;
  985. sev_load_ctx.kernel_size = kernel_size;
  986. /*
  987. * If we're starting an encrypted VM, it will be OVMF based, which uses the
  988. * efi stub for booting and doesn't require any values to be placed in the
  989. * kernel header. We therefore don't update the header so the hash of the
  990. * kernel on the other side of the fw_cfg interface matches the hash of the
  991. * file the user passed in.
  992. */
  993. if (!sev_enabled() && first_setup_data) {
  994. SetupDataFixup *fixup = g_malloc(sizeof(*fixup));
  995. memcpy(setup, header, MIN(sizeof(header), setup_size));
  996. /* Offset 0x250 is a pointer to the first setup_data link. */
  997. fixup->pos = setup + 0x250;
  998. fixup->orig_val = ldq_p(fixup->pos);
  999. fixup->new_val = first_setup_data;
  1000. fixup->addr = cpu_to_le32(real_addr);
  1001. fw_cfg_add_bytes_callback(fw_cfg, FW_CFG_SETUP_ADDR, fixup_setup_data, NULL,
  1002. fixup, &fixup->addr, sizeof(fixup->addr), true);
  1003. qemu_register_reset(reset_setup_data, fixup);
  1004. } else {
  1005. fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr);
  1006. }
  1007. fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size);
  1008. fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, setup, setup_size);
  1009. sev_load_ctx.setup_data = (char *)setup;
  1010. sev_load_ctx.setup_size = setup_size;
  1011. if (sev_enabled()) {
  1012. sev_add_kernel_loader_hashes(&sev_load_ctx, &error_fatal);
  1013. }
  1014. option_rom[nb_option_roms].bootindex = 0;
  1015. option_rom[nb_option_roms].name = "linuxboot.bin";
  1016. if (linuxboot_dma_enabled && fw_cfg_dma_enabled(fw_cfg)) {
  1017. option_rom[nb_option_roms].name = "linuxboot_dma.bin";
  1018. }
  1019. nb_option_roms++;
  1020. }
  1021. void x86_bios_rom_init(MachineState *ms, const char *default_firmware,
  1022. MemoryRegion *rom_memory, bool isapc_ram_fw)
  1023. {
  1024. const char *bios_name;
  1025. char *filename;
  1026. MemoryRegion *bios, *isa_bios;
  1027. int bios_size, isa_bios_size;
  1028. ssize_t ret;
  1029. /* BIOS load */
  1030. bios_name = ms->firmware ?: default_firmware;
  1031. filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
  1032. if (filename) {
  1033. bios_size = get_image_size(filename);
  1034. } else {
  1035. bios_size = -1;
  1036. }
  1037. if (bios_size <= 0 ||
  1038. (bios_size % 65536) != 0) {
  1039. goto bios_error;
  1040. }
  1041. bios = g_malloc(sizeof(*bios));
  1042. memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal);
  1043. if (sev_enabled()) {
  1044. /*
  1045. * The concept of a "reset" simply doesn't exist for
  1046. * confidential computing guests, we have to destroy and
  1047. * re-launch them instead. So there is no need to register
  1048. * the firmware as rom to properly re-initialize on reset.
  1049. * Just go for a straight file load instead.
  1050. */
  1051. void *ptr = memory_region_get_ram_ptr(bios);
  1052. load_image_size(filename, ptr, bios_size);
  1053. x86_firmware_configure(ptr, bios_size);
  1054. } else {
  1055. if (!isapc_ram_fw) {
  1056. memory_region_set_readonly(bios, true);
  1057. }
  1058. ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1);
  1059. if (ret != 0) {
  1060. goto bios_error;
  1061. }
  1062. }
  1063. g_free(filename);
  1064. /* map the last 128KB of the BIOS in ISA space */
  1065. isa_bios_size = MIN(bios_size, 128 * KiB);
  1066. isa_bios = g_malloc(sizeof(*isa_bios));
  1067. memory_region_init_alias(isa_bios, NULL, "isa-bios", bios,
  1068. bios_size - isa_bios_size, isa_bios_size);
  1069. memory_region_add_subregion_overlap(rom_memory,
  1070. 0x100000 - isa_bios_size,
  1071. isa_bios,
  1072. 1);
  1073. if (!isapc_ram_fw) {
  1074. memory_region_set_readonly(isa_bios, true);
  1075. }
  1076. /* map all the bios at the top of memory */
  1077. memory_region_add_subregion(rom_memory,
  1078. (uint32_t)(-bios_size),
  1079. bios);
  1080. return;
  1081. bios_error:
  1082. fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name);
  1083. exit(1);
  1084. }
  1085. bool x86_machine_is_smm_enabled(const X86MachineState *x86ms)
  1086. {
  1087. bool smm_available = false;
  1088. if (x86ms->smm == ON_OFF_AUTO_OFF) {
  1089. return false;
  1090. }
  1091. if (tcg_enabled() || qtest_enabled()) {
  1092. smm_available = true;
  1093. } else if (kvm_enabled()) {
  1094. smm_available = kvm_has_smm();
  1095. }
  1096. if (smm_available) {
  1097. return true;
  1098. }
  1099. if (x86ms->smm == ON_OFF_AUTO_ON) {
  1100. error_report("System Management Mode not supported by this hypervisor.");
  1101. exit(1);
  1102. }
  1103. return false;
  1104. }
  1105. static void x86_machine_get_smm(Object *obj, Visitor *v, const char *name,
  1106. void *opaque, Error **errp)
  1107. {
  1108. X86MachineState *x86ms = X86_MACHINE(obj);
  1109. OnOffAuto smm = x86ms->smm;
  1110. visit_type_OnOffAuto(v, name, &smm, errp);
  1111. }
  1112. static void x86_machine_set_smm(Object *obj, Visitor *v, const char *name,
  1113. void *opaque, Error **errp)
  1114. {
  1115. X86MachineState *x86ms = X86_MACHINE(obj);
  1116. visit_type_OnOffAuto(v, name, &x86ms->smm, errp);
  1117. }
  1118. bool x86_machine_is_acpi_enabled(const X86MachineState *x86ms)
  1119. {
  1120. if (x86ms->acpi == ON_OFF_AUTO_OFF) {
  1121. return false;
  1122. }
  1123. return true;
  1124. }
  1125. static void x86_machine_get_acpi(Object *obj, Visitor *v, const char *name,
  1126. void *opaque, Error **errp)
  1127. {
  1128. X86MachineState *x86ms = X86_MACHINE(obj);
  1129. OnOffAuto acpi = x86ms->acpi;
  1130. visit_type_OnOffAuto(v, name, &acpi, errp);
  1131. }
  1132. static void x86_machine_set_acpi(Object *obj, Visitor *v, const char *name,
  1133. void *opaque, Error **errp)
  1134. {
  1135. X86MachineState *x86ms = X86_MACHINE(obj);
  1136. visit_type_OnOffAuto(v, name, &x86ms->acpi, errp);
  1137. }
  1138. static void x86_machine_get_pit(Object *obj, Visitor *v, const char *name,
  1139. void *opaque, Error **errp)
  1140. {
  1141. X86MachineState *x86ms = X86_MACHINE(obj);
  1142. OnOffAuto pit = x86ms->pit;
  1143. visit_type_OnOffAuto(v, name, &pit, errp);
  1144. }
  1145. static void x86_machine_set_pit(Object *obj, Visitor *v, const char *name,
  1146. void *opaque, Error **errp)
  1147. {
  1148. X86MachineState *x86ms = X86_MACHINE(obj);;
  1149. visit_type_OnOffAuto(v, name, &x86ms->pit, errp);
  1150. }
  1151. static void x86_machine_get_pic(Object *obj, Visitor *v, const char *name,
  1152. void *opaque, Error **errp)
  1153. {
  1154. X86MachineState *x86ms = X86_MACHINE(obj);
  1155. OnOffAuto pic = x86ms->pic;
  1156. visit_type_OnOffAuto(v, name, &pic, errp);
  1157. }
  1158. static void x86_machine_set_pic(Object *obj, Visitor *v, const char *name,
  1159. void *opaque, Error **errp)
  1160. {
  1161. X86MachineState *x86ms = X86_MACHINE(obj);
  1162. visit_type_OnOffAuto(v, name, &x86ms->pic, errp);
  1163. }
  1164. static char *x86_machine_get_oem_id(Object *obj, Error **errp)
  1165. {
  1166. X86MachineState *x86ms = X86_MACHINE(obj);
  1167. return g_strdup(x86ms->oem_id);
  1168. }
  1169. static void x86_machine_set_oem_id(Object *obj, const char *value, Error **errp)
  1170. {
  1171. X86MachineState *x86ms = X86_MACHINE(obj);
  1172. size_t len = strlen(value);
  1173. if (len > 6) {
  1174. error_setg(errp,
  1175. "User specified "X86_MACHINE_OEM_ID" value is bigger than "
  1176. "6 bytes in size");
  1177. return;
  1178. }
  1179. strncpy(x86ms->oem_id, value, 6);
  1180. }
  1181. static char *x86_machine_get_oem_table_id(Object *obj, Error **errp)
  1182. {
  1183. X86MachineState *x86ms = X86_MACHINE(obj);
  1184. return g_strdup(x86ms->oem_table_id);
  1185. }
  1186. static void x86_machine_set_oem_table_id(Object *obj, const char *value,
  1187. Error **errp)
  1188. {
  1189. X86MachineState *x86ms = X86_MACHINE(obj);
  1190. size_t len = strlen(value);
  1191. if (len > 8) {
  1192. error_setg(errp,
  1193. "User specified "X86_MACHINE_OEM_TABLE_ID
  1194. " value is bigger than "
  1195. "8 bytes in size");
  1196. return;
  1197. }
  1198. strncpy(x86ms->oem_table_id, value, 8);
  1199. }
  1200. static void x86_machine_get_bus_lock_ratelimit(Object *obj, Visitor *v,
  1201. const char *name, void *opaque, Error **errp)
  1202. {
  1203. X86MachineState *x86ms = X86_MACHINE(obj);
  1204. uint64_t bus_lock_ratelimit = x86ms->bus_lock_ratelimit;
  1205. visit_type_uint64(v, name, &bus_lock_ratelimit, errp);
  1206. }
  1207. static void x86_machine_set_bus_lock_ratelimit(Object *obj, Visitor *v,
  1208. const char *name, void *opaque, Error **errp)
  1209. {
  1210. X86MachineState *x86ms = X86_MACHINE(obj);
  1211. visit_type_uint64(v, name, &x86ms->bus_lock_ratelimit, errp);
  1212. }
  1213. static void machine_get_sgx_epc(Object *obj, Visitor *v, const char *name,
  1214. void *opaque, Error **errp)
  1215. {
  1216. X86MachineState *x86ms = X86_MACHINE(obj);
  1217. SgxEPCList *list = x86ms->sgx_epc_list;
  1218. visit_type_SgxEPCList(v, name, &list, errp);
  1219. }
  1220. static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name,
  1221. void *opaque, Error **errp)
  1222. {
  1223. X86MachineState *x86ms = X86_MACHINE(obj);
  1224. SgxEPCList *list;
  1225. list = x86ms->sgx_epc_list;
  1226. visit_type_SgxEPCList(v, name, &x86ms->sgx_epc_list, errp);
  1227. qapi_free_SgxEPCList(list);
  1228. }
  1229. static void x86_machine_initfn(Object *obj)
  1230. {
  1231. X86MachineState *x86ms = X86_MACHINE(obj);
  1232. x86ms->smm = ON_OFF_AUTO_AUTO;
  1233. x86ms->acpi = ON_OFF_AUTO_AUTO;
  1234. x86ms->pit = ON_OFF_AUTO_AUTO;
  1235. x86ms->pic = ON_OFF_AUTO_AUTO;
  1236. x86ms->pci_irq_mask = ACPI_BUILD_PCI_IRQS;
  1237. x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6);
  1238. x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
  1239. x86ms->bus_lock_ratelimit = 0;
  1240. x86ms->above_4g_mem_start = 4 * GiB;
  1241. }
  1242. static void x86_machine_class_init(ObjectClass *oc, void *data)
  1243. {
  1244. MachineClass *mc = MACHINE_CLASS(oc);
  1245. X86MachineClass *x86mc = X86_MACHINE_CLASS(oc);
  1246. NMIClass *nc = NMI_CLASS(oc);
  1247. mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
  1248. mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
  1249. mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
  1250. x86mc->save_tsc_khz = true;
  1251. x86mc->fwcfg_dma_enabled = true;
  1252. nc->nmi_monitor_handler = x86_nmi;
  1253. object_class_property_add(oc, X86_MACHINE_SMM, "OnOffAuto",
  1254. x86_machine_get_smm, x86_machine_set_smm,
  1255. NULL, NULL);
  1256. object_class_property_set_description(oc, X86_MACHINE_SMM,
  1257. "Enable SMM");
  1258. object_class_property_add(oc, X86_MACHINE_ACPI, "OnOffAuto",
  1259. x86_machine_get_acpi, x86_machine_set_acpi,
  1260. NULL, NULL);
  1261. object_class_property_set_description(oc, X86_MACHINE_ACPI,
  1262. "Enable ACPI");
  1263. object_class_property_add(oc, X86_MACHINE_PIT, "OnOffAuto",
  1264. x86_machine_get_pit,
  1265. x86_machine_set_pit,
  1266. NULL, NULL);
  1267. object_class_property_set_description(oc, X86_MACHINE_PIT,
  1268. "Enable i8254 PIT");
  1269. object_class_property_add(oc, X86_MACHINE_PIC, "OnOffAuto",
  1270. x86_machine_get_pic,
  1271. x86_machine_set_pic,
  1272. NULL, NULL);
  1273. object_class_property_set_description(oc, X86_MACHINE_PIC,
  1274. "Enable i8259 PIC");
  1275. object_class_property_add_str(oc, X86_MACHINE_OEM_ID,
  1276. x86_machine_get_oem_id,
  1277. x86_machine_set_oem_id);
  1278. object_class_property_set_description(oc, X86_MACHINE_OEM_ID,
  1279. "Override the default value of field OEMID "
  1280. "in ACPI table header."
  1281. "The string may be up to 6 bytes in size");
  1282. object_class_property_add_str(oc, X86_MACHINE_OEM_TABLE_ID,
  1283. x86_machine_get_oem_table_id,
  1284. x86_machine_set_oem_table_id);
  1285. object_class_property_set_description(oc, X86_MACHINE_OEM_TABLE_ID,
  1286. "Override the default value of field OEM Table ID "
  1287. "in ACPI table header."
  1288. "The string may be up to 8 bytes in size");
  1289. object_class_property_add(oc, X86_MACHINE_BUS_LOCK_RATELIMIT, "uint64_t",
  1290. x86_machine_get_bus_lock_ratelimit,
  1291. x86_machine_set_bus_lock_ratelimit, NULL, NULL);
  1292. object_class_property_set_description(oc, X86_MACHINE_BUS_LOCK_RATELIMIT,
  1293. "Set the ratelimit for the bus locks acquired in VMs");
  1294. object_class_property_add(oc, "sgx-epc", "SgxEPC",
  1295. machine_get_sgx_epc, machine_set_sgx_epc,
  1296. NULL, NULL);
  1297. object_class_property_set_description(oc, "sgx-epc",
  1298. "SGX EPC device");
  1299. }
  1300. static const TypeInfo x86_machine_info = {
  1301. .name = TYPE_X86_MACHINE,
  1302. .parent = TYPE_MACHINE,
  1303. .abstract = true,
  1304. .instance_size = sizeof(X86MachineState),
  1305. .instance_init = x86_machine_initfn,
  1306. .class_size = sizeof(X86MachineClass),
  1307. .class_init = x86_machine_class_init,
  1308. .interfaces = (InterfaceInfo[]) {
  1309. { TYPE_NMI },
  1310. { }
  1311. },
  1312. };
  1313. static void x86_machine_register_types(void)
  1314. {
  1315. type_register_static(&x86_machine_info);
  1316. }
  1317. type_init(x86_machine_register_types)