microvm.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. /*
  2. * Copyright (c) 2018 Intel Corporation
  3. * Copyright (c) 2019 Red Hat, Inc.
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms and conditions of the GNU General Public License,
  7. * version 2 or later, as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. * more details.
  13. *
  14. * You should have received a copy of the GNU General Public License along with
  15. * this program. If not, see <http://www.gnu.org/licenses/>.
  16. */
  17. #include "qemu/osdep.h"
  18. #include "qemu/error-report.h"
  19. #include "qemu/cutils.h"
  20. #include "qemu/units.h"
  21. #include "qapi/error.h"
  22. #include "qapi/visitor.h"
  23. #include "qapi/qapi-visit-common.h"
  24. #include "sysemu/sysemu.h"
  25. #include "sysemu/cpus.h"
  26. #include "sysemu/numa.h"
  27. #include "sysemu/reset.h"
  28. #include "hw/loader.h"
  29. #include "hw/irq.h"
  30. #include "hw/kvm/clock.h"
  31. #include "hw/i386/microvm.h"
  32. #include "hw/i386/x86.h"
  33. #include "hw/i386/pc.h"
  34. #include "target/i386/cpu.h"
  35. #include "hw/timer/i8254.h"
  36. #include "hw/rtc/mc146818rtc.h"
  37. #include "hw/char/serial.h"
  38. #include "hw/i386/topology.h"
  39. #include "hw/i386/e820_memory_layout.h"
  40. #include "hw/i386/fw_cfg.h"
  41. #include "hw/virtio/virtio-mmio.h"
  42. #include "cpu.h"
  43. #include "elf.h"
  44. #include "kvm_i386.h"
  45. #include "hw/xen/start_info.h"
  46. #define MICROVM_BIOS_FILENAME "bios-microvm.bin"
  47. static void microvm_set_rtc(MicrovmMachineState *mms, ISADevice *s)
  48. {
  49. X86MachineState *x86ms = X86_MACHINE(mms);
  50. int val;
  51. val = MIN(x86ms->below_4g_mem_size / KiB, 640);
  52. rtc_set_memory(s, 0x15, val);
  53. rtc_set_memory(s, 0x16, val >> 8);
  54. /* extended memory (next 64MiB) */
  55. if (x86ms->below_4g_mem_size > 1 * MiB) {
  56. val = (x86ms->below_4g_mem_size - 1 * MiB) / KiB;
  57. } else {
  58. val = 0;
  59. }
  60. if (val > 65535) {
  61. val = 65535;
  62. }
  63. rtc_set_memory(s, 0x17, val);
  64. rtc_set_memory(s, 0x18, val >> 8);
  65. rtc_set_memory(s, 0x30, val);
  66. rtc_set_memory(s, 0x31, val >> 8);
  67. /* memory between 16MiB and 4GiB */
  68. if (x86ms->below_4g_mem_size > 16 * MiB) {
  69. val = (x86ms->below_4g_mem_size - 16 * MiB) / (64 * KiB);
  70. } else {
  71. val = 0;
  72. }
  73. if (val > 65535) {
  74. val = 65535;
  75. }
  76. rtc_set_memory(s, 0x34, val);
  77. rtc_set_memory(s, 0x35, val >> 8);
  78. /* memory above 4GiB */
  79. val = x86ms->above_4g_mem_size / 65536;
  80. rtc_set_memory(s, 0x5b, val);
  81. rtc_set_memory(s, 0x5c, val >> 8);
  82. rtc_set_memory(s, 0x5d, val >> 16);
  83. }
  84. static void microvm_gsi_handler(void *opaque, int n, int level)
  85. {
  86. GSIState *s = opaque;
  87. qemu_set_irq(s->ioapic_irq[n], level);
  88. }
  89. static void microvm_devices_init(MicrovmMachineState *mms)
  90. {
  91. X86MachineState *x86ms = X86_MACHINE(mms);
  92. ISABus *isa_bus;
  93. ISADevice *rtc_state;
  94. GSIState *gsi_state;
  95. int i;
  96. /* Core components */
  97. gsi_state = g_malloc0(sizeof(*gsi_state));
  98. if (mms->pic == ON_OFF_AUTO_ON || mms->pic == ON_OFF_AUTO_AUTO) {
  99. x86ms->gsi = qemu_allocate_irqs(gsi_handler, gsi_state, GSI_NUM_PINS);
  100. } else {
  101. x86ms->gsi = qemu_allocate_irqs(microvm_gsi_handler,
  102. gsi_state, GSI_NUM_PINS);
  103. }
  104. isa_bus = isa_bus_new(NULL, get_system_memory(), get_system_io(),
  105. &error_abort);
  106. isa_bus_irqs(isa_bus, x86ms->gsi);
  107. ioapic_init_gsi(gsi_state, "machine");
  108. kvmclock_create();
  109. for (i = 0; i < VIRTIO_NUM_TRANSPORTS; i++) {
  110. sysbus_create_simple("virtio-mmio",
  111. VIRTIO_MMIO_BASE + i * 512,
  112. x86ms->gsi[VIRTIO_IRQ_BASE + i]);
  113. }
  114. /* Optional and legacy devices */
  115. if (mms->pic == ON_OFF_AUTO_ON || mms->pic == ON_OFF_AUTO_AUTO) {
  116. qemu_irq *i8259;
  117. i8259 = i8259_init(isa_bus, pc_allocate_cpu_irq());
  118. for (i = 0; i < ISA_NUM_IRQS; i++) {
  119. gsi_state->i8259_irq[i] = i8259[i];
  120. }
  121. g_free(i8259);
  122. }
  123. if (mms->pit == ON_OFF_AUTO_ON || mms->pit == ON_OFF_AUTO_AUTO) {
  124. if (kvm_pit_in_kernel()) {
  125. kvm_pit_init(isa_bus, 0x40);
  126. } else {
  127. i8254_pit_init(isa_bus, 0x40, 0, NULL);
  128. }
  129. }
  130. if (mms->rtc == ON_OFF_AUTO_ON ||
  131. (mms->rtc == ON_OFF_AUTO_AUTO && !kvm_enabled())) {
  132. rtc_state = mc146818_rtc_init(isa_bus, 2000, NULL);
  133. microvm_set_rtc(mms, rtc_state);
  134. }
  135. if (mms->isa_serial) {
  136. serial_hds_isa_init(isa_bus, 0, 1);
  137. }
  138. if (bios_name == NULL) {
  139. bios_name = MICROVM_BIOS_FILENAME;
  140. }
  141. x86_bios_rom_init(get_system_memory(), true);
  142. }
  143. static void microvm_memory_init(MicrovmMachineState *mms)
  144. {
  145. MachineState *machine = MACHINE(mms);
  146. X86MachineState *x86ms = X86_MACHINE(mms);
  147. MemoryRegion *ram, *ram_below_4g, *ram_above_4g;
  148. MemoryRegion *system_memory = get_system_memory();
  149. FWCfgState *fw_cfg;
  150. ram_addr_t lowmem;
  151. int i;
  152. /*
  153. * Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory
  154. * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping
  155. * also known as MMCFG).
  156. * If it doesn't, we need to split it in chunks below and above 4G.
  157. * In any case, try to make sure that guest addresses aligned at
  158. * 1G boundaries get mapped to host addresses aligned at 1G boundaries.
  159. */
  160. if (machine->ram_size >= 0xb0000000) {
  161. lowmem = 0x80000000;
  162. } else {
  163. lowmem = 0xb0000000;
  164. }
  165. /*
  166. * Handle the machine opt max-ram-below-4g. It is basically doing
  167. * min(qemu limit, user limit).
  168. */
  169. if (!x86ms->max_ram_below_4g) {
  170. x86ms->max_ram_below_4g = 4 * GiB;
  171. }
  172. if (lowmem > x86ms->max_ram_below_4g) {
  173. lowmem = x86ms->max_ram_below_4g;
  174. if (machine->ram_size - lowmem > lowmem &&
  175. lowmem & (1 * GiB - 1)) {
  176. warn_report("There is possibly poor performance as the ram size "
  177. " (0x%" PRIx64 ") is more then twice the size of"
  178. " max-ram-below-4g (%"PRIu64") and"
  179. " max-ram-below-4g is not a multiple of 1G.",
  180. (uint64_t)machine->ram_size, x86ms->max_ram_below_4g);
  181. }
  182. }
  183. if (machine->ram_size > lowmem) {
  184. x86ms->above_4g_mem_size = machine->ram_size - lowmem;
  185. x86ms->below_4g_mem_size = lowmem;
  186. } else {
  187. x86ms->above_4g_mem_size = 0;
  188. x86ms->below_4g_mem_size = machine->ram_size;
  189. }
  190. ram = g_malloc(sizeof(*ram));
  191. memory_region_allocate_system_memory(ram, NULL, "microvm.ram",
  192. machine->ram_size);
  193. ram_below_4g = g_malloc(sizeof(*ram_below_4g));
  194. memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
  195. 0, x86ms->below_4g_mem_size);
  196. memory_region_add_subregion(system_memory, 0, ram_below_4g);
  197. e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
  198. if (x86ms->above_4g_mem_size > 0) {
  199. ram_above_4g = g_malloc(sizeof(*ram_above_4g));
  200. memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
  201. x86ms->below_4g_mem_size,
  202. x86ms->above_4g_mem_size);
  203. memory_region_add_subregion(system_memory, 0x100000000ULL,
  204. ram_above_4g);
  205. e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
  206. }
  207. fw_cfg = fw_cfg_init_io_dma(FW_CFG_IO_BASE, FW_CFG_IO_BASE + 4,
  208. &address_space_memory);
  209. fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, machine->smp.cpus);
  210. fw_cfg_add_i16(fw_cfg, FW_CFG_MAX_CPUS, machine->smp.max_cpus);
  211. fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)machine->ram_size);
  212. fw_cfg_add_i32(fw_cfg, FW_CFG_IRQ0_OVERRIDE, kvm_allows_irq0_override());
  213. fw_cfg_add_bytes(fw_cfg, FW_CFG_E820_TABLE,
  214. &e820_reserve, sizeof(e820_reserve));
  215. fw_cfg_add_file(fw_cfg, "etc/e820", e820_table,
  216. sizeof(struct e820_entry) * e820_get_num_entries());
  217. rom_set_fw(fw_cfg);
  218. if (machine->kernel_filename != NULL) {
  219. x86_load_linux(x86ms, fw_cfg, 0, true, true);
  220. }
  221. if (mms->option_roms) {
  222. for (i = 0; i < nb_option_roms; i++) {
  223. rom_add_option(option_rom[i].name, option_rom[i].bootindex);
  224. }
  225. }
  226. x86ms->fw_cfg = fw_cfg;
  227. x86ms->ioapic_as = &address_space_memory;
  228. }
  229. static gchar *microvm_get_mmio_cmdline(gchar *name)
  230. {
  231. gchar *cmdline;
  232. gchar *separator;
  233. long int index;
  234. int ret;
  235. separator = g_strrstr(name, ".");
  236. if (!separator) {
  237. return NULL;
  238. }
  239. if (qemu_strtol(separator + 1, NULL, 10, &index) != 0) {
  240. return NULL;
  241. }
  242. cmdline = g_malloc0(VIRTIO_CMDLINE_MAXLEN);
  243. ret = g_snprintf(cmdline, VIRTIO_CMDLINE_MAXLEN,
  244. " virtio_mmio.device=512@0x%lx:%ld",
  245. VIRTIO_MMIO_BASE + index * 512,
  246. VIRTIO_IRQ_BASE + index);
  247. if (ret < 0 || ret >= VIRTIO_CMDLINE_MAXLEN) {
  248. g_free(cmdline);
  249. return NULL;
  250. }
  251. return cmdline;
  252. }
  253. static void microvm_fix_kernel_cmdline(MachineState *machine)
  254. {
  255. X86MachineState *x86ms = X86_MACHINE(machine);
  256. BusState *bus;
  257. BusChild *kid;
  258. char *cmdline;
  259. /*
  260. * Find MMIO transports with attached devices, and add them to the kernel
  261. * command line.
  262. *
  263. * Yes, this is a hack, but one that heavily improves the UX without
  264. * introducing any significant issues.
  265. */
  266. cmdline = g_strdup(machine->kernel_cmdline);
  267. bus = sysbus_get_default();
  268. QTAILQ_FOREACH(kid, &bus->children, sibling) {
  269. DeviceState *dev = kid->child;
  270. ObjectClass *class = object_get_class(OBJECT(dev));
  271. if (class == object_class_by_name(TYPE_VIRTIO_MMIO)) {
  272. VirtIOMMIOProxy *mmio = VIRTIO_MMIO(OBJECT(dev));
  273. VirtioBusState *mmio_virtio_bus = &mmio->bus;
  274. BusState *mmio_bus = &mmio_virtio_bus->parent_obj;
  275. if (!QTAILQ_EMPTY(&mmio_bus->children)) {
  276. gchar *mmio_cmdline = microvm_get_mmio_cmdline(mmio_bus->name);
  277. if (mmio_cmdline) {
  278. char *newcmd = g_strjoin(NULL, cmdline, mmio_cmdline, NULL);
  279. g_free(mmio_cmdline);
  280. g_free(cmdline);
  281. cmdline = newcmd;
  282. }
  283. }
  284. }
  285. }
  286. fw_cfg_modify_i32(x86ms->fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(cmdline) + 1);
  287. fw_cfg_modify_string(x86ms->fw_cfg, FW_CFG_CMDLINE_DATA, cmdline);
  288. g_free(cmdline);
  289. }
  290. static void microvm_machine_state_init(MachineState *machine)
  291. {
  292. MicrovmMachineState *mms = MICROVM_MACHINE(machine);
  293. X86MachineState *x86ms = X86_MACHINE(machine);
  294. Error *local_err = NULL;
  295. microvm_memory_init(mms);
  296. x86_cpus_init(x86ms, CPU_VERSION_LATEST);
  297. if (local_err) {
  298. error_report_err(local_err);
  299. exit(1);
  300. }
  301. microvm_devices_init(mms);
  302. }
  303. static void microvm_machine_reset(MachineState *machine)
  304. {
  305. MicrovmMachineState *mms = MICROVM_MACHINE(machine);
  306. CPUState *cs;
  307. X86CPU *cpu;
  308. if (machine->kernel_filename != NULL &&
  309. mms->auto_kernel_cmdline && !mms->kernel_cmdline_fixed) {
  310. microvm_fix_kernel_cmdline(machine);
  311. mms->kernel_cmdline_fixed = true;
  312. }
  313. qemu_devices_reset();
  314. CPU_FOREACH(cs) {
  315. cpu = X86_CPU(cs);
  316. if (cpu->apic_state) {
  317. device_reset(cpu->apic_state);
  318. }
  319. }
  320. }
  321. static void microvm_machine_get_pic(Object *obj, Visitor *v, const char *name,
  322. void *opaque, Error **errp)
  323. {
  324. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  325. OnOffAuto pic = mms->pic;
  326. visit_type_OnOffAuto(v, name, &pic, errp);
  327. }
  328. static void microvm_machine_set_pic(Object *obj, Visitor *v, const char *name,
  329. void *opaque, Error **errp)
  330. {
  331. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  332. visit_type_OnOffAuto(v, name, &mms->pic, errp);
  333. }
  334. static void microvm_machine_get_pit(Object *obj, Visitor *v, const char *name,
  335. void *opaque, Error **errp)
  336. {
  337. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  338. OnOffAuto pit = mms->pit;
  339. visit_type_OnOffAuto(v, name, &pit, errp);
  340. }
  341. static void microvm_machine_set_pit(Object *obj, Visitor *v, const char *name,
  342. void *opaque, Error **errp)
  343. {
  344. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  345. visit_type_OnOffAuto(v, name, &mms->pit, errp);
  346. }
  347. static void microvm_machine_get_rtc(Object *obj, Visitor *v, const char *name,
  348. void *opaque, Error **errp)
  349. {
  350. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  351. OnOffAuto rtc = mms->rtc;
  352. visit_type_OnOffAuto(v, name, &rtc, errp);
  353. }
  354. static void microvm_machine_set_rtc(Object *obj, Visitor *v, const char *name,
  355. void *opaque, Error **errp)
  356. {
  357. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  358. visit_type_OnOffAuto(v, name, &mms->rtc, errp);
  359. }
  360. static bool microvm_machine_get_isa_serial(Object *obj, Error **errp)
  361. {
  362. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  363. return mms->isa_serial;
  364. }
  365. static void microvm_machine_set_isa_serial(Object *obj, bool value,
  366. Error **errp)
  367. {
  368. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  369. mms->isa_serial = value;
  370. }
  371. static bool microvm_machine_get_option_roms(Object *obj, Error **errp)
  372. {
  373. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  374. return mms->option_roms;
  375. }
  376. static void microvm_machine_set_option_roms(Object *obj, bool value,
  377. Error **errp)
  378. {
  379. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  380. mms->option_roms = value;
  381. }
  382. static bool microvm_machine_get_auto_kernel_cmdline(Object *obj, Error **errp)
  383. {
  384. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  385. return mms->auto_kernel_cmdline;
  386. }
  387. static void microvm_machine_set_auto_kernel_cmdline(Object *obj, bool value,
  388. Error **errp)
  389. {
  390. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  391. mms->auto_kernel_cmdline = value;
  392. }
  393. static void microvm_machine_initfn(Object *obj)
  394. {
  395. MicrovmMachineState *mms = MICROVM_MACHINE(obj);
  396. /* Configuration */
  397. mms->pic = ON_OFF_AUTO_AUTO;
  398. mms->pit = ON_OFF_AUTO_AUTO;
  399. mms->rtc = ON_OFF_AUTO_AUTO;
  400. mms->isa_serial = true;
  401. mms->option_roms = true;
  402. mms->auto_kernel_cmdline = true;
  403. /* State */
  404. mms->kernel_cmdline_fixed = false;
  405. }
  406. static void microvm_class_init(ObjectClass *oc, void *data)
  407. {
  408. MachineClass *mc = MACHINE_CLASS(oc);
  409. mc->init = microvm_machine_state_init;
  410. mc->family = "microvm_i386";
  411. mc->desc = "microvm (i386)";
  412. mc->units_per_default_bus = 1;
  413. mc->no_floppy = 1;
  414. mc->max_cpus = 288;
  415. mc->has_hotpluggable_cpus = false;
  416. mc->auto_enable_numa_with_memhp = false;
  417. mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE;
  418. mc->nvdimm_supported = false;
  419. /* Avoid relying too much on kernel components */
  420. mc->default_kernel_irqchip_split = true;
  421. /* Machine class handlers */
  422. mc->reset = microvm_machine_reset;
  423. object_class_property_add(oc, MICROVM_MACHINE_PIC, "OnOffAuto",
  424. microvm_machine_get_pic,
  425. microvm_machine_set_pic,
  426. NULL, NULL, &error_abort);
  427. object_class_property_set_description(oc, MICROVM_MACHINE_PIC,
  428. "Enable i8259 PIC", &error_abort);
  429. object_class_property_add(oc, MICROVM_MACHINE_PIT, "OnOffAuto",
  430. microvm_machine_get_pit,
  431. microvm_machine_set_pit,
  432. NULL, NULL, &error_abort);
  433. object_class_property_set_description(oc, MICROVM_MACHINE_PIT,
  434. "Enable i8254 PIT", &error_abort);
  435. object_class_property_add(oc, MICROVM_MACHINE_RTC, "OnOffAuto",
  436. microvm_machine_get_rtc,
  437. microvm_machine_set_rtc,
  438. NULL, NULL, &error_abort);
  439. object_class_property_set_description(oc, MICROVM_MACHINE_RTC,
  440. "Enable MC146818 RTC", &error_abort);
  441. object_class_property_add_bool(oc, MICROVM_MACHINE_ISA_SERIAL,
  442. microvm_machine_get_isa_serial,
  443. microvm_machine_set_isa_serial,
  444. &error_abort);
  445. object_class_property_set_description(oc, MICROVM_MACHINE_ISA_SERIAL,
  446. "Set off to disable the instantiation an ISA serial port",
  447. &error_abort);
  448. object_class_property_add_bool(oc, MICROVM_MACHINE_OPTION_ROMS,
  449. microvm_machine_get_option_roms,
  450. microvm_machine_set_option_roms,
  451. &error_abort);
  452. object_class_property_set_description(oc, MICROVM_MACHINE_OPTION_ROMS,
  453. "Set off to disable loading option ROMs", &error_abort);
  454. object_class_property_add_bool(oc, MICROVM_MACHINE_AUTO_KERNEL_CMDLINE,
  455. microvm_machine_get_auto_kernel_cmdline,
  456. microvm_machine_set_auto_kernel_cmdline,
  457. &error_abort);
  458. object_class_property_set_description(oc,
  459. MICROVM_MACHINE_AUTO_KERNEL_CMDLINE,
  460. "Set off to disable adding virtio-mmio devices to the kernel cmdline",
  461. &error_abort);
  462. }
  463. static const TypeInfo microvm_machine_info = {
  464. .name = TYPE_MICROVM_MACHINE,
  465. .parent = TYPE_X86_MACHINE,
  466. .instance_size = sizeof(MicrovmMachineState),
  467. .instance_init = microvm_machine_initfn,
  468. .class_size = sizeof(MicrovmMachineClass),
  469. .class_init = microvm_class_init,
  470. .interfaces = (InterfaceInfo[]) {
  471. { }
  472. },
  473. };
  474. static void microvm_machine_init(void)
  475. {
  476. type_register_static(&microvm_machine_info);
  477. }
  478. type_init(microvm_machine_init);