aboutsummaryrefslogtreecommitdiffstats
path: root/hw/i386
diff options
context:
space:
mode:
Diffstat (limited to 'hw/i386')
-rw-r--r--hw/i386/Kconfig139
-rw-r--r--hw/i386/acpi-build.c2873
-rw-r--r--hw/i386/acpi-build.h15
-rw-r--r--hw/i386/acpi-common.c165
-rw-r--r--hw/i386/acpi-common.h15
-rw-r--r--hw/i386/acpi-microvm.c258
-rw-r--r--hw/i386/acpi-microvm.h8
-rw-r--r--hw/i386/amd_iommu.c1662
-rw-r--r--hw/i386/amd_iommu.h372
-rw-r--r--hw/i386/e820_memory_layout.c59
-rw-r--r--hw/i386/e820_memory_layout.h42
-rw-r--r--hw/i386/fw_cfg.c221
-rw-r--r--hw/i386/fw_cfg.h30
-rw-r--r--hw/i386/generic_event_device_x86.c36
-rw-r--r--hw/i386/intel_iommu.c3900
-rw-r--r--hw/i386/intel_iommu_internal.h518
-rw-r--r--hw/i386/kvm/apic.c271
-rw-r--r--hw/i386/kvm/clock.c350
-rw-r--r--hw/i386/kvm/i8254.c337
-rw-r--r--hw/i386/kvm/i8259.c167
-rw-r--r--hw/i386/kvm/ioapic.c165
-rw-r--r--hw/i386/kvm/meson.build8
-rw-r--r--hw/i386/kvmvapic.c871
-rw-r--r--hw/i386/meson.build37
-rw-r--r--hw/i386/microvm-dt.c348
-rw-r--r--hw/i386/microvm-dt.h8
-rw-r--r--hw/i386/microvm.c779
-rw-r--r--hw/i386/multiboot.c415
-rw-r--r--hw/i386/multiboot.h16
-rw-r--r--hw/i386/pc.c1777
-rw-r--r--hw/i386/pc_piix.c951
-rw-r--r--hw/i386/pc_q35.c620
-rw-r--r--hw/i386/pc_sysfw.c262
-rw-r--r--hw/i386/pc_sysfw_ovmf-stubs.c26
-rw-r--r--hw/i386/pc_sysfw_ovmf.c151
-rw-r--r--hw/i386/port92.c127
-rw-r--r--hw/i386/sgx-epc.c185
-rw-r--r--hw/i386/sgx-stub.c34
-rw-r--r--hw/i386/sgx.c243
-rw-r--r--hw/i386/trace-events121
-rw-r--r--hw/i386/trace.h1
-rw-r--r--hw/i386/vmmouse.c327
-rw-r--r--hw/i386/vmport.c313
-rw-r--r--hw/i386/x86-iommu-stub.c38
-rw-r--r--hw/i386/x86-iommu.c162
-rw-r--r--hw/i386/x86.c1399
-rw-r--r--hw/i386/xen/meson.build7
-rw-r--r--hw/i386/xen/trace-events28
-rw-r--r--hw/i386/xen/trace.h1
-rw-r--r--hw/i386/xen/xen-hvm.c1620
-rw-r--r--hw/i386/xen/xen-mapcache.c593
-rw-r--r--hw/i386/xen/xen_apic.c103
-rw-r--r--hw/i386/xen/xen_platform.c519
-rw-r--r--hw/i386/xen/xen_pvdevice.c154
54 files changed, 23847 insertions, 0 deletions
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
new file mode 100644
index 000000000..d22ac4a4b
--- /dev/null
+++ b/hw/i386/Kconfig
@@ -0,0 +1,139 @@
+config X86_FW_OVMF
+ bool
+
+config SEV
+ bool
+ select X86_FW_OVMF
+ depends on KVM
+
+config SGX
+ bool
+ depends on KVM
+
+config PC
+ bool
+ imply APPLESMC
+ imply HYPERV
+ imply ISA_IPMI_KCS
+ imply ISA_IPMI_BT
+ imply PCI_IPMI_KCS
+ imply PCI_IPMI_BT
+ imply IPMI_SSIF
+ imply ISA_DEBUG
+ imply PARALLEL
+ imply PCI_DEVICES
+ imply PVPANIC_ISA
+ imply QXL
+ imply SEV
+ imply SGX
+ imply SGA
+ imply TEST_DEVICES
+ imply TPM_CRB
+ imply TPM_TIS_ISA
+ imply VGA_PCI
+ imply VIRTIO_VGA
+ imply NVDIMM
+ select FDC_ISA
+ select I8259
+ select I8254
+ select PCKBD
+ select PCSPK
+ select I8257
+ select MC146818RTC
+ # For ACPI builder:
+ select SERIAL_ISA
+ select ACPI_PCI
+ select ACPI_VMGENID
+ select VIRTIO_PMEM_SUPPORTED
+ select VIRTIO_MEM_SUPPORTED
+
+config PC_PCI
+ bool
+ select APIC
+ select IOAPIC
+ select APM
+ select PC
+
+config PC_ACPI
+ bool
+ select ACPI_X86
+ select ACPI_CPU_HOTPLUG
+ select ACPI_MEMORY_HOTPLUG
+ select ACPI_VIOT
+ select SMBUS_EEPROM
+ select PFLASH_CFI01
+ depends on ACPI_SMBUS
+
+config I440FX
+ bool
+ imply E1000_PCI
+ imply VMPORT
+ imply VMMOUSE
+ select PC_PCI
+ select PC_ACPI
+ select ACPI_SMBUS
+ select PCI_I440FX
+ select PIIX3
+ select IDE_PIIX
+ select DIMM
+ select SMBIOS
+ select FW_CFG_DMA
+
+config ISAPC
+ bool
+ select ISA_BUS
+ select PC
+ select IDE_ISA
+ select VGA_ISA
+ # FIXME: it is in the same file as i440fx, and does not compile
+ # if separated
+ depends on I440FX
+
+config Q35
+ bool
+ imply VTD
+ imply AMD_IOMMU
+ imply E1000E_PCI_EXPRESS
+ imply VMPORT
+ imply VMMOUSE
+ select PC_PCI
+ select PC_ACPI
+ select PCI_EXPRESS_Q35
+ select LPC_ICH9
+ select AHCI_ICH9
+ select DIMM
+ select SMBIOS
+ select FW_CFG_DMA
+
+config MICROVM
+ bool
+ select SERIAL_ISA # for serial_hds_isa_init()
+ select ISA_BUS
+ select APIC
+ select IOAPIC
+ select I8259
+ select MC146818RTC
+ select VIRTIO_MMIO
+ select ACPI_HW_REDUCED
+ select PCI_EXPRESS_GENERIC_BRIDGE
+ select USB_XHCI_SYSBUS
+ select I8254
+
+config X86_IOMMU
+ bool
+ depends on PC
+
+config VTD
+ bool
+ select X86_IOMMU
+
+config AMD_IOMMU
+ bool
+ select X86_IOMMU
+
+config VMPORT
+ bool
+
+config VMMOUSE
+ bool
+ depends on VMPORT
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
new file mode 100644
index 000000000..a99c6e4fe
--- /dev/null
+++ b/hw/i386/acpi-build.c
@@ -0,0 +1,2873 @@
+/* Support for generating ACPI tables and passing them to Guests
+ *
+ * Copyright (C) 2008-2010 Kevin O'Connor <kevin@koconnor.net>
+ * Copyright (C) 2006 Fabrice Bellard
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qnum.h"
+#include "acpi-build.h"
+#include "acpi-common.h"
+#include "qemu/bitmap.h"
+#include "qemu/error-report.h"
+#include "hw/pci/pci.h"
+#include "hw/core/cpu.h"
+#include "target/i386/cpu.h"
+#include "hw/misc/pvpanic.h"
+#include "hw/timer/hpet.h"
+#include "hw/acpi/acpi-defs.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/cpu.h"
+#include "hw/nvram/fw_cfg.h"
+#include "hw/acpi/bios-linker-loader.h"
+#include "hw/isa/isa.h"
+#include "hw/block/fdc.h"
+#include "hw/acpi/memory_hotplug.h"
+#include "sysemu/tpm.h"
+#include "hw/acpi/tpm.h"
+#include "hw/acpi/vmgenid.h"
+#include "sysemu/tpm_backend.h"
+#include "hw/rtc/mc146818rtc_regs.h"
+#include "migration/vmstate.h"
+#include "hw/mem/memory-device.h"
+#include "hw/mem/nvdimm.h"
+#include "sysemu/numa.h"
+#include "sysemu/reset.h"
+#include "hw/hyperv/vmbus-bridge.h"
+
+/* Supported chipsets: */
+#include "hw/southbridge/piix.h"
+#include "hw/acpi/pcihp.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/i386/ich9.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci-host/q35.h"
+#include "hw/i386/x86-iommu.h"
+
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/utils.h"
+#include "hw/acpi/pci.h"
+
+#include "qom/qom-qobject.h"
+#include "hw/i386/amd_iommu.h"
+#include "hw/i386/intel_iommu.h"
+#include "hw/virtio/virtio-iommu.h"
+
+#include "hw/acpi/ipmi.h"
+#include "hw/acpi/hmat.h"
+#include "hw/acpi/viot.h"
+
+/* These are used to size the ACPI tables for -M pc-i440fx-1.7 and
+ * -M pc-i440fx-2.0. Even if the actual amount of AML generated grows
+ * a little bit, there should be plenty of free space since the DSDT
+ * shrunk by ~1.5k between QEMU 2.0 and QEMU 2.1.
+ */
+#define ACPI_BUILD_LEGACY_CPU_AML_SIZE 97
+#define ACPI_BUILD_ALIGN_SIZE 0x1000
+
+#define ACPI_BUILD_TABLE_SIZE 0x20000
+
+/* #define DEBUG_ACPI_BUILD */
+#ifdef DEBUG_ACPI_BUILD
+#define ACPI_BUILD_DPRINTF(fmt, ...) \
+ do {printf("ACPI_BUILD: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define ACPI_BUILD_DPRINTF(fmt, ...)
+#endif
+
+typedef struct AcpiPmInfo {
+ bool s3_disabled;
+ bool s4_disabled;
+ bool pcihp_bridge_en;
+ bool smi_on_cpuhp;
+ bool smi_on_cpu_unplug;
+ bool pcihp_root_en;
+ uint8_t s4_val;
+ AcpiFadtData fadt;
+ uint16_t cpu_hp_io_base;
+ uint16_t pcihp_io_base;
+ uint16_t pcihp_io_len;
+} AcpiPmInfo;
+
+typedef struct AcpiMiscInfo {
+ bool is_piix4;
+ bool has_hpet;
+#ifdef CONFIG_TPM
+ TPMVersion tpm_version;
+#endif
+ const unsigned char *dsdt_code;
+ unsigned dsdt_size;
+ uint16_t pvpanic_port;
+ uint16_t applesmc_io_base;
+} AcpiMiscInfo;
+
+typedef struct AcpiBuildPciBusHotplugState {
+ GArray *device_table;
+ GArray *notify_table;
+ struct AcpiBuildPciBusHotplugState *parent;
+ bool pcihp_bridge_en;
+} AcpiBuildPciBusHotplugState;
+
+typedef struct FwCfgTPMConfig {
+ uint32_t tpmppi_address;
+ uint8_t tpm_version;
+ uint8_t tpmppi_version;
+} QEMU_PACKED FwCfgTPMConfig;
+
+static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg);
+
+const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio = {
+ .space_id = AML_AS_SYSTEM_IO,
+ .address = NVDIMM_ACPI_IO_BASE,
+ .bit_width = NVDIMM_ACPI_IO_LEN << 3
+};
+
+static void init_common_fadt_data(MachineState *ms, Object *o,
+ AcpiFadtData *data)
+{
+ X86MachineState *x86ms = X86_MACHINE(ms);
+ /*
+ * "ICH9-LPC" or "PIIX4_PM" has "smm-compat" property to keep the old
+ * behavior for compatibility irrelevant to smm_enabled, which doesn't
+ * comforms to ACPI spec.
+ */
+ bool smm_enabled = object_property_get_bool(o, "smm-compat", NULL) ?
+ true : x86_machine_is_smm_enabled(x86ms);
+ uint32_t io = object_property_get_uint(o, ACPI_PM_PROP_PM_IO_BASE, NULL);
+ AmlAddressSpace as = AML_AS_SYSTEM_IO;
+ AcpiFadtData fadt = {
+ .rev = 3,
+ .flags =
+ (1 << ACPI_FADT_F_WBINVD) |
+ (1 << ACPI_FADT_F_PROC_C1) |
+ (1 << ACPI_FADT_F_SLP_BUTTON) |
+ (1 << ACPI_FADT_F_RTC_S4) |
+ (1 << ACPI_FADT_F_USE_PLATFORM_CLOCK) |
+ /* APIC destination mode ("Flat Logical") has an upper limit of 8
+ * CPUs for more than 8 CPUs, "Clustered Logical" mode has to be
+ * used
+ */
+ ((ms->smp.max_cpus > 8) ?
+ (1 << ACPI_FADT_F_FORCE_APIC_CLUSTER_MODEL) : 0),
+ .int_model = 1 /* Multiple APIC */,
+ .rtc_century = RTC_CENTURY,
+ .plvl2_lat = 0xfff /* C2 state not supported */,
+ .plvl3_lat = 0xfff /* C3 state not supported */,
+ .smi_cmd = smm_enabled ? ACPI_PORT_SMI_CMD : 0,
+ .sci_int = object_property_get_uint(o, ACPI_PM_PROP_SCI_INT, NULL),
+ .acpi_enable_cmd =
+ smm_enabled ?
+ object_property_get_uint(o, ACPI_PM_PROP_ACPI_ENABLE_CMD, NULL) :
+ 0,
+ .acpi_disable_cmd =
+ smm_enabled ?
+ object_property_get_uint(o, ACPI_PM_PROP_ACPI_DISABLE_CMD, NULL) :
+ 0,
+ .pm1a_evt = { .space_id = as, .bit_width = 4 * 8, .address = io },
+ .pm1a_cnt = { .space_id = as, .bit_width = 2 * 8,
+ .address = io + 0x04 },
+ .pm_tmr = { .space_id = as, .bit_width = 4 * 8, .address = io + 0x08 },
+ .gpe0_blk = { .space_id = as, .bit_width =
+ object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK_LEN, NULL) * 8,
+ .address = object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK, NULL)
+ },
+ };
+ *data = fadt;
+}
+
+static Object *object_resolve_type_unambiguous(const char *typename)
+{
+ bool ambig;
+ Object *o = object_resolve_path_type("", typename, &ambig);
+
+ if (ambig || !o) {
+ return NULL;
+ }
+ return o;
+}
+
+static void acpi_get_pm_info(MachineState *machine, AcpiPmInfo *pm)
+{
+ Object *piix = object_resolve_type_unambiguous(TYPE_PIIX4_PM);
+ Object *lpc = object_resolve_type_unambiguous(TYPE_ICH9_LPC_DEVICE);
+ Object *obj = piix ? piix : lpc;
+ QObject *o;
+ pm->cpu_hp_io_base = 0;
+ pm->pcihp_io_base = 0;
+ pm->pcihp_io_len = 0;
+ pm->smi_on_cpuhp = false;
+ pm->smi_on_cpu_unplug = false;
+
+ assert(obj);
+ init_common_fadt_data(machine, obj, &pm->fadt);
+ if (piix) {
+ /* w2k requires FADT(rev1) or it won't boot, keep PC compatible */
+ pm->fadt.rev = 1;
+ pm->cpu_hp_io_base = PIIX4_CPU_HOTPLUG_IO_BASE;
+ }
+ if (lpc) {
+ uint64_t smi_features = object_property_get_uint(lpc,
+ ICH9_LPC_SMI_NEGOTIATED_FEAT_PROP, NULL);
+ struct AcpiGenericAddress r = { .space_id = AML_AS_SYSTEM_IO,
+ .bit_width = 8, .address = ICH9_RST_CNT_IOPORT };
+ pm->fadt.reset_reg = r;
+ pm->fadt.reset_val = 0xf;
+ pm->fadt.flags |= 1 << ACPI_FADT_F_RESET_REG_SUP;
+ pm->cpu_hp_io_base = ICH9_CPU_HOTPLUG_IO_BASE;
+ pm->smi_on_cpuhp =
+ !!(smi_features & BIT_ULL(ICH9_LPC_SMI_F_CPU_HOTPLUG_BIT));
+ pm->smi_on_cpu_unplug =
+ !!(smi_features & BIT_ULL(ICH9_LPC_SMI_F_CPU_HOT_UNPLUG_BIT));
+ }
+ pm->pcihp_io_base =
+ object_property_get_uint(obj, ACPI_PCIHP_IO_BASE_PROP, NULL);
+ pm->pcihp_io_len =
+ object_property_get_uint(obj, ACPI_PCIHP_IO_LEN_PROP, NULL);
+
+ /* The above need not be conditional on machine type because the reset port
+ * happens to be the same on PIIX (pc) and ICH9 (q35). */
+ QEMU_BUILD_BUG_ON(ICH9_RST_CNT_IOPORT != PIIX_RCR_IOPORT);
+
+ /* Fill in optional s3/s4 related properties */
+ o = object_property_get_qobject(obj, ACPI_PM_PROP_S3_DISABLED, NULL);
+ if (o) {
+ pm->s3_disabled = qnum_get_uint(qobject_to(QNum, o));
+ } else {
+ pm->s3_disabled = false;
+ }
+ qobject_unref(o);
+ o = object_property_get_qobject(obj, ACPI_PM_PROP_S4_DISABLED, NULL);
+ if (o) {
+ pm->s4_disabled = qnum_get_uint(qobject_to(QNum, o));
+ } else {
+ pm->s4_disabled = false;
+ }
+ qobject_unref(o);
+ o = object_property_get_qobject(obj, ACPI_PM_PROP_S4_VAL, NULL);
+ if (o) {
+ pm->s4_val = qnum_get_uint(qobject_to(QNum, o));
+ } else {
+ pm->s4_val = false;
+ }
+ qobject_unref(o);
+
+ pm->pcihp_bridge_en =
+ object_property_get_bool(obj, ACPI_PM_PROP_ACPI_PCIHP_BRIDGE,
+ NULL);
+ pm->pcihp_root_en =
+ object_property_get_bool(obj, ACPI_PM_PROP_ACPI_PCI_ROOTHP,
+ NULL);
+}
+
+static void acpi_get_misc_info(AcpiMiscInfo *info)
+{
+ Object *piix = object_resolve_type_unambiguous(TYPE_PIIX4_PM);
+ Object *lpc = object_resolve_type_unambiguous(TYPE_ICH9_LPC_DEVICE);
+ assert(!!piix != !!lpc);
+
+ if (piix) {
+ info->is_piix4 = true;
+ }
+ if (lpc) {
+ info->is_piix4 = false;
+ }
+
+ info->has_hpet = hpet_find();
+#ifdef CONFIG_TPM
+ info->tpm_version = tpm_get_version(tpm_find());
+#endif
+ info->pvpanic_port = pvpanic_port();
+ info->applesmc_io_base = applesmc_port();
+}
+
+/*
+ * Because of the PXB hosts we cannot simply query TYPE_PCI_HOST_BRIDGE.
+ * On i386 arch we only have two pci hosts, so we can look only for them.
+ */
+Object *acpi_get_i386_pci_host(void)
+{
+ PCIHostState *host;
+
+ host = PCI_HOST_BRIDGE(object_resolve_path("/machine/i440fx", NULL));
+ if (!host) {
+ host = PCI_HOST_BRIDGE(object_resolve_path("/machine/q35", NULL));
+ }
+
+ return OBJECT(host);
+}
+
+static void acpi_get_pci_holes(Range *hole, Range *hole64)
+{
+ Object *pci_host;
+
+ pci_host = acpi_get_i386_pci_host();
+
+ if (!pci_host) {
+ return;
+ }
+
+ range_set_bounds1(hole,
+ object_property_get_uint(pci_host,
+ PCI_HOST_PROP_PCI_HOLE_START,
+ NULL),
+ object_property_get_uint(pci_host,
+ PCI_HOST_PROP_PCI_HOLE_END,
+ NULL));
+ range_set_bounds1(hole64,
+ object_property_get_uint(pci_host,
+ PCI_HOST_PROP_PCI_HOLE64_START,
+ NULL),
+ object_property_get_uint(pci_host,
+ PCI_HOST_PROP_PCI_HOLE64_END,
+ NULL));
+}
+
+static void acpi_align_size(GArray *blob, unsigned align)
+{
+ /* Align size to multiple of given size. This reduces the chance
+ * we need to change size in the future (breaking cross version migration).
+ */
+ g_array_set_size(blob, ROUND_UP(acpi_data_len(blob), align));
+}
+
+/*
+ * ACPI spec 1.0b,
+ * 5.2.6 Firmware ACPI Control Structure
+ */
+static void
+build_facs(GArray *table_data)
+{
+ const char *sig = "FACS";
+ const uint8_t reserved[40] = {};
+
+ g_array_append_vals(table_data, sig, 4); /* Signature */
+ build_append_int_noprefix(table_data, 64, 4); /* Length */
+ build_append_int_noprefix(table_data, 0, 4); /* Hardware Signature */
+ build_append_int_noprefix(table_data, 0, 4); /* Firmware Waking Vector */
+ build_append_int_noprefix(table_data, 0, 4); /* Global Lock */
+ build_append_int_noprefix(table_data, 0, 4); /* Flags */
+ g_array_append_vals(table_data, reserved, 40); /* Reserved */
+}
+
+static void build_append_pcihp_notify_entry(Aml *method, int slot)
+{
+ Aml *if_ctx;
+ int32_t devfn = PCI_DEVFN(slot, 0);
+
+ if_ctx = aml_if(aml_and(aml_arg(0), aml_int(0x1U << slot), NULL));
+ aml_append(if_ctx, aml_notify(aml_name("S%.02X", devfn), aml_arg(1)));
+ aml_append(method, if_ctx);
+}
+
+static void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus,
+ bool pcihp_bridge_en)
+{
+ Aml *dev, *notify_method = NULL, *method;
+ QObject *bsel;
+ PCIBus *sec;
+ int devfn;
+
+ bsel = object_property_get_qobject(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, NULL);
+ if (bsel) {
+ uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel));
+
+ aml_append(parent_scope, aml_name_decl("BSEL", aml_int(bsel_val)));
+ notify_method = aml_method("DVNT", 2, AML_NOTSERIALIZED);
+ }
+
+ for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
+ DeviceClass *dc;
+ PCIDeviceClass *pc;
+ PCIDevice *pdev = bus->devices[devfn];
+ int slot = PCI_SLOT(devfn);
+ int func = PCI_FUNC(devfn);
+ /* ACPI spec: 1.0b: Table 6-2 _ADR Object Bus Types, PCI type */
+ int adr = slot << 16 | func;
+ bool hotplug_enabled_dev;
+ bool bridge_in_acpi;
+ bool cold_plugged_bridge;
+
+ if (!pdev) {
+ /*
+ * add hotplug slots for non present devices.
+ * hotplug is supported only for non-multifunction device
+ * so generate device description only for function 0
+ */
+ if (bsel && !func) {
+ if (pci_bus_is_express(bus) && slot > 0) {
+ break;
+ }
+ dev = aml_device("S%.02X", devfn);
+ aml_append(dev, aml_name_decl("_SUN", aml_int(slot)));
+ aml_append(dev, aml_name_decl("_ADR", aml_int(adr)));
+ method = aml_method("_EJ0", 1, AML_NOTSERIALIZED);
+ aml_append(method,
+ aml_call2("PCEJ", aml_name("BSEL"), aml_name("_SUN"))
+ );
+ aml_append(dev, method);
+ method = aml_method("_DSM", 4, AML_SERIALIZED);
+ aml_append(method,
+ aml_return(aml_call6("PDSM", aml_arg(0), aml_arg(1),
+ aml_arg(2), aml_arg(3),
+ aml_name("BSEL"), aml_name("_SUN")))
+ );
+ aml_append(dev, method);
+ aml_append(parent_scope, dev);
+
+ build_append_pcihp_notify_entry(notify_method, slot);
+ }
+ continue;
+ }
+
+ pc = PCI_DEVICE_GET_CLASS(pdev);
+ dc = DEVICE_GET_CLASS(pdev);
+
+ /*
+ * Cold plugged bridges aren't themselves hot-pluggable.
+ * Hotplugged bridges *are* hot-pluggable.
+ */
+ cold_plugged_bridge = pc->is_bridge && !DEVICE(pdev)->hotplugged;
+ bridge_in_acpi = cold_plugged_bridge && pcihp_bridge_en;
+
+ hotplug_enabled_dev = bsel && dc->hotpluggable && !cold_plugged_bridge;
+
+ if (pc->class_id == PCI_CLASS_BRIDGE_ISA) {
+ continue;
+ }
+
+ /*
+ * allow describing coldplugged bridges in ACPI even if they are not
+ * on function 0, as they are not unpluggable, for all other devices
+ * generate description only for function 0 per slot
+ */
+ if (func && !bridge_in_acpi) {
+ continue;
+ }
+
+ /* start to compose PCI device descriptor */
+ dev = aml_device("S%.02X", devfn);
+ aml_append(dev, aml_name_decl("_ADR", aml_int(adr)));
+
+ if (bsel) {
+ /*
+ * Can't declare _SUN here for every device as it changes 'slot'
+ * enumeration order in linux kernel, so use another variable for it
+ */
+ aml_append(dev, aml_name_decl("ASUN", aml_int(slot)));
+ method = aml_method("_DSM", 4, AML_SERIALIZED);
+ aml_append(method, aml_return(
+ aml_call6("PDSM", aml_arg(0), aml_arg(1), aml_arg(2),
+ aml_arg(3), aml_name("BSEL"), aml_name("ASUN"))
+ ));
+ aml_append(dev, method);
+ }
+
+ if (pc->class_id == PCI_CLASS_DISPLAY_VGA) {
+ /* add VGA specific AML methods */
+ int s3d;
+
+ if (object_dynamic_cast(OBJECT(pdev), "qxl-vga")) {
+ s3d = 3;
+ } else {
+ s3d = 0;
+ }
+
+ method = aml_method("_S1D", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_return(aml_int(0)));
+ aml_append(dev, method);
+
+ method = aml_method("_S2D", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_return(aml_int(0)));
+ aml_append(dev, method);
+
+ method = aml_method("_S3D", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_return(aml_int(s3d)));
+ aml_append(dev, method);
+ } else if (hotplug_enabled_dev) {
+ aml_append(dev, aml_name_decl("_SUN", aml_int(slot)));
+ /* add _EJ0 to make slot hotpluggable */
+ method = aml_method("_EJ0", 1, AML_NOTSERIALIZED);
+ aml_append(method,
+ aml_call2("PCEJ", aml_name("BSEL"), aml_name("_SUN"))
+ );
+ aml_append(dev, method);
+
+ if (bsel) {
+ build_append_pcihp_notify_entry(notify_method, slot);
+ }
+ } else if (bridge_in_acpi) {
+ /*
+ * device is coldplugged bridge,
+ * add child device descriptions into its scope
+ */
+ PCIBus *sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev));
+
+ build_append_pci_bus_devices(dev, sec_bus, pcihp_bridge_en);
+ }
+ /* device descriptor has been composed, add it into parent context */
+ aml_append(parent_scope, dev);
+ }
+
+ if (bsel) {
+ aml_append(parent_scope, notify_method);
+ }
+
+ /* Append PCNT method to notify about events on local and child buses.
+ * Add this method for root bus only when hotplug is enabled since DSDT
+ * expects it.
+ */
+ if (bsel || pcihp_bridge_en) {
+ method = aml_method("PCNT", 0, AML_NOTSERIALIZED);
+
+ /* If bus supports hotplug select it and notify about local events */
+ if (bsel) {
+ uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel));
+
+ aml_append(method, aml_store(aml_int(bsel_val), aml_name("BNUM")));
+ aml_append(method, aml_call2("DVNT", aml_name("PCIU"),
+ aml_int(1))); /* Device Check */
+ aml_append(method, aml_call2("DVNT", aml_name("PCID"),
+ aml_int(3))); /* Eject Request */
+ }
+
+ /* Notify about child bus events in any case */
+ if (pcihp_bridge_en) {
+ QLIST_FOREACH(sec, &bus->child, sibling) {
+ if (pci_bus_is_root(sec)) {
+ continue;
+ }
+
+ aml_append(method, aml_name("^S%.02X.PCNT",
+ sec->parent_dev->devfn));
+ }
+ }
+
+ aml_append(parent_scope, method);
+ }
+ qobject_unref(bsel);
+}
+
+Aml *aml_pci_device_dsm(void)
+{
+ Aml *method, *UUID, *ifctx, *ifctx1, *ifctx2, *ifctx3, *elsectx;
+ Aml *acpi_index = aml_local(0);
+ Aml *zero = aml_int(0);
+ Aml *bnum = aml_arg(4);
+ Aml *func = aml_arg(2);
+ Aml *rev = aml_arg(1);
+ Aml *sun = aml_arg(5);
+
+ method = aml_method("PDSM", 6, AML_SERIALIZED);
+
+ /*
+ * PCI Firmware Specification 3.1
+ * 4.6. _DSM Definitions for PCI
+ */
+ UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D");
+ ifctx = aml_if(aml_equal(aml_arg(0), UUID));
+ {
+ aml_append(ifctx, aml_store(aml_call2("AIDX", bnum, sun), acpi_index));
+ ifctx1 = aml_if(aml_equal(func, zero));
+ {
+ uint8_t byte_list[1];
+
+ ifctx2 = aml_if(aml_equal(rev, aml_int(2)));
+ {
+ /*
+ * advertise function 7 if device has acpi-index
+ * acpi_index values:
+ * 0: not present (default value)
+ * FFFFFFFF: not supported (old QEMU without PIDX reg)
+ * other: device's acpi-index
+ */
+ ifctx3 = aml_if(aml_lnot(
+ aml_or(aml_equal(acpi_index, zero),
+ aml_equal(acpi_index, aml_int(0xFFFFFFFF)), NULL)
+ ));
+ {
+ byte_list[0] =
+ 1 /* have supported functions */ |
+ 1 << 7 /* support for function 7 */
+ ;
+ aml_append(ifctx3, aml_return(aml_buffer(1, byte_list)));
+ }
+ aml_append(ifctx2, ifctx3);
+ }
+ aml_append(ifctx1, ifctx2);
+
+ byte_list[0] = 0; /* nothing supported */
+ aml_append(ifctx1, aml_return(aml_buffer(1, byte_list)));
+ }
+ aml_append(ifctx, ifctx1);
+ elsectx = aml_else();
+ /*
+ * PCI Firmware Specification 3.1
+ * 4.6.7. _DSM for Naming a PCI or PCI Express Device Under
+ * Operating Systems
+ */
+ ifctx1 = aml_if(aml_equal(func, aml_int(7)));
+ {
+ Aml *pkg = aml_package(2);
+ Aml *ret = aml_local(1);
+
+ aml_append(pkg, zero);
+ /*
+ * optional, if not impl. should return null string
+ */
+ aml_append(pkg, aml_string("%s", ""));
+ aml_append(ifctx1, aml_store(pkg, ret));
+ /*
+ * update acpi-index to actual value
+ */
+ aml_append(ifctx1, aml_store(acpi_index, aml_index(ret, zero)));
+ aml_append(ifctx1, aml_return(ret));
+ }
+ aml_append(elsectx, ifctx1);
+ aml_append(ifctx, elsectx);
+ }
+ aml_append(method, ifctx);
+ return method;
+}
+
+/**
+ * build_prt_entry:
+ * @link_name: link name for PCI route entry
+ *
+ * build AML package containing a PCI route entry for @link_name
+ */
+static Aml *build_prt_entry(const char *link_name)
+{
+ Aml *a_zero = aml_int(0);
+ Aml *pkg = aml_package(4);
+ aml_append(pkg, a_zero);
+ aml_append(pkg, a_zero);
+ aml_append(pkg, aml_name("%s", link_name));
+ aml_append(pkg, a_zero);
+ return pkg;
+}
+
+/*
+ * initialize_route - Initialize the interrupt routing rule
+ * through a specific LINK:
+ * if (lnk_idx == idx)
+ * route using link 'link_name'
+ */
+static Aml *initialize_route(Aml *route, const char *link_name,
+ Aml *lnk_idx, int idx)
+{
+ Aml *if_ctx = aml_if(aml_equal(lnk_idx, aml_int(idx)));
+ Aml *pkg = build_prt_entry(link_name);
+
+ aml_append(if_ctx, aml_store(pkg, route));
+
+ return if_ctx;
+}
+
+/*
+ * build_prt - Define interrupt rounting rules
+ *
+ * Returns an array of 128 routes, one for each device,
+ * based on device location.
+ * The main goal is to equaly distribute the interrupts
+ * over the 4 existing ACPI links (works only for i440fx).
+ * The hash function is (slot + pin) & 3 -> "LNK[D|A|B|C]".
+ *
+ */
+static Aml *build_prt(bool is_pci0_prt)
+{
+ Aml *method, *while_ctx, *pin, *res;
+
+ method = aml_method("_PRT", 0, AML_NOTSERIALIZED);
+ res = aml_local(0);
+ pin = aml_local(1);
+ aml_append(method, aml_store(aml_package(128), res));
+ aml_append(method, aml_store(aml_int(0), pin));
+
+ /* while (pin < 128) */
+ while_ctx = aml_while(aml_lless(pin, aml_int(128)));
+ {
+ Aml *slot = aml_local(2);
+ Aml *lnk_idx = aml_local(3);
+ Aml *route = aml_local(4);
+
+ /* slot = pin >> 2 */
+ aml_append(while_ctx,
+ aml_store(aml_shiftright(pin, aml_int(2), NULL), slot));
+ /* lnk_idx = (slot + pin) & 3 */
+ aml_append(while_ctx,
+ aml_store(aml_and(aml_add(pin, slot, NULL), aml_int(3), NULL),
+ lnk_idx));
+
+ /* route[2] = "LNK[D|A|B|C]", selection based on pin % 3 */
+ aml_append(while_ctx, initialize_route(route, "LNKD", lnk_idx, 0));
+ if (is_pci0_prt) {
+ Aml *if_device_1, *if_pin_4, *else_pin_4;
+
+ /* device 1 is the power-management device, needs SCI */
+ if_device_1 = aml_if(aml_equal(lnk_idx, aml_int(1)));
+ {
+ if_pin_4 = aml_if(aml_equal(pin, aml_int(4)));
+ {
+ aml_append(if_pin_4,
+ aml_store(build_prt_entry("LNKS"), route));
+ }
+ aml_append(if_device_1, if_pin_4);
+ else_pin_4 = aml_else();
+ {
+ aml_append(else_pin_4,
+ aml_store(build_prt_entry("LNKA"), route));
+ }
+ aml_append(if_device_1, else_pin_4);
+ }
+ aml_append(while_ctx, if_device_1);
+ } else {
+ aml_append(while_ctx, initialize_route(route, "LNKA", lnk_idx, 1));
+ }
+ aml_append(while_ctx, initialize_route(route, "LNKB", lnk_idx, 2));
+ aml_append(while_ctx, initialize_route(route, "LNKC", lnk_idx, 3));
+
+ /* route[0] = 0x[slot]FFFF */
+ aml_append(while_ctx,
+ aml_store(aml_or(aml_shiftleft(slot, aml_int(16)), aml_int(0xFFFF),
+ NULL),
+ aml_index(route, aml_int(0))));
+ /* route[1] = pin & 3 */
+ aml_append(while_ctx,
+ aml_store(aml_and(pin, aml_int(3), NULL),
+ aml_index(route, aml_int(1))));
+ /* res[pin] = route */
+ aml_append(while_ctx, aml_store(route, aml_index(res, pin)));
+ /* pin++ */
+ aml_append(while_ctx, aml_increment(pin));
+ }
+ aml_append(method, while_ctx);
+ /* return res*/
+ aml_append(method, aml_return(res));
+
+ return method;
+}
+
+static void build_hpet_aml(Aml *table)
+{
+ Aml *crs;
+ Aml *field;
+ Aml *method;
+ Aml *if_ctx;
+ Aml *scope = aml_scope("_SB");
+ Aml *dev = aml_device("HPET");
+ Aml *zero = aml_int(0);
+ Aml *id = aml_local(0);
+ Aml *period = aml_local(1);
+
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0103")));
+ aml_append(dev, aml_name_decl("_UID", zero));
+
+ aml_append(dev,
+ aml_operation_region("HPTM", AML_SYSTEM_MEMORY, aml_int(HPET_BASE),
+ HPET_LEN));
+ field = aml_field("HPTM", AML_DWORD_ACC, AML_LOCK, AML_PRESERVE);
+ aml_append(field, aml_named_field("VEND", 32));
+ aml_append(field, aml_named_field("PRD", 32));
+ aml_append(dev, field);
+
+ method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_store(aml_name("VEND"), id));
+ aml_append(method, aml_store(aml_name("PRD"), period));
+ aml_append(method, aml_shiftright(id, aml_int(16), id));
+ if_ctx = aml_if(aml_lor(aml_equal(id, zero),
+ aml_equal(id, aml_int(0xffff))));
+ {
+ aml_append(if_ctx, aml_return(zero));
+ }
+ aml_append(method, if_ctx);
+
+ if_ctx = aml_if(aml_lor(aml_equal(period, zero),
+ aml_lgreater(period, aml_int(100000000))));
+ {
+ aml_append(if_ctx, aml_return(zero));
+ }
+ aml_append(method, if_ctx);
+
+ aml_append(method, aml_return(aml_int(0x0F)));
+ aml_append(dev, method);
+
+ crs = aml_resource_template();
+ aml_append(crs, aml_memory32_fixed(HPET_BASE, HPET_LEN, AML_READ_ONLY));
+ aml_append(dev, aml_name_decl("_CRS", crs));
+
+ aml_append(scope, dev);
+ aml_append(table, scope);
+}
+
+static Aml *build_vmbus_device_aml(VMBusBridge *vmbus_bridge)
+{
+ Aml *dev;
+ Aml *method;
+ Aml *crs;
+
+ dev = aml_device("VMBS");
+ aml_append(dev, aml_name_decl("STA", aml_int(0xF)));
+ aml_append(dev, aml_name_decl("_HID", aml_string("VMBus")));
+ aml_append(dev, aml_name_decl("_UID", aml_int(0x0)));
+ aml_append(dev, aml_name_decl("_DDN", aml_string("VMBUS")));
+
+ method = aml_method("_DIS", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_store(aml_and(aml_name("STA"), aml_int(0xD), NULL),
+ aml_name("STA")));
+ aml_append(dev, method);
+
+ method = aml_method("_PS0", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_store(aml_or(aml_name("STA"), aml_int(0xF), NULL),
+ aml_name("STA")));
+ aml_append(dev, method);
+
+ method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_return(aml_name("STA")));
+ aml_append(dev, method);
+
+ aml_append(dev, aml_name_decl("_PS3", aml_int(0x0)));
+
+ crs = aml_resource_template();
+ aml_append(crs, aml_irq_no_flags(vmbus_bridge->irq));
+ aml_append(dev, aml_name_decl("_CRS", crs));
+
+ return dev;
+}
+
+static void build_isa_devices_aml(Aml *table)
+{
+ bool ambiguous;
+ Object *obj = object_resolve_path_type("", TYPE_ISA_BUS, &ambiguous);
+ Aml *scope;
+
+ assert(obj && !ambiguous);
+
+ scope = aml_scope("_SB.PCI0.ISA");
+ build_acpi_ipmi_devices(scope, BUS(obj), "\\_SB.PCI0.ISA");
+ isa_build_aml(ISA_BUS(obj), scope);
+
+ aml_append(table, scope);
+}
+
+static void build_dbg_aml(Aml *table)
+{
+ Aml *field;
+ Aml *method;
+ Aml *while_ctx;
+ Aml *scope = aml_scope("\\");
+ Aml *buf = aml_local(0);
+ Aml *len = aml_local(1);
+ Aml *idx = aml_local(2);
+
+ aml_append(scope,
+ aml_operation_region("DBG", AML_SYSTEM_IO, aml_int(0x0402), 0x01));
+ field = aml_field("DBG", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE);
+ aml_append(field, aml_named_field("DBGB", 8));
+ aml_append(scope, field);
+
+ method = aml_method("DBUG", 1, AML_NOTSERIALIZED);
+
+ aml_append(method, aml_to_hexstring(aml_arg(0), buf));
+ aml_append(method, aml_to_buffer(buf, buf));
+ aml_append(method, aml_subtract(aml_sizeof(buf), aml_int(1), len));
+ aml_append(method, aml_store(aml_int(0), idx));
+
+ while_ctx = aml_while(aml_lless(idx, len));
+ aml_append(while_ctx,
+ aml_store(aml_derefof(aml_index(buf, idx)), aml_name("DBGB")));
+ aml_append(while_ctx, aml_increment(idx));
+ aml_append(method, while_ctx);
+
+ aml_append(method, aml_store(aml_int(0x0A), aml_name("DBGB")));
+ aml_append(scope, method);
+
+ aml_append(table, scope);
+}
+
+static Aml *build_link_dev(const char *name, uint8_t uid, Aml *reg)
+{
+ Aml *dev;
+ Aml *crs;
+ Aml *method;
+ uint32_t irqs[] = {5, 10, 11};
+
+ dev = aml_device("%s", name);
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0C0F")));
+ aml_append(dev, aml_name_decl("_UID", aml_int(uid)));
+
+ crs = aml_resource_template();
+ aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH,
+ AML_SHARED, irqs, ARRAY_SIZE(irqs)));
+ aml_append(dev, aml_name_decl("_PRS", crs));
+
+ method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_return(aml_call1("IQST", reg)));
+ aml_append(dev, method);
+
+ method = aml_method("_DIS", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_or(reg, aml_int(0x80), reg));
+ aml_append(dev, method);
+
+ method = aml_method("_CRS", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_return(aml_call1("IQCR", reg)));
+ aml_append(dev, method);
+
+ method = aml_method("_SRS", 1, AML_NOTSERIALIZED);
+ aml_append(method, aml_create_dword_field(aml_arg(0), aml_int(5), "PRRI"));
+ aml_append(method, aml_store(aml_name("PRRI"), reg));
+ aml_append(dev, method);
+
+ return dev;
+ }
+
+static Aml *build_gsi_link_dev(const char *name, uint8_t uid, uint8_t gsi)
+{
+ Aml *dev;
+ Aml *crs;
+ Aml *method;
+ uint32_t irqs;
+
+ dev = aml_device("%s", name);
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0C0F")));
+ aml_append(dev, aml_name_decl("_UID", aml_int(uid)));
+
+ crs = aml_resource_template();
+ irqs = gsi;
+ aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH,
+ AML_SHARED, &irqs, 1));
+ aml_append(dev, aml_name_decl("_PRS", crs));
+
+ aml_append(dev, aml_name_decl("_CRS", crs));
+
+ /*
+ * _DIS can be no-op because the interrupt cannot be disabled.
+ */
+ method = aml_method("_DIS", 0, AML_NOTSERIALIZED);
+ aml_append(dev, method);
+
+ method = aml_method("_SRS", 1, AML_NOTSERIALIZED);
+ aml_append(dev, method);
+
+ return dev;
+}
+
+/* _CRS method - get current settings */
+static Aml *build_iqcr_method(bool is_piix4)
+{
+ Aml *if_ctx;
+ uint32_t irqs;
+ Aml *method = aml_method("IQCR", 1, AML_SERIALIZED);
+ Aml *crs = aml_resource_template();
+
+ irqs = 0;
+ aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL,
+ AML_ACTIVE_HIGH, AML_SHARED, &irqs, 1));
+ aml_append(method, aml_name_decl("PRR0", crs));
+
+ aml_append(method,
+ aml_create_dword_field(aml_name("PRR0"), aml_int(5), "PRRI"));
+
+ if (is_piix4) {
+ if_ctx = aml_if(aml_lless(aml_arg(0), aml_int(0x80)));
+ aml_append(if_ctx, aml_store(aml_arg(0), aml_name("PRRI")));
+ aml_append(method, if_ctx);
+ } else {
+ aml_append(method,
+ aml_store(aml_and(aml_arg(0), aml_int(0xF), NULL),
+ aml_name("PRRI")));
+ }
+
+ aml_append(method, aml_return(aml_name("PRR0")));
+ return method;
+}
+
+/* _STA method - get status */
+static Aml *build_irq_status_method(void)
+{
+ Aml *if_ctx;
+ Aml *method = aml_method("IQST", 1, AML_NOTSERIALIZED);
+
+ if_ctx = aml_if(aml_and(aml_int(0x80), aml_arg(0), NULL));
+ aml_append(if_ctx, aml_return(aml_int(0x09)));
+ aml_append(method, if_ctx);
+ aml_append(method, aml_return(aml_int(0x0B)));
+ return method;
+}
+
+static void build_piix4_pci0_int(Aml *table)
+{
+ Aml *dev;
+ Aml *crs;
+ Aml *field;
+ Aml *method;
+ uint32_t irqs;
+ Aml *sb_scope = aml_scope("_SB");
+ Aml *pci0_scope = aml_scope("PCI0");
+
+ aml_append(pci0_scope, build_prt(true));
+ aml_append(sb_scope, pci0_scope);
+
+ field = aml_field("PCI0.ISA.P40C", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE);
+ aml_append(field, aml_named_field("PRQ0", 8));
+ aml_append(field, aml_named_field("PRQ1", 8));
+ aml_append(field, aml_named_field("PRQ2", 8));
+ aml_append(field, aml_named_field("PRQ3", 8));
+ aml_append(sb_scope, field);
+
+ aml_append(sb_scope, build_irq_status_method());
+ aml_append(sb_scope, build_iqcr_method(true));
+
+ aml_append(sb_scope, build_link_dev("LNKA", 0, aml_name("PRQ0")));
+ aml_append(sb_scope, build_link_dev("LNKB", 1, aml_name("PRQ1")));
+ aml_append(sb_scope, build_link_dev("LNKC", 2, aml_name("PRQ2")));
+ aml_append(sb_scope, build_link_dev("LNKD", 3, aml_name("PRQ3")));
+
+ dev = aml_device("LNKS");
+ {
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0C0F")));
+ aml_append(dev, aml_name_decl("_UID", aml_int(4)));
+
+ crs = aml_resource_template();
+ irqs = 9;
+ aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL,
+ AML_ACTIVE_HIGH, AML_SHARED,
+ &irqs, 1));
+ aml_append(dev, aml_name_decl("_PRS", crs));
+
+ /* The SCI cannot be disabled and is always attached to GSI 9,
+ * so these are no-ops. We only need this link to override the
+ * polarity to active high and match the content of the MADT.
+ */
+ method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_return(aml_int(0x0b)));
+ aml_append(dev, method);
+
+ method = aml_method("_DIS", 0, AML_NOTSERIALIZED);
+ aml_append(dev, method);
+
+ method = aml_method("_CRS", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_return(aml_name("_PRS")));
+ aml_append(dev, method);
+
+ method = aml_method("_SRS", 1, AML_NOTSERIALIZED);
+ aml_append(dev, method);
+ }
+ aml_append(sb_scope, dev);
+
+ aml_append(table, sb_scope);
+}
+
+static void append_q35_prt_entry(Aml *ctx, uint32_t nr, const char *name)
+{
+ int i;
+ int head;
+ Aml *pkg;
+ char base = name[3] < 'E' ? 'A' : 'E';
+ char *s = g_strdup(name);
+ Aml *a_nr = aml_int((nr << 16) | 0xffff);
+
+ assert(strlen(s) == 4);
+
+ head = name[3] - base;
+ for (i = 0; i < 4; i++) {
+ if (head + i > 3) {
+ head = i * -1;
+ }
+ s[3] = base + head + i;
+ pkg = aml_package(4);
+ aml_append(pkg, a_nr);
+ aml_append(pkg, aml_int(i));
+ aml_append(pkg, aml_name("%s", s));
+ aml_append(pkg, aml_int(0));
+ aml_append(ctx, pkg);
+ }
+ g_free(s);
+}
+
+static Aml *build_q35_routing_table(const char *str)
+{
+ int i;
+ Aml *pkg;
+ char *name = g_strdup_printf("%s ", str);
+
+ pkg = aml_package(128);
+ for (i = 0; i < 0x18; i++) {
+ name[3] = 'E' + (i & 0x3);
+ append_q35_prt_entry(pkg, i, name);
+ }
+
+ name[3] = 'E';
+ append_q35_prt_entry(pkg, 0x18, name);
+
+ /* INTA -> PIRQA for slot 25 - 31, see the default value of D<N>IR */
+ for (i = 0x0019; i < 0x1e; i++) {
+ name[3] = 'A';
+ append_q35_prt_entry(pkg, i, name);
+ }
+
+ /* PCIe->PCI bridge. use PIRQ[E-H] */
+ name[3] = 'E';
+ append_q35_prt_entry(pkg, 0x1e, name);
+ name[3] = 'A';
+ append_q35_prt_entry(pkg, 0x1f, name);
+
+ g_free(name);
+ return pkg;
+}
+
+static void build_q35_pci0_int(Aml *table)
+{
+ Aml *field;
+ Aml *method;
+ Aml *sb_scope = aml_scope("_SB");
+ Aml *pci0_scope = aml_scope("PCI0");
+
+ /* Zero => PIC mode, One => APIC Mode */
+ aml_append(table, aml_name_decl("PICF", aml_int(0)));
+ method = aml_method("_PIC", 1, AML_NOTSERIALIZED);
+ {
+ aml_append(method, aml_store(aml_arg(0), aml_name("PICF")));
+ }
+ aml_append(table, method);
+
+ aml_append(pci0_scope,
+ aml_name_decl("PRTP", build_q35_routing_table("LNK")));
+ aml_append(pci0_scope,
+ aml_name_decl("PRTA", build_q35_routing_table("GSI")));
+
+ method = aml_method("_PRT", 0, AML_NOTSERIALIZED);
+ {
+ Aml *if_ctx;
+ Aml *else_ctx;
+
+ /* PCI IRQ routing table, example from ACPI 2.0a specification,
+ section 6.2.8.1 */
+ /* Note: we provide the same info as the PCI routing
+ table of the Bochs BIOS */
+ if_ctx = aml_if(aml_equal(aml_name("PICF"), aml_int(0)));
+ aml_append(if_ctx, aml_return(aml_name("PRTP")));
+ aml_append(method, if_ctx);
+ else_ctx = aml_else();
+ aml_append(else_ctx, aml_return(aml_name("PRTA")));
+ aml_append(method, else_ctx);
+ }
+ aml_append(pci0_scope, method);
+ aml_append(sb_scope, pci0_scope);
+
+ field = aml_field("PCI0.ISA.PIRQ", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE);
+ aml_append(field, aml_named_field("PRQA", 8));
+ aml_append(field, aml_named_field("PRQB", 8));
+ aml_append(field, aml_named_field("PRQC", 8));
+ aml_append(field, aml_named_field("PRQD", 8));
+ aml_append(field, aml_reserved_field(0x20));
+ aml_append(field, aml_named_field("PRQE", 8));
+ aml_append(field, aml_named_field("PRQF", 8));
+ aml_append(field, aml_named_field("PRQG", 8));
+ aml_append(field, aml_named_field("PRQH", 8));
+ aml_append(sb_scope, field);
+
+ aml_append(sb_scope, build_irq_status_method());
+ aml_append(sb_scope, build_iqcr_method(false));
+
+ aml_append(sb_scope, build_link_dev("LNKA", 0, aml_name("PRQA")));
+ aml_append(sb_scope, build_link_dev("LNKB", 1, aml_name("PRQB")));
+ aml_append(sb_scope, build_link_dev("LNKC", 2, aml_name("PRQC")));
+ aml_append(sb_scope, build_link_dev("LNKD", 3, aml_name("PRQD")));
+ aml_append(sb_scope, build_link_dev("LNKE", 4, aml_name("PRQE")));
+ aml_append(sb_scope, build_link_dev("LNKF", 5, aml_name("PRQF")));
+ aml_append(sb_scope, build_link_dev("LNKG", 6, aml_name("PRQG")));
+ aml_append(sb_scope, build_link_dev("LNKH", 7, aml_name("PRQH")));
+
+ aml_append(sb_scope, build_gsi_link_dev("GSIA", 0x10, 0x10));
+ aml_append(sb_scope, build_gsi_link_dev("GSIB", 0x11, 0x11));
+ aml_append(sb_scope, build_gsi_link_dev("GSIC", 0x12, 0x12));
+ aml_append(sb_scope, build_gsi_link_dev("GSID", 0x13, 0x13));
+ aml_append(sb_scope, build_gsi_link_dev("GSIE", 0x14, 0x14));
+ aml_append(sb_scope, build_gsi_link_dev("GSIF", 0x15, 0x15));
+ aml_append(sb_scope, build_gsi_link_dev("GSIG", 0x16, 0x16));
+ aml_append(sb_scope, build_gsi_link_dev("GSIH", 0x17, 0x17));
+
+ aml_append(table, sb_scope);
+}
+
+static Aml *build_q35_dram_controller(const AcpiMcfgInfo *mcfg)
+{
+ Aml *dev;
+ Aml *resource_template;
+
+ /* DRAM controller */
+ dev = aml_device("DRAC");
+ aml_append(dev, aml_name_decl("_HID", aml_string("PNP0C01")));
+
+ resource_template = aml_resource_template();
+ if (mcfg->base + mcfg->size - 1 >= (1ULL << 32)) {
+ aml_append(resource_template,
+ aml_qword_memory(AML_POS_DECODE,
+ AML_MIN_FIXED,
+ AML_MAX_FIXED,
+ AML_NON_CACHEABLE,
+ AML_READ_WRITE,
+ 0x0000000000000000,
+ mcfg->base,
+ mcfg->base + mcfg->size - 1,
+ 0x0000000000000000,
+ mcfg->size));
+ } else {
+ aml_append(resource_template,
+ aml_dword_memory(AML_POS_DECODE,
+ AML_MIN_FIXED,
+ AML_MAX_FIXED,
+ AML_NON_CACHEABLE,
+ AML_READ_WRITE,
+ 0x0000000000000000,
+ mcfg->base,
+ mcfg->base + mcfg->size - 1,
+ 0x0000000000000000,
+ mcfg->size));
+ }
+ aml_append(dev, aml_name_decl("_CRS", resource_template));
+
+ return dev;
+}
+
+static void build_q35_isa_bridge(Aml *table)
+{
+ Aml *dev;
+ Aml *scope;
+
+ scope = aml_scope("_SB.PCI0");
+ dev = aml_device("ISA");
+ aml_append(dev, aml_name_decl("_ADR", aml_int(0x001F0000)));
+
+ /* ICH9 PCI to ISA irq remapping */
+ aml_append(dev, aml_operation_region("PIRQ", AML_PCI_CONFIG,
+ aml_int(0x60), 0x0C));
+
+ aml_append(scope, dev);
+ aml_append(table, scope);
+}
+
+static void build_piix4_isa_bridge(Aml *table)
+{
+ Aml *dev;
+ Aml *scope;
+
+ scope = aml_scope("_SB.PCI0");
+ dev = aml_device("ISA");
+ aml_append(dev, aml_name_decl("_ADR", aml_int(0x00010000)));
+
+ /* PIIX PCI to ISA irq remapping */
+ aml_append(dev, aml_operation_region("P40C", AML_PCI_CONFIG,
+ aml_int(0x60), 0x04));
+
+ aml_append(scope, dev);
+ aml_append(table, scope);
+}
+
+static void build_x86_acpi_pci_hotplug(Aml *table, uint64_t pcihp_addr)
+{
+ Aml *scope;
+ Aml *field;
+ Aml *method;
+
+ scope = aml_scope("_SB.PCI0");
+
+ aml_append(scope,
+ aml_operation_region("PCST", AML_SYSTEM_IO, aml_int(pcihp_addr), 0x08));
+ field = aml_field("PCST", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS);
+ aml_append(field, aml_named_field("PCIU", 32));
+ aml_append(field, aml_named_field("PCID", 32));
+ aml_append(scope, field);
+
+ aml_append(scope,
+ aml_operation_region("SEJ", AML_SYSTEM_IO,
+ aml_int(pcihp_addr + ACPI_PCIHP_SEJ_BASE), 0x04));
+ field = aml_field("SEJ", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS);
+ aml_append(field, aml_named_field("B0EJ", 32));
+ aml_append(scope, field);
+
+ aml_append(scope,
+ aml_operation_region("BNMR", AML_SYSTEM_IO,
+ aml_int(pcihp_addr + ACPI_PCIHP_BNMR_BASE), 0x08));
+ field = aml_field("BNMR", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS);
+ aml_append(field, aml_named_field("BNUM", 32));
+ aml_append(field, aml_named_field("PIDX", 32));
+ aml_append(scope, field);
+
+ aml_append(scope, aml_mutex("BLCK", 0));
+
+ method = aml_method("PCEJ", 2, AML_NOTSERIALIZED);
+ aml_append(method, aml_acquire(aml_name("BLCK"), 0xFFFF));
+ aml_append(method, aml_store(aml_arg(0), aml_name("BNUM")));
+ aml_append(method,
+ aml_store(aml_shiftleft(aml_int(1), aml_arg(1)), aml_name("B0EJ")));
+ aml_append(method, aml_release(aml_name("BLCK")));
+ aml_append(method, aml_return(aml_int(0)));
+ aml_append(scope, method);
+
+ method = aml_method("AIDX", 2, AML_NOTSERIALIZED);
+ aml_append(method, aml_acquire(aml_name("BLCK"), 0xFFFF));
+ aml_append(method, aml_store(aml_arg(0), aml_name("BNUM")));
+ aml_append(method,
+ aml_store(aml_shiftleft(aml_int(1), aml_arg(1)), aml_name("PIDX")));
+ aml_append(method, aml_store(aml_name("PIDX"), aml_local(0)));
+ aml_append(method, aml_release(aml_name("BLCK")));
+ aml_append(method, aml_return(aml_local(0)));
+ aml_append(scope, method);
+
+ aml_append(scope, aml_pci_device_dsm());
+
+ aml_append(table, scope);
+}
+
+static Aml *build_q35_osc_method(bool enable_native_pcie_hotplug)
+{
+ Aml *if_ctx;
+ Aml *if_ctx2;
+ Aml *else_ctx;
+ Aml *method;
+ Aml *a_cwd1 = aml_name("CDW1");
+ Aml *a_ctrl = aml_local(0);
+
+ method = aml_method("_OSC", 4, AML_NOTSERIALIZED);
+ aml_append(method, aml_create_dword_field(aml_arg(3), aml_int(0), "CDW1"));
+
+ if_ctx = aml_if(aml_equal(
+ aml_arg(0), aml_touuid("33DB4D5B-1FF7-401C-9657-7441C03DD766")));
+ aml_append(if_ctx, aml_create_dword_field(aml_arg(3), aml_int(4), "CDW2"));
+ aml_append(if_ctx, aml_create_dword_field(aml_arg(3), aml_int(8), "CDW3"));
+
+ aml_append(if_ctx, aml_store(aml_name("CDW3"), a_ctrl));
+
+ /*
+ * Always allow native PME, AER (no dependencies)
+ * Allow SHPC (PCI bridges can have SHPC controller)
+ * Disable PCIe Native Hot-plug if ACPI PCI Hot-plug is enabled.
+ */
+ aml_append(if_ctx, aml_and(a_ctrl,
+ aml_int(0x1E | (enable_native_pcie_hotplug ? 0x1 : 0x0)), a_ctrl));
+
+ if_ctx2 = aml_if(aml_lnot(aml_equal(aml_arg(1), aml_int(1))));
+ /* Unknown revision */
+ aml_append(if_ctx2, aml_or(a_cwd1, aml_int(0x08), a_cwd1));
+ aml_append(if_ctx, if_ctx2);
+
+ if_ctx2 = aml_if(aml_lnot(aml_equal(aml_name("CDW3"), a_ctrl)));
+ /* Capabilities bits were masked */
+ aml_append(if_ctx2, aml_or(a_cwd1, aml_int(0x10), a_cwd1));
+ aml_append(if_ctx, if_ctx2);
+
+ /* Update DWORD3 in the buffer */
+ aml_append(if_ctx, aml_store(a_ctrl, aml_name("CDW3")));
+ aml_append(method, if_ctx);
+
+ else_ctx = aml_else();
+ /* Unrecognized UUID */
+ aml_append(else_ctx, aml_or(a_cwd1, aml_int(4), a_cwd1));
+ aml_append(method, else_ctx);
+
+ aml_append(method, aml_return(aml_arg(3)));
+ return method;
+}
+
+static void build_smb0(Aml *table, I2CBus *smbus, int devnr, int func)
+{
+ Aml *scope = aml_scope("_SB.PCI0");
+ Aml *dev = aml_device("SMB0");
+
+ aml_append(dev, aml_name_decl("_ADR", aml_int(devnr << 16 | func)));
+ build_acpi_ipmi_devices(dev, BUS(smbus), "\\_SB.PCI0.SMB0");
+ aml_append(scope, dev);
+ aml_append(table, scope);
+}
+
+static void
+build_dsdt(GArray *table_data, BIOSLinker *linker,
+ AcpiPmInfo *pm, AcpiMiscInfo *misc,
+ Range *pci_hole, Range *pci_hole64, MachineState *machine)
+{
+ CrsRangeEntry *entry;
+ Aml *dsdt, *sb_scope, *scope, *dev, *method, *field, *pkg, *crs;
+ CrsRangeSet crs_range_set;
+ PCMachineState *pcms = PC_MACHINE(machine);
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(machine);
+ X86MachineState *x86ms = X86_MACHINE(machine);
+ AcpiMcfgInfo mcfg;
+ bool mcfg_valid = !!acpi_get_mcfg(&mcfg);
+ uint32_t nr_mem = machine->ram_slots;
+ int root_bus_limit = 0xFF;
+ PCIBus *bus = NULL;
+#ifdef CONFIG_TPM
+ TPMIf *tpm = tpm_find();
+#endif
+ int i;
+ VMBusBridge *vmbus_bridge = vmbus_bridge_find();
+ AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = x86ms->oem_id,
+ .oem_table_id = x86ms->oem_table_id };
+
+ acpi_table_begin(&table, table_data);
+ dsdt = init_aml_allocator();
+
+ build_dbg_aml(dsdt);
+ if (misc->is_piix4) {
+ sb_scope = aml_scope("_SB");
+ dev = aml_device("PCI0");
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03")));
+ aml_append(dev, aml_name_decl("_ADR", aml_int(0)));
+ aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid)));
+ aml_append(sb_scope, dev);
+ aml_append(dsdt, sb_scope);
+
+ if (misc->has_hpet) {
+ build_hpet_aml(dsdt);
+ }
+ build_piix4_isa_bridge(dsdt);
+ build_isa_devices_aml(dsdt);
+ if (pm->pcihp_bridge_en || pm->pcihp_root_en) {
+ build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base);
+ }
+ build_piix4_pci0_int(dsdt);
+ } else {
+ sb_scope = aml_scope("_SB");
+ dev = aml_device("PCI0");
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A08")));
+ aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03")));
+ aml_append(dev, aml_name_decl("_ADR", aml_int(0)));
+ aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid)));
+ aml_append(dev, build_q35_osc_method(!pm->pcihp_bridge_en));
+ aml_append(sb_scope, dev);
+ if (mcfg_valid) {
+ aml_append(sb_scope, build_q35_dram_controller(&mcfg));
+ }
+
+ if (pm->smi_on_cpuhp) {
+ /* reserve SMI block resources, IO ports 0xB2, 0xB3 */
+ dev = aml_device("PCI0.SMI0");
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A06")));
+ aml_append(dev, aml_name_decl("_UID", aml_string("SMI resources")));
+ crs = aml_resource_template();
+ aml_append(crs,
+ aml_io(
+ AML_DECODE16,
+ ACPI_PORT_SMI_CMD,
+ ACPI_PORT_SMI_CMD,
+ 1,
+ 2)
+ );
+ aml_append(dev, aml_name_decl("_CRS", crs));
+ aml_append(dev, aml_operation_region("SMIR", AML_SYSTEM_IO,
+ aml_int(ACPI_PORT_SMI_CMD), 2));
+ field = aml_field("SMIR", AML_BYTE_ACC, AML_NOLOCK,
+ AML_WRITE_AS_ZEROS);
+ aml_append(field, aml_named_field("SMIC", 8));
+ aml_append(field, aml_reserved_field(8));
+ aml_append(dev, field);
+ aml_append(sb_scope, dev);
+ }
+
+ aml_append(dsdt, sb_scope);
+
+ if (misc->has_hpet) {
+ build_hpet_aml(dsdt);
+ }
+ build_q35_isa_bridge(dsdt);
+ build_isa_devices_aml(dsdt);
+ if (pm->pcihp_bridge_en) {
+ build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base);
+ }
+ build_q35_pci0_int(dsdt);
+ if (pcms->smbus && !pcmc->do_not_add_smb_acpi) {
+ build_smb0(dsdt, pcms->smbus, ICH9_SMB_DEV, ICH9_SMB_FUNC);
+ }
+ }
+
+ if (vmbus_bridge) {
+ sb_scope = aml_scope("_SB");
+ aml_append(sb_scope, build_vmbus_device_aml(vmbus_bridge));
+ aml_append(dsdt, sb_scope);
+ }
+
+ if (pcmc->legacy_cpu_hotplug) {
+ build_legacy_cpu_hotplug_aml(dsdt, machine, pm->cpu_hp_io_base);
+ } else {
+ CPUHotplugFeatures opts = {
+ .acpi_1_compatible = true, .has_legacy_cphp = true,
+ .smi_path = pm->smi_on_cpuhp ? "\\_SB.PCI0.SMI0.SMIC" : NULL,
+ .fw_unplugs_cpu = pm->smi_on_cpu_unplug,
+ };
+ build_cpus_aml(dsdt, machine, opts, pm->cpu_hp_io_base,
+ "\\_SB.PCI0", "\\_GPE._E02");
+ }
+
+ if (pcms->memhp_io_base && nr_mem) {
+ build_memory_hotplug_aml(dsdt, nr_mem, "\\_SB.PCI0",
+ "\\_GPE._E03", AML_SYSTEM_IO,
+ pcms->memhp_io_base);
+ }
+
+ scope = aml_scope("_GPE");
+ {
+ aml_append(scope, aml_name_decl("_HID", aml_string("ACPI0006")));
+
+ if (pm->pcihp_bridge_en || pm->pcihp_root_en) {
+ method = aml_method("_E01", 0, AML_NOTSERIALIZED);
+ aml_append(method,
+ aml_acquire(aml_name("\\_SB.PCI0.BLCK"), 0xFFFF));
+ aml_append(method, aml_call0("\\_SB.PCI0.PCNT"));
+ aml_append(method, aml_release(aml_name("\\_SB.PCI0.BLCK")));
+ aml_append(scope, method);
+ }
+
+ if (machine->nvdimms_state->is_enabled) {
+ method = aml_method("_E04", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_notify(aml_name("\\_SB.NVDR"),
+ aml_int(0x80)));
+ aml_append(scope, method);
+ }
+ }
+ aml_append(dsdt, scope);
+
+ crs_range_set_init(&crs_range_set);
+ bus = PC_MACHINE(machine)->bus;
+ if (bus) {
+ QLIST_FOREACH(bus, &bus->child, sibling) {
+ uint8_t bus_num = pci_bus_num(bus);
+ uint8_t numa_node = pci_bus_numa_node(bus);
+
+ /* look only for expander root buses */
+ if (!pci_bus_is_root(bus)) {
+ continue;
+ }
+
+ if (bus_num < root_bus_limit) {
+ root_bus_limit = bus_num - 1;
+ }
+
+ scope = aml_scope("\\_SB");
+ dev = aml_device("PC%.02X", bus_num);
+ aml_append(dev, aml_name_decl("_UID", aml_int(bus_num)));
+ aml_append(dev, aml_name_decl("_BBN", aml_int(bus_num)));
+ if (pci_bus_is_express(bus)) {
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A08")));
+ aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03")));
+
+ /* Expander bridges do not have ACPI PCI Hot-plug enabled */
+ aml_append(dev, build_q35_osc_method(true));
+ } else {
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03")));
+ }
+
+ if (numa_node != NUMA_NODE_UNASSIGNED) {
+ aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node)));
+ }
+
+ aml_append(dev, build_prt(false));
+ crs = build_crs(PCI_HOST_BRIDGE(BUS(bus)->parent), &crs_range_set,
+ 0, 0, 0, 0);
+ aml_append(dev, aml_name_decl("_CRS", crs));
+ aml_append(scope, dev);
+ aml_append(dsdt, scope);
+ }
+ }
+
+ /*
+ * At this point crs_range_set has all the ranges used by pci
+ * busses *other* than PCI0. These ranges will be excluded from
+ * the PCI0._CRS. Add mmconfig to the set so it will be excluded
+ * too.
+ */
+ if (mcfg_valid) {
+ crs_range_insert(crs_range_set.mem_ranges,
+ mcfg.base, mcfg.base + mcfg.size - 1);
+ }
+
+ scope = aml_scope("\\_SB.PCI0");
+ /* build PCI0._CRS */
+ crs = aml_resource_template();
+ aml_append(crs,
+ aml_word_bus_number(AML_MIN_FIXED, AML_MAX_FIXED, AML_POS_DECODE,
+ 0x0000, 0x0, root_bus_limit,
+ 0x0000, root_bus_limit + 1));
+ aml_append(crs, aml_io(AML_DECODE16, 0x0CF8, 0x0CF8, 0x01, 0x08));
+
+ aml_append(crs,
+ aml_word_io(AML_MIN_FIXED, AML_MAX_FIXED,
+ AML_POS_DECODE, AML_ENTIRE_RANGE,
+ 0x0000, 0x0000, 0x0CF7, 0x0000, 0x0CF8));
+
+ crs_replace_with_free_ranges(crs_range_set.io_ranges, 0x0D00, 0xFFFF);
+ for (i = 0; i < crs_range_set.io_ranges->len; i++) {
+ entry = g_ptr_array_index(crs_range_set.io_ranges, i);
+ aml_append(crs,
+ aml_word_io(AML_MIN_FIXED, AML_MAX_FIXED,
+ AML_POS_DECODE, AML_ENTIRE_RANGE,
+ 0x0000, entry->base, entry->limit,
+ 0x0000, entry->limit - entry->base + 1));
+ }
+
+ aml_append(crs,
+ aml_dword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED,
+ AML_CACHEABLE, AML_READ_WRITE,
+ 0, 0x000A0000, 0x000BFFFF, 0, 0x00020000));
+
+ crs_replace_with_free_ranges(crs_range_set.mem_ranges,
+ range_lob(pci_hole),
+ range_upb(pci_hole));
+ for (i = 0; i < crs_range_set.mem_ranges->len; i++) {
+ entry = g_ptr_array_index(crs_range_set.mem_ranges, i);
+ aml_append(crs,
+ aml_dword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED,
+ AML_NON_CACHEABLE, AML_READ_WRITE,
+ 0, entry->base, entry->limit,
+ 0, entry->limit - entry->base + 1));
+ }
+
+ if (!range_is_empty(pci_hole64)) {
+ crs_replace_with_free_ranges(crs_range_set.mem_64bit_ranges,
+ range_lob(pci_hole64),
+ range_upb(pci_hole64));
+ for (i = 0; i < crs_range_set.mem_64bit_ranges->len; i++) {
+ entry = g_ptr_array_index(crs_range_set.mem_64bit_ranges, i);
+ aml_append(crs,
+ aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED,
+ AML_MAX_FIXED,
+ AML_CACHEABLE, AML_READ_WRITE,
+ 0, entry->base, entry->limit,
+ 0, entry->limit - entry->base + 1));
+ }
+ }
+
+#ifdef CONFIG_TPM
+ if (TPM_IS_TIS_ISA(tpm_find())) {
+ aml_append(crs, aml_memory32_fixed(TPM_TIS_ADDR_BASE,
+ TPM_TIS_ADDR_SIZE, AML_READ_WRITE));
+ }
+#endif
+ aml_append(scope, aml_name_decl("_CRS", crs));
+
+ /* reserve GPE0 block resources */
+ dev = aml_device("GPE0");
+ aml_append(dev, aml_name_decl("_HID", aml_string("PNP0A06")));
+ aml_append(dev, aml_name_decl("_UID", aml_string("GPE0 resources")));
+ /* device present, functioning, decoding, not shown in UI */
+ aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+ crs = aml_resource_template();
+ aml_append(crs,
+ aml_io(
+ AML_DECODE16,
+ pm->fadt.gpe0_blk.address,
+ pm->fadt.gpe0_blk.address,
+ 1,
+ pm->fadt.gpe0_blk.bit_width / 8)
+ );
+ aml_append(dev, aml_name_decl("_CRS", crs));
+ aml_append(scope, dev);
+
+ crs_range_set_free(&crs_range_set);
+
+ /* reserve PCIHP resources */
+ if (pm->pcihp_io_len && (pm->pcihp_bridge_en || pm->pcihp_root_en)) {
+ dev = aml_device("PHPR");
+ aml_append(dev, aml_name_decl("_HID", aml_string("PNP0A06")));
+ aml_append(dev,
+ aml_name_decl("_UID", aml_string("PCI Hotplug resources")));
+ /* device present, functioning, decoding, not shown in UI */
+ aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+ crs = aml_resource_template();
+ aml_append(crs,
+ aml_io(AML_DECODE16, pm->pcihp_io_base, pm->pcihp_io_base, 1,
+ pm->pcihp_io_len)
+ );
+ aml_append(dev, aml_name_decl("_CRS", crs));
+ aml_append(scope, dev);
+ }
+ aml_append(dsdt, scope);
+
+ /* create S3_ / S4_ / S5_ packages if necessary */
+ scope = aml_scope("\\");
+ if (!pm->s3_disabled) {
+ pkg = aml_package(4);
+ aml_append(pkg, aml_int(1)); /* PM1a_CNT.SLP_TYP */
+ aml_append(pkg, aml_int(1)); /* PM1b_CNT.SLP_TYP, FIXME: not impl. */
+ aml_append(pkg, aml_int(0)); /* reserved */
+ aml_append(pkg, aml_int(0)); /* reserved */
+ aml_append(scope, aml_name_decl("_S3", pkg));
+ }
+
+ if (!pm->s4_disabled) {
+ pkg = aml_package(4);
+ aml_append(pkg, aml_int(pm->s4_val)); /* PM1a_CNT.SLP_TYP */
+ /* PM1b_CNT.SLP_TYP, FIXME: not impl. */
+ aml_append(pkg, aml_int(pm->s4_val));
+ aml_append(pkg, aml_int(0)); /* reserved */
+ aml_append(pkg, aml_int(0)); /* reserved */
+ aml_append(scope, aml_name_decl("_S4", pkg));
+ }
+
+ pkg = aml_package(4);
+ aml_append(pkg, aml_int(0)); /* PM1a_CNT.SLP_TYP */
+ aml_append(pkg, aml_int(0)); /* PM1b_CNT.SLP_TYP not impl. */
+ aml_append(pkg, aml_int(0)); /* reserved */
+ aml_append(pkg, aml_int(0)); /* reserved */
+ aml_append(scope, aml_name_decl("_S5", pkg));
+ aml_append(dsdt, scope);
+
+ /* create fw_cfg node, unconditionally */
+ {
+ scope = aml_scope("\\_SB.PCI0");
+ fw_cfg_add_acpi_dsdt(scope, x86ms->fw_cfg);
+ aml_append(dsdt, scope);
+ }
+
+ if (misc->applesmc_io_base) {
+ scope = aml_scope("\\_SB.PCI0.ISA");
+ dev = aml_device("SMC");
+
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("APP0001")));
+ /* device present, functioning, decoding, not shown in UI */
+ aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+
+ crs = aml_resource_template();
+ aml_append(crs,
+ aml_io(AML_DECODE16, misc->applesmc_io_base, misc->applesmc_io_base,
+ 0x01, APPLESMC_MAX_DATA_LENGTH)
+ );
+ aml_append(crs, aml_irq_no_flags(6));
+ aml_append(dev, aml_name_decl("_CRS", crs));
+
+ aml_append(scope, dev);
+ aml_append(dsdt, scope);
+ }
+
+ if (misc->pvpanic_port) {
+ scope = aml_scope("\\_SB.PCI0.ISA");
+
+ dev = aml_device("PEVT");
+ aml_append(dev, aml_name_decl("_HID", aml_string("QEMU0001")));
+
+ crs = aml_resource_template();
+ aml_append(crs,
+ aml_io(AML_DECODE16, misc->pvpanic_port, misc->pvpanic_port, 1, 1)
+ );
+ aml_append(dev, aml_name_decl("_CRS", crs));
+
+ aml_append(dev, aml_operation_region("PEOR", AML_SYSTEM_IO,
+ aml_int(misc->pvpanic_port), 1));
+ field = aml_field("PEOR", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE);
+ aml_append(field, aml_named_field("PEPT", 8));
+ aml_append(dev, field);
+
+ /* device present, functioning, decoding, shown in UI */
+ aml_append(dev, aml_name_decl("_STA", aml_int(0xF)));
+
+ method = aml_method("RDPT", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_store(aml_name("PEPT"), aml_local(0)));
+ aml_append(method, aml_return(aml_local(0)));
+ aml_append(dev, method);
+
+ method = aml_method("WRPT", 1, AML_NOTSERIALIZED);
+ aml_append(method, aml_store(aml_arg(0), aml_name("PEPT")));
+ aml_append(dev, method);
+
+ aml_append(scope, dev);
+ aml_append(dsdt, scope);
+ }
+
+ sb_scope = aml_scope("\\_SB");
+ {
+ Object *pci_host;
+ PCIBus *bus = NULL;
+
+ pci_host = acpi_get_i386_pci_host();
+
+ if (pci_host) {
+ bus = PCI_HOST_BRIDGE(pci_host)->bus;
+ }
+
+ if (bus) {
+ Aml *scope = aml_scope("PCI0");
+ /* Scan all PCI buses. Generate tables to support hotplug. */
+ build_append_pci_bus_devices(scope, bus, pm->pcihp_bridge_en);
+
+#ifdef CONFIG_TPM
+ if (TPM_IS_TIS_ISA(tpm)) {
+ if (misc->tpm_version == TPM_VERSION_2_0) {
+ dev = aml_device("TPM");
+ aml_append(dev, aml_name_decl("_HID",
+ aml_string("MSFT0101")));
+ } else {
+ dev = aml_device("ISA.TPM");
+ aml_append(dev, aml_name_decl("_HID",
+ aml_eisaid("PNP0C31")));
+ }
+
+ aml_append(dev, aml_name_decl("_STA", aml_int(0xF)));
+ crs = aml_resource_template();
+ aml_append(crs, aml_memory32_fixed(TPM_TIS_ADDR_BASE,
+ TPM_TIS_ADDR_SIZE, AML_READ_WRITE));
+ /*
+ FIXME: TPM_TIS_IRQ=5 conflicts with PNP0C0F irqs,
+ Rewrite to take IRQ from TPM device model and
+ fix default IRQ value there to use some unused IRQ
+ */
+ /* aml_append(crs, aml_irq_no_flags(TPM_TIS_IRQ)); */
+ aml_append(dev, aml_name_decl("_CRS", crs));
+
+ tpm_build_ppi_acpi(tpm, dev);
+
+ aml_append(scope, dev);
+ }
+#endif
+
+ aml_append(sb_scope, scope);
+ }
+ }
+
+#ifdef CONFIG_TPM
+ if (TPM_IS_CRB(tpm)) {
+ dev = aml_device("TPM");
+ aml_append(dev, aml_name_decl("_HID", aml_string("MSFT0101")));
+ crs = aml_resource_template();
+ aml_append(crs, aml_memory32_fixed(TPM_CRB_ADDR_BASE,
+ TPM_CRB_ADDR_SIZE, AML_READ_WRITE));
+ aml_append(dev, aml_name_decl("_CRS", crs));
+
+ aml_append(dev, aml_name_decl("_STA", aml_int(0xf)));
+
+ tpm_build_ppi_acpi(tpm, dev);
+
+ aml_append(sb_scope, dev);
+ }
+#endif
+
+ if (pcms->sgx_epc.size != 0) {
+ uint64_t epc_base = pcms->sgx_epc.base;
+ uint64_t epc_size = pcms->sgx_epc.size;
+
+ dev = aml_device("EPC");
+ aml_append(dev, aml_name_decl("_HID", aml_eisaid("INT0E0C")));
+ aml_append(dev, aml_name_decl("_STR",
+ aml_unicode("Enclave Page Cache 1.0")));
+ crs = aml_resource_template();
+ aml_append(crs,
+ aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED,
+ AML_MAX_FIXED, AML_NON_CACHEABLE,
+ AML_READ_WRITE, 0, epc_base,
+ epc_base + epc_size - 1, 0, epc_size));
+ aml_append(dev, aml_name_decl("_CRS", crs));
+
+ method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+ aml_append(method, aml_return(aml_int(0x0f)));
+ aml_append(dev, method);
+
+ aml_append(sb_scope, dev);
+ }
+ aml_append(dsdt, sb_scope);
+
+ /* copy AML table into ACPI tables blob and patch header there */
+ g_array_append_vals(table_data, dsdt->buf->data, dsdt->buf->len);
+ acpi_table_end(linker, &table);
+ free_aml_allocator();
+}
+
+/*
+ * IA-PC HPET (High Precision Event Timers) Specification (Revision: 1.0a)
+ * 3.2.4The ACPI 2.0 HPET Description Table (HPET)
+ */
+static void
+build_hpet(GArray *table_data, BIOSLinker *linker, const char *oem_id,
+ const char *oem_table_id)
+{
+ AcpiTable table = { .sig = "HPET", .rev = 1,
+ .oem_id = oem_id, .oem_table_id = oem_table_id };
+
+ acpi_table_begin(&table, table_data);
+ /* Note timer_block_id value must be kept in sync with value advertised by
+ * emulated hpet
+ */
+ /* Event Timer Block ID */
+ build_append_int_noprefix(table_data, 0x8086a201, 4);
+ /* BASE_ADDRESS */
+ build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 0, 0, 0, HPET_BASE);
+ /* HPET Number */
+ build_append_int_noprefix(table_data, 0, 1);
+ /* Main Counter Minimum Clock_tick in Periodic Mode */
+ build_append_int_noprefix(table_data, 0, 2);
+ /* Page Protection And OEM Attribute */
+ build_append_int_noprefix(table_data, 0, 1);
+ acpi_table_end(linker, &table);
+}
+
+#ifdef CONFIG_TPM
+/*
+ * TCPA Description Table
+ *
+ * Following Level 00, Rev 00.37 of specs:
+ * http://www.trustedcomputinggroup.org/resources/tcg_acpi_specification
+ * 7.1.2 ACPI Table Layout
+ */
+static void
+build_tpm_tcpa(GArray *table_data, BIOSLinker *linker, GArray *tcpalog,
+ const char *oem_id, const char *oem_table_id)
+{
+ unsigned log_addr_offset;
+ AcpiTable table = { .sig = "TCPA", .rev = 2,
+ .oem_id = oem_id, .oem_table_id = oem_table_id };
+
+ acpi_table_begin(&table, table_data);
+ /* Platform Class */
+ build_append_int_noprefix(table_data, TPM_TCPA_ACPI_CLASS_CLIENT, 2);
+ /* Log Area Minimum Length (LAML) */
+ build_append_int_noprefix(table_data, TPM_LOG_AREA_MINIMUM_SIZE, 4);
+ /* Log Area Start Address (LASA) */
+ log_addr_offset = table_data->len;
+ build_append_int_noprefix(table_data, 0, 8);
+
+ /* allocate/reserve space for TPM log area */
+ acpi_data_push(tcpalog, TPM_LOG_AREA_MINIMUM_SIZE);
+ bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1,
+ false /* high memory */);
+ /* log area start address to be filled by Guest linker */
+ bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
+ log_addr_offset, 8, ACPI_BUILD_TPMLOG_FILE, 0);
+
+ acpi_table_end(linker, &table);
+}
+#endif
+
+#define HOLE_640K_START (640 * KiB)
+#define HOLE_640K_END (1 * MiB)
+
+/*
+ * ACPI spec, Revision 3.0
+ * 5.2.15 System Resource Affinity Table (SRAT)
+ */
+static void
+build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
+{
+ int i;
+ int numa_mem_start, slots;
+ uint64_t mem_len, mem_base, next_base;
+ MachineClass *mc = MACHINE_GET_CLASS(machine);
+ X86MachineState *x86ms = X86_MACHINE(machine);
+ const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(machine);
+ PCMachineState *pcms = PC_MACHINE(machine);
+ int nb_numa_nodes = machine->numa_state->num_nodes;
+ NodeInfo *numa_info = machine->numa_state->nodes;
+ ram_addr_t hotpluggable_address_space_size =
+ object_property_get_int(OBJECT(pcms), PC_MACHINE_DEVMEM_REGION_SIZE,
+ NULL);
+ AcpiTable table = { .sig = "SRAT", .rev = 1, .oem_id = x86ms->oem_id,
+ .oem_table_id = x86ms->oem_table_id };
+
+ acpi_table_begin(&table, table_data);
+ build_append_int_noprefix(table_data, 1, 4); /* Reserved */
+ build_append_int_noprefix(table_data, 0, 8); /* Reserved */
+
+ for (i = 0; i < apic_ids->len; i++) {
+ int node_id = apic_ids->cpus[i].props.node_id;
+ uint32_t apic_id = apic_ids->cpus[i].arch_id;
+
+ if (apic_id < 255) {
+ /* 5.2.15.1 Processor Local APIC/SAPIC Affinity Structure */
+ build_append_int_noprefix(table_data, 0, 1); /* Type */
+ build_append_int_noprefix(table_data, 16, 1); /* Length */
+ /* Proximity Domain [7:0] */
+ build_append_int_noprefix(table_data, node_id, 1);
+ build_append_int_noprefix(table_data, apic_id, 1); /* APIC ID */
+ /* Flags, Table 5-36 */
+ build_append_int_noprefix(table_data, 1, 4);
+ build_append_int_noprefix(table_data, 0, 1); /* Local SAPIC EID */
+ /* Proximity Domain [31:8] */
+ build_append_int_noprefix(table_data, 0, 3);
+ build_append_int_noprefix(table_data, 0, 4); /* Reserved */
+ } else {
+ /*
+ * ACPI spec, Revision 4.0
+ * 5.2.16.3 Processor Local x2APIC Affinity Structure
+ */
+ build_append_int_noprefix(table_data, 2, 1); /* Type */
+ build_append_int_noprefix(table_data, 24, 1); /* Length */
+ build_append_int_noprefix(table_data, 0, 2); /* Reserved */
+ /* Proximity Domain */
+ build_append_int_noprefix(table_data, node_id, 4);
+ build_append_int_noprefix(table_data, apic_id, 4); /* X2APIC ID */
+ /* Flags, Table 5-39 */
+ build_append_int_noprefix(table_data, 1 /* Enabled */, 4);
+ build_append_int_noprefix(table_data, 0, 4); /* Clock Domain */
+ build_append_int_noprefix(table_data, 0, 4); /* Reserved */
+ }
+ }
+
+ /* the memory map is a bit tricky, it contains at least one hole
+ * from 640k-1M and possibly another one from 3.5G-4G.
+ */
+ next_base = 0;
+ numa_mem_start = table_data->len;
+
+ for (i = 1; i < nb_numa_nodes + 1; ++i) {
+ mem_base = next_base;
+ mem_len = numa_info[i - 1].node_mem;
+ next_base = mem_base + mem_len;
+
+ /* Cut out the 640K hole */
+ if (mem_base <= HOLE_640K_START &&
+ next_base > HOLE_640K_START) {
+ mem_len -= next_base - HOLE_640K_START;
+ if (mem_len > 0) {
+ build_srat_memory(table_data, mem_base, mem_len, i - 1,
+ MEM_AFFINITY_ENABLED);
+ }
+
+ /* Check for the rare case: 640K < RAM < 1M */
+ if (next_base <= HOLE_640K_END) {
+ next_base = HOLE_640K_END;
+ continue;
+ }
+ mem_base = HOLE_640K_END;
+ mem_len = next_base - HOLE_640K_END;
+ }
+
+ /* Cut out the ACPI_PCI hole */
+ if (mem_base <= x86ms->below_4g_mem_size &&
+ next_base > x86ms->below_4g_mem_size) {
+ mem_len -= next_base - x86ms->below_4g_mem_size;
+ if (mem_len > 0) {
+ build_srat_memory(table_data, mem_base, mem_len, i - 1,
+ MEM_AFFINITY_ENABLED);
+ }
+ mem_base = 1ULL << 32;
+ mem_len = next_base - x86ms->below_4g_mem_size;
+ next_base = mem_base + mem_len;
+ }
+
+ if (mem_len > 0) {
+ build_srat_memory(table_data, mem_base, mem_len, i - 1,
+ MEM_AFFINITY_ENABLED);
+ }
+ }
+
+ if (machine->nvdimms_state->is_enabled) {
+ nvdimm_build_srat(table_data);
+ }
+
+ /*
+ * TODO: this part is not in ACPI spec and current linux kernel boots fine
+ * without these entries. But I recall there were issues the last time I
+ * tried to remove it with some ancient guest OS, however I can't remember
+ * what that was so keep this around for now
+ */
+ slots = (table_data->len - numa_mem_start) / 40 /* mem affinity len */;
+ for (; slots < nb_numa_nodes + 2; slots++) {
+ build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS);
+ }
+
+ /*
+ * Entry is required for Windows to enable memory hotplug in OS
+ * and for Linux to enable SWIOTLB when booted with less than
+ * 4G of RAM. Windows works better if the entry sets proximity
+ * to the highest NUMA node in the machine.
+ * Memory devices may override proximity set by this entry,
+ * providing _PXM method if necessary.
+ */
+ if (hotpluggable_address_space_size) {
+ build_srat_memory(table_data, machine->device_memory->base,
+ hotpluggable_address_space_size, nb_numa_nodes - 1,
+ MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
+ }
+
+ acpi_table_end(linker, &table);
+}
+
+/*
+ * Insert DMAR scope for PCI bridges and endpoint devcie
+ */
+static void
+insert_scope(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+ const size_t device_scope_size = 6 /* device scope structure */ +
+ 2 /* 1 path entry */;
+ GArray *scope_blob = opaque;
+
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
+ /* Dmar Scope Type: 0x02 for PCI Bridge */
+ build_append_int_noprefix(scope_blob, 0x02, 1);
+ } else {
+ /* Dmar Scope Type: 0x01 for PCI Endpoint Device */
+ build_append_int_noprefix(scope_blob, 0x01, 1);
+ }
+
+ /* length */
+ build_append_int_noprefix(scope_blob, device_scope_size, 1);
+ /* reserved */
+ build_append_int_noprefix(scope_blob, 0, 2);
+ /* enumeration_id */
+ build_append_int_noprefix(scope_blob, 0, 1);
+ /* bus */
+ build_append_int_noprefix(scope_blob, pci_bus_num(bus), 1);
+ /* device */
+ build_append_int_noprefix(scope_blob, PCI_SLOT(dev->devfn), 1);
+ /* function */
+ build_append_int_noprefix(scope_blob, PCI_FUNC(dev->devfn), 1);
+}
+
+/* For a given PCI host bridge, walk and insert DMAR scope */
+static int
+dmar_host_bridges(Object *obj, void *opaque)
+{
+ GArray *scope_blob = opaque;
+
+ if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) {
+ PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus;
+
+ if (bus && !pci_bus_bypass_iommu(bus)) {
+ pci_for_each_device_under_bus(bus, insert_scope, scope_blob);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Intel ® Virtualization Technology for Directed I/O
+ * Architecture Specification. Revision 3.3
+ * 8.1 DMA Remapping Reporting Structure
+ */
+static void
+build_dmar_q35(GArray *table_data, BIOSLinker *linker, const char *oem_id,
+ const char *oem_table_id)
+{
+ uint8_t dmar_flags = 0;
+ uint8_t rsvd10[10] = {};
+ /* Root complex IOAPIC uses one path only */
+ const size_t ioapic_scope_size = 6 /* device scope structure */ +
+ 2 /* 1 path entry */;
+ X86IOMMUState *iommu = x86_iommu_get_default();
+ IntelIOMMUState *intel_iommu = INTEL_IOMMU_DEVICE(iommu);
+ GArray *scope_blob = g_array_new(false, true, 1);
+
+ AcpiTable table = { .sig = "DMAR", .rev = 1, .oem_id = oem_id,
+ .oem_table_id = oem_table_id };
+
+ /*
+ * A PCI bus walk, for each PCI host bridge.
+ * Insert scope for each PCI bridge and endpoint device which
+ * is attached to a bus with iommu enabled.
+ */
+ object_child_foreach_recursive(object_get_root(),
+ dmar_host_bridges, scope_blob);
+
+ assert(iommu);
+ if (x86_iommu_ir_supported(iommu)) {
+ dmar_flags |= 0x1; /* Flags: 0x1: INT_REMAP */
+ }
+
+ acpi_table_begin(&table, table_data);
+ /* Host Address Width */
+ build_append_int_noprefix(table_data, intel_iommu->aw_bits - 1, 1);
+ build_append_int_noprefix(table_data, dmar_flags, 1); /* Flags */
+ g_array_append_vals(table_data, rsvd10, sizeof(rsvd10)); /* Reserved */
+
+ /* 8.3 DMAR Remapping Hardware Unit Definition structure */
+ build_append_int_noprefix(table_data, 0, 2); /* Type */
+ /* Length */
+ build_append_int_noprefix(table_data,
+ 16 + ioapic_scope_size + scope_blob->len, 2);
+ /* Flags */
+ build_append_int_noprefix(table_data, 0 /* Don't include all pci device */ ,
+ 1);
+ build_append_int_noprefix(table_data, 0 , 1); /* Reserved */
+ build_append_int_noprefix(table_data, 0 , 2); /* Segment Number */
+ /* Register Base Address */
+ build_append_int_noprefix(table_data, Q35_HOST_BRIDGE_IOMMU_ADDR , 8);
+
+ /* Scope definition for the root-complex IOAPIC. See VT-d spec
+ * 8.3.1 (version Oct. 2014 or later). */
+ build_append_int_noprefix(table_data, 0x03 /* IOAPIC */, 1); /* Type */
+ build_append_int_noprefix(table_data, ioapic_scope_size, 1); /* Length */
+ build_append_int_noprefix(table_data, 0, 2); /* Reserved */
+ /* Enumeration ID */
+ build_append_int_noprefix(table_data, ACPI_BUILD_IOAPIC_ID, 1);
+ /* Start Bus Number */
+ build_append_int_noprefix(table_data, Q35_PSEUDO_BUS_PLATFORM, 1);
+ /* Path, {Device, Function} pair */
+ build_append_int_noprefix(table_data, PCI_SLOT(Q35_PSEUDO_DEVFN_IOAPIC), 1);
+ build_append_int_noprefix(table_data, PCI_FUNC(Q35_PSEUDO_DEVFN_IOAPIC), 1);
+
+ /* Add scope found above */
+ g_array_append_vals(table_data, scope_blob->data, scope_blob->len);
+ g_array_free(scope_blob, true);
+
+ if (iommu->dt_supported) {
+ /* 8.5 Root Port ATS Capability Reporting Structure */
+ build_append_int_noprefix(table_data, 2, 2); /* Type */
+ build_append_int_noprefix(table_data, 8, 2); /* Length */
+ build_append_int_noprefix(table_data, 1 /* ALL_PORTS */, 1); /* Flags */
+ build_append_int_noprefix(table_data, 0, 1); /* Reserved */
+ build_append_int_noprefix(table_data, 0, 2); /* Segment Number */
+ }
+
+ acpi_table_end(linker, &table);
+}
+
+/*
+ * Windows ACPI Emulated Devices Table
+ * (Version 1.0 - April 6, 2009)
+ * Spec: http://download.microsoft.com/download/7/E/7/7E7662CF-CBEA-470B-A97E-CE7CE0D98DC2/WAET.docx
+ *
+ * Helpful to speedup Windows guests and ignored by others.
+ */
+static void
+build_waet(GArray *table_data, BIOSLinker *linker, const char *oem_id,
+ const char *oem_table_id)
+{
+ AcpiTable table = { .sig = "WAET", .rev = 1, .oem_id = oem_id,
+ .oem_table_id = oem_table_id };
+
+ acpi_table_begin(&table, table_data);
+ /*
+ * Set "ACPI PM timer good" flag.
+ *
+ * Tells Windows guests that our ACPI PM timer is reliable in the
+ * sense that guest can read it only once to obtain a reliable value.
+ * Which avoids costly VMExits caused by guest re-reading it unnecessarily.
+ */
+ build_append_int_noprefix(table_data, 1 << 1 /* ACPI PM timer good */, 4);
+ acpi_table_end(linker, &table);
+}
+
+/*
+ * IVRS table as specified in AMD IOMMU Specification v2.62, Section 5.2
+ * accessible here http://support.amd.com/TechDocs/48882_IOMMU.pdf
+ */
+#define IOAPIC_SB_DEVID (uint64_t)PCI_BUILD_BDF(0, PCI_DEVFN(0x14, 0))
+
+/*
+ * Insert IVHD entry for device and recurse, insert alias, or insert range as
+ * necessary for the PCI topology.
+ */
+static void
+insert_ivhd(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+ GArray *table_data = opaque;
+ uint32_t entry;
+
+ /* "Select" IVHD entry, type 0x2 */
+ entry = PCI_BUILD_BDF(pci_bus_num(bus), dev->devfn) << 8 | 0x2;
+ build_append_int_noprefix(table_data, entry, 4);
+
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
+ PCIBus *sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(dev));
+ uint8_t sec = pci_bus_num(sec_bus);
+ uint8_t sub = dev->config[PCI_SUBORDINATE_BUS];
+
+ if (pci_bus_is_express(sec_bus)) {
+ /*
+ * Walk the bus if there are subordinates, otherwise use a range
+ * to cover an entire leaf bus. We could potentially also use a
+ * range for traversed buses, but we'd need to take care not to
+ * create both Select and Range entries covering the same device.
+ * This is easier and potentially more compact.
+ *
+ * An example bare metal system seems to use Select entries for
+ * root ports without a slot (ie. built-ins) and Range entries
+ * when there is a slot. The same system also only hard-codes
+ * the alias range for an onboard PCIe-to-PCI bridge, apparently
+ * making no effort to support nested bridges. We attempt to
+ * be more thorough here.
+ */
+ if (sec == sub) { /* leaf bus */
+ /* "Start of Range" IVHD entry, type 0x3 */
+ entry = PCI_BUILD_BDF(sec, PCI_DEVFN(0, 0)) << 8 | 0x3;
+ build_append_int_noprefix(table_data, entry, 4);
+ /* "End of Range" IVHD entry, type 0x4 */
+ entry = PCI_BUILD_BDF(sub, PCI_DEVFN(31, 7)) << 8 | 0x4;
+ build_append_int_noprefix(table_data, entry, 4);
+ } else {
+ pci_for_each_device(sec_bus, sec, insert_ivhd, table_data);
+ }
+ } else {
+ /*
+ * If the secondary bus is conventional, then we need to create an
+ * Alias range for everything downstream. The range covers the
+ * first devfn on the secondary bus to the last devfn on the
+ * subordinate bus. The alias target depends on legacy versus
+ * express bridges, just as in pci_device_iommu_address_space().
+ * DeviceIDa vs DeviceIDb as per the AMD IOMMU spec.
+ */
+ uint16_t dev_id_a, dev_id_b;
+
+ dev_id_a = PCI_BUILD_BDF(sec, PCI_DEVFN(0, 0));
+
+ if (pci_is_express(dev) &&
+ pcie_cap_get_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) {
+ dev_id_b = dev_id_a;
+ } else {
+ dev_id_b = PCI_BUILD_BDF(pci_bus_num(bus), dev->devfn);
+ }
+
+ /* "Alias Start of Range" IVHD entry, type 0x43, 8 bytes */
+ build_append_int_noprefix(table_data, dev_id_a << 8 | 0x43, 4);
+ build_append_int_noprefix(table_data, dev_id_b << 8 | 0x0, 4);
+
+ /* "End of Range" IVHD entry, type 0x4 */
+ entry = PCI_BUILD_BDF(sub, PCI_DEVFN(31, 7)) << 8 | 0x4;
+ build_append_int_noprefix(table_data, entry, 4);
+ }
+ }
+}
+
+/* For all PCI host bridges, walk and insert IVHD entries */
+static int
+ivrs_host_bridges(Object *obj, void *opaque)
+{
+ GArray *ivhd_blob = opaque;
+
+ if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) {
+ PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus;
+
+ if (bus && !pci_bus_bypass_iommu(bus)) {
+ pci_for_each_device_under_bus(bus, insert_ivhd, ivhd_blob);
+ }
+ }
+
+ return 0;
+}
+
+static void
+build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
+ const char *oem_table_id)
+{
+ int ivhd_table_len = 24;
+ AMDVIState *s = AMD_IOMMU_DEVICE(x86_iommu_get_default());
+ GArray *ivhd_blob = g_array_new(false, true, 1);
+ AcpiTable table = { .sig = "IVRS", .rev = 1, .oem_id = oem_id,
+ .oem_table_id = oem_table_id };
+
+ acpi_table_begin(&table, table_data);
+ /* IVinfo - IO virtualization information common to all
+ * IOMMU units in a system
+ */
+ build_append_int_noprefix(table_data, 40UL << 8/* PASize */, 4);
+ /* reserved */
+ build_append_int_noprefix(table_data, 0, 8);
+
+ /* IVHD definition - type 10h */
+ build_append_int_noprefix(table_data, 0x10, 1);
+ /* virtualization flags */
+ build_append_int_noprefix(table_data,
+ (1UL << 0) | /* HtTunEn */
+ (1UL << 4) | /* iotblSup */
+ (1UL << 6) | /* PrefSup */
+ (1UL << 7), /* PPRSup */
+ 1);
+
+ /*
+ * A PCI bus walk, for each PCI host bridge, is necessary to create a
+ * complete set of IVHD entries. Do this into a separate blob so that we
+ * can calculate the total IVRS table length here and then append the new
+ * blob further below. Fall back to an entry covering all devices, which
+ * is sufficient when no aliases are present.
+ */
+ object_child_foreach_recursive(object_get_root(),
+ ivrs_host_bridges, ivhd_blob);
+
+ if (!ivhd_blob->len) {
+ /*
+ * Type 1 device entry reporting all devices
+ * These are 4-byte device entries currently reporting the range of
+ * Refer to Spec - Table 95:IVHD Device Entry Type Codes(4-byte)
+ */
+ build_append_int_noprefix(ivhd_blob, 0x0000001, 4);
+ }
+
+ ivhd_table_len += ivhd_blob->len;
+
+ /*
+ * When interrupt remapping is supported, we add a special IVHD device
+ * for type IO-APIC.
+ */
+ if (x86_iommu_ir_supported(x86_iommu_get_default())) {
+ ivhd_table_len += 8;
+ }
+
+ /* IVHD length */
+ build_append_int_noprefix(table_data, ivhd_table_len, 2);
+ /* DeviceID */
+ build_append_int_noprefix(table_data, s->devid, 2);
+ /* Capability offset */
+ build_append_int_noprefix(table_data, s->capab_offset, 2);
+ /* IOMMU base address */
+ build_append_int_noprefix(table_data, s->mmio.addr, 8);
+ /* PCI Segment Group */
+ build_append_int_noprefix(table_data, 0, 2);
+ /* IOMMU info */
+ build_append_int_noprefix(table_data, 0, 2);
+ /* IOMMU Feature Reporting */
+ build_append_int_noprefix(table_data,
+ (48UL << 30) | /* HATS */
+ (48UL << 28) | /* GATS */
+ (1UL << 2) | /* GTSup */
+ (1UL << 6), /* GASup */
+ 4);
+
+ /* IVHD entries as found above */
+ g_array_append_vals(table_data, ivhd_blob->data, ivhd_blob->len);
+ g_array_free(ivhd_blob, TRUE);
+
+ /*
+ * Add a special IVHD device type.
+ * Refer to spec - Table 95: IVHD device entry type codes
+ *
+ * Linux IOMMU driver checks for the special IVHD device (type IO-APIC).
+ * See Linux kernel commit 'c2ff5cf5294bcbd7fa50f7d860e90a66db7e5059'
+ */
+ if (x86_iommu_ir_supported(x86_iommu_get_default())) {
+ build_append_int_noprefix(table_data,
+ (0x1ull << 56) | /* type IOAPIC */
+ (IOAPIC_SB_DEVID << 40) | /* IOAPIC devid */
+ 0x48, /* special device */
+ 8);
+ }
+ acpi_table_end(linker, &table);
+}
+
+typedef
+struct AcpiBuildState {
+ /* Copy of table in RAM (for patching). */
+ MemoryRegion *table_mr;
+ /* Is table patched? */
+ uint8_t patched;
+ void *rsdp;
+ MemoryRegion *rsdp_mr;
+ MemoryRegion *linker_mr;
+} AcpiBuildState;
+
+static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
+{
+ Object *pci_host;
+ QObject *o;
+
+ pci_host = acpi_get_i386_pci_host();
+ if (!pci_host) {
+ return false;
+ }
+
+ o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL);
+ if (!o) {
+ return false;
+ }
+ mcfg->base = qnum_get_uint(qobject_to(QNum, o));
+ qobject_unref(o);
+ if (mcfg->base == PCIE_BASE_ADDR_UNMAPPED) {
+ return false;
+ }
+
+ o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL);
+ assert(o);
+ mcfg->size = qnum_get_uint(qobject_to(QNum, o));
+ qobject_unref(o);
+ return true;
+}
+
+static
+void acpi_build(AcpiBuildTables *tables, MachineState *machine)
+{
+ PCMachineState *pcms = PC_MACHINE(machine);
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ X86MachineState *x86ms = X86_MACHINE(machine);
+ DeviceState *iommu = pcms->iommu;
+ GArray *table_offsets;
+ unsigned facs, dsdt, rsdt, fadt;
+ AcpiPmInfo pm;
+ AcpiMiscInfo misc;
+ AcpiMcfgInfo mcfg;
+ Range pci_hole = {}, pci_hole64 = {};
+ uint8_t *u;
+ size_t aml_len = 0;
+ GArray *tables_blob = tables->table_data;
+ AcpiSlicOem slic_oem = { .id = NULL, .table_id = NULL };
+ Object *vmgenid_dev;
+ char *oem_id;
+ char *oem_table_id;
+
+ acpi_get_pm_info(machine, &pm);
+ acpi_get_misc_info(&misc);
+ acpi_get_pci_holes(&pci_hole, &pci_hole64);
+ acpi_get_slic_oem(&slic_oem);
+
+ if (slic_oem.id) {
+ oem_id = slic_oem.id;
+ } else {
+ oem_id = x86ms->oem_id;
+ }
+
+ if (slic_oem.table_id) {
+ oem_table_id = slic_oem.table_id;
+ } else {
+ oem_table_id = x86ms->oem_table_id;
+ }
+
+ table_offsets = g_array_new(false, true /* clear */,
+ sizeof(uint32_t));
+ ACPI_BUILD_DPRINTF("init ACPI tables\n");
+
+ bios_linker_loader_alloc(tables->linker,
+ ACPI_BUILD_TABLE_FILE, tables_blob,
+ 64 /* Ensure FACS is aligned */,
+ false /* high memory */);
+
+ /*
+ * FACS is pointed to by FADT.
+ * We place it first since it's the only table that has alignment
+ * requirements.
+ */
+ facs = tables_blob->len;
+ build_facs(tables_blob);
+
+ /* DSDT is pointed to by FADT */
+ dsdt = tables_blob->len;
+ build_dsdt(tables_blob, tables->linker, &pm, &misc,
+ &pci_hole, &pci_hole64, machine);
+
+ /* Count the size of the DSDT and SSDT, we will need it for legacy
+ * sizing of ACPI tables.
+ */
+ aml_len += tables_blob->len - dsdt;
+
+ /* ACPI tables pointed to by RSDT */
+ fadt = tables_blob->len;
+ acpi_add_table(table_offsets, tables_blob);
+ pm.fadt.facs_tbl_offset = &facs;
+ pm.fadt.dsdt_tbl_offset = &dsdt;
+ pm.fadt.xdsdt_tbl_offset = &dsdt;
+ build_fadt(tables_blob, tables->linker, &pm.fadt, oem_id, oem_table_id);
+ aml_len += tables_blob->len - fadt;
+
+ acpi_add_table(table_offsets, tables_blob);
+ acpi_build_madt(tables_blob, tables->linker, x86ms,
+ ACPI_DEVICE_IF(x86ms->acpi_dev), x86ms->oem_id,
+ x86ms->oem_table_id);
+
+ vmgenid_dev = find_vmgenid_dev();
+ if (vmgenid_dev) {
+ acpi_add_table(table_offsets, tables_blob);
+ vmgenid_build_acpi(VMGENID(vmgenid_dev), tables_blob,
+ tables->vmgenid, tables->linker, x86ms->oem_id);
+ }
+
+ if (misc.has_hpet) {
+ acpi_add_table(table_offsets, tables_blob);
+ build_hpet(tables_blob, tables->linker, x86ms->oem_id,
+ x86ms->oem_table_id);
+ }
+#ifdef CONFIG_TPM
+ if (misc.tpm_version != TPM_VERSION_UNSPEC) {
+ if (misc.tpm_version == TPM_VERSION_1_2) {
+ acpi_add_table(table_offsets, tables_blob);
+ build_tpm_tcpa(tables_blob, tables->linker, tables->tcpalog,
+ x86ms->oem_id, x86ms->oem_table_id);
+ } else { /* TPM_VERSION_2_0 */
+ acpi_add_table(table_offsets, tables_blob);
+ build_tpm2(tables_blob, tables->linker, tables->tcpalog,
+ x86ms->oem_id, x86ms->oem_table_id);
+ }
+ }
+#endif
+ if (machine->numa_state->num_nodes) {
+ acpi_add_table(table_offsets, tables_blob);
+ build_srat(tables_blob, tables->linker, machine);
+ if (machine->numa_state->have_numa_distance) {
+ acpi_add_table(table_offsets, tables_blob);
+ build_slit(tables_blob, tables->linker, machine, x86ms->oem_id,
+ x86ms->oem_table_id);
+ }
+ if (machine->numa_state->hmat_enabled) {
+ acpi_add_table(table_offsets, tables_blob);
+ build_hmat(tables_blob, tables->linker, machine->numa_state,
+ x86ms->oem_id, x86ms->oem_table_id);
+ }
+ }
+ if (acpi_get_mcfg(&mcfg)) {
+ acpi_add_table(table_offsets, tables_blob);
+ build_mcfg(tables_blob, tables->linker, &mcfg, x86ms->oem_id,
+ x86ms->oem_table_id);
+ }
+ if (object_dynamic_cast(OBJECT(iommu), TYPE_AMD_IOMMU_DEVICE)) {
+ acpi_add_table(table_offsets, tables_blob);
+ build_amd_iommu(tables_blob, tables->linker, x86ms->oem_id,
+ x86ms->oem_table_id);
+ } else if (object_dynamic_cast(OBJECT(iommu), TYPE_INTEL_IOMMU_DEVICE)) {
+ acpi_add_table(table_offsets, tables_blob);
+ build_dmar_q35(tables_blob, tables->linker, x86ms->oem_id,
+ x86ms->oem_table_id);
+ } else if (object_dynamic_cast(OBJECT(iommu), TYPE_VIRTIO_IOMMU_PCI)) {
+ PCIDevice *pdev = PCI_DEVICE(iommu);
+
+ acpi_add_table(table_offsets, tables_blob);
+ build_viot(machine, tables_blob, tables->linker, pci_get_bdf(pdev),
+ x86ms->oem_id, x86ms->oem_table_id);
+ }
+ if (machine->nvdimms_state->is_enabled) {
+ nvdimm_build_acpi(table_offsets, tables_blob, tables->linker,
+ machine->nvdimms_state, machine->ram_slots,
+ x86ms->oem_id, x86ms->oem_table_id);
+ }
+
+ acpi_add_table(table_offsets, tables_blob);
+ build_waet(tables_blob, tables->linker, x86ms->oem_id, x86ms->oem_table_id);
+
+ /* Add tables supplied by user (if any) */
+ for (u = acpi_table_first(); u; u = acpi_table_next(u)) {
+ unsigned len = acpi_table_len(u);
+
+ acpi_add_table(table_offsets, tables_blob);
+ g_array_append_vals(tables_blob, u, len);
+ }
+
+ /* RSDT is pointed to by RSDP */
+ rsdt = tables_blob->len;
+ build_rsdt(tables_blob, tables->linker, table_offsets,
+ oem_id, oem_table_id);
+
+ /* RSDP is in FSEG memory, so allocate it separately */
+ {
+ AcpiRsdpData rsdp_data = {
+ .revision = 0,
+ .oem_id = x86ms->oem_id,
+ .xsdt_tbl_offset = NULL,
+ .rsdt_tbl_offset = &rsdt,
+ };
+ build_rsdp(tables->rsdp, tables->linker, &rsdp_data);
+ if (!pcmc->rsdp_in_ram) {
+ /* We used to allocate some extra space for RSDP revision 2 but
+ * only used the RSDP revision 0 space. The extra bytes were
+ * zeroed out and not used.
+ * Here we continue wasting those extra 16 bytes to make sure we
+ * don't break migration for machine types 2.2 and older due to
+ * RSDP blob size mismatch.
+ */
+ build_append_int_noprefix(tables->rsdp, 0, 16);
+ }
+ }
+
+ /* We'll expose it all to Guest so we want to reduce
+ * chance of size changes.
+ *
+ * We used to align the tables to 4k, but of course this would
+ * too simple to be enough. 4k turned out to be too small an
+ * alignment very soon, and in fact it is almost impossible to
+ * keep the table size stable for all (max_cpus, max_memory_slots)
+ * combinations. So the table size is always 64k for pc-i440fx-2.1
+ * and we give an error if the table grows beyond that limit.
+ *
+ * We still have the problem of migrating from "-M pc-i440fx-2.0". For
+ * that, we exploit the fact that QEMU 2.1 generates _smaller_ tables
+ * than 2.0 and we can always pad the smaller tables with zeros. We can
+ * then use the exact size of the 2.0 tables.
+ *
+ * All this is for PIIX4, since QEMU 2.0 didn't support Q35 migration.
+ */
+ if (pcmc->legacy_acpi_table_size) {
+ /* Subtracting aml_len gives the size of fixed tables. Then add the
+ * size of the PIIX4 DSDT/SSDT in QEMU 2.0.
+ */
+ int legacy_aml_len =
+ pcmc->legacy_acpi_table_size +
+ ACPI_BUILD_LEGACY_CPU_AML_SIZE * x86ms->apic_id_limit;
+ int legacy_table_size =
+ ROUND_UP(tables_blob->len - aml_len + legacy_aml_len,
+ ACPI_BUILD_ALIGN_SIZE);
+ if (tables_blob->len > legacy_table_size) {
+ /* Should happen only with PCI bridges and -M pc-i440fx-2.0. */
+ warn_report("ACPI table size %u exceeds %d bytes,"
+ " migration may not work",
+ tables_blob->len, legacy_table_size);
+ error_printf("Try removing CPUs, NUMA nodes, memory slots"
+ " or PCI bridges.");
+ }
+ g_array_set_size(tables_blob, legacy_table_size);
+ } else {
+ /* Make sure we have a buffer in case we need to resize the tables. */
+ if (tables_blob->len > ACPI_BUILD_TABLE_SIZE / 2) {
+ /* As of QEMU 2.1, this fires with 160 VCPUs and 255 memory slots. */
+ warn_report("ACPI table size %u exceeds %d bytes,"
+ " migration may not work",
+ tables_blob->len, ACPI_BUILD_TABLE_SIZE / 2);
+ error_printf("Try removing CPUs, NUMA nodes, memory slots"
+ " or PCI bridges.");
+ }
+ acpi_align_size(tables_blob, ACPI_BUILD_TABLE_SIZE);
+ }
+
+ acpi_align_size(tables->linker->cmd_blob, ACPI_BUILD_ALIGN_SIZE);
+
+ /* Cleanup memory that's no longer used. */
+ g_array_free(table_offsets, true);
+}
+
+static void acpi_ram_update(MemoryRegion *mr, GArray *data)
+{
+ uint32_t size = acpi_data_len(data);
+
+ /* Make sure RAM size is correct - in case it got changed e.g. by migration */
+ memory_region_ram_resize(mr, size, &error_abort);
+
+ memcpy(memory_region_get_ram_ptr(mr), data->data, size);
+ memory_region_set_dirty(mr, 0, size);
+}
+
+static void acpi_build_update(void *build_opaque)
+{
+ AcpiBuildState *build_state = build_opaque;
+ AcpiBuildTables tables;
+
+ /* No state to update or already patched? Nothing to do. */
+ if (!build_state || build_state->patched) {
+ return;
+ }
+ build_state->patched = 1;
+
+ acpi_build_tables_init(&tables);
+
+ acpi_build(&tables, MACHINE(qdev_get_machine()));
+
+ acpi_ram_update(build_state->table_mr, tables.table_data);
+
+ if (build_state->rsdp) {
+ memcpy(build_state->rsdp, tables.rsdp->data, acpi_data_len(tables.rsdp));
+ } else {
+ acpi_ram_update(build_state->rsdp_mr, tables.rsdp);
+ }
+
+ acpi_ram_update(build_state->linker_mr, tables.linker->cmd_blob);
+ acpi_build_tables_cleanup(&tables, true);
+}
+
+static void acpi_build_reset(void *build_opaque)
+{
+ AcpiBuildState *build_state = build_opaque;
+ build_state->patched = 0;
+}
+
+static const VMStateDescription vmstate_acpi_build = {
+ .name = "acpi_build",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT8(patched, AcpiBuildState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+void acpi_setup(void)
+{
+ PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ X86MachineState *x86ms = X86_MACHINE(pcms);
+ AcpiBuildTables tables;
+ AcpiBuildState *build_state;
+ Object *vmgenid_dev;
+#ifdef CONFIG_TPM
+ TPMIf *tpm;
+ static FwCfgTPMConfig tpm_config;
+#endif
+
+ if (!x86ms->fw_cfg) {
+ ACPI_BUILD_DPRINTF("No fw cfg. Bailing out.\n");
+ return;
+ }
+
+ if (!pcms->acpi_build_enabled) {
+ ACPI_BUILD_DPRINTF("ACPI build disabled. Bailing out.\n");
+ return;
+ }
+
+ if (!x86_machine_is_acpi_enabled(X86_MACHINE(pcms))) {
+ ACPI_BUILD_DPRINTF("ACPI disabled. Bailing out.\n");
+ return;
+ }
+
+ build_state = g_malloc0(sizeof *build_state);
+
+ acpi_build_tables_init(&tables);
+ acpi_build(&tables, MACHINE(pcms));
+
+ /* Now expose it all to Guest */
+ build_state->table_mr = acpi_add_rom_blob(acpi_build_update,
+ build_state, tables.table_data,
+ ACPI_BUILD_TABLE_FILE);
+ assert(build_state->table_mr != NULL);
+
+ build_state->linker_mr =
+ acpi_add_rom_blob(acpi_build_update, build_state,
+ tables.linker->cmd_blob, ACPI_BUILD_LOADER_FILE);
+
+#ifdef CONFIG_TPM
+ fw_cfg_add_file(x86ms->fw_cfg, ACPI_BUILD_TPMLOG_FILE,
+ tables.tcpalog->data, acpi_data_len(tables.tcpalog));
+
+ tpm = tpm_find();
+ if (tpm && object_property_get_bool(OBJECT(tpm), "ppi", &error_abort)) {
+ tpm_config = (FwCfgTPMConfig) {
+ .tpmppi_address = cpu_to_le32(TPM_PPI_ADDR_BASE),
+ .tpm_version = tpm_get_version(tpm),
+ .tpmppi_version = TPM_PPI_VERSION_1_30
+ };
+ fw_cfg_add_file(x86ms->fw_cfg, "etc/tpm/config",
+ &tpm_config, sizeof tpm_config);
+ }
+#endif
+
+ vmgenid_dev = find_vmgenid_dev();
+ if (vmgenid_dev) {
+ vmgenid_add_fw_cfg(VMGENID(vmgenid_dev), x86ms->fw_cfg,
+ tables.vmgenid);
+ }
+
+ if (!pcmc->rsdp_in_ram) {
+ /*
+ * Keep for compatibility with old machine types.
+ * Though RSDP is small, its contents isn't immutable, so
+ * we'll update it along with the rest of tables on guest access.
+ */
+ uint32_t rsdp_size = acpi_data_len(tables.rsdp);
+
+ build_state->rsdp = g_memdup(tables.rsdp->data, rsdp_size);
+ fw_cfg_add_file_callback(x86ms->fw_cfg, ACPI_BUILD_RSDP_FILE,
+ acpi_build_update, NULL, build_state,
+ build_state->rsdp, rsdp_size, true);
+ build_state->rsdp_mr = NULL;
+ } else {
+ build_state->rsdp = NULL;
+ build_state->rsdp_mr = acpi_add_rom_blob(acpi_build_update,
+ build_state, tables.rsdp,
+ ACPI_BUILD_RSDP_FILE);
+ }
+
+ qemu_register_reset(acpi_build_reset, build_state);
+ acpi_build_reset(build_state);
+ vmstate_register(NULL, 0, &vmstate_acpi_build, build_state);
+
+ /* Cleanup tables but don't free the memory: we track it
+ * in build_state.
+ */
+ acpi_build_tables_cleanup(&tables, false);
+}
diff --git a/hw/i386/acpi-build.h b/hw/i386/acpi-build.h
new file mode 100644
index 000000000..0dce155c8
--- /dev/null
+++ b/hw/i386/acpi-build.h
@@ -0,0 +1,15 @@
+
+#ifndef HW_I386_ACPI_BUILD_H
+#define HW_I386_ACPI_BUILD_H
+#include "hw/acpi/acpi-defs.h"
+
+extern const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio;
+
+/* PCI Hot-plug registers bases. See docs/spec/acpi_pci_hotplug.txt */
+#define ACPI_PCIHP_SEJ_BASE 0x8
+#define ACPI_PCIHP_BNMR_BASE 0x10
+
+void acpi_setup(void);
+Object *acpi_get_i386_pci_host(void);
+
+#endif
diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
new file mode 100644
index 000000000..4aaafbdd7
--- /dev/null
+++ b/hw/i386/acpi-common.c
@@ -0,0 +1,165 @@
+/* Support for generating ACPI tables and passing them to Guests
+ *
+ * Copyright (C) 2008-2010 Kevin O'Connor <kevin@koconnor.net>
+ * Copyright (C) 2006 Fabrice Bellard
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+
+#include "exec/memory.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/utils.h"
+#include "hw/i386/pc.h"
+#include "target/i386/cpu.h"
+
+#include "acpi-build.h"
+#include "acpi-common.h"
+
+void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
+ const CPUArchIdList *apic_ids, GArray *entry,
+ bool force_enabled)
+{
+ uint32_t apic_id = apic_ids->cpus[uid].arch_id;
+ /* Flags – Local APIC Flags */
+ uint32_t flags = apic_ids->cpus[uid].cpu != NULL || force_enabled ?
+ 1 /* Enabled */ : 0;
+
+ /* ACPI spec says that LAPIC entry for non present
+ * CPU may be omitted from MADT or it must be marked
+ * as disabled. However omitting non present CPU from
+ * MADT breaks hotplug on linux. So possible CPUs
+ * should be put in MADT but kept disabled.
+ */
+ if (apic_id < 255) {
+ /* Rev 1.0b, Table 5-13 Processor Local APIC Structure */
+ build_append_int_noprefix(entry, 0, 1); /* Type */
+ build_append_int_noprefix(entry, 8, 1); /* Length */
+ build_append_int_noprefix(entry, uid, 1); /* ACPI Processor ID */
+ build_append_int_noprefix(entry, apic_id, 1); /* APIC ID */
+ build_append_int_noprefix(entry, flags, 4); /* Flags */
+ } else {
+ /* Rev 4.0, 5.2.12.12 Processor Local x2APIC Structure */
+ build_append_int_noprefix(entry, 9, 1); /* Type */
+ build_append_int_noprefix(entry, 16, 1); /* Length */
+ build_append_int_noprefix(entry, 0, 2); /* Reserved */
+ build_append_int_noprefix(entry, apic_id, 4); /* X2APIC ID */
+ build_append_int_noprefix(entry, flags, 4); /* Flags */
+ build_append_int_noprefix(entry, uid, 4); /* ACPI Processor UID */
+ }
+}
+
+static void build_ioapic(GArray *entry, uint8_t id, uint32_t addr, uint32_t irq)
+{
+ /* Rev 1.0b, 5.2.8.2 IO APIC */
+ build_append_int_noprefix(entry, 1, 1); /* Type */
+ build_append_int_noprefix(entry, 12, 1); /* Length */
+ build_append_int_noprefix(entry, id, 1); /* IO APIC ID */
+ build_append_int_noprefix(entry, 0, 1); /* Reserved */
+ build_append_int_noprefix(entry, addr, 4); /* IO APIC Address */
+ build_append_int_noprefix(entry, irq, 4); /* System Vector Base */
+}
+
+static void
+build_xrupt_override(GArray *entry, uint8_t src, uint32_t gsi, uint16_t flags)
+{
+ /* Rev 1.0b, 5.2.8.3.1 Interrupt Source Overrides */
+ build_append_int_noprefix(entry, 2, 1); /* Type */
+ build_append_int_noprefix(entry, 10, 1); /* Length */
+ build_append_int_noprefix(entry, 0, 1); /* Bus */
+ build_append_int_noprefix(entry, src, 1); /* Source */
+ /* Global System Interrupt Vector */
+ build_append_int_noprefix(entry, gsi, 4);
+ build_append_int_noprefix(entry, flags, 2); /* Flags */
+}
+
+/*
+ * ACPI spec, Revision 1.0b
+ * 5.2.8 Multiple APIC Description Table
+ */
+void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
+ X86MachineState *x86ms, AcpiDeviceIf *adev,
+ const char *oem_id, const char *oem_table_id)
+{
+ int i;
+ bool x2apic_mode = false;
+ MachineClass *mc = MACHINE_GET_CLASS(x86ms);
+ const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms));
+ AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(adev);
+ AcpiTable table = { .sig = "APIC", .rev = 1, .oem_id = oem_id,
+ .oem_table_id = oem_table_id };
+
+ acpi_table_begin(&table, table_data);
+ /* Local APIC Address */
+ build_append_int_noprefix(table_data, APIC_DEFAULT_ADDRESS, 4);
+ build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags */
+
+ for (i = 0; i < apic_ids->len; i++) {
+ adevc->madt_cpu(adev, i, apic_ids, table_data, false);
+ if (apic_ids->cpus[i].arch_id > 254) {
+ x2apic_mode = true;
+ }
+ }
+
+ build_ioapic(table_data, ACPI_BUILD_IOAPIC_ID, IO_APIC_DEFAULT_ADDRESS, 0);
+ if (x86ms->ioapic2) {
+ build_ioapic(table_data, ACPI_BUILD_IOAPIC_ID + 1,
+ IO_APIC_SECONDARY_ADDRESS, IO_APIC_SECONDARY_IRQBASE);
+ }
+
+ if (x86ms->apic_xrupt_override) {
+ build_xrupt_override(table_data, 0, 2,
+ 0 /* Flags: Conforms to the specifications of the bus */);
+ }
+
+ for (i = 1; i < 16; i++) {
+ if (!(x86ms->pci_irq_mask & (1 << i))) {
+ /* No need for a INT source override structure. */
+ continue;
+ }
+ build_xrupt_override(table_data, i, i,
+ 0xd /* Flags: Active high, Level Triggered */);
+ }
+
+ if (x2apic_mode) {
+ /* Rev 4.0, 5.2.12.13 Local x2APIC NMI Structure*/
+ build_append_int_noprefix(table_data, 0xA, 1); /* Type */
+ build_append_int_noprefix(table_data, 12, 1); /* Length */
+ build_append_int_noprefix(table_data, 0, 2); /* Flags */
+ /* ACPI Processor UID */
+ build_append_int_noprefix(table_data, 0xFFFFFFFF /* all processors */,
+ 4);
+ /* Local x2APIC LINT# */
+ build_append_int_noprefix(table_data, 1 /* ACPI_LINT1 */, 1);
+ build_append_int_noprefix(table_data, 0, 3); /* Reserved */
+ } else {
+ /* Rev 1.0b, 5.2.8.3.3 Local APIC NMI */
+ build_append_int_noprefix(table_data, 4, 1); /* Type */
+ build_append_int_noprefix(table_data, 6, 1); /* Length */
+ /* ACPI Processor ID */
+ build_append_int_noprefix(table_data, 0xFF /* all processors */, 1);
+ build_append_int_noprefix(table_data, 0, 2); /* Flags */
+ /* Local APIC INTI# */
+ build_append_int_noprefix(table_data, 1 /* ACPI_LINT1 */, 1);
+ }
+
+ acpi_table_end(linker, &table);
+}
+
diff --git a/hw/i386/acpi-common.h b/hw/i386/acpi-common.h
new file mode 100644
index 000000000..a68825acf
--- /dev/null
+++ b/hw/i386/acpi-common.h
@@ -0,0 +1,15 @@
+#ifndef HW_I386_ACPI_COMMON_H
+#define HW_I386_ACPI_COMMON_H
+
+#include "hw/acpi/acpi_dev_interface.h"
+#include "hw/acpi/bios-linker-loader.h"
+#include "hw/i386/x86.h"
+
+/* Default IOAPIC ID */
+#define ACPI_BUILD_IOAPIC_ID 0x0
+
+void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
+ X86MachineState *x86ms, AcpiDeviceIf *adev,
+ const char *oem_id, const char *oem_table_id);
+
+#endif
diff --git a/hw/i386/acpi-microvm.c b/hw/i386/acpi-microvm.c
new file mode 100644
index 000000000..196d31849
--- /dev/null
+++ b/hw/i386/acpi-microvm.c
@@ -0,0 +1,258 @@
+/* Support for generating ACPI tables and passing them to Guests
+ *
+ * Copyright (C) 2008-2010 Kevin O'Connor <kevin@koconnor.net>
+ * Copyright (C) 2006 Fabrice Bellard
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+
+#include "exec/memory.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/bios-linker-loader.h"
+#include "hw/acpi/generic_event_device.h"
+#include "hw/acpi/utils.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/i386/microvm.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pcie_host.h"
+#include "hw/usb/xhci.h"
+#include "hw/virtio/virtio-mmio.h"
+
+#include "acpi-common.h"
+#include "acpi-microvm.h"
+
+static void acpi_dsdt_add_virtio(Aml *scope,
+ MicrovmMachineState *mms)
+{
+ gchar *separator;
+ long int index;
+ BusState *bus;
+ BusChild *kid;
+
+ bus = sysbus_get_default();
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+ Object *obj = object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MMIO);
+
+ if (obj) {
+ VirtIOMMIOProxy *mmio = VIRTIO_MMIO(obj);
+ VirtioBusState *mmio_virtio_bus = &mmio->bus;
+ BusState *mmio_bus = &mmio_virtio_bus->parent_obj;
+
+ if (QTAILQ_EMPTY(&mmio_bus->children)) {
+ continue;
+ }
+ separator = g_strrstr(mmio_bus->name, ".");
+ if (!separator) {
+ continue;
+ }
+ if (qemu_strtol(separator + 1, NULL, 10, &index) != 0) {
+ continue;
+ }
+
+ uint32_t irq = mms->virtio_irq_base + index;
+ hwaddr base = VIRTIO_MMIO_BASE + index * 512;
+ hwaddr size = 512;
+
+ Aml *dev = aml_device("VR%02u", (unsigned)index);
+ aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0005")));
+ aml_append(dev, aml_name_decl("_UID", aml_int(index)));
+ aml_append(dev, aml_name_decl("_CCA", aml_int(1)));
+
+ Aml *crs = aml_resource_template();
+ aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE));
+ aml_append(crs,
+ aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH,
+ AML_EXCLUSIVE, &irq, 1));
+ aml_append(dev, aml_name_decl("_CRS", crs));
+ aml_append(scope, dev);
+ }
+ }
+}
+
+static void acpi_dsdt_add_xhci(Aml *scope, MicrovmMachineState *mms)
+{
+ if (machine_usb(MACHINE(mms))) {
+ xhci_sysbus_build_aml(scope, MICROVM_XHCI_BASE, MICROVM_XHCI_IRQ);
+ }
+}
+
+static void acpi_dsdt_add_pci(Aml *scope, MicrovmMachineState *mms)
+{
+ if (mms->pcie != ON_OFF_AUTO_ON) {
+ return;
+ }
+
+ acpi_dsdt_add_gpex(scope, &mms->gpex);
+}
+
+static void
+build_dsdt_microvm(GArray *table_data, BIOSLinker *linker,
+ MicrovmMachineState *mms)
+{
+ X86MachineState *x86ms = X86_MACHINE(mms);
+ Aml *dsdt, *sb_scope, *scope, *pkg;
+ bool ambiguous;
+ Object *isabus;
+ AcpiTable table = { .sig = "DSDT", .rev = 2, .oem_id = x86ms->oem_id,
+ .oem_table_id = x86ms->oem_table_id };
+
+ isabus = object_resolve_path_type("", TYPE_ISA_BUS, &ambiguous);
+ assert(isabus);
+ assert(!ambiguous);
+
+ acpi_table_begin(&table, table_data);
+ dsdt = init_aml_allocator();
+
+ sb_scope = aml_scope("_SB");
+ fw_cfg_add_acpi_dsdt(sb_scope, x86ms->fw_cfg);
+ isa_build_aml(ISA_BUS(isabus), sb_scope);
+ build_ged_aml(sb_scope, GED_DEVICE, x86ms->acpi_dev,
+ GED_MMIO_IRQ, AML_SYSTEM_MEMORY, GED_MMIO_BASE);
+ acpi_dsdt_add_power_button(sb_scope);
+ acpi_dsdt_add_virtio(sb_scope, mms);
+ acpi_dsdt_add_xhci(sb_scope, mms);
+ acpi_dsdt_add_pci(sb_scope, mms);
+ aml_append(dsdt, sb_scope);
+
+ /* ACPI 5.0: Table 7-209 System State Package */
+ scope = aml_scope("\\");
+ pkg = aml_package(4);
+ aml_append(pkg, aml_int(ACPI_GED_SLP_TYP_S5));
+ aml_append(pkg, aml_int(0)); /* ignored */
+ aml_append(pkg, aml_int(0)); /* reserved */
+ aml_append(pkg, aml_int(0)); /* reserved */
+ aml_append(scope, aml_name_decl("_S5", pkg));
+ aml_append(dsdt, scope);
+
+ /* copy AML bytecode into ACPI tables blob */
+ g_array_append_vals(table_data, dsdt->buf->data, dsdt->buf->len);
+
+ acpi_table_end(linker, &table);
+ free_aml_allocator();
+}
+
+static void acpi_build_microvm(AcpiBuildTables *tables,
+ MicrovmMachineState *mms)
+{
+ MachineState *machine = MACHINE(mms);
+ X86MachineState *x86ms = X86_MACHINE(mms);
+ GArray *table_offsets;
+ GArray *tables_blob = tables->table_data;
+ unsigned dsdt, xsdt;
+ AcpiFadtData pmfadt = {
+ /* ACPI 5.0: 4.1 Hardware-Reduced ACPI */
+ .rev = 5,
+ .flags = ((1 << ACPI_FADT_F_HW_REDUCED_ACPI) |
+ (1 << ACPI_FADT_F_RESET_REG_SUP)),
+
+ /* ACPI 5.0: 4.8.3.7 Sleep Control and Status Registers */
+ .sleep_ctl = {
+ .space_id = AML_AS_SYSTEM_MEMORY,
+ .bit_width = 8,
+ .address = GED_MMIO_BASE_REGS + ACPI_GED_REG_SLEEP_CTL,
+ },
+ .sleep_sts = {
+ .space_id = AML_AS_SYSTEM_MEMORY,
+ .bit_width = 8,
+ .address = GED_MMIO_BASE_REGS + ACPI_GED_REG_SLEEP_STS,
+ },
+
+ /* ACPI 5.0: 4.8.3.6 Reset Register */
+ .reset_reg = {
+ .space_id = AML_AS_SYSTEM_MEMORY,
+ .bit_width = 8,
+ .address = GED_MMIO_BASE_REGS + ACPI_GED_REG_RESET,
+ },
+ .reset_val = ACPI_GED_RESET_VALUE,
+ };
+
+ table_offsets = g_array_new(false, true /* clear */,
+ sizeof(uint32_t));
+ bios_linker_loader_alloc(tables->linker,
+ ACPI_BUILD_TABLE_FILE, tables_blob,
+ 64 /* Ensure FACS is aligned */,
+ false /* high memory */);
+
+ dsdt = tables_blob->len;
+ build_dsdt_microvm(tables_blob, tables->linker, mms);
+
+ pmfadt.dsdt_tbl_offset = &dsdt;
+ pmfadt.xdsdt_tbl_offset = &dsdt;
+ acpi_add_table(table_offsets, tables_blob);
+ build_fadt(tables_blob, tables->linker, &pmfadt, x86ms->oem_id,
+ x86ms->oem_table_id);
+
+ acpi_add_table(table_offsets, tables_blob);
+ acpi_build_madt(tables_blob, tables->linker, X86_MACHINE(machine),
+ ACPI_DEVICE_IF(x86ms->acpi_dev), x86ms->oem_id,
+ x86ms->oem_table_id);
+
+ xsdt = tables_blob->len;
+ build_xsdt(tables_blob, tables->linker, table_offsets, x86ms->oem_id,
+ x86ms->oem_table_id);
+
+ /* RSDP is in FSEG memory, so allocate it separately */
+ {
+ AcpiRsdpData rsdp_data = {
+ /* ACPI 2.0: 5.2.4.3 RSDP Structure */
+ .revision = 2, /* xsdt needs v2 */
+ .oem_id = x86ms->oem_id,
+ .xsdt_tbl_offset = &xsdt,
+ .rsdt_tbl_offset = NULL,
+ };
+ build_rsdp(tables->rsdp, tables->linker, &rsdp_data);
+ }
+
+ /* Cleanup memory that's no longer used. */
+ g_array_free(table_offsets, true);
+}
+
+static void acpi_build_no_update(void *build_opaque)
+{
+ /* nothing, microvm tables don't change at runtime */
+}
+
+void acpi_setup_microvm(MicrovmMachineState *mms)
+{
+ X86MachineState *x86ms = X86_MACHINE(mms);
+ AcpiBuildTables tables;
+
+ assert(x86ms->fw_cfg);
+
+ if (!x86_machine_is_acpi_enabled(x86ms)) {
+ return;
+ }
+
+ acpi_build_tables_init(&tables);
+ acpi_build_microvm(&tables, mms);
+
+ /* Now expose it all to Guest */
+ acpi_add_rom_blob(acpi_build_no_update, NULL, tables.table_data,
+ ACPI_BUILD_TABLE_FILE);
+ acpi_add_rom_blob(acpi_build_no_update, NULL, tables.linker->cmd_blob,
+ ACPI_BUILD_LOADER_FILE);
+ acpi_add_rom_blob(acpi_build_no_update, NULL, tables.rsdp,
+ ACPI_BUILD_RSDP_FILE);
+
+ acpi_build_tables_cleanup(&tables, false);
+}
diff --git a/hw/i386/acpi-microvm.h b/hw/i386/acpi-microvm.h
new file mode 100644
index 000000000..dfe853690
--- /dev/null
+++ b/hw/i386/acpi-microvm.h
@@ -0,0 +1,8 @@
+#ifndef HW_I386_ACPI_MICROVM_H
+#define HW_I386_ACPI_MICROVM_H
+
+#include "hw/i386/microvm.h"
+
+void acpi_setup_microvm(MicrovmMachineState *mms);
+
+#endif
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
new file mode 100644
index 000000000..91fe34ae5
--- /dev/null
+++ b/hw/i386/amd_iommu.c
@@ -0,0 +1,1662 @@
+/*
+ * QEMU emulation of AMD IOMMU (AMD-Vi)
+ *
+ * Copyright (C) 2011 Eduard - Gabriel Munteanu
+ * Copyright (C) 2015, 2016 David Kiarie Kahurani
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Cache implementation inspired by hw/i386/intel_iommu.c
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/pc.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/pci_bus.h"
+#include "migration/vmstate.h"
+#include "amd_iommu.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/i386/apic_internal.h"
+#include "trace.h"
+#include "hw/i386/apic-msidef.h"
+
+/* used AMD-Vi MMIO registers */
+const char *amdvi_mmio_low[] = {
+ "AMDVI_MMIO_DEVTAB_BASE",
+ "AMDVI_MMIO_CMDBUF_BASE",
+ "AMDVI_MMIO_EVTLOG_BASE",
+ "AMDVI_MMIO_CONTROL",
+ "AMDVI_MMIO_EXCL_BASE",
+ "AMDVI_MMIO_EXCL_LIMIT",
+ "AMDVI_MMIO_EXT_FEATURES",
+ "AMDVI_MMIO_PPR_BASE",
+ "UNHANDLED"
+};
+const char *amdvi_mmio_high[] = {
+ "AMDVI_MMIO_COMMAND_HEAD",
+ "AMDVI_MMIO_COMMAND_TAIL",
+ "AMDVI_MMIO_EVTLOG_HEAD",
+ "AMDVI_MMIO_EVTLOG_TAIL",
+ "AMDVI_MMIO_STATUS",
+ "AMDVI_MMIO_PPR_HEAD",
+ "AMDVI_MMIO_PPR_TAIL",
+ "UNHANDLED"
+};
+
+struct AMDVIAddressSpace {
+ uint8_t bus_num; /* bus number */
+ uint8_t devfn; /* device function */
+ AMDVIState *iommu_state; /* AMDVI - one per machine */
+ MemoryRegion root; /* AMDVI Root memory map region */
+ IOMMUMemoryRegion iommu; /* Device's address translation region */
+ MemoryRegion iommu_ir; /* Device's interrupt remapping region */
+ AddressSpace as; /* device's corresponding address space */
+};
+
+/* AMDVI cache entry */
+typedef struct AMDVIIOTLBEntry {
+ uint16_t domid; /* assigned domain id */
+ uint16_t devid; /* device owning entry */
+ uint64_t perms; /* access permissions */
+ uint64_t translated_addr; /* translated address */
+ uint64_t page_mask; /* physical page size */
+} AMDVIIOTLBEntry;
+
+/* configure MMIO registers at startup/reset */
+static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
+ uint64_t romask, uint64_t w1cmask)
+{
+ stq_le_p(&s->mmior[addr], val);
+ stq_le_p(&s->romask[addr], romask);
+ stq_le_p(&s->w1cmask[addr], w1cmask);
+}
+
+static uint16_t amdvi_readw(AMDVIState *s, hwaddr addr)
+{
+ return lduw_le_p(&s->mmior[addr]);
+}
+
+static uint32_t amdvi_readl(AMDVIState *s, hwaddr addr)
+{
+ return ldl_le_p(&s->mmior[addr]);
+}
+
+static uint64_t amdvi_readq(AMDVIState *s, hwaddr addr)
+{
+ return ldq_le_p(&s->mmior[addr]);
+}
+
+/* internal write */
+static void amdvi_writeq_raw(AMDVIState *s, hwaddr addr, uint64_t val)
+{
+ stq_le_p(&s->mmior[addr], val);
+}
+
+/* external write */
+static void amdvi_writew(AMDVIState *s, hwaddr addr, uint16_t val)
+{
+ uint16_t romask = lduw_le_p(&s->romask[addr]);
+ uint16_t w1cmask = lduw_le_p(&s->w1cmask[addr]);
+ uint16_t oldval = lduw_le_p(&s->mmior[addr]);
+ stw_le_p(&s->mmior[addr],
+ ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
+}
+
+static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val)
+{
+ uint32_t romask = ldl_le_p(&s->romask[addr]);
+ uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
+ uint32_t oldval = ldl_le_p(&s->mmior[addr]);
+ stl_le_p(&s->mmior[addr],
+ ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
+}
+
+static void amdvi_writeq(AMDVIState *s, hwaddr addr, uint64_t val)
+{
+ uint64_t romask = ldq_le_p(&s->romask[addr]);
+ uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
+ uint32_t oldval = ldq_le_p(&s->mmior[addr]);
+ stq_le_p(&s->mmior[addr],
+ ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
+}
+
+/* OR a 64-bit register with a 64-bit value */
+static bool amdvi_test_mask(AMDVIState *s, hwaddr addr, uint64_t val)
+{
+ return amdvi_readq(s, addr) | val;
+}
+
+/* OR a 64-bit register with a 64-bit value storing result in the register */
+static void amdvi_assign_orq(AMDVIState *s, hwaddr addr, uint64_t val)
+{
+ amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) | val);
+}
+
+/* AND a 64-bit register with a 64-bit value storing result in the register */
+static void amdvi_assign_andq(AMDVIState *s, hwaddr addr, uint64_t val)
+{
+ amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) & val);
+}
+
+static void amdvi_generate_msi_interrupt(AMDVIState *s)
+{
+ MSIMessage msg = {};
+ MemTxAttrs attrs = {
+ .requester_id = pci_requester_id(&s->pci.dev)
+ };
+
+ if (msi_enabled(&s->pci.dev)) {
+ msg = msi_get_message(&s->pci.dev, 0);
+ address_space_stl_le(&address_space_memory, msg.address, msg.data,
+ attrs, NULL);
+ }
+}
+
+static void amdvi_log_event(AMDVIState *s, uint64_t *evt)
+{
+ /* event logging not enabled */
+ if (!s->evtlog_enabled || amdvi_test_mask(s, AMDVI_MMIO_STATUS,
+ AMDVI_MMIO_STATUS_EVT_OVF)) {
+ return;
+ }
+
+ /* event log buffer full */
+ if (s->evtlog_tail >= s->evtlog_len) {
+ amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF);
+ /* generate interrupt */
+ amdvi_generate_msi_interrupt(s);
+ return;
+ }
+
+ if (dma_memory_write(&address_space_memory, s->evtlog + s->evtlog_tail,
+ evt, AMDVI_EVENT_LEN)) {
+ trace_amdvi_evntlog_fail(s->evtlog, s->evtlog_tail);
+ }
+
+ s->evtlog_tail += AMDVI_EVENT_LEN;
+ amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
+ amdvi_generate_msi_interrupt(s);
+}
+
+static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start,
+ int length)
+{
+ int index = start / 64, bitpos = start % 64;
+ uint64_t mask = MAKE_64BIT_MASK(start, length);
+ buffer[index] &= ~mask;
+ buffer[index] |= (value << bitpos) & mask;
+}
+/*
+ * AMDVi event structure
+ * 0:15 -> DeviceID
+ * 55:63 -> event type + miscellaneous info
+ * 63:127 -> related address
+ */
+static void amdvi_encode_event(uint64_t *evt, uint16_t devid, uint64_t addr,
+ uint16_t info)
+{
+ amdvi_setevent_bits(evt, devid, 0, 16);
+ amdvi_setevent_bits(evt, info, 55, 8);
+ amdvi_setevent_bits(evt, addr, 63, 64);
+}
+/* log an error encountered during a page walk
+ *
+ * @addr: virtual address in translation request
+ */
+static void amdvi_page_fault(AMDVIState *s, uint16_t devid,
+ hwaddr addr, uint16_t info)
+{
+ uint64_t evt[4];
+
+ info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF;
+ amdvi_encode_event(evt, devid, addr, info);
+ amdvi_log_event(s, evt);
+ pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+ PCI_STATUS_SIG_TARGET_ABORT);
+}
+/*
+ * log a master abort accessing device table
+ * @devtab : address of device table entry
+ * @info : error flags
+ */
+static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid,
+ hwaddr devtab, uint16_t info)
+{
+ uint64_t evt[4];
+
+ info |= AMDVI_EVENT_DEV_TAB_HW_ERROR;
+
+ amdvi_encode_event(evt, devid, devtab, info);
+ amdvi_log_event(s, evt);
+ pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+ PCI_STATUS_SIG_TARGET_ABORT);
+}
+/* log an event trying to access command buffer
+ * @addr : address that couldn't be accessed
+ */
+static void amdvi_log_command_error(AMDVIState *s, hwaddr addr)
+{
+ uint64_t evt[4], info = AMDVI_EVENT_COMMAND_HW_ERROR;
+
+ amdvi_encode_event(evt, 0, addr, info);
+ amdvi_log_event(s, evt);
+ pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+ PCI_STATUS_SIG_TARGET_ABORT);
+}
+/* log an illegal comand event
+ * @addr : address of illegal command
+ */
+static void amdvi_log_illegalcom_error(AMDVIState *s, uint16_t info,
+ hwaddr addr)
+{
+ uint64_t evt[4];
+
+ info |= AMDVI_EVENT_ILLEGAL_COMMAND_ERROR;
+ amdvi_encode_event(evt, 0, addr, info);
+ amdvi_log_event(s, evt);
+}
+/* log an error accessing device table
+ *
+ * @devid : device owning the table entry
+ * @devtab : address of device table entry
+ * @info : error flags
+ */
+static void amdvi_log_illegaldevtab_error(AMDVIState *s, uint16_t devid,
+ hwaddr addr, uint16_t info)
+{
+ uint64_t evt[4];
+
+ info |= AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY;
+ amdvi_encode_event(evt, devid, addr, info);
+ amdvi_log_event(s, evt);
+}
+/* log an error accessing a PTE entry
+ * @addr : address that couldn't be accessed
+ */
+static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid,
+ hwaddr addr, uint16_t info)
+{
+ uint64_t evt[4];
+
+ info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR;
+ amdvi_encode_event(evt, devid, addr, info);
+ amdvi_log_event(s, evt);
+ pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+ PCI_STATUS_SIG_TARGET_ABORT);
+}
+
+static gboolean amdvi_uint64_equal(gconstpointer v1, gconstpointer v2)
+{
+ return *((const uint64_t *)v1) == *((const uint64_t *)v2);
+}
+
+static guint amdvi_uint64_hash(gconstpointer v)
+{
+ return (guint)*(const uint64_t *)v;
+}
+
+static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr,
+ uint64_t devid)
+{
+ uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
+ ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
+ return g_hash_table_lookup(s->iotlb, &key);
+}
+
+static void amdvi_iotlb_reset(AMDVIState *s)
+{
+ assert(s->iotlb);
+ trace_amdvi_iotlb_reset();
+ g_hash_table_remove_all(s->iotlb);
+}
+
+static gboolean amdvi_iotlb_remove_by_devid(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
+ uint16_t devid = *(uint16_t *)user_data;
+ return entry->devid == devid;
+}
+
+static void amdvi_iotlb_remove_page(AMDVIState *s, hwaddr addr,
+ uint64_t devid)
+{
+ uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
+ ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
+ g_hash_table_remove(s->iotlb, &key);
+}
+
+static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid,
+ uint64_t gpa, IOMMUTLBEntry to_cache,
+ uint16_t domid)
+{
+ AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1);
+ uint64_t *key = g_new(uint64_t, 1);
+ uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K;
+
+ /* don't cache erroneous translations */
+ if (to_cache.perm != IOMMU_NONE) {
+ trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+ PCI_FUNC(devid), gpa, to_cache.translated_addr);
+
+ if (g_hash_table_size(s->iotlb) >= AMDVI_IOTLB_MAX_SIZE) {
+ amdvi_iotlb_reset(s);
+ }
+
+ entry->domid = domid;
+ entry->perms = to_cache.perm;
+ entry->translated_addr = to_cache.translated_addr;
+ entry->page_mask = to_cache.addr_mask;
+ *key = gfn | ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
+ g_hash_table_replace(s->iotlb, key, entry);
+ }
+}
+
+static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd)
+{
+ /* pad the last 3 bits */
+ hwaddr addr = cpu_to_le64(extract64(cmd[0], 3, 49)) << 3;
+ uint64_t data = cpu_to_le64(cmd[1]);
+
+ if (extract64(cmd[0], 52, 8)) {
+ amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+ s->cmdbuf + s->cmdbuf_head);
+ }
+ if (extract64(cmd[0], 0, 1)) {
+ if (dma_memory_write(&address_space_memory, addr, &data,
+ AMDVI_COMPLETION_DATA_SIZE)) {
+ trace_amdvi_completion_wait_fail(addr);
+ }
+ }
+ /* set completion interrupt */
+ if (extract64(cmd[0], 1, 1)) {
+ amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
+ /* generate interrupt */
+ amdvi_generate_msi_interrupt(s);
+ }
+ trace_amdvi_completion_wait(addr, data);
+}
+
+/* log error without aborting since linux seems to be using reserved bits */
+static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd)
+{
+ uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16));
+
+ /* This command should invalidate internal caches of which there isn't */
+ if (extract64(cmd[0], 16, 44) || cmd[1]) {
+ amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+ s->cmdbuf + s->cmdbuf_head);
+ }
+ trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid),
+ PCI_FUNC(devid));
+}
+
+static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd)
+{
+ if (extract64(cmd[0], 16, 16) || extract64(cmd[0], 52, 8) ||
+ extract64(cmd[1], 0, 2) || extract64(cmd[1], 3, 29)
+ || extract64(cmd[1], 48, 16)) {
+ amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+ s->cmdbuf + s->cmdbuf_head);
+ }
+ trace_amdvi_ppr_exec();
+}
+
+static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
+{
+ if (extract64(cmd[0], 0, 60) || cmd[1]) {
+ amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+ s->cmdbuf + s->cmdbuf_head);
+ }
+
+ amdvi_iotlb_reset(s);
+ trace_amdvi_all_inval();
+}
+
+static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
+ uint16_t domid = *(uint16_t *)user_data;
+ return entry->domid == domid;
+}
+
+/* we don't have devid - we can't remove pages by address */
+static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd)
+{
+ uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16));
+
+ if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) ||
+ extract64(cmd[1], 3, 9)) {
+ amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+ s->cmdbuf + s->cmdbuf_head);
+ }
+
+ g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid,
+ &domid);
+ trace_amdvi_pages_inval(domid);
+}
+
+static void amdvi_prefetch_pages(AMDVIState *s, uint64_t *cmd)
+{
+ if (extract64(cmd[0], 16, 8) || extract64(cmd[0], 52, 8) ||
+ extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
+ extract64(cmd[1], 5, 7)) {
+ amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+ s->cmdbuf + s->cmdbuf_head);
+ }
+
+ trace_amdvi_prefetch_pages();
+}
+
+static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd)
+{
+ if (extract64(cmd[0], 16, 44) || cmd[1]) {
+ amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+ s->cmdbuf + s->cmdbuf_head);
+ return;
+ }
+
+ trace_amdvi_intr_inval();
+}
+
+/* FIXME: Try to work with the specified size instead of all the pages
+ * when the S bit is on
+ */
+static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd)
+{
+
+ uint16_t devid = extract64(cmd[0], 0, 16);
+ if (extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
+ extract64(cmd[1], 6, 6)) {
+ amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+ s->cmdbuf + s->cmdbuf_head);
+ return;
+ }
+
+ if (extract64(cmd[1], 0, 1)) {
+ g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_devid,
+ &devid);
+ } else {
+ amdvi_iotlb_remove_page(s, cpu_to_le64(extract64(cmd[1], 12, 52)) << 12,
+ cpu_to_le16(extract64(cmd[1], 0, 16)));
+ }
+ trace_amdvi_iotlb_inval();
+}
+
+/* not honouring reserved bits is regarded as an illegal command */
+static void amdvi_cmdbuf_exec(AMDVIState *s)
+{
+ uint64_t cmd[2];
+
+ if (dma_memory_read(&address_space_memory, s->cmdbuf + s->cmdbuf_head,
+ cmd, AMDVI_COMMAND_SIZE)) {
+ trace_amdvi_command_read_fail(s->cmdbuf, s->cmdbuf_head);
+ amdvi_log_command_error(s, s->cmdbuf + s->cmdbuf_head);
+ return;
+ }
+
+ switch (extract64(cmd[0], 60, 4)) {
+ case AMDVI_CMD_COMPLETION_WAIT:
+ amdvi_completion_wait(s, cmd);
+ break;
+ case AMDVI_CMD_INVAL_DEVTAB_ENTRY:
+ amdvi_inval_devtab_entry(s, cmd);
+ break;
+ case AMDVI_CMD_INVAL_AMDVI_PAGES:
+ amdvi_inval_pages(s, cmd);
+ break;
+ case AMDVI_CMD_INVAL_IOTLB_PAGES:
+ iommu_inval_iotlb(s, cmd);
+ break;
+ case AMDVI_CMD_INVAL_INTR_TABLE:
+ amdvi_inval_inttable(s, cmd);
+ break;
+ case AMDVI_CMD_PREFETCH_AMDVI_PAGES:
+ amdvi_prefetch_pages(s, cmd);
+ break;
+ case AMDVI_CMD_COMPLETE_PPR_REQUEST:
+ amdvi_complete_ppr(s, cmd);
+ break;
+ case AMDVI_CMD_INVAL_AMDVI_ALL:
+ amdvi_inval_all(s, cmd);
+ break;
+ default:
+ trace_amdvi_unhandled_command(extract64(cmd[1], 60, 4));
+ /* log illegal command */
+ amdvi_log_illegalcom_error(s, extract64(cmd[1], 60, 4),
+ s->cmdbuf + s->cmdbuf_head);
+ }
+}
+
+static void amdvi_cmdbuf_run(AMDVIState *s)
+{
+ if (!s->cmdbuf_enabled) {
+ trace_amdvi_command_error(amdvi_readq(s, AMDVI_MMIO_CONTROL));
+ return;
+ }
+
+ /* check if there is work to do. */
+ while (s->cmdbuf_head != s->cmdbuf_tail) {
+ trace_amdvi_command_exec(s->cmdbuf_head, s->cmdbuf_tail, s->cmdbuf);
+ amdvi_cmdbuf_exec(s);
+ s->cmdbuf_head += AMDVI_COMMAND_SIZE;
+ amdvi_writeq_raw(s, AMDVI_MMIO_COMMAND_HEAD, s->cmdbuf_head);
+
+ /* wrap head pointer */
+ if (s->cmdbuf_head >= s->cmdbuf_len * AMDVI_COMMAND_SIZE) {
+ s->cmdbuf_head = 0;
+ }
+ }
+}
+
+static void amdvi_mmio_trace(hwaddr addr, unsigned size)
+{
+ uint8_t index = (addr & ~0x2000) / 8;
+
+ if ((addr & 0x2000)) {
+ /* high table */
+ index = index >= AMDVI_MMIO_REGS_HIGH ? AMDVI_MMIO_REGS_HIGH : index;
+ trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07);
+ } else {
+ index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index;
+ trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07);
+ }
+}
+
+static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size)
+{
+ AMDVIState *s = opaque;
+
+ uint64_t val = -1;
+ if (addr + size > AMDVI_MMIO_SIZE) {
+ trace_amdvi_mmio_read_invalid(AMDVI_MMIO_SIZE, addr, size);
+ return (uint64_t)-1;
+ }
+
+ if (size == 2) {
+ val = amdvi_readw(s, addr);
+ } else if (size == 4) {
+ val = amdvi_readl(s, addr);
+ } else if (size == 8) {
+ val = amdvi_readq(s, addr);
+ }
+ amdvi_mmio_trace(addr, size);
+
+ return val;
+}
+
+static void amdvi_handle_control_write(AMDVIState *s)
+{
+ unsigned long control = amdvi_readq(s, AMDVI_MMIO_CONTROL);
+ s->enabled = !!(control & AMDVI_MMIO_CONTROL_AMDVIEN);
+
+ s->ats_enabled = !!(control & AMDVI_MMIO_CONTROL_HTTUNEN);
+ s->evtlog_enabled = s->enabled && !!(control &
+ AMDVI_MMIO_CONTROL_EVENTLOGEN);
+
+ s->evtlog_intr = !!(control & AMDVI_MMIO_CONTROL_EVENTINTEN);
+ s->completion_wait_intr = !!(control & AMDVI_MMIO_CONTROL_COMWAITINTEN);
+ s->cmdbuf_enabled = s->enabled && !!(control &
+ AMDVI_MMIO_CONTROL_CMDBUFLEN);
+ s->ga_enabled = !!(control & AMDVI_MMIO_CONTROL_GAEN);
+
+ /* update the flags depending on the control register */
+ if (s->cmdbuf_enabled) {
+ amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_CMDBUF_RUN);
+ } else {
+ amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_CMDBUF_RUN);
+ }
+ if (s->evtlog_enabled) {
+ amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_RUN);
+ } else {
+ amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_EVT_RUN);
+ }
+
+ trace_amdvi_control_status(control);
+ amdvi_cmdbuf_run(s);
+}
+
+static inline void amdvi_handle_devtab_write(AMDVIState *s)
+
+{
+ uint64_t val = amdvi_readq(s, AMDVI_MMIO_DEVICE_TABLE);
+ s->devtab = (val & AMDVI_MMIO_DEVTAB_BASE_MASK);
+
+ /* set device table length */
+ s->devtab_len = ((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1 *
+ (AMDVI_MMIO_DEVTAB_SIZE_UNIT /
+ AMDVI_MMIO_DEVTAB_ENTRY_SIZE));
+}
+
+static inline void amdvi_handle_cmdhead_write(AMDVIState *s)
+{
+ s->cmdbuf_head = amdvi_readq(s, AMDVI_MMIO_COMMAND_HEAD)
+ & AMDVI_MMIO_CMDBUF_HEAD_MASK;
+ amdvi_cmdbuf_run(s);
+}
+
+static inline void amdvi_handle_cmdbase_write(AMDVIState *s)
+{
+ s->cmdbuf = amdvi_readq(s, AMDVI_MMIO_COMMAND_BASE)
+ & AMDVI_MMIO_CMDBUF_BASE_MASK;
+ s->cmdbuf_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_CMDBUF_SIZE_BYTE)
+ & AMDVI_MMIO_CMDBUF_SIZE_MASK);
+ s->cmdbuf_head = s->cmdbuf_tail = 0;
+}
+
+static inline void amdvi_handle_cmdtail_write(AMDVIState *s)
+{
+ s->cmdbuf_tail = amdvi_readq(s, AMDVI_MMIO_COMMAND_TAIL)
+ & AMDVI_MMIO_CMDBUF_TAIL_MASK;
+ amdvi_cmdbuf_run(s);
+}
+
+static inline void amdvi_handle_excllim_write(AMDVIState *s)
+{
+ uint64_t val = amdvi_readq(s, AMDVI_MMIO_EXCL_LIMIT);
+ s->excl_limit = (val & AMDVI_MMIO_EXCL_LIMIT_MASK) |
+ AMDVI_MMIO_EXCL_LIMIT_LOW;
+}
+
+static inline void amdvi_handle_evtbase_write(AMDVIState *s)
+{
+ uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_BASE);
+ s->evtlog = val & AMDVI_MMIO_EVTLOG_BASE_MASK;
+ s->evtlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_EVTLOG_SIZE_BYTE)
+ & AMDVI_MMIO_EVTLOG_SIZE_MASK);
+}
+
+static inline void amdvi_handle_evttail_write(AMDVIState *s)
+{
+ uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_TAIL);
+ s->evtlog_tail = val & AMDVI_MMIO_EVTLOG_TAIL_MASK;
+}
+
+static inline void amdvi_handle_evthead_write(AMDVIState *s)
+{
+ uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_HEAD);
+ s->evtlog_head = val & AMDVI_MMIO_EVTLOG_HEAD_MASK;
+}
+
+static inline void amdvi_handle_pprbase_write(AMDVIState *s)
+{
+ uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_BASE);
+ s->ppr_log = val & AMDVI_MMIO_PPRLOG_BASE_MASK;
+ s->pprlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_PPRLOG_SIZE_BYTE)
+ & AMDVI_MMIO_PPRLOG_SIZE_MASK);
+}
+
+static inline void amdvi_handle_pprhead_write(AMDVIState *s)
+{
+ uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_HEAD);
+ s->pprlog_head = val & AMDVI_MMIO_PPRLOG_HEAD_MASK;
+}
+
+static inline void amdvi_handle_pprtail_write(AMDVIState *s)
+{
+ uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_TAIL);
+ s->pprlog_tail = val & AMDVI_MMIO_PPRLOG_TAIL_MASK;
+}
+
+/* FIXME: something might go wrong if System Software writes in chunks
+ * of one byte but linux writes in chunks of 4 bytes so currently it
+ * works correctly with linux but will definitely be busted if software
+ * reads/writes 8 bytes
+ */
+static void amdvi_mmio_reg_write(AMDVIState *s, unsigned size, uint64_t val,
+ hwaddr addr)
+{
+ if (size == 2) {
+ amdvi_writew(s, addr, val);
+ } else if (size == 4) {
+ amdvi_writel(s, addr, val);
+ } else if (size == 8) {
+ amdvi_writeq(s, addr, val);
+ }
+}
+
+static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val,
+ unsigned size)
+{
+ AMDVIState *s = opaque;
+ unsigned long offset = addr & 0x07;
+
+ if (addr + size > AMDVI_MMIO_SIZE) {
+ trace_amdvi_mmio_write("error: addr outside region: max ",
+ (uint64_t)AMDVI_MMIO_SIZE, size, val, offset);
+ return;
+ }
+
+ amdvi_mmio_trace(addr, size);
+ switch (addr & ~0x07) {
+ case AMDVI_MMIO_CONTROL:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ amdvi_handle_control_write(s);
+ break;
+ case AMDVI_MMIO_DEVICE_TABLE:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ /* set device table address
+ * This also suffers from inability to tell whether software
+ * is done writing
+ */
+ if (offset || (size == 8)) {
+ amdvi_handle_devtab_write(s);
+ }
+ break;
+ case AMDVI_MMIO_COMMAND_HEAD:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ amdvi_handle_cmdhead_write(s);
+ break;
+ case AMDVI_MMIO_COMMAND_BASE:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ /* FIXME - make sure System Software has finished writing incase
+ * it writes in chucks less than 8 bytes in a robust way.As for
+ * now, this hacks works for the linux driver
+ */
+ if (offset || (size == 8)) {
+ amdvi_handle_cmdbase_write(s);
+ }
+ break;
+ case AMDVI_MMIO_COMMAND_TAIL:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ amdvi_handle_cmdtail_write(s);
+ break;
+ case AMDVI_MMIO_EVENT_BASE:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ amdvi_handle_evtbase_write(s);
+ break;
+ case AMDVI_MMIO_EVENT_HEAD:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ amdvi_handle_evthead_write(s);
+ break;
+ case AMDVI_MMIO_EVENT_TAIL:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ amdvi_handle_evttail_write(s);
+ break;
+ case AMDVI_MMIO_EXCL_LIMIT:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ amdvi_handle_excllim_write(s);
+ break;
+ /* PPR log base - unused for now */
+ case AMDVI_MMIO_PPR_BASE:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ amdvi_handle_pprbase_write(s);
+ break;
+ /* PPR log head - also unused for now */
+ case AMDVI_MMIO_PPR_HEAD:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ amdvi_handle_pprhead_write(s);
+ break;
+ /* PPR log tail - unused for now */
+ case AMDVI_MMIO_PPR_TAIL:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ amdvi_handle_pprtail_write(s);
+ break;
+ }
+}
+
+static inline uint64_t amdvi_get_perms(uint64_t entry)
+{
+ return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >>
+ AMDVI_DEV_PERM_SHIFT;
+}
+
+/* validate that reserved bits are honoured */
+static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid,
+ uint64_t *dte)
+{
+ if ((dte[0] & AMDVI_DTE_LOWER_QUAD_RESERVED)
+ || (dte[1] & AMDVI_DTE_MIDDLE_QUAD_RESERVED)
+ || (dte[2] & AMDVI_DTE_UPPER_QUAD_RESERVED) || dte[3]) {
+ amdvi_log_illegaldevtab_error(s, devid,
+ s->devtab +
+ devid * AMDVI_DEVTAB_ENTRY_SIZE, 0);
+ return false;
+ }
+
+ return true;
+}
+
+/* get a device table entry given the devid */
+static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry)
+{
+ uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE;
+
+ if (dma_memory_read(&address_space_memory, s->devtab + offset, entry,
+ AMDVI_DEVTAB_ENTRY_SIZE)) {
+ trace_amdvi_dte_get_fail(s->devtab, offset);
+ /* log error accessing dte */
+ amdvi_log_devtab_error(s, devid, s->devtab + offset, 0);
+ return false;
+ }
+
+ *entry = le64_to_cpu(*entry);
+ if (!amdvi_validate_dte(s, devid, entry)) {
+ trace_amdvi_invalid_dte(entry[0]);
+ return false;
+ }
+
+ return true;
+}
+
+/* get pte translation mode */
+static inline uint8_t get_pte_translation_mode(uint64_t pte)
+{
+ return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK;
+}
+
+static inline uint64_t pte_override_page_mask(uint64_t pte)
+{
+ uint8_t page_mask = 13;
+ uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) >> 12;
+ /* find the first zero bit */
+ while (addr & 1) {
+ page_mask++;
+ addr = addr >> 1;
+ }
+
+ return ~((1ULL << page_mask) - 1);
+}
+
+static inline uint64_t pte_get_page_mask(uint64_t oldlevel)
+{
+ return ~((1UL << ((oldlevel * 9) + 3)) - 1);
+}
+
+static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
+ uint16_t devid)
+{
+ uint64_t pte;
+
+ if (dma_memory_read(&address_space_memory, pte_addr, &pte, sizeof(pte))) {
+ trace_amdvi_get_pte_hwerror(pte_addr);
+ amdvi_log_pagetab_error(s, devid, pte_addr, 0);
+ pte = 0;
+ return pte;
+ }
+
+ pte = le64_to_cpu(pte);
+ return pte;
+}
+
+static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
+ IOMMUTLBEntry *ret, unsigned perms,
+ hwaddr addr)
+{
+ unsigned level, present, pte_perms, oldlevel;
+ uint64_t pte = dte[0], pte_addr, page_mask;
+
+ /* make sure the DTE has TV = 1 */
+ if (pte & AMDVI_DEV_TRANSLATION_VALID) {
+ level = get_pte_translation_mode(pte);
+ if (level >= 7) {
+ trace_amdvi_mode_invalid(level, addr);
+ return;
+ }
+ if (level == 0) {
+ goto no_remap;
+ }
+
+ /* we are at the leaf page table or page table encodes a huge page */
+ while (level > 0) {
+ pte_perms = amdvi_get_perms(pte);
+ present = pte & 1;
+ if (!present || perms != (perms & pte_perms)) {
+ amdvi_page_fault(as->iommu_state, as->devfn, addr, perms);
+ trace_amdvi_page_fault(addr);
+ return;
+ }
+
+ /* go to the next lower level */
+ pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK;
+ /* add offset and load pte */
+ pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3;
+ pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn);
+ if (!pte) {
+ return;
+ }
+ oldlevel = level;
+ level = get_pte_translation_mode(pte);
+ if (level == 0x7) {
+ break;
+ }
+ }
+
+ if (level == 0x7) {
+ page_mask = pte_override_page_mask(pte);
+ } else {
+ page_mask = pte_get_page_mask(oldlevel);
+ }
+
+ /* get access permissions from pte */
+ ret->iova = addr & page_mask;
+ ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask;
+ ret->addr_mask = ~page_mask;
+ ret->perm = amdvi_get_perms(pte);
+ return;
+ }
+no_remap:
+ ret->iova = addr & AMDVI_PAGE_MASK_4K;
+ ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
+ ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
+ ret->perm = amdvi_get_perms(pte);
+}
+
+static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
+ bool is_write, IOMMUTLBEntry *ret)
+{
+ AMDVIState *s = as->iommu_state;
+ uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn);
+ AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid);
+ uint64_t entry[4];
+
+ if (iotlb_entry) {
+ trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid),
+ PCI_FUNC(devid), addr, iotlb_entry->translated_addr);
+ ret->iova = addr & ~iotlb_entry->page_mask;
+ ret->translated_addr = iotlb_entry->translated_addr;
+ ret->addr_mask = iotlb_entry->page_mask;
+ ret->perm = iotlb_entry->perms;
+ return;
+ }
+
+ if (!amdvi_get_dte(s, devid, entry)) {
+ return;
+ }
+
+ /* devices with V = 0 are not translated */
+ if (!(entry[0] & AMDVI_DEV_VALID)) {
+ goto out;
+ }
+
+ amdvi_page_walk(as, entry, ret,
+ is_write ? AMDVI_PERM_WRITE : AMDVI_PERM_READ, addr);
+
+ amdvi_update_iotlb(s, devid, addr, *ret,
+ entry[1] & AMDVI_DEV_DOMID_ID_MASK);
+ return;
+
+out:
+ ret->iova = addr & AMDVI_PAGE_MASK_4K;
+ ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
+ ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
+ ret->perm = IOMMU_RW;
+}
+
+static inline bool amdvi_is_interrupt_addr(hwaddr addr)
+{
+ return addr >= AMDVI_INT_ADDR_FIRST && addr <= AMDVI_INT_ADDR_LAST;
+}
+
+static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
+ IOMMUAccessFlags flag, int iommu_idx)
+{
+ AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
+ AMDVIState *s = as->iommu_state;
+ IOMMUTLBEntry ret = {
+ .target_as = &address_space_memory,
+ .iova = addr,
+ .translated_addr = 0,
+ .addr_mask = ~(hwaddr)0,
+ .perm = IOMMU_NONE
+ };
+
+ if (!s->enabled) {
+ /* AMDVI disabled - corresponds to iommu=off not
+ * failure to provide any parameter
+ */
+ ret.iova = addr & AMDVI_PAGE_MASK_4K;
+ ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
+ ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
+ ret.perm = IOMMU_RW;
+ return ret;
+ } else if (amdvi_is_interrupt_addr(addr)) {
+ ret.iova = addr & AMDVI_PAGE_MASK_4K;
+ ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
+ ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
+ ret.perm = IOMMU_WO;
+ return ret;
+ }
+
+ amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret);
+ trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
+ PCI_FUNC(as->devfn), addr, ret.translated_addr);
+ return ret;
+}
+
+static int amdvi_get_irte(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
+ union irte *irte, uint16_t devid)
+{
+ uint64_t irte_root, offset;
+
+ irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
+ offset = (origin->data & AMDVI_IRTE_OFFSET) << 2;
+
+ trace_amdvi_ir_irte(irte_root, offset);
+
+ if (dma_memory_read(&address_space_memory, irte_root + offset,
+ irte, sizeof(*irte))) {
+ trace_amdvi_ir_err("failed to get irte");
+ return -AMDVI_IR_GET_IRTE;
+ }
+
+ trace_amdvi_ir_irte_val(irte->val);
+
+ return 0;
+}
+
+static int amdvi_int_remap_legacy(AMDVIState *iommu,
+ MSIMessage *origin,
+ MSIMessage *translated,
+ uint64_t *dte,
+ X86IOMMUIrq *irq,
+ uint16_t sid)
+{
+ int ret;
+ union irte irte;
+
+ /* get interrupt remapping table */
+ ret = amdvi_get_irte(iommu, origin, dte, &irte, sid);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (!irte.fields.valid) {
+ trace_amdvi_ir_target_abort("RemapEn is disabled");
+ return -AMDVI_IR_TARGET_ABORT;
+ }
+
+ if (irte.fields.guest_mode) {
+ error_report_once("guest mode is not zero");
+ return -AMDVI_IR_ERR;
+ }
+
+ if (irte.fields.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
+ error_report_once("reserved int_type");
+ return -AMDVI_IR_ERR;
+ }
+
+ irq->delivery_mode = irte.fields.int_type;
+ irq->vector = irte.fields.vector;
+ irq->dest_mode = irte.fields.dm;
+ irq->redir_hint = irte.fields.rq_eoi;
+ irq->dest = irte.fields.destination;
+
+ return 0;
+}
+
+static int amdvi_get_irte_ga(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
+ struct irte_ga *irte, uint16_t devid)
+{
+ uint64_t irte_root, offset;
+
+ irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
+ offset = (origin->data & AMDVI_IRTE_OFFSET) << 4;
+ trace_amdvi_ir_irte(irte_root, offset);
+
+ if (dma_memory_read(&address_space_memory, irte_root + offset,
+ irte, sizeof(*irte))) {
+ trace_amdvi_ir_err("failed to get irte_ga");
+ return -AMDVI_IR_GET_IRTE;
+ }
+
+ trace_amdvi_ir_irte_ga_val(irte->hi.val, irte->lo.val);
+ return 0;
+}
+
+static int amdvi_int_remap_ga(AMDVIState *iommu,
+ MSIMessage *origin,
+ MSIMessage *translated,
+ uint64_t *dte,
+ X86IOMMUIrq *irq,
+ uint16_t sid)
+{
+ int ret;
+ struct irte_ga irte;
+
+ /* get interrupt remapping table */
+ ret = amdvi_get_irte_ga(iommu, origin, dte, &irte, sid);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (!irte.lo.fields_remap.valid) {
+ trace_amdvi_ir_target_abort("RemapEn is disabled");
+ return -AMDVI_IR_TARGET_ABORT;
+ }
+
+ if (irte.lo.fields_remap.guest_mode) {
+ error_report_once("guest mode is not zero");
+ return -AMDVI_IR_ERR;
+ }
+
+ if (irte.lo.fields_remap.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
+ error_report_once("reserved int_type is set");
+ return -AMDVI_IR_ERR;
+ }
+
+ irq->delivery_mode = irte.lo.fields_remap.int_type;
+ irq->vector = irte.hi.fields.vector;
+ irq->dest_mode = irte.lo.fields_remap.dm;
+ irq->redir_hint = irte.lo.fields_remap.rq_eoi;
+ irq->dest = irte.lo.fields_remap.destination;
+
+ return 0;
+}
+
+static int __amdvi_int_remap_msi(AMDVIState *iommu,
+ MSIMessage *origin,
+ MSIMessage *translated,
+ uint64_t *dte,
+ X86IOMMUIrq *irq,
+ uint16_t sid)
+{
+ int ret;
+ uint8_t int_ctl;
+
+ int_ctl = (dte[2] >> AMDVI_IR_INTCTL_SHIFT) & 3;
+ trace_amdvi_ir_intctl(int_ctl);
+
+ switch (int_ctl) {
+ case AMDVI_IR_INTCTL_PASS:
+ memcpy(translated, origin, sizeof(*origin));
+ return 0;
+ case AMDVI_IR_INTCTL_REMAP:
+ break;
+ case AMDVI_IR_INTCTL_ABORT:
+ trace_amdvi_ir_target_abort("int_ctl abort");
+ return -AMDVI_IR_TARGET_ABORT;
+ default:
+ trace_amdvi_ir_err("int_ctl reserved");
+ return -AMDVI_IR_ERR;
+ }
+
+ if (iommu->ga_enabled) {
+ ret = amdvi_int_remap_ga(iommu, origin, translated, dte, irq, sid);
+ } else {
+ ret = amdvi_int_remap_legacy(iommu, origin, translated, dte, irq, sid);
+ }
+
+ return ret;
+}
+
+/* Interrupt remapping for MSI/MSI-X entry */
+static int amdvi_int_remap_msi(AMDVIState *iommu,
+ MSIMessage *origin,
+ MSIMessage *translated,
+ uint16_t sid)
+{
+ int ret = 0;
+ uint64_t pass = 0;
+ uint64_t dte[4] = { 0 };
+ X86IOMMUIrq irq = { 0 };
+ uint8_t dest_mode, delivery_mode;
+
+ assert(origin && translated);
+
+ /*
+ * When IOMMU is enabled, interrupt remap request will come either from
+ * IO-APIC or PCI device. If interrupt is from PCI device then it will
+ * have a valid requester id but if the interrupt is from IO-APIC
+ * then requester id will be invalid.
+ */
+ if (sid == X86_IOMMU_SID_INVALID) {
+ sid = AMDVI_IOAPIC_SB_DEVID;
+ }
+
+ trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid);
+
+ /* check if device table entry is set before we go further. */
+ if (!iommu || !iommu->devtab_len) {
+ memcpy(translated, origin, sizeof(*origin));
+ goto out;
+ }
+
+ if (!amdvi_get_dte(iommu, sid, dte)) {
+ return -AMDVI_IR_ERR;
+ }
+
+ /* Check if IR is enabled in DTE */
+ if (!(dte[2] & AMDVI_IR_REMAP_ENABLE)) {
+ memcpy(translated, origin, sizeof(*origin));
+ goto out;
+ }
+
+ /* validate that we are configure with intremap=on */
+ if (!x86_iommu_ir_supported(X86_IOMMU_DEVICE(iommu))) {
+ trace_amdvi_err("Interrupt remapping is enabled in the guest but "
+ "not in the host. Use intremap=on to enable interrupt "
+ "remapping in amd-iommu.");
+ return -AMDVI_IR_ERR;
+ }
+
+ if (origin->address & AMDVI_MSI_ADDR_HI_MASK) {
+ trace_amdvi_err("MSI address high 32 bits non-zero when "
+ "Interrupt Remapping enabled.");
+ return -AMDVI_IR_ERR;
+ }
+
+ if ((origin->address & AMDVI_MSI_ADDR_LO_MASK) != APIC_DEFAULT_ADDRESS) {
+ trace_amdvi_err("MSI is not from IOAPIC.");
+ return -AMDVI_IR_ERR;
+ }
+
+ /*
+ * The MSI data register [10:8] are used to get the upstream interrupt type.
+ *
+ * See MSI/MSI-X format:
+ * https://pdfs.semanticscholar.org/presentation/9420/c279e942eca568157711ef5c92b800c40a79.pdf
+ * (page 5)
+ */
+ delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
+
+ switch (delivery_mode) {
+ case AMDVI_IOAPIC_INT_TYPE_FIXED:
+ case AMDVI_IOAPIC_INT_TYPE_ARBITRATED:
+ trace_amdvi_ir_delivery_mode("fixed/arbitrated");
+ ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, sid);
+ if (ret < 0) {
+ goto remap_fail;
+ } else {
+ /* Translate IRQ to MSI messages */
+ x86_iommu_irq_to_msi_message(&irq, translated);
+ goto out;
+ }
+ break;
+ case AMDVI_IOAPIC_INT_TYPE_SMI:
+ error_report("SMI is not supported!");
+ ret = -AMDVI_IR_ERR;
+ break;
+ case AMDVI_IOAPIC_INT_TYPE_NMI:
+ pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK;
+ trace_amdvi_ir_delivery_mode("nmi");
+ break;
+ case AMDVI_IOAPIC_INT_TYPE_INIT:
+ pass = dte[3] & AMDVI_DEV_INT_PASS_MASK;
+ trace_amdvi_ir_delivery_mode("init");
+ break;
+ case AMDVI_IOAPIC_INT_TYPE_EINT:
+ pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK;
+ trace_amdvi_ir_delivery_mode("eint");
+ break;
+ default:
+ trace_amdvi_ir_delivery_mode("unsupported delivery_mode");
+ ret = -AMDVI_IR_ERR;
+ break;
+ }
+
+ if (ret < 0) {
+ goto remap_fail;
+ }
+
+ /*
+ * The MSI address register bit[2] is used to get the destination
+ * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts
+ * only.
+ */
+ dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
+ if (dest_mode) {
+ trace_amdvi_ir_err("invalid dest_mode");
+ ret = -AMDVI_IR_ERR;
+ goto remap_fail;
+ }
+
+ if (pass) {
+ memcpy(translated, origin, sizeof(*origin));
+ } else {
+ trace_amdvi_ir_err("passthrough is not enabled");
+ ret = -AMDVI_IR_ERR;
+ goto remap_fail;
+ }
+
+out:
+ trace_amdvi_ir_remap_msi(origin->address, origin->data,
+ translated->address, translated->data);
+ return 0;
+
+remap_fail:
+ return ret;
+}
+
+static int amdvi_int_remap(X86IOMMUState *iommu,
+ MSIMessage *origin,
+ MSIMessage *translated,
+ uint16_t sid)
+{
+ return amdvi_int_remap_msi(AMD_IOMMU_DEVICE(iommu), origin,
+ translated, sid);
+}
+
+static MemTxResult amdvi_mem_ir_write(void *opaque, hwaddr addr,
+ uint64_t value, unsigned size,
+ MemTxAttrs attrs)
+{
+ int ret;
+ MSIMessage from = { 0, 0 }, to = { 0, 0 };
+ uint16_t sid = AMDVI_IOAPIC_SB_DEVID;
+
+ from.address = (uint64_t) addr + AMDVI_INT_ADDR_FIRST;
+ from.data = (uint32_t) value;
+
+ trace_amdvi_mem_ir_write_req(addr, value, size);
+
+ if (!attrs.unspecified) {
+ /* We have explicit Source ID */
+ sid = attrs.requester_id;
+ }
+
+ ret = amdvi_int_remap_msi(opaque, &from, &to, sid);
+ if (ret < 0) {
+ /* TODO: log the event using IOMMU log event interface */
+ error_report_once("failed to remap interrupt from devid 0x%x", sid);
+ return MEMTX_ERROR;
+ }
+
+ apic_get_class()->send_msi(&to);
+
+ trace_amdvi_mem_ir_write(to.address, to.data);
+ return MEMTX_OK;
+}
+
+static MemTxResult amdvi_mem_ir_read(void *opaque, hwaddr addr,
+ uint64_t *data, unsigned size,
+ MemTxAttrs attrs)
+{
+ return MEMTX_OK;
+}
+
+static const MemoryRegionOps amdvi_ir_ops = {
+ .read_with_attrs = amdvi_mem_ir_read,
+ .write_with_attrs = amdvi_mem_ir_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ .impl = {
+ .min_access_size = 4,
+ .max_access_size = 4,
+ },
+ .valid = {
+ .min_access_size = 4,
+ .max_access_size = 4,
+ }
+};
+
+static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
+{
+ char name[128];
+ AMDVIState *s = opaque;
+ AMDVIAddressSpace **iommu_as, *amdvi_dev_as;
+ int bus_num = pci_bus_num(bus);
+
+ iommu_as = s->address_spaces[bus_num];
+
+ /* allocate memory during the first run */
+ if (!iommu_as) {
+ iommu_as = g_malloc0(sizeof(AMDVIAddressSpace *) * PCI_DEVFN_MAX);
+ s->address_spaces[bus_num] = iommu_as;
+ }
+
+ /* set up AMD-Vi region */
+ if (!iommu_as[devfn]) {
+ snprintf(name, sizeof(name), "amd_iommu_devfn_%d", devfn);
+
+ iommu_as[devfn] = g_malloc0(sizeof(AMDVIAddressSpace));
+ iommu_as[devfn]->bus_num = (uint8_t)bus_num;
+ iommu_as[devfn]->devfn = (uint8_t)devfn;
+ iommu_as[devfn]->iommu_state = s;
+
+ amdvi_dev_as = iommu_as[devfn];
+
+ /*
+ * Memory region relationships looks like (Address range shows
+ * only lower 32 bits to make it short in length...):
+ *
+ * |-----------------+-------------------+----------|
+ * | Name | Address range | Priority |
+ * |-----------------+-------------------+----------+
+ * | amdvi_root | 00000000-ffffffff | 0 |
+ * | amdvi_iommu | 00000000-ffffffff | 1 |
+ * | amdvi_iommu_ir | fee00000-feefffff | 64 |
+ * |-----------------+-------------------+----------|
+ */
+ memory_region_init_iommu(&amdvi_dev_as->iommu,
+ sizeof(amdvi_dev_as->iommu),
+ TYPE_AMD_IOMMU_MEMORY_REGION,
+ OBJECT(s),
+ "amd_iommu", UINT64_MAX);
+ memory_region_init(&amdvi_dev_as->root, OBJECT(s),
+ "amdvi_root", UINT64_MAX);
+ address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name);
+ memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s),
+ &amdvi_ir_ops, s, "amd_iommu_ir",
+ AMDVI_INT_ADDR_SIZE);
+ memory_region_add_subregion_overlap(&amdvi_dev_as->root,
+ AMDVI_INT_ADDR_FIRST,
+ &amdvi_dev_as->iommu_ir,
+ 64);
+ memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
+ MEMORY_REGION(&amdvi_dev_as->iommu),
+ 1);
+ }
+ return &iommu_as[devfn]->as;
+}
+
+static const MemoryRegionOps mmio_mem_ops = {
+ .read = amdvi_mmio_read,
+ .write = amdvi_mmio_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ .unaligned = false,
+ },
+ .valid = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ }
+};
+
+static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
+ IOMMUNotifierFlag old,
+ IOMMUNotifierFlag new,
+ Error **errp)
+{
+ AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
+
+ if (new & IOMMU_NOTIFIER_MAP) {
+ error_setg(errp,
+ "device %02x.%02x.%x requires iommu notifier which is not "
+ "currently supported", as->bus_num, PCI_SLOT(as->devfn),
+ PCI_FUNC(as->devfn));
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void amdvi_init(AMDVIState *s)
+{
+ amdvi_iotlb_reset(s);
+
+ s->devtab_len = 0;
+ s->cmdbuf_len = 0;
+ s->cmdbuf_head = 0;
+ s->cmdbuf_tail = 0;
+ s->evtlog_head = 0;
+ s->evtlog_tail = 0;
+ s->excl_enabled = false;
+ s->excl_allow = false;
+ s->mmio_enabled = false;
+ s->enabled = false;
+ s->ats_enabled = false;
+ s->cmdbuf_enabled = false;
+
+ /* reset MMIO */
+ memset(s->mmior, 0, AMDVI_MMIO_SIZE);
+ amdvi_set_quad(s, AMDVI_MMIO_EXT_FEATURES, AMDVI_EXT_FEATURES,
+ 0xffffffffffffffef, 0);
+ amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98, 0x67);
+
+ /* reset device ident */
+ pci_config_set_vendor_id(s->pci.dev.config, PCI_VENDOR_ID_AMD);
+ pci_config_set_prog_interface(s->pci.dev.config, 00);
+ pci_config_set_device_id(s->pci.dev.config, s->devid);
+ pci_config_set_class(s->pci.dev.config, 0x0806);
+
+ /* reset AMDVI specific capabilities, all r/o */
+ pci_set_long(s->pci.dev.config + s->capab_offset, AMDVI_CAPAB_FEATURES);
+ pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_LOW,
+ s->mmio.addr & ~(0xffff0000));
+ pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH,
+ (s->mmio.addr & ~(0xffff)) >> 16);
+ pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_RANGE,
+ 0xff000000);
+ pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, 0);
+ pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC,
+ AMDVI_MAX_PH_ADDR | AMDVI_MAX_GVA_ADDR | AMDVI_MAX_VA_ADDR);
+}
+
+static void amdvi_sysbus_reset(DeviceState *dev)
+{
+ AMDVIState *s = AMD_IOMMU_DEVICE(dev);
+
+ msi_reset(&s->pci.dev);
+ amdvi_init(s);
+}
+
+static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
+{
+ int ret = 0;
+ AMDVIState *s = AMD_IOMMU_DEVICE(dev);
+ MachineState *ms = MACHINE(qdev_get_machine());
+ PCMachineState *pcms = PC_MACHINE(ms);
+ X86MachineState *x86ms = X86_MACHINE(ms);
+ PCIBus *bus = pcms->bus;
+
+ s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
+ amdvi_uint64_equal, g_free, g_free);
+
+ /* This device should take care of IOMMU PCI properties */
+ if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) {
+ return;
+ }
+ ret = pci_add_capability(&s->pci.dev, AMDVI_CAPAB_ID_SEC, 0,
+ AMDVI_CAPAB_SIZE, errp);
+ if (ret < 0) {
+ return;
+ }
+ s->capab_offset = ret;
+
+ ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_MSI, 0,
+ AMDVI_CAPAB_REG_SIZE, errp);
+ if (ret < 0) {
+ return;
+ }
+ ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_HT, 0,
+ AMDVI_CAPAB_REG_SIZE, errp);
+ if (ret < 0) {
+ return;
+ }
+
+ /* Pseudo address space under root PCI bus. */
+ x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
+
+ /* set up MMIO */
+ memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio",
+ AMDVI_MMIO_SIZE);
+
+ sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio);
+ sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
+ pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
+ s->devid = object_property_get_int(OBJECT(&s->pci), "addr", &error_abort);
+ msi_init(&s->pci.dev, 0, 1, true, false, errp);
+ amdvi_init(s);
+}
+
+static const VMStateDescription vmstate_amdvi_sysbus = {
+ .name = "amd-iommu",
+ .unmigratable = 1
+};
+
+static void amdvi_sysbus_instance_init(Object *klass)
+{
+ AMDVIState *s = AMD_IOMMU_DEVICE(klass);
+
+ object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI);
+}
+
+static void amdvi_sysbus_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ X86IOMMUClass *dc_class = X86_IOMMU_DEVICE_CLASS(klass);
+
+ dc->reset = amdvi_sysbus_reset;
+ dc->vmsd = &vmstate_amdvi_sysbus;
+ dc->hotpluggable = false;
+ dc_class->realize = amdvi_sysbus_realize;
+ dc_class->int_remap = amdvi_int_remap;
+ /* Supported by the pc-q35-* machine types */
+ dc->user_creatable = true;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device";
+}
+
+static const TypeInfo amdvi_sysbus = {
+ .name = TYPE_AMD_IOMMU_DEVICE,
+ .parent = TYPE_X86_IOMMU_DEVICE,
+ .instance_size = sizeof(AMDVIState),
+ .instance_init = amdvi_sysbus_instance_init,
+ .class_init = amdvi_sysbus_class_init
+};
+
+static void amdvi_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device";
+}
+
+static const TypeInfo amdvi_pci = {
+ .name = TYPE_AMD_IOMMU_PCI,
+ .parent = TYPE_PCI_DEVICE,
+ .instance_size = sizeof(AMDVIPCIState),
+ .class_init = amdvi_pci_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { },
+ },
+};
+
+static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, void *data)
+{
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+ imrc->translate = amdvi_translate;
+ imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed;
+}
+
+static const TypeInfo amdvi_iommu_memory_region_info = {
+ .parent = TYPE_IOMMU_MEMORY_REGION,
+ .name = TYPE_AMD_IOMMU_MEMORY_REGION,
+ .class_init = amdvi_iommu_memory_region_class_init,
+};
+
+static void amdvi_register_types(void)
+{
+ type_register_static(&amdvi_pci);
+ type_register_static(&amdvi_sysbus);
+ type_register_static(&amdvi_iommu_memory_region_info);
+}
+
+type_init(amdvi_register_types);
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
new file mode 100644
index 000000000..79d38a3e4
--- /dev/null
+++ b/hw/i386/amd_iommu.h
@@ -0,0 +1,372 @@
+/*
+ * QEMU emulation of an AMD IOMMU (AMD-Vi)
+ *
+ * Copyright (C) 2011 Eduard - Gabriel Munteanu
+ * Copyright (C) 2015, 2016 David Kiarie Kahurani
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef AMD_IOMMU_H
+#define AMD_IOMMU_H
+
+#include "hw/pci/pci.h"
+#include "hw/i386/x86-iommu.h"
+#include "qom/object.h"
+
+/* Capability registers */
+#define AMDVI_CAPAB_BAR_LOW 0x04
+#define AMDVI_CAPAB_BAR_HIGH 0x08
+#define AMDVI_CAPAB_RANGE 0x0C
+#define AMDVI_CAPAB_MISC 0x10
+
+#define AMDVI_CAPAB_SIZE 0x18
+#define AMDVI_CAPAB_REG_SIZE 0x04
+
+/* Capability header data */
+#define AMDVI_CAPAB_ID_SEC 0xf
+#define AMDVI_CAPAB_FLAT_EXT (1 << 28)
+#define AMDVI_CAPAB_EFR_SUP (1 << 27)
+#define AMDVI_CAPAB_FLAG_NPCACHE (1 << 26)
+#define AMDVI_CAPAB_FLAG_HTTUNNEL (1 << 25)
+#define AMDVI_CAPAB_FLAG_IOTLBSUP (1 << 24)
+#define AMDVI_CAPAB_INIT_TYPE (3 << 16)
+
+/* No. of used MMIO registers */
+#define AMDVI_MMIO_REGS_HIGH 7
+#define AMDVI_MMIO_REGS_LOW 8
+
+/* MMIO registers */
+#define AMDVI_MMIO_DEVICE_TABLE 0x0000
+#define AMDVI_MMIO_COMMAND_BASE 0x0008
+#define AMDVI_MMIO_EVENT_BASE 0x0010
+#define AMDVI_MMIO_CONTROL 0x0018
+#define AMDVI_MMIO_EXCL_BASE 0x0020
+#define AMDVI_MMIO_EXCL_LIMIT 0x0028
+#define AMDVI_MMIO_EXT_FEATURES 0x0030
+#define AMDVI_MMIO_COMMAND_HEAD 0x2000
+#define AMDVI_MMIO_COMMAND_TAIL 0x2008
+#define AMDVI_MMIO_EVENT_HEAD 0x2010
+#define AMDVI_MMIO_EVENT_TAIL 0x2018
+#define AMDVI_MMIO_STATUS 0x2020
+#define AMDVI_MMIO_PPR_BASE 0x0038
+#define AMDVI_MMIO_PPR_HEAD 0x2030
+#define AMDVI_MMIO_PPR_TAIL 0x2038
+
+#define AMDVI_MMIO_SIZE 0x4000
+
+#define AMDVI_MMIO_DEVTAB_SIZE_MASK ((1ULL << 12) - 1)
+#define AMDVI_MMIO_DEVTAB_BASE_MASK (((1ULL << 52) - 1) & ~ \
+ AMDVI_MMIO_DEVTAB_SIZE_MASK)
+#define AMDVI_MMIO_DEVTAB_ENTRY_SIZE 32
+#define AMDVI_MMIO_DEVTAB_SIZE_UNIT 4096
+
+/* some of this are similar but just for readability */
+#define AMDVI_MMIO_CMDBUF_SIZE_BYTE (AMDVI_MMIO_COMMAND_BASE + 7)
+#define AMDVI_MMIO_CMDBUF_SIZE_MASK 0x0f
+#define AMDVI_MMIO_CMDBUF_BASE_MASK AMDVI_MMIO_DEVTAB_BASE_MASK
+#define AMDVI_MMIO_CMDBUF_HEAD_MASK (((1ULL << 19) - 1) & ~0x0f)
+#define AMDVI_MMIO_CMDBUF_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK
+
+#define AMDVI_MMIO_EVTLOG_SIZE_BYTE (AMDVI_MMIO_EVENT_BASE + 7)
+#define AMDVI_MMIO_EVTLOG_SIZE_MASK AMDVI_MMIO_CMDBUF_SIZE_MASK
+#define AMDVI_MMIO_EVTLOG_BASE_MASK AMDVI_MMIO_CMDBUF_BASE_MASK
+#define AMDVI_MMIO_EVTLOG_HEAD_MASK (((1ULL << 19) - 1) & ~0x0f)
+#define AMDVI_MMIO_EVTLOG_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK
+
+#define AMDVI_MMIO_PPRLOG_SIZE_BYTE (AMDVI_MMIO_EVENT_BASE + 7)
+#define AMDVI_MMIO_PPRLOG_HEAD_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK
+#define AMDVI_MMIO_PPRLOG_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK
+#define AMDVI_MMIO_PPRLOG_BASE_MASK AMDVI_MMIO_EVTLOG_BASE_MASK
+#define AMDVI_MMIO_PPRLOG_SIZE_MASK AMDVI_MMIO_EVTLOG_SIZE_MASK
+
+#define AMDVI_MMIO_EXCL_ENABLED_MASK (1ULL << 0)
+#define AMDVI_MMIO_EXCL_ALLOW_MASK (1ULL << 1)
+#define AMDVI_MMIO_EXCL_LIMIT_MASK AMDVI_MMIO_DEVTAB_BASE_MASK
+#define AMDVI_MMIO_EXCL_LIMIT_LOW 0xfff
+
+/* mmio control register flags */
+#define AMDVI_MMIO_CONTROL_AMDVIEN (1ULL << 0)
+#define AMDVI_MMIO_CONTROL_HTTUNEN (1ULL << 1)
+#define AMDVI_MMIO_CONTROL_EVENTLOGEN (1ULL << 2)
+#define AMDVI_MMIO_CONTROL_EVENTINTEN (1ULL << 3)
+#define AMDVI_MMIO_CONTROL_COMWAITINTEN (1ULL << 4)
+#define AMDVI_MMIO_CONTROL_CMDBUFLEN (1ULL << 12)
+#define AMDVI_MMIO_CONTROL_GAEN (1ULL << 17)
+
+/* MMIO status register bits */
+#define AMDVI_MMIO_STATUS_CMDBUF_RUN (1 << 4)
+#define AMDVI_MMIO_STATUS_EVT_RUN (1 << 3)
+#define AMDVI_MMIO_STATUS_COMP_INT (1 << 2)
+#define AMDVI_MMIO_STATUS_EVT_OVF (1 << 0)
+
+#define AMDVI_CMDBUF_ID_BYTE 0x07
+#define AMDVI_CMDBUF_ID_RSHIFT 4
+
+#define AMDVI_CMD_COMPLETION_WAIT 0x01
+#define AMDVI_CMD_INVAL_DEVTAB_ENTRY 0x02
+#define AMDVI_CMD_INVAL_AMDVI_PAGES 0x03
+#define AMDVI_CMD_INVAL_IOTLB_PAGES 0x04
+#define AMDVI_CMD_INVAL_INTR_TABLE 0x05
+#define AMDVI_CMD_PREFETCH_AMDVI_PAGES 0x06
+#define AMDVI_CMD_COMPLETE_PPR_REQUEST 0x07
+#define AMDVI_CMD_INVAL_AMDVI_ALL 0x08
+
+#define AMDVI_DEVTAB_ENTRY_SIZE 32
+
+/* Device table entry bits 0:63 */
+#define AMDVI_DEV_VALID (1ULL << 0)
+#define AMDVI_DEV_TRANSLATION_VALID (1ULL << 1)
+#define AMDVI_DEV_MODE_MASK 0x7
+#define AMDVI_DEV_MODE_RSHIFT 9
+#define AMDVI_DEV_PT_ROOT_MASK 0xffffffffff000
+#define AMDVI_DEV_PT_ROOT_RSHIFT 12
+#define AMDVI_DEV_PERM_SHIFT 61
+#define AMDVI_DEV_PERM_READ (1ULL << 61)
+#define AMDVI_DEV_PERM_WRITE (1ULL << 62)
+
+/* Device table entry bits 64:127 */
+#define AMDVI_DEV_DOMID_ID_MASK ((1ULL << 16) - 1)
+
+/* Event codes and flags, as stored in the info field */
+#define AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY (0x1U << 12)
+#define AMDVI_EVENT_IOPF (0x2U << 12)
+#define AMDVI_EVENT_IOPF_I (1U << 3)
+#define AMDVI_EVENT_DEV_TAB_HW_ERROR (0x3U << 12)
+#define AMDVI_EVENT_PAGE_TAB_HW_ERROR (0x4U << 12)
+#define AMDVI_EVENT_ILLEGAL_COMMAND_ERROR (0x5U << 12)
+#define AMDVI_EVENT_COMMAND_HW_ERROR (0x6U << 12)
+
+#define AMDVI_EVENT_LEN 16
+#define AMDVI_PERM_READ (1 << 0)
+#define AMDVI_PERM_WRITE (1 << 1)
+
+#define AMDVI_FEATURE_PREFETCH (1ULL << 0) /* page prefetch */
+#define AMDVI_FEATURE_PPR (1ULL << 1) /* PPR Support */
+#define AMDVI_FEATURE_GT (1ULL << 4) /* Guest Translation */
+#define AMDVI_FEATURE_IA (1ULL << 6) /* inval all support */
+#define AMDVI_FEATURE_GA (1ULL << 7) /* guest VAPIC support */
+#define AMDVI_FEATURE_HE (1ULL << 8) /* hardware error regs */
+#define AMDVI_FEATURE_PC (1ULL << 9) /* Perf counters */
+
+/* reserved DTE bits */
+#define AMDVI_DTE_LOWER_QUAD_RESERVED 0x80300000000000fc
+#define AMDVI_DTE_MIDDLE_QUAD_RESERVED 0x0000000000000100
+#define AMDVI_DTE_UPPER_QUAD_RESERVED 0x08f0000000000000
+
+/* AMDVI paging mode */
+#define AMDVI_GATS_MODE (2ULL << 12)
+#define AMDVI_HATS_MODE (2ULL << 10)
+
+/* IOTLB */
+#define AMDVI_IOTLB_MAX_SIZE 1024
+#define AMDVI_DEVID_SHIFT 36
+
+/* extended feature support */
+#define AMDVI_EXT_FEATURES (AMDVI_FEATURE_PREFETCH | AMDVI_FEATURE_PPR | \
+ AMDVI_FEATURE_IA | AMDVI_FEATURE_GT | AMDVI_FEATURE_HE | \
+ AMDVI_GATS_MODE | AMDVI_HATS_MODE | AMDVI_FEATURE_GA)
+
+/* capabilities header */
+#define AMDVI_CAPAB_FEATURES (AMDVI_CAPAB_FLAT_EXT | \
+ AMDVI_CAPAB_FLAG_NPCACHE | AMDVI_CAPAB_FLAG_IOTLBSUP \
+ | AMDVI_CAPAB_ID_SEC | AMDVI_CAPAB_INIT_TYPE | \
+ AMDVI_CAPAB_FLAG_HTTUNNEL | AMDVI_CAPAB_EFR_SUP)
+
+/* AMDVI default address */
+#define AMDVI_BASE_ADDR 0xfed80000
+
+/* page management constants */
+#define AMDVI_PAGE_SHIFT 12
+#define AMDVI_PAGE_SIZE (1ULL << AMDVI_PAGE_SHIFT)
+
+#define AMDVI_PAGE_SHIFT_4K 12
+#define AMDVI_PAGE_MASK_4K (~((1ULL << AMDVI_PAGE_SHIFT_4K) - 1))
+
+#define AMDVI_MAX_VA_ADDR (48UL << 5)
+#define AMDVI_MAX_PH_ADDR (40UL << 8)
+#define AMDVI_MAX_GVA_ADDR (48UL << 15)
+
+/* Completion Wait data size */
+#define AMDVI_COMPLETION_DATA_SIZE 8
+
+#define AMDVI_COMMAND_SIZE 16
+/* Completion Wait data size */
+#define AMDVI_COMPLETION_DATA_SIZE 8
+
+#define AMDVI_COMMAND_SIZE 16
+
+#define AMDVI_INT_ADDR_FIRST 0xfee00000
+#define AMDVI_INT_ADDR_LAST 0xfeefffff
+#define AMDVI_INT_ADDR_SIZE (AMDVI_INT_ADDR_LAST - AMDVI_INT_ADDR_FIRST + 1)
+#define AMDVI_MSI_ADDR_HI_MASK (0xffffffff00000000ULL)
+#define AMDVI_MSI_ADDR_LO_MASK (0x00000000ffffffffULL)
+
+/* SB IOAPIC is always on this device in AMD systems */
+#define AMDVI_IOAPIC_SB_DEVID PCI_BUILD_BDF(0, PCI_DEVFN(0x14, 0))
+
+/* Interrupt remapping errors */
+#define AMDVI_IR_ERR 0x1
+#define AMDVI_IR_GET_IRTE 0x2
+#define AMDVI_IR_TARGET_ABORT 0x3
+
+/* Interrupt remapping */
+#define AMDVI_IR_REMAP_ENABLE 1ULL
+#define AMDVI_IR_INTCTL_SHIFT 60
+#define AMDVI_IR_INTCTL_ABORT 0
+#define AMDVI_IR_INTCTL_PASS 1
+#define AMDVI_IR_INTCTL_REMAP 2
+
+#define AMDVI_IR_PHYS_ADDR_MASK (((1ULL << 45) - 1) << 6)
+
+/* MSI data 10:0 bits (section 2.2.5.1 Fig 14) */
+#define AMDVI_IRTE_OFFSET 0x7ff
+
+/* Delivery mode of MSI data (same as IOAPIC deilver mode encoding) */
+#define AMDVI_IOAPIC_INT_TYPE_FIXED 0x0
+#define AMDVI_IOAPIC_INT_TYPE_ARBITRATED 0x1
+#define AMDVI_IOAPIC_INT_TYPE_SMI 0x2
+#define AMDVI_IOAPIC_INT_TYPE_NMI 0x4
+#define AMDVI_IOAPIC_INT_TYPE_INIT 0x5
+#define AMDVI_IOAPIC_INT_TYPE_EINT 0x7
+
+/* Pass through interrupt */
+#define AMDVI_DEV_INT_PASS_MASK (1ULL << 56)
+#define AMDVI_DEV_EINT_PASS_MASK (1ULL << 57)
+#define AMDVI_DEV_NMI_PASS_MASK (1ULL << 58)
+#define AMDVI_DEV_LINT0_PASS_MASK (1ULL << 62)
+#define AMDVI_DEV_LINT1_PASS_MASK (1ULL << 63)
+
+/* Interrupt remapping table fields (Guest VAPIC not enabled) */
+union irte {
+ uint32_t val;
+ struct {
+ uint32_t valid:1,
+ no_fault:1,
+ int_type:3,
+ rq_eoi:1,
+ dm:1,
+ guest_mode:1,
+ destination:8,
+ vector:8,
+ rsvd:8;
+ } fields;
+};
+
+/* Interrupt remapping table fields (Guest VAPIC is enabled) */
+union irte_ga_lo {
+ uint64_t val;
+
+ /* For int remapping */
+ struct {
+ uint64_t valid:1,
+ no_fault:1,
+ /* ------ */
+ int_type:3,
+ rq_eoi:1,
+ dm:1,
+ /* ------ */
+ guest_mode:1,
+ destination:8,
+ rsvd_1:48;
+ } fields_remap;
+};
+
+union irte_ga_hi {
+ uint64_t val;
+ struct {
+ uint64_t vector:8,
+ rsvd_2:56;
+ } fields;
+};
+
+struct irte_ga {
+ union irte_ga_lo lo;
+ union irte_ga_hi hi;
+};
+
+#define TYPE_AMD_IOMMU_DEVICE "amd-iommu"
+OBJECT_DECLARE_SIMPLE_TYPE(AMDVIState, AMD_IOMMU_DEVICE)
+
+#define TYPE_AMD_IOMMU_PCI "AMDVI-PCI"
+
+#define TYPE_AMD_IOMMU_MEMORY_REGION "amd-iommu-iommu-memory-region"
+
+typedef struct AMDVIAddressSpace AMDVIAddressSpace;
+
+/* functions to steal PCI config space */
+typedef struct AMDVIPCIState {
+ PCIDevice dev; /* The PCI device itself */
+} AMDVIPCIState;
+
+struct AMDVIState {
+ X86IOMMUState iommu; /* IOMMU bus device */
+ AMDVIPCIState pci; /* IOMMU PCI device */
+
+ uint32_t version;
+ uint32_t capab_offset; /* capability offset pointer */
+
+ uint64_t mmio_addr;
+
+ uint32_t devid; /* auto-assigned devid */
+
+ bool enabled; /* IOMMU enabled */
+ bool ats_enabled; /* address translation enabled */
+ bool cmdbuf_enabled; /* command buffer enabled */
+ bool evtlog_enabled; /* event log enabled */
+ bool excl_enabled;
+
+ hwaddr devtab; /* base address device table */
+ size_t devtab_len; /* device table length */
+
+ hwaddr cmdbuf; /* command buffer base address */
+ uint64_t cmdbuf_len; /* command buffer length */
+ uint32_t cmdbuf_head; /* current IOMMU read position */
+ uint32_t cmdbuf_tail; /* next Software write position */
+ bool completion_wait_intr;
+
+ hwaddr evtlog; /* base address event log */
+ bool evtlog_intr;
+ uint32_t evtlog_len; /* event log length */
+ uint32_t evtlog_head; /* current IOMMU write position */
+ uint32_t evtlog_tail; /* current Software read position */
+
+ /* unused for now */
+ hwaddr excl_base; /* base DVA - IOMMU exclusion range */
+ hwaddr excl_limit; /* limit of IOMMU exclusion range */
+ bool excl_allow; /* translate accesses to the exclusion range */
+ bool excl_enable; /* exclusion range enabled */
+
+ hwaddr ppr_log; /* base address ppr log */
+ uint32_t pprlog_len; /* ppr log len */
+ uint32_t pprlog_head; /* ppr log head */
+ uint32_t pprlog_tail; /* ppr log tail */
+
+ MemoryRegion mmio; /* MMIO region */
+ uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */
+ uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */
+ uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */
+ bool mmio_enabled;
+
+ /* for each served device */
+ AMDVIAddressSpace **address_spaces[PCI_BUS_MAX];
+
+ /* IOTLB */
+ GHashTable *iotlb;
+
+ /* Interrupt remapping */
+ bool ga_enabled;
+};
+
+#endif
diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
new file mode 100644
index 000000000..bcf9eaf83
--- /dev/null
+++ b/hw/i386/e820_memory_layout.c
@@ -0,0 +1,59 @@
+/*
+ * QEMU BIOS e820 routines
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bswap.h"
+#include "e820_memory_layout.h"
+
+static size_t e820_entries;
+struct e820_table e820_reserve;
+struct e820_entry *e820_table;
+
+int e820_add_entry(uint64_t address, uint64_t length, uint32_t type)
+{
+ int index = le32_to_cpu(e820_reserve.count);
+ struct e820_entry *entry;
+
+ if (type != E820_RAM) {
+ /* old FW_CFG_E820_TABLE entry -- reservations only */
+ if (index >= E820_NR_ENTRIES) {
+ return -EBUSY;
+ }
+ entry = &e820_reserve.entry[index++];
+
+ entry->address = cpu_to_le64(address);
+ entry->length = cpu_to_le64(length);
+ entry->type = cpu_to_le32(type);
+
+ e820_reserve.count = cpu_to_le32(index);
+ }
+
+ /* new "etc/e820" file -- include ram too */
+ e820_table = g_renew(struct e820_entry, e820_table, e820_entries + 1);
+ e820_table[e820_entries].address = cpu_to_le64(address);
+ e820_table[e820_entries].length = cpu_to_le64(length);
+ e820_table[e820_entries].type = cpu_to_le32(type);
+ e820_entries++;
+
+ return e820_entries;
+}
+
+int e820_get_num_entries(void)
+{
+ return e820_entries;
+}
+
+bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
+{
+ if (idx < e820_entries && e820_table[idx].type == cpu_to_le32(type)) {
+ *address = le64_to_cpu(e820_table[idx].address);
+ *length = le64_to_cpu(e820_table[idx].length);
+ return true;
+ }
+ return false;
+}
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
new file mode 100644
index 000000000..2a0ceb8b9
--- /dev/null
+++ b/hw/i386/e820_memory_layout.h
@@ -0,0 +1,42 @@
+/*
+ * QEMU BIOS e820 routines
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef HW_I386_E820_H
+#define HW_I386_E820_H
+
+/* e820 types */
+#define E820_RAM 1
+#define E820_RESERVED 2
+#define E820_ACPI 3
+#define E820_NVS 4
+#define E820_UNUSABLE 5
+
+#define E820_NR_ENTRIES 16
+
+struct e820_entry {
+ uint64_t address;
+ uint64_t length;
+ uint32_t type;
+} QEMU_PACKED __attribute((__aligned__(4)));
+
+struct e820_table {
+ uint32_t count;
+ struct e820_entry entry[E820_NR_ENTRIES];
+} QEMU_PACKED __attribute((__aligned__(4)));
+
+extern struct e820_table e820_reserve;
+extern struct e820_entry *e820_table;
+
+int e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
+int e820_get_num_entries(void);
+bool e820_get_entry(int index, uint32_t type,
+ uint64_t *address, uint64_t *length);
+
+
+
+#endif
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
new file mode 100644
index 000000000..a283785a8
--- /dev/null
+++ b/hw/i386/fw_cfg.c
@@ -0,0 +1,221 @@
+/*
+ * QEMU fw_cfg helpers (X86 specific)
+ *
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * Author:
+ * Philippe Mathieu-Daudé <philmd@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/numa.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/firmware/smbios.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/timer/hpet.h"
+#include "hw/nvram/fw_cfg.h"
+#include "e820_memory_layout.h"
+#include "kvm/kvm_i386.h"
+#include "qapi/error.h"
+#include CONFIG_DEVICES
+
+struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX};
+
+const char *fw_cfg_arch_key_name(uint16_t key)
+{
+ static const struct {
+ uint16_t key;
+ const char *name;
+ } fw_cfg_arch_wellknown_keys[] = {
+ {FW_CFG_ACPI_TABLES, "acpi_tables"},
+ {FW_CFG_SMBIOS_ENTRIES, "smbios_entries"},
+ {FW_CFG_IRQ0_OVERRIDE, "irq0_override"},
+ {FW_CFG_E820_TABLE, "e820_table"},
+ {FW_CFG_HPET, "hpet"},
+ };
+
+ for (size_t i = 0; i < ARRAY_SIZE(fw_cfg_arch_wellknown_keys); i++) {
+ if (fw_cfg_arch_wellknown_keys[i].key == key) {
+ return fw_cfg_arch_wellknown_keys[i].name;
+ }
+ }
+ return NULL;
+}
+
+void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg)
+{
+#ifdef CONFIG_SMBIOS
+ uint8_t *smbios_tables, *smbios_anchor;
+ size_t smbios_tables_len, smbios_anchor_len;
+ struct smbios_phys_mem_area *mem_array;
+ unsigned i, array_count;
+ X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu);
+
+ /* tell smbios about cpuid version and features */
+ smbios_set_cpuid(cpu->env.cpuid_version, cpu->env.features[FEAT_1_EDX]);
+
+ smbios_tables = smbios_get_table_legacy(ms, &smbios_tables_len);
+ if (smbios_tables) {
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_SMBIOS_ENTRIES,
+ smbios_tables, smbios_tables_len);
+ }
+
+ /* build the array of physical mem area from e820 table */
+ mem_array = g_malloc0(sizeof(*mem_array) * e820_get_num_entries());
+ for (i = 0, array_count = 0; i < e820_get_num_entries(); i++) {
+ uint64_t addr, len;
+
+ if (e820_get_entry(i, E820_RAM, &addr, &len)) {
+ mem_array[array_count].address = addr;
+ mem_array[array_count].length = len;
+ array_count++;
+ }
+ }
+ smbios_get_tables(ms, mem_array, array_count,
+ &smbios_tables, &smbios_tables_len,
+ &smbios_anchor, &smbios_anchor_len,
+ &error_fatal);
+ g_free(mem_array);
+
+ if (smbios_anchor) {
+ fw_cfg_add_file(fw_cfg, "etc/smbios/smbios-tables",
+ smbios_tables, smbios_tables_len);
+ fw_cfg_add_file(fw_cfg, "etc/smbios/smbios-anchor",
+ smbios_anchor, smbios_anchor_len);
+ }
+#endif
+}
+
+FWCfgState *fw_cfg_arch_create(MachineState *ms,
+ uint16_t boot_cpus,
+ uint16_t apic_id_limit)
+{
+ FWCfgState *fw_cfg;
+ uint64_t *numa_fw_cfg;
+ int i;
+ MachineClass *mc = MACHINE_GET_CLASS(ms);
+ const CPUArchIdList *cpus = mc->possible_cpu_arch_ids(ms);
+ int nb_numa_nodes = ms->numa_state->num_nodes;
+
+ fw_cfg = fw_cfg_init_io_dma(FW_CFG_IO_BASE, FW_CFG_IO_BASE + 4,
+ &address_space_memory);
+ fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, boot_cpus);
+
+ /* FW_CFG_MAX_CPUS is a bit confusing/problematic on x86:
+ *
+ * For machine types prior to 1.8, SeaBIOS needs FW_CFG_MAX_CPUS for
+ * building MPTable, ACPI MADT, ACPI CPU hotplug and ACPI SRAT table,
+ * that tables are based on xAPIC ID and QEMU<->SeaBIOS interface
+ * for CPU hotplug also uses APIC ID and not "CPU index".
+ * This means that FW_CFG_MAX_CPUS is not the "maximum number of CPUs",
+ * but the "limit to the APIC ID values SeaBIOS may see".
+ *
+ * So for compatibility reasons with old BIOSes we are stuck with
+ * "etc/max-cpus" actually being apic_id_limit
+ */
+ fw_cfg_add_i16(fw_cfg, FW_CFG_MAX_CPUS, apic_id_limit);
+ fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, ms->ram_size);
+#ifdef CONFIG_ACPI
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_ACPI_TABLES,
+ acpi_tables, acpi_tables_len);
+#endif
+ fw_cfg_add_i32(fw_cfg, FW_CFG_IRQ0_OVERRIDE, 1);
+
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_E820_TABLE,
+ &e820_reserve, sizeof(e820_reserve));
+ fw_cfg_add_file(fw_cfg, "etc/e820", e820_table,
+ sizeof(struct e820_entry) * e820_get_num_entries());
+
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, &hpet_cfg, sizeof(hpet_cfg));
+ /* allocate memory for the NUMA channel: one (64bit) word for the number
+ * of nodes, one word for each VCPU->node and one word for each node to
+ * hold the amount of memory.
+ */
+ numa_fw_cfg = g_new0(uint64_t, 1 + apic_id_limit + nb_numa_nodes);
+ numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
+ for (i = 0; i < cpus->len; i++) {
+ unsigned int apic_id = cpus->cpus[i].arch_id;
+ assert(apic_id < apic_id_limit);
+ numa_fw_cfg[apic_id + 1] = cpu_to_le64(cpus->cpus[i].props.node_id);
+ }
+ for (i = 0; i < nb_numa_nodes; i++) {
+ numa_fw_cfg[apic_id_limit + 1 + i] =
+ cpu_to_le64(ms->numa_state->nodes[i].node_mem);
+ }
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg,
+ (1 + apic_id_limit + nb_numa_nodes) *
+ sizeof(*numa_fw_cfg));
+
+ return fw_cfg;
+}
+
+void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg)
+{
+ X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu);
+ CPUX86State *env = &cpu->env;
+ uint32_t unused, ebx, ecx, edx;
+ uint64_t feature_control_bits = 0;
+ uint64_t *val;
+
+ cpu_x86_cpuid(env, 1, 0, &unused, &unused, &ecx, &edx);
+ if (ecx & CPUID_EXT_VMX) {
+ feature_control_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ }
+
+ if ((edx & (CPUID_EXT2_MCE | CPUID_EXT2_MCA)) ==
+ (CPUID_EXT2_MCE | CPUID_EXT2_MCA) &&
+ (env->mcg_cap & MCG_LMCE_P)) {
+ feature_control_bits |= FEATURE_CONTROL_LMCE;
+ }
+
+ if (env->cpuid_level >= 7) {
+ cpu_x86_cpuid(env, 0x7, 0, &unused, &ebx, &ecx, &unused);
+ if (ebx & CPUID_7_0_EBX_SGX) {
+ feature_control_bits |= FEATURE_CONTROL_SGX;
+ }
+ if (ecx & CPUID_7_0_ECX_SGX_LC) {
+ feature_control_bits |= FEATURE_CONTROL_SGX_LC;
+ }
+ }
+
+ if (!feature_control_bits) {
+ return;
+ }
+
+ val = g_malloc(sizeof(*val));
+ *val = cpu_to_le64(feature_control_bits | FEATURE_CONTROL_LOCKED);
+ fw_cfg_add_file(fw_cfg, "etc/msr_feature_control", val, sizeof(*val));
+}
+
+void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg)
+{
+ /*
+ * when using port i/o, the 8-bit data register *always* overlaps
+ * with half of the 16-bit control register. Hence, the total size
+ * of the i/o region used is FW_CFG_CTL_SIZE; when using DMA, the
+ * DMA control register is located at FW_CFG_DMA_IO_BASE + 4
+ */
+ Object *obj = OBJECT(fw_cfg);
+ uint8_t io_size = object_property_get_bool(obj, "dma_enabled", NULL) ?
+ ROUND_UP(FW_CFG_CTL_SIZE, 4) + sizeof(dma_addr_t) :
+ FW_CFG_CTL_SIZE;
+ Aml *dev = aml_device("FWCF");
+ Aml *crs = aml_resource_template();
+
+ aml_append(dev, aml_name_decl("_HID", aml_string("QEMU0002")));
+
+ /* device present, functioning, decoding, not shown in UI */
+ aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+
+ aml_append(crs,
+ aml_io(AML_DECODE16, FW_CFG_IO_BASE, FW_CFG_IO_BASE, 0x01, io_size));
+
+ aml_append(dev, aml_name_decl("_CRS", crs));
+ aml_append(scope, dev);
+}
diff --git a/hw/i386/fw_cfg.h b/hw/i386/fw_cfg.h
new file mode 100644
index 000000000..275f15c1c
--- /dev/null
+++ b/hw/i386/fw_cfg.h
@@ -0,0 +1,30 @@
+/*
+ * QEMU fw_cfg helpers (X86 specific)
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef HW_I386_FW_CFG_H
+#define HW_I386_FW_CFG_H
+
+#include "hw/boards.h"
+#include "hw/nvram/fw_cfg.h"
+
+#define FW_CFG_IO_BASE 0x510
+
+#define FW_CFG_ACPI_TABLES (FW_CFG_ARCH_LOCAL + 0)
+#define FW_CFG_SMBIOS_ENTRIES (FW_CFG_ARCH_LOCAL + 1)
+#define FW_CFG_IRQ0_OVERRIDE (FW_CFG_ARCH_LOCAL + 2)
+#define FW_CFG_E820_TABLE (FW_CFG_ARCH_LOCAL + 3)
+#define FW_CFG_HPET (FW_CFG_ARCH_LOCAL + 4)
+
+FWCfgState *fw_cfg_arch_create(MachineState *ms,
+ uint16_t boot_cpus,
+ uint16_t apic_id_limit);
+void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg);
+void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg);
+void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg);
+
+#endif
diff --git a/hw/i386/generic_event_device_x86.c b/hw/i386/generic_event_device_x86.c
new file mode 100644
index 000000000..e26fb02a2
--- /dev/null
+++ b/hw/i386/generic_event_device_x86.c
@@ -0,0 +1,36 @@
+/*
+ * x86 variant of the generic event device for hw reduced acpi
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/acpi/generic_event_device.h"
+#include "hw/i386/pc.h"
+
+static void acpi_ged_x86_class_init(ObjectClass *class, void *data)
+{
+ AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_CLASS(class);
+
+ adevc->madt_cpu = pc_madt_cpu_entry;
+}
+
+static const TypeInfo acpi_ged_x86_info = {
+ .name = TYPE_ACPI_GED_X86,
+ .parent = TYPE_ACPI_GED,
+ .class_init = acpi_ged_x86_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_HOTPLUG_HANDLER },
+ { TYPE_ACPI_DEVICE_IF },
+ { }
+ }
+};
+
+static void acpi_ged_x86_register_types(void)
+{
+ type_register_static(&acpi_ged_x86_info);
+}
+
+type_init(acpi_ged_x86_register_types)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
new file mode 100644
index 000000000..f584449d8
--- /dev/null
+++ b/hw/i386/intel_iommu.c
@@ -0,0 +1,3900 @@
+/*
+ * QEMU emulation of an Intel IOMMU (VT-d)
+ * (DMA Remapping device)
+ *
+ * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
+ * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "hw/sysbus.h"
+#include "intel_iommu_internal.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/qdev-properties.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/apic-msidef.h"
+#include "hw/i386/x86-iommu.h"
+#include "hw/pci-host/q35.h"
+#include "sysemu/kvm.h"
+#include "sysemu/dma.h"
+#include "sysemu/sysemu.h"
+#include "hw/i386/apic_internal.h"
+#include "kvm/kvm_i386.h"
+#include "migration/vmstate.h"
+#include "trace.h"
+
+/* context entry operations */
+#define VTD_CE_GET_RID2PASID(ce) \
+ ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
+#define VTD_CE_GET_PASID_DIR_TABLE(ce) \
+ ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK)
+
+/* pe operations */
+#define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
+#define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
+#define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\
+ if (ret_fr) { \
+ ret_fr = -ret_fr; \
+ if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { \
+ trace_vtd_fault_disabled(); \
+ } else { \
+ vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); \
+ } \
+ goto error; \
+ } \
+}
+
+static void vtd_address_space_refresh_all(IntelIOMMUState *s);
+static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
+
+static void vtd_panic_require_caching_mode(void)
+{
+ error_report("We need to set caching-mode=on for intel-iommu to enable "
+ "device assignment with IOMMU protection.");
+ exit(1);
+}
+
+static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
+ uint64_t wmask, uint64_t w1cmask)
+{
+ stq_le_p(&s->csr[addr], val);
+ stq_le_p(&s->wmask[addr], wmask);
+ stq_le_p(&s->w1cmask[addr], w1cmask);
+}
+
+static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask)
+{
+ stq_le_p(&s->womask[addr], mask);
+}
+
+static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val,
+ uint32_t wmask, uint32_t w1cmask)
+{
+ stl_le_p(&s->csr[addr], val);
+ stl_le_p(&s->wmask[addr], wmask);
+ stl_le_p(&s->w1cmask[addr], w1cmask);
+}
+
+static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask)
+{
+ stl_le_p(&s->womask[addr], mask);
+}
+
+/* "External" get/set operations */
+static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val)
+{
+ uint64_t oldval = ldq_le_p(&s->csr[addr]);
+ uint64_t wmask = ldq_le_p(&s->wmask[addr]);
+ uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
+ stq_le_p(&s->csr[addr],
+ ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
+}
+
+static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val)
+{
+ uint32_t oldval = ldl_le_p(&s->csr[addr]);
+ uint32_t wmask = ldl_le_p(&s->wmask[addr]);
+ uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
+ stl_le_p(&s->csr[addr],
+ ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
+}
+
+static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr)
+{
+ uint64_t val = ldq_le_p(&s->csr[addr]);
+ uint64_t womask = ldq_le_p(&s->womask[addr]);
+ return val & ~womask;
+}
+
+static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr)
+{
+ uint32_t val = ldl_le_p(&s->csr[addr]);
+ uint32_t womask = ldl_le_p(&s->womask[addr]);
+ return val & ~womask;
+}
+
+/* "Internal" get/set operations */
+static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr)
+{
+ return ldq_le_p(&s->csr[addr]);
+}
+
+static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr)
+{
+ return ldl_le_p(&s->csr[addr]);
+}
+
+static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val)
+{
+ stq_le_p(&s->csr[addr], val);
+}
+
+static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
+ uint32_t clear, uint32_t mask)
+{
+ uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask;
+ stl_le_p(&s->csr[addr], new_val);
+ return new_val;
+}
+
+static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
+ uint64_t clear, uint64_t mask)
+{
+ uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask;
+ stq_le_p(&s->csr[addr], new_val);
+ return new_val;
+}
+
+static inline void vtd_iommu_lock(IntelIOMMUState *s)
+{
+ qemu_mutex_lock(&s->iommu_lock);
+}
+
+static inline void vtd_iommu_unlock(IntelIOMMUState *s)
+{
+ qemu_mutex_unlock(&s->iommu_lock);
+}
+
+static void vtd_update_scalable_state(IntelIOMMUState *s)
+{
+ uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
+
+ if (s->scalable_mode) {
+ s->root_scalable = val & VTD_RTADDR_SMT;
+ }
+}
+
+/* Whether the address space needs to notify new mappings */
+static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
+{
+ return as->notifier_flags & IOMMU_NOTIFIER_MAP;
+}
+
+/* GHashTable functions */
+static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
+{
+ return *((const uint64_t *)v1) == *((const uint64_t *)v2);
+}
+
+static guint vtd_uint64_hash(gconstpointer v)
+{
+ return (guint)*(const uint64_t *)v;
+}
+
+static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
+ uint16_t domain_id = *(uint16_t *)user_data;
+ return entry->domain_id == domain_id;
+}
+
+/* The shift of an addr for a certain level of paging structure */
+static inline uint32_t vtd_slpt_level_shift(uint32_t level)
+{
+ assert(level != 0);
+ return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
+}
+
+static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
+{
+ return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
+}
+
+static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
+ VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
+ uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
+ uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
+ return (entry->domain_id == info->domain_id) &&
+ (((entry->gfn & info->mask) == gfn) ||
+ (entry->gfn == gfn_tlb));
+}
+
+/* Reset all the gen of VTDAddressSpace to zero and set the gen of
+ * IntelIOMMUState to 1. Must be called with IOMMU lock held.
+ */
+static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
+{
+ VTDAddressSpace *vtd_as;
+ VTDBus *vtd_bus;
+ GHashTableIter bus_it;
+ uint32_t devfn_it;
+
+ trace_vtd_context_cache_reset();
+
+ g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
+
+ while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
+ for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
+ vtd_as = vtd_bus->dev_as[devfn_it];
+ if (!vtd_as) {
+ continue;
+ }
+ vtd_as->context_cache_entry.context_cache_gen = 0;
+ }
+ }
+ s->context_cache_gen = 1;
+}
+
+/* Must be called with IOMMU lock held. */
+static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
+{
+ assert(s->iotlb);
+ g_hash_table_remove_all(s->iotlb);
+}
+
+static void vtd_reset_iotlb(IntelIOMMUState *s)
+{
+ vtd_iommu_lock(s);
+ vtd_reset_iotlb_locked(s);
+ vtd_iommu_unlock(s);
+}
+
+static void vtd_reset_caches(IntelIOMMUState *s)
+{
+ vtd_iommu_lock(s);
+ vtd_reset_iotlb_locked(s);
+ vtd_reset_context_cache_locked(s);
+ vtd_iommu_unlock(s);
+}
+
+static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
+ uint32_t level)
+{
+ return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
+ ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
+}
+
+static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
+{
+ return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
+}
+
+/* Must be called with IOMMU lock held */
+static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
+ hwaddr addr)
+{
+ VTDIOTLBEntry *entry;
+ uint64_t key;
+ int level;
+
+ for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
+ key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
+ source_id, level);
+ entry = g_hash_table_lookup(s->iotlb, &key);
+ if (entry) {
+ goto out;
+ }
+ }
+
+out:
+ return entry;
+}
+
+/* Must be with IOMMU lock held */
+static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
+ uint16_t domain_id, hwaddr addr, uint64_t slpte,
+ uint8_t access_flags, uint32_t level)
+{
+ VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
+ uint64_t *key = g_malloc(sizeof(*key));
+ uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
+
+ trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
+ if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
+ trace_vtd_iotlb_reset("iotlb exceeds size limit");
+ vtd_reset_iotlb_locked(s);
+ }
+
+ entry->gfn = gfn;
+ entry->domain_id = domain_id;
+ entry->slpte = slpte;
+ entry->access_flags = access_flags;
+ entry->mask = vtd_slpt_level_page_mask(level);
+ *key = vtd_get_iotlb_key(gfn, source_id, level);
+ g_hash_table_replace(s->iotlb, key, entry);
+}
+
+/* Given the reg addr of both the message data and address, generate an
+ * interrupt via MSI.
+ */
+static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
+ hwaddr mesg_data_reg)
+{
+ MSIMessage msi;
+
+ assert(mesg_data_reg < DMAR_REG_SIZE);
+ assert(mesg_addr_reg < DMAR_REG_SIZE);
+
+ msi.address = vtd_get_long_raw(s, mesg_addr_reg);
+ msi.data = vtd_get_long_raw(s, mesg_data_reg);
+
+ trace_vtd_irq_generate(msi.address, msi.data);
+
+ apic_get_class()->send_msi(&msi);
+}
+
+/* Generate a fault event to software via MSI if conditions are met.
+ * Notice that the value of FSTS_REG being passed to it should be the one
+ * before any update.
+ */
+static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
+{
+ if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
+ pre_fsts & VTD_FSTS_IQE) {
+ error_report_once("There are previous interrupt conditions "
+ "to be serviced by software, fault event "
+ "is not generated");
+ return;
+ }
+ vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
+ if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
+ error_report_once("Interrupt Mask set, irq is not generated");
+ } else {
+ vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
+ vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
+ }
+}
+
+/* Check if the Fault (F) field of the Fault Recording Register referenced by
+ * @index is Set.
+ */
+static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index)
+{
+ /* Each reg is 128-bit */
+ hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
+ addr += 8; /* Access the high 64-bit half */
+
+ assert(index < DMAR_FRCD_REG_NR);
+
+ return vtd_get_quad_raw(s, addr) & VTD_FRCD_F;
+}
+
+/* Update the PPF field of Fault Status Register.
+ * Should be called whenever change the F field of any fault recording
+ * registers.
+ */
+static void vtd_update_fsts_ppf(IntelIOMMUState *s)
+{
+ uint32_t i;
+ uint32_t ppf_mask = 0;
+
+ for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
+ if (vtd_is_frcd_set(s, i)) {
+ ppf_mask = VTD_FSTS_PPF;
+ break;
+ }
+ }
+ vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
+ trace_vtd_fsts_ppf(!!ppf_mask);
+}
+
+static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index)
+{
+ /* Each reg is 128-bit */
+ hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
+ addr += 8; /* Access the high 64-bit half */
+
+ assert(index < DMAR_FRCD_REG_NR);
+
+ vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F);
+ vtd_update_fsts_ppf(s);
+}
+
+/* Must not update F field now, should be done later */
+static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
+ uint16_t source_id, hwaddr addr,
+ VTDFaultReason fault, bool is_write)
+{
+ uint64_t hi = 0, lo;
+ hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
+
+ assert(index < DMAR_FRCD_REG_NR);
+
+ lo = VTD_FRCD_FI(addr);
+ hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
+ if (!is_write) {
+ hi |= VTD_FRCD_T;
+ }
+ vtd_set_quad_raw(s, frcd_reg_addr, lo);
+ vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
+
+ trace_vtd_frr_new(index, hi, lo);
+}
+
+/* Try to collapse multiple pending faults from the same requester */
+static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
+{
+ uint32_t i;
+ uint64_t frcd_reg;
+ hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */
+
+ for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
+ frcd_reg = vtd_get_quad_raw(s, addr);
+ if ((frcd_reg & VTD_FRCD_F) &&
+ ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
+ return true;
+ }
+ addr += 16; /* 128-bit for each */
+ }
+ return false;
+}
+
+/* Log and report an DMAR (address translation) fault to software */
+static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
+ hwaddr addr, VTDFaultReason fault,
+ bool is_write)
+{
+ uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
+
+ assert(fault < VTD_FR_MAX);
+
+ if (fault == VTD_FR_RESERVED_ERR) {
+ /* This is not a normal fault reason case. Drop it. */
+ return;
+ }
+
+ trace_vtd_dmar_fault(source_id, fault, addr, is_write);
+
+ if (fsts_reg & VTD_FSTS_PFO) {
+ error_report_once("New fault is not recorded due to "
+ "Primary Fault Overflow");
+ return;
+ }
+
+ if (vtd_try_collapse_fault(s, source_id)) {
+ error_report_once("New fault is not recorded due to "
+ "compression of faults");
+ return;
+ }
+
+ if (vtd_is_frcd_set(s, s->next_frcd_reg)) {
+ error_report_once("Next Fault Recording Reg is used, "
+ "new fault is not recorded, set PFO field");
+ vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
+ return;
+ }
+
+ vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
+
+ if (fsts_reg & VTD_FSTS_PPF) {
+ error_report_once("There are pending faults already, "
+ "fault event is not generated");
+ vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg);
+ s->next_frcd_reg++;
+ if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
+ s->next_frcd_reg = 0;
+ }
+ } else {
+ vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK,
+ VTD_FSTS_FRI(s->next_frcd_reg));
+ vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */
+ s->next_frcd_reg++;
+ if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
+ s->next_frcd_reg = 0;
+ }
+ /* This case actually cause the PPF to be Set.
+ * So generate fault event (interrupt).
+ */
+ vtd_generate_fault_event(s, fsts_reg);
+ }
+}
+
+/* Handle Invalidation Queue Errors of queued invalidation interface error
+ * conditions.
+ */
+static void vtd_handle_inv_queue_error(IntelIOMMUState *s)
+{
+ uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
+
+ vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE);
+ vtd_generate_fault_event(s, fsts_reg);
+}
+
+/* Set the IWC field and try to generate an invalidation completion interrupt */
+static void vtd_generate_completion_event(IntelIOMMUState *s)
+{
+ if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) {
+ trace_vtd_inv_desc_wait_irq("One pending, skip current");
+ return;
+ }
+ vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC);
+ vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP);
+ if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) {
+ trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, "
+ "new event not generated");
+ return;
+ } else {
+ /* Generate the interrupt event */
+ trace_vtd_inv_desc_wait_irq("Generating complete event");
+ vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
+ vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
+ }
+}
+
+static inline bool vtd_root_entry_present(IntelIOMMUState *s,
+ VTDRootEntry *re,
+ uint8_t devfn)
+{
+ if (s->root_scalable && devfn > UINT8_MAX / 2) {
+ return re->hi & VTD_ROOT_ENTRY_P;
+ }
+
+ return re->lo & VTD_ROOT_ENTRY_P;
+}
+
+static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
+ VTDRootEntry *re)
+{
+ dma_addr_t addr;
+
+ addr = s->root + index * sizeof(*re);
+ if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) {
+ re->lo = 0;
+ return -VTD_FR_ROOT_TABLE_INV;
+ }
+ re->lo = le64_to_cpu(re->lo);
+ re->hi = le64_to_cpu(re->hi);
+ return 0;
+}
+
+static inline bool vtd_ce_present(VTDContextEntry *context)
+{
+ return context->lo & VTD_CONTEXT_ENTRY_P;
+}
+
+static int vtd_get_context_entry_from_root(IntelIOMMUState *s,
+ VTDRootEntry *re,
+ uint8_t index,
+ VTDContextEntry *ce)
+{
+ dma_addr_t addr, ce_size;
+
+ /* we have checked that root entry is present */
+ ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE :
+ VTD_CTX_ENTRY_LEGACY_SIZE;
+
+ if (s->root_scalable && index > UINT8_MAX / 2) {
+ index = index & (~VTD_DEVFN_CHECK_MASK);
+ addr = re->hi & VTD_ROOT_ENTRY_CTP;
+ } else {
+ addr = re->lo & VTD_ROOT_ENTRY_CTP;
+ }
+
+ addr = addr + index * ce_size;
+ if (dma_memory_read(&address_space_memory, addr, ce, ce_size)) {
+ return -VTD_FR_CONTEXT_TABLE_INV;
+ }
+
+ ce->lo = le64_to_cpu(ce->lo);
+ ce->hi = le64_to_cpu(ce->hi);
+ if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) {
+ ce->val[2] = le64_to_cpu(ce->val[2]);
+ ce->val[3] = le64_to_cpu(ce->val[3]);
+ }
+ return 0;
+}
+
+static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
+{
+ return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
+}
+
+static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
+{
+ return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
+}
+
+/* Whether the pte indicates the address of the page frame */
+static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level)
+{
+ return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
+}
+
+/* Get the content of a spte located in @base_addr[@index] */
+static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
+{
+ uint64_t slpte;
+
+ assert(index < VTD_SL_PT_ENTRY_NR);
+
+ if (dma_memory_read(&address_space_memory,
+ base_addr + index * sizeof(slpte), &slpte,
+ sizeof(slpte))) {
+ slpte = (uint64_t)-1;
+ return slpte;
+ }
+ slpte = le64_to_cpu(slpte);
+ return slpte;
+}
+
+/* Given an iova and the level of paging structure, return the offset
+ * of current level.
+ */
+static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
+{
+ return (iova >> vtd_slpt_level_shift(level)) &
+ ((1ULL << VTD_SL_LEVEL_BITS) - 1);
+}
+
+/* Check Capability Register to see if the @level of page-table is supported */
+static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
+{
+ return VTD_CAP_SAGAW_MASK & s->cap &
+ (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
+}
+
+/* Return true if check passed, otherwise false */
+static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
+ VTDPASIDEntry *pe)
+{
+ switch (VTD_PE_GET_TYPE(pe)) {
+ case VTD_SM_PASID_ENTRY_FLT:
+ case VTD_SM_PASID_ENTRY_SLT:
+ case VTD_SM_PASID_ENTRY_NESTED:
+ break;
+ case VTD_SM_PASID_ENTRY_PT:
+ if (!x86_iommu->pt_supported) {
+ return false;
+ }
+ break;
+ default:
+ /* Unknown type */
+ return false;
+ }
+ return true;
+}
+
+static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
+{
+ return pdire->val & 1;
+}
+
+/**
+ * Caller of this function should check present bit if wants
+ * to use pdir entry for further usage except for fpd bit check.
+ */
+static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base,
+ uint32_t pasid,
+ VTDPASIDDirEntry *pdire)
+{
+ uint32_t index;
+ dma_addr_t addr, entry_size;
+
+ index = VTD_PASID_DIR_INDEX(pasid);
+ entry_size = VTD_PASID_DIR_ENTRY_SIZE;
+ addr = pasid_dir_base + index * entry_size;
+ if (dma_memory_read(&address_space_memory, addr, pdire, entry_size)) {
+ return -VTD_FR_PASID_TABLE_INV;
+ }
+
+ return 0;
+}
+
+static inline bool vtd_pe_present(VTDPASIDEntry *pe)
+{
+ return pe->val[0] & VTD_PASID_ENTRY_P;
+}
+
+static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
+ uint32_t pasid,
+ dma_addr_t addr,
+ VTDPASIDEntry *pe)
+{
+ uint32_t index;
+ dma_addr_t entry_size;
+ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+ index = VTD_PASID_TABLE_INDEX(pasid);
+ entry_size = VTD_PASID_ENTRY_SIZE;
+ addr = addr + index * entry_size;
+ if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) {
+ return -VTD_FR_PASID_TABLE_INV;
+ }
+
+ /* Do translation type check */
+ if (!vtd_pe_type_check(x86_iommu, pe)) {
+ return -VTD_FR_PASID_TABLE_INV;
+ }
+
+ if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
+ return -VTD_FR_PASID_TABLE_INV;
+ }
+
+ return 0;
+}
+
+/**
+ * Caller of this function should check present bit if wants
+ * to use pasid entry for further usage except for fpd bit check.
+ */
+static int vtd_get_pe_from_pdire(IntelIOMMUState *s,
+ uint32_t pasid,
+ VTDPASIDDirEntry *pdire,
+ VTDPASIDEntry *pe)
+{
+ dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK;
+
+ return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe);
+}
+
+/**
+ * This function gets a pasid entry from a specified pasid
+ * table (includes dir and leaf table) with a specified pasid.
+ * Sanity check should be done to ensure return a present
+ * pasid entry to caller.
+ */
+static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
+ dma_addr_t pasid_dir_base,
+ uint32_t pasid,
+ VTDPASIDEntry *pe)
+{
+ int ret;
+ VTDPASIDDirEntry pdire;
+
+ ret = vtd_get_pdire_from_pdir_table(pasid_dir_base,
+ pasid, &pdire);
+ if (ret) {
+ return ret;
+ }
+
+ if (!vtd_pdire_present(&pdire)) {
+ return -VTD_FR_PASID_TABLE_INV;
+ }
+
+ ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe);
+ if (ret) {
+ return ret;
+ }
+
+ if (!vtd_pe_present(pe)) {
+ return -VTD_FR_PASID_TABLE_INV;
+ }
+
+ return 0;
+}
+
+static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
+ VTDContextEntry *ce,
+ VTDPASIDEntry *pe)
+{
+ uint32_t pasid;
+ dma_addr_t pasid_dir_base;
+ int ret = 0;
+
+ pasid = VTD_CE_GET_RID2PASID(ce);
+ pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
+ ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe);
+
+ return ret;
+}
+
+static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
+ VTDContextEntry *ce,
+ bool *pe_fpd_set)
+{
+ int ret;
+ uint32_t pasid;
+ dma_addr_t pasid_dir_base;
+ VTDPASIDDirEntry pdire;
+ VTDPASIDEntry pe;
+
+ pasid = VTD_CE_GET_RID2PASID(ce);
+ pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
+
+ /*
+ * No present bit check since fpd is meaningful even
+ * if the present bit is clear.
+ */
+ ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire);
+ if (ret) {
+ return ret;
+ }
+
+ if (pdire.val & VTD_PASID_DIR_FPD) {
+ *pe_fpd_set = true;
+ return 0;
+ }
+
+ if (!vtd_pdire_present(&pdire)) {
+ return -VTD_FR_PASID_TABLE_INV;
+ }
+
+ /*
+ * No present bit check since fpd is meaningful even
+ * if the present bit is clear.
+ */
+ ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe);
+ if (ret) {
+ return ret;
+ }
+
+ if (pe.val[0] & VTD_PASID_ENTRY_FPD) {
+ *pe_fpd_set = true;
+ }
+
+ return 0;
+}
+
+/* Get the page-table level that hardware should use for the second-level
+ * page-table walk from the Address Width field of context-entry.
+ */
+static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce)
+{
+ return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
+}
+
+static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
+ VTDContextEntry *ce)
+{
+ VTDPASIDEntry pe;
+
+ if (s->root_scalable) {
+ vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+ return VTD_PE_GET_LEVEL(&pe);
+ }
+
+ return vtd_ce_get_level(ce);
+}
+
+static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce)
+{
+ return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
+}
+
+static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s,
+ VTDContextEntry *ce)
+{
+ VTDPASIDEntry pe;
+
+ if (s->root_scalable) {
+ vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+ return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9;
+ }
+
+ return vtd_ce_get_agaw(ce);
+}
+
+static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce)
+{
+ return ce->lo & VTD_CONTEXT_ENTRY_TT;
+}
+
+/* Only for Legacy Mode. Return true if check passed, otherwise false */
+static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
+ VTDContextEntry *ce)
+{
+ switch (vtd_ce_get_type(ce)) {
+ case VTD_CONTEXT_TT_MULTI_LEVEL:
+ /* Always supported */
+ break;
+ case VTD_CONTEXT_TT_DEV_IOTLB:
+ if (!x86_iommu->dt_supported) {
+ error_report_once("%s: DT specified but not supported", __func__);
+ return false;
+ }
+ break;
+ case VTD_CONTEXT_TT_PASS_THROUGH:
+ if (!x86_iommu->pt_supported) {
+ error_report_once("%s: PT specified but not supported", __func__);
+ return false;
+ }
+ break;
+ default:
+ /* Unknown type */
+ error_report_once("%s: unknown ce type: %"PRIu32, __func__,
+ vtd_ce_get_type(ce));
+ return false;
+ }
+ return true;
+}
+
+static inline uint64_t vtd_iova_limit(IntelIOMMUState *s,
+ VTDContextEntry *ce, uint8_t aw)
+{
+ uint32_t ce_agaw = vtd_get_iova_agaw(s, ce);
+ return 1ULL << MIN(ce_agaw, aw);
+}
+
+/* Return true if IOVA passes range check, otherwise false. */
+static inline bool vtd_iova_range_check(IntelIOMMUState *s,
+ uint64_t iova, VTDContextEntry *ce,
+ uint8_t aw)
+{
+ /*
+ * Check if @iova is above 2^X-1, where X is the minimum of MGAW
+ * in CAP_REG and AW in context-entry.
+ */
+ return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1));
+}
+
+static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
+ VTDContextEntry *ce)
+{
+ VTDPASIDEntry pe;
+
+ if (s->root_scalable) {
+ vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+ return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+ }
+
+ return vtd_ce_get_slpt_base(ce);
+}
+
+/*
+ * Rsvd field masks for spte:
+ * vtd_spte_rsvd 4k pages
+ * vtd_spte_rsvd_large large pages
+ */
+static uint64_t vtd_spte_rsvd[5];
+static uint64_t vtd_spte_rsvd_large[5];
+
+static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
+{
+ uint64_t rsvd_mask = vtd_spte_rsvd[level];
+
+ if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) &&
+ (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) {
+ /* large page */
+ rsvd_mask = vtd_spte_rsvd_large[level];
+ }
+
+ return slpte & rsvd_mask;
+}
+
+/* Find the VTD address space associated with a given bus number */
+static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
+{
+ VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
+ GHashTableIter iter;
+
+ if (vtd_bus) {
+ return vtd_bus;
+ }
+
+ /*
+ * Iterate over the registered buses to find the one which
+ * currently holds this bus number and update the bus_num
+ * lookup table.
+ */
+ g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
+ while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
+ if (pci_bus_num(vtd_bus->bus) == bus_num) {
+ s->vtd_as_by_bus_num[bus_num] = vtd_bus;
+ return vtd_bus;
+ }
+ }
+
+ return NULL;
+}
+
+/* Given the @iova, get relevant @slptep. @slpte_level will be the last level
+ * of the translation, can be used for deciding the size of large page.
+ */
+static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
+ uint64_t iova, bool is_write,
+ uint64_t *slptep, uint32_t *slpte_level,
+ bool *reads, bool *writes, uint8_t aw_bits)
+{
+ dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
+ uint32_t level = vtd_get_iova_level(s, ce);
+ uint32_t offset;
+ uint64_t slpte;
+ uint64_t access_right_check;
+
+ if (!vtd_iova_range_check(s, iova, ce, aw_bits)) {
+ error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")",
+ __func__, iova);
+ return -VTD_FR_ADDR_BEYOND_MGAW;
+ }
+
+ /* FIXME: what is the Atomics request here? */
+ access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
+
+ while (true) {
+ offset = vtd_iova_level_offset(iova, level);
+ slpte = vtd_get_slpte(addr, offset);
+
+ if (slpte == (uint64_t)-1) {
+ error_report_once("%s: detected read error on DMAR slpte "
+ "(iova=0x%" PRIx64 ")", __func__, iova);
+ if (level == vtd_get_iova_level(s, ce)) {
+ /* Invalid programming of context-entry */
+ return -VTD_FR_CONTEXT_ENTRY_INV;
+ } else {
+ return -VTD_FR_PAGING_ENTRY_INV;
+ }
+ }
+ *reads = (*reads) && (slpte & VTD_SL_R);
+ *writes = (*writes) && (slpte & VTD_SL_W);
+ if (!(slpte & access_right_check)) {
+ error_report_once("%s: detected slpte permission error "
+ "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
+ "slpte=0x%" PRIx64 ", write=%d)", __func__,
+ iova, level, slpte, is_write);
+ return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
+ }
+ if (vtd_slpte_nonzero_rsvd(slpte, level)) {
+ error_report_once("%s: detected splte reserve non-zero "
+ "iova=0x%" PRIx64 ", level=0x%" PRIx32
+ "slpte=0x%" PRIx64 ")", __func__, iova,
+ level, slpte);
+ return -VTD_FR_PAGING_ENTRY_RSVD;
+ }
+
+ if (vtd_is_last_slpte(slpte, level)) {
+ *slptep = slpte;
+ *slpte_level = level;
+ return 0;
+ }
+ addr = vtd_get_slpte_addr(slpte, aw_bits);
+ level--;
+ }
+}
+
+typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private);
+
+/**
+ * Constant information used during page walking
+ *
+ * @hook_fn: hook func to be called when detected page
+ * @private: private data to be passed into hook func
+ * @notify_unmap: whether we should notify invalid entries
+ * @as: VT-d address space of the device
+ * @aw: maximum address width
+ * @domain: domain ID of the page walk
+ */
+typedef struct {
+ VTDAddressSpace *as;
+ vtd_page_walk_hook hook_fn;
+ void *private;
+ bool notify_unmap;
+ uint8_t aw;
+ uint16_t domain_id;
+} vtd_page_walk_info;
+
+static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info)
+{
+ VTDAddressSpace *as = info->as;
+ vtd_page_walk_hook hook_fn = info->hook_fn;
+ void *private = info->private;
+ IOMMUTLBEntry *entry = &event->entry;
+ DMAMap target = {
+ .iova = entry->iova,
+ .size = entry->addr_mask,
+ .translated_addr = entry->translated_addr,
+ .perm = entry->perm,
+ };
+ const DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+ if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) {
+ trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+ return 0;
+ }
+
+ assert(hook_fn);
+
+ /* Update local IOVA mapped ranges */
+ if (event->type == IOMMU_NOTIFIER_MAP) {
+ if (mapped) {
+ /* If it's exactly the same translation, skip */
+ if (!memcmp(mapped, &target, sizeof(target))) {
+ trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
+ entry->translated_addr);
+ return 0;
+ } else {
+ /*
+ * Translation changed. Normally this should not
+ * happen, but it can happen when with buggy guest
+ * OSes. Note that there will be a small window that
+ * we don't have map at all. But that's the best
+ * effort we can do. The ideal way to emulate this is
+ * atomically modify the PTE to follow what has
+ * changed, but we can't. One example is that vfio
+ * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
+ * interface to modify a mapping (meanwhile it seems
+ * meaningless to even provide one). Anyway, let's
+ * mark this as a TODO in case one day we'll have
+ * a better solution.
+ */
+ IOMMUAccessFlags cache_perm = entry->perm;
+ int ret;
+
+ /* Emulate an UNMAP */
+ event->type = IOMMU_NOTIFIER_UNMAP;
+ entry->perm = IOMMU_NONE;
+ trace_vtd_page_walk_one(info->domain_id,
+ entry->iova,
+ entry->translated_addr,
+ entry->addr_mask,
+ entry->perm);
+ ret = hook_fn(event, private);
+ if (ret) {
+ return ret;
+ }
+ /* Drop any existing mapping */
+ iova_tree_remove(as->iova_tree, &target);
+ /* Recover the correct type */
+ event->type = IOMMU_NOTIFIER_MAP;
+ entry->perm = cache_perm;
+ }
+ }
+ iova_tree_insert(as->iova_tree, &target);
+ } else {
+ if (!mapped) {
+ /* Skip since we didn't map this range at all */
+ trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+ return 0;
+ }
+ iova_tree_remove(as->iova_tree, &target);
+ }
+
+ trace_vtd_page_walk_one(info->domain_id, entry->iova,
+ entry->translated_addr, entry->addr_mask,
+ entry->perm);
+ return hook_fn(event, private);
+}
+
+/**
+ * vtd_page_walk_level - walk over specific level for IOVA range
+ *
+ * @addr: base GPA addr to start the walk
+ * @start: IOVA range start address
+ * @end: IOVA range end address (start <= addr < end)
+ * @read: whether parent level has read permission
+ * @write: whether parent level has write permission
+ * @info: constant information for the page walk
+ */
+static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
+ uint64_t end, uint32_t level, bool read,
+ bool write, vtd_page_walk_info *info)
+{
+ bool read_cur, write_cur, entry_valid;
+ uint32_t offset;
+ uint64_t slpte;
+ uint64_t subpage_size, subpage_mask;
+ IOMMUTLBEvent event;
+ uint64_t iova = start;
+ uint64_t iova_next;
+ int ret = 0;
+
+ trace_vtd_page_walk_level(addr, level, start, end);
+
+ subpage_size = 1ULL << vtd_slpt_level_shift(level);
+ subpage_mask = vtd_slpt_level_page_mask(level);
+
+ while (iova < end) {
+ iova_next = (iova & subpage_mask) + subpage_size;
+
+ offset = vtd_iova_level_offset(iova, level);
+ slpte = vtd_get_slpte(addr, offset);
+
+ if (slpte == (uint64_t)-1) {
+ trace_vtd_page_walk_skip_read(iova, iova_next);
+ goto next;
+ }
+
+ if (vtd_slpte_nonzero_rsvd(slpte, level)) {
+ trace_vtd_page_walk_skip_reserve(iova, iova_next);
+ goto next;
+ }
+
+ /* Permissions are stacked with parents' */
+ read_cur = read && (slpte & VTD_SL_R);
+ write_cur = write && (slpte & VTD_SL_W);
+
+ /*
+ * As long as we have either read/write permission, this is a
+ * valid entry. The rule works for both page entries and page
+ * table entries.
+ */
+ entry_valid = read_cur | write_cur;
+
+ if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
+ /*
+ * This is a valid PDE (or even bigger than PDE). We need
+ * to walk one further level.
+ */
+ ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
+ iova, MIN(iova_next, end), level - 1,
+ read_cur, write_cur, info);
+ } else {
+ /*
+ * This means we are either:
+ *
+ * (1) the real page entry (either 4K page, or huge page)
+ * (2) the whole range is invalid
+ *
+ * In either case, we send an IOTLB notification down.
+ */
+ event.entry.target_as = &address_space_memory;
+ event.entry.iova = iova & subpage_mask;
+ event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
+ event.entry.addr_mask = ~subpage_mask;
+ /* NOTE: this is only meaningful if entry_valid == true */
+ event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+ event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP :
+ IOMMU_NOTIFIER_UNMAP;
+ ret = vtd_page_walk_one(&event, info);
+ }
+
+ if (ret < 0) {
+ return ret;
+ }
+
+next:
+ iova = iova_next;
+ }
+
+ return 0;
+}
+
+/**
+ * vtd_page_walk - walk specific IOVA range, and call the hook
+ *
+ * @s: intel iommu state
+ * @ce: context entry to walk upon
+ * @start: IOVA address to start the walk
+ * @end: IOVA range end address (start <= addr < end)
+ * @info: page walking information struct
+ */
+static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce,
+ uint64_t start, uint64_t end,
+ vtd_page_walk_info *info)
+{
+ dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
+ uint32_t level = vtd_get_iova_level(s, ce);
+
+ if (!vtd_iova_range_check(s, start, ce, info->aw)) {
+ return -VTD_FR_ADDR_BEYOND_MGAW;
+ }
+
+ if (!vtd_iova_range_check(s, end, ce, info->aw)) {
+ /* Fix end so that it reaches the maximum */
+ end = vtd_iova_limit(s, ce, info->aw);
+ }
+
+ return vtd_page_walk_level(addr, start, end, level, true, true, info);
+}
+
+static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s,
+ VTDRootEntry *re)
+{
+ /* Legacy Mode reserved bits check */
+ if (!s->root_scalable &&
+ (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
+ goto rsvd_err;
+
+ /* Scalable Mode reserved bits check */
+ if (s->root_scalable &&
+ ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) ||
+ (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
+ goto rsvd_err;
+
+ return 0;
+
+rsvd_err:
+ error_report_once("%s: invalid root entry: hi=0x%"PRIx64
+ ", lo=0x%"PRIx64,
+ __func__, re->hi, re->lo);
+ return -VTD_FR_ROOT_ENTRY_RSVD;
+}
+
+static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
+ VTDContextEntry *ce)
+{
+ if (!s->root_scalable &&
+ (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI ||
+ ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
+ error_report_once("%s: invalid context entry: hi=%"PRIx64
+ ", lo=%"PRIx64" (reserved nonzero)",
+ __func__, ce->hi, ce->lo);
+ return -VTD_FR_CONTEXT_ENTRY_RSVD;
+ }
+
+ if (s->root_scalable &&
+ (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) ||
+ ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 ||
+ ce->val[2] ||
+ ce->val[3])) {
+ error_report_once("%s: invalid context entry: val[3]=%"PRIx64
+ ", val[2]=%"PRIx64
+ ", val[1]=%"PRIx64
+ ", val[0]=%"PRIx64" (reserved nonzero)",
+ __func__, ce->val[3], ce->val[2],
+ ce->val[1], ce->val[0]);
+ return -VTD_FR_CONTEXT_ENTRY_RSVD;
+ }
+
+ return 0;
+}
+
+static int vtd_ce_rid2pasid_check(IntelIOMMUState *s,
+ VTDContextEntry *ce)
+{
+ VTDPASIDEntry pe;
+
+ /*
+ * Make sure in Scalable Mode, a present context entry
+ * has valid rid2pasid setting, which includes valid
+ * rid2pasid field and corresponding pasid entry setting
+ */
+ return vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+}
+
+/* Map a device to its corresponding domain (context-entry) */
+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+ uint8_t devfn, VTDContextEntry *ce)
+{
+ VTDRootEntry re;
+ int ret_fr;
+ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+ ret_fr = vtd_get_root_entry(s, bus_num, &re);
+ if (ret_fr) {
+ return ret_fr;
+ }
+
+ if (!vtd_root_entry_present(s, &re, devfn)) {
+ /* Not error - it's okay we don't have root entry. */
+ trace_vtd_re_not_present(bus_num);
+ return -VTD_FR_ROOT_ENTRY_P;
+ }
+
+ ret_fr = vtd_root_entry_rsvd_bits_check(s, &re);
+ if (ret_fr) {
+ return ret_fr;
+ }
+
+ ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce);
+ if (ret_fr) {
+ return ret_fr;
+ }
+
+ if (!vtd_ce_present(ce)) {
+ /* Not error - it's okay we don't have context entry. */
+ trace_vtd_ce_not_present(bus_num, devfn);
+ return -VTD_FR_CONTEXT_ENTRY_P;
+ }
+
+ ret_fr = vtd_context_entry_rsvd_bits_check(s, ce);
+ if (ret_fr) {
+ return ret_fr;
+ }
+
+ /* Check if the programming of context-entry is valid */
+ if (!s->root_scalable &&
+ !vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
+ error_report_once("%s: invalid context entry: hi=%"PRIx64
+ ", lo=%"PRIx64" (level %d not supported)",
+ __func__, ce->hi, ce->lo,
+ vtd_ce_get_level(ce));
+ return -VTD_FR_CONTEXT_ENTRY_INV;
+ }
+
+ if (!s->root_scalable) {
+ /* Do translation type check */
+ if (!vtd_ce_type_check(x86_iommu, ce)) {
+ /* Errors dumped in vtd_ce_type_check() */
+ return -VTD_FR_CONTEXT_ENTRY_INV;
+ }
+ } else {
+ /*
+ * Check if the programming of context-entry.rid2pasid
+ * and corresponding pasid setting is valid, and thus
+ * avoids to check pasid entry fetching result in future
+ * helper function calling.
+ */
+ ret_fr = vtd_ce_rid2pasid_check(s, ce);
+ if (ret_fr) {
+ return ret_fr;
+ }
+ }
+
+ return 0;
+}
+
+static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event,
+ void *private)
+{
+ memory_region_notify_iommu(private, 0, *event);
+ return 0;
+}
+
+static uint16_t vtd_get_domain_id(IntelIOMMUState *s,
+ VTDContextEntry *ce)
+{
+ VTDPASIDEntry pe;
+
+ if (s->root_scalable) {
+ vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+ return VTD_SM_PASID_ENTRY_DID(pe.val[1]);
+ }
+
+ return VTD_CONTEXT_ENTRY_DID(ce->hi);
+}
+
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
+ VTDContextEntry *ce,
+ hwaddr addr, hwaddr size)
+{
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ vtd_page_walk_info info = {
+ .hook_fn = vtd_sync_shadow_page_hook,
+ .private = (void *)&vtd_as->iommu,
+ .notify_unmap = true,
+ .aw = s->aw_bits,
+ .as = vtd_as,
+ .domain_id = vtd_get_domain_id(s, ce),
+ };
+
+ return vtd_page_walk(s, ce, addr, addr + size, &info);
+}
+
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
+{
+ int ret;
+ VTDContextEntry ce;
+ IOMMUNotifier *n;
+
+ if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) {
+ return 0;
+ }
+
+ ret = vtd_dev_to_context_entry(vtd_as->iommu_state,
+ pci_bus_num(vtd_as->bus),
+ vtd_as->devfn, &ce);
+ if (ret) {
+ if (ret == -VTD_FR_CONTEXT_ENTRY_P) {
+ /*
+ * It's a valid scenario to have a context entry that is
+ * not present. For example, when a device is removed
+ * from an existing domain then the context entry will be
+ * zeroed by the guest before it was put into another
+ * domain. When this happens, instead of synchronizing
+ * the shadow pages we should invalidate all existing
+ * mappings and notify the backends.
+ */
+ IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
+ vtd_address_space_unmap(vtd_as, n);
+ }
+ ret = 0;
+ }
+ return ret;
+ }
+
+ return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX);
+}
+
+/*
+ * Check if specific device is configured to bypass address
+ * translation for DMA requests. In Scalable Mode, bypass
+ * 1st-level translation or 2nd-level translation, it depends
+ * on PGTT setting.
+ */
+static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
+{
+ IntelIOMMUState *s;
+ VTDContextEntry ce;
+ VTDPASIDEntry pe;
+ int ret;
+
+ assert(as);
+
+ s = as->iommu_state;
+ ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
+ as->devfn, &ce);
+ if (ret) {
+ /*
+ * Possibly failed to parse the context entry for some reason
+ * (e.g., during init, or any guest configuration errors on
+ * context entries). We should assume PT not enabled for
+ * safety.
+ */
+ return false;
+ }
+
+ if (s->root_scalable) {
+ ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe);
+ if (ret) {
+ error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32,
+ __func__, ret);
+ return false;
+ }
+ return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
+ }
+
+ return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH);
+}
+
+/* Return whether the device is using IOMMU translation. */
+static bool vtd_switch_address_space(VTDAddressSpace *as)
+{
+ bool use_iommu;
+ /* Whether we need to take the BQL on our own */
+ bool take_bql = !qemu_mutex_iothread_locked();
+
+ assert(as);
+
+ use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as);
+
+ trace_vtd_switch_address_space(pci_bus_num(as->bus),
+ VTD_PCI_SLOT(as->devfn),
+ VTD_PCI_FUNC(as->devfn),
+ use_iommu);
+
+ /*
+ * It's possible that we reach here without BQL, e.g., when called
+ * from vtd_pt_enable_fast_path(). However the memory APIs need
+ * it. We'd better make sure we have had it already, or, take it.
+ */
+ if (take_bql) {
+ qemu_mutex_lock_iothread();
+ }
+
+ /* Turn off first then on the other */
+ if (use_iommu) {
+ memory_region_set_enabled(&as->nodmar, false);
+ memory_region_set_enabled(MEMORY_REGION(&as->iommu), true);
+ } else {
+ memory_region_set_enabled(MEMORY_REGION(&as->iommu), false);
+ memory_region_set_enabled(&as->nodmar, true);
+ }
+
+ if (take_bql) {
+ qemu_mutex_unlock_iothread();
+ }
+
+ return use_iommu;
+}
+
+static void vtd_switch_address_space_all(IntelIOMMUState *s)
+{
+ GHashTableIter iter;
+ VTDBus *vtd_bus;
+ int i;
+
+ g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
+ while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
+ for (i = 0; i < PCI_DEVFN_MAX; i++) {
+ if (!vtd_bus->dev_as[i]) {
+ continue;
+ }
+ vtd_switch_address_space(vtd_bus->dev_as[i]);
+ }
+ }
+}
+
+static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
+{
+ return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
+}
+
+static const bool vtd_qualified_faults[] = {
+ [VTD_FR_RESERVED] = false,
+ [VTD_FR_ROOT_ENTRY_P] = false,
+ [VTD_FR_CONTEXT_ENTRY_P] = true,
+ [VTD_FR_CONTEXT_ENTRY_INV] = true,
+ [VTD_FR_ADDR_BEYOND_MGAW] = true,
+ [VTD_FR_WRITE] = true,
+ [VTD_FR_READ] = true,
+ [VTD_FR_PAGING_ENTRY_INV] = true,
+ [VTD_FR_ROOT_TABLE_INV] = false,
+ [VTD_FR_CONTEXT_TABLE_INV] = false,
+ [VTD_FR_ROOT_ENTRY_RSVD] = false,
+ [VTD_FR_PAGING_ENTRY_RSVD] = true,
+ [VTD_FR_CONTEXT_ENTRY_TT] = true,
+ [VTD_FR_PASID_TABLE_INV] = false,
+ [VTD_FR_RESERVED_ERR] = false,
+ [VTD_FR_MAX] = false,
+};
+
+/* To see if a fault condition is "qualified", which is reported to software
+ * only if the FPD field in the context-entry used to process the faulting
+ * request is 0.
+ */
+static inline bool vtd_is_qualified_fault(VTDFaultReason fault)
+{
+ return vtd_qualified_faults[fault];
+}
+
+static inline bool vtd_is_interrupt_addr(hwaddr addr)
+{
+ return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
+}
+
+static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
+{
+ VTDBus *vtd_bus;
+ VTDAddressSpace *vtd_as;
+ bool success = false;
+
+ vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
+ if (!vtd_bus) {
+ goto out;
+ }
+
+ vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
+ if (!vtd_as) {
+ goto out;
+ }
+
+ if (vtd_switch_address_space(vtd_as) == false) {
+ /* We switched off IOMMU region successfully. */
+ success = true;
+ }
+
+out:
+ trace_vtd_pt_enable_fast_path(source_id, success);
+}
+
+/* Map dev to context-entry then do a paging-structures walk to do a iommu
+ * translation.
+ *
+ * Called from RCU critical section.
+ *
+ * @bus_num: The bus number
+ * @devfn: The devfn, which is the combined of device and function number
+ * @is_write: The access is a write operation
+ * @entry: IOMMUTLBEntry that contain the addr to be translated and result
+ *
+ * Returns true if translation is successful, otherwise false.
+ */
+static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
+ uint8_t devfn, hwaddr addr, bool is_write,
+ IOMMUTLBEntry *entry)
+{
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ VTDContextEntry ce;
+ uint8_t bus_num = pci_bus_num(bus);
+ VTDContextCacheEntry *cc_entry;
+ uint64_t slpte, page_mask;
+ uint32_t level;
+ uint16_t source_id = vtd_make_source_id(bus_num, devfn);
+ int ret_fr;
+ bool is_fpd_set = false;
+ bool reads = true;
+ bool writes = true;
+ uint8_t access_flags;
+ VTDIOTLBEntry *iotlb_entry;
+
+ /*
+ * We have standalone memory region for interrupt addresses, we
+ * should never receive translation requests in this region.
+ */
+ assert(!vtd_is_interrupt_addr(addr));
+
+ vtd_iommu_lock(s);
+
+ cc_entry = &vtd_as->context_cache_entry;
+
+ /* Try to fetch slpte form IOTLB */
+ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
+ if (iotlb_entry) {
+ trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
+ iotlb_entry->domain_id);
+ slpte = iotlb_entry->slpte;
+ access_flags = iotlb_entry->access_flags;
+ page_mask = iotlb_entry->mask;
+ goto out;
+ }
+
+ /* Try to fetch context-entry from cache first */
+ if (cc_entry->context_cache_gen == s->context_cache_gen) {
+ trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi,
+ cc_entry->context_entry.lo,
+ cc_entry->context_cache_gen);
+ ce = cc_entry->context_entry;
+ is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
+ if (!is_fpd_set && s->root_scalable) {
+ ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
+ VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
+ }
+ } else {
+ ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+ is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
+ if (!ret_fr && !is_fpd_set && s->root_scalable) {
+ ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
+ }
+ VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
+ /* Update context-cache */
+ trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
+ cc_entry->context_cache_gen,
+ s->context_cache_gen);
+ cc_entry->context_entry = ce;
+ cc_entry->context_cache_gen = s->context_cache_gen;
+ }
+
+ /*
+ * We don't need to translate for pass-through context entries.
+ * Also, let's ignore IOTLB caching as well for PT devices.
+ */
+ if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
+ entry->iova = addr & VTD_PAGE_MASK_4K;
+ entry->translated_addr = entry->iova;
+ entry->addr_mask = ~VTD_PAGE_MASK_4K;
+ entry->perm = IOMMU_RW;
+ trace_vtd_translate_pt(source_id, entry->iova);
+
+ /*
+ * When this happens, it means firstly caching-mode is not
+ * enabled, and this is the first passthrough translation for
+ * the device. Let's enable the fast path for passthrough.
+ *
+ * When passthrough is disabled again for the device, we can
+ * capture it via the context entry invalidation, then the
+ * IOMMU region can be swapped back.
+ */
+ vtd_pt_enable_fast_path(s, source_id);
+ vtd_iommu_unlock(s);
+ return true;
+ }
+
+ ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
+ &reads, &writes, s->aw_bits);
+ VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
+
+ page_mask = vtd_slpt_level_page_mask(level);
+ access_flags = IOMMU_ACCESS_FLAG(reads, writes);
+ vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte,
+ access_flags, level);
+out:
+ vtd_iommu_unlock(s);
+ entry->iova = addr & page_mask;
+ entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
+ entry->addr_mask = ~page_mask;
+ entry->perm = access_flags;
+ return true;
+
+error:
+ vtd_iommu_unlock(s);
+ entry->iova = 0;
+ entry->translated_addr = 0;
+ entry->addr_mask = 0;
+ entry->perm = IOMMU_NONE;
+ return false;
+}
+
+static void vtd_root_table_setup(IntelIOMMUState *s)
+{
+ s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
+ s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
+
+ vtd_update_scalable_state(s);
+
+ trace_vtd_reg_dmar_root(s->root, s->root_scalable);
+}
+
+static void vtd_iec_notify_all(IntelIOMMUState *s, bool global,
+ uint32_t index, uint32_t mask)
+{
+ x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
+}
+
+static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
+{
+ uint64_t value = 0;
+ value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
+ s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
+ s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
+ s->intr_eime = value & VTD_IRTA_EIME;
+
+ /* Notify global invalidation */
+ vtd_iec_notify_all(s, true, 0, 0);
+
+ trace_vtd_reg_ir_root(s->intr_root, s->intr_size);
+}
+
+static void vtd_iommu_replay_all(IntelIOMMUState *s)
+{
+ VTDAddressSpace *vtd_as;
+
+ QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+ vtd_sync_shadow_page_table(vtd_as);
+ }
+}
+
+static void vtd_context_global_invalidate(IntelIOMMUState *s)
+{
+ trace_vtd_inv_desc_cc_global();
+ /* Protects context cache */
+ vtd_iommu_lock(s);
+ s->context_cache_gen++;
+ if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
+ vtd_reset_context_cache_locked(s);
+ }
+ vtd_iommu_unlock(s);
+ vtd_address_space_refresh_all(s);
+ /*
+ * From VT-d spec 6.5.2.1, a global context entry invalidation
+ * should be followed by a IOTLB global invalidation, so we should
+ * be safe even without this. Hoewever, let's replay the region as
+ * well to be safer, and go back here when we need finer tunes for
+ * VT-d emulation codes.
+ */
+ vtd_iommu_replay_all(s);
+}
+
+/* Do a context-cache device-selective invalidation.
+ * @func_mask: FM field after shifting
+ */
+static void vtd_context_device_invalidate(IntelIOMMUState *s,
+ uint16_t source_id,
+ uint16_t func_mask)
+{
+ uint16_t mask;
+ VTDBus *vtd_bus;
+ VTDAddressSpace *vtd_as;
+ uint8_t bus_n, devfn;
+ uint16_t devfn_it;
+
+ trace_vtd_inv_desc_cc_devices(source_id, func_mask);
+
+ switch (func_mask & 3) {
+ case 0:
+ mask = 0; /* No bits in the SID field masked */
+ break;
+ case 1:
+ mask = 4; /* Mask bit 2 in the SID field */
+ break;
+ case 2:
+ mask = 6; /* Mask bit 2:1 in the SID field */
+ break;
+ case 3:
+ mask = 7; /* Mask bit 2:0 in the SID field */
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ mask = ~mask;
+
+ bus_n = VTD_SID_TO_BUS(source_id);
+ vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
+ if (vtd_bus) {
+ devfn = VTD_SID_TO_DEVFN(source_id);
+ for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
+ vtd_as = vtd_bus->dev_as[devfn_it];
+ if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
+ trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
+ VTD_PCI_FUNC(devfn_it));
+ vtd_iommu_lock(s);
+ vtd_as->context_cache_entry.context_cache_gen = 0;
+ vtd_iommu_unlock(s);
+ /*
+ * Do switch address space when needed, in case if the
+ * device passthrough bit is switched.
+ */
+ vtd_switch_address_space(vtd_as);
+ /*
+ * So a device is moving out of (or moving into) a
+ * domain, resync the shadow page table.
+ * This won't bring bad even if we have no such
+ * notifier registered - the IOMMU notification
+ * framework will skip MAP notifications if that
+ * happened.
+ */
+ vtd_sync_shadow_page_table(vtd_as);
+ }
+ }
+ }
+}
+
+/* Context-cache invalidation
+ * Returns the Context Actual Invalidation Granularity.
+ * @val: the content of the CCMD_REG
+ */
+static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
+{
+ uint64_t caig;
+ uint64_t type = val & VTD_CCMD_CIRG_MASK;
+
+ switch (type) {
+ case VTD_CCMD_DOMAIN_INVL:
+ /* Fall through */
+ case VTD_CCMD_GLOBAL_INVL:
+ caig = VTD_CCMD_GLOBAL_INVL_A;
+ vtd_context_global_invalidate(s);
+ break;
+
+ case VTD_CCMD_DEVICE_INVL:
+ caig = VTD_CCMD_DEVICE_INVL_A;
+ vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val));
+ break;
+
+ default:
+ error_report_once("%s: invalid context: 0x%" PRIx64,
+ __func__, val);
+ caig = 0;
+ }
+ return caig;
+}
+
+static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
+{
+ trace_vtd_inv_desc_iotlb_global();
+ vtd_reset_iotlb(s);
+ vtd_iommu_replay_all(s);
+}
+
+static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
+{
+ VTDContextEntry ce;
+ VTDAddressSpace *vtd_as;
+
+ trace_vtd_inv_desc_iotlb_domain(domain_id);
+
+ vtd_iommu_lock(s);
+ g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
+ &domain_id);
+ vtd_iommu_unlock(s);
+
+ QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+ if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+ vtd_as->devfn, &ce) &&
+ domain_id == vtd_get_domain_id(s, &ce)) {
+ vtd_sync_shadow_page_table(vtd_as);
+ }
+ }
+}
+
+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
+ uint16_t domain_id, hwaddr addr,
+ uint8_t am)
+{
+ VTDAddressSpace *vtd_as;
+ VTDContextEntry ce;
+ int ret;
+ hwaddr size = (1 << am) * VTD_PAGE_SIZE;
+
+ QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
+ ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+ vtd_as->devfn, &ce);
+ if (!ret && domain_id == vtd_get_domain_id(s, &ce)) {
+ if (vtd_as_has_map_notifier(vtd_as)) {
+ /*
+ * As long as we have MAP notifications registered in
+ * any of our IOMMU notifiers, we need to sync the
+ * shadow page table.
+ */
+ vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
+ } else {
+ /*
+ * For UNMAP-only notifiers, we don't need to walk the
+ * page tables. We just deliver the PSI down to
+ * invalidate caches.
+ */
+ IOMMUTLBEvent event = {
+ .type = IOMMU_NOTIFIER_UNMAP,
+ .entry = {
+ .target_as = &address_space_memory,
+ .iova = addr,
+ .translated_addr = 0,
+ .addr_mask = size - 1,
+ .perm = IOMMU_NONE,
+ },
+ };
+ memory_region_notify_iommu(&vtd_as->iommu, 0, event);
+ }
+ }
+ }
+}
+
+static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
+ hwaddr addr, uint8_t am)
+{
+ VTDIOTLBPageInvInfo info;
+
+ trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am);
+
+ assert(am <= VTD_MAMV);
+ info.domain_id = domain_id;
+ info.addr = addr;
+ info.mask = ~((1 << am) - 1);
+ vtd_iommu_lock(s);
+ g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+ vtd_iommu_unlock(s);
+ vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
+}
+
+/* Flush IOTLB
+ * Returns the IOTLB Actual Invalidation Granularity.
+ * @val: the content of the IOTLB_REG
+ */
+static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
+{
+ uint64_t iaig;
+ uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK;
+ uint16_t domain_id;
+ hwaddr addr;
+ uint8_t am;
+
+ switch (type) {
+ case VTD_TLB_GLOBAL_FLUSH:
+ iaig = VTD_TLB_GLOBAL_FLUSH_A;
+ vtd_iotlb_global_invalidate(s);
+ break;
+
+ case VTD_TLB_DSI_FLUSH:
+ domain_id = VTD_TLB_DID(val);
+ iaig = VTD_TLB_DSI_FLUSH_A;
+ vtd_iotlb_domain_invalidate(s, domain_id);
+ break;
+
+ case VTD_TLB_PSI_FLUSH:
+ domain_id = VTD_TLB_DID(val);
+ addr = vtd_get_quad_raw(s, DMAR_IVA_REG);
+ am = VTD_IVA_AM(addr);
+ addr = VTD_IVA_ADDR(addr);
+ if (am > VTD_MAMV) {
+ error_report_once("%s: address mask overflow: 0x%" PRIx64,
+ __func__, vtd_get_quad_raw(s, DMAR_IVA_REG));
+ iaig = 0;
+ break;
+ }
+ iaig = VTD_TLB_PSI_FLUSH_A;
+ vtd_iotlb_page_invalidate(s, domain_id, addr, am);
+ break;
+
+ default:
+ error_report_once("%s: invalid granularity: 0x%" PRIx64,
+ __func__, val);
+ iaig = 0;
+ }
+ return iaig;
+}
+
+static void vtd_fetch_inv_desc(IntelIOMMUState *s);
+
+static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s)
+{
+ return s->qi_enabled && (s->iq_tail == s->iq_head) &&
+ (s->iq_last_desc_type == VTD_INV_DESC_WAIT);
+}
+
+static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
+{
+ uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG);
+
+ trace_vtd_inv_qi_enable(en);
+
+ if (en) {
+ s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
+ /* 2^(x+8) entries */
+ s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
+ s->qi_enabled = true;
+ trace_vtd_inv_qi_setup(s->iq, s->iq_size);
+ /* Ok - report back to driver */
+ vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES);
+
+ if (s->iq_tail != 0) {
+ /*
+ * This is a spec violation but Windows guests are known to set up
+ * Queued Invalidation this way so we allow the write and process
+ * Invalidation Descriptors right away.
+ */
+ trace_vtd_warn_invalid_qi_tail(s->iq_tail);
+ if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
+ vtd_fetch_inv_desc(s);
+ }
+ }
+ } else {
+ if (vtd_queued_inv_disable_check(s)) {
+ /* disable Queued Invalidation */
+ vtd_set_quad_raw(s, DMAR_IQH_REG, 0);
+ s->iq_head = 0;
+ s->qi_enabled = false;
+ /* Ok - report back to driver */
+ vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0);
+ } else {
+ error_report_once("%s: detected improper state when disable QI "
+ "(head=0x%x, tail=0x%x, last_type=%d)",
+ __func__,
+ s->iq_head, s->iq_tail, s->iq_last_desc_type);
+ }
+ }
+}
+
+/* Set Root Table Pointer */
+static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
+{
+ vtd_root_table_setup(s);
+ /* Ok - report back to driver */
+ vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
+ vtd_reset_caches(s);
+ vtd_address_space_refresh_all(s);
+}
+
+/* Set Interrupt Remap Table Pointer */
+static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
+{
+ vtd_interrupt_remap_table_setup(s);
+ /* Ok - report back to driver */
+ vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
+}
+
+/* Handle Translation Enable/Disable */
+static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
+{
+ if (s->dmar_enabled == en) {
+ return;
+ }
+
+ trace_vtd_dmar_enable(en);
+
+ if (en) {
+ s->dmar_enabled = true;
+ /* Ok - report back to driver */
+ vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
+ } else {
+ s->dmar_enabled = false;
+
+ /* Clear the index of Fault Recording Register */
+ s->next_frcd_reg = 0;
+ /* Ok - report back to driver */
+ vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
+ }
+
+ vtd_reset_caches(s);
+ vtd_address_space_refresh_all(s);
+}
+
+/* Handle Interrupt Remap Enable/Disable */
+static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en)
+{
+ trace_vtd_ir_enable(en);
+
+ if (en) {
+ s->intr_enabled = true;
+ /* Ok - report back to driver */
+ vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES);
+ } else {
+ s->intr_enabled = false;
+ /* Ok - report back to driver */
+ vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0);
+ }
+}
+
+/* Handle write to Global Command Register */
+static void vtd_handle_gcmd_write(IntelIOMMUState *s)
+{
+ uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG);
+ uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
+ uint32_t changed = status ^ val;
+
+ trace_vtd_reg_write_gcmd(status, val);
+ if (changed & VTD_GCMD_TE) {
+ /* Translation enable/disable */
+ vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
+ }
+ if (val & VTD_GCMD_SRTP) {
+ /* Set/update the root-table pointer */
+ vtd_handle_gcmd_srtp(s);
+ }
+ if (changed & VTD_GCMD_QIE) {
+ /* Queued Invalidation Enable */
+ vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE);
+ }
+ if (val & VTD_GCMD_SIRTP) {
+ /* Set/update the interrupt remapping root-table pointer */
+ vtd_handle_gcmd_sirtp(s);
+ }
+ if (changed & VTD_GCMD_IRE) {
+ /* Interrupt remap enable/disable */
+ vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE);
+ }
+}
+
+/* Handle write to Context Command Register */
+static void vtd_handle_ccmd_write(IntelIOMMUState *s)
+{
+ uint64_t ret;
+ uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG);
+
+ /* Context-cache invalidation request */
+ if (val & VTD_CCMD_ICC) {
+ if (s->qi_enabled) {
+ error_report_once("Queued Invalidation enabled, "
+ "should not use register-based invalidation");
+ return;
+ }
+ ret = vtd_context_cache_invalidate(s, val);
+ /* Invalidation completed. Change something to show */
+ vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
+ ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK,
+ ret);
+ }
+}
+
+/* Handle write to IOTLB Invalidation Register */
+static void vtd_handle_iotlb_write(IntelIOMMUState *s)
+{
+ uint64_t ret;
+ uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG);
+
+ /* IOTLB invalidation request */
+ if (val & VTD_TLB_IVT) {
+ if (s->qi_enabled) {
+ error_report_once("Queued Invalidation enabled, "
+ "should not use register-based invalidation");
+ return;
+ }
+ ret = vtd_iotlb_flush(s, val);
+ /* Invalidation completed. Change something to show */
+ vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
+ ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG,
+ VTD_TLB_FLUSH_GRANU_MASK_A, ret);
+ }
+}
+
+/* Fetch an Invalidation Descriptor from the Invalidation Queue */
+static bool vtd_get_inv_desc(IntelIOMMUState *s,
+ VTDInvDesc *inv_desc)
+{
+ dma_addr_t base_addr = s->iq;
+ uint32_t offset = s->iq_head;
+ uint32_t dw = s->iq_dw ? 32 : 16;
+ dma_addr_t addr = base_addr + offset * dw;
+
+ if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) {
+ error_report_once("Read INV DESC failed.");
+ return false;
+ }
+ inv_desc->lo = le64_to_cpu(inv_desc->lo);
+ inv_desc->hi = le64_to_cpu(inv_desc->hi);
+ if (dw == 32) {
+ inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
+ inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);
+ }
+ return true;
+}
+
+static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
+{
+ if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
+ (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
+ error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
+ " (reserved nonzero)", __func__, inv_desc->hi,
+ inv_desc->lo);
+ return false;
+ }
+ if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
+ /* Status Write */
+ uint32_t status_data = (uint32_t)(inv_desc->lo >>
+ VTD_INV_DESC_WAIT_DATA_SHIFT);
+
+ assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
+
+ /* FIXME: need to be masked with HAW? */
+ dma_addr_t status_addr = inv_desc->hi;
+ trace_vtd_inv_desc_wait_sw(status_addr, status_data);
+ status_data = cpu_to_le32(status_data);
+ if (dma_memory_write(&address_space_memory, status_addr, &status_data,
+ sizeof(status_data))) {
+ trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo);
+ return false;
+ }
+ } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
+ /* Interrupt flag */
+ vtd_generate_completion_event(s);
+ } else {
+ error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
+ " (unknown type)", __func__, inv_desc->hi,
+ inv_desc->lo);
+ return false;
+ }
+ return true;
+}
+
+static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
+ VTDInvDesc *inv_desc)
+{
+ uint16_t sid, fmask;
+
+ if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) {
+ error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
+ " (reserved nonzero)", __func__, inv_desc->hi,
+ inv_desc->lo);
+ return false;
+ }
+ switch (inv_desc->lo & VTD_INV_DESC_CC_G) {
+ case VTD_INV_DESC_CC_DOMAIN:
+ trace_vtd_inv_desc_cc_domain(
+ (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo));
+ /* Fall through */
+ case VTD_INV_DESC_CC_GLOBAL:
+ vtd_context_global_invalidate(s);
+ break;
+
+ case VTD_INV_DESC_CC_DEVICE:
+ sid = VTD_INV_DESC_CC_SID(inv_desc->lo);
+ fmask = VTD_INV_DESC_CC_FM(inv_desc->lo);
+ vtd_context_device_invalidate(s, sid, fmask);
+ break;
+
+ default:
+ error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
+ " (invalid type)", __func__, inv_desc->hi,
+ inv_desc->lo);
+ return false;
+ }
+ return true;
+}
+
+static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
+{
+ uint16_t domain_id;
+ uint8_t am;
+ hwaddr addr;
+
+ if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) ||
+ (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) {
+ error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
+ ", lo=0x%"PRIx64" (reserved bits unzero)",
+ __func__, inv_desc->hi, inv_desc->lo);
+ return false;
+ }
+
+ switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
+ case VTD_INV_DESC_IOTLB_GLOBAL:
+ vtd_iotlb_global_invalidate(s);
+ break;
+
+ case VTD_INV_DESC_IOTLB_DOMAIN:
+ domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
+ vtd_iotlb_domain_invalidate(s, domain_id);
+ break;
+
+ case VTD_INV_DESC_IOTLB_PAGE:
+ domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
+ addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
+ am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
+ if (am > VTD_MAMV) {
+ error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
+ ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)",
+ __func__, inv_desc->hi, inv_desc->lo,
+ am, (unsigned)VTD_MAMV);
+ return false;
+ }
+ vtd_iotlb_page_invalidate(s, domain_id, addr, am);
+ break;
+
+ default:
+ error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
+ ", lo=0x%"PRIx64" (type mismatch: 0x%llx)",
+ __func__, inv_desc->hi, inv_desc->lo,
+ inv_desc->lo & VTD_INV_DESC_IOTLB_G);
+ return false;
+ }
+ return true;
+}
+
+static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
+ VTDInvDesc *inv_desc)
+{
+ trace_vtd_inv_desc_iec(inv_desc->iec.granularity,
+ inv_desc->iec.index,
+ inv_desc->iec.index_mask);
+
+ vtd_iec_notify_all(s, !inv_desc->iec.granularity,
+ inv_desc->iec.index,
+ inv_desc->iec.index_mask);
+ return true;
+}
+
+static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
+ VTDInvDesc *inv_desc)
+{
+ VTDAddressSpace *vtd_dev_as;
+ IOMMUTLBEvent event;
+ struct VTDBus *vtd_bus;
+ hwaddr addr;
+ uint64_t sz;
+ uint16_t sid;
+ uint8_t devfn;
+ bool size;
+ uint8_t bus_num;
+
+ addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi);
+ sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo);
+ devfn = sid & 0xff;
+ bus_num = sid >> 8;
+ size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi);
+
+ if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
+ (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
+ error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64
+ ", lo=%"PRIx64" (reserved nonzero)", __func__,
+ inv_desc->hi, inv_desc->lo);
+ return false;
+ }
+
+ vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
+ if (!vtd_bus) {
+ goto done;
+ }
+
+ vtd_dev_as = vtd_bus->dev_as[devfn];
+ if (!vtd_dev_as) {
+ goto done;
+ }
+
+ /* According to ATS spec table 2.4:
+ * S = 0, bits 15:12 = xxxx range size: 4K
+ * S = 1, bits 15:12 = xxx0 range size: 8K
+ * S = 1, bits 15:12 = xx01 range size: 16K
+ * S = 1, bits 15:12 = x011 range size: 32K
+ * S = 1, bits 15:12 = 0111 range size: 64K
+ * ...
+ */
+ if (size) {
+ sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
+ addr &= ~(sz - 1);
+ } else {
+ sz = VTD_PAGE_SIZE;
+ }
+
+ event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP;
+ event.entry.target_as = &vtd_dev_as->as;
+ event.entry.addr_mask = sz - 1;
+ event.entry.iova = addr;
+ event.entry.perm = IOMMU_NONE;
+ event.entry.translated_addr = 0;
+ memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event);
+
+done:
+ return true;
+}
+
+static bool vtd_process_inv_desc(IntelIOMMUState *s)
+{
+ VTDInvDesc inv_desc;
+ uint8_t desc_type;
+
+ trace_vtd_inv_qi_head(s->iq_head);
+ if (!vtd_get_inv_desc(s, &inv_desc)) {
+ s->iq_last_desc_type = VTD_INV_DESC_NONE;
+ return false;
+ }
+
+ desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
+ /* FIXME: should update at first or at last? */
+ s->iq_last_desc_type = desc_type;
+
+ switch (desc_type) {
+ case VTD_INV_DESC_CC:
+ trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo);
+ if (!vtd_process_context_cache_desc(s, &inv_desc)) {
+ return false;
+ }
+ break;
+
+ case VTD_INV_DESC_IOTLB:
+ trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo);
+ if (!vtd_process_iotlb_desc(s, &inv_desc)) {
+ return false;
+ }
+ break;
+
+ /*
+ * TODO: the entity of below two cases will be implemented in future series.
+ * To make guest (which integrates scalable mode support patch set in
+ * iommu driver) work, just return true is enough so far.
+ */
+ case VTD_INV_DESC_PC:
+ break;
+
+ case VTD_INV_DESC_PIOTLB:
+ break;
+
+ case VTD_INV_DESC_WAIT:
+ trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
+ if (!vtd_process_wait_desc(s, &inv_desc)) {
+ return false;
+ }
+ break;
+
+ case VTD_INV_DESC_IEC:
+ trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo);
+ if (!vtd_process_inv_iec_desc(s, &inv_desc)) {
+ return false;
+ }
+ break;
+
+ case VTD_INV_DESC_DEVICE:
+ trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
+ if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
+ return false;
+ }
+ break;
+
+ default:
+ error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64
+ " (unknown type)", __func__, inv_desc.hi,
+ inv_desc.lo);
+ return false;
+ }
+ s->iq_head++;
+ if (s->iq_head == s->iq_size) {
+ s->iq_head = 0;
+ }
+ return true;
+}
+
+/* Try to fetch and process more Invalidation Descriptors */
+static void vtd_fetch_inv_desc(IntelIOMMUState *s)
+{
+ int qi_shift;
+
+ /* Refer to 10.4.23 of VT-d spec 3.0 */
+ qi_shift = s->iq_dw ? VTD_IQH_QH_SHIFT_5 : VTD_IQH_QH_SHIFT_4;
+
+ trace_vtd_inv_qi_fetch();
+
+ if (s->iq_tail >= s->iq_size) {
+ /* Detects an invalid Tail pointer */
+ error_report_once("%s: detected invalid QI tail "
+ "(tail=0x%x, size=0x%x)",
+ __func__, s->iq_tail, s->iq_size);
+ vtd_handle_inv_queue_error(s);
+ return;
+ }
+ while (s->iq_head != s->iq_tail) {
+ if (!vtd_process_inv_desc(s)) {
+ /* Invalidation Queue Errors */
+ vtd_handle_inv_queue_error(s);
+ break;
+ }
+ /* Must update the IQH_REG in time */
+ vtd_set_quad_raw(s, DMAR_IQH_REG,
+ (((uint64_t)(s->iq_head)) << qi_shift) &
+ VTD_IQH_QH_MASK);
+ }
+}
+
+/* Handle write to Invalidation Queue Tail Register */
+static void vtd_handle_iqt_write(IntelIOMMUState *s)
+{
+ uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
+
+ if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) {
+ error_report_once("%s: RSV bit is set: val=0x%"PRIx64,
+ __func__, val);
+ return;
+ }
+ s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
+ trace_vtd_inv_qi_tail(s->iq_tail);
+
+ if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
+ /* Process Invalidation Queue here */
+ vtd_fetch_inv_desc(s);
+ }
+}
+
+static void vtd_handle_fsts_write(IntelIOMMUState *s)
+{
+ uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
+ uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
+ uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE;
+
+ if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
+ vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
+ trace_vtd_fsts_clear_ip();
+ }
+ /* FIXME: when IQE is Clear, should we try to fetch some Invalidation
+ * Descriptors if there are any when Queued Invalidation is enabled?
+ */
+}
+
+static void vtd_handle_fectl_write(IntelIOMMUState *s)
+{
+ uint32_t fectl_reg;
+ /* FIXME: when software clears the IM field, check the IP field. But do we
+ * need to compare the old value and the new value to conclude that
+ * software clears the IM field? Or just check if the IM field is zero?
+ */
+ fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
+
+ trace_vtd_reg_write_fectl(fectl_reg);
+
+ if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
+ vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
+ vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
+ }
+}
+
+static void vtd_handle_ics_write(IntelIOMMUState *s)
+{
+ uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG);
+ uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
+
+ if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) {
+ trace_vtd_reg_ics_clear_ip();
+ vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
+ }
+}
+
+static void vtd_handle_iectl_write(IntelIOMMUState *s)
+{
+ uint32_t iectl_reg;
+ /* FIXME: when software clears the IM field, check the IP field. But do we
+ * need to compare the old value and the new value to conclude that
+ * software clears the IM field? Or just check if the IM field is zero?
+ */
+ iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
+
+ trace_vtd_reg_write_iectl(iectl_reg);
+
+ if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) {
+ vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
+ vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
+ }
+}
+
+static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
+{
+ IntelIOMMUState *s = opaque;
+ uint64_t val;
+
+ trace_vtd_reg_read(addr, size);
+
+ if (addr + size > DMAR_REG_SIZE) {
+ error_report_once("%s: MMIO over range: addr=0x%" PRIx64
+ " size=0x%x", __func__, addr, size);
+ return (uint64_t)-1;
+ }
+
+ switch (addr) {
+ /* Root Table Address Register, 64-bit */
+ case DMAR_RTADDR_REG:
+ val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
+ if (size == 4) {
+ val = val & ((1ULL << 32) - 1);
+ }
+ break;
+
+ case DMAR_RTADDR_REG_HI:
+ assert(size == 4);
+ val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32;
+ break;
+
+ /* Invalidation Queue Address Register, 64-bit */
+ case DMAR_IQA_REG:
+ val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS);
+ if (size == 4) {
+ val = val & ((1ULL << 32) - 1);
+ }
+ break;
+
+ case DMAR_IQA_REG_HI:
+ assert(size == 4);
+ val = s->iq >> 32;
+ break;
+
+ default:
+ if (size == 4) {
+ val = vtd_get_long(s, addr);
+ } else {
+ val = vtd_get_quad(s, addr);
+ }
+ }
+
+ return val;
+}
+
+static void vtd_mem_write(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+ IntelIOMMUState *s = opaque;
+
+ trace_vtd_reg_write(addr, size, val);
+
+ if (addr + size > DMAR_REG_SIZE) {
+ error_report_once("%s: MMIO over range: addr=0x%" PRIx64
+ " size=0x%x", __func__, addr, size);
+ return;
+ }
+
+ switch (addr) {
+ /* Global Command Register, 32-bit */
+ case DMAR_GCMD_REG:
+ vtd_set_long(s, addr, val);
+ vtd_handle_gcmd_write(s);
+ break;
+
+ /* Context Command Register, 64-bit */
+ case DMAR_CCMD_REG:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ vtd_set_quad(s, addr, val);
+ vtd_handle_ccmd_write(s);
+ }
+ break;
+
+ case DMAR_CCMD_REG_HI:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ vtd_handle_ccmd_write(s);
+ break;
+
+ /* IOTLB Invalidation Register, 64-bit */
+ case DMAR_IOTLB_REG:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ vtd_set_quad(s, addr, val);
+ vtd_handle_iotlb_write(s);
+ }
+ break;
+
+ case DMAR_IOTLB_REG_HI:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ vtd_handle_iotlb_write(s);
+ break;
+
+ /* Invalidate Address Register, 64-bit */
+ case DMAR_IVA_REG:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ vtd_set_quad(s, addr, val);
+ }
+ break;
+
+ case DMAR_IVA_REG_HI:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ /* Fault Status Register, 32-bit */
+ case DMAR_FSTS_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ vtd_handle_fsts_write(s);
+ break;
+
+ /* Fault Event Control Register, 32-bit */
+ case DMAR_FECTL_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ vtd_handle_fectl_write(s);
+ break;
+
+ /* Fault Event Data Register, 32-bit */
+ case DMAR_FEDATA_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ /* Fault Event Address Register, 32-bit */
+ case DMAR_FEADDR_REG:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ /*
+ * While the register is 32-bit only, some guests (Xen...) write to
+ * it with 64-bit.
+ */
+ vtd_set_quad(s, addr, val);
+ }
+ break;
+
+ /* Fault Event Upper Address Register, 32-bit */
+ case DMAR_FEUADDR_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ /* Protected Memory Enable Register, 32-bit */
+ case DMAR_PMEN_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ /* Root Table Address Register, 64-bit */
+ case DMAR_RTADDR_REG:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ vtd_set_quad(s, addr, val);
+ }
+ break;
+
+ case DMAR_RTADDR_REG_HI:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ /* Invalidation Queue Tail Register, 64-bit */
+ case DMAR_IQT_REG:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ vtd_set_quad(s, addr, val);
+ }
+ vtd_handle_iqt_write(s);
+ break;
+
+ case DMAR_IQT_REG_HI:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ /* 19:63 of IQT_REG is RsvdZ, do nothing here */
+ break;
+
+ /* Invalidation Queue Address Register, 64-bit */
+ case DMAR_IQA_REG:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ vtd_set_quad(s, addr, val);
+ }
+ if (s->ecap & VTD_ECAP_SMTS &&
+ val & VTD_IQA_DW_MASK) {
+ s->iq_dw = true;
+ } else {
+ s->iq_dw = false;
+ }
+ break;
+
+ case DMAR_IQA_REG_HI:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ /* Invalidation Completion Status Register, 32-bit */
+ case DMAR_ICS_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ vtd_handle_ics_write(s);
+ break;
+
+ /* Invalidation Event Control Register, 32-bit */
+ case DMAR_IECTL_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ vtd_handle_iectl_write(s);
+ break;
+
+ /* Invalidation Event Data Register, 32-bit */
+ case DMAR_IEDATA_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ /* Invalidation Event Address Register, 32-bit */
+ case DMAR_IEADDR_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ /* Invalidation Event Upper Address Register, 32-bit */
+ case DMAR_IEUADDR_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ /* Fault Recording Registers, 128-bit */
+ case DMAR_FRCD_REG_0_0:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ vtd_set_quad(s, addr, val);
+ }
+ break;
+
+ case DMAR_FRCD_REG_0_1:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ case DMAR_FRCD_REG_0_2:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ vtd_set_quad(s, addr, val);
+ /* May clear bit 127 (Fault), update PPF */
+ vtd_update_fsts_ppf(s);
+ }
+ break;
+
+ case DMAR_FRCD_REG_0_3:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ /* May clear bit 127 (Fault), update PPF */
+ vtd_update_fsts_ppf(s);
+ break;
+
+ case DMAR_IRTA_REG:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ vtd_set_quad(s, addr, val);
+ }
+ break;
+
+ case DMAR_IRTA_REG_HI:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
+ default:
+ if (size == 4) {
+ vtd_set_long(s, addr, val);
+ } else {
+ vtd_set_quad(s, addr, val);
+ }
+ }
+}
+
+static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
+ IOMMUAccessFlags flag, int iommu_idx)
+{
+ VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ IOMMUTLBEntry iotlb = {
+ /* We'll fill in the rest later. */
+ .target_as = &address_space_memory,
+ };
+ bool success;
+
+ if (likely(s->dmar_enabled)) {
+ success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
+ addr, flag & IOMMU_WO, &iotlb);
+ } else {
+ /* DMAR disabled, passthrough, use 4k-page*/
+ iotlb.iova = addr & VTD_PAGE_MASK_4K;
+ iotlb.translated_addr = addr & VTD_PAGE_MASK_4K;
+ iotlb.addr_mask = ~VTD_PAGE_MASK_4K;
+ iotlb.perm = IOMMU_RW;
+ success = true;
+ }
+
+ if (likely(success)) {
+ trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus),
+ VTD_PCI_SLOT(vtd_as->devfn),
+ VTD_PCI_FUNC(vtd_as->devfn),
+ iotlb.iova, iotlb.translated_addr,
+ iotlb.addr_mask);
+ } else {
+ error_report_once("%s: detected translation failure "
+ "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
+ __func__, pci_bus_num(vtd_as->bus),
+ VTD_PCI_SLOT(vtd_as->devfn),
+ VTD_PCI_FUNC(vtd_as->devfn),
+ addr);
+ }
+
+ return iotlb;
+}
+
+static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
+ IOMMUNotifierFlag old,
+ IOMMUNotifierFlag new,
+ Error **errp)
+{
+ VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+ IntelIOMMUState *s = vtd_as->iommu_state;
+
+ /* Update per-address-space notifier flags */
+ vtd_as->notifier_flags = new;
+
+ if (old == IOMMU_NOTIFIER_NONE) {
+ QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
+ } else if (new == IOMMU_NOTIFIER_NONE) {
+ QLIST_REMOVE(vtd_as, next);
+ }
+ return 0;
+}
+
+static int vtd_post_load(void *opaque, int version_id)
+{
+ IntelIOMMUState *iommu = opaque;
+
+ /*
+ * Memory regions are dynamically turned on/off depending on
+ * context entry configurations from the guest. After migration,
+ * we need to make sure the memory regions are still correct.
+ */
+ vtd_switch_address_space_all(iommu);
+
+ /*
+ * We don't need to migrate the root_scalable because we can
+ * simply do the calculation after the loading is complete. We
+ * can actually do similar things with root, dmar_enabled, etc.
+ * however since we've had them already so we'd better keep them
+ * for compatibility of migration.
+ */
+ vtd_update_scalable_state(iommu);
+
+ return 0;
+}
+
+static const VMStateDescription vtd_vmstate = {
+ .name = "iommu-intel",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .priority = MIG_PRI_IOMMU,
+ .post_load = vtd_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(root, IntelIOMMUState),
+ VMSTATE_UINT64(intr_root, IntelIOMMUState),
+ VMSTATE_UINT64(iq, IntelIOMMUState),
+ VMSTATE_UINT32(intr_size, IntelIOMMUState),
+ VMSTATE_UINT16(iq_head, IntelIOMMUState),
+ VMSTATE_UINT16(iq_tail, IntelIOMMUState),
+ VMSTATE_UINT16(iq_size, IntelIOMMUState),
+ VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState),
+ VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE),
+ VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState),
+ VMSTATE_UNUSED(1), /* bool root_extended is obsolete by VT-d */
+ VMSTATE_BOOL(dmar_enabled, IntelIOMMUState),
+ VMSTATE_BOOL(qi_enabled, IntelIOMMUState),
+ VMSTATE_BOOL(intr_enabled, IntelIOMMUState),
+ VMSTATE_BOOL(intr_eime, IntelIOMMUState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const MemoryRegionOps vtd_mem_ops = {
+ .read = vtd_mem_read,
+ .write = vtd_mem_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ .impl = {
+ .min_access_size = 4,
+ .max_access_size = 8,
+ },
+ .valid = {
+ .min_access_size = 4,
+ .max_access_size = 8,
+ },
+};
+
+static Property vtd_properties[] = {
+ DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
+ DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
+ ON_OFF_AUTO_AUTO),
+ DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
+ DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
+ VTD_HOST_ADDRESS_WIDTH),
+ DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
+ DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
+ DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+/* Read IRTE entry with specific index */
+static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
+ VTD_IR_TableEntry *entry, uint16_t sid)
+{
+ static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \
+ {0xffff, 0xfffb, 0xfff9, 0xfff8};
+ dma_addr_t addr = 0x00;
+ uint16_t mask, source_id;
+ uint8_t bus, bus_max, bus_min;
+
+ if (index >= iommu->intr_size) {
+ error_report_once("%s: index too large: ind=0x%x",
+ __func__, index);
+ return -VTD_FR_IR_INDEX_OVER;
+ }
+
+ addr = iommu->intr_root + index * sizeof(*entry);
+ if (dma_memory_read(&address_space_memory, addr, entry,
+ sizeof(*entry))) {
+ error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64,
+ __func__, index, addr);
+ return -VTD_FR_IR_ROOT_INVAL;
+ }
+
+ trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]),
+ le64_to_cpu(entry->data[0]));
+
+ if (!entry->irte.present) {
+ error_report_once("%s: detected non-present IRTE "
+ "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
+ __func__, index, le64_to_cpu(entry->data[1]),
+ le64_to_cpu(entry->data[0]));
+ return -VTD_FR_IR_ENTRY_P;
+ }
+
+ if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
+ entry->irte.__reserved_2) {
+ error_report_once("%s: detected non-zero reserved IRTE "
+ "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
+ __func__, index, le64_to_cpu(entry->data[1]),
+ le64_to_cpu(entry->data[0]));
+ return -VTD_FR_IR_IRTE_RSVD;
+ }
+
+ if (sid != X86_IOMMU_SID_INVALID) {
+ /* Validate IRTE SID */
+ source_id = le32_to_cpu(entry->irte.source_id);
+ switch (entry->irte.sid_vtype) {
+ case VTD_SVT_NONE:
+ break;
+
+ case VTD_SVT_ALL:
+ mask = vtd_svt_mask[entry->irte.sid_q];
+ if ((source_id & mask) != (sid & mask)) {
+ error_report_once("%s: invalid IRTE SID "
+ "(index=%u, sid=%u, source_id=%u)",
+ __func__, index, sid, source_id);
+ return -VTD_FR_IR_SID_ERR;
+ }
+ break;
+
+ case VTD_SVT_BUS:
+ bus_max = source_id >> 8;
+ bus_min = source_id & 0xff;
+ bus = sid >> 8;
+ if (bus > bus_max || bus < bus_min) {
+ error_report_once("%s: invalid SVT_BUS "
+ "(index=%u, bus=%u, min=%u, max=%u)",
+ __func__, index, bus, bus_min, bus_max);
+ return -VTD_FR_IR_SID_ERR;
+ }
+ break;
+
+ default:
+ error_report_once("%s: detected invalid IRTE SVT "
+ "(index=%u, type=%d)", __func__,
+ index, entry->irte.sid_vtype);
+ /* Take this as verification failure. */
+ return -VTD_FR_IR_SID_ERR;
+ }
+ }
+
+ return 0;
+}
+
+/* Fetch IRQ information of specific IR index */
+static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
+ X86IOMMUIrq *irq, uint16_t sid)
+{
+ VTD_IR_TableEntry irte = {};
+ int ret = 0;
+
+ ret = vtd_irte_get(iommu, index, &irte, sid);
+ if (ret) {
+ return ret;
+ }
+
+ irq->trigger_mode = irte.irte.trigger_mode;
+ irq->vector = irte.irte.vector;
+ irq->delivery_mode = irte.irte.delivery_mode;
+ irq->dest = le32_to_cpu(irte.irte.dest_id);
+ if (!iommu->intr_eime) {
+#define VTD_IR_APIC_DEST_MASK (0xff00ULL)
+#define VTD_IR_APIC_DEST_SHIFT (8)
+ irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >>
+ VTD_IR_APIC_DEST_SHIFT;
+ }
+ irq->dest_mode = irte.irte.dest_mode;
+ irq->redir_hint = irte.irte.redir_hint;
+
+ trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector,
+ irq->delivery_mode, irq->dest, irq->dest_mode);
+
+ return 0;
+}
+
+/* Interrupt remapping for MSI/MSI-X entry */
+static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
+ MSIMessage *origin,
+ MSIMessage *translated,
+ uint16_t sid)
+{
+ int ret = 0;
+ VTD_IR_MSIAddress addr;
+ uint16_t index;
+ X86IOMMUIrq irq = {};
+
+ assert(origin && translated);
+
+ trace_vtd_ir_remap_msi_req(origin->address, origin->data);
+
+ if (!iommu || !iommu->intr_enabled) {
+ memcpy(translated, origin, sizeof(*origin));
+ goto out;
+ }
+
+ if (origin->address & VTD_MSI_ADDR_HI_MASK) {
+ error_report_once("%s: MSI address high 32 bits non-zero detected: "
+ "address=0x%" PRIx64, __func__, origin->address);
+ return -VTD_FR_IR_REQ_RSVD;
+ }
+
+ addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
+ if (addr.addr.__head != 0xfee) {
+ error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32,
+ __func__, addr.data);
+ return -VTD_FR_IR_REQ_RSVD;
+ }
+
+ /* This is compatible mode. */
+ if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
+ memcpy(translated, origin, sizeof(*origin));
+ goto out;
+ }
+
+ index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
+
+#define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff)
+#define VTD_IR_MSI_DATA_RESERVED (0xffff0000)
+
+ if (addr.addr.sub_valid) {
+ /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */
+ index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE;
+ }
+
+ ret = vtd_remap_irq_get(iommu, index, &irq, sid);
+ if (ret) {
+ return ret;
+ }
+
+ if (addr.addr.sub_valid) {
+ trace_vtd_ir_remap_type("MSI");
+ if (origin->data & VTD_IR_MSI_DATA_RESERVED) {
+ error_report_once("%s: invalid IR MSI "
+ "(sid=%u, address=0x%" PRIx64
+ ", data=0x%" PRIx32 ")",
+ __func__, sid, origin->address, origin->data);
+ return -VTD_FR_IR_REQ_RSVD;
+ }
+ } else {
+ uint8_t vector = origin->data & 0xff;
+ uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
+
+ trace_vtd_ir_remap_type("IOAPIC");
+ /* IOAPIC entry vector should be aligned with IRTE vector
+ * (see vt-d spec 5.1.5.1). */
+ if (vector != irq.vector) {
+ trace_vtd_warn_ir_vector(sid, index, vector, irq.vector);
+ }
+
+ /* The Trigger Mode field must match the Trigger Mode in the IRTE.
+ * (see vt-d spec 5.1.5.1). */
+ if (trigger_mode != irq.trigger_mode) {
+ trace_vtd_warn_ir_trigger(sid, index, trigger_mode,
+ irq.trigger_mode);
+ }
+ }
+
+ /*
+ * We'd better keep the last two bits, assuming that guest OS
+ * might modify it. Keep it does not hurt after all.
+ */
+ irq.msi_addr_last_bits = addr.addr.__not_care;
+
+ /* Translate X86IOMMUIrq to MSI message */
+ x86_iommu_irq_to_msi_message(&irq, translated);
+
+out:
+ trace_vtd_ir_remap_msi(origin->address, origin->data,
+ translated->address, translated->data);
+ return 0;
+}
+
+static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
+ MSIMessage *dst, uint16_t sid)
+{
+ return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu),
+ src, dst, sid);
+}
+
+static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
+ uint64_t *data, unsigned size,
+ MemTxAttrs attrs)
+{
+ return MEMTX_OK;
+}
+
+static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr,
+ uint64_t value, unsigned size,
+ MemTxAttrs attrs)
+{
+ int ret = 0;
+ MSIMessage from = {}, to = {};
+ uint16_t sid = X86_IOMMU_SID_INVALID;
+
+ from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
+ from.data = (uint32_t) value;
+
+ if (!attrs.unspecified) {
+ /* We have explicit Source ID */
+ sid = attrs.requester_id;
+ }
+
+ ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
+ if (ret) {
+ /* TODO: report error */
+ /* Drop this interrupt */
+ return MEMTX_ERROR;
+ }
+
+ apic_get_class()->send_msi(&to);
+
+ return MEMTX_OK;
+}
+
+static const MemoryRegionOps vtd_mem_ir_ops = {
+ .read_with_attrs = vtd_mem_ir_read,
+ .write_with_attrs = vtd_mem_ir_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ .impl = {
+ .min_access_size = 4,
+ .max_access_size = 4,
+ },
+ .valid = {
+ .min_access_size = 4,
+ .max_access_size = 4,
+ },
+};
+
+VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
+{
+ uintptr_t key = (uintptr_t)bus;
+ VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key);
+ VTDAddressSpace *vtd_dev_as;
+ char name[128];
+
+ if (!vtd_bus) {
+ uintptr_t *new_key = g_malloc(sizeof(*new_key));
+ *new_key = (uintptr_t)bus;
+ /* No corresponding free() */
+ vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
+ PCI_DEVFN_MAX);
+ vtd_bus->bus = bus;
+ g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
+ }
+
+ vtd_dev_as = vtd_bus->dev_as[devfn];
+
+ if (!vtd_dev_as) {
+ snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn),
+ PCI_FUNC(devfn));
+ vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace));
+
+ vtd_dev_as->bus = bus;
+ vtd_dev_as->devfn = (uint8_t)devfn;
+ vtd_dev_as->iommu_state = s;
+ vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+ vtd_dev_as->iova_tree = iova_tree_new();
+
+ memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX);
+ address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root");
+
+ /*
+ * Build the DMAR-disabled container with aliases to the
+ * shared MRs. Note that aliasing to a shared memory region
+ * could help the memory API to detect same FlatViews so we
+ * can have devices to share the same FlatView when DMAR is
+ * disabled (either by not providing "intel_iommu=on" or with
+ * "iommu=pt"). It will greatly reduce the total number of
+ * FlatViews of the system hence VM runs faster.
+ */
+ memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s),
+ "vtd-nodmar", &s->mr_nodmar, 0,
+ memory_region_size(&s->mr_nodmar));
+
+ /*
+ * Build the per-device DMAR-enabled container.
+ *
+ * TODO: currently we have per-device IOMMU memory region only
+ * because we have per-device IOMMU notifiers for devices. If
+ * one day we can abstract the IOMMU notifiers out of the
+ * memory regions then we can also share the same memory
+ * region here just like what we've done above with the nodmar
+ * region.
+ */
+ strcat(name, "-dmar");
+ memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu),
+ TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s),
+ name, UINT64_MAX);
+ memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir",
+ &s->mr_ir, 0, memory_region_size(&s->mr_ir));
+ memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu),
+ VTD_INTERRUPT_ADDR_FIRST,
+ &vtd_dev_as->iommu_ir, 1);
+
+ /*
+ * Hook both the containers under the root container, we
+ * switch between DMAR & noDMAR by enable/disable
+ * corresponding sub-containers
+ */
+ memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
+ MEMORY_REGION(&vtd_dev_as->iommu),
+ 0);
+ memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
+ &vtd_dev_as->nodmar, 0);
+
+ vtd_switch_address_space(vtd_dev_as);
+ }
+ return vtd_dev_as;
+}
+
+/* Unmap the whole range in the notifier's scope. */
+static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
+{
+ hwaddr size, remain;
+ hwaddr start = n->start;
+ hwaddr end = n->end;
+ IntelIOMMUState *s = as->iommu_state;
+ DMAMap map;
+
+ /*
+ * Note: all the codes in this function has a assumption that IOVA
+ * bits are no more than VTD_MGAW bits (which is restricted by
+ * VT-d spec), otherwise we need to consider overflow of 64 bits.
+ */
+
+ if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) {
+ /*
+ * Don't need to unmap regions that is bigger than the whole
+ * VT-d supported address space size
+ */
+ end = VTD_ADDRESS_SIZE(s->aw_bits) - 1;
+ }
+
+ assert(start <= end);
+ size = remain = end - start + 1;
+
+ while (remain >= VTD_PAGE_SIZE) {
+ IOMMUTLBEvent event;
+ uint64_t mask = dma_aligned_pow2_mask(start, end, s->aw_bits);
+ uint64_t size = mask + 1;
+
+ assert(size);
+
+ event.type = IOMMU_NOTIFIER_UNMAP;
+ event.entry.iova = start;
+ event.entry.addr_mask = mask;
+ event.entry.target_as = &address_space_memory;
+ event.entry.perm = IOMMU_NONE;
+ /* This field is meaningless for unmap */
+ event.entry.translated_addr = 0;
+
+ memory_region_notify_iommu_one(n, &event);
+
+ start += size;
+ remain -= size;
+ }
+
+ assert(!remain);
+
+ trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
+ VTD_PCI_SLOT(as->devfn),
+ VTD_PCI_FUNC(as->devfn),
+ n->start, size);
+
+ map.iova = n->start;
+ map.size = size;
+ iova_tree_remove(as->iova_tree, &map);
+}
+
+static void vtd_address_space_unmap_all(IntelIOMMUState *s)
+{
+ VTDAddressSpace *vtd_as;
+ IOMMUNotifier *n;
+
+ QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+ IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
+ vtd_address_space_unmap(vtd_as, n);
+ }
+ }
+}
+
+static void vtd_address_space_refresh_all(IntelIOMMUState *s)
+{
+ vtd_address_space_unmap_all(s);
+ vtd_switch_address_space_all(s);
+}
+
+static int vtd_replay_hook(IOMMUTLBEvent *event, void *private)
+{
+ memory_region_notify_iommu_one(private, event);
+ return 0;
+}
+
+static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
+{
+ VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ uint8_t bus_n = pci_bus_num(vtd_as->bus);
+ VTDContextEntry ce;
+
+ /*
+ * The replay can be triggered by either a invalidation or a newly
+ * created entry. No matter what, we release existing mappings
+ * (it means flushing caches for UNMAP-only registers).
+ */
+ vtd_address_space_unmap(vtd_as, n);
+
+ if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
+ trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
+ "legacy mode",
+ bus_n, PCI_SLOT(vtd_as->devfn),
+ PCI_FUNC(vtd_as->devfn),
+ vtd_get_domain_id(s, &ce),
+ ce.hi, ce.lo);
+ if (vtd_as_has_map_notifier(vtd_as)) {
+ /* This is required only for MAP typed notifiers */
+ vtd_page_walk_info info = {
+ .hook_fn = vtd_replay_hook,
+ .private = (void *)n,
+ .notify_unmap = false,
+ .aw = s->aw_bits,
+ .as = vtd_as,
+ .domain_id = vtd_get_domain_id(s, &ce),
+ };
+
+ vtd_page_walk(s, &ce, 0, ~0ULL, &info);
+ }
+ } else {
+ trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
+ PCI_FUNC(vtd_as->devfn));
+ }
+
+ return;
+}
+
+/* Do the initialization. It will also be called when reset, so pay
+ * attention when adding new initialization stuff.
+ */
+static void vtd_init(IntelIOMMUState *s)
+{
+ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+ memset(s->csr, 0, DMAR_REG_SIZE);
+ memset(s->wmask, 0, DMAR_REG_SIZE);
+ memset(s->w1cmask, 0, DMAR_REG_SIZE);
+ memset(s->womask, 0, DMAR_REG_SIZE);
+
+ s->root = 0;
+ s->root_scalable = false;
+ s->dmar_enabled = false;
+ s->intr_enabled = false;
+ s->iq_head = 0;
+ s->iq_tail = 0;
+ s->iq = 0;
+ s->iq_size = 0;
+ s->qi_enabled = false;
+ s->iq_last_desc_type = VTD_INV_DESC_NONE;
+ s->iq_dw = false;
+ s->next_frcd_reg = 0;
+ s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
+ VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
+ VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
+ if (s->dma_drain) {
+ s->cap |= VTD_CAP_DRAIN;
+ }
+ if (s->aw_bits == VTD_HOST_AW_48BIT) {
+ s->cap |= VTD_CAP_SAGAW_48bit;
+ }
+ s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
+
+ /*
+ * Rsvd field masks for spte
+ */
+ vtd_spte_rsvd[0] = ~0ULL;
+ vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
+ x86_iommu->dt_supported);
+ vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+ vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+ vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+ vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
+ x86_iommu->dt_supported);
+ vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
+ x86_iommu->dt_supported);
+
+ if (s->scalable_mode) {
+ vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+ vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+ vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+ }
+
+ if (x86_iommu_ir_supported(x86_iommu)) {
+ s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
+ if (s->intr_eim == ON_OFF_AUTO_ON) {
+ s->ecap |= VTD_ECAP_EIM;
+ }
+ assert(s->intr_eim != ON_OFF_AUTO_AUTO);
+ }
+
+ if (x86_iommu->dt_supported) {
+ s->ecap |= VTD_ECAP_DT;
+ }
+
+ if (x86_iommu->pt_supported) {
+ s->ecap |= VTD_ECAP_PT;
+ }
+
+ if (s->caching_mode) {
+ s->cap |= VTD_CAP_CM;
+ }
+
+ /* TODO: read cap/ecap from host to decide which cap to be exposed. */
+ if (s->scalable_mode) {
+ s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
+ }
+
+ vtd_reset_caches(s);
+
+ /* Define registers with default values and bit semantics */
+ vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
+ vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
+ vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
+ vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
+ vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
+ vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
+ vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0);
+ vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
+ vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
+
+ /* Advanced Fault Logging not supported */
+ vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL);
+ vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0);
+ vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0);
+ vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0);
+
+ /* Treated as RsvdZ when EIM in ECAP_REG is not supported
+ * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0);
+ */
+ vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0);
+
+ /* Treated as RO for implementations that PLMR and PHMR fields reported
+ * as Clear in the CAP_REG.
+ * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0);
+ */
+ vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0);
+
+ vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
+ vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
+ vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0);
+ vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
+ vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
+ vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
+ vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0);
+ /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */
+ vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0);
+
+ /* IOTLB registers */
+ vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0);
+ vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0);
+ vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL);
+
+ /* Fault Recording Registers, 128-bit */
+ vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0);
+ vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL);
+
+ /*
+ * Interrupt remapping registers.
+ */
+ vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
+}
+
+/* Should not reset address_spaces when reset because devices will still use
+ * the address space they got at first (won't ask the bus again).
+ */
+static void vtd_reset(DeviceState *dev)
+{
+ IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
+
+ vtd_init(s);
+ vtd_address_space_refresh_all(s);
+}
+
+static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
+{
+ IntelIOMMUState *s = opaque;
+ VTDAddressSpace *vtd_as;
+
+ assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+ vtd_as = vtd_find_add_as(s, bus, devfn);
+ return &vtd_as->as;
+}
+
+static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
+{
+ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+ if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) {
+ error_setg(errp, "eim=on cannot be selected without intremap=on");
+ return false;
+ }
+
+ if (s->intr_eim == ON_OFF_AUTO_AUTO) {
+ s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim)
+ && x86_iommu_ir_supported(x86_iommu) ?
+ ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
+ }
+ if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
+ if (!kvm_irqchip_in_kernel()) {
+ error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
+ return false;
+ }
+ if (!kvm_enable_x2apic()) {
+ error_setg(errp, "eim=on requires support on the KVM side"
+ "(X2APIC_API, first shipped in v4.7)");
+ return false;
+ }
+ }
+
+ /* Currently only address widths supported are 39 and 48 bits */
+ if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
+ (s->aw_bits != VTD_HOST_AW_48BIT)) {
+ error_setg(errp, "Supported values for aw-bits are: %d, %d",
+ VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
+ return false;
+ }
+
+ if (s->scalable_mode && !s->dma_drain) {
+ error_setg(errp, "Need to set dma_drain for scalable mode");
+ return false;
+ }
+
+ return true;
+}
+
+static int vtd_machine_done_notify_one(Object *child, void *unused)
+{
+ IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
+
+ /*
+ * We hard-coded here because vfio-pci is the only special case
+ * here. Let's be more elegant in the future when we can, but so
+ * far there seems to be no better way.
+ */
+ if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) {
+ vtd_panic_require_caching_mode();
+ }
+
+ return 0;
+}
+
+static void vtd_machine_done_hook(Notifier *notifier, void *unused)
+{
+ object_child_foreach_recursive(object_get_root(),
+ vtd_machine_done_notify_one, NULL);
+}
+
+static Notifier vtd_machine_done_notify = {
+ .notify = vtd_machine_done_hook,
+};
+
+static void vtd_realize(DeviceState *dev, Error **errp)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ PCMachineState *pcms = PC_MACHINE(ms);
+ X86MachineState *x86ms = X86_MACHINE(ms);
+ PCIBus *bus = pcms->bus;
+ IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
+
+ if (!vtd_decide_config(s, errp)) {
+ return;
+ }
+
+ QLIST_INIT(&s->vtd_as_with_notifiers);
+ qemu_mutex_init(&s->iommu_lock);
+ memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
+ memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
+ "intel_iommu", DMAR_REG_SIZE);
+
+ /* Create the shared memory regions by all devices */
+ memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar",
+ UINT64_MAX);
+ memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops,
+ s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE);
+ memory_region_init_alias(&s->mr_sys_alias, OBJECT(s),
+ "vtd-sys-alias", get_system_memory(), 0,
+ memory_region_size(get_system_memory()));
+ memory_region_add_subregion_overlap(&s->mr_nodmar, 0,
+ &s->mr_sys_alias, 0);
+ memory_region_add_subregion_overlap(&s->mr_nodmar,
+ VTD_INTERRUPT_ADDR_FIRST,
+ &s->mr_ir, 1);
+
+ sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
+ /* No corresponding destroy */
+ s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
+ g_free, g_free);
+ s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
+ g_free, g_free);
+ vtd_init(s);
+ sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
+ pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
+ /* Pseudo address space under root PCI bus. */
+ x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
+ qemu_add_machine_init_done_notifier(&vtd_machine_done_notify);
+}
+
+static void vtd_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass);
+
+ dc->reset = vtd_reset;
+ dc->vmsd = &vtd_vmstate;
+ device_class_set_props(dc, vtd_properties);
+ dc->hotpluggable = false;
+ x86_class->realize = vtd_realize;
+ x86_class->int_remap = vtd_int_remap;
+ /* Supported by the pc-q35-* machine types */
+ dc->user_creatable = true;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ dc->desc = "Intel IOMMU (VT-d) DMA Remapping device";
+}
+
+static const TypeInfo vtd_info = {
+ .name = TYPE_INTEL_IOMMU_DEVICE,
+ .parent = TYPE_X86_IOMMU_DEVICE,
+ .instance_size = sizeof(IntelIOMMUState),
+ .class_init = vtd_class_init,
+};
+
+static void vtd_iommu_memory_region_class_init(ObjectClass *klass,
+ void *data)
+{
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+ imrc->translate = vtd_iommu_translate;
+ imrc->notify_flag_changed = vtd_iommu_notify_flag_changed;
+ imrc->replay = vtd_iommu_replay;
+}
+
+static const TypeInfo vtd_iommu_memory_region_info = {
+ .parent = TYPE_IOMMU_MEMORY_REGION,
+ .name = TYPE_INTEL_IOMMU_MEMORY_REGION,
+ .class_init = vtd_iommu_memory_region_class_init,
+};
+
+static void vtd_register_types(void)
+{
+ type_register_static(&vtd_info);
+ type_register_static(&vtd_iommu_memory_region_info);
+}
+
+type_init(vtd_register_types)
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
new file mode 100644
index 000000000..a6c788049
--- /dev/null
+++ b/hw/i386/intel_iommu_internal.h
@@ -0,0 +1,518 @@
+/*
+ * QEMU emulation of an Intel IOMMU (VT-d)
+ * (DMA Remapping device)
+ *
+ * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
+ * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Lots of defines copied from kernel/include/linux/intel-iommu.h:
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Author: Ashok Raj <ashok.raj@intel.com>
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *
+ */
+
+#ifndef HW_I386_INTEL_IOMMU_INTERNAL_H
+#define HW_I386_INTEL_IOMMU_INTERNAL_H
+#include "hw/i386/intel_iommu.h"
+
+/*
+ * Intel IOMMU register specification
+ */
+#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */
+#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */
+#define DMAR_CAP_REG_HI 0xc /* High 32-bit of DMAR_CAP_REG */
+#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */
+#define DMAR_ECAP_REG_HI 0X14
+#define DMAR_GCMD_REG 0x18 /* Global command */
+#define DMAR_GSTS_REG 0x1c /* Global status */
+#define DMAR_RTADDR_REG 0x20 /* Root entry table */
+#define DMAR_RTADDR_REG_HI 0X24
+#define DMAR_CCMD_REG 0x28 /* Context command */
+#define DMAR_CCMD_REG_HI 0x2c
+#define DMAR_FSTS_REG 0x34 /* Fault status */
+#define DMAR_FECTL_REG 0x38 /* Fault control */
+#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data */
+#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr */
+#define DMAR_FEUADDR_REG 0x44 /* Upper address */
+#define DMAR_AFLOG_REG 0x58 /* Advanced fault control */
+#define DMAR_AFLOG_REG_HI 0X5c
+#define DMAR_PMEN_REG 0x64 /* Enable protected memory region */
+#define DMAR_PLMBASE_REG 0x68 /* PMRR low addr */
+#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */
+#define DMAR_PHMBASE_REG 0x70 /* PMRR high base addr */
+#define DMAR_PHMBASE_REG_HI 0X74
+#define DMAR_PHMLIMIT_REG 0x78 /* PMRR high limit */
+#define DMAR_PHMLIMIT_REG_HI 0x7c
+#define DMAR_IQH_REG 0x80 /* Invalidation queue head */
+#define DMAR_IQH_REG_HI 0X84
+#define DMAR_IQT_REG 0x88 /* Invalidation queue tail */
+#define DMAR_IQT_REG_HI 0X8c
+#define DMAR_IQA_REG 0x90 /* Invalidation queue addr */
+#define DMAR_IQA_REG_HI 0x94
+#define DMAR_ICS_REG 0x9c /* Invalidation complete status */
+#define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr */
+#define DMAR_IRTA_REG_HI 0xbc
+#define DMAR_IECTL_REG 0xa0 /* Invalidation event control */
+#define DMAR_IEDATA_REG 0xa4 /* Invalidation event data */
+#define DMAR_IEADDR_REG 0xa8 /* Invalidation event address */
+#define DMAR_IEUADDR_REG 0xac /* Invalidation event address */
+#define DMAR_PQH_REG 0xc0 /* Page request queue head */
+#define DMAR_PQH_REG_HI 0xc4
+#define DMAR_PQT_REG 0xc8 /* Page request queue tail*/
+#define DMAR_PQT_REG_HI 0xcc
+#define DMAR_PQA_REG 0xd0 /* Page request queue address */
+#define DMAR_PQA_REG_HI 0xd4
+#define DMAR_PRS_REG 0xdc /* Page request status */
+#define DMAR_PECTL_REG 0xe0 /* Page request event control */
+#define DMAR_PEDATA_REG 0xe4 /* Page request event data */
+#define DMAR_PEADDR_REG 0xe8 /* Page request event address */
+#define DMAR_PEUADDR_REG 0xec /* Page event upper address */
+#define DMAR_MTRRCAP_REG 0x100 /* MTRR capability */
+#define DMAR_MTRRCAP_REG_HI 0x104
+#define DMAR_MTRRDEF_REG 0x108 /* MTRR default type */
+#define DMAR_MTRRDEF_REG_HI 0x10c
+
+/* IOTLB registers */
+#define DMAR_IOTLB_REG_OFFSET 0xf0 /* Offset to the IOTLB registers */
+#define DMAR_IVA_REG DMAR_IOTLB_REG_OFFSET /* Invalidate address */
+#define DMAR_IVA_REG_HI (DMAR_IVA_REG + 4)
+/* IOTLB invalidate register */
+#define DMAR_IOTLB_REG (DMAR_IOTLB_REG_OFFSET + 0x8)
+#define DMAR_IOTLB_REG_HI (DMAR_IOTLB_REG + 4)
+
+/* FRCD */
+#define DMAR_FRCD_REG_OFFSET 0x220 /* Offset to the fault recording regs */
+/* NOTICE: If you change the DMAR_FRCD_REG_NR, please remember to change the
+ * DMAR_REG_SIZE in include/hw/i386/intel_iommu.h.
+ * #define DMAR_REG_SIZE (DMAR_FRCD_REG_OFFSET + 16 * DMAR_FRCD_REG_NR)
+ */
+#define DMAR_FRCD_REG_NR 1ULL /* Num of fault recording regs */
+
+#define DMAR_FRCD_REG_0_0 0x220 /* The 0th fault recording regs */
+#define DMAR_FRCD_REG_0_1 0x224
+#define DMAR_FRCD_REG_0_2 0x228
+#define DMAR_FRCD_REG_0_3 0x22c
+
+/* Interrupt Address Range */
+#define VTD_INTERRUPT_ADDR_FIRST 0xfee00000ULL
+#define VTD_INTERRUPT_ADDR_LAST 0xfeefffffULL
+#define VTD_INTERRUPT_ADDR_SIZE (VTD_INTERRUPT_ADDR_LAST - \
+ VTD_INTERRUPT_ADDR_FIRST + 1)
+
+/* The shift of source_id in the key of IOTLB hash table */
+#define VTD_IOTLB_SID_SHIFT 36
+#define VTD_IOTLB_LVL_SHIFT 52
+#define VTD_IOTLB_MAX_SIZE 1024 /* Max size of the hash table */
+
+/* IOTLB_REG */
+#define VTD_TLB_GLOBAL_FLUSH (1ULL << 60) /* Global invalidation */
+#define VTD_TLB_DSI_FLUSH (2ULL << 60) /* Domain-selective */
+#define VTD_TLB_PSI_FLUSH (3ULL << 60) /* Page-selective */
+#define VTD_TLB_FLUSH_GRANU_MASK (3ULL << 60)
+#define VTD_TLB_GLOBAL_FLUSH_A (1ULL << 57)
+#define VTD_TLB_DSI_FLUSH_A (2ULL << 57)
+#define VTD_TLB_PSI_FLUSH_A (3ULL << 57)
+#define VTD_TLB_FLUSH_GRANU_MASK_A (3ULL << 57)
+#define VTD_TLB_IVT (1ULL << 63)
+#define VTD_TLB_DID(val) (((val) >> 32) & VTD_DOMAIN_ID_MASK)
+
+/* IVA_REG */
+#define VTD_IVA_ADDR(val) ((val) & ~0xfffULL)
+#define VTD_IVA_AM(val) ((val) & 0x3fULL)
+
+/* GCMD_REG */
+#define VTD_GCMD_TE (1UL << 31)
+#define VTD_GCMD_SRTP (1UL << 30)
+#define VTD_GCMD_SFL (1UL << 29)
+#define VTD_GCMD_EAFL (1UL << 28)
+#define VTD_GCMD_WBF (1UL << 27)
+#define VTD_GCMD_QIE (1UL << 26)
+#define VTD_GCMD_IRE (1UL << 25)
+#define VTD_GCMD_SIRTP (1UL << 24)
+#define VTD_GCMD_CFI (1UL << 23)
+
+/* GSTS_REG */
+#define VTD_GSTS_TES (1UL << 31)
+#define VTD_GSTS_RTPS (1UL << 30)
+#define VTD_GSTS_FLS (1UL << 29)
+#define VTD_GSTS_AFLS (1UL << 28)
+#define VTD_GSTS_WBFS (1UL << 27)
+#define VTD_GSTS_QIES (1UL << 26)
+#define VTD_GSTS_IRES (1UL << 25)
+#define VTD_GSTS_IRTPS (1UL << 24)
+#define VTD_GSTS_CFIS (1UL << 23)
+
+/* CCMD_REG */
+#define VTD_CCMD_ICC (1ULL << 63)
+#define VTD_CCMD_GLOBAL_INVL (1ULL << 61)
+#define VTD_CCMD_DOMAIN_INVL (2ULL << 61)
+#define VTD_CCMD_DEVICE_INVL (3ULL << 61)
+#define VTD_CCMD_CIRG_MASK (3ULL << 61)
+#define VTD_CCMD_GLOBAL_INVL_A (1ULL << 59)
+#define VTD_CCMD_DOMAIN_INVL_A (2ULL << 59)
+#define VTD_CCMD_DEVICE_INVL_A (3ULL << 59)
+#define VTD_CCMD_CAIG_MASK (3ULL << 59)
+#define VTD_CCMD_DID(val) ((val) & VTD_DOMAIN_ID_MASK)
+#define VTD_CCMD_SID(val) (((val) >> 16) & 0xffffULL)
+#define VTD_CCMD_FM(val) (((val) >> 32) & 3ULL)
+
+/* RTADDR_REG */
+#define VTD_RTADDR_SMT (1ULL << 10)
+#define VTD_RTADDR_ADDR_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL)
+
+/* IRTA_REG */
+#define VTD_IRTA_ADDR_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL)
+#define VTD_IRTA_EIME (1ULL << 11)
+#define VTD_IRTA_SIZE_MASK (0xfULL)
+
+/* ECAP_REG */
+/* (offset >> 4) << 8 */
+#define VTD_ECAP_IRO (DMAR_IOTLB_REG_OFFSET << 4)
+#define VTD_ECAP_QI (1ULL << 1)
+#define VTD_ECAP_DT (1ULL << 2)
+/* Interrupt Remapping support */
+#define VTD_ECAP_IR (1ULL << 3)
+#define VTD_ECAP_EIM (1ULL << 4)
+#define VTD_ECAP_PT (1ULL << 6)
+#define VTD_ECAP_MHMV (15ULL << 20)
+#define VTD_ECAP_SRS (1ULL << 31)
+#define VTD_ECAP_SMTS (1ULL << 43)
+#define VTD_ECAP_SLTS (1ULL << 46)
+
+/* CAP_REG */
+/* (offset >> 4) << 24 */
+#define VTD_CAP_FRO (DMAR_FRCD_REG_OFFSET << 20)
+#define VTD_CAP_NFR ((DMAR_FRCD_REG_NR - 1) << 40)
+#define VTD_DOMAIN_ID_SHIFT 16 /* 16-bit domain id for 64K domains */
+#define VTD_DOMAIN_ID_MASK ((1UL << VTD_DOMAIN_ID_SHIFT) - 1)
+#define VTD_CAP_ND (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
+#define VTD_ADDRESS_SIZE(aw) (1ULL << (aw))
+#define VTD_CAP_MGAW(aw) ((((aw) - 1) & 0x3fULL) << 16)
+#define VTD_MAMV 18ULL
+#define VTD_CAP_MAMV (VTD_MAMV << 48)
+#define VTD_CAP_PSI (1ULL << 39)
+#define VTD_CAP_SLLPS ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_DRAIN_WRITE (1ULL << 54)
+#define VTD_CAP_DRAIN_READ (1ULL << 55)
+#define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE)
+#define VTD_CAP_CM (1ULL << 7)
+
+/* Supported Adjusted Guest Address Widths */
+#define VTD_CAP_SAGAW_SHIFT 8
+#define VTD_CAP_SAGAW_MASK (0x1fULL << VTD_CAP_SAGAW_SHIFT)
+ /* 39-bit AGAW, 3-level page-table */
+#define VTD_CAP_SAGAW_39bit (0x2ULL << VTD_CAP_SAGAW_SHIFT)
+ /* 48-bit AGAW, 4-level page-table */
+#define VTD_CAP_SAGAW_48bit (0x4ULL << VTD_CAP_SAGAW_SHIFT)
+
+/* IQT_REG */
+#define VTD_IQT_QT(dw_bit, val) (dw_bit ? (((val) >> 5) & 0x3fffULL) : \
+ (((val) >> 4) & 0x7fffULL))
+#define VTD_IQT_QT_256_RSV_BIT 0x10
+
+/* IQA_REG */
+#define VTD_IQA_IQA_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL)
+#define VTD_IQA_QS 0x7ULL
+#define VTD_IQA_DW_MASK 0x800
+
+/* IQH_REG */
+#define VTD_IQH_QH_SHIFT_4 4
+#define VTD_IQH_QH_SHIFT_5 5
+#define VTD_IQH_QH_MASK 0x7fff0ULL
+
+/* ICS_REG */
+#define VTD_ICS_IWC 1UL
+
+/* IECTL_REG */
+#define VTD_IECTL_IM (1UL << 31)
+#define VTD_IECTL_IP (1UL << 30)
+
+/* FSTS_REG */
+#define VTD_FSTS_FRI_MASK 0xff00UL
+#define VTD_FSTS_FRI(val) ((((uint32_t)(val)) << 8) & VTD_FSTS_FRI_MASK)
+#define VTD_FSTS_IQE (1UL << 4)
+#define VTD_FSTS_PPF (1UL << 1)
+#define VTD_FSTS_PFO 1UL
+
+/* FECTL_REG */
+#define VTD_FECTL_IM (1UL << 31)
+#define VTD_FECTL_IP (1UL << 30)
+
+/* Fault Recording Register */
+/* For the high 64-bit of 128-bit */
+#define VTD_FRCD_F (1ULL << 63)
+#define VTD_FRCD_T (1ULL << 62)
+#define VTD_FRCD_FR(val) (((val) & 0xffULL) << 32)
+#define VTD_FRCD_SID_MASK 0xffffULL
+#define VTD_FRCD_SID(val) ((val) & VTD_FRCD_SID_MASK)
+/* For the low 64-bit of 128-bit */
+#define VTD_FRCD_FI(val) ((val) & ~0xfffULL)
+
+/* DMA Remapping Fault Conditions */
+typedef enum VTDFaultReason {
+ VTD_FR_RESERVED = 0, /* Reserved for Advanced Fault logging */
+ VTD_FR_ROOT_ENTRY_P = 1, /* The Present(P) field of root-entry is 0 */
+ VTD_FR_CONTEXT_ENTRY_P, /* The Present(P) field of context-entry is 0 */
+ VTD_FR_CONTEXT_ENTRY_INV, /* Invalid programming of a context-entry */
+ VTD_FR_ADDR_BEYOND_MGAW, /* Input-address above (2^x-1) */
+ VTD_FR_WRITE, /* No write permission */
+ VTD_FR_READ, /* No read permission */
+ /* Fail to access a second-level paging entry (not SL_PML4E) */
+ VTD_FR_PAGING_ENTRY_INV,
+ VTD_FR_ROOT_TABLE_INV, /* Fail to access a root-entry */
+ VTD_FR_CONTEXT_TABLE_INV, /* Fail to access a context-entry */
+ /* Non-zero reserved field in a present root-entry */
+ VTD_FR_ROOT_ENTRY_RSVD,
+ /* Non-zero reserved field in a present context-entry */
+ VTD_FR_CONTEXT_ENTRY_RSVD,
+ /* Non-zero reserved field in a second-level paging entry with at lease one
+ * Read(R) and Write(W) or Execute(E) field is Set.
+ */
+ VTD_FR_PAGING_ENTRY_RSVD,
+ /* Translation request or translated request explicitly blocked dut to the
+ * programming of the Translation Type (T) field in the present
+ * context-entry.
+ */
+ VTD_FR_CONTEXT_ENTRY_TT,
+
+ /* Interrupt remapping transition faults */
+ VTD_FR_IR_REQ_RSVD = 0x20, /* One or more IR request reserved
+ * fields set */
+ VTD_FR_IR_INDEX_OVER = 0x21, /* Index value greater than max */
+ VTD_FR_IR_ENTRY_P = 0x22, /* Present (P) not set in IRTE */
+ VTD_FR_IR_ROOT_INVAL = 0x23, /* IR Root table invalid */
+ VTD_FR_IR_IRTE_RSVD = 0x24, /* IRTE Rsvd field non-zero with
+ * Present flag set */
+ VTD_FR_IR_REQ_COMPAT = 0x25, /* Encountered compatible IR
+ * request while disabled */
+ VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */
+
+ VTD_FR_PASID_TABLE_INV = 0x58, /*Invalid PASID table entry */
+
+ /* This is not a normal fault reason. We use this to indicate some faults
+ * that are not referenced by the VT-d specification.
+ * Fault event with such reason should not be recorded.
+ */
+ VTD_FR_RESERVED_ERR,
+ VTD_FR_MAX, /* Guard */
+} VTDFaultReason;
+
+#define VTD_CONTEXT_CACHE_GEN_MAX 0xffffffffUL
+
+/* Interrupt Entry Cache Invalidation Descriptor: VT-d 6.5.2.7. */
+struct VTDInvDescIEC {
+ uint32_t type:4; /* Should always be 0x4 */
+ uint32_t granularity:1; /* If set, it's global IR invalidation */
+ uint32_t resved_1:22;
+ uint32_t index_mask:5; /* 2^N for continuous int invalidation */
+ uint32_t index:16; /* Start index to invalidate */
+ uint32_t reserved_2:16;
+};
+typedef struct VTDInvDescIEC VTDInvDescIEC;
+
+/* Queued Invalidation Descriptor */
+union VTDInvDesc {
+ struct {
+ uint64_t lo;
+ uint64_t hi;
+ };
+ struct {
+ uint64_t val[4];
+ };
+ union {
+ VTDInvDescIEC iec;
+ };
+};
+typedef union VTDInvDesc VTDInvDesc;
+
+/* Masks for struct VTDInvDesc */
+#define VTD_INV_DESC_TYPE 0xf
+#define VTD_INV_DESC_CC 0x1 /* Context-cache Invalidate Desc */
+#define VTD_INV_DESC_IOTLB 0x2
+#define VTD_INV_DESC_DEVICE 0x3
+#define VTD_INV_DESC_IEC 0x4 /* Interrupt Entry Cache
+ Invalidate Descriptor */
+#define VTD_INV_DESC_WAIT 0x5 /* Invalidation Wait Descriptor */
+#define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */
+#define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */
+#define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */
+
+/* Masks for Invalidation Wait Descriptor*/
+#define VTD_INV_DESC_WAIT_SW (1ULL << 5)
+#define VTD_INV_DESC_WAIT_IF (1ULL << 4)
+#define VTD_INV_DESC_WAIT_FN (1ULL << 6)
+#define VTD_INV_DESC_WAIT_DATA_SHIFT 32
+#define VTD_INV_DESC_WAIT_RSVD_LO 0Xffffff80ULL
+#define VTD_INV_DESC_WAIT_RSVD_HI 3ULL
+
+/* Masks for Context-cache Invalidation Descriptor */
+#define VTD_INV_DESC_CC_G (3ULL << 4)
+#define VTD_INV_DESC_CC_GLOBAL (1ULL << 4)
+#define VTD_INV_DESC_CC_DOMAIN (2ULL << 4)
+#define VTD_INV_DESC_CC_DEVICE (3ULL << 4)
+#define VTD_INV_DESC_CC_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_CC_SID(val) (((val) >> 32) & 0xffffUL)
+#define VTD_INV_DESC_CC_FM(val) (((val) >> 48) & 3UL)
+#define VTD_INV_DESC_CC_RSVD 0xfffc00000000ffc0ULL
+
+/* Masks for IOTLB Invalidate Descriptor */
+#define VTD_INV_DESC_IOTLB_G (3ULL << 4)
+#define VTD_INV_DESC_IOTLB_GLOBAL (1ULL << 4)
+#define VTD_INV_DESC_IOTLB_DOMAIN (2ULL << 4)
+#define VTD_INV_DESC_IOTLB_PAGE (3ULL << 4)
+#define VTD_INV_DESC_IOTLB_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_IOTLB_ADDR(val) ((val) & ~0xfffULL)
+#define VTD_INV_DESC_IOTLB_AM(val) ((val) & 0x3fULL)
+#define VTD_INV_DESC_IOTLB_RSVD_LO 0xffffffff0000ff00ULL
+#define VTD_INV_DESC_IOTLB_RSVD_HI 0xf80ULL
+
+/* Mask for Device IOTLB Invalidate Descriptor */
+#define VTD_INV_DESC_DEVICE_IOTLB_ADDR(val) ((val) & 0xfffffffffffff000ULL)
+#define VTD_INV_DESC_DEVICE_IOTLB_SIZE(val) ((val) & 0x1)
+#define VTD_INV_DESC_DEVICE_IOTLB_SID(val) (((val) >> 32) & 0xFFFFULL)
+#define VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI 0xffeULL
+#define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0fff8
+
+/* Rsvd field masks for spte */
+#define VTD_SPTE_SNP 0x800ULL
+
+#define VTD_SPTE_PAGE_L1_RSVD_MASK(aw, dt_supported) \
+ dt_supported ? \
+ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
+ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L2_RSVD_MASK(aw) \
+ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L3_RSVD_MASK(aw) \
+ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \
+ (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+
+#define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw, dt_supported) \
+ dt_supported ? \
+ (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
+ (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L3_RSVD_MASK(aw, dt_supported) \
+ dt_supported ? \
+ (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
+ (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+
+/* Information about page-selective IOTLB invalidate */
+struct VTDIOTLBPageInvInfo {
+ uint16_t domain_id;
+ uint64_t addr;
+ uint8_t mask;
+};
+typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
+
+/* Pagesize of VTD paging structures, including root and context tables */
+#define VTD_PAGE_SHIFT 12
+#define VTD_PAGE_SIZE (1ULL << VTD_PAGE_SHIFT)
+
+#define VTD_PAGE_SHIFT_4K 12
+#define VTD_PAGE_MASK_4K (~((1ULL << VTD_PAGE_SHIFT_4K) - 1))
+#define VTD_PAGE_SHIFT_2M 21
+#define VTD_PAGE_MASK_2M (~((1ULL << VTD_PAGE_SHIFT_2M) - 1))
+#define VTD_PAGE_SHIFT_1G 30
+#define VTD_PAGE_MASK_1G (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
+
+struct VTDRootEntry {
+ uint64_t lo;
+ uint64_t hi;
+};
+typedef struct VTDRootEntry VTDRootEntry;
+
+/* Masks for struct VTDRootEntry */
+#define VTD_ROOT_ENTRY_P 1ULL
+#define VTD_ROOT_ENTRY_CTP (~0xfffULL)
+
+#define VTD_ROOT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(VTDRootEntry))
+#define VTD_ROOT_ENTRY_RSVD(aw) (0xffeULL | ~VTD_HAW_MASK(aw))
+
+#define VTD_DEVFN_CHECK_MASK 0x80
+
+/* Masks for struct VTDContextEntry */
+/* lo */
+#define VTD_CONTEXT_ENTRY_P (1ULL << 0)
+#define VTD_CONTEXT_ENTRY_FPD (1ULL << 1) /* Fault Processing Disable */
+#define VTD_CONTEXT_ENTRY_TT (3ULL << 2) /* Translation Type */
+#define VTD_CONTEXT_TT_MULTI_LEVEL 0
+#define VTD_CONTEXT_TT_DEV_IOTLB (1ULL << 2)
+#define VTD_CONTEXT_TT_PASS_THROUGH (2ULL << 2)
+/* Second Level Page Translation Pointer*/
+#define VTD_CONTEXT_ENTRY_SLPTPTR (~0xfffULL)
+#define VTD_CONTEXT_ENTRY_RSVD_LO(aw) (0xff0ULL | ~VTD_HAW_MASK(aw))
+/* hi */
+#define VTD_CONTEXT_ENTRY_AW 7ULL /* Adjusted guest-address-width */
+#define VTD_CONTEXT_ENTRY_DID(val) (((val) >> 8) & VTD_DOMAIN_ID_MASK)
+#define VTD_CONTEXT_ENTRY_RSVD_HI 0xffffffffff000080ULL
+
+#define VTD_CONTEXT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(VTDContextEntry))
+
+#define VTD_CTX_ENTRY_LEGACY_SIZE 16
+#define VTD_CTX_ENTRY_SCALABLE_SIZE 32
+
+#define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff
+#define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL | ~VTD_HAW_MASK(aw))
+#define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL
+
+/* PASID Table Related Definitions */
+#define VTD_PASID_DIR_BASE_ADDR_MASK (~0xfffULL)
+#define VTD_PASID_TABLE_BASE_ADDR_MASK (~0xfffULL)
+#define VTD_PASID_DIR_ENTRY_SIZE 8
+#define VTD_PASID_ENTRY_SIZE 64
+#define VTD_PASID_DIR_BITS_MASK (0x3fffULL)
+#define VTD_PASID_DIR_INDEX(pasid) (((pasid) >> 6) & VTD_PASID_DIR_BITS_MASK)
+#define VTD_PASID_DIR_FPD (1ULL << 1) /* Fault Processing Disable */
+#define VTD_PASID_TABLE_BITS_MASK (0x3fULL)
+#define VTD_PASID_TABLE_INDEX(pasid) ((pasid) & VTD_PASID_TABLE_BITS_MASK)
+#define VTD_PASID_ENTRY_FPD (1ULL << 1) /* Fault Processing Disable */
+
+/* PASID Granular Translation Type Mask */
+#define VTD_PASID_ENTRY_P 1ULL
+#define VTD_SM_PASID_ENTRY_PGTT (7ULL << 6)
+#define VTD_SM_PASID_ENTRY_FLT (1ULL << 6)
+#define VTD_SM_PASID_ENTRY_SLT (2ULL << 6)
+#define VTD_SM_PASID_ENTRY_NESTED (3ULL << 6)
+#define VTD_SM_PASID_ENTRY_PT (4ULL << 6)
+
+#define VTD_SM_PASID_ENTRY_AW 7ULL /* Adjusted guest-address-width */
+#define VTD_SM_PASID_ENTRY_DID(val) ((val) & VTD_DOMAIN_ID_MASK)
+
+/* Second Level Page Translation Pointer*/
+#define VTD_SM_PASID_ENTRY_SLPTPTR (~0xfffULL)
+
+/* Paging Structure common */
+#define VTD_SL_PT_PAGE_SIZE_MASK (1ULL << 7)
+/* Bits to decide the offset for each level */
+#define VTD_SL_LEVEL_BITS 9
+
+/* Second Level Paging Structure */
+#define VTD_SL_PML4_LEVEL 4
+#define VTD_SL_PDP_LEVEL 3
+#define VTD_SL_PD_LEVEL 2
+#define VTD_SL_PT_LEVEL 1
+#define VTD_SL_PT_ENTRY_NR 512
+
+/* Masks for Second Level Paging Entry */
+#define VTD_SL_RW_MASK 3ULL
+#define VTD_SL_R 1ULL
+#define VTD_SL_W (1ULL << 1)
+#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
+#define VTD_SL_IGN_COM 0xbff0000000000000ULL
+#define VTD_SL_TM (1ULL << 62)
+
+#endif
diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c
new file mode 100644
index 000000000..1e89ca089
--- /dev/null
+++ b/hw/i386/kvm/apic.c
@@ -0,0 +1,271 @@
+/*
+ * KVM in-kernel APIC support
+ *
+ * Copyright (c) 2011 Siemens AG
+ *
+ * Authors:
+ * Jan Kiszka <jan.kiszka@siemens.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/module.h"
+#include "hw/i386/apic_internal.h"
+#include "hw/pci/msi.h"
+#include "sysemu/hw_accel.h"
+#include "sysemu/kvm.h"
+#include "kvm/kvm_i386.h"
+
+static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic,
+ int reg_id, uint32_t val)
+{
+ *((uint32_t *)(kapic->regs + (reg_id << 4))) = val;
+}
+
+static inline uint32_t kvm_apic_get_reg(struct kvm_lapic_state *kapic,
+ int reg_id)
+{
+ return *((uint32_t *)(kapic->regs + (reg_id << 4)));
+}
+
+static void kvm_put_apic_state(APICCommonState *s, struct kvm_lapic_state *kapic)
+{
+ int i;
+
+ memset(kapic, 0, sizeof(*kapic));
+ if (kvm_has_x2apic_api() && s->apicbase & MSR_IA32_APICBASE_EXTD) {
+ kvm_apic_set_reg(kapic, 0x2, s->initial_apic_id);
+ } else {
+ kvm_apic_set_reg(kapic, 0x2, s->id << 24);
+ }
+ kvm_apic_set_reg(kapic, 0x8, s->tpr);
+ kvm_apic_set_reg(kapic, 0xd, s->log_dest << 24);
+ kvm_apic_set_reg(kapic, 0xe, s->dest_mode << 28 | 0x0fffffff);
+ kvm_apic_set_reg(kapic, 0xf, s->spurious_vec);
+ for (i = 0; i < 8; i++) {
+ kvm_apic_set_reg(kapic, 0x10 + i, s->isr[i]);
+ kvm_apic_set_reg(kapic, 0x18 + i, s->tmr[i]);
+ kvm_apic_set_reg(kapic, 0x20 + i, s->irr[i]);
+ }
+ kvm_apic_set_reg(kapic, 0x28, s->esr);
+ kvm_apic_set_reg(kapic, 0x30, s->icr[0]);
+ kvm_apic_set_reg(kapic, 0x31, s->icr[1]);
+ for (i = 0; i < APIC_LVT_NB; i++) {
+ kvm_apic_set_reg(kapic, 0x32 + i, s->lvt[i]);
+ }
+ kvm_apic_set_reg(kapic, 0x38, s->initial_count);
+ kvm_apic_set_reg(kapic, 0x3e, s->divide_conf);
+}
+
+void kvm_get_apic_state(DeviceState *dev, struct kvm_lapic_state *kapic)
+{
+ APICCommonState *s = APIC_COMMON(dev);
+ int i, v;
+
+ if (kvm_has_x2apic_api() && s->apicbase & MSR_IA32_APICBASE_EXTD) {
+ assert(kvm_apic_get_reg(kapic, 0x2) == s->initial_apic_id);
+ } else {
+ s->id = kvm_apic_get_reg(kapic, 0x2) >> 24;
+ }
+ s->tpr = kvm_apic_get_reg(kapic, 0x8);
+ s->arb_id = kvm_apic_get_reg(kapic, 0x9);
+ s->log_dest = kvm_apic_get_reg(kapic, 0xd) >> 24;
+ s->dest_mode = kvm_apic_get_reg(kapic, 0xe) >> 28;
+ s->spurious_vec = kvm_apic_get_reg(kapic, 0xf);
+ for (i = 0; i < 8; i++) {
+ s->isr[i] = kvm_apic_get_reg(kapic, 0x10 + i);
+ s->tmr[i] = kvm_apic_get_reg(kapic, 0x18 + i);
+ s->irr[i] = kvm_apic_get_reg(kapic, 0x20 + i);
+ }
+ s->esr = kvm_apic_get_reg(kapic, 0x28);
+ s->icr[0] = kvm_apic_get_reg(kapic, 0x30);
+ s->icr[1] = kvm_apic_get_reg(kapic, 0x31);
+ for (i = 0; i < APIC_LVT_NB; i++) {
+ s->lvt[i] = kvm_apic_get_reg(kapic, 0x32 + i);
+ }
+ s->initial_count = kvm_apic_get_reg(kapic, 0x38);
+ s->divide_conf = kvm_apic_get_reg(kapic, 0x3e);
+
+ v = (s->divide_conf & 3) | ((s->divide_conf >> 1) & 4);
+ s->count_shift = (v + 1) & 7;
+
+ s->initial_count_load_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+ apic_next_timer(s, s->initial_count_load_time);
+}
+
+static void kvm_apic_set_base(APICCommonState *s, uint64_t val)
+{
+ s->apicbase = val;
+}
+
+static void kvm_apic_set_tpr(APICCommonState *s, uint8_t val)
+{
+ s->tpr = (val & 0x0f) << 4;
+}
+
+static uint8_t kvm_apic_get_tpr(APICCommonState *s)
+{
+ return s->tpr >> 4;
+}
+
+static void kvm_apic_enable_tpr_reporting(APICCommonState *s, bool enable)
+{
+ struct kvm_tpr_access_ctl ctl = {
+ .enabled = enable
+ };
+
+ kvm_vcpu_ioctl(CPU(s->cpu), KVM_TPR_ACCESS_REPORTING, &ctl);
+}
+
+static void kvm_apic_vapic_base_update(APICCommonState *s)
+{
+ struct kvm_vapic_addr vapid_addr = {
+ .vapic_addr = s->vapic_paddr,
+ };
+ int ret;
+
+ ret = kvm_vcpu_ioctl(CPU(s->cpu), KVM_SET_VAPIC_ADDR, &vapid_addr);
+ if (ret < 0) {
+ fprintf(stderr, "KVM: setting VAPIC address failed (%s)\n",
+ strerror(-ret));
+ abort();
+ }
+}
+
+static void kvm_apic_put(CPUState *cs, run_on_cpu_data data)
+{
+ APICCommonState *s = data.host_ptr;
+ struct kvm_lapic_state kapic;
+ int ret;
+
+ kvm_put_apicbase(s->cpu, s->apicbase);
+ kvm_put_apic_state(s, &kapic);
+
+ ret = kvm_vcpu_ioctl(CPU(s->cpu), KVM_SET_LAPIC, &kapic);
+ if (ret < 0) {
+ fprintf(stderr, "KVM_SET_LAPIC failed: %s\n", strerror(-ret));
+ abort();
+ }
+}
+
+static void kvm_apic_post_load(APICCommonState *s)
+{
+ run_on_cpu(CPU(s->cpu), kvm_apic_put, RUN_ON_CPU_HOST_PTR(s));
+}
+
+static void do_inject_external_nmi(CPUState *cpu, run_on_cpu_data data)
+{
+ APICCommonState *s = data.host_ptr;
+ uint32_t lvt;
+ int ret;
+
+ cpu_synchronize_state(cpu);
+
+ lvt = s->lvt[APIC_LVT_LINT1];
+ if (!(lvt & APIC_LVT_MASKED) && ((lvt >> 8) & 7) == APIC_DM_NMI) {
+ ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
+ if (ret < 0) {
+ fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
+ strerror(-ret));
+ }
+ }
+}
+
+static void kvm_apic_external_nmi(APICCommonState *s)
+{
+ run_on_cpu(CPU(s->cpu), do_inject_external_nmi, RUN_ON_CPU_HOST_PTR(s));
+}
+
+static void kvm_send_msi(MSIMessage *msg)
+{
+ int ret;
+
+ /*
+ * The message has already passed through interrupt remapping if enabled,
+ * but the legacy extended destination ID in low bits still needs to be
+ * handled.
+ */
+ msg->address = kvm_swizzle_msi_ext_dest_id(msg->address);
+
+ ret = kvm_irqchip_send_msi(kvm_state, *msg);
+ if (ret < 0) {
+ fprintf(stderr, "KVM: injection failed, MSI lost (%s)\n",
+ strerror(-ret));
+ }
+}
+
+static uint64_t kvm_apic_mem_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ return ~(uint64_t)0;
+}
+
+static void kvm_apic_mem_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ MSIMessage msg = { .address = addr, .data = data };
+
+ kvm_send_msi(&msg);
+}
+
+static const MemoryRegionOps kvm_apic_io_ops = {
+ .read = kvm_apic_mem_read,
+ .write = kvm_apic_mem_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void kvm_apic_reset(APICCommonState *s)
+{
+ /* Not used by KVM, which uses the CPU mp_state instead. */
+ s->wait_for_sipi = 0;
+
+ run_on_cpu(CPU(s->cpu), kvm_apic_put, RUN_ON_CPU_HOST_PTR(s));
+}
+
+static void kvm_apic_realize(DeviceState *dev, Error **errp)
+{
+ APICCommonState *s = APIC_COMMON(dev);
+
+ memory_region_init_io(&s->io_memory, OBJECT(s), &kvm_apic_io_ops, s,
+ "kvm-apic-msi", APIC_SPACE_SIZE);
+
+ assert(kvm_has_gsi_routing());
+ msi_nonbroken = true;
+}
+
+static void kvm_apic_unrealize(DeviceState *dev)
+{
+}
+
+static void kvm_apic_class_init(ObjectClass *klass, void *data)
+{
+ APICCommonClass *k = APIC_COMMON_CLASS(klass);
+
+ k->realize = kvm_apic_realize;
+ k->unrealize = kvm_apic_unrealize;
+ k->reset = kvm_apic_reset;
+ k->set_base = kvm_apic_set_base;
+ k->set_tpr = kvm_apic_set_tpr;
+ k->get_tpr = kvm_apic_get_tpr;
+ k->post_load = kvm_apic_post_load;
+ k->enable_tpr_reporting = kvm_apic_enable_tpr_reporting;
+ k->vapic_base_update = kvm_apic_vapic_base_update;
+ k->external_nmi = kvm_apic_external_nmi;
+ k->send_msi = kvm_send_msi;
+}
+
+static const TypeInfo kvm_apic_info = {
+ .name = "kvm-apic",
+ .parent = TYPE_APIC_COMMON,
+ .instance_size = sizeof(APICCommonState),
+ .class_init = kvm_apic_class_init,
+};
+
+static void kvm_apic_register_types(void)
+{
+ type_register_static(&kvm_apic_info);
+}
+
+type_init(kvm_apic_register_types)
diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
new file mode 100644
index 000000000..df70b4a03
--- /dev/null
+++ b/hw/i386/kvm/clock.c
@@ -0,0 +1,350 @@
+/*
+ * QEMU KVM support, paravirtual clock device
+ *
+ * Copyright (C) 2011 Siemens AG
+ *
+ * Authors:
+ * Jan Kiszka <jan.kiszka@siemens.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ * See the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "sysemu/kvm.h"
+#include "sysemu/runstate.h"
+#include "sysemu/hw_accel.h"
+#include "kvm/kvm_i386.h"
+#include "migration/vmstate.h"
+#include "hw/sysbus.h"
+#include "hw/kvm/clock.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+
+#include <linux/kvm.h>
+#include "standard-headers/asm-x86/kvm_para.h"
+#include "qom/object.h"
+
+#define TYPE_KVM_CLOCK "kvmclock"
+OBJECT_DECLARE_SIMPLE_TYPE(KVMClockState, KVM_CLOCK)
+
+struct KVMClockState {
+ /*< private >*/
+ SysBusDevice busdev;
+ /*< public >*/
+
+ uint64_t clock;
+ bool clock_valid;
+
+ /* whether the 'clock' value was obtained in the 'paused' state */
+ bool runstate_paused;
+
+ /* whether machine type supports reliable KVM_GET_CLOCK */
+ bool mach_use_reliable_get_clock;
+
+ /* whether the 'clock' value was obtained in a host with
+ * reliable KVM_GET_CLOCK */
+ bool clock_is_reliable;
+};
+
+struct pvclock_vcpu_time_info {
+ uint32_t version;
+ uint32_t pad0;
+ uint64_t tsc_timestamp;
+ uint64_t system_time;
+ uint32_t tsc_to_system_mul;
+ int8_t tsc_shift;
+ uint8_t flags;
+ uint8_t pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+static uint64_t kvmclock_current_nsec(KVMClockState *s)
+{
+ CPUState *cpu = first_cpu;
+ CPUX86State *env = cpu->env_ptr;
+ hwaddr kvmclock_struct_pa;
+ uint64_t migration_tsc = env->tsc;
+ struct pvclock_vcpu_time_info time;
+ uint64_t delta;
+ uint64_t nsec_lo;
+ uint64_t nsec_hi;
+ uint64_t nsec;
+
+ cpu_synchronize_state(cpu);
+
+ if (!(env->system_time_msr & 1ULL)) {
+ /* KVM clock not active */
+ return 0;
+ }
+
+ kvmclock_struct_pa = env->system_time_msr & ~1ULL;
+ cpu_physical_memory_read(kvmclock_struct_pa, &time, sizeof(time));
+
+ assert(time.tsc_timestamp <= migration_tsc);
+ delta = migration_tsc - time.tsc_timestamp;
+ if (time.tsc_shift < 0) {
+ delta >>= -time.tsc_shift;
+ } else {
+ delta <<= time.tsc_shift;
+ }
+
+ mulu64(&nsec_lo, &nsec_hi, delta, time.tsc_to_system_mul);
+ nsec = (nsec_lo >> 32) | (nsec_hi << 32);
+ return nsec + time.system_time;
+}
+
+static void kvm_update_clock(KVMClockState *s)
+{
+ struct kvm_clock_data data;
+ int ret;
+
+ ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+ if (ret < 0) {
+ fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(-ret));
+ abort();
+ }
+ s->clock = data.clock;
+
+ /* If kvm_has_adjust_clock_stable() is false, KVM_GET_CLOCK returns
+ * essentially CLOCK_MONOTONIC plus a guest-specific adjustment. This
+ * can drift from the TSC-based value that is computed by the guest,
+ * so we need to go through kvmclock_current_nsec(). If
+ * kvm_has_adjust_clock_stable() is true, and the flags contain
+ * KVM_CLOCK_TSC_STABLE, then KVM_GET_CLOCK returns a TSC-based value
+ * and kvmclock_current_nsec() is not necessary.
+ *
+ * Here, however, we need not check KVM_CLOCK_TSC_STABLE. This is because:
+ *
+ * - if the host has disabled the kvmclock master clock, the guest already
+ * has protection against time going backwards. This "safety net" is only
+ * absent when kvmclock is stable;
+ *
+ * - therefore, we can replace a check like
+ *
+ * if last KVM_GET_CLOCK was not reliable then
+ * read from memory
+ *
+ * with
+ *
+ * if last KVM_GET_CLOCK was not reliable && masterclock is enabled
+ * read from memory
+ *
+ * However:
+ *
+ * - if kvm_has_adjust_clock_stable() returns false, the left side is
+ * always true (KVM_GET_CLOCK is never reliable), and the right side is
+ * unknown (because we don't have data.flags). We must assume it's true
+ * and read from memory.
+ *
+ * - if kvm_has_adjust_clock_stable() returns true, the result of the &&
+ * is always false (masterclock is enabled iff KVM_GET_CLOCK is reliable)
+ *
+ * So we can just use this instead:
+ *
+ * if !kvm_has_adjust_clock_stable() then
+ * read from memory
+ */
+ s->clock_is_reliable = kvm_has_adjust_clock_stable();
+}
+
+static void do_kvmclock_ctrl(CPUState *cpu, run_on_cpu_data data)
+{
+ int ret = kvm_vcpu_ioctl(cpu, KVM_KVMCLOCK_CTRL, 0);
+
+ if (ret && ret != -EINVAL) {
+ fprintf(stderr, "%s: %s\n", __func__, strerror(-ret));
+ }
+}
+
+static void kvmclock_vm_state_change(void *opaque, bool running,
+ RunState state)
+{
+ KVMClockState *s = opaque;
+ CPUState *cpu;
+ int cap_clock_ctrl = kvm_check_extension(kvm_state, KVM_CAP_KVMCLOCK_CTRL);
+ int ret;
+
+ if (running) {
+ struct kvm_clock_data data = {};
+
+ /*
+ * If the host where s->clock was read did not support reliable
+ * KVM_GET_CLOCK, read kvmclock value from memory.
+ */
+ if (!s->clock_is_reliable) {
+ uint64_t pvclock_via_mem = kvmclock_current_nsec(s);
+ /* We can't rely on the saved clock value, just discard it */
+ if (pvclock_via_mem) {
+ s->clock = pvclock_via_mem;
+ }
+ }
+
+ s->clock_valid = false;
+
+ data.clock = s->clock;
+ ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
+ if (ret < 0) {
+ fprintf(stderr, "KVM_SET_CLOCK failed: %s\n", strerror(-ret));
+ abort();
+ }
+
+ if (!cap_clock_ctrl) {
+ return;
+ }
+ CPU_FOREACH(cpu) {
+ run_on_cpu(cpu, do_kvmclock_ctrl, RUN_ON_CPU_NULL);
+ }
+ } else {
+
+ if (s->clock_valid) {
+ return;
+ }
+
+ s->runstate_paused = runstate_check(RUN_STATE_PAUSED);
+
+ kvm_synchronize_all_tsc();
+
+ kvm_update_clock(s);
+ /*
+ * If the VM is stopped, declare the clock state valid to
+ * avoid re-reading it on next vmsave (which would return
+ * a different value). Will be reset when the VM is continued.
+ */
+ s->clock_valid = true;
+ }
+}
+
+static void kvmclock_realize(DeviceState *dev, Error **errp)
+{
+ KVMClockState *s = KVM_CLOCK(dev);
+
+ if (!kvm_enabled()) {
+ error_setg(errp, "kvmclock device requires KVM");
+ return;
+ }
+
+ kvm_update_clock(s);
+
+ qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
+}
+
+static bool kvmclock_clock_is_reliable_needed(void *opaque)
+{
+ KVMClockState *s = opaque;
+
+ return s->mach_use_reliable_get_clock;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+ .name = "kvmclock/clock_is_reliable",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = kvmclock_clock_is_reliable_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_BOOL(clock_is_reliable, KVMClockState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+/*
+ * When migrating, assume the source has an unreliable
+ * KVM_GET_CLOCK unless told otherwise.
+ */
+static int kvmclock_pre_load(void *opaque)
+{
+ KVMClockState *s = opaque;
+
+ s->clock_is_reliable = false;
+
+ return 0;
+}
+
+/*
+ * When migrating a running guest, read the clock just
+ * before migration, so that the guest clock counts
+ * during the events between:
+ *
+ * * vm_stop()
+ * *
+ * * pre_save()
+ *
+ * This reduces kvmclock difference on migration from 5s
+ * to 0.1s (when max_downtime == 5s), because sending the
+ * final pages of memory (which happens between vm_stop()
+ * and pre_save()) takes max_downtime.
+ */
+static int kvmclock_pre_save(void *opaque)
+{
+ KVMClockState *s = opaque;
+
+ if (!s->runstate_paused) {
+ kvm_update_clock(s);
+ }
+
+ return 0;
+}
+
+static const VMStateDescription kvmclock_vmsd = {
+ .name = "kvmclock",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .pre_load = kvmclock_pre_load,
+ .pre_save = kvmclock_pre_save,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(clock, KVMClockState),
+ VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription * []) {
+ &kvmclock_reliable_get_clock,
+ NULL
+ }
+};
+
+static Property kvmclock_properties[] = {
+ DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
+ mach_use_reliable_get_clock, true),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void kvmclock_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->realize = kvmclock_realize;
+ dc->vmsd = &kvmclock_vmsd;
+ device_class_set_props(dc, kvmclock_properties);
+}
+
+static const TypeInfo kvmclock_info = {
+ .name = TYPE_KVM_CLOCK,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(KVMClockState),
+ .class_init = kvmclock_class_init,
+};
+
+/* Note: Must be called after VCPU initialization. */
+void kvmclock_create(bool create_always)
+{
+ X86CPU *cpu = X86_CPU(first_cpu);
+
+ if (!kvm_enabled() || !kvm_has_adjust_clock())
+ return;
+
+ if (create_always ||
+ cpu->env.features[FEAT_KVM] & ((1ULL << KVM_FEATURE_CLOCKSOURCE) |
+ (1ULL << KVM_FEATURE_CLOCKSOURCE2))) {
+ sysbus_create_simple(TYPE_KVM_CLOCK, -1, NULL);
+ }
+}
+
+static void kvmclock_register_types(void)
+{
+ type_register_static(&kvmclock_info);
+}
+
+type_init(kvmclock_register_types)
diff --git a/hw/i386/kvm/i8254.c b/hw/i386/kvm/i8254.c
new file mode 100644
index 000000000..191a26fa5
--- /dev/null
+++ b/hw/i386/kvm/i8254.c
@@ -0,0 +1,337 @@
+/*
+ * KVM in-kernel PIT (i8254) support
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2012 Jan Kiszka, Siemens AG
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include <linux/kvm.h>
+#include "qapi/qapi-types-machine.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "qemu/timer.h"
+#include "sysemu/runstate.h"
+#include "hw/timer/i8254.h"
+#include "hw/timer/i8254_internal.h"
+#include "hw/qdev-properties-system.h"
+#include "sysemu/kvm.h"
+#include "qom/object.h"
+
+#define KVM_PIT_REINJECT_BIT 0
+
+#define CALIBRATION_ROUNDS 3
+
+typedef struct KVMPITClass KVMPITClass;
+typedef struct KVMPITState KVMPITState;
+DECLARE_OBJ_CHECKERS(KVMPITState, KVMPITClass,
+ KVM_PIT, TYPE_KVM_I8254)
+
+struct KVMPITState {
+ PITCommonState parent_obj;
+
+ LostTickPolicy lost_tick_policy;
+ bool vm_stopped;
+ int64_t kernel_clock_offset;
+};
+
+struct KVMPITClass {
+ PITCommonClass parent_class;
+
+ DeviceRealize parent_realize;
+};
+
+static void kvm_pit_update_clock_offset(KVMPITState *s)
+{
+ int64_t offset, clock_offset;
+ struct timespec ts;
+ int i;
+
+ /*
+ * Measure the delta between CLOCK_MONOTONIC, the base used for
+ * kvm_pit_channel_state::count_load_time, and QEMU_CLOCK_VIRTUAL. Take the
+ * minimum of several samples to filter out scheduling noise.
+ */
+ clock_offset = INT64_MAX;
+ for (i = 0; i < CALIBRATION_ROUNDS; i++) {
+ offset = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ offset -= ts.tv_nsec;
+ offset -= (int64_t)ts.tv_sec * 1000000000;
+ if (uabs64(offset) < uabs64(clock_offset)) {
+ clock_offset = offset;
+ }
+ }
+ s->kernel_clock_offset = clock_offset;
+}
+
+static void kvm_pit_get(PITCommonState *pit)
+{
+ KVMPITState *s = KVM_PIT(pit);
+ struct kvm_pit_state2 kpit;
+ struct kvm_pit_channel_state *kchan;
+ struct PITChannelState *sc;
+ int i, ret;
+
+ /* No need to re-read the state if VM is stopped. */
+ if (s->vm_stopped) {
+ return;
+ }
+
+ if (kvm_has_pit_state2()) {
+ ret = kvm_vm_ioctl(kvm_state, KVM_GET_PIT2, &kpit);
+ if (ret < 0) {
+ fprintf(stderr, "KVM_GET_PIT2 failed: %s\n", strerror(-ret));
+ abort();
+ }
+ pit->channels[0].irq_disabled = kpit.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ } else {
+ /*
+ * kvm_pit_state2 is superset of kvm_pit_state struct,
+ * so we can use it for KVM_GET_PIT as well.
+ */
+ ret = kvm_vm_ioctl(kvm_state, KVM_GET_PIT, &kpit);
+ if (ret < 0) {
+ fprintf(stderr, "KVM_GET_PIT failed: %s\n", strerror(-ret));
+ abort();
+ }
+ }
+ for (i = 0; i < 3; i++) {
+ kchan = &kpit.channels[i];
+ sc = &pit->channels[i];
+ sc->count = kchan->count;
+ sc->latched_count = kchan->latched_count;
+ sc->count_latched = kchan->count_latched;
+ sc->status_latched = kchan->status_latched;
+ sc->status = kchan->status;
+ sc->read_state = kchan->read_state;
+ sc->write_state = kchan->write_state;
+ sc->write_latch = kchan->write_latch;
+ sc->rw_mode = kchan->rw_mode;
+ sc->mode = kchan->mode;
+ sc->bcd = kchan->bcd;
+ sc->gate = kchan->gate;
+ sc->count_load_time = kchan->count_load_time + s->kernel_clock_offset;
+ }
+
+ sc = &pit->channels[0];
+ sc->next_transition_time =
+ pit_get_next_transition_time(sc, sc->count_load_time);
+}
+
+static void kvm_pit_put(PITCommonState *pit)
+{
+ KVMPITState *s = KVM_PIT(pit);
+ struct kvm_pit_state2 kpit = {};
+ struct kvm_pit_channel_state *kchan;
+ struct PITChannelState *sc;
+ int i, ret;
+
+ /* The offset keeps changing as long as the VM is stopped. */
+ if (s->vm_stopped) {
+ kvm_pit_update_clock_offset(s);
+ }
+
+ kpit.flags = pit->channels[0].irq_disabled ? KVM_PIT_FLAGS_HPET_LEGACY : 0;
+ for (i = 0; i < 3; i++) {
+ kchan = &kpit.channels[i];
+ sc = &pit->channels[i];
+ kchan->count = sc->count;
+ kchan->latched_count = sc->latched_count;
+ kchan->count_latched = sc->count_latched;
+ kchan->status_latched = sc->status_latched;
+ kchan->status = sc->status;
+ kchan->read_state = sc->read_state;
+ kchan->write_state = sc->write_state;
+ kchan->write_latch = sc->write_latch;
+ kchan->rw_mode = sc->rw_mode;
+ kchan->mode = sc->mode;
+ kchan->bcd = sc->bcd;
+ kchan->gate = sc->gate;
+ kchan->count_load_time = sc->count_load_time - s->kernel_clock_offset;
+ }
+
+ ret = kvm_vm_ioctl(kvm_state,
+ kvm_has_pit_state2() ? KVM_SET_PIT2 : KVM_SET_PIT,
+ &kpit);
+ if (ret < 0) {
+ fprintf(stderr, "%s failed: %s\n",
+ kvm_has_pit_state2() ? "KVM_SET_PIT2" : "KVM_SET_PIT",
+ strerror(-ret));
+ abort();
+ }
+}
+
+static void kvm_pit_set_gate(PITCommonState *s, PITChannelState *sc, int val)
+{
+ kvm_pit_get(s);
+
+ switch (sc->mode) {
+ default:
+ case 0:
+ case 4:
+ /* XXX: just disable/enable counting */
+ break;
+ case 1:
+ case 2:
+ case 3:
+ case 5:
+ if (sc->gate < val) {
+ /* restart counting on rising edge */
+ sc->count_load_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+ }
+ break;
+ }
+ sc->gate = val;
+
+ kvm_pit_put(s);
+}
+
+static void kvm_pit_get_channel_info(PITCommonState *s, PITChannelState *sc,
+ PITChannelInfo *info)
+{
+ kvm_pit_get(s);
+
+ pit_get_channel_info_common(s, sc, info);
+}
+
+static void kvm_pit_reset(DeviceState *dev)
+{
+ PITCommonState *s = PIT_COMMON(dev);
+
+ pit_reset_common(s);
+
+ kvm_pit_put(s);
+}
+
+static void kvm_pit_irq_control(void *opaque, int n, int enable)
+{
+ PITCommonState *pit = opaque;
+ PITChannelState *s = &pit->channels[0];
+
+ kvm_pit_get(pit);
+
+ s->irq_disabled = !enable;
+
+ kvm_pit_put(pit);
+}
+
+static void kvm_pit_vm_state_change(void *opaque, bool running,
+ RunState state)
+{
+ KVMPITState *s = opaque;
+
+ if (running) {
+ kvm_pit_update_clock_offset(s);
+ kvm_pit_put(PIT_COMMON(s));
+ s->vm_stopped = false;
+ } else {
+ kvm_pit_update_clock_offset(s);
+ kvm_pit_get(PIT_COMMON(s));
+ s->vm_stopped = true;
+ }
+}
+
+static void kvm_pit_realizefn(DeviceState *dev, Error **errp)
+{
+ PITCommonState *pit = PIT_COMMON(dev);
+ KVMPITClass *kpc = KVM_PIT_GET_CLASS(dev);
+ KVMPITState *s = KVM_PIT(pit);
+ struct kvm_pit_config config = {
+ .flags = 0,
+ };
+ int ret;
+
+ if (kvm_check_extension(kvm_state, KVM_CAP_PIT2)) {
+ ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_PIT2, &config);
+ } else {
+ ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_PIT);
+ }
+ if (ret < 0) {
+ error_setg(errp, "Create kernel PIC irqchip failed: %s",
+ strerror(-ret));
+ return;
+ }
+ switch (s->lost_tick_policy) {
+ case LOST_TICK_POLICY_DELAY:
+ break; /* enabled by default */
+ case LOST_TICK_POLICY_DISCARD:
+ if (kvm_check_extension(kvm_state, KVM_CAP_REINJECT_CONTROL)) {
+ struct kvm_reinject_control control = { .pit_reinject = 0 };
+
+ ret = kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
+ if (ret < 0) {
+ error_setg(errp,
+ "Can't disable in-kernel PIT reinjection: %s",
+ strerror(-ret));
+ return;
+ }
+ }
+ break;
+ default:
+ error_setg(errp, "Lost tick policy not supported.");
+ return;
+ }
+
+ memory_region_init_io(&pit->ioports, OBJECT(dev), NULL, NULL, "kvm-pit", 4);
+
+ qdev_init_gpio_in(dev, kvm_pit_irq_control, 1);
+
+ qemu_add_vm_change_state_handler(kvm_pit_vm_state_change, s);
+
+ kpc->parent_realize(dev, errp);
+}
+
+static Property kvm_pit_properties[] = {
+ DEFINE_PROP_UINT32("iobase", PITCommonState, iobase, -1),
+ DEFINE_PROP_LOSTTICKPOLICY("lost_tick_policy", KVMPITState,
+ lost_tick_policy, LOST_TICK_POLICY_DELAY),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void kvm_pit_class_init(ObjectClass *klass, void *data)
+{
+ KVMPITClass *kpc = KVM_PIT_CLASS(klass);
+ PITCommonClass *k = PIT_COMMON_CLASS(klass);
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ device_class_set_parent_realize(dc, kvm_pit_realizefn,
+ &kpc->parent_realize);
+ k->set_channel_gate = kvm_pit_set_gate;
+ k->get_channel_info = kvm_pit_get_channel_info;
+ dc->reset = kvm_pit_reset;
+ device_class_set_props(dc, kvm_pit_properties);
+}
+
+static const TypeInfo kvm_pit_info = {
+ .name = TYPE_KVM_I8254,
+ .parent = TYPE_PIT_COMMON,
+ .instance_size = sizeof(KVMPITState),
+ .class_init = kvm_pit_class_init,
+ .class_size = sizeof(KVMPITClass),
+};
+
+static void kvm_pit_register(void)
+{
+ type_register_static(&kvm_pit_info);
+}
+
+type_init(kvm_pit_register)
diff --git a/hw/i386/kvm/i8259.c b/hw/i386/kvm/i8259.c
new file mode 100644
index 000000000..d61bae4dc
--- /dev/null
+++ b/hw/i386/kvm/i8259.c
@@ -0,0 +1,167 @@
+/*
+ * KVM in-kernel PIC (i8259) support
+ *
+ * Copyright (c) 2011 Siemens AG
+ *
+ * Authors:
+ * Jan Kiszka <jan.kiszka@siemens.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/isa/i8259_internal.h"
+#include "hw/intc/i8259.h"
+#include "qemu/module.h"
+#include "hw/i386/apic_internal.h"
+#include "hw/irq.h"
+#include "sysemu/kvm.h"
+#include "qom/object.h"
+
+#define TYPE_KVM_I8259 "kvm-i8259"
+typedef struct KVMPICClass KVMPICClass;
+DECLARE_CLASS_CHECKERS(KVMPICClass, KVM_PIC,
+ TYPE_KVM_I8259)
+
+/**
+ * KVMPICClass:
+ * @parent_realize: The parent's realizefn.
+ */
+struct KVMPICClass {
+ PICCommonClass parent_class;
+
+ DeviceRealize parent_realize;
+};
+
+static void kvm_pic_get(PICCommonState *s)
+{
+ struct kvm_irqchip chip;
+ struct kvm_pic_state *kpic;
+ int ret;
+
+ chip.chip_id = s->master ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE;
+ ret = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, &chip);
+ if (ret < 0) {
+ fprintf(stderr, "KVM_GET_IRQCHIP failed: %s\n", strerror(-ret));
+ abort();
+ }
+
+ kpic = &chip.chip.pic;
+
+ s->last_irr = kpic->last_irr;
+ s->irr = kpic->irr;
+ s->imr = kpic->imr;
+ s->isr = kpic->isr;
+ s->priority_add = kpic->priority_add;
+ s->irq_base = kpic->irq_base;
+ s->read_reg_select = kpic->read_reg_select;
+ s->poll = kpic->poll;
+ s->special_mask = kpic->special_mask;
+ s->init_state = kpic->init_state;
+ s->auto_eoi = kpic->auto_eoi;
+ s->rotate_on_auto_eoi = kpic->rotate_on_auto_eoi;
+ s->special_fully_nested_mode = kpic->special_fully_nested_mode;
+ s->init4 = kpic->init4;
+ s->elcr = kpic->elcr;
+ s->elcr_mask = kpic->elcr_mask;
+}
+
+static void kvm_pic_put(PICCommonState *s)
+{
+ struct kvm_irqchip chip;
+ struct kvm_pic_state *kpic;
+ int ret;
+
+ chip.chip_id = s->master ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE;
+
+ kpic = &chip.chip.pic;
+
+ kpic->last_irr = s->last_irr;
+ kpic->irr = s->irr;
+ kpic->imr = s->imr;
+ kpic->isr = s->isr;
+ kpic->priority_add = s->priority_add;
+ kpic->irq_base = s->irq_base;
+ kpic->read_reg_select = s->read_reg_select;
+ kpic->poll = s->poll;
+ kpic->special_mask = s->special_mask;
+ kpic->init_state = s->init_state;
+ kpic->auto_eoi = s->auto_eoi;
+ kpic->rotate_on_auto_eoi = s->rotate_on_auto_eoi;
+ kpic->special_fully_nested_mode = s->special_fully_nested_mode;
+ kpic->init4 = s->init4;
+ kpic->elcr = s->elcr;
+ kpic->elcr_mask = s->elcr_mask;
+
+ ret = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, &chip);
+ if (ret < 0) {
+ fprintf(stderr, "KVM_SET_IRQCHIP failed: %s\n", strerror(-ret));
+ abort();
+ }
+}
+
+static void kvm_pic_reset(DeviceState *dev)
+{
+ PICCommonState *s = PIC_COMMON(dev);
+
+ s->elcr = 0;
+ pic_reset_common(s);
+
+ kvm_pic_put(s);
+}
+
+static void kvm_pic_set_irq(void *opaque, int irq, int level)
+{
+ int delivered;
+
+ pic_stat_update_irq(irq, level);
+ delivered = kvm_set_irq(kvm_state, irq, level);
+ apic_report_irq_delivered(delivered);
+}
+
+static void kvm_pic_realize(DeviceState *dev, Error **errp)
+{
+ PICCommonState *s = PIC_COMMON(dev);
+ KVMPICClass *kpc = KVM_PIC_GET_CLASS(dev);
+
+ memory_region_init_io(&s->base_io, OBJECT(dev), NULL, NULL, "kvm-pic", 2);
+ memory_region_init_io(&s->elcr_io, OBJECT(dev), NULL, NULL, "kvm-elcr", 1);
+
+ kpc->parent_realize(dev, errp);
+}
+
+qemu_irq *kvm_i8259_init(ISABus *bus)
+{
+ i8259_init_chip(TYPE_KVM_I8259, bus, true);
+ i8259_init_chip(TYPE_KVM_I8259, bus, false);
+
+ return qemu_allocate_irqs(kvm_pic_set_irq, NULL, ISA_NUM_IRQS);
+}
+
+static void kvm_i8259_class_init(ObjectClass *klass, void *data)
+{
+ KVMPICClass *kpc = KVM_PIC_CLASS(klass);
+ PICCommonClass *k = PIC_COMMON_CLASS(klass);
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->reset = kvm_pic_reset;
+ device_class_set_parent_realize(dc, kvm_pic_realize, &kpc->parent_realize);
+ k->pre_save = kvm_pic_get;
+ k->post_load = kvm_pic_put;
+}
+
+static const TypeInfo kvm_i8259_info = {
+ .name = TYPE_KVM_I8259,
+ .parent = TYPE_PIC_COMMON,
+ .instance_size = sizeof(PICCommonState),
+ .class_init = kvm_i8259_class_init,
+ .class_size = sizeof(KVMPICClass),
+};
+
+static void kvm_pic_register_types(void)
+{
+ type_register_static(&kvm_i8259_info);
+}
+
+type_init(kvm_pic_register_types)
diff --git a/hw/i386/kvm/ioapic.c b/hw/i386/kvm/ioapic.c
new file mode 100644
index 000000000..ee7c8ef68
--- /dev/null
+++ b/hw/i386/kvm/ioapic.c
@@ -0,0 +1,165 @@
+/*
+ * KVM in-kernel IOPIC support
+ *
+ * Copyright (c) 2011 Siemens AG
+ *
+ * Authors:
+ * Jan Kiszka <jan.kiszka@siemens.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "monitor/monitor.h"
+#include "hw/i386/x86.h"
+#include "hw/qdev-properties.h"
+#include "hw/i386/ioapic_internal.h"
+#include "hw/i386/apic_internal.h"
+#include "sysemu/kvm.h"
+
+/* PC Utility function */
+void kvm_pc_setup_irq_routing(bool pci_enabled)
+{
+ KVMState *s = kvm_state;
+ int i;
+
+ assert(kvm_has_gsi_routing());
+ for (i = 0; i < 8; ++i) {
+ if (i == 2) {
+ continue;
+ }
+ kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_PIC_MASTER, i);
+ }
+ for (i = 8; i < 16; ++i) {
+ kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_PIC_SLAVE, i - 8);
+ }
+ if (pci_enabled) {
+ for (i = 0; i < 24; ++i) {
+ if (i == 0) {
+ kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_IOAPIC, 2);
+ } else if (i != 2) {
+ kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_IOAPIC, i);
+ }
+ }
+ }
+ kvm_irqchip_commit_routes(s);
+}
+
+typedef struct KVMIOAPICState KVMIOAPICState;
+
+struct KVMIOAPICState {
+ IOAPICCommonState ioapic;
+ uint32_t kvm_gsi_base;
+};
+
+static void kvm_ioapic_get(IOAPICCommonState *s)
+{
+ struct kvm_irqchip chip;
+ struct kvm_ioapic_state *kioapic;
+ int ret, i;
+
+ chip.chip_id = KVM_IRQCHIP_IOAPIC;
+ ret = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, &chip);
+ if (ret < 0) {
+ fprintf(stderr, "KVM_GET_IRQCHIP failed: %s\n", strerror(-ret));
+ abort();
+ }
+
+ kioapic = &chip.chip.ioapic;
+
+ s->id = kioapic->id;
+ s->ioregsel = kioapic->ioregsel;
+ s->irr = kioapic->irr;
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ s->ioredtbl[i] = kioapic->redirtbl[i].bits;
+ }
+}
+
+static void kvm_ioapic_put(IOAPICCommonState *s)
+{
+ struct kvm_irqchip chip;
+ struct kvm_ioapic_state *kioapic;
+ int ret, i;
+
+ chip.chip_id = KVM_IRQCHIP_IOAPIC;
+ kioapic = &chip.chip.ioapic;
+
+ kioapic->id = s->id;
+ kioapic->ioregsel = s->ioregsel;
+ kioapic->base_address = s->busdev.mmio[0].addr;
+ kioapic->irr = s->irr;
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ kioapic->redirtbl[i].bits = s->ioredtbl[i];
+ }
+
+ ret = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, &chip);
+ if (ret < 0) {
+ fprintf(stderr, "KVM_SET_IRQCHIP failed: %s\n", strerror(-ret));
+ abort();
+ }
+}
+
+static void kvm_ioapic_reset(DeviceState *dev)
+{
+ IOAPICCommonState *s = IOAPIC_COMMON(dev);
+
+ ioapic_reset_common(dev);
+ kvm_ioapic_put(s);
+}
+
+static void kvm_ioapic_set_irq(void *opaque, int irq, int level)
+{
+ KVMIOAPICState *s = opaque;
+ IOAPICCommonState *common = IOAPIC_COMMON(s);
+ int delivered;
+
+ ioapic_stat_update_irq(common, irq, level);
+ delivered = kvm_set_irq(kvm_state, s->kvm_gsi_base + irq, level);
+ apic_report_irq_delivered(delivered);
+}
+
+static void kvm_ioapic_realize(DeviceState *dev, Error **errp)
+{
+ IOAPICCommonState *s = IOAPIC_COMMON(dev);
+
+ memory_region_init_io(&s->io_memory, OBJECT(dev), NULL, NULL, "kvm-ioapic", 0x1000);
+ /*
+ * KVM ioapic only supports 0x11 now. This will only be used when
+ * we want to dump ioapic version.
+ */
+ s->version = 0x11;
+
+ qdev_init_gpio_in(dev, kvm_ioapic_set_irq, IOAPIC_NUM_PINS);
+}
+
+static Property kvm_ioapic_properties[] = {
+ DEFINE_PROP_UINT32("gsi_base", KVMIOAPICState, kvm_gsi_base, 0),
+ DEFINE_PROP_END_OF_LIST()
+};
+
+static void kvm_ioapic_class_init(ObjectClass *klass, void *data)
+{
+ IOAPICCommonClass *k = IOAPIC_COMMON_CLASS(klass);
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ k->realize = kvm_ioapic_realize;
+ k->pre_save = kvm_ioapic_get;
+ k->post_load = kvm_ioapic_put;
+ dc->reset = kvm_ioapic_reset;
+ device_class_set_props(dc, kvm_ioapic_properties);
+}
+
+static const TypeInfo kvm_ioapic_info = {
+ .name = TYPE_KVM_IOAPIC,
+ .parent = TYPE_IOAPIC_COMMON,
+ .instance_size = sizeof(KVMIOAPICState),
+ .class_init = kvm_ioapic_class_init,
+};
+
+static void kvm_ioapic_register_types(void)
+{
+ type_register_static(&kvm_ioapic_info);
+}
+
+type_init(kvm_ioapic_register_types)
diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
new file mode 100644
index 000000000..95467f1de
--- /dev/null
+++ b/hw/i386/kvm/meson.build
@@ -0,0 +1,8 @@
+i386_kvm_ss = ss.source_set()
+i386_kvm_ss.add(files('clock.c'))
+i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c'))
+i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c'))
+i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c'))
+i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c'))
+
+i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c
new file mode 100644
index 000000000..43f8a8f67
--- /dev/null
+++ b/hw/i386/kvmvapic.c
@@ -0,0 +1,871 @@
+/*
+ * TPR optimization for 32-bit Windows guests (XP and Server 2003)
+ *
+ * Copyright (C) 2007-2008 Qumranet Technologies
+ * Copyright (C) 2012 Jan Kiszka, Siemens AG
+ *
+ * This work is licensed under the terms of the GNU GPL version 2, or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/module.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/cpus.h"
+#include "sysemu/hw_accel.h"
+#include "sysemu/kvm.h"
+#include "sysemu/runstate.h"
+#include "hw/i386/apic_internal.h"
+#include "hw/sysbus.h"
+#include "hw/boards.h"
+#include "migration/vmstate.h"
+#include "qom/object.h"
+
+#define VAPIC_IO_PORT 0x7e
+
+#define VAPIC_CPU_SHIFT 7
+
+#define ROM_BLOCK_SIZE 512
+#define ROM_BLOCK_MASK (~(ROM_BLOCK_SIZE - 1))
+
+typedef enum VAPICMode {
+ VAPIC_INACTIVE = 0,
+ VAPIC_ACTIVE = 1,
+ VAPIC_STANDBY = 2,
+} VAPICMode;
+
+typedef struct VAPICHandlers {
+ uint32_t set_tpr;
+ uint32_t set_tpr_eax;
+ uint32_t get_tpr[8];
+ uint32_t get_tpr_stack;
+} QEMU_PACKED VAPICHandlers;
+
+typedef struct GuestROMState {
+ char signature[8];
+ uint32_t vaddr;
+ uint32_t fixup_start;
+ uint32_t fixup_end;
+ uint32_t vapic_vaddr;
+ uint32_t vapic_size;
+ uint32_t vcpu_shift;
+ uint32_t real_tpr_addr;
+ VAPICHandlers up;
+ VAPICHandlers mp;
+} QEMU_PACKED GuestROMState;
+
+struct VAPICROMState {
+ SysBusDevice busdev;
+ MemoryRegion io;
+ MemoryRegion rom;
+ uint32_t state;
+ uint32_t rom_state_paddr;
+ uint32_t rom_state_vaddr;
+ uint32_t vapic_paddr;
+ uint32_t real_tpr_addr;
+ GuestROMState rom_state;
+ size_t rom_size;
+ bool rom_mapped_writable;
+ VMChangeStateEntry *vmsentry;
+};
+
+#define TYPE_VAPIC "kvmvapic"
+OBJECT_DECLARE_SIMPLE_TYPE(VAPICROMState, VAPIC)
+
+#define TPR_INSTR_ABS_MODRM 0x1
+#define TPR_INSTR_MATCH_MODRM_REG 0x2
+
+typedef struct TPRInstruction {
+ uint8_t opcode;
+ uint8_t modrm_reg;
+ unsigned int flags;
+ TPRAccess access;
+ size_t length;
+ off_t addr_offset;
+} TPRInstruction;
+
+/* must be sorted by length, shortest first */
+static const TPRInstruction tpr_instr[] = {
+ { /* mov abs to eax */
+ .opcode = 0xa1,
+ .access = TPR_ACCESS_READ,
+ .length = 5,
+ .addr_offset = 1,
+ },
+ { /* mov eax to abs */
+ .opcode = 0xa3,
+ .access = TPR_ACCESS_WRITE,
+ .length = 5,
+ .addr_offset = 1,
+ },
+ { /* mov r32 to r/m32 */
+ .opcode = 0x89,
+ .flags = TPR_INSTR_ABS_MODRM,
+ .access = TPR_ACCESS_WRITE,
+ .length = 6,
+ .addr_offset = 2,
+ },
+ { /* mov r/m32 to r32 */
+ .opcode = 0x8b,
+ .flags = TPR_INSTR_ABS_MODRM,
+ .access = TPR_ACCESS_READ,
+ .length = 6,
+ .addr_offset = 2,
+ },
+ { /* push r/m32 */
+ .opcode = 0xff,
+ .modrm_reg = 6,
+ .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG,
+ .access = TPR_ACCESS_READ,
+ .length = 6,
+ .addr_offset = 2,
+ },
+ { /* mov imm32, r/m32 (c7/0) */
+ .opcode = 0xc7,
+ .modrm_reg = 0,
+ .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG,
+ .access = TPR_ACCESS_WRITE,
+ .length = 10,
+ .addr_offset = 2,
+ },
+};
+
+static void read_guest_rom_state(VAPICROMState *s)
+{
+ cpu_physical_memory_read(s->rom_state_paddr, &s->rom_state,
+ sizeof(GuestROMState));
+}
+
+static void write_guest_rom_state(VAPICROMState *s)
+{
+ cpu_physical_memory_write(s->rom_state_paddr, &s->rom_state,
+ sizeof(GuestROMState));
+}
+
+static void update_guest_rom_state(VAPICROMState *s)
+{
+ read_guest_rom_state(s);
+
+ s->rom_state.real_tpr_addr = cpu_to_le32(s->real_tpr_addr);
+ s->rom_state.vcpu_shift = cpu_to_le32(VAPIC_CPU_SHIFT);
+
+ write_guest_rom_state(s);
+}
+
+static int find_real_tpr_addr(VAPICROMState *s, CPUX86State *env)
+{
+ CPUState *cs = env_cpu(env);
+ hwaddr paddr;
+ target_ulong addr;
+
+ if (s->state == VAPIC_ACTIVE) {
+ return 0;
+ }
+ /*
+ * If there is no prior TPR access instruction we could analyze (which is
+ * the case after resume from hibernation), we need to scan the possible
+ * virtual address space for the APIC mapping.
+ */
+ for (addr = 0xfffff000; addr >= 0x80000000; addr -= TARGET_PAGE_SIZE) {
+ paddr = cpu_get_phys_page_debug(cs, addr);
+ if (paddr != APIC_DEFAULT_ADDRESS) {
+ continue;
+ }
+ s->real_tpr_addr = addr + 0x80;
+ update_guest_rom_state(s);
+ return 0;
+ }
+ return -1;
+}
+
+static uint8_t modrm_reg(uint8_t modrm)
+{
+ return (modrm >> 3) & 7;
+}
+
+static bool is_abs_modrm(uint8_t modrm)
+{
+ return (modrm & 0xc7) == 0x05;
+}
+
+static bool opcode_matches(uint8_t *opcode, const TPRInstruction *instr)
+{
+ return opcode[0] == instr->opcode &&
+ (!(instr->flags & TPR_INSTR_ABS_MODRM) || is_abs_modrm(opcode[1])) &&
+ (!(instr->flags & TPR_INSTR_MATCH_MODRM_REG) ||
+ modrm_reg(opcode[1]) == instr->modrm_reg);
+}
+
+static int evaluate_tpr_instruction(VAPICROMState *s, X86CPU *cpu,
+ target_ulong *pip, TPRAccess access)
+{
+ CPUState *cs = CPU(cpu);
+ const TPRInstruction *instr;
+ target_ulong ip = *pip;
+ uint8_t opcode[2];
+ uint32_t real_tpr_addr;
+ int i;
+
+ if ((ip & 0xf0000000ULL) != 0x80000000ULL &&
+ (ip & 0xf0000000ULL) != 0xe0000000ULL) {
+ return -1;
+ }
+
+ /*
+ * Early Windows 2003 SMP initialization contains a
+ *
+ * mov imm32, r/m32
+ *
+ * instruction that is patched by TPR optimization. The problem is that
+ * RSP, used by the patched instruction, is zero, so the guest gets a
+ * double fault and dies.
+ */
+ if (cpu->env.regs[R_ESP] == 0) {
+ return -1;
+ }
+
+ if (kvm_enabled() && !kvm_irqchip_in_kernel()) {
+ /*
+ * KVM without kernel-based TPR access reporting will pass an IP that
+ * points after the accessing instruction. So we need to look backward
+ * to find the reason.
+ */
+ for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) {
+ instr = &tpr_instr[i];
+ if (instr->access != access) {
+ continue;
+ }
+ if (cpu_memory_rw_debug(cs, ip - instr->length, opcode,
+ sizeof(opcode), 0) < 0) {
+ return -1;
+ }
+ if (opcode_matches(opcode, instr)) {
+ ip -= instr->length;
+ goto instruction_ok;
+ }
+ }
+ return -1;
+ } else {
+ if (cpu_memory_rw_debug(cs, ip, opcode, sizeof(opcode), 0) < 0) {
+ return -1;
+ }
+ for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) {
+ instr = &tpr_instr[i];
+ if (opcode_matches(opcode, instr)) {
+ goto instruction_ok;
+ }
+ }
+ return -1;
+ }
+
+instruction_ok:
+ /*
+ * Grab the virtual TPR address from the instruction
+ * and update the cached values.
+ */
+ if (cpu_memory_rw_debug(cs, ip + instr->addr_offset,
+ (void *)&real_tpr_addr,
+ sizeof(real_tpr_addr), 0) < 0) {
+ return -1;
+ }
+ real_tpr_addr = le32_to_cpu(real_tpr_addr);
+ if ((real_tpr_addr & 0xfff) != 0x80) {
+ return -1;
+ }
+ s->real_tpr_addr = real_tpr_addr;
+ update_guest_rom_state(s);
+
+ *pip = ip;
+ return 0;
+}
+
+static int update_rom_mapping(VAPICROMState *s, CPUX86State *env, target_ulong ip)
+{
+ CPUState *cs = env_cpu(env);
+ hwaddr paddr;
+ uint32_t rom_state_vaddr;
+ uint32_t pos, patch, offset;
+
+ /* nothing to do if already activated */
+ if (s->state == VAPIC_ACTIVE) {
+ return 0;
+ }
+
+ /* bail out if ROM init code was not executed (missing ROM?) */
+ if (s->state == VAPIC_INACTIVE) {
+ return -1;
+ }
+
+ /* find out virtual address of the ROM */
+ rom_state_vaddr = s->rom_state_paddr + (ip & 0xf0000000);
+ paddr = cpu_get_phys_page_debug(cs, rom_state_vaddr);
+ if (paddr == -1) {
+ return -1;
+ }
+ paddr += rom_state_vaddr & ~TARGET_PAGE_MASK;
+ if (paddr != s->rom_state_paddr) {
+ return -1;
+ }
+ read_guest_rom_state(s);
+ if (memcmp(s->rom_state.signature, "kvm aPiC", 8) != 0) {
+ return -1;
+ }
+ s->rom_state_vaddr = rom_state_vaddr;
+
+ /* fixup addresses in ROM if needed */
+ if (rom_state_vaddr == le32_to_cpu(s->rom_state.vaddr)) {
+ return 0;
+ }
+ for (pos = le32_to_cpu(s->rom_state.fixup_start);
+ pos < le32_to_cpu(s->rom_state.fixup_end);
+ pos += 4) {
+ cpu_physical_memory_read(paddr + pos - s->rom_state.vaddr,
+ &offset, sizeof(offset));
+ offset = le32_to_cpu(offset);
+ cpu_physical_memory_read(paddr + offset, &patch, sizeof(patch));
+ patch = le32_to_cpu(patch);
+ patch += rom_state_vaddr - le32_to_cpu(s->rom_state.vaddr);
+ patch = cpu_to_le32(patch);
+ cpu_physical_memory_write(paddr + offset, &patch, sizeof(patch));
+ }
+ read_guest_rom_state(s);
+ s->vapic_paddr = paddr + le32_to_cpu(s->rom_state.vapic_vaddr) -
+ le32_to_cpu(s->rom_state.vaddr);
+
+ return 0;
+}
+
+/*
+ * Tries to read the unique processor number from the Kernel Processor Control
+ * Region (KPCR) of 32-bit Windows XP and Server 2003. Returns -1 if the KPCR
+ * cannot be accessed or is considered invalid. This also ensures that we are
+ * not patching the wrong guest.
+ */
+static int get_kpcr_number(X86CPU *cpu)
+{
+ CPUX86State *env = &cpu->env;
+ struct kpcr {
+ uint8_t fill1[0x1c];
+ uint32_t self;
+ uint8_t fill2[0x31];
+ uint8_t number;
+ } QEMU_PACKED kpcr;
+
+ if (cpu_memory_rw_debug(CPU(cpu), env->segs[R_FS].base,
+ (void *)&kpcr, sizeof(kpcr), 0) < 0 ||
+ kpcr.self != env->segs[R_FS].base) {
+ return -1;
+ }
+ return kpcr.number;
+}
+
+static int vapic_enable(VAPICROMState *s, X86CPU *cpu)
+{
+ int cpu_number = get_kpcr_number(cpu);
+ hwaddr vapic_paddr;
+ static const uint8_t enabled = 1;
+
+ if (cpu_number < 0) {
+ return -1;
+ }
+ vapic_paddr = s->vapic_paddr +
+ (((hwaddr)cpu_number) << VAPIC_CPU_SHIFT);
+ cpu_physical_memory_write(vapic_paddr + offsetof(VAPICState, enabled),
+ &enabled, sizeof(enabled));
+ apic_enable_vapic(cpu->apic_state, vapic_paddr);
+
+ s->state = VAPIC_ACTIVE;
+
+ return 0;
+}
+
+static void patch_byte(X86CPU *cpu, target_ulong addr, uint8_t byte)
+{
+ cpu_memory_rw_debug(CPU(cpu), addr, &byte, 1, 1);
+}
+
+static void patch_call(X86CPU *cpu, target_ulong ip, uint32_t target)
+{
+ uint32_t offset;
+
+ offset = cpu_to_le32(target - ip - 5);
+ patch_byte(cpu, ip, 0xe8); /* call near */
+ cpu_memory_rw_debug(CPU(cpu), ip + 1, (void *)&offset, sizeof(offset), 1);
+}
+
+typedef struct PatchInfo {
+ VAPICHandlers *handler;
+ target_ulong ip;
+} PatchInfo;
+
+static void do_patch_instruction(CPUState *cs, run_on_cpu_data data)
+{
+ X86CPU *x86_cpu = X86_CPU(cs);
+ PatchInfo *info = (PatchInfo *) data.host_ptr;
+ VAPICHandlers *handlers = info->handler;
+ target_ulong ip = info->ip;
+ uint8_t opcode[2];
+ uint32_t imm32 = 0;
+
+ cpu_memory_rw_debug(cs, ip, opcode, sizeof(opcode), 0);
+
+ switch (opcode[0]) {
+ case 0x89: /* mov r32 to r/m32 */
+ patch_byte(x86_cpu, ip, 0x50 + modrm_reg(opcode[1])); /* push reg */
+ patch_call(x86_cpu, ip + 1, handlers->set_tpr);
+ break;
+ case 0x8b: /* mov r/m32 to r32 */
+ patch_byte(x86_cpu, ip, 0x90);
+ patch_call(x86_cpu, ip + 1, handlers->get_tpr[modrm_reg(opcode[1])]);
+ break;
+ case 0xa1: /* mov abs to eax */
+ patch_call(x86_cpu, ip, handlers->get_tpr[0]);
+ break;
+ case 0xa3: /* mov eax to abs */
+ patch_call(x86_cpu, ip, handlers->set_tpr_eax);
+ break;
+ case 0xc7: /* mov imm32, r/m32 (c7/0) */
+ patch_byte(x86_cpu, ip, 0x68); /* push imm32 */
+ cpu_memory_rw_debug(cs, ip + 6, (void *)&imm32, sizeof(imm32), 0);
+ cpu_memory_rw_debug(cs, ip + 1, (void *)&imm32, sizeof(imm32), 1);
+ patch_call(x86_cpu, ip + 5, handlers->set_tpr);
+ break;
+ case 0xff: /* push r/m32 */
+ patch_byte(x86_cpu, ip, 0x50); /* push eax */
+ patch_call(x86_cpu, ip + 1, handlers->get_tpr_stack);
+ break;
+ default:
+ abort();
+ }
+
+ g_free(info);
+}
+
+static void patch_instruction(VAPICROMState *s, X86CPU *cpu, target_ulong ip)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ CPUState *cs = CPU(cpu);
+ VAPICHandlers *handlers;
+ PatchInfo *info;
+
+ if (ms->smp.cpus == 1) {
+ handlers = &s->rom_state.up;
+ } else {
+ handlers = &s->rom_state.mp;
+ }
+
+ info = g_new(PatchInfo, 1);
+ info->handler = handlers;
+ info->ip = ip;
+
+ async_safe_run_on_cpu(cs, do_patch_instruction, RUN_ON_CPU_HOST_PTR(info));
+}
+
+void vapic_report_tpr_access(DeviceState *dev, CPUState *cs, target_ulong ip,
+ TPRAccess access)
+{
+ VAPICROMState *s = VAPIC(dev);
+ X86CPU *cpu = X86_CPU(cs);
+ CPUX86State *env = &cpu->env;
+
+ cpu_synchronize_state(cs);
+
+ if (evaluate_tpr_instruction(s, cpu, &ip, access) < 0) {
+ if (s->state == VAPIC_ACTIVE) {
+ vapic_enable(s, cpu);
+ }
+ return;
+ }
+ if (update_rom_mapping(s, env, ip) < 0) {
+ return;
+ }
+ if (vapic_enable(s, cpu) < 0) {
+ return;
+ }
+ patch_instruction(s, cpu, ip);
+}
+
+typedef struct VAPICEnableTPRReporting {
+ DeviceState *apic;
+ bool enable;
+} VAPICEnableTPRReporting;
+
+static void vapic_do_enable_tpr_reporting(CPUState *cpu, run_on_cpu_data data)
+{
+ VAPICEnableTPRReporting *info = data.host_ptr;
+ apic_enable_tpr_access_reporting(info->apic, info->enable);
+}
+
+static void vapic_enable_tpr_reporting(bool enable)
+{
+ VAPICEnableTPRReporting info = {
+ .enable = enable,
+ };
+ CPUState *cs;
+ X86CPU *cpu;
+
+ CPU_FOREACH(cs) {
+ cpu = X86_CPU(cs);
+ info.apic = cpu->apic_state;
+ run_on_cpu(cs, vapic_do_enable_tpr_reporting, RUN_ON_CPU_HOST_PTR(&info));
+ }
+}
+
+static void vapic_reset(DeviceState *dev)
+{
+ VAPICROMState *s = VAPIC(dev);
+
+ s->state = VAPIC_INACTIVE;
+ s->rom_state_paddr = 0;
+ vapic_enable_tpr_reporting(false);
+}
+
+/*
+ * Set the IRQ polling hypercalls to the supported variant:
+ * - vmcall if using KVM in-kernel irqchip
+ * - 32-bit VAPIC port write otherwise
+ */
+static int patch_hypercalls(VAPICROMState *s)
+{
+ hwaddr rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK;
+ static const uint8_t vmcall_pattern[] = { /* vmcall */
+ 0xb8, 0x1, 0, 0, 0, 0xf, 0x1, 0xc1
+ };
+ static const uint8_t outl_pattern[] = { /* nop; outl %eax,0x7e */
+ 0xb8, 0x1, 0, 0, 0, 0x90, 0xe7, 0x7e
+ };
+ uint8_t alternates[2];
+ const uint8_t *pattern;
+ const uint8_t *patch;
+ off_t pos;
+ uint8_t *rom;
+
+ rom = g_malloc(s->rom_size);
+ cpu_physical_memory_read(rom_paddr, rom, s->rom_size);
+
+ for (pos = 0; pos < s->rom_size - sizeof(vmcall_pattern); pos++) {
+ if (kvm_irqchip_in_kernel()) {
+ pattern = outl_pattern;
+ alternates[0] = outl_pattern[7];
+ alternates[1] = outl_pattern[7];
+ patch = &vmcall_pattern[5];
+ } else {
+ pattern = vmcall_pattern;
+ alternates[0] = vmcall_pattern[7];
+ alternates[1] = 0xd9; /* AMD's VMMCALL */
+ patch = &outl_pattern[5];
+ }
+ if (memcmp(rom + pos, pattern, 7) == 0 &&
+ (rom[pos + 7] == alternates[0] || rom[pos + 7] == alternates[1])) {
+ cpu_physical_memory_write(rom_paddr + pos + 5, patch, 3);
+ /*
+ * Don't flush the tb here. Under ordinary conditions, the patched
+ * calls are miles away from the current IP. Under malicious
+ * conditions, the guest could trick us to crash.
+ */
+ }
+ }
+
+ g_free(rom);
+ return 0;
+}
+
+/*
+ * For TCG mode or the time KVM honors read-only memory regions, we need to
+ * enable write access to the option ROM so that variables can be updated by
+ * the guest.
+ */
+static int vapic_map_rom_writable(VAPICROMState *s)
+{
+ hwaddr rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK;
+ MemoryRegionSection section;
+ MemoryRegion *as;
+ size_t rom_size;
+ uint8_t *ram;
+
+ as = sysbus_address_space(&s->busdev);
+
+ if (s->rom_mapped_writable) {
+ memory_region_del_subregion(as, &s->rom);
+ object_unparent(OBJECT(&s->rom));
+ }
+
+ /* grab RAM memory region (region @rom_paddr may still be pc.rom) */
+ section = memory_region_find(as, 0, 1);
+
+ /* read ROM size from RAM region */
+ if (rom_paddr + 2 >= memory_region_size(section.mr)) {
+ return -1;
+ }
+ ram = memory_region_get_ram_ptr(section.mr);
+ rom_size = ram[rom_paddr + 2] * ROM_BLOCK_SIZE;
+ if (rom_size == 0) {
+ return -1;
+ }
+ s->rom_size = rom_size;
+
+ /* We need to round to avoid creating subpages
+ * from which we cannot run code. */
+ rom_size += rom_paddr & ~TARGET_PAGE_MASK;
+ rom_paddr &= TARGET_PAGE_MASK;
+ rom_size = TARGET_PAGE_ALIGN(rom_size);
+
+ memory_region_init_alias(&s->rom, OBJECT(s), "kvmvapic-rom", section.mr,
+ rom_paddr, rom_size);
+ memory_region_add_subregion_overlap(as, rom_paddr, &s->rom, 1000);
+ s->rom_mapped_writable = true;
+ memory_region_unref(section.mr);
+
+ return 0;
+}
+
+static int vapic_prepare(VAPICROMState *s)
+{
+ if (vapic_map_rom_writable(s) < 0) {
+ return -1;
+ }
+
+ if (patch_hypercalls(s) < 0) {
+ return -1;
+ }
+
+ vapic_enable_tpr_reporting(true);
+
+ return 0;
+}
+
+static void vapic_write(void *opaque, hwaddr addr, uint64_t data,
+ unsigned int size)
+{
+ VAPICROMState *s = opaque;
+ X86CPU *cpu;
+ CPUX86State *env;
+ hwaddr rom_paddr;
+
+ if (!current_cpu) {
+ return;
+ }
+
+ cpu_synchronize_state(current_cpu);
+ cpu = X86_CPU(current_cpu);
+ env = &cpu->env;
+
+ /*
+ * The VAPIC supports two PIO-based hypercalls, both via port 0x7E.
+ * o 16-bit write access:
+ * Reports the option ROM initialization to the hypervisor. Written
+ * value is the offset of the state structure in the ROM.
+ * o 8-bit write access:
+ * Reactivates the VAPIC after a guest hibernation, i.e. after the
+ * option ROM content has been re-initialized by a guest power cycle.
+ * o 32-bit write access:
+ * Poll for pending IRQs, considering the current VAPIC state.
+ */
+ switch (size) {
+ case 2:
+ if (s->state == VAPIC_INACTIVE) {
+ rom_paddr = (env->segs[R_CS].base + env->eip) & ROM_BLOCK_MASK;
+ s->rom_state_paddr = rom_paddr + data;
+
+ s->state = VAPIC_STANDBY;
+ }
+ if (vapic_prepare(s) < 0) {
+ s->state = VAPIC_INACTIVE;
+ s->rom_state_paddr = 0;
+ break;
+ }
+ break;
+ case 1:
+ if (kvm_enabled()) {
+ /*
+ * Disable triggering instruction in ROM by writing a NOP.
+ *
+ * We cannot do this in TCG mode as the reported IP is not
+ * accurate.
+ */
+ pause_all_vcpus();
+ patch_byte(cpu, env->eip - 2, 0x66);
+ patch_byte(cpu, env->eip - 1, 0x90);
+ resume_all_vcpus();
+ }
+
+ if (s->state == VAPIC_ACTIVE) {
+ break;
+ }
+ if (update_rom_mapping(s, env, env->eip) < 0) {
+ break;
+ }
+ if (find_real_tpr_addr(s, env) < 0) {
+ break;
+ }
+ vapic_enable(s, cpu);
+ break;
+ default:
+ case 4:
+ if (!kvm_irqchip_in_kernel()) {
+ apic_poll_irq(cpu->apic_state);
+ }
+ break;
+ }
+}
+
+static uint64_t vapic_read(void *opaque, hwaddr addr, unsigned size)
+{
+ return 0xffffffff;
+}
+
+static const MemoryRegionOps vapic_ops = {
+ .write = vapic_write,
+ .read = vapic_read,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void vapic_realize(DeviceState *dev, Error **errp)
+{
+ SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+ VAPICROMState *s = VAPIC(dev);
+
+ memory_region_init_io(&s->io, OBJECT(s), &vapic_ops, s, "kvmvapic", 2);
+ sysbus_add_io(sbd, VAPIC_IO_PORT, &s->io);
+ sysbus_init_ioports(sbd, VAPIC_IO_PORT, 2);
+
+ option_rom[nb_option_roms].name = "kvmvapic.bin";
+ option_rom[nb_option_roms].bootindex = -1;
+ nb_option_roms++;
+}
+
+static void do_vapic_enable(CPUState *cs, run_on_cpu_data data)
+{
+ VAPICROMState *s = data.host_ptr;
+ X86CPU *cpu = X86_CPU(cs);
+
+ static const uint8_t enabled = 1;
+ cpu_physical_memory_write(s->vapic_paddr + offsetof(VAPICState, enabled),
+ &enabled, sizeof(enabled));
+ apic_enable_vapic(cpu->apic_state, s->vapic_paddr);
+ s->state = VAPIC_ACTIVE;
+}
+
+static void kvmvapic_vm_state_change(void *opaque, bool running,
+ RunState state)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ VAPICROMState *s = opaque;
+ uint8_t *zero;
+
+ if (!running) {
+ return;
+ }
+
+ if (s->state == VAPIC_ACTIVE) {
+ if (ms->smp.cpus == 1) {
+ run_on_cpu(first_cpu, do_vapic_enable, RUN_ON_CPU_HOST_PTR(s));
+ } else {
+ zero = g_malloc0(s->rom_state.vapic_size);
+ cpu_physical_memory_write(s->vapic_paddr, zero,
+ s->rom_state.vapic_size);
+ g_free(zero);
+ }
+ }
+
+ qemu_del_vm_change_state_handler(s->vmsentry);
+ s->vmsentry = NULL;
+}
+
+static int vapic_post_load(void *opaque, int version_id)
+{
+ VAPICROMState *s = opaque;
+
+ /*
+ * The old implementation of qemu-kvm did not provide the state
+ * VAPIC_STANDBY. Reconstruct it.
+ */
+ if (s->state == VAPIC_INACTIVE && s->rom_state_paddr != 0) {
+ s->state = VAPIC_STANDBY;
+ }
+
+ if (s->state != VAPIC_INACTIVE) {
+ if (vapic_prepare(s) < 0) {
+ return -1;
+ }
+ }
+
+ if (!s->vmsentry) {
+ s->vmsentry =
+ qemu_add_vm_change_state_handler(kvmvapic_vm_state_change, s);
+ }
+ return 0;
+}
+
+static const VMStateDescription vmstate_handlers = {
+ .name = "kvmvapic-handlers",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(set_tpr, VAPICHandlers),
+ VMSTATE_UINT32(set_tpr_eax, VAPICHandlers),
+ VMSTATE_UINT32_ARRAY(get_tpr, VAPICHandlers, 8),
+ VMSTATE_UINT32(get_tpr_stack, VAPICHandlers),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_guest_rom = {
+ .name = "kvmvapic-guest-rom",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UNUSED(8), /* signature */
+ VMSTATE_UINT32(vaddr, GuestROMState),
+ VMSTATE_UINT32(fixup_start, GuestROMState),
+ VMSTATE_UINT32(fixup_end, GuestROMState),
+ VMSTATE_UINT32(vapic_vaddr, GuestROMState),
+ VMSTATE_UINT32(vapic_size, GuestROMState),
+ VMSTATE_UINT32(vcpu_shift, GuestROMState),
+ VMSTATE_UINT32(real_tpr_addr, GuestROMState),
+ VMSTATE_STRUCT(up, GuestROMState, 0, vmstate_handlers, VAPICHandlers),
+ VMSTATE_STRUCT(mp, GuestROMState, 0, vmstate_handlers, VAPICHandlers),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_vapic = {
+ .name = "kvm-tpr-opt", /* compatible with qemu-kvm VAPIC */
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .post_load = vapic_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_STRUCT(rom_state, VAPICROMState, 0, vmstate_guest_rom,
+ GuestROMState),
+ VMSTATE_UINT32(state, VAPICROMState),
+ VMSTATE_UINT32(real_tpr_addr, VAPICROMState),
+ VMSTATE_UINT32(rom_state_vaddr, VAPICROMState),
+ VMSTATE_UINT32(vapic_paddr, VAPICROMState),
+ VMSTATE_UINT32(rom_state_paddr, VAPICROMState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void vapic_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->reset = vapic_reset;
+ dc->vmsd = &vmstate_vapic;
+ dc->realize = vapic_realize;
+}
+
+static const TypeInfo vapic_type = {
+ .name = TYPE_VAPIC,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(VAPICROMState),
+ .class_init = vapic_class_init,
+};
+
+static void vapic_register(void)
+{
+ type_register_static(&vapic_type);
+}
+
+type_init(vapic_register);
diff --git a/hw/i386/meson.build b/hw/i386/meson.build
new file mode 100644
index 000000000..213e2e82b
--- /dev/null
+++ b/hw/i386/meson.build
@@ -0,0 +1,37 @@
+i386_ss = ss.source_set()
+i386_ss.add(files(
+ 'fw_cfg.c',
+ 'kvmvapic.c',
+ 'e820_memory_layout.c',
+ 'multiboot.c',
+ 'x86.c',
+))
+
+i386_ss.add(when: 'CONFIG_X86_IOMMU', if_true: files('x86-iommu.c'),
+ if_false: files('x86-iommu-stub.c'))
+i386_ss.add(when: 'CONFIG_AMD_IOMMU', if_true: files('amd_iommu.c'))
+i386_ss.add(when: 'CONFIG_I440FX', if_true: files('pc_piix.c'))
+i386_ss.add(when: 'CONFIG_MICROVM', if_true: files('microvm.c', 'acpi-microvm.c', 'microvm-dt.c'))
+i386_ss.add(when: 'CONFIG_Q35', if_true: files('pc_q35.c'))
+i386_ss.add(when: 'CONFIG_VMMOUSE', if_true: files('vmmouse.c'))
+i386_ss.add(when: 'CONFIG_VMPORT', if_true: files('vmport.c'))
+i386_ss.add(when: 'CONFIG_VTD', if_true: files('intel_iommu.c'))
+i386_ss.add(when: 'CONFIG_SGX', if_true: files('sgx-epc.c','sgx.c'),
+ if_false: files('sgx-stub.c'))
+
+i386_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-common.c'))
+i386_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device_x86.c'))
+i386_ss.add(when: 'CONFIG_PC', if_true: files(
+ 'pc.c',
+ 'pc_sysfw.c',
+ 'acpi-build.c',
+ 'port92.c'))
+i386_ss.add(when: 'CONFIG_X86_FW_OVMF', if_true: files('pc_sysfw_ovmf.c'),
+ if_false: files('pc_sysfw_ovmf-stubs.c'))
+
+subdir('kvm')
+subdir('xen')
+
+i386_ss.add_all(xenpv_ss)
+
+hw_arch += {'i386': i386_ss}
diff --git a/hw/i386/microvm-dt.c b/hw/i386/microvm-dt.c
new file mode 100644
index 000000000..9c3c4995b
--- /dev/null
+++ b/hw/i386/microvm-dt.c
@@ -0,0 +1,348 @@
+/*
+ * microvm device tree support
+ *
+ * This generates an device tree for microvm and exports it via fw_cfg
+ * as "etc/fdt" to the firmware (edk2 specifically).
+ *
+ * The use case is to allow edk2 find the pcie ecam and the virtio
+ * devices, without adding an ACPI parser, reusing the fdt parser
+ * which is needed anyway for the arm platform.
+ *
+ * Note 1: The device tree is incomplete. CPUs and memory is missing
+ * for example, those can be detected using other fw_cfg files.
+ * Also pci ecam irq routing is not there, edk2 doesn't use
+ * interrupts.
+ *
+ * Note 2: This is for firmware only. OSes should use the more
+ * complete ACPI tables for hardware discovery.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "sysemu/device_tree.h"
+#include "hw/char/serial.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "hw/sysbus.h"
+#include "hw/virtio/virtio-mmio.h"
+#include "hw/usb/xhci.h"
+
+#include "microvm-dt.h"
+
+static bool debug;
+
+static void dt_add_microvm_irq(MicrovmMachineState *mms,
+ const char *nodename, uint32_t irq)
+{
+ int index = 0;
+
+ if (irq >= IO_APIC_SECONDARY_IRQBASE) {
+ irq -= IO_APIC_SECONDARY_IRQBASE;
+ index++;
+ }
+
+ qemu_fdt_setprop_cell(mms->fdt, nodename, "interrupt-parent",
+ mms->ioapic_phandle[index]);
+ qemu_fdt_setprop_cells(mms->fdt, nodename, "interrupts", irq, 0);
+}
+
+static void dt_add_virtio(MicrovmMachineState *mms, VirtIOMMIOProxy *mmio)
+{
+ SysBusDevice *dev = SYS_BUS_DEVICE(mmio);
+ VirtioBusState *mmio_virtio_bus = &mmio->bus;
+ BusState *mmio_bus = &mmio_virtio_bus->parent_obj;
+ char *nodename;
+
+ if (QTAILQ_EMPTY(&mmio_bus->children)) {
+ return;
+ }
+
+ hwaddr base = dev->mmio[0].addr;
+ hwaddr size = 512;
+ unsigned index = (base - VIRTIO_MMIO_BASE) / size;
+ uint32_t irq = mms->virtio_irq_base + index;
+
+ nodename = g_strdup_printf("/virtio_mmio@%" PRIx64, base);
+ qemu_fdt_add_subnode(mms->fdt, nodename);
+ qemu_fdt_setprop_string(mms->fdt, nodename, "compatible", "virtio,mmio");
+ qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size);
+ qemu_fdt_setprop(mms->fdt, nodename, "dma-coherent", NULL, 0);
+ dt_add_microvm_irq(mms, nodename, irq);
+ g_free(nodename);
+}
+
+static void dt_add_xhci(MicrovmMachineState *mms)
+{
+ const char compat[] = "generic-xhci";
+ uint32_t irq = MICROVM_XHCI_IRQ;
+ hwaddr base = MICROVM_XHCI_BASE;
+ hwaddr size = XHCI_LEN_REGS;
+ char *nodename;
+
+ nodename = g_strdup_printf("/usb@%" PRIx64, base);
+ qemu_fdt_add_subnode(mms->fdt, nodename);
+ qemu_fdt_setprop(mms->fdt, nodename, "compatible", compat, sizeof(compat));
+ qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size);
+ qemu_fdt_setprop(mms->fdt, nodename, "dma-coherent", NULL, 0);
+ dt_add_microvm_irq(mms, nodename, irq);
+ g_free(nodename);
+}
+
+static void dt_add_pcie(MicrovmMachineState *mms)
+{
+ hwaddr base = PCIE_MMIO_BASE;
+ int nr_pcie_buses;
+ char *nodename;
+
+ nodename = g_strdup_printf("/pcie@%" PRIx64, base);
+ qemu_fdt_add_subnode(mms->fdt, nodename);
+ qemu_fdt_setprop_string(mms->fdt, nodename,
+ "compatible", "pci-host-ecam-generic");
+ qemu_fdt_setprop_string(mms->fdt, nodename, "device_type", "pci");
+ qemu_fdt_setprop_cell(mms->fdt, nodename, "#address-cells", 3);
+ qemu_fdt_setprop_cell(mms->fdt, nodename, "#size-cells", 2);
+ qemu_fdt_setprop_cell(mms->fdt, nodename, "linux,pci-domain", 0);
+ qemu_fdt_setprop(mms->fdt, nodename, "dma-coherent", NULL, 0);
+
+ qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg",
+ 2, PCIE_ECAM_BASE, 2, PCIE_ECAM_SIZE);
+ if (mms->gpex.mmio64.size) {
+ qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "ranges",
+
+ 1, FDT_PCI_RANGE_MMIO,
+ 2, mms->gpex.mmio32.base,
+ 2, mms->gpex.mmio32.base,
+ 2, mms->gpex.mmio32.size,
+
+ 1, FDT_PCI_RANGE_MMIO_64BIT,
+ 2, mms->gpex.mmio64.base,
+ 2, mms->gpex.mmio64.base,
+ 2, mms->gpex.mmio64.size);
+ } else {
+ qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "ranges",
+
+ 1, FDT_PCI_RANGE_MMIO,
+ 2, mms->gpex.mmio32.base,
+ 2, mms->gpex.mmio32.base,
+ 2, mms->gpex.mmio32.size);
+ }
+
+ nr_pcie_buses = PCIE_ECAM_SIZE / PCIE_MMCFG_SIZE_MIN;
+ qemu_fdt_setprop_cells(mms->fdt, nodename, "bus-range", 0,
+ nr_pcie_buses - 1);
+
+ g_free(nodename);
+}
+
+static void dt_add_ioapic(MicrovmMachineState *mms, SysBusDevice *dev)
+{
+ hwaddr base = dev->mmio[0].addr;
+ char *nodename;
+ uint32_t ph;
+ int index;
+
+ switch (base) {
+ case IO_APIC_DEFAULT_ADDRESS:
+ index = 0;
+ break;
+ case IO_APIC_SECONDARY_ADDRESS:
+ index = 1;
+ break;
+ default:
+ fprintf(stderr, "unknown ioapic @ %" PRIx64 "\n", base);
+ return;
+ }
+
+ nodename = g_strdup_printf("/ioapic%d@%" PRIx64, index + 1, base);
+ qemu_fdt_add_subnode(mms->fdt, nodename);
+ qemu_fdt_setprop_string(mms->fdt, nodename,
+ "compatible", "intel,ce4100-ioapic");
+ qemu_fdt_setprop(mms->fdt, nodename, "interrupt-controller", NULL, 0);
+ qemu_fdt_setprop_cell(mms->fdt, nodename, "#interrupt-cells", 0x2);
+ qemu_fdt_setprop_cell(mms->fdt, nodename, "#address-cells", 0x2);
+ qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg",
+ 2, base, 2, 0x1000);
+
+ ph = qemu_fdt_alloc_phandle(mms->fdt);
+ qemu_fdt_setprop_cell(mms->fdt, nodename, "phandle", ph);
+ qemu_fdt_setprop_cell(mms->fdt, nodename, "linux,phandle", ph);
+ mms->ioapic_phandle[index] = ph;
+
+ g_free(nodename);
+}
+
+static void dt_add_isa_serial(MicrovmMachineState *mms, ISADevice *dev)
+{
+ const char compat[] = "ns16550";
+ uint32_t irq = object_property_get_int(OBJECT(dev), "irq", NULL);
+ hwaddr base = object_property_get_int(OBJECT(dev), "iobase", NULL);
+ hwaddr size = 8;
+ char *nodename;
+
+ nodename = g_strdup_printf("/serial@%" PRIx64, base);
+ qemu_fdt_add_subnode(mms->fdt, nodename);
+ qemu_fdt_setprop(mms->fdt, nodename, "compatible", compat, sizeof(compat));
+ qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size);
+ dt_add_microvm_irq(mms, nodename, irq);
+
+ if (base == 0x3f8 /* com1 */) {
+ qemu_fdt_setprop_string(mms->fdt, "/chosen", "stdout-path", nodename);
+ }
+
+ g_free(nodename);
+}
+
+static void dt_add_isa_rtc(MicrovmMachineState *mms, ISADevice *dev)
+{
+ const char compat[] = "motorola,mc146818";
+ uint32_t irq = RTC_ISA_IRQ;
+ hwaddr base = RTC_ISA_BASE;
+ hwaddr size = 8;
+ char *nodename;
+
+ nodename = g_strdup_printf("/rtc@%" PRIx64, base);
+ qemu_fdt_add_subnode(mms->fdt, nodename);
+ qemu_fdt_setprop(mms->fdt, nodename, "compatible", compat, sizeof(compat));
+ qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size);
+ dt_add_microvm_irq(mms, nodename, irq);
+ g_free(nodename);
+}
+
+static void dt_setup_isa_bus(MicrovmMachineState *mms, DeviceState *bridge)
+{
+ BusState *bus = qdev_get_child_bus(bridge, "isa.0");
+ BusChild *kid;
+ Object *obj;
+
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+
+ /* serial */
+ obj = object_dynamic_cast(OBJECT(dev), TYPE_ISA_SERIAL);
+ if (obj) {
+ dt_add_isa_serial(mms, ISA_DEVICE(obj));
+ continue;
+ }
+
+ /* rtc */
+ obj = object_dynamic_cast(OBJECT(dev), TYPE_MC146818_RTC);
+ if (obj) {
+ dt_add_isa_rtc(mms, ISA_DEVICE(obj));
+ continue;
+ }
+
+ if (debug) {
+ fprintf(stderr, "%s: unhandled: %s\n", __func__,
+ object_get_typename(OBJECT(dev)));
+ }
+ }
+}
+
+static void dt_setup_sys_bus(MicrovmMachineState *mms)
+{
+ BusState *bus;
+ BusChild *kid;
+ Object *obj;
+
+ /* sysbus devices */
+ bus = sysbus_get_default();
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+
+ /* ioapic */
+ obj = object_dynamic_cast(OBJECT(dev), TYPE_IOAPIC);
+ if (obj) {
+ dt_add_ioapic(mms, SYS_BUS_DEVICE(obj));
+ continue;
+ }
+ }
+
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+
+ /* virtio */
+ obj = object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MMIO);
+ if (obj) {
+ dt_add_virtio(mms, VIRTIO_MMIO(obj));
+ continue;
+ }
+
+ /* xhci */
+ obj = object_dynamic_cast(OBJECT(dev), TYPE_XHCI_SYSBUS);
+ if (obj) {
+ dt_add_xhci(mms);
+ continue;
+ }
+
+ /* pcie */
+ obj = object_dynamic_cast(OBJECT(dev), TYPE_GPEX_HOST);
+ if (obj) {
+ dt_add_pcie(mms);
+ continue;
+ }
+
+ /* isa */
+ obj = object_dynamic_cast(OBJECT(dev), "isabus-bridge");
+ if (obj) {
+ dt_setup_isa_bus(mms, DEVICE(obj));
+ continue;
+ }
+
+ if (debug) {
+ obj = object_dynamic_cast(OBJECT(dev), TYPE_IOAPIC);
+ if (obj) {
+ /* ioapic already added in first pass */
+ continue;
+ }
+ fprintf(stderr, "%s: unhandled: %s\n", __func__,
+ object_get_typename(OBJECT(dev)));
+ }
+ }
+}
+
+void dt_setup_microvm(MicrovmMachineState *mms)
+{
+ X86MachineState *x86ms = X86_MACHINE(mms);
+ int size = 0;
+
+ mms->fdt = create_device_tree(&size);
+
+ /* root node */
+ qemu_fdt_setprop_string(mms->fdt, "/", "compatible", "linux,microvm");
+ qemu_fdt_setprop_cell(mms->fdt, "/", "#address-cells", 0x2);
+ qemu_fdt_setprop_cell(mms->fdt, "/", "#size-cells", 0x2);
+
+ qemu_fdt_add_subnode(mms->fdt, "/chosen");
+ dt_setup_sys_bus(mms);
+
+ /* add to fw_cfg */
+ if (debug) {
+ fprintf(stderr, "%s: add etc/fdt to fw_cfg\n", __func__);
+ }
+ fw_cfg_add_file(x86ms->fw_cfg, "etc/fdt", mms->fdt, size);
+
+ if (debug) {
+ fprintf(stderr, "%s: writing microvm.fdt\n", __func__);
+ if (!g_file_set_contents("microvm.fdt", mms->fdt, size, NULL)) {
+ fprintf(stderr, "%s: writing microvm.fdt failed\n", __func__);
+ return;
+ }
+ int ret = system("dtc -I dtb -O dts microvm.fdt");
+ if (ret != 0) {
+ fprintf(stderr, "%s: oops, dtc not installed?\n", __func__);
+ }
+ }
+}
diff --git a/hw/i386/microvm-dt.h b/hw/i386/microvm-dt.h
new file mode 100644
index 000000000..77c79cbdd
--- /dev/null
+++ b/hw/i386/microvm-dt.h
@@ -0,0 +1,8 @@
+#ifndef HW_I386_MICROVM_DT_H
+#define HW_I386_MICROVM_DT_H
+
+#include "hw/i386/microvm.h"
+
+void dt_setup_microvm(MicrovmMachineState *mms);
+
+#endif
diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
new file mode 100644
index 000000000..4b3b1dd26
--- /dev/null
+++ b/hw/i386/microvm.c
@@ -0,0 +1,779 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "qemu/units.h"
+#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include "qapi/qapi-visit-common.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/cpus.h"
+#include "sysemu/numa.h"
+#include "sysemu/reset.h"
+#include "sysemu/runstate.h"
+#include "acpi-microvm.h"
+#include "microvm-dt.h"
+
+#include "hw/loader.h"
+#include "hw/irq.h"
+#include "hw/kvm/clock.h"
+#include "hw/i386/microvm.h"
+#include "hw/i386/x86.h"
+#include "target/i386/cpu.h"
+#include "hw/intc/i8259.h"
+#include "hw/timer/i8254.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "hw/char/serial.h"
+#include "hw/display/ramfb.h"
+#include "hw/i386/topology.h"
+#include "hw/i386/e820_memory_layout.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/virtio/virtio-mmio.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/generic_event_device.h"
+#include "hw/pci-host/gpex.h"
+#include "hw/usb/xhci.h"
+
+#include "elf.h"
+#include "kvm/kvm_i386.h"
+#include "hw/xen/start_info.h"
+
+#define MICROVM_QBOOT_FILENAME "qboot.rom"
+#define MICROVM_BIOS_FILENAME "bios-microvm.bin"
+
+static void microvm_set_rtc(MicrovmMachineState *mms, ISADevice *s)
+{
+ X86MachineState *x86ms = X86_MACHINE(mms);
+ int val;
+
+ val = MIN(x86ms->below_4g_mem_size / KiB, 640);
+ rtc_set_memory(s, 0x15, val);
+ rtc_set_memory(s, 0x16, val >> 8);
+ /* extended memory (next 64MiB) */
+ if (x86ms->below_4g_mem_size > 1 * MiB) {
+ val = (x86ms->below_4g_mem_size - 1 * MiB) / KiB;
+ } else {
+ val = 0;
+ }
+ if (val > 65535) {
+ val = 65535;
+ }
+ rtc_set_memory(s, 0x17, val);
+ rtc_set_memory(s, 0x18, val >> 8);
+ rtc_set_memory(s, 0x30, val);
+ rtc_set_memory(s, 0x31, val >> 8);
+ /* memory between 16MiB and 4GiB */
+ if (x86ms->below_4g_mem_size > 16 * MiB) {
+ val = (x86ms->below_4g_mem_size - 16 * MiB) / (64 * KiB);
+ } else {
+ val = 0;
+ }
+ if (val > 65535) {
+ val = 65535;
+ }
+ rtc_set_memory(s, 0x34, val);
+ rtc_set_memory(s, 0x35, val >> 8);
+ /* memory above 4GiB */
+ val = x86ms->above_4g_mem_size / 65536;
+ rtc_set_memory(s, 0x5b, val);
+ rtc_set_memory(s, 0x5c, val >> 8);
+ rtc_set_memory(s, 0x5d, val >> 16);
+}
+
+static void create_gpex(MicrovmMachineState *mms)
+{
+ X86MachineState *x86ms = X86_MACHINE(mms);
+ MemoryRegion *mmio32_alias;
+ MemoryRegion *mmio64_alias;
+ MemoryRegion *mmio_reg;
+ MemoryRegion *ecam_alias;
+ MemoryRegion *ecam_reg;
+ DeviceState *dev;
+ int i;
+
+ dev = qdev_new(TYPE_GPEX_HOST);
+ sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
+
+ /* Map only the first size_ecam bytes of ECAM space */
+ ecam_alias = g_new0(MemoryRegion, 1);
+ ecam_reg = sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 0);
+ memory_region_init_alias(ecam_alias, OBJECT(dev), "pcie-ecam",
+ ecam_reg, 0, mms->gpex.ecam.size);
+ memory_region_add_subregion(get_system_memory(),
+ mms->gpex.ecam.base, ecam_alias);
+
+ /* Map the MMIO window into system address space so as to expose
+ * the section of PCI MMIO space which starts at the same base address
+ * (ie 1:1 mapping for that part of PCI MMIO space visible through
+ * the window).
+ */
+ mmio_reg = sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 1);
+ if (mms->gpex.mmio32.size) {
+ mmio32_alias = g_new0(MemoryRegion, 1);
+ memory_region_init_alias(mmio32_alias, OBJECT(dev), "pcie-mmio32", mmio_reg,
+ mms->gpex.mmio32.base, mms->gpex.mmio32.size);
+ memory_region_add_subregion(get_system_memory(),
+ mms->gpex.mmio32.base, mmio32_alias);
+ }
+ if (mms->gpex.mmio64.size) {
+ mmio64_alias = g_new0(MemoryRegion, 1);
+ memory_region_init_alias(mmio64_alias, OBJECT(dev), "pcie-mmio64", mmio_reg,
+ mms->gpex.mmio64.base, mms->gpex.mmio64.size);
+ memory_region_add_subregion(get_system_memory(),
+ mms->gpex.mmio64.base, mmio64_alias);
+ }
+
+ for (i = 0; i < GPEX_NUM_IRQS; i++) {
+ sysbus_connect_irq(SYS_BUS_DEVICE(dev), i,
+ x86ms->gsi[mms->gpex.irq + i]);
+ }
+}
+
+static int microvm_ioapics(MicrovmMachineState *mms)
+{
+ if (!x86_machine_is_acpi_enabled(X86_MACHINE(mms))) {
+ return 1;
+ }
+ if (mms->ioapic2 == ON_OFF_AUTO_OFF) {
+ return 1;
+ }
+ return 2;
+}
+
+static void microvm_devices_init(MicrovmMachineState *mms)
+{
+ const char *default_firmware;
+ X86MachineState *x86ms = X86_MACHINE(mms);
+ ISABus *isa_bus;
+ ISADevice *rtc_state;
+ GSIState *gsi_state;
+ int ioapics;
+ int i;
+
+ /* Core components */
+ ioapics = microvm_ioapics(mms);
+ gsi_state = g_malloc0(sizeof(*gsi_state));
+ x86ms->gsi = qemu_allocate_irqs(gsi_handler, gsi_state,
+ IOAPIC_NUM_PINS * ioapics);
+
+ isa_bus = isa_bus_new(NULL, get_system_memory(), get_system_io(),
+ &error_abort);
+ isa_bus_irqs(isa_bus, x86ms->gsi);
+
+ ioapic_init_gsi(gsi_state, "machine");
+ if (ioapics > 1) {
+ x86ms->ioapic2 = ioapic_init_secondary(gsi_state);
+ }
+
+ kvmclock_create(true);
+
+ mms->virtio_irq_base = 5;
+ mms->virtio_num_transports = 8;
+ if (x86ms->ioapic2) {
+ mms->pcie_irq_base = 16; /* 16 -> 19 */
+ /* use second ioapic (24 -> 47) for virtio-mmio irq lines */
+ mms->virtio_irq_base = IO_APIC_SECONDARY_IRQBASE;
+ mms->virtio_num_transports = IOAPIC_NUM_PINS;
+ } else if (x86_machine_is_acpi_enabled(x86ms)) {
+ mms->pcie_irq_base = 12; /* 12 -> 15 */
+ mms->virtio_irq_base = 16; /* 16 -> 23 */
+ }
+
+ for (i = 0; i < mms->virtio_num_transports; i++) {
+ sysbus_create_simple("virtio-mmio",
+ VIRTIO_MMIO_BASE + i * 512,
+ x86ms->gsi[mms->virtio_irq_base + i]);
+ }
+
+ /* Optional and legacy devices */
+ if (x86_machine_is_acpi_enabled(x86ms)) {
+ DeviceState *dev = qdev_new(TYPE_ACPI_GED_X86);
+ qdev_prop_set_uint32(dev, "ged-event", ACPI_GED_PWR_DOWN_EVT);
+ sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, GED_MMIO_BASE);
+ /* sysbus_mmio_map(SYS_BUS_DEVICE(dev), 1, GED_MMIO_BASE_MEMHP); */
+ sysbus_mmio_map(SYS_BUS_DEVICE(dev), 2, GED_MMIO_BASE_REGS);
+ sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0,
+ x86ms->gsi[GED_MMIO_IRQ]);
+ sysbus_realize(SYS_BUS_DEVICE(dev), &error_fatal);
+ x86ms->acpi_dev = HOTPLUG_HANDLER(dev);
+ }
+
+ if (x86_machine_is_acpi_enabled(x86ms) && machine_usb(MACHINE(mms))) {
+ DeviceState *dev = qdev_new(TYPE_XHCI_SYSBUS);
+ qdev_prop_set_uint32(dev, "intrs", 1);
+ qdev_prop_set_uint32(dev, "slots", XHCI_MAXSLOTS);
+ qdev_prop_set_uint32(dev, "p2", 8);
+ qdev_prop_set_uint32(dev, "p3", 8);
+ sysbus_realize(SYS_BUS_DEVICE(dev), &error_fatal);
+ sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, MICROVM_XHCI_BASE);
+ sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0,
+ x86ms->gsi[MICROVM_XHCI_IRQ]);
+ }
+
+ if (x86_machine_is_acpi_enabled(x86ms) && mms->pcie == ON_OFF_AUTO_ON) {
+ /* use topmost 25% of the address space available */
+ hwaddr phys_size = (hwaddr)1 << X86_CPU(first_cpu)->phys_bits;
+ if (phys_size > 0x1000000ll) {
+ mms->gpex.mmio64.size = phys_size / 4;
+ mms->gpex.mmio64.base = phys_size - mms->gpex.mmio64.size;
+ }
+ mms->gpex.mmio32.base = PCIE_MMIO_BASE;
+ mms->gpex.mmio32.size = PCIE_MMIO_SIZE;
+ mms->gpex.ecam.base = PCIE_ECAM_BASE;
+ mms->gpex.ecam.size = PCIE_ECAM_SIZE;
+ mms->gpex.irq = mms->pcie_irq_base;
+ create_gpex(mms);
+ x86ms->pci_irq_mask = ((1 << (mms->pcie_irq_base + 0)) |
+ (1 << (mms->pcie_irq_base + 1)) |
+ (1 << (mms->pcie_irq_base + 2)) |
+ (1 << (mms->pcie_irq_base + 3)));
+ } else {
+ x86ms->pci_irq_mask = 0;
+ }
+
+ if (mms->pic == ON_OFF_AUTO_ON || mms->pic == ON_OFF_AUTO_AUTO) {
+ qemu_irq *i8259;
+
+ i8259 = i8259_init(isa_bus, x86_allocate_cpu_irq());
+ for (i = 0; i < ISA_NUM_IRQS; i++) {
+ gsi_state->i8259_irq[i] = i8259[i];
+ }
+ g_free(i8259);
+ }
+
+ if (mms->pit == ON_OFF_AUTO_ON || mms->pit == ON_OFF_AUTO_AUTO) {
+ if (kvm_pit_in_kernel()) {
+ kvm_pit_init(isa_bus, 0x40);
+ } else {
+ i8254_pit_init(isa_bus, 0x40, 0, NULL);
+ }
+ }
+
+ if (mms->rtc == ON_OFF_AUTO_ON ||
+ (mms->rtc == ON_OFF_AUTO_AUTO && !kvm_enabled())) {
+ rtc_state = mc146818_rtc_init(isa_bus, 2000, NULL);
+ microvm_set_rtc(mms, rtc_state);
+ }
+
+ if (mms->isa_serial) {
+ serial_hds_isa_init(isa_bus, 0, 1);
+ }
+
+ default_firmware = x86_machine_is_acpi_enabled(x86ms)
+ ? MICROVM_BIOS_FILENAME
+ : MICROVM_QBOOT_FILENAME;
+ x86_bios_rom_init(MACHINE(mms), default_firmware, get_system_memory(), true);
+}
+
+static void microvm_memory_init(MicrovmMachineState *mms)
+{
+ MachineState *machine = MACHINE(mms);
+ X86MachineState *x86ms = X86_MACHINE(mms);
+ MemoryRegion *ram_below_4g, *ram_above_4g;
+ MemoryRegion *system_memory = get_system_memory();
+ FWCfgState *fw_cfg;
+ ram_addr_t lowmem = 0xc0000000; /* 3G */
+ int i;
+
+ if (machine->ram_size > lowmem) {
+ x86ms->above_4g_mem_size = machine->ram_size - lowmem;
+ x86ms->below_4g_mem_size = lowmem;
+ } else {
+ x86ms->above_4g_mem_size = 0;
+ x86ms->below_4g_mem_size = machine->ram_size;
+ }
+
+ ram_below_4g = g_malloc(sizeof(*ram_below_4g));
+ memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
+ 0, x86ms->below_4g_mem_size);
+ memory_region_add_subregion(system_memory, 0, ram_below_4g);
+
+ e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
+
+ if (x86ms->above_4g_mem_size > 0) {
+ ram_above_4g = g_malloc(sizeof(*ram_above_4g));
+ memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
+ machine->ram,
+ x86ms->below_4g_mem_size,
+ x86ms->above_4g_mem_size);
+ memory_region_add_subregion(system_memory, 0x100000000ULL,
+ ram_above_4g);
+ e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
+ }
+
+ fw_cfg = fw_cfg_init_io_dma(FW_CFG_IO_BASE, FW_CFG_IO_BASE + 4,
+ &address_space_memory);
+
+ fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, machine->smp.cpus);
+ fw_cfg_add_i16(fw_cfg, FW_CFG_MAX_CPUS, machine->smp.max_cpus);
+ fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)machine->ram_size);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_IRQ0_OVERRIDE, 1);
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_E820_TABLE,
+ &e820_reserve, sizeof(e820_reserve));
+ fw_cfg_add_file(fw_cfg, "etc/e820", e820_table,
+ sizeof(struct e820_entry) * e820_get_num_entries());
+
+ rom_set_fw(fw_cfg);
+
+ if (machine->kernel_filename != NULL) {
+ x86_load_linux(x86ms, fw_cfg, 0, true);
+ }
+
+ if (mms->option_roms) {
+ for (i = 0; i < nb_option_roms; i++) {
+ rom_add_option(option_rom[i].name, option_rom[i].bootindex);
+ }
+ }
+
+ x86ms->fw_cfg = fw_cfg;
+ x86ms->ioapic_as = &address_space_memory;
+}
+
+static gchar *microvm_get_mmio_cmdline(gchar *name, uint32_t virtio_irq_base)
+{
+ gchar *cmdline;
+ gchar *separator;
+ long int index;
+ int ret;
+
+ separator = g_strrstr(name, ".");
+ if (!separator) {
+ return NULL;
+ }
+
+ if (qemu_strtol(separator + 1, NULL, 10, &index) != 0) {
+ return NULL;
+ }
+
+ cmdline = g_malloc0(VIRTIO_CMDLINE_MAXLEN);
+ ret = g_snprintf(cmdline, VIRTIO_CMDLINE_MAXLEN,
+ " virtio_mmio.device=512@0x%lx:%ld",
+ VIRTIO_MMIO_BASE + index * 512,
+ virtio_irq_base + index);
+ if (ret < 0 || ret >= VIRTIO_CMDLINE_MAXLEN) {
+ g_free(cmdline);
+ return NULL;
+ }
+
+ return cmdline;
+}
+
+static void microvm_fix_kernel_cmdline(MachineState *machine)
+{
+ X86MachineState *x86ms = X86_MACHINE(machine);
+ MicrovmMachineState *mms = MICROVM_MACHINE(machine);
+ BusState *bus;
+ BusChild *kid;
+ char *cmdline;
+
+ /*
+ * Find MMIO transports with attached devices, and add them to the kernel
+ * command line.
+ *
+ * Yes, this is a hack, but one that heavily improves the UX without
+ * introducing any significant issues.
+ */
+ cmdline = g_strdup(machine->kernel_cmdline);
+ bus = sysbus_get_default();
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+ ObjectClass *class = object_get_class(OBJECT(dev));
+
+ if (class == object_class_by_name(TYPE_VIRTIO_MMIO)) {
+ VirtIOMMIOProxy *mmio = VIRTIO_MMIO(OBJECT(dev));
+ VirtioBusState *mmio_virtio_bus = &mmio->bus;
+ BusState *mmio_bus = &mmio_virtio_bus->parent_obj;
+
+ if (!QTAILQ_EMPTY(&mmio_bus->children)) {
+ gchar *mmio_cmdline = microvm_get_mmio_cmdline
+ (mmio_bus->name, mms->virtio_irq_base);
+ if (mmio_cmdline) {
+ char *newcmd = g_strjoin(NULL, cmdline, mmio_cmdline, NULL);
+ g_free(mmio_cmdline);
+ g_free(cmdline);
+ cmdline = newcmd;
+ }
+ }
+ }
+ }
+
+ fw_cfg_modify_i32(x86ms->fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(cmdline) + 1);
+ fw_cfg_modify_string(x86ms->fw_cfg, FW_CFG_CMDLINE_DATA, cmdline);
+
+ g_free(cmdline);
+}
+
+static void microvm_device_pre_plug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ X86CPU *cpu = X86_CPU(dev);
+
+ cpu->host_phys_bits = true; /* need reliable phys-bits */
+ x86_cpu_pre_plug(hotplug_dev, dev, errp);
+}
+
+static void microvm_device_plug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ x86_cpu_plug(hotplug_dev, dev, errp);
+}
+
+static void microvm_device_unplug_request_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ error_setg(errp, "unplug not supported by microvm");
+}
+
+static void microvm_device_unplug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ error_setg(errp, "unplug not supported by microvm");
+}
+
+static HotplugHandler *microvm_get_hotplug_handler(MachineState *machine,
+ DeviceState *dev)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+ return HOTPLUG_HANDLER(machine);
+ }
+ return NULL;
+}
+
+static void microvm_machine_state_init(MachineState *machine)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(machine);
+ X86MachineState *x86ms = X86_MACHINE(machine);
+
+ microvm_memory_init(mms);
+
+ x86_cpus_init(x86ms, CPU_VERSION_LATEST);
+
+ microvm_devices_init(mms);
+}
+
+static void microvm_machine_reset(MachineState *machine)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(machine);
+ CPUState *cs;
+ X86CPU *cpu;
+
+ if (!x86_machine_is_acpi_enabled(X86_MACHINE(machine)) &&
+ machine->kernel_filename != NULL &&
+ mms->auto_kernel_cmdline && !mms->kernel_cmdline_fixed) {
+ microvm_fix_kernel_cmdline(machine);
+ mms->kernel_cmdline_fixed = true;
+ }
+
+ qemu_devices_reset();
+
+ CPU_FOREACH(cs) {
+ cpu = X86_CPU(cs);
+
+ if (cpu->apic_state) {
+ device_legacy_reset(cpu->apic_state);
+ }
+ }
+}
+
+static void microvm_machine_get_pic(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+ OnOffAuto pic = mms->pic;
+
+ visit_type_OnOffAuto(v, name, &pic, errp);
+}
+
+static void microvm_machine_set_pic(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ visit_type_OnOffAuto(v, name, &mms->pic, errp);
+}
+
+static void microvm_machine_get_pit(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+ OnOffAuto pit = mms->pit;
+
+ visit_type_OnOffAuto(v, name, &pit, errp);
+}
+
+static void microvm_machine_set_pit(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ visit_type_OnOffAuto(v, name, &mms->pit, errp);
+}
+
+static void microvm_machine_get_rtc(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+ OnOffAuto rtc = mms->rtc;
+
+ visit_type_OnOffAuto(v, name, &rtc, errp);
+}
+
+static void microvm_machine_set_rtc(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ visit_type_OnOffAuto(v, name, &mms->rtc, errp);
+}
+
+static void microvm_machine_get_pcie(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+ OnOffAuto pcie = mms->pcie;
+
+ visit_type_OnOffAuto(v, name, &pcie, errp);
+}
+
+static void microvm_machine_set_pcie(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ visit_type_OnOffAuto(v, name, &mms->pcie, errp);
+}
+
+static void microvm_machine_get_ioapic2(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+ OnOffAuto ioapic2 = mms->ioapic2;
+
+ visit_type_OnOffAuto(v, name, &ioapic2, errp);
+}
+
+static void microvm_machine_set_ioapic2(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ visit_type_OnOffAuto(v, name, &mms->ioapic2, errp);
+}
+
+static bool microvm_machine_get_isa_serial(Object *obj, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ return mms->isa_serial;
+}
+
+static void microvm_machine_set_isa_serial(Object *obj, bool value,
+ Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ mms->isa_serial = value;
+}
+
+static bool microvm_machine_get_option_roms(Object *obj, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ return mms->option_roms;
+}
+
+static void microvm_machine_set_option_roms(Object *obj, bool value,
+ Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ mms->option_roms = value;
+}
+
+static bool microvm_machine_get_auto_kernel_cmdline(Object *obj, Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ return mms->auto_kernel_cmdline;
+}
+
+static void microvm_machine_set_auto_kernel_cmdline(Object *obj, bool value,
+ Error **errp)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ mms->auto_kernel_cmdline = value;
+}
+
+static void microvm_machine_done(Notifier *notifier, void *data)
+{
+ MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState,
+ machine_done);
+
+ acpi_setup_microvm(mms);
+ dt_setup_microvm(mms);
+}
+
+static void microvm_powerdown_req(Notifier *notifier, void *data)
+{
+ MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState,
+ powerdown_req);
+ X86MachineState *x86ms = X86_MACHINE(mms);
+
+ if (x86ms->acpi_dev) {
+ Object *obj = OBJECT(x86ms->acpi_dev);
+ AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj);
+ adevc->send_event(ACPI_DEVICE_IF(x86ms->acpi_dev),
+ ACPI_POWER_DOWN_STATUS);
+ }
+}
+
+static void microvm_machine_initfn(Object *obj)
+{
+ MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+ /* Configuration */
+ mms->pic = ON_OFF_AUTO_AUTO;
+ mms->pit = ON_OFF_AUTO_AUTO;
+ mms->rtc = ON_OFF_AUTO_AUTO;
+ mms->pcie = ON_OFF_AUTO_AUTO;
+ mms->ioapic2 = ON_OFF_AUTO_AUTO;
+ mms->isa_serial = true;
+ mms->option_roms = true;
+ mms->auto_kernel_cmdline = true;
+
+ /* State */
+ mms->kernel_cmdline_fixed = false;
+
+ mms->machine_done.notify = microvm_machine_done;
+ qemu_add_machine_init_done_notifier(&mms->machine_done);
+ mms->powerdown_req.notify = microvm_powerdown_req;
+ qemu_register_powerdown_notifier(&mms->powerdown_req);
+}
+
+static void microvm_class_init(ObjectClass *oc, void *data)
+{
+ X86MachineClass *x86mc = X86_MACHINE_CLASS(oc);
+ MachineClass *mc = MACHINE_CLASS(oc);
+ HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
+
+ mc->init = microvm_machine_state_init;
+
+ mc->family = "microvm_i386";
+ mc->desc = "microvm (i386)";
+ mc->units_per_default_bus = 1;
+ mc->no_floppy = 1;
+ mc->max_cpus = 288;
+ mc->has_hotpluggable_cpus = false;
+ mc->auto_enable_numa_with_memhp = false;
+ mc->auto_enable_numa_with_memdev = false;
+ mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE;
+ mc->nvdimm_supported = false;
+ mc->default_ram_id = "microvm.ram";
+
+ /* Avoid relying too much on kernel components */
+ mc->default_kernel_irqchip_split = true;
+
+ /* Machine class handlers */
+ mc->reset = microvm_machine_reset;
+
+ /* hotplug (for cpu coldplug) */
+ mc->get_hotplug_handler = microvm_get_hotplug_handler;
+ hc->pre_plug = microvm_device_pre_plug_cb;
+ hc->plug = microvm_device_plug_cb;
+ hc->unplug_request = microvm_device_unplug_request_cb;
+ hc->unplug = microvm_device_unplug_cb;
+
+ x86mc->fwcfg_dma_enabled = true;
+
+ object_class_property_add(oc, MICROVM_MACHINE_PIC, "OnOffAuto",
+ microvm_machine_get_pic,
+ microvm_machine_set_pic,
+ NULL, NULL);
+ object_class_property_set_description(oc, MICROVM_MACHINE_PIC,
+ "Enable i8259 PIC");
+
+ object_class_property_add(oc, MICROVM_MACHINE_PIT, "OnOffAuto",
+ microvm_machine_get_pit,
+ microvm_machine_set_pit,
+ NULL, NULL);
+ object_class_property_set_description(oc, MICROVM_MACHINE_PIT,
+ "Enable i8254 PIT");
+
+ object_class_property_add(oc, MICROVM_MACHINE_RTC, "OnOffAuto",
+ microvm_machine_get_rtc,
+ microvm_machine_set_rtc,
+ NULL, NULL);
+ object_class_property_set_description(oc, MICROVM_MACHINE_RTC,
+ "Enable MC146818 RTC");
+
+ object_class_property_add(oc, MICROVM_MACHINE_PCIE, "OnOffAuto",
+ microvm_machine_get_pcie,
+ microvm_machine_set_pcie,
+ NULL, NULL);
+ object_class_property_set_description(oc, MICROVM_MACHINE_PCIE,
+ "Enable PCIe");
+
+ object_class_property_add(oc, MICROVM_MACHINE_IOAPIC2, "OnOffAuto",
+ microvm_machine_get_ioapic2,
+ microvm_machine_set_ioapic2,
+ NULL, NULL);
+ object_class_property_set_description(oc, MICROVM_MACHINE_IOAPIC2,
+ "Enable second IO-APIC");
+
+ object_class_property_add_bool(oc, MICROVM_MACHINE_ISA_SERIAL,
+ microvm_machine_get_isa_serial,
+ microvm_machine_set_isa_serial);
+ object_class_property_set_description(oc, MICROVM_MACHINE_ISA_SERIAL,
+ "Set off to disable the instantiation an ISA serial port");
+
+ object_class_property_add_bool(oc, MICROVM_MACHINE_OPTION_ROMS,
+ microvm_machine_get_option_roms,
+ microvm_machine_set_option_roms);
+ object_class_property_set_description(oc, MICROVM_MACHINE_OPTION_ROMS,
+ "Set off to disable loading option ROMs");
+
+ object_class_property_add_bool(oc, MICROVM_MACHINE_AUTO_KERNEL_CMDLINE,
+ microvm_machine_get_auto_kernel_cmdline,
+ microvm_machine_set_auto_kernel_cmdline);
+ object_class_property_set_description(oc,
+ MICROVM_MACHINE_AUTO_KERNEL_CMDLINE,
+ "Set off to disable adding virtio-mmio devices to the kernel cmdline");
+
+ machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE);
+}
+
+static const TypeInfo microvm_machine_info = {
+ .name = TYPE_MICROVM_MACHINE,
+ .parent = TYPE_X86_MACHINE,
+ .instance_size = sizeof(MicrovmMachineState),
+ .instance_init = microvm_machine_initfn,
+ .class_size = sizeof(MicrovmMachineClass),
+ .class_init = microvm_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_HOTPLUG_HANDLER },
+ { }
+ },
+};
+
+static void microvm_machine_init(void)
+{
+ type_register_static(&microvm_machine_info);
+}
+type_init(microvm_machine_init);
diff --git a/hw/i386/multiboot.c b/hw/i386/multiboot.c
new file mode 100644
index 000000000..0a10089f1
--- /dev/null
+++ b/hw/i386/multiboot.c
@@ -0,0 +1,415 @@
+/*
+ * QEMU PC System Emulator
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/option.h"
+#include "cpu.h"
+#include "hw/nvram/fw_cfg.h"
+#include "multiboot.h"
+#include "hw/loader.h"
+#include "elf.h"
+#include "sysemu/sysemu.h"
+#include "qemu/error-report.h"
+
+/* Show multiboot debug output */
+//#define DEBUG_MULTIBOOT
+
+#ifdef DEBUG_MULTIBOOT
+#define mb_debug(a...) error_report(a)
+#else
+#define mb_debug(a...)
+#endif
+
+#define MULTIBOOT_STRUCT_ADDR 0x9000
+
+#if MULTIBOOT_STRUCT_ADDR > 0xf0000
+#error multiboot struct needs to fit in 16 bit real mode
+#endif
+
+enum {
+ /* Multiboot info */
+ MBI_FLAGS = 0,
+ MBI_MEM_LOWER = 4,
+ MBI_MEM_UPPER = 8,
+ MBI_BOOT_DEVICE = 12,
+ MBI_CMDLINE = 16,
+ MBI_MODS_COUNT = 20,
+ MBI_MODS_ADDR = 24,
+ MBI_MMAP_ADDR = 48,
+ MBI_BOOTLOADER = 64,
+
+ MBI_SIZE = 88,
+
+ /* Multiboot modules */
+ MB_MOD_START = 0,
+ MB_MOD_END = 4,
+ MB_MOD_CMDLINE = 8,
+
+ MB_MOD_SIZE = 16,
+
+ /* Region offsets */
+ ADDR_E820_MAP = MULTIBOOT_STRUCT_ADDR + 0,
+ ADDR_MBI = ADDR_E820_MAP + 0x500,
+
+ /* Multiboot flags */
+ MULTIBOOT_FLAGS_MEMORY = 1 << 0,
+ MULTIBOOT_FLAGS_BOOT_DEVICE = 1 << 1,
+ MULTIBOOT_FLAGS_CMDLINE = 1 << 2,
+ MULTIBOOT_FLAGS_MODULES = 1 << 3,
+ MULTIBOOT_FLAGS_MMAP = 1 << 6,
+ MULTIBOOT_FLAGS_BOOTLOADER = 1 << 9,
+};
+
+typedef struct {
+ /* buffer holding kernel, cmdlines and mb_infos */
+ void *mb_buf;
+ /* address in target */
+ hwaddr mb_buf_phys;
+ /* size of mb_buf in bytes */
+ unsigned mb_buf_size;
+ /* offset of mb-info's in bytes */
+ hwaddr offset_mbinfo;
+ /* offset in buffer for cmdlines in bytes */
+ hwaddr offset_cmdlines;
+ /* offset in buffer for bootloader name in bytes */
+ hwaddr offset_bootloader;
+ /* offset of modules in bytes */
+ hwaddr offset_mods;
+ /* available slots for mb modules infos */
+ int mb_mods_avail;
+ /* currently used slots of mb modules */
+ int mb_mods_count;
+} MultibootState;
+
+const char *bootloader_name = "qemu";
+
+static uint32_t mb_add_cmdline(MultibootState *s, const char *cmdline)
+{
+ hwaddr p = s->offset_cmdlines;
+ char *b = (char *)s->mb_buf + p;
+
+ memcpy(b, cmdline, strlen(cmdline) + 1);
+ s->offset_cmdlines += strlen(b) + 1;
+ return s->mb_buf_phys + p;
+}
+
+static uint32_t mb_add_bootloader(MultibootState *s, const char *bootloader)
+{
+ hwaddr p = s->offset_bootloader;
+ char *b = (char *)s->mb_buf + p;
+
+ memcpy(b, bootloader, strlen(bootloader) + 1);
+ s->offset_bootloader += strlen(b) + 1;
+ return s->mb_buf_phys + p;
+}
+
+static void mb_add_mod(MultibootState *s,
+ hwaddr start, hwaddr end,
+ hwaddr cmdline_phys)
+{
+ char *p;
+ assert(s->mb_mods_count < s->mb_mods_avail);
+
+ p = (char *)s->mb_buf + s->offset_mbinfo + MB_MOD_SIZE * s->mb_mods_count;
+
+ stl_p(p + MB_MOD_START, start);
+ stl_p(p + MB_MOD_END, end);
+ stl_p(p + MB_MOD_CMDLINE, cmdline_phys);
+
+ mb_debug("mod%02d: "TARGET_FMT_plx" - "TARGET_FMT_plx,
+ s->mb_mods_count, start, end);
+
+ s->mb_mods_count++;
+}
+
+int load_multiboot(X86MachineState *x86ms,
+ FWCfgState *fw_cfg,
+ FILE *f,
+ const char *kernel_filename,
+ const char *initrd_filename,
+ const char *kernel_cmdline,
+ int kernel_file_size,
+ uint8_t *header)
+{
+ bool multiboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled;
+ int i, is_multiboot = 0;
+ uint32_t flags = 0;
+ uint32_t mh_entry_addr;
+ uint32_t mh_load_addr;
+ uint32_t mb_kernel_size;
+ MultibootState mbs;
+ uint8_t bootinfo[MBI_SIZE];
+ uint8_t *mb_bootinfo_data;
+ uint32_t cmdline_len;
+ GList *mods = NULL;
+
+ /* Ok, let's see if it is a multiboot image.
+ The header is 12x32bit long, so the latest entry may be 8192 - 48. */
+ for (i = 0; i < (8192 - 48); i += 4) {
+ if (ldl_p(header+i) == 0x1BADB002) {
+ uint32_t checksum = ldl_p(header+i+8);
+ flags = ldl_p(header+i+4);
+ checksum += flags;
+ checksum += (uint32_t)0x1BADB002;
+ if (!checksum) {
+ is_multiboot = 1;
+ break;
+ }
+ }
+ }
+
+ if (!is_multiboot)
+ return 0; /* no multiboot */
+
+ mb_debug("I believe we found a multiboot image!");
+ memset(bootinfo, 0, sizeof(bootinfo));
+ memset(&mbs, 0, sizeof(mbs));
+
+ if (flags & 0x00000004) { /* MULTIBOOT_HEADER_HAS_VBE */
+ error_report("multiboot knows VBE. we don't");
+ }
+ if (!(flags & 0x00010000)) { /* MULTIBOOT_HEADER_HAS_ADDR */
+ uint64_t elf_entry;
+ uint64_t elf_low, elf_high;
+ int kernel_size;
+ fclose(f);
+
+ if (((struct elf64_hdr*)header)->e_machine == EM_X86_64) {
+ error_report("Cannot load x86-64 image, give a 32bit one.");
+ exit(1);
+ }
+
+ kernel_size = load_elf(kernel_filename, NULL, NULL, NULL, &elf_entry,
+ &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE,
+ 0, 0);
+ if (kernel_size < 0) {
+ error_report("Error while loading elf kernel");
+ exit(1);
+ }
+ mh_load_addr = elf_low;
+ mb_kernel_size = elf_high - elf_low;
+ mh_entry_addr = elf_entry;
+
+ mbs.mb_buf = g_malloc(mb_kernel_size);
+ if (rom_copy(mbs.mb_buf, mh_load_addr, mb_kernel_size) != mb_kernel_size) {
+ error_report("Error while fetching elf kernel from rom");
+ exit(1);
+ }
+
+ mb_debug("loading multiboot-elf kernel "
+ "(%#x bytes) with entry %#zx",
+ mb_kernel_size, (size_t)mh_entry_addr);
+ } else {
+ /* Valid if mh_flags sets MULTIBOOT_HEADER_HAS_ADDR. */
+ uint32_t mh_header_addr = ldl_p(header+i+12);
+ uint32_t mh_load_end_addr = ldl_p(header+i+20);
+ uint32_t mh_bss_end_addr = ldl_p(header+i+24);
+
+ mh_load_addr = ldl_p(header+i+16);
+ if (mh_header_addr < mh_load_addr) {
+ error_report("invalid load_addr address");
+ exit(1);
+ }
+ if (mh_header_addr - mh_load_addr > i) {
+ error_report("invalid header_addr address");
+ exit(1);
+ }
+
+ uint32_t mb_kernel_text_offset = i - (mh_header_addr - mh_load_addr);
+ uint32_t mb_load_size = 0;
+ mh_entry_addr = ldl_p(header+i+28);
+
+ if (mh_load_end_addr) {
+ if (mh_load_end_addr < mh_load_addr) {
+ error_report("invalid load_end_addr address");
+ exit(1);
+ }
+ mb_load_size = mh_load_end_addr - mh_load_addr;
+ } else {
+ if (kernel_file_size < mb_kernel_text_offset) {
+ error_report("invalid kernel_file_size");
+ exit(1);
+ }
+ mb_load_size = kernel_file_size - mb_kernel_text_offset;
+ }
+ if (mb_load_size > UINT32_MAX - mh_load_addr) {
+ error_report("kernel does not fit in address space");
+ exit(1);
+ }
+ if (mh_bss_end_addr) {
+ if (mh_bss_end_addr < (mh_load_addr + mb_load_size)) {
+ error_report("invalid bss_end_addr address");
+ exit(1);
+ }
+ mb_kernel_size = mh_bss_end_addr - mh_load_addr;
+ } else {
+ mb_kernel_size = mb_load_size;
+ }
+
+ mb_debug("multiboot: header_addr = %#x", mh_header_addr);
+ mb_debug("multiboot: load_addr = %#x", mh_load_addr);
+ mb_debug("multiboot: load_end_addr = %#x", mh_load_end_addr);
+ mb_debug("multiboot: bss_end_addr = %#x", mh_bss_end_addr);
+ mb_debug("loading multiboot kernel (%#x bytes) at %#x",
+ mb_load_size, mh_load_addr);
+
+ mbs.mb_buf = g_malloc(mb_kernel_size);
+ fseek(f, mb_kernel_text_offset, SEEK_SET);
+ if (fread(mbs.mb_buf, 1, mb_load_size, f) != mb_load_size) {
+ error_report("fread() failed");
+ exit(1);
+ }
+ memset(mbs.mb_buf + mb_load_size, 0, mb_kernel_size - mb_load_size);
+ fclose(f);
+ }
+
+ mbs.mb_buf_phys = mh_load_addr;
+
+ mbs.mb_buf_size = TARGET_PAGE_ALIGN(mb_kernel_size);
+ mbs.offset_mbinfo = mbs.mb_buf_size;
+
+ /* Calculate space for cmdlines, bootloader name, and mb_mods */
+ cmdline_len = strlen(kernel_filename) + 1;
+ cmdline_len += strlen(kernel_cmdline) + 1;
+ if (initrd_filename) {
+ const char *r = initrd_filename;
+ cmdline_len += strlen(initrd_filename) + 1;
+ while (*r) {
+ char *value;
+ r = get_opt_value(r, &value);
+ mbs.mb_mods_avail++;
+ mods = g_list_append(mods, value);
+ if (*r) {
+ r++;
+ }
+ }
+ }
+
+ mbs.mb_buf_size += cmdline_len;
+ mbs.mb_buf_size += MB_MOD_SIZE * mbs.mb_mods_avail;
+ mbs.mb_buf_size += strlen(bootloader_name) + 1;
+
+ mbs.mb_buf_size = TARGET_PAGE_ALIGN(mbs.mb_buf_size);
+
+ /* enlarge mb_buf to hold cmdlines, bootloader, mb-info structs */
+ mbs.mb_buf = g_realloc(mbs.mb_buf, mbs.mb_buf_size);
+ mbs.offset_cmdlines = mbs.offset_mbinfo + mbs.mb_mods_avail * MB_MOD_SIZE;
+ mbs.offset_bootloader = mbs.offset_cmdlines + cmdline_len;
+
+ if (mods) {
+ GList *tmpl = mods;
+ mbs.offset_mods = mbs.mb_buf_size;
+
+ while (tmpl) {
+ char *next_space;
+ int mb_mod_length;
+ uint32_t offs = mbs.mb_buf_size;
+ char *one_file = tmpl->data;
+
+ /* if a space comes after the module filename, treat everything
+ after that as parameters */
+ hwaddr c = mb_add_cmdline(&mbs, one_file);
+ next_space = strchr(one_file, ' ');
+ if (next_space) {
+ *next_space = '\0';
+ }
+ mb_debug("multiboot loading module: %s", one_file);
+ mb_mod_length = get_image_size(one_file);
+ if (mb_mod_length < 0) {
+ error_report("Failed to open file '%s'", one_file);
+ exit(1);
+ }
+
+ mbs.mb_buf_size = TARGET_PAGE_ALIGN(mb_mod_length + mbs.mb_buf_size);
+ mbs.mb_buf = g_realloc(mbs.mb_buf, mbs.mb_buf_size);
+
+ if (load_image_size(one_file, (unsigned char *)mbs.mb_buf + offs,
+ mbs.mb_buf_size - offs) < 0) {
+ error_report("Error loading file '%s'", one_file);
+ exit(1);
+ }
+ mb_add_mod(&mbs, mbs.mb_buf_phys + offs,
+ mbs.mb_buf_phys + offs + mb_mod_length, c);
+
+ mb_debug("mod_start: %p\nmod_end: %p\n cmdline: "TARGET_FMT_plx,
+ (char *)mbs.mb_buf + offs,
+ (char *)mbs.mb_buf + offs + mb_mod_length, c);
+ g_free(one_file);
+ tmpl = tmpl->next;
+ }
+ g_list_free(mods);
+ }
+
+ /* Commandline support */
+ char kcmdline[strlen(kernel_filename) + strlen(kernel_cmdline) + 2];
+ snprintf(kcmdline, sizeof(kcmdline), "%s %s",
+ kernel_filename, kernel_cmdline);
+ stl_p(bootinfo + MBI_CMDLINE, mb_add_cmdline(&mbs, kcmdline));
+
+ stl_p(bootinfo + MBI_BOOTLOADER, mb_add_bootloader(&mbs, bootloader_name));
+
+ stl_p(bootinfo + MBI_MODS_ADDR, mbs.mb_buf_phys + mbs.offset_mbinfo);
+ stl_p(bootinfo + MBI_MODS_COUNT, mbs.mb_mods_count); /* mods_count */
+
+ /* the kernel is where we want it to be now */
+ stl_p(bootinfo + MBI_FLAGS, MULTIBOOT_FLAGS_MEMORY
+ | MULTIBOOT_FLAGS_BOOT_DEVICE
+ | MULTIBOOT_FLAGS_CMDLINE
+ | MULTIBOOT_FLAGS_MODULES
+ | MULTIBOOT_FLAGS_MMAP
+ | MULTIBOOT_FLAGS_BOOTLOADER);
+ stl_p(bootinfo + MBI_BOOT_DEVICE, 0x8000ffff); /* XXX: use the -boot switch? */
+ stl_p(bootinfo + MBI_MMAP_ADDR, ADDR_E820_MAP);
+
+ mb_debug("multiboot: entry_addr = %#x", mh_entry_addr);
+ mb_debug(" mb_buf_phys = "TARGET_FMT_plx, mbs.mb_buf_phys);
+ mb_debug(" mod_start = "TARGET_FMT_plx,
+ mbs.mb_buf_phys + mbs.offset_mods);
+ mb_debug(" mb_mods_count = %d", mbs.mb_mods_count);
+
+ /* save bootinfo off the stack */
+ mb_bootinfo_data = g_memdup(bootinfo, sizeof(bootinfo));
+
+ /* Pass variables to option rom */
+ fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, mh_entry_addr);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, mbs.mb_buf_size);
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA,
+ mbs.mb_buf, mbs.mb_buf_size);
+
+ fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, ADDR_MBI);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, sizeof(bootinfo));
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, mb_bootinfo_data,
+ sizeof(bootinfo));
+
+ if (multiboot_dma_enabled) {
+ option_rom[nb_option_roms].name = "multiboot_dma.bin";
+ } else {
+ option_rom[nb_option_roms].name = "multiboot.bin";
+ }
+ option_rom[nb_option_roms].bootindex = 0;
+ nb_option_roms++;
+
+ return 1; /* yes, we are multiboot */
+}
diff --git a/hw/i386/multiboot.h b/hw/i386/multiboot.h
new file mode 100644
index 000000000..2b9182a8e
--- /dev/null
+++ b/hw/i386/multiboot.h
@@ -0,0 +1,16 @@
+#ifndef QEMU_MULTIBOOT_H
+#define QEMU_MULTIBOOT_H
+
+#include "hw/nvram/fw_cfg.h"
+#include "hw/i386/x86.h"
+
+int load_multiboot(X86MachineState *x86ms,
+ FWCfgState *fw_cfg,
+ FILE *f,
+ const char *kernel_filename,
+ const char *initrd_filename,
+ const char *kernel_cmdline,
+ int kernel_file_size,
+ uint8_t *header);
+
+#endif
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
new file mode 100644
index 000000000..a2ef40ecb
--- /dev/null
+++ b/hw/i386/pc.c
@@ -0,0 +1,1777 @@
+/*
+ * QEMU PC System Emulator
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/char/serial.h"
+#include "hw/char/parallel.h"
+#include "hw/i386/apic.h"
+#include "hw/i386/topology.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/i386/vmport.h"
+#include "sysemu/cpus.h"
+#include "hw/block/fdc.h"
+#include "hw/ide.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/nvram/fw_cfg.h"
+#include "hw/timer/hpet.h"
+#include "hw/firmware/smbios.h"
+#include "hw/loader.h"
+#include "elf.h"
+#include "migration/vmstate.h"
+#include "multiboot.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "hw/intc/i8259.h"
+#include "hw/dma/i8257.h"
+#include "hw/timer/i8254.h"
+#include "hw/input/i8042.h"
+#include "hw/irq.h"
+#include "hw/audio/pcspk.h"
+#include "hw/pci/msi.h"
+#include "hw/sysbus.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/tcg.h"
+#include "sysemu/numa.h"
+#include "sysemu/kvm.h"
+#include "sysemu/xen.h"
+#include "sysemu/reset.h"
+#include "sysemu/runstate.h"
+#include "kvm/kvm_i386.h"
+#include "hw/xen/xen.h"
+#include "hw/xen/start_info.h"
+#include "ui/qemu-spice.h"
+#include "exec/memory.h"
+#include "qemu/bitmap.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/cutils.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/cpu_hotplug.h"
+#include "acpi-build.h"
+#include "hw/mem/pc-dimm.h"
+#include "hw/mem/nvdimm.h"
+#include "qapi/error.h"
+#include "qapi/qapi-visit-common.h"
+#include "qapi/visitor.h"
+#include "hw/core/cpu.h"
+#include "hw/usb.h"
+#include "hw/i386/intel_iommu.h"
+#include "hw/net/ne2000-isa.h"
+#include "standard-headers/asm-x86/bootparam.h"
+#include "hw/virtio/virtio-iommu.h"
+#include "hw/virtio/virtio-pmem-pci.h"
+#include "hw/virtio/virtio-mem-pci.h"
+#include "hw/mem/memory-device.h"
+#include "sysemu/replay.h"
+#include "qapi/qmp/qerror.h"
+#include "e820_memory_layout.h"
+#include "fw_cfg.h"
+#include "trace.h"
+#include CONFIG_DEVICES
+
+GlobalProperty pc_compat_6_1[] = {
+ { TYPE_X86_CPU, "hv-version-id-build", "0x1bbc" },
+ { TYPE_X86_CPU, "hv-version-id-major", "0x0006" },
+ { TYPE_X86_CPU, "hv-version-id-minor", "0x0001" },
+ { "ICH9-LPC", "x-keep-pci-slot-hpc", "false" },
+};
+const size_t pc_compat_6_1_len = G_N_ELEMENTS(pc_compat_6_1);
+
+GlobalProperty pc_compat_6_0[] = {
+ { "qemu64" "-" TYPE_X86_CPU, "family", "6" },
+ { "qemu64" "-" TYPE_X86_CPU, "model", "6" },
+ { "qemu64" "-" TYPE_X86_CPU, "stepping", "3" },
+ { TYPE_X86_CPU, "x-vendor-cpuid-only", "off" },
+ { "ICH9-LPC", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" },
+ { "ICH9-LPC", "x-keep-pci-slot-hpc", "true" },
+};
+const size_t pc_compat_6_0_len = G_N_ELEMENTS(pc_compat_6_0);
+
+GlobalProperty pc_compat_5_2[] = {
+ { "ICH9-LPC", "x-smi-cpu-hotunplug", "off" },
+};
+const size_t pc_compat_5_2_len = G_N_ELEMENTS(pc_compat_5_2);
+
+GlobalProperty pc_compat_5_1[] = {
+ { "ICH9-LPC", "x-smi-cpu-hotplug", "off" },
+ { TYPE_X86_CPU, "kvm-msi-ext-dest-id", "off" },
+};
+const size_t pc_compat_5_1_len = G_N_ELEMENTS(pc_compat_5_1);
+
+GlobalProperty pc_compat_5_0[] = {
+};
+const size_t pc_compat_5_0_len = G_N_ELEMENTS(pc_compat_5_0);
+
+GlobalProperty pc_compat_4_2[] = {
+ { "mch", "smbase-smram", "off" },
+};
+const size_t pc_compat_4_2_len = G_N_ELEMENTS(pc_compat_4_2);
+
+GlobalProperty pc_compat_4_1[] = {};
+const size_t pc_compat_4_1_len = G_N_ELEMENTS(pc_compat_4_1);
+
+GlobalProperty pc_compat_4_0[] = {};
+const size_t pc_compat_4_0_len = G_N_ELEMENTS(pc_compat_4_0);
+
+GlobalProperty pc_compat_3_1[] = {
+ { "intel-iommu", "dma-drain", "off" },
+ { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "off" },
+ { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "off" },
+ { "Opteron_G4" "-" TYPE_X86_CPU, "npt", "off" },
+ { "Opteron_G4" "-" TYPE_X86_CPU, "nrip-save", "off" },
+ { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "off" },
+ { "Opteron_G5" "-" TYPE_X86_CPU, "npt", "off" },
+ { "Opteron_G5" "-" TYPE_X86_CPU, "nrip-save", "off" },
+ { "EPYC" "-" TYPE_X86_CPU, "npt", "off" },
+ { "EPYC" "-" TYPE_X86_CPU, "nrip-save", "off" },
+ { "EPYC-IBPB" "-" TYPE_X86_CPU, "npt", "off" },
+ { "EPYC-IBPB" "-" TYPE_X86_CPU, "nrip-save", "off" },
+ { "Skylake-Client" "-" TYPE_X86_CPU, "mpx", "on" },
+ { "Skylake-Client-IBRS" "-" TYPE_X86_CPU, "mpx", "on" },
+ { "Skylake-Server" "-" TYPE_X86_CPU, "mpx", "on" },
+ { "Skylake-Server-IBRS" "-" TYPE_X86_CPU, "mpx", "on" },
+ { "Cascadelake-Server" "-" TYPE_X86_CPU, "mpx", "on" },
+ { "Icelake-Client" "-" TYPE_X86_CPU, "mpx", "on" },
+ { "Icelake-Server" "-" TYPE_X86_CPU, "mpx", "on" },
+ { "Cascadelake-Server" "-" TYPE_X86_CPU, "stepping", "5" },
+ { TYPE_X86_CPU, "x-intel-pt-auto-level", "off" },
+};
+const size_t pc_compat_3_1_len = G_N_ELEMENTS(pc_compat_3_1);
+
+GlobalProperty pc_compat_3_0[] = {
+ { TYPE_X86_CPU, "x-hv-synic-kvm-only", "on" },
+ { "Skylake-Server" "-" TYPE_X86_CPU, "pku", "off" },
+ { "Skylake-Server-IBRS" "-" TYPE_X86_CPU, "pku", "off" },
+};
+const size_t pc_compat_3_0_len = G_N_ELEMENTS(pc_compat_3_0);
+
+GlobalProperty pc_compat_2_12[] = {
+ { TYPE_X86_CPU, "legacy-cache", "on" },
+ { TYPE_X86_CPU, "topoext", "off" },
+ { "EPYC-" TYPE_X86_CPU, "xlevel", "0x8000000a" },
+ { "EPYC-IBPB-" TYPE_X86_CPU, "xlevel", "0x8000000a" },
+};
+const size_t pc_compat_2_12_len = G_N_ELEMENTS(pc_compat_2_12);
+
+GlobalProperty pc_compat_2_11[] = {
+ { TYPE_X86_CPU, "x-migrate-smi-count", "off" },
+ { "Skylake-Server" "-" TYPE_X86_CPU, "clflushopt", "off" },
+};
+const size_t pc_compat_2_11_len = G_N_ELEMENTS(pc_compat_2_11);
+
+GlobalProperty pc_compat_2_10[] = {
+ { TYPE_X86_CPU, "x-hv-max-vps", "0x40" },
+ { "i440FX-pcihost", "x-pci-hole64-fix", "off" },
+ { "q35-pcihost", "x-pci-hole64-fix", "off" },
+};
+const size_t pc_compat_2_10_len = G_N_ELEMENTS(pc_compat_2_10);
+
+GlobalProperty pc_compat_2_9[] = {
+ { "mch", "extended-tseg-mbytes", "0" },
+};
+const size_t pc_compat_2_9_len = G_N_ELEMENTS(pc_compat_2_9);
+
+GlobalProperty pc_compat_2_8[] = {
+ { TYPE_X86_CPU, "tcg-cpuid", "off" },
+ { "kvmclock", "x-mach-use-reliable-get-clock", "off" },
+ { "ICH9-LPC", "x-smi-broadcast", "off" },
+ { TYPE_X86_CPU, "vmware-cpuid-freq", "off" },
+ { "Haswell-" TYPE_X86_CPU, "stepping", "1" },
+};
+const size_t pc_compat_2_8_len = G_N_ELEMENTS(pc_compat_2_8);
+
+GlobalProperty pc_compat_2_7[] = {
+ { TYPE_X86_CPU, "l3-cache", "off" },
+ { TYPE_X86_CPU, "full-cpuid-auto-level", "off" },
+ { "Opteron_G3" "-" TYPE_X86_CPU, "family", "15" },
+ { "Opteron_G3" "-" TYPE_X86_CPU, "model", "6" },
+ { "Opteron_G3" "-" TYPE_X86_CPU, "stepping", "1" },
+ { "isa-pcspk", "migrate", "off" },
+};
+const size_t pc_compat_2_7_len = G_N_ELEMENTS(pc_compat_2_7);
+
+GlobalProperty pc_compat_2_6[] = {
+ { TYPE_X86_CPU, "cpuid-0xb", "off" },
+ { "vmxnet3", "romfile", "" },
+ { TYPE_X86_CPU, "fill-mtrr-mask", "off" },
+ { "apic-common", "legacy-instance-id", "on", }
+};
+const size_t pc_compat_2_6_len = G_N_ELEMENTS(pc_compat_2_6);
+
+GlobalProperty pc_compat_2_5[] = {};
+const size_t pc_compat_2_5_len = G_N_ELEMENTS(pc_compat_2_5);
+
+GlobalProperty pc_compat_2_4[] = {
+ PC_CPU_MODEL_IDS("2.4.0")
+ { "Haswell-" TYPE_X86_CPU, "abm", "off" },
+ { "Haswell-noTSX-" TYPE_X86_CPU, "abm", "off" },
+ { "Broadwell-" TYPE_X86_CPU, "abm", "off" },
+ { "Broadwell-noTSX-" TYPE_X86_CPU, "abm", "off" },
+ { "host" "-" TYPE_X86_CPU, "host-cache-info", "on" },
+ { TYPE_X86_CPU, "check", "off" },
+ { "qemu64" "-" TYPE_X86_CPU, "sse4a", "on" },
+ { "qemu64" "-" TYPE_X86_CPU, "abm", "on" },
+ { "qemu64" "-" TYPE_X86_CPU, "popcnt", "on" },
+ { "qemu32" "-" TYPE_X86_CPU, "popcnt", "on" },
+ { "Opteron_G2" "-" TYPE_X86_CPU, "rdtscp", "on" },
+ { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "on" },
+ { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "on" },
+ { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "on", }
+};
+const size_t pc_compat_2_4_len = G_N_ELEMENTS(pc_compat_2_4);
+
+GlobalProperty pc_compat_2_3[] = {
+ PC_CPU_MODEL_IDS("2.3.0")
+ { TYPE_X86_CPU, "arat", "off" },
+ { "qemu64" "-" TYPE_X86_CPU, "min-level", "4" },
+ { "kvm64" "-" TYPE_X86_CPU, "min-level", "5" },
+ { "pentium3" "-" TYPE_X86_CPU, "min-level", "2" },
+ { "n270" "-" TYPE_X86_CPU, "min-level", "5" },
+ { "Conroe" "-" TYPE_X86_CPU, "min-level", "4" },
+ { "Penryn" "-" TYPE_X86_CPU, "min-level", "4" },
+ { "Nehalem" "-" TYPE_X86_CPU, "min-level", "4" },
+ { "n270" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { "Penryn" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { "Conroe" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { "Nehalem" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { "Westmere" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { "SandyBridge" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { "IvyBridge" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { "Haswell" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { "Haswell-noTSX" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { "Broadwell" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { "Broadwell-noTSX" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+ { TYPE_X86_CPU, "kvm-no-smi-migration", "on" },
+};
+const size_t pc_compat_2_3_len = G_N_ELEMENTS(pc_compat_2_3);
+
+GlobalProperty pc_compat_2_2[] = {
+ PC_CPU_MODEL_IDS("2.2.0")
+ { "kvm64" "-" TYPE_X86_CPU, "vme", "off" },
+ { "kvm32" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Conroe" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Penryn" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Nehalem" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Westmere" "-" TYPE_X86_CPU, "vme", "off" },
+ { "SandyBridge" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Haswell" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Broadwell" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Opteron_G1" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Opteron_G2" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Opteron_G3" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Opteron_G4" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Opteron_G5" "-" TYPE_X86_CPU, "vme", "off" },
+ { "Haswell" "-" TYPE_X86_CPU, "f16c", "off" },
+ { "Haswell" "-" TYPE_X86_CPU, "rdrand", "off" },
+ { "Broadwell" "-" TYPE_X86_CPU, "f16c", "off" },
+ { "Broadwell" "-" TYPE_X86_CPU, "rdrand", "off" },
+};
+const size_t pc_compat_2_2_len = G_N_ELEMENTS(pc_compat_2_2);
+
+GlobalProperty pc_compat_2_1[] = {
+ PC_CPU_MODEL_IDS("2.1.0")
+ { "coreduo" "-" TYPE_X86_CPU, "vmx", "on" },
+ { "core2duo" "-" TYPE_X86_CPU, "vmx", "on" },
+};
+const size_t pc_compat_2_1_len = G_N_ELEMENTS(pc_compat_2_1);
+
+GlobalProperty pc_compat_2_0[] = {
+ PC_CPU_MODEL_IDS("2.0.0")
+ { "virtio-scsi-pci", "any_layout", "off" },
+ { "PIIX4_PM", "memory-hotplug-support", "off" },
+ { "apic", "version", "0x11" },
+ { "nec-usb-xhci", "superspeed-ports-first", "off" },
+ { "nec-usb-xhci", "force-pcie-endcap", "on" },
+ { "pci-serial", "prog_if", "0" },
+ { "pci-serial-2x", "prog_if", "0" },
+ { "pci-serial-4x", "prog_if", "0" },
+ { "virtio-net-pci", "guest_announce", "off" },
+ { "ICH9-LPC", "memory-hotplug-support", "off" },
+ { "xio3130-downstream", COMPAT_PROP_PCP, "off" },
+ { "ioh3420", COMPAT_PROP_PCP, "off" },
+};
+const size_t pc_compat_2_0_len = G_N_ELEMENTS(pc_compat_2_0);
+
+GlobalProperty pc_compat_1_7[] = {
+ PC_CPU_MODEL_IDS("1.7.0")
+ { TYPE_USB_DEVICE, "msos-desc", "no" },
+ { "PIIX4_PM", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" },
+ { "hpet", HPET_INTCAP, "4" },
+};
+const size_t pc_compat_1_7_len = G_N_ELEMENTS(pc_compat_1_7);
+
+GlobalProperty pc_compat_1_6[] = {
+ PC_CPU_MODEL_IDS("1.6.0")
+ { "e1000", "mitigation", "off" },
+ { "qemu64-" TYPE_X86_CPU, "model", "2" },
+ { "qemu32-" TYPE_X86_CPU, "model", "3" },
+ { "i440FX-pcihost", "short_root_bus", "1" },
+ { "q35-pcihost", "short_root_bus", "1" },
+};
+const size_t pc_compat_1_6_len = G_N_ELEMENTS(pc_compat_1_6);
+
+GlobalProperty pc_compat_1_5[] = {
+ PC_CPU_MODEL_IDS("1.5.0")
+ { "Conroe-" TYPE_X86_CPU, "model", "2" },
+ { "Conroe-" TYPE_X86_CPU, "min-level", "2" },
+ { "Penryn-" TYPE_X86_CPU, "model", "2" },
+ { "Penryn-" TYPE_X86_CPU, "min-level", "2" },
+ { "Nehalem-" TYPE_X86_CPU, "model", "2" },
+ { "Nehalem-" TYPE_X86_CPU, "min-level", "2" },
+ { "virtio-net-pci", "any_layout", "off" },
+ { TYPE_X86_CPU, "pmu", "on" },
+ { "i440FX-pcihost", "short_root_bus", "0" },
+ { "q35-pcihost", "short_root_bus", "0" },
+};
+const size_t pc_compat_1_5_len = G_N_ELEMENTS(pc_compat_1_5);
+
+GlobalProperty pc_compat_1_4[] = {
+ PC_CPU_MODEL_IDS("1.4.0")
+ { "scsi-hd", "discard_granularity", "0" },
+ { "scsi-cd", "discard_granularity", "0" },
+ { "ide-hd", "discard_granularity", "0" },
+ { "ide-cd", "discard_granularity", "0" },
+ { "virtio-blk-pci", "discard_granularity", "0" },
+ /* DEV_NVECTORS_UNSPECIFIED as a uint32_t string: */
+ { "virtio-serial-pci", "vectors", "0xFFFFFFFF" },
+ { "virtio-net-pci", "ctrl_guest_offloads", "off" },
+ { "e1000", "romfile", "pxe-e1000.rom" },
+ { "ne2k_pci", "romfile", "pxe-ne2k_pci.rom" },
+ { "pcnet", "romfile", "pxe-pcnet.rom" },
+ { "rtl8139", "romfile", "pxe-rtl8139.rom" },
+ { "virtio-net-pci", "romfile", "pxe-virtio.rom" },
+ { "486-" TYPE_X86_CPU, "model", "0" },
+ { "n270" "-" TYPE_X86_CPU, "movbe", "off" },
+ { "Westmere" "-" TYPE_X86_CPU, "pclmulqdq", "off" },
+};
+const size_t pc_compat_1_4_len = G_N_ELEMENTS(pc_compat_1_4);
+
+GSIState *pc_gsi_create(qemu_irq **irqs, bool pci_enabled)
+{
+ GSIState *s;
+
+ s = g_new0(GSIState, 1);
+ if (kvm_ioapic_in_kernel()) {
+ kvm_pc_setup_irq_routing(pci_enabled);
+ }
+ *irqs = qemu_allocate_irqs(gsi_handler, s, GSI_NUM_PINS);
+
+ return s;
+}
+
+static void ioport80_write(void *opaque, hwaddr addr, uint64_t data,
+ unsigned size)
+{
+}
+
+static uint64_t ioport80_read(void *opaque, hwaddr addr, unsigned size)
+{
+ return 0xffffffffffffffffULL;
+}
+
+/* MSDOS compatibility mode FPU exception support */
+static void ioportF0_write(void *opaque, hwaddr addr, uint64_t data,
+ unsigned size)
+{
+ if (tcg_enabled()) {
+ cpu_set_ignne();
+ }
+}
+
+static uint64_t ioportF0_read(void *opaque, hwaddr addr, unsigned size)
+{
+ return 0xffffffffffffffffULL;
+}
+
+/* PC cmos mappings */
+
+#define REG_EQUIPMENT_BYTE 0x14
+
+static void cmos_init_hd(ISADevice *s, int type_ofs, int info_ofs,
+ int16_t cylinders, int8_t heads, int8_t sectors)
+{
+ rtc_set_memory(s, type_ofs, 47);
+ rtc_set_memory(s, info_ofs, cylinders);
+ rtc_set_memory(s, info_ofs + 1, cylinders >> 8);
+ rtc_set_memory(s, info_ofs + 2, heads);
+ rtc_set_memory(s, info_ofs + 3, 0xff);
+ rtc_set_memory(s, info_ofs + 4, 0xff);
+ rtc_set_memory(s, info_ofs + 5, 0xc0 | ((heads > 8) << 3));
+ rtc_set_memory(s, info_ofs + 6, cylinders);
+ rtc_set_memory(s, info_ofs + 7, cylinders >> 8);
+ rtc_set_memory(s, info_ofs + 8, sectors);
+}
+
+/* convert boot_device letter to something recognizable by the bios */
+static int boot_device2nibble(char boot_device)
+{
+ switch(boot_device) {
+ case 'a':
+ case 'b':
+ return 0x01; /* floppy boot */
+ case 'c':
+ return 0x02; /* hard drive boot */
+ case 'd':
+ return 0x03; /* CD-ROM boot */
+ case 'n':
+ return 0x04; /* Network boot */
+ }
+ return 0;
+}
+
+static void set_boot_dev(ISADevice *s, const char *boot_device, Error **errp)
+{
+#define PC_MAX_BOOT_DEVICES 3
+ int nbds, bds[3] = { 0, };
+ int i;
+
+ nbds = strlen(boot_device);
+ if (nbds > PC_MAX_BOOT_DEVICES) {
+ error_setg(errp, "Too many boot devices for PC");
+ return;
+ }
+ for (i = 0; i < nbds; i++) {
+ bds[i] = boot_device2nibble(boot_device[i]);
+ if (bds[i] == 0) {
+ error_setg(errp, "Invalid boot device for PC: '%c'",
+ boot_device[i]);
+ return;
+ }
+ }
+ rtc_set_memory(s, 0x3d, (bds[1] << 4) | bds[0]);
+ rtc_set_memory(s, 0x38, (bds[2] << 4) | (fd_bootchk ? 0x0 : 0x1));
+}
+
+static void pc_boot_set(void *opaque, const char *boot_device, Error **errp)
+{
+ set_boot_dev(opaque, boot_device, errp);
+}
+
+static void pc_cmos_init_floppy(ISADevice *rtc_state, ISADevice *floppy)
+{
+ int val, nb, i;
+ FloppyDriveType fd_type[2] = { FLOPPY_DRIVE_TYPE_NONE,
+ FLOPPY_DRIVE_TYPE_NONE };
+
+ /* floppy type */
+ if (floppy) {
+ for (i = 0; i < 2; i++) {
+ fd_type[i] = isa_fdc_get_drive_type(floppy, i);
+ }
+ }
+ val = (cmos_get_fd_drive_type(fd_type[0]) << 4) |
+ cmos_get_fd_drive_type(fd_type[1]);
+ rtc_set_memory(rtc_state, 0x10, val);
+
+ val = rtc_get_memory(rtc_state, REG_EQUIPMENT_BYTE);
+ nb = 0;
+ if (fd_type[0] != FLOPPY_DRIVE_TYPE_NONE) {
+ nb++;
+ }
+ if (fd_type[1] != FLOPPY_DRIVE_TYPE_NONE) {
+ nb++;
+ }
+ switch (nb) {
+ case 0:
+ break;
+ case 1:
+ val |= 0x01; /* 1 drive, ready for boot */
+ break;
+ case 2:
+ val |= 0x41; /* 2 drives, ready for boot */
+ break;
+ }
+ rtc_set_memory(rtc_state, REG_EQUIPMENT_BYTE, val);
+}
+
+typedef struct pc_cmos_init_late_arg {
+ ISADevice *rtc_state;
+ BusState *idebus[2];
+} pc_cmos_init_late_arg;
+
+typedef struct check_fdc_state {
+ ISADevice *floppy;
+ bool multiple;
+} CheckFdcState;
+
+static int check_fdc(Object *obj, void *opaque)
+{
+ CheckFdcState *state = opaque;
+ Object *fdc;
+ uint32_t iobase;
+ Error *local_err = NULL;
+
+ fdc = object_dynamic_cast(obj, TYPE_ISA_FDC);
+ if (!fdc) {
+ return 0;
+ }
+
+ iobase = object_property_get_uint(obj, "iobase", &local_err);
+ if (local_err || iobase != 0x3f0) {
+ error_free(local_err);
+ return 0;
+ }
+
+ if (state->floppy) {
+ state->multiple = true;
+ } else {
+ state->floppy = ISA_DEVICE(obj);
+ }
+ return 0;
+}
+
+static const char * const fdc_container_path[] = {
+ "/unattached", "/peripheral", "/peripheral-anon"
+};
+
+/*
+ * Locate the FDC at IO address 0x3f0, in order to configure the CMOS registers
+ * and ACPI objects.
+ */
+ISADevice *pc_find_fdc0(void)
+{
+ int i;
+ Object *container;
+ CheckFdcState state = { 0 };
+
+ for (i = 0; i < ARRAY_SIZE(fdc_container_path); i++) {
+ container = container_get(qdev_get_machine(), fdc_container_path[i]);
+ object_child_foreach(container, check_fdc, &state);
+ }
+
+ if (state.multiple) {
+ warn_report("multiple floppy disk controllers with "
+ "iobase=0x3f0 have been found");
+ error_printf("the one being picked for CMOS setup might not reflect "
+ "your intent");
+ }
+
+ return state.floppy;
+}
+
+static void pc_cmos_init_late(void *opaque)
+{
+ pc_cmos_init_late_arg *arg = opaque;
+ ISADevice *s = arg->rtc_state;
+ int16_t cylinders;
+ int8_t heads, sectors;
+ int val;
+ int i, trans;
+
+ val = 0;
+ if (arg->idebus[0] && ide_get_geometry(arg->idebus[0], 0,
+ &cylinders, &heads, &sectors) >= 0) {
+ cmos_init_hd(s, 0x19, 0x1b, cylinders, heads, sectors);
+ val |= 0xf0;
+ }
+ if (arg->idebus[0] && ide_get_geometry(arg->idebus[0], 1,
+ &cylinders, &heads, &sectors) >= 0) {
+ cmos_init_hd(s, 0x1a, 0x24, cylinders, heads, sectors);
+ val |= 0x0f;
+ }
+ rtc_set_memory(s, 0x12, val);
+
+ val = 0;
+ for (i = 0; i < 4; i++) {
+ /* NOTE: ide_get_geometry() returns the physical
+ geometry. It is always such that: 1 <= sects <= 63, 1
+ <= heads <= 16, 1 <= cylinders <= 16383. The BIOS
+ geometry can be different if a translation is done. */
+ if (arg->idebus[i / 2] &&
+ ide_get_geometry(arg->idebus[i / 2], i % 2,
+ &cylinders, &heads, &sectors) >= 0) {
+ trans = ide_get_bios_chs_trans(arg->idebus[i / 2], i % 2) - 1;
+ assert((trans & ~3) == 0);
+ val |= trans << (i * 2);
+ }
+ }
+ rtc_set_memory(s, 0x39, val);
+
+ pc_cmos_init_floppy(s, pc_find_fdc0());
+
+ qemu_unregister_reset(pc_cmos_init_late, opaque);
+}
+
+void pc_cmos_init(PCMachineState *pcms,
+ BusState *idebus0, BusState *idebus1,
+ ISADevice *s)
+{
+ int val;
+ static pc_cmos_init_late_arg arg;
+ X86MachineState *x86ms = X86_MACHINE(pcms);
+
+ /* various important CMOS locations needed by PC/Bochs bios */
+
+ /* memory size */
+ /* base memory (first MiB) */
+ val = MIN(x86ms->below_4g_mem_size / KiB, 640);
+ rtc_set_memory(s, 0x15, val);
+ rtc_set_memory(s, 0x16, val >> 8);
+ /* extended memory (next 64MiB) */
+ if (x86ms->below_4g_mem_size > 1 * MiB) {
+ val = (x86ms->below_4g_mem_size - 1 * MiB) / KiB;
+ } else {
+ val = 0;
+ }
+ if (val > 65535)
+ val = 65535;
+ rtc_set_memory(s, 0x17, val);
+ rtc_set_memory(s, 0x18, val >> 8);
+ rtc_set_memory(s, 0x30, val);
+ rtc_set_memory(s, 0x31, val >> 8);
+ /* memory between 16MiB and 4GiB */
+ if (x86ms->below_4g_mem_size > 16 * MiB) {
+ val = (x86ms->below_4g_mem_size - 16 * MiB) / (64 * KiB);
+ } else {
+ val = 0;
+ }
+ if (val > 65535)
+ val = 65535;
+ rtc_set_memory(s, 0x34, val);
+ rtc_set_memory(s, 0x35, val >> 8);
+ /* memory above 4GiB */
+ val = x86ms->above_4g_mem_size / 65536;
+ rtc_set_memory(s, 0x5b, val);
+ rtc_set_memory(s, 0x5c, val >> 8);
+ rtc_set_memory(s, 0x5d, val >> 16);
+
+ object_property_add_link(OBJECT(pcms), "rtc_state",
+ TYPE_ISA_DEVICE,
+ (Object **)&x86ms->rtc,
+ object_property_allow_set_link,
+ OBJ_PROP_LINK_STRONG);
+ object_property_set_link(OBJECT(pcms), "rtc_state", OBJECT(s),
+ &error_abort);
+
+ set_boot_dev(s, MACHINE(pcms)->boot_order, &error_fatal);
+
+ val = 0;
+ val |= 0x02; /* FPU is there */
+ val |= 0x04; /* PS/2 mouse installed */
+ rtc_set_memory(s, REG_EQUIPMENT_BYTE, val);
+
+ /* hard drives and FDC */
+ arg.rtc_state = s;
+ arg.idebus[0] = idebus0;
+ arg.idebus[1] = idebus1;
+ qemu_register_reset(pc_cmos_init_late, &arg);
+}
+
+static void handle_a20_line_change(void *opaque, int irq, int level)
+{
+ X86CPU *cpu = opaque;
+
+ /* XXX: send to all CPUs ? */
+ /* XXX: add logic to handle multiple A20 line sources */
+ x86_cpu_set_a20(cpu, level);
+}
+
+#define NE2000_NB_MAX 6
+
+static const int ne2000_io[NE2000_NB_MAX] = { 0x300, 0x320, 0x340, 0x360,
+ 0x280, 0x380 };
+static const int ne2000_irq[NE2000_NB_MAX] = { 9, 10, 11, 3, 4, 5 };
+
+void pc_init_ne2k_isa(ISABus *bus, NICInfo *nd)
+{
+ static int nb_ne2k = 0;
+
+ if (nb_ne2k == NE2000_NB_MAX)
+ return;
+ isa_ne2000_init(bus, ne2000_io[nb_ne2k],
+ ne2000_irq[nb_ne2k], nd);
+ nb_ne2k++;
+}
+
+void pc_acpi_smi_interrupt(void *opaque, int irq, int level)
+{
+ X86CPU *cpu = opaque;
+
+ if (level) {
+ cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
+ }
+}
+
+static
+void pc_machine_done(Notifier *notifier, void *data)
+{
+ PCMachineState *pcms = container_of(notifier,
+ PCMachineState, machine_done);
+ X86MachineState *x86ms = X86_MACHINE(pcms);
+
+ /* set the number of CPUs */
+ x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
+
+ fw_cfg_add_extra_pci_roots(pcms->bus, x86ms->fw_cfg);
+
+ acpi_setup();
+ if (x86ms->fw_cfg) {
+ fw_cfg_build_smbios(MACHINE(pcms), x86ms->fw_cfg);
+ fw_cfg_build_feature_control(MACHINE(pcms), x86ms->fw_cfg);
+ /* update FW_CFG_NB_CPUS to account for -device added CPUs */
+ fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
+ }
+
+
+ if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
+ !kvm_irqchip_in_kernel()) {
+ error_report("current -smp configuration requires kernel "
+ "irqchip support.");
+ exit(EXIT_FAILURE);
+ }
+}
+
+void pc_guest_info_init(PCMachineState *pcms)
+{
+ X86MachineState *x86ms = X86_MACHINE(pcms);
+
+ x86ms->apic_xrupt_override = true;
+ pcms->machine_done.notify = pc_machine_done;
+ qemu_add_machine_init_done_notifier(&pcms->machine_done);
+}
+
+/* setup pci memory address space mapping into system address space */
+void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory,
+ MemoryRegion *pci_address_space)
+{
+ /* Set to lower priority than RAM */
+ memory_region_add_subregion_overlap(system_memory, 0x0,
+ pci_address_space, -1);
+}
+
+void xen_load_linux(PCMachineState *pcms)
+{
+ int i;
+ FWCfgState *fw_cfg;
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ X86MachineState *x86ms = X86_MACHINE(pcms);
+
+ assert(MACHINE(pcms)->kernel_filename != NULL);
+
+ fw_cfg = fw_cfg_init_io(FW_CFG_IO_BASE);
+ fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
+ rom_set_fw(fw_cfg);
+
+ x86_load_linux(x86ms, fw_cfg, pcmc->acpi_data_size,
+ pcmc->pvh_enabled);
+ for (i = 0; i < nb_option_roms; i++) {
+ assert(!strcmp(option_rom[i].name, "linuxboot.bin") ||
+ !strcmp(option_rom[i].name, "linuxboot_dma.bin") ||
+ !strcmp(option_rom[i].name, "pvh.bin") ||
+ !strcmp(option_rom[i].name, "multiboot.bin") ||
+ !strcmp(option_rom[i].name, "multiboot_dma.bin"));
+ rom_add_option(option_rom[i].name, option_rom[i].bootindex);
+ }
+ x86ms->fw_cfg = fw_cfg;
+}
+
+#define PC_ROM_MIN_VGA 0xc0000
+#define PC_ROM_MIN_OPTION 0xc8000
+#define PC_ROM_MAX 0xe0000
+#define PC_ROM_ALIGN 0x800
+#define PC_ROM_SIZE (PC_ROM_MAX - PC_ROM_MIN_VGA)
+
+void pc_memory_init(PCMachineState *pcms,
+ MemoryRegion *system_memory,
+ MemoryRegion *rom_memory,
+ MemoryRegion **ram_memory)
+{
+ int linux_boot, i;
+ MemoryRegion *option_rom_mr;
+ MemoryRegion *ram_below_4g, *ram_above_4g;
+ FWCfgState *fw_cfg;
+ MachineState *machine = MACHINE(pcms);
+ MachineClass *mc = MACHINE_GET_CLASS(machine);
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ X86MachineState *x86ms = X86_MACHINE(pcms);
+
+ assert(machine->ram_size == x86ms->below_4g_mem_size +
+ x86ms->above_4g_mem_size);
+
+ linux_boot = (machine->kernel_filename != NULL);
+
+ /*
+ * Split single memory region and use aliases to address portions of it,
+ * done for backwards compatibility with older qemus.
+ */
+ *ram_memory = machine->ram;
+ ram_below_4g = g_malloc(sizeof(*ram_below_4g));
+ memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
+ 0, x86ms->below_4g_mem_size);
+ memory_region_add_subregion(system_memory, 0, ram_below_4g);
+ e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
+ if (x86ms->above_4g_mem_size > 0) {
+ ram_above_4g = g_malloc(sizeof(*ram_above_4g));
+ memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
+ machine->ram,
+ x86ms->below_4g_mem_size,
+ x86ms->above_4g_mem_size);
+ memory_region_add_subregion(system_memory, 0x100000000ULL,
+ ram_above_4g);
+ e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
+ }
+
+ if (pcms->sgx_epc.size != 0) {
+ e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
+ }
+
+ if (!pcmc->has_reserved_memory &&
+ (machine->ram_slots ||
+ (machine->maxram_size > machine->ram_size))) {
+
+ error_report("\"-memory 'slots|maxmem'\" is not supported by: %s",
+ mc->name);
+ exit(EXIT_FAILURE);
+ }
+
+ /* always allocate the device memory information */
+ machine->device_memory = g_malloc0(sizeof(*machine->device_memory));
+
+ /* initialize device memory address space */
+ if (pcmc->has_reserved_memory &&
+ (machine->ram_size < machine->maxram_size)) {
+ ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
+
+ if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) {
+ error_report("unsupported amount of memory slots: %"PRIu64,
+ machine->ram_slots);
+ exit(EXIT_FAILURE);
+ }
+
+ if (QEMU_ALIGN_UP(machine->maxram_size,
+ TARGET_PAGE_SIZE) != machine->maxram_size) {
+ error_report("maximum memory size must by aligned to multiple of "
+ "%d bytes", TARGET_PAGE_SIZE);
+ exit(EXIT_FAILURE);
+ }
+
+ if (pcms->sgx_epc.size != 0) {
+ machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc);
+ } else {
+ machine->device_memory->base =
+ 0x100000000ULL + x86ms->above_4g_mem_size;
+ }
+
+ machine->device_memory->base =
+ ROUND_UP(machine->device_memory->base, 1 * GiB);
+
+ if (pcmc->enforce_aligned_dimm) {
+ /* size device region assuming 1G page max alignment per slot */
+ device_mem_size += (1 * GiB) * machine->ram_slots;
+ }
+
+ if ((machine->device_memory->base + device_mem_size) <
+ device_mem_size) {
+ error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
+ machine->maxram_size);
+ exit(EXIT_FAILURE);
+ }
+
+ memory_region_init(&machine->device_memory->mr, OBJECT(pcms),
+ "device-memory", device_mem_size);
+ memory_region_add_subregion(system_memory, machine->device_memory->base,
+ &machine->device_memory->mr);
+ }
+
+ /* Initialize PC system firmware */
+ pc_system_firmware_init(pcms, rom_memory);
+
+ option_rom_mr = g_malloc(sizeof(*option_rom_mr));
+ memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
+ &error_fatal);
+ if (pcmc->pci_enabled) {
+ memory_region_set_readonly(option_rom_mr, true);
+ }
+ memory_region_add_subregion_overlap(rom_memory,
+ PC_ROM_MIN_VGA,
+ option_rom_mr,
+ 1);
+
+ fw_cfg = fw_cfg_arch_create(machine,
+ x86ms->boot_cpus, x86ms->apic_id_limit);
+
+ rom_set_fw(fw_cfg);
+
+ if (pcmc->has_reserved_memory && machine->device_memory->base) {
+ uint64_t *val = g_malloc(sizeof(*val));
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ uint64_t res_mem_end = machine->device_memory->base;
+
+ if (!pcmc->broken_reserved_end) {
+ res_mem_end += memory_region_size(&machine->device_memory->mr);
+ }
+ *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB));
+ fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val));
+ }
+
+ if (linux_boot) {
+ x86_load_linux(x86ms, fw_cfg, pcmc->acpi_data_size,
+ pcmc->pvh_enabled);
+ }
+
+ for (i = 0; i < nb_option_roms; i++) {
+ rom_add_option(option_rom[i].name, option_rom[i].bootindex);
+ }
+ x86ms->fw_cfg = fw_cfg;
+
+ /* Init default IOAPIC address space */
+ x86ms->ioapic_as = &address_space_memory;
+
+ /* Init ACPI memory hotplug IO base address */
+ pcms->memhp_io_base = ACPI_MEMORY_HOTPLUG_BASE;
+}
+
+/*
+ * The 64bit pci hole starts after "above 4G RAM" and
+ * potentially the space reserved for memory hotplug.
+ */
+uint64_t pc_pci_hole64_start(void)
+{
+ PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ MachineState *ms = MACHINE(pcms);
+ X86MachineState *x86ms = X86_MACHINE(pcms);
+ uint64_t hole64_start = 0;
+
+ if (pcmc->has_reserved_memory && ms->device_memory->base) {
+ hole64_start = ms->device_memory->base;
+ if (!pcmc->broken_reserved_end) {
+ hole64_start += memory_region_size(&ms->device_memory->mr);
+ }
+ } else if (pcms->sgx_epc.size != 0) {
+ hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc);
+ } else {
+ hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
+ }
+
+ return ROUND_UP(hole64_start, 1 * GiB);
+}
+
+DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus)
+{
+ DeviceState *dev = NULL;
+
+ rom_set_order_override(FW_CFG_ORDER_OVERRIDE_VGA);
+ if (pci_bus) {
+ PCIDevice *pcidev = pci_vga_init(pci_bus);
+ dev = pcidev ? &pcidev->qdev : NULL;
+ } else if (isa_bus) {
+ ISADevice *isadev = isa_vga_init(isa_bus);
+ dev = isadev ? DEVICE(isadev) : NULL;
+ }
+ rom_reset_order_override();
+ return dev;
+}
+
+static const MemoryRegionOps ioport80_io_ops = {
+ .write = ioport80_write,
+ .read = ioport80_read,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 1,
+ },
+};
+
+static const MemoryRegionOps ioportF0_io_ops = {
+ .write = ioportF0_write,
+ .read = ioportF0_read,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 1,
+ },
+};
+
+static void pc_superio_init(ISABus *isa_bus, bool create_fdctrl, bool no_vmport)
+{
+ int i;
+ DriveInfo *fd[MAX_FD];
+ qemu_irq *a20_line;
+ ISADevice *fdc, *i8042, *port92, *vmmouse;
+
+ serial_hds_isa_init(isa_bus, 0, MAX_ISA_SERIAL_PORTS);
+ parallel_hds_isa_init(isa_bus, MAX_PARALLEL_PORTS);
+
+ for (i = 0; i < MAX_FD; i++) {
+ fd[i] = drive_get(IF_FLOPPY, 0, i);
+ create_fdctrl |= !!fd[i];
+ }
+ if (create_fdctrl) {
+ fdc = isa_new(TYPE_ISA_FDC);
+ if (fdc) {
+ isa_realize_and_unref(fdc, isa_bus, &error_fatal);
+ isa_fdc_init_drives(fdc, fd);
+ }
+ }
+
+ i8042 = isa_create_simple(isa_bus, "i8042");
+ if (!no_vmport) {
+ isa_create_simple(isa_bus, TYPE_VMPORT);
+ vmmouse = isa_try_new("vmmouse");
+ } else {
+ vmmouse = NULL;
+ }
+ if (vmmouse) {
+ object_property_set_link(OBJECT(vmmouse), "i8042", OBJECT(i8042),
+ &error_abort);
+ isa_realize_and_unref(vmmouse, isa_bus, &error_fatal);
+ }
+ port92 = isa_create_simple(isa_bus, TYPE_PORT92);
+
+ a20_line = qemu_allocate_irqs(handle_a20_line_change, first_cpu, 2);
+ i8042_setup_a20_line(i8042, a20_line[0]);
+ qdev_connect_gpio_out_named(DEVICE(port92),
+ PORT92_A20_LINE, 0, a20_line[1]);
+ g_free(a20_line);
+}
+
+void pc_basic_device_init(struct PCMachineState *pcms,
+ ISABus *isa_bus, qemu_irq *gsi,
+ ISADevice **rtc_state,
+ bool create_fdctrl,
+ uint32_t hpet_irqs)
+{
+ int i;
+ DeviceState *hpet = NULL;
+ int pit_isa_irq = 0;
+ qemu_irq pit_alt_irq = NULL;
+ qemu_irq rtc_irq = NULL;
+ ISADevice *pit = NULL;
+ MemoryRegion *ioport80_io = g_new(MemoryRegion, 1);
+ MemoryRegion *ioportF0_io = g_new(MemoryRegion, 1);
+
+ memory_region_init_io(ioport80_io, NULL, &ioport80_io_ops, NULL, "ioport80", 1);
+ memory_region_add_subregion(isa_bus->address_space_io, 0x80, ioport80_io);
+
+ memory_region_init_io(ioportF0_io, NULL, &ioportF0_io_ops, NULL, "ioportF0", 1);
+ memory_region_add_subregion(isa_bus->address_space_io, 0xf0, ioportF0_io);
+
+ /*
+ * Check if an HPET shall be created.
+ *
+ * Without KVM_CAP_PIT_STATE2, we cannot switch off the in-kernel PIT
+ * when the HPET wants to take over. Thus we have to disable the latter.
+ */
+ if (pcms->hpet_enabled && (!kvm_irqchip_in_kernel() ||
+ kvm_has_pit_state2())) {
+ hpet = qdev_try_new(TYPE_HPET);
+ if (!hpet) {
+ error_report("couldn't create HPET device");
+ exit(1);
+ }
+ /*
+ * For pc-piix-*, hpet's intcap is always IRQ2. For pc-q35-1.7 and
+ * earlier, use IRQ2 for compat. Otherwise, use IRQ16~23, IRQ8 and
+ * IRQ2.
+ */
+ uint8_t compat = object_property_get_uint(OBJECT(hpet),
+ HPET_INTCAP, NULL);
+ if (!compat) {
+ qdev_prop_set_uint32(hpet, HPET_INTCAP, hpet_irqs);
+ }
+ sysbus_realize_and_unref(SYS_BUS_DEVICE(hpet), &error_fatal);
+ sysbus_mmio_map(SYS_BUS_DEVICE(hpet), 0, HPET_BASE);
+
+ for (i = 0; i < GSI_NUM_PINS; i++) {
+ sysbus_connect_irq(SYS_BUS_DEVICE(hpet), i, gsi[i]);
+ }
+ pit_isa_irq = -1;
+ pit_alt_irq = qdev_get_gpio_in(hpet, HPET_LEGACY_PIT_INT);
+ rtc_irq = qdev_get_gpio_in(hpet, HPET_LEGACY_RTC_INT);
+ }
+ *rtc_state = mc146818_rtc_init(isa_bus, 2000, rtc_irq);
+
+ qemu_register_boot_set(pc_boot_set, *rtc_state);
+
+ if (!xen_enabled() && pcms->pit_enabled) {
+ if (kvm_pit_in_kernel()) {
+ pit = kvm_pit_init(isa_bus, 0x40);
+ } else {
+ pit = i8254_pit_init(isa_bus, 0x40, pit_isa_irq, pit_alt_irq);
+ }
+ if (hpet) {
+ /* connect PIT to output control line of the HPET */
+ qdev_connect_gpio_out(hpet, 0, qdev_get_gpio_in(DEVICE(pit), 0));
+ }
+ pcspk_init(pcms->pcspk, isa_bus, pit);
+ }
+
+ i8257_dma_init(isa_bus, 0);
+
+ /* Super I/O */
+ pc_superio_init(isa_bus, create_fdctrl, pcms->vmport != ON_OFF_AUTO_ON);
+}
+
+void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus)
+{
+ int i;
+
+ rom_set_order_override(FW_CFG_ORDER_OVERRIDE_NIC);
+ for (i = 0; i < nb_nics; i++) {
+ NICInfo *nd = &nd_table[i];
+ const char *model = nd->model ? nd->model : pcmc->default_nic_model;
+
+ if (g_str_equal(model, "ne2k_isa")) {
+ pc_init_ne2k_isa(isa_bus, nd);
+ } else {
+ pci_nic_init_nofail(nd, pci_bus, model, NULL);
+ }
+ }
+ rom_reset_order_override();
+}
+
+void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs)
+{
+ qemu_irq *i8259;
+
+ if (kvm_pic_in_kernel()) {
+ i8259 = kvm_i8259_init(isa_bus);
+ } else if (xen_enabled()) {
+ i8259 = xen_interrupt_controller_init();
+ } else {
+ i8259 = i8259_init(isa_bus, x86_allocate_cpu_irq());
+ }
+
+ for (size_t i = 0; i < ISA_NUM_IRQS; i++) {
+ i8259_irqs[i] = i8259[i];
+ }
+
+ g_free(i8259);
+}
+
+static void pc_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
+ Error **errp)
+{
+ const PCMachineState *pcms = PC_MACHINE(hotplug_dev);
+ const X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+ const PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ const MachineState *ms = MACHINE(hotplug_dev);
+ const bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
+ const uint64_t legacy_align = TARGET_PAGE_SIZE;
+ Error *local_err = NULL;
+
+ /*
+ * When -no-acpi is used with Q35 machine type, no ACPI is built,
+ * but pcms->acpi_dev is still created. Check !acpi_enabled in
+ * addition to cover this case.
+ */
+ if (!x86ms->acpi_dev || !x86_machine_is_acpi_enabled(x86ms)) {
+ error_setg(errp,
+ "memory hotplug is not enabled: missing acpi device or acpi disabled");
+ return;
+ }
+
+ if (is_nvdimm && !ms->nvdimms_state->is_enabled) {
+ error_setg(errp, "nvdimm is not enabled: missing 'nvdimm' in '-M'");
+ return;
+ }
+
+ hotplug_handler_pre_plug(x86ms->acpi_dev, dev, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+
+ pc_dimm_pre_plug(PC_DIMM(dev), MACHINE(hotplug_dev),
+ pcmc->enforce_aligned_dimm ? NULL : &legacy_align, errp);
+}
+
+static void pc_memory_plug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(hotplug_dev);
+ X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+ MachineState *ms = MACHINE(hotplug_dev);
+ bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
+
+ pc_dimm_plug(PC_DIMM(dev), MACHINE(pcms));
+
+ if (is_nvdimm) {
+ nvdimm_plug(ms->nvdimms_state);
+ }
+
+ hotplug_handler_plug(x86ms->acpi_dev, dev, &error_abort);
+}
+
+static void pc_memory_unplug_request(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+
+ /*
+ * When -no-acpi is used with Q35 machine type, no ACPI is built,
+ * but pcms->acpi_dev is still created. Check !acpi_enabled in
+ * addition to cover this case.
+ */
+ if (!x86ms->acpi_dev || !x86_machine_is_acpi_enabled(x86ms)) {
+ error_setg(errp,
+ "memory hotplug is not enabled: missing acpi device or acpi disabled");
+ return;
+ }
+
+ if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
+ error_setg(errp, "nvdimm device hot unplug is not supported yet.");
+ return;
+ }
+
+ hotplug_handler_unplug_request(x86ms->acpi_dev, dev,
+ errp);
+}
+
+static void pc_memory_unplug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(hotplug_dev);
+ X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+ Error *local_err = NULL;
+
+ hotplug_handler_unplug(x86ms->acpi_dev, dev, &local_err);
+ if (local_err) {
+ goto out;
+ }
+
+ pc_dimm_unplug(PC_DIMM(dev), MACHINE(pcms));
+ qdev_unrealize(dev);
+ out:
+ error_propagate(errp, local_err);
+}
+
+static void pc_virtio_md_pci_pre_plug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
+ Error *local_err = NULL;
+
+ if (!hotplug_dev2 && dev->hotplugged) {
+ /*
+ * Without a bus hotplug handler, we cannot control the plug/unplug
+ * order. We should never reach this point when hotplugging on x86,
+ * however, better add a safety net.
+ */
+ error_setg(errp, "hotplug of virtio based memory devices not supported"
+ " on this bus.");
+ return;
+ }
+ /*
+ * First, see if we can plug this memory device at all. If that
+ * succeeds, branch of to the actual hotplug handler.
+ */
+ memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL,
+ &local_err);
+ if (!local_err && hotplug_dev2) {
+ hotplug_handler_pre_plug(hotplug_dev2, dev, &local_err);
+ }
+ error_propagate(errp, local_err);
+}
+
+static void pc_virtio_md_pci_plug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
+ Error *local_err = NULL;
+
+ /*
+ * Plug the memory device first and then branch off to the actual
+ * hotplug handler. If that one fails, we can easily undo the memory
+ * device bits.
+ */
+ memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
+ if (hotplug_dev2) {
+ hotplug_handler_plug(hotplug_dev2, dev, &local_err);
+ if (local_err) {
+ memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
+ }
+ }
+ error_propagate(errp, local_err);
+}
+
+static void pc_virtio_md_pci_unplug_request(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ /* We don't support hot unplug of virtio based memory devices */
+ error_setg(errp, "virtio based memory devices cannot be unplugged.");
+}
+
+static void pc_virtio_md_pci_unplug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ /* We don't support hot unplug of virtio based memory devices */
+}
+
+static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+ pc_memory_pre_plug(hotplug_dev, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+ x86_cpu_pre_plug(hotplug_dev, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+ pc_virtio_md_pci_pre_plug(hotplug_dev, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+ /* Declare the APIC range as the reserved MSI region */
+ char *resv_prop_str = g_strdup_printf("0xfee00000:0xfeefffff:%d",
+ VIRTIO_IOMMU_RESV_MEM_T_MSI);
+
+ object_property_set_uint(OBJECT(dev), "len-reserved-regions", 1, errp);
+ object_property_set_str(OBJECT(dev), "reserved-regions[0]",
+ resv_prop_str, errp);
+ g_free(resv_prop_str);
+ }
+
+ if (object_dynamic_cast(OBJECT(dev), TYPE_X86_IOMMU_DEVICE) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+ PCMachineState *pcms = PC_MACHINE(hotplug_dev);
+
+ if (pcms->iommu) {
+ error_setg(errp, "QEMU does not support multiple vIOMMUs "
+ "for x86 yet.");
+ return;
+ }
+ pcms->iommu = dev;
+ }
+}
+
+static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+ pc_memory_plug(hotplug_dev, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+ x86_cpu_plug(hotplug_dev, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+ pc_virtio_md_pci_plug(hotplug_dev, dev, errp);
+ }
+}
+
+static void pc_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+ pc_memory_unplug_request(hotplug_dev, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+ x86_cpu_unplug_request_cb(hotplug_dev, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+ pc_virtio_md_pci_unplug_request(hotplug_dev, dev, errp);
+ } else {
+ error_setg(errp, "acpi: device unplug request for not supported device"
+ " type: %s", object_get_typename(OBJECT(dev)));
+ }
+}
+
+static void pc_machine_device_unplug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+ pc_memory_unplug(hotplug_dev, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+ x86_cpu_unplug_cb(hotplug_dev, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+ pc_virtio_md_pci_unplug(hotplug_dev, dev, errp);
+ } else {
+ error_setg(errp, "acpi: device unplug for not supported device"
+ " type: %s", object_get_typename(OBJECT(dev)));
+ }
+}
+
+static HotplugHandler *pc_get_hotplug_handler(MachineState *machine,
+ DeviceState *dev)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_CPU) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_X86_IOMMU_DEVICE)) {
+ return HOTPLUG_HANDLER(machine);
+ }
+
+ return NULL;
+}
+
+static void
+pc_machine_get_device_memory_region_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ MachineState *ms = MACHINE(obj);
+ int64_t value = 0;
+
+ if (ms->device_memory) {
+ value = memory_region_size(&ms->device_memory->mr);
+ }
+
+ visit_type_int(v, name, &value, errp);
+}
+
+static void pc_machine_get_vmport(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+ OnOffAuto vmport = pcms->vmport;
+
+ visit_type_OnOffAuto(v, name, &vmport, errp);
+}
+
+static void pc_machine_set_vmport(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ visit_type_OnOffAuto(v, name, &pcms->vmport, errp);
+}
+
+static bool pc_machine_get_smbus(Object *obj, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ return pcms->smbus_enabled;
+}
+
+static void pc_machine_set_smbus(Object *obj, bool value, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ pcms->smbus_enabled = value;
+}
+
+static bool pc_machine_get_sata(Object *obj, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ return pcms->sata_enabled;
+}
+
+static void pc_machine_set_sata(Object *obj, bool value, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ pcms->sata_enabled = value;
+}
+
+static bool pc_machine_get_pit(Object *obj, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ return pcms->pit_enabled;
+}
+
+static void pc_machine_set_pit(Object *obj, bool value, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ pcms->pit_enabled = value;
+}
+
+static bool pc_machine_get_hpet(Object *obj, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ return pcms->hpet_enabled;
+}
+
+static void pc_machine_set_hpet(Object *obj, bool value, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ pcms->hpet_enabled = value;
+}
+
+static bool pc_machine_get_default_bus_bypass_iommu(Object *obj, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ return pcms->default_bus_bypass_iommu;
+}
+
+static void pc_machine_set_default_bus_bypass_iommu(Object *obj, bool value,
+ Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+ pcms->default_bus_bypass_iommu = value;
+}
+
+static void pc_machine_get_max_ram_below_4g(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+ uint64_t value = pcms->max_ram_below_4g;
+
+ visit_type_size(v, name, &value, errp);
+}
+
+static void pc_machine_set_max_ram_below_4g(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+ uint64_t value;
+
+ if (!visit_type_size(v, name, &value, errp)) {
+ return;
+ }
+ if (value > 4 * GiB) {
+ error_setg(errp,
+ "Machine option 'max-ram-below-4g=%"PRIu64
+ "' expects size less than or equal to 4G", value);
+ return;
+ }
+
+ if (value < 1 * MiB) {
+ warn_report("Only %" PRIu64 " bytes of RAM below the 4GiB boundary,"
+ "BIOS may not work with less than 1MiB", value);
+ }
+
+ pcms->max_ram_below_4g = value;
+}
+
+static void pc_machine_get_max_fw_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+ uint64_t value = pcms->max_fw_size;
+
+ visit_type_size(v, name, &value, errp);
+}
+
+static void pc_machine_set_max_fw_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+ Error *error = NULL;
+ uint64_t value;
+
+ visit_type_size(v, name, &value, &error);
+ if (error) {
+ error_propagate(errp, error);
+ return;
+ }
+
+ /*
+ * We don't have a theoretically justifiable exact lower bound on the base
+ * address of any flash mapping. In practice, the IO-APIC MMIO range is
+ * [0xFEE00000..0xFEE01000] -- see IO_APIC_DEFAULT_ADDRESS --, leaving free
+ * only 18MB-4KB below 4G. For now, restrict the cumulative mapping to 8MB in
+ * size.
+ */
+ if (value > 16 * MiB) {
+ error_setg(errp,
+ "User specified max allowed firmware size %" PRIu64 " is "
+ "greater than 16MiB. If combined firwmare size exceeds "
+ "16MiB the system may not boot, or experience intermittent"
+ "stability issues.",
+ value);
+ return;
+ }
+
+ pcms->max_fw_size = value;
+}
+
+
+static void pc_machine_initfn(Object *obj)
+{
+ PCMachineState *pcms = PC_MACHINE(obj);
+
+#ifdef CONFIG_VMPORT
+ pcms->vmport = ON_OFF_AUTO_AUTO;
+#else
+ pcms->vmport = ON_OFF_AUTO_OFF;
+#endif /* CONFIG_VMPORT */
+ pcms->max_ram_below_4g = 0; /* use default */
+ /* acpi build is enabled by default if machine supports it */
+ pcms->acpi_build_enabled = PC_MACHINE_GET_CLASS(pcms)->has_acpi_build;
+ pcms->smbus_enabled = true;
+ pcms->sata_enabled = true;
+ pcms->pit_enabled = true;
+ pcms->max_fw_size = 8 * MiB;
+#ifdef CONFIG_HPET
+ pcms->hpet_enabled = true;
+#endif
+ pcms->default_bus_bypass_iommu = false;
+
+ pc_system_flash_create(pcms);
+ pcms->pcspk = isa_new(TYPE_PC_SPEAKER);
+ object_property_add_alias(OBJECT(pcms), "pcspk-audiodev",
+ OBJECT(pcms->pcspk), "audiodev");
+}
+
+static void pc_machine_reset(MachineState *machine)
+{
+ CPUState *cs;
+ X86CPU *cpu;
+
+ qemu_devices_reset();
+
+ /* Reset APIC after devices have been reset to cancel
+ * any changes that qemu_devices_reset() might have done.
+ */
+ CPU_FOREACH(cs) {
+ cpu = X86_CPU(cs);
+
+ if (cpu->apic_state) {
+ device_legacy_reset(cpu->apic_state);
+ }
+ }
+}
+
+static void pc_machine_wakeup(MachineState *machine)
+{
+ cpu_synchronize_all_states();
+ pc_machine_reset(machine);
+ cpu_synchronize_all_post_reset();
+}
+
+static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp)
+{
+ X86IOMMUState *iommu = x86_iommu_get_default();
+ IntelIOMMUState *intel_iommu;
+
+ if (iommu &&
+ object_dynamic_cast((Object *)iommu, TYPE_INTEL_IOMMU_DEVICE) &&
+ object_dynamic_cast((Object *)dev, "vfio-pci")) {
+ intel_iommu = INTEL_IOMMU_DEVICE(iommu);
+ if (!intel_iommu->caching_mode) {
+ error_setg(errp, "Device assignment is not allowed without "
+ "enabling caching-mode=on for Intel IOMMU.");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void pc_machine_class_init(ObjectClass *oc, void *data)
+{
+ MachineClass *mc = MACHINE_CLASS(oc);
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(oc);
+ HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
+
+ pcmc->pci_enabled = true;
+ pcmc->has_acpi_build = true;
+ pcmc->rsdp_in_ram = true;
+ pcmc->smbios_defaults = true;
+ pcmc->smbios_uuid_encoded = true;
+ pcmc->gigabyte_align = true;
+ pcmc->has_reserved_memory = true;
+ pcmc->kvmclock_enabled = true;
+ pcmc->enforce_aligned_dimm = true;
+ /* BIOS ACPI tables: 128K. Other BIOS datastructures: less than 4K reported
+ * to be used at the moment, 32K should be enough for a while. */
+ pcmc->acpi_data_size = 0x20000 + 0x8000;
+ pcmc->pvh_enabled = true;
+ pcmc->kvmclock_create_always = true;
+ assert(!mc->get_hotplug_handler);
+ mc->get_hotplug_handler = pc_get_hotplug_handler;
+ mc->hotplug_allowed = pc_hotplug_allowed;
+ mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
+ mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
+ mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
+ mc->auto_enable_numa_with_memhp = true;
+ mc->auto_enable_numa_with_memdev = true;
+ mc->has_hotpluggable_cpus = true;
+ mc->default_boot_order = "cad";
+ mc->block_default_type = IF_IDE;
+ mc->max_cpus = 255;
+ mc->reset = pc_machine_reset;
+ mc->wakeup = pc_machine_wakeup;
+ hc->pre_plug = pc_machine_device_pre_plug_cb;
+ hc->plug = pc_machine_device_plug_cb;
+ hc->unplug_request = pc_machine_device_unplug_request_cb;
+ hc->unplug = pc_machine_device_unplug_cb;
+ mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE;
+ mc->nvdimm_supported = true;
+ mc->smp_props.dies_supported = true;
+ mc->default_ram_id = "pc.ram";
+
+ object_class_property_add(oc, PC_MACHINE_MAX_RAM_BELOW_4G, "size",
+ pc_machine_get_max_ram_below_4g, pc_machine_set_max_ram_below_4g,
+ NULL, NULL);
+ object_class_property_set_description(oc, PC_MACHINE_MAX_RAM_BELOW_4G,
+ "Maximum ram below the 4G boundary (32bit boundary)");
+
+ object_class_property_add(oc, PC_MACHINE_DEVMEM_REGION_SIZE, "int",
+ pc_machine_get_device_memory_region_size, NULL,
+ NULL, NULL);
+
+ object_class_property_add(oc, PC_MACHINE_VMPORT, "OnOffAuto",
+ pc_machine_get_vmport, pc_machine_set_vmport,
+ NULL, NULL);
+ object_class_property_set_description(oc, PC_MACHINE_VMPORT,
+ "Enable vmport (pc & q35)");
+
+ object_class_property_add_bool(oc, PC_MACHINE_SMBUS,
+ pc_machine_get_smbus, pc_machine_set_smbus);
+
+ object_class_property_add_bool(oc, PC_MACHINE_SATA,
+ pc_machine_get_sata, pc_machine_set_sata);
+
+ object_class_property_add_bool(oc, PC_MACHINE_PIT,
+ pc_machine_get_pit, pc_machine_set_pit);
+
+ object_class_property_add_bool(oc, "hpet",
+ pc_machine_get_hpet, pc_machine_set_hpet);
+
+ object_class_property_add_bool(oc, "default-bus-bypass-iommu",
+ pc_machine_get_default_bus_bypass_iommu,
+ pc_machine_set_default_bus_bypass_iommu);
+
+ object_class_property_add(oc, PC_MACHINE_MAX_FW_SIZE, "size",
+ pc_machine_get_max_fw_size, pc_machine_set_max_fw_size,
+ NULL, NULL);
+ object_class_property_set_description(oc, PC_MACHINE_MAX_FW_SIZE,
+ "Maximum combined firmware size");
+}
+
+static const TypeInfo pc_machine_info = {
+ .name = TYPE_PC_MACHINE,
+ .parent = TYPE_X86_MACHINE,
+ .abstract = true,
+ .instance_size = sizeof(PCMachineState),
+ .instance_init = pc_machine_initfn,
+ .class_size = sizeof(PCMachineClass),
+ .class_init = pc_machine_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_HOTPLUG_HANDLER },
+ { }
+ },
+};
+
+static void pc_machine_register_types(void)
+{
+ type_register_static(&pc_machine_info);
+}
+
+type_init(pc_machine_register_types)
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
new file mode 100644
index 000000000..223dd3e05
--- /dev/null
+++ b/hw/i386/pc_piix.c
@@ -0,0 +1,951 @@
+/*
+ * QEMU PC System Emulator
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include CONFIG_DEVICES
+
+#include "qemu/units.h"
+#include "hw/loader.h"
+#include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/apic.h"
+#include "hw/pci-host/i440fx.h"
+#include "hw/southbridge/piix.h"
+#include "hw/display/ramfb.h"
+#include "hw/firmware/smbios.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_ids.h"
+#include "hw/usb.h"
+#include "net/net.h"
+#include "hw/ide/pci.h"
+#include "hw/irq.h"
+#include "sysemu/kvm.h"
+#include "hw/kvm/clock.h"
+#include "hw/sysbus.h"
+#include "hw/i2c/smbus_eeprom.h"
+#include "hw/xen/xen-x86.h"
+#include "exec/memory.h"
+#include "hw/acpi/acpi.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "sysemu/xen.h"
+#ifdef CONFIG_XEN
+#include <xen/hvm/hvm_info_table.h>
+#include "hw/xen/xen_pt.h"
+#endif
+#include "migration/global_state.h"
+#include "migration/misc.h"
+#include "sysemu/numa.h"
+#include "hw/hyperv/vmbus-bridge.h"
+#include "hw/mem/nvdimm.h"
+#include "hw/i386/acpi-build.h"
+#include "kvm/kvm-cpu.h"
+
+#define MAX_IDE_BUS 2
+
+#ifdef CONFIG_IDE_ISA
+static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 };
+static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 };
+static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
+#endif
+
+/* PC hardware initialisation */
+static void pc_init1(MachineState *machine,
+ const char *host_type, const char *pci_type)
+{
+ PCMachineState *pcms = PC_MACHINE(machine);
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ X86MachineState *x86ms = X86_MACHINE(machine);
+ MemoryRegion *system_memory = get_system_memory();
+ MemoryRegion *system_io = get_system_io();
+ PCIBus *pci_bus;
+ ISABus *isa_bus;
+ PCII440FXState *i440fx_state;
+ int piix3_devfn = -1;
+ qemu_irq smi_irq;
+ GSIState *gsi_state;
+ BusState *idebus[MAX_IDE_BUS];
+ ISADevice *rtc_state;
+ MemoryRegion *ram_memory;
+ MemoryRegion *pci_memory;
+ MemoryRegion *rom_memory;
+ ram_addr_t lowmem;
+
+ /*
+ * Calculate ram split, for memory below and above 4G. It's a bit
+ * complicated for backward compatibility reasons ...
+ *
+ * - Traditional split is 3.5G (lowmem = 0xe0000000). This is the
+ * default value for max_ram_below_4g now.
+ *
+ * - Then, to gigabyte align the memory, we move the split to 3G
+ * (lowmem = 0xc0000000). But only in case we have to split in
+ * the first place, i.e. ram_size is larger than (traditional)
+ * lowmem. And for new machine types (gigabyte_align = true)
+ * only, for live migration compatibility reasons.
+ *
+ * - Next the max-ram-below-4g option was added, which allowed to
+ * reduce lowmem to a smaller value, to allow a larger PCI I/O
+ * window below 4G. qemu doesn't enforce gigabyte alignment here,
+ * but prints a warning.
+ *
+ * - Finally max-ram-below-4g got updated to also allow raising lowmem,
+ * so legacy non-PAE guests can get as much memory as possible in
+ * the 32bit address space below 4G.
+ *
+ * - Note that Xen has its own ram setup code in xen_ram_init(),
+ * called via xen_hvm_init_pc().
+ *
+ * Examples:
+ * qemu -M pc-1.7 -m 4G (old default) -> 3584M low, 512M high
+ * qemu -M pc -m 4G (new default) -> 3072M low, 1024M high
+ * qemu -M pc,max-ram-below-4g=2G -m 4G -> 2048M low, 2048M high
+ * qemu -M pc,max-ram-below-4g=4G -m 3968M -> 3968M low (=4G-128M)
+ */
+ if (xen_enabled()) {
+ xen_hvm_init_pc(pcms, &ram_memory);
+ } else {
+ if (!pcms->max_ram_below_4g) {
+ pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */
+ }
+ lowmem = pcms->max_ram_below_4g;
+ if (machine->ram_size >= pcms->max_ram_below_4g) {
+ if (pcmc->gigabyte_align) {
+ if (lowmem > 0xc0000000) {
+ lowmem = 0xc0000000;
+ }
+ if (lowmem & (1 * GiB - 1)) {
+ warn_report("Large machine and max_ram_below_4g "
+ "(%" PRIu64 ") not a multiple of 1G; "
+ "possible bad performance.",
+ pcms->max_ram_below_4g);
+ }
+ }
+ }
+
+ if (machine->ram_size >= lowmem) {
+ x86ms->above_4g_mem_size = machine->ram_size - lowmem;
+ x86ms->below_4g_mem_size = lowmem;
+ } else {
+ x86ms->above_4g_mem_size = 0;
+ x86ms->below_4g_mem_size = machine->ram_size;
+ }
+ }
+
+ pc_machine_init_sgx_epc(pcms);
+ x86_cpus_init(x86ms, pcmc->default_cpu_version);
+
+ if (pcmc->kvmclock_enabled) {
+ kvmclock_create(pcmc->kvmclock_create_always);
+ }
+
+ if (pcmc->pci_enabled) {
+ pci_memory = g_new(MemoryRegion, 1);
+ memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
+ rom_memory = pci_memory;
+ } else {
+ pci_memory = NULL;
+ rom_memory = system_memory;
+ }
+
+ pc_guest_info_init(pcms);
+
+ if (pcmc->smbios_defaults) {
+ MachineClass *mc = MACHINE_GET_CLASS(machine);
+ /* These values are guest ABI, do not change */
+ smbios_set_defaults("QEMU", "Standard PC (i440FX + PIIX, 1996)",
+ mc->name, pcmc->smbios_legacy_mode,
+ pcmc->smbios_uuid_encoded,
+ SMBIOS_ENTRY_POINT_21);
+ }
+
+ /* allocate ram and load rom/bios */
+ if (!xen_enabled()) {
+ pc_memory_init(pcms, system_memory,
+ rom_memory, &ram_memory);
+ } else {
+ pc_system_flash_cleanup_unused(pcms);
+ if (machine->kernel_filename != NULL) {
+ /* For xen HVM direct kernel boot, load linux here */
+ xen_load_linux(pcms);
+ }
+ }
+
+ gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled);
+
+ if (pcmc->pci_enabled) {
+ PIIX3State *piix3;
+
+ pci_bus = i440fx_init(host_type,
+ pci_type,
+ &i440fx_state,
+ system_memory, system_io, machine->ram_size,
+ x86ms->below_4g_mem_size,
+ x86ms->above_4g_mem_size,
+ pci_memory, ram_memory);
+ pcms->bus = pci_bus;
+
+ piix3 = piix3_create(pci_bus, &isa_bus);
+ piix3->pic = x86ms->gsi;
+ piix3_devfn = piix3->dev.devfn;
+ } else {
+ pci_bus = NULL;
+ i440fx_state = NULL;
+ isa_bus = isa_bus_new(NULL, get_system_memory(), system_io,
+ &error_abort);
+ pcms->hpet_enabled = false;
+ }
+ isa_bus_irqs(isa_bus, x86ms->gsi);
+
+ pc_i8259_create(isa_bus, gsi_state->i8259_irq);
+
+ if (pcmc->pci_enabled) {
+ ioapic_init_gsi(gsi_state, "i440fx");
+ }
+
+ if (tcg_enabled()) {
+ x86_register_ferr_irq(x86ms->gsi[13]);
+ }
+
+ pc_vga_init(isa_bus, pcmc->pci_enabled ? pci_bus : NULL);
+
+ assert(pcms->vmport != ON_OFF_AUTO__MAX);
+ if (pcms->vmport == ON_OFF_AUTO_AUTO) {
+ pcms->vmport = xen_enabled() ? ON_OFF_AUTO_OFF : ON_OFF_AUTO_ON;
+ }
+
+ /* init basic PC hardware */
+ pc_basic_device_init(pcms, isa_bus, x86ms->gsi, &rtc_state, true,
+ 0x4);
+
+ pc_nic_init(pcmc, isa_bus, pci_bus);
+
+ if (pcmc->pci_enabled) {
+ PCIDevice *dev;
+
+ dev = pci_create_simple(pci_bus, piix3_devfn + 1,
+ xen_enabled() ? "piix3-ide-xen" : "piix3-ide");
+ pci_ide_create_devs(dev);
+ idebus[0] = qdev_get_child_bus(&dev->qdev, "ide.0");
+ idebus[1] = qdev_get_child_bus(&dev->qdev, "ide.1");
+ pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state);
+ }
+#ifdef CONFIG_IDE_ISA
+ else {
+ DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
+ int i;
+
+ ide_drive_get(hd, ARRAY_SIZE(hd));
+ for (i = 0; i < MAX_IDE_BUS; i++) {
+ ISADevice *dev;
+ char busname[] = "ide.0";
+ dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i],
+ ide_irq[i],
+ hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]);
+ /*
+ * The ide bus name is ide.0 for the first bus and ide.1 for the
+ * second one.
+ */
+ busname[4] = '0' + i;
+ idebus[i] = qdev_get_child_bus(DEVICE(dev), busname);
+ }
+ pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state);
+ }
+#endif
+
+ if (pcmc->pci_enabled && machine_usb(machine)) {
+ pci_create_simple(pci_bus, piix3_devfn + 2, "piix3-usb-uhci");
+ }
+
+ if (pcmc->pci_enabled && x86_machine_is_acpi_enabled(X86_MACHINE(pcms))) {
+ DeviceState *piix4_pm;
+
+ smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0);
+ /* TODO: Populate SPD eeprom data. */
+ pcms->smbus = piix4_pm_init(pci_bus, piix3_devfn + 3, 0xb100,
+ x86ms->gsi[9], smi_irq,
+ x86_machine_is_smm_enabled(x86ms),
+ &piix4_pm);
+ smbus_eeprom_init(pcms->smbus, 8, NULL, 0);
+
+ object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
+ TYPE_HOTPLUG_HANDLER,
+ (Object **)&x86ms->acpi_dev,
+ object_property_allow_set_link,
+ OBJ_PROP_LINK_STRONG);
+ object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
+ OBJECT(piix4_pm), &error_abort);
+ }
+
+ if (machine->nvdimms_state->is_enabled) {
+ nvdimm_init_acpi_state(machine->nvdimms_state, system_io,
+ x86_nvdimm_acpi_dsmio,
+ x86ms->fw_cfg, OBJECT(pcms));
+ }
+}
+
+/* Looking for a pc_compat_2_4() function? It doesn't exist.
+ * pc_compat_*() functions that run on machine-init time and
+ * change global QEMU state are deprecated. Please don't create
+ * one, and implement any pc-*-2.4 (and newer) compat code in
+ * hw_compat_*, pc_compat_*, or * pc_*_machine_options().
+ */
+
+static void pc_compat_2_3_fn(MachineState *machine)
+{
+ X86MachineState *x86ms = X86_MACHINE(machine);
+ if (kvm_enabled()) {
+ x86ms->smm = ON_OFF_AUTO_OFF;
+ }
+}
+
+static void pc_compat_2_2_fn(MachineState *machine)
+{
+ pc_compat_2_3_fn(machine);
+}
+
+static void pc_compat_2_1_fn(MachineState *machine)
+{
+ pc_compat_2_2_fn(machine);
+ x86_cpu_change_kvm_default("svm", NULL);
+}
+
+static void pc_compat_2_0_fn(MachineState *machine)
+{
+ pc_compat_2_1_fn(machine);
+}
+
+static void pc_compat_1_7_fn(MachineState *machine)
+{
+ pc_compat_2_0_fn(machine);
+ x86_cpu_change_kvm_default("x2apic", NULL);
+}
+
+static void pc_compat_1_6_fn(MachineState *machine)
+{
+ pc_compat_1_7_fn(machine);
+}
+
+static void pc_compat_1_5_fn(MachineState *machine)
+{
+ pc_compat_1_6_fn(machine);
+}
+
+static void pc_compat_1_4_fn(MachineState *machine)
+{
+ pc_compat_1_5_fn(machine);
+}
+
+static void pc_init_isa(MachineState *machine)
+{
+ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, TYPE_I440FX_PCI_DEVICE);
+}
+
+#ifdef CONFIG_XEN
+static void pc_xen_hvm_init_pci(MachineState *machine)
+{
+ const char *pci_type = xen_igd_gfx_pt_enabled() ?
+ TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE : TYPE_I440FX_PCI_DEVICE;
+
+ pc_init1(machine,
+ TYPE_I440FX_PCI_HOST_BRIDGE,
+ pci_type);
+}
+
+static void pc_xen_hvm_init(MachineState *machine)
+{
+ PCMachineState *pcms = PC_MACHINE(machine);
+
+ if (!xen_enabled()) {
+ error_report("xenfv machine requires the xen accelerator");
+ exit(1);
+ }
+
+ pc_xen_hvm_init_pci(machine);
+ pci_create_simple(pcms->bus, -1, "xen-platform");
+}
+#endif
+
+#define DEFINE_I440FX_MACHINE(suffix, name, compatfn, optionfn) \
+ static void pc_init_##suffix(MachineState *machine) \
+ { \
+ void (*compat)(MachineState *m) = (compatfn); \
+ if (compat) { \
+ compat(machine); \
+ } \
+ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \
+ TYPE_I440FX_PCI_DEVICE); \
+ } \
+ DEFINE_PC_MACHINE(suffix, name, pc_init_##suffix, optionfn)
+
+static void pc_i440fx_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+ pcmc->default_nic_model = "e1000";
+ pcmc->pci_root_uid = 0;
+
+ m->family = "pc_piix";
+ m->desc = "Standard PC (i440FX + PIIX, 1996)";
+ m->default_machine_opts = "firmware=bios-256k.bin";
+ m->default_display = "std";
+ machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE);
+ machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
+}
+
+static void pc_i440fx_6_2_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+ pc_i440fx_machine_options(m);
+ m->alias = "pc";
+ m->is_default = true;
+ pcmc->default_cpu_version = 1;
+}
+
+DEFINE_I440FX_MACHINE(v6_2, "pc-i440fx-6.2", NULL,
+ pc_i440fx_6_2_machine_options);
+
+static void pc_i440fx_6_1_machine_options(MachineClass *m)
+{
+ pc_i440fx_6_2_machine_options(m);
+ m->alias = NULL;
+ m->is_default = false;
+ compat_props_add(m->compat_props, hw_compat_6_1, hw_compat_6_1_len);
+ compat_props_add(m->compat_props, pc_compat_6_1, pc_compat_6_1_len);
+ m->smp_props.prefer_sockets = true;
+}
+
+DEFINE_I440FX_MACHINE(v6_1, "pc-i440fx-6.1", NULL,
+ pc_i440fx_6_1_machine_options);
+
+static void pc_i440fx_6_0_machine_options(MachineClass *m)
+{
+ pc_i440fx_6_1_machine_options(m);
+ m->alias = NULL;
+ m->is_default = false;
+ compat_props_add(m->compat_props, hw_compat_6_0, hw_compat_6_0_len);
+ compat_props_add(m->compat_props, pc_compat_6_0, pc_compat_6_0_len);
+}
+
+DEFINE_I440FX_MACHINE(v6_0, "pc-i440fx-6.0", NULL,
+ pc_i440fx_6_0_machine_options);
+
+static void pc_i440fx_5_2_machine_options(MachineClass *m)
+{
+ pc_i440fx_6_0_machine_options(m);
+ m->alias = NULL;
+ m->is_default = false;
+ compat_props_add(m->compat_props, hw_compat_5_2, hw_compat_5_2_len);
+ compat_props_add(m->compat_props, pc_compat_5_2, pc_compat_5_2_len);
+}
+
+DEFINE_I440FX_MACHINE(v5_2, "pc-i440fx-5.2", NULL,
+ pc_i440fx_5_2_machine_options);
+
+static void pc_i440fx_5_1_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_i440fx_5_2_machine_options(m);
+ m->alias = NULL;
+ m->is_default = false;
+ compat_props_add(m->compat_props, hw_compat_5_1, hw_compat_5_1_len);
+ compat_props_add(m->compat_props, pc_compat_5_1, pc_compat_5_1_len);
+ pcmc->kvmclock_create_always = false;
+ pcmc->pci_root_uid = 1;
+}
+
+DEFINE_I440FX_MACHINE(v5_1, "pc-i440fx-5.1", NULL,
+ pc_i440fx_5_1_machine_options);
+
+static void pc_i440fx_5_0_machine_options(MachineClass *m)
+{
+ pc_i440fx_5_1_machine_options(m);
+ m->alias = NULL;
+ m->is_default = false;
+ m->numa_mem_supported = true;
+ compat_props_add(m->compat_props, hw_compat_5_0, hw_compat_5_0_len);
+ compat_props_add(m->compat_props, pc_compat_5_0, pc_compat_5_0_len);
+ m->auto_enable_numa_with_memdev = false;
+}
+
+DEFINE_I440FX_MACHINE(v5_0, "pc-i440fx-5.0", NULL,
+ pc_i440fx_5_0_machine_options);
+
+static void pc_i440fx_4_2_machine_options(MachineClass *m)
+{
+ pc_i440fx_5_0_machine_options(m);
+ m->alias = NULL;
+ m->is_default = false;
+ compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len);
+ compat_props_add(m->compat_props, pc_compat_4_2, pc_compat_4_2_len);
+}
+
+DEFINE_I440FX_MACHINE(v4_2, "pc-i440fx-4.2", NULL,
+ pc_i440fx_4_2_machine_options);
+
+static void pc_i440fx_4_1_machine_options(MachineClass *m)
+{
+ pc_i440fx_4_2_machine_options(m);
+ m->alias = NULL;
+ m->is_default = false;
+ compat_props_add(m->compat_props, hw_compat_4_1, hw_compat_4_1_len);
+ compat_props_add(m->compat_props, pc_compat_4_1, pc_compat_4_1_len);
+}
+
+DEFINE_I440FX_MACHINE(v4_1, "pc-i440fx-4.1", NULL,
+ pc_i440fx_4_1_machine_options);
+
+static void pc_i440fx_4_0_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+ pc_i440fx_4_1_machine_options(m);
+ m->alias = NULL;
+ m->is_default = false;
+ pcmc->default_cpu_version = CPU_VERSION_LEGACY;
+ compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len);
+ compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len);
+}
+
+DEFINE_I440FX_MACHINE(v4_0, "pc-i440fx-4.0", NULL,
+ pc_i440fx_4_0_machine_options);
+
+static void pc_i440fx_3_1_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_i440fx_4_0_machine_options(m);
+ m->is_default = false;
+ pcmc->do_not_add_smb_acpi = true;
+ m->smbus_no_migration_support = true;
+ m->alias = NULL;
+ pcmc->pvh_enabled = false;
+ compat_props_add(m->compat_props, hw_compat_3_1, hw_compat_3_1_len);
+ compat_props_add(m->compat_props, pc_compat_3_1, pc_compat_3_1_len);
+}
+
+DEFINE_I440FX_MACHINE(v3_1, "pc-i440fx-3.1", NULL,
+ pc_i440fx_3_1_machine_options);
+
+static void pc_i440fx_3_0_machine_options(MachineClass *m)
+{
+ pc_i440fx_3_1_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_3_0, hw_compat_3_0_len);
+ compat_props_add(m->compat_props, pc_compat_3_0, pc_compat_3_0_len);
+}
+
+DEFINE_I440FX_MACHINE(v3_0, "pc-i440fx-3.0", NULL,
+ pc_i440fx_3_0_machine_options);
+
+static void pc_i440fx_2_12_machine_options(MachineClass *m)
+{
+ pc_i440fx_3_0_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_2_12, hw_compat_2_12_len);
+ compat_props_add(m->compat_props, pc_compat_2_12, pc_compat_2_12_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_12, "pc-i440fx-2.12", NULL,
+ pc_i440fx_2_12_machine_options);
+
+static void pc_i440fx_2_11_machine_options(MachineClass *m)
+{
+ pc_i440fx_2_12_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_2_11, hw_compat_2_11_len);
+ compat_props_add(m->compat_props, pc_compat_2_11, pc_compat_2_11_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_11, "pc-i440fx-2.11", NULL,
+ pc_i440fx_2_11_machine_options);
+
+static void pc_i440fx_2_10_machine_options(MachineClass *m)
+{
+ pc_i440fx_2_11_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_2_10, hw_compat_2_10_len);
+ compat_props_add(m->compat_props, pc_compat_2_10, pc_compat_2_10_len);
+ m->auto_enable_numa_with_memhp = false;
+}
+
+DEFINE_I440FX_MACHINE(v2_10, "pc-i440fx-2.10", NULL,
+ pc_i440fx_2_10_machine_options);
+
+static void pc_i440fx_2_9_machine_options(MachineClass *m)
+{
+ pc_i440fx_2_10_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_2_9, hw_compat_2_9_len);
+ compat_props_add(m->compat_props, pc_compat_2_9, pc_compat_2_9_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_9, "pc-i440fx-2.9", NULL,
+ pc_i440fx_2_9_machine_options);
+
+static void pc_i440fx_2_8_machine_options(MachineClass *m)
+{
+ pc_i440fx_2_9_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_2_8, hw_compat_2_8_len);
+ compat_props_add(m->compat_props, pc_compat_2_8, pc_compat_2_8_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_8, "pc-i440fx-2.8", NULL,
+ pc_i440fx_2_8_machine_options);
+
+static void pc_i440fx_2_7_machine_options(MachineClass *m)
+{
+ pc_i440fx_2_8_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_2_7, hw_compat_2_7_len);
+ compat_props_add(m->compat_props, pc_compat_2_7, pc_compat_2_7_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_7, "pc-i440fx-2.7", NULL,
+ pc_i440fx_2_7_machine_options);
+
+static void pc_i440fx_2_6_machine_options(MachineClass *m)
+{
+ X86MachineClass *x86mc = X86_MACHINE_CLASS(m);
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_i440fx_2_7_machine_options(m);
+ pcmc->legacy_cpu_hotplug = true;
+ x86mc->fwcfg_dma_enabled = false;
+ compat_props_add(m->compat_props, hw_compat_2_6, hw_compat_2_6_len);
+ compat_props_add(m->compat_props, pc_compat_2_6, pc_compat_2_6_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_6, "pc-i440fx-2.6", NULL,
+ pc_i440fx_2_6_machine_options);
+
+static void pc_i440fx_2_5_machine_options(MachineClass *m)
+{
+ X86MachineClass *x86mc = X86_MACHINE_CLASS(m);
+
+ pc_i440fx_2_6_machine_options(m);
+ x86mc->save_tsc_khz = false;
+ m->legacy_fw_cfg_order = 1;
+ compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len);
+ compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_5, "pc-i440fx-2.5", NULL,
+ pc_i440fx_2_5_machine_options);
+
+static void pc_i440fx_2_4_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_i440fx_2_5_machine_options(m);
+ m->hw_version = "2.4.0";
+ pcmc->broken_reserved_end = true;
+ compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len);
+ compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_4, "pc-i440fx-2.4", NULL,
+ pc_i440fx_2_4_machine_options)
+
+static void pc_i440fx_2_3_machine_options(MachineClass *m)
+{
+ pc_i440fx_2_4_machine_options(m);
+ m->hw_version = "2.3.0";
+ compat_props_add(m->compat_props, hw_compat_2_3, hw_compat_2_3_len);
+ compat_props_add(m->compat_props, pc_compat_2_3, pc_compat_2_3_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_3, "pc-i440fx-2.3", pc_compat_2_3_fn,
+ pc_i440fx_2_3_machine_options);
+
+static void pc_i440fx_2_2_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_i440fx_2_3_machine_options(m);
+ m->hw_version = "2.2.0";
+ m->default_machine_opts = "firmware=bios-256k.bin,suppress-vmdesc=on";
+ compat_props_add(m->compat_props, hw_compat_2_2, hw_compat_2_2_len);
+ compat_props_add(m->compat_props, pc_compat_2_2, pc_compat_2_2_len);
+ pcmc->rsdp_in_ram = false;
+}
+
+DEFINE_I440FX_MACHINE(v2_2, "pc-i440fx-2.2", pc_compat_2_2_fn,
+ pc_i440fx_2_2_machine_options);
+
+static void pc_i440fx_2_1_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_i440fx_2_2_machine_options(m);
+ m->hw_version = "2.1.0";
+ m->default_display = NULL;
+ compat_props_add(m->compat_props, hw_compat_2_1, hw_compat_2_1_len);
+ compat_props_add(m->compat_props, pc_compat_2_1, pc_compat_2_1_len);
+ pcmc->smbios_uuid_encoded = false;
+ pcmc->enforce_aligned_dimm = false;
+}
+
+DEFINE_I440FX_MACHINE(v2_1, "pc-i440fx-2.1", pc_compat_2_1_fn,
+ pc_i440fx_2_1_machine_options);
+
+static void pc_i440fx_2_0_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_i440fx_2_1_machine_options(m);
+ m->hw_version = "2.0.0";
+ compat_props_add(m->compat_props, pc_compat_2_0, pc_compat_2_0_len);
+ pcmc->smbios_legacy_mode = true;
+ pcmc->has_reserved_memory = false;
+ /* This value depends on the actual DSDT and SSDT compiled into
+ * the source QEMU; unfortunately it depends on the binary and
+ * not on the machine type, so we cannot make pc-i440fx-1.7 work on
+ * both QEMU 1.7 and QEMU 2.0.
+ *
+ * Large variations cause migration to fail for more than one
+ * consecutive value of the "-smp" maxcpus option.
+ *
+ * For small variations of the kind caused by different iasl versions,
+ * the 4k rounding usually leaves slack. However, there could be still
+ * one or two values that break. For QEMU 1.7 and QEMU 2.0 the
+ * slack is only ~10 bytes before one "-smp maxcpus" value breaks!
+ *
+ * 6652 is valid for QEMU 2.0, the right value for pc-i440fx-1.7 on
+ * QEMU 1.7 it is 6414. For RHEL/CentOS 7.0 it is 6418.
+ */
+ pcmc->legacy_acpi_table_size = 6652;
+ pcmc->acpi_data_size = 0x10000;
+}
+
+DEFINE_I440FX_MACHINE(v2_0, "pc-i440fx-2.0", pc_compat_2_0_fn,
+ pc_i440fx_2_0_machine_options);
+
+static void pc_i440fx_1_7_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_i440fx_2_0_machine_options(m);
+ m->hw_version = "1.7.0";
+ m->default_machine_opts = NULL;
+ m->option_rom_has_mr = true;
+ compat_props_add(m->compat_props, pc_compat_1_7, pc_compat_1_7_len);
+ pcmc->smbios_defaults = false;
+ pcmc->gigabyte_align = false;
+ pcmc->legacy_acpi_table_size = 6414;
+}
+
+DEFINE_I440FX_MACHINE(v1_7, "pc-i440fx-1.7", pc_compat_1_7_fn,
+ pc_i440fx_1_7_machine_options);
+
+static void pc_i440fx_1_6_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_i440fx_1_7_machine_options(m);
+ m->hw_version = "1.6.0";
+ m->rom_file_has_mr = false;
+ compat_props_add(m->compat_props, pc_compat_1_6, pc_compat_1_6_len);
+ pcmc->has_acpi_build = false;
+}
+
+DEFINE_I440FX_MACHINE(v1_6, "pc-i440fx-1.6", pc_compat_1_6_fn,
+ pc_i440fx_1_6_machine_options);
+
+static void pc_i440fx_1_5_machine_options(MachineClass *m)
+{
+ pc_i440fx_1_6_machine_options(m);
+ m->hw_version = "1.5.0";
+ compat_props_add(m->compat_props, pc_compat_1_5, pc_compat_1_5_len);
+}
+
+DEFINE_I440FX_MACHINE(v1_5, "pc-i440fx-1.5", pc_compat_1_5_fn,
+ pc_i440fx_1_5_machine_options);
+
+static void pc_i440fx_1_4_machine_options(MachineClass *m)
+{
+ pc_i440fx_1_5_machine_options(m);
+ m->hw_version = "1.4.0";
+ compat_props_add(m->compat_props, pc_compat_1_4, pc_compat_1_4_len);
+}
+
+DEFINE_I440FX_MACHINE(v1_4, "pc-i440fx-1.4", pc_compat_1_4_fn,
+ pc_i440fx_1_4_machine_options);
+
+typedef struct {
+ uint16_t gpu_device_id;
+ uint16_t pch_device_id;
+ uint8_t pch_revision_id;
+} IGDDeviceIDInfo;
+
+/* In real world different GPU should have different PCH. But actually
+ * the different PCH DIDs likely map to different PCH SKUs. We do the
+ * same thing for the GPU. For PCH, the different SKUs are going to be
+ * all the same silicon design and implementation, just different
+ * features turn on and off with fuses. The SW interfaces should be
+ * consistent across all SKUs in a given family (eg LPT). But just same
+ * features may not be supported.
+ *
+ * Most of these different PCH features probably don't matter to the
+ * Gfx driver, but obviously any difference in display port connections
+ * will so it should be fine with any PCH in case of passthrough.
+ *
+ * So currently use one PCH version, 0x8c4e, to cover all HSW(Haswell)
+ * scenarios, 0x9cc3 for BDW(Broadwell).
+ */
+static const IGDDeviceIDInfo igd_combo_id_infos[] = {
+ /* HSW Classic */
+ {0x0402, 0x8c4e, 0x04}, /* HSWGT1D, HSWD_w7 */
+ {0x0406, 0x8c4e, 0x04}, /* HSWGT1M, HSWM_w7 */
+ {0x0412, 0x8c4e, 0x04}, /* HSWGT2D, HSWD_w7 */
+ {0x0416, 0x8c4e, 0x04}, /* HSWGT2M, HSWM_w7 */
+ {0x041E, 0x8c4e, 0x04}, /* HSWGT15D, HSWD_w7 */
+ /* HSW ULT */
+ {0x0A06, 0x8c4e, 0x04}, /* HSWGT1UT, HSWM_w7 */
+ {0x0A16, 0x8c4e, 0x04}, /* HSWGT2UT, HSWM_w7 */
+ {0x0A26, 0x8c4e, 0x06}, /* HSWGT3UT, HSWM_w7 */
+ {0x0A2E, 0x8c4e, 0x04}, /* HSWGT3UT28W, HSWM_w7 */
+ {0x0A1E, 0x8c4e, 0x04}, /* HSWGT2UX, HSWM_w7 */
+ {0x0A0E, 0x8c4e, 0x04}, /* HSWGT1ULX, HSWM_w7 */
+ /* HSW CRW */
+ {0x0D26, 0x8c4e, 0x04}, /* HSWGT3CW, HSWM_w7 */
+ {0x0D22, 0x8c4e, 0x04}, /* HSWGT3CWDT, HSWD_w7 */
+ /* HSW Server */
+ {0x041A, 0x8c4e, 0x04}, /* HSWSVGT2, HSWD_w7 */
+ /* HSW SRVR */
+ {0x040A, 0x8c4e, 0x04}, /* HSWSVGT1, HSWD_w7 */
+ /* BSW */
+ {0x1606, 0x9cc3, 0x03}, /* BDWULTGT1, BDWM_w7 */
+ {0x1616, 0x9cc3, 0x03}, /* BDWULTGT2, BDWM_w7 */
+ {0x1626, 0x9cc3, 0x03}, /* BDWULTGT3, BDWM_w7 */
+ {0x160E, 0x9cc3, 0x03}, /* BDWULXGT1, BDWM_w7 */
+ {0x161E, 0x9cc3, 0x03}, /* BDWULXGT2, BDWM_w7 */
+ {0x1602, 0x9cc3, 0x03}, /* BDWHALOGT1, BDWM_w7 */
+ {0x1612, 0x9cc3, 0x03}, /* BDWHALOGT2, BDWM_w7 */
+ {0x1622, 0x9cc3, 0x03}, /* BDWHALOGT3, BDWM_w7 */
+ {0x162B, 0x9cc3, 0x03}, /* BDWHALO28W, BDWM_w7 */
+ {0x162A, 0x9cc3, 0x03}, /* BDWGT3WRKS, BDWM_w7 */
+ {0x162D, 0x9cc3, 0x03}, /* BDWGT3SRVR, BDWM_w7 */
+};
+
+static void isa_bridge_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+ dc->desc = "ISA bridge faked to support IGD PT";
+ set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+ k->vendor_id = PCI_VENDOR_ID_INTEL;
+ k->class_id = PCI_CLASS_BRIDGE_ISA;
+};
+
+static TypeInfo isa_bridge_info = {
+ .name = "igd-passthrough-isa-bridge",
+ .parent = TYPE_PCI_DEVICE,
+ .instance_size = sizeof(PCIDevice),
+ .class_init = isa_bridge_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { },
+ },
+};
+
+static void pt_graphics_register_types(void)
+{
+ type_register_static(&isa_bridge_info);
+}
+type_init(pt_graphics_register_types)
+
+void igd_passthrough_isa_bridge_create(PCIBus *bus, uint16_t gpu_dev_id)
+{
+ struct PCIDevice *bridge_dev;
+ int i, num;
+ uint16_t pch_dev_id = 0xffff;
+ uint8_t pch_rev_id = 0;
+
+ num = ARRAY_SIZE(igd_combo_id_infos);
+ for (i = 0; i < num; i++) {
+ if (gpu_dev_id == igd_combo_id_infos[i].gpu_device_id) {
+ pch_dev_id = igd_combo_id_infos[i].pch_device_id;
+ pch_rev_id = igd_combo_id_infos[i].pch_revision_id;
+ }
+ }
+
+ if (pch_dev_id == 0xffff) {
+ return;
+ }
+
+ /* Currently IGD drivers always need to access PCH by 1f.0. */
+ bridge_dev = pci_create_simple(bus, PCI_DEVFN(0x1f, 0),
+ "igd-passthrough-isa-bridge");
+
+ /*
+ * Note that vendor id is always PCI_VENDOR_ID_INTEL.
+ */
+ if (!bridge_dev) {
+ fprintf(stderr, "set igd-passthrough-isa-bridge failed!\n");
+ return;
+ }
+ pci_config_set_device_id(bridge_dev->config, pch_dev_id);
+ pci_config_set_revision(bridge_dev->config, pch_rev_id);
+}
+
+static void isapc_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+ m->desc = "ISA-only PC";
+ m->max_cpus = 1;
+ m->option_rom_has_mr = true;
+ m->rom_file_has_mr = false;
+ pcmc->pci_enabled = false;
+ pcmc->has_acpi_build = false;
+ pcmc->smbios_defaults = false;
+ pcmc->gigabyte_align = false;
+ pcmc->smbios_legacy_mode = true;
+ pcmc->has_reserved_memory = false;
+ pcmc->default_nic_model = "ne2k_isa";
+ m->default_cpu_type = X86_CPU_TYPE_NAME("486");
+}
+
+DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa,
+ isapc_machine_options);
+
+
+#ifdef CONFIG_XEN
+static void xenfv_4_2_machine_options(MachineClass *m)
+{
+ pc_i440fx_4_2_machine_options(m);
+ m->desc = "Xen Fully-virtualized PC";
+ m->max_cpus = HVM_MAX_VCPUS;
+ m->default_machine_opts = "accel=xen,suppress-vmdesc=on";
+}
+
+DEFINE_PC_MACHINE(xenfv_4_2, "xenfv-4.2", pc_xen_hvm_init,
+ xenfv_4_2_machine_options);
+
+static void xenfv_3_1_machine_options(MachineClass *m)
+{
+ pc_i440fx_3_1_machine_options(m);
+ m->desc = "Xen Fully-virtualized PC";
+ m->alias = "xenfv";
+ m->max_cpus = HVM_MAX_VCPUS;
+ m->default_machine_opts = "accel=xen,suppress-vmdesc=on";
+}
+
+DEFINE_PC_MACHINE(xenfv, "xenfv-3.1", pc_xen_hvm_init,
+ xenfv_3_1_machine_options);
+#endif
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
new file mode 100644
index 000000000..e1e100316
--- /dev/null
+++ b/hw/i386/pc_q35.c
@@ -0,0 +1,620 @@
+/*
+ * Q35 chipset based pc system emulator
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2009, 2010
+ * Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ * Copyright (C) 2012 Jason Baron <jbaron@redhat.com>
+ *
+ * This is based on pc.c, but heavily modified.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "hw/loader.h"
+#include "hw/i2c/smbus_eeprom.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "sysemu/kvm.h"
+#include "hw/kvm/clock.h"
+#include "hw/pci-host/q35.h"
+#include "hw/pci/pcie_port.h"
+#include "hw/qdev-properties.h"
+#include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/ich9.h"
+#include "hw/i386/amd_iommu.h"
+#include "hw/i386/intel_iommu.h"
+#include "hw/display/ramfb.h"
+#include "hw/firmware/smbios.h"
+#include "hw/ide/pci.h"
+#include "hw/ide/ahci.h"
+#include "hw/usb.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "sysemu/numa.h"
+#include "hw/hyperv/vmbus-bridge.h"
+#include "hw/mem/nvdimm.h"
+#include "hw/i386/acpi-build.h"
+
+/* ICH9 AHCI has 6 ports */
+#define MAX_SATA_PORTS 6
+
+struct ehci_companions {
+ const char *name;
+ int func;
+ int port;
+};
+
+static const struct ehci_companions ich9_1d[] = {
+ { .name = "ich9-usb-uhci1", .func = 0, .port = 0 },
+ { .name = "ich9-usb-uhci2", .func = 1, .port = 2 },
+ { .name = "ich9-usb-uhci3", .func = 2, .port = 4 },
+};
+
+static const struct ehci_companions ich9_1a[] = {
+ { .name = "ich9-usb-uhci4", .func = 0, .port = 0 },
+ { .name = "ich9-usb-uhci5", .func = 1, .port = 2 },
+ { .name = "ich9-usb-uhci6", .func = 2, .port = 4 },
+};
+
+static int ehci_create_ich9_with_companions(PCIBus *bus, int slot)
+{
+ const struct ehci_companions *comp;
+ PCIDevice *ehci, *uhci;
+ BusState *usbbus;
+ const char *name;
+ int i;
+
+ switch (slot) {
+ case 0x1d:
+ name = "ich9-usb-ehci1";
+ comp = ich9_1d;
+ break;
+ case 0x1a:
+ name = "ich9-usb-ehci2";
+ comp = ich9_1a;
+ break;
+ default:
+ return -1;
+ }
+
+ ehci = pci_new_multifunction(PCI_DEVFN(slot, 7), true, name);
+ pci_realize_and_unref(ehci, bus, &error_fatal);
+ usbbus = QLIST_FIRST(&ehci->qdev.child_bus);
+
+ for (i = 0; i < 3; i++) {
+ uhci = pci_new_multifunction(PCI_DEVFN(slot, comp[i].func), true,
+ comp[i].name);
+ qdev_prop_set_string(&uhci->qdev, "masterbus", usbbus->name);
+ qdev_prop_set_uint32(&uhci->qdev, "firstport", comp[i].port);
+ pci_realize_and_unref(uhci, bus, &error_fatal);
+ }
+ return 0;
+}
+
+/* PC hardware initialisation */
+static void pc_q35_init(MachineState *machine)
+{
+ PCMachineState *pcms = PC_MACHINE(machine);
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ X86MachineState *x86ms = X86_MACHINE(machine);
+ Q35PCIHost *q35_host;
+ PCIHostState *phb;
+ PCIBus *host_bus;
+ PCIDevice *lpc;
+ DeviceState *lpc_dev;
+ BusState *idebus[MAX_SATA_PORTS];
+ ISADevice *rtc_state;
+ MemoryRegion *system_io = get_system_io();
+ MemoryRegion *pci_memory;
+ MemoryRegion *rom_memory;
+ MemoryRegion *ram_memory;
+ GSIState *gsi_state;
+ ISABus *isa_bus;
+ int i;
+ ICH9LPCState *ich9_lpc;
+ PCIDevice *ahci;
+ ram_addr_t lowmem;
+ DriveInfo *hd[MAX_SATA_PORTS];
+ MachineClass *mc = MACHINE_GET_CLASS(machine);
+ bool acpi_pcihp;
+ bool keep_pci_slot_hpc;
+
+ /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory
+ * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping
+ * also known as MMCFG).
+ * If it doesn't, we need to split it in chunks below and above 4G.
+ * In any case, try to make sure that guest addresses aligned at
+ * 1G boundaries get mapped to host addresses aligned at 1G boundaries.
+ */
+ if (machine->ram_size >= 0xb0000000) {
+ lowmem = 0x80000000;
+ } else {
+ lowmem = 0xb0000000;
+ }
+
+ /* Handle the machine opt max-ram-below-4g. It is basically doing
+ * min(qemu limit, user limit).
+ */
+ if (!pcms->max_ram_below_4g) {
+ pcms->max_ram_below_4g = 4 * GiB;
+ }
+ if (lowmem > pcms->max_ram_below_4g) {
+ lowmem = pcms->max_ram_below_4g;
+ if (machine->ram_size - lowmem > lowmem &&
+ lowmem & (1 * GiB - 1)) {
+ warn_report("There is possibly poor performance as the ram size "
+ " (0x%" PRIx64 ") is more then twice the size of"
+ " max-ram-below-4g (%"PRIu64") and"
+ " max-ram-below-4g is not a multiple of 1G.",
+ (uint64_t)machine->ram_size, pcms->max_ram_below_4g);
+ }
+ }
+
+ if (machine->ram_size >= lowmem) {
+ x86ms->above_4g_mem_size = machine->ram_size - lowmem;
+ x86ms->below_4g_mem_size = lowmem;
+ } else {
+ x86ms->above_4g_mem_size = 0;
+ x86ms->below_4g_mem_size = machine->ram_size;
+ }
+
+ pc_machine_init_sgx_epc(pcms);
+ x86_cpus_init(x86ms, pcmc->default_cpu_version);
+
+ kvmclock_create(pcmc->kvmclock_create_always);
+
+ /* pci enabled */
+ if (pcmc->pci_enabled) {
+ pci_memory = g_new(MemoryRegion, 1);
+ memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
+ rom_memory = pci_memory;
+ } else {
+ pci_memory = NULL;
+ rom_memory = get_system_memory();
+ }
+
+ pc_guest_info_init(pcms);
+
+ if (pcmc->smbios_defaults) {
+ /* These values are guest ABI, do not change */
+ smbios_set_defaults("QEMU", "Standard PC (Q35 + ICH9, 2009)",
+ mc->name, pcmc->smbios_legacy_mode,
+ pcmc->smbios_uuid_encoded,
+ SMBIOS_ENTRY_POINT_21);
+ }
+
+ /* allocate ram and load rom/bios */
+ pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory);
+
+ /* create pci host bus */
+ q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE));
+
+ object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host));
+ object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM,
+ OBJECT(ram_memory), NULL);
+ object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_PCI_MEM,
+ OBJECT(pci_memory), NULL);
+ object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_SYSTEM_MEM,
+ OBJECT(get_system_memory()), NULL);
+ object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_IO_MEM,
+ OBJECT(system_io), NULL);
+ object_property_set_int(OBJECT(q35_host), PCI_HOST_BELOW_4G_MEM_SIZE,
+ x86ms->below_4g_mem_size, NULL);
+ object_property_set_int(OBJECT(q35_host), PCI_HOST_ABOVE_4G_MEM_SIZE,
+ x86ms->above_4g_mem_size, NULL);
+ /* pci */
+ sysbus_realize_and_unref(SYS_BUS_DEVICE(q35_host), &error_fatal);
+ phb = PCI_HOST_BRIDGE(q35_host);
+ host_bus = phb->bus;
+ /* create ISA bus */
+ lpc = pci_create_simple_multifunction(host_bus, PCI_DEVFN(ICH9_LPC_DEV,
+ ICH9_LPC_FUNC), true,
+ TYPE_ICH9_LPC_DEVICE);
+
+ object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
+ TYPE_HOTPLUG_HANDLER,
+ (Object **)&x86ms->acpi_dev,
+ object_property_allow_set_link,
+ OBJ_PROP_LINK_STRONG);
+ object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
+ OBJECT(lpc), &error_abort);
+
+ acpi_pcihp = object_property_get_bool(OBJECT(lpc),
+ ACPI_PM_PROP_ACPI_PCIHP_BRIDGE,
+ NULL);
+
+ keep_pci_slot_hpc = object_property_get_bool(OBJECT(lpc),
+ "x-keep-pci-slot-hpc",
+ NULL);
+
+ if (!keep_pci_slot_hpc && acpi_pcihp) {
+ object_register_sugar_prop(TYPE_PCIE_SLOT, "x-native-hotplug",
+ "false", true);
+ }
+
+ /* irq lines */
+ gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled);
+
+ ich9_lpc = ICH9_LPC_DEVICE(lpc);
+ lpc_dev = DEVICE(lpc);
+ for (i = 0; i < GSI_NUM_PINS; i++) {
+ qdev_connect_gpio_out_named(lpc_dev, ICH9_GPIO_GSI, i, x86ms->gsi[i]);
+ }
+ pci_bus_irqs(host_bus, ich9_lpc_set_irq, ich9_lpc_map_irq, ich9_lpc,
+ ICH9_LPC_NB_PIRQS);
+ pci_bus_set_route_irq_fn(host_bus, ich9_route_intx_pin_to_irq);
+ isa_bus = ich9_lpc->isa_bus;
+
+ pc_i8259_create(isa_bus, gsi_state->i8259_irq);
+
+ if (pcmc->pci_enabled) {
+ ioapic_init_gsi(gsi_state, "q35");
+ }
+
+ if (tcg_enabled()) {
+ x86_register_ferr_irq(x86ms->gsi[13]);
+ }
+
+ assert(pcms->vmport != ON_OFF_AUTO__MAX);
+ if (pcms->vmport == ON_OFF_AUTO_AUTO) {
+ pcms->vmport = ON_OFF_AUTO_ON;
+ }
+
+ /* init basic PC hardware */
+ pc_basic_device_init(pcms, isa_bus, x86ms->gsi, &rtc_state, !mc->no_floppy,
+ 0xff0104);
+
+ /* connect pm stuff to lpc */
+ ich9_lpc_pm_init(lpc, x86_machine_is_smm_enabled(x86ms));
+
+ if (pcms->sata_enabled) {
+ /* ahci and SATA device, for q35 1 ahci controller is built-in */
+ ahci = pci_create_simple_multifunction(host_bus,
+ PCI_DEVFN(ICH9_SATA1_DEV,
+ ICH9_SATA1_FUNC),
+ true, "ich9-ahci");
+ idebus[0] = qdev_get_child_bus(&ahci->qdev, "ide.0");
+ idebus[1] = qdev_get_child_bus(&ahci->qdev, "ide.1");
+ g_assert(MAX_SATA_PORTS == ahci_get_num_ports(ahci));
+ ide_drive_get(hd, ahci_get_num_ports(ahci));
+ ahci_ide_create_devs(ahci, hd);
+ } else {
+ idebus[0] = idebus[1] = NULL;
+ }
+
+ if (machine_usb(machine)) {
+ /* Should we create 6 UHCI according to ich9 spec? */
+ ehci_create_ich9_with_companions(host_bus, 0x1d);
+ }
+
+ if (pcms->smbus_enabled) {
+ /* TODO: Populate SPD eeprom data. */
+ pcms->smbus = ich9_smb_init(host_bus,
+ PCI_DEVFN(ICH9_SMB_DEV, ICH9_SMB_FUNC),
+ 0xb100);
+ smbus_eeprom_init(pcms->smbus, 8, NULL, 0);
+ }
+
+ pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state);
+
+ /* the rest devices to which pci devfn is automatically assigned */
+ pc_vga_init(isa_bus, host_bus);
+ pc_nic_init(pcmc, isa_bus, host_bus);
+
+ if (machine->nvdimms_state->is_enabled) {
+ nvdimm_init_acpi_state(machine->nvdimms_state, system_io,
+ x86_nvdimm_acpi_dsmio,
+ x86ms->fw_cfg, OBJECT(pcms));
+ }
+}
+
+#define DEFINE_Q35_MACHINE(suffix, name, compatfn, optionfn) \
+ static void pc_init_##suffix(MachineState *machine) \
+ { \
+ void (*compat)(MachineState *m) = (compatfn); \
+ if (compat) { \
+ compat(machine); \
+ } \
+ pc_q35_init(machine); \
+ } \
+ DEFINE_PC_MACHINE(suffix, name, pc_init_##suffix, optionfn)
+
+
+static void pc_q35_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+ pcmc->default_nic_model = "e1000e";
+ pcmc->pci_root_uid = 0;
+
+ m->family = "pc_q35";
+ m->desc = "Standard PC (Q35 + ICH9, 2009)";
+ m->units_per_default_bus = 1;
+ m->default_machine_opts = "firmware=bios-256k.bin";
+ m->default_display = "std";
+ m->default_kernel_irqchip_split = false;
+ m->no_floppy = 1;
+ machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE);
+ machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE);
+ machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE);
+ machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
+ m->max_cpus = 288;
+}
+
+static void pc_q35_6_2_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+ pc_q35_machine_options(m);
+ m->alias = "q35";
+ pcmc->default_cpu_version = 1;
+}
+
+DEFINE_Q35_MACHINE(v6_2, "pc-q35-6.2", NULL,
+ pc_q35_6_2_machine_options);
+
+static void pc_q35_6_1_machine_options(MachineClass *m)
+{
+ pc_q35_6_2_machine_options(m);
+ m->alias = NULL;
+ compat_props_add(m->compat_props, hw_compat_6_1, hw_compat_6_1_len);
+ compat_props_add(m->compat_props, pc_compat_6_1, pc_compat_6_1_len);
+ m->smp_props.prefer_sockets = true;
+}
+
+DEFINE_Q35_MACHINE(v6_1, "pc-q35-6.1", NULL,
+ pc_q35_6_1_machine_options);
+
+static void pc_q35_6_0_machine_options(MachineClass *m)
+{
+ pc_q35_6_1_machine_options(m);
+ m->alias = NULL;
+ compat_props_add(m->compat_props, hw_compat_6_0, hw_compat_6_0_len);
+ compat_props_add(m->compat_props, pc_compat_6_0, pc_compat_6_0_len);
+}
+
+DEFINE_Q35_MACHINE(v6_0, "pc-q35-6.0", NULL,
+ pc_q35_6_0_machine_options);
+
+static void pc_q35_5_2_machine_options(MachineClass *m)
+{
+ pc_q35_6_0_machine_options(m);
+ m->alias = NULL;
+ compat_props_add(m->compat_props, hw_compat_5_2, hw_compat_5_2_len);
+ compat_props_add(m->compat_props, pc_compat_5_2, pc_compat_5_2_len);
+}
+
+DEFINE_Q35_MACHINE(v5_2, "pc-q35-5.2", NULL,
+ pc_q35_5_2_machine_options);
+
+static void pc_q35_5_1_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_q35_5_2_machine_options(m);
+ m->alias = NULL;
+ compat_props_add(m->compat_props, hw_compat_5_1, hw_compat_5_1_len);
+ compat_props_add(m->compat_props, pc_compat_5_1, pc_compat_5_1_len);
+ pcmc->kvmclock_create_always = false;
+ pcmc->pci_root_uid = 1;
+}
+
+DEFINE_Q35_MACHINE(v5_1, "pc-q35-5.1", NULL,
+ pc_q35_5_1_machine_options);
+
+static void pc_q35_5_0_machine_options(MachineClass *m)
+{
+ pc_q35_5_1_machine_options(m);
+ m->alias = NULL;
+ m->numa_mem_supported = true;
+ compat_props_add(m->compat_props, hw_compat_5_0, hw_compat_5_0_len);
+ compat_props_add(m->compat_props, pc_compat_5_0, pc_compat_5_0_len);
+ m->auto_enable_numa_with_memdev = false;
+}
+
+DEFINE_Q35_MACHINE(v5_0, "pc-q35-5.0", NULL,
+ pc_q35_5_0_machine_options);
+
+static void pc_q35_4_2_machine_options(MachineClass *m)
+{
+ pc_q35_5_0_machine_options(m);
+ m->alias = NULL;
+ compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len);
+ compat_props_add(m->compat_props, pc_compat_4_2, pc_compat_4_2_len);
+}
+
+DEFINE_Q35_MACHINE(v4_2, "pc-q35-4.2", NULL,
+ pc_q35_4_2_machine_options);
+
+static void pc_q35_4_1_machine_options(MachineClass *m)
+{
+ pc_q35_4_2_machine_options(m);
+ m->alias = NULL;
+ compat_props_add(m->compat_props, hw_compat_4_1, hw_compat_4_1_len);
+ compat_props_add(m->compat_props, pc_compat_4_1, pc_compat_4_1_len);
+}
+
+DEFINE_Q35_MACHINE(v4_1, "pc-q35-4.1", NULL,
+ pc_q35_4_1_machine_options);
+
+static void pc_q35_4_0_1_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+ pc_q35_4_1_machine_options(m);
+ m->alias = NULL;
+ pcmc->default_cpu_version = CPU_VERSION_LEGACY;
+ /*
+ * This is the default machine for the 4.0-stable branch. It is basically
+ * a 4.0 that doesn't use split irqchip by default. It MUST hence apply the
+ * 4.0 compat props.
+ */
+ compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len);
+ compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len);
+}
+
+DEFINE_Q35_MACHINE(v4_0_1, "pc-q35-4.0.1", NULL,
+ pc_q35_4_0_1_machine_options);
+
+static void pc_q35_4_0_machine_options(MachineClass *m)
+{
+ pc_q35_4_0_1_machine_options(m);
+ m->default_kernel_irqchip_split = true;
+ m->alias = NULL;
+ /* Compat props are applied by the 4.0.1 machine */
+}
+
+DEFINE_Q35_MACHINE(v4_0, "pc-q35-4.0", NULL,
+ pc_q35_4_0_machine_options);
+
+static void pc_q35_3_1_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_q35_4_0_machine_options(m);
+ m->default_kernel_irqchip_split = false;
+ pcmc->do_not_add_smb_acpi = true;
+ m->smbus_no_migration_support = true;
+ m->alias = NULL;
+ pcmc->pvh_enabled = false;
+ compat_props_add(m->compat_props, hw_compat_3_1, hw_compat_3_1_len);
+ compat_props_add(m->compat_props, pc_compat_3_1, pc_compat_3_1_len);
+}
+
+DEFINE_Q35_MACHINE(v3_1, "pc-q35-3.1", NULL,
+ pc_q35_3_1_machine_options);
+
+static void pc_q35_3_0_machine_options(MachineClass *m)
+{
+ pc_q35_3_1_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_3_0, hw_compat_3_0_len);
+ compat_props_add(m->compat_props, pc_compat_3_0, pc_compat_3_0_len);
+}
+
+DEFINE_Q35_MACHINE(v3_0, "pc-q35-3.0", NULL,
+ pc_q35_3_0_machine_options);
+
+static void pc_q35_2_12_machine_options(MachineClass *m)
+{
+ pc_q35_3_0_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_2_12, hw_compat_2_12_len);
+ compat_props_add(m->compat_props, pc_compat_2_12, pc_compat_2_12_len);
+}
+
+DEFINE_Q35_MACHINE(v2_12, "pc-q35-2.12", NULL,
+ pc_q35_2_12_machine_options);
+
+static void pc_q35_2_11_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_q35_2_12_machine_options(m);
+ pcmc->default_nic_model = "e1000";
+ compat_props_add(m->compat_props, hw_compat_2_11, hw_compat_2_11_len);
+ compat_props_add(m->compat_props, pc_compat_2_11, pc_compat_2_11_len);
+}
+
+DEFINE_Q35_MACHINE(v2_11, "pc-q35-2.11", NULL,
+ pc_q35_2_11_machine_options);
+
+static void pc_q35_2_10_machine_options(MachineClass *m)
+{
+ pc_q35_2_11_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_2_10, hw_compat_2_10_len);
+ compat_props_add(m->compat_props, pc_compat_2_10, pc_compat_2_10_len);
+ m->auto_enable_numa_with_memhp = false;
+}
+
+DEFINE_Q35_MACHINE(v2_10, "pc-q35-2.10", NULL,
+ pc_q35_2_10_machine_options);
+
+static void pc_q35_2_9_machine_options(MachineClass *m)
+{
+ pc_q35_2_10_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_2_9, hw_compat_2_9_len);
+ compat_props_add(m->compat_props, pc_compat_2_9, pc_compat_2_9_len);
+}
+
+DEFINE_Q35_MACHINE(v2_9, "pc-q35-2.9", NULL,
+ pc_q35_2_9_machine_options);
+
+static void pc_q35_2_8_machine_options(MachineClass *m)
+{
+ pc_q35_2_9_machine_options(m);
+ compat_props_add(m->compat_props, hw_compat_2_8, hw_compat_2_8_len);
+ compat_props_add(m->compat_props, pc_compat_2_8, pc_compat_2_8_len);
+}
+
+DEFINE_Q35_MACHINE(v2_8, "pc-q35-2.8", NULL,
+ pc_q35_2_8_machine_options);
+
+static void pc_q35_2_7_machine_options(MachineClass *m)
+{
+ pc_q35_2_8_machine_options(m);
+ m->max_cpus = 255;
+ compat_props_add(m->compat_props, hw_compat_2_7, hw_compat_2_7_len);
+ compat_props_add(m->compat_props, pc_compat_2_7, pc_compat_2_7_len);
+}
+
+DEFINE_Q35_MACHINE(v2_7, "pc-q35-2.7", NULL,
+ pc_q35_2_7_machine_options);
+
+static void pc_q35_2_6_machine_options(MachineClass *m)
+{
+ X86MachineClass *x86mc = X86_MACHINE_CLASS(m);
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_q35_2_7_machine_options(m);
+ pcmc->legacy_cpu_hotplug = true;
+ x86mc->fwcfg_dma_enabled = false;
+ compat_props_add(m->compat_props, hw_compat_2_6, hw_compat_2_6_len);
+ compat_props_add(m->compat_props, pc_compat_2_6, pc_compat_2_6_len);
+}
+
+DEFINE_Q35_MACHINE(v2_6, "pc-q35-2.6", NULL,
+ pc_q35_2_6_machine_options);
+
+static void pc_q35_2_5_machine_options(MachineClass *m)
+{
+ X86MachineClass *x86mc = X86_MACHINE_CLASS(m);
+
+ pc_q35_2_6_machine_options(m);
+ x86mc->save_tsc_khz = false;
+ m->legacy_fw_cfg_order = 1;
+ compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len);
+ compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len);
+}
+
+DEFINE_Q35_MACHINE(v2_5, "pc-q35-2.5", NULL,
+ pc_q35_2_5_machine_options);
+
+static void pc_q35_2_4_machine_options(MachineClass *m)
+{
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ pc_q35_2_5_machine_options(m);
+ m->hw_version = "2.4.0";
+ pcmc->broken_reserved_end = true;
+ compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len);
+ compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len);
+}
+
+DEFINE_Q35_MACHINE(v2_4, "pc-q35-2.4", NULL,
+ pc_q35_2_4_machine_options);
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
new file mode 100644
index 000000000..c8b17af95
--- /dev/null
+++ b/hw/i386/pc_sysfw.c
@@ -0,0 +1,262 @@
+/*
+ * QEMU PC System Firmware
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2011-2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qapi/error.h"
+#include "sysemu/block-backend.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/units.h"
+#include "hw/sysbus.h"
+#include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/loader.h"
+#include "hw/qdev-properties.h"
+#include "hw/block/flash.h"
+#include "sysemu/kvm.h"
+#include "sev.h"
+
+#define FLASH_SECTOR_SIZE 4096
+
+static void pc_isa_bios_init(MemoryRegion *rom_memory,
+ MemoryRegion *flash_mem,
+ int ram_size)
+{
+ int isa_bios_size;
+ MemoryRegion *isa_bios;
+ uint64_t flash_size;
+ void *flash_ptr, *isa_bios_ptr;
+
+ flash_size = memory_region_size(flash_mem);
+
+ /* map the last 128KB of the BIOS in ISA space */
+ isa_bios_size = MIN(flash_size, 128 * KiB);
+ isa_bios = g_malloc(sizeof(*isa_bios));
+ memory_region_init_ram(isa_bios, NULL, "isa-bios", isa_bios_size,
+ &error_fatal);
+ memory_region_add_subregion_overlap(rom_memory,
+ 0x100000 - isa_bios_size,
+ isa_bios,
+ 1);
+
+ /* copy ISA rom image from top of flash memory */
+ flash_ptr = memory_region_get_ram_ptr(flash_mem);
+ isa_bios_ptr = memory_region_get_ram_ptr(isa_bios);
+ memcpy(isa_bios_ptr,
+ ((uint8_t*)flash_ptr) + (flash_size - isa_bios_size),
+ isa_bios_size);
+
+ memory_region_set_readonly(isa_bios, true);
+}
+
+static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms,
+ const char *name,
+ const char *alias_prop_name)
+{
+ DeviceState *dev = qdev_new(TYPE_PFLASH_CFI01);
+
+ qdev_prop_set_uint64(dev, "sector-length", FLASH_SECTOR_SIZE);
+ qdev_prop_set_uint8(dev, "width", 1);
+ qdev_prop_set_string(dev, "name", name);
+ object_property_add_child(OBJECT(pcms), name, OBJECT(dev));
+ object_property_add_alias(OBJECT(pcms), alias_prop_name,
+ OBJECT(dev), "drive");
+ /*
+ * The returned reference is tied to the child property and
+ * will be removed with object_unparent.
+ */
+ object_unref(OBJECT(dev));
+ return PFLASH_CFI01(dev);
+}
+
+void pc_system_flash_create(PCMachineState *pcms)
+{
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+
+ if (pcmc->pci_enabled) {
+ pcms->flash[0] = pc_pflash_create(pcms, "system.flash0",
+ "pflash0");
+ pcms->flash[1] = pc_pflash_create(pcms, "system.flash1",
+ "pflash1");
+ }
+}
+
+void pc_system_flash_cleanup_unused(PCMachineState *pcms)
+{
+ char *prop_name;
+ int i;
+ Object *dev_obj;
+
+ assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled);
+
+ for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
+ dev_obj = OBJECT(pcms->flash[i]);
+ if (!object_property_get_bool(dev_obj, "realized", &error_abort)) {
+ prop_name = g_strdup_printf("pflash%d", i);
+ object_property_del(OBJECT(pcms), prop_name);
+ g_free(prop_name);
+ object_unparent(dev_obj);
+ pcms->flash[i] = NULL;
+ }
+ }
+}
+
+/*
+ * Map the pcms->flash[] from 4GiB downward, and realize.
+ * Map them in descending order, i.e. pcms->flash[0] at the top,
+ * without gaps.
+ * Stop at the first pcms->flash[0] lacking a block backend.
+ * Set each flash's size from its block backend. Fatal error if the
+ * size isn't a non-zero multiple of 4KiB, or the total size exceeds
+ * pcms->max_fw_size.
+ *
+ * If pcms->flash[0] has a block backend, its memory is passed to
+ * pc_isa_bios_init(). Merging several flash devices for isa-bios is
+ * not supported.
+ */
+static void pc_system_flash_map(PCMachineState *pcms,
+ MemoryRegion *rom_memory)
+{
+ hwaddr total_size = 0;
+ int i;
+ BlockBackend *blk;
+ int64_t size;
+ PFlashCFI01 *system_flash;
+ MemoryRegion *flash_mem;
+ void *flash_ptr;
+ int flash_size;
+ int ret;
+
+ assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled);
+
+ for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
+ system_flash = pcms->flash[i];
+ blk = pflash_cfi01_get_blk(system_flash);
+ if (!blk) {
+ break;
+ }
+ size = blk_getlength(blk);
+ if (size < 0) {
+ error_report("can't get size of block device %s: %s",
+ blk_name(blk), strerror(-size));
+ exit(1);
+ }
+ if (size == 0 || !QEMU_IS_ALIGNED(size, FLASH_SECTOR_SIZE)) {
+ error_report("system firmware block device %s has invalid size "
+ "%" PRId64,
+ blk_name(blk), size);
+ info_report("its size must be a non-zero multiple of 0x%x",
+ FLASH_SECTOR_SIZE);
+ exit(1);
+ }
+ if ((hwaddr)size != size
+ || total_size > HWADDR_MAX - size
+ || total_size + size > pcms->max_fw_size) {
+ error_report("combined size of system firmware exceeds "
+ "%" PRIu64 " bytes",
+ pcms->max_fw_size);
+ exit(1);
+ }
+
+ total_size += size;
+ qdev_prop_set_uint32(DEVICE(system_flash), "num-blocks",
+ size / FLASH_SECTOR_SIZE);
+ sysbus_realize_and_unref(SYS_BUS_DEVICE(system_flash), &error_fatal);
+ sysbus_mmio_map(SYS_BUS_DEVICE(system_flash), 0,
+ 0x100000000ULL - total_size);
+
+ if (i == 0) {
+ flash_mem = pflash_cfi01_get_memory(system_flash);
+ pc_isa_bios_init(rom_memory, flash_mem, size);
+
+ /* Encrypt the pflash boot ROM */
+ if (sev_enabled()) {
+ flash_ptr = memory_region_get_ram_ptr(flash_mem);
+ flash_size = memory_region_size(flash_mem);
+ /*
+ * OVMF places a GUIDed structures in the flash, so
+ * search for them
+ */
+ pc_system_parse_ovmf_flash(flash_ptr, flash_size);
+
+ ret = sev_es_save_reset_vector(flash_ptr, flash_size);
+ if (ret) {
+ error_report("failed to locate and/or save reset vector");
+ exit(1);
+ }
+
+ sev_encrypt_flash(flash_ptr, flash_size, &error_fatal);
+ }
+ }
+ }
+}
+
+void pc_system_firmware_init(PCMachineState *pcms,
+ MemoryRegion *rom_memory)
+{
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ int i;
+ BlockBackend *pflash_blk[ARRAY_SIZE(pcms->flash)];
+
+ if (!pcmc->pci_enabled) {
+ x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, true);
+ return;
+ }
+
+ /* Map legacy -drive if=pflash to machine properties */
+ for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
+ pflash_cfi01_legacy_drive(pcms->flash[i],
+ drive_get(IF_PFLASH, 0, i));
+ pflash_blk[i] = pflash_cfi01_get_blk(pcms->flash[i]);
+ }
+
+ /* Reject gaps */
+ for (i = 1; i < ARRAY_SIZE(pcms->flash); i++) {
+ if (pflash_blk[i] && !pflash_blk[i - 1]) {
+ error_report("pflash%d requires pflash%d", i, i - 1);
+ exit(1);
+ }
+ }
+
+ if (!pflash_blk[0]) {
+ /* Machine property pflash0 not set, use ROM mode */
+ x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, false);
+ } else {
+ if (kvm_enabled() && !kvm_readonly_mem_enabled()) {
+ /*
+ * Older KVM cannot execute from device memory. So, flash
+ * memory cannot be used unless the readonly memory kvm
+ * capability is present.
+ */
+ error_report("pflash with kvm requires KVM readonly memory support");
+ exit(1);
+ }
+
+ pc_system_flash_map(pcms, rom_memory);
+ }
+
+ pc_system_flash_cleanup_unused(pcms);
+}
diff --git a/hw/i386/pc_sysfw_ovmf-stubs.c b/hw/i386/pc_sysfw_ovmf-stubs.c
new file mode 100644
index 000000000..aabe78b27
--- /dev/null
+++ b/hw/i386/pc_sysfw_ovmf-stubs.c
@@ -0,0 +1,26 @@
+/*
+ * QEMU PC System Firmware (OVMF stubs)
+ *
+ * Copyright (c) 2021 Red Hat, Inc.
+ *
+ * Author:
+ * Philippe Mathieu-Daudé <philmd@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/pc.h"
+
+bool pc_system_ovmf_table_find(const char *entry, uint8_t **data, int *data_len)
+{
+ g_assert_not_reached();
+}
+
+void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size)
+{
+ g_assert_not_reached();
+}
diff --git a/hw/i386/pc_sysfw_ovmf.c b/hw/i386/pc_sysfw_ovmf.c
new file mode 100644
index 000000000..f4dd92c58
--- /dev/null
+++ b/hw/i386/pc_sysfw_ovmf.c
@@ -0,0 +1,151 @@
+/*
+ * QEMU PC System Firmware (OVMF specific)
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2011-2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/pc.h"
+#include "cpu.h"
+
+#define OVMF_TABLE_FOOTER_GUID "96b582de-1fb2-45f7-baea-a366c55a082d"
+
+static bool ovmf_flash_parsed;
+static uint8_t *ovmf_table;
+static int ovmf_table_len;
+
+void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size)
+{
+ uint8_t *ptr;
+ QemuUUID guid;
+ int tot_len;
+
+ /* should only be called once */
+ if (ovmf_flash_parsed) {
+ return;
+ }
+
+ ovmf_flash_parsed = true;
+
+ if (flash_size < TARGET_PAGE_SIZE) {
+ return;
+ }
+
+ /*
+ * if this is OVMF there will be a table footer
+ * guid 48 bytes before the end of the flash file. If it's
+ * not found, silently abort the flash parsing.
+ */
+ qemu_uuid_parse(OVMF_TABLE_FOOTER_GUID, &guid);
+ guid = qemu_uuid_bswap(guid); /* guids are LE */
+ ptr = flash_ptr + flash_size - 48;
+ if (!qemu_uuid_is_equal((QemuUUID *)ptr, &guid)) {
+ return;
+ }
+
+ /* if found, just before is two byte table length */
+ ptr -= sizeof(uint16_t);
+ tot_len = le16_to_cpu(*(uint16_t *)ptr) - sizeof(guid) - sizeof(uint16_t);
+
+ if (tot_len <= 0) {
+ return;
+ }
+
+ ovmf_table = g_malloc(tot_len);
+ ovmf_table_len = tot_len;
+
+ /*
+ * ptr is the foot of the table, so copy it all to the newly
+ * allocated ovmf_table and then set the ovmf_table pointer
+ * to the table foot
+ */
+ memcpy(ovmf_table, ptr - tot_len, tot_len);
+ ovmf_table += tot_len;
+}
+
+/**
+ * pc_system_ovmf_table_find - Find the data associated with an entry in OVMF's
+ * reset vector GUIDed table.
+ *
+ * @entry: GUID string of the entry to lookup
+ * @data: Filled with a pointer to the entry's value (if not NULL)
+ * @data_len: Filled with the length of the entry's value (if not NULL). Pass
+ * NULL here if the length of data is known.
+ *
+ * Return: true if the entry was found in the OVMF table; false otherwise.
+ */
+bool pc_system_ovmf_table_find(const char *entry, uint8_t **data,
+ int *data_len)
+{
+ uint8_t *ptr = ovmf_table;
+ int tot_len = ovmf_table_len;
+ QemuUUID entry_guid;
+
+ assert(ovmf_flash_parsed);
+
+ if (qemu_uuid_parse(entry, &entry_guid) < 0) {
+ return false;
+ }
+
+ if (!ptr) {
+ return false;
+ }
+
+ entry_guid = qemu_uuid_bswap(entry_guid); /* guids are LE */
+ while (tot_len >= sizeof(QemuUUID) + sizeof(uint16_t)) {
+ int len;
+ QemuUUID *guid;
+
+ /*
+ * The data structure is
+ * arbitrary length data
+ * 2 byte length of entire entry
+ * 16 byte guid
+ */
+ guid = (QemuUUID *)(ptr - sizeof(QemuUUID));
+ len = le16_to_cpu(*(uint16_t *)(ptr - sizeof(QemuUUID) -
+ sizeof(uint16_t)));
+
+ /*
+ * just in case the table is corrupt, wouldn't want to spin in
+ * the zero case
+ */
+ if (len < sizeof(QemuUUID) + sizeof(uint16_t)) {
+ return false;
+ } else if (len > tot_len) {
+ return false;
+ }
+
+ ptr -= len;
+ tot_len -= len;
+ if (qemu_uuid_is_equal(guid, &entry_guid)) {
+ if (data) {
+ *data = ptr;
+ }
+ if (data_len) {
+ *data_len = len - sizeof(QemuUUID) - sizeof(uint16_t);
+ }
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/hw/i386/port92.c b/hw/i386/port92.c
new file mode 100644
index 000000000..e1379a4f9
--- /dev/null
+++ b/hw/i386/port92.c
@@ -0,0 +1,127 @@
+/*
+ * QEMU I/O port 0x92 (System Control Port A, to handle Fast Gate A20)
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/runstate.h"
+#include "migration/vmstate.h"
+#include "hw/irq.h"
+#include "hw/i386/pc.h"
+#include "trace.h"
+#include "qom/object.h"
+
+OBJECT_DECLARE_SIMPLE_TYPE(Port92State, PORT92)
+
+struct Port92State {
+ ISADevice parent_obj;
+
+ MemoryRegion io;
+ uint8_t outport;
+ qemu_irq a20_out;
+};
+
+static void port92_write(void *opaque, hwaddr addr, uint64_t val,
+ unsigned size)
+{
+ Port92State *s = opaque;
+ int oldval = s->outport;
+
+ trace_port92_write(val);
+ s->outport = val;
+ qemu_set_irq(s->a20_out, (val >> 1) & 1);
+ if ((val & 1) && !(oldval & 1)) {
+ qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
+ }
+}
+
+static uint64_t port92_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ Port92State *s = opaque;
+ uint32_t ret;
+
+ ret = s->outport;
+ trace_port92_read(ret);
+
+ return ret;
+}
+
+static const VMStateDescription vmstate_port92_isa = {
+ .name = "port92",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT8(outport, Port92State),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void port92_reset(DeviceState *d)
+{
+ Port92State *s = PORT92(d);
+
+ s->outport &= ~1;
+}
+
+static const MemoryRegionOps port92_ops = {
+ .read = port92_read,
+ .write = port92_write,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 1,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void port92_initfn(Object *obj)
+{
+ Port92State *s = PORT92(obj);
+
+ memory_region_init_io(&s->io, OBJECT(s), &port92_ops, s, "port92", 1);
+
+ s->outport = 0;
+
+ qdev_init_gpio_out_named(DEVICE(obj), &s->a20_out, PORT92_A20_LINE, 1);
+}
+
+static void port92_realizefn(DeviceState *dev, Error **errp)
+{
+ ISADevice *isadev = ISA_DEVICE(dev);
+ Port92State *s = PORT92(dev);
+
+ isa_register_ioport(isadev, &s->io, 0x92);
+}
+
+static void port92_class_initfn(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->realize = port92_realizefn;
+ dc->reset = port92_reset;
+ dc->vmsd = &vmstate_port92_isa;
+ /*
+ * Reason: unlike ordinary ISA devices, this one needs additional
+ * wiring: its A20 output line needs to be wired up with
+ * qdev_connect_gpio_out_named().
+ */
+ dc->user_creatable = false;
+}
+
+static const TypeInfo port92_info = {
+ .name = TYPE_PORT92,
+ .parent = TYPE_ISA_DEVICE,
+ .instance_size = sizeof(Port92State),
+ .instance_init = port92_initfn,
+ .class_init = port92_class_initfn,
+};
+
+static void port92_register_types(void)
+{
+ type_register_static(&port92_info);
+}
+
+type_init(port92_register_types)
diff --git a/hw/i386/sgx-epc.c b/hw/i386/sgx-epc.c
new file mode 100644
index 000000000..e508827e7
--- /dev/null
+++ b/hw/i386/sgx-epc.c
@@ -0,0 +1,185 @@
+/*
+ * SGX EPC device
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Authors:
+ * Sean Christopherson <sean.j.christopherson@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/sgx-epc.h"
+#include "hw/mem/memory-device.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include "target/i386/cpu.h"
+#include "exec/address-spaces.h"
+
+static Property sgx_epc_properties[] = {
+ DEFINE_PROP_UINT64(SGX_EPC_ADDR_PROP, SGXEPCDevice, addr, 0),
+ DEFINE_PROP_LINK(SGX_EPC_MEMDEV_PROP, SGXEPCDevice, hostmem,
+ TYPE_MEMORY_BACKEND_EPC, HostMemoryBackendEpc *),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void sgx_epc_get_size(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ Error *local_err = NULL;
+ uint64_t value;
+
+ value = memory_device_get_region_size(MEMORY_DEVICE(obj), &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+
+ visit_type_uint64(v, name, &value, errp);
+}
+
+static void sgx_epc_init(Object *obj)
+{
+ object_property_add(obj, SGX_EPC_SIZE_PROP, "uint64", sgx_epc_get_size,
+ NULL, NULL, NULL);
+}
+
+static void sgx_epc_realize(DeviceState *dev, Error **errp)
+{
+ PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+ X86MachineState *x86ms = X86_MACHINE(pcms);
+ MemoryDeviceState *md = MEMORY_DEVICE(dev);
+ SGXEPCState *sgx_epc = &pcms->sgx_epc;
+ SGXEPCDevice *epc = SGX_EPC(dev);
+ HostMemoryBackend *hostmem;
+ const char *path;
+
+ if (x86ms->boot_cpus != 0) {
+ error_setg(errp, "'" TYPE_SGX_EPC "' can't be created after vCPUs,"
+ "e.g. via -device");
+ return;
+ }
+
+ if (!epc->hostmem) {
+ error_setg(errp, "'" SGX_EPC_MEMDEV_PROP "' property is not set");
+ return;
+ }
+ hostmem = MEMORY_BACKEND(epc->hostmem);
+ if (host_memory_backend_is_mapped(hostmem)) {
+ path = object_get_canonical_path_component(OBJECT(hostmem));
+ error_setg(errp, "can't use already busy memdev: %s", path);
+ return;
+ }
+
+ epc->addr = sgx_epc->base + sgx_epc->size;
+
+ memory_region_add_subregion(&sgx_epc->mr, epc->addr - sgx_epc->base,
+ host_memory_backend_get_memory(hostmem));
+
+ host_memory_backend_set_mapped(hostmem, true);
+
+ sgx_epc->sections = g_renew(SGXEPCDevice *, sgx_epc->sections,
+ sgx_epc->nr_sections + 1);
+ sgx_epc->sections[sgx_epc->nr_sections++] = epc;
+
+ sgx_epc->size += memory_device_get_region_size(md, errp);
+}
+
+static void sgx_epc_unrealize(DeviceState *dev)
+{
+ SGXEPCDevice *epc = SGX_EPC(dev);
+ HostMemoryBackend *hostmem = MEMORY_BACKEND(epc->hostmem);
+
+ host_memory_backend_set_mapped(hostmem, false);
+}
+
+static uint64_t sgx_epc_md_get_addr(const MemoryDeviceState *md)
+{
+ const SGXEPCDevice *epc = SGX_EPC(md);
+
+ return epc->addr;
+}
+
+static void sgx_epc_md_set_addr(MemoryDeviceState *md, uint64_t addr,
+ Error **errp)
+{
+ object_property_set_uint(OBJECT(md), SGX_EPC_ADDR_PROP, addr, errp);
+}
+
+static uint64_t sgx_epc_md_get_plugged_size(const MemoryDeviceState *md,
+ Error **errp)
+{
+ return 0;
+}
+
+static MemoryRegion *sgx_epc_md_get_memory_region(MemoryDeviceState *md,
+ Error **errp)
+{
+ SGXEPCDevice *epc = SGX_EPC(md);
+ HostMemoryBackend *hostmem;
+
+ if (!epc->hostmem) {
+ error_setg(errp, "'" SGX_EPC_MEMDEV_PROP "' property must be set");
+ return NULL;
+ }
+
+ hostmem = MEMORY_BACKEND(epc->hostmem);
+ return host_memory_backend_get_memory(hostmem);
+}
+
+static void sgx_epc_md_fill_device_info(const MemoryDeviceState *md,
+ MemoryDeviceInfo *info)
+{
+ SgxEPCDeviceInfo *se = g_new0(SgxEPCDeviceInfo, 1);
+ SGXEPCDevice *epc = SGX_EPC(md);
+
+ se->memaddr = epc->addr;
+ se->size = object_property_get_uint(OBJECT(epc), SGX_EPC_SIZE_PROP,
+ NULL);
+ se->memdev = object_get_canonical_path(OBJECT(epc->hostmem));
+
+ info->u.sgx_epc.data = se;
+ info->type = MEMORY_DEVICE_INFO_KIND_SGX_EPC;
+}
+
+static void sgx_epc_class_init(ObjectClass *oc, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(oc);
+ MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(oc);
+
+ dc->hotpluggable = false;
+ dc->realize = sgx_epc_realize;
+ dc->unrealize = sgx_epc_unrealize;
+ dc->desc = "SGX EPC section";
+ dc->user_creatable = false;
+ device_class_set_props(dc, sgx_epc_properties);
+
+ mdc->get_addr = sgx_epc_md_get_addr;
+ mdc->set_addr = sgx_epc_md_set_addr;
+ mdc->get_plugged_size = sgx_epc_md_get_plugged_size;
+ mdc->get_memory_region = sgx_epc_md_get_memory_region;
+ mdc->fill_device_info = sgx_epc_md_fill_device_info;
+}
+
+static TypeInfo sgx_epc_info = {
+ .name = TYPE_SGX_EPC,
+ .parent = TYPE_DEVICE,
+ .instance_size = sizeof(SGXEPCDevice),
+ .instance_init = sgx_epc_init,
+ .class_init = sgx_epc_class_init,
+ .class_size = sizeof(DeviceClass),
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_MEMORY_DEVICE },
+ { }
+ },
+};
+
+static void sgx_epc_register_types(void)
+{
+ type_register_static(&sgx_epc_info);
+}
+
+type_init(sgx_epc_register_types)
diff --git a/hw/i386/sgx-stub.c b/hw/i386/sgx-stub.c
new file mode 100644
index 000000000..c9b379e66
--- /dev/null
+++ b/hw/i386/sgx-stub.c
@@ -0,0 +1,34 @@
+#include "qemu/osdep.h"
+#include "monitor/monitor.h"
+#include "monitor/hmp-target.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/sgx-epc.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc-target.h"
+
+SGXInfo *qmp_query_sgx(Error **errp)
+{
+ error_setg(errp, "SGX support is not compiled in");
+ return NULL;
+}
+
+SGXInfo *qmp_query_sgx_capabilities(Error **errp)
+{
+ error_setg(errp, "SGX support is not compiled in");
+ return NULL;
+}
+
+void hmp_info_sgx(Monitor *mon, const QDict *qdict)
+{
+ monitor_printf(mon, "SGX is not available in this QEMU\n");
+}
+
+void pc_machine_init_sgx_epc(PCMachineState *pcms)
+{
+ memset(&pcms->sgx_epc, 0, sizeof(SGXEPCState));
+}
+
+bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
+{
+ g_assert_not_reached();
+}
diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c
new file mode 100644
index 000000000..8fef3dd8f
--- /dev/null
+++ b/hw/i386/sgx.c
@@ -0,0 +1,243 @@
+/*
+ * SGX common code
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Authors:
+ * Yang Zhong<yang.zhong@intel.com>
+ * Sean Christopherson <sean.j.christopherson@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/sgx-epc.h"
+#include "hw/mem/memory-device.h"
+#include "monitor/qdev.h"
+#include "monitor/monitor.h"
+#include "monitor/hmp-target.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc-target.h"
+#include "exec/address-spaces.h"
+#include "sysemu/hw_accel.h"
+#include "sysemu/reset.h"
+#include <sys/ioctl.h>
+
+#define SGX_MAX_EPC_SECTIONS 8
+#define SGX_CPUID_EPC_INVALID 0x0
+
+/* A valid EPC section. */
+#define SGX_CPUID_EPC_SECTION 0x1
+#define SGX_CPUID_EPC_MASK 0xF
+
+#define SGX_MAGIC 0xA4
+#define SGX_IOC_VEPC_REMOVE_ALL _IO(SGX_MAGIC, 0x04)
+
+#define RETRY_NUM 2
+
+static uint64_t sgx_calc_section_metric(uint64_t low, uint64_t high)
+{
+ return (low & MAKE_64BIT_MASK(12, 20)) +
+ ((high & MAKE_64BIT_MASK(0, 20)) << 32);
+}
+
+static uint64_t sgx_calc_host_epc_section_size(void)
+{
+ uint32_t i, type;
+ uint32_t eax, ebx, ecx, edx;
+ uint64_t size = 0;
+
+ for (i = 0; i < SGX_MAX_EPC_SECTIONS; i++) {
+ host_cpuid(0x12, i + 2, &eax, &ebx, &ecx, &edx);
+
+ type = eax & SGX_CPUID_EPC_MASK;
+ if (type == SGX_CPUID_EPC_INVALID) {
+ break;
+ }
+
+ if (type != SGX_CPUID_EPC_SECTION) {
+ break;
+ }
+
+ size += sgx_calc_section_metric(ecx, edx);
+ }
+
+ return size;
+}
+
+static void sgx_epc_reset(void *opaque)
+{
+ PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+ HostMemoryBackend *hostmem;
+ SGXEPCDevice *epc;
+ int failures;
+ int fd, i, j, r;
+ static bool warned = false;
+
+ /*
+ * The second pass is needed to remove SECS pages that could not
+ * be removed during the first.
+ */
+ for (i = 0; i < RETRY_NUM; i++) {
+ failures = 0;
+ for (j = 0; j < pcms->sgx_epc.nr_sections; j++) {
+ epc = pcms->sgx_epc.sections[j];
+ hostmem = MEMORY_BACKEND(epc->hostmem);
+ fd = memory_region_get_fd(host_memory_backend_get_memory(hostmem));
+
+ r = ioctl(fd, SGX_IOC_VEPC_REMOVE_ALL);
+ if (r == -ENOTTY && !warned) {
+ warned = true;
+ warn_report("kernel does not support SGX_IOC_VEPC_REMOVE_ALL");
+ warn_report("SGX might operate incorrectly in the guest after reset");
+ break;
+ } else if (r > 0) {
+ /* SECS pages remain */
+ failures++;
+ if (i == 1) {
+ error_report("cannot reset vEPC section %d", j);
+ }
+ }
+ }
+ if (!failures) {
+ break;
+ }
+ }
+}
+
+SGXInfo *qmp_query_sgx_capabilities(Error **errp)
+{
+ SGXInfo *info = NULL;
+ uint32_t eax, ebx, ecx, edx;
+
+ int fd = qemu_open_old("/dev/sgx_vepc", O_RDWR);
+ if (fd < 0) {
+ error_setg(errp, "SGX is not enabled in KVM");
+ return NULL;
+ }
+
+ info = g_new0(SGXInfo, 1);
+ host_cpuid(0x7, 0, &eax, &ebx, &ecx, &edx);
+
+ info->sgx = ebx & (1U << 2) ? true : false;
+ info->flc = ecx & (1U << 30) ? true : false;
+
+ host_cpuid(0x12, 0, &eax, &ebx, &ecx, &edx);
+ info->sgx1 = eax & (1U << 0) ? true : false;
+ info->sgx2 = eax & (1U << 1) ? true : false;
+
+ info->section_size = sgx_calc_host_epc_section_size();
+
+ close(fd);
+
+ return info;
+}
+
+SGXInfo *qmp_query_sgx(Error **errp)
+{
+ SGXInfo *info = NULL;
+ X86MachineState *x86ms;
+ PCMachineState *pcms =
+ (PCMachineState *)object_dynamic_cast(qdev_get_machine(),
+ TYPE_PC_MACHINE);
+ if (!pcms) {
+ error_setg(errp, "SGX is only supported on PC machines");
+ return NULL;
+ }
+
+ x86ms = X86_MACHINE(pcms);
+ if (!x86ms->sgx_epc_list) {
+ error_setg(errp, "No EPC regions defined, SGX not available");
+ return NULL;
+ }
+
+ SGXEPCState *sgx_epc = &pcms->sgx_epc;
+ info = g_new0(SGXInfo, 1);
+
+ info->sgx = true;
+ info->sgx1 = true;
+ info->sgx2 = true;
+ info->flc = true;
+ info->section_size = sgx_epc->size;
+
+ return info;
+}
+
+void hmp_info_sgx(Monitor *mon, const QDict *qdict)
+{
+ Error *err = NULL;
+ g_autoptr(SGXInfo) info = qmp_query_sgx(&err);
+
+ if (err) {
+ error_report_err(err);
+ return;
+ }
+ monitor_printf(mon, "SGX support: %s\n",
+ info->sgx ? "enabled" : "disabled");
+ monitor_printf(mon, "SGX1 support: %s\n",
+ info->sgx1 ? "enabled" : "disabled");
+ monitor_printf(mon, "SGX2 support: %s\n",
+ info->sgx2 ? "enabled" : "disabled");
+ monitor_printf(mon, "FLC support: %s\n",
+ info->flc ? "enabled" : "disabled");
+ monitor_printf(mon, "size: %" PRIu64 "\n",
+ info->section_size);
+}
+
+bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
+{
+ PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+ SGXEPCDevice *epc;
+
+ if (pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) {
+ return true;
+ }
+
+ epc = pcms->sgx_epc.sections[section_nr];
+
+ *addr = epc->addr;
+ *size = memory_device_get_region_size(MEMORY_DEVICE(epc), &error_fatal);
+
+ return false;
+}
+
+void pc_machine_init_sgx_epc(PCMachineState *pcms)
+{
+ SGXEPCState *sgx_epc = &pcms->sgx_epc;
+ X86MachineState *x86ms = X86_MACHINE(pcms);
+ SgxEPCList *list = NULL;
+ Object *obj;
+
+ memset(sgx_epc, 0, sizeof(SGXEPCState));
+ if (!x86ms->sgx_epc_list) {
+ return;
+ }
+
+ sgx_epc->base = 0x100000000ULL + x86ms->above_4g_mem_size;
+
+ memory_region_init(&sgx_epc->mr, OBJECT(pcms), "sgx-epc", UINT64_MAX);
+ memory_region_add_subregion(get_system_memory(), sgx_epc->base,
+ &sgx_epc->mr);
+
+ for (list = x86ms->sgx_epc_list; list; list = list->next) {
+ obj = object_new("sgx-epc");
+
+ /* set the memdev link with memory backend */
+ object_property_parse(obj, SGX_EPC_MEMDEV_PROP, list->value->memdev,
+ &error_fatal);
+ object_property_set_bool(obj, "realized", true, &error_fatal);
+ object_unref(obj);
+ }
+
+ if ((sgx_epc->base + sgx_epc->size) < sgx_epc->base) {
+ error_report("Size of all 'sgx-epc' =0x%"PRIu64" causes EPC to wrap",
+ sgx_epc->size);
+ exit(EXIT_FAILURE);
+ }
+
+ memory_region_set_size(&sgx_epc->mr, sgx_epc->size);
+
+ /* register the reset callback for sgx epc */
+ qemu_register_reset(sgx_epc_reset, NULL);
+}
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
new file mode 100644
index 000000000..5bf7e52bf
--- /dev/null
+++ b/hw/i386/trace-events
@@ -0,0 +1,121 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# x86-iommu.c
+x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32
+
+# intel_iommu.c
+vtd_inv_desc(const char *type, uint64_t hi, uint64_t lo) "invalidate desc type %s high 0x%"PRIx64" low 0x%"PRIx64
+vtd_inv_desc_cc_domain(uint16_t domain) "context invalidate domain 0x%"PRIx16
+vtd_inv_desc_cc_global(void) "context invalidate globally"
+vtd_inv_desc_cc_device(uint8_t bus, uint8_t dev, uint8_t fn) "context invalidate device %02"PRIx8":%02"PRIx8".%02"PRIx8
+vtd_inv_desc_cc_devices(uint16_t sid, uint16_t fmask) "context invalidate devices sid 0x%"PRIx16" fmask 0x%"PRIx16
+vtd_inv_desc_iotlb_global(void) "iotlb invalidate global"
+vtd_inv_desc_iotlb_domain(uint16_t domain) "iotlb invalidate whole domain 0x%"PRIx16
+vtd_inv_desc_iotlb_pages(uint16_t domain, uint64_t addr, uint8_t mask) "iotlb invalidate domain 0x%"PRIx16" addr 0x%"PRIx64" mask 0x%"PRIx8
+vtd_inv_desc_wait_sw(uint64_t addr, uint32_t data) "wait invalidate status write addr 0x%"PRIx64" data 0x%"PRIx32
+vtd_inv_desc_wait_irq(const char *msg) "%s"
+vtd_inv_desc_wait_write_fail(uint64_t hi, uint64_t lo) "write fail for wait desc hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_inv_desc_iec(uint32_t granularity, uint32_t index, uint32_t mask) "granularity 0x%"PRIx32" index 0x%"PRIx32" mask 0x%"PRIx32
+vtd_inv_qi_enable(bool enable) "enabled %d"
+vtd_inv_qi_setup(uint64_t addr, int size) "addr 0x%"PRIx64" size %d"
+vtd_inv_qi_head(uint16_t head) "read head %d"
+vtd_inv_qi_tail(uint16_t head) "write tail %d"
+vtd_inv_qi_fetch(void) ""
+vtd_context_cache_reset(void) ""
+vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
+vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present"
+vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
+vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page update sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
+vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32
+vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
+vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)"
+vtd_fault_disabled(void) "Fault processing disabled for context entry"
+vtd_replay_ce_valid(const char *mode, uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "%s: replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
+vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
+vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
+vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
+vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
+vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
+vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
+vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64
+vtd_pt_enable_fast_path(uint16_t sid, bool success) "sid 0x%"PRIu16" %d"
+vtd_irq_generate(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64
+vtd_reg_read(uint64_t addr, uint64_t size) "addr 0x%"PRIx64" size 0x%"PRIx64
+vtd_reg_write(uint64_t addr, uint64_t size, uint64_t val) "addr 0x%"PRIx64" size 0x%"PRIx64" value 0x%"PRIx64
+vtd_reg_dmar_root(uint64_t addr, bool scalable) "addr 0x%"PRIx64" scalable %d"
+vtd_reg_ir_root(uint64_t addr, uint32_t size) "addr 0x%"PRIx64" size 0x%"PRIx32
+vtd_reg_write_gcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"PRIx32
+vtd_reg_write_fectl(uint32_t value) "value 0x%"PRIx32
+vtd_reg_write_iectl(uint32_t value) "value 0x%"PRIx32
+vtd_reg_ics_clear_ip(void) ""
+vtd_dmar_translate(uint8_t bus, uint8_t slot, uint8_t func, uint64_t iova, uint64_t gpa, uint64_t mask) "dev %02x:%02x.%02x iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64
+vtd_dmar_enable(bool en) "enable %d"
+vtd_dmar_fault(uint16_t sid, int fault, uint64_t addr, bool is_write) "sid 0x%"PRIx16" fault %d addr 0x%"PRIx64" write %d"
+vtd_ir_enable(bool en) "enable %d"
+vtd_ir_irte_get(int index, uint64_t lo, uint64_t hi) "index %d low 0x%"PRIx64" high 0x%"PRIx64
+vtd_ir_remap(int index, int tri, int vec, int deliver, uint32_t dest, int dest_mode) "index %d trigger %d vector %d deliver %d dest 0x%"PRIx32" mode %d"
+vtd_ir_remap_type(const char *type) "%s"
+vtd_ir_remap_msi(uint64_t addr, uint64_t data, uint64_t addr2, uint64_t data2) "(addr 0x%"PRIx64", data 0x%"PRIx64") -> (addr 0x%"PRIx64", data 0x%"PRIx64")"
+vtd_ir_remap_msi_req(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64
+vtd_fsts_ppf(bool set) "FSTS PPF bit set to %d"
+vtd_fsts_clear_ip(void) ""
+vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low 0x%"PRIx64
+vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
+vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)"
+vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)"
+
+# amd_iommu.c
+amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32
+amdvi_cache_update(uint16_t domid, uint8_t bus, uint8_t slot, uint8_t func, uint64_t gpa, uint64_t txaddr) " update iotlb domid 0x%"PRIx16" devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64
+amdvi_completion_wait_fail(uint64_t addr) "error: fail to write at address 0x%"PRIx64
+amdvi_mmio_write(const char *reg, uint64_t addr, unsigned size, uint64_t val, uint64_t offset) "%s write addr 0x%"PRIx64", size %u, val 0x%"PRIx64", offset 0x%"PRIx64
+amdvi_mmio_read(const char *reg, uint64_t addr, unsigned size, uint64_t offset) "%s read addr 0x%"PRIx64", size %u offset 0x%"PRIx64
+amdvi_mmio_read_invalid(int max, uint64_t addr, unsigned size) "error: addr outside region (max 0x%x): read addr 0x%" PRIx64 ", size %u"
+amdvi_command_error(uint64_t status) "error: Executing commands with command buffer disabled 0x%"PRIx64
+amdvi_command_read_fail(uint64_t addr, uint32_t head) "error: fail to access memory at 0x%"PRIx64" + 0x%"PRIx32
+amdvi_command_exec(uint32_t head, uint32_t tail, uint64_t buf) "command buffer head at 0x%"PRIx32" command buffer tail at 0x%"PRIx32" command buffer base at 0x%"PRIx64
+amdvi_unhandled_command(uint8_t type) "unhandled command 0x%"PRIx8
+amdvi_intr_inval(void) "Interrupt table invalidated"
+amdvi_iotlb_inval(void) "IOTLB pages invalidated"
+amdvi_prefetch_pages(void) "Pre-fetch of AMD-Vi pages requested"
+amdvi_pages_inval(uint16_t domid) "AMD-Vi pages for domain 0x%"PRIx16 " invalidated"
+amdvi_all_inval(void) "Invalidation of all AMD-Vi cache requested "
+amdvi_ppr_exec(void) "Execution of PPR queue requested "
+amdvi_devtab_inval(uint8_t bus, uint8_t slot, uint8_t func) "device table entry for devid: %02x:%02x.%x invalidated"
+amdvi_completion_wait(uint64_t addr, uint64_t data) "completion wait requested with store address 0x%"PRIx64" and store data 0x%"PRIx64
+amdvi_control_status(uint64_t val) "MMIO_STATUS state 0x%"PRIx64
+amdvi_iotlb_reset(void) "IOTLB exceed size limit - reset "
+amdvi_dte_get_fail(uint64_t addr, uint32_t offset) "error: failed to access Device Entry devtab 0x%"PRIx64" offset 0x%"PRIx32
+amdvi_invalid_dte(uint64_t addr) "PTE entry at 0x%"PRIx64" is invalid "
+amdvi_get_pte_hwerror(uint64_t addr) "hardware error eccessing PTE at addr 0x%"PRIx64
+amdvi_mode_invalid(uint8_t level, uint64_t addr)"error: translation level 0x%"PRIx8" translating addr 0x%"PRIx64
+amdvi_page_fault(uint64_t addr) "error: page fault accessing guest physical address 0x%"PRIx64
+amdvi_iotlb_hit(uint8_t bus, uint8_t slot, uint8_t func, uint64_t addr, uint64_t txaddr) "hit iotlb devid %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64
+amdvi_translation_result(uint8_t bus, uint8_t slot, uint8_t func, uint64_t addr, uint64_t txaddr) "devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64
+amdvi_mem_ir_write_req(uint64_t addr, uint64_t val, uint32_t size) "addr 0x%"PRIx64" data 0x%"PRIx64" size 0x%"PRIx32
+amdvi_mem_ir_write(uint64_t addr, uint64_t val) "addr 0x%"PRIx64" data 0x%"PRIx64
+amdvi_ir_remap_msi_req(uint64_t addr, uint64_t data, uint8_t devid) "addr 0x%"PRIx64" data 0x%"PRIx64" devid 0x%"PRIx8
+amdvi_ir_remap_msi(uint64_t addr, uint64_t data, uint64_t addr2, uint64_t data2) "(addr 0x%"PRIx64", data 0x%"PRIx64") -> (addr 0x%"PRIx64", data 0x%"PRIx64")"
+amdvi_err(const char *str) "%s"
+amdvi_ir_irte(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" offset 0x%"PRIx64
+amdvi_ir_irte_val(uint32_t data) "data 0x%"PRIx32
+amdvi_ir_err(const char *str) "%s"
+amdvi_ir_intctl(uint8_t val) "int_ctl 0x%"PRIx8
+amdvi_ir_target_abort(const char *str) "%s"
+amdvi_ir_delivery_mode(const char *str) "%s"
+amdvi_ir_irte_ga_val(uint64_t hi, uint64_t lo) "hi 0x%"PRIx64" lo 0x%"PRIx64
+
+# vmport.c
+vmport_register(unsigned char command, void *func, void *opaque) "command: 0x%02x func: %p opaque: %p"
+vmport_command(unsigned char command) "command: 0x%02x"
+
+# x86.c
+x86_gsi_interrupt(int irqn, int level) "GSI interrupt #%d level:%d"
+x86_pic_interrupt(int irqn, int level) "PIC interrupt #%d level:%d"
+
+# port92.c
+port92_read(uint8_t val) "port92: read 0x%02x"
+port92_write(uint8_t val) "port92: write 0x%02x"
diff --git a/hw/i386/trace.h b/hw/i386/trace.h
new file mode 100644
index 000000000..37a9f67e5
--- /dev/null
+++ b/hw/i386/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_i386.h"
diff --git a/hw/i386/vmmouse.c b/hw/i386/vmmouse.c
new file mode 100644
index 000000000..3d6636828
--- /dev/null
+++ b/hw/i386/vmmouse.c
@@ -0,0 +1,327 @@
+/*
+ * QEMU VMMouse emulation
+ *
+ * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "ui/console.h"
+#include "hw/i386/vmport.h"
+#include "hw/input/i8042.h"
+#include "hw/qdev-properties.h"
+#include "migration/vmstate.h"
+#include "cpu.h"
+#include "qom/object.h"
+
+/* debug only vmmouse */
+//#define DEBUG_VMMOUSE
+
+#define VMMOUSE_READ_ID 0x45414552
+#define VMMOUSE_DISABLE 0x000000f5
+#define VMMOUSE_REQUEST_RELATIVE 0x4c455252
+#define VMMOUSE_REQUEST_ABSOLUTE 0x53424152
+
+#define VMMOUSE_QUEUE_SIZE 1024
+
+#define VMMOUSE_VERSION 0x3442554a
+
+#ifdef DEBUG_VMMOUSE
+#define DPRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__)
+#else
+#define DPRINTF(fmt, ...) do { } while (0)
+#endif
+
+#define TYPE_VMMOUSE "vmmouse"
+OBJECT_DECLARE_SIMPLE_TYPE(VMMouseState, VMMOUSE)
+
+struct VMMouseState {
+ ISADevice parent_obj;
+
+ uint32_t queue[VMMOUSE_QUEUE_SIZE];
+ int32_t queue_size;
+ uint16_t nb_queue;
+ uint16_t status;
+ uint8_t absolute;
+ QEMUPutMouseEntry *entry;
+ ISAKBDState *i8042;
+};
+
+static void vmmouse_get_data(uint32_t *data)
+{
+ X86CPU *cpu = X86_CPU(current_cpu);
+ CPUX86State *env = &cpu->env;
+
+ data[0] = env->regs[R_EAX]; data[1] = env->regs[R_EBX];
+ data[2] = env->regs[R_ECX]; data[3] = env->regs[R_EDX];
+ data[4] = env->regs[R_ESI]; data[5] = env->regs[R_EDI];
+}
+
+static void vmmouse_set_data(const uint32_t *data)
+{
+ X86CPU *cpu = X86_CPU(current_cpu);
+ CPUX86State *env = &cpu->env;
+
+ env->regs[R_EAX] = data[0]; env->regs[R_EBX] = data[1];
+ env->regs[R_ECX] = data[2]; env->regs[R_EDX] = data[3];
+ env->regs[R_ESI] = data[4]; env->regs[R_EDI] = data[5];
+}
+
+static uint32_t vmmouse_get_status(VMMouseState *s)
+{
+ DPRINTF("vmmouse_get_status()\n");
+ return (s->status << 16) | s->nb_queue;
+}
+
+static void vmmouse_mouse_event(void *opaque, int x, int y, int dz, int buttons_state)
+{
+ VMMouseState *s = opaque;
+ int buttons = 0;
+
+ if (s->nb_queue > (VMMOUSE_QUEUE_SIZE - 4))
+ return;
+
+ DPRINTF("vmmouse_mouse_event(%d, %d, %d, %d)\n",
+ x, y, dz, buttons_state);
+
+ if ((buttons_state & MOUSE_EVENT_LBUTTON))
+ buttons |= 0x20;
+ if ((buttons_state & MOUSE_EVENT_RBUTTON))
+ buttons |= 0x10;
+ if ((buttons_state & MOUSE_EVENT_MBUTTON))
+ buttons |= 0x08;
+
+ if (s->absolute) {
+ x <<= 1;
+ y <<= 1;
+ }
+
+ s->queue[s->nb_queue++] = buttons;
+ s->queue[s->nb_queue++] = x;
+ s->queue[s->nb_queue++] = y;
+ s->queue[s->nb_queue++] = dz;
+
+ /* need to still generate PS2 events to notify driver to
+ read from queue */
+ i8042_isa_mouse_fake_event(s->i8042);
+}
+
+static void vmmouse_remove_handler(VMMouseState *s)
+{
+ if (s->entry) {
+ qemu_remove_mouse_event_handler(s->entry);
+ s->entry = NULL;
+ }
+}
+
+static void vmmouse_update_handler(VMMouseState *s, int absolute)
+{
+ if (s->status != 0) {
+ return;
+ }
+ if (s->absolute != absolute) {
+ s->absolute = absolute;
+ vmmouse_remove_handler(s);
+ }
+ if (s->entry == NULL) {
+ s->entry = qemu_add_mouse_event_handler(vmmouse_mouse_event,
+ s, s->absolute,
+ "vmmouse");
+ qemu_activate_mouse_event_handler(s->entry);
+ }
+}
+
+static void vmmouse_read_id(VMMouseState *s)
+{
+ DPRINTF("vmmouse_read_id()\n");
+
+ if (s->nb_queue == VMMOUSE_QUEUE_SIZE)
+ return;
+
+ s->queue[s->nb_queue++] = VMMOUSE_VERSION;
+ s->status = 0;
+ vmmouse_update_handler(s, s->absolute);
+}
+
+static void vmmouse_request_relative(VMMouseState *s)
+{
+ DPRINTF("vmmouse_request_relative()\n");
+ vmmouse_update_handler(s, 0);
+}
+
+static void vmmouse_request_absolute(VMMouseState *s)
+{
+ DPRINTF("vmmouse_request_absolute()\n");
+ vmmouse_update_handler(s, 1);
+}
+
+static void vmmouse_disable(VMMouseState *s)
+{
+ DPRINTF("vmmouse_disable()\n");
+ s->status = 0xffff;
+ vmmouse_remove_handler(s);
+}
+
+static void vmmouse_data(VMMouseState *s, uint32_t *data, uint32_t size)
+{
+ int i;
+
+ DPRINTF("vmmouse_data(%d)\n", size);
+
+ if (size == 0 || size > 6 || size > s->nb_queue) {
+ printf("vmmouse: driver requested too much data %d\n", size);
+ s->status = 0xffff;
+ vmmouse_remove_handler(s);
+ return;
+ }
+
+ for (i = 0; i < size; i++)
+ data[i] = s->queue[i];
+
+ s->nb_queue -= size;
+ if (s->nb_queue)
+ memmove(s->queue, &s->queue[size], sizeof(s->queue[0]) * s->nb_queue);
+}
+
+static uint32_t vmmouse_ioport_read(void *opaque, uint32_t addr)
+{
+ VMMouseState *s = opaque;
+ uint32_t data[6];
+ uint16_t command;
+
+ vmmouse_get_data(data);
+
+ command = data[2] & 0xFFFF;
+
+ switch (command) {
+ case VMPORT_CMD_VMMOUSE_STATUS:
+ data[0] = vmmouse_get_status(s);
+ break;
+ case VMPORT_CMD_VMMOUSE_COMMAND:
+ switch (data[1]) {
+ case VMMOUSE_DISABLE:
+ vmmouse_disable(s);
+ break;
+ case VMMOUSE_READ_ID:
+ vmmouse_read_id(s);
+ break;
+ case VMMOUSE_REQUEST_RELATIVE:
+ vmmouse_request_relative(s);
+ break;
+ case VMMOUSE_REQUEST_ABSOLUTE:
+ vmmouse_request_absolute(s);
+ break;
+ default:
+ printf("vmmouse: unknown command %x\n", data[1]);
+ break;
+ }
+ break;
+ case VMPORT_CMD_VMMOUSE_DATA:
+ vmmouse_data(s, data, data[1]);
+ break;
+ default:
+ printf("vmmouse: unknown command %x\n", command);
+ break;
+ }
+
+ vmmouse_set_data(data);
+ return data[0];
+}
+
+static int vmmouse_post_load(void *opaque, int version_id)
+{
+ VMMouseState *s = opaque;
+
+ vmmouse_remove_handler(s);
+ vmmouse_update_handler(s, s->absolute);
+ return 0;
+}
+
+static const VMStateDescription vmstate_vmmouse = {
+ .name = "vmmouse",
+ .version_id = 0,
+ .minimum_version_id = 0,
+ .post_load = vmmouse_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_INT32_EQUAL(queue_size, VMMouseState, NULL),
+ VMSTATE_UINT32_ARRAY(queue, VMMouseState, VMMOUSE_QUEUE_SIZE),
+ VMSTATE_UINT16(nb_queue, VMMouseState),
+ VMSTATE_UINT16(status, VMMouseState),
+ VMSTATE_UINT8(absolute, VMMouseState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void vmmouse_reset(DeviceState *d)
+{
+ VMMouseState *s = VMMOUSE(d);
+
+ s->queue_size = VMMOUSE_QUEUE_SIZE;
+ s->nb_queue = 0;
+
+ vmmouse_disable(s);
+}
+
+static void vmmouse_realizefn(DeviceState *dev, Error **errp)
+{
+ VMMouseState *s = VMMOUSE(dev);
+
+ DPRINTF("vmmouse_init\n");
+
+ if (!object_resolve_path_type("", TYPE_VMPORT, NULL)) {
+ error_setg(errp, "vmmouse needs a machine with vmport");
+ return;
+ }
+
+ vmport_register(VMPORT_CMD_VMMOUSE_STATUS, vmmouse_ioport_read, s);
+ vmport_register(VMPORT_CMD_VMMOUSE_COMMAND, vmmouse_ioport_read, s);
+ vmport_register(VMPORT_CMD_VMMOUSE_DATA, vmmouse_ioport_read, s);
+}
+
+static Property vmmouse_properties[] = {
+ DEFINE_PROP_LINK("i8042", VMMouseState, i8042, TYPE_I8042, ISAKBDState *),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vmmouse_class_initfn(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->realize = vmmouse_realizefn;
+ dc->reset = vmmouse_reset;
+ dc->vmsd = &vmstate_vmmouse;
+ device_class_set_props(dc, vmmouse_properties);
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+}
+
+static const TypeInfo vmmouse_info = {
+ .name = TYPE_VMMOUSE,
+ .parent = TYPE_ISA_DEVICE,
+ .instance_size = sizeof(VMMouseState),
+ .class_init = vmmouse_class_initfn,
+};
+
+static void vmmouse_register_types(void)
+{
+ type_register_static(&vmmouse_info);
+}
+
+type_init(vmmouse_register_types)
diff --git a/hw/i386/vmport.c b/hw/i386/vmport.c
new file mode 100644
index 000000000..7cc75dbc6
--- /dev/null
+++ b/hw/i386/vmport.c
@@ -0,0 +1,313 @@
+/*
+ * QEMU VMPort emulation
+ *
+ * Copyright (C) 2007 Hervé Poussineau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * Guest code that interacts with this virtual device can be found
+ * in VMware open-vm-tools open-source project:
+ * https://github.com/vmware/open-vm-tools
+ */
+
+#include "qemu/osdep.h"
+#include "hw/isa/isa.h"
+#include "hw/i386/vmport.h"
+#include "hw/qdev-properties.h"
+#include "hw/boards.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/hw_accel.h"
+#include "sysemu/qtest.h"
+#include "qemu/log.h"
+#include "trace.h"
+#include "qom/object.h"
+
+#define VMPORT_MAGIC 0x564D5868
+
+/* Compatibility flags for migration */
+#define VMPORT_COMPAT_READ_SET_EAX_BIT 0
+#define VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD_BIT 1
+#define VMPORT_COMPAT_REPORT_VMX_TYPE_BIT 2
+#define VMPORT_COMPAT_CMDS_V2_BIT 3
+#define VMPORT_COMPAT_READ_SET_EAX \
+ (1 << VMPORT_COMPAT_READ_SET_EAX_BIT)
+#define VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD \
+ (1 << VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD_BIT)
+#define VMPORT_COMPAT_REPORT_VMX_TYPE \
+ (1 << VMPORT_COMPAT_REPORT_VMX_TYPE_BIT)
+#define VMPORT_COMPAT_CMDS_V2 \
+ (1 << VMPORT_COMPAT_CMDS_V2_BIT)
+
+/* vCPU features reported by CMD_GET_VCPU_INFO */
+#define VCPU_INFO_SLC64_BIT 0
+#define VCPU_INFO_SYNC_VTSCS_BIT 1
+#define VCPU_INFO_HV_REPLAY_OK_BIT 2
+#define VCPU_INFO_LEGACY_X2APIC_BIT 3
+#define VCPU_INFO_RESERVED_BIT 31
+
+OBJECT_DECLARE_SIMPLE_TYPE(VMPortState, VMPORT)
+
+struct VMPortState {
+ ISADevice parent_obj;
+
+ MemoryRegion io;
+ VMPortReadFunc *func[VMPORT_ENTRIES];
+ void *opaque[VMPORT_ENTRIES];
+
+ uint32_t vmware_vmx_version;
+ uint8_t vmware_vmx_type;
+
+ uint32_t compat_flags;
+};
+
+static VMPortState *port_state;
+
+void vmport_register(VMPortCommand command, VMPortReadFunc *func, void *opaque)
+{
+ assert(command < VMPORT_ENTRIES);
+ assert(port_state);
+
+ trace_vmport_register(command, func, opaque);
+ port_state->func[command] = func;
+ port_state->opaque[command] = opaque;
+}
+
+static uint64_t vmport_ioport_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ VMPortState *s = opaque;
+ CPUState *cs = current_cpu;
+ X86CPU *cpu = X86_CPU(cs);
+ CPUX86State *env;
+ unsigned char command;
+ uint32_t eax;
+
+ if (qtest_enabled()) {
+ return -1;
+ }
+ env = &cpu->env;
+ cpu_synchronize_state(cs);
+
+ eax = env->regs[R_EAX];
+ if (eax != VMPORT_MAGIC) {
+ goto err;
+ }
+
+ command = env->regs[R_ECX];
+ trace_vmport_command(command);
+ if (command >= VMPORT_ENTRIES || !s->func[command]) {
+ qemu_log_mask(LOG_UNIMP, "vmport: unknown command %x\n", command);
+ goto err;
+ }
+
+ eax = s->func[command](s->opaque[command], addr);
+ goto out;
+
+err:
+ if (s->compat_flags & VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD) {
+ eax = UINT32_MAX;
+ }
+
+out:
+ /*
+ * The call above to cpu_synchronize_state() gets vCPU registers values
+ * to QEMU but also cause QEMU to write QEMU vCPU registers values to
+ * vCPU implementation (e.g. Accelerator such as KVM) just before
+ * resuming guest.
+ *
+ * Therefore, in order to make IOPort return value propagate to
+ * guest EAX, we need to explicitly update QEMU EAX register value.
+ */
+ if (s->compat_flags & VMPORT_COMPAT_READ_SET_EAX) {
+ cpu->env.regs[R_EAX] = eax;
+ }
+
+ return eax;
+}
+
+static void vmport_ioport_write(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+ X86CPU *cpu = X86_CPU(current_cpu);
+
+ if (qtest_enabled()) {
+ return;
+ }
+ cpu->env.regs[R_EAX] = vmport_ioport_read(opaque, addr, 4);
+}
+
+static uint32_t vmport_cmd_get_version(void *opaque, uint32_t addr)
+{
+ X86CPU *cpu = X86_CPU(current_cpu);
+
+ if (qtest_enabled()) {
+ return -1;
+ }
+ cpu->env.regs[R_EBX] = VMPORT_MAGIC;
+ if (port_state->compat_flags & VMPORT_COMPAT_REPORT_VMX_TYPE) {
+ cpu->env.regs[R_ECX] = port_state->vmware_vmx_type;
+ }
+ return port_state->vmware_vmx_version;
+}
+
+static uint32_t vmport_cmd_get_bios_uuid(void *opaque, uint32_t addr)
+{
+ X86CPU *cpu = X86_CPU(current_cpu);
+ uint32_t *uuid_parts = (uint32_t *)(qemu_uuid.data);
+
+ cpu->env.regs[R_EAX] = le32_to_cpu(uuid_parts[0]);
+ cpu->env.regs[R_EBX] = le32_to_cpu(uuid_parts[1]);
+ cpu->env.regs[R_ECX] = le32_to_cpu(uuid_parts[2]);
+ cpu->env.regs[R_EDX] = le32_to_cpu(uuid_parts[3]);
+ return cpu->env.regs[R_EAX];
+}
+
+static uint32_t vmport_cmd_ram_size(void *opaque, uint32_t addr)
+{
+ X86CPU *cpu = X86_CPU(current_cpu);
+
+ if (qtest_enabled()) {
+ return -1;
+ }
+ cpu->env.regs[R_EBX] = 0x1177;
+ return current_machine->ram_size;
+}
+
+static uint32_t vmport_cmd_get_hz(void *opaque, uint32_t addr)
+{
+ X86CPU *cpu = X86_CPU(current_cpu);
+
+ if (cpu->env.tsc_khz && cpu->env.apic_bus_freq) {
+ uint64_t tsc_freq = (uint64_t)cpu->env.tsc_khz * 1000;
+
+ cpu->env.regs[R_ECX] = cpu->env.apic_bus_freq;
+ cpu->env.regs[R_EBX] = (uint32_t)(tsc_freq >> 32);
+ cpu->env.regs[R_EAX] = (uint32_t)tsc_freq;
+ } else {
+ /* Signal cmd as not supported */
+ cpu->env.regs[R_EBX] = UINT32_MAX;
+ }
+
+ return cpu->env.regs[R_EAX];
+}
+
+static uint32_t vmport_cmd_get_vcpu_info(void *opaque, uint32_t addr)
+{
+ X86CPU *cpu = X86_CPU(current_cpu);
+ uint32_t ret = 0;
+
+ if (cpu->env.features[FEAT_1_ECX] & CPUID_EXT_X2APIC) {
+ ret |= 1 << VCPU_INFO_LEGACY_X2APIC_BIT;
+ }
+
+ return ret;
+}
+
+static const MemoryRegionOps vmport_ops = {
+ .read = vmport_ioport_read,
+ .write = vmport_ioport_write,
+ .impl = {
+ .min_access_size = 4,
+ .max_access_size = 4,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vmport_realizefn(DeviceState *dev, Error **errp)
+{
+ ISADevice *isadev = ISA_DEVICE(dev);
+ VMPortState *s = VMPORT(dev);
+
+ memory_region_init_io(&s->io, OBJECT(s), &vmport_ops, s, "vmport", 1);
+ isa_register_ioport(isadev, &s->io, 0x5658);
+
+ port_state = s;
+
+ /* Register some generic port commands */
+ vmport_register(VMPORT_CMD_GETVERSION, vmport_cmd_get_version, NULL);
+ vmport_register(VMPORT_CMD_GETRAMSIZE, vmport_cmd_ram_size, NULL);
+ if (s->compat_flags & VMPORT_COMPAT_CMDS_V2) {
+ vmport_register(VMPORT_CMD_GETBIOSUUID, vmport_cmd_get_bios_uuid, NULL);
+ vmport_register(VMPORT_CMD_GETHZ, vmport_cmd_get_hz, NULL);
+ vmport_register(VMPORT_CMD_GET_VCPU_INFO, vmport_cmd_get_vcpu_info,
+ NULL);
+ }
+}
+
+static Property vmport_properties[] = {
+ /* Used to enforce compatibility for migration */
+ DEFINE_PROP_BIT("x-read-set-eax", VMPortState, compat_flags,
+ VMPORT_COMPAT_READ_SET_EAX_BIT, true),
+ DEFINE_PROP_BIT("x-signal-unsupported-cmd", VMPortState, compat_flags,
+ VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD_BIT, true),
+ DEFINE_PROP_BIT("x-report-vmx-type", VMPortState, compat_flags,
+ VMPORT_COMPAT_REPORT_VMX_TYPE_BIT, true),
+ DEFINE_PROP_BIT("x-cmds-v2", VMPortState, compat_flags,
+ VMPORT_COMPAT_CMDS_V2_BIT, true),
+
+ /* Default value taken from open-vm-tools code VERSION_MAGIC definition */
+ DEFINE_PROP_UINT32("vmware-vmx-version", VMPortState,
+ vmware_vmx_version, 6),
+ /*
+ * Value determines which VMware product type host report itself to guest.
+ *
+ * Most guests are fine with exposing host as VMware ESX server.
+ * Some legacy/proprietary guests hard-code a given type.
+ *
+ * For a complete list of values, refer to enum VMXType at open-vm-tools
+ * project (Defined at lib/include/vm_vmx_type.h).
+ *
+ * Reasonable options:
+ * 0 - Unset
+ * 1 - VMware Express (deprecated)
+ * 2 - VMware ESX Server
+ * 3 - VMware Server (Deprecated)
+ * 4 - VMware Workstation
+ * 5 - ACE 1.x (Deprecated)
+ */
+ DEFINE_PROP_UINT8("vmware-vmx-type", VMPortState, vmware_vmx_type, 2),
+
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vmport_class_initfn(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->realize = vmport_realizefn;
+ /* Reason: realize sets global port_state */
+ dc->user_creatable = false;
+ device_class_set_props(dc, vmport_properties);
+}
+
+static const TypeInfo vmport_info = {
+ .name = TYPE_VMPORT,
+ .parent = TYPE_ISA_DEVICE,
+ .instance_size = sizeof(VMPortState),
+ .class_init = vmport_class_initfn,
+};
+
+static void vmport_register_types(void)
+{
+ type_register_static(&vmport_info);
+}
+
+type_init(vmport_register_types)
diff --git a/hw/i386/x86-iommu-stub.c b/hw/i386/x86-iommu-stub.c
new file mode 100644
index 000000000..781b5ff92
--- /dev/null
+++ b/hw/i386/x86-iommu-stub.c
@@ -0,0 +1,38 @@
+/*
+ * Stubs for X86 IOMMU emulation
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/x86-iommu.h"
+
+void x86_iommu_iec_register_notifier(X86IOMMUState *iommu,
+ iec_notify_fn fn, void *data)
+{
+}
+
+X86IOMMUState *x86_iommu_get_default(void)
+{
+ return NULL;
+}
+
+bool x86_iommu_ir_supported(X86IOMMUState *s)
+{
+ return false;
+}
diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c
new file mode 100644
index 000000000..01d11325a
--- /dev/null
+++ b/hw/i386/x86-iommu.c
@@ -0,0 +1,162 @@
+/*
+ * QEMU emulation of common X86 IOMMU
+ *
+ * Copyright (C) 2016 Peter Xu, Red Hat <peterx@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/sysbus.h"
+#include "hw/i386/x86-iommu.h"
+#include "hw/qdev-properties.h"
+#include "hw/i386/pc.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "sysemu/kvm.h"
+
+void x86_iommu_iec_register_notifier(X86IOMMUState *iommu,
+ iec_notify_fn fn, void *data)
+{
+ IEC_Notifier *notifier = g_new0(IEC_Notifier, 1);
+
+ notifier->iec_notify = fn;
+ notifier->private = data;
+
+ QLIST_INSERT_HEAD(&iommu->iec_notifiers, notifier, list);
+}
+
+void x86_iommu_iec_notify_all(X86IOMMUState *iommu, bool global,
+ uint32_t index, uint32_t mask)
+{
+ IEC_Notifier *notifier;
+
+ trace_x86_iommu_iec_notify(global, index, mask);
+
+ QLIST_FOREACH(notifier, &iommu->iec_notifiers, list) {
+ if (notifier->iec_notify) {
+ notifier->iec_notify(notifier->private, global,
+ index, mask);
+ }
+ }
+}
+
+/* Generate one MSI message from VTDIrq info */
+void x86_iommu_irq_to_msi_message(X86IOMMUIrq *irq, MSIMessage *msg_out)
+{
+ X86IOMMU_MSIMessage msg = {};
+
+ /* Generate address bits */
+ msg.dest_mode = irq->dest_mode;
+ msg.redir_hint = irq->redir_hint;
+ msg.dest = irq->dest;
+ msg.__addr_hi = irq->dest & 0xffffff00;
+ msg.__addr_head = cpu_to_le32(0xfee);
+ /* Keep this from original MSI address bits */
+ msg.__not_used = irq->msi_addr_last_bits;
+
+ /* Generate data bits */
+ msg.vector = irq->vector;
+ msg.delivery_mode = irq->delivery_mode;
+ msg.level = 1;
+ msg.trigger_mode = irq->trigger_mode;
+
+ msg_out->address = msg.msi_addr;
+ msg_out->data = msg.msi_data;
+}
+
+X86IOMMUState *x86_iommu_get_default(void)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ PCMachineState *pcms =
+ PC_MACHINE(object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE));
+
+ if (pcms &&
+ object_dynamic_cast(OBJECT(pcms->iommu), TYPE_X86_IOMMU_DEVICE)) {
+ return X86_IOMMU_DEVICE(pcms->iommu);
+ }
+ return NULL;
+}
+
+static void x86_iommu_realize(DeviceState *dev, Error **errp)
+{
+ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
+ X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_GET_CLASS(dev);
+ MachineState *ms = MACHINE(qdev_get_machine());
+ MachineClass *mc = MACHINE_GET_CLASS(ms);
+ PCMachineState *pcms =
+ PC_MACHINE(object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE));
+ QLIST_INIT(&x86_iommu->iec_notifiers);
+ bool irq_all_kernel = kvm_irqchip_in_kernel() && !kvm_irqchip_is_split();
+
+ if (!pcms || !pcms->bus) {
+ error_setg(errp, "Machine-type '%s' not supported by IOMMU",
+ mc->name);
+ return;
+ }
+
+ /* If the user didn't specify IR, choose a default value for it */
+ if (x86_iommu->intr_supported == ON_OFF_AUTO_AUTO) {
+ x86_iommu->intr_supported = irq_all_kernel ?
+ ON_OFF_AUTO_OFF : ON_OFF_AUTO_ON;
+ }
+
+ /* Both Intel and AMD IOMMU IR only support "kernel-irqchip={off|split}" */
+ if (x86_iommu_ir_supported(x86_iommu) && irq_all_kernel) {
+ error_setg(errp, "Interrupt Remapping cannot work with "
+ "kernel-irqchip=on, please use 'split|off'.");
+ return;
+ }
+
+ if (x86_class->realize) {
+ x86_class->realize(dev, errp);
+ }
+}
+
+static Property x86_iommu_properties[] = {
+ DEFINE_PROP_ON_OFF_AUTO("intremap", X86IOMMUState,
+ intr_supported, ON_OFF_AUTO_AUTO),
+ DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false),
+ DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void x86_iommu_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ dc->realize = x86_iommu_realize;
+ device_class_set_props(dc, x86_iommu_properties);
+}
+
+bool x86_iommu_ir_supported(X86IOMMUState *s)
+{
+ return s->intr_supported == ON_OFF_AUTO_ON;
+}
+
+static const TypeInfo x86_iommu_info = {
+ .name = TYPE_X86_IOMMU_DEVICE,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(X86IOMMUState),
+ .class_init = x86_iommu_class_init,
+ .class_size = sizeof(X86IOMMUClass),
+ .abstract = true,
+};
+
+static void x86_iommu_register_types(void)
+{
+ type_register_static(&x86_iommu_info);
+}
+
+type_init(x86_iommu_register_types)
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
new file mode 100644
index 000000000..b84840a1b
--- /dev/null
+++ b/hw/i386/x86.c
@@ -0,0 +1,1399 @@
+/*
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/cutils.h"
+#include "qemu/units.h"
+#include "qemu-common.h"
+#include "qemu/datadir.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qapi-visit-common.h"
+#include "qapi/clone-visitor.h"
+#include "qapi/qapi-visit-machine.h"
+#include "qapi/visitor.h"
+#include "sysemu/qtest.h"
+#include "sysemu/whpx.h"
+#include "sysemu/numa.h"
+#include "sysemu/replay.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/cpu-timers.h"
+#include "trace.h"
+
+#include "hw/i386/x86.h"
+#include "target/i386/cpu.h"
+#include "hw/i386/topology.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/intc/i8259.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "target/i386/sev.h"
+
+#include "hw/acpi/cpu_hotplug.h"
+#include "hw/irq.h"
+#include "hw/nmi.h"
+#include "hw/loader.h"
+#include "multiboot.h"
+#include "elf.h"
+#include "standard-headers/asm-x86/bootparam.h"
+#include CONFIG_DEVICES
+#include "kvm/kvm_i386.h"
+
+/* Physical Address of PVH entry point read from kernel ELF NOTE */
+static size_t pvh_start_addr;
+
+inline void init_topo_info(X86CPUTopoInfo *topo_info,
+ const X86MachineState *x86ms)
+{
+ MachineState *ms = MACHINE(x86ms);
+
+ topo_info->dies_per_pkg = ms->smp.dies;
+ topo_info->cores_per_die = ms->smp.cores;
+ topo_info->threads_per_core = ms->smp.threads;
+}
+
+/*
+ * Calculates initial APIC ID for a specific CPU index
+ *
+ * Currently we need to be able to calculate the APIC ID from the CPU index
+ * alone (without requiring a CPU object), as the QEMU<->Seabios interfaces have
+ * no concept of "CPU index", and the NUMA tables on fw_cfg need the APIC ID of
+ * all CPUs up to max_cpus.
+ */
+uint32_t x86_cpu_apic_id_from_index(X86MachineState *x86ms,
+ unsigned int cpu_index)
+{
+ X86MachineClass *x86mc = X86_MACHINE_GET_CLASS(x86ms);
+ X86CPUTopoInfo topo_info;
+ uint32_t correct_id;
+ static bool warned;
+
+ init_topo_info(&topo_info, x86ms);
+
+ correct_id = x86_apicid_from_cpu_idx(&topo_info, cpu_index);
+ if (x86mc->compat_apic_id_mode) {
+ if (cpu_index != correct_id && !warned && !qtest_enabled()) {
+ error_report("APIC IDs set in compatibility mode, "
+ "CPU topology won't match the configuration");
+ warned = true;
+ }
+ return cpu_index;
+ } else {
+ return correct_id;
+ }
+}
+
+
+void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp)
+{
+ Object *cpu = object_new(MACHINE(x86ms)->cpu_type);
+
+ if (!object_property_set_uint(cpu, "apic-id", apic_id, errp)) {
+ goto out;
+ }
+ qdev_realize(DEVICE(cpu), NULL, errp);
+
+out:
+ object_unref(cpu);
+}
+
+void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
+{
+ int i;
+ const CPUArchIdList *possible_cpus;
+ MachineState *ms = MACHINE(x86ms);
+ MachineClass *mc = MACHINE_GET_CLASS(x86ms);
+
+ x86_cpu_set_default_version(default_cpu_version);
+
+ /*
+ * Calculates the limit to CPU APIC ID values
+ *
+ * Limit for the APIC ID value, so that all
+ * CPU APIC IDs are < x86ms->apic_id_limit.
+ *
+ * This is used for FW_CFG_MAX_CPUS. See comments on fw_cfg_arch_create().
+ */
+ x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms,
+ ms->smp.max_cpus - 1) + 1;
+ possible_cpus = mc->possible_cpu_arch_ids(ms);
+ for (i = 0; i < ms->smp.cpus; i++) {
+ x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
+ }
+}
+
+void x86_rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count)
+{
+ if (cpus_count > 0xff) {
+ /*
+ * If the number of CPUs can't be represented in 8 bits, the
+ * BIOS must use "FW_CFG_NB_CPUS". Set RTC field to 0 just
+ * to make old BIOSes fail more predictably.
+ */
+ rtc_set_memory(rtc, 0x5f, 0);
+ } else {
+ rtc_set_memory(rtc, 0x5f, cpus_count - 1);
+ }
+}
+
+static int x86_apic_cmp(const void *a, const void *b)
+{
+ CPUArchId *apic_a = (CPUArchId *)a;
+ CPUArchId *apic_b = (CPUArchId *)b;
+
+ return apic_a->arch_id - apic_b->arch_id;
+}
+
+/*
+ * returns pointer to CPUArchId descriptor that matches CPU's apic_id
+ * in ms->possible_cpus->cpus, if ms->possible_cpus->cpus has no
+ * entry corresponding to CPU's apic_id returns NULL.
+ */
+CPUArchId *x86_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
+{
+ CPUArchId apic_id, *found_cpu;
+
+ apic_id.arch_id = id;
+ found_cpu = bsearch(&apic_id, ms->possible_cpus->cpus,
+ ms->possible_cpus->len, sizeof(*ms->possible_cpus->cpus),
+ x86_apic_cmp);
+ if (found_cpu && idx) {
+ *idx = found_cpu - ms->possible_cpus->cpus;
+ }
+ return found_cpu;
+}
+
+void x86_cpu_plug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ CPUArchId *found_cpu;
+ Error *local_err = NULL;
+ X86CPU *cpu = X86_CPU(dev);
+ X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+
+ if (x86ms->acpi_dev) {
+ hotplug_handler_plug(x86ms->acpi_dev, dev, &local_err);
+ if (local_err) {
+ goto out;
+ }
+ }
+
+ /* increment the number of CPUs */
+ x86ms->boot_cpus++;
+ if (x86ms->rtc) {
+ x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
+ }
+ if (x86ms->fw_cfg) {
+ fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
+ }
+
+ found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL);
+ found_cpu->cpu = OBJECT(dev);
+out:
+ error_propagate(errp, local_err);
+}
+
+void x86_cpu_unplug_request_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ int idx = -1;
+ X86CPU *cpu = X86_CPU(dev);
+ X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+
+ if (!x86ms->acpi_dev) {
+ error_setg(errp, "CPU hot unplug not supported without ACPI");
+ return;
+ }
+
+ x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx);
+ assert(idx != -1);
+ if (idx == 0) {
+ error_setg(errp, "Boot CPU is unpluggable");
+ return;
+ }
+
+ hotplug_handler_unplug_request(x86ms->acpi_dev, dev,
+ errp);
+}
+
+void x86_cpu_unplug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ CPUArchId *found_cpu;
+ Error *local_err = NULL;
+ X86CPU *cpu = X86_CPU(dev);
+ X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+
+ hotplug_handler_unplug(x86ms->acpi_dev, dev, &local_err);
+ if (local_err) {
+ goto out;
+ }
+
+ found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL);
+ found_cpu->cpu = NULL;
+ qdev_unrealize(dev);
+
+ /* decrement the number of CPUs */
+ x86ms->boot_cpus--;
+ /* Update the number of CPUs in CMOS */
+ x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
+ fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
+ out:
+ error_propagate(errp, local_err);
+}
+
+void x86_cpu_pre_plug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ int idx;
+ CPUState *cs;
+ CPUArchId *cpu_slot;
+ X86CPUTopoIDs topo_ids;
+ X86CPU *cpu = X86_CPU(dev);
+ CPUX86State *env = &cpu->env;
+ MachineState *ms = MACHINE(hotplug_dev);
+ X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+ unsigned int smp_cores = ms->smp.cores;
+ unsigned int smp_threads = ms->smp.threads;
+ X86CPUTopoInfo topo_info;
+
+ if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) {
+ error_setg(errp, "Invalid CPU type, expected cpu type: '%s'",
+ ms->cpu_type);
+ return;
+ }
+
+ if (x86ms->acpi_dev) {
+ Error *local_err = NULL;
+
+ hotplug_handler_pre_plug(HOTPLUG_HANDLER(x86ms->acpi_dev), dev,
+ &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ }
+
+ init_topo_info(&topo_info, x86ms);
+
+ env->nr_dies = ms->smp.dies;
+
+ /*
+ * If APIC ID is not set,
+ * set it based on socket/die/core/thread properties.
+ */
+ if (cpu->apic_id == UNASSIGNED_APIC_ID) {
+ int max_socket = (ms->smp.max_cpus - 1) /
+ smp_threads / smp_cores / ms->smp.dies;
+
+ /*
+ * die-id was optional in QEMU 4.0 and older, so keep it optional
+ * if there's only one die per socket.
+ */
+ if (cpu->die_id < 0 && ms->smp.dies == 1) {
+ cpu->die_id = 0;
+ }
+
+ if (cpu->socket_id < 0) {
+ error_setg(errp, "CPU socket-id is not set");
+ return;
+ } else if (cpu->socket_id > max_socket) {
+ error_setg(errp, "Invalid CPU socket-id: %u must be in range 0:%u",
+ cpu->socket_id, max_socket);
+ return;
+ }
+ if (cpu->die_id < 0) {
+ error_setg(errp, "CPU die-id is not set");
+ return;
+ } else if (cpu->die_id > ms->smp.dies - 1) {
+ error_setg(errp, "Invalid CPU die-id: %u must be in range 0:%u",
+ cpu->die_id, ms->smp.dies - 1);
+ return;
+ }
+ if (cpu->core_id < 0) {
+ error_setg(errp, "CPU core-id is not set");
+ return;
+ } else if (cpu->core_id > (smp_cores - 1)) {
+ error_setg(errp, "Invalid CPU core-id: %u must be in range 0:%u",
+ cpu->core_id, smp_cores - 1);
+ return;
+ }
+ if (cpu->thread_id < 0) {
+ error_setg(errp, "CPU thread-id is not set");
+ return;
+ } else if (cpu->thread_id > (smp_threads - 1)) {
+ error_setg(errp, "Invalid CPU thread-id: %u must be in range 0:%u",
+ cpu->thread_id, smp_threads - 1);
+ return;
+ }
+
+ topo_ids.pkg_id = cpu->socket_id;
+ topo_ids.die_id = cpu->die_id;
+ topo_ids.core_id = cpu->core_id;
+ topo_ids.smt_id = cpu->thread_id;
+ cpu->apic_id = x86_apicid_from_topo_ids(&topo_info, &topo_ids);
+ }
+
+ cpu_slot = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx);
+ if (!cpu_slot) {
+ MachineState *ms = MACHINE(x86ms);
+
+ x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids);
+ error_setg(errp,
+ "Invalid CPU [socket: %u, die: %u, core: %u, thread: %u] with"
+ " APIC ID %" PRIu32 ", valid index range 0:%d",
+ topo_ids.pkg_id, topo_ids.die_id, topo_ids.core_id, topo_ids.smt_id,
+ cpu->apic_id, ms->possible_cpus->len - 1);
+ return;
+ }
+
+ if (cpu_slot->cpu) {
+ error_setg(errp, "CPU[%d] with APIC ID %" PRIu32 " exists",
+ idx, cpu->apic_id);
+ return;
+ }
+
+ /* if 'address' properties socket-id/core-id/thread-id are not set, set them
+ * so that machine_query_hotpluggable_cpus would show correct values
+ */
+ /* TODO: move socket_id/core_id/thread_id checks into x86_cpu_realizefn()
+ * once -smp refactoring is complete and there will be CPU private
+ * CPUState::nr_cores and CPUState::nr_threads fields instead of globals */
+ x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids);
+ if (cpu->socket_id != -1 && cpu->socket_id != topo_ids.pkg_id) {
+ error_setg(errp, "property socket-id: %u doesn't match set apic-id:"
+ " 0x%x (socket-id: %u)", cpu->socket_id, cpu->apic_id,
+ topo_ids.pkg_id);
+ return;
+ }
+ cpu->socket_id = topo_ids.pkg_id;
+
+ if (cpu->die_id != -1 && cpu->die_id != topo_ids.die_id) {
+ error_setg(errp, "property die-id: %u doesn't match set apic-id:"
+ " 0x%x (die-id: %u)", cpu->die_id, cpu->apic_id, topo_ids.die_id);
+ return;
+ }
+ cpu->die_id = topo_ids.die_id;
+
+ if (cpu->core_id != -1 && cpu->core_id != topo_ids.core_id) {
+ error_setg(errp, "property core-id: %u doesn't match set apic-id:"
+ " 0x%x (core-id: %u)", cpu->core_id, cpu->apic_id,
+ topo_ids.core_id);
+ return;
+ }
+ cpu->core_id = topo_ids.core_id;
+
+ if (cpu->thread_id != -1 && cpu->thread_id != topo_ids.smt_id) {
+ error_setg(errp, "property thread-id: %u doesn't match set apic-id:"
+ " 0x%x (thread-id: %u)", cpu->thread_id, cpu->apic_id,
+ topo_ids.smt_id);
+ return;
+ }
+ cpu->thread_id = topo_ids.smt_id;
+
+ if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) &&
+ !kvm_hv_vpindex_settable()) {
+ error_setg(errp, "kernel doesn't allow setting HyperV VP_INDEX");
+ return;
+ }
+
+ cs = CPU(cpu);
+ cs->cpu_index = idx;
+
+ numa_cpu_pre_plug(cpu_slot, dev, errp);
+}
+
+CpuInstanceProperties
+x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
+{
+ MachineClass *mc = MACHINE_GET_CLASS(ms);
+ const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms);
+
+ assert(cpu_index < possible_cpus->len);
+ return possible_cpus->cpus[cpu_index].props;
+}
+
+int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx)
+{
+ X86CPUTopoIDs topo_ids;
+ X86MachineState *x86ms = X86_MACHINE(ms);
+ X86CPUTopoInfo topo_info;
+
+ init_topo_info(&topo_info, x86ms);
+
+ assert(idx < ms->possible_cpus->len);
+ x86_topo_ids_from_apicid(ms->possible_cpus->cpus[idx].arch_id,
+ &topo_info, &topo_ids);
+ return topo_ids.pkg_id % ms->numa_state->num_nodes;
+}
+
+const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms)
+{
+ X86MachineState *x86ms = X86_MACHINE(ms);
+ unsigned int max_cpus = ms->smp.max_cpus;
+ X86CPUTopoInfo topo_info;
+ int i;
+
+ if (ms->possible_cpus) {
+ /*
+ * make sure that max_cpus hasn't changed since the first use, i.e.
+ * -smp hasn't been parsed after it
+ */
+ assert(ms->possible_cpus->len == max_cpus);
+ return ms->possible_cpus;
+ }
+
+ ms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
+ sizeof(CPUArchId) * max_cpus);
+ ms->possible_cpus->len = max_cpus;
+
+ init_topo_info(&topo_info, x86ms);
+
+ for (i = 0; i < ms->possible_cpus->len; i++) {
+ X86CPUTopoIDs topo_ids;
+
+ ms->possible_cpus->cpus[i].type = ms->cpu_type;
+ ms->possible_cpus->cpus[i].vcpus_count = 1;
+ ms->possible_cpus->cpus[i].arch_id =
+ x86_cpu_apic_id_from_index(x86ms, i);
+ x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id,
+ &topo_info, &topo_ids);
+ ms->possible_cpus->cpus[i].props.has_socket_id = true;
+ ms->possible_cpus->cpus[i].props.socket_id = topo_ids.pkg_id;
+ if (ms->smp.dies > 1) {
+ ms->possible_cpus->cpus[i].props.has_die_id = true;
+ ms->possible_cpus->cpus[i].props.die_id = topo_ids.die_id;
+ }
+ ms->possible_cpus->cpus[i].props.has_core_id = true;
+ ms->possible_cpus->cpus[i].props.core_id = topo_ids.core_id;
+ ms->possible_cpus->cpus[i].props.has_thread_id = true;
+ ms->possible_cpus->cpus[i].props.thread_id = topo_ids.smt_id;
+ }
+ return ms->possible_cpus;
+}
+
+static void x86_nmi(NMIState *n, int cpu_index, Error **errp)
+{
+ /* cpu index isn't used */
+ CPUState *cs;
+
+ CPU_FOREACH(cs) {
+ X86CPU *cpu = X86_CPU(cs);
+
+ if (!cpu->apic_state) {
+ cpu_interrupt(cs, CPU_INTERRUPT_NMI);
+ } else {
+ apic_deliver_nmi(cpu->apic_state);
+ }
+ }
+}
+
+static long get_file_size(FILE *f)
+{
+ long where, size;
+
+ /* XXX: on Unix systems, using fstat() probably makes more sense */
+
+ where = ftell(f);
+ fseek(f, 0, SEEK_END);
+ size = ftell(f);
+ fseek(f, where, SEEK_SET);
+
+ return size;
+}
+
+/* TSC handling */
+uint64_t cpu_get_tsc(CPUX86State *env)
+{
+ return cpus_get_elapsed_ticks();
+}
+
+/* IRQ handling */
+static void pic_irq_request(void *opaque, int irq, int level)
+{
+ CPUState *cs = first_cpu;
+ X86CPU *cpu = X86_CPU(cs);
+
+ trace_x86_pic_interrupt(irq, level);
+ if (cpu->apic_state && !kvm_irqchip_in_kernel() &&
+ !whpx_apic_in_platform()) {
+ CPU_FOREACH(cs) {
+ cpu = X86_CPU(cs);
+ if (apic_accept_pic_intr(cpu->apic_state)) {
+ apic_deliver_pic_intr(cpu->apic_state, level);
+ }
+ }
+ } else {
+ if (level) {
+ cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+ } else {
+ cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
+ }
+ }
+}
+
+qemu_irq x86_allocate_cpu_irq(void)
+{
+ return qemu_allocate_irq(pic_irq_request, NULL, 0);
+}
+
+int cpu_get_pic_interrupt(CPUX86State *env)
+{
+ X86CPU *cpu = env_archcpu(env);
+ int intno;
+
+ if (!kvm_irqchip_in_kernel() && !whpx_apic_in_platform()) {
+ intno = apic_get_interrupt(cpu->apic_state);
+ if (intno >= 0) {
+ return intno;
+ }
+ /* read the irq from the PIC */
+ if (!apic_accept_pic_intr(cpu->apic_state)) {
+ return -1;
+ }
+ }
+
+ intno = pic_read_irq(isa_pic);
+ return intno;
+}
+
+DeviceState *cpu_get_current_apic(void)
+{
+ if (current_cpu) {
+ X86CPU *cpu = X86_CPU(current_cpu);
+ return cpu->apic_state;
+ } else {
+ return NULL;
+ }
+}
+
+void gsi_handler(void *opaque, int n, int level)
+{
+ GSIState *s = opaque;
+
+ trace_x86_gsi_interrupt(n, level);
+ switch (n) {
+ case 0 ... ISA_NUM_IRQS - 1:
+ if (s->i8259_irq[n]) {
+ /* Under KVM, Kernel will forward to both PIC and IOAPIC */
+ qemu_set_irq(s->i8259_irq[n], level);
+ }
+ /* fall through */
+ case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1:
+ qemu_set_irq(s->ioapic_irq[n], level);
+ break;
+ case IO_APIC_SECONDARY_IRQBASE
+ ... IO_APIC_SECONDARY_IRQBASE + IOAPIC_NUM_PINS - 1:
+ qemu_set_irq(s->ioapic2_irq[n - IO_APIC_SECONDARY_IRQBASE], level);
+ break;
+ }
+}
+
+void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
+{
+ DeviceState *dev;
+ SysBusDevice *d;
+ unsigned int i;
+
+ assert(parent_name);
+ if (kvm_ioapic_in_kernel()) {
+ dev = qdev_new(TYPE_KVM_IOAPIC);
+ } else {
+ dev = qdev_new(TYPE_IOAPIC);
+ }
+ object_property_add_child(object_resolve_path(parent_name, NULL),
+ "ioapic", OBJECT(dev));
+ d = SYS_BUS_DEVICE(dev);
+ sysbus_realize_and_unref(d, &error_fatal);
+ sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i);
+ }
+}
+
+DeviceState *ioapic_init_secondary(GSIState *gsi_state)
+{
+ DeviceState *dev;
+ SysBusDevice *d;
+ unsigned int i;
+
+ dev = qdev_new(TYPE_IOAPIC);
+ d = SYS_BUS_DEVICE(dev);
+ sysbus_realize_and_unref(d, &error_fatal);
+ sysbus_mmio_map(d, 0, IO_APIC_SECONDARY_ADDRESS);
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ gsi_state->ioapic2_irq[i] = qdev_get_gpio_in(dev, i);
+ }
+ return dev;
+}
+
+struct setup_data {
+ uint64_t next;
+ uint32_t type;
+ uint32_t len;
+ uint8_t data[];
+} __attribute__((packed));
+
+
+/*
+ * The entry point into the kernel for PVH boot is different from
+ * the native entry point. The PVH entry is defined by the x86/HVM
+ * direct boot ABI and is available in an ELFNOTE in the kernel binary.
+ *
+ * This function is passed to load_elf() when it is called from
+ * load_elfboot() which then additionally checks for an ELF Note of
+ * type XEN_ELFNOTE_PHYS32_ENTRY and passes it to this function to
+ * parse the PVH entry address from the ELF Note.
+ *
+ * Due to trickery in elf_opts.h, load_elf() is actually available as
+ * load_elf32() or load_elf64() and this routine needs to be able
+ * to deal with being called as 32 or 64 bit.
+ *
+ * The address of the PVH entry point is saved to the 'pvh_start_addr'
+ * global variable. (although the entry point is 32-bit, the kernel
+ * binary can be either 32-bit or 64-bit).
+ */
+static uint64_t read_pvh_start_addr(void *arg1, void *arg2, bool is64)
+{
+ size_t *elf_note_data_addr;
+
+ /* Check if ELF Note header passed in is valid */
+ if (arg1 == NULL) {
+ return 0;
+ }
+
+ if (is64) {
+ struct elf64_note *nhdr64 = (struct elf64_note *)arg1;
+ uint64_t nhdr_size64 = sizeof(struct elf64_note);
+ uint64_t phdr_align = *(uint64_t *)arg2;
+ uint64_t nhdr_namesz = nhdr64->n_namesz;
+
+ elf_note_data_addr =
+ ((void *)nhdr64) + nhdr_size64 +
+ QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
+
+ pvh_start_addr = *elf_note_data_addr;
+ } else {
+ struct elf32_note *nhdr32 = (struct elf32_note *)arg1;
+ uint32_t nhdr_size32 = sizeof(struct elf32_note);
+ uint32_t phdr_align = *(uint32_t *)arg2;
+ uint32_t nhdr_namesz = nhdr32->n_namesz;
+
+ elf_note_data_addr =
+ ((void *)nhdr32) + nhdr_size32 +
+ QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
+
+ pvh_start_addr = *(uint32_t *)elf_note_data_addr;
+ }
+
+ return pvh_start_addr;
+}
+
+static bool load_elfboot(const char *kernel_filename,
+ int kernel_file_size,
+ uint8_t *header,
+ size_t pvh_xen_start_addr,
+ FWCfgState *fw_cfg)
+{
+ uint32_t flags = 0;
+ uint32_t mh_load_addr = 0;
+ uint32_t elf_kernel_size = 0;
+ uint64_t elf_entry;
+ uint64_t elf_low, elf_high;
+ int kernel_size;
+
+ if (ldl_p(header) != 0x464c457f) {
+ return false; /* no elfboot */
+ }
+
+ bool elf_is64 = header[EI_CLASS] == ELFCLASS64;
+ flags = elf_is64 ?
+ ((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags;
+
+ if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */
+ error_report("elfboot unsupported flags = %x", flags);
+ exit(1);
+ }
+
+ uint64_t elf_note_type = XEN_ELFNOTE_PHYS32_ENTRY;
+ kernel_size = load_elf(kernel_filename, read_pvh_start_addr,
+ NULL, &elf_note_type, &elf_entry,
+ &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE,
+ 0, 0);
+
+ if (kernel_size < 0) {
+ error_report("Error while loading elf kernel");
+ exit(1);
+ }
+ mh_load_addr = elf_low;
+ elf_kernel_size = elf_high - elf_low;
+
+ if (pvh_start_addr == 0) {
+ error_report("Error loading uncompressed kernel without PVH ELF Note");
+ exit(1);
+ }
+ fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, pvh_start_addr);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, elf_kernel_size);
+
+ return true;
+}
+
+void x86_load_linux(X86MachineState *x86ms,
+ FWCfgState *fw_cfg,
+ int acpi_data_size,
+ bool pvh_enabled)
+{
+ bool linuxboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled;
+ uint16_t protocol;
+ int setup_size, kernel_size, cmdline_size;
+ int dtb_size, setup_data_offset;
+ uint32_t initrd_max;
+ uint8_t header[8192], *setup, *kernel;
+ hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0;
+ FILE *f;
+ char *vmode;
+ MachineState *machine = MACHINE(x86ms);
+ struct setup_data *setup_data;
+ const char *kernel_filename = machine->kernel_filename;
+ const char *initrd_filename = machine->initrd_filename;
+ const char *dtb_filename = machine->dtb;
+ const char *kernel_cmdline = machine->kernel_cmdline;
+ SevKernelLoaderContext sev_load_ctx = {};
+
+ /* Align to 16 bytes as a paranoia measure */
+ cmdline_size = (strlen(kernel_cmdline) + 16) & ~15;
+
+ /* load the kernel header */
+ f = fopen(kernel_filename, "rb");
+ if (!f) {
+ fprintf(stderr, "qemu: could not open kernel file '%s': %s\n",
+ kernel_filename, strerror(errno));
+ exit(1);
+ }
+
+ kernel_size = get_file_size(f);
+ if (!kernel_size ||
+ fread(header, 1, MIN(ARRAY_SIZE(header), kernel_size), f) !=
+ MIN(ARRAY_SIZE(header), kernel_size)) {
+ fprintf(stderr, "qemu: could not load kernel '%s': %s\n",
+ kernel_filename, strerror(errno));
+ exit(1);
+ }
+
+ /* kernel protocol version */
+ if (ldl_p(header + 0x202) == 0x53726448) {
+ protocol = lduw_p(header + 0x206);
+ } else {
+ /*
+ * This could be a multiboot kernel. If it is, let's stop treating it
+ * like a Linux kernel.
+ * Note: some multiboot images could be in the ELF format (the same of
+ * PVH), so we try multiboot first since we check the multiboot magic
+ * header before to load it.
+ */
+ if (load_multiboot(x86ms, fw_cfg, f, kernel_filename, initrd_filename,
+ kernel_cmdline, kernel_size, header)) {
+ return;
+ }
+ /*
+ * Check if the file is an uncompressed kernel file (ELF) and load it,
+ * saving the PVH entry point used by the x86/HVM direct boot ABI.
+ * If load_elfboot() is successful, populate the fw_cfg info.
+ */
+ if (pvh_enabled &&
+ load_elfboot(kernel_filename, kernel_size,
+ header, pvh_start_addr, fw_cfg)) {
+ fclose(f);
+
+ fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
+ strlen(kernel_cmdline) + 1);
+ fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
+
+ fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header));
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA,
+ header, sizeof(header));
+
+ /* load initrd */
+ if (initrd_filename) {
+ GMappedFile *mapped_file;
+ gsize initrd_size;
+ gchar *initrd_data;
+ GError *gerr = NULL;
+
+ mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
+ if (!mapped_file) {
+ fprintf(stderr, "qemu: error reading initrd %s: %s\n",
+ initrd_filename, gerr->message);
+ exit(1);
+ }
+ x86ms->initrd_mapped_file = mapped_file;
+
+ initrd_data = g_mapped_file_get_contents(mapped_file);
+ initrd_size = g_mapped_file_get_length(mapped_file);
+ initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
+ if (initrd_size >= initrd_max) {
+ fprintf(stderr, "qemu: initrd is too large, cannot support."
+ "(max: %"PRIu32", need %"PRId64")\n",
+ initrd_max, (uint64_t)initrd_size);
+ exit(1);
+ }
+
+ initrd_addr = (initrd_max - initrd_size) & ~4095;
+
+ fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data,
+ initrd_size);
+ }
+
+ option_rom[nb_option_roms].bootindex = 0;
+ option_rom[nb_option_roms].name = "pvh.bin";
+ nb_option_roms++;
+
+ return;
+ }
+ protocol = 0;
+ }
+
+ if (protocol < 0x200 || !(header[0x211] & 0x01)) {
+ /* Low kernel */
+ real_addr = 0x90000;
+ cmdline_addr = 0x9a000 - cmdline_size;
+ prot_addr = 0x10000;
+ } else if (protocol < 0x202) {
+ /* High but ancient kernel */
+ real_addr = 0x90000;
+ cmdline_addr = 0x9a000 - cmdline_size;
+ prot_addr = 0x100000;
+ } else {
+ /* High and recent kernel */
+ real_addr = 0x10000;
+ cmdline_addr = 0x20000;
+ prot_addr = 0x100000;
+ }
+
+ /* highest address for loading the initrd */
+ if (protocol >= 0x20c &&
+ lduw_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
+ /*
+ * Linux has supported initrd up to 4 GB for a very long time (2007,
+ * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013),
+ * though it only sets initrd_max to 2 GB to "work around bootloader
+ * bugs". Luckily, QEMU firmware(which does something like bootloader)
+ * has supported this.
+ *
+ * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can
+ * be loaded into any address.
+ *
+ * In addition, initrd_max is uint32_t simply because QEMU doesn't
+ * support the 64-bit boot protocol (specifically the ext_ramdisk_image
+ * field).
+ *
+ * Therefore here just limit initrd_max to UINT32_MAX simply as well.
+ */
+ initrd_max = UINT32_MAX;
+ } else if (protocol >= 0x203) {
+ initrd_max = ldl_p(header + 0x22c);
+ } else {
+ initrd_max = 0x37ffffff;
+ }
+
+ if (initrd_max >= x86ms->below_4g_mem_size - acpi_data_size) {
+ initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
+ }
+
+ fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1);
+ fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
+ sev_load_ctx.cmdline_data = (char *)kernel_cmdline;
+ sev_load_ctx.cmdline_size = strlen(kernel_cmdline) + 1;
+
+ if (protocol >= 0x202) {
+ stl_p(header + 0x228, cmdline_addr);
+ } else {
+ stw_p(header + 0x20, 0xA33F);
+ stw_p(header + 0x22, cmdline_addr - real_addr);
+ }
+
+ /* handle vga= parameter */
+ vmode = strstr(kernel_cmdline, "vga=");
+ if (vmode) {
+ unsigned int video_mode;
+ const char *end;
+ int ret;
+ /* skip "vga=" */
+ vmode += 4;
+ if (!strncmp(vmode, "normal", 6)) {
+ video_mode = 0xffff;
+ } else if (!strncmp(vmode, "ext", 3)) {
+ video_mode = 0xfffe;
+ } else if (!strncmp(vmode, "ask", 3)) {
+ video_mode = 0xfffd;
+ } else {
+ ret = qemu_strtoui(vmode, &end, 0, &video_mode);
+ if (ret != 0 || (*end && *end != ' ')) {
+ fprintf(stderr, "qemu: invalid 'vga=' kernel parameter.\n");
+ exit(1);
+ }
+ }
+ stw_p(header + 0x1fa, video_mode);
+ }
+
+ /* loader type */
+ /*
+ * High nybble = B reserved for QEMU; low nybble is revision number.
+ * If this code is substantially changed, you may want to consider
+ * incrementing the revision.
+ */
+ if (protocol >= 0x200) {
+ header[0x210] = 0xB0;
+ }
+ /* heap */
+ if (protocol >= 0x201) {
+ header[0x211] |= 0x80; /* CAN_USE_HEAP */
+ stw_p(header + 0x224, cmdline_addr - real_addr - 0x200);
+ }
+
+ /* load initrd */
+ if (initrd_filename) {
+ GMappedFile *mapped_file;
+ gsize initrd_size;
+ gchar *initrd_data;
+ GError *gerr = NULL;
+
+ if (protocol < 0x200) {
+ fprintf(stderr, "qemu: linux kernel too old to load a ram disk\n");
+ exit(1);
+ }
+
+ mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
+ if (!mapped_file) {
+ fprintf(stderr, "qemu: error reading initrd %s: %s\n",
+ initrd_filename, gerr->message);
+ exit(1);
+ }
+ x86ms->initrd_mapped_file = mapped_file;
+
+ initrd_data = g_mapped_file_get_contents(mapped_file);
+ initrd_size = g_mapped_file_get_length(mapped_file);
+ if (initrd_size >= initrd_max) {
+ fprintf(stderr, "qemu: initrd is too large, cannot support."
+ "(max: %"PRIu32", need %"PRId64")\n",
+ initrd_max, (uint64_t)initrd_size);
+ exit(1);
+ }
+
+ initrd_addr = (initrd_max - initrd_size) & ~4095;
+
+ fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, initrd_size);
+ sev_load_ctx.initrd_data = initrd_data;
+ sev_load_ctx.initrd_size = initrd_size;
+
+ stl_p(header + 0x218, initrd_addr);
+ stl_p(header + 0x21c, initrd_size);
+ }
+
+ /* load kernel and setup */
+ setup_size = header[0x1f1];
+ if (setup_size == 0) {
+ setup_size = 4;
+ }
+ setup_size = (setup_size + 1) * 512;
+ if (setup_size > kernel_size) {
+ fprintf(stderr, "qemu: invalid kernel header\n");
+ exit(1);
+ }
+ kernel_size -= setup_size;
+
+ setup = g_malloc(setup_size);
+ kernel = g_malloc(kernel_size);
+ fseek(f, 0, SEEK_SET);
+ if (fread(setup, 1, setup_size, f) != setup_size) {
+ fprintf(stderr, "fread() failed\n");
+ exit(1);
+ }
+ if (fread(kernel, 1, kernel_size, f) != kernel_size) {
+ fprintf(stderr, "fread() failed\n");
+ exit(1);
+ }
+ fclose(f);
+
+ /* append dtb to kernel */
+ if (dtb_filename) {
+ if (protocol < 0x209) {
+ fprintf(stderr, "qemu: Linux kernel too old to load a dtb\n");
+ exit(1);
+ }
+
+ dtb_size = get_image_size(dtb_filename);
+ if (dtb_size <= 0) {
+ fprintf(stderr, "qemu: error reading dtb %s: %s\n",
+ dtb_filename, strerror(errno));
+ exit(1);
+ }
+
+ setup_data_offset = QEMU_ALIGN_UP(kernel_size, 16);
+ kernel_size = setup_data_offset + sizeof(struct setup_data) + dtb_size;
+ kernel = g_realloc(kernel, kernel_size);
+
+ stq_p(header + 0x250, prot_addr + setup_data_offset);
+
+ setup_data = (struct setup_data *)(kernel + setup_data_offset);
+ setup_data->next = 0;
+ setup_data->type = cpu_to_le32(SETUP_DTB);
+ setup_data->len = cpu_to_le32(dtb_size);
+
+ load_image_size(dtb_filename, setup_data->data, dtb_size);
+ }
+
+ /*
+ * If we're starting an encrypted VM, it will be OVMF based, which uses the
+ * efi stub for booting and doesn't require any values to be placed in the
+ * kernel header. We therefore don't update the header so the hash of the
+ * kernel on the other side of the fw_cfg interface matches the hash of the
+ * file the user passed in.
+ */
+ if (!sev_enabled()) {
+ memcpy(setup, header, MIN(sizeof(header), setup_size));
+ }
+
+ fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size);
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size);
+ sev_load_ctx.kernel_data = (char *)kernel;
+ sev_load_ctx.kernel_size = kernel_size;
+
+ fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr);
+ fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size);
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, setup, setup_size);
+ sev_load_ctx.setup_data = (char *)setup;
+ sev_load_ctx.setup_size = setup_size;
+
+ if (sev_enabled()) {
+ sev_add_kernel_loader_hashes(&sev_load_ctx, &error_fatal);
+ }
+
+ option_rom[nb_option_roms].bootindex = 0;
+ option_rom[nb_option_roms].name = "linuxboot.bin";
+ if (linuxboot_dma_enabled && fw_cfg_dma_enabled(fw_cfg)) {
+ option_rom[nb_option_roms].name = "linuxboot_dma.bin";
+ }
+ nb_option_roms++;
+}
+
+void x86_bios_rom_init(MachineState *ms, const char *default_firmware,
+ MemoryRegion *rom_memory, bool isapc_ram_fw)
+{
+ const char *bios_name;
+ char *filename;
+ MemoryRegion *bios, *isa_bios;
+ int bios_size, isa_bios_size;
+ int ret;
+
+ /* BIOS load */
+ bios_name = ms->firmware ?: default_firmware;
+ filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
+ if (filename) {
+ bios_size = get_image_size(filename);
+ } else {
+ bios_size = -1;
+ }
+ if (bios_size <= 0 ||
+ (bios_size % 65536) != 0) {
+ goto bios_error;
+ }
+ bios = g_malloc(sizeof(*bios));
+ memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal);
+ if (!isapc_ram_fw) {
+ memory_region_set_readonly(bios, true);
+ }
+ ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1);
+ if (ret != 0) {
+ bios_error:
+ fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name);
+ exit(1);
+ }
+ g_free(filename);
+
+ /* map the last 128KB of the BIOS in ISA space */
+ isa_bios_size = MIN(bios_size, 128 * KiB);
+ isa_bios = g_malloc(sizeof(*isa_bios));
+ memory_region_init_alias(isa_bios, NULL, "isa-bios", bios,
+ bios_size - isa_bios_size, isa_bios_size);
+ memory_region_add_subregion_overlap(rom_memory,
+ 0x100000 - isa_bios_size,
+ isa_bios,
+ 1);
+ if (!isapc_ram_fw) {
+ memory_region_set_readonly(isa_bios, true);
+ }
+
+ /* map all the bios at the top of memory */
+ memory_region_add_subregion(rom_memory,
+ (uint32_t)(-bios_size),
+ bios);
+}
+
+bool x86_machine_is_smm_enabled(const X86MachineState *x86ms)
+{
+ bool smm_available = false;
+
+ if (x86ms->smm == ON_OFF_AUTO_OFF) {
+ return false;
+ }
+
+ if (tcg_enabled() || qtest_enabled()) {
+ smm_available = true;
+ } else if (kvm_enabled()) {
+ smm_available = kvm_has_smm();
+ }
+
+ if (smm_available) {
+ return true;
+ }
+
+ if (x86ms->smm == ON_OFF_AUTO_ON) {
+ error_report("System Management Mode not supported by this hypervisor.");
+ exit(1);
+ }
+ return false;
+}
+
+static void x86_machine_get_smm(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+ OnOffAuto smm = x86ms->smm;
+
+ visit_type_OnOffAuto(v, name, &smm, errp);
+}
+
+static void x86_machine_set_smm(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+
+ visit_type_OnOffAuto(v, name, &x86ms->smm, errp);
+}
+
+bool x86_machine_is_acpi_enabled(const X86MachineState *x86ms)
+{
+ if (x86ms->acpi == ON_OFF_AUTO_OFF) {
+ return false;
+ }
+ return true;
+}
+
+static void x86_machine_get_acpi(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+ OnOffAuto acpi = x86ms->acpi;
+
+ visit_type_OnOffAuto(v, name, &acpi, errp);
+}
+
+static void x86_machine_set_acpi(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+
+ visit_type_OnOffAuto(v, name, &x86ms->acpi, errp);
+}
+
+static char *x86_machine_get_oem_id(Object *obj, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+
+ return g_strdup(x86ms->oem_id);
+}
+
+static void x86_machine_set_oem_id(Object *obj, const char *value, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+ size_t len = strlen(value);
+
+ if (len > 6) {
+ error_setg(errp,
+ "User specified "X86_MACHINE_OEM_ID" value is bigger than "
+ "6 bytes in size");
+ return;
+ }
+
+ strncpy(x86ms->oem_id, value, 6);
+}
+
+static char *x86_machine_get_oem_table_id(Object *obj, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+
+ return g_strdup(x86ms->oem_table_id);
+}
+
+static void x86_machine_set_oem_table_id(Object *obj, const char *value,
+ Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+ size_t len = strlen(value);
+
+ if (len > 8) {
+ error_setg(errp,
+ "User specified "X86_MACHINE_OEM_TABLE_ID
+ " value is bigger than "
+ "8 bytes in size");
+ return;
+ }
+ strncpy(x86ms->oem_table_id, value, 8);
+}
+
+static void x86_machine_get_bus_lock_ratelimit(Object *obj, Visitor *v,
+ const char *name, void *opaque, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+ uint64_t bus_lock_ratelimit = x86ms->bus_lock_ratelimit;
+
+ visit_type_uint64(v, name, &bus_lock_ratelimit, errp);
+}
+
+static void x86_machine_set_bus_lock_ratelimit(Object *obj, Visitor *v,
+ const char *name, void *opaque, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+
+ visit_type_uint64(v, name, &x86ms->bus_lock_ratelimit, errp);
+}
+
+static void machine_get_sgx_epc(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+ SgxEPCList *list = x86ms->sgx_epc_list;
+
+ visit_type_SgxEPCList(v, name, &list, errp);
+}
+
+static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+ SgxEPCList *list;
+
+ list = x86ms->sgx_epc_list;
+ visit_type_SgxEPCList(v, name, &x86ms->sgx_epc_list, errp);
+
+ qapi_free_SgxEPCList(list);
+}
+
+static void x86_machine_initfn(Object *obj)
+{
+ X86MachineState *x86ms = X86_MACHINE(obj);
+
+ x86ms->smm = ON_OFF_AUTO_AUTO;
+ x86ms->acpi = ON_OFF_AUTO_AUTO;
+ x86ms->pci_irq_mask = ACPI_BUILD_PCI_IRQS;
+ x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6);
+ x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
+ x86ms->bus_lock_ratelimit = 0;
+}
+
+static void x86_machine_class_init(ObjectClass *oc, void *data)
+{
+ MachineClass *mc = MACHINE_CLASS(oc);
+ X86MachineClass *x86mc = X86_MACHINE_CLASS(oc);
+ NMIClass *nc = NMI_CLASS(oc);
+
+ mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
+ mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
+ mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
+ x86mc->compat_apic_id_mode = false;
+ x86mc->save_tsc_khz = true;
+ x86mc->fwcfg_dma_enabled = true;
+ nc->nmi_monitor_handler = x86_nmi;
+
+ object_class_property_add(oc, X86_MACHINE_SMM, "OnOffAuto",
+ x86_machine_get_smm, x86_machine_set_smm,
+ NULL, NULL);
+ object_class_property_set_description(oc, X86_MACHINE_SMM,
+ "Enable SMM");
+
+ object_class_property_add(oc, X86_MACHINE_ACPI, "OnOffAuto",
+ x86_machine_get_acpi, x86_machine_set_acpi,
+ NULL, NULL);
+ object_class_property_set_description(oc, X86_MACHINE_ACPI,
+ "Enable ACPI");
+
+ object_class_property_add_str(oc, X86_MACHINE_OEM_ID,
+ x86_machine_get_oem_id,
+ x86_machine_set_oem_id);
+ object_class_property_set_description(oc, X86_MACHINE_OEM_ID,
+ "Override the default value of field OEMID "
+ "in ACPI table header."
+ "The string may be up to 6 bytes in size");
+
+
+ object_class_property_add_str(oc, X86_MACHINE_OEM_TABLE_ID,
+ x86_machine_get_oem_table_id,
+ x86_machine_set_oem_table_id);
+ object_class_property_set_description(oc, X86_MACHINE_OEM_TABLE_ID,
+ "Override the default value of field OEM Table ID "
+ "in ACPI table header."
+ "The string may be up to 8 bytes in size");
+
+ object_class_property_add(oc, X86_MACHINE_BUS_LOCK_RATELIMIT, "uint64_t",
+ x86_machine_get_bus_lock_ratelimit,
+ x86_machine_set_bus_lock_ratelimit, NULL, NULL);
+ object_class_property_set_description(oc, X86_MACHINE_BUS_LOCK_RATELIMIT,
+ "Set the ratelimit for the bus locks acquired in VMs");
+
+ object_class_property_add(oc, "sgx-epc", "SgxEPC",
+ machine_get_sgx_epc, machine_set_sgx_epc,
+ NULL, NULL);
+ object_class_property_set_description(oc, "sgx-epc",
+ "SGX EPC device");
+}
+
+static const TypeInfo x86_machine_info = {
+ .name = TYPE_X86_MACHINE,
+ .parent = TYPE_MACHINE,
+ .abstract = true,
+ .instance_size = sizeof(X86MachineState),
+ .instance_init = x86_machine_initfn,
+ .class_size = sizeof(X86MachineClass),
+ .class_init = x86_machine_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_NMI },
+ { }
+ },
+};
+
+static void x86_machine_register_types(void)
+{
+ type_register_static(&x86_machine_info);
+}
+
+type_init(x86_machine_register_types)
diff --git a/hw/i386/xen/meson.build b/hw/i386/xen/meson.build
new file mode 100644
index 000000000..be8413030
--- /dev/null
+++ b/hw/i386/xen/meson.build
@@ -0,0 +1,7 @@
+i386_ss.add(when: 'CONFIG_XEN', if_true: files(
+ 'xen-hvm.c',
+ 'xen-mapcache.c',
+ 'xen_apic.c',
+ 'xen_platform.c',
+ 'xen_pvdevice.c',
+))
diff --git a/hw/i386/xen/trace-events b/hw/i386/xen/trace-events
new file mode 100644
index 000000000..5d6be6109
--- /dev/null
+++ b/hw/i386/xen/trace-events
@@ -0,0 +1,28 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# xen_platform.c
+xen_platform_log(char *s) "xen platform: %s"
+
+# xen_pvdevice.c
+xen_pv_mmio_read(uint64_t addr) "WARNING: read from Xen PV Device MMIO space (address 0x%"PRIx64")"
+xen_pv_mmio_write(uint64_t addr) "WARNING: write to Xen PV Device MMIO space (address 0x%"PRIx64")"
+
+# xen-hvm.c
+xen_ram_alloc(unsigned long ram_addr, unsigned long size) "requested: 0x%lx, size 0x%lx"
+xen_client_set_memory(uint64_t start_addr, unsigned long size, bool log_dirty) "0x%"PRIx64" size 0x%lx, log_dirty %i"
+handle_ioreq(void *req, uint32_t type, uint32_t dir, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p type=%d dir=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d"
+handle_ioreq_read(void *req, uint32_t type, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p read type=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d"
+handle_ioreq_write(void *req, uint32_t type, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p write type=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d"
+cpu_ioreq_pio(void *req, uint32_t dir, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p pio dir=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d"
+cpu_ioreq_pio_read_reg(void *req, uint64_t data, uint64_t addr, uint32_t size) "I/O=%p pio read reg data=0x%"PRIx64" port=0x%"PRIx64" size=%d"
+cpu_ioreq_pio_write_reg(void *req, uint64_t data, uint64_t addr, uint32_t size) "I/O=%p pio write reg data=0x%"PRIx64" port=0x%"PRIx64" size=%d"
+cpu_ioreq_move(void *req, uint32_t dir, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p copy dir=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d"
+xen_map_resource_ioreq(uint32_t id, void *addr) "id: %u addr: %p"
+cpu_ioreq_config_read(void *req, uint32_t sbdf, uint32_t reg, uint32_t size, uint32_t data) "I/O=%p sbdf=0x%x reg=%u size=%u data=0x%x"
+cpu_ioreq_config_write(void *req, uint32_t sbdf, uint32_t reg, uint32_t size, uint32_t data) "I/O=%p sbdf=0x%x reg=%u size=%u data=0x%x"
+
+# xen-mapcache.c
+xen_map_cache(uint64_t phys_addr) "want 0x%"PRIx64
+xen_remap_bucket(uint64_t index) "index 0x%"PRIx64
+xen_map_cache_return(void* ptr) "%p"
+
diff --git a/hw/i386/xen/trace.h b/hw/i386/xen/trace.h
new file mode 100644
index 000000000..a02bf755d
--- /dev/null
+++ b/hw/i386/xen/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_i386_xen.h"
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
new file mode 100644
index 000000000..482be9541
--- /dev/null
+++ b/hw/i386/xen/xen-hvm.c
@@ -0,0 +1,1620 @@
+/*
+ * Copyright (C) 2010 Citrix Ltd.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+
+#include "cpu.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_host.h"
+#include "hw/i386/pc.h"
+#include "hw/southbridge/piix.h"
+#include "hw/irq.h"
+#include "hw/hw.h"
+#include "hw/i386/apic-msidef.h"
+#include "hw/xen/xen_common.h"
+#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen-bus.h"
+#include "hw/xen/xen-x86.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-migration.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "qemu/range.h"
+#include "sysemu/runstate.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/xen.h"
+#include "sysemu/xen-mapcache.h"
+#include "trace.h"
+
+#include <xen/hvm/ioreq.h>
+#include <xen/hvm/e820.h>
+
+//#define DEBUG_XEN_HVM
+
+#ifdef DEBUG_XEN_HVM
+#define DPRINTF(fmt, ...) \
+ do { fprintf(stderr, "xen: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+ do { } while (0)
+#endif
+
+static MemoryRegion ram_memory, ram_640k, ram_lo, ram_hi;
+static MemoryRegion *framebuffer;
+static bool xen_in_migration;
+
+/* Compatibility with older version */
+
+/* This allows QEMU to build on a system that has Xen 4.5 or earlier
+ * installed. This here (not in hw/xen/xen_common.h) because xen/hvm/ioreq.h
+ * needs to be included before this block and hw/xen/xen_common.h needs to
+ * be included before xen/hvm/ioreq.h
+ */
+#ifndef IOREQ_TYPE_VMWARE_PORT
+#define IOREQ_TYPE_VMWARE_PORT 3
+struct vmware_regs {
+ uint32_t esi;
+ uint32_t edi;
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+};
+typedef struct vmware_regs vmware_regs_t;
+
+struct shared_vmport_iopage {
+ struct vmware_regs vcpu_vmport_regs[1];
+};
+typedef struct shared_vmport_iopage shared_vmport_iopage_t;
+#endif
+
+static inline uint32_t xen_vcpu_eport(shared_iopage_t *shared_page, int i)
+{
+ return shared_page->vcpu_ioreq[i].vp_eport;
+}
+static inline ioreq_t *xen_vcpu_ioreq(shared_iopage_t *shared_page, int vcpu)
+{
+ return &shared_page->vcpu_ioreq[vcpu];
+}
+
+#define BUFFER_IO_MAX_DELAY 100
+
+typedef struct XenPhysmap {
+ hwaddr start_addr;
+ ram_addr_t size;
+ const char *name;
+ hwaddr phys_offset;
+
+ QLIST_ENTRY(XenPhysmap) list;
+} XenPhysmap;
+
+static QLIST_HEAD(, XenPhysmap) xen_physmap;
+
+typedef struct XenPciDevice {
+ PCIDevice *pci_dev;
+ uint32_t sbdf;
+ QLIST_ENTRY(XenPciDevice) entry;
+} XenPciDevice;
+
+typedef struct XenIOState {
+ ioservid_t ioservid;
+ shared_iopage_t *shared_page;
+ shared_vmport_iopage_t *shared_vmport_page;
+ buffered_iopage_t *buffered_io_page;
+ xenforeignmemory_resource_handle *fres;
+ QEMUTimer *buffered_io_timer;
+ CPUState **cpu_by_vcpu_id;
+ /* the evtchn port for polling the notification, */
+ evtchn_port_t *ioreq_local_port;
+ /* evtchn remote and local ports for buffered io */
+ evtchn_port_t bufioreq_remote_port;
+ evtchn_port_t bufioreq_local_port;
+ /* the evtchn fd for polling */
+ xenevtchn_handle *xce_handle;
+ /* which vcpu we are serving */
+ int send_vcpu;
+
+ struct xs_handle *xenstore;
+ MemoryListener memory_listener;
+ MemoryListener io_listener;
+ QLIST_HEAD(, XenPciDevice) dev_list;
+ DeviceListener device_listener;
+ hwaddr free_phys_offset;
+ const XenPhysmap *log_for_dirtybit;
+ /* Buffer used by xen_sync_dirty_bitmap */
+ unsigned long *dirty_bitmap;
+
+ Notifier exit;
+ Notifier suspend;
+ Notifier wakeup;
+} XenIOState;
+
+/* Xen specific function for piix pci */
+
+int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
+{
+ return irq_num + (PCI_SLOT(pci_dev->devfn) << 2);
+}
+
+void xen_piix3_set_irq(void *opaque, int irq_num, int level)
+{
+ xen_set_pci_intx_level(xen_domid, 0, 0, irq_num >> 2,
+ irq_num & 3, level);
+}
+
+void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len)
+{
+ int i;
+
+ /* Scan for updates to PCI link routes (0x60-0x63). */
+ for (i = 0; i < len; i++) {
+ uint8_t v = (val >> (8 * i)) & 0xff;
+ if (v & 0x80) {
+ v = 0;
+ }
+ v &= 0xf;
+ if (((address + i) >= PIIX_PIRQCA) && ((address + i) <= PIIX_PIRQCD)) {
+ xen_set_pci_link_route(xen_domid, address + i - PIIX_PIRQCA, v);
+ }
+ }
+}
+
+int xen_is_pirq_msi(uint32_t msi_data)
+{
+ /* If vector is 0, the msi is remapped into a pirq, passed as
+ * dest_id.
+ */
+ return ((msi_data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT) == 0;
+}
+
+void xen_hvm_inject_msi(uint64_t addr, uint32_t data)
+{
+ xen_inject_msi(xen_domid, addr, data);
+}
+
+static void xen_suspend_notifier(Notifier *notifier, void *data)
+{
+ xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 3);
+}
+
+/* Xen Interrupt Controller */
+
+static void xen_set_irq(void *opaque, int irq, int level)
+{
+ xen_set_isa_irq_level(xen_domid, irq, level);
+}
+
+qemu_irq *xen_interrupt_controller_init(void)
+{
+ return qemu_allocate_irqs(xen_set_irq, NULL, 16);
+}
+
+/* Memory Ops */
+
+static void xen_ram_init(PCMachineState *pcms,
+ ram_addr_t ram_size, MemoryRegion **ram_memory_p)
+{
+ X86MachineState *x86ms = X86_MACHINE(pcms);
+ MemoryRegion *sysmem = get_system_memory();
+ ram_addr_t block_len;
+ uint64_t user_lowmem =
+ object_property_get_uint(qdev_get_machine(),
+ PC_MACHINE_MAX_RAM_BELOW_4G,
+ &error_abort);
+
+ /* Handle the machine opt max-ram-below-4g. It is basically doing
+ * min(xen limit, user limit).
+ */
+ if (!user_lowmem) {
+ user_lowmem = HVM_BELOW_4G_RAM_END; /* default */
+ }
+ if (HVM_BELOW_4G_RAM_END <= user_lowmem) {
+ user_lowmem = HVM_BELOW_4G_RAM_END;
+ }
+
+ if (ram_size >= user_lowmem) {
+ x86ms->above_4g_mem_size = ram_size - user_lowmem;
+ x86ms->below_4g_mem_size = user_lowmem;
+ } else {
+ x86ms->above_4g_mem_size = 0;
+ x86ms->below_4g_mem_size = ram_size;
+ }
+ if (!x86ms->above_4g_mem_size) {
+ block_len = ram_size;
+ } else {
+ /*
+ * Xen does not allocate the memory continuously, it keeps a
+ * hole of the size computed above or passed in.
+ */
+ block_len = (4 * GiB) + x86ms->above_4g_mem_size;
+ }
+ memory_region_init_ram(&ram_memory, NULL, "xen.ram", block_len,
+ &error_fatal);
+ *ram_memory_p = &ram_memory;
+
+ memory_region_init_alias(&ram_640k, NULL, "xen.ram.640k",
+ &ram_memory, 0, 0xa0000);
+ memory_region_add_subregion(sysmem, 0, &ram_640k);
+ /* Skip of the VGA IO memory space, it will be registered later by the VGA
+ * emulated device.
+ *
+ * The area between 0xc0000 and 0x100000 will be used by SeaBIOS to load
+ * the Options ROM, so it is registered here as RAM.
+ */
+ memory_region_init_alias(&ram_lo, NULL, "xen.ram.lo",
+ &ram_memory, 0xc0000,
+ x86ms->below_4g_mem_size - 0xc0000);
+ memory_region_add_subregion(sysmem, 0xc0000, &ram_lo);
+ if (x86ms->above_4g_mem_size > 0) {
+ memory_region_init_alias(&ram_hi, NULL, "xen.ram.hi",
+ &ram_memory, 0x100000000ULL,
+ x86ms->above_4g_mem_size);
+ memory_region_add_subregion(sysmem, 0x100000000ULL, &ram_hi);
+ }
+}
+
+void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, MemoryRegion *mr,
+ Error **errp)
+{
+ unsigned long nr_pfn;
+ xen_pfn_t *pfn_list;
+ int i;
+
+ if (runstate_check(RUN_STATE_INMIGRATE)) {
+ /* RAM already populated in Xen */
+ fprintf(stderr, "%s: do not alloc "RAM_ADDR_FMT
+ " bytes of ram at "RAM_ADDR_FMT" when runstate is INMIGRATE\n",
+ __func__, size, ram_addr);
+ return;
+ }
+
+ if (mr == &ram_memory) {
+ return;
+ }
+
+ trace_xen_ram_alloc(ram_addr, size);
+
+ nr_pfn = size >> TARGET_PAGE_BITS;
+ pfn_list = g_malloc(sizeof (*pfn_list) * nr_pfn);
+
+ for (i = 0; i < nr_pfn; i++) {
+ pfn_list[i] = (ram_addr >> TARGET_PAGE_BITS) + i;
+ }
+
+ if (xc_domain_populate_physmap_exact(xen_xc, xen_domid, nr_pfn, 0, 0, pfn_list)) {
+ error_setg(errp, "xen: failed to populate ram at " RAM_ADDR_FMT,
+ ram_addr);
+ }
+
+ g_free(pfn_list);
+}
+
+static XenPhysmap *get_physmapping(hwaddr start_addr, ram_addr_t size)
+{
+ XenPhysmap *physmap = NULL;
+
+ start_addr &= TARGET_PAGE_MASK;
+
+ QLIST_FOREACH(physmap, &xen_physmap, list) {
+ if (range_covers_byte(physmap->start_addr, physmap->size, start_addr)) {
+ return physmap;
+ }
+ }
+ return NULL;
+}
+
+static hwaddr xen_phys_offset_to_gaddr(hwaddr phys_offset, ram_addr_t size)
+{
+ hwaddr addr = phys_offset & TARGET_PAGE_MASK;
+ XenPhysmap *physmap = NULL;
+
+ QLIST_FOREACH(physmap, &xen_physmap, list) {
+ if (range_covers_byte(physmap->phys_offset, physmap->size, addr)) {
+ return physmap->start_addr + (phys_offset - physmap->phys_offset);
+ }
+ }
+
+ return phys_offset;
+}
+
+#ifdef XEN_COMPAT_PHYSMAP
+static int xen_save_physmap(XenIOState *state, XenPhysmap *physmap)
+{
+ char path[80], value[17];
+
+ snprintf(path, sizeof(path),
+ "/local/domain/0/device-model/%d/physmap/%"PRIx64"/start_addr",
+ xen_domid, (uint64_t)physmap->phys_offset);
+ snprintf(value, sizeof(value), "%"PRIx64, (uint64_t)physmap->start_addr);
+ if (!xs_write(state->xenstore, 0, path, value, strlen(value))) {
+ return -1;
+ }
+ snprintf(path, sizeof(path),
+ "/local/domain/0/device-model/%d/physmap/%"PRIx64"/size",
+ xen_domid, (uint64_t)physmap->phys_offset);
+ snprintf(value, sizeof(value), "%"PRIx64, (uint64_t)physmap->size);
+ if (!xs_write(state->xenstore, 0, path, value, strlen(value))) {
+ return -1;
+ }
+ if (physmap->name) {
+ snprintf(path, sizeof(path),
+ "/local/domain/0/device-model/%d/physmap/%"PRIx64"/name",
+ xen_domid, (uint64_t)physmap->phys_offset);
+ if (!xs_write(state->xenstore, 0, path,
+ physmap->name, strlen(physmap->name))) {
+ return -1;
+ }
+ }
+ return 0;
+}
+#else
+static int xen_save_physmap(XenIOState *state, XenPhysmap *physmap)
+{
+ return 0;
+}
+#endif
+
+static int xen_add_to_physmap(XenIOState *state,
+ hwaddr start_addr,
+ ram_addr_t size,
+ MemoryRegion *mr,
+ hwaddr offset_within_region)
+{
+ unsigned long nr_pages;
+ int rc = 0;
+ XenPhysmap *physmap = NULL;
+ hwaddr pfn, start_gpfn;
+ hwaddr phys_offset = memory_region_get_ram_addr(mr);
+ const char *mr_name;
+
+ if (get_physmapping(start_addr, size)) {
+ return 0;
+ }
+ if (size <= 0) {
+ return -1;
+ }
+
+ /* Xen can only handle a single dirty log region for now and we want
+ * the linear framebuffer to be that region.
+ * Avoid tracking any regions that is not videoram and avoid tracking
+ * the legacy vga region. */
+ if (mr == framebuffer && start_addr > 0xbffff) {
+ goto go_physmap;
+ }
+ return -1;
+
+go_physmap:
+ DPRINTF("mapping vram to %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
+ start_addr, start_addr + size);
+
+ mr_name = memory_region_name(mr);
+
+ physmap = g_malloc(sizeof(XenPhysmap));
+
+ physmap->start_addr = start_addr;
+ physmap->size = size;
+ physmap->name = mr_name;
+ physmap->phys_offset = phys_offset;
+
+ QLIST_INSERT_HEAD(&xen_physmap, physmap, list);
+
+ if (runstate_check(RUN_STATE_INMIGRATE)) {
+ /* Now when we have a physmap entry we can replace a dummy mapping with
+ * a real one of guest foreign memory. */
+ uint8_t *p = xen_replace_cache_entry(phys_offset, start_addr, size);
+ assert(p && p == memory_region_get_ram_ptr(mr));
+
+ return 0;
+ }
+
+ pfn = phys_offset >> TARGET_PAGE_BITS;
+ start_gpfn = start_addr >> TARGET_PAGE_BITS;
+ nr_pages = size >> TARGET_PAGE_BITS;
+ rc = xendevicemodel_relocate_memory(xen_dmod, xen_domid, nr_pages, pfn,
+ start_gpfn);
+ if (rc) {
+ int saved_errno = errno;
+
+ error_report("relocate_memory %lu pages from GFN %"HWADDR_PRIx
+ " to GFN %"HWADDR_PRIx" failed: %s",
+ nr_pages, pfn, start_gpfn, strerror(saved_errno));
+ errno = saved_errno;
+ return -1;
+ }
+
+ rc = xendevicemodel_pin_memory_cacheattr(xen_dmod, xen_domid,
+ start_addr >> TARGET_PAGE_BITS,
+ (start_addr + size - 1) >> TARGET_PAGE_BITS,
+ XEN_DOMCTL_MEM_CACHEATTR_WB);
+ if (rc) {
+ error_report("pin_memory_cacheattr failed: %s", strerror(errno));
+ }
+ return xen_save_physmap(state, physmap);
+}
+
+static int xen_remove_from_physmap(XenIOState *state,
+ hwaddr start_addr,
+ ram_addr_t size)
+{
+ int rc = 0;
+ XenPhysmap *physmap = NULL;
+ hwaddr phys_offset = 0;
+
+ physmap = get_physmapping(start_addr, size);
+ if (physmap == NULL) {
+ return -1;
+ }
+
+ phys_offset = physmap->phys_offset;
+ size = physmap->size;
+
+ DPRINTF("unmapping vram to %"HWADDR_PRIx" - %"HWADDR_PRIx", at "
+ "%"HWADDR_PRIx"\n", start_addr, start_addr + size, phys_offset);
+
+ size >>= TARGET_PAGE_BITS;
+ start_addr >>= TARGET_PAGE_BITS;
+ phys_offset >>= TARGET_PAGE_BITS;
+ rc = xendevicemodel_relocate_memory(xen_dmod, xen_domid, size, start_addr,
+ phys_offset);
+ if (rc) {
+ int saved_errno = errno;
+
+ error_report("relocate_memory "RAM_ADDR_FMT" pages"
+ " from GFN %"HWADDR_PRIx
+ " to GFN %"HWADDR_PRIx" failed: %s",
+ size, start_addr, phys_offset, strerror(saved_errno));
+ errno = saved_errno;
+ return -1;
+ }
+
+ QLIST_REMOVE(physmap, list);
+ if (state->log_for_dirtybit == physmap) {
+ state->log_for_dirtybit = NULL;
+ g_free(state->dirty_bitmap);
+ state->dirty_bitmap = NULL;
+ }
+ g_free(physmap);
+
+ return 0;
+}
+
+static void xen_set_memory(struct MemoryListener *listener,
+ MemoryRegionSection *section,
+ bool add)
+{
+ XenIOState *state = container_of(listener, XenIOState, memory_listener);
+ hwaddr start_addr = section->offset_within_address_space;
+ ram_addr_t size = int128_get64(section->size);
+ bool log_dirty = memory_region_is_logging(section->mr, DIRTY_MEMORY_VGA);
+ hvmmem_type_t mem_type;
+
+ if (section->mr == &ram_memory) {
+ return;
+ } else {
+ if (add) {
+ xen_map_memory_section(xen_domid, state->ioservid,
+ section);
+ } else {
+ xen_unmap_memory_section(xen_domid, state->ioservid,
+ section);
+ }
+ }
+
+ if (!memory_region_is_ram(section->mr)) {
+ return;
+ }
+
+ if (log_dirty != add) {
+ return;
+ }
+
+ trace_xen_client_set_memory(start_addr, size, log_dirty);
+
+ start_addr &= TARGET_PAGE_MASK;
+ size = TARGET_PAGE_ALIGN(size);
+
+ if (add) {
+ if (!memory_region_is_rom(section->mr)) {
+ xen_add_to_physmap(state, start_addr, size,
+ section->mr, section->offset_within_region);
+ } else {
+ mem_type = HVMMEM_ram_ro;
+ if (xen_set_mem_type(xen_domid, mem_type,
+ start_addr >> TARGET_PAGE_BITS,
+ size >> TARGET_PAGE_BITS)) {
+ DPRINTF("xen_set_mem_type error, addr: "TARGET_FMT_plx"\n",
+ start_addr);
+ }
+ }
+ } else {
+ if (xen_remove_from_physmap(state, start_addr, size) < 0) {
+ DPRINTF("physmapping does not exist at "TARGET_FMT_plx"\n", start_addr);
+ }
+ }
+}
+
+static void xen_region_add(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ memory_region_ref(section->mr);
+ xen_set_memory(listener, section, true);
+}
+
+static void xen_region_del(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ xen_set_memory(listener, section, false);
+ memory_region_unref(section->mr);
+}
+
+static void xen_io_add(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ XenIOState *state = container_of(listener, XenIOState, io_listener);
+ MemoryRegion *mr = section->mr;
+
+ if (mr->ops == &unassigned_io_ops) {
+ return;
+ }
+
+ memory_region_ref(mr);
+
+ xen_map_io_section(xen_domid, state->ioservid, section);
+}
+
+static void xen_io_del(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ XenIOState *state = container_of(listener, XenIOState, io_listener);
+ MemoryRegion *mr = section->mr;
+
+ if (mr->ops == &unassigned_io_ops) {
+ return;
+ }
+
+ xen_unmap_io_section(xen_domid, state->ioservid, section);
+
+ memory_region_unref(mr);
+}
+
+static void xen_device_realize(DeviceListener *listener,
+ DeviceState *dev)
+{
+ XenIOState *state = container_of(listener, XenIOState, device_listener);
+
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+ PCIDevice *pci_dev = PCI_DEVICE(dev);
+ XenPciDevice *xendev = g_new(XenPciDevice, 1);
+
+ xendev->pci_dev = pci_dev;
+ xendev->sbdf = PCI_BUILD_BDF(pci_dev_bus_num(pci_dev),
+ pci_dev->devfn);
+ QLIST_INSERT_HEAD(&state->dev_list, xendev, entry);
+
+ xen_map_pcidev(xen_domid, state->ioservid, pci_dev);
+ }
+}
+
+static void xen_device_unrealize(DeviceListener *listener,
+ DeviceState *dev)
+{
+ XenIOState *state = container_of(listener, XenIOState, device_listener);
+
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+ PCIDevice *pci_dev = PCI_DEVICE(dev);
+ XenPciDevice *xendev, *next;
+
+ xen_unmap_pcidev(xen_domid, state->ioservid, pci_dev);
+
+ QLIST_FOREACH_SAFE(xendev, &state->dev_list, entry, next) {
+ if (xendev->pci_dev == pci_dev) {
+ QLIST_REMOVE(xendev, entry);
+ g_free(xendev);
+ break;
+ }
+ }
+ }
+}
+
+static void xen_sync_dirty_bitmap(XenIOState *state,
+ hwaddr start_addr,
+ ram_addr_t size)
+{
+ hwaddr npages = size >> TARGET_PAGE_BITS;
+ const int width = sizeof(unsigned long) * 8;
+ size_t bitmap_size = DIV_ROUND_UP(npages, width);
+ int rc, i, j;
+ const XenPhysmap *physmap = NULL;
+
+ physmap = get_physmapping(start_addr, size);
+ if (physmap == NULL) {
+ /* not handled */
+ return;
+ }
+
+ if (state->log_for_dirtybit == NULL) {
+ state->log_for_dirtybit = physmap;
+ state->dirty_bitmap = g_new(unsigned long, bitmap_size);
+ } else if (state->log_for_dirtybit != physmap) {
+ /* Only one range for dirty bitmap can be tracked. */
+ return;
+ }
+
+ rc = xen_track_dirty_vram(xen_domid, start_addr >> TARGET_PAGE_BITS,
+ npages, state->dirty_bitmap);
+ if (rc < 0) {
+#ifndef ENODATA
+#define ENODATA ENOENT
+#endif
+ if (errno == ENODATA) {
+ memory_region_set_dirty(framebuffer, 0, size);
+ DPRINTF("xen: track_dirty_vram failed (0x" TARGET_FMT_plx
+ ", 0x" TARGET_FMT_plx "): %s\n",
+ start_addr, start_addr + size, strerror(errno));
+ }
+ return;
+ }
+
+ for (i = 0; i < bitmap_size; i++) {
+ unsigned long map = state->dirty_bitmap[i];
+ while (map != 0) {
+ j = ctzl(map);
+ map &= ~(1ul << j);
+ memory_region_set_dirty(framebuffer,
+ (i * width + j) * TARGET_PAGE_SIZE,
+ TARGET_PAGE_SIZE);
+ };
+ }
+}
+
+static void xen_log_start(MemoryListener *listener,
+ MemoryRegionSection *section,
+ int old, int new)
+{
+ XenIOState *state = container_of(listener, XenIOState, memory_listener);
+
+ if (new & ~old & (1 << DIRTY_MEMORY_VGA)) {
+ xen_sync_dirty_bitmap(state, section->offset_within_address_space,
+ int128_get64(section->size));
+ }
+}
+
+static void xen_log_stop(MemoryListener *listener, MemoryRegionSection *section,
+ int old, int new)
+{
+ XenIOState *state = container_of(listener, XenIOState, memory_listener);
+
+ if (old & ~new & (1 << DIRTY_MEMORY_VGA)) {
+ state->log_for_dirtybit = NULL;
+ g_free(state->dirty_bitmap);
+ state->dirty_bitmap = NULL;
+ /* Disable dirty bit tracking */
+ xen_track_dirty_vram(xen_domid, 0, 0, NULL);
+ }
+}
+
+static void xen_log_sync(MemoryListener *listener, MemoryRegionSection *section)
+{
+ XenIOState *state = container_of(listener, XenIOState, memory_listener);
+
+ xen_sync_dirty_bitmap(state, section->offset_within_address_space,
+ int128_get64(section->size));
+}
+
+static void xen_log_global_start(MemoryListener *listener)
+{
+ if (xen_enabled()) {
+ xen_in_migration = true;
+ }
+}
+
+static void xen_log_global_stop(MemoryListener *listener)
+{
+ xen_in_migration = false;
+}
+
+static MemoryListener xen_memory_listener = {
+ .name = "xen-memory",
+ .region_add = xen_region_add,
+ .region_del = xen_region_del,
+ .log_start = xen_log_start,
+ .log_stop = xen_log_stop,
+ .log_sync = xen_log_sync,
+ .log_global_start = xen_log_global_start,
+ .log_global_stop = xen_log_global_stop,
+ .priority = 10,
+};
+
+static MemoryListener xen_io_listener = {
+ .name = "xen-io",
+ .region_add = xen_io_add,
+ .region_del = xen_io_del,
+ .priority = 10,
+};
+
+static DeviceListener xen_device_listener = {
+ .realize = xen_device_realize,
+ .unrealize = xen_device_unrealize,
+};
+
+/* get the ioreq packets from share mem */
+static ioreq_t *cpu_get_ioreq_from_shared_memory(XenIOState *state, int vcpu)
+{
+ ioreq_t *req = xen_vcpu_ioreq(state->shared_page, vcpu);
+
+ if (req->state != STATE_IOREQ_READY) {
+ DPRINTF("I/O request not ready: "
+ "%x, ptr: %x, port: %"PRIx64", "
+ "data: %"PRIx64", count: %u, size: %u\n",
+ req->state, req->data_is_ptr, req->addr,
+ req->data, req->count, req->size);
+ return NULL;
+ }
+
+ xen_rmb(); /* see IOREQ_READY /then/ read contents of ioreq */
+
+ req->state = STATE_IOREQ_INPROCESS;
+ return req;
+}
+
+/* use poll to get the port notification */
+/* ioreq_vec--out,the */
+/* retval--the number of ioreq packet */
+static ioreq_t *cpu_get_ioreq(XenIOState *state)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ unsigned int max_cpus = ms->smp.max_cpus;
+ int i;
+ evtchn_port_t port;
+
+ port = xenevtchn_pending(state->xce_handle);
+ if (port == state->bufioreq_local_port) {
+ timer_mod(state->buffered_io_timer,
+ BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
+ return NULL;
+ }
+
+ if (port != -1) {
+ for (i = 0; i < max_cpus; i++) {
+ if (state->ioreq_local_port[i] == port) {
+ break;
+ }
+ }
+
+ if (i == max_cpus) {
+ hw_error("Fatal error while trying to get io event!\n");
+ }
+
+ /* unmask the wanted port again */
+ xenevtchn_unmask(state->xce_handle, port);
+
+ /* get the io packet from shared memory */
+ state->send_vcpu = i;
+ return cpu_get_ioreq_from_shared_memory(state, i);
+ }
+
+ /* read error or read nothing */
+ return NULL;
+}
+
+static uint32_t do_inp(uint32_t addr, unsigned long size)
+{
+ switch (size) {
+ case 1:
+ return cpu_inb(addr);
+ case 2:
+ return cpu_inw(addr);
+ case 4:
+ return cpu_inl(addr);
+ default:
+ hw_error("inp: bad size: %04x %lx", addr, size);
+ }
+}
+
+static void do_outp(uint32_t addr,
+ unsigned long size, uint32_t val)
+{
+ switch (size) {
+ case 1:
+ return cpu_outb(addr, val);
+ case 2:
+ return cpu_outw(addr, val);
+ case 4:
+ return cpu_outl(addr, val);
+ default:
+ hw_error("outp: bad size: %04x %lx", addr, size);
+ }
+}
+
+/*
+ * Helper functions which read/write an object from/to physical guest
+ * memory, as part of the implementation of an ioreq.
+ *
+ * Equivalent to
+ * cpu_physical_memory_rw(addr + (req->df ? -1 : +1) * req->size * i,
+ * val, req->size, 0/1)
+ * except without the integer overflow problems.
+ */
+static void rw_phys_req_item(hwaddr addr,
+ ioreq_t *req, uint32_t i, void *val, int rw)
+{
+ /* Do everything unsigned so overflow just results in a truncated result
+ * and accesses to undesired parts of guest memory, which is up
+ * to the guest */
+ hwaddr offset = (hwaddr)req->size * i;
+ if (req->df) {
+ addr -= offset;
+ } else {
+ addr += offset;
+ }
+ cpu_physical_memory_rw(addr, val, req->size, rw);
+}
+
+static inline void read_phys_req_item(hwaddr addr,
+ ioreq_t *req, uint32_t i, void *val)
+{
+ rw_phys_req_item(addr, req, i, val, 0);
+}
+static inline void write_phys_req_item(hwaddr addr,
+ ioreq_t *req, uint32_t i, void *val)
+{
+ rw_phys_req_item(addr, req, i, val, 1);
+}
+
+
+static void cpu_ioreq_pio(ioreq_t *req)
+{
+ uint32_t i;
+
+ trace_cpu_ioreq_pio(req, req->dir, req->df, req->data_is_ptr, req->addr,
+ req->data, req->count, req->size);
+
+ if (req->size > sizeof(uint32_t)) {
+ hw_error("PIO: bad size (%u)", req->size);
+ }
+
+ if (req->dir == IOREQ_READ) {
+ if (!req->data_is_ptr) {
+ req->data = do_inp(req->addr, req->size);
+ trace_cpu_ioreq_pio_read_reg(req, req->data, req->addr,
+ req->size);
+ } else {
+ uint32_t tmp;
+
+ for (i = 0; i < req->count; i++) {
+ tmp = do_inp(req->addr, req->size);
+ write_phys_req_item(req->data, req, i, &tmp);
+ }
+ }
+ } else if (req->dir == IOREQ_WRITE) {
+ if (!req->data_is_ptr) {
+ trace_cpu_ioreq_pio_write_reg(req, req->data, req->addr,
+ req->size);
+ do_outp(req->addr, req->size, req->data);
+ } else {
+ for (i = 0; i < req->count; i++) {
+ uint32_t tmp = 0;
+
+ read_phys_req_item(req->data, req, i, &tmp);
+ do_outp(req->addr, req->size, tmp);
+ }
+ }
+ }
+}
+
+static void cpu_ioreq_move(ioreq_t *req)
+{
+ uint32_t i;
+
+ trace_cpu_ioreq_move(req, req->dir, req->df, req->data_is_ptr, req->addr,
+ req->data, req->count, req->size);
+
+ if (req->size > sizeof(req->data)) {
+ hw_error("MMIO: bad size (%u)", req->size);
+ }
+
+ if (!req->data_is_ptr) {
+ if (req->dir == IOREQ_READ) {
+ for (i = 0; i < req->count; i++) {
+ read_phys_req_item(req->addr, req, i, &req->data);
+ }
+ } else if (req->dir == IOREQ_WRITE) {
+ for (i = 0; i < req->count; i++) {
+ write_phys_req_item(req->addr, req, i, &req->data);
+ }
+ }
+ } else {
+ uint64_t tmp;
+
+ if (req->dir == IOREQ_READ) {
+ for (i = 0; i < req->count; i++) {
+ read_phys_req_item(req->addr, req, i, &tmp);
+ write_phys_req_item(req->data, req, i, &tmp);
+ }
+ } else if (req->dir == IOREQ_WRITE) {
+ for (i = 0; i < req->count; i++) {
+ read_phys_req_item(req->data, req, i, &tmp);
+ write_phys_req_item(req->addr, req, i, &tmp);
+ }
+ }
+ }
+}
+
+static void cpu_ioreq_config(XenIOState *state, ioreq_t *req)
+{
+ uint32_t sbdf = req->addr >> 32;
+ uint32_t reg = req->addr;
+ XenPciDevice *xendev;
+
+ if (req->size != sizeof(uint8_t) && req->size != sizeof(uint16_t) &&
+ req->size != sizeof(uint32_t)) {
+ hw_error("PCI config access: bad size (%u)", req->size);
+ }
+
+ if (req->count != 1) {
+ hw_error("PCI config access: bad count (%u)", req->count);
+ }
+
+ QLIST_FOREACH(xendev, &state->dev_list, entry) {
+ if (xendev->sbdf != sbdf) {
+ continue;
+ }
+
+ if (!req->data_is_ptr) {
+ if (req->dir == IOREQ_READ) {
+ req->data = pci_host_config_read_common(
+ xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
+ req->size);
+ trace_cpu_ioreq_config_read(req, xendev->sbdf, reg,
+ req->size, req->data);
+ } else if (req->dir == IOREQ_WRITE) {
+ trace_cpu_ioreq_config_write(req, xendev->sbdf, reg,
+ req->size, req->data);
+ pci_host_config_write_common(
+ xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
+ req->data, req->size);
+ }
+ } else {
+ uint32_t tmp;
+
+ if (req->dir == IOREQ_READ) {
+ tmp = pci_host_config_read_common(
+ xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
+ req->size);
+ trace_cpu_ioreq_config_read(req, xendev->sbdf, reg,
+ req->size, tmp);
+ write_phys_req_item(req->data, req, 0, &tmp);
+ } else if (req->dir == IOREQ_WRITE) {
+ read_phys_req_item(req->data, req, 0, &tmp);
+ trace_cpu_ioreq_config_write(req, xendev->sbdf, reg,
+ req->size, tmp);
+ pci_host_config_write_common(
+ xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
+ tmp, req->size);
+ }
+ }
+ }
+}
+
+static void regs_to_cpu(vmware_regs_t *vmport_regs, ioreq_t *req)
+{
+ X86CPU *cpu;
+ CPUX86State *env;
+
+ cpu = X86_CPU(current_cpu);
+ env = &cpu->env;
+ env->regs[R_EAX] = req->data;
+ env->regs[R_EBX] = vmport_regs->ebx;
+ env->regs[R_ECX] = vmport_regs->ecx;
+ env->regs[R_EDX] = vmport_regs->edx;
+ env->regs[R_ESI] = vmport_regs->esi;
+ env->regs[R_EDI] = vmport_regs->edi;
+}
+
+static void regs_from_cpu(vmware_regs_t *vmport_regs)
+{
+ X86CPU *cpu = X86_CPU(current_cpu);
+ CPUX86State *env = &cpu->env;
+
+ vmport_regs->ebx = env->regs[R_EBX];
+ vmport_regs->ecx = env->regs[R_ECX];
+ vmport_regs->edx = env->regs[R_EDX];
+ vmport_regs->esi = env->regs[R_ESI];
+ vmport_regs->edi = env->regs[R_EDI];
+}
+
+static void handle_vmport_ioreq(XenIOState *state, ioreq_t *req)
+{
+ vmware_regs_t *vmport_regs;
+
+ assert(state->shared_vmport_page);
+ vmport_regs =
+ &state->shared_vmport_page->vcpu_vmport_regs[state->send_vcpu];
+ QEMU_BUILD_BUG_ON(sizeof(*req) < sizeof(*vmport_regs));
+
+ current_cpu = state->cpu_by_vcpu_id[state->send_vcpu];
+ regs_to_cpu(vmport_regs, req);
+ cpu_ioreq_pio(req);
+ regs_from_cpu(vmport_regs);
+ current_cpu = NULL;
+}
+
+static void handle_ioreq(XenIOState *state, ioreq_t *req)
+{
+ trace_handle_ioreq(req, req->type, req->dir, req->df, req->data_is_ptr,
+ req->addr, req->data, req->count, req->size);
+
+ if (!req->data_is_ptr && (req->dir == IOREQ_WRITE) &&
+ (req->size < sizeof (target_ulong))) {
+ req->data &= ((target_ulong) 1 << (8 * req->size)) - 1;
+ }
+
+ if (req->dir == IOREQ_WRITE)
+ trace_handle_ioreq_write(req, req->type, req->df, req->data_is_ptr,
+ req->addr, req->data, req->count, req->size);
+
+ switch (req->type) {
+ case IOREQ_TYPE_PIO:
+ cpu_ioreq_pio(req);
+ break;
+ case IOREQ_TYPE_COPY:
+ cpu_ioreq_move(req);
+ break;
+ case IOREQ_TYPE_VMWARE_PORT:
+ handle_vmport_ioreq(state, req);
+ break;
+ case IOREQ_TYPE_TIMEOFFSET:
+ break;
+ case IOREQ_TYPE_INVALIDATE:
+ xen_invalidate_map_cache();
+ break;
+ case IOREQ_TYPE_PCI_CONFIG:
+ cpu_ioreq_config(state, req);
+ break;
+ default:
+ hw_error("Invalid ioreq type 0x%x\n", req->type);
+ }
+ if (req->dir == IOREQ_READ) {
+ trace_handle_ioreq_read(req, req->type, req->df, req->data_is_ptr,
+ req->addr, req->data, req->count, req->size);
+ }
+}
+
+static int handle_buffered_iopage(XenIOState *state)
+{
+ buffered_iopage_t *buf_page = state->buffered_io_page;
+ buf_ioreq_t *buf_req = NULL;
+ ioreq_t req;
+ int qw;
+
+ if (!buf_page) {
+ return 0;
+ }
+
+ memset(&req, 0x00, sizeof(req));
+ req.state = STATE_IOREQ_READY;
+ req.count = 1;
+ req.dir = IOREQ_WRITE;
+
+ for (;;) {
+ uint32_t rdptr = buf_page->read_pointer, wrptr;
+
+ xen_rmb();
+ wrptr = buf_page->write_pointer;
+ xen_rmb();
+ if (rdptr != buf_page->read_pointer) {
+ continue;
+ }
+ if (rdptr == wrptr) {
+ break;
+ }
+ buf_req = &buf_page->buf_ioreq[rdptr % IOREQ_BUFFER_SLOT_NUM];
+ req.size = 1U << buf_req->size;
+ req.addr = buf_req->addr;
+ req.data = buf_req->data;
+ req.type = buf_req->type;
+ xen_rmb();
+ qw = (req.size == 8);
+ if (qw) {
+ if (rdptr + 1 == wrptr) {
+ hw_error("Incomplete quad word buffered ioreq");
+ }
+ buf_req = &buf_page->buf_ioreq[(rdptr + 1) %
+ IOREQ_BUFFER_SLOT_NUM];
+ req.data |= ((uint64_t)buf_req->data) << 32;
+ xen_rmb();
+ }
+
+ handle_ioreq(state, &req);
+
+ /* Only req.data may get updated by handle_ioreq(), albeit even that
+ * should not happen as such data would never make it to the guest (we
+ * can only usefully see writes here after all).
+ */
+ assert(req.state == STATE_IOREQ_READY);
+ assert(req.count == 1);
+ assert(req.dir == IOREQ_WRITE);
+ assert(!req.data_is_ptr);
+
+ qatomic_add(&buf_page->read_pointer, qw + 1);
+ }
+
+ return req.count;
+}
+
+static void handle_buffered_io(void *opaque)
+{
+ XenIOState *state = opaque;
+
+ if (handle_buffered_iopage(state)) {
+ timer_mod(state->buffered_io_timer,
+ BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
+ } else {
+ timer_del(state->buffered_io_timer);
+ xenevtchn_unmask(state->xce_handle, state->bufioreq_local_port);
+ }
+}
+
+static void cpu_handle_ioreq(void *opaque)
+{
+ XenIOState *state = opaque;
+ ioreq_t *req = cpu_get_ioreq(state);
+
+ handle_buffered_iopage(state);
+ if (req) {
+ ioreq_t copy = *req;
+
+ xen_rmb();
+ handle_ioreq(state, &copy);
+ req->data = copy.data;
+
+ if (req->state != STATE_IOREQ_INPROCESS) {
+ fprintf(stderr, "Badness in I/O request ... not in service?!: "
+ "%x, ptr: %x, port: %"PRIx64", "
+ "data: %"PRIx64", count: %u, size: %u, type: %u\n",
+ req->state, req->data_is_ptr, req->addr,
+ req->data, req->count, req->size, req->type);
+ destroy_hvm_domain(false);
+ return;
+ }
+
+ xen_wmb(); /* Update ioreq contents /then/ update state. */
+
+ /*
+ * We do this before we send the response so that the tools
+ * have the opportunity to pick up on the reset before the
+ * guest resumes and does a hlt with interrupts disabled which
+ * causes Xen to powerdown the domain.
+ */
+ if (runstate_is_running()) {
+ ShutdownCause request;
+
+ if (qemu_shutdown_requested_get()) {
+ destroy_hvm_domain(false);
+ }
+ request = qemu_reset_requested_get();
+ if (request) {
+ qemu_system_reset(request);
+ destroy_hvm_domain(true);
+ }
+ }
+
+ req->state = STATE_IORESP_READY;
+ xenevtchn_notify(state->xce_handle,
+ state->ioreq_local_port[state->send_vcpu]);
+ }
+}
+
+static void xen_main_loop_prepare(XenIOState *state)
+{
+ int evtchn_fd = -1;
+
+ if (state->xce_handle != NULL) {
+ evtchn_fd = xenevtchn_fd(state->xce_handle);
+ }
+
+ state->buffered_io_timer = timer_new_ms(QEMU_CLOCK_REALTIME, handle_buffered_io,
+ state);
+
+ if (evtchn_fd != -1) {
+ CPUState *cpu_state;
+
+ DPRINTF("%s: Init cpu_by_vcpu_id\n", __func__);
+ CPU_FOREACH(cpu_state) {
+ DPRINTF("%s: cpu_by_vcpu_id[%d]=%p\n",
+ __func__, cpu_state->cpu_index, cpu_state);
+ state->cpu_by_vcpu_id[cpu_state->cpu_index] = cpu_state;
+ }
+ qemu_set_fd_handler(evtchn_fd, cpu_handle_ioreq, NULL, state);
+ }
+}
+
+
+static void xen_hvm_change_state_handler(void *opaque, bool running,
+ RunState rstate)
+{
+ XenIOState *state = opaque;
+
+ if (running) {
+ xen_main_loop_prepare(state);
+ }
+
+ xen_set_ioreq_server_state(xen_domid,
+ state->ioservid,
+ (rstate == RUN_STATE_RUNNING));
+}
+
+static void xen_exit_notifier(Notifier *n, void *data)
+{
+ XenIOState *state = container_of(n, XenIOState, exit);
+
+ xen_destroy_ioreq_server(xen_domid, state->ioservid);
+ if (state->fres != NULL) {
+ xenforeignmemory_unmap_resource(xen_fmem, state->fres);
+ }
+
+ xenevtchn_close(state->xce_handle);
+ xs_daemon_close(state->xenstore);
+}
+
+#ifdef XEN_COMPAT_PHYSMAP
+static void xen_read_physmap(XenIOState *state)
+{
+ XenPhysmap *physmap = NULL;
+ unsigned int len, num, i;
+ char path[80], *value = NULL;
+ char **entries = NULL;
+
+ snprintf(path, sizeof(path),
+ "/local/domain/0/device-model/%d/physmap", xen_domid);
+ entries = xs_directory(state->xenstore, 0, path, &num);
+ if (entries == NULL)
+ return;
+
+ for (i = 0; i < num; i++) {
+ physmap = g_malloc(sizeof (XenPhysmap));
+ physmap->phys_offset = strtoull(entries[i], NULL, 16);
+ snprintf(path, sizeof(path),
+ "/local/domain/0/device-model/%d/physmap/%s/start_addr",
+ xen_domid, entries[i]);
+ value = xs_read(state->xenstore, 0, path, &len);
+ if (value == NULL) {
+ g_free(physmap);
+ continue;
+ }
+ physmap->start_addr = strtoull(value, NULL, 16);
+ free(value);
+
+ snprintf(path, sizeof(path),
+ "/local/domain/0/device-model/%d/physmap/%s/size",
+ xen_domid, entries[i]);
+ value = xs_read(state->xenstore, 0, path, &len);
+ if (value == NULL) {
+ g_free(physmap);
+ continue;
+ }
+ physmap->size = strtoull(value, NULL, 16);
+ free(value);
+
+ snprintf(path, sizeof(path),
+ "/local/domain/0/device-model/%d/physmap/%s/name",
+ xen_domid, entries[i]);
+ physmap->name = xs_read(state->xenstore, 0, path, &len);
+
+ QLIST_INSERT_HEAD(&xen_physmap, physmap, list);
+ }
+ free(entries);
+}
+#else
+static void xen_read_physmap(XenIOState *state)
+{
+}
+#endif
+
+static void xen_wakeup_notifier(Notifier *notifier, void *data)
+{
+ xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 0);
+}
+
+static int xen_map_ioreq_server(XenIOState *state)
+{
+ void *addr = NULL;
+ xen_pfn_t ioreq_pfn;
+ xen_pfn_t bufioreq_pfn;
+ evtchn_port_t bufioreq_evtchn;
+ int rc;
+
+ /*
+ * Attempt to map using the resource API and fall back to normal
+ * foreign mapping if this is not supported.
+ */
+ QEMU_BUILD_BUG_ON(XENMEM_resource_ioreq_server_frame_bufioreq != 0);
+ QEMU_BUILD_BUG_ON(XENMEM_resource_ioreq_server_frame_ioreq(0) != 1);
+ state->fres = xenforeignmemory_map_resource(xen_fmem, xen_domid,
+ XENMEM_resource_ioreq_server,
+ state->ioservid, 0, 2,
+ &addr,
+ PROT_READ | PROT_WRITE, 0);
+ if (state->fres != NULL) {
+ trace_xen_map_resource_ioreq(state->ioservid, addr);
+ state->buffered_io_page = addr;
+ state->shared_page = addr + TARGET_PAGE_SIZE;
+ } else if (errno != EOPNOTSUPP) {
+ error_report("failed to map ioreq server resources: error %d handle=%p",
+ errno, xen_xc);
+ return -1;
+ }
+
+ rc = xen_get_ioreq_server_info(xen_domid, state->ioservid,
+ (state->shared_page == NULL) ?
+ &ioreq_pfn : NULL,
+ (state->buffered_io_page == NULL) ?
+ &bufioreq_pfn : NULL,
+ &bufioreq_evtchn);
+ if (rc < 0) {
+ error_report("failed to get ioreq server info: error %d handle=%p",
+ errno, xen_xc);
+ return rc;
+ }
+
+ if (state->shared_page == NULL) {
+ DPRINTF("shared page at pfn %lx\n", ioreq_pfn);
+
+ state->shared_page = xenforeignmemory_map(xen_fmem, xen_domid,
+ PROT_READ | PROT_WRITE,
+ 1, &ioreq_pfn, NULL);
+ if (state->shared_page == NULL) {
+ error_report("map shared IO page returned error %d handle=%p",
+ errno, xen_xc);
+ }
+ }
+
+ if (state->buffered_io_page == NULL) {
+ DPRINTF("buffered io page at pfn %lx\n", bufioreq_pfn);
+
+ state->buffered_io_page = xenforeignmemory_map(xen_fmem, xen_domid,
+ PROT_READ | PROT_WRITE,
+ 1, &bufioreq_pfn,
+ NULL);
+ if (state->buffered_io_page == NULL) {
+ error_report("map buffered IO page returned error %d", errno);
+ return -1;
+ }
+ }
+
+ if (state->shared_page == NULL || state->buffered_io_page == NULL) {
+ return -1;
+ }
+
+ DPRINTF("buffered io evtchn is %x\n", bufioreq_evtchn);
+
+ state->bufioreq_remote_port = bufioreq_evtchn;
+
+ return 0;
+}
+
+void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory)
+{
+ MachineState *ms = MACHINE(pcms);
+ unsigned int max_cpus = ms->smp.max_cpus;
+ int i, rc;
+ xen_pfn_t ioreq_pfn;
+ XenIOState *state;
+
+ state = g_malloc0(sizeof (XenIOState));
+
+ state->xce_handle = xenevtchn_open(NULL, 0);
+ if (state->xce_handle == NULL) {
+ perror("xen: event channel open");
+ goto err;
+ }
+
+ state->xenstore = xs_daemon_open();
+ if (state->xenstore == NULL) {
+ perror("xen: xenstore open");
+ goto err;
+ }
+
+ xen_create_ioreq_server(xen_domid, &state->ioservid);
+
+ state->exit.notify = xen_exit_notifier;
+ qemu_add_exit_notifier(&state->exit);
+
+ state->suspend.notify = xen_suspend_notifier;
+ qemu_register_suspend_notifier(&state->suspend);
+
+ state->wakeup.notify = xen_wakeup_notifier;
+ qemu_register_wakeup_notifier(&state->wakeup);
+
+ /*
+ * Register wake-up support in QMP query-current-machine API
+ */
+ qemu_register_wakeup_support();
+
+ rc = xen_map_ioreq_server(state);
+ if (rc < 0) {
+ goto err;
+ }
+
+ rc = xen_get_vmport_regs_pfn(xen_xc, xen_domid, &ioreq_pfn);
+ if (!rc) {
+ DPRINTF("shared vmport page at pfn %lx\n", ioreq_pfn);
+ state->shared_vmport_page =
+ xenforeignmemory_map(xen_fmem, xen_domid, PROT_READ|PROT_WRITE,
+ 1, &ioreq_pfn, NULL);
+ if (state->shared_vmport_page == NULL) {
+ error_report("map shared vmport IO page returned error %d handle=%p",
+ errno, xen_xc);
+ goto err;
+ }
+ } else if (rc != -ENOSYS) {
+ error_report("get vmport regs pfn returned error %d, rc=%d",
+ errno, rc);
+ goto err;
+ }
+
+ /* Note: cpus is empty at this point in init */
+ state->cpu_by_vcpu_id = g_malloc0(max_cpus * sizeof(CPUState *));
+
+ rc = xen_set_ioreq_server_state(xen_domid, state->ioservid, true);
+ if (rc < 0) {
+ error_report("failed to enable ioreq server info: error %d handle=%p",
+ errno, xen_xc);
+ goto err;
+ }
+
+ state->ioreq_local_port = g_malloc0(max_cpus * sizeof (evtchn_port_t));
+
+ /* FIXME: how about if we overflow the page here? */
+ for (i = 0; i < max_cpus; i++) {
+ rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid,
+ xen_vcpu_eport(state->shared_page, i));
+ if (rc == -1) {
+ error_report("shared evtchn %d bind error %d", i, errno);
+ goto err;
+ }
+ state->ioreq_local_port[i] = rc;
+ }
+
+ rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid,
+ state->bufioreq_remote_port);
+ if (rc == -1) {
+ error_report("buffered evtchn bind error %d", errno);
+ goto err;
+ }
+ state->bufioreq_local_port = rc;
+
+ /* Init RAM management */
+#ifdef XEN_COMPAT_PHYSMAP
+ xen_map_cache_init(xen_phys_offset_to_gaddr, state);
+#else
+ xen_map_cache_init(NULL, state);
+#endif
+ xen_ram_init(pcms, ms->ram_size, ram_memory);
+
+ qemu_add_vm_change_state_handler(xen_hvm_change_state_handler, state);
+
+ state->memory_listener = xen_memory_listener;
+ memory_listener_register(&state->memory_listener, &address_space_memory);
+ state->log_for_dirtybit = NULL;
+
+ state->io_listener = xen_io_listener;
+ memory_listener_register(&state->io_listener, &address_space_io);
+
+ state->device_listener = xen_device_listener;
+ QLIST_INIT(&state->dev_list);
+ device_listener_register(&state->device_listener);
+
+ xen_bus_init();
+
+ /* Initialize backend core & drivers */
+ if (xen_be_init() != 0) {
+ error_report("xen backend core setup failed");
+ goto err;
+ }
+ xen_be_register_common();
+
+ QLIST_INIT(&xen_physmap);
+ xen_read_physmap(state);
+
+ /* Disable ACPI build because Xen handles it */
+ pcms->acpi_build_enabled = false;
+
+ return;
+
+err:
+ error_report("xen hardware virtual machine initialisation failed");
+ exit(1);
+}
+
+void destroy_hvm_domain(bool reboot)
+{
+ xc_interface *xc_handle;
+ int sts;
+ int rc;
+
+ unsigned int reason = reboot ? SHUTDOWN_reboot : SHUTDOWN_poweroff;
+
+ if (xen_dmod) {
+ rc = xendevicemodel_shutdown(xen_dmod, xen_domid, reason);
+ if (!rc) {
+ return;
+ }
+ if (errno != ENOTTY /* old Xen */) {
+ perror("xendevicemodel_shutdown failed");
+ }
+ /* well, try the old thing then */
+ }
+
+ xc_handle = xc_interface_open(0, 0, 0);
+ if (xc_handle == NULL) {
+ fprintf(stderr, "Cannot acquire xenctrl handle\n");
+ } else {
+ sts = xc_domain_shutdown(xc_handle, xen_domid, reason);
+ if (sts != 0) {
+ fprintf(stderr, "xc_domain_shutdown failed to issue %s, "
+ "sts %d, %s\n", reboot ? "reboot" : "poweroff",
+ sts, strerror(errno));
+ } else {
+ fprintf(stderr, "Issued domain %d %s\n", xen_domid,
+ reboot ? "reboot" : "poweroff");
+ }
+ xc_interface_close(xc_handle);
+ }
+}
+
+void xen_register_framebuffer(MemoryRegion *mr)
+{
+ framebuffer = mr;
+}
+
+void xen_shutdown_fatal_error(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ fprintf(stderr, "Will destroy the domain.\n");
+ /* destroy the domain */
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR);
+}
+
+void xen_hvm_modified_memory(ram_addr_t start, ram_addr_t length)
+{
+ if (unlikely(xen_in_migration)) {
+ int rc;
+ ram_addr_t start_pfn, nb_pages;
+
+ start = xen_phys_offset_to_gaddr(start, length);
+
+ if (length == 0) {
+ length = TARGET_PAGE_SIZE;
+ }
+ start_pfn = start >> TARGET_PAGE_BITS;
+ nb_pages = ((start + length + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS)
+ - start_pfn;
+ rc = xen_modified_memory(xen_domid, start_pfn, nb_pages);
+ if (rc) {
+ fprintf(stderr,
+ "%s failed for "RAM_ADDR_FMT" ("RAM_ADDR_FMT"): %i, %s\n",
+ __func__, start, nb_pages, errno, strerror(errno));
+ }
+ }
+}
+
+void qmp_xen_set_global_dirty_log(bool enable, Error **errp)
+{
+ if (enable) {
+ memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
+ } else {
+ memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
+ }
+}
diff --git a/hw/i386/xen/xen-mapcache.c b/hw/i386/xen/xen-mapcache.c
new file mode 100644
index 000000000..bd47c3d67
--- /dev/null
+++ b/hw/i386/xen/xen-mapcache.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright (C) 2011 Citrix Ltd.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "qemu/error-report.h"
+
+#include <sys/resource.h>
+
+#include "hw/xen/xen-legacy-backend.h"
+#include "qemu/bitmap.h"
+
+#include "sysemu/runstate.h"
+#include "sysemu/xen-mapcache.h"
+#include "trace.h"
+
+
+//#define MAPCACHE_DEBUG
+
+#ifdef MAPCACHE_DEBUG
+# define DPRINTF(fmt, ...) do { \
+ fprintf(stderr, "xen_mapcache: " fmt, ## __VA_ARGS__); \
+} while (0)
+#else
+# define DPRINTF(fmt, ...) do { } while (0)
+#endif
+
+#if HOST_LONG_BITS == 32
+# define MCACHE_BUCKET_SHIFT 16
+# define MCACHE_MAX_SIZE (1UL<<31) /* 2GB Cap */
+#else
+# define MCACHE_BUCKET_SHIFT 20
+# define MCACHE_MAX_SIZE (1UL<<35) /* 32GB Cap */
+#endif
+#define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT)
+
+/* This is the size of the virtual address space reserve to QEMU that will not
+ * be use by MapCache.
+ * From empirical tests I observed that qemu use 75MB more than the
+ * max_mcache_size.
+ */
+#define NON_MCACHE_MEMORY_SIZE (80 * MiB)
+
+typedef struct MapCacheEntry {
+ hwaddr paddr_index;
+ uint8_t *vaddr_base;
+ unsigned long *valid_mapping;
+ uint8_t lock;
+#define XEN_MAPCACHE_ENTRY_DUMMY (1 << 0)
+ uint8_t flags;
+ hwaddr size;
+ struct MapCacheEntry *next;
+} MapCacheEntry;
+
+typedef struct MapCacheRev {
+ uint8_t *vaddr_req;
+ hwaddr paddr_index;
+ hwaddr size;
+ QTAILQ_ENTRY(MapCacheRev) next;
+ bool dma;
+} MapCacheRev;
+
+typedef struct MapCache {
+ MapCacheEntry *entry;
+ unsigned long nr_buckets;
+ QTAILQ_HEAD(, MapCacheRev) locked_entries;
+
+ /* For most cases (>99.9%), the page address is the same. */
+ MapCacheEntry *last_entry;
+ unsigned long max_mcache_size;
+ unsigned int mcache_bucket_shift;
+
+ phys_offset_to_gaddr_t phys_offset_to_gaddr;
+ QemuMutex lock;
+ void *opaque;
+} MapCache;
+
+static MapCache *mapcache;
+
+static inline void mapcache_lock(void)
+{
+ qemu_mutex_lock(&mapcache->lock);
+}
+
+static inline void mapcache_unlock(void)
+{
+ qemu_mutex_unlock(&mapcache->lock);
+}
+
+static inline int test_bits(int nr, int size, const unsigned long *addr)
+{
+ unsigned long res = find_next_zero_bit(addr, size + nr, nr);
+ if (res >= nr + size)
+ return 1;
+ else
+ return 0;
+}
+
+void xen_map_cache_init(phys_offset_to_gaddr_t f, void *opaque)
+{
+ unsigned long size;
+ struct rlimit rlimit_as;
+
+ mapcache = g_malloc0(sizeof (MapCache));
+
+ mapcache->phys_offset_to_gaddr = f;
+ mapcache->opaque = opaque;
+ qemu_mutex_init(&mapcache->lock);
+
+ QTAILQ_INIT(&mapcache->locked_entries);
+
+ if (geteuid() == 0) {
+ rlimit_as.rlim_cur = RLIM_INFINITY;
+ rlimit_as.rlim_max = RLIM_INFINITY;
+ mapcache->max_mcache_size = MCACHE_MAX_SIZE;
+ } else {
+ getrlimit(RLIMIT_AS, &rlimit_as);
+ rlimit_as.rlim_cur = rlimit_as.rlim_max;
+
+ if (rlimit_as.rlim_max != RLIM_INFINITY) {
+ warn_report("QEMU's maximum size of virtual"
+ " memory is not infinity");
+ }
+ if (rlimit_as.rlim_max < MCACHE_MAX_SIZE + NON_MCACHE_MEMORY_SIZE) {
+ mapcache->max_mcache_size = rlimit_as.rlim_max -
+ NON_MCACHE_MEMORY_SIZE;
+ } else {
+ mapcache->max_mcache_size = MCACHE_MAX_SIZE;
+ }
+ }
+
+ setrlimit(RLIMIT_AS, &rlimit_as);
+
+ mapcache->nr_buckets =
+ (((mapcache->max_mcache_size >> XC_PAGE_SHIFT) +
+ (1UL << (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1) >>
+ (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT));
+
+ size = mapcache->nr_buckets * sizeof (MapCacheEntry);
+ size = (size + XC_PAGE_SIZE - 1) & ~(XC_PAGE_SIZE - 1);
+ DPRINTF("%s, nr_buckets = %lx size %lu\n", __func__,
+ mapcache->nr_buckets, size);
+ mapcache->entry = g_malloc0(size);
+}
+
+static void xen_remap_bucket(MapCacheEntry *entry,
+ void *vaddr,
+ hwaddr size,
+ hwaddr address_index,
+ bool dummy)
+{
+ uint8_t *vaddr_base;
+ xen_pfn_t *pfns;
+ int *err;
+ unsigned int i;
+ hwaddr nb_pfn = size >> XC_PAGE_SHIFT;
+
+ trace_xen_remap_bucket(address_index);
+
+ pfns = g_malloc0(nb_pfn * sizeof (xen_pfn_t));
+ err = g_malloc0(nb_pfn * sizeof (int));
+
+ if (entry->vaddr_base != NULL) {
+ if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) {
+ ram_block_notify_remove(entry->vaddr_base, entry->size,
+ entry->size);
+ }
+
+ /*
+ * If an entry is being replaced by another mapping and we're using
+ * MAP_FIXED flag for it - there is possibility of a race for vaddr
+ * address with another thread doing an mmap call itself
+ * (see man 2 mmap). To avoid that we skip explicit unmapping here
+ * and allow the kernel to destroy the previous mappings by replacing
+ * them in mmap call later.
+ *
+ * Non-identical replacements are not allowed therefore.
+ */
+ assert(!vaddr || (entry->vaddr_base == vaddr && entry->size == size));
+
+ if (!vaddr && munmap(entry->vaddr_base, entry->size) != 0) {
+ perror("unmap fails");
+ exit(-1);
+ }
+ }
+ g_free(entry->valid_mapping);
+ entry->valid_mapping = NULL;
+
+ for (i = 0; i < nb_pfn; i++) {
+ pfns[i] = (address_index << (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i;
+ }
+
+ /*
+ * If the caller has requested the mapping at a specific address use
+ * MAP_FIXED to make sure it's honored.
+ */
+ if (!dummy) {
+ vaddr_base = xenforeignmemory_map2(xen_fmem, xen_domid, vaddr,
+ PROT_READ | PROT_WRITE,
+ vaddr ? MAP_FIXED : 0,
+ nb_pfn, pfns, err);
+ if (vaddr_base == NULL) {
+ perror("xenforeignmemory_map2");
+ exit(-1);
+ }
+ } else {
+ /*
+ * We create dummy mappings where we are unable to create a foreign
+ * mapping immediately due to certain circumstances (i.e. on resume now)
+ */
+ vaddr_base = mmap(vaddr, size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_SHARED | (vaddr ? MAP_FIXED : 0),
+ -1, 0);
+ if (vaddr_base == MAP_FAILED) {
+ perror("mmap");
+ exit(-1);
+ }
+ }
+
+ if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) {
+ ram_block_notify_add(vaddr_base, size, size);
+ }
+
+ entry->vaddr_base = vaddr_base;
+ entry->paddr_index = address_index;
+ entry->size = size;
+ entry->valid_mapping = (unsigned long *) g_malloc0(sizeof(unsigned long) *
+ BITS_TO_LONGS(size >> XC_PAGE_SHIFT));
+
+ if (dummy) {
+ entry->flags |= XEN_MAPCACHE_ENTRY_DUMMY;
+ } else {
+ entry->flags &= ~(XEN_MAPCACHE_ENTRY_DUMMY);
+ }
+
+ bitmap_zero(entry->valid_mapping, nb_pfn);
+ for (i = 0; i < nb_pfn; i++) {
+ if (!err[i]) {
+ bitmap_set(entry->valid_mapping, i, 1);
+ }
+ }
+
+ g_free(pfns);
+ g_free(err);
+}
+
+static uint8_t *xen_map_cache_unlocked(hwaddr phys_addr, hwaddr size,
+ uint8_t lock, bool dma)
+{
+ MapCacheEntry *entry, *pentry = NULL,
+ *free_entry = NULL, *free_pentry = NULL;
+ hwaddr address_index;
+ hwaddr address_offset;
+ hwaddr cache_size = size;
+ hwaddr test_bit_size;
+ bool translated G_GNUC_UNUSED = false;
+ bool dummy = false;
+
+tryagain:
+ address_index = phys_addr >> MCACHE_BUCKET_SHIFT;
+ address_offset = phys_addr & (MCACHE_BUCKET_SIZE - 1);
+
+ trace_xen_map_cache(phys_addr);
+
+ /* test_bit_size is always a multiple of XC_PAGE_SIZE */
+ if (size) {
+ test_bit_size = size + (phys_addr & (XC_PAGE_SIZE - 1));
+
+ if (test_bit_size % XC_PAGE_SIZE) {
+ test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE);
+ }
+ } else {
+ test_bit_size = XC_PAGE_SIZE;
+ }
+
+ if (mapcache->last_entry != NULL &&
+ mapcache->last_entry->paddr_index == address_index &&
+ !lock && !size &&
+ test_bits(address_offset >> XC_PAGE_SHIFT,
+ test_bit_size >> XC_PAGE_SHIFT,
+ mapcache->last_entry->valid_mapping)) {
+ trace_xen_map_cache_return(mapcache->last_entry->vaddr_base + address_offset);
+ return mapcache->last_entry->vaddr_base + address_offset;
+ }
+
+ /* size is always a multiple of MCACHE_BUCKET_SIZE */
+ if (size) {
+ cache_size = size + address_offset;
+ if (cache_size % MCACHE_BUCKET_SIZE) {
+ cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE);
+ }
+ } else {
+ cache_size = MCACHE_BUCKET_SIZE;
+ }
+
+ entry = &mapcache->entry[address_index % mapcache->nr_buckets];
+
+ while (entry && (lock || entry->lock) && entry->vaddr_base &&
+ (entry->paddr_index != address_index || entry->size != cache_size ||
+ !test_bits(address_offset >> XC_PAGE_SHIFT,
+ test_bit_size >> XC_PAGE_SHIFT,
+ entry->valid_mapping))) {
+ if (!free_entry && !entry->lock) {
+ free_entry = entry;
+ free_pentry = pentry;
+ }
+ pentry = entry;
+ entry = entry->next;
+ }
+ if (!entry && free_entry) {
+ entry = free_entry;
+ pentry = free_pentry;
+ }
+ if (!entry) {
+ entry = g_malloc0(sizeof (MapCacheEntry));
+ pentry->next = entry;
+ xen_remap_bucket(entry, NULL, cache_size, address_index, dummy);
+ } else if (!entry->lock) {
+ if (!entry->vaddr_base || entry->paddr_index != address_index ||
+ entry->size != cache_size ||
+ !test_bits(address_offset >> XC_PAGE_SHIFT,
+ test_bit_size >> XC_PAGE_SHIFT,
+ entry->valid_mapping)) {
+ xen_remap_bucket(entry, NULL, cache_size, address_index, dummy);
+ }
+ }
+
+ if(!test_bits(address_offset >> XC_PAGE_SHIFT,
+ test_bit_size >> XC_PAGE_SHIFT,
+ entry->valid_mapping)) {
+ mapcache->last_entry = NULL;
+#ifdef XEN_COMPAT_PHYSMAP
+ if (!translated && mapcache->phys_offset_to_gaddr) {
+ phys_addr = mapcache->phys_offset_to_gaddr(phys_addr, size);
+ translated = true;
+ goto tryagain;
+ }
+#endif
+ if (!dummy && runstate_check(RUN_STATE_INMIGRATE)) {
+ dummy = true;
+ goto tryagain;
+ }
+ trace_xen_map_cache_return(NULL);
+ return NULL;
+ }
+
+ mapcache->last_entry = entry;
+ if (lock) {
+ MapCacheRev *reventry = g_malloc0(sizeof(MapCacheRev));
+ entry->lock++;
+ reventry->dma = dma;
+ reventry->vaddr_req = mapcache->last_entry->vaddr_base + address_offset;
+ reventry->paddr_index = mapcache->last_entry->paddr_index;
+ reventry->size = entry->size;
+ QTAILQ_INSERT_HEAD(&mapcache->locked_entries, reventry, next);
+ }
+
+ trace_xen_map_cache_return(mapcache->last_entry->vaddr_base + address_offset);
+ return mapcache->last_entry->vaddr_base + address_offset;
+}
+
+uint8_t *xen_map_cache(hwaddr phys_addr, hwaddr size,
+ uint8_t lock, bool dma)
+{
+ uint8_t *p;
+
+ mapcache_lock();
+ p = xen_map_cache_unlocked(phys_addr, size, lock, dma);
+ mapcache_unlock();
+ return p;
+}
+
+ram_addr_t xen_ram_addr_from_mapcache(void *ptr)
+{
+ MapCacheEntry *entry = NULL;
+ MapCacheRev *reventry;
+ hwaddr paddr_index;
+ hwaddr size;
+ ram_addr_t raddr;
+ int found = 0;
+
+ mapcache_lock();
+ QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+ if (reventry->vaddr_req == ptr) {
+ paddr_index = reventry->paddr_index;
+ size = reventry->size;
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ fprintf(stderr, "%s, could not find %p\n", __func__, ptr);
+ QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+ DPRINTF(" "TARGET_FMT_plx" -> %p is present\n", reventry->paddr_index,
+ reventry->vaddr_req);
+ }
+ abort();
+ return 0;
+ }
+
+ entry = &mapcache->entry[paddr_index % mapcache->nr_buckets];
+ while (entry && (entry->paddr_index != paddr_index || entry->size != size)) {
+ entry = entry->next;
+ }
+ if (!entry) {
+ DPRINTF("Trying to find address %p that is not in the mapcache!\n", ptr);
+ raddr = 0;
+ } else {
+ raddr = (reventry->paddr_index << MCACHE_BUCKET_SHIFT) +
+ ((unsigned long) ptr - (unsigned long) entry->vaddr_base);
+ }
+ mapcache_unlock();
+ return raddr;
+}
+
+static void xen_invalidate_map_cache_entry_unlocked(uint8_t *buffer)
+{
+ MapCacheEntry *entry = NULL, *pentry = NULL;
+ MapCacheRev *reventry;
+ hwaddr paddr_index;
+ hwaddr size;
+ int found = 0;
+
+ QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+ if (reventry->vaddr_req == buffer) {
+ paddr_index = reventry->paddr_index;
+ size = reventry->size;
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ DPRINTF("%s, could not find %p\n", __func__, buffer);
+ QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+ DPRINTF(" "TARGET_FMT_plx" -> %p is present\n", reventry->paddr_index, reventry->vaddr_req);
+ }
+ return;
+ }
+ QTAILQ_REMOVE(&mapcache->locked_entries, reventry, next);
+ g_free(reventry);
+
+ if (mapcache->last_entry != NULL &&
+ mapcache->last_entry->paddr_index == paddr_index) {
+ mapcache->last_entry = NULL;
+ }
+
+ entry = &mapcache->entry[paddr_index % mapcache->nr_buckets];
+ while (entry && (entry->paddr_index != paddr_index || entry->size != size)) {
+ pentry = entry;
+ entry = entry->next;
+ }
+ if (!entry) {
+ DPRINTF("Trying to unmap address %p that is not in the mapcache!\n", buffer);
+ return;
+ }
+ entry->lock--;
+ if (entry->lock > 0 || pentry == NULL) {
+ return;
+ }
+
+ pentry->next = entry->next;
+ ram_block_notify_remove(entry->vaddr_base, entry->size, entry->size);
+ if (munmap(entry->vaddr_base, entry->size) != 0) {
+ perror("unmap fails");
+ exit(-1);
+ }
+ g_free(entry->valid_mapping);
+ g_free(entry);
+}
+
+void xen_invalidate_map_cache_entry(uint8_t *buffer)
+{
+ mapcache_lock();
+ xen_invalidate_map_cache_entry_unlocked(buffer);
+ mapcache_unlock();
+}
+
+void xen_invalidate_map_cache(void)
+{
+ unsigned long i;
+ MapCacheRev *reventry;
+
+ /* Flush pending AIO before destroying the mapcache */
+ bdrv_drain_all();
+
+ mapcache_lock();
+
+ QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+ if (!reventry->dma) {
+ continue;
+ }
+ fprintf(stderr, "Locked DMA mapping while invalidating mapcache!"
+ " "TARGET_FMT_plx" -> %p is present\n",
+ reventry->paddr_index, reventry->vaddr_req);
+ }
+
+ for (i = 0; i < mapcache->nr_buckets; i++) {
+ MapCacheEntry *entry = &mapcache->entry[i];
+
+ if (entry->vaddr_base == NULL) {
+ continue;
+ }
+ if (entry->lock > 0) {
+ continue;
+ }
+
+ if (munmap(entry->vaddr_base, entry->size) != 0) {
+ perror("unmap fails");
+ exit(-1);
+ }
+
+ entry->paddr_index = 0;
+ entry->vaddr_base = NULL;
+ entry->size = 0;
+ g_free(entry->valid_mapping);
+ entry->valid_mapping = NULL;
+ }
+
+ mapcache->last_entry = NULL;
+
+ mapcache_unlock();
+}
+
+static uint8_t *xen_replace_cache_entry_unlocked(hwaddr old_phys_addr,
+ hwaddr new_phys_addr,
+ hwaddr size)
+{
+ MapCacheEntry *entry;
+ hwaddr address_index, address_offset;
+ hwaddr test_bit_size, cache_size = size;
+
+ address_index = old_phys_addr >> MCACHE_BUCKET_SHIFT;
+ address_offset = old_phys_addr & (MCACHE_BUCKET_SIZE - 1);
+
+ assert(size);
+ /* test_bit_size is always a multiple of XC_PAGE_SIZE */
+ test_bit_size = size + (old_phys_addr & (XC_PAGE_SIZE - 1));
+ if (test_bit_size % XC_PAGE_SIZE) {
+ test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE);
+ }
+ cache_size = size + address_offset;
+ if (cache_size % MCACHE_BUCKET_SIZE) {
+ cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE);
+ }
+
+ entry = &mapcache->entry[address_index % mapcache->nr_buckets];
+ while (entry && !(entry->paddr_index == address_index &&
+ entry->size == cache_size)) {
+ entry = entry->next;
+ }
+ if (!entry) {
+ DPRINTF("Trying to update an entry for "TARGET_FMT_plx \
+ "that is not in the mapcache!\n", old_phys_addr);
+ return NULL;
+ }
+
+ address_index = new_phys_addr >> MCACHE_BUCKET_SHIFT;
+ address_offset = new_phys_addr & (MCACHE_BUCKET_SIZE - 1);
+
+ fprintf(stderr, "Replacing a dummy mapcache entry for "TARGET_FMT_plx \
+ " with "TARGET_FMT_plx"\n", old_phys_addr, new_phys_addr);
+
+ xen_remap_bucket(entry, entry->vaddr_base,
+ cache_size, address_index, false);
+ if (!test_bits(address_offset >> XC_PAGE_SHIFT,
+ test_bit_size >> XC_PAGE_SHIFT,
+ entry->valid_mapping)) {
+ DPRINTF("Unable to update a mapcache entry for "TARGET_FMT_plx"!\n",
+ old_phys_addr);
+ return NULL;
+ }
+
+ return entry->vaddr_base + address_offset;
+}
+
+uint8_t *xen_replace_cache_entry(hwaddr old_phys_addr,
+ hwaddr new_phys_addr,
+ hwaddr size)
+{
+ uint8_t *p;
+
+ mapcache_lock();
+ p = xen_replace_cache_entry_unlocked(old_phys_addr, new_phys_addr, size);
+ mapcache_unlock();
+ return p;
+}
diff --git a/hw/i386/xen/xen_apic.c b/hw/i386/xen/xen_apic.c
new file mode 100644
index 000000000..7c7a60b16
--- /dev/null
+++ b/hw/i386/xen/xen_apic.c
@@ -0,0 +1,103 @@
+/*
+ * Xen basic APIC support
+ *
+ * Copyright (c) 2012 Citrix
+ *
+ * Authors:
+ * Wei Liu <wei.liu2@citrix.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/apic_internal.h"
+#include "hw/pci/msi.h"
+#include "hw/xen/xen.h"
+#include "qemu/module.h"
+
+static uint64_t xen_apic_mem_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ return ~(uint64_t)0;
+}
+
+static void xen_apic_mem_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ if (size != sizeof(uint32_t)) {
+ fprintf(stderr, "Xen: APIC write data size = %d, invalid\n", size);
+ return;
+ }
+
+ xen_hvm_inject_msi(addr, data);
+}
+
+static const MemoryRegionOps xen_apic_io_ops = {
+ .read = xen_apic_mem_read,
+ .write = xen_apic_mem_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void xen_apic_realize(DeviceState *dev, Error **errp)
+{
+ APICCommonState *s = APIC_COMMON(dev);
+
+ s->vapic_control = 0;
+ memory_region_init_io(&s->io_memory, OBJECT(s), &xen_apic_io_ops, s,
+ "xen-apic-msi", APIC_SPACE_SIZE);
+ msi_nonbroken = true;
+}
+
+static void xen_apic_set_base(APICCommonState *s, uint64_t val)
+{
+}
+
+static void xen_apic_set_tpr(APICCommonState *s, uint8_t val)
+{
+}
+
+static uint8_t xen_apic_get_tpr(APICCommonState *s)
+{
+ return 0;
+}
+
+static void xen_apic_vapic_base_update(APICCommonState *s)
+{
+}
+
+static void xen_apic_external_nmi(APICCommonState *s)
+{
+}
+
+static void xen_send_msi(MSIMessage *msi)
+{
+ xen_hvm_inject_msi(msi->address, msi->data);
+}
+
+static void xen_apic_class_init(ObjectClass *klass, void *data)
+{
+ APICCommonClass *k = APIC_COMMON_CLASS(klass);
+
+ k->realize = xen_apic_realize;
+ k->set_base = xen_apic_set_base;
+ k->set_tpr = xen_apic_set_tpr;
+ k->get_tpr = xen_apic_get_tpr;
+ k->vapic_base_update = xen_apic_vapic_base_update;
+ k->external_nmi = xen_apic_external_nmi;
+ k->send_msi = xen_send_msi;
+}
+
+static const TypeInfo xen_apic_info = {
+ .name = "xen-apic",
+ .parent = TYPE_APIC_COMMON,
+ .instance_size = sizeof(APICCommonState),
+ .class_init = xen_apic_class_init,
+};
+
+static void xen_apic_register_types(void)
+{
+ type_register_static(&xen_apic_info);
+}
+
+type_init(xen_apic_register_types)
diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c
new file mode 100644
index 000000000..72028449b
--- /dev/null
+++ b/hw/i386/xen/xen_platform.c
@@ -0,0 +1,519 @@
+/*
+ * XEN platform pci device, formerly known as the event channel device
+ *
+ * Copyright (c) 2003-2004 Intel Corp.
+ * Copyright (c) 2006 XenSource
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/ide.h"
+#include "hw/pci/pci.h"
+#include "hw/xen/xen_common.h"
+#include "migration/vmstate.h"
+#include "hw/xen/xen-legacy-backend.h"
+#include "trace.h"
+#include "sysemu/xen.h"
+#include "sysemu/block-backend.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+//#define DEBUG_PLATFORM
+
+#ifdef DEBUG_PLATFORM
+#define DPRINTF(fmt, ...) do { \
+ fprintf(stderr, "xen_platform: " fmt, ## __VA_ARGS__); \
+} while (0)
+#else
+#define DPRINTF(fmt, ...) do { } while (0)
+#endif
+
+#define PFFLAG_ROM_LOCK 1 /* Sets whether ROM memory area is RW or RO */
+
+struct PCIXenPlatformState {
+ /*< private >*/
+ PCIDevice parent_obj;
+ /*< public >*/
+
+ MemoryRegion fixed_io;
+ MemoryRegion bar;
+ MemoryRegion mmio_bar;
+ uint8_t flags; /* used only for version_id == 2 */
+ uint16_t driver_product_version;
+
+ /* Log from guest drivers */
+ char log_buffer[4096];
+ int log_buffer_off;
+};
+
+#define TYPE_XEN_PLATFORM "xen-platform"
+OBJECT_DECLARE_SIMPLE_TYPE(PCIXenPlatformState, XEN_PLATFORM)
+
+#define XEN_PLATFORM_IOPORT 0x10
+
+/* Send bytes to syslog */
+static void log_writeb(PCIXenPlatformState *s, char val)
+{
+ if (val == '\n' || s->log_buffer_off == sizeof(s->log_buffer) - 1) {
+ /* Flush buffer */
+ s->log_buffer[s->log_buffer_off] = 0;
+ trace_xen_platform_log(s->log_buffer);
+ s->log_buffer_off = 0;
+ } else {
+ s->log_buffer[s->log_buffer_off++] = val;
+ }
+}
+
+/*
+ * Unplug device flags.
+ *
+ * The logic got a little confused at some point in the past but this is
+ * what they do now.
+ *
+ * bit 0: Unplug all IDE and SCSI disks.
+ * bit 1: Unplug all NICs.
+ * bit 2: Unplug IDE disks except primary master. This is overridden if
+ * bit 0 is also present in the mask.
+ * bit 3: Unplug all NVMe disks.
+ *
+ */
+#define _UNPLUG_IDE_SCSI_DISKS 0
+#define UNPLUG_IDE_SCSI_DISKS (1u << _UNPLUG_IDE_SCSI_DISKS)
+
+#define _UNPLUG_ALL_NICS 1
+#define UNPLUG_ALL_NICS (1u << _UNPLUG_ALL_NICS)
+
+#define _UNPLUG_AUX_IDE_DISKS 2
+#define UNPLUG_AUX_IDE_DISKS (1u << _UNPLUG_AUX_IDE_DISKS)
+
+#define _UNPLUG_NVME_DISKS 3
+#define UNPLUG_NVME_DISKS (1u << _UNPLUG_NVME_DISKS)
+
+static void unplug_nic(PCIBus *b, PCIDevice *d, void *o)
+{
+ /* We have to ignore passthrough devices */
+ if (pci_get_word(d->config + PCI_CLASS_DEVICE) ==
+ PCI_CLASS_NETWORK_ETHERNET
+ && strcmp(d->name, "xen-pci-passthrough") != 0) {
+ object_unparent(OBJECT(d));
+ }
+}
+
+/* Remove the peer of the NIC device. Normally, this would be a tap device. */
+static void del_nic_peer(NICState *nic, void *opaque)
+{
+ NetClientState *nc;
+
+ nc = qemu_get_queue(nic);
+ if (nc->peer)
+ qemu_del_net_client(nc->peer);
+}
+
+static void pci_unplug_nics(PCIBus *bus)
+{
+ qemu_foreach_nic(del_nic_peer, NULL);
+ pci_for_each_device(bus, 0, unplug_nic, NULL);
+}
+
+static void unplug_disks(PCIBus *b, PCIDevice *d, void *opaque)
+{
+ uint32_t flags = *(uint32_t *)opaque;
+ bool aux = (flags & UNPLUG_AUX_IDE_DISKS) &&
+ !(flags & UNPLUG_IDE_SCSI_DISKS);
+
+ /* We have to ignore passthrough devices */
+ if (!strcmp(d->name, "xen-pci-passthrough")) {
+ return;
+ }
+
+ switch (pci_get_word(d->config + PCI_CLASS_DEVICE)) {
+ case PCI_CLASS_STORAGE_IDE:
+ pci_piix3_xen_ide_unplug(DEVICE(d), aux);
+ break;
+
+ case PCI_CLASS_STORAGE_SCSI:
+ if (!aux) {
+ object_unparent(OBJECT(d));
+ }
+ break;
+
+ case PCI_CLASS_STORAGE_EXPRESS:
+ if (flags & UNPLUG_NVME_DISKS) {
+ object_unparent(OBJECT(d));
+ }
+
+ default:
+ break;
+ }
+}
+
+static void pci_unplug_disks(PCIBus *bus, uint32_t flags)
+{
+ pci_for_each_device(bus, 0, unplug_disks, &flags);
+}
+
+static void platform_fixed_ioport_writew(void *opaque, uint32_t addr, uint32_t val)
+{
+ PCIXenPlatformState *s = opaque;
+
+ switch (addr) {
+ case 0: {
+ PCIDevice *pci_dev = PCI_DEVICE(s);
+ /* Unplug devices. See comment above flag definitions */
+ if (val & (UNPLUG_IDE_SCSI_DISKS | UNPLUG_AUX_IDE_DISKS |
+ UNPLUG_NVME_DISKS)) {
+ DPRINTF("unplug disks\n");
+ pci_unplug_disks(pci_get_bus(pci_dev), val);
+ }
+ if (val & UNPLUG_ALL_NICS) {
+ DPRINTF("unplug nics\n");
+ pci_unplug_nics(pci_get_bus(pci_dev));
+ }
+ break;
+ }
+ case 2:
+ switch (val) {
+ case 1:
+ DPRINTF("Citrix Windows PV drivers loaded in guest\n");
+ break;
+ case 0:
+ DPRINTF("Guest claimed to be running PV product 0?\n");
+ break;
+ default:
+ DPRINTF("Unknown PV product %d loaded in guest\n", val);
+ break;
+ }
+ s->driver_product_version = val;
+ break;
+ }
+}
+
+static void platform_fixed_ioport_writel(void *opaque, uint32_t addr,
+ uint32_t val)
+{
+ switch (addr) {
+ case 0:
+ /* PV driver version */
+ break;
+ }
+}
+
+static void platform_fixed_ioport_writeb(void *opaque, uint32_t addr, uint32_t val)
+{
+ PCIXenPlatformState *s = opaque;
+
+ switch (addr) {
+ case 0: /* Platform flags */ {
+ hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ?
+ HVMMEM_ram_ro : HVMMEM_ram_rw;
+ if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) {
+ DPRINTF("unable to change ro/rw state of ROM memory area!\n");
+ } else {
+ s->flags = val & PFFLAG_ROM_LOCK;
+ DPRINTF("changed ro/rw state of ROM memory area. now is %s state.\n",
+ (mem_type == HVMMEM_ram_ro ? "ro":"rw"));
+ }
+ break;
+ }
+ case 2:
+ log_writeb(s, val);
+ break;
+ }
+}
+
+static uint32_t platform_fixed_ioport_readw(void *opaque, uint32_t addr)
+{
+ switch (addr) {
+ case 0:
+ /* Magic value so that you can identify the interface. */
+ return 0x49d2;
+ default:
+ return 0xffff;
+ }
+}
+
+static uint32_t platform_fixed_ioport_readb(void *opaque, uint32_t addr)
+{
+ PCIXenPlatformState *s = opaque;
+
+ switch (addr) {
+ case 0:
+ /* Platform flags */
+ return s->flags;
+ case 2:
+ /* Version number */
+ return 1;
+ default:
+ return 0xff;
+ }
+}
+
+static void platform_fixed_ioport_reset(void *opaque)
+{
+ PCIXenPlatformState *s = opaque;
+
+ platform_fixed_ioport_writeb(s, 0, 0);
+}
+
+static uint64_t platform_fixed_ioport_read(void *opaque,
+ hwaddr addr,
+ unsigned size)
+{
+ switch (size) {
+ case 1:
+ return platform_fixed_ioport_readb(opaque, addr);
+ case 2:
+ return platform_fixed_ioport_readw(opaque, addr);
+ default:
+ return -1;
+ }
+}
+
+static void platform_fixed_ioport_write(void *opaque, hwaddr addr,
+
+ uint64_t val, unsigned size)
+{
+ switch (size) {
+ case 1:
+ platform_fixed_ioport_writeb(opaque, addr, val);
+ break;
+ case 2:
+ platform_fixed_ioport_writew(opaque, addr, val);
+ break;
+ case 4:
+ platform_fixed_ioport_writel(opaque, addr, val);
+ break;
+ }
+}
+
+
+static const MemoryRegionOps platform_fixed_io_ops = {
+ .read = platform_fixed_ioport_read,
+ .write = platform_fixed_ioport_write,
+ .valid = {
+ .unaligned = true,
+ },
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 4,
+ .unaligned = true,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void platform_fixed_ioport_init(PCIXenPlatformState* s)
+{
+ memory_region_init_io(&s->fixed_io, OBJECT(s), &platform_fixed_io_ops, s,
+ "xen-fixed", 16);
+ memory_region_add_subregion(get_system_io(), XEN_PLATFORM_IOPORT,
+ &s->fixed_io);
+}
+
+/* Xen Platform PCI Device */
+
+static uint64_t xen_platform_ioport_readb(void *opaque, hwaddr addr,
+ unsigned int size)
+{
+ if (addr == 0) {
+ return platform_fixed_ioport_readb(opaque, 0);
+ } else {
+ return ~0u;
+ }
+}
+
+static void xen_platform_ioport_writeb(void *opaque, hwaddr addr,
+ uint64_t val, unsigned int size)
+{
+ PCIXenPlatformState *s = opaque;
+ PCIDevice *pci_dev = PCI_DEVICE(s);
+
+ switch (addr) {
+ case 0: /* Platform flags */
+ platform_fixed_ioport_writeb(opaque, 0, (uint32_t)val);
+ break;
+ case 4:
+ if (val == 1) {
+ /*
+ * SUSE unplug for Xenlinux
+ * xen-kmp used this since xen-3.0.4, instead the official protocol
+ * from xen-3.3+ It did an unconditional "outl(1, (ioaddr + 4));"
+ * Pre VMDP 1.7 used 4 and 8 depending on how VMDP was configured.
+ * If VMDP was to control both disk and LAN it would use 4.
+ * If it controlled just disk or just LAN, it would use 8 below.
+ */
+ pci_unplug_disks(pci_get_bus(pci_dev), UNPLUG_IDE_SCSI_DISKS);
+ pci_unplug_nics(pci_get_bus(pci_dev));
+ }
+ break;
+ case 8:
+ switch (val) {
+ case 1:
+ pci_unplug_disks(pci_get_bus(pci_dev), UNPLUG_IDE_SCSI_DISKS);
+ break;
+ case 2:
+ pci_unplug_nics(pci_get_bus(pci_dev));
+ break;
+ default:
+ log_writeb(s, (uint32_t)val);
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static const MemoryRegionOps xen_pci_io_ops = {
+ .read = xen_platform_ioport_readb,
+ .write = xen_platform_ioport_writeb,
+ .impl.min_access_size = 1,
+ .impl.max_access_size = 1,
+};
+
+static void platform_ioport_bar_setup(PCIXenPlatformState *d)
+{
+ memory_region_init_io(&d->bar, OBJECT(d), &xen_pci_io_ops, d,
+ "xen-pci", 0x100);
+}
+
+static uint64_t platform_mmio_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ DPRINTF("Warning: attempted read from physical address "
+ "0x" TARGET_FMT_plx " in xen platform mmio space\n", addr);
+
+ return 0;
+}
+
+static void platform_mmio_write(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+ DPRINTF("Warning: attempted write of 0x%"PRIx64" to physical "
+ "address 0x" TARGET_FMT_plx " in xen platform mmio space\n",
+ val, addr);
+}
+
+static const MemoryRegionOps platform_mmio_handler = {
+ .read = &platform_mmio_read,
+ .write = &platform_mmio_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void platform_mmio_setup(PCIXenPlatformState *d)
+{
+ memory_region_init_io(&d->mmio_bar, OBJECT(d), &platform_mmio_handler, d,
+ "xen-mmio", 0x1000000);
+}
+
+static int xen_platform_post_load(void *opaque, int version_id)
+{
+ PCIXenPlatformState *s = opaque;
+
+ platform_fixed_ioport_writeb(s, 0, s->flags);
+
+ return 0;
+}
+
+static const VMStateDescription vmstate_xen_platform = {
+ .name = "platform",
+ .version_id = 4,
+ .minimum_version_id = 4,
+ .post_load = xen_platform_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_PCI_DEVICE(parent_obj, PCIXenPlatformState),
+ VMSTATE_UINT8(flags, PCIXenPlatformState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void xen_platform_realize(PCIDevice *dev, Error **errp)
+{
+ PCIXenPlatformState *d = XEN_PLATFORM(dev);
+ uint8_t *pci_conf;
+
+ /* Device will crash on reset if xen is not initialized */
+ if (!xen_enabled()) {
+ error_setg(errp, "xen-platform device requires the Xen accelerator");
+ return;
+ }
+
+ pci_conf = dev->config;
+
+ pci_set_word(pci_conf + PCI_COMMAND, PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+
+ pci_config_set_prog_interface(pci_conf, 0);
+
+ pci_conf[PCI_INTERRUPT_PIN] = 1;
+
+ platform_ioport_bar_setup(d);
+ pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_IO, &d->bar);
+
+ /* reserve 16MB mmio address for share memory*/
+ platform_mmio_setup(d);
+ pci_register_bar(dev, 1, PCI_BASE_ADDRESS_MEM_PREFETCH,
+ &d->mmio_bar);
+
+ platform_fixed_ioport_init(d);
+}
+
+static void platform_reset(DeviceState *dev)
+{
+ PCIXenPlatformState *s = XEN_PLATFORM(dev);
+
+ platform_fixed_ioport_reset(s);
+}
+
+static void xen_platform_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+ k->realize = xen_platform_realize;
+ k->vendor_id = PCI_VENDOR_ID_XEN;
+ k->device_id = PCI_DEVICE_ID_XEN_PLATFORM;
+ k->class_id = PCI_CLASS_OTHERS << 8 | 0x80;
+ k->subsystem_vendor_id = PCI_VENDOR_ID_XEN;
+ k->subsystem_id = PCI_DEVICE_ID_XEN_PLATFORM;
+ k->revision = 1;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ dc->desc = "XEN platform pci device";
+ dc->reset = platform_reset;
+ dc->vmsd = &vmstate_xen_platform;
+}
+
+static const TypeInfo xen_platform_info = {
+ .name = TYPE_XEN_PLATFORM,
+ .parent = TYPE_PCI_DEVICE,
+ .instance_size = sizeof(PCIXenPlatformState),
+ .class_init = xen_platform_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { },
+ },
+};
+
+static void xen_platform_register_types(void)
+{
+ type_register_static(&xen_platform_info);
+}
+
+type_init(xen_platform_register_types)
diff --git a/hw/i386/xen/xen_pvdevice.c b/hw/i386/xen/xen_pvdevice.c
new file mode 100644
index 000000000..1ea95fa60
--- /dev/null
+++ b/hw/i386/xen/xen_pvdevice.c
@@ -0,0 +1,154 @@
+/* Copyright (c) Citrix Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms,
+ * with or without modification, are permitted provided
+ * that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer in the documentation and/or other
+ * materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "migration/vmstate.h"
+#include "trace.h"
+#include "qom/object.h"
+
+#define TYPE_XEN_PV_DEVICE "xen-pvdevice"
+
+OBJECT_DECLARE_SIMPLE_TYPE(XenPVDevice, XEN_PV_DEVICE)
+
+struct XenPVDevice {
+ /*< private >*/
+ PCIDevice parent_obj;
+ /*< public >*/
+ uint16_t vendor_id;
+ uint16_t device_id;
+ uint8_t revision;
+ uint32_t size;
+ MemoryRegion mmio;
+};
+
+static uint64_t xen_pv_mmio_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ trace_xen_pv_mmio_read(addr);
+
+ return ~(uint64_t)0;
+}
+
+static void xen_pv_mmio_write(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+ trace_xen_pv_mmio_write(addr);
+}
+
+static const MemoryRegionOps xen_pv_mmio_ops = {
+ .read = &xen_pv_mmio_read,
+ .write = &xen_pv_mmio_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static const VMStateDescription vmstate_xen_pvdevice = {
+ .name = "xen-pvdevice",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_PCI_DEVICE(parent_obj, XenPVDevice),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void xen_pv_realize(PCIDevice *pci_dev, Error **errp)
+{
+ XenPVDevice *d = XEN_PV_DEVICE(pci_dev);
+ uint8_t *pci_conf;
+
+ /* device-id property must always be supplied */
+ if (d->device_id == 0xffff) {
+ error_setg(errp, "Device ID invalid, it must always be supplied");
+ return;
+ }
+
+ pci_conf = pci_dev->config;
+
+ pci_set_word(pci_conf + PCI_VENDOR_ID, d->vendor_id);
+ pci_set_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID, d->vendor_id);
+ pci_set_word(pci_conf + PCI_DEVICE_ID, d->device_id);
+ pci_set_word(pci_conf + PCI_SUBSYSTEM_ID, d->device_id);
+ pci_set_byte(pci_conf + PCI_REVISION_ID, d->revision);
+
+ pci_set_word(pci_conf + PCI_COMMAND, PCI_COMMAND_MEMORY);
+
+ pci_config_set_prog_interface(pci_conf, 0);
+
+ pci_conf[PCI_INTERRUPT_PIN] = 1;
+
+ memory_region_init_io(&d->mmio, NULL, &xen_pv_mmio_ops, d,
+ "mmio", d->size);
+
+ pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_MEM_PREFETCH,
+ &d->mmio);
+}
+
+static Property xen_pv_props[] = {
+ DEFINE_PROP_UINT16("vendor-id", XenPVDevice, vendor_id, PCI_VENDOR_ID_XEN),
+ DEFINE_PROP_UINT16("device-id", XenPVDevice, device_id, 0xffff),
+ DEFINE_PROP_UINT8("revision", XenPVDevice, revision, 0x01),
+ DEFINE_PROP_UINT32("size", XenPVDevice, size, 0x400000),
+ DEFINE_PROP_END_OF_LIST()
+};
+
+static void xen_pv_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+ k->realize = xen_pv_realize;
+ k->class_id = PCI_CLASS_SYSTEM_OTHER;
+ dc->desc = "Xen PV Device";
+ device_class_set_props(dc, xen_pv_props);
+ dc->vmsd = &vmstate_xen_pvdevice;
+}
+
+static const TypeInfo xen_pv_type_info = {
+ .name = TYPE_XEN_PV_DEVICE,
+ .parent = TYPE_PCI_DEVICE,
+ .instance_size = sizeof(XenPVDevice),
+ .class_init = xen_pv_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { },
+ },
+};
+
+static void xen_pv_register_types(void)
+{
+ type_register_static(&xen_pv_type_info);
+}
+
+type_init(xen_pv_register_types)