54 files changed, 23847 insertions, 0 deletions
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
new file mode 100644
index 000000000..d22ac4a4b
--- /dev/null
+++ b/hw/i386/Kconfig
@@ -0,0 +1,139 @@
+config X86_FW_OVMF
+    bool
+
+config SEV
+    bool
+    select X86_FW_OVMF
+    depends on KVM
+
+config SGX
+    bool
+    depends on KVM
+
+config PC
+    bool
+    imply APPLESMC
+    imply HYPERV
+    imply ISA_IPMI_KCS
+    imply ISA_IPMI_BT
+    imply PCI_IPMI_KCS
+    imply PCI_IPMI_BT
+    imply IPMI_SSIF
+    imply ISA_DEBUG
+    imply PARALLEL
+    imply PCI_DEVICES
+    imply PVPANIC_ISA
+    imply QXL
+    imply SEV
+    imply SGX
+    imply SGA
+    imply TEST_DEVICES
+    imply TPM_CRB
+    imply TPM_TIS_ISA
+    imply VGA_PCI
+    imply VIRTIO_VGA
+    imply NVDIMM
+    select FDC_ISA
+    select I8259
+    select I8254
+    select PCKBD
+    select PCSPK
+    select I8257
+    select MC146818RTC
+    # For ACPI builder:
+    select SERIAL_ISA
+    select ACPI_PCI
+    select ACPI_VMGENID
+    select VIRTIO_PMEM_SUPPORTED
+    select VIRTIO_MEM_SUPPORTED
+
+config PC_PCI
+    bool
+    select APIC
+    select IOAPIC
+    select APM
+    select PC
+
+config PC_ACPI
+    bool
+    select ACPI_X86
+    select ACPI_CPU_HOTPLUG
+    select ACPI_MEMORY_HOTPLUG
+    select ACPI_VIOT
+    select SMBUS_EEPROM
+    select PFLASH_CFI01
+    depends on ACPI_SMBUS
+
+config I440FX
+    bool
+    imply E1000_PCI
+    imply VMPORT
+    imply VMMOUSE
+    select PC_PCI
+    select PC_ACPI
+    select ACPI_SMBUS
+    select PCI_I440FX
+    select PIIX3
+    select IDE_PIIX
+    select DIMM
+    select SMBIOS
+    select FW_CFG_DMA
+
+config ISAPC
+    bool
+    select ISA_BUS
+    select PC
+    select IDE_ISA
+    select VGA_ISA
+    # FIXME: it is in the same file as i440fx, and does not compile
+    # if separated
+    depends on I440FX
+
+config Q35
+    bool
+    imply VTD
+    imply AMD_IOMMU
+    imply E1000E_PCI_EXPRESS
+    imply VMPORT
+    imply VMMOUSE
+    select PC_PCI
+    select PC_ACPI
+    select PCI_EXPRESS_Q35
+    select LPC_ICH9
+    select AHCI_ICH9
+    select DIMM
+    select SMBIOS
+    select FW_CFG_DMA
+
+config MICROVM
+    bool
+    select SERIAL_ISA # for serial_hds_isa_init()
+    select ISA_BUS
+    select APIC
+    select IOAPIC
+    select I8259
+    select MC146818RTC
+    select VIRTIO_MMIO
+    select ACPI_HW_REDUCED
+    select PCI_EXPRESS_GENERIC_BRIDGE
+    select USB_XHCI_SYSBUS
+    select I8254
+
+config X86_IOMMU
+    bool
+    depends on PC
+
+config VTD
+    bool
+    select X86_IOMMU
+
+config AMD_IOMMU
+    bool
+    select X86_IOMMU
+
+config VMPORT
+    bool
+
+config VMMOUSE
+    bool
+    depends on VMPORT
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
new file mode 100644
index 000000000..a99c6e4fe
--- /dev/null
+++ b/hw/i386/acpi-build.c
@@ -0,0 +1,2873 @@
+/* Support for generating ACPI tables and passing them to Guests
+ *
+ * Copyright (C) 2008-2010  Kevin O'Connor <kevin@koconnor.net>
+ * Copyright (C) 2006 Fabrice Bellard
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qnum.h"
+#include "acpi-build.h"
+#include "acpi-common.h"
+#include "qemu/bitmap.h"
+#include "qemu/error-report.h"
+#include "hw/pci/pci.h"
+#include "hw/core/cpu.h"
+#include "target/i386/cpu.h"
+#include "hw/misc/pvpanic.h"
+#include "hw/timer/hpet.h"
+#include "hw/acpi/acpi-defs.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/cpu.h"
+#include "hw/nvram/fw_cfg.h"
+#include "hw/acpi/bios-linker-loader.h"
+#include "hw/isa/isa.h"
+#include "hw/block/fdc.h"
+#include "hw/acpi/memory_hotplug.h"
+#include "sysemu/tpm.h"
+#include "hw/acpi/tpm.h"
+#include "hw/acpi/vmgenid.h"
+#include "sysemu/tpm_backend.h"
+#include "hw/rtc/mc146818rtc_regs.h"
+#include "migration/vmstate.h"
+#include "hw/mem/memory-device.h"
+#include "hw/mem/nvdimm.h"
+#include "sysemu/numa.h"
+#include "sysemu/reset.h"
+#include "hw/hyperv/vmbus-bridge.h"
+
+/* Supported chipsets: */
+#include "hw/southbridge/piix.h"
+#include "hw/acpi/pcihp.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/i386/ich9.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci-host/q35.h"
+#include "hw/i386/x86-iommu.h"
+
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/utils.h"
+#include "hw/acpi/pci.h"
+
+#include "qom/qom-qobject.h"
+#include "hw/i386/amd_iommu.h"
+#include "hw/i386/intel_iommu.h"
+#include "hw/virtio/virtio-iommu.h"
+
+#include "hw/acpi/ipmi.h"
+#include "hw/acpi/hmat.h"
+#include "hw/acpi/viot.h"
+
+/* These are used to size the ACPI tables for -M pc-i440fx-1.7 and
+ * -M pc-i440fx-2.0.  Even if the actual amount of AML generated grows
+ * a little bit, there should be plenty of free space since the DSDT
+ * shrunk by ~1.5k between QEMU 2.0 and QEMU 2.1.
+ */
+#define ACPI_BUILD_LEGACY_CPU_AML_SIZE    97
+#define ACPI_BUILD_ALIGN_SIZE             0x1000
+
+#define ACPI_BUILD_TABLE_SIZE             0x20000
+
+/* #define DEBUG_ACPI_BUILD */
+#ifdef DEBUG_ACPI_BUILD
+#define ACPI_BUILD_DPRINTF(fmt, ...)        \
+    do {printf("ACPI_BUILD: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define ACPI_BUILD_DPRINTF(fmt, ...)
+#endif
+
+typedef struct AcpiPmInfo {
+    bool s3_disabled;
+    bool s4_disabled;
+    bool pcihp_bridge_en;
+    bool smi_on_cpuhp;
+    bool smi_on_cpu_unplug;
+    bool pcihp_root_en;
+    uint8_t s4_val;
+    AcpiFadtData fadt;
+    uint16_t cpu_hp_io_base;
+    uint16_t pcihp_io_base;
+    uint16_t pcihp_io_len;
+} AcpiPmInfo;
+
+typedef struct AcpiMiscInfo {
+    bool is_piix4;
+    bool has_hpet;
+#ifdef CONFIG_TPM
+    TPMVersion tpm_version;
+#endif
+    const unsigned char *dsdt_code;
+    unsigned dsdt_size;
+    uint16_t pvpanic_port;
+    uint16_t applesmc_io_base;
+} AcpiMiscInfo;
+
+typedef struct AcpiBuildPciBusHotplugState {
+    GArray *device_table;
+    GArray *notify_table;
+    struct AcpiBuildPciBusHotplugState *parent;
+    bool pcihp_bridge_en;
+} AcpiBuildPciBusHotplugState;
+
+typedef struct FwCfgTPMConfig {
+    uint32_t tpmppi_address;
+    uint8_t tpm_version;
+    uint8_t tpmppi_version;
+} QEMU_PACKED FwCfgTPMConfig;
+
+static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg);
+
+const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio = {
+    .space_id = AML_AS_SYSTEM_IO,
+    .address = NVDIMM_ACPI_IO_BASE,
+    .bit_width = NVDIMM_ACPI_IO_LEN << 3
+};
+
+static void init_common_fadt_data(MachineState *ms, Object *o,
+                                  AcpiFadtData *data)
+{
+    X86MachineState *x86ms = X86_MACHINE(ms);
+    /*
+     * "ICH9-LPC" or "PIIX4_PM" has "smm-compat" property to keep the old
+     * behavior for compatibility irrelevant to smm_enabled, which doesn't
+     * comforms to ACPI spec.
+     */
+    bool smm_enabled = object_property_get_bool(o, "smm-compat", NULL) ?
+        true : x86_machine_is_smm_enabled(x86ms);
+    uint32_t io = object_property_get_uint(o, ACPI_PM_PROP_PM_IO_BASE, NULL);
+    AmlAddressSpace as = AML_AS_SYSTEM_IO;
+    AcpiFadtData fadt = {
+        .rev = 3,
+        .flags =
+            (1 << ACPI_FADT_F_WBINVD) |
+            (1 << ACPI_FADT_F_PROC_C1) |
+            (1 << ACPI_FADT_F_SLP_BUTTON) |
+            (1 << ACPI_FADT_F_RTC_S4) |
+            (1 << ACPI_FADT_F_USE_PLATFORM_CLOCK) |
+            /* APIC destination mode ("Flat Logical") has an upper limit of 8
+             * CPUs for more than 8 CPUs, "Clustered Logical" mode has to be
+             * used
+             */
+            ((ms->smp.max_cpus > 8) ?
+                        (1 << ACPI_FADT_F_FORCE_APIC_CLUSTER_MODEL) : 0),
+        .int_model = 1 /* Multiple APIC */,
+        .rtc_century = RTC_CENTURY,
+        .plvl2_lat = 0xfff /* C2 state not supported */,
+        .plvl3_lat = 0xfff /* C3 state not supported */,
+        .smi_cmd = smm_enabled ? ACPI_PORT_SMI_CMD : 0,
+        .sci_int = object_property_get_uint(o, ACPI_PM_PROP_SCI_INT, NULL),
+        .acpi_enable_cmd =
+            smm_enabled ?
+            object_property_get_uint(o, ACPI_PM_PROP_ACPI_ENABLE_CMD, NULL) :
+            0,
+        .acpi_disable_cmd =
+            smm_enabled ?
+            object_property_get_uint(o, ACPI_PM_PROP_ACPI_DISABLE_CMD, NULL) :
+            0,
+        .pm1a_evt = { .space_id = as, .bit_width = 4 * 8, .address = io },
+        .pm1a_cnt = { .space_id = as, .bit_width = 2 * 8,
+                      .address = io + 0x04 },
+        .pm_tmr = { .space_id = as, .bit_width = 4 * 8, .address = io + 0x08 },
+        .gpe0_blk = { .space_id = as, .bit_width =
+            object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK_LEN, NULL) * 8,
+            .address = object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK, NULL)
+        },
+    };
+    *data = fadt;
+}
+
+static Object *object_resolve_type_unambiguous(const char *typename)
+{
+    bool ambig;
+    Object *o = object_resolve_path_type("", typename, &ambig);
+
+    if (ambig || !o) {
+        return NULL;
+    }
+    return o;
+}
+
+static void acpi_get_pm_info(MachineState *machine, AcpiPmInfo *pm)
+{
+    Object *piix = object_resolve_type_unambiguous(TYPE_PIIX4_PM);
+    Object *lpc = object_resolve_type_unambiguous(TYPE_ICH9_LPC_DEVICE);
+    Object *obj = piix ? piix : lpc;
+    QObject *o;
+    pm->cpu_hp_io_base = 0;
+    pm->pcihp_io_base = 0;
+    pm->pcihp_io_len = 0;
+    pm->smi_on_cpuhp = false;
+    pm->smi_on_cpu_unplug = false;
+
+    assert(obj);
+    init_common_fadt_data(machine, obj, &pm->fadt);
+    if (piix) {
+        /* w2k requires FADT(rev1) or it won't boot, keep PC compatible */
+        pm->fadt.rev = 1;
+        pm->cpu_hp_io_base = PIIX4_CPU_HOTPLUG_IO_BASE;
+    }
+    if (lpc) {
+        uint64_t smi_features = object_property_get_uint(lpc,
+            ICH9_LPC_SMI_NEGOTIATED_FEAT_PROP, NULL);
+        struct AcpiGenericAddress r = { .space_id = AML_AS_SYSTEM_IO,
+            .bit_width = 8, .address = ICH9_RST_CNT_IOPORT };
+        pm->fadt.reset_reg = r;
+        pm->fadt.reset_val = 0xf;
+        pm->fadt.flags |= 1 << ACPI_FADT_F_RESET_REG_SUP;
+        pm->cpu_hp_io_base = ICH9_CPU_HOTPLUG_IO_BASE;
+        pm->smi_on_cpuhp =
+            !!(smi_features & BIT_ULL(ICH9_LPC_SMI_F_CPU_HOTPLUG_BIT));
+        pm->smi_on_cpu_unplug =
+            !!(smi_features & BIT_ULL(ICH9_LPC_SMI_F_CPU_HOT_UNPLUG_BIT));
+    }
+    pm->pcihp_io_base =
+        object_property_get_uint(obj, ACPI_PCIHP_IO_BASE_PROP, NULL);
+    pm->pcihp_io_len =
+        object_property_get_uint(obj, ACPI_PCIHP_IO_LEN_PROP, NULL);
+
+    /* The above need not be conditional on machine type because the reset port
+     * happens to be the same on PIIX (pc) and ICH9 (q35). */
+    QEMU_BUILD_BUG_ON(ICH9_RST_CNT_IOPORT != PIIX_RCR_IOPORT);
+
+    /* Fill in optional s3/s4 related properties */
+    o = object_property_get_qobject(obj, ACPI_PM_PROP_S3_DISABLED, NULL);
+    if (o) {
+        pm->s3_disabled = qnum_get_uint(qobject_to(QNum, o));
+    } else {
+        pm->s3_disabled = false;
+    }
+    qobject_unref(o);
+    o = object_property_get_qobject(obj, ACPI_PM_PROP_S4_DISABLED, NULL);
+    if (o) {
+        pm->s4_disabled = qnum_get_uint(qobject_to(QNum, o));
+    } else {
+        pm->s4_disabled = false;
+    }
+    qobject_unref(o);
+    o = object_property_get_qobject(obj, ACPI_PM_PROP_S4_VAL, NULL);
+    if (o) {
+        pm->s4_val = qnum_get_uint(qobject_to(QNum, o));
+    } else {
+        pm->s4_val = false;
+    }
+    qobject_unref(o);
+
+    pm->pcihp_bridge_en =
+        object_property_get_bool(obj, ACPI_PM_PROP_ACPI_PCIHP_BRIDGE,
+                                 NULL);
+    pm->pcihp_root_en =
+        object_property_get_bool(obj, ACPI_PM_PROP_ACPI_PCI_ROOTHP,
+                                 NULL);
+}
+
+static void acpi_get_misc_info(AcpiMiscInfo *info)
+{
+    Object *piix = object_resolve_type_unambiguous(TYPE_PIIX4_PM);
+    Object *lpc = object_resolve_type_unambiguous(TYPE_ICH9_LPC_DEVICE);
+    assert(!!piix != !!lpc);
+
+    if (piix) {
+        info->is_piix4 = true;
+    }
+    if (lpc) {
+        info->is_piix4 = false;
+    }
+
+    info->has_hpet = hpet_find();
+#ifdef CONFIG_TPM
+    info->tpm_version = tpm_get_version(tpm_find());
+#endif
+    info->pvpanic_port = pvpanic_port();
+    info->applesmc_io_base = applesmc_port();
+}
+
+/*
+ * Because of the PXB hosts we cannot simply query TYPE_PCI_HOST_BRIDGE.
+ * On i386 arch we only have two pci hosts, so we can look only for them.
+ */
+Object *acpi_get_i386_pci_host(void)
+{
+    PCIHostState *host;
+
+    host = PCI_HOST_BRIDGE(object_resolve_path("/machine/i440fx", NULL));
+    if (!host) {
+        host = PCI_HOST_BRIDGE(object_resolve_path("/machine/q35", NULL));
+    }
+
+    return OBJECT(host);
+}
+
+static void acpi_get_pci_holes(Range *hole, Range *hole64)
+{
+    Object *pci_host;
+
+    pci_host = acpi_get_i386_pci_host();
+
+    if (!pci_host) {
+        return;
+    }
+
+    range_set_bounds1(hole,
+                      object_property_get_uint(pci_host,
+                                               PCI_HOST_PROP_PCI_HOLE_START,
+                                               NULL),
+                      object_property_get_uint(pci_host,
+                                               PCI_HOST_PROP_PCI_HOLE_END,
+                                               NULL));
+    range_set_bounds1(hole64,
+                      object_property_get_uint(pci_host,
+                                               PCI_HOST_PROP_PCI_HOLE64_START,
+                                               NULL),
+                      object_property_get_uint(pci_host,
+                                               PCI_HOST_PROP_PCI_HOLE64_END,
+                                               NULL));
+}
+
+static void acpi_align_size(GArray *blob, unsigned align)
+{
+    /* Align size to multiple of given size. This reduces the chance
+     * we need to change size in the future (breaking cross version migration).
+     */
+    g_array_set_size(blob, ROUND_UP(acpi_data_len(blob), align));
+}
+
+/*
+ * ACPI spec 1.0b,
+ * 5.2.6 Firmware ACPI Control Structure
+ */
+static void
+build_facs(GArray *table_data)
+{
+    const char *sig = "FACS";
+    const uint8_t reserved[40] = {};
+
+    g_array_append_vals(table_data, sig, 4); /* Signature */
+    build_append_int_noprefix(table_data, 64, 4); /* Length */
+    build_append_int_noprefix(table_data, 0, 4); /* Hardware Signature */
+    build_append_int_noprefix(table_data, 0, 4); /* Firmware Waking Vector */
+    build_append_int_noprefix(table_data, 0, 4); /* Global Lock */
+    build_append_int_noprefix(table_data, 0, 4); /* Flags */
+    g_array_append_vals(table_data, reserved, 40); /* Reserved */
+}
+
+static void build_append_pcihp_notify_entry(Aml *method, int slot)
+{
+    Aml *if_ctx;
+    int32_t devfn = PCI_DEVFN(slot, 0);
+
+    if_ctx = aml_if(aml_and(aml_arg(0), aml_int(0x1U << slot), NULL));
+    aml_append(if_ctx, aml_notify(aml_name("S%.02X", devfn), aml_arg(1)));
+    aml_append(method, if_ctx);
+}
+
+static void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus,
+                                         bool pcihp_bridge_en)
+{
+    Aml *dev, *notify_method = NULL, *method;
+    QObject *bsel;
+    PCIBus *sec;
+    int devfn;
+
+    bsel = object_property_get_qobject(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, NULL);
+    if (bsel) {
+        uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel));
+
+        aml_append(parent_scope, aml_name_decl("BSEL", aml_int(bsel_val)));
+        notify_method = aml_method("DVNT", 2, AML_NOTSERIALIZED);
+    }
+
+    for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
+        DeviceClass *dc;
+        PCIDeviceClass *pc;
+        PCIDevice *pdev = bus->devices[devfn];
+        int slot = PCI_SLOT(devfn);
+        int func = PCI_FUNC(devfn);
+        /* ACPI spec: 1.0b: Table 6-2 _ADR Object Bus Types, PCI type */
+        int adr = slot << 16 | func;
+        bool hotplug_enabled_dev;
+        bool bridge_in_acpi;
+        bool cold_plugged_bridge;
+
+        if (!pdev) {
+            /*
+             * add hotplug slots for non present devices.
+             * hotplug is supported only for non-multifunction device
+             * so generate device description only for function 0
+             */
+            if (bsel && !func) {
+                if (pci_bus_is_express(bus) && slot > 0) {
+                    break;
+                }
+                dev = aml_device("S%.02X", devfn);
+                aml_append(dev, aml_name_decl("_SUN", aml_int(slot)));
+                aml_append(dev, aml_name_decl("_ADR", aml_int(adr)));
+                method = aml_method("_EJ0", 1, AML_NOTSERIALIZED);
+                aml_append(method,
+                    aml_call2("PCEJ", aml_name("BSEL"), aml_name("_SUN"))
+                );
+                aml_append(dev, method);
+                method = aml_method("_DSM", 4, AML_SERIALIZED);
+                aml_append(method,
+                    aml_return(aml_call6("PDSM", aml_arg(0), aml_arg(1),
+                                         aml_arg(2), aml_arg(3),
+                                         aml_name("BSEL"), aml_name("_SUN")))
+                );
+                aml_append(dev, method);
+                aml_append(parent_scope, dev);
+
+                build_append_pcihp_notify_entry(notify_method, slot);
+            }
+            continue;
+        }
+
+        pc = PCI_DEVICE_GET_CLASS(pdev);
+        dc = DEVICE_GET_CLASS(pdev);
+
+        /*
+         * Cold plugged bridges aren't themselves hot-pluggable.
+         * Hotplugged bridges *are* hot-pluggable.
+         */
+        cold_plugged_bridge = pc->is_bridge && !DEVICE(pdev)->hotplugged;
+        bridge_in_acpi =  cold_plugged_bridge && pcihp_bridge_en;
+
+        hotplug_enabled_dev = bsel && dc->hotpluggable && !cold_plugged_bridge;
+
+        if (pc->class_id == PCI_CLASS_BRIDGE_ISA) {
+            continue;
+        }
+
+        /*
+         * allow describing coldplugged bridges in ACPI even if they are not
+         * on function 0, as they are not unpluggable, for all other devices
+         * generate description only for function 0 per slot
+         */
+        if (func && !bridge_in_acpi) {
+            continue;
+        }
+
+        /* start to compose PCI device descriptor */
+        dev = aml_device("S%.02X", devfn);
+        aml_append(dev, aml_name_decl("_ADR", aml_int(adr)));
+
+        if (bsel) {
+            /*
+             * Can't declare _SUN here for every device as it changes 'slot'
+             * enumeration order in linux kernel, so use another variable for it
+             */
+            aml_append(dev, aml_name_decl("ASUN", aml_int(slot)));
+            method = aml_method("_DSM", 4, AML_SERIALIZED);
+            aml_append(method, aml_return(
+                aml_call6("PDSM", aml_arg(0), aml_arg(1), aml_arg(2),
+                          aml_arg(3), aml_name("BSEL"), aml_name("ASUN"))
+            ));
+            aml_append(dev, method);
+        }
+
+        if (pc->class_id == PCI_CLASS_DISPLAY_VGA) {
+            /* add VGA specific AML methods */
+            int s3d;
+
+            if (object_dynamic_cast(OBJECT(pdev), "qxl-vga")) {
+                s3d = 3;
+            } else {
+                s3d = 0;
+            }
+
+            method = aml_method("_S1D", 0, AML_NOTSERIALIZED);
+            aml_append(method, aml_return(aml_int(0)));
+            aml_append(dev, method);
+
+            method = aml_method("_S2D", 0, AML_NOTSERIALIZED);
+            aml_append(method, aml_return(aml_int(0)));
+            aml_append(dev, method);
+
+            method = aml_method("_S3D", 0, AML_NOTSERIALIZED);
+            aml_append(method, aml_return(aml_int(s3d)));
+            aml_append(dev, method);
+        } else if (hotplug_enabled_dev) {
+            aml_append(dev, aml_name_decl("_SUN", aml_int(slot)));
+            /* add _EJ0 to make slot hotpluggable  */
+            method = aml_method("_EJ0", 1, AML_NOTSERIALIZED);
+            aml_append(method,
+                aml_call2("PCEJ", aml_name("BSEL"), aml_name("_SUN"))
+            );
+            aml_append(dev, method);
+
+            if (bsel) {
+                build_append_pcihp_notify_entry(notify_method, slot);
+            }
+        } else if (bridge_in_acpi) {
+            /*
+             * device is coldplugged bridge,
+             * add child device descriptions into its scope
+             */
+            PCIBus *sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev));
+
+            build_append_pci_bus_devices(dev, sec_bus, pcihp_bridge_en);
+        }
+        /* device descriptor has been composed, add it into parent context */
+        aml_append(parent_scope, dev);
+    }
+
+    if (bsel) {
+        aml_append(parent_scope, notify_method);
+    }
+
+    /* Append PCNT method to notify about events on local and child buses.
+     * Add this method for root bus only when hotplug is enabled since DSDT
+     * expects it.
+     */
+    if (bsel || pcihp_bridge_en) {
+        method = aml_method("PCNT", 0, AML_NOTSERIALIZED);
+
+        /* If bus supports hotplug select it and notify about local events */
+        if (bsel) {
+            uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel));
+
+            aml_append(method, aml_store(aml_int(bsel_val), aml_name("BNUM")));
+            aml_append(method, aml_call2("DVNT", aml_name("PCIU"),
+                                         aml_int(1))); /* Device Check */
+            aml_append(method, aml_call2("DVNT", aml_name("PCID"),
+                                         aml_int(3))); /* Eject Request */
+        }
+
+        /* Notify about child bus events in any case */
+        if (pcihp_bridge_en) {
+            QLIST_FOREACH(sec, &bus->child, sibling) {
+                if (pci_bus_is_root(sec)) {
+                    continue;
+                }
+
+                aml_append(method, aml_name("^S%.02X.PCNT",
+                                            sec->parent_dev->devfn));
+            }
+        }
+
+        aml_append(parent_scope, method);
+    }
+    qobject_unref(bsel);
+}
+
+Aml *aml_pci_device_dsm(void)
+{
+    Aml *method, *UUID, *ifctx, *ifctx1, *ifctx2, *ifctx3, *elsectx;
+    Aml *acpi_index = aml_local(0);
+    Aml *zero = aml_int(0);
+    Aml *bnum = aml_arg(4);
+    Aml *func = aml_arg(2);
+    Aml *rev = aml_arg(1);
+    Aml *sun = aml_arg(5);
+
+    method = aml_method("PDSM", 6, AML_SERIALIZED);
+
+    /*
+     * PCI Firmware Specification 3.1
+     * 4.6.  _DSM Definitions for PCI
+     */
+    UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D");
+    ifctx = aml_if(aml_equal(aml_arg(0), UUID));
+    {
+        aml_append(ifctx, aml_store(aml_call2("AIDX", bnum, sun), acpi_index));
+        ifctx1 = aml_if(aml_equal(func, zero));
+        {
+            uint8_t byte_list[1];
+
+            ifctx2 = aml_if(aml_equal(rev, aml_int(2)));
+            {
+                /*
+                 * advertise function 7 if device has acpi-index
+                 * acpi_index values:
+                 *            0: not present (default value)
+                 *     FFFFFFFF: not supported (old QEMU without PIDX reg)
+                 *        other: device's acpi-index
+                 */
+                ifctx3 = aml_if(aml_lnot(
+                    aml_or(aml_equal(acpi_index, zero),
+                           aml_equal(acpi_index, aml_int(0xFFFFFFFF)), NULL)
+                ));
+                {
+                    byte_list[0] =
+                        1 /* have supported functions */ |
+                        1 << 7 /* support for function 7 */
+                    ;
+                    aml_append(ifctx3, aml_return(aml_buffer(1, byte_list)));
+                }
+                aml_append(ifctx2, ifctx3);
+             }
+             aml_append(ifctx1, ifctx2);
+
+             byte_list[0] = 0; /* nothing supported */
+             aml_append(ifctx1, aml_return(aml_buffer(1, byte_list)));
+         }
+         aml_append(ifctx, ifctx1);
+         elsectx = aml_else();
+         /*
+          * PCI Firmware Specification 3.1
+          * 4.6.7. _DSM for Naming a PCI or PCI Express Device Under
+          *        Operating Systems
+          */
+         ifctx1 = aml_if(aml_equal(func, aml_int(7)));
+         {
+             Aml *pkg = aml_package(2);
+             Aml *ret = aml_local(1);
+
+             aml_append(pkg, zero);
+             /*
+              * optional, if not impl. should return null string
+              */
+             aml_append(pkg, aml_string("%s", ""));
+             aml_append(ifctx1, aml_store(pkg, ret));
+             /*
+              * update acpi-index to actual value
+              */
+             aml_append(ifctx1, aml_store(acpi_index, aml_index(ret, zero)));
+             aml_append(ifctx1, aml_return(ret));
+         }
+         aml_append(elsectx, ifctx1);
+         aml_append(ifctx, elsectx);
+    }
+    aml_append(method, ifctx);
+    return method;
+}
+
+/**
+ * build_prt_entry:
+ * @link_name: link name for PCI route entry
+ *
+ * build AML package containing a PCI route entry for @link_name
+ */
+static Aml *build_prt_entry(const char *link_name)
+{
+    Aml *a_zero = aml_int(0);
+    Aml *pkg = aml_package(4);
+    aml_append(pkg, a_zero);
+    aml_append(pkg, a_zero);
+    aml_append(pkg, aml_name("%s", link_name));
+    aml_append(pkg, a_zero);
+    return pkg;
+}
+
+/*
+ * initialize_route - Initialize the interrupt routing rule
+ * through a specific LINK:
+ *  if (lnk_idx == idx)
+ *      route using link 'link_name'
+ */
+static Aml *initialize_route(Aml *route, const char *link_name,
+                             Aml *lnk_idx, int idx)
+{
+    Aml *if_ctx = aml_if(aml_equal(lnk_idx, aml_int(idx)));
+    Aml *pkg = build_prt_entry(link_name);
+
+    aml_append(if_ctx, aml_store(pkg, route));
+
+    return if_ctx;
+}
+
+/*
+ * build_prt - Define interrupt rounting rules
+ *
+ * Returns an array of 128 routes, one for each device,
+ * based on device location.
+ * The main goal is to equaly distribute the interrupts
+ * over the 4 existing ACPI links (works only for i440fx).
+ * The hash function is  (slot + pin) & 3 -> "LNK[D|A|B|C]".
+ *
+ */
+static Aml *build_prt(bool is_pci0_prt)
+{
+    Aml *method, *while_ctx, *pin, *res;
+
+    method = aml_method("_PRT", 0, AML_NOTSERIALIZED);
+    res = aml_local(0);
+    pin = aml_local(1);
+    aml_append(method, aml_store(aml_package(128), res));
+    aml_append(method, aml_store(aml_int(0), pin));
+
+    /* while (pin < 128) */
+    while_ctx = aml_while(aml_lless(pin, aml_int(128)));
+    {
+        Aml *slot = aml_local(2);
+        Aml *lnk_idx = aml_local(3);
+        Aml *route = aml_local(4);
+
+        /* slot = pin >> 2 */
+        aml_append(while_ctx,
+                   aml_store(aml_shiftright(pin, aml_int(2), NULL), slot));
+        /* lnk_idx = (slot + pin) & 3 */
+        aml_append(while_ctx,
+            aml_store(aml_and(aml_add(pin, slot, NULL), aml_int(3), NULL),
+                      lnk_idx));
+
+        /* route[2] = "LNK[D|A|B|C]", selection based on pin % 3  */
+        aml_append(while_ctx, initialize_route(route, "LNKD", lnk_idx, 0));
+        if (is_pci0_prt) {
+            Aml *if_device_1, *if_pin_4, *else_pin_4;
+
+            /* device 1 is the power-management device, needs SCI */
+            if_device_1 = aml_if(aml_equal(lnk_idx, aml_int(1)));
+            {
+                if_pin_4 = aml_if(aml_equal(pin, aml_int(4)));
+                {
+                    aml_append(if_pin_4,
+                        aml_store(build_prt_entry("LNKS"), route));
+                }
+                aml_append(if_device_1, if_pin_4);
+                else_pin_4 = aml_else();
+                {
+                    aml_append(else_pin_4,
+                        aml_store(build_prt_entry("LNKA"), route));
+                }
+                aml_append(if_device_1, else_pin_4);
+            }
+            aml_append(while_ctx, if_device_1);
+        } else {
+            aml_append(while_ctx, initialize_route(route, "LNKA", lnk_idx, 1));
+        }
+        aml_append(while_ctx, initialize_route(route, "LNKB", lnk_idx, 2));
+        aml_append(while_ctx, initialize_route(route, "LNKC", lnk_idx, 3));
+
+        /* route[0] = 0x[slot]FFFF */
+        aml_append(while_ctx,
+            aml_store(aml_or(aml_shiftleft(slot, aml_int(16)), aml_int(0xFFFF),
+                             NULL),
+                      aml_index(route, aml_int(0))));
+        /* route[1] = pin & 3 */
+        aml_append(while_ctx,
+            aml_store(aml_and(pin, aml_int(3), NULL),
+                      aml_index(route, aml_int(1))));
+        /* res[pin] = route */
+        aml_append(while_ctx, aml_store(route, aml_index(res, pin)));
+        /* pin++ */
+        aml_append(while_ctx, aml_increment(pin));
+    }
+    aml_append(method, while_ctx);
+    /* return res*/
+    aml_append(method, aml_return(res));
+
+    return method;
+}
+
+static void build_hpet_aml(Aml *table)
+{
+    Aml *crs;
+    Aml *field;
+    Aml *method;
+    Aml *if_ctx;
+    Aml *scope = aml_scope("_SB");
+    Aml *dev = aml_device("HPET");
+    Aml *zero = aml_int(0);
+    Aml *id = aml_local(0);
+    Aml *period = aml_local(1);
+
+    aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0103")));
+    aml_append(dev, aml_name_decl("_UID", zero));
+
+    aml_append(dev,
+        aml_operation_region("HPTM", AML_SYSTEM_MEMORY, aml_int(HPET_BASE),
+                             HPET_LEN));
+    field = aml_field("HPTM", AML_DWORD_ACC, AML_LOCK, AML_PRESERVE);
+    aml_append(field, aml_named_field("VEND", 32));
+    aml_append(field, aml_named_field("PRD", 32));
+    aml_append(dev, field);
+
+    method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+    aml_append(method, aml_store(aml_name("VEND"), id));
+    aml_append(method, aml_store(aml_name("PRD"), period));
+    aml_append(method, aml_shiftright(id, aml_int(16), id));
+    if_ctx = aml_if(aml_lor(aml_equal(id, zero),
+                            aml_equal(id, aml_int(0xffff))));
+    {
+        aml_append(if_ctx, aml_return(zero));
+    }
+    aml_append(method, if_ctx);
+
+    if_ctx = aml_if(aml_lor(aml_equal(period, zero),
+                            aml_lgreater(period, aml_int(100000000))));
+    {
+        aml_append(if_ctx, aml_return(zero));
+    }
+    aml_append(method, if_ctx);
+
+    aml_append(method, aml_return(aml_int(0x0F)));
+    aml_append(dev, method);
+
+    crs = aml_resource_template();
+    aml_append(crs, aml_memory32_fixed(HPET_BASE, HPET_LEN, AML_READ_ONLY));
+    aml_append(dev, aml_name_decl("_CRS", crs));
+
+    aml_append(scope, dev);
+    aml_append(table, scope);
+}
+
+static Aml *build_vmbus_device_aml(VMBusBridge *vmbus_bridge)
+{
+    Aml *dev;
+    Aml *method;
+    Aml *crs;
+
+    dev = aml_device("VMBS");
+    aml_append(dev, aml_name_decl("STA", aml_int(0xF)));
+    aml_append(dev, aml_name_decl("_HID", aml_string("VMBus")));
+    aml_append(dev, aml_name_decl("_UID", aml_int(0x0)));
+    aml_append(dev, aml_name_decl("_DDN", aml_string("VMBUS")));
+
+    method = aml_method("_DIS", 0, AML_NOTSERIALIZED);
+    aml_append(method, aml_store(aml_and(aml_name("STA"), aml_int(0xD), NULL),
+                                     aml_name("STA")));
+    aml_append(dev, method);
+
+    method = aml_method("_PS0", 0, AML_NOTSERIALIZED);
+    aml_append(method, aml_store(aml_or(aml_name("STA"), aml_int(0xF), NULL),
+                                     aml_name("STA")));
+    aml_append(dev, method);
+
+    method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+    aml_append(method, aml_return(aml_name("STA")));
+    aml_append(dev, method);
+
+    aml_append(dev, aml_name_decl("_PS3", aml_int(0x0)));
+
+    crs = aml_resource_template();
+    aml_append(crs, aml_irq_no_flags(vmbus_bridge->irq));
+    aml_append(dev, aml_name_decl("_CRS", crs));
+
+    return dev;
+}
+
+static void build_isa_devices_aml(Aml *table)
+{
+    bool ambiguous;
+    Object *obj = object_resolve_path_type("", TYPE_ISA_BUS, &ambiguous);
+    Aml *scope;
+
+    assert(obj && !ambiguous);
+
+    scope = aml_scope("_SB.PCI0.ISA");
+    build_acpi_ipmi_devices(scope, BUS(obj), "\\_SB.PCI0.ISA");
+    isa_build_aml(ISA_BUS(obj), scope);
+
+    aml_append(table, scope);
+}
+
+static void build_dbg_aml(Aml *table)
+{
+    Aml *field;
+    Aml *method;
+    Aml *while_ctx;
+    Aml *scope = aml_scope("\\");
+    Aml *buf = aml_local(0);
+    Aml *len = aml_local(1);
+    Aml *idx = aml_local(2);
+
+    aml_append(scope,
+       aml_operation_region("DBG", AML_SYSTEM_IO, aml_int(0x0402), 0x01));
+    field = aml_field("DBG", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE);
+    aml_append(field, aml_named_field("DBGB", 8));
+    aml_append(scope, field);
+
+    method = aml_method("DBUG", 1, AML_NOTSERIALIZED);
+
+    aml_append(method, aml_to_hexstring(aml_arg(0), buf));
+    aml_append(method, aml_to_buffer(buf, buf));
+    aml_append(method, aml_subtract(aml_sizeof(buf), aml_int(1), len));
+    aml_append(method, aml_store(aml_int(0), idx));
+
+    while_ctx = aml_while(aml_lless(idx, len));
+    aml_append(while_ctx,
+        aml_store(aml_derefof(aml_index(buf, idx)), aml_name("DBGB")));
+    aml_append(while_ctx, aml_increment(idx));
+    aml_append(method, while_ctx);
+
+    aml_append(method, aml_store(aml_int(0x0A), aml_name("DBGB")));
+    aml_append(scope, method);
+
+    aml_append(table, scope);
+}
+
+static Aml *build_link_dev(const char *name, uint8_t uid, Aml *reg)
+{
+    Aml *dev;
+    Aml *crs;
+    Aml *method;
+    uint32_t irqs[] = {5, 10, 11};
+
+    dev = aml_device("%s", name);
+    aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0C0F")));
+    aml_append(dev, aml_name_decl("_UID", aml_int(uid)));
+
+    crs = aml_resource_template();
+    aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH,
+                                  AML_SHARED, irqs, ARRAY_SIZE(irqs)));
+    aml_append(dev, aml_name_decl("_PRS", crs));
+
+    method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+    aml_append(method, aml_return(aml_call1("IQST", reg)));
+    aml_append(dev, method);
+
+    method = aml_method("_DIS", 0, AML_NOTSERIALIZED);
+    aml_append(method, aml_or(reg, aml_int(0x80), reg));
+    aml_append(dev, method);
+
+    method = aml_method("_CRS", 0, AML_NOTSERIALIZED);
+    aml_append(method, aml_return(aml_call1("IQCR", reg)));
+    aml_append(dev, method);
+
+    method = aml_method("_SRS", 1, AML_NOTSERIALIZED);
+    aml_append(method, aml_create_dword_field(aml_arg(0), aml_int(5), "PRRI"));
+    aml_append(method, aml_store(aml_name("PRRI"), reg));
+    aml_append(dev, method);
+
+    return dev;
+ }
+
+static Aml *build_gsi_link_dev(const char *name, uint8_t uid, uint8_t gsi)
+{
+    Aml *dev;
+    Aml *crs;
+    Aml *method;
+    uint32_t irqs;
+
+    dev = aml_device("%s", name);
+    aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0C0F")));
+    aml_append(dev, aml_name_decl("_UID", aml_int(uid)));
+
+    crs = aml_resource_template();
+    irqs = gsi;
+    aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH,
+                                  AML_SHARED, &irqs, 1));
+    aml_append(dev, aml_name_decl("_PRS", crs));
+
+    aml_append(dev, aml_name_decl("_CRS", crs));
+
+    /*
+     * _DIS can be no-op because the interrupt cannot be disabled.
+     */
+    method = aml_method("_DIS", 0, AML_NOTSERIALIZED);
+    aml_append(dev, method);
+
+    method = aml_method("_SRS", 1, AML_NOTSERIALIZED);
+    aml_append(dev, method);
+
+    return dev;
+}
+
+/* _CRS method - get current settings */
+static Aml *build_iqcr_method(bool is_piix4)
+{
+    Aml *if_ctx;
+    uint32_t irqs;
+    Aml *method = aml_method("IQCR", 1, AML_SERIALIZED);
+    Aml *crs = aml_resource_template();
+
+    irqs = 0;
+    aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL,
+                                  AML_ACTIVE_HIGH, AML_SHARED, &irqs, 1));
+    aml_append(method, aml_name_decl("PRR0", crs));
+
+    aml_append(method,
+        aml_create_dword_field(aml_name("PRR0"), aml_int(5), "PRRI"));
+
+    if (is_piix4) {
+        if_ctx = aml_if(aml_lless(aml_arg(0), aml_int(0x80)));
+        aml_append(if_ctx, aml_store(aml_arg(0), aml_name("PRRI")));
+        aml_append(method, if_ctx);
+    } else {
+        aml_append(method,
+            aml_store(aml_and(aml_arg(0), aml_int(0xF), NULL),
+                      aml_name("PRRI")));
+    }
+
+    aml_append(method, aml_return(aml_name("PRR0")));
+    return method;
+}
+
+/* _STA method - get status */
+static Aml *build_irq_status_method(void)
+{
+    Aml *if_ctx;
+    Aml *method = aml_method("IQST", 1, AML_NOTSERIALIZED);
+
+    if_ctx = aml_if(aml_and(aml_int(0x80), aml_arg(0), NULL));
+    aml_append(if_ctx, aml_return(aml_int(0x09)));
+    aml_append(method, if_ctx);
+    aml_append(method, aml_return(aml_int(0x0B)));
+    return method;
+}
+
+static void build_piix4_pci0_int(Aml *table)
+{
+    Aml *dev;
+    Aml *crs;
+    Aml *field;
+    Aml *method;
+    uint32_t irqs;
+    Aml *sb_scope = aml_scope("_SB");
+    Aml *pci0_scope = aml_scope("PCI0");
+
+    aml_append(pci0_scope, build_prt(true));
+    aml_append(sb_scope, pci0_scope);
+
+    field = aml_field("PCI0.ISA.P40C", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE);
+    aml_append(field, aml_named_field("PRQ0", 8));
+    aml_append(field, aml_named_field("PRQ1", 8));
+    aml_append(field, aml_named_field("PRQ2", 8));
+    aml_append(field, aml_named_field("PRQ3", 8));
+    aml_append(sb_scope, field);
+
+    aml_append(sb_scope, build_irq_status_method());
+    aml_append(sb_scope, build_iqcr_method(true));
+
+    aml_append(sb_scope, build_link_dev("LNKA", 0, aml_name("PRQ0")));
+    aml_append(sb_scope, build_link_dev("LNKB", 1, aml_name("PRQ1")));
+    aml_append(sb_scope, build_link_dev("LNKC", 2, aml_name("PRQ2")));
+    aml_append(sb_scope, build_link_dev("LNKD", 3, aml_name("PRQ3")));
+
+    dev = aml_device("LNKS");
+    {
+        aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0C0F")));
+        aml_append(dev, aml_name_decl("_UID", aml_int(4)));
+
+        crs = aml_resource_template();
+        irqs = 9;
+        aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL,
+                                      AML_ACTIVE_HIGH, AML_SHARED,
+                                      &irqs, 1));
+        aml_append(dev, aml_name_decl("_PRS", crs));
+
+        /* The SCI cannot be disabled and is always attached to GSI 9,
+         * so these are no-ops.  We only need this link to override the
+         * polarity to active high and match the content of the MADT.
+         */
+        method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+        aml_append(method, aml_return(aml_int(0x0b)));
+        aml_append(dev, method);
+
+        method = aml_method("_DIS", 0, AML_NOTSERIALIZED);
+        aml_append(dev, method);
+
+        method = aml_method("_CRS", 0, AML_NOTSERIALIZED);
+        aml_append(method, aml_return(aml_name("_PRS")));
+        aml_append(dev, method);
+
+        method = aml_method("_SRS", 1, AML_NOTSERIALIZED);
+        aml_append(dev, method);
+    }
+    aml_append(sb_scope, dev);
+
+    aml_append(table, sb_scope);
+}
+
+static void append_q35_prt_entry(Aml *ctx, uint32_t nr, const char *name)
+{
+    int i;
+    int head;
+    Aml *pkg;
+    char base = name[3] < 'E' ? 'A' : 'E';
+    char *s = g_strdup(name);
+    Aml *a_nr = aml_int((nr << 16) | 0xffff);
+
+    assert(strlen(s) == 4);
+
+    head = name[3] - base;
+    for (i = 0; i < 4; i++) {
+        if (head + i > 3) {
+            head = i * -1;
+        }
+        s[3] = base + head + i;
+        pkg = aml_package(4);
+        aml_append(pkg, a_nr);
+        aml_append(pkg, aml_int(i));
+        aml_append(pkg, aml_name("%s", s));
+        aml_append(pkg, aml_int(0));
+        aml_append(ctx, pkg);
+    }
+    g_free(s);
+}
+
+static Aml *build_q35_routing_table(const char *str)
+{
+    int i;
+    Aml *pkg;
+    char *name = g_strdup_printf("%s ", str);
+
+    pkg = aml_package(128);
+    for (i = 0; i < 0x18; i++) {
+            name[3] = 'E' + (i & 0x3);
+            append_q35_prt_entry(pkg, i, name);
+    }
+
+    name[3] = 'E';
+    append_q35_prt_entry(pkg, 0x18, name);
+
+    /* INTA -> PIRQA for slot 25 - 31, see the default value of D<N>IR */
+    for (i = 0x0019; i < 0x1e; i++) {
+        name[3] = 'A';
+        append_q35_prt_entry(pkg, i, name);
+    }
+
+    /* PCIe->PCI bridge. use PIRQ[E-H] */
+    name[3] = 'E';
+    append_q35_prt_entry(pkg, 0x1e, name);
+    name[3] = 'A';
+    append_q35_prt_entry(pkg, 0x1f, name);
+
+    g_free(name);
+    return pkg;
+}
+
+static void build_q35_pci0_int(Aml *table)
+{
+    Aml *field;
+    Aml *method;
+    Aml *sb_scope = aml_scope("_SB");
+    Aml *pci0_scope = aml_scope("PCI0");
+
+    /* Zero => PIC mode, One => APIC Mode */
+    aml_append(table, aml_name_decl("PICF", aml_int(0)));
+    method = aml_method("_PIC", 1, AML_NOTSERIALIZED);
+    {
+        aml_append(method, aml_store(aml_arg(0), aml_name("PICF")));
+    }
+    aml_append(table, method);
+
+    aml_append(pci0_scope,
+        aml_name_decl("PRTP", build_q35_routing_table("LNK")));
+    aml_append(pci0_scope,
+        aml_name_decl("PRTA", build_q35_routing_table("GSI")));
+
+    method = aml_method("_PRT", 0, AML_NOTSERIALIZED);
+    {
+        Aml *if_ctx;
+        Aml *else_ctx;
+
+        /* PCI IRQ routing table, example from ACPI 2.0a specification,
+           section 6.2.8.1 */
+        /* Note: we provide the same info as the PCI routing
+           table of the Bochs BIOS */
+        if_ctx = aml_if(aml_equal(aml_name("PICF"), aml_int(0)));
+        aml_append(if_ctx, aml_return(aml_name("PRTP")));
+        aml_append(method, if_ctx);
+        else_ctx = aml_else();
+        aml_append(else_ctx, aml_return(aml_name("PRTA")));
+        aml_append(method, else_ctx);
+    }
+    aml_append(pci0_scope, method);
+    aml_append(sb_scope, pci0_scope);
+
+    field = aml_field("PCI0.ISA.PIRQ", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE);
+    aml_append(field, aml_named_field("PRQA", 8));
+    aml_append(field, aml_named_field("PRQB", 8));
+    aml_append(field, aml_named_field("PRQC", 8));
+    aml_append(field, aml_named_field("PRQD", 8));
+    aml_append(field, aml_reserved_field(0x20));
+    aml_append(field, aml_named_field("PRQE", 8));
+    aml_append(field, aml_named_field("PRQF", 8));
+    aml_append(field, aml_named_field("PRQG", 8));
+    aml_append(field, aml_named_field("PRQH", 8));
+    aml_append(sb_scope, field);
+
+    aml_append(sb_scope, build_irq_status_method());
+    aml_append(sb_scope, build_iqcr_method(false));
+
+    aml_append(sb_scope, build_link_dev("LNKA", 0, aml_name("PRQA")));
+    aml_append(sb_scope, build_link_dev("LNKB", 1, aml_name("PRQB")));
+    aml_append(sb_scope, build_link_dev("LNKC", 2, aml_name("PRQC")));
+    aml_append(sb_scope, build_link_dev("LNKD", 3, aml_name("PRQD")));
+    aml_append(sb_scope, build_link_dev("LNKE", 4, aml_name("PRQE")));
+    aml_append(sb_scope, build_link_dev("LNKF", 5, aml_name("PRQF")));
+    aml_append(sb_scope, build_link_dev("LNKG", 6, aml_name("PRQG")));
+    aml_append(sb_scope, build_link_dev("LNKH", 7, aml_name("PRQH")));
+
+    aml_append(sb_scope, build_gsi_link_dev("GSIA", 0x10, 0x10));
+    aml_append(sb_scope, build_gsi_link_dev("GSIB", 0x11, 0x11));
+    aml_append(sb_scope, build_gsi_link_dev("GSIC", 0x12, 0x12));
+    aml_append(sb_scope, build_gsi_link_dev("GSID", 0x13, 0x13));
+    aml_append(sb_scope, build_gsi_link_dev("GSIE", 0x14, 0x14));
+    aml_append(sb_scope, build_gsi_link_dev("GSIF", 0x15, 0x15));
+    aml_append(sb_scope, build_gsi_link_dev("GSIG", 0x16, 0x16));
+    aml_append(sb_scope, build_gsi_link_dev("GSIH", 0x17, 0x17));
+
+    aml_append(table, sb_scope);
+}
+
+static Aml *build_q35_dram_controller(const AcpiMcfgInfo *mcfg)
+{
+    Aml *dev;
+    Aml *resource_template;
+
+    /* DRAM controller */
+    dev = aml_device("DRAC");
+    aml_append(dev, aml_name_decl("_HID", aml_string("PNP0C01")));
+
+    resource_template = aml_resource_template();
+    if (mcfg->base + mcfg->size - 1 >= (1ULL << 32)) {
+        aml_append(resource_template,
+                   aml_qword_memory(AML_POS_DECODE,
+                                    AML_MIN_FIXED,
+                                    AML_MAX_FIXED,
+                                    AML_NON_CACHEABLE,
+                                    AML_READ_WRITE,
+                                    0x0000000000000000,
+                                    mcfg->base,
+                                    mcfg->base + mcfg->size - 1,
+                                    0x0000000000000000,
+                                    mcfg->size));
+    } else {
+        aml_append(resource_template,
+                   aml_dword_memory(AML_POS_DECODE,
+                                    AML_MIN_FIXED,
+                                    AML_MAX_FIXED,
+                                    AML_NON_CACHEABLE,
+                                    AML_READ_WRITE,
+                                    0x0000000000000000,
+                                    mcfg->base,
+                                    mcfg->base + mcfg->size - 1,
+                                    0x0000000000000000,
+                                    mcfg->size));
+    }
+    aml_append(dev, aml_name_decl("_CRS", resource_template));
+
+    return dev;
+}
+
+static void build_q35_isa_bridge(Aml *table)
+{
+    Aml *dev;
+    Aml *scope;
+
+    scope =  aml_scope("_SB.PCI0");
+    dev = aml_device("ISA");
+    aml_append(dev, aml_name_decl("_ADR", aml_int(0x001F0000)));
+
+    /* ICH9 PCI to ISA irq remapping */
+    aml_append(dev, aml_operation_region("PIRQ", AML_PCI_CONFIG,
+                                         aml_int(0x60), 0x0C));
+
+    aml_append(scope, dev);
+    aml_append(table, scope);
+}
+
+static void build_piix4_isa_bridge(Aml *table)
+{
+    Aml *dev;
+    Aml *scope;
+
+    scope =  aml_scope("_SB.PCI0");
+    dev = aml_device("ISA");
+    aml_append(dev, aml_name_decl("_ADR", aml_int(0x00010000)));
+
+    /* PIIX PCI to ISA irq remapping */
+    aml_append(dev, aml_operation_region("P40C", AML_PCI_CONFIG,
+                                         aml_int(0x60), 0x04));
+
+    aml_append(scope, dev);
+    aml_append(table, scope);
+}
+
+static void build_x86_acpi_pci_hotplug(Aml *table, uint64_t pcihp_addr)
+{
+    Aml *scope;
+    Aml *field;
+    Aml *method;
+
+    scope =  aml_scope("_SB.PCI0");
+
+    aml_append(scope,
+        aml_operation_region("PCST", AML_SYSTEM_IO, aml_int(pcihp_addr), 0x08));
+    field = aml_field("PCST", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS);
+    aml_append(field, aml_named_field("PCIU", 32));
+    aml_append(field, aml_named_field("PCID", 32));
+    aml_append(scope, field);
+
+    aml_append(scope,
+        aml_operation_region("SEJ", AML_SYSTEM_IO,
+                             aml_int(pcihp_addr + ACPI_PCIHP_SEJ_BASE), 0x04));
+    field = aml_field("SEJ", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS);
+    aml_append(field, aml_named_field("B0EJ", 32));
+    aml_append(scope, field);
+
+    aml_append(scope,
+        aml_operation_region("BNMR", AML_SYSTEM_IO,
+                             aml_int(pcihp_addr + ACPI_PCIHP_BNMR_BASE), 0x08));
+    field = aml_field("BNMR", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS);
+    aml_append(field, aml_named_field("BNUM", 32));
+    aml_append(field, aml_named_field("PIDX", 32));
+    aml_append(scope, field);
+
+    aml_append(scope, aml_mutex("BLCK", 0));
+
+    method = aml_method("PCEJ", 2, AML_NOTSERIALIZED);
+    aml_append(method, aml_acquire(aml_name("BLCK"), 0xFFFF));
+    aml_append(method, aml_store(aml_arg(0), aml_name("BNUM")));
+    aml_append(method,
+        aml_store(aml_shiftleft(aml_int(1), aml_arg(1)), aml_name("B0EJ")));
+    aml_append(method, aml_release(aml_name("BLCK")));
+    aml_append(method, aml_return(aml_int(0)));
+    aml_append(scope, method);
+
+    method = aml_method("AIDX", 2, AML_NOTSERIALIZED);
+    aml_append(method, aml_acquire(aml_name("BLCK"), 0xFFFF));
+    aml_append(method, aml_store(aml_arg(0), aml_name("BNUM")));
+    aml_append(method,
+        aml_store(aml_shiftleft(aml_int(1), aml_arg(1)), aml_name("PIDX")));
+    aml_append(method, aml_store(aml_name("PIDX"), aml_local(0)));
+    aml_append(method, aml_release(aml_name("BLCK")));
+    aml_append(method, aml_return(aml_local(0)));
+    aml_append(scope, method);
+
+    aml_append(scope, aml_pci_device_dsm());
+
+    aml_append(table, scope);
+}
+
+static Aml *build_q35_osc_method(bool enable_native_pcie_hotplug)
+{
+    Aml *if_ctx;
+    Aml *if_ctx2;
+    Aml *else_ctx;
+    Aml *method;
+    Aml *a_cwd1 = aml_name("CDW1");
+    Aml *a_ctrl = aml_local(0);
+
+    method = aml_method("_OSC", 4, AML_NOTSERIALIZED);
+    aml_append(method, aml_create_dword_field(aml_arg(3), aml_int(0), "CDW1"));
+
+    if_ctx = aml_if(aml_equal(
+        aml_arg(0), aml_touuid("33DB4D5B-1FF7-401C-9657-7441C03DD766")));
+    aml_append(if_ctx, aml_create_dword_field(aml_arg(3), aml_int(4), "CDW2"));
+    aml_append(if_ctx, aml_create_dword_field(aml_arg(3), aml_int(8), "CDW3"));
+
+    aml_append(if_ctx, aml_store(aml_name("CDW3"), a_ctrl));
+
+    /*
+     * Always allow native PME, AER (no dependencies)
+     * Allow SHPC (PCI bridges can have SHPC controller)
+     * Disable PCIe Native Hot-plug if ACPI PCI Hot-plug is enabled.
+     */
+    aml_append(if_ctx, aml_and(a_ctrl,
+        aml_int(0x1E | (enable_native_pcie_hotplug ? 0x1 : 0x0)), a_ctrl));
+
+    if_ctx2 = aml_if(aml_lnot(aml_equal(aml_arg(1), aml_int(1))));
+    /* Unknown revision */
+    aml_append(if_ctx2, aml_or(a_cwd1, aml_int(0x08), a_cwd1));
+    aml_append(if_ctx, if_ctx2);
+
+    if_ctx2 = aml_if(aml_lnot(aml_equal(aml_name("CDW3"), a_ctrl)));
+    /* Capabilities bits were masked */
+    aml_append(if_ctx2, aml_or(a_cwd1, aml_int(0x10), a_cwd1));
+    aml_append(if_ctx, if_ctx2);
+
+    /* Update DWORD3 in the buffer */
+    aml_append(if_ctx, aml_store(a_ctrl, aml_name("CDW3")));
+    aml_append(method, if_ctx);
+
+    else_ctx = aml_else();
+    /* Unrecognized UUID */
+    aml_append(else_ctx, aml_or(a_cwd1, aml_int(4), a_cwd1));
+    aml_append(method, else_ctx);
+
+    aml_append(method, aml_return(aml_arg(3)));
+    return method;
+}
+
+static void build_smb0(Aml *table, I2CBus *smbus, int devnr, int func)
+{
+    Aml *scope = aml_scope("_SB.PCI0");
+    Aml *dev = aml_device("SMB0");
+
+    aml_append(dev, aml_name_decl("_ADR", aml_int(devnr << 16 | func)));
+    build_acpi_ipmi_devices(dev, BUS(smbus), "\\_SB.PCI0.SMB0");
+    aml_append(scope, dev);
+    aml_append(table, scope);
+}
+
+static void
+build_dsdt(GArray *table_data, BIOSLinker *linker,
+           AcpiPmInfo *pm, AcpiMiscInfo *misc,
+           Range *pci_hole, Range *pci_hole64, MachineState *machine)
+{
+    CrsRangeEntry *entry;
+    Aml *dsdt, *sb_scope, *scope, *dev, *method, *field, *pkg, *crs;
+    CrsRangeSet crs_range_set;
+    PCMachineState *pcms = PC_MACHINE(machine);
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(machine);
+    X86MachineState *x86ms = X86_MACHINE(machine);
+    AcpiMcfgInfo mcfg;
+    bool mcfg_valid = !!acpi_get_mcfg(&mcfg);
+    uint32_t nr_mem = machine->ram_slots;
+    int root_bus_limit = 0xFF;
+    PCIBus *bus = NULL;
+#ifdef CONFIG_TPM
+    TPMIf *tpm = tpm_find();
+#endif
+    int i;
+    VMBusBridge *vmbus_bridge = vmbus_bridge_find();
+    AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = x86ms->oem_id,
+                        .oem_table_id = x86ms->oem_table_id };
+
+    acpi_table_begin(&table, table_data);
+    dsdt = init_aml_allocator();
+
+    build_dbg_aml(dsdt);
+    if (misc->is_piix4) {
+        sb_scope = aml_scope("_SB");
+        dev = aml_device("PCI0");
+        aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03")));
+        aml_append(dev, aml_name_decl("_ADR", aml_int(0)));
+        aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid)));
+        aml_append(sb_scope, dev);
+        aml_append(dsdt, sb_scope);
+
+        if (misc->has_hpet) {
+            build_hpet_aml(dsdt);
+        }
+        build_piix4_isa_bridge(dsdt);
+        build_isa_devices_aml(dsdt);
+        if (pm->pcihp_bridge_en || pm->pcihp_root_en) {
+            build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base);
+        }
+        build_piix4_pci0_int(dsdt);
+    } else {
+        sb_scope = aml_scope("_SB");
+        dev = aml_device("PCI0");
+        aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A08")));
+        aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03")));
+        aml_append(dev, aml_name_decl("_ADR", aml_int(0)));
+        aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid)));
+        aml_append(dev, build_q35_osc_method(!pm->pcihp_bridge_en));
+        aml_append(sb_scope, dev);
+        if (mcfg_valid) {
+            aml_append(sb_scope, build_q35_dram_controller(&mcfg));
+        }
+
+        if (pm->smi_on_cpuhp) {
+            /* reserve SMI block resources, IO ports 0xB2, 0xB3 */
+            dev = aml_device("PCI0.SMI0");
+            aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A06")));
+            aml_append(dev, aml_name_decl("_UID", aml_string("SMI resources")));
+            crs = aml_resource_template();
+            aml_append(crs,
+                aml_io(
+                       AML_DECODE16,
+                       ACPI_PORT_SMI_CMD,
+                       ACPI_PORT_SMI_CMD,
+                       1,
+                       2)
+            );
+            aml_append(dev, aml_name_decl("_CRS", crs));
+            aml_append(dev, aml_operation_region("SMIR", AML_SYSTEM_IO,
+                aml_int(ACPI_PORT_SMI_CMD), 2));
+            field = aml_field("SMIR", AML_BYTE_ACC, AML_NOLOCK,
+                              AML_WRITE_AS_ZEROS);
+            aml_append(field, aml_named_field("SMIC", 8));
+            aml_append(field, aml_reserved_field(8));
+            aml_append(dev, field);
+            aml_append(sb_scope, dev);
+        }
+
+        aml_append(dsdt, sb_scope);
+
+        if (misc->has_hpet) {
+            build_hpet_aml(dsdt);
+        }
+        build_q35_isa_bridge(dsdt);
+        build_isa_devices_aml(dsdt);
+        if (pm->pcihp_bridge_en) {
+            build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base);
+        }
+        build_q35_pci0_int(dsdt);
+        if (pcms->smbus && !pcmc->do_not_add_smb_acpi) {
+            build_smb0(dsdt, pcms->smbus, ICH9_SMB_DEV, ICH9_SMB_FUNC);
+        }
+    }
+
+    if (vmbus_bridge) {
+        sb_scope = aml_scope("_SB");
+        aml_append(sb_scope, build_vmbus_device_aml(vmbus_bridge));
+        aml_append(dsdt, sb_scope);
+    }
+
+    if (pcmc->legacy_cpu_hotplug) {
+        build_legacy_cpu_hotplug_aml(dsdt, machine, pm->cpu_hp_io_base);
+    } else {
+        CPUHotplugFeatures opts = {
+            .acpi_1_compatible = true, .has_legacy_cphp = true,
+            .smi_path = pm->smi_on_cpuhp ? "\\_SB.PCI0.SMI0.SMIC" : NULL,
+            .fw_unplugs_cpu = pm->smi_on_cpu_unplug,
+        };
+        build_cpus_aml(dsdt, machine, opts, pm->cpu_hp_io_base,
+                       "\\_SB.PCI0", "\\_GPE._E02");
+    }
+
+    if (pcms->memhp_io_base && nr_mem) {
+        build_memory_hotplug_aml(dsdt, nr_mem, "\\_SB.PCI0",
+                                 "\\_GPE._E03", AML_SYSTEM_IO,
+                                 pcms->memhp_io_base);
+    }
+
+    scope =  aml_scope("_GPE");
+    {
+        aml_append(scope, aml_name_decl("_HID", aml_string("ACPI0006")));
+
+        if (pm->pcihp_bridge_en || pm->pcihp_root_en) {
+            method = aml_method("_E01", 0, AML_NOTSERIALIZED);
+            aml_append(method,
+                aml_acquire(aml_name("\\_SB.PCI0.BLCK"), 0xFFFF));
+            aml_append(method, aml_call0("\\_SB.PCI0.PCNT"));
+            aml_append(method, aml_release(aml_name("\\_SB.PCI0.BLCK")));
+            aml_append(scope, method);
+        }
+
+        if (machine->nvdimms_state->is_enabled) {
+            method = aml_method("_E04", 0, AML_NOTSERIALIZED);
+            aml_append(method, aml_notify(aml_name("\\_SB.NVDR"),
+                                          aml_int(0x80)));
+            aml_append(scope, method);
+        }
+    }
+    aml_append(dsdt, scope);
+
+    crs_range_set_init(&crs_range_set);
+    bus = PC_MACHINE(machine)->bus;
+    if (bus) {
+        QLIST_FOREACH(bus, &bus->child, sibling) {
+            uint8_t bus_num = pci_bus_num(bus);
+            uint8_t numa_node = pci_bus_numa_node(bus);
+
+            /* look only for expander root buses */
+            if (!pci_bus_is_root(bus)) {
+                continue;
+            }
+
+            if (bus_num < root_bus_limit) {
+                root_bus_limit = bus_num - 1;
+            }
+
+            scope = aml_scope("\\_SB");
+            dev = aml_device("PC%.02X", bus_num);
+            aml_append(dev, aml_name_decl("_UID", aml_int(bus_num)));
+            aml_append(dev, aml_name_decl("_BBN", aml_int(bus_num)));
+            if (pci_bus_is_express(bus)) {
+                aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A08")));
+                aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03")));
+
+                /* Expander bridges do not have ACPI PCI Hot-plug enabled */
+                aml_append(dev, build_q35_osc_method(true));
+            } else {
+                aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03")));
+            }
+
+            if (numa_node != NUMA_NODE_UNASSIGNED) {
+                aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node)));
+            }
+
+            aml_append(dev, build_prt(false));
+            crs = build_crs(PCI_HOST_BRIDGE(BUS(bus)->parent), &crs_range_set,
+                            0, 0, 0, 0);
+            aml_append(dev, aml_name_decl("_CRS", crs));
+            aml_append(scope, dev);
+            aml_append(dsdt, scope);
+        }
+    }
+
+    /*
+     * At this point crs_range_set has all the ranges used by pci
+     * busses *other* than PCI0.  These ranges will be excluded from
+     * the PCI0._CRS.  Add mmconfig to the set so it will be excluded
+     * too.
+     */
+    if (mcfg_valid) {
+        crs_range_insert(crs_range_set.mem_ranges,
+                         mcfg.base, mcfg.base + mcfg.size - 1);
+    }
+
+    scope = aml_scope("\\_SB.PCI0");
+    /* build PCI0._CRS */
+    crs = aml_resource_template();
+    aml_append(crs,
+        aml_word_bus_number(AML_MIN_FIXED, AML_MAX_FIXED, AML_POS_DECODE,
+                            0x0000, 0x0, root_bus_limit,
+                            0x0000, root_bus_limit + 1));
+    aml_append(crs, aml_io(AML_DECODE16, 0x0CF8, 0x0CF8, 0x01, 0x08));
+
+    aml_append(crs,
+        aml_word_io(AML_MIN_FIXED, AML_MAX_FIXED,
+                    AML_POS_DECODE, AML_ENTIRE_RANGE,
+                    0x0000, 0x0000, 0x0CF7, 0x0000, 0x0CF8));
+
+    crs_replace_with_free_ranges(crs_range_set.io_ranges, 0x0D00, 0xFFFF);
+    for (i = 0; i < crs_range_set.io_ranges->len; i++) {
+        entry = g_ptr_array_index(crs_range_set.io_ranges, i);
+        aml_append(crs,
+            aml_word_io(AML_MIN_FIXED, AML_MAX_FIXED,
+                        AML_POS_DECODE, AML_ENTIRE_RANGE,
+                        0x0000, entry->base, entry->limit,
+                        0x0000, entry->limit - entry->base + 1));
+    }
+
+    aml_append(crs,
+        aml_dword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED,
+                         AML_CACHEABLE, AML_READ_WRITE,
+                         0, 0x000A0000, 0x000BFFFF, 0, 0x00020000));
+
+    crs_replace_with_free_ranges(crs_range_set.mem_ranges,
+                                 range_lob(pci_hole),
+                                 range_upb(pci_hole));
+    for (i = 0; i < crs_range_set.mem_ranges->len; i++) {
+        entry = g_ptr_array_index(crs_range_set.mem_ranges, i);
+        aml_append(crs,
+            aml_dword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED,
+                             AML_NON_CACHEABLE, AML_READ_WRITE,
+                             0, entry->base, entry->limit,
+                             0, entry->limit - entry->base + 1));
+    }
+
+    if (!range_is_empty(pci_hole64)) {
+        crs_replace_with_free_ranges(crs_range_set.mem_64bit_ranges,
+                                     range_lob(pci_hole64),
+                                     range_upb(pci_hole64));
+        for (i = 0; i < crs_range_set.mem_64bit_ranges->len; i++) {
+            entry = g_ptr_array_index(crs_range_set.mem_64bit_ranges, i);
+            aml_append(crs,
+                       aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED,
+                                        AML_MAX_FIXED,
+                                        AML_CACHEABLE, AML_READ_WRITE,
+                                        0, entry->base, entry->limit,
+                                        0, entry->limit - entry->base + 1));
+        }
+    }
+
+#ifdef CONFIG_TPM
+    if (TPM_IS_TIS_ISA(tpm_find())) {
+        aml_append(crs, aml_memory32_fixed(TPM_TIS_ADDR_BASE,
+                   TPM_TIS_ADDR_SIZE, AML_READ_WRITE));
+    }
+#endif
+    aml_append(scope, aml_name_decl("_CRS", crs));
+
+    /* reserve GPE0 block resources */
+    dev = aml_device("GPE0");
+    aml_append(dev, aml_name_decl("_HID", aml_string("PNP0A06")));
+    aml_append(dev, aml_name_decl("_UID", aml_string("GPE0 resources")));
+    /* device present, functioning, decoding, not shown in UI */
+    aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+    crs = aml_resource_template();
+    aml_append(crs,
+        aml_io(
+               AML_DECODE16,
+               pm->fadt.gpe0_blk.address,
+               pm->fadt.gpe0_blk.address,
+               1,
+               pm->fadt.gpe0_blk.bit_width / 8)
+    );
+    aml_append(dev, aml_name_decl("_CRS", crs));
+    aml_append(scope, dev);
+
+    crs_range_set_free(&crs_range_set);
+
+    /* reserve PCIHP resources */
+    if (pm->pcihp_io_len && (pm->pcihp_bridge_en || pm->pcihp_root_en)) {
+        dev = aml_device("PHPR");
+        aml_append(dev, aml_name_decl("_HID", aml_string("PNP0A06")));
+        aml_append(dev,
+            aml_name_decl("_UID", aml_string("PCI Hotplug resources")));
+        /* device present, functioning, decoding, not shown in UI */
+        aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+        crs = aml_resource_template();
+        aml_append(crs,
+            aml_io(AML_DECODE16, pm->pcihp_io_base, pm->pcihp_io_base, 1,
+                   pm->pcihp_io_len)
+        );
+        aml_append(dev, aml_name_decl("_CRS", crs));
+        aml_append(scope, dev);
+    }
+    aml_append(dsdt, scope);
+
+    /*  create S3_ / S4_ / S5_ packages if necessary */
+    scope = aml_scope("\\");
+    if (!pm->s3_disabled) {
+        pkg = aml_package(4);
+        aml_append(pkg, aml_int(1)); /* PM1a_CNT.SLP_TYP */
+        aml_append(pkg, aml_int(1)); /* PM1b_CNT.SLP_TYP, FIXME: not impl. */
+        aml_append(pkg, aml_int(0)); /* reserved */
+        aml_append(pkg, aml_int(0)); /* reserved */
+        aml_append(scope, aml_name_decl("_S3", pkg));
+    }
+
+    if (!pm->s4_disabled) {
+        pkg = aml_package(4);
+        aml_append(pkg, aml_int(pm->s4_val)); /* PM1a_CNT.SLP_TYP */
+        /* PM1b_CNT.SLP_TYP, FIXME: not impl. */
+        aml_append(pkg, aml_int(pm->s4_val));
+        aml_append(pkg, aml_int(0)); /* reserved */
+        aml_append(pkg, aml_int(0)); /* reserved */
+        aml_append(scope, aml_name_decl("_S4", pkg));
+    }
+
+    pkg = aml_package(4);
+    aml_append(pkg, aml_int(0)); /* PM1a_CNT.SLP_TYP */
+    aml_append(pkg, aml_int(0)); /* PM1b_CNT.SLP_TYP not impl. */
+    aml_append(pkg, aml_int(0)); /* reserved */
+    aml_append(pkg, aml_int(0)); /* reserved */
+    aml_append(scope, aml_name_decl("_S5", pkg));
+    aml_append(dsdt, scope);
+
+    /* create fw_cfg node, unconditionally */
+    {
+        scope = aml_scope("\\_SB.PCI0");
+        fw_cfg_add_acpi_dsdt(scope, x86ms->fw_cfg);
+        aml_append(dsdt, scope);
+    }
+
+    if (misc->applesmc_io_base) {
+        scope = aml_scope("\\_SB.PCI0.ISA");
+        dev = aml_device("SMC");
+
+        aml_append(dev, aml_name_decl("_HID", aml_eisaid("APP0001")));
+        /* device present, functioning, decoding, not shown in UI */
+        aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+
+        crs = aml_resource_template();
+        aml_append(crs,
+            aml_io(AML_DECODE16, misc->applesmc_io_base, misc->applesmc_io_base,
+                   0x01, APPLESMC_MAX_DATA_LENGTH)
+        );
+        aml_append(crs, aml_irq_no_flags(6));
+        aml_append(dev, aml_name_decl("_CRS", crs));
+
+        aml_append(scope, dev);
+        aml_append(dsdt, scope);
+    }
+
+    if (misc->pvpanic_port) {
+        scope = aml_scope("\\_SB.PCI0.ISA");
+
+        dev = aml_device("PEVT");
+        aml_append(dev, aml_name_decl("_HID", aml_string("QEMU0001")));
+
+        crs = aml_resource_template();
+        aml_append(crs,
+            aml_io(AML_DECODE16, misc->pvpanic_port, misc->pvpanic_port, 1, 1)
+        );
+        aml_append(dev, aml_name_decl("_CRS", crs));
+
+        aml_append(dev, aml_operation_region("PEOR", AML_SYSTEM_IO,
+                                              aml_int(misc->pvpanic_port), 1));
+        field = aml_field("PEOR", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE);
+        aml_append(field, aml_named_field("PEPT", 8));
+        aml_append(dev, field);
+
+        /* device present, functioning, decoding, shown in UI */
+        aml_append(dev, aml_name_decl("_STA", aml_int(0xF)));
+
+        method = aml_method("RDPT", 0, AML_NOTSERIALIZED);
+        aml_append(method, aml_store(aml_name("PEPT"), aml_local(0)));
+        aml_append(method, aml_return(aml_local(0)));
+        aml_append(dev, method);
+
+        method = aml_method("WRPT", 1, AML_NOTSERIALIZED);
+        aml_append(method, aml_store(aml_arg(0), aml_name("PEPT")));
+        aml_append(dev, method);
+
+        aml_append(scope, dev);
+        aml_append(dsdt, scope);
+    }
+
+    sb_scope = aml_scope("\\_SB");
+    {
+        Object *pci_host;
+        PCIBus *bus = NULL;
+
+        pci_host = acpi_get_i386_pci_host();
+
+        if (pci_host) {
+            bus = PCI_HOST_BRIDGE(pci_host)->bus;
+        }
+
+        if (bus) {
+            Aml *scope = aml_scope("PCI0");
+            /* Scan all PCI buses. Generate tables to support hotplug. */
+            build_append_pci_bus_devices(scope, bus, pm->pcihp_bridge_en);
+
+#ifdef CONFIG_TPM
+            if (TPM_IS_TIS_ISA(tpm)) {
+                if (misc->tpm_version == TPM_VERSION_2_0) {
+                    dev = aml_device("TPM");
+                    aml_append(dev, aml_name_decl("_HID",
+                                                  aml_string("MSFT0101")));
+                } else {
+                    dev = aml_device("ISA.TPM");
+                    aml_append(dev, aml_name_decl("_HID",
+                                                  aml_eisaid("PNP0C31")));
+                }
+
+                aml_append(dev, aml_name_decl("_STA", aml_int(0xF)));
+                crs = aml_resource_template();
+                aml_append(crs, aml_memory32_fixed(TPM_TIS_ADDR_BASE,
+                           TPM_TIS_ADDR_SIZE, AML_READ_WRITE));
+                /*
+                    FIXME: TPM_TIS_IRQ=5 conflicts with PNP0C0F irqs,
+                    Rewrite to take IRQ from TPM device model and
+                    fix default IRQ value there to use some unused IRQ
+                 */
+                /* aml_append(crs, aml_irq_no_flags(TPM_TIS_IRQ)); */
+                aml_append(dev, aml_name_decl("_CRS", crs));
+
+                tpm_build_ppi_acpi(tpm, dev);
+
+                aml_append(scope, dev);
+            }
+#endif
+
+            aml_append(sb_scope, scope);
+        }
+    }
+
+#ifdef CONFIG_TPM
+    if (TPM_IS_CRB(tpm)) {
+        dev = aml_device("TPM");
+        aml_append(dev, aml_name_decl("_HID", aml_string("MSFT0101")));
+        crs = aml_resource_template();
+        aml_append(crs, aml_memory32_fixed(TPM_CRB_ADDR_BASE,
+                                           TPM_CRB_ADDR_SIZE, AML_READ_WRITE));
+        aml_append(dev, aml_name_decl("_CRS", crs));
+
+        aml_append(dev, aml_name_decl("_STA", aml_int(0xf)));
+
+        tpm_build_ppi_acpi(tpm, dev);
+
+        aml_append(sb_scope, dev);
+    }
+#endif
+
+    if (pcms->sgx_epc.size != 0) {
+        uint64_t epc_base = pcms->sgx_epc.base;
+        uint64_t epc_size = pcms->sgx_epc.size;
+
+        dev = aml_device("EPC");
+        aml_append(dev, aml_name_decl("_HID", aml_eisaid("INT0E0C")));
+        aml_append(dev, aml_name_decl("_STR",
+                                      aml_unicode("Enclave Page Cache 1.0")));
+        crs = aml_resource_template();
+        aml_append(crs,
+                   aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED,
+                                    AML_MAX_FIXED, AML_NON_CACHEABLE,
+                                    AML_READ_WRITE, 0, epc_base,
+                                    epc_base + epc_size - 1, 0, epc_size));
+        aml_append(dev, aml_name_decl("_CRS", crs));
+
+        method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+        aml_append(method, aml_return(aml_int(0x0f)));
+        aml_append(dev, method);
+
+        aml_append(sb_scope, dev);
+    }
+    aml_append(dsdt, sb_scope);
+
+    /* copy AML table into ACPI tables blob and patch header there */
+    g_array_append_vals(table_data, dsdt->buf->data, dsdt->buf->len);
+    acpi_table_end(linker, &table);
+    free_aml_allocator();
+}
+
+/*
+ * IA-PC HPET (High Precision Event Timers) Specification (Revision: 1.0a)
+ * 3.2.4The ACPI 2.0 HPET Description Table (HPET)
+ */
+static void
+build_hpet(GArray *table_data, BIOSLinker *linker, const char *oem_id,
+           const char *oem_table_id)
+{
+    AcpiTable table = { .sig = "HPET", .rev = 1,
+                        .oem_id = oem_id, .oem_table_id = oem_table_id };
+
+    acpi_table_begin(&table, table_data);
+    /* Note timer_block_id value must be kept in sync with value advertised by
+     * emulated hpet
+     */
+    /* Event Timer Block ID */
+    build_append_int_noprefix(table_data, 0x8086a201, 4);
+    /* BASE_ADDRESS */
+    build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 0, 0, 0, HPET_BASE);
+    /* HPET Number */
+    build_append_int_noprefix(table_data, 0, 1);
+    /* Main Counter Minimum Clock_tick in Periodic Mode */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* Page Protection And OEM Attribute */
+    build_append_int_noprefix(table_data, 0, 1);
+    acpi_table_end(linker, &table);
+}
+
+#ifdef CONFIG_TPM
+/*
+ * TCPA Description Table
+ *
+ * Following Level 00, Rev 00.37 of specs:
+ * http://www.trustedcomputinggroup.org/resources/tcg_acpi_specification
+ * 7.1.2 ACPI Table Layout
+ */
+static void
+build_tpm_tcpa(GArray *table_data, BIOSLinker *linker, GArray *tcpalog,
+               const char *oem_id, const char *oem_table_id)
+{
+    unsigned log_addr_offset;
+    AcpiTable table = { .sig = "TCPA", .rev = 2,
+                        .oem_id = oem_id, .oem_table_id = oem_table_id };
+
+    acpi_table_begin(&table, table_data);
+    /* Platform Class */
+    build_append_int_noprefix(table_data, TPM_TCPA_ACPI_CLASS_CLIENT, 2);
+    /* Log Area Minimum Length (LAML) */
+    build_append_int_noprefix(table_data, TPM_LOG_AREA_MINIMUM_SIZE, 4);
+    /* Log Area Start Address (LASA) */
+    log_addr_offset = table_data->len;
+    build_append_int_noprefix(table_data, 0, 8);
+
+    /* allocate/reserve space for TPM log area */
+    acpi_data_push(tcpalog, TPM_LOG_AREA_MINIMUM_SIZE);
+    bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1,
+                             false /* high memory */);
+    /* log area start address to be filled by Guest linker */
+    bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
+        log_addr_offset, 8, ACPI_BUILD_TPMLOG_FILE, 0);
+
+    acpi_table_end(linker, &table);
+}
+#endif
+
+#define HOLE_640K_START  (640 * KiB)
+#define HOLE_640K_END   (1 * MiB)
+
+/*
+ * ACPI spec, Revision 3.0
+ * 5.2.15 System Resource Affinity Table (SRAT)
+ */
+static void
+build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
+{
+    int i;
+    int numa_mem_start, slots;
+    uint64_t mem_len, mem_base, next_base;
+    MachineClass *mc = MACHINE_GET_CLASS(machine);
+    X86MachineState *x86ms = X86_MACHINE(machine);
+    const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(machine);
+    PCMachineState *pcms = PC_MACHINE(machine);
+    int nb_numa_nodes = machine->numa_state->num_nodes;
+    NodeInfo *numa_info = machine->numa_state->nodes;
+    ram_addr_t hotpluggable_address_space_size =
+        object_property_get_int(OBJECT(pcms), PC_MACHINE_DEVMEM_REGION_SIZE,
+                                NULL);
+    AcpiTable table = { .sig = "SRAT", .rev = 1, .oem_id = x86ms->oem_id,
+                        .oem_table_id = x86ms->oem_table_id };
+
+    acpi_table_begin(&table, table_data);
+    build_append_int_noprefix(table_data, 1, 4); /* Reserved */
+    build_append_int_noprefix(table_data, 0, 8); /* Reserved */
+
+    for (i = 0; i < apic_ids->len; i++) {
+        int node_id = apic_ids->cpus[i].props.node_id;
+        uint32_t apic_id = apic_ids->cpus[i].arch_id;
+
+        if (apic_id < 255) {
+            /* 5.2.15.1 Processor Local APIC/SAPIC Affinity Structure */
+            build_append_int_noprefix(table_data, 0, 1);  /* Type  */
+            build_append_int_noprefix(table_data, 16, 1); /* Length */
+            /* Proximity Domain [7:0] */
+            build_append_int_noprefix(table_data, node_id, 1);
+            build_append_int_noprefix(table_data, apic_id, 1); /* APIC ID */
+            /* Flags, Table 5-36 */
+            build_append_int_noprefix(table_data, 1, 4);
+            build_append_int_noprefix(table_data, 0, 1); /* Local SAPIC EID */
+            /* Proximity Domain [31:8] */
+            build_append_int_noprefix(table_data, 0, 3);
+            build_append_int_noprefix(table_data, 0, 4); /* Reserved */
+        } else {
+            /*
+             * ACPI spec, Revision 4.0
+             * 5.2.16.3 Processor Local x2APIC Affinity Structure
+             */
+            build_append_int_noprefix(table_data, 2, 1);  /* Type  */
+            build_append_int_noprefix(table_data, 24, 1); /* Length */
+            build_append_int_noprefix(table_data, 0, 2); /* Reserved */
+            /* Proximity Domain */
+            build_append_int_noprefix(table_data, node_id, 4);
+            build_append_int_noprefix(table_data, apic_id, 4); /* X2APIC ID */
+            /* Flags, Table 5-39 */
+            build_append_int_noprefix(table_data, 1 /* Enabled */, 4);
+            build_append_int_noprefix(table_data, 0, 4); /* Clock Domain */
+            build_append_int_noprefix(table_data, 0, 4); /* Reserved */
+        }
+    }
+
+    /* the memory map is a bit tricky, it contains at least one hole
+     * from 640k-1M and possibly another one from 3.5G-4G.
+     */
+    next_base = 0;
+    numa_mem_start = table_data->len;
+
+    for (i = 1; i < nb_numa_nodes + 1; ++i) {
+        mem_base = next_base;
+        mem_len = numa_info[i - 1].node_mem;
+        next_base = mem_base + mem_len;
+
+        /* Cut out the 640K hole */
+        if (mem_base <= HOLE_640K_START &&
+            next_base > HOLE_640K_START) {
+            mem_len -= next_base - HOLE_640K_START;
+            if (mem_len > 0) {
+                build_srat_memory(table_data, mem_base, mem_len, i - 1,
+                                  MEM_AFFINITY_ENABLED);
+            }
+
+            /* Check for the rare case: 640K < RAM < 1M */
+            if (next_base <= HOLE_640K_END) {
+                next_base = HOLE_640K_END;
+                continue;
+            }
+            mem_base = HOLE_640K_END;
+            mem_len = next_base - HOLE_640K_END;
+        }
+
+        /* Cut out the ACPI_PCI hole */
+        if (mem_base <= x86ms->below_4g_mem_size &&
+            next_base > x86ms->below_4g_mem_size) {
+            mem_len -= next_base - x86ms->below_4g_mem_size;
+            if (mem_len > 0) {
+                build_srat_memory(table_data, mem_base, mem_len, i - 1,
+                                  MEM_AFFINITY_ENABLED);
+            }
+            mem_base = 1ULL << 32;
+            mem_len = next_base - x86ms->below_4g_mem_size;
+            next_base = mem_base + mem_len;
+        }
+
+        if (mem_len > 0) {
+            build_srat_memory(table_data, mem_base, mem_len, i - 1,
+                              MEM_AFFINITY_ENABLED);
+        }
+    }
+
+    if (machine->nvdimms_state->is_enabled) {
+        nvdimm_build_srat(table_data);
+    }
+
+    /*
+     * TODO: this part is not in ACPI spec and current linux kernel boots fine
+     * without these entries. But I recall there were issues the last time I
+     * tried to remove it with some ancient guest OS, however I can't remember
+     * what that was so keep this around for now
+     */
+    slots = (table_data->len - numa_mem_start) / 40 /* mem affinity len */;
+    for (; slots < nb_numa_nodes + 2; slots++) {
+        build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS);
+    }
+
+    /*
+     * Entry is required for Windows to enable memory hotplug in OS
+     * and for Linux to enable SWIOTLB when booted with less than
+     * 4G of RAM. Windows works better if the entry sets proximity
+     * to the highest NUMA node in the machine.
+     * Memory devices may override proximity set by this entry,
+     * providing _PXM method if necessary.
+     */
+    if (hotpluggable_address_space_size) {
+        build_srat_memory(table_data, machine->device_memory->base,
+                          hotpluggable_address_space_size, nb_numa_nodes - 1,
+                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
+    }
+
+    acpi_table_end(linker, &table);
+}
+
+/*
+ * Insert DMAR scope for PCI bridges and endpoint devcie
+ */
+static void
+insert_scope(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+    const size_t device_scope_size = 6 /* device scope structure */ +
+                                     2 /* 1 path entry */;
+    GArray *scope_blob = opaque;
+
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
+        /* Dmar Scope Type: 0x02 for PCI Bridge */
+        build_append_int_noprefix(scope_blob, 0x02, 1);
+    } else {
+        /* Dmar Scope Type: 0x01 for PCI Endpoint Device */
+        build_append_int_noprefix(scope_blob, 0x01, 1);
+    }
+
+    /* length */
+    build_append_int_noprefix(scope_blob, device_scope_size, 1);
+    /* reserved */
+    build_append_int_noprefix(scope_blob, 0, 2);
+    /* enumeration_id */
+    build_append_int_noprefix(scope_blob, 0, 1);
+    /* bus */
+    build_append_int_noprefix(scope_blob, pci_bus_num(bus), 1);
+    /* device */
+    build_append_int_noprefix(scope_blob, PCI_SLOT(dev->devfn), 1);
+    /* function */
+    build_append_int_noprefix(scope_blob, PCI_FUNC(dev->devfn), 1);
+}
+
+/* For a given PCI host bridge, walk and insert DMAR scope */
+static int
+dmar_host_bridges(Object *obj, void *opaque)
+{
+    GArray *scope_blob = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) {
+        PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus;
+
+        if (bus && !pci_bus_bypass_iommu(bus)) {
+            pci_for_each_device_under_bus(bus, insert_scope, scope_blob);
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Intel ® Virtualization Technology for Directed I/O
+ * Architecture Specification. Revision 3.3
+ * 8.1 DMA Remapping Reporting Structure
+ */
+static void
+build_dmar_q35(GArray *table_data, BIOSLinker *linker, const char *oem_id,
+               const char *oem_table_id)
+{
+    uint8_t dmar_flags = 0;
+    uint8_t rsvd10[10] = {};
+    /* Root complex IOAPIC uses one path only */
+    const size_t ioapic_scope_size = 6 /* device scope structure */ +
+                                     2 /* 1 path entry */;
+    X86IOMMUState *iommu = x86_iommu_get_default();
+    IntelIOMMUState *intel_iommu = INTEL_IOMMU_DEVICE(iommu);
+    GArray *scope_blob = g_array_new(false, true, 1);
+
+    AcpiTable table = { .sig = "DMAR", .rev = 1, .oem_id = oem_id,
+                        .oem_table_id = oem_table_id };
+
+    /*
+     * A PCI bus walk, for each PCI host bridge.
+     * Insert scope for each PCI bridge and endpoint device which
+     * is attached to a bus with iommu enabled.
+     */
+    object_child_foreach_recursive(object_get_root(),
+                                   dmar_host_bridges, scope_blob);
+
+    assert(iommu);
+    if (x86_iommu_ir_supported(iommu)) {
+        dmar_flags |= 0x1;      /* Flags: 0x1: INT_REMAP */
+    }
+
+    acpi_table_begin(&table, table_data);
+    /* Host Address Width */
+    build_append_int_noprefix(table_data, intel_iommu->aw_bits - 1, 1);
+    build_append_int_noprefix(table_data, dmar_flags, 1); /* Flags */
+    g_array_append_vals(table_data, rsvd10, sizeof(rsvd10)); /* Reserved */
+
+    /* 8.3 DMAR Remapping Hardware Unit Definition structure */
+    build_append_int_noprefix(table_data, 0, 2); /* Type */
+    /* Length */
+    build_append_int_noprefix(table_data,
+                              16 + ioapic_scope_size + scope_blob->len, 2);
+    /* Flags */
+    build_append_int_noprefix(table_data, 0 /* Don't include all pci device */ ,
+                              1);
+    build_append_int_noprefix(table_data, 0 , 1); /* Reserved */
+    build_append_int_noprefix(table_data, 0 , 2); /* Segment Number */
+    /* Register Base Address */
+    build_append_int_noprefix(table_data, Q35_HOST_BRIDGE_IOMMU_ADDR , 8);
+
+    /* Scope definition for the root-complex IOAPIC. See VT-d spec
+     * 8.3.1 (version Oct. 2014 or later). */
+    build_append_int_noprefix(table_data, 0x03 /* IOAPIC */, 1); /* Type */
+    build_append_int_noprefix(table_data, ioapic_scope_size, 1); /* Length */
+    build_append_int_noprefix(table_data, 0, 2); /* Reserved */
+    /* Enumeration ID */
+    build_append_int_noprefix(table_data, ACPI_BUILD_IOAPIC_ID, 1);
+    /* Start Bus Number */
+    build_append_int_noprefix(table_data, Q35_PSEUDO_BUS_PLATFORM, 1);
+    /* Path, {Device, Function} pair */
+    build_append_int_noprefix(table_data, PCI_SLOT(Q35_PSEUDO_DEVFN_IOAPIC), 1);
+    build_append_int_noprefix(table_data, PCI_FUNC(Q35_PSEUDO_DEVFN_IOAPIC), 1);
+
+    /* Add scope found above */
+    g_array_append_vals(table_data, scope_blob->data, scope_blob->len);
+    g_array_free(scope_blob, true);
+
+    if (iommu->dt_supported) {
+        /* 8.5 Root Port ATS Capability Reporting Structure */
+        build_append_int_noprefix(table_data, 2, 2); /* Type */
+        build_append_int_noprefix(table_data, 8, 2); /* Length */
+        build_append_int_noprefix(table_data, 1 /* ALL_PORTS */, 1); /* Flags */
+        build_append_int_noprefix(table_data, 0, 1); /* Reserved */
+        build_append_int_noprefix(table_data, 0, 2); /* Segment Number */
+    }
+
+    acpi_table_end(linker, &table);
+}
+
+/*
+ * Windows ACPI Emulated Devices Table
+ * (Version 1.0 - April 6, 2009)
+ * Spec: http://download.microsoft.com/download/7/E/7/7E7662CF-CBEA-470B-A97E-CE7CE0D98DC2/WAET.docx
+ *
+ * Helpful to speedup Windows guests and ignored by others.
+ */
+static void
+build_waet(GArray *table_data, BIOSLinker *linker, const char *oem_id,
+           const char *oem_table_id)
+{
+    AcpiTable table = { .sig = "WAET", .rev = 1, .oem_id = oem_id,
+                        .oem_table_id = oem_table_id };
+
+    acpi_table_begin(&table, table_data);
+    /*
+     * Set "ACPI PM timer good" flag.
+     *
+     * Tells Windows guests that our ACPI PM timer is reliable in the
+     * sense that guest can read it only once to obtain a reliable value.
+     * Which avoids costly VMExits caused by guest re-reading it unnecessarily.
+     */
+    build_append_int_noprefix(table_data, 1 << 1 /* ACPI PM timer good */, 4);
+    acpi_table_end(linker, &table);
+}
+
+/*
+ *   IVRS table as specified in AMD IOMMU Specification v2.62, Section 5.2
+ *   accessible here http://support.amd.com/TechDocs/48882_IOMMU.pdf
+ */
+#define IOAPIC_SB_DEVID   (uint64_t)PCI_BUILD_BDF(0, PCI_DEVFN(0x14, 0))
+
+/*
+ * Insert IVHD entry for device and recurse, insert alias, or insert range as
+ * necessary for the PCI topology.
+ */
+static void
+insert_ivhd(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+    GArray *table_data = opaque;
+    uint32_t entry;
+
+    /* "Select" IVHD entry, type 0x2 */
+    entry = PCI_BUILD_BDF(pci_bus_num(bus), dev->devfn) << 8 | 0x2;
+    build_append_int_noprefix(table_data, entry, 4);
+
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
+        PCIBus *sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(dev));
+        uint8_t sec = pci_bus_num(sec_bus);
+        uint8_t sub = dev->config[PCI_SUBORDINATE_BUS];
+
+        if (pci_bus_is_express(sec_bus)) {
+            /*
+             * Walk the bus if there are subordinates, otherwise use a range
+             * to cover an entire leaf bus.  We could potentially also use a
+             * range for traversed buses, but we'd need to take care not to
+             * create both Select and Range entries covering the same device.
+             * This is easier and potentially more compact.
+             *
+             * An example bare metal system seems to use Select entries for
+             * root ports without a slot (ie. built-ins) and Range entries
+             * when there is a slot.  The same system also only hard-codes
+             * the alias range for an onboard PCIe-to-PCI bridge, apparently
+             * making no effort to support nested bridges.  We attempt to
+             * be more thorough here.
+             */
+            if (sec == sub) { /* leaf bus */
+                /* "Start of Range" IVHD entry, type 0x3 */
+                entry = PCI_BUILD_BDF(sec, PCI_DEVFN(0, 0)) << 8 | 0x3;
+                build_append_int_noprefix(table_data, entry, 4);
+                /* "End of Range" IVHD entry, type 0x4 */
+                entry = PCI_BUILD_BDF(sub, PCI_DEVFN(31, 7)) << 8 | 0x4;
+                build_append_int_noprefix(table_data, entry, 4);
+            } else {
+                pci_for_each_device(sec_bus, sec, insert_ivhd, table_data);
+            }
+        } else {
+            /*
+             * If the secondary bus is conventional, then we need to create an
+             * Alias range for everything downstream.  The range covers the
+             * first devfn on the secondary bus to the last devfn on the
+             * subordinate bus.  The alias target depends on legacy versus
+             * express bridges, just as in pci_device_iommu_address_space().
+             * DeviceIDa vs DeviceIDb as per the AMD IOMMU spec.
+             */
+            uint16_t dev_id_a, dev_id_b;
+
+            dev_id_a = PCI_BUILD_BDF(sec, PCI_DEVFN(0, 0));
+
+            if (pci_is_express(dev) &&
+                pcie_cap_get_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) {
+                dev_id_b = dev_id_a;
+            } else {
+                dev_id_b = PCI_BUILD_BDF(pci_bus_num(bus), dev->devfn);
+            }
+
+            /* "Alias Start of Range" IVHD entry, type 0x43, 8 bytes */
+            build_append_int_noprefix(table_data, dev_id_a << 8 | 0x43, 4);
+            build_append_int_noprefix(table_data, dev_id_b << 8 | 0x0, 4);
+
+            /* "End of Range" IVHD entry, type 0x4 */
+            entry = PCI_BUILD_BDF(sub, PCI_DEVFN(31, 7)) << 8 | 0x4;
+            build_append_int_noprefix(table_data, entry, 4);
+        }
+    }
+}
+
+/* For all PCI host bridges, walk and insert IVHD entries */
+static int
+ivrs_host_bridges(Object *obj, void *opaque)
+{
+    GArray *ivhd_blob = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) {
+        PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus;
+
+        if (bus && !pci_bus_bypass_iommu(bus)) {
+            pci_for_each_device_under_bus(bus, insert_ivhd, ivhd_blob);
+        }
+    }
+
+    return 0;
+}
+
+static void
+build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
+                const char *oem_table_id)
+{
+    int ivhd_table_len = 24;
+    AMDVIState *s = AMD_IOMMU_DEVICE(x86_iommu_get_default());
+    GArray *ivhd_blob = g_array_new(false, true, 1);
+    AcpiTable table = { .sig = "IVRS", .rev = 1, .oem_id = oem_id,
+                        .oem_table_id = oem_table_id };
+
+    acpi_table_begin(&table, table_data);
+    /* IVinfo - IO virtualization information common to all
+     * IOMMU units in a system
+     */
+    build_append_int_noprefix(table_data, 40UL << 8/* PASize */, 4);
+    /* reserved */
+    build_append_int_noprefix(table_data, 0, 8);
+
+    /* IVHD definition - type 10h */
+    build_append_int_noprefix(table_data, 0x10, 1);
+    /* virtualization flags */
+    build_append_int_noprefix(table_data,
+                             (1UL << 0) | /* HtTunEn      */
+                             (1UL << 4) | /* iotblSup     */
+                             (1UL << 6) | /* PrefSup      */
+                             (1UL << 7),  /* PPRSup       */
+                             1);
+
+    /*
+     * A PCI bus walk, for each PCI host bridge, is necessary to create a
+     * complete set of IVHD entries.  Do this into a separate blob so that we
+     * can calculate the total IVRS table length here and then append the new
+     * blob further below.  Fall back to an entry covering all devices, which
+     * is sufficient when no aliases are present.
+     */
+    object_child_foreach_recursive(object_get_root(),
+                                   ivrs_host_bridges, ivhd_blob);
+
+    if (!ivhd_blob->len) {
+        /*
+         *   Type 1 device entry reporting all devices
+         *   These are 4-byte device entries currently reporting the range of
+         *   Refer to Spec - Table 95:IVHD Device Entry Type Codes(4-byte)
+         */
+        build_append_int_noprefix(ivhd_blob, 0x0000001, 4);
+    }
+
+    ivhd_table_len += ivhd_blob->len;
+
+    /*
+     * When interrupt remapping is supported, we add a special IVHD device
+     * for type IO-APIC.
+     */
+    if (x86_iommu_ir_supported(x86_iommu_get_default())) {
+        ivhd_table_len += 8;
+    }
+
+    /* IVHD length */
+    build_append_int_noprefix(table_data, ivhd_table_len, 2);
+    /* DeviceID */
+    build_append_int_noprefix(table_data, s->devid, 2);
+    /* Capability offset */
+    build_append_int_noprefix(table_data, s->capab_offset, 2);
+    /* IOMMU base address */
+    build_append_int_noprefix(table_data, s->mmio.addr, 8);
+    /* PCI Segment Group */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* IOMMU info */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* IOMMU Feature Reporting */
+    build_append_int_noprefix(table_data,
+                             (48UL << 30) | /* HATS   */
+                             (48UL << 28) | /* GATS   */
+                             (1UL << 2)   | /* GTSup  */
+                             (1UL << 6),    /* GASup  */
+                             4);
+
+    /* IVHD entries as found above */
+    g_array_append_vals(table_data, ivhd_blob->data, ivhd_blob->len);
+    g_array_free(ivhd_blob, TRUE);
+
+    /*
+     * Add a special IVHD device type.
+     * Refer to spec - Table 95: IVHD device entry type codes
+     *
+     * Linux IOMMU driver checks for the special IVHD device (type IO-APIC).
+     * See Linux kernel commit 'c2ff5cf5294bcbd7fa50f7d860e90a66db7e5059'
+     */
+    if (x86_iommu_ir_supported(x86_iommu_get_default())) {
+        build_append_int_noprefix(table_data,
+                                 (0x1ull << 56) |           /* type IOAPIC */
+                                 (IOAPIC_SB_DEVID << 40) |  /* IOAPIC devid */
+                                 0x48,                      /* special device */
+                                 8);
+    }
+    acpi_table_end(linker, &table);
+}
+
+typedef
+struct AcpiBuildState {
+    /* Copy of table in RAM (for patching). */
+    MemoryRegion *table_mr;
+    /* Is table patched? */
+    uint8_t patched;
+    void *rsdp;
+    MemoryRegion *rsdp_mr;
+    MemoryRegion *linker_mr;
+} AcpiBuildState;
+
+static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
+{
+    Object *pci_host;
+    QObject *o;
+
+    pci_host = acpi_get_i386_pci_host();
+    if (!pci_host) {
+        return false;
+    }
+
+    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL);
+    if (!o) {
+        return false;
+    }
+    mcfg->base = qnum_get_uint(qobject_to(QNum, o));
+    qobject_unref(o);
+    if (mcfg->base == PCIE_BASE_ADDR_UNMAPPED) {
+        return false;
+    }
+
+    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL);
+    assert(o);
+    mcfg->size = qnum_get_uint(qobject_to(QNum, o));
+    qobject_unref(o);
+    return true;
+}
+
+static
+void acpi_build(AcpiBuildTables *tables, MachineState *machine)
+{
+    PCMachineState *pcms = PC_MACHINE(machine);
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+    X86MachineState *x86ms = X86_MACHINE(machine);
+    DeviceState *iommu = pcms->iommu;
+    GArray *table_offsets;
+    unsigned facs, dsdt, rsdt, fadt;
+    AcpiPmInfo pm;
+    AcpiMiscInfo misc;
+    AcpiMcfgInfo mcfg;
+    Range pci_hole = {}, pci_hole64 = {};
+    uint8_t *u;
+    size_t aml_len = 0;
+    GArray *tables_blob = tables->table_data;
+    AcpiSlicOem slic_oem = { .id = NULL, .table_id = NULL };
+    Object *vmgenid_dev;
+    char *oem_id;
+    char *oem_table_id;
+
+    acpi_get_pm_info(machine, &pm);
+    acpi_get_misc_info(&misc);
+    acpi_get_pci_holes(&pci_hole, &pci_hole64);
+    acpi_get_slic_oem(&slic_oem);
+
+    if (slic_oem.id) {
+        oem_id = slic_oem.id;
+    } else {
+        oem_id = x86ms->oem_id;
+    }
+
+    if (slic_oem.table_id) {
+        oem_table_id = slic_oem.table_id;
+    } else {
+        oem_table_id = x86ms->oem_table_id;
+    }
+
+    table_offsets = g_array_new(false, true /* clear */,
+                                        sizeof(uint32_t));
+    ACPI_BUILD_DPRINTF("init ACPI tables\n");
+
+    bios_linker_loader_alloc(tables->linker,
+                             ACPI_BUILD_TABLE_FILE, tables_blob,
+                             64 /* Ensure FACS is aligned */,
+                             false /* high memory */);
+
+    /*
+     * FACS is pointed to by FADT.
+     * We place it first since it's the only table that has alignment
+     * requirements.
+     */
+    facs = tables_blob->len;
+    build_facs(tables_blob);
+
+    /* DSDT is pointed to by FADT */
+    dsdt = tables_blob->len;
+    build_dsdt(tables_blob, tables->linker, &pm, &misc,
+               &pci_hole, &pci_hole64, machine);
+
+    /* Count the size of the DSDT and SSDT, we will need it for legacy
+     * sizing of ACPI tables.
+     */
+    aml_len += tables_blob->len - dsdt;
+
+    /* ACPI tables pointed to by RSDT */
+    fadt = tables_blob->len;
+    acpi_add_table(table_offsets, tables_blob);
+    pm.fadt.facs_tbl_offset = &facs;
+    pm.fadt.dsdt_tbl_offset = &dsdt;
+    pm.fadt.xdsdt_tbl_offset = &dsdt;
+    build_fadt(tables_blob, tables->linker, &pm.fadt, oem_id, oem_table_id);
+    aml_len += tables_blob->len - fadt;
+
+    acpi_add_table(table_offsets, tables_blob);
+    acpi_build_madt(tables_blob, tables->linker, x86ms,
+                    ACPI_DEVICE_IF(x86ms->acpi_dev), x86ms->oem_id,
+                    x86ms->oem_table_id);
+
+    vmgenid_dev = find_vmgenid_dev();
+    if (vmgenid_dev) {
+        acpi_add_table(table_offsets, tables_blob);
+        vmgenid_build_acpi(VMGENID(vmgenid_dev), tables_blob,
+                           tables->vmgenid, tables->linker, x86ms->oem_id);
+    }
+
+    if (misc.has_hpet) {
+        acpi_add_table(table_offsets, tables_blob);
+        build_hpet(tables_blob, tables->linker, x86ms->oem_id,
+                   x86ms->oem_table_id);
+    }
+#ifdef CONFIG_TPM
+    if (misc.tpm_version != TPM_VERSION_UNSPEC) {
+        if (misc.tpm_version == TPM_VERSION_1_2) {
+            acpi_add_table(table_offsets, tables_blob);
+            build_tpm_tcpa(tables_blob, tables->linker, tables->tcpalog,
+                           x86ms->oem_id, x86ms->oem_table_id);
+        } else { /* TPM_VERSION_2_0 */
+            acpi_add_table(table_offsets, tables_blob);
+            build_tpm2(tables_blob, tables->linker, tables->tcpalog,
+                       x86ms->oem_id, x86ms->oem_table_id);
+        }
+    }
+#endif
+    if (machine->numa_state->num_nodes) {
+        acpi_add_table(table_offsets, tables_blob);
+        build_srat(tables_blob, tables->linker, machine);
+        if (machine->numa_state->have_numa_distance) {
+            acpi_add_table(table_offsets, tables_blob);
+            build_slit(tables_blob, tables->linker, machine, x86ms->oem_id,
+                       x86ms->oem_table_id);
+        }
+        if (machine->numa_state->hmat_enabled) {
+            acpi_add_table(table_offsets, tables_blob);
+            build_hmat(tables_blob, tables->linker, machine->numa_state,
+                       x86ms->oem_id, x86ms->oem_table_id);
+        }
+    }
+    if (acpi_get_mcfg(&mcfg)) {
+        acpi_add_table(table_offsets, tables_blob);
+        build_mcfg(tables_blob, tables->linker, &mcfg, x86ms->oem_id,
+                   x86ms->oem_table_id);
+    }
+    if (object_dynamic_cast(OBJECT(iommu), TYPE_AMD_IOMMU_DEVICE)) {
+        acpi_add_table(table_offsets, tables_blob);
+        build_amd_iommu(tables_blob, tables->linker, x86ms->oem_id,
+                        x86ms->oem_table_id);
+    } else if (object_dynamic_cast(OBJECT(iommu), TYPE_INTEL_IOMMU_DEVICE)) {
+        acpi_add_table(table_offsets, tables_blob);
+        build_dmar_q35(tables_blob, tables->linker, x86ms->oem_id,
+                       x86ms->oem_table_id);
+    } else if (object_dynamic_cast(OBJECT(iommu), TYPE_VIRTIO_IOMMU_PCI)) {
+        PCIDevice *pdev = PCI_DEVICE(iommu);
+
+        acpi_add_table(table_offsets, tables_blob);
+        build_viot(machine, tables_blob, tables->linker, pci_get_bdf(pdev),
+                   x86ms->oem_id, x86ms->oem_table_id);
+    }
+    if (machine->nvdimms_state->is_enabled) {
+        nvdimm_build_acpi(table_offsets, tables_blob, tables->linker,
+                          machine->nvdimms_state, machine->ram_slots,
+                          x86ms->oem_id, x86ms->oem_table_id);
+    }
+
+    acpi_add_table(table_offsets, tables_blob);
+    build_waet(tables_blob, tables->linker, x86ms->oem_id, x86ms->oem_table_id);
+
+    /* Add tables supplied by user (if any) */
+    for (u = acpi_table_first(); u; u = acpi_table_next(u)) {
+        unsigned len = acpi_table_len(u);
+
+        acpi_add_table(table_offsets, tables_blob);
+        g_array_append_vals(tables_blob, u, len);
+    }
+
+    /* RSDT is pointed to by RSDP */
+    rsdt = tables_blob->len;
+    build_rsdt(tables_blob, tables->linker, table_offsets,
+               oem_id, oem_table_id);
+
+    /* RSDP is in FSEG memory, so allocate it separately */
+    {
+        AcpiRsdpData rsdp_data = {
+            .revision = 0,
+            .oem_id = x86ms->oem_id,
+            .xsdt_tbl_offset = NULL,
+            .rsdt_tbl_offset = &rsdt,
+        };
+        build_rsdp(tables->rsdp, tables->linker, &rsdp_data);
+        if (!pcmc->rsdp_in_ram) {
+            /* We used to allocate some extra space for RSDP revision 2 but
+             * only used the RSDP revision 0 space. The extra bytes were
+             * zeroed out and not used.
+             * Here we continue wasting those extra 16 bytes to make sure we
+             * don't break migration for machine types 2.2 and older due to
+             * RSDP blob size mismatch.
+             */
+            build_append_int_noprefix(tables->rsdp, 0, 16);
+        }
+    }
+
+    /* We'll expose it all to Guest so we want to reduce
+     * chance of size changes.
+     *
+     * We used to align the tables to 4k, but of course this would
+     * too simple to be enough.  4k turned out to be too small an
+     * alignment very soon, and in fact it is almost impossible to
+     * keep the table size stable for all (max_cpus, max_memory_slots)
+     * combinations.  So the table size is always 64k for pc-i440fx-2.1
+     * and we give an error if the table grows beyond that limit.
+     *
+     * We still have the problem of migrating from "-M pc-i440fx-2.0".  For
+     * that, we exploit the fact that QEMU 2.1 generates _smaller_ tables
+     * than 2.0 and we can always pad the smaller tables with zeros.  We can
+     * then use the exact size of the 2.0 tables.
+     *
+     * All this is for PIIX4, since QEMU 2.0 didn't support Q35 migration.
+     */
+    if (pcmc->legacy_acpi_table_size) {
+        /* Subtracting aml_len gives the size of fixed tables.  Then add the
+         * size of the PIIX4 DSDT/SSDT in QEMU 2.0.
+         */
+        int legacy_aml_len =
+            pcmc->legacy_acpi_table_size +
+            ACPI_BUILD_LEGACY_CPU_AML_SIZE * x86ms->apic_id_limit;
+        int legacy_table_size =
+            ROUND_UP(tables_blob->len - aml_len + legacy_aml_len,
+                     ACPI_BUILD_ALIGN_SIZE);
+        if (tables_blob->len > legacy_table_size) {
+            /* Should happen only with PCI bridges and -M pc-i440fx-2.0.  */
+            warn_report("ACPI table size %u exceeds %d bytes,"
+                        " migration may not work",
+                        tables_blob->len, legacy_table_size);
+            error_printf("Try removing CPUs, NUMA nodes, memory slots"
+                         " or PCI bridges.");
+        }
+        g_array_set_size(tables_blob, legacy_table_size);
+    } else {
+        /* Make sure we have a buffer in case we need to resize the tables. */
+        if (tables_blob->len > ACPI_BUILD_TABLE_SIZE / 2) {
+            /* As of QEMU 2.1, this fires with 160 VCPUs and 255 memory slots.  */
+            warn_report("ACPI table size %u exceeds %d bytes,"
+                        " migration may not work",
+                        tables_blob->len, ACPI_BUILD_TABLE_SIZE / 2);
+            error_printf("Try removing CPUs, NUMA nodes, memory slots"
+                         " or PCI bridges.");
+        }
+        acpi_align_size(tables_blob, ACPI_BUILD_TABLE_SIZE);
+    }
+
+    acpi_align_size(tables->linker->cmd_blob, ACPI_BUILD_ALIGN_SIZE);
+
+    /* Cleanup memory that's no longer used. */
+    g_array_free(table_offsets, true);
+}
+
+static void acpi_ram_update(MemoryRegion *mr, GArray *data)
+{
+    uint32_t size = acpi_data_len(data);
+
+    /* Make sure RAM size is correct - in case it got changed e.g. by migration */
+    memory_region_ram_resize(mr, size, &error_abort);
+
+    memcpy(memory_region_get_ram_ptr(mr), data->data, size);
+    memory_region_set_dirty(mr, 0, size);
+}
+
+static void acpi_build_update(void *build_opaque)
+{
+    AcpiBuildState *build_state = build_opaque;
+    AcpiBuildTables tables;
+
+    /* No state to update or already patched? Nothing to do. */
+    if (!build_state || build_state->patched) {
+        return;
+    }
+    build_state->patched = 1;
+
+    acpi_build_tables_init(&tables);
+
+    acpi_build(&tables, MACHINE(qdev_get_machine()));
+
+    acpi_ram_update(build_state->table_mr, tables.table_data);
+
+    if (build_state->rsdp) {
+        memcpy(build_state->rsdp, tables.rsdp->data, acpi_data_len(tables.rsdp));
+    } else {
+        acpi_ram_update(build_state->rsdp_mr, tables.rsdp);
+    }
+
+    acpi_ram_update(build_state->linker_mr, tables.linker->cmd_blob);
+    acpi_build_tables_cleanup(&tables, true);
+}
+
+static void acpi_build_reset(void *build_opaque)
+{
+    AcpiBuildState *build_state = build_opaque;
+    build_state->patched = 0;
+}
+
+static const VMStateDescription vmstate_acpi_build = {
+    .name = "acpi_build",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8(patched, AcpiBuildState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+void acpi_setup(void)
+{
+    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+    X86MachineState *x86ms = X86_MACHINE(pcms);
+    AcpiBuildTables tables;
+    AcpiBuildState *build_state;
+    Object *vmgenid_dev;
+#ifdef CONFIG_TPM
+    TPMIf *tpm;
+    static FwCfgTPMConfig tpm_config;
+#endif
+
+    if (!x86ms->fw_cfg) {
+        ACPI_BUILD_DPRINTF("No fw cfg. Bailing out.\n");
+        return;
+    }
+
+    if (!pcms->acpi_build_enabled) {
+        ACPI_BUILD_DPRINTF("ACPI build disabled. Bailing out.\n");
+        return;
+    }
+
+    if (!x86_machine_is_acpi_enabled(X86_MACHINE(pcms))) {
+        ACPI_BUILD_DPRINTF("ACPI disabled. Bailing out.\n");
+        return;
+    }
+
+    build_state = g_malloc0(sizeof *build_state);
+
+    acpi_build_tables_init(&tables);
+    acpi_build(&tables, MACHINE(pcms));
+
+    /* Now expose it all to Guest */
+    build_state->table_mr = acpi_add_rom_blob(acpi_build_update,
+                                              build_state, tables.table_data,
+                                              ACPI_BUILD_TABLE_FILE);
+    assert(build_state->table_mr != NULL);
+
+    build_state->linker_mr =
+        acpi_add_rom_blob(acpi_build_update, build_state,
+                          tables.linker->cmd_blob, ACPI_BUILD_LOADER_FILE);
+
+#ifdef CONFIG_TPM
+    fw_cfg_add_file(x86ms->fw_cfg, ACPI_BUILD_TPMLOG_FILE,
+                    tables.tcpalog->data, acpi_data_len(tables.tcpalog));
+
+    tpm = tpm_find();
+    if (tpm && object_property_get_bool(OBJECT(tpm), "ppi", &error_abort)) {
+        tpm_config = (FwCfgTPMConfig) {
+            .tpmppi_address = cpu_to_le32(TPM_PPI_ADDR_BASE),
+            .tpm_version = tpm_get_version(tpm),
+            .tpmppi_version = TPM_PPI_VERSION_1_30
+        };
+        fw_cfg_add_file(x86ms->fw_cfg, "etc/tpm/config",
+                        &tpm_config, sizeof tpm_config);
+    }
+#endif
+
+    vmgenid_dev = find_vmgenid_dev();
+    if (vmgenid_dev) {
+        vmgenid_add_fw_cfg(VMGENID(vmgenid_dev), x86ms->fw_cfg,
+                           tables.vmgenid);
+    }
+
+    if (!pcmc->rsdp_in_ram) {
+        /*
+         * Keep for compatibility with old machine types.
+         * Though RSDP is small, its contents isn't immutable, so
+         * we'll update it along with the rest of tables on guest access.
+         */
+        uint32_t rsdp_size = acpi_data_len(tables.rsdp);
+
+        build_state->rsdp = g_memdup(tables.rsdp->data, rsdp_size);
+        fw_cfg_add_file_callback(x86ms->fw_cfg, ACPI_BUILD_RSDP_FILE,
+                                 acpi_build_update, NULL, build_state,
+                                 build_state->rsdp, rsdp_size, true);
+        build_state->rsdp_mr = NULL;
+    } else {
+        build_state->rsdp = NULL;
+        build_state->rsdp_mr = acpi_add_rom_blob(acpi_build_update,
+                                                 build_state, tables.rsdp,
+                                                 ACPI_BUILD_RSDP_FILE);
+    }
+
+    qemu_register_reset(acpi_build_reset, build_state);
+    acpi_build_reset(build_state);
+    vmstate_register(NULL, 0, &vmstate_acpi_build, build_state);
+
+    /* Cleanup tables but don't free the memory: we track it
+     * in build_state.
+     */
+    acpi_build_tables_cleanup(&tables, false);
+}
diff --git a/hw/i386/acpi-build.h b/hw/i386/acpi-build.h
new file mode 100644
index 000000000..0dce155c8
--- /dev/null
+++ b/hw/i386/acpi-build.h
@@ -0,0 +1,15 @@
+
+#ifndef HW_I386_ACPI_BUILD_H
+#define HW_I386_ACPI_BUILD_H
+#include "hw/acpi/acpi-defs.h"
+
+extern const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio;
+
+/* PCI Hot-plug registers bases. See docs/spec/acpi_pci_hotplug.txt */
+#define ACPI_PCIHP_SEJ_BASE 0x8
+#define ACPI_PCIHP_BNMR_BASE 0x10
+
+void acpi_setup(void);
+Object *acpi_get_i386_pci_host(void);
+
+#endif
diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
new file mode 100644
index 000000000..4aaafbdd7
--- /dev/null
+++ b/hw/i386/acpi-common.c
@@ -0,0 +1,165 @@
+/* Support for generating ACPI tables and passing them to Guests
+ *
+ * Copyright (C) 2008-2010  Kevin O'Connor <kevin@koconnor.net>
+ * Copyright (C) 2006 Fabrice Bellard
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+
+#include "exec/memory.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/utils.h"
+#include "hw/i386/pc.h"
+#include "target/i386/cpu.h"
+
+#include "acpi-build.h"
+#include "acpi-common.h"
+
+void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
+                       const CPUArchIdList *apic_ids, GArray *entry,
+                       bool force_enabled)
+{
+    uint32_t apic_id = apic_ids->cpus[uid].arch_id;
+    /* Flags – Local APIC Flags */
+    uint32_t flags = apic_ids->cpus[uid].cpu != NULL || force_enabled ?
+                     1 /* Enabled */ : 0;
+
+    /* ACPI spec says that LAPIC entry for non present
+     * CPU may be omitted from MADT or it must be marked
+     * as disabled. However omitting non present CPU from
+     * MADT breaks hotplug on linux. So possible CPUs
+     * should be put in MADT but kept disabled.
+     */
+    if (apic_id < 255) {
+        /* Rev 1.0b, Table 5-13 Processor Local APIC Structure */
+        build_append_int_noprefix(entry, 0, 1);       /* Type */
+        build_append_int_noprefix(entry, 8, 1);       /* Length */
+        build_append_int_noprefix(entry, uid, 1);     /* ACPI Processor ID */
+        build_append_int_noprefix(entry, apic_id, 1); /* APIC ID */
+        build_append_int_noprefix(entry, flags, 4); /* Flags */
+    } else {
+        /* Rev 4.0, 5.2.12.12 Processor Local x2APIC Structure */
+        build_append_int_noprefix(entry, 9, 1);       /* Type */
+        build_append_int_noprefix(entry, 16, 1);      /* Length */
+        build_append_int_noprefix(entry, 0, 2);       /* Reserved */
+        build_append_int_noprefix(entry, apic_id, 4); /* X2APIC ID */
+        build_append_int_noprefix(entry, flags, 4);   /* Flags */
+        build_append_int_noprefix(entry, uid, 4);     /* ACPI Processor UID */
+    }
+}
+
+static void build_ioapic(GArray *entry, uint8_t id, uint32_t addr, uint32_t irq)
+{
+    /* Rev 1.0b, 5.2.8.2 IO APIC */
+    build_append_int_noprefix(entry, 1, 1);    /* Type */
+    build_append_int_noprefix(entry, 12, 1);   /* Length */
+    build_append_int_noprefix(entry, id, 1);   /* IO APIC ID */
+    build_append_int_noprefix(entry, 0, 1);    /* Reserved */
+    build_append_int_noprefix(entry, addr, 4); /* IO APIC Address */
+    build_append_int_noprefix(entry, irq, 4);  /* System Vector Base */
+}
+
+static void
+build_xrupt_override(GArray *entry, uint8_t src, uint32_t gsi, uint16_t flags)
+{
+    /* Rev 1.0b, 5.2.8.3.1 Interrupt Source Overrides */
+    build_append_int_noprefix(entry, 2, 1);  /* Type */
+    build_append_int_noprefix(entry, 10, 1); /* Length */
+    build_append_int_noprefix(entry, 0, 1);  /* Bus */
+    build_append_int_noprefix(entry, src, 1);  /* Source */
+    /* Global System Interrupt Vector */
+    build_append_int_noprefix(entry, gsi, 4);
+    build_append_int_noprefix(entry, flags, 2);  /* Flags */
+}
+
+/*
+ * ACPI spec, Revision 1.0b
+ * 5.2.8 Multiple APIC Description Table
+ */
+void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
+                     X86MachineState *x86ms, AcpiDeviceIf *adev,
+                     const char *oem_id, const char *oem_table_id)
+{
+    int i;
+    bool x2apic_mode = false;
+    MachineClass *mc = MACHINE_GET_CLASS(x86ms);
+    const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms));
+    AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(adev);
+    AcpiTable table = { .sig = "APIC", .rev = 1, .oem_id = oem_id,
+                        .oem_table_id = oem_table_id };
+
+    acpi_table_begin(&table, table_data);
+    /* Local APIC Address */
+    build_append_int_noprefix(table_data, APIC_DEFAULT_ADDRESS, 4);
+    build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags */
+
+    for (i = 0; i < apic_ids->len; i++) {
+        adevc->madt_cpu(adev, i, apic_ids, table_data, false);
+        if (apic_ids->cpus[i].arch_id > 254) {
+            x2apic_mode = true;
+        }
+    }
+
+    build_ioapic(table_data, ACPI_BUILD_IOAPIC_ID, IO_APIC_DEFAULT_ADDRESS, 0);
+    if (x86ms->ioapic2) {
+        build_ioapic(table_data, ACPI_BUILD_IOAPIC_ID + 1,
+                     IO_APIC_SECONDARY_ADDRESS, IO_APIC_SECONDARY_IRQBASE);
+    }
+
+    if (x86ms->apic_xrupt_override) {
+        build_xrupt_override(table_data, 0, 2,
+            0 /* Flags: Conforms to the specifications of the bus */);
+    }
+
+    for (i = 1; i < 16; i++) {
+        if (!(x86ms->pci_irq_mask & (1 << i))) {
+            /* No need for a INT source override structure. */
+            continue;
+        }
+        build_xrupt_override(table_data, i, i,
+            0xd /* Flags: Active high, Level Triggered */);
+    }
+
+    if (x2apic_mode) {
+        /* Rev 4.0, 5.2.12.13 Local x2APIC NMI Structure*/
+        build_append_int_noprefix(table_data, 0xA, 1); /* Type */
+        build_append_int_noprefix(table_data, 12, 1);  /* Length */
+        build_append_int_noprefix(table_data, 0, 2);   /* Flags */
+        /* ACPI Processor UID */
+        build_append_int_noprefix(table_data, 0xFFFFFFFF /* all processors */,
+                                  4);
+        /* Local x2APIC LINT# */
+        build_append_int_noprefix(table_data, 1 /* ACPI_LINT1 */, 1);
+        build_append_int_noprefix(table_data, 0, 3); /* Reserved */
+    } else {
+        /* Rev 1.0b, 5.2.8.3.3 Local APIC NMI */
+        build_append_int_noprefix(table_data, 4, 1);  /* Type */
+        build_append_int_noprefix(table_data, 6, 1);  /* Length */
+        /* ACPI Processor ID */
+        build_append_int_noprefix(table_data, 0xFF /* all processors */, 1);
+        build_append_int_noprefix(table_data, 0, 2);  /* Flags */
+        /* Local APIC INTI# */
+        build_append_int_noprefix(table_data, 1 /* ACPI_LINT1 */, 1);
+    }
+
+    acpi_table_end(linker, &table);
+}
+
diff --git a/hw/i386/acpi-common.h b/hw/i386/acpi-common.h
new file mode 100644
index 000000000..a68825acf
--- /dev/null
+++ b/hw/i386/acpi-common.h
@@ -0,0 +1,15 @@
+#ifndef HW_I386_ACPI_COMMON_H
+#define HW_I386_ACPI_COMMON_H
+
+#include "hw/acpi/acpi_dev_interface.h"
+#include "hw/acpi/bios-linker-loader.h"
+#include "hw/i386/x86.h"
+
+/* Default IOAPIC ID */
+#define ACPI_BUILD_IOAPIC_ID 0x0
+
+void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
+                     X86MachineState *x86ms, AcpiDeviceIf *adev,
+                     const char *oem_id, const char *oem_table_id);
+
+#endif
diff --git a/hw/i386/acpi-microvm.c b/hw/i386/acpi-microvm.c
new file mode 100644
index 000000000..196d31849
--- /dev/null
+++ b/hw/i386/acpi-microvm.c
@@ -0,0 +1,258 @@
+/* Support for generating ACPI tables and passing them to Guests
+ *
+ * Copyright (C) 2008-2010  Kevin O'Connor <kevin@koconnor.net>
+ * Copyright (C) 2006 Fabrice Bellard
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+
+#include "exec/memory.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/bios-linker-loader.h"
+#include "hw/acpi/generic_event_device.h"
+#include "hw/acpi/utils.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/i386/microvm.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pcie_host.h"
+#include "hw/usb/xhci.h"
+#include "hw/virtio/virtio-mmio.h"
+
+#include "acpi-common.h"
+#include "acpi-microvm.h"
+
+static void acpi_dsdt_add_virtio(Aml *scope,
+                                 MicrovmMachineState *mms)
+{
+    gchar *separator;
+    long int index;
+    BusState *bus;
+    BusChild *kid;
+
+    bus = sysbus_get_default();
+    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+        DeviceState *dev = kid->child;
+        Object *obj = object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MMIO);
+
+        if (obj) {
+            VirtIOMMIOProxy *mmio = VIRTIO_MMIO(obj);
+            VirtioBusState *mmio_virtio_bus = &mmio->bus;
+            BusState *mmio_bus = &mmio_virtio_bus->parent_obj;
+
+            if (QTAILQ_EMPTY(&mmio_bus->children)) {
+                continue;
+            }
+            separator = g_strrstr(mmio_bus->name, ".");
+            if (!separator) {
+                continue;
+            }
+            if (qemu_strtol(separator + 1, NULL, 10, &index) != 0) {
+                continue;
+            }
+
+            uint32_t irq = mms->virtio_irq_base + index;
+            hwaddr base = VIRTIO_MMIO_BASE + index * 512;
+            hwaddr size = 512;
+
+            Aml *dev = aml_device("VR%02u", (unsigned)index);
+            aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0005")));
+            aml_append(dev, aml_name_decl("_UID", aml_int(index)));
+            aml_append(dev, aml_name_decl("_CCA", aml_int(1)));
+
+            Aml *crs = aml_resource_template();
+            aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE));
+            aml_append(crs,
+                       aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH,
+                                     AML_EXCLUSIVE, &irq, 1));
+            aml_append(dev, aml_name_decl("_CRS", crs));
+            aml_append(scope, dev);
+        }
+    }
+}
+
+static void acpi_dsdt_add_xhci(Aml *scope, MicrovmMachineState *mms)
+{
+    if (machine_usb(MACHINE(mms))) {
+        xhci_sysbus_build_aml(scope, MICROVM_XHCI_BASE, MICROVM_XHCI_IRQ);
+    }
+}
+
+static void acpi_dsdt_add_pci(Aml *scope, MicrovmMachineState *mms)
+{
+    if (mms->pcie != ON_OFF_AUTO_ON) {
+        return;
+    }
+
+    acpi_dsdt_add_gpex(scope, &mms->gpex);
+}
+
+static void
+build_dsdt_microvm(GArray *table_data, BIOSLinker *linker,
+                   MicrovmMachineState *mms)
+{
+    X86MachineState *x86ms = X86_MACHINE(mms);
+    Aml *dsdt, *sb_scope, *scope, *pkg;
+    bool ambiguous;
+    Object *isabus;
+    AcpiTable table = { .sig = "DSDT", .rev = 2, .oem_id = x86ms->oem_id,
+                        .oem_table_id = x86ms->oem_table_id };
+
+    isabus = object_resolve_path_type("", TYPE_ISA_BUS, &ambiguous);
+    assert(isabus);
+    assert(!ambiguous);
+
+    acpi_table_begin(&table, table_data);
+    dsdt = init_aml_allocator();
+
+    sb_scope = aml_scope("_SB");
+    fw_cfg_add_acpi_dsdt(sb_scope, x86ms->fw_cfg);
+    isa_build_aml(ISA_BUS(isabus), sb_scope);
+    build_ged_aml(sb_scope, GED_DEVICE, x86ms->acpi_dev,
+                  GED_MMIO_IRQ, AML_SYSTEM_MEMORY, GED_MMIO_BASE);
+    acpi_dsdt_add_power_button(sb_scope);
+    acpi_dsdt_add_virtio(sb_scope, mms);
+    acpi_dsdt_add_xhci(sb_scope, mms);
+    acpi_dsdt_add_pci(sb_scope, mms);
+    aml_append(dsdt, sb_scope);
+
+    /* ACPI 5.0: Table 7-209 System State Package */
+    scope = aml_scope("\\");
+    pkg = aml_package(4);
+    aml_append(pkg, aml_int(ACPI_GED_SLP_TYP_S5));
+    aml_append(pkg, aml_int(0)); /* ignored */
+    aml_append(pkg, aml_int(0)); /* reserved */
+    aml_append(pkg, aml_int(0)); /* reserved */
+    aml_append(scope, aml_name_decl("_S5", pkg));
+    aml_append(dsdt, scope);
+
+    /* copy AML bytecode into ACPI tables blob */
+    g_array_append_vals(table_data, dsdt->buf->data, dsdt->buf->len);
+
+    acpi_table_end(linker, &table);
+    free_aml_allocator();
+}
+
+static void acpi_build_microvm(AcpiBuildTables *tables,
+                               MicrovmMachineState *mms)
+{
+    MachineState *machine = MACHINE(mms);
+    X86MachineState *x86ms = X86_MACHINE(mms);
+    GArray *table_offsets;
+    GArray *tables_blob = tables->table_data;
+    unsigned dsdt, xsdt;
+    AcpiFadtData pmfadt = {
+        /* ACPI 5.0: 4.1 Hardware-Reduced ACPI */
+        .rev = 5,
+        .flags = ((1 << ACPI_FADT_F_HW_REDUCED_ACPI) |
+                  (1 << ACPI_FADT_F_RESET_REG_SUP)),
+
+        /* ACPI 5.0: 4.8.3.7 Sleep Control and Status Registers */
+        .sleep_ctl = {
+            .space_id = AML_AS_SYSTEM_MEMORY,
+            .bit_width = 8,
+            .address = GED_MMIO_BASE_REGS + ACPI_GED_REG_SLEEP_CTL,
+        },
+        .sleep_sts = {
+            .space_id = AML_AS_SYSTEM_MEMORY,
+            .bit_width = 8,
+            .address = GED_MMIO_BASE_REGS + ACPI_GED_REG_SLEEP_STS,
+        },
+
+        /* ACPI 5.0: 4.8.3.6 Reset Register */
+        .reset_reg = {
+            .space_id = AML_AS_SYSTEM_MEMORY,
+            .bit_width = 8,
+            .address = GED_MMIO_BASE_REGS + ACPI_GED_REG_RESET,
+        },
+        .reset_val = ACPI_GED_RESET_VALUE,
+    };
+
+    table_offsets = g_array_new(false, true /* clear */,
+                                        sizeof(uint32_t));
+    bios_linker_loader_alloc(tables->linker,
+                             ACPI_BUILD_TABLE_FILE, tables_blob,
+                             64 /* Ensure FACS is aligned */,
+                             false /* high memory */);
+
+    dsdt = tables_blob->len;
+    build_dsdt_microvm(tables_blob, tables->linker, mms);
+
+    pmfadt.dsdt_tbl_offset = &dsdt;
+    pmfadt.xdsdt_tbl_offset = &dsdt;
+    acpi_add_table(table_offsets, tables_blob);
+    build_fadt(tables_blob, tables->linker, &pmfadt, x86ms->oem_id,
+               x86ms->oem_table_id);
+
+    acpi_add_table(table_offsets, tables_blob);
+    acpi_build_madt(tables_blob, tables->linker, X86_MACHINE(machine),
+                    ACPI_DEVICE_IF(x86ms->acpi_dev), x86ms->oem_id,
+                    x86ms->oem_table_id);
+
+    xsdt = tables_blob->len;
+    build_xsdt(tables_blob, tables->linker, table_offsets, x86ms->oem_id,
+               x86ms->oem_table_id);
+
+    /* RSDP is in FSEG memory, so allocate it separately */
+    {
+        AcpiRsdpData rsdp_data = {
+            /* ACPI 2.0: 5.2.4.3 RSDP Structure */
+            .revision = 2, /* xsdt needs v2 */
+            .oem_id = x86ms->oem_id,
+            .xsdt_tbl_offset = &xsdt,
+            .rsdt_tbl_offset = NULL,
+        };
+        build_rsdp(tables->rsdp, tables->linker, &rsdp_data);
+    }
+
+    /* Cleanup memory that's no longer used. */
+    g_array_free(table_offsets, true);
+}
+
+static void acpi_build_no_update(void *build_opaque)
+{
+    /* nothing, microvm tables don't change at runtime */
+}
+
+void acpi_setup_microvm(MicrovmMachineState *mms)
+{
+    X86MachineState *x86ms = X86_MACHINE(mms);
+    AcpiBuildTables tables;
+
+    assert(x86ms->fw_cfg);
+
+    if (!x86_machine_is_acpi_enabled(x86ms)) {
+        return;
+    }
+
+    acpi_build_tables_init(&tables);
+    acpi_build_microvm(&tables, mms);
+
+    /* Now expose it all to Guest */
+    acpi_add_rom_blob(acpi_build_no_update, NULL, tables.table_data,
+                      ACPI_BUILD_TABLE_FILE);
+    acpi_add_rom_blob(acpi_build_no_update, NULL, tables.linker->cmd_blob,
+                      ACPI_BUILD_LOADER_FILE);
+    acpi_add_rom_blob(acpi_build_no_update, NULL, tables.rsdp,
+                      ACPI_BUILD_RSDP_FILE);
+
+    acpi_build_tables_cleanup(&tables, false);
+}
diff --git a/hw/i386/acpi-microvm.h b/hw/i386/acpi-microvm.h
new file mode 100644
index 000000000..dfe853690
--- /dev/null
+++ b/hw/i386/acpi-microvm.h
@@ -0,0 +1,8 @@
+#ifndef HW_I386_ACPI_MICROVM_H
+#define HW_I386_ACPI_MICROVM_H
+
+#include "hw/i386/microvm.h"
+
+void acpi_setup_microvm(MicrovmMachineState *mms);
+
+#endif
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
new file mode 100644
index 000000000..91fe34ae5
--- /dev/null
+++ b/hw/i386/amd_iommu.c
@@ -0,0 +1,1662 @@
+/*
+ * QEMU emulation of AMD IOMMU (AMD-Vi)
+ *
+ * Copyright (C) 2011 Eduard - Gabriel Munteanu
+ * Copyright (C) 2015, 2016 David Kiarie Kahurani
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Cache implementation inspired by hw/i386/intel_iommu.c
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/pc.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/pci_bus.h"
+#include "migration/vmstate.h"
+#include "amd_iommu.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/i386/apic_internal.h"
+#include "trace.h"
+#include "hw/i386/apic-msidef.h"
+
+/* used AMD-Vi MMIO registers */
+const char *amdvi_mmio_low[] = {
+    "AMDVI_MMIO_DEVTAB_BASE",
+    "AMDVI_MMIO_CMDBUF_BASE",
+    "AMDVI_MMIO_EVTLOG_BASE",
+    "AMDVI_MMIO_CONTROL",
+    "AMDVI_MMIO_EXCL_BASE",
+    "AMDVI_MMIO_EXCL_LIMIT",
+    "AMDVI_MMIO_EXT_FEATURES",
+    "AMDVI_MMIO_PPR_BASE",
+    "UNHANDLED"
+};
+const char *amdvi_mmio_high[] = {
+    "AMDVI_MMIO_COMMAND_HEAD",
+    "AMDVI_MMIO_COMMAND_TAIL",
+    "AMDVI_MMIO_EVTLOG_HEAD",
+    "AMDVI_MMIO_EVTLOG_TAIL",
+    "AMDVI_MMIO_STATUS",
+    "AMDVI_MMIO_PPR_HEAD",
+    "AMDVI_MMIO_PPR_TAIL",
+    "UNHANDLED"
+};
+
+struct AMDVIAddressSpace {
+    uint8_t bus_num;            /* bus number                           */
+    uint8_t devfn;              /* device function                      */
+    AMDVIState *iommu_state;    /* AMDVI - one per machine              */
+    MemoryRegion root;          /* AMDVI Root memory map region */
+    IOMMUMemoryRegion iommu;    /* Device's address translation region  */
+    MemoryRegion iommu_ir;      /* Device's interrupt remapping region  */
+    AddressSpace as;            /* device's corresponding address space */
+};
+
+/* AMDVI cache entry */
+typedef struct AMDVIIOTLBEntry {
+    uint16_t domid;             /* assigned domain id  */
+    uint16_t devid;             /* device owning entry */
+    uint64_t perms;             /* access permissions  */
+    uint64_t translated_addr;   /* translated address  */
+    uint64_t page_mask;         /* physical page size  */
+} AMDVIIOTLBEntry;
+
+/* configure MMIO registers at startup/reset */
+static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
+                           uint64_t romask, uint64_t w1cmask)
+{
+    stq_le_p(&s->mmior[addr], val);
+    stq_le_p(&s->romask[addr], romask);
+    stq_le_p(&s->w1cmask[addr], w1cmask);
+}
+
+static uint16_t amdvi_readw(AMDVIState *s, hwaddr addr)
+{
+    return lduw_le_p(&s->mmior[addr]);
+}
+
+static uint32_t amdvi_readl(AMDVIState *s, hwaddr addr)
+{
+    return ldl_le_p(&s->mmior[addr]);
+}
+
+static uint64_t amdvi_readq(AMDVIState *s, hwaddr addr)
+{
+    return ldq_le_p(&s->mmior[addr]);
+}
+
+/* internal write */
+static void amdvi_writeq_raw(AMDVIState *s, hwaddr addr, uint64_t val)
+{
+    stq_le_p(&s->mmior[addr], val);
+}
+
+/* external write */
+static void amdvi_writew(AMDVIState *s, hwaddr addr, uint16_t val)
+{
+    uint16_t romask = lduw_le_p(&s->romask[addr]);
+    uint16_t w1cmask = lduw_le_p(&s->w1cmask[addr]);
+    uint16_t oldval = lduw_le_p(&s->mmior[addr]);
+    stw_le_p(&s->mmior[addr],
+            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
+}
+
+static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val)
+{
+    uint32_t romask = ldl_le_p(&s->romask[addr]);
+    uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
+    uint32_t oldval = ldl_le_p(&s->mmior[addr]);
+    stl_le_p(&s->mmior[addr],
+            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
+}
+
+static void amdvi_writeq(AMDVIState *s, hwaddr addr, uint64_t val)
+{
+    uint64_t romask = ldq_le_p(&s->romask[addr]);
+    uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
+    uint32_t oldval = ldq_le_p(&s->mmior[addr]);
+    stq_le_p(&s->mmior[addr],
+            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
+}
+
+/* OR a 64-bit register with a 64-bit value */
+static bool amdvi_test_mask(AMDVIState *s, hwaddr addr, uint64_t val)
+{
+    return amdvi_readq(s, addr) | val;
+}
+
+/* OR a 64-bit register with a 64-bit value storing result in the register */
+static void amdvi_assign_orq(AMDVIState *s, hwaddr addr, uint64_t val)
+{
+    amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) | val);
+}
+
+/* AND a 64-bit register with a 64-bit value storing result in the register */
+static void amdvi_assign_andq(AMDVIState *s, hwaddr addr, uint64_t val)
+{
+   amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) & val);
+}
+
+static void amdvi_generate_msi_interrupt(AMDVIState *s)
+{
+    MSIMessage msg = {};
+    MemTxAttrs attrs = {
+        .requester_id = pci_requester_id(&s->pci.dev)
+    };
+
+    if (msi_enabled(&s->pci.dev)) {
+        msg = msi_get_message(&s->pci.dev, 0);
+        address_space_stl_le(&address_space_memory, msg.address, msg.data,
+                             attrs, NULL);
+    }
+}
+
+static void amdvi_log_event(AMDVIState *s, uint64_t *evt)
+{
+    /* event logging not enabled */
+    if (!s->evtlog_enabled || amdvi_test_mask(s, AMDVI_MMIO_STATUS,
+        AMDVI_MMIO_STATUS_EVT_OVF)) {
+        return;
+    }
+
+    /* event log buffer full */
+    if (s->evtlog_tail >= s->evtlog_len) {
+        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF);
+        /* generate interrupt */
+        amdvi_generate_msi_interrupt(s);
+        return;
+    }
+
+    if (dma_memory_write(&address_space_memory, s->evtlog + s->evtlog_tail,
+                         evt, AMDVI_EVENT_LEN)) {
+        trace_amdvi_evntlog_fail(s->evtlog, s->evtlog_tail);
+    }
+
+    s->evtlog_tail += AMDVI_EVENT_LEN;
+    amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
+    amdvi_generate_msi_interrupt(s);
+}
+
+static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start,
+                                int length)
+{
+    int index = start / 64, bitpos = start % 64;
+    uint64_t mask = MAKE_64BIT_MASK(start, length);
+    buffer[index] &= ~mask;
+    buffer[index] |= (value << bitpos) & mask;
+}
+/*
+ * AMDVi event structure
+ *    0:15   -> DeviceID
+ *    55:63  -> event type + miscellaneous info
+ *    63:127 -> related address
+ */
+static void amdvi_encode_event(uint64_t *evt, uint16_t devid, uint64_t addr,
+                               uint16_t info)
+{
+    amdvi_setevent_bits(evt, devid, 0, 16);
+    amdvi_setevent_bits(evt, info, 55, 8);
+    amdvi_setevent_bits(evt, addr, 63, 64);
+}
+/* log an error encountered during a page walk
+ *
+ * @addr: virtual address in translation request
+ */
+static void amdvi_page_fault(AMDVIState *s, uint16_t devid,
+                             hwaddr addr, uint16_t info)
+{
+    uint64_t evt[4];
+
+    info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF;
+    amdvi_encode_event(evt, devid, addr, info);
+    amdvi_log_event(s, evt);
+    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+            PCI_STATUS_SIG_TARGET_ABORT);
+}
+/*
+ * log a master abort accessing device table
+ *  @devtab : address of device table entry
+ *  @info : error flags
+ */
+static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid,
+                                   hwaddr devtab, uint16_t info)
+{
+    uint64_t evt[4];
+
+    info |= AMDVI_EVENT_DEV_TAB_HW_ERROR;
+
+    amdvi_encode_event(evt, devid, devtab, info);
+    amdvi_log_event(s, evt);
+    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+            PCI_STATUS_SIG_TARGET_ABORT);
+}
+/* log an event trying to access command buffer
+ *   @addr : address that couldn't be accessed
+ */
+static void amdvi_log_command_error(AMDVIState *s, hwaddr addr)
+{
+    uint64_t evt[4], info = AMDVI_EVENT_COMMAND_HW_ERROR;
+
+    amdvi_encode_event(evt, 0, addr, info);
+    amdvi_log_event(s, evt);
+    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+            PCI_STATUS_SIG_TARGET_ABORT);
+}
+/* log an illegal comand event
+ *   @addr : address of illegal command
+ */
+static void amdvi_log_illegalcom_error(AMDVIState *s, uint16_t info,
+                                       hwaddr addr)
+{
+    uint64_t evt[4];
+
+    info |= AMDVI_EVENT_ILLEGAL_COMMAND_ERROR;
+    amdvi_encode_event(evt, 0, addr, info);
+    amdvi_log_event(s, evt);
+}
+/* log an error accessing device table
+ *
+ *  @devid : device owning the table entry
+ *  @devtab : address of device table entry
+ *  @info : error flags
+ */
+static void amdvi_log_illegaldevtab_error(AMDVIState *s, uint16_t devid,
+                                          hwaddr addr, uint16_t info)
+{
+    uint64_t evt[4];
+
+    info |= AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY;
+    amdvi_encode_event(evt, devid, addr, info);
+    amdvi_log_event(s, evt);
+}
+/* log an error accessing a PTE entry
+ * @addr : address that couldn't be accessed
+ */
+static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid,
+                                    hwaddr addr, uint16_t info)
+{
+    uint64_t evt[4];
+
+    info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR;
+    amdvi_encode_event(evt, devid, addr, info);
+    amdvi_log_event(s, evt);
+    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+             PCI_STATUS_SIG_TARGET_ABORT);
+}
+
+static gboolean amdvi_uint64_equal(gconstpointer v1, gconstpointer v2)
+{
+    return *((const uint64_t *)v1) == *((const uint64_t *)v2);
+}
+
+static guint amdvi_uint64_hash(gconstpointer v)
+{
+    return (guint)*(const uint64_t *)v;
+}
+
+static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr,
+                                           uint64_t devid)
+{
+    uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
+                   ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
+    return g_hash_table_lookup(s->iotlb, &key);
+}
+
+static void amdvi_iotlb_reset(AMDVIState *s)
+{
+    assert(s->iotlb);
+    trace_amdvi_iotlb_reset();
+    g_hash_table_remove_all(s->iotlb);
+}
+
+static gboolean amdvi_iotlb_remove_by_devid(gpointer key, gpointer value,
+                                            gpointer user_data)
+{
+    AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
+    uint16_t devid = *(uint16_t *)user_data;
+    return entry->devid == devid;
+}
+
+static void amdvi_iotlb_remove_page(AMDVIState *s, hwaddr addr,
+                                    uint64_t devid)
+{
+    uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
+                   ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
+    g_hash_table_remove(s->iotlb, &key);
+}
+
+static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid,
+                               uint64_t gpa, IOMMUTLBEntry to_cache,
+                               uint16_t domid)
+{
+    AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1);
+    uint64_t *key = g_new(uint64_t, 1);
+    uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K;
+
+    /* don't cache erroneous translations */
+    if (to_cache.perm != IOMMU_NONE) {
+        trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                PCI_FUNC(devid), gpa, to_cache.translated_addr);
+
+        if (g_hash_table_size(s->iotlb) >= AMDVI_IOTLB_MAX_SIZE) {
+            amdvi_iotlb_reset(s);
+        }
+
+        entry->domid = domid;
+        entry->perms = to_cache.perm;
+        entry->translated_addr = to_cache.translated_addr;
+        entry->page_mask = to_cache.addr_mask;
+        *key = gfn | ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
+        g_hash_table_replace(s->iotlb, key, entry);
+    }
+}
+
+static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd)
+{
+    /* pad the last 3 bits */
+    hwaddr addr = cpu_to_le64(extract64(cmd[0], 3, 49)) << 3;
+    uint64_t data = cpu_to_le64(cmd[1]);
+
+    if (extract64(cmd[0], 52, 8)) {
+        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+                                   s->cmdbuf + s->cmdbuf_head);
+    }
+    if (extract64(cmd[0], 0, 1)) {
+        if (dma_memory_write(&address_space_memory, addr, &data,
+            AMDVI_COMPLETION_DATA_SIZE)) {
+            trace_amdvi_completion_wait_fail(addr);
+        }
+    }
+    /* set completion interrupt */
+    if (extract64(cmd[0], 1, 1)) {
+        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
+        /* generate interrupt */
+        amdvi_generate_msi_interrupt(s);
+    }
+    trace_amdvi_completion_wait(addr, data);
+}
+
+/* log error without aborting since linux seems to be using reserved bits */
+static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd)
+{
+    uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16));
+
+    /* This command should invalidate internal caches of which there isn't */
+    if (extract64(cmd[0], 16, 44) || cmd[1]) {
+        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+                                   s->cmdbuf + s->cmdbuf_head);
+    }
+    trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                             PCI_FUNC(devid));
+}
+
+static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd)
+{
+    if (extract64(cmd[0], 16, 16) ||  extract64(cmd[0], 52, 8) ||
+        extract64(cmd[1], 0, 2) || extract64(cmd[1], 3, 29)
+        || extract64(cmd[1], 48, 16)) {
+        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+                                   s->cmdbuf + s->cmdbuf_head);
+    }
+    trace_amdvi_ppr_exec();
+}
+
+static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
+{
+    if (extract64(cmd[0], 0, 60) || cmd[1]) {
+        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+                                   s->cmdbuf + s->cmdbuf_head);
+    }
+
+    amdvi_iotlb_reset(s);
+    trace_amdvi_all_inval();
+}
+
+static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value,
+                                            gpointer user_data)
+{
+    AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
+    uint16_t domid = *(uint16_t *)user_data;
+    return entry->domid == domid;
+}
+
+/* we don't have devid - we can't remove pages by address */
+static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd)
+{
+    uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16));
+
+    if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) ||
+        extract64(cmd[1], 3, 9)) {
+        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+                                   s->cmdbuf + s->cmdbuf_head);
+    }
+
+    g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid,
+                                &domid);
+    trace_amdvi_pages_inval(domid);
+}
+
+static void amdvi_prefetch_pages(AMDVIState *s, uint64_t *cmd)
+{
+    if (extract64(cmd[0], 16, 8) || extract64(cmd[0], 52, 8) ||
+        extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
+        extract64(cmd[1], 5, 7)) {
+        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+                                   s->cmdbuf + s->cmdbuf_head);
+    }
+
+    trace_amdvi_prefetch_pages();
+}
+
+static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd)
+{
+    if (extract64(cmd[0], 16, 44) || cmd[1]) {
+        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+                                   s->cmdbuf + s->cmdbuf_head);
+        return;
+    }
+
+    trace_amdvi_intr_inval();
+}
+
+/* FIXME: Try to work with the specified size instead of all the pages
+ * when the S bit is on
+ */
+static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd)
+{
+
+    uint16_t devid = extract64(cmd[0], 0, 16);
+    if (extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
+        extract64(cmd[1], 6, 6)) {
+        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
+                                   s->cmdbuf + s->cmdbuf_head);
+        return;
+    }
+
+    if (extract64(cmd[1], 0, 1)) {
+        g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_devid,
+                                    &devid);
+    } else {
+        amdvi_iotlb_remove_page(s, cpu_to_le64(extract64(cmd[1], 12, 52)) << 12,
+                                cpu_to_le16(extract64(cmd[1], 0, 16)));
+    }
+    trace_amdvi_iotlb_inval();
+}
+
+/* not honouring reserved bits is regarded as an illegal command */
+static void amdvi_cmdbuf_exec(AMDVIState *s)
+{
+    uint64_t cmd[2];
+
+    if (dma_memory_read(&address_space_memory, s->cmdbuf + s->cmdbuf_head,
+        cmd, AMDVI_COMMAND_SIZE)) {
+        trace_amdvi_command_read_fail(s->cmdbuf, s->cmdbuf_head);
+        amdvi_log_command_error(s, s->cmdbuf + s->cmdbuf_head);
+        return;
+    }
+
+    switch (extract64(cmd[0], 60, 4)) {
+    case AMDVI_CMD_COMPLETION_WAIT:
+        amdvi_completion_wait(s, cmd);
+        break;
+    case AMDVI_CMD_INVAL_DEVTAB_ENTRY:
+        amdvi_inval_devtab_entry(s, cmd);
+        break;
+    case AMDVI_CMD_INVAL_AMDVI_PAGES:
+        amdvi_inval_pages(s, cmd);
+        break;
+    case AMDVI_CMD_INVAL_IOTLB_PAGES:
+        iommu_inval_iotlb(s, cmd);
+        break;
+    case AMDVI_CMD_INVAL_INTR_TABLE:
+        amdvi_inval_inttable(s, cmd);
+        break;
+    case AMDVI_CMD_PREFETCH_AMDVI_PAGES:
+        amdvi_prefetch_pages(s, cmd);
+        break;
+    case AMDVI_CMD_COMPLETE_PPR_REQUEST:
+        amdvi_complete_ppr(s, cmd);
+        break;
+    case AMDVI_CMD_INVAL_AMDVI_ALL:
+        amdvi_inval_all(s, cmd);
+        break;
+    default:
+        trace_amdvi_unhandled_command(extract64(cmd[1], 60, 4));
+        /* log illegal command */
+        amdvi_log_illegalcom_error(s, extract64(cmd[1], 60, 4),
+                                   s->cmdbuf + s->cmdbuf_head);
+    }
+}
+
+static void amdvi_cmdbuf_run(AMDVIState *s)
+{
+    if (!s->cmdbuf_enabled) {
+        trace_amdvi_command_error(amdvi_readq(s, AMDVI_MMIO_CONTROL));
+        return;
+    }
+
+    /* check if there is work to do. */
+    while (s->cmdbuf_head != s->cmdbuf_tail) {
+        trace_amdvi_command_exec(s->cmdbuf_head, s->cmdbuf_tail, s->cmdbuf);
+        amdvi_cmdbuf_exec(s);
+        s->cmdbuf_head += AMDVI_COMMAND_SIZE;
+        amdvi_writeq_raw(s, AMDVI_MMIO_COMMAND_HEAD, s->cmdbuf_head);
+
+        /* wrap head pointer */
+        if (s->cmdbuf_head >= s->cmdbuf_len * AMDVI_COMMAND_SIZE) {
+            s->cmdbuf_head = 0;
+        }
+    }
+}
+
+static void amdvi_mmio_trace(hwaddr addr, unsigned size)
+{
+    uint8_t index = (addr & ~0x2000) / 8;
+
+    if ((addr & 0x2000)) {
+        /* high table */
+        index = index >= AMDVI_MMIO_REGS_HIGH ? AMDVI_MMIO_REGS_HIGH : index;
+        trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07);
+    } else {
+        index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index;
+        trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07);
+    }
+}
+
+static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size)
+{
+    AMDVIState *s = opaque;
+
+    uint64_t val = -1;
+    if (addr + size > AMDVI_MMIO_SIZE) {
+        trace_amdvi_mmio_read_invalid(AMDVI_MMIO_SIZE, addr, size);
+        return (uint64_t)-1;
+    }
+
+    if (size == 2) {
+        val = amdvi_readw(s, addr);
+    } else if (size == 4) {
+        val = amdvi_readl(s, addr);
+    } else if (size == 8) {
+        val = amdvi_readq(s, addr);
+    }
+    amdvi_mmio_trace(addr, size);
+
+    return val;
+}
+
+static void amdvi_handle_control_write(AMDVIState *s)
+{
+    unsigned long control = amdvi_readq(s, AMDVI_MMIO_CONTROL);
+    s->enabled = !!(control & AMDVI_MMIO_CONTROL_AMDVIEN);
+
+    s->ats_enabled = !!(control & AMDVI_MMIO_CONTROL_HTTUNEN);
+    s->evtlog_enabled = s->enabled && !!(control &
+                        AMDVI_MMIO_CONTROL_EVENTLOGEN);
+
+    s->evtlog_intr = !!(control & AMDVI_MMIO_CONTROL_EVENTINTEN);
+    s->completion_wait_intr = !!(control & AMDVI_MMIO_CONTROL_COMWAITINTEN);
+    s->cmdbuf_enabled = s->enabled && !!(control &
+                        AMDVI_MMIO_CONTROL_CMDBUFLEN);
+    s->ga_enabled = !!(control & AMDVI_MMIO_CONTROL_GAEN);
+
+    /* update the flags depending on the control register */
+    if (s->cmdbuf_enabled) {
+        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_CMDBUF_RUN);
+    } else {
+        amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_CMDBUF_RUN);
+    }
+    if (s->evtlog_enabled) {
+        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_RUN);
+    } else {
+        amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_EVT_RUN);
+    }
+
+    trace_amdvi_control_status(control);
+    amdvi_cmdbuf_run(s);
+}
+
+static inline void amdvi_handle_devtab_write(AMDVIState *s)
+
+{
+    uint64_t val = amdvi_readq(s, AMDVI_MMIO_DEVICE_TABLE);
+    s->devtab = (val & AMDVI_MMIO_DEVTAB_BASE_MASK);
+
+    /* set device table length */
+    s->devtab_len = ((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1 *
+                    (AMDVI_MMIO_DEVTAB_SIZE_UNIT /
+                     AMDVI_MMIO_DEVTAB_ENTRY_SIZE));
+}
+
+static inline void amdvi_handle_cmdhead_write(AMDVIState *s)
+{
+    s->cmdbuf_head = amdvi_readq(s, AMDVI_MMIO_COMMAND_HEAD)
+                     & AMDVI_MMIO_CMDBUF_HEAD_MASK;
+    amdvi_cmdbuf_run(s);
+}
+
+static inline void amdvi_handle_cmdbase_write(AMDVIState *s)
+{
+    s->cmdbuf = amdvi_readq(s, AMDVI_MMIO_COMMAND_BASE)
+                & AMDVI_MMIO_CMDBUF_BASE_MASK;
+    s->cmdbuf_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_CMDBUF_SIZE_BYTE)
+                    & AMDVI_MMIO_CMDBUF_SIZE_MASK);
+    s->cmdbuf_head = s->cmdbuf_tail = 0;
+}
+
+static inline void amdvi_handle_cmdtail_write(AMDVIState *s)
+{
+    s->cmdbuf_tail = amdvi_readq(s, AMDVI_MMIO_COMMAND_TAIL)
+                     & AMDVI_MMIO_CMDBUF_TAIL_MASK;
+    amdvi_cmdbuf_run(s);
+}
+
+static inline void amdvi_handle_excllim_write(AMDVIState *s)
+{
+    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EXCL_LIMIT);
+    s->excl_limit = (val & AMDVI_MMIO_EXCL_LIMIT_MASK) |
+                    AMDVI_MMIO_EXCL_LIMIT_LOW;
+}
+
+static inline void amdvi_handle_evtbase_write(AMDVIState *s)
+{
+    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_BASE);
+    s->evtlog = val & AMDVI_MMIO_EVTLOG_BASE_MASK;
+    s->evtlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_EVTLOG_SIZE_BYTE)
+                    & AMDVI_MMIO_EVTLOG_SIZE_MASK);
+}
+
+static inline void amdvi_handle_evttail_write(AMDVIState *s)
+{
+    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_TAIL);
+    s->evtlog_tail = val & AMDVI_MMIO_EVTLOG_TAIL_MASK;
+}
+
+static inline void amdvi_handle_evthead_write(AMDVIState *s)
+{
+    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_HEAD);
+    s->evtlog_head = val & AMDVI_MMIO_EVTLOG_HEAD_MASK;
+}
+
+static inline void amdvi_handle_pprbase_write(AMDVIState *s)
+{
+    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_BASE);
+    s->ppr_log = val & AMDVI_MMIO_PPRLOG_BASE_MASK;
+    s->pprlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_PPRLOG_SIZE_BYTE)
+                    & AMDVI_MMIO_PPRLOG_SIZE_MASK);
+}
+
+static inline void amdvi_handle_pprhead_write(AMDVIState *s)
+{
+    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_HEAD);
+    s->pprlog_head = val & AMDVI_MMIO_PPRLOG_HEAD_MASK;
+}
+
+static inline void amdvi_handle_pprtail_write(AMDVIState *s)
+{
+    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_TAIL);
+    s->pprlog_tail = val & AMDVI_MMIO_PPRLOG_TAIL_MASK;
+}
+
+/* FIXME: something might go wrong if System Software writes in chunks
+ * of one byte but linux writes in chunks of 4 bytes so currently it
+ * works correctly with linux but will definitely be busted if software
+ * reads/writes 8 bytes
+ */
+static void amdvi_mmio_reg_write(AMDVIState *s, unsigned size, uint64_t val,
+                                 hwaddr addr)
+{
+    if (size == 2) {
+        amdvi_writew(s, addr, val);
+    } else if (size == 4) {
+        amdvi_writel(s, addr, val);
+    } else if (size == 8) {
+        amdvi_writeq(s, addr, val);
+    }
+}
+
+static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val,
+                             unsigned size)
+{
+    AMDVIState *s = opaque;
+    unsigned long offset = addr & 0x07;
+
+    if (addr + size > AMDVI_MMIO_SIZE) {
+        trace_amdvi_mmio_write("error: addr outside region: max ",
+                (uint64_t)AMDVI_MMIO_SIZE, size, val, offset);
+        return;
+    }
+
+    amdvi_mmio_trace(addr, size);
+    switch (addr & ~0x07) {
+    case AMDVI_MMIO_CONTROL:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        amdvi_handle_control_write(s);
+        break;
+    case AMDVI_MMIO_DEVICE_TABLE:
+        amdvi_mmio_reg_write(s, size, val, addr);
+       /*  set device table address
+        *   This also suffers from inability to tell whether software
+        *   is done writing
+        */
+        if (offset || (size == 8)) {
+            amdvi_handle_devtab_write(s);
+        }
+        break;
+    case AMDVI_MMIO_COMMAND_HEAD:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        amdvi_handle_cmdhead_write(s);
+        break;
+    case AMDVI_MMIO_COMMAND_BASE:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        /* FIXME - make sure System Software has finished writing incase
+         * it writes in chucks less than 8 bytes in a robust way.As for
+         * now, this hacks works for the linux driver
+         */
+        if (offset || (size == 8)) {
+            amdvi_handle_cmdbase_write(s);
+        }
+        break;
+    case AMDVI_MMIO_COMMAND_TAIL:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        amdvi_handle_cmdtail_write(s);
+        break;
+    case AMDVI_MMIO_EVENT_BASE:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        amdvi_handle_evtbase_write(s);
+        break;
+    case AMDVI_MMIO_EVENT_HEAD:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        amdvi_handle_evthead_write(s);
+        break;
+    case AMDVI_MMIO_EVENT_TAIL:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        amdvi_handle_evttail_write(s);
+        break;
+    case AMDVI_MMIO_EXCL_LIMIT:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        amdvi_handle_excllim_write(s);
+        break;
+        /* PPR log base - unused for now */
+    case AMDVI_MMIO_PPR_BASE:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        amdvi_handle_pprbase_write(s);
+        break;
+        /* PPR log head - also unused for now */
+    case AMDVI_MMIO_PPR_HEAD:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        amdvi_handle_pprhead_write(s);
+        break;
+        /* PPR log tail - unused for now */
+    case AMDVI_MMIO_PPR_TAIL:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        amdvi_handle_pprtail_write(s);
+        break;
+    }
+}
+
+static inline uint64_t amdvi_get_perms(uint64_t entry)
+{
+    return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >>
+           AMDVI_DEV_PERM_SHIFT;
+}
+
+/* validate that reserved bits are honoured */
+static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid,
+                               uint64_t *dte)
+{
+    if ((dte[0] & AMDVI_DTE_LOWER_QUAD_RESERVED)
+        || (dte[1] & AMDVI_DTE_MIDDLE_QUAD_RESERVED)
+        || (dte[2] & AMDVI_DTE_UPPER_QUAD_RESERVED) || dte[3]) {
+        amdvi_log_illegaldevtab_error(s, devid,
+                                      s->devtab +
+                                      devid * AMDVI_DEVTAB_ENTRY_SIZE, 0);
+        return false;
+    }
+
+    return true;
+}
+
+/* get a device table entry given the devid */
+static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry)
+{
+    uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE;
+
+    if (dma_memory_read(&address_space_memory, s->devtab + offset, entry,
+        AMDVI_DEVTAB_ENTRY_SIZE)) {
+        trace_amdvi_dte_get_fail(s->devtab, offset);
+        /* log error accessing dte */
+        amdvi_log_devtab_error(s, devid, s->devtab + offset, 0);
+        return false;
+    }
+
+    *entry = le64_to_cpu(*entry);
+    if (!amdvi_validate_dte(s, devid, entry)) {
+        trace_amdvi_invalid_dte(entry[0]);
+        return false;
+    }
+
+    return true;
+}
+
+/* get pte translation mode */
+static inline uint8_t get_pte_translation_mode(uint64_t pte)
+{
+    return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK;
+}
+
+static inline uint64_t pte_override_page_mask(uint64_t pte)
+{
+    uint8_t page_mask = 13;
+    uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) >> 12;
+    /* find the first zero bit */
+    while (addr & 1) {
+        page_mask++;
+        addr = addr >> 1;
+    }
+
+    return ~((1ULL << page_mask) - 1);
+}
+
+static inline uint64_t pte_get_page_mask(uint64_t oldlevel)
+{
+    return ~((1UL << ((oldlevel * 9) + 3)) - 1);
+}
+
+static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
+                                          uint16_t devid)
+{
+    uint64_t pte;
+
+    if (dma_memory_read(&address_space_memory, pte_addr, &pte, sizeof(pte))) {
+        trace_amdvi_get_pte_hwerror(pte_addr);
+        amdvi_log_pagetab_error(s, devid, pte_addr, 0);
+        pte = 0;
+        return pte;
+    }
+
+    pte = le64_to_cpu(pte);
+    return pte;
+}
+
+static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
+                            IOMMUTLBEntry *ret, unsigned perms,
+                            hwaddr addr)
+{
+    unsigned level, present, pte_perms, oldlevel;
+    uint64_t pte = dte[0], pte_addr, page_mask;
+
+    /* make sure the DTE has TV = 1 */
+    if (pte & AMDVI_DEV_TRANSLATION_VALID) {
+        level = get_pte_translation_mode(pte);
+        if (level >= 7) {
+            trace_amdvi_mode_invalid(level, addr);
+            return;
+        }
+        if (level == 0) {
+            goto no_remap;
+        }
+
+        /* we are at the leaf page table or page table encodes a huge page */
+        while (level > 0) {
+            pte_perms = amdvi_get_perms(pte);
+            present = pte & 1;
+            if (!present || perms != (perms & pte_perms)) {
+                amdvi_page_fault(as->iommu_state, as->devfn, addr, perms);
+                trace_amdvi_page_fault(addr);
+                return;
+            }
+
+            /* go to the next lower level */
+            pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK;
+            /* add offset and load pte */
+            pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3;
+            pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn);
+            if (!pte) {
+                return;
+            }
+            oldlevel = level;
+            level = get_pte_translation_mode(pte);
+            if (level == 0x7) {
+                break;
+            }
+        }
+
+        if (level == 0x7) {
+            page_mask = pte_override_page_mask(pte);
+        } else {
+            page_mask = pte_get_page_mask(oldlevel);
+        }
+
+        /* get access permissions from pte */
+        ret->iova = addr & page_mask;
+        ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask;
+        ret->addr_mask = ~page_mask;
+        ret->perm = amdvi_get_perms(pte);
+        return;
+    }
+no_remap:
+    ret->iova = addr & AMDVI_PAGE_MASK_4K;
+    ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
+    ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
+    ret->perm = amdvi_get_perms(pte);
+}
+
+static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
+                               bool is_write, IOMMUTLBEntry *ret)
+{
+    AMDVIState *s = as->iommu_state;
+    uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn);
+    AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid);
+    uint64_t entry[4];
+
+    if (iotlb_entry) {
+        trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                PCI_FUNC(devid), addr, iotlb_entry->translated_addr);
+        ret->iova = addr & ~iotlb_entry->page_mask;
+        ret->translated_addr = iotlb_entry->translated_addr;
+        ret->addr_mask = iotlb_entry->page_mask;
+        ret->perm = iotlb_entry->perms;
+        return;
+    }
+
+    if (!amdvi_get_dte(s, devid, entry)) {
+        return;
+    }
+
+    /* devices with V = 0 are not translated */
+    if (!(entry[0] & AMDVI_DEV_VALID)) {
+        goto out;
+    }
+
+    amdvi_page_walk(as, entry, ret,
+                    is_write ? AMDVI_PERM_WRITE : AMDVI_PERM_READ, addr);
+
+    amdvi_update_iotlb(s, devid, addr, *ret,
+                       entry[1] & AMDVI_DEV_DOMID_ID_MASK);
+    return;
+
+out:
+    ret->iova = addr & AMDVI_PAGE_MASK_4K;
+    ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
+    ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
+    ret->perm = IOMMU_RW;
+}
+
+static inline bool amdvi_is_interrupt_addr(hwaddr addr)
+{
+    return addr >= AMDVI_INT_ADDR_FIRST && addr <= AMDVI_INT_ADDR_LAST;
+}
+
+static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
+                                     IOMMUAccessFlags flag, int iommu_idx)
+{
+    AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
+    AMDVIState *s = as->iommu_state;
+    IOMMUTLBEntry ret = {
+        .target_as = &address_space_memory,
+        .iova = addr,
+        .translated_addr = 0,
+        .addr_mask = ~(hwaddr)0,
+        .perm = IOMMU_NONE
+    };
+
+    if (!s->enabled) {
+        /* AMDVI disabled - corresponds to iommu=off not
+         * failure to provide any parameter
+         */
+        ret.iova = addr & AMDVI_PAGE_MASK_4K;
+        ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
+        ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
+        ret.perm = IOMMU_RW;
+        return ret;
+    } else if (amdvi_is_interrupt_addr(addr)) {
+        ret.iova = addr & AMDVI_PAGE_MASK_4K;
+        ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
+        ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
+        ret.perm = IOMMU_WO;
+        return ret;
+    }
+
+    amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret);
+    trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
+            PCI_FUNC(as->devfn), addr, ret.translated_addr);
+    return ret;
+}
+
+static int amdvi_get_irte(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
+                          union irte *irte, uint16_t devid)
+{
+    uint64_t irte_root, offset;
+
+    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
+    offset = (origin->data & AMDVI_IRTE_OFFSET) << 2;
+
+    trace_amdvi_ir_irte(irte_root, offset);
+
+    if (dma_memory_read(&address_space_memory, irte_root + offset,
+                        irte, sizeof(*irte))) {
+        trace_amdvi_ir_err("failed to get irte");
+        return -AMDVI_IR_GET_IRTE;
+    }
+
+    trace_amdvi_ir_irte_val(irte->val);
+
+    return 0;
+}
+
+static int amdvi_int_remap_legacy(AMDVIState *iommu,
+                                  MSIMessage *origin,
+                                  MSIMessage *translated,
+                                  uint64_t *dte,
+                                  X86IOMMUIrq *irq,
+                                  uint16_t sid)
+{
+    int ret;
+    union irte irte;
+
+    /* get interrupt remapping table */
+    ret = amdvi_get_irte(iommu, origin, dte, &irte, sid);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (!irte.fields.valid) {
+        trace_amdvi_ir_target_abort("RemapEn is disabled");
+        return -AMDVI_IR_TARGET_ABORT;
+    }
+
+    if (irte.fields.guest_mode) {
+        error_report_once("guest mode is not zero");
+        return -AMDVI_IR_ERR;
+    }
+
+    if (irte.fields.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
+        error_report_once("reserved int_type");
+        return -AMDVI_IR_ERR;
+    }
+
+    irq->delivery_mode = irte.fields.int_type;
+    irq->vector = irte.fields.vector;
+    irq->dest_mode = irte.fields.dm;
+    irq->redir_hint = irte.fields.rq_eoi;
+    irq->dest = irte.fields.destination;
+
+    return 0;
+}
+
+static int amdvi_get_irte_ga(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
+                             struct irte_ga *irte, uint16_t devid)
+{
+    uint64_t irte_root, offset;
+
+    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
+    offset = (origin->data & AMDVI_IRTE_OFFSET) << 4;
+    trace_amdvi_ir_irte(irte_root, offset);
+
+    if (dma_memory_read(&address_space_memory, irte_root + offset,
+                        irte, sizeof(*irte))) {
+        trace_amdvi_ir_err("failed to get irte_ga");
+        return -AMDVI_IR_GET_IRTE;
+    }
+
+    trace_amdvi_ir_irte_ga_val(irte->hi.val, irte->lo.val);
+    return 0;
+}
+
+static int amdvi_int_remap_ga(AMDVIState *iommu,
+                              MSIMessage *origin,
+                              MSIMessage *translated,
+                              uint64_t *dte,
+                              X86IOMMUIrq *irq,
+                              uint16_t sid)
+{
+    int ret;
+    struct irte_ga irte;
+
+    /* get interrupt remapping table */
+    ret = amdvi_get_irte_ga(iommu, origin, dte, &irte, sid);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (!irte.lo.fields_remap.valid) {
+        trace_amdvi_ir_target_abort("RemapEn is disabled");
+        return -AMDVI_IR_TARGET_ABORT;
+    }
+
+    if (irte.lo.fields_remap.guest_mode) {
+        error_report_once("guest mode is not zero");
+        return -AMDVI_IR_ERR;
+    }
+
+    if (irte.lo.fields_remap.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
+        error_report_once("reserved int_type is set");
+        return -AMDVI_IR_ERR;
+    }
+
+    irq->delivery_mode = irte.lo.fields_remap.int_type;
+    irq->vector = irte.hi.fields.vector;
+    irq->dest_mode = irte.lo.fields_remap.dm;
+    irq->redir_hint = irte.lo.fields_remap.rq_eoi;
+    irq->dest = irte.lo.fields_remap.destination;
+
+    return 0;
+}
+
+static int __amdvi_int_remap_msi(AMDVIState *iommu,
+                                 MSIMessage *origin,
+                                 MSIMessage *translated,
+                                 uint64_t *dte,
+                                 X86IOMMUIrq *irq,
+                                 uint16_t sid)
+{
+    int ret;
+    uint8_t int_ctl;
+
+    int_ctl = (dte[2] >> AMDVI_IR_INTCTL_SHIFT) & 3;
+    trace_amdvi_ir_intctl(int_ctl);
+
+    switch (int_ctl) {
+    case AMDVI_IR_INTCTL_PASS:
+        memcpy(translated, origin, sizeof(*origin));
+        return 0;
+    case AMDVI_IR_INTCTL_REMAP:
+        break;
+    case AMDVI_IR_INTCTL_ABORT:
+        trace_amdvi_ir_target_abort("int_ctl abort");
+        return -AMDVI_IR_TARGET_ABORT;
+    default:
+        trace_amdvi_ir_err("int_ctl reserved");
+        return -AMDVI_IR_ERR;
+    }
+
+    if (iommu->ga_enabled) {
+        ret = amdvi_int_remap_ga(iommu, origin, translated, dte, irq, sid);
+    } else {
+        ret = amdvi_int_remap_legacy(iommu, origin, translated, dte, irq, sid);
+    }
+
+    return ret;
+}
+
+/* Interrupt remapping for MSI/MSI-X entry */
+static int amdvi_int_remap_msi(AMDVIState *iommu,
+                               MSIMessage *origin,
+                               MSIMessage *translated,
+                               uint16_t sid)
+{
+    int ret = 0;
+    uint64_t pass = 0;
+    uint64_t dte[4] = { 0 };
+    X86IOMMUIrq irq = { 0 };
+    uint8_t dest_mode, delivery_mode;
+
+    assert(origin && translated);
+
+    /*
+     * When IOMMU is enabled, interrupt remap request will come either from
+     * IO-APIC or PCI device. If interrupt is from PCI device then it will
+     * have a valid requester id but if the interrupt is from IO-APIC
+     * then requester id will be invalid.
+     */
+    if (sid == X86_IOMMU_SID_INVALID) {
+        sid = AMDVI_IOAPIC_SB_DEVID;
+    }
+
+    trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid);
+
+    /* check if device table entry is set before we go further. */
+    if (!iommu || !iommu->devtab_len) {
+        memcpy(translated, origin, sizeof(*origin));
+        goto out;
+    }
+
+    if (!amdvi_get_dte(iommu, sid, dte)) {
+        return -AMDVI_IR_ERR;
+    }
+
+    /* Check if IR is enabled in DTE */
+    if (!(dte[2] & AMDVI_IR_REMAP_ENABLE)) {
+        memcpy(translated, origin, sizeof(*origin));
+        goto out;
+    }
+
+    /* validate that we are configure with intremap=on */
+    if (!x86_iommu_ir_supported(X86_IOMMU_DEVICE(iommu))) {
+        trace_amdvi_err("Interrupt remapping is enabled in the guest but "
+                        "not in the host. Use intremap=on to enable interrupt "
+                        "remapping in amd-iommu.");
+        return -AMDVI_IR_ERR;
+    }
+
+    if (origin->address & AMDVI_MSI_ADDR_HI_MASK) {
+        trace_amdvi_err("MSI address high 32 bits non-zero when "
+                        "Interrupt Remapping enabled.");
+        return -AMDVI_IR_ERR;
+    }
+
+    if ((origin->address & AMDVI_MSI_ADDR_LO_MASK) != APIC_DEFAULT_ADDRESS) {
+        trace_amdvi_err("MSI is not from IOAPIC.");
+        return -AMDVI_IR_ERR;
+    }
+
+    /*
+     * The MSI data register [10:8] are used to get the upstream interrupt type.
+     *
+     * See MSI/MSI-X format:
+     * https://pdfs.semanticscholar.org/presentation/9420/c279e942eca568157711ef5c92b800c40a79.pdf
+     * (page 5)
+     */
+    delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
+
+    switch (delivery_mode) {
+    case AMDVI_IOAPIC_INT_TYPE_FIXED:
+    case AMDVI_IOAPIC_INT_TYPE_ARBITRATED:
+        trace_amdvi_ir_delivery_mode("fixed/arbitrated");
+        ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, sid);
+        if (ret < 0) {
+            goto remap_fail;
+        } else {
+            /* Translate IRQ to MSI messages */
+            x86_iommu_irq_to_msi_message(&irq, translated);
+            goto out;
+        }
+        break;
+    case AMDVI_IOAPIC_INT_TYPE_SMI:
+        error_report("SMI is not supported!");
+        ret = -AMDVI_IR_ERR;
+        break;
+    case AMDVI_IOAPIC_INT_TYPE_NMI:
+        pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK;
+        trace_amdvi_ir_delivery_mode("nmi");
+        break;
+    case AMDVI_IOAPIC_INT_TYPE_INIT:
+        pass = dte[3] & AMDVI_DEV_INT_PASS_MASK;
+        trace_amdvi_ir_delivery_mode("init");
+        break;
+    case AMDVI_IOAPIC_INT_TYPE_EINT:
+        pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK;
+        trace_amdvi_ir_delivery_mode("eint");
+        break;
+    default:
+        trace_amdvi_ir_delivery_mode("unsupported delivery_mode");
+        ret = -AMDVI_IR_ERR;
+        break;
+    }
+
+    if (ret < 0) {
+        goto remap_fail;
+    }
+
+    /*
+     * The MSI address register bit[2] is used to get the destination
+     * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts
+     * only.
+     */
+    dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
+    if (dest_mode) {
+        trace_amdvi_ir_err("invalid dest_mode");
+        ret = -AMDVI_IR_ERR;
+        goto remap_fail;
+    }
+
+    if (pass) {
+        memcpy(translated, origin, sizeof(*origin));
+    } else {
+        trace_amdvi_ir_err("passthrough is not enabled");
+        ret = -AMDVI_IR_ERR;
+        goto remap_fail;
+    }
+
+out:
+    trace_amdvi_ir_remap_msi(origin->address, origin->data,
+                             translated->address, translated->data);
+    return 0;
+
+remap_fail:
+    return ret;
+}
+
+static int amdvi_int_remap(X86IOMMUState *iommu,
+                           MSIMessage *origin,
+                           MSIMessage *translated,
+                           uint16_t sid)
+{
+    return amdvi_int_remap_msi(AMD_IOMMU_DEVICE(iommu), origin,
+                               translated, sid);
+}
+
+static MemTxResult amdvi_mem_ir_write(void *opaque, hwaddr addr,
+                                      uint64_t value, unsigned size,
+                                      MemTxAttrs attrs)
+{
+    int ret;
+    MSIMessage from = { 0, 0 }, to = { 0, 0 };
+    uint16_t sid = AMDVI_IOAPIC_SB_DEVID;
+
+    from.address = (uint64_t) addr + AMDVI_INT_ADDR_FIRST;
+    from.data = (uint32_t) value;
+
+    trace_amdvi_mem_ir_write_req(addr, value, size);
+
+    if (!attrs.unspecified) {
+        /* We have explicit Source ID */
+        sid = attrs.requester_id;
+    }
+
+    ret = amdvi_int_remap_msi(opaque, &from, &to, sid);
+    if (ret < 0) {
+        /* TODO: log the event using IOMMU log event interface */
+        error_report_once("failed to remap interrupt from devid 0x%x", sid);
+        return MEMTX_ERROR;
+    }
+
+    apic_get_class()->send_msi(&to);
+
+    trace_amdvi_mem_ir_write(to.address, to.data);
+    return MEMTX_OK;
+}
+
+static MemTxResult amdvi_mem_ir_read(void *opaque, hwaddr addr,
+                                     uint64_t *data, unsigned size,
+                                     MemTxAttrs attrs)
+{
+    return MEMTX_OK;
+}
+
+static const MemoryRegionOps amdvi_ir_ops = {
+    .read_with_attrs = amdvi_mem_ir_read,
+    .write_with_attrs = amdvi_mem_ir_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    }
+};
+
+static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
+{
+    char name[128];
+    AMDVIState *s = opaque;
+    AMDVIAddressSpace **iommu_as, *amdvi_dev_as;
+    int bus_num = pci_bus_num(bus);
+
+    iommu_as = s->address_spaces[bus_num];
+
+    /* allocate memory during the first run */
+    if (!iommu_as) {
+        iommu_as = g_malloc0(sizeof(AMDVIAddressSpace *) * PCI_DEVFN_MAX);
+        s->address_spaces[bus_num] = iommu_as;
+    }
+
+    /* set up AMD-Vi region */
+    if (!iommu_as[devfn]) {
+        snprintf(name, sizeof(name), "amd_iommu_devfn_%d", devfn);
+
+        iommu_as[devfn] = g_malloc0(sizeof(AMDVIAddressSpace));
+        iommu_as[devfn]->bus_num = (uint8_t)bus_num;
+        iommu_as[devfn]->devfn = (uint8_t)devfn;
+        iommu_as[devfn]->iommu_state = s;
+
+        amdvi_dev_as = iommu_as[devfn];
+
+        /*
+         * Memory region relationships looks like (Address range shows
+         * only lower 32 bits to make it short in length...):
+         *
+         * |-----------------+-------------------+----------|
+         * | Name            | Address range     | Priority |
+         * |-----------------+-------------------+----------+
+         * | amdvi_root      | 00000000-ffffffff |        0 |
+         * |  amdvi_iommu    | 00000000-ffffffff |        1 |
+         * |  amdvi_iommu_ir | fee00000-feefffff |       64 |
+         * |-----------------+-------------------+----------|
+         */
+        memory_region_init_iommu(&amdvi_dev_as->iommu,
+                                 sizeof(amdvi_dev_as->iommu),
+                                 TYPE_AMD_IOMMU_MEMORY_REGION,
+                                 OBJECT(s),
+                                 "amd_iommu", UINT64_MAX);
+        memory_region_init(&amdvi_dev_as->root, OBJECT(s),
+                           "amdvi_root", UINT64_MAX);
+        address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name);
+        memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s),
+                              &amdvi_ir_ops, s, "amd_iommu_ir",
+                              AMDVI_INT_ADDR_SIZE);
+        memory_region_add_subregion_overlap(&amdvi_dev_as->root,
+                                            AMDVI_INT_ADDR_FIRST,
+                                            &amdvi_dev_as->iommu_ir,
+                                            64);
+        memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
+                                            MEMORY_REGION(&amdvi_dev_as->iommu),
+                                            1);
+    }
+    return &iommu_as[devfn]->as;
+}
+
+static const MemoryRegionOps mmio_mem_ops = {
+    .read = amdvi_mmio_read,
+    .write = amdvi_mmio_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    }
+};
+
+static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
+                                           IOMMUNotifierFlag old,
+                                           IOMMUNotifierFlag new,
+                                           Error **errp)
+{
+    AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
+
+    if (new & IOMMU_NOTIFIER_MAP) {
+        error_setg(errp,
+                   "device %02x.%02x.%x requires iommu notifier which is not "
+                   "currently supported", as->bus_num, PCI_SLOT(as->devfn),
+                   PCI_FUNC(as->devfn));
+        return -EINVAL;
+    }
+    return 0;
+}
+
+static void amdvi_init(AMDVIState *s)
+{
+    amdvi_iotlb_reset(s);
+
+    s->devtab_len = 0;
+    s->cmdbuf_len = 0;
+    s->cmdbuf_head = 0;
+    s->cmdbuf_tail = 0;
+    s->evtlog_head = 0;
+    s->evtlog_tail = 0;
+    s->excl_enabled = false;
+    s->excl_allow = false;
+    s->mmio_enabled = false;
+    s->enabled = false;
+    s->ats_enabled = false;
+    s->cmdbuf_enabled = false;
+
+    /* reset MMIO */
+    memset(s->mmior, 0, AMDVI_MMIO_SIZE);
+    amdvi_set_quad(s, AMDVI_MMIO_EXT_FEATURES, AMDVI_EXT_FEATURES,
+            0xffffffffffffffef, 0);
+    amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98, 0x67);
+
+    /* reset device ident */
+    pci_config_set_vendor_id(s->pci.dev.config, PCI_VENDOR_ID_AMD);
+    pci_config_set_prog_interface(s->pci.dev.config, 00);
+    pci_config_set_device_id(s->pci.dev.config, s->devid);
+    pci_config_set_class(s->pci.dev.config, 0x0806);
+
+    /* reset AMDVI specific capabilities, all r/o */
+    pci_set_long(s->pci.dev.config + s->capab_offset, AMDVI_CAPAB_FEATURES);
+    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_LOW,
+                 s->mmio.addr & ~(0xffff0000));
+    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH,
+                (s->mmio.addr & ~(0xffff)) >> 16);
+    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_RANGE,
+                 0xff000000);
+    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, 0);
+    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC,
+            AMDVI_MAX_PH_ADDR | AMDVI_MAX_GVA_ADDR | AMDVI_MAX_VA_ADDR);
+}
+
+static void amdvi_sysbus_reset(DeviceState *dev)
+{
+    AMDVIState *s = AMD_IOMMU_DEVICE(dev);
+
+    msi_reset(&s->pci.dev);
+    amdvi_init(s);
+}
+
+static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
+{
+    int ret = 0;
+    AMDVIState *s = AMD_IOMMU_DEVICE(dev);
+    MachineState *ms = MACHINE(qdev_get_machine());
+    PCMachineState *pcms = PC_MACHINE(ms);
+    X86MachineState *x86ms = X86_MACHINE(ms);
+    PCIBus *bus = pcms->bus;
+
+    s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
+                                     amdvi_uint64_equal, g_free, g_free);
+
+    /* This device should take care of IOMMU PCI properties */
+    if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) {
+        return;
+    }
+    ret = pci_add_capability(&s->pci.dev, AMDVI_CAPAB_ID_SEC, 0,
+                                         AMDVI_CAPAB_SIZE, errp);
+    if (ret < 0) {
+        return;
+    }
+    s->capab_offset = ret;
+
+    ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_MSI, 0,
+                             AMDVI_CAPAB_REG_SIZE, errp);
+    if (ret < 0) {
+        return;
+    }
+    ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_HT, 0,
+                             AMDVI_CAPAB_REG_SIZE, errp);
+    if (ret < 0) {
+        return;
+    }
+
+    /* Pseudo address space under root PCI bus. */
+    x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
+
+    /* set up MMIO */
+    memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio",
+                          AMDVI_MMIO_SIZE);
+
+    sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio);
+    sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
+    pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
+    s->devid = object_property_get_int(OBJECT(&s->pci), "addr", &error_abort);
+    msi_init(&s->pci.dev, 0, 1, true, false, errp);
+    amdvi_init(s);
+}
+
+static const VMStateDescription vmstate_amdvi_sysbus = {
+    .name = "amd-iommu",
+    .unmigratable = 1
+};
+
+static void amdvi_sysbus_instance_init(Object *klass)
+{
+    AMDVIState *s = AMD_IOMMU_DEVICE(klass);
+
+    object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI);
+}
+
+static void amdvi_sysbus_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    X86IOMMUClass *dc_class = X86_IOMMU_DEVICE_CLASS(klass);
+
+    dc->reset = amdvi_sysbus_reset;
+    dc->vmsd = &vmstate_amdvi_sysbus;
+    dc->hotpluggable = false;
+    dc_class->realize = amdvi_sysbus_realize;
+    dc_class->int_remap = amdvi_int_remap;
+    /* Supported by the pc-q35-* machine types */
+    dc->user_creatable = true;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device";
+}
+
+static const TypeInfo amdvi_sysbus = {
+    .name = TYPE_AMD_IOMMU_DEVICE,
+    .parent = TYPE_X86_IOMMU_DEVICE,
+    .instance_size = sizeof(AMDVIState),
+    .instance_init = amdvi_sysbus_instance_init,
+    .class_init = amdvi_sysbus_class_init
+};
+
+static void amdvi_pci_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device";
+}
+
+static const TypeInfo amdvi_pci = {
+    .name = TYPE_AMD_IOMMU_PCI,
+    .parent = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(AMDVIPCIState),
+    .class_init = amdvi_pci_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+        { },
+    },
+};
+
+static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, void *data)
+{
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+    imrc->translate = amdvi_translate;
+    imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed;
+}
+
+static const TypeInfo amdvi_iommu_memory_region_info = {
+    .parent = TYPE_IOMMU_MEMORY_REGION,
+    .name = TYPE_AMD_IOMMU_MEMORY_REGION,
+    .class_init = amdvi_iommu_memory_region_class_init,
+};
+
+static void amdvi_register_types(void)
+{
+    type_register_static(&amdvi_pci);
+    type_register_static(&amdvi_sysbus);
+    type_register_static(&amdvi_iommu_memory_region_info);
+}
+
+type_init(amdvi_register_types);
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
new file mode 100644
index 000000000..79d38a3e4
--- /dev/null
+++ b/hw/i386/amd_iommu.h
@@ -0,0 +1,372 @@
+/*
+ * QEMU emulation of an AMD IOMMU (AMD-Vi)
+ *
+ * Copyright (C) 2011 Eduard - Gabriel Munteanu
+ * Copyright (C) 2015, 2016 David Kiarie Kahurani
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef AMD_IOMMU_H
+#define AMD_IOMMU_H
+
+#include "hw/pci/pci.h"
+#include "hw/i386/x86-iommu.h"
+#include "qom/object.h"
+
+/* Capability registers */
+#define AMDVI_CAPAB_BAR_LOW           0x04
+#define AMDVI_CAPAB_BAR_HIGH          0x08
+#define AMDVI_CAPAB_RANGE             0x0C
+#define AMDVI_CAPAB_MISC              0x10
+
+#define AMDVI_CAPAB_SIZE              0x18
+#define AMDVI_CAPAB_REG_SIZE          0x04
+
+/* Capability header data */
+#define AMDVI_CAPAB_ID_SEC            0xf
+#define AMDVI_CAPAB_FLAT_EXT          (1 << 28)
+#define AMDVI_CAPAB_EFR_SUP           (1 << 27)
+#define AMDVI_CAPAB_FLAG_NPCACHE      (1 << 26)
+#define AMDVI_CAPAB_FLAG_HTTUNNEL     (1 << 25)
+#define AMDVI_CAPAB_FLAG_IOTLBSUP     (1 << 24)
+#define AMDVI_CAPAB_INIT_TYPE         (3 << 16)
+
+/* No. of used MMIO registers */
+#define AMDVI_MMIO_REGS_HIGH  7
+#define AMDVI_MMIO_REGS_LOW   8
+
+/* MMIO registers */
+#define AMDVI_MMIO_DEVICE_TABLE       0x0000
+#define AMDVI_MMIO_COMMAND_BASE       0x0008
+#define AMDVI_MMIO_EVENT_BASE         0x0010
+#define AMDVI_MMIO_CONTROL            0x0018
+#define AMDVI_MMIO_EXCL_BASE          0x0020
+#define AMDVI_MMIO_EXCL_LIMIT         0x0028
+#define AMDVI_MMIO_EXT_FEATURES       0x0030
+#define AMDVI_MMIO_COMMAND_HEAD       0x2000
+#define AMDVI_MMIO_COMMAND_TAIL       0x2008
+#define AMDVI_MMIO_EVENT_HEAD         0x2010
+#define AMDVI_MMIO_EVENT_TAIL         0x2018
+#define AMDVI_MMIO_STATUS             0x2020
+#define AMDVI_MMIO_PPR_BASE           0x0038
+#define AMDVI_MMIO_PPR_HEAD           0x2030
+#define AMDVI_MMIO_PPR_TAIL           0x2038
+
+#define AMDVI_MMIO_SIZE               0x4000
+
+#define AMDVI_MMIO_DEVTAB_SIZE_MASK   ((1ULL << 12) - 1)
+#define AMDVI_MMIO_DEVTAB_BASE_MASK   (((1ULL << 52) - 1) & ~ \
+                                       AMDVI_MMIO_DEVTAB_SIZE_MASK)
+#define AMDVI_MMIO_DEVTAB_ENTRY_SIZE  32
+#define AMDVI_MMIO_DEVTAB_SIZE_UNIT   4096
+
+/* some of this are similar but just for readability */
+#define AMDVI_MMIO_CMDBUF_SIZE_BYTE       (AMDVI_MMIO_COMMAND_BASE + 7)
+#define AMDVI_MMIO_CMDBUF_SIZE_MASK       0x0f
+#define AMDVI_MMIO_CMDBUF_BASE_MASK       AMDVI_MMIO_DEVTAB_BASE_MASK
+#define AMDVI_MMIO_CMDBUF_HEAD_MASK       (((1ULL << 19) - 1) & ~0x0f)
+#define AMDVI_MMIO_CMDBUF_TAIL_MASK       AMDVI_MMIO_EVTLOG_HEAD_MASK
+
+#define AMDVI_MMIO_EVTLOG_SIZE_BYTE       (AMDVI_MMIO_EVENT_BASE + 7)
+#define AMDVI_MMIO_EVTLOG_SIZE_MASK       AMDVI_MMIO_CMDBUF_SIZE_MASK
+#define AMDVI_MMIO_EVTLOG_BASE_MASK       AMDVI_MMIO_CMDBUF_BASE_MASK
+#define AMDVI_MMIO_EVTLOG_HEAD_MASK       (((1ULL << 19) - 1) & ~0x0f)
+#define AMDVI_MMIO_EVTLOG_TAIL_MASK       AMDVI_MMIO_EVTLOG_HEAD_MASK
+
+#define AMDVI_MMIO_PPRLOG_SIZE_BYTE       (AMDVI_MMIO_EVENT_BASE + 7)
+#define AMDVI_MMIO_PPRLOG_HEAD_MASK       AMDVI_MMIO_EVTLOG_HEAD_MASK
+#define AMDVI_MMIO_PPRLOG_TAIL_MASK       AMDVI_MMIO_EVTLOG_HEAD_MASK
+#define AMDVI_MMIO_PPRLOG_BASE_MASK       AMDVI_MMIO_EVTLOG_BASE_MASK
+#define AMDVI_MMIO_PPRLOG_SIZE_MASK       AMDVI_MMIO_EVTLOG_SIZE_MASK
+
+#define AMDVI_MMIO_EXCL_ENABLED_MASK      (1ULL << 0)
+#define AMDVI_MMIO_EXCL_ALLOW_MASK        (1ULL << 1)
+#define AMDVI_MMIO_EXCL_LIMIT_MASK        AMDVI_MMIO_DEVTAB_BASE_MASK
+#define AMDVI_MMIO_EXCL_LIMIT_LOW         0xfff
+
+/* mmio control register flags */
+#define AMDVI_MMIO_CONTROL_AMDVIEN        (1ULL << 0)
+#define AMDVI_MMIO_CONTROL_HTTUNEN        (1ULL << 1)
+#define AMDVI_MMIO_CONTROL_EVENTLOGEN     (1ULL << 2)
+#define AMDVI_MMIO_CONTROL_EVENTINTEN     (1ULL << 3)
+#define AMDVI_MMIO_CONTROL_COMWAITINTEN   (1ULL << 4)
+#define AMDVI_MMIO_CONTROL_CMDBUFLEN      (1ULL << 12)
+#define AMDVI_MMIO_CONTROL_GAEN           (1ULL << 17)
+
+/* MMIO status register bits */
+#define AMDVI_MMIO_STATUS_CMDBUF_RUN  (1 << 4)
+#define AMDVI_MMIO_STATUS_EVT_RUN     (1 << 3)
+#define AMDVI_MMIO_STATUS_COMP_INT    (1 << 2)
+#define AMDVI_MMIO_STATUS_EVT_OVF     (1 << 0)
+
+#define AMDVI_CMDBUF_ID_BYTE              0x07
+#define AMDVI_CMDBUF_ID_RSHIFT            4
+
+#define AMDVI_CMD_COMPLETION_WAIT         0x01
+#define AMDVI_CMD_INVAL_DEVTAB_ENTRY      0x02
+#define AMDVI_CMD_INVAL_AMDVI_PAGES       0x03
+#define AMDVI_CMD_INVAL_IOTLB_PAGES       0x04
+#define AMDVI_CMD_INVAL_INTR_TABLE        0x05
+#define AMDVI_CMD_PREFETCH_AMDVI_PAGES    0x06
+#define AMDVI_CMD_COMPLETE_PPR_REQUEST    0x07
+#define AMDVI_CMD_INVAL_AMDVI_ALL         0x08
+
+#define AMDVI_DEVTAB_ENTRY_SIZE           32
+
+/* Device table entry bits 0:63 */
+#define AMDVI_DEV_VALID                   (1ULL << 0)
+#define AMDVI_DEV_TRANSLATION_VALID       (1ULL << 1)
+#define AMDVI_DEV_MODE_MASK               0x7
+#define AMDVI_DEV_MODE_RSHIFT             9
+#define AMDVI_DEV_PT_ROOT_MASK            0xffffffffff000
+#define AMDVI_DEV_PT_ROOT_RSHIFT          12
+#define AMDVI_DEV_PERM_SHIFT              61
+#define AMDVI_DEV_PERM_READ               (1ULL << 61)
+#define AMDVI_DEV_PERM_WRITE              (1ULL << 62)
+
+/* Device table entry bits 64:127 */
+#define AMDVI_DEV_DOMID_ID_MASK          ((1ULL << 16) - 1)
+
+/* Event codes and flags, as stored in the info field */
+#define AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY  (0x1U << 12)
+#define AMDVI_EVENT_IOPF                  (0x2U << 12)
+#define   AMDVI_EVENT_IOPF_I              (1U << 3)
+#define AMDVI_EVENT_DEV_TAB_HW_ERROR      (0x3U << 12)
+#define AMDVI_EVENT_PAGE_TAB_HW_ERROR     (0x4U << 12)
+#define AMDVI_EVENT_ILLEGAL_COMMAND_ERROR (0x5U << 12)
+#define AMDVI_EVENT_COMMAND_HW_ERROR      (0x6U << 12)
+
+#define AMDVI_EVENT_LEN                  16
+#define AMDVI_PERM_READ             (1 << 0)
+#define AMDVI_PERM_WRITE            (1 << 1)
+
+#define AMDVI_FEATURE_PREFETCH            (1ULL << 0) /* page prefetch       */
+#define AMDVI_FEATURE_PPR                 (1ULL << 1) /* PPR Support         */
+#define AMDVI_FEATURE_GT                  (1ULL << 4) /* Guest Translation   */
+#define AMDVI_FEATURE_IA                  (1ULL << 6) /* inval all support   */
+#define AMDVI_FEATURE_GA                  (1ULL << 7) /* guest VAPIC support */
+#define AMDVI_FEATURE_HE                  (1ULL << 8) /* hardware error regs */
+#define AMDVI_FEATURE_PC                  (1ULL << 9) /* Perf counters       */
+
+/* reserved DTE bits */
+#define AMDVI_DTE_LOWER_QUAD_RESERVED  0x80300000000000fc
+#define AMDVI_DTE_MIDDLE_QUAD_RESERVED 0x0000000000000100
+#define AMDVI_DTE_UPPER_QUAD_RESERVED  0x08f0000000000000
+
+/* AMDVI paging mode */
+#define AMDVI_GATS_MODE                 (2ULL <<  12)
+#define AMDVI_HATS_MODE                 (2ULL <<  10)
+
+/* IOTLB */
+#define AMDVI_IOTLB_MAX_SIZE 1024
+#define AMDVI_DEVID_SHIFT    36
+
+/* extended feature support */
+#define AMDVI_EXT_FEATURES (AMDVI_FEATURE_PREFETCH | AMDVI_FEATURE_PPR | \
+        AMDVI_FEATURE_IA | AMDVI_FEATURE_GT | AMDVI_FEATURE_HE | \
+        AMDVI_GATS_MODE | AMDVI_HATS_MODE | AMDVI_FEATURE_GA)
+
+/* capabilities header */
+#define AMDVI_CAPAB_FEATURES (AMDVI_CAPAB_FLAT_EXT | \
+        AMDVI_CAPAB_FLAG_NPCACHE | AMDVI_CAPAB_FLAG_IOTLBSUP \
+        | AMDVI_CAPAB_ID_SEC | AMDVI_CAPAB_INIT_TYPE | \
+        AMDVI_CAPAB_FLAG_HTTUNNEL |  AMDVI_CAPAB_EFR_SUP)
+
+/* AMDVI default address */
+#define AMDVI_BASE_ADDR 0xfed80000
+
+/* page management constants */
+#define AMDVI_PAGE_SHIFT 12
+#define AMDVI_PAGE_SIZE  (1ULL << AMDVI_PAGE_SHIFT)
+
+#define AMDVI_PAGE_SHIFT_4K 12
+#define AMDVI_PAGE_MASK_4K  (~((1ULL << AMDVI_PAGE_SHIFT_4K) - 1))
+
+#define AMDVI_MAX_VA_ADDR          (48UL << 5)
+#define AMDVI_MAX_PH_ADDR          (40UL << 8)
+#define AMDVI_MAX_GVA_ADDR         (48UL << 15)
+
+/* Completion Wait data size */
+#define AMDVI_COMPLETION_DATA_SIZE    8
+
+#define AMDVI_COMMAND_SIZE   16
+/* Completion Wait data size */
+#define AMDVI_COMPLETION_DATA_SIZE    8
+
+#define AMDVI_COMMAND_SIZE   16
+
+#define AMDVI_INT_ADDR_FIRST    0xfee00000
+#define AMDVI_INT_ADDR_LAST     0xfeefffff
+#define AMDVI_INT_ADDR_SIZE     (AMDVI_INT_ADDR_LAST - AMDVI_INT_ADDR_FIRST + 1)
+#define AMDVI_MSI_ADDR_HI_MASK  (0xffffffff00000000ULL)
+#define AMDVI_MSI_ADDR_LO_MASK  (0x00000000ffffffffULL)
+
+/* SB IOAPIC is always on this device in AMD systems */
+#define AMDVI_IOAPIC_SB_DEVID   PCI_BUILD_BDF(0, PCI_DEVFN(0x14, 0))
+
+/* Interrupt remapping errors */
+#define AMDVI_IR_ERR            0x1
+#define AMDVI_IR_GET_IRTE       0x2
+#define AMDVI_IR_TARGET_ABORT   0x3
+
+/* Interrupt remapping */
+#define AMDVI_IR_REMAP_ENABLE           1ULL
+#define AMDVI_IR_INTCTL_SHIFT           60
+#define AMDVI_IR_INTCTL_ABORT           0
+#define AMDVI_IR_INTCTL_PASS            1
+#define AMDVI_IR_INTCTL_REMAP           2
+
+#define AMDVI_IR_PHYS_ADDR_MASK         (((1ULL << 45) - 1) << 6)
+
+/* MSI data 10:0 bits (section 2.2.5.1 Fig 14) */
+#define AMDVI_IRTE_OFFSET               0x7ff
+
+/* Delivery mode of MSI data (same as IOAPIC deilver mode encoding) */
+#define AMDVI_IOAPIC_INT_TYPE_FIXED          0x0
+#define AMDVI_IOAPIC_INT_TYPE_ARBITRATED     0x1
+#define AMDVI_IOAPIC_INT_TYPE_SMI            0x2
+#define AMDVI_IOAPIC_INT_TYPE_NMI            0x4
+#define AMDVI_IOAPIC_INT_TYPE_INIT           0x5
+#define AMDVI_IOAPIC_INT_TYPE_EINT           0x7
+
+/* Pass through interrupt */
+#define AMDVI_DEV_INT_PASS_MASK         (1ULL << 56)
+#define AMDVI_DEV_EINT_PASS_MASK        (1ULL << 57)
+#define AMDVI_DEV_NMI_PASS_MASK         (1ULL << 58)
+#define AMDVI_DEV_LINT0_PASS_MASK       (1ULL << 62)
+#define AMDVI_DEV_LINT1_PASS_MASK       (1ULL << 63)
+
+/* Interrupt remapping table fields (Guest VAPIC not enabled) */
+union irte {
+    uint32_t val;
+    struct {
+        uint32_t valid:1,
+                 no_fault:1,
+                 int_type:3,
+                 rq_eoi:1,
+                 dm:1,
+                 guest_mode:1,
+                 destination:8,
+                 vector:8,
+                 rsvd:8;
+    } fields;
+};
+
+/* Interrupt remapping table fields (Guest VAPIC is enabled) */
+union irte_ga_lo {
+  uint64_t val;
+
+  /* For int remapping */
+  struct {
+      uint64_t  valid:1,
+                no_fault:1,
+                /* ------ */
+                int_type:3,
+                rq_eoi:1,
+                dm:1,
+                /* ------ */
+                guest_mode:1,
+                destination:8,
+                rsvd_1:48;
+  } fields_remap;
+};
+
+union irte_ga_hi {
+  uint64_t val;
+  struct {
+      uint64_t  vector:8,
+                rsvd_2:56;
+  } fields;
+};
+
+struct irte_ga {
+  union irte_ga_lo lo;
+  union irte_ga_hi hi;
+};
+
+#define TYPE_AMD_IOMMU_DEVICE "amd-iommu"
+OBJECT_DECLARE_SIMPLE_TYPE(AMDVIState, AMD_IOMMU_DEVICE)
+
+#define TYPE_AMD_IOMMU_PCI "AMDVI-PCI"
+
+#define TYPE_AMD_IOMMU_MEMORY_REGION "amd-iommu-iommu-memory-region"
+
+typedef struct AMDVIAddressSpace AMDVIAddressSpace;
+
+/* functions to steal PCI config space */
+typedef struct AMDVIPCIState {
+    PCIDevice dev;               /* The PCI device itself        */
+} AMDVIPCIState;
+
+struct AMDVIState {
+    X86IOMMUState iommu;        /* IOMMU bus device             */
+    AMDVIPCIState pci;          /* IOMMU PCI device             */
+
+    uint32_t version;
+    uint32_t capab_offset;       /* capability offset pointer    */
+
+    uint64_t mmio_addr;
+
+    uint32_t devid;              /* auto-assigned devid          */
+
+    bool enabled;                /* IOMMU enabled                */
+    bool ats_enabled;            /* address translation enabled  */
+    bool cmdbuf_enabled;         /* command buffer enabled       */
+    bool evtlog_enabled;         /* event log enabled            */
+    bool excl_enabled;
+
+    hwaddr devtab;               /* base address device table    */
+    size_t devtab_len;           /* device table length          */
+
+    hwaddr cmdbuf;               /* command buffer base address  */
+    uint64_t cmdbuf_len;         /* command buffer length        */
+    uint32_t cmdbuf_head;        /* current IOMMU read position  */
+    uint32_t cmdbuf_tail;        /* next Software write position */
+    bool completion_wait_intr;
+
+    hwaddr evtlog;               /* base address event log       */
+    bool evtlog_intr;
+    uint32_t evtlog_len;         /* event log length             */
+    uint32_t evtlog_head;        /* current IOMMU write position */
+    uint32_t evtlog_tail;        /* current Software read position */
+
+    /* unused for now */
+    hwaddr excl_base;            /* base DVA - IOMMU exclusion range */
+    hwaddr excl_limit;           /* limit of IOMMU exclusion range   */
+    bool excl_allow;             /* translate accesses to the exclusion range */
+    bool excl_enable;            /* exclusion range enabled          */
+
+    hwaddr ppr_log;              /* base address ppr log */
+    uint32_t pprlog_len;         /* ppr log len  */
+    uint32_t pprlog_head;        /* ppr log head */
+    uint32_t pprlog_tail;        /* ppr log tail */
+
+    MemoryRegion mmio;                 /* MMIO region                  */
+    uint8_t mmior[AMDVI_MMIO_SIZE];    /* read/write MMIO              */
+    uint8_t w1cmask[AMDVI_MMIO_SIZE];  /* read/write 1 clear mask      */
+    uint8_t romask[AMDVI_MMIO_SIZE];   /* MMIO read/only mask          */
+    bool mmio_enabled;
+
+    /* for each served device */
+    AMDVIAddressSpace **address_spaces[PCI_BUS_MAX];
+
+    /* IOTLB */
+    GHashTable *iotlb;
+
+    /* Interrupt remapping */
+    bool ga_enabled;
+};
+
+#endif
diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
new file mode 100644
index 000000000..bcf9eaf83
--- /dev/null
+++ b/hw/i386/e820_memory_layout.c
@@ -0,0 +1,59 @@
+/*
+ * QEMU BIOS e820 routines
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bswap.h"
+#include "e820_memory_layout.h"
+
+static size_t e820_entries;
+struct e820_table e820_reserve;
+struct e820_entry *e820_table;
+
+int e820_add_entry(uint64_t address, uint64_t length, uint32_t type)
+{
+    int index = le32_to_cpu(e820_reserve.count);
+    struct e820_entry *entry;
+
+    if (type != E820_RAM) {
+        /* old FW_CFG_E820_TABLE entry -- reservations only */
+        if (index >= E820_NR_ENTRIES) {
+            return -EBUSY;
+        }
+        entry = &e820_reserve.entry[index++];
+
+        entry->address = cpu_to_le64(address);
+        entry->length = cpu_to_le64(length);
+        entry->type = cpu_to_le32(type);
+
+        e820_reserve.count = cpu_to_le32(index);
+    }
+
+    /* new "etc/e820" file -- include ram too */
+    e820_table = g_renew(struct e820_entry, e820_table, e820_entries + 1);
+    e820_table[e820_entries].address = cpu_to_le64(address);
+    e820_table[e820_entries].length = cpu_to_le64(length);
+    e820_table[e820_entries].type = cpu_to_le32(type);
+    e820_entries++;
+
+    return e820_entries;
+}
+
+int e820_get_num_entries(void)
+{
+    return e820_entries;
+}
+
+bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
+{
+    if (idx < e820_entries && e820_table[idx].type == cpu_to_le32(type)) {
+        *address = le64_to_cpu(e820_table[idx].address);
+        *length = le64_to_cpu(e820_table[idx].length);
+        return true;
+    }
+    return false;
+}
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
new file mode 100644
index 000000000..2a0ceb8b9
--- /dev/null
+++ b/hw/i386/e820_memory_layout.h
@@ -0,0 +1,42 @@
+/*
+ * QEMU BIOS e820 routines
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef HW_I386_E820_H
+#define HW_I386_E820_H
+
+/* e820 types */
+#define E820_RAM        1
+#define E820_RESERVED   2
+#define E820_ACPI       3
+#define E820_NVS        4
+#define E820_UNUSABLE   5
+
+#define E820_NR_ENTRIES 16
+
+struct e820_entry {
+    uint64_t address;
+    uint64_t length;
+    uint32_t type;
+} QEMU_PACKED __attribute((__aligned__(4)));
+
+struct e820_table {
+    uint32_t count;
+    struct e820_entry entry[E820_NR_ENTRIES];
+} QEMU_PACKED __attribute((__aligned__(4)));
+
+extern struct e820_table e820_reserve;
+extern struct e820_entry *e820_table;
+
+int e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
+int e820_get_num_entries(void);
+bool e820_get_entry(int index, uint32_t type,
+                    uint64_t *address, uint64_t *length);
+
+
+
+#endif
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
new file mode 100644
index 000000000..a283785a8
--- /dev/null
+++ b/hw/i386/fw_cfg.c
@@ -0,0 +1,221 @@
+/*
+ * QEMU fw_cfg helpers (X86 specific)
+ *
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * Author:
+ *   Philippe Mathieu-Daudé <philmd@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/numa.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/firmware/smbios.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/timer/hpet.h"
+#include "hw/nvram/fw_cfg.h"
+#include "e820_memory_layout.h"
+#include "kvm/kvm_i386.h"
+#include "qapi/error.h"
+#include CONFIG_DEVICES
+
+struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX};
+
+const char *fw_cfg_arch_key_name(uint16_t key)
+{
+    static const struct {
+        uint16_t key;
+        const char *name;
+    } fw_cfg_arch_wellknown_keys[] = {
+        {FW_CFG_ACPI_TABLES, "acpi_tables"},
+        {FW_CFG_SMBIOS_ENTRIES, "smbios_entries"},
+        {FW_CFG_IRQ0_OVERRIDE, "irq0_override"},
+        {FW_CFG_E820_TABLE, "e820_table"},
+        {FW_CFG_HPET, "hpet"},
+    };
+
+    for (size_t i = 0; i < ARRAY_SIZE(fw_cfg_arch_wellknown_keys); i++) {
+        if (fw_cfg_arch_wellknown_keys[i].key == key) {
+            return fw_cfg_arch_wellknown_keys[i].name;
+        }
+    }
+    return NULL;
+}
+
+void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg)
+{
+#ifdef CONFIG_SMBIOS
+    uint8_t *smbios_tables, *smbios_anchor;
+    size_t smbios_tables_len, smbios_anchor_len;
+    struct smbios_phys_mem_area *mem_array;
+    unsigned i, array_count;
+    X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu);
+
+    /* tell smbios about cpuid version and features */
+    smbios_set_cpuid(cpu->env.cpuid_version, cpu->env.features[FEAT_1_EDX]);
+
+    smbios_tables = smbios_get_table_legacy(ms, &smbios_tables_len);
+    if (smbios_tables) {
+        fw_cfg_add_bytes(fw_cfg, FW_CFG_SMBIOS_ENTRIES,
+                         smbios_tables, smbios_tables_len);
+    }
+
+    /* build the array of physical mem area from e820 table */
+    mem_array = g_malloc0(sizeof(*mem_array) * e820_get_num_entries());
+    for (i = 0, array_count = 0; i < e820_get_num_entries(); i++) {
+        uint64_t addr, len;
+
+        if (e820_get_entry(i, E820_RAM, &addr, &len)) {
+            mem_array[array_count].address = addr;
+            mem_array[array_count].length = len;
+            array_count++;
+        }
+    }
+    smbios_get_tables(ms, mem_array, array_count,
+                      &smbios_tables, &smbios_tables_len,
+                      &smbios_anchor, &smbios_anchor_len,
+                      &error_fatal);
+    g_free(mem_array);
+
+    if (smbios_anchor) {
+        fw_cfg_add_file(fw_cfg, "etc/smbios/smbios-tables",
+                        smbios_tables, smbios_tables_len);
+        fw_cfg_add_file(fw_cfg, "etc/smbios/smbios-anchor",
+                        smbios_anchor, smbios_anchor_len);
+    }
+#endif
+}
+
+FWCfgState *fw_cfg_arch_create(MachineState *ms,
+                                      uint16_t boot_cpus,
+                                      uint16_t apic_id_limit)
+{
+    FWCfgState *fw_cfg;
+    uint64_t *numa_fw_cfg;
+    int i;
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
+    const CPUArchIdList *cpus = mc->possible_cpu_arch_ids(ms);
+    int nb_numa_nodes = ms->numa_state->num_nodes;
+
+    fw_cfg = fw_cfg_init_io_dma(FW_CFG_IO_BASE, FW_CFG_IO_BASE + 4,
+                                &address_space_memory);
+    fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, boot_cpus);
+
+    /* FW_CFG_MAX_CPUS is a bit confusing/problematic on x86:
+     *
+     * For machine types prior to 1.8, SeaBIOS needs FW_CFG_MAX_CPUS for
+     * building MPTable, ACPI MADT, ACPI CPU hotplug and ACPI SRAT table,
+     * that tables are based on xAPIC ID and QEMU<->SeaBIOS interface
+     * for CPU hotplug also uses APIC ID and not "CPU index".
+     * This means that FW_CFG_MAX_CPUS is not the "maximum number of CPUs",
+     * but the "limit to the APIC ID values SeaBIOS may see".
+     *
+     * So for compatibility reasons with old BIOSes we are stuck with
+     * "etc/max-cpus" actually being apic_id_limit
+     */
+    fw_cfg_add_i16(fw_cfg, FW_CFG_MAX_CPUS, apic_id_limit);
+    fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, ms->ram_size);
+#ifdef CONFIG_ACPI
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_ACPI_TABLES,
+                     acpi_tables, acpi_tables_len);
+#endif
+    fw_cfg_add_i32(fw_cfg, FW_CFG_IRQ0_OVERRIDE, 1);
+
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_E820_TABLE,
+                     &e820_reserve, sizeof(e820_reserve));
+    fw_cfg_add_file(fw_cfg, "etc/e820", e820_table,
+                    sizeof(struct e820_entry) * e820_get_num_entries());
+
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, &hpet_cfg, sizeof(hpet_cfg));
+    /* allocate memory for the NUMA channel: one (64bit) word for the number
+     * of nodes, one word for each VCPU->node and one word for each node to
+     * hold the amount of memory.
+     */
+    numa_fw_cfg = g_new0(uint64_t, 1 + apic_id_limit + nb_numa_nodes);
+    numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
+    for (i = 0; i < cpus->len; i++) {
+        unsigned int apic_id = cpus->cpus[i].arch_id;
+        assert(apic_id < apic_id_limit);
+        numa_fw_cfg[apic_id + 1] = cpu_to_le64(cpus->cpus[i].props.node_id);
+    }
+    for (i = 0; i < nb_numa_nodes; i++) {
+        numa_fw_cfg[apic_id_limit + 1 + i] =
+            cpu_to_le64(ms->numa_state->nodes[i].node_mem);
+    }
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg,
+                     (1 + apic_id_limit + nb_numa_nodes) *
+                     sizeof(*numa_fw_cfg));
+
+    return fw_cfg;
+}
+
+void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg)
+{
+    X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu);
+    CPUX86State *env = &cpu->env;
+    uint32_t unused, ebx, ecx, edx;
+    uint64_t feature_control_bits = 0;
+    uint64_t *val;
+
+    cpu_x86_cpuid(env, 1, 0, &unused, &unused, &ecx, &edx);
+    if (ecx & CPUID_EXT_VMX) {
+        feature_control_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+    }
+
+    if ((edx & (CPUID_EXT2_MCE | CPUID_EXT2_MCA)) ==
+        (CPUID_EXT2_MCE | CPUID_EXT2_MCA) &&
+        (env->mcg_cap & MCG_LMCE_P)) {
+        feature_control_bits |= FEATURE_CONTROL_LMCE;
+    }
+
+    if (env->cpuid_level >= 7) {
+        cpu_x86_cpuid(env, 0x7, 0, &unused, &ebx, &ecx, &unused);
+        if (ebx & CPUID_7_0_EBX_SGX) {
+            feature_control_bits |= FEATURE_CONTROL_SGX;
+        }
+        if (ecx & CPUID_7_0_ECX_SGX_LC) {
+            feature_control_bits |= FEATURE_CONTROL_SGX_LC;
+        }
+    }
+
+    if (!feature_control_bits) {
+        return;
+    }
+
+    val = g_malloc(sizeof(*val));
+    *val = cpu_to_le64(feature_control_bits | FEATURE_CONTROL_LOCKED);
+    fw_cfg_add_file(fw_cfg, "etc/msr_feature_control", val, sizeof(*val));
+}
+
+void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg)
+{
+    /*
+     * when using port i/o, the 8-bit data register *always* overlaps
+     * with half of the 16-bit control register. Hence, the total size
+     * of the i/o region used is FW_CFG_CTL_SIZE; when using DMA, the
+     * DMA control register is located at FW_CFG_DMA_IO_BASE + 4
+     */
+    Object *obj = OBJECT(fw_cfg);
+    uint8_t io_size = object_property_get_bool(obj, "dma_enabled", NULL) ?
+        ROUND_UP(FW_CFG_CTL_SIZE, 4) + sizeof(dma_addr_t) :
+        FW_CFG_CTL_SIZE;
+    Aml *dev = aml_device("FWCF");
+    Aml *crs = aml_resource_template();
+
+    aml_append(dev, aml_name_decl("_HID", aml_string("QEMU0002")));
+
+    /* device present, functioning, decoding, not shown in UI */
+    aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+
+    aml_append(crs,
+        aml_io(AML_DECODE16, FW_CFG_IO_BASE, FW_CFG_IO_BASE, 0x01, io_size));
+
+    aml_append(dev, aml_name_decl("_CRS", crs));
+    aml_append(scope, dev);
+}
diff --git a/hw/i386/fw_cfg.h b/hw/i386/fw_cfg.h
new file mode 100644
index 000000000..275f15c1c
--- /dev/null
+++ b/hw/i386/fw_cfg.h
@@ -0,0 +1,30 @@
+/*
+ * QEMU fw_cfg helpers (X86 specific)
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef HW_I386_FW_CFG_H
+#define HW_I386_FW_CFG_H
+
+#include "hw/boards.h"
+#include "hw/nvram/fw_cfg.h"
+
+#define FW_CFG_IO_BASE     0x510
+
+#define FW_CFG_ACPI_TABLES      (FW_CFG_ARCH_LOCAL + 0)
+#define FW_CFG_SMBIOS_ENTRIES   (FW_CFG_ARCH_LOCAL + 1)
+#define FW_CFG_IRQ0_OVERRIDE    (FW_CFG_ARCH_LOCAL + 2)
+#define FW_CFG_E820_TABLE       (FW_CFG_ARCH_LOCAL + 3)
+#define FW_CFG_HPET             (FW_CFG_ARCH_LOCAL + 4)
+
+FWCfgState *fw_cfg_arch_create(MachineState *ms,
+                               uint16_t boot_cpus,
+                               uint16_t apic_id_limit);
+void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg);
+void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg);
+void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg);
+
+#endif
diff --git a/hw/i386/generic_event_device_x86.c b/hw/i386/generic_event_device_x86.c
new file mode 100644
index 000000000..e26fb02a2
--- /dev/null
+++ b/hw/i386/generic_event_device_x86.c
@@ -0,0 +1,36 @@
+/*
+ * x86 variant of the generic event device for hw reduced acpi
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/acpi/generic_event_device.h"
+#include "hw/i386/pc.h"
+
+static void acpi_ged_x86_class_init(ObjectClass *class, void *data)
+{
+    AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_CLASS(class);
+
+    adevc->madt_cpu = pc_madt_cpu_entry;
+}
+
+static const TypeInfo acpi_ged_x86_info = {
+    .name          = TYPE_ACPI_GED_X86,
+    .parent        = TYPE_ACPI_GED,
+    .class_init    = acpi_ged_x86_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_HOTPLUG_HANDLER },
+        { TYPE_ACPI_DEVICE_IF },
+        { }
+    }
+};
+
+static void acpi_ged_x86_register_types(void)
+{
+    type_register_static(&acpi_ged_x86_info);
+}
+
+type_init(acpi_ged_x86_register_types)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
new file mode 100644
index 000000000..f584449d8
--- /dev/null
+++ b/hw/i386/intel_iommu.c
@@ -0,0 +1,3900 @@
+/*
+ * QEMU emulation of an Intel IOMMU (VT-d)
+ *   (DMA Remapping device)
+ *
+ * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
+ * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "hw/sysbus.h"
+#include "intel_iommu_internal.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/qdev-properties.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/apic-msidef.h"
+#include "hw/i386/x86-iommu.h"
+#include "hw/pci-host/q35.h"
+#include "sysemu/kvm.h"
+#include "sysemu/dma.h"
+#include "sysemu/sysemu.h"
+#include "hw/i386/apic_internal.h"
+#include "kvm/kvm_i386.h"
+#include "migration/vmstate.h"
+#include "trace.h"
+
+/* context entry operations */
+#define VTD_CE_GET_RID2PASID(ce) \
+    ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
+#define VTD_CE_GET_PASID_DIR_TABLE(ce) \
+    ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK)
+
+/* pe operations */
+#define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
+#define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
+#define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\
+    if (ret_fr) {                                                             \
+        ret_fr = -ret_fr;                                                     \
+        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {                   \
+            trace_vtd_fault_disabled();                                       \
+        } else {                                                              \
+            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);      \
+        }                                                                     \
+        goto error;                                                           \
+    }                                                                         \
+}
+
+static void vtd_address_space_refresh_all(IntelIOMMUState *s);
+static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
+
+static void vtd_panic_require_caching_mode(void)
+{
+    error_report("We need to set caching-mode=on for intel-iommu to enable "
+                 "device assignment with IOMMU protection.");
+    exit(1);
+}
+
+static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
+                            uint64_t wmask, uint64_t w1cmask)
+{
+    stq_le_p(&s->csr[addr], val);
+    stq_le_p(&s->wmask[addr], wmask);
+    stq_le_p(&s->w1cmask[addr], w1cmask);
+}
+
+static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask)
+{
+    stq_le_p(&s->womask[addr], mask);
+}
+
+static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val,
+                            uint32_t wmask, uint32_t w1cmask)
+{
+    stl_le_p(&s->csr[addr], val);
+    stl_le_p(&s->wmask[addr], wmask);
+    stl_le_p(&s->w1cmask[addr], w1cmask);
+}
+
+static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask)
+{
+    stl_le_p(&s->womask[addr], mask);
+}
+
+/* "External" get/set operations */
+static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val)
+{
+    uint64_t oldval = ldq_le_p(&s->csr[addr]);
+    uint64_t wmask = ldq_le_p(&s->wmask[addr]);
+    uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
+    stq_le_p(&s->csr[addr],
+             ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
+}
+
+static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val)
+{
+    uint32_t oldval = ldl_le_p(&s->csr[addr]);
+    uint32_t wmask = ldl_le_p(&s->wmask[addr]);
+    uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
+    stl_le_p(&s->csr[addr],
+             ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
+}
+
+static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr)
+{
+    uint64_t val = ldq_le_p(&s->csr[addr]);
+    uint64_t womask = ldq_le_p(&s->womask[addr]);
+    return val & ~womask;
+}
+
+static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr)
+{
+    uint32_t val = ldl_le_p(&s->csr[addr]);
+    uint32_t womask = ldl_le_p(&s->womask[addr]);
+    return val & ~womask;
+}
+
+/* "Internal" get/set operations */
+static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr)
+{
+    return ldq_le_p(&s->csr[addr]);
+}
+
+static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr)
+{
+    return ldl_le_p(&s->csr[addr]);
+}
+
+static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val)
+{
+    stq_le_p(&s->csr[addr], val);
+}
+
+static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
+                                        uint32_t clear, uint32_t mask)
+{
+    uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask;
+    stl_le_p(&s->csr[addr], new_val);
+    return new_val;
+}
+
+static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
+                                        uint64_t clear, uint64_t mask)
+{
+    uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask;
+    stq_le_p(&s->csr[addr], new_val);
+    return new_val;
+}
+
+static inline void vtd_iommu_lock(IntelIOMMUState *s)
+{
+    qemu_mutex_lock(&s->iommu_lock);
+}
+
+static inline void vtd_iommu_unlock(IntelIOMMUState *s)
+{
+    qemu_mutex_unlock(&s->iommu_lock);
+}
+
+static void vtd_update_scalable_state(IntelIOMMUState *s)
+{
+    uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
+
+    if (s->scalable_mode) {
+        s->root_scalable = val & VTD_RTADDR_SMT;
+    }
+}
+
+/* Whether the address space needs to notify new mappings */
+static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
+{
+    return as->notifier_flags & IOMMU_NOTIFIER_MAP;
+}
+
+/* GHashTable functions */
+static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
+{
+    return *((const uint64_t *)v1) == *((const uint64_t *)v2);
+}
+
+static guint vtd_uint64_hash(gconstpointer v)
+{
+    return (guint)*(const uint64_t *)v;
+}
+
+static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
+                                          gpointer user_data)
+{
+    VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
+    uint16_t domain_id = *(uint16_t *)user_data;
+    return entry->domain_id == domain_id;
+}
+
+/* The shift of an addr for a certain level of paging structure */
+static inline uint32_t vtd_slpt_level_shift(uint32_t level)
+{
+    assert(level != 0);
+    return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
+}
+
+static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
+{
+    return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
+}
+
+static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
+                                        gpointer user_data)
+{
+    VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
+    VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
+    uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
+    uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
+    return (entry->domain_id == info->domain_id) &&
+            (((entry->gfn & info->mask) == gfn) ||
+             (entry->gfn == gfn_tlb));
+}
+
+/* Reset all the gen of VTDAddressSpace to zero and set the gen of
+ * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
+ */
+static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
+{
+    VTDAddressSpace *vtd_as;
+    VTDBus *vtd_bus;
+    GHashTableIter bus_it;
+    uint32_t devfn_it;
+
+    trace_vtd_context_cache_reset();
+
+    g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
+
+    while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
+        for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
+            vtd_as = vtd_bus->dev_as[devfn_it];
+            if (!vtd_as) {
+                continue;
+            }
+            vtd_as->context_cache_entry.context_cache_gen = 0;
+        }
+    }
+    s->context_cache_gen = 1;
+}
+
+/* Must be called with IOMMU lock held. */
+static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
+{
+    assert(s->iotlb);
+    g_hash_table_remove_all(s->iotlb);
+}
+
+static void vtd_reset_iotlb(IntelIOMMUState *s)
+{
+    vtd_iommu_lock(s);
+    vtd_reset_iotlb_locked(s);
+    vtd_iommu_unlock(s);
+}
+
+static void vtd_reset_caches(IntelIOMMUState *s)
+{
+    vtd_iommu_lock(s);
+    vtd_reset_iotlb_locked(s);
+    vtd_reset_context_cache_locked(s);
+    vtd_iommu_unlock(s);
+}
+
+static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
+                                  uint32_t level)
+{
+    return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
+           ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
+}
+
+static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
+{
+    return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
+}
+
+/* Must be called with IOMMU lock held */
+static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
+                                       hwaddr addr)
+{
+    VTDIOTLBEntry *entry;
+    uint64_t key;
+    int level;
+
+    for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
+        key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
+                                source_id, level);
+        entry = g_hash_table_lookup(s->iotlb, &key);
+        if (entry) {
+            goto out;
+        }
+    }
+
+out:
+    return entry;
+}
+
+/* Must be with IOMMU lock held */
+static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
+                             uint16_t domain_id, hwaddr addr, uint64_t slpte,
+                             uint8_t access_flags, uint32_t level)
+{
+    VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
+    uint64_t *key = g_malloc(sizeof(*key));
+    uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
+
+    trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
+    if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
+        trace_vtd_iotlb_reset("iotlb exceeds size limit");
+        vtd_reset_iotlb_locked(s);
+    }
+
+    entry->gfn = gfn;
+    entry->domain_id = domain_id;
+    entry->slpte = slpte;
+    entry->access_flags = access_flags;
+    entry->mask = vtd_slpt_level_page_mask(level);
+    *key = vtd_get_iotlb_key(gfn, source_id, level);
+    g_hash_table_replace(s->iotlb, key, entry);
+}
+
+/* Given the reg addr of both the message data and address, generate an
+ * interrupt via MSI.
+ */
+static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
+                                   hwaddr mesg_data_reg)
+{
+    MSIMessage msi;
+
+    assert(mesg_data_reg < DMAR_REG_SIZE);
+    assert(mesg_addr_reg < DMAR_REG_SIZE);
+
+    msi.address = vtd_get_long_raw(s, mesg_addr_reg);
+    msi.data = vtd_get_long_raw(s, mesg_data_reg);
+
+    trace_vtd_irq_generate(msi.address, msi.data);
+
+    apic_get_class()->send_msi(&msi);
+}
+
+/* Generate a fault event to software via MSI if conditions are met.
+ * Notice that the value of FSTS_REG being passed to it should be the one
+ * before any update.
+ */
+static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
+{
+    if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
+        pre_fsts & VTD_FSTS_IQE) {
+        error_report_once("There are previous interrupt conditions "
+                          "to be serviced by software, fault event "
+                          "is not generated");
+        return;
+    }
+    vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
+    if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
+        error_report_once("Interrupt Mask set, irq is not generated");
+    } else {
+        vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
+        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
+    }
+}
+
+/* Check if the Fault (F) field of the Fault Recording Register referenced by
+ * @index is Set.
+ */
+static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index)
+{
+    /* Each reg is 128-bit */
+    hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
+    addr += 8; /* Access the high 64-bit half */
+
+    assert(index < DMAR_FRCD_REG_NR);
+
+    return vtd_get_quad_raw(s, addr) & VTD_FRCD_F;
+}
+
+/* Update the PPF field of Fault Status Register.
+ * Should be called whenever change the F field of any fault recording
+ * registers.
+ */
+static void vtd_update_fsts_ppf(IntelIOMMUState *s)
+{
+    uint32_t i;
+    uint32_t ppf_mask = 0;
+
+    for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
+        if (vtd_is_frcd_set(s, i)) {
+            ppf_mask = VTD_FSTS_PPF;
+            break;
+        }
+    }
+    vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
+    trace_vtd_fsts_ppf(!!ppf_mask);
+}
+
+static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index)
+{
+    /* Each reg is 128-bit */
+    hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
+    addr += 8; /* Access the high 64-bit half */
+
+    assert(index < DMAR_FRCD_REG_NR);
+
+    vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F);
+    vtd_update_fsts_ppf(s);
+}
+
+/* Must not update F field now, should be done later */
+static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
+                            uint16_t source_id, hwaddr addr,
+                            VTDFaultReason fault, bool is_write)
+{
+    uint64_t hi = 0, lo;
+    hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
+
+    assert(index < DMAR_FRCD_REG_NR);
+
+    lo = VTD_FRCD_FI(addr);
+    hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
+    if (!is_write) {
+        hi |= VTD_FRCD_T;
+    }
+    vtd_set_quad_raw(s, frcd_reg_addr, lo);
+    vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
+
+    trace_vtd_frr_new(index, hi, lo);
+}
+
+/* Try to collapse multiple pending faults from the same requester */
+static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
+{
+    uint32_t i;
+    uint64_t frcd_reg;
+    hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */
+
+    for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
+        frcd_reg = vtd_get_quad_raw(s, addr);
+        if ((frcd_reg & VTD_FRCD_F) &&
+            ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
+            return true;
+        }
+        addr += 16; /* 128-bit for each */
+    }
+    return false;
+}
+
+/* Log and report an DMAR (address translation) fault to software */
+static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
+                                  hwaddr addr, VTDFaultReason fault,
+                                  bool is_write)
+{
+    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
+
+    assert(fault < VTD_FR_MAX);
+
+    if (fault == VTD_FR_RESERVED_ERR) {
+        /* This is not a normal fault reason case. Drop it. */
+        return;
+    }
+
+    trace_vtd_dmar_fault(source_id, fault, addr, is_write);
+
+    if (fsts_reg & VTD_FSTS_PFO) {
+        error_report_once("New fault is not recorded due to "
+                          "Primary Fault Overflow");
+        return;
+    }
+
+    if (vtd_try_collapse_fault(s, source_id)) {
+        error_report_once("New fault is not recorded due to "
+                          "compression of faults");
+        return;
+    }
+
+    if (vtd_is_frcd_set(s, s->next_frcd_reg)) {
+        error_report_once("Next Fault Recording Reg is used, "
+                          "new fault is not recorded, set PFO field");
+        vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
+        return;
+    }
+
+    vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
+
+    if (fsts_reg & VTD_FSTS_PPF) {
+        error_report_once("There are pending faults already, "
+                          "fault event is not generated");
+        vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg);
+        s->next_frcd_reg++;
+        if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
+            s->next_frcd_reg = 0;
+        }
+    } else {
+        vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK,
+                                VTD_FSTS_FRI(s->next_frcd_reg));
+        vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */
+        s->next_frcd_reg++;
+        if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
+            s->next_frcd_reg = 0;
+        }
+        /* This case actually cause the PPF to be Set.
+         * So generate fault event (interrupt).
+         */
+         vtd_generate_fault_event(s, fsts_reg);
+    }
+}
+
+/* Handle Invalidation Queue Errors of queued invalidation interface error
+ * conditions.
+ */
+static void vtd_handle_inv_queue_error(IntelIOMMUState *s)
+{
+    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
+
+    vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE);
+    vtd_generate_fault_event(s, fsts_reg);
+}
+
+/* Set the IWC field and try to generate an invalidation completion interrupt */
+static void vtd_generate_completion_event(IntelIOMMUState *s)
+{
+    if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) {
+        trace_vtd_inv_desc_wait_irq("One pending, skip current");
+        return;
+    }
+    vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC);
+    vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP);
+    if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) {
+        trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, "
+                                    "new event not generated");
+        return;
+    } else {
+        /* Generate the interrupt event */
+        trace_vtd_inv_desc_wait_irq("Generating complete event");
+        vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
+        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
+    }
+}
+
+static inline bool vtd_root_entry_present(IntelIOMMUState *s,
+                                          VTDRootEntry *re,
+                                          uint8_t devfn)
+{
+    if (s->root_scalable && devfn > UINT8_MAX / 2) {
+        return re->hi & VTD_ROOT_ENTRY_P;
+    }
+
+    return re->lo & VTD_ROOT_ENTRY_P;
+}
+
+static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
+                              VTDRootEntry *re)
+{
+    dma_addr_t addr;
+
+    addr = s->root + index * sizeof(*re);
+    if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) {
+        re->lo = 0;
+        return -VTD_FR_ROOT_TABLE_INV;
+    }
+    re->lo = le64_to_cpu(re->lo);
+    re->hi = le64_to_cpu(re->hi);
+    return 0;
+}
+
+static inline bool vtd_ce_present(VTDContextEntry *context)
+{
+    return context->lo & VTD_CONTEXT_ENTRY_P;
+}
+
+static int vtd_get_context_entry_from_root(IntelIOMMUState *s,
+                                           VTDRootEntry *re,
+                                           uint8_t index,
+                                           VTDContextEntry *ce)
+{
+    dma_addr_t addr, ce_size;
+
+    /* we have checked that root entry is present */
+    ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE :
+              VTD_CTX_ENTRY_LEGACY_SIZE;
+
+    if (s->root_scalable && index > UINT8_MAX / 2) {
+        index = index & (~VTD_DEVFN_CHECK_MASK);
+        addr = re->hi & VTD_ROOT_ENTRY_CTP;
+    } else {
+        addr = re->lo & VTD_ROOT_ENTRY_CTP;
+    }
+
+    addr = addr + index * ce_size;
+    if (dma_memory_read(&address_space_memory, addr, ce, ce_size)) {
+        return -VTD_FR_CONTEXT_TABLE_INV;
+    }
+
+    ce->lo = le64_to_cpu(ce->lo);
+    ce->hi = le64_to_cpu(ce->hi);
+    if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) {
+        ce->val[2] = le64_to_cpu(ce->val[2]);
+        ce->val[3] = le64_to_cpu(ce->val[3]);
+    }
+    return 0;
+}
+
+static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
+{
+    return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
+}
+
+static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
+{
+    return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
+}
+
+/* Whether the pte indicates the address of the page frame */
+static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level)
+{
+    return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
+}
+
+/* Get the content of a spte located in @base_addr[@index] */
+static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
+{
+    uint64_t slpte;
+
+    assert(index < VTD_SL_PT_ENTRY_NR);
+
+    if (dma_memory_read(&address_space_memory,
+                        base_addr + index * sizeof(slpte), &slpte,
+                        sizeof(slpte))) {
+        slpte = (uint64_t)-1;
+        return slpte;
+    }
+    slpte = le64_to_cpu(slpte);
+    return slpte;
+}
+
+/* Given an iova and the level of paging structure, return the offset
+ * of current level.
+ */
+static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
+{
+    return (iova >> vtd_slpt_level_shift(level)) &
+            ((1ULL << VTD_SL_LEVEL_BITS) - 1);
+}
+
+/* Check Capability Register to see if the @level of page-table is supported */
+static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
+{
+    return VTD_CAP_SAGAW_MASK & s->cap &
+           (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
+}
+
+/* Return true if check passed, otherwise false */
+static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
+                                     VTDPASIDEntry *pe)
+{
+    switch (VTD_PE_GET_TYPE(pe)) {
+    case VTD_SM_PASID_ENTRY_FLT:
+    case VTD_SM_PASID_ENTRY_SLT:
+    case VTD_SM_PASID_ENTRY_NESTED:
+        break;
+    case VTD_SM_PASID_ENTRY_PT:
+        if (!x86_iommu->pt_supported) {
+            return false;
+        }
+        break;
+    default:
+        /* Unknown type */
+        return false;
+    }
+    return true;
+}
+
+static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
+{
+    return pdire->val & 1;
+}
+
+/**
+ * Caller of this function should check present bit if wants
+ * to use pdir entry for further usage except for fpd bit check.
+ */
+static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base,
+                                         uint32_t pasid,
+                                         VTDPASIDDirEntry *pdire)
+{
+    uint32_t index;
+    dma_addr_t addr, entry_size;
+
+    index = VTD_PASID_DIR_INDEX(pasid);
+    entry_size = VTD_PASID_DIR_ENTRY_SIZE;
+    addr = pasid_dir_base + index * entry_size;
+    if (dma_memory_read(&address_space_memory, addr, pdire, entry_size)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    return 0;
+}
+
+static inline bool vtd_pe_present(VTDPASIDEntry *pe)
+{
+    return pe->val[0] & VTD_PASID_ENTRY_P;
+}
+
+static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
+                                          uint32_t pasid,
+                                          dma_addr_t addr,
+                                          VTDPASIDEntry *pe)
+{
+    uint32_t index;
+    dma_addr_t entry_size;
+    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+    index = VTD_PASID_TABLE_INDEX(pasid);
+    entry_size = VTD_PASID_ENTRY_SIZE;
+    addr = addr + index * entry_size;
+    if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    /* Do translation type check */
+    if (!vtd_pe_type_check(x86_iommu, pe)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    return 0;
+}
+
+/**
+ * Caller of this function should check present bit if wants
+ * to use pasid entry for further usage except for fpd bit check.
+ */
+static int vtd_get_pe_from_pdire(IntelIOMMUState *s,
+                                 uint32_t pasid,
+                                 VTDPASIDDirEntry *pdire,
+                                 VTDPASIDEntry *pe)
+{
+    dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK;
+
+    return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe);
+}
+
+/**
+ * This function gets a pasid entry from a specified pasid
+ * table (includes dir and leaf table) with a specified pasid.
+ * Sanity check should be done to ensure return a present
+ * pasid entry to caller.
+ */
+static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
+                                       dma_addr_t pasid_dir_base,
+                                       uint32_t pasid,
+                                       VTDPASIDEntry *pe)
+{
+    int ret;
+    VTDPASIDDirEntry pdire;
+
+    ret = vtd_get_pdire_from_pdir_table(pasid_dir_base,
+                                        pasid, &pdire);
+    if (ret) {
+        return ret;
+    }
+
+    if (!vtd_pdire_present(&pdire)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe);
+    if (ret) {
+        return ret;
+    }
+
+    if (!vtd_pe_present(pe)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    return 0;
+}
+
+static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
+                                      VTDContextEntry *ce,
+                                      VTDPASIDEntry *pe)
+{
+    uint32_t pasid;
+    dma_addr_t pasid_dir_base;
+    int ret = 0;
+
+    pasid = VTD_CE_GET_RID2PASID(ce);
+    pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
+    ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe);
+
+    return ret;
+}
+
+static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
+                                VTDContextEntry *ce,
+                                bool *pe_fpd_set)
+{
+    int ret;
+    uint32_t pasid;
+    dma_addr_t pasid_dir_base;
+    VTDPASIDDirEntry pdire;
+    VTDPASIDEntry pe;
+
+    pasid = VTD_CE_GET_RID2PASID(ce);
+    pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
+
+    /*
+     * No present bit check since fpd is meaningful even
+     * if the present bit is clear.
+     */
+    ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire);
+    if (ret) {
+        return ret;
+    }
+
+    if (pdire.val & VTD_PASID_DIR_FPD) {
+        *pe_fpd_set = true;
+        return 0;
+    }
+
+    if (!vtd_pdire_present(&pdire)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    /*
+     * No present bit check since fpd is meaningful even
+     * if the present bit is clear.
+     */
+    ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe);
+    if (ret) {
+        return ret;
+    }
+
+    if (pe.val[0] & VTD_PASID_ENTRY_FPD) {
+        *pe_fpd_set = true;
+    }
+
+    return 0;
+}
+
+/* Get the page-table level that hardware should use for the second-level
+ * page-table walk from the Address Width field of context-entry.
+ */
+static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce)
+{
+    return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
+}
+
+static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
+                                   VTDContextEntry *ce)
+{
+    VTDPASIDEntry pe;
+
+    if (s->root_scalable) {
+        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+        return VTD_PE_GET_LEVEL(&pe);
+    }
+
+    return vtd_ce_get_level(ce);
+}
+
+static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce)
+{
+    return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
+}
+
+static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s,
+                                  VTDContextEntry *ce)
+{
+    VTDPASIDEntry pe;
+
+    if (s->root_scalable) {
+        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+        return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9;
+    }
+
+    return vtd_ce_get_agaw(ce);
+}
+
+static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce)
+{
+    return ce->lo & VTD_CONTEXT_ENTRY_TT;
+}
+
+/* Only for Legacy Mode. Return true if check passed, otherwise false */
+static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
+                                     VTDContextEntry *ce)
+{
+    switch (vtd_ce_get_type(ce)) {
+    case VTD_CONTEXT_TT_MULTI_LEVEL:
+        /* Always supported */
+        break;
+    case VTD_CONTEXT_TT_DEV_IOTLB:
+        if (!x86_iommu->dt_supported) {
+            error_report_once("%s: DT specified but not supported", __func__);
+            return false;
+        }
+        break;
+    case VTD_CONTEXT_TT_PASS_THROUGH:
+        if (!x86_iommu->pt_supported) {
+            error_report_once("%s: PT specified but not supported", __func__);
+            return false;
+        }
+        break;
+    default:
+        /* Unknown type */
+        error_report_once("%s: unknown ce type: %"PRIu32, __func__,
+                          vtd_ce_get_type(ce));
+        return false;
+    }
+    return true;
+}
+
+static inline uint64_t vtd_iova_limit(IntelIOMMUState *s,
+                                      VTDContextEntry *ce, uint8_t aw)
+{
+    uint32_t ce_agaw = vtd_get_iova_agaw(s, ce);
+    return 1ULL << MIN(ce_agaw, aw);
+}
+
+/* Return true if IOVA passes range check, otherwise false. */
+static inline bool vtd_iova_range_check(IntelIOMMUState *s,
+                                        uint64_t iova, VTDContextEntry *ce,
+                                        uint8_t aw)
+{
+    /*
+     * Check if @iova is above 2^X-1, where X is the minimum of MGAW
+     * in CAP_REG and AW in context-entry.
+     */
+    return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1));
+}
+
+static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
+                                          VTDContextEntry *ce)
+{
+    VTDPASIDEntry pe;
+
+    if (s->root_scalable) {
+        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+        return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+    }
+
+    return vtd_ce_get_slpt_base(ce);
+}
+
+/*
+ * Rsvd field masks for spte:
+ *     vtd_spte_rsvd 4k pages
+ *     vtd_spte_rsvd_large large pages
+ */
+static uint64_t vtd_spte_rsvd[5];
+static uint64_t vtd_spte_rsvd_large[5];
+
+static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
+{
+    uint64_t rsvd_mask = vtd_spte_rsvd[level];
+
+    if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) &&
+        (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) {
+        /* large page */
+        rsvd_mask = vtd_spte_rsvd_large[level];
+    }
+
+    return slpte & rsvd_mask;
+}
+
+/* Find the VTD address space associated with a given bus number */
+static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
+{
+    VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
+    GHashTableIter iter;
+
+    if (vtd_bus) {
+        return vtd_bus;
+    }
+
+    /*
+     * Iterate over the registered buses to find the one which
+     * currently holds this bus number and update the bus_num
+     * lookup table.
+     */
+    g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
+    while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
+        if (pci_bus_num(vtd_bus->bus) == bus_num) {
+            s->vtd_as_by_bus_num[bus_num] = vtd_bus;
+            return vtd_bus;
+        }
+    }
+
+    return NULL;
+}
+
+/* Given the @iova, get relevant @slptep. @slpte_level will be the last level
+ * of the translation, can be used for deciding the size of large page.
+ */
+static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
+                             uint64_t iova, bool is_write,
+                             uint64_t *slptep, uint32_t *slpte_level,
+                             bool *reads, bool *writes, uint8_t aw_bits)
+{
+    dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
+    uint32_t level = vtd_get_iova_level(s, ce);
+    uint32_t offset;
+    uint64_t slpte;
+    uint64_t access_right_check;
+
+    if (!vtd_iova_range_check(s, iova, ce, aw_bits)) {
+        error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")",
+                          __func__, iova);
+        return -VTD_FR_ADDR_BEYOND_MGAW;
+    }
+
+    /* FIXME: what is the Atomics request here? */
+    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
+
+    while (true) {
+        offset = vtd_iova_level_offset(iova, level);
+        slpte = vtd_get_slpte(addr, offset);
+
+        if (slpte == (uint64_t)-1) {
+            error_report_once("%s: detected read error on DMAR slpte "
+                              "(iova=0x%" PRIx64 ")", __func__, iova);
+            if (level == vtd_get_iova_level(s, ce)) {
+                /* Invalid programming of context-entry */
+                return -VTD_FR_CONTEXT_ENTRY_INV;
+            } else {
+                return -VTD_FR_PAGING_ENTRY_INV;
+            }
+        }
+        *reads = (*reads) && (slpte & VTD_SL_R);
+        *writes = (*writes) && (slpte & VTD_SL_W);
+        if (!(slpte & access_right_check)) {
+            error_report_once("%s: detected slpte permission error "
+                              "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
+                              "slpte=0x%" PRIx64 ", write=%d)", __func__,
+                              iova, level, slpte, is_write);
+            return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
+        }
+        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
+            error_report_once("%s: detected splte reserve non-zero "
+                              "iova=0x%" PRIx64 ", level=0x%" PRIx32
+                              "slpte=0x%" PRIx64 ")", __func__, iova,
+                              level, slpte);
+            return -VTD_FR_PAGING_ENTRY_RSVD;
+        }
+
+        if (vtd_is_last_slpte(slpte, level)) {
+            *slptep = slpte;
+            *slpte_level = level;
+            return 0;
+        }
+        addr = vtd_get_slpte_addr(slpte, aw_bits);
+        level--;
+    }
+}
+
+typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private);
+
+/**
+ * Constant information used during page walking
+ *
+ * @hook_fn: hook func to be called when detected page
+ * @private: private data to be passed into hook func
+ * @notify_unmap: whether we should notify invalid entries
+ * @as: VT-d address space of the device
+ * @aw: maximum address width
+ * @domain: domain ID of the page walk
+ */
+typedef struct {
+    VTDAddressSpace *as;
+    vtd_page_walk_hook hook_fn;
+    void *private;
+    bool notify_unmap;
+    uint8_t aw;
+    uint16_t domain_id;
+} vtd_page_walk_info;
+
+static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info)
+{
+    VTDAddressSpace *as = info->as;
+    vtd_page_walk_hook hook_fn = info->hook_fn;
+    void *private = info->private;
+    IOMMUTLBEntry *entry = &event->entry;
+    DMAMap target = {
+        .iova = entry->iova,
+        .size = entry->addr_mask,
+        .translated_addr = entry->translated_addr,
+        .perm = entry->perm,
+    };
+    const DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+    if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) {
+        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+        return 0;
+    }
+
+    assert(hook_fn);
+
+    /* Update local IOVA mapped ranges */
+    if (event->type == IOMMU_NOTIFIER_MAP) {
+        if (mapped) {
+            /* If it's exactly the same translation, skip */
+            if (!memcmp(mapped, &target, sizeof(target))) {
+                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
+                                                 entry->translated_addr);
+                return 0;
+            } else {
+                /*
+                 * Translation changed.  Normally this should not
+                 * happen, but it can happen when with buggy guest
+                 * OSes.  Note that there will be a small window that
+                 * we don't have map at all.  But that's the best
+                 * effort we can do.  The ideal way to emulate this is
+                 * atomically modify the PTE to follow what has
+                 * changed, but we can't.  One example is that vfio
+                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
+                 * interface to modify a mapping (meanwhile it seems
+                 * meaningless to even provide one).  Anyway, let's
+                 * mark this as a TODO in case one day we'll have
+                 * a better solution.
+                 */
+                IOMMUAccessFlags cache_perm = entry->perm;
+                int ret;
+
+                /* Emulate an UNMAP */
+                event->type = IOMMU_NOTIFIER_UNMAP;
+                entry->perm = IOMMU_NONE;
+                trace_vtd_page_walk_one(info->domain_id,
+                                        entry->iova,
+                                        entry->translated_addr,
+                                        entry->addr_mask,
+                                        entry->perm);
+                ret = hook_fn(event, private);
+                if (ret) {
+                    return ret;
+                }
+                /* Drop any existing mapping */
+                iova_tree_remove(as->iova_tree, &target);
+                /* Recover the correct type */
+                event->type = IOMMU_NOTIFIER_MAP;
+                entry->perm = cache_perm;
+            }
+        }
+        iova_tree_insert(as->iova_tree, &target);
+    } else {
+        if (!mapped) {
+            /* Skip since we didn't map this range at all */
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+            return 0;
+        }
+        iova_tree_remove(as->iova_tree, &target);
+    }
+
+    trace_vtd_page_walk_one(info->domain_id, entry->iova,
+                            entry->translated_addr, entry->addr_mask,
+                            entry->perm);
+    return hook_fn(event, private);
+}
+
+/**
+ * vtd_page_walk_level - walk over specific level for IOVA range
+ *
+ * @addr: base GPA addr to start the walk
+ * @start: IOVA range start address
+ * @end: IOVA range end address (start <= addr < end)
+ * @read: whether parent level has read permission
+ * @write: whether parent level has write permission
+ * @info: constant information for the page walk
+ */
+static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
+                               uint64_t end, uint32_t level, bool read,
+                               bool write, vtd_page_walk_info *info)
+{
+    bool read_cur, write_cur, entry_valid;
+    uint32_t offset;
+    uint64_t slpte;
+    uint64_t subpage_size, subpage_mask;
+    IOMMUTLBEvent event;
+    uint64_t iova = start;
+    uint64_t iova_next;
+    int ret = 0;
+
+    trace_vtd_page_walk_level(addr, level, start, end);
+
+    subpage_size = 1ULL << vtd_slpt_level_shift(level);
+    subpage_mask = vtd_slpt_level_page_mask(level);
+
+    while (iova < end) {
+        iova_next = (iova & subpage_mask) + subpage_size;
+
+        offset = vtd_iova_level_offset(iova, level);
+        slpte = vtd_get_slpte(addr, offset);
+
+        if (slpte == (uint64_t)-1) {
+            trace_vtd_page_walk_skip_read(iova, iova_next);
+            goto next;
+        }
+
+        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
+            trace_vtd_page_walk_skip_reserve(iova, iova_next);
+            goto next;
+        }
+
+        /* Permissions are stacked with parents' */
+        read_cur = read && (slpte & VTD_SL_R);
+        write_cur = write && (slpte & VTD_SL_W);
+
+        /*
+         * As long as we have either read/write permission, this is a
+         * valid entry. The rule works for both page entries and page
+         * table entries.
+         */
+        entry_valid = read_cur | write_cur;
+
+        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
+            /*
+             * This is a valid PDE (or even bigger than PDE).  We need
+             * to walk one further level.
+             */
+            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
+                                      iova, MIN(iova_next, end), level - 1,
+                                      read_cur, write_cur, info);
+        } else {
+            /*
+             * This means we are either:
+             *
+             * (1) the real page entry (either 4K page, or huge page)
+             * (2) the whole range is invalid
+             *
+             * In either case, we send an IOTLB notification down.
+             */
+            event.entry.target_as = &address_space_memory;
+            event.entry.iova = iova & subpage_mask;
+            event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
+            event.entry.addr_mask = ~subpage_mask;
+            /* NOTE: this is only meaningful if entry_valid == true */
+            event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+            event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP :
+                                            IOMMU_NOTIFIER_UNMAP;
+            ret = vtd_page_walk_one(&event, info);
+        }
+
+        if (ret < 0) {
+            return ret;
+        }
+
+next:
+        iova = iova_next;
+    }
+
+    return 0;
+}
+
+/**
+ * vtd_page_walk - walk specific IOVA range, and call the hook
+ *
+ * @s: intel iommu state
+ * @ce: context entry to walk upon
+ * @start: IOVA address to start the walk
+ * @end: IOVA range end address (start <= addr < end)
+ * @info: page walking information struct
+ */
+static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce,
+                         uint64_t start, uint64_t end,
+                         vtd_page_walk_info *info)
+{
+    dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
+    uint32_t level = vtd_get_iova_level(s, ce);
+
+    if (!vtd_iova_range_check(s, start, ce, info->aw)) {
+        return -VTD_FR_ADDR_BEYOND_MGAW;
+    }
+
+    if (!vtd_iova_range_check(s, end, ce, info->aw)) {
+        /* Fix end so that it reaches the maximum */
+        end = vtd_iova_limit(s, ce, info->aw);
+    }
+
+    return vtd_page_walk_level(addr, start, end, level, true, true, info);
+}
+
+static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s,
+                                          VTDRootEntry *re)
+{
+    /* Legacy Mode reserved bits check */
+    if (!s->root_scalable &&
+        (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
+        goto rsvd_err;
+
+    /* Scalable Mode reserved bits check */
+    if (s->root_scalable &&
+        ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) ||
+         (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
+        goto rsvd_err;
+
+    return 0;
+
+rsvd_err:
+    error_report_once("%s: invalid root entry: hi=0x%"PRIx64
+                      ", lo=0x%"PRIx64,
+                      __func__, re->hi, re->lo);
+    return -VTD_FR_ROOT_ENTRY_RSVD;
+}
+
+static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
+                                                    VTDContextEntry *ce)
+{
+    if (!s->root_scalable &&
+        (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI ||
+         ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
+        error_report_once("%s: invalid context entry: hi=%"PRIx64
+                          ", lo=%"PRIx64" (reserved nonzero)",
+                          __func__, ce->hi, ce->lo);
+        return -VTD_FR_CONTEXT_ENTRY_RSVD;
+    }
+
+    if (s->root_scalable &&
+        (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) ||
+         ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 ||
+         ce->val[2] ||
+         ce->val[3])) {
+        error_report_once("%s: invalid context entry: val[3]=%"PRIx64
+                          ", val[2]=%"PRIx64
+                          ", val[1]=%"PRIx64
+                          ", val[0]=%"PRIx64" (reserved nonzero)",
+                          __func__, ce->val[3], ce->val[2],
+                          ce->val[1], ce->val[0]);
+        return -VTD_FR_CONTEXT_ENTRY_RSVD;
+    }
+
+    return 0;
+}
+
+static int vtd_ce_rid2pasid_check(IntelIOMMUState *s,
+                                  VTDContextEntry *ce)
+{
+    VTDPASIDEntry pe;
+
+    /*
+     * Make sure in Scalable Mode, a present context entry
+     * has valid rid2pasid setting, which includes valid
+     * rid2pasid field and corresponding pasid entry setting
+     */
+    return vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+}
+
+/* Map a device to its corresponding domain (context-entry) */
+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+                                    uint8_t devfn, VTDContextEntry *ce)
+{
+    VTDRootEntry re;
+    int ret_fr;
+    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+    ret_fr = vtd_get_root_entry(s, bus_num, &re);
+    if (ret_fr) {
+        return ret_fr;
+    }
+
+    if (!vtd_root_entry_present(s, &re, devfn)) {
+        /* Not error - it's okay we don't have root entry. */
+        trace_vtd_re_not_present(bus_num);
+        return -VTD_FR_ROOT_ENTRY_P;
+    }
+
+    ret_fr = vtd_root_entry_rsvd_bits_check(s, &re);
+    if (ret_fr) {
+        return ret_fr;
+    }
+
+    ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce);
+    if (ret_fr) {
+        return ret_fr;
+    }
+
+    if (!vtd_ce_present(ce)) {
+        /* Not error - it's okay we don't have context entry. */
+        trace_vtd_ce_not_present(bus_num, devfn);
+        return -VTD_FR_CONTEXT_ENTRY_P;
+    }
+
+    ret_fr = vtd_context_entry_rsvd_bits_check(s, ce);
+    if (ret_fr) {
+        return ret_fr;
+    }
+
+    /* Check if the programming of context-entry is valid */
+    if (!s->root_scalable &&
+        !vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
+        error_report_once("%s: invalid context entry: hi=%"PRIx64
+                          ", lo=%"PRIx64" (level %d not supported)",
+                          __func__, ce->hi, ce->lo,
+                          vtd_ce_get_level(ce));
+        return -VTD_FR_CONTEXT_ENTRY_INV;
+    }
+
+    if (!s->root_scalable) {
+        /* Do translation type check */
+        if (!vtd_ce_type_check(x86_iommu, ce)) {
+            /* Errors dumped in vtd_ce_type_check() */
+            return -VTD_FR_CONTEXT_ENTRY_INV;
+        }
+    } else {
+        /*
+         * Check if the programming of context-entry.rid2pasid
+         * and corresponding pasid setting is valid, and thus
+         * avoids to check pasid entry fetching result in future
+         * helper function calling.
+         */
+        ret_fr = vtd_ce_rid2pasid_check(s, ce);
+        if (ret_fr) {
+            return ret_fr;
+        }
+    }
+
+    return 0;
+}
+
+static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event,
+                                     void *private)
+{
+    memory_region_notify_iommu(private, 0, *event);
+    return 0;
+}
+
+static uint16_t vtd_get_domain_id(IntelIOMMUState *s,
+                                  VTDContextEntry *ce)
+{
+    VTDPASIDEntry pe;
+
+    if (s->root_scalable) {
+        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+        return VTD_SM_PASID_ENTRY_DID(pe.val[1]);
+    }
+
+    return VTD_CONTEXT_ENTRY_DID(ce->hi);
+}
+
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
+                                            VTDContextEntry *ce,
+                                            hwaddr addr, hwaddr size)
+{
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    vtd_page_walk_info info = {
+        .hook_fn = vtd_sync_shadow_page_hook,
+        .private = (void *)&vtd_as->iommu,
+        .notify_unmap = true,
+        .aw = s->aw_bits,
+        .as = vtd_as,
+        .domain_id = vtd_get_domain_id(s, ce),
+    };
+
+    return vtd_page_walk(s, ce, addr, addr + size, &info);
+}
+
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
+{
+    int ret;
+    VTDContextEntry ce;
+    IOMMUNotifier *n;
+
+    if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) {
+        return 0;
+    }
+
+    ret = vtd_dev_to_context_entry(vtd_as->iommu_state,
+                                   pci_bus_num(vtd_as->bus),
+                                   vtd_as->devfn, &ce);
+    if (ret) {
+        if (ret == -VTD_FR_CONTEXT_ENTRY_P) {
+            /*
+             * It's a valid scenario to have a context entry that is
+             * not present.  For example, when a device is removed
+             * from an existing domain then the context entry will be
+             * zeroed by the guest before it was put into another
+             * domain.  When this happens, instead of synchronizing
+             * the shadow pages we should invalidate all existing
+             * mappings and notify the backends.
+             */
+            IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
+                vtd_address_space_unmap(vtd_as, n);
+            }
+            ret = 0;
+        }
+        return ret;
+    }
+
+    return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX);
+}
+
+/*
+ * Check if specific device is configured to bypass address
+ * translation for DMA requests. In Scalable Mode, bypass
+ * 1st-level translation or 2nd-level translation, it depends
+ * on PGTT setting.
+ */
+static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
+{
+    IntelIOMMUState *s;
+    VTDContextEntry ce;
+    VTDPASIDEntry pe;
+    int ret;
+
+    assert(as);
+
+    s = as->iommu_state;
+    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
+                                   as->devfn, &ce);
+    if (ret) {
+        /*
+         * Possibly failed to parse the context entry for some reason
+         * (e.g., during init, or any guest configuration errors on
+         * context entries). We should assume PT not enabled for
+         * safety.
+         */
+        return false;
+    }
+
+    if (s->root_scalable) {
+        ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe);
+        if (ret) {
+            error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32,
+                              __func__, ret);
+            return false;
+        }
+        return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
+    }
+
+    return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH);
+}
+
+/* Return whether the device is using IOMMU translation. */
+static bool vtd_switch_address_space(VTDAddressSpace *as)
+{
+    bool use_iommu;
+    /* Whether we need to take the BQL on our own */
+    bool take_bql = !qemu_mutex_iothread_locked();
+
+    assert(as);
+
+    use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as);
+
+    trace_vtd_switch_address_space(pci_bus_num(as->bus),
+                                   VTD_PCI_SLOT(as->devfn),
+                                   VTD_PCI_FUNC(as->devfn),
+                                   use_iommu);
+
+    /*
+     * It's possible that we reach here without BQL, e.g., when called
+     * from vtd_pt_enable_fast_path(). However the memory APIs need
+     * it. We'd better make sure we have had it already, or, take it.
+     */
+    if (take_bql) {
+        qemu_mutex_lock_iothread();
+    }
+
+    /* Turn off first then on the other */
+    if (use_iommu) {
+        memory_region_set_enabled(&as->nodmar, false);
+        memory_region_set_enabled(MEMORY_REGION(&as->iommu), true);
+    } else {
+        memory_region_set_enabled(MEMORY_REGION(&as->iommu), false);
+        memory_region_set_enabled(&as->nodmar, true);
+    }
+
+    if (take_bql) {
+        qemu_mutex_unlock_iothread();
+    }
+
+    return use_iommu;
+}
+
+static void vtd_switch_address_space_all(IntelIOMMUState *s)
+{
+    GHashTableIter iter;
+    VTDBus *vtd_bus;
+    int i;
+
+    g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
+    while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
+        for (i = 0; i < PCI_DEVFN_MAX; i++) {
+            if (!vtd_bus->dev_as[i]) {
+                continue;
+            }
+            vtd_switch_address_space(vtd_bus->dev_as[i]);
+        }
+    }
+}
+
+static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
+{
+    return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
+}
+
+static const bool vtd_qualified_faults[] = {
+    [VTD_FR_RESERVED] = false,
+    [VTD_FR_ROOT_ENTRY_P] = false,
+    [VTD_FR_CONTEXT_ENTRY_P] = true,
+    [VTD_FR_CONTEXT_ENTRY_INV] = true,
+    [VTD_FR_ADDR_BEYOND_MGAW] = true,
+    [VTD_FR_WRITE] = true,
+    [VTD_FR_READ] = true,
+    [VTD_FR_PAGING_ENTRY_INV] = true,
+    [VTD_FR_ROOT_TABLE_INV] = false,
+    [VTD_FR_CONTEXT_TABLE_INV] = false,
+    [VTD_FR_ROOT_ENTRY_RSVD] = false,
+    [VTD_FR_PAGING_ENTRY_RSVD] = true,
+    [VTD_FR_CONTEXT_ENTRY_TT] = true,
+    [VTD_FR_PASID_TABLE_INV] = false,
+    [VTD_FR_RESERVED_ERR] = false,
+    [VTD_FR_MAX] = false,
+};
+
+/* To see if a fault condition is "qualified", which is reported to software
+ * only if the FPD field in the context-entry used to process the faulting
+ * request is 0.
+ */
+static inline bool vtd_is_qualified_fault(VTDFaultReason fault)
+{
+    return vtd_qualified_faults[fault];
+}
+
+static inline bool vtd_is_interrupt_addr(hwaddr addr)
+{
+    return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
+}
+
+static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
+{
+    VTDBus *vtd_bus;
+    VTDAddressSpace *vtd_as;
+    bool success = false;
+
+    vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
+    if (!vtd_bus) {
+        goto out;
+    }
+
+    vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
+    if (!vtd_as) {
+        goto out;
+    }
+
+    if (vtd_switch_address_space(vtd_as) == false) {
+        /* We switched off IOMMU region successfully. */
+        success = true;
+    }
+
+out:
+    trace_vtd_pt_enable_fast_path(source_id, success);
+}
+
+/* Map dev to context-entry then do a paging-structures walk to do a iommu
+ * translation.
+ *
+ * Called from RCU critical section.
+ *
+ * @bus_num: The bus number
+ * @devfn: The devfn, which is the  combined of device and function number
+ * @is_write: The access is a write operation
+ * @entry: IOMMUTLBEntry that contain the addr to be translated and result
+ *
+ * Returns true if translation is successful, otherwise false.
+ */
+static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
+                                   uint8_t devfn, hwaddr addr, bool is_write,
+                                   IOMMUTLBEntry *entry)
+{
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    VTDContextEntry ce;
+    uint8_t bus_num = pci_bus_num(bus);
+    VTDContextCacheEntry *cc_entry;
+    uint64_t slpte, page_mask;
+    uint32_t level;
+    uint16_t source_id = vtd_make_source_id(bus_num, devfn);
+    int ret_fr;
+    bool is_fpd_set = false;
+    bool reads = true;
+    bool writes = true;
+    uint8_t access_flags;
+    VTDIOTLBEntry *iotlb_entry;
+
+    /*
+     * We have standalone memory region for interrupt addresses, we
+     * should never receive translation requests in this region.
+     */
+    assert(!vtd_is_interrupt_addr(addr));
+
+    vtd_iommu_lock(s);
+
+    cc_entry = &vtd_as->context_cache_entry;
+
+    /* Try to fetch slpte form IOTLB */
+    iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
+    if (iotlb_entry) {
+        trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
+                                 iotlb_entry->domain_id);
+        slpte = iotlb_entry->slpte;
+        access_flags = iotlb_entry->access_flags;
+        page_mask = iotlb_entry->mask;
+        goto out;
+    }
+
+    /* Try to fetch context-entry from cache first */
+    if (cc_entry->context_cache_gen == s->context_cache_gen) {
+        trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi,
+                               cc_entry->context_entry.lo,
+                               cc_entry->context_cache_gen);
+        ce = cc_entry->context_entry;
+        is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
+        if (!is_fpd_set && s->root_scalable) {
+            ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
+            VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
+        }
+    } else {
+        ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+        is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
+        if (!ret_fr && !is_fpd_set && s->root_scalable) {
+            ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
+        }
+        VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
+        /* Update context-cache */
+        trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
+                                  cc_entry->context_cache_gen,
+                                  s->context_cache_gen);
+        cc_entry->context_entry = ce;
+        cc_entry->context_cache_gen = s->context_cache_gen;
+    }
+
+    /*
+     * We don't need to translate for pass-through context entries.
+     * Also, let's ignore IOTLB caching as well for PT devices.
+     */
+    if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
+        entry->iova = addr & VTD_PAGE_MASK_4K;
+        entry->translated_addr = entry->iova;
+        entry->addr_mask = ~VTD_PAGE_MASK_4K;
+        entry->perm = IOMMU_RW;
+        trace_vtd_translate_pt(source_id, entry->iova);
+
+        /*
+         * When this happens, it means firstly caching-mode is not
+         * enabled, and this is the first passthrough translation for
+         * the device. Let's enable the fast path for passthrough.
+         *
+         * When passthrough is disabled again for the device, we can
+         * capture it via the context entry invalidation, then the
+         * IOMMU region can be swapped back.
+         */
+        vtd_pt_enable_fast_path(s, source_id);
+        vtd_iommu_unlock(s);
+        return true;
+    }
+
+    ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
+                               &reads, &writes, s->aw_bits);
+    VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
+
+    page_mask = vtd_slpt_level_page_mask(level);
+    access_flags = IOMMU_ACCESS_FLAG(reads, writes);
+    vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte,
+                     access_flags, level);
+out:
+    vtd_iommu_unlock(s);
+    entry->iova = addr & page_mask;
+    entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
+    entry->addr_mask = ~page_mask;
+    entry->perm = access_flags;
+    return true;
+
+error:
+    vtd_iommu_unlock(s);
+    entry->iova = 0;
+    entry->translated_addr = 0;
+    entry->addr_mask = 0;
+    entry->perm = IOMMU_NONE;
+    return false;
+}
+
+static void vtd_root_table_setup(IntelIOMMUState *s)
+{
+    s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
+    s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
+
+    vtd_update_scalable_state(s);
+
+    trace_vtd_reg_dmar_root(s->root, s->root_scalable);
+}
+
+static void vtd_iec_notify_all(IntelIOMMUState *s, bool global,
+                               uint32_t index, uint32_t mask)
+{
+    x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
+}
+
+static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
+{
+    uint64_t value = 0;
+    value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
+    s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
+    s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
+    s->intr_eime = value & VTD_IRTA_EIME;
+
+    /* Notify global invalidation */
+    vtd_iec_notify_all(s, true, 0, 0);
+
+    trace_vtd_reg_ir_root(s->intr_root, s->intr_size);
+}
+
+static void vtd_iommu_replay_all(IntelIOMMUState *s)
+{
+    VTDAddressSpace *vtd_as;
+
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+        vtd_sync_shadow_page_table(vtd_as);
+    }
+}
+
+static void vtd_context_global_invalidate(IntelIOMMUState *s)
+{
+    trace_vtd_inv_desc_cc_global();
+    /* Protects context cache */
+    vtd_iommu_lock(s);
+    s->context_cache_gen++;
+    if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
+        vtd_reset_context_cache_locked(s);
+    }
+    vtd_iommu_unlock(s);
+    vtd_address_space_refresh_all(s);
+    /*
+     * From VT-d spec 6.5.2.1, a global context entry invalidation
+     * should be followed by a IOTLB global invalidation, so we should
+     * be safe even without this. Hoewever, let's replay the region as
+     * well to be safer, and go back here when we need finer tunes for
+     * VT-d emulation codes.
+     */
+    vtd_iommu_replay_all(s);
+}
+
+/* Do a context-cache device-selective invalidation.
+ * @func_mask: FM field after shifting
+ */
+static void vtd_context_device_invalidate(IntelIOMMUState *s,
+                                          uint16_t source_id,
+                                          uint16_t func_mask)
+{
+    uint16_t mask;
+    VTDBus *vtd_bus;
+    VTDAddressSpace *vtd_as;
+    uint8_t bus_n, devfn;
+    uint16_t devfn_it;
+
+    trace_vtd_inv_desc_cc_devices(source_id, func_mask);
+
+    switch (func_mask & 3) {
+    case 0:
+        mask = 0;   /* No bits in the SID field masked */
+        break;
+    case 1:
+        mask = 4;   /* Mask bit 2 in the SID field */
+        break;
+    case 2:
+        mask = 6;   /* Mask bit 2:1 in the SID field */
+        break;
+    case 3:
+        mask = 7;   /* Mask bit 2:0 in the SID field */
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    mask = ~mask;
+
+    bus_n = VTD_SID_TO_BUS(source_id);
+    vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
+    if (vtd_bus) {
+        devfn = VTD_SID_TO_DEVFN(source_id);
+        for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
+            vtd_as = vtd_bus->dev_as[devfn_it];
+            if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
+                trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
+                                             VTD_PCI_FUNC(devfn_it));
+                vtd_iommu_lock(s);
+                vtd_as->context_cache_entry.context_cache_gen = 0;
+                vtd_iommu_unlock(s);
+                /*
+                 * Do switch address space when needed, in case if the
+                 * device passthrough bit is switched.
+                 */
+                vtd_switch_address_space(vtd_as);
+                /*
+                 * So a device is moving out of (or moving into) a
+                 * domain, resync the shadow page table.
+                 * This won't bring bad even if we have no such
+                 * notifier registered - the IOMMU notification
+                 * framework will skip MAP notifications if that
+                 * happened.
+                 */
+                vtd_sync_shadow_page_table(vtd_as);
+            }
+        }
+    }
+}
+
+/* Context-cache invalidation
+ * Returns the Context Actual Invalidation Granularity.
+ * @val: the content of the CCMD_REG
+ */
+static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
+{
+    uint64_t caig;
+    uint64_t type = val & VTD_CCMD_CIRG_MASK;
+
+    switch (type) {
+    case VTD_CCMD_DOMAIN_INVL:
+        /* Fall through */
+    case VTD_CCMD_GLOBAL_INVL:
+        caig = VTD_CCMD_GLOBAL_INVL_A;
+        vtd_context_global_invalidate(s);
+        break;
+
+    case VTD_CCMD_DEVICE_INVL:
+        caig = VTD_CCMD_DEVICE_INVL_A;
+        vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val));
+        break;
+
+    default:
+        error_report_once("%s: invalid context: 0x%" PRIx64,
+                          __func__, val);
+        caig = 0;
+    }
+    return caig;
+}
+
+static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
+{
+    trace_vtd_inv_desc_iotlb_global();
+    vtd_reset_iotlb(s);
+    vtd_iommu_replay_all(s);
+}
+
+static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
+{
+    VTDContextEntry ce;
+    VTDAddressSpace *vtd_as;
+
+    trace_vtd_inv_desc_iotlb_domain(domain_id);
+
+    vtd_iommu_lock(s);
+    g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
+                                &domain_id);
+    vtd_iommu_unlock(s);
+
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+        if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+                                      vtd_as->devfn, &ce) &&
+            domain_id == vtd_get_domain_id(s, &ce)) {
+            vtd_sync_shadow_page_table(vtd_as);
+        }
+    }
+}
+
+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
+                                           uint16_t domain_id, hwaddr addr,
+                                           uint8_t am)
+{
+    VTDAddressSpace *vtd_as;
+    VTDContextEntry ce;
+    int ret;
+    hwaddr size = (1 << am) * VTD_PAGE_SIZE;
+
+    QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+                                       vtd_as->devfn, &ce);
+        if (!ret && domain_id == vtd_get_domain_id(s, &ce)) {
+            if (vtd_as_has_map_notifier(vtd_as)) {
+                /*
+                 * As long as we have MAP notifications registered in
+                 * any of our IOMMU notifiers, we need to sync the
+                 * shadow page table.
+                 */
+                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
+            } else {
+                /*
+                 * For UNMAP-only notifiers, we don't need to walk the
+                 * page tables.  We just deliver the PSI down to
+                 * invalidate caches.
+                 */
+                IOMMUTLBEvent event = {
+                    .type = IOMMU_NOTIFIER_UNMAP,
+                    .entry = {
+                        .target_as = &address_space_memory,
+                        .iova = addr,
+                        .translated_addr = 0,
+                        .addr_mask = size - 1,
+                        .perm = IOMMU_NONE,
+                    },
+                };
+                memory_region_notify_iommu(&vtd_as->iommu, 0, event);
+            }
+        }
+    }
+}
+
+static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
+                                      hwaddr addr, uint8_t am)
+{
+    VTDIOTLBPageInvInfo info;
+
+    trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am);
+
+    assert(am <= VTD_MAMV);
+    info.domain_id = domain_id;
+    info.addr = addr;
+    info.mask = ~((1 << am) - 1);
+    vtd_iommu_lock(s);
+    g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+    vtd_iommu_unlock(s);
+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
+}
+
+/* Flush IOTLB
+ * Returns the IOTLB Actual Invalidation Granularity.
+ * @val: the content of the IOTLB_REG
+ */
+static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
+{
+    uint64_t iaig;
+    uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK;
+    uint16_t domain_id;
+    hwaddr addr;
+    uint8_t am;
+
+    switch (type) {
+    case VTD_TLB_GLOBAL_FLUSH:
+        iaig = VTD_TLB_GLOBAL_FLUSH_A;
+        vtd_iotlb_global_invalidate(s);
+        break;
+
+    case VTD_TLB_DSI_FLUSH:
+        domain_id = VTD_TLB_DID(val);
+        iaig = VTD_TLB_DSI_FLUSH_A;
+        vtd_iotlb_domain_invalidate(s, domain_id);
+        break;
+
+    case VTD_TLB_PSI_FLUSH:
+        domain_id = VTD_TLB_DID(val);
+        addr = vtd_get_quad_raw(s, DMAR_IVA_REG);
+        am = VTD_IVA_AM(addr);
+        addr = VTD_IVA_ADDR(addr);
+        if (am > VTD_MAMV) {
+            error_report_once("%s: address mask overflow: 0x%" PRIx64,
+                              __func__, vtd_get_quad_raw(s, DMAR_IVA_REG));
+            iaig = 0;
+            break;
+        }
+        iaig = VTD_TLB_PSI_FLUSH_A;
+        vtd_iotlb_page_invalidate(s, domain_id, addr, am);
+        break;
+
+    default:
+        error_report_once("%s: invalid granularity: 0x%" PRIx64,
+                          __func__, val);
+        iaig = 0;
+    }
+    return iaig;
+}
+
+static void vtd_fetch_inv_desc(IntelIOMMUState *s);
+
+static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s)
+{
+    return s->qi_enabled && (s->iq_tail == s->iq_head) &&
+           (s->iq_last_desc_type == VTD_INV_DESC_WAIT);
+}
+
+static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
+{
+    uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG);
+
+    trace_vtd_inv_qi_enable(en);
+
+    if (en) {
+        s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
+        /* 2^(x+8) entries */
+        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
+        s->qi_enabled = true;
+        trace_vtd_inv_qi_setup(s->iq, s->iq_size);
+        /* Ok - report back to driver */
+        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES);
+
+        if (s->iq_tail != 0) {
+            /*
+             * This is a spec violation but Windows guests are known to set up
+             * Queued Invalidation this way so we allow the write and process
+             * Invalidation Descriptors right away.
+             */
+            trace_vtd_warn_invalid_qi_tail(s->iq_tail);
+            if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
+                vtd_fetch_inv_desc(s);
+            }
+        }
+    } else {
+        if (vtd_queued_inv_disable_check(s)) {
+            /* disable Queued Invalidation */
+            vtd_set_quad_raw(s, DMAR_IQH_REG, 0);
+            s->iq_head = 0;
+            s->qi_enabled = false;
+            /* Ok - report back to driver */
+            vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0);
+        } else {
+            error_report_once("%s: detected improper state when disable QI "
+                              "(head=0x%x, tail=0x%x, last_type=%d)",
+                              __func__,
+                              s->iq_head, s->iq_tail, s->iq_last_desc_type);
+        }
+    }
+}
+
+/* Set Root Table Pointer */
+static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
+{
+    vtd_root_table_setup(s);
+    /* Ok - report back to driver */
+    vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
+    vtd_reset_caches(s);
+    vtd_address_space_refresh_all(s);
+}
+
+/* Set Interrupt Remap Table Pointer */
+static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
+{
+    vtd_interrupt_remap_table_setup(s);
+    /* Ok - report back to driver */
+    vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
+}
+
+/* Handle Translation Enable/Disable */
+static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
+{
+    if (s->dmar_enabled == en) {
+        return;
+    }
+
+    trace_vtd_dmar_enable(en);
+
+    if (en) {
+        s->dmar_enabled = true;
+        /* Ok - report back to driver */
+        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
+    } else {
+        s->dmar_enabled = false;
+
+        /* Clear the index of Fault Recording Register */
+        s->next_frcd_reg = 0;
+        /* Ok - report back to driver */
+        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
+    }
+
+    vtd_reset_caches(s);
+    vtd_address_space_refresh_all(s);
+}
+
+/* Handle Interrupt Remap Enable/Disable */
+static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en)
+{
+    trace_vtd_ir_enable(en);
+
+    if (en) {
+        s->intr_enabled = true;
+        /* Ok - report back to driver */
+        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES);
+    } else {
+        s->intr_enabled = false;
+        /* Ok - report back to driver */
+        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0);
+    }
+}
+
+/* Handle write to Global Command Register */
+static void vtd_handle_gcmd_write(IntelIOMMUState *s)
+{
+    uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG);
+    uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
+    uint32_t changed = status ^ val;
+
+    trace_vtd_reg_write_gcmd(status, val);
+    if (changed & VTD_GCMD_TE) {
+        /* Translation enable/disable */
+        vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
+    }
+    if (val & VTD_GCMD_SRTP) {
+        /* Set/update the root-table pointer */
+        vtd_handle_gcmd_srtp(s);
+    }
+    if (changed & VTD_GCMD_QIE) {
+        /* Queued Invalidation Enable */
+        vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE);
+    }
+    if (val & VTD_GCMD_SIRTP) {
+        /* Set/update the interrupt remapping root-table pointer */
+        vtd_handle_gcmd_sirtp(s);
+    }
+    if (changed & VTD_GCMD_IRE) {
+        /* Interrupt remap enable/disable */
+        vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE);
+    }
+}
+
+/* Handle write to Context Command Register */
+static void vtd_handle_ccmd_write(IntelIOMMUState *s)
+{
+    uint64_t ret;
+    uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG);
+
+    /* Context-cache invalidation request */
+    if (val & VTD_CCMD_ICC) {
+        if (s->qi_enabled) {
+            error_report_once("Queued Invalidation enabled, "
+                              "should not use register-based invalidation");
+            return;
+        }
+        ret = vtd_context_cache_invalidate(s, val);
+        /* Invalidation completed. Change something to show */
+        vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
+        ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK,
+                                      ret);
+    }
+}
+
+/* Handle write to IOTLB Invalidation Register */
+static void vtd_handle_iotlb_write(IntelIOMMUState *s)
+{
+    uint64_t ret;
+    uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG);
+
+    /* IOTLB invalidation request */
+    if (val & VTD_TLB_IVT) {
+        if (s->qi_enabled) {
+            error_report_once("Queued Invalidation enabled, "
+                              "should not use register-based invalidation");
+            return;
+        }
+        ret = vtd_iotlb_flush(s, val);
+        /* Invalidation completed. Change something to show */
+        vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
+        ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG,
+                                      VTD_TLB_FLUSH_GRANU_MASK_A, ret);
+    }
+}
+
+/* Fetch an Invalidation Descriptor from the Invalidation Queue */
+static bool vtd_get_inv_desc(IntelIOMMUState *s,
+                             VTDInvDesc *inv_desc)
+{
+    dma_addr_t base_addr = s->iq;
+    uint32_t offset = s->iq_head;
+    uint32_t dw = s->iq_dw ? 32 : 16;
+    dma_addr_t addr = base_addr + offset * dw;
+
+    if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) {
+        error_report_once("Read INV DESC failed.");
+        return false;
+    }
+    inv_desc->lo = le64_to_cpu(inv_desc->lo);
+    inv_desc->hi = le64_to_cpu(inv_desc->hi);
+    if (dw == 32) {
+        inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
+        inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);
+    }
+    return true;
+}
+
+static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
+{
+    if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
+        (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
+        error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
+                          " (reserved nonzero)", __func__, inv_desc->hi,
+                          inv_desc->lo);
+        return false;
+    }
+    if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
+        /* Status Write */
+        uint32_t status_data = (uint32_t)(inv_desc->lo >>
+                               VTD_INV_DESC_WAIT_DATA_SHIFT);
+
+        assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
+
+        /* FIXME: need to be masked with HAW? */
+        dma_addr_t status_addr = inv_desc->hi;
+        trace_vtd_inv_desc_wait_sw(status_addr, status_data);
+        status_data = cpu_to_le32(status_data);
+        if (dma_memory_write(&address_space_memory, status_addr, &status_data,
+                             sizeof(status_data))) {
+            trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo);
+            return false;
+        }
+    } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
+        /* Interrupt flag */
+        vtd_generate_completion_event(s);
+    } else {
+        error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
+                          " (unknown type)", __func__, inv_desc->hi,
+                          inv_desc->lo);
+        return false;
+    }
+    return true;
+}
+
+static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
+                                           VTDInvDesc *inv_desc)
+{
+    uint16_t sid, fmask;
+
+    if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) {
+        error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
+                          " (reserved nonzero)", __func__, inv_desc->hi,
+                          inv_desc->lo);
+        return false;
+    }
+    switch (inv_desc->lo & VTD_INV_DESC_CC_G) {
+    case VTD_INV_DESC_CC_DOMAIN:
+        trace_vtd_inv_desc_cc_domain(
+            (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo));
+        /* Fall through */
+    case VTD_INV_DESC_CC_GLOBAL:
+        vtd_context_global_invalidate(s);
+        break;
+
+    case VTD_INV_DESC_CC_DEVICE:
+        sid = VTD_INV_DESC_CC_SID(inv_desc->lo);
+        fmask = VTD_INV_DESC_CC_FM(inv_desc->lo);
+        vtd_context_device_invalidate(s, sid, fmask);
+        break;
+
+    default:
+        error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
+                          " (invalid type)", __func__, inv_desc->hi,
+                          inv_desc->lo);
+        return false;
+    }
+    return true;
+}
+
+static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
+{
+    uint16_t domain_id;
+    uint8_t am;
+    hwaddr addr;
+
+    if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) ||
+        (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) {
+        error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
+                          ", lo=0x%"PRIx64" (reserved bits unzero)",
+                          __func__, inv_desc->hi, inv_desc->lo);
+        return false;
+    }
+
+    switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
+    case VTD_INV_DESC_IOTLB_GLOBAL:
+        vtd_iotlb_global_invalidate(s);
+        break;
+
+    case VTD_INV_DESC_IOTLB_DOMAIN:
+        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
+        vtd_iotlb_domain_invalidate(s, domain_id);
+        break;
+
+    case VTD_INV_DESC_IOTLB_PAGE:
+        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
+        addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
+        am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
+        if (am > VTD_MAMV) {
+            error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
+                              ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)",
+                              __func__, inv_desc->hi, inv_desc->lo,
+                              am, (unsigned)VTD_MAMV);
+            return false;
+        }
+        vtd_iotlb_page_invalidate(s, domain_id, addr, am);
+        break;
+
+    default:
+        error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
+                          ", lo=0x%"PRIx64" (type mismatch: 0x%llx)",
+                          __func__, inv_desc->hi, inv_desc->lo,
+                          inv_desc->lo & VTD_INV_DESC_IOTLB_G);
+        return false;
+    }
+    return true;
+}
+
+static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
+                                     VTDInvDesc *inv_desc)
+{
+    trace_vtd_inv_desc_iec(inv_desc->iec.granularity,
+                           inv_desc->iec.index,
+                           inv_desc->iec.index_mask);
+
+    vtd_iec_notify_all(s, !inv_desc->iec.granularity,
+                       inv_desc->iec.index,
+                       inv_desc->iec.index_mask);
+    return true;
+}
+
+static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
+                                          VTDInvDesc *inv_desc)
+{
+    VTDAddressSpace *vtd_dev_as;
+    IOMMUTLBEvent event;
+    struct VTDBus *vtd_bus;
+    hwaddr addr;
+    uint64_t sz;
+    uint16_t sid;
+    uint8_t devfn;
+    bool size;
+    uint8_t bus_num;
+
+    addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi);
+    sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo);
+    devfn = sid & 0xff;
+    bus_num = sid >> 8;
+    size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi);
+
+    if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
+        (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
+        error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64
+                          ", lo=%"PRIx64" (reserved nonzero)", __func__,
+                          inv_desc->hi, inv_desc->lo);
+        return false;
+    }
+
+    vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
+    if (!vtd_bus) {
+        goto done;
+    }
+
+    vtd_dev_as = vtd_bus->dev_as[devfn];
+    if (!vtd_dev_as) {
+        goto done;
+    }
+
+    /* According to ATS spec table 2.4:
+     * S = 0, bits 15:12 = xxxx     range size: 4K
+     * S = 1, bits 15:12 = xxx0     range size: 8K
+     * S = 1, bits 15:12 = xx01     range size: 16K
+     * S = 1, bits 15:12 = x011     range size: 32K
+     * S = 1, bits 15:12 = 0111     range size: 64K
+     * ...
+     */
+    if (size) {
+        sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
+        addr &= ~(sz - 1);
+    } else {
+        sz = VTD_PAGE_SIZE;
+    }
+
+    event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP;
+    event.entry.target_as = &vtd_dev_as->as;
+    event.entry.addr_mask = sz - 1;
+    event.entry.iova = addr;
+    event.entry.perm = IOMMU_NONE;
+    event.entry.translated_addr = 0;
+    memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event);
+
+done:
+    return true;
+}
+
+static bool vtd_process_inv_desc(IntelIOMMUState *s)
+{
+    VTDInvDesc inv_desc;
+    uint8_t desc_type;
+
+    trace_vtd_inv_qi_head(s->iq_head);
+    if (!vtd_get_inv_desc(s, &inv_desc)) {
+        s->iq_last_desc_type = VTD_INV_DESC_NONE;
+        return false;
+    }
+
+    desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
+    /* FIXME: should update at first or at last? */
+    s->iq_last_desc_type = desc_type;
+
+    switch (desc_type) {
+    case VTD_INV_DESC_CC:
+        trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo);
+        if (!vtd_process_context_cache_desc(s, &inv_desc)) {
+            return false;
+        }
+        break;
+
+    case VTD_INV_DESC_IOTLB:
+        trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo);
+        if (!vtd_process_iotlb_desc(s, &inv_desc)) {
+            return false;
+        }
+        break;
+
+    /*
+     * TODO: the entity of below two cases will be implemented in future series.
+     * To make guest (which integrates scalable mode support patch set in
+     * iommu driver) work, just return true is enough so far.
+     */
+    case VTD_INV_DESC_PC:
+        break;
+
+    case VTD_INV_DESC_PIOTLB:
+        break;
+
+    case VTD_INV_DESC_WAIT:
+        trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
+        if (!vtd_process_wait_desc(s, &inv_desc)) {
+            return false;
+        }
+        break;
+
+    case VTD_INV_DESC_IEC:
+        trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo);
+        if (!vtd_process_inv_iec_desc(s, &inv_desc)) {
+            return false;
+        }
+        break;
+
+    case VTD_INV_DESC_DEVICE:
+        trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
+        if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
+            return false;
+        }
+        break;
+
+    default:
+        error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64
+                          " (unknown type)", __func__, inv_desc.hi,
+                          inv_desc.lo);
+        return false;
+    }
+    s->iq_head++;
+    if (s->iq_head == s->iq_size) {
+        s->iq_head = 0;
+    }
+    return true;
+}
+
+/* Try to fetch and process more Invalidation Descriptors */
+static void vtd_fetch_inv_desc(IntelIOMMUState *s)
+{
+    int qi_shift;
+
+    /* Refer to 10.4.23 of VT-d spec 3.0 */
+    qi_shift = s->iq_dw ? VTD_IQH_QH_SHIFT_5 : VTD_IQH_QH_SHIFT_4;
+
+    trace_vtd_inv_qi_fetch();
+
+    if (s->iq_tail >= s->iq_size) {
+        /* Detects an invalid Tail pointer */
+        error_report_once("%s: detected invalid QI tail "
+                          "(tail=0x%x, size=0x%x)",
+                          __func__, s->iq_tail, s->iq_size);
+        vtd_handle_inv_queue_error(s);
+        return;
+    }
+    while (s->iq_head != s->iq_tail) {
+        if (!vtd_process_inv_desc(s)) {
+            /* Invalidation Queue Errors */
+            vtd_handle_inv_queue_error(s);
+            break;
+        }
+        /* Must update the IQH_REG in time */
+        vtd_set_quad_raw(s, DMAR_IQH_REG,
+                         (((uint64_t)(s->iq_head)) << qi_shift) &
+                         VTD_IQH_QH_MASK);
+    }
+}
+
+/* Handle write to Invalidation Queue Tail Register */
+static void vtd_handle_iqt_write(IntelIOMMUState *s)
+{
+    uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
+
+    if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) {
+        error_report_once("%s: RSV bit is set: val=0x%"PRIx64,
+                          __func__, val);
+        return;
+    }
+    s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
+    trace_vtd_inv_qi_tail(s->iq_tail);
+
+    if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
+        /* Process Invalidation Queue here */
+        vtd_fetch_inv_desc(s);
+    }
+}
+
+static void vtd_handle_fsts_write(IntelIOMMUState *s)
+{
+    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
+    uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
+    uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE;
+
+    if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
+        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
+        trace_vtd_fsts_clear_ip();
+    }
+    /* FIXME: when IQE is Clear, should we try to fetch some Invalidation
+     * Descriptors if there are any when Queued Invalidation is enabled?
+     */
+}
+
+static void vtd_handle_fectl_write(IntelIOMMUState *s)
+{
+    uint32_t fectl_reg;
+    /* FIXME: when software clears the IM field, check the IP field. But do we
+     * need to compare the old value and the new value to conclude that
+     * software clears the IM field? Or just check if the IM field is zero?
+     */
+    fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
+
+    trace_vtd_reg_write_fectl(fectl_reg);
+
+    if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
+        vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
+        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
+    }
+}
+
+static void vtd_handle_ics_write(IntelIOMMUState *s)
+{
+    uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG);
+    uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
+
+    if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) {
+        trace_vtd_reg_ics_clear_ip();
+        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
+    }
+}
+
+static void vtd_handle_iectl_write(IntelIOMMUState *s)
+{
+    uint32_t iectl_reg;
+    /* FIXME: when software clears the IM field, check the IP field. But do we
+     * need to compare the old value and the new value to conclude that
+     * software clears the IM field? Or just check if the IM field is zero?
+     */
+    iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
+
+    trace_vtd_reg_write_iectl(iectl_reg);
+
+    if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) {
+        vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
+        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
+    }
+}
+
+static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
+{
+    IntelIOMMUState *s = opaque;
+    uint64_t val;
+
+    trace_vtd_reg_read(addr, size);
+
+    if (addr + size > DMAR_REG_SIZE) {
+        error_report_once("%s: MMIO over range: addr=0x%" PRIx64
+                          " size=0x%x", __func__, addr, size);
+        return (uint64_t)-1;
+    }
+
+    switch (addr) {
+    /* Root Table Address Register, 64-bit */
+    case DMAR_RTADDR_REG:
+        val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
+        if (size == 4) {
+            val = val & ((1ULL << 32) - 1);
+        }
+        break;
+
+    case DMAR_RTADDR_REG_HI:
+        assert(size == 4);
+        val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32;
+        break;
+
+    /* Invalidation Queue Address Register, 64-bit */
+    case DMAR_IQA_REG:
+        val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS);
+        if (size == 4) {
+            val = val & ((1ULL << 32) - 1);
+        }
+        break;
+
+    case DMAR_IQA_REG_HI:
+        assert(size == 4);
+        val = s->iq >> 32;
+        break;
+
+    default:
+        if (size == 4) {
+            val = vtd_get_long(s, addr);
+        } else {
+            val = vtd_get_quad(s, addr);
+        }
+    }
+
+    return val;
+}
+
+static void vtd_mem_write(void *opaque, hwaddr addr,
+                          uint64_t val, unsigned size)
+{
+    IntelIOMMUState *s = opaque;
+
+    trace_vtd_reg_write(addr, size, val);
+
+    if (addr + size > DMAR_REG_SIZE) {
+        error_report_once("%s: MMIO over range: addr=0x%" PRIx64
+                          " size=0x%x", __func__, addr, size);
+        return;
+    }
+
+    switch (addr) {
+    /* Global Command Register, 32-bit */
+    case DMAR_GCMD_REG:
+        vtd_set_long(s, addr, val);
+        vtd_handle_gcmd_write(s);
+        break;
+
+    /* Context Command Register, 64-bit */
+    case DMAR_CCMD_REG:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+            vtd_handle_ccmd_write(s);
+        }
+        break;
+
+    case DMAR_CCMD_REG_HI:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        vtd_handle_ccmd_write(s);
+        break;
+
+    /* IOTLB Invalidation Register, 64-bit */
+    case DMAR_IOTLB_REG:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+            vtd_handle_iotlb_write(s);
+        }
+        break;
+
+    case DMAR_IOTLB_REG_HI:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        vtd_handle_iotlb_write(s);
+        break;
+
+    /* Invalidate Address Register, 64-bit */
+    case DMAR_IVA_REG:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+        }
+        break;
+
+    case DMAR_IVA_REG_HI:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    /* Fault Status Register, 32-bit */
+    case DMAR_FSTS_REG:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        vtd_handle_fsts_write(s);
+        break;
+
+    /* Fault Event Control Register, 32-bit */
+    case DMAR_FECTL_REG:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        vtd_handle_fectl_write(s);
+        break;
+
+    /* Fault Event Data Register, 32-bit */
+    case DMAR_FEDATA_REG:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    /* Fault Event Address Register, 32-bit */
+    case DMAR_FEADDR_REG:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            /*
+             * While the register is 32-bit only, some guests (Xen...) write to
+             * it with 64-bit.
+             */
+            vtd_set_quad(s, addr, val);
+        }
+        break;
+
+    /* Fault Event Upper Address Register, 32-bit */
+    case DMAR_FEUADDR_REG:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    /* Protected Memory Enable Register, 32-bit */
+    case DMAR_PMEN_REG:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    /* Root Table Address Register, 64-bit */
+    case DMAR_RTADDR_REG:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+        }
+        break;
+
+    case DMAR_RTADDR_REG_HI:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    /* Invalidation Queue Tail Register, 64-bit */
+    case DMAR_IQT_REG:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+        }
+        vtd_handle_iqt_write(s);
+        break;
+
+    case DMAR_IQT_REG_HI:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        /* 19:63 of IQT_REG is RsvdZ, do nothing here */
+        break;
+
+    /* Invalidation Queue Address Register, 64-bit */
+    case DMAR_IQA_REG:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+        }
+        if (s->ecap & VTD_ECAP_SMTS &&
+            val & VTD_IQA_DW_MASK) {
+            s->iq_dw = true;
+        } else {
+            s->iq_dw = false;
+        }
+        break;
+
+    case DMAR_IQA_REG_HI:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    /* Invalidation Completion Status Register, 32-bit */
+    case DMAR_ICS_REG:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        vtd_handle_ics_write(s);
+        break;
+
+    /* Invalidation Event Control Register, 32-bit */
+    case DMAR_IECTL_REG:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        vtd_handle_iectl_write(s);
+        break;
+
+    /* Invalidation Event Data Register, 32-bit */
+    case DMAR_IEDATA_REG:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    /* Invalidation Event Address Register, 32-bit */
+    case DMAR_IEADDR_REG:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    /* Invalidation Event Upper Address Register, 32-bit */
+    case DMAR_IEUADDR_REG:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    /* Fault Recording Registers, 128-bit */
+    case DMAR_FRCD_REG_0_0:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+        }
+        break;
+
+    case DMAR_FRCD_REG_0_1:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    case DMAR_FRCD_REG_0_2:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+            /* May clear bit 127 (Fault), update PPF */
+            vtd_update_fsts_ppf(s);
+        }
+        break;
+
+    case DMAR_FRCD_REG_0_3:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        /* May clear bit 127 (Fault), update PPF */
+        vtd_update_fsts_ppf(s);
+        break;
+
+    case DMAR_IRTA_REG:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+        }
+        break;
+
+    case DMAR_IRTA_REG_HI:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        break;
+
+    default:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+        }
+    }
+}
+
+static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
+                                         IOMMUAccessFlags flag, int iommu_idx)
+{
+    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    IOMMUTLBEntry iotlb = {
+        /* We'll fill in the rest later. */
+        .target_as = &address_space_memory,
+    };
+    bool success;
+
+    if (likely(s->dmar_enabled)) {
+        success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
+                                         addr, flag & IOMMU_WO, &iotlb);
+    } else {
+        /* DMAR disabled, passthrough, use 4k-page*/
+        iotlb.iova = addr & VTD_PAGE_MASK_4K;
+        iotlb.translated_addr = addr & VTD_PAGE_MASK_4K;
+        iotlb.addr_mask = ~VTD_PAGE_MASK_4K;
+        iotlb.perm = IOMMU_RW;
+        success = true;
+    }
+
+    if (likely(success)) {
+        trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus),
+                                 VTD_PCI_SLOT(vtd_as->devfn),
+                                 VTD_PCI_FUNC(vtd_as->devfn),
+                                 iotlb.iova, iotlb.translated_addr,
+                                 iotlb.addr_mask);
+    } else {
+        error_report_once("%s: detected translation failure "
+                          "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
+                          __func__, pci_bus_num(vtd_as->bus),
+                          VTD_PCI_SLOT(vtd_as->devfn),
+                          VTD_PCI_FUNC(vtd_as->devfn),
+                          addr);
+    }
+
+    return iotlb;
+}
+
+static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
+                                         IOMMUNotifierFlag old,
+                                         IOMMUNotifierFlag new,
+                                         Error **errp)
+{
+    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+    IntelIOMMUState *s = vtd_as->iommu_state;
+
+    /* Update per-address-space notifier flags */
+    vtd_as->notifier_flags = new;
+
+    if (old == IOMMU_NOTIFIER_NONE) {
+        QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
+    } else if (new == IOMMU_NOTIFIER_NONE) {
+        QLIST_REMOVE(vtd_as, next);
+    }
+    return 0;
+}
+
+static int vtd_post_load(void *opaque, int version_id)
+{
+    IntelIOMMUState *iommu = opaque;
+
+    /*
+     * Memory regions are dynamically turned on/off depending on
+     * context entry configurations from the guest. After migration,
+     * we need to make sure the memory regions are still correct.
+     */
+    vtd_switch_address_space_all(iommu);
+
+    /*
+     * We don't need to migrate the root_scalable because we can
+     * simply do the calculation after the loading is complete.  We
+     * can actually do similar things with root, dmar_enabled, etc.
+     * however since we've had them already so we'd better keep them
+     * for compatibility of migration.
+     */
+    vtd_update_scalable_state(iommu);
+
+    return 0;
+}
+
+static const VMStateDescription vtd_vmstate = {
+    .name = "iommu-intel",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .priority = MIG_PRI_IOMMU,
+    .post_load = vtd_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(root, IntelIOMMUState),
+        VMSTATE_UINT64(intr_root, IntelIOMMUState),
+        VMSTATE_UINT64(iq, IntelIOMMUState),
+        VMSTATE_UINT32(intr_size, IntelIOMMUState),
+        VMSTATE_UINT16(iq_head, IntelIOMMUState),
+        VMSTATE_UINT16(iq_tail, IntelIOMMUState),
+        VMSTATE_UINT16(iq_size, IntelIOMMUState),
+        VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState),
+        VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE),
+        VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState),
+        VMSTATE_UNUSED(1),      /* bool root_extended is obsolete by VT-d */
+        VMSTATE_BOOL(dmar_enabled, IntelIOMMUState),
+        VMSTATE_BOOL(qi_enabled, IntelIOMMUState),
+        VMSTATE_BOOL(intr_enabled, IntelIOMMUState),
+        VMSTATE_BOOL(intr_eime, IntelIOMMUState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const MemoryRegionOps vtd_mem_ops = {
+    .read = vtd_mem_read,
+    .write = vtd_mem_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+    },
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+    },
+};
+
+static Property vtd_properties[] = {
+    DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
+    DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
+                            ON_OFF_AUTO_AUTO),
+    DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
+    DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
+                      VTD_HOST_ADDRESS_WIDTH),
+    DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
+    DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
+    DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+/* Read IRTE entry with specific index */
+static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
+                        VTD_IR_TableEntry *entry, uint16_t sid)
+{
+    static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \
+        {0xffff, 0xfffb, 0xfff9, 0xfff8};
+    dma_addr_t addr = 0x00;
+    uint16_t mask, source_id;
+    uint8_t bus, bus_max, bus_min;
+
+    if (index >= iommu->intr_size) {
+        error_report_once("%s: index too large: ind=0x%x",
+                          __func__, index);
+        return -VTD_FR_IR_INDEX_OVER;
+    }
+
+    addr = iommu->intr_root + index * sizeof(*entry);
+    if (dma_memory_read(&address_space_memory, addr, entry,
+                        sizeof(*entry))) {
+        error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64,
+                          __func__, index, addr);
+        return -VTD_FR_IR_ROOT_INVAL;
+    }
+
+    trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]),
+                          le64_to_cpu(entry->data[0]));
+
+    if (!entry->irte.present) {
+        error_report_once("%s: detected non-present IRTE "
+                          "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
+                          __func__, index, le64_to_cpu(entry->data[1]),
+                          le64_to_cpu(entry->data[0]));
+        return -VTD_FR_IR_ENTRY_P;
+    }
+
+    if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
+        entry->irte.__reserved_2) {
+        error_report_once("%s: detected non-zero reserved IRTE "
+                          "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
+                          __func__, index, le64_to_cpu(entry->data[1]),
+                          le64_to_cpu(entry->data[0]));
+        return -VTD_FR_IR_IRTE_RSVD;
+    }
+
+    if (sid != X86_IOMMU_SID_INVALID) {
+        /* Validate IRTE SID */
+        source_id = le32_to_cpu(entry->irte.source_id);
+        switch (entry->irte.sid_vtype) {
+        case VTD_SVT_NONE:
+            break;
+
+        case VTD_SVT_ALL:
+            mask = vtd_svt_mask[entry->irte.sid_q];
+            if ((source_id & mask) != (sid & mask)) {
+                error_report_once("%s: invalid IRTE SID "
+                                  "(index=%u, sid=%u, source_id=%u)",
+                                  __func__, index, sid, source_id);
+                return -VTD_FR_IR_SID_ERR;
+            }
+            break;
+
+        case VTD_SVT_BUS:
+            bus_max = source_id >> 8;
+            bus_min = source_id & 0xff;
+            bus = sid >> 8;
+            if (bus > bus_max || bus < bus_min) {
+                error_report_once("%s: invalid SVT_BUS "
+                                  "(index=%u, bus=%u, min=%u, max=%u)",
+                                  __func__, index, bus, bus_min, bus_max);
+                return -VTD_FR_IR_SID_ERR;
+            }
+            break;
+
+        default:
+            error_report_once("%s: detected invalid IRTE SVT "
+                              "(index=%u, type=%d)", __func__,
+                              index, entry->irte.sid_vtype);
+            /* Take this as verification failure. */
+            return -VTD_FR_IR_SID_ERR;
+        }
+    }
+
+    return 0;
+}
+
+/* Fetch IRQ information of specific IR index */
+static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
+                             X86IOMMUIrq *irq, uint16_t sid)
+{
+    VTD_IR_TableEntry irte = {};
+    int ret = 0;
+
+    ret = vtd_irte_get(iommu, index, &irte, sid);
+    if (ret) {
+        return ret;
+    }
+
+    irq->trigger_mode = irte.irte.trigger_mode;
+    irq->vector = irte.irte.vector;
+    irq->delivery_mode = irte.irte.delivery_mode;
+    irq->dest = le32_to_cpu(irte.irte.dest_id);
+    if (!iommu->intr_eime) {
+#define  VTD_IR_APIC_DEST_MASK         (0xff00ULL)
+#define  VTD_IR_APIC_DEST_SHIFT        (8)
+        irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >>
+            VTD_IR_APIC_DEST_SHIFT;
+    }
+    irq->dest_mode = irte.irte.dest_mode;
+    irq->redir_hint = irte.irte.redir_hint;
+
+    trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector,
+                       irq->delivery_mode, irq->dest, irq->dest_mode);
+
+    return 0;
+}
+
+/* Interrupt remapping for MSI/MSI-X entry */
+static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
+                                   MSIMessage *origin,
+                                   MSIMessage *translated,
+                                   uint16_t sid)
+{
+    int ret = 0;
+    VTD_IR_MSIAddress addr;
+    uint16_t index;
+    X86IOMMUIrq irq = {};
+
+    assert(origin && translated);
+
+    trace_vtd_ir_remap_msi_req(origin->address, origin->data);
+
+    if (!iommu || !iommu->intr_enabled) {
+        memcpy(translated, origin, sizeof(*origin));
+        goto out;
+    }
+
+    if (origin->address & VTD_MSI_ADDR_HI_MASK) {
+        error_report_once("%s: MSI address high 32 bits non-zero detected: "
+                          "address=0x%" PRIx64, __func__, origin->address);
+        return -VTD_FR_IR_REQ_RSVD;
+    }
+
+    addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
+    if (addr.addr.__head != 0xfee) {
+        error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32,
+                          __func__, addr.data);
+        return -VTD_FR_IR_REQ_RSVD;
+    }
+
+    /* This is compatible mode. */
+    if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
+        memcpy(translated, origin, sizeof(*origin));
+        goto out;
+    }
+
+    index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
+
+#define  VTD_IR_MSI_DATA_SUBHANDLE       (0x0000ffff)
+#define  VTD_IR_MSI_DATA_RESERVED        (0xffff0000)
+
+    if (addr.addr.sub_valid) {
+        /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */
+        index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE;
+    }
+
+    ret = vtd_remap_irq_get(iommu, index, &irq, sid);
+    if (ret) {
+        return ret;
+    }
+
+    if (addr.addr.sub_valid) {
+        trace_vtd_ir_remap_type("MSI");
+        if (origin->data & VTD_IR_MSI_DATA_RESERVED) {
+            error_report_once("%s: invalid IR MSI "
+                              "(sid=%u, address=0x%" PRIx64
+                              ", data=0x%" PRIx32 ")",
+                              __func__, sid, origin->address, origin->data);
+            return -VTD_FR_IR_REQ_RSVD;
+        }
+    } else {
+        uint8_t vector = origin->data & 0xff;
+        uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
+
+        trace_vtd_ir_remap_type("IOAPIC");
+        /* IOAPIC entry vector should be aligned with IRTE vector
+         * (see vt-d spec 5.1.5.1). */
+        if (vector != irq.vector) {
+            trace_vtd_warn_ir_vector(sid, index, vector, irq.vector);
+        }
+
+        /* The Trigger Mode field must match the Trigger Mode in the IRTE.
+         * (see vt-d spec 5.1.5.1). */
+        if (trigger_mode != irq.trigger_mode) {
+            trace_vtd_warn_ir_trigger(sid, index, trigger_mode,
+                                      irq.trigger_mode);
+        }
+    }
+
+    /*
+     * We'd better keep the last two bits, assuming that guest OS
+     * might modify it. Keep it does not hurt after all.
+     */
+    irq.msi_addr_last_bits = addr.addr.__not_care;
+
+    /* Translate X86IOMMUIrq to MSI message */
+    x86_iommu_irq_to_msi_message(&irq, translated);
+
+out:
+    trace_vtd_ir_remap_msi(origin->address, origin->data,
+                           translated->address, translated->data);
+    return 0;
+}
+
+static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
+                         MSIMessage *dst, uint16_t sid)
+{
+    return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu),
+                                   src, dst, sid);
+}
+
+static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
+                                   uint64_t *data, unsigned size,
+                                   MemTxAttrs attrs)
+{
+    return MEMTX_OK;
+}
+
+static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr,
+                                    uint64_t value, unsigned size,
+                                    MemTxAttrs attrs)
+{
+    int ret = 0;
+    MSIMessage from = {}, to = {};
+    uint16_t sid = X86_IOMMU_SID_INVALID;
+
+    from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
+    from.data = (uint32_t) value;
+
+    if (!attrs.unspecified) {
+        /* We have explicit Source ID */
+        sid = attrs.requester_id;
+    }
+
+    ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
+    if (ret) {
+        /* TODO: report error */
+        /* Drop this interrupt */
+        return MEMTX_ERROR;
+    }
+
+    apic_get_class()->send_msi(&to);
+
+    return MEMTX_OK;
+}
+
+static const MemoryRegionOps vtd_mem_ir_ops = {
+    .read_with_attrs = vtd_mem_ir_read,
+    .write_with_attrs = vtd_mem_ir_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
+{
+    uintptr_t key = (uintptr_t)bus;
+    VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key);
+    VTDAddressSpace *vtd_dev_as;
+    char name[128];
+
+    if (!vtd_bus) {
+        uintptr_t *new_key = g_malloc(sizeof(*new_key));
+        *new_key = (uintptr_t)bus;
+        /* No corresponding free() */
+        vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
+                            PCI_DEVFN_MAX);
+        vtd_bus->bus = bus;
+        g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
+    }
+
+    vtd_dev_as = vtd_bus->dev_as[devfn];
+
+    if (!vtd_dev_as) {
+        snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn),
+                 PCI_FUNC(devfn));
+        vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace));
+
+        vtd_dev_as->bus = bus;
+        vtd_dev_as->devfn = (uint8_t)devfn;
+        vtd_dev_as->iommu_state = s;
+        vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+        vtd_dev_as->iova_tree = iova_tree_new();
+
+        memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX);
+        address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root");
+
+        /*
+         * Build the DMAR-disabled container with aliases to the
+         * shared MRs.  Note that aliasing to a shared memory region
+         * could help the memory API to detect same FlatViews so we
+         * can have devices to share the same FlatView when DMAR is
+         * disabled (either by not providing "intel_iommu=on" or with
+         * "iommu=pt").  It will greatly reduce the total number of
+         * FlatViews of the system hence VM runs faster.
+         */
+        memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s),
+                                 "vtd-nodmar", &s->mr_nodmar, 0,
+                                 memory_region_size(&s->mr_nodmar));
+
+        /*
+         * Build the per-device DMAR-enabled container.
+         *
+         * TODO: currently we have per-device IOMMU memory region only
+         * because we have per-device IOMMU notifiers for devices.  If
+         * one day we can abstract the IOMMU notifiers out of the
+         * memory regions then we can also share the same memory
+         * region here just like what we've done above with the nodmar
+         * region.
+         */
+        strcat(name, "-dmar");
+        memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu),
+                                 TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s),
+                                 name, UINT64_MAX);
+        memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir",
+                                 &s->mr_ir, 0, memory_region_size(&s->mr_ir));
+        memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu),
+                                            VTD_INTERRUPT_ADDR_FIRST,
+                                            &vtd_dev_as->iommu_ir, 1);
+
+        /*
+         * Hook both the containers under the root container, we
+         * switch between DMAR & noDMAR by enable/disable
+         * corresponding sub-containers
+         */
+        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
+                                            MEMORY_REGION(&vtd_dev_as->iommu),
+                                            0);
+        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
+                                            &vtd_dev_as->nodmar, 0);
+
+        vtd_switch_address_space(vtd_dev_as);
+    }
+    return vtd_dev_as;
+}
+
+/* Unmap the whole range in the notifier's scope. */
+static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
+{
+    hwaddr size, remain;
+    hwaddr start = n->start;
+    hwaddr end = n->end;
+    IntelIOMMUState *s = as->iommu_state;
+    DMAMap map;
+
+    /*
+     * Note: all the codes in this function has a assumption that IOVA
+     * bits are no more than VTD_MGAW bits (which is restricted by
+     * VT-d spec), otherwise we need to consider overflow of 64 bits.
+     */
+
+    if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) {
+        /*
+         * Don't need to unmap regions that is bigger than the whole
+         * VT-d supported address space size
+         */
+        end = VTD_ADDRESS_SIZE(s->aw_bits) - 1;
+    }
+
+    assert(start <= end);
+    size = remain = end - start + 1;
+
+    while (remain >= VTD_PAGE_SIZE) {
+        IOMMUTLBEvent event;
+        uint64_t mask = dma_aligned_pow2_mask(start, end, s->aw_bits);
+        uint64_t size = mask + 1;
+
+        assert(size);
+
+        event.type = IOMMU_NOTIFIER_UNMAP;
+        event.entry.iova = start;
+        event.entry.addr_mask = mask;
+        event.entry.target_as = &address_space_memory;
+        event.entry.perm = IOMMU_NONE;
+        /* This field is meaningless for unmap */
+        event.entry.translated_addr = 0;
+
+        memory_region_notify_iommu_one(n, &event);
+
+        start += size;
+        remain -= size;
+    }
+
+    assert(!remain);
+
+    trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
+                             VTD_PCI_SLOT(as->devfn),
+                             VTD_PCI_FUNC(as->devfn),
+                             n->start, size);
+
+    map.iova = n->start;
+    map.size = size;
+    iova_tree_remove(as->iova_tree, &map);
+}
+
+static void vtd_address_space_unmap_all(IntelIOMMUState *s)
+{
+    VTDAddressSpace *vtd_as;
+    IOMMUNotifier *n;
+
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+        IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
+            vtd_address_space_unmap(vtd_as, n);
+        }
+    }
+}
+
+static void vtd_address_space_refresh_all(IntelIOMMUState *s)
+{
+    vtd_address_space_unmap_all(s);
+    vtd_switch_address_space_all(s);
+}
+
+static int vtd_replay_hook(IOMMUTLBEvent *event, void *private)
+{
+    memory_region_notify_iommu_one(private, event);
+    return 0;
+}
+
+static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
+{
+    VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    uint8_t bus_n = pci_bus_num(vtd_as->bus);
+    VTDContextEntry ce;
+
+    /*
+     * The replay can be triggered by either a invalidation or a newly
+     * created entry. No matter what, we release existing mappings
+     * (it means flushing caches for UNMAP-only registers).
+     */
+    vtd_address_space_unmap(vtd_as, n);
+
+    if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
+        trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
+                                  "legacy mode",
+                                  bus_n, PCI_SLOT(vtd_as->devfn),
+                                  PCI_FUNC(vtd_as->devfn),
+                                  vtd_get_domain_id(s, &ce),
+                                  ce.hi, ce.lo);
+        if (vtd_as_has_map_notifier(vtd_as)) {
+            /* This is required only for MAP typed notifiers */
+            vtd_page_walk_info info = {
+                .hook_fn = vtd_replay_hook,
+                .private = (void *)n,
+                .notify_unmap = false,
+                .aw = s->aw_bits,
+                .as = vtd_as,
+                .domain_id = vtd_get_domain_id(s, &ce),
+            };
+
+            vtd_page_walk(s, &ce, 0, ~0ULL, &info);
+        }
+    } else {
+        trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
+                                    PCI_FUNC(vtd_as->devfn));
+    }
+
+    return;
+}
+
+/* Do the initialization. It will also be called when reset, so pay
+ * attention when adding new initialization stuff.
+ */
+static void vtd_init(IntelIOMMUState *s)
+{
+    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+    memset(s->csr, 0, DMAR_REG_SIZE);
+    memset(s->wmask, 0, DMAR_REG_SIZE);
+    memset(s->w1cmask, 0, DMAR_REG_SIZE);
+    memset(s->womask, 0, DMAR_REG_SIZE);
+
+    s->root = 0;
+    s->root_scalable = false;
+    s->dmar_enabled = false;
+    s->intr_enabled = false;
+    s->iq_head = 0;
+    s->iq_tail = 0;
+    s->iq = 0;
+    s->iq_size = 0;
+    s->qi_enabled = false;
+    s->iq_last_desc_type = VTD_INV_DESC_NONE;
+    s->iq_dw = false;
+    s->next_frcd_reg = 0;
+    s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
+             VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
+             VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
+    if (s->dma_drain) {
+        s->cap |= VTD_CAP_DRAIN;
+    }
+    if (s->aw_bits == VTD_HOST_AW_48BIT) {
+        s->cap |= VTD_CAP_SAGAW_48bit;
+    }
+    s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
+
+    /*
+     * Rsvd field masks for spte
+     */
+    vtd_spte_rsvd[0] = ~0ULL;
+    vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
+                                                  x86_iommu->dt_supported);
+    vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+    vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+    vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+    vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
+                                                         x86_iommu->dt_supported);
+    vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
+                                                         x86_iommu->dt_supported);
+
+    if (s->scalable_mode) {
+        vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+        vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+        vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+    }
+
+    if (x86_iommu_ir_supported(x86_iommu)) {
+        s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
+        if (s->intr_eim == ON_OFF_AUTO_ON) {
+            s->ecap |= VTD_ECAP_EIM;
+        }
+        assert(s->intr_eim != ON_OFF_AUTO_AUTO);
+    }
+
+    if (x86_iommu->dt_supported) {
+        s->ecap |= VTD_ECAP_DT;
+    }
+
+    if (x86_iommu->pt_supported) {
+        s->ecap |= VTD_ECAP_PT;
+    }
+
+    if (s->caching_mode) {
+        s->cap |= VTD_CAP_CM;
+    }
+
+    /* TODO: read cap/ecap from host to decide which cap to be exposed. */
+    if (s->scalable_mode) {
+        s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
+    }
+
+    vtd_reset_caches(s);
+
+    /* Define registers with default values and bit semantics */
+    vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
+    vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
+    vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
+    vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
+    vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
+    vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
+    vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0);
+    vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
+    vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
+
+    /* Advanced Fault Logging not supported */
+    vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL);
+    vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0);
+    vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0);
+    vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0);
+
+    /* Treated as RsvdZ when EIM in ECAP_REG is not supported
+     * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0);
+     */
+    vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0);
+
+    /* Treated as RO for implementations that PLMR and PHMR fields reported
+     * as Clear in the CAP_REG.
+     * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0);
+     */
+    vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0);
+
+    vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
+    vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
+    vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0);
+    vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
+    vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
+    vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
+    vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0);
+    /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */
+    vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0);
+
+    /* IOTLB registers */
+    vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0);
+    vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0);
+    vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL);
+
+    /* Fault Recording Registers, 128-bit */
+    vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0);
+    vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL);
+
+    /*
+     * Interrupt remapping registers.
+     */
+    vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
+}
+
+/* Should not reset address_spaces when reset because devices will still use
+ * the address space they got at first (won't ask the bus again).
+ */
+static void vtd_reset(DeviceState *dev)
+{
+    IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
+
+    vtd_init(s);
+    vtd_address_space_refresh_all(s);
+}
+
+static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
+{
+    IntelIOMMUState *s = opaque;
+    VTDAddressSpace *vtd_as;
+
+    assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+    vtd_as = vtd_find_add_as(s, bus, devfn);
+    return &vtd_as->as;
+}
+
+static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
+{
+    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+    if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) {
+        error_setg(errp, "eim=on cannot be selected without intremap=on");
+        return false;
+    }
+
+    if (s->intr_eim == ON_OFF_AUTO_AUTO) {
+        s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim)
+                      && x86_iommu_ir_supported(x86_iommu) ?
+                                              ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
+    }
+    if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
+        if (!kvm_irqchip_in_kernel()) {
+            error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
+            return false;
+        }
+        if (!kvm_enable_x2apic()) {
+            error_setg(errp, "eim=on requires support on the KVM side"
+                             "(X2APIC_API, first shipped in v4.7)");
+            return false;
+        }
+    }
+
+    /* Currently only address widths supported are 39 and 48 bits */
+    if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
+        (s->aw_bits != VTD_HOST_AW_48BIT)) {
+        error_setg(errp, "Supported values for aw-bits are: %d, %d",
+                   VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
+        return false;
+    }
+
+    if (s->scalable_mode && !s->dma_drain) {
+        error_setg(errp, "Need to set dma_drain for scalable mode");
+        return false;
+    }
+
+    return true;
+}
+
+static int vtd_machine_done_notify_one(Object *child, void *unused)
+{
+    IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
+
+    /*
+     * We hard-coded here because vfio-pci is the only special case
+     * here.  Let's be more elegant in the future when we can, but so
+     * far there seems to be no better way.
+     */
+    if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) {
+        vtd_panic_require_caching_mode();
+    }
+
+    return 0;
+}
+
+static void vtd_machine_done_hook(Notifier *notifier, void *unused)
+{
+    object_child_foreach_recursive(object_get_root(),
+                                   vtd_machine_done_notify_one, NULL);
+}
+
+static Notifier vtd_machine_done_notify = {
+    .notify = vtd_machine_done_hook,
+};
+
+static void vtd_realize(DeviceState *dev, Error **errp)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    PCMachineState *pcms = PC_MACHINE(ms);
+    X86MachineState *x86ms = X86_MACHINE(ms);
+    PCIBus *bus = pcms->bus;
+    IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
+
+    if (!vtd_decide_config(s, errp)) {
+        return;
+    }
+
+    QLIST_INIT(&s->vtd_as_with_notifiers);
+    qemu_mutex_init(&s->iommu_lock);
+    memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
+    memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
+                          "intel_iommu", DMAR_REG_SIZE);
+
+    /* Create the shared memory regions by all devices */
+    memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar",
+                       UINT64_MAX);
+    memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops,
+                          s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE);
+    memory_region_init_alias(&s->mr_sys_alias, OBJECT(s),
+                             "vtd-sys-alias", get_system_memory(), 0,
+                             memory_region_size(get_system_memory()));
+    memory_region_add_subregion_overlap(&s->mr_nodmar, 0,
+                                        &s->mr_sys_alias, 0);
+    memory_region_add_subregion_overlap(&s->mr_nodmar,
+                                        VTD_INTERRUPT_ADDR_FIRST,
+                                        &s->mr_ir, 1);
+
+    sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
+    /* No corresponding destroy */
+    s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
+                                     g_free, g_free);
+    s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
+                                              g_free, g_free);
+    vtd_init(s);
+    sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
+    pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
+    /* Pseudo address space under root PCI bus. */
+    x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
+    qemu_add_machine_init_done_notifier(&vtd_machine_done_notify);
+}
+
+static void vtd_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass);
+
+    dc->reset = vtd_reset;
+    dc->vmsd = &vtd_vmstate;
+    device_class_set_props(dc, vtd_properties);
+    dc->hotpluggable = false;
+    x86_class->realize = vtd_realize;
+    x86_class->int_remap = vtd_int_remap;
+    /* Supported by the pc-q35-* machine types */
+    dc->user_creatable = true;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    dc->desc = "Intel IOMMU (VT-d) DMA Remapping device";
+}
+
+static const TypeInfo vtd_info = {
+    .name          = TYPE_INTEL_IOMMU_DEVICE,
+    .parent        = TYPE_X86_IOMMU_DEVICE,
+    .instance_size = sizeof(IntelIOMMUState),
+    .class_init    = vtd_class_init,
+};
+
+static void vtd_iommu_memory_region_class_init(ObjectClass *klass,
+                                                     void *data)
+{
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+    imrc->translate = vtd_iommu_translate;
+    imrc->notify_flag_changed = vtd_iommu_notify_flag_changed;
+    imrc->replay = vtd_iommu_replay;
+}
+
+static const TypeInfo vtd_iommu_memory_region_info = {
+    .parent = TYPE_IOMMU_MEMORY_REGION,
+    .name = TYPE_INTEL_IOMMU_MEMORY_REGION,
+    .class_init = vtd_iommu_memory_region_class_init,
+};
+
+static void vtd_register_types(void)
+{
+    type_register_static(&vtd_info);
+    type_register_static(&vtd_iommu_memory_region_info);
+}
+
+type_init(vtd_register_types)
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
new file mode 100644
index 000000000..a6c788049
--- /dev/null
+++ b/hw/i386/intel_iommu_internal.h
@@ -0,0 +1,518 @@
+/*
+ * QEMU emulation of an Intel IOMMU (VT-d)
+ *   (DMA Remapping device)
+ *
+ * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
+ * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Lots of defines copied from kernel/include/linux/intel-iommu.h:
+ *   Copyright (C) 2006-2008 Intel Corporation
+ *   Author: Ashok Raj <ashok.raj@intel.com>
+ *   Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *
+ */
+
+#ifndef HW_I386_INTEL_IOMMU_INTERNAL_H
+#define HW_I386_INTEL_IOMMU_INTERNAL_H
+#include "hw/i386/intel_iommu.h"
+
+/*
+ * Intel IOMMU register specification
+ */
+#define DMAR_VER_REG            0x0  /* Arch version supported by this IOMMU */
+#define DMAR_CAP_REG            0x8  /* Hardware supported capabilities */
+#define DMAR_CAP_REG_HI         0xc  /* High 32-bit of DMAR_CAP_REG */
+#define DMAR_ECAP_REG           0x10 /* Extended capabilities supported */
+#define DMAR_ECAP_REG_HI        0X14
+#define DMAR_GCMD_REG           0x18 /* Global command */
+#define DMAR_GSTS_REG           0x1c /* Global status */
+#define DMAR_RTADDR_REG         0x20 /* Root entry table */
+#define DMAR_RTADDR_REG_HI      0X24
+#define DMAR_CCMD_REG           0x28 /* Context command */
+#define DMAR_CCMD_REG_HI        0x2c
+#define DMAR_FSTS_REG           0x34 /* Fault status */
+#define DMAR_FECTL_REG          0x38 /* Fault control */
+#define DMAR_FEDATA_REG         0x3c /* Fault event interrupt data */
+#define DMAR_FEADDR_REG         0x40 /* Fault event interrupt addr */
+#define DMAR_FEUADDR_REG        0x44 /* Upper address */
+#define DMAR_AFLOG_REG          0x58 /* Advanced fault control */
+#define DMAR_AFLOG_REG_HI       0X5c
+#define DMAR_PMEN_REG           0x64 /* Enable protected memory region */
+#define DMAR_PLMBASE_REG        0x68 /* PMRR low addr */
+#define DMAR_PLMLIMIT_REG       0x6c /* PMRR low limit */
+#define DMAR_PHMBASE_REG        0x70 /* PMRR high base addr */
+#define DMAR_PHMBASE_REG_HI     0X74
+#define DMAR_PHMLIMIT_REG       0x78 /* PMRR high limit */
+#define DMAR_PHMLIMIT_REG_HI    0x7c
+#define DMAR_IQH_REG            0x80 /* Invalidation queue head */
+#define DMAR_IQH_REG_HI         0X84
+#define DMAR_IQT_REG            0x88 /* Invalidation queue tail */
+#define DMAR_IQT_REG_HI         0X8c
+#define DMAR_IQA_REG            0x90 /* Invalidation queue addr */
+#define DMAR_IQA_REG_HI         0x94
+#define DMAR_ICS_REG            0x9c /* Invalidation complete status */
+#define DMAR_IRTA_REG           0xb8 /* Interrupt remapping table addr */
+#define DMAR_IRTA_REG_HI        0xbc
+#define DMAR_IECTL_REG          0xa0 /* Invalidation event control */
+#define DMAR_IEDATA_REG         0xa4 /* Invalidation event data */
+#define DMAR_IEADDR_REG         0xa8 /* Invalidation event address */
+#define DMAR_IEUADDR_REG        0xac /* Invalidation event address */
+#define DMAR_PQH_REG            0xc0 /* Page request queue head */
+#define DMAR_PQH_REG_HI         0xc4
+#define DMAR_PQT_REG            0xc8 /* Page request queue tail*/
+#define DMAR_PQT_REG_HI         0xcc
+#define DMAR_PQA_REG            0xd0 /* Page request queue address */
+#define DMAR_PQA_REG_HI         0xd4
+#define DMAR_PRS_REG            0xdc /* Page request status */
+#define DMAR_PECTL_REG          0xe0 /* Page request event control */
+#define DMAR_PEDATA_REG         0xe4 /* Page request event data */
+#define DMAR_PEADDR_REG         0xe8 /* Page request event address */
+#define DMAR_PEUADDR_REG        0xec /* Page event upper address */
+#define DMAR_MTRRCAP_REG        0x100 /* MTRR capability */
+#define DMAR_MTRRCAP_REG_HI     0x104
+#define DMAR_MTRRDEF_REG        0x108 /* MTRR default type */
+#define DMAR_MTRRDEF_REG_HI     0x10c
+
+/* IOTLB registers */
+#define DMAR_IOTLB_REG_OFFSET   0xf0 /* Offset to the IOTLB registers */
+#define DMAR_IVA_REG            DMAR_IOTLB_REG_OFFSET /* Invalidate address */
+#define DMAR_IVA_REG_HI         (DMAR_IVA_REG + 4)
+/* IOTLB invalidate register */
+#define DMAR_IOTLB_REG          (DMAR_IOTLB_REG_OFFSET + 0x8)
+#define DMAR_IOTLB_REG_HI       (DMAR_IOTLB_REG + 4)
+
+/* FRCD */
+#define DMAR_FRCD_REG_OFFSET    0x220 /* Offset to the fault recording regs */
+/* NOTICE: If you change the DMAR_FRCD_REG_NR, please remember to change the
+ * DMAR_REG_SIZE in include/hw/i386/intel_iommu.h.
+ * #define DMAR_REG_SIZE   (DMAR_FRCD_REG_OFFSET + 16 * DMAR_FRCD_REG_NR)
+ */
+#define DMAR_FRCD_REG_NR        1ULL /* Num of fault recording regs */
+
+#define DMAR_FRCD_REG_0_0       0x220 /* The 0th fault recording regs */
+#define DMAR_FRCD_REG_0_1       0x224
+#define DMAR_FRCD_REG_0_2       0x228
+#define DMAR_FRCD_REG_0_3       0x22c
+
+/* Interrupt Address Range */
+#define VTD_INTERRUPT_ADDR_FIRST    0xfee00000ULL
+#define VTD_INTERRUPT_ADDR_LAST     0xfeefffffULL
+#define VTD_INTERRUPT_ADDR_SIZE     (VTD_INTERRUPT_ADDR_LAST - \
+                                     VTD_INTERRUPT_ADDR_FIRST + 1)
+
+/* The shift of source_id in the key of IOTLB hash table */
+#define VTD_IOTLB_SID_SHIFT         36
+#define VTD_IOTLB_LVL_SHIFT         52
+#define VTD_IOTLB_MAX_SIZE          1024    /* Max size of the hash table */
+
+/* IOTLB_REG */
+#define VTD_TLB_GLOBAL_FLUSH        (1ULL << 60) /* Global invalidation */
+#define VTD_TLB_DSI_FLUSH           (2ULL << 60) /* Domain-selective */
+#define VTD_TLB_PSI_FLUSH           (3ULL << 60) /* Page-selective */
+#define VTD_TLB_FLUSH_GRANU_MASK    (3ULL << 60)
+#define VTD_TLB_GLOBAL_FLUSH_A      (1ULL << 57)
+#define VTD_TLB_DSI_FLUSH_A         (2ULL << 57)
+#define VTD_TLB_PSI_FLUSH_A         (3ULL << 57)
+#define VTD_TLB_FLUSH_GRANU_MASK_A  (3ULL << 57)
+#define VTD_TLB_IVT                 (1ULL << 63)
+#define VTD_TLB_DID(val)            (((val) >> 32) & VTD_DOMAIN_ID_MASK)
+
+/* IVA_REG */
+#define VTD_IVA_ADDR(val)       ((val) & ~0xfffULL)
+#define VTD_IVA_AM(val)         ((val) & 0x3fULL)
+
+/* GCMD_REG */
+#define VTD_GCMD_TE                 (1UL << 31)
+#define VTD_GCMD_SRTP               (1UL << 30)
+#define VTD_GCMD_SFL                (1UL << 29)
+#define VTD_GCMD_EAFL               (1UL << 28)
+#define VTD_GCMD_WBF                (1UL << 27)
+#define VTD_GCMD_QIE                (1UL << 26)
+#define VTD_GCMD_IRE                (1UL << 25)
+#define VTD_GCMD_SIRTP              (1UL << 24)
+#define VTD_GCMD_CFI                (1UL << 23)
+
+/* GSTS_REG */
+#define VTD_GSTS_TES                (1UL << 31)
+#define VTD_GSTS_RTPS               (1UL << 30)
+#define VTD_GSTS_FLS                (1UL << 29)
+#define VTD_GSTS_AFLS               (1UL << 28)
+#define VTD_GSTS_WBFS               (1UL << 27)
+#define VTD_GSTS_QIES               (1UL << 26)
+#define VTD_GSTS_IRES               (1UL << 25)
+#define VTD_GSTS_IRTPS              (1UL << 24)
+#define VTD_GSTS_CFIS               (1UL << 23)
+
+/* CCMD_REG */
+#define VTD_CCMD_ICC                (1ULL << 63)
+#define VTD_CCMD_GLOBAL_INVL        (1ULL << 61)
+#define VTD_CCMD_DOMAIN_INVL        (2ULL << 61)
+#define VTD_CCMD_DEVICE_INVL        (3ULL << 61)
+#define VTD_CCMD_CIRG_MASK          (3ULL << 61)
+#define VTD_CCMD_GLOBAL_INVL_A      (1ULL << 59)
+#define VTD_CCMD_DOMAIN_INVL_A      (2ULL << 59)
+#define VTD_CCMD_DEVICE_INVL_A      (3ULL << 59)
+#define VTD_CCMD_CAIG_MASK          (3ULL << 59)
+#define VTD_CCMD_DID(val)           ((val) & VTD_DOMAIN_ID_MASK)
+#define VTD_CCMD_SID(val)           (((val) >> 16) & 0xffffULL)
+#define VTD_CCMD_FM(val)            (((val) >> 32) & 3ULL)
+
+/* RTADDR_REG */
+#define VTD_RTADDR_SMT              (1ULL << 10)
+#define VTD_RTADDR_ADDR_MASK(aw)    (VTD_HAW_MASK(aw) ^ 0xfffULL)
+
+/* IRTA_REG */
+#define VTD_IRTA_ADDR_MASK(aw)      (VTD_HAW_MASK(aw) ^ 0xfffULL)
+#define VTD_IRTA_EIME               (1ULL << 11)
+#define VTD_IRTA_SIZE_MASK          (0xfULL)
+
+/* ECAP_REG */
+/* (offset >> 4) << 8 */
+#define VTD_ECAP_IRO                (DMAR_IOTLB_REG_OFFSET << 4)
+#define VTD_ECAP_QI                 (1ULL << 1)
+#define VTD_ECAP_DT                 (1ULL << 2)
+/* Interrupt Remapping support */
+#define VTD_ECAP_IR                 (1ULL << 3)
+#define VTD_ECAP_EIM                (1ULL << 4)
+#define VTD_ECAP_PT                 (1ULL << 6)
+#define VTD_ECAP_MHMV               (15ULL << 20)
+#define VTD_ECAP_SRS                (1ULL << 31)
+#define VTD_ECAP_SMTS               (1ULL << 43)
+#define VTD_ECAP_SLTS               (1ULL << 46)
+
+/* CAP_REG */
+/* (offset >> 4) << 24 */
+#define VTD_CAP_FRO                 (DMAR_FRCD_REG_OFFSET << 20)
+#define VTD_CAP_NFR                 ((DMAR_FRCD_REG_NR - 1) << 40)
+#define VTD_DOMAIN_ID_SHIFT         16  /* 16-bit domain id for 64K domains */
+#define VTD_DOMAIN_ID_MASK          ((1UL << VTD_DOMAIN_ID_SHIFT) - 1)
+#define VTD_CAP_ND                  (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
+#define VTD_ADDRESS_SIZE(aw)        (1ULL << (aw))
+#define VTD_CAP_MGAW(aw)            ((((aw) - 1) & 0x3fULL) << 16)
+#define VTD_MAMV                    18ULL
+#define VTD_CAP_MAMV                (VTD_MAMV << 48)
+#define VTD_CAP_PSI                 (1ULL << 39)
+#define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_DRAIN_WRITE         (1ULL << 54)
+#define VTD_CAP_DRAIN_READ          (1ULL << 55)
+#define VTD_CAP_DRAIN               (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE)
+#define VTD_CAP_CM                  (1ULL << 7)
+
+/* Supported Adjusted Guest Address Widths */
+#define VTD_CAP_SAGAW_SHIFT         8
+#define VTD_CAP_SAGAW_MASK          (0x1fULL << VTD_CAP_SAGAW_SHIFT)
+ /* 39-bit AGAW, 3-level page-table */
+#define VTD_CAP_SAGAW_39bit         (0x2ULL << VTD_CAP_SAGAW_SHIFT)
+ /* 48-bit AGAW, 4-level page-table */
+#define VTD_CAP_SAGAW_48bit         (0x4ULL << VTD_CAP_SAGAW_SHIFT)
+
+/* IQT_REG */
+#define VTD_IQT_QT(dw_bit, val)     (dw_bit ? (((val) >> 5) & 0x3fffULL) : \
+                                     (((val) >> 4) & 0x7fffULL))
+#define VTD_IQT_QT_256_RSV_BIT      0x10
+
+/* IQA_REG */
+#define VTD_IQA_IQA_MASK(aw)        (VTD_HAW_MASK(aw) ^ 0xfffULL)
+#define VTD_IQA_QS                  0x7ULL
+#define VTD_IQA_DW_MASK             0x800
+
+/* IQH_REG */
+#define VTD_IQH_QH_SHIFT_4          4
+#define VTD_IQH_QH_SHIFT_5          5
+#define VTD_IQH_QH_MASK             0x7fff0ULL
+
+/* ICS_REG */
+#define VTD_ICS_IWC                 1UL
+
+/* IECTL_REG */
+#define VTD_IECTL_IM                (1UL << 31)
+#define VTD_IECTL_IP                (1UL << 30)
+
+/* FSTS_REG */
+#define VTD_FSTS_FRI_MASK       0xff00UL
+#define VTD_FSTS_FRI(val)       ((((uint32_t)(val)) << 8) & VTD_FSTS_FRI_MASK)
+#define VTD_FSTS_IQE            (1UL << 4)
+#define VTD_FSTS_PPF            (1UL << 1)
+#define VTD_FSTS_PFO            1UL
+
+/* FECTL_REG */
+#define VTD_FECTL_IM            (1UL << 31)
+#define VTD_FECTL_IP            (1UL << 30)
+
+/* Fault Recording Register */
+/* For the high 64-bit of 128-bit */
+#define VTD_FRCD_F              (1ULL << 63)
+#define VTD_FRCD_T              (1ULL << 62)
+#define VTD_FRCD_FR(val)        (((val) & 0xffULL) << 32)
+#define VTD_FRCD_SID_MASK       0xffffULL
+#define VTD_FRCD_SID(val)       ((val) & VTD_FRCD_SID_MASK)
+/* For the low 64-bit of 128-bit */
+#define VTD_FRCD_FI(val)        ((val) & ~0xfffULL)
+
+/* DMA Remapping Fault Conditions */
+typedef enum VTDFaultReason {
+    VTD_FR_RESERVED = 0,        /* Reserved for Advanced Fault logging */
+    VTD_FR_ROOT_ENTRY_P = 1,    /* The Present(P) field of root-entry is 0 */
+    VTD_FR_CONTEXT_ENTRY_P,     /* The Present(P) field of context-entry is 0 */
+    VTD_FR_CONTEXT_ENTRY_INV,   /* Invalid programming of a context-entry */
+    VTD_FR_ADDR_BEYOND_MGAW,    /* Input-address above (2^x-1) */
+    VTD_FR_WRITE,               /* No write permission */
+    VTD_FR_READ,                /* No read permission */
+    /* Fail to access a second-level paging entry (not SL_PML4E) */
+    VTD_FR_PAGING_ENTRY_INV,
+    VTD_FR_ROOT_TABLE_INV,      /* Fail to access a root-entry */
+    VTD_FR_CONTEXT_TABLE_INV,   /* Fail to access a context-entry */
+    /* Non-zero reserved field in a present root-entry */
+    VTD_FR_ROOT_ENTRY_RSVD,
+    /* Non-zero reserved field in a present context-entry */
+    VTD_FR_CONTEXT_ENTRY_RSVD,
+    /* Non-zero reserved field in a second-level paging entry with at lease one
+     * Read(R) and Write(W) or Execute(E) field is Set.
+     */
+    VTD_FR_PAGING_ENTRY_RSVD,
+    /* Translation request or translated request explicitly blocked dut to the
+     * programming of the Translation Type (T) field in the present
+     * context-entry.
+     */
+    VTD_FR_CONTEXT_ENTRY_TT,
+
+    /* Interrupt remapping transition faults */
+    VTD_FR_IR_REQ_RSVD = 0x20, /* One or more IR request reserved
+                                * fields set */
+    VTD_FR_IR_INDEX_OVER = 0x21, /* Index value greater than max */
+    VTD_FR_IR_ENTRY_P = 0x22,    /* Present (P) not set in IRTE */
+    VTD_FR_IR_ROOT_INVAL = 0x23, /* IR Root table invalid */
+    VTD_FR_IR_IRTE_RSVD = 0x24,  /* IRTE Rsvd field non-zero with
+                                  * Present flag set */
+    VTD_FR_IR_REQ_COMPAT = 0x25, /* Encountered compatible IR
+                                  * request while disabled */
+    VTD_FR_IR_SID_ERR = 0x26,   /* Invalid Source-ID */
+
+    VTD_FR_PASID_TABLE_INV = 0x58,  /*Invalid PASID table entry */
+
+    /* This is not a normal fault reason. We use this to indicate some faults
+     * that are not referenced by the VT-d specification.
+     * Fault event with such reason should not be recorded.
+     */
+    VTD_FR_RESERVED_ERR,
+    VTD_FR_MAX,                 /* Guard */
+} VTDFaultReason;
+
+#define VTD_CONTEXT_CACHE_GEN_MAX       0xffffffffUL
+
+/* Interrupt Entry Cache Invalidation Descriptor: VT-d 6.5.2.7. */
+struct VTDInvDescIEC {
+    uint32_t type:4;            /* Should always be 0x4 */
+    uint32_t granularity:1;     /* If set, it's global IR invalidation */
+    uint32_t resved_1:22;
+    uint32_t index_mask:5;      /* 2^N for continuous int invalidation */
+    uint32_t index:16;          /* Start index to invalidate */
+    uint32_t reserved_2:16;
+};
+typedef struct VTDInvDescIEC VTDInvDescIEC;
+
+/* Queued Invalidation Descriptor */
+union VTDInvDesc {
+    struct {
+        uint64_t lo;
+        uint64_t hi;
+    };
+    struct {
+        uint64_t val[4];
+    };
+    union {
+        VTDInvDescIEC iec;
+    };
+};
+typedef union VTDInvDesc VTDInvDesc;
+
+/* Masks for struct VTDInvDesc */
+#define VTD_INV_DESC_TYPE               0xf
+#define VTD_INV_DESC_CC                 0x1 /* Context-cache Invalidate Desc */
+#define VTD_INV_DESC_IOTLB              0x2
+#define VTD_INV_DESC_DEVICE             0x3
+#define VTD_INV_DESC_IEC                0x4 /* Interrupt Entry Cache
+                                               Invalidate Descriptor */
+#define VTD_INV_DESC_WAIT               0x5 /* Invalidation Wait Descriptor */
+#define VTD_INV_DESC_PIOTLB             0x6 /* PASID-IOTLB Invalidate Desc */
+#define VTD_INV_DESC_PC                 0x7 /* PASID-cache Invalidate Desc */
+#define VTD_INV_DESC_NONE               0   /* Not an Invalidate Descriptor */
+
+/* Masks for Invalidation Wait Descriptor*/
+#define VTD_INV_DESC_WAIT_SW            (1ULL << 5)
+#define VTD_INV_DESC_WAIT_IF            (1ULL << 4)
+#define VTD_INV_DESC_WAIT_FN            (1ULL << 6)
+#define VTD_INV_DESC_WAIT_DATA_SHIFT    32
+#define VTD_INV_DESC_WAIT_RSVD_LO       0Xffffff80ULL
+#define VTD_INV_DESC_WAIT_RSVD_HI       3ULL
+
+/* Masks for Context-cache Invalidation Descriptor */
+#define VTD_INV_DESC_CC_G               (3ULL << 4)
+#define VTD_INV_DESC_CC_GLOBAL          (1ULL << 4)
+#define VTD_INV_DESC_CC_DOMAIN          (2ULL << 4)
+#define VTD_INV_DESC_CC_DEVICE          (3ULL << 4)
+#define VTD_INV_DESC_CC_DID(val)        (((val) >> 16) & VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_CC_SID(val)        (((val) >> 32) & 0xffffUL)
+#define VTD_INV_DESC_CC_FM(val)         (((val) >> 48) & 3UL)
+#define VTD_INV_DESC_CC_RSVD            0xfffc00000000ffc0ULL
+
+/* Masks for IOTLB Invalidate Descriptor */
+#define VTD_INV_DESC_IOTLB_G            (3ULL << 4)
+#define VTD_INV_DESC_IOTLB_GLOBAL       (1ULL << 4)
+#define VTD_INV_DESC_IOTLB_DOMAIN       (2ULL << 4)
+#define VTD_INV_DESC_IOTLB_PAGE         (3ULL << 4)
+#define VTD_INV_DESC_IOTLB_DID(val)     (((val) >> 16) & VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_IOTLB_ADDR(val)    ((val) & ~0xfffULL)
+#define VTD_INV_DESC_IOTLB_AM(val)      ((val) & 0x3fULL)
+#define VTD_INV_DESC_IOTLB_RSVD_LO      0xffffffff0000ff00ULL
+#define VTD_INV_DESC_IOTLB_RSVD_HI      0xf80ULL
+
+/* Mask for Device IOTLB Invalidate Descriptor */
+#define VTD_INV_DESC_DEVICE_IOTLB_ADDR(val) ((val) & 0xfffffffffffff000ULL)
+#define VTD_INV_DESC_DEVICE_IOTLB_SIZE(val) ((val) & 0x1)
+#define VTD_INV_DESC_DEVICE_IOTLB_SID(val) (((val) >> 32) & 0xFFFFULL)
+#define VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI 0xffeULL
+#define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0fff8
+
+/* Rsvd field masks for spte */
+#define VTD_SPTE_SNP 0x800ULL
+
+#define VTD_SPTE_PAGE_L1_RSVD_MASK(aw, dt_supported) \
+        dt_supported ? \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L2_RSVD_MASK(aw) \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L3_RSVD_MASK(aw) \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \
+        (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+
+#define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw, dt_supported) \
+        dt_supported ? \
+        (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
+        (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L3_RSVD_MASK(aw, dt_supported) \
+        dt_supported ? \
+        (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
+        (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+
+/* Information about page-selective IOTLB invalidate */
+struct VTDIOTLBPageInvInfo {
+    uint16_t domain_id;
+    uint64_t addr;
+    uint8_t mask;
+};
+typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
+
+/* Pagesize of VTD paging structures, including root and context tables */
+#define VTD_PAGE_SHIFT              12
+#define VTD_PAGE_SIZE               (1ULL << VTD_PAGE_SHIFT)
+
+#define VTD_PAGE_SHIFT_4K           12
+#define VTD_PAGE_MASK_4K            (~((1ULL << VTD_PAGE_SHIFT_4K) - 1))
+#define VTD_PAGE_SHIFT_2M           21
+#define VTD_PAGE_MASK_2M            (~((1ULL << VTD_PAGE_SHIFT_2M) - 1))
+#define VTD_PAGE_SHIFT_1G           30
+#define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
+
+struct VTDRootEntry {
+    uint64_t lo;
+    uint64_t hi;
+};
+typedef struct VTDRootEntry VTDRootEntry;
+
+/* Masks for struct VTDRootEntry */
+#define VTD_ROOT_ENTRY_P            1ULL
+#define VTD_ROOT_ENTRY_CTP          (~0xfffULL)
+
+#define VTD_ROOT_ENTRY_NR           (VTD_PAGE_SIZE / sizeof(VTDRootEntry))
+#define VTD_ROOT_ENTRY_RSVD(aw)     (0xffeULL | ~VTD_HAW_MASK(aw))
+
+#define VTD_DEVFN_CHECK_MASK        0x80
+
+/* Masks for struct VTDContextEntry */
+/* lo */
+#define VTD_CONTEXT_ENTRY_P         (1ULL << 0)
+#define VTD_CONTEXT_ENTRY_FPD       (1ULL << 1) /* Fault Processing Disable */
+#define VTD_CONTEXT_ENTRY_TT        (3ULL << 2) /* Translation Type */
+#define VTD_CONTEXT_TT_MULTI_LEVEL  0
+#define VTD_CONTEXT_TT_DEV_IOTLB    (1ULL << 2)
+#define VTD_CONTEXT_TT_PASS_THROUGH (2ULL << 2)
+/* Second Level Page Translation Pointer*/
+#define VTD_CONTEXT_ENTRY_SLPTPTR   (~0xfffULL)
+#define VTD_CONTEXT_ENTRY_RSVD_LO(aw) (0xff0ULL | ~VTD_HAW_MASK(aw))
+/* hi */
+#define VTD_CONTEXT_ENTRY_AW        7ULL /* Adjusted guest-address-width */
+#define VTD_CONTEXT_ENTRY_DID(val)  (((val) >> 8) & VTD_DOMAIN_ID_MASK)
+#define VTD_CONTEXT_ENTRY_RSVD_HI   0xffffffffff000080ULL
+
+#define VTD_CONTEXT_ENTRY_NR        (VTD_PAGE_SIZE / sizeof(VTDContextEntry))
+
+#define VTD_CTX_ENTRY_LEGACY_SIZE     16
+#define VTD_CTX_ENTRY_SCALABLE_SIZE   32
+
+#define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff
+#define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
+#define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
+
+/* PASID Table Related Definitions */
+#define VTD_PASID_DIR_BASE_ADDR_MASK  (~0xfffULL)
+#define VTD_PASID_TABLE_BASE_ADDR_MASK (~0xfffULL)
+#define VTD_PASID_DIR_ENTRY_SIZE      8
+#define VTD_PASID_ENTRY_SIZE          64
+#define VTD_PASID_DIR_BITS_MASK       (0x3fffULL)
+#define VTD_PASID_DIR_INDEX(pasid)    (((pasid) >> 6) & VTD_PASID_DIR_BITS_MASK)
+#define VTD_PASID_DIR_FPD             (1ULL << 1) /* Fault Processing Disable */
+#define VTD_PASID_TABLE_BITS_MASK     (0x3fULL)
+#define VTD_PASID_TABLE_INDEX(pasid)  ((pasid) & VTD_PASID_TABLE_BITS_MASK)
+#define VTD_PASID_ENTRY_FPD           (1ULL << 1) /* Fault Processing Disable */
+
+/* PASID Granular Translation Type Mask */
+#define VTD_PASID_ENTRY_P              1ULL
+#define VTD_SM_PASID_ENTRY_PGTT        (7ULL << 6)
+#define VTD_SM_PASID_ENTRY_FLT         (1ULL << 6)
+#define VTD_SM_PASID_ENTRY_SLT         (2ULL << 6)
+#define VTD_SM_PASID_ENTRY_NESTED      (3ULL << 6)
+#define VTD_SM_PASID_ENTRY_PT          (4ULL << 6)
+
+#define VTD_SM_PASID_ENTRY_AW          7ULL /* Adjusted guest-address-width */
+#define VTD_SM_PASID_ENTRY_DID(val)    ((val) & VTD_DOMAIN_ID_MASK)
+
+/* Second Level Page Translation Pointer*/
+#define VTD_SM_PASID_ENTRY_SLPTPTR     (~0xfffULL)
+
+/* Paging Structure common */
+#define VTD_SL_PT_PAGE_SIZE_MASK    (1ULL << 7)
+/* Bits to decide the offset for each level */
+#define VTD_SL_LEVEL_BITS           9
+
+/* Second Level Paging Structure */
+#define VTD_SL_PML4_LEVEL           4
+#define VTD_SL_PDP_LEVEL            3
+#define VTD_SL_PD_LEVEL             2
+#define VTD_SL_PT_LEVEL             1
+#define VTD_SL_PT_ENTRY_NR          512
+
+/* Masks for Second Level Paging Entry */
+#define VTD_SL_RW_MASK              3ULL
+#define VTD_SL_R                    1ULL
+#define VTD_SL_W                    (1ULL << 1)
+#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
+#define VTD_SL_IGN_COM              0xbff0000000000000ULL
+#define VTD_SL_TM                   (1ULL << 62)
+
+#endif
diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c
new file mode 100644
index 000000000..1e89ca089
--- /dev/null
+++ b/hw/i386/kvm/apic.c
@@ -0,0 +1,271 @@
+/*
+ * KVM in-kernel APIC support
+ *
+ * Copyright (c) 2011 Siemens AG
+ *
+ * Authors:
+ *  Jan Kiszka          <jan.kiszka@siemens.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/module.h"
+#include "hw/i386/apic_internal.h"
+#include "hw/pci/msi.h"
+#include "sysemu/hw_accel.h"
+#include "sysemu/kvm.h"
+#include "kvm/kvm_i386.h"
+
+static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic,
+                                    int reg_id, uint32_t val)
+{
+    *((uint32_t *)(kapic->regs + (reg_id << 4))) = val;
+}
+
+static inline uint32_t kvm_apic_get_reg(struct kvm_lapic_state *kapic,
+                                        int reg_id)
+{
+    return *((uint32_t *)(kapic->regs + (reg_id << 4)));
+}
+
+static void kvm_put_apic_state(APICCommonState *s, struct kvm_lapic_state *kapic)
+{
+    int i;
+
+    memset(kapic, 0, sizeof(*kapic));
+    if (kvm_has_x2apic_api() && s->apicbase & MSR_IA32_APICBASE_EXTD) {
+        kvm_apic_set_reg(kapic, 0x2, s->initial_apic_id);
+    } else {
+        kvm_apic_set_reg(kapic, 0x2, s->id << 24);
+    }
+    kvm_apic_set_reg(kapic, 0x8, s->tpr);
+    kvm_apic_set_reg(kapic, 0xd, s->log_dest << 24);
+    kvm_apic_set_reg(kapic, 0xe, s->dest_mode << 28 | 0x0fffffff);
+    kvm_apic_set_reg(kapic, 0xf, s->spurious_vec);
+    for (i = 0; i < 8; i++) {
+        kvm_apic_set_reg(kapic, 0x10 + i, s->isr[i]);
+        kvm_apic_set_reg(kapic, 0x18 + i, s->tmr[i]);
+        kvm_apic_set_reg(kapic, 0x20 + i, s->irr[i]);
+    }
+    kvm_apic_set_reg(kapic, 0x28, s->esr);
+    kvm_apic_set_reg(kapic, 0x30, s->icr[0]);
+    kvm_apic_set_reg(kapic, 0x31, s->icr[1]);
+    for (i = 0; i < APIC_LVT_NB; i++) {
+        kvm_apic_set_reg(kapic, 0x32 + i, s->lvt[i]);
+    }
+    kvm_apic_set_reg(kapic, 0x38, s->initial_count);
+    kvm_apic_set_reg(kapic, 0x3e, s->divide_conf);
+}
+
+void kvm_get_apic_state(DeviceState *dev, struct kvm_lapic_state *kapic)
+{
+    APICCommonState *s = APIC_COMMON(dev);
+    int i, v;
+
+    if (kvm_has_x2apic_api() && s->apicbase & MSR_IA32_APICBASE_EXTD) {
+        assert(kvm_apic_get_reg(kapic, 0x2) == s->initial_apic_id);
+    } else {
+        s->id = kvm_apic_get_reg(kapic, 0x2) >> 24;
+    }
+    s->tpr = kvm_apic_get_reg(kapic, 0x8);
+    s->arb_id = kvm_apic_get_reg(kapic, 0x9);
+    s->log_dest = kvm_apic_get_reg(kapic, 0xd) >> 24;
+    s->dest_mode = kvm_apic_get_reg(kapic, 0xe) >> 28;
+    s->spurious_vec = kvm_apic_get_reg(kapic, 0xf);
+    for (i = 0; i < 8; i++) {
+        s->isr[i] = kvm_apic_get_reg(kapic, 0x10 + i);
+        s->tmr[i] = kvm_apic_get_reg(kapic, 0x18 + i);
+        s->irr[i] = kvm_apic_get_reg(kapic, 0x20 + i);
+    }
+    s->esr = kvm_apic_get_reg(kapic, 0x28);
+    s->icr[0] = kvm_apic_get_reg(kapic, 0x30);
+    s->icr[1] = kvm_apic_get_reg(kapic, 0x31);
+    for (i = 0; i < APIC_LVT_NB; i++) {
+        s->lvt[i] = kvm_apic_get_reg(kapic, 0x32 + i);
+    }
+    s->initial_count = kvm_apic_get_reg(kapic, 0x38);
+    s->divide_conf = kvm_apic_get_reg(kapic, 0x3e);
+
+    v = (s->divide_conf & 3) | ((s->divide_conf >> 1) & 4);
+    s->count_shift = (v + 1) & 7;
+
+    s->initial_count_load_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+    apic_next_timer(s, s->initial_count_load_time);
+}
+
+static void kvm_apic_set_base(APICCommonState *s, uint64_t val)
+{
+    s->apicbase = val;
+}
+
+static void kvm_apic_set_tpr(APICCommonState *s, uint8_t val)
+{
+    s->tpr = (val & 0x0f) << 4;
+}
+
+static uint8_t kvm_apic_get_tpr(APICCommonState *s)
+{
+    return s->tpr >> 4;
+}
+
+static void kvm_apic_enable_tpr_reporting(APICCommonState *s, bool enable)
+{
+    struct kvm_tpr_access_ctl ctl = {
+        .enabled = enable
+    };
+
+    kvm_vcpu_ioctl(CPU(s->cpu), KVM_TPR_ACCESS_REPORTING, &ctl);
+}
+
+static void kvm_apic_vapic_base_update(APICCommonState *s)
+{
+    struct kvm_vapic_addr vapid_addr = {
+        .vapic_addr = s->vapic_paddr,
+    };
+    int ret;
+
+    ret = kvm_vcpu_ioctl(CPU(s->cpu), KVM_SET_VAPIC_ADDR, &vapid_addr);
+    if (ret < 0) {
+        fprintf(stderr, "KVM: setting VAPIC address failed (%s)\n",
+                strerror(-ret));
+        abort();
+    }
+}
+
+static void kvm_apic_put(CPUState *cs, run_on_cpu_data data)
+{
+    APICCommonState *s = data.host_ptr;
+    struct kvm_lapic_state kapic;
+    int ret;
+
+    kvm_put_apicbase(s->cpu, s->apicbase);
+    kvm_put_apic_state(s, &kapic);
+
+    ret = kvm_vcpu_ioctl(CPU(s->cpu), KVM_SET_LAPIC, &kapic);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_SET_LAPIC failed: %s\n", strerror(-ret));
+        abort();
+    }
+}
+
+static void kvm_apic_post_load(APICCommonState *s)
+{
+    run_on_cpu(CPU(s->cpu), kvm_apic_put, RUN_ON_CPU_HOST_PTR(s));
+}
+
+static void do_inject_external_nmi(CPUState *cpu, run_on_cpu_data data)
+{
+    APICCommonState *s = data.host_ptr;
+    uint32_t lvt;
+    int ret;
+
+    cpu_synchronize_state(cpu);
+
+    lvt = s->lvt[APIC_LVT_LINT1];
+    if (!(lvt & APIC_LVT_MASKED) && ((lvt >> 8) & 7) == APIC_DM_NMI) {
+        ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
+        if (ret < 0) {
+            fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
+                    strerror(-ret));
+        }
+    }
+}
+
+static void kvm_apic_external_nmi(APICCommonState *s)
+{
+    run_on_cpu(CPU(s->cpu), do_inject_external_nmi, RUN_ON_CPU_HOST_PTR(s));
+}
+
+static void kvm_send_msi(MSIMessage *msg)
+{
+    int ret;
+
+    /*
+     * The message has already passed through interrupt remapping if enabled,
+     * but the legacy extended destination ID in low bits still needs to be
+     * handled.
+     */
+    msg->address = kvm_swizzle_msi_ext_dest_id(msg->address);
+
+    ret = kvm_irqchip_send_msi(kvm_state, *msg);
+    if (ret < 0) {
+        fprintf(stderr, "KVM: injection failed, MSI lost (%s)\n",
+                strerror(-ret));
+    }
+}
+
+static uint64_t kvm_apic_mem_read(void *opaque, hwaddr addr,
+                                  unsigned size)
+{
+    return ~(uint64_t)0;
+}
+
+static void kvm_apic_mem_write(void *opaque, hwaddr addr,
+                               uint64_t data, unsigned size)
+{
+    MSIMessage msg = { .address = addr, .data = data };
+
+    kvm_send_msi(&msg);
+}
+
+static const MemoryRegionOps kvm_apic_io_ops = {
+    .read = kvm_apic_mem_read,
+    .write = kvm_apic_mem_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void kvm_apic_reset(APICCommonState *s)
+{
+    /* Not used by KVM, which uses the CPU mp_state instead.  */
+    s->wait_for_sipi = 0;
+
+    run_on_cpu(CPU(s->cpu), kvm_apic_put, RUN_ON_CPU_HOST_PTR(s));
+}
+
+static void kvm_apic_realize(DeviceState *dev, Error **errp)
+{
+    APICCommonState *s = APIC_COMMON(dev);
+
+    memory_region_init_io(&s->io_memory, OBJECT(s), &kvm_apic_io_ops, s,
+                          "kvm-apic-msi", APIC_SPACE_SIZE);
+
+    assert(kvm_has_gsi_routing());
+    msi_nonbroken = true;
+}
+
+static void kvm_apic_unrealize(DeviceState *dev)
+{
+}
+
+static void kvm_apic_class_init(ObjectClass *klass, void *data)
+{
+    APICCommonClass *k = APIC_COMMON_CLASS(klass);
+
+    k->realize = kvm_apic_realize;
+    k->unrealize = kvm_apic_unrealize;
+    k->reset = kvm_apic_reset;
+    k->set_base = kvm_apic_set_base;
+    k->set_tpr = kvm_apic_set_tpr;
+    k->get_tpr = kvm_apic_get_tpr;
+    k->post_load = kvm_apic_post_load;
+    k->enable_tpr_reporting = kvm_apic_enable_tpr_reporting;
+    k->vapic_base_update = kvm_apic_vapic_base_update;
+    k->external_nmi = kvm_apic_external_nmi;
+    k->send_msi = kvm_send_msi;
+}
+
+static const TypeInfo kvm_apic_info = {
+    .name = "kvm-apic",
+    .parent = TYPE_APIC_COMMON,
+    .instance_size = sizeof(APICCommonState),
+    .class_init = kvm_apic_class_init,
+};
+
+static void kvm_apic_register_types(void)
+{
+    type_register_static(&kvm_apic_info);
+}
+
+type_init(kvm_apic_register_types)
diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
new file mode 100644
index 000000000..df70b4a03
--- /dev/null
+++ b/hw/i386/kvm/clock.c
@@ -0,0 +1,350 @@
+/*
+ * QEMU KVM support, paravirtual clock device
+ *
+ * Copyright (C) 2011 Siemens AG
+ *
+ * Authors:
+ *  Jan Kiszka        <jan.kiszka@siemens.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ * See the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "sysemu/kvm.h"
+#include "sysemu/runstate.h"
+#include "sysemu/hw_accel.h"
+#include "kvm/kvm_i386.h"
+#include "migration/vmstate.h"
+#include "hw/sysbus.h"
+#include "hw/kvm/clock.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+
+#include <linux/kvm.h>
+#include "standard-headers/asm-x86/kvm_para.h"
+#include "qom/object.h"
+
+#define TYPE_KVM_CLOCK "kvmclock"
+OBJECT_DECLARE_SIMPLE_TYPE(KVMClockState, KVM_CLOCK)
+
+struct KVMClockState {
+    /*< private >*/
+    SysBusDevice busdev;
+    /*< public >*/
+
+    uint64_t clock;
+    bool clock_valid;
+
+    /* whether the 'clock' value was obtained in the 'paused' state */
+    bool runstate_paused;
+
+    /* whether machine type supports reliable KVM_GET_CLOCK */
+    bool mach_use_reliable_get_clock;
+
+    /* whether the 'clock' value was obtained in a host with
+     * reliable KVM_GET_CLOCK */
+    bool clock_is_reliable;
+};
+
+struct pvclock_vcpu_time_info {
+    uint32_t   version;
+    uint32_t   pad0;
+    uint64_t   tsc_timestamp;
+    uint64_t   system_time;
+    uint32_t   tsc_to_system_mul;
+    int8_t     tsc_shift;
+    uint8_t    flags;
+    uint8_t    pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+static uint64_t kvmclock_current_nsec(KVMClockState *s)
+{
+    CPUState *cpu = first_cpu;
+    CPUX86State *env = cpu->env_ptr;
+    hwaddr kvmclock_struct_pa;
+    uint64_t migration_tsc = env->tsc;
+    struct pvclock_vcpu_time_info time;
+    uint64_t delta;
+    uint64_t nsec_lo;
+    uint64_t nsec_hi;
+    uint64_t nsec;
+
+    cpu_synchronize_state(cpu);
+
+    if (!(env->system_time_msr & 1ULL)) {
+        /* KVM clock not active */
+        return 0;
+    }
+
+    kvmclock_struct_pa = env->system_time_msr & ~1ULL;
+    cpu_physical_memory_read(kvmclock_struct_pa, &time, sizeof(time));
+
+    assert(time.tsc_timestamp <= migration_tsc);
+    delta = migration_tsc - time.tsc_timestamp;
+    if (time.tsc_shift < 0) {
+        delta >>= -time.tsc_shift;
+    } else {
+        delta <<= time.tsc_shift;
+    }
+
+    mulu64(&nsec_lo, &nsec_hi, delta, time.tsc_to_system_mul);
+    nsec = (nsec_lo >> 32) | (nsec_hi << 32);
+    return nsec + time.system_time;
+}
+
+static void kvm_update_clock(KVMClockState *s)
+{
+    struct kvm_clock_data data;
+    int ret;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(-ret));
+                abort();
+    }
+    s->clock = data.clock;
+
+    /* If kvm_has_adjust_clock_stable() is false, KVM_GET_CLOCK returns
+     * essentially CLOCK_MONOTONIC plus a guest-specific adjustment.  This
+     * can drift from the TSC-based value that is computed by the guest,
+     * so we need to go through kvmclock_current_nsec().  If
+     * kvm_has_adjust_clock_stable() is true, and the flags contain
+     * KVM_CLOCK_TSC_STABLE, then KVM_GET_CLOCK returns a TSC-based value
+     * and kvmclock_current_nsec() is not necessary.
+     *
+     * Here, however, we need not check KVM_CLOCK_TSC_STABLE.  This is because:
+     *
+     * - if the host has disabled the kvmclock master clock, the guest already
+     *   has protection against time going backwards.  This "safety net" is only
+     *   absent when kvmclock is stable;
+     *
+     * - therefore, we can replace a check like
+     *
+     *       if last KVM_GET_CLOCK was not reliable then
+     *               read from memory
+     *
+     *   with
+     *
+     *       if last KVM_GET_CLOCK was not reliable && masterclock is enabled
+     *               read from memory
+     *
+     * However:
+     *
+     * - if kvm_has_adjust_clock_stable() returns false, the left side is
+     *   always true (KVM_GET_CLOCK is never reliable), and the right side is
+     *   unknown (because we don't have data.flags).  We must assume it's true
+     *   and read from memory.
+     *
+     * - if kvm_has_adjust_clock_stable() returns true, the result of the &&
+     *   is always false (masterclock is enabled iff KVM_GET_CLOCK is reliable)
+     *
+     * So we can just use this instead:
+     *
+     *       if !kvm_has_adjust_clock_stable() then
+     *               read from memory
+     */
+    s->clock_is_reliable = kvm_has_adjust_clock_stable();
+}
+
+static void do_kvmclock_ctrl(CPUState *cpu, run_on_cpu_data data)
+{
+    int ret = kvm_vcpu_ioctl(cpu, KVM_KVMCLOCK_CTRL, 0);
+
+    if (ret && ret != -EINVAL) {
+        fprintf(stderr, "%s: %s\n", __func__, strerror(-ret));
+    }
+}
+
+static void kvmclock_vm_state_change(void *opaque, bool running,
+                                     RunState state)
+{
+    KVMClockState *s = opaque;
+    CPUState *cpu;
+    int cap_clock_ctrl = kvm_check_extension(kvm_state, KVM_CAP_KVMCLOCK_CTRL);
+    int ret;
+
+    if (running) {
+        struct kvm_clock_data data = {};
+
+        /*
+         * If the host where s->clock was read did not support reliable
+         * KVM_GET_CLOCK, read kvmclock value from memory.
+         */
+        if (!s->clock_is_reliable) {
+            uint64_t pvclock_via_mem = kvmclock_current_nsec(s);
+            /* We can't rely on the saved clock value, just discard it */
+            if (pvclock_via_mem) {
+                s->clock = pvclock_via_mem;
+            }
+        }
+
+        s->clock_valid = false;
+
+        data.clock = s->clock;
+        ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
+        if (ret < 0) {
+            fprintf(stderr, "KVM_SET_CLOCK failed: %s\n", strerror(-ret));
+            abort();
+        }
+
+        if (!cap_clock_ctrl) {
+            return;
+        }
+        CPU_FOREACH(cpu) {
+            run_on_cpu(cpu, do_kvmclock_ctrl, RUN_ON_CPU_NULL);
+        }
+    } else {
+
+        if (s->clock_valid) {
+            return;
+        }
+
+        s->runstate_paused = runstate_check(RUN_STATE_PAUSED);
+
+        kvm_synchronize_all_tsc();
+
+        kvm_update_clock(s);
+        /*
+         * If the VM is stopped, declare the clock state valid to
+         * avoid re-reading it on next vmsave (which would return
+         * a different value). Will be reset when the VM is continued.
+         */
+        s->clock_valid = true;
+    }
+}
+
+static void kvmclock_realize(DeviceState *dev, Error **errp)
+{
+    KVMClockState *s = KVM_CLOCK(dev);
+
+    if (!kvm_enabled()) {
+        error_setg(errp, "kvmclock device requires KVM");
+        return;
+    }
+
+    kvm_update_clock(s);
+
+    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
+}
+
+static bool kvmclock_clock_is_reliable_needed(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    return s->mach_use_reliable_get_clock;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+    .name = "kvmclock/clock_is_reliable",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_clock_is_reliable_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+/*
+ * When migrating, assume the source has an unreliable
+ * KVM_GET_CLOCK unless told otherwise.
+ */
+static int kvmclock_pre_load(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    s->clock_is_reliable = false;
+
+    return 0;
+}
+
+/*
+ * When migrating a running guest, read the clock just
+ * before migration, so that the guest clock counts
+ * during the events between:
+ *
+ *  * vm_stop()
+ *  *
+ *  * pre_save()
+ *
+ *  This reduces kvmclock difference on migration from 5s
+ *  to 0.1s (when max_downtime == 5s), because sending the
+ *  final pages of memory (which happens between vm_stop()
+ *  and pre_save()) takes max_downtime.
+ */
+static int kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    if (!s->runstate_paused) {
+        kvm_update_clock(s);
+    }
+
+    return 0;
+}
+
+static const VMStateDescription kvmclock_vmsd = {
+    .name = "kvmclock",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_load = kvmclock_pre_load,
+    .pre_save = kvmclock_pre_save,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(clock, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_reliable_get_clock,
+        NULL
+    }
+};
+
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
+                      mach_use_reliable_get_clock, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void kvmclock_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = kvmclock_realize;
+    dc->vmsd = &kvmclock_vmsd;
+    device_class_set_props(dc, kvmclock_properties);
+}
+
+static const TypeInfo kvmclock_info = {
+    .name          = TYPE_KVM_CLOCK,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(KVMClockState),
+    .class_init    = kvmclock_class_init,
+};
+
+/* Note: Must be called after VCPU initialization. */
+void kvmclock_create(bool create_always)
+{
+    X86CPU *cpu = X86_CPU(first_cpu);
+
+    if (!kvm_enabled() || !kvm_has_adjust_clock())
+        return;
+
+    if (create_always ||
+        cpu->env.features[FEAT_KVM] & ((1ULL << KVM_FEATURE_CLOCKSOURCE) |
+                                       (1ULL << KVM_FEATURE_CLOCKSOURCE2))) {
+        sysbus_create_simple(TYPE_KVM_CLOCK, -1, NULL);
+    }
+}
+
+static void kvmclock_register_types(void)
+{
+    type_register_static(&kvmclock_info);
+}
+
+type_init(kvmclock_register_types)
diff --git a/hw/i386/kvm/i8254.c b/hw/i386/kvm/i8254.c
new file mode 100644
index 000000000..191a26fa5
--- /dev/null
+++ b/hw/i386/kvm/i8254.c
@@ -0,0 +1,337 @@
+/*
+ * KVM in-kernel PIT (i8254) support
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2012      Jan Kiszka, Siemens AG
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include <linux/kvm.h>
+#include "qapi/qapi-types-machine.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "qemu/timer.h"
+#include "sysemu/runstate.h"
+#include "hw/timer/i8254.h"
+#include "hw/timer/i8254_internal.h"
+#include "hw/qdev-properties-system.h"
+#include "sysemu/kvm.h"
+#include "qom/object.h"
+
+#define KVM_PIT_REINJECT_BIT 0
+
+#define CALIBRATION_ROUNDS   3
+
+typedef struct KVMPITClass KVMPITClass;
+typedef struct KVMPITState KVMPITState;
+DECLARE_OBJ_CHECKERS(KVMPITState, KVMPITClass,
+                     KVM_PIT, TYPE_KVM_I8254)
+
+struct KVMPITState {
+    PITCommonState parent_obj;
+
+    LostTickPolicy lost_tick_policy;
+    bool vm_stopped;
+    int64_t kernel_clock_offset;
+};
+
+struct KVMPITClass {
+    PITCommonClass parent_class;
+
+    DeviceRealize parent_realize;
+};
+
+static void kvm_pit_update_clock_offset(KVMPITState *s)
+{
+    int64_t offset, clock_offset;
+    struct timespec ts;
+    int i;
+
+    /*
+     * Measure the delta between CLOCK_MONOTONIC, the base used for
+     * kvm_pit_channel_state::count_load_time, and QEMU_CLOCK_VIRTUAL. Take the
+     * minimum of several samples to filter out scheduling noise.
+     */
+    clock_offset = INT64_MAX;
+    for (i = 0; i < CALIBRATION_ROUNDS; i++) {
+        offset = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        offset -= ts.tv_nsec;
+        offset -= (int64_t)ts.tv_sec * 1000000000;
+        if (uabs64(offset) < uabs64(clock_offset)) {
+            clock_offset = offset;
+        }
+    }
+    s->kernel_clock_offset = clock_offset;
+}
+
+static void kvm_pit_get(PITCommonState *pit)
+{
+    KVMPITState *s = KVM_PIT(pit);
+    struct kvm_pit_state2 kpit;
+    struct kvm_pit_channel_state *kchan;
+    struct PITChannelState *sc;
+    int i, ret;
+
+    /* No need to re-read the state if VM is stopped. */
+    if (s->vm_stopped) {
+        return;
+    }
+
+    if (kvm_has_pit_state2()) {
+        ret = kvm_vm_ioctl(kvm_state, KVM_GET_PIT2, &kpit);
+        if (ret < 0) {
+            fprintf(stderr, "KVM_GET_PIT2 failed: %s\n", strerror(-ret));
+            abort();
+        }
+        pit->channels[0].irq_disabled = kpit.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+    } else {
+        /*
+         * kvm_pit_state2 is superset of kvm_pit_state struct,
+         * so we can use it for KVM_GET_PIT as well.
+         */
+        ret = kvm_vm_ioctl(kvm_state, KVM_GET_PIT, &kpit);
+        if (ret < 0) {
+            fprintf(stderr, "KVM_GET_PIT failed: %s\n", strerror(-ret));
+            abort();
+        }
+    }
+    for (i = 0; i < 3; i++) {
+        kchan = &kpit.channels[i];
+        sc = &pit->channels[i];
+        sc->count = kchan->count;
+        sc->latched_count = kchan->latched_count;
+        sc->count_latched = kchan->count_latched;
+        sc->status_latched = kchan->status_latched;
+        sc->status = kchan->status;
+        sc->read_state = kchan->read_state;
+        sc->write_state = kchan->write_state;
+        sc->write_latch = kchan->write_latch;
+        sc->rw_mode = kchan->rw_mode;
+        sc->mode = kchan->mode;
+        sc->bcd = kchan->bcd;
+        sc->gate = kchan->gate;
+        sc->count_load_time = kchan->count_load_time + s->kernel_clock_offset;
+    }
+
+    sc = &pit->channels[0];
+    sc->next_transition_time =
+        pit_get_next_transition_time(sc, sc->count_load_time);
+}
+
+static void kvm_pit_put(PITCommonState *pit)
+{
+    KVMPITState *s = KVM_PIT(pit);
+    struct kvm_pit_state2 kpit = {};
+    struct kvm_pit_channel_state *kchan;
+    struct PITChannelState *sc;
+    int i, ret;
+
+    /* The offset keeps changing as long as the VM is stopped. */
+    if (s->vm_stopped) {
+        kvm_pit_update_clock_offset(s);
+    }
+
+    kpit.flags = pit->channels[0].irq_disabled ? KVM_PIT_FLAGS_HPET_LEGACY : 0;
+    for (i = 0; i < 3; i++) {
+        kchan = &kpit.channels[i];
+        sc = &pit->channels[i];
+        kchan->count = sc->count;
+        kchan->latched_count = sc->latched_count;
+        kchan->count_latched = sc->count_latched;
+        kchan->status_latched = sc->status_latched;
+        kchan->status = sc->status;
+        kchan->read_state = sc->read_state;
+        kchan->write_state = sc->write_state;
+        kchan->write_latch = sc->write_latch;
+        kchan->rw_mode = sc->rw_mode;
+        kchan->mode = sc->mode;
+        kchan->bcd = sc->bcd;
+        kchan->gate = sc->gate;
+        kchan->count_load_time = sc->count_load_time - s->kernel_clock_offset;
+    }
+
+    ret = kvm_vm_ioctl(kvm_state,
+                       kvm_has_pit_state2() ? KVM_SET_PIT2 : KVM_SET_PIT,
+                       &kpit);
+    if (ret < 0) {
+        fprintf(stderr, "%s failed: %s\n",
+                kvm_has_pit_state2() ? "KVM_SET_PIT2" : "KVM_SET_PIT",
+                strerror(-ret));
+        abort();
+    }
+}
+
+static void kvm_pit_set_gate(PITCommonState *s, PITChannelState *sc, int val)
+{
+    kvm_pit_get(s);
+
+    switch (sc->mode) {
+    default:
+    case 0:
+    case 4:
+        /* XXX: just disable/enable counting */
+        break;
+    case 1:
+    case 2:
+    case 3:
+    case 5:
+        if (sc->gate < val) {
+            /* restart counting on rising edge */
+            sc->count_load_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+        }
+        break;
+    }
+    sc->gate = val;
+
+    kvm_pit_put(s);
+}
+
+static void kvm_pit_get_channel_info(PITCommonState *s, PITChannelState *sc,
+                                     PITChannelInfo *info)
+{
+    kvm_pit_get(s);
+
+    pit_get_channel_info_common(s, sc, info);
+}
+
+static void kvm_pit_reset(DeviceState *dev)
+{
+    PITCommonState *s = PIT_COMMON(dev);
+
+    pit_reset_common(s);
+
+    kvm_pit_put(s);
+}
+
+static void kvm_pit_irq_control(void *opaque, int n, int enable)
+{
+    PITCommonState *pit = opaque;
+    PITChannelState *s = &pit->channels[0];
+
+    kvm_pit_get(pit);
+
+    s->irq_disabled = !enable;
+
+    kvm_pit_put(pit);
+}
+
+static void kvm_pit_vm_state_change(void *opaque, bool running,
+                                    RunState state)
+{
+    KVMPITState *s = opaque;
+
+    if (running) {
+        kvm_pit_update_clock_offset(s);
+        kvm_pit_put(PIT_COMMON(s));
+        s->vm_stopped = false;
+    } else {
+        kvm_pit_update_clock_offset(s);
+        kvm_pit_get(PIT_COMMON(s));
+        s->vm_stopped = true;
+    }
+}
+
+static void kvm_pit_realizefn(DeviceState *dev, Error **errp)
+{
+    PITCommonState *pit = PIT_COMMON(dev);
+    KVMPITClass *kpc = KVM_PIT_GET_CLASS(dev);
+    KVMPITState *s = KVM_PIT(pit);
+    struct kvm_pit_config config = {
+        .flags = 0,
+    };
+    int ret;
+
+    if (kvm_check_extension(kvm_state, KVM_CAP_PIT2)) {
+        ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_PIT2, &config);
+    } else {
+        ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_PIT);
+    }
+    if (ret < 0) {
+        error_setg(errp, "Create kernel PIC irqchip failed: %s",
+                   strerror(-ret));
+        return;
+    }
+    switch (s->lost_tick_policy) {
+    case LOST_TICK_POLICY_DELAY:
+        break; /* enabled by default */
+    case LOST_TICK_POLICY_DISCARD:
+        if (kvm_check_extension(kvm_state, KVM_CAP_REINJECT_CONTROL)) {
+            struct kvm_reinject_control control = { .pit_reinject = 0 };
+
+            ret = kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
+            if (ret < 0) {
+                error_setg(errp,
+                           "Can't disable in-kernel PIT reinjection: %s",
+                           strerror(-ret));
+                return;
+            }
+        }
+        break;
+    default:
+        error_setg(errp, "Lost tick policy not supported.");
+        return;
+    }
+
+    memory_region_init_io(&pit->ioports, OBJECT(dev), NULL, NULL, "kvm-pit", 4);
+
+    qdev_init_gpio_in(dev, kvm_pit_irq_control, 1);
+
+    qemu_add_vm_change_state_handler(kvm_pit_vm_state_change, s);
+
+    kpc->parent_realize(dev, errp);
+}
+
+static Property kvm_pit_properties[] = {
+    DEFINE_PROP_UINT32("iobase", PITCommonState, iobase,  -1),
+    DEFINE_PROP_LOSTTICKPOLICY("lost_tick_policy", KVMPITState,
+                               lost_tick_policy, LOST_TICK_POLICY_DELAY),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void kvm_pit_class_init(ObjectClass *klass, void *data)
+{
+    KVMPITClass *kpc = KVM_PIT_CLASS(klass);
+    PITCommonClass *k = PIT_COMMON_CLASS(klass);
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    device_class_set_parent_realize(dc, kvm_pit_realizefn,
+                                    &kpc->parent_realize);
+    k->set_channel_gate = kvm_pit_set_gate;
+    k->get_channel_info = kvm_pit_get_channel_info;
+    dc->reset = kvm_pit_reset;
+    device_class_set_props(dc, kvm_pit_properties);
+}
+
+static const TypeInfo kvm_pit_info = {
+    .name          = TYPE_KVM_I8254,
+    .parent        = TYPE_PIT_COMMON,
+    .instance_size = sizeof(KVMPITState),
+    .class_init = kvm_pit_class_init,
+    .class_size = sizeof(KVMPITClass),
+};
+
+static void kvm_pit_register(void)
+{
+    type_register_static(&kvm_pit_info);
+}
+
+type_init(kvm_pit_register)
diff --git a/hw/i386/kvm/i8259.c b/hw/i386/kvm/i8259.c
new file mode 100644
index 000000000..d61bae4dc
--- /dev/null
+++ b/hw/i386/kvm/i8259.c
@@ -0,0 +1,167 @@
+/*
+ * KVM in-kernel PIC (i8259) support
+ *
+ * Copyright (c) 2011 Siemens AG
+ *
+ * Authors:
+ *  Jan Kiszka          <jan.kiszka@siemens.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/isa/i8259_internal.h"
+#include "hw/intc/i8259.h"
+#include "qemu/module.h"
+#include "hw/i386/apic_internal.h"
+#include "hw/irq.h"
+#include "sysemu/kvm.h"
+#include "qom/object.h"
+
+#define TYPE_KVM_I8259 "kvm-i8259"
+typedef struct KVMPICClass KVMPICClass;
+DECLARE_CLASS_CHECKERS(KVMPICClass, KVM_PIC,
+                       TYPE_KVM_I8259)
+
+/**
+ * KVMPICClass:
+ * @parent_realize: The parent's realizefn.
+ */
+struct KVMPICClass {
+    PICCommonClass parent_class;
+
+    DeviceRealize parent_realize;
+};
+
+static void kvm_pic_get(PICCommonState *s)
+{
+    struct kvm_irqchip chip;
+    struct kvm_pic_state *kpic;
+    int ret;
+
+    chip.chip_id = s->master ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE;
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, &chip);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_IRQCHIP failed: %s\n", strerror(-ret));
+        abort();
+    }
+
+    kpic = &chip.chip.pic;
+
+    s->last_irr = kpic->last_irr;
+    s->irr = kpic->irr;
+    s->imr = kpic->imr;
+    s->isr = kpic->isr;
+    s->priority_add = kpic->priority_add;
+    s->irq_base = kpic->irq_base;
+    s->read_reg_select = kpic->read_reg_select;
+    s->poll = kpic->poll;
+    s->special_mask = kpic->special_mask;
+    s->init_state = kpic->init_state;
+    s->auto_eoi = kpic->auto_eoi;
+    s->rotate_on_auto_eoi = kpic->rotate_on_auto_eoi;
+    s->special_fully_nested_mode = kpic->special_fully_nested_mode;
+    s->init4 = kpic->init4;
+    s->elcr = kpic->elcr;
+    s->elcr_mask = kpic->elcr_mask;
+}
+
+static void kvm_pic_put(PICCommonState *s)
+{
+    struct kvm_irqchip chip;
+    struct kvm_pic_state *kpic;
+    int ret;
+
+    chip.chip_id = s->master ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE;
+
+    kpic = &chip.chip.pic;
+
+    kpic->last_irr = s->last_irr;
+    kpic->irr = s->irr;
+    kpic->imr = s->imr;
+    kpic->isr = s->isr;
+    kpic->priority_add = s->priority_add;
+    kpic->irq_base = s->irq_base;
+    kpic->read_reg_select = s->read_reg_select;
+    kpic->poll = s->poll;
+    kpic->special_mask = s->special_mask;
+    kpic->init_state = s->init_state;
+    kpic->auto_eoi = s->auto_eoi;
+    kpic->rotate_on_auto_eoi = s->rotate_on_auto_eoi;
+    kpic->special_fully_nested_mode = s->special_fully_nested_mode;
+    kpic->init4 = s->init4;
+    kpic->elcr = s->elcr;
+    kpic->elcr_mask = s->elcr_mask;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, &chip);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_SET_IRQCHIP failed: %s\n", strerror(-ret));
+        abort();
+    }
+}
+
+static void kvm_pic_reset(DeviceState *dev)
+{
+    PICCommonState *s = PIC_COMMON(dev);
+
+    s->elcr = 0;
+    pic_reset_common(s);
+
+    kvm_pic_put(s);
+}
+
+static void kvm_pic_set_irq(void *opaque, int irq, int level)
+{
+    int delivered;
+
+    pic_stat_update_irq(irq, level);
+    delivered = kvm_set_irq(kvm_state, irq, level);
+    apic_report_irq_delivered(delivered);
+}
+
+static void kvm_pic_realize(DeviceState *dev, Error **errp)
+{
+    PICCommonState *s = PIC_COMMON(dev);
+    KVMPICClass *kpc = KVM_PIC_GET_CLASS(dev);
+
+    memory_region_init_io(&s->base_io, OBJECT(dev), NULL, NULL, "kvm-pic", 2);
+    memory_region_init_io(&s->elcr_io, OBJECT(dev), NULL, NULL, "kvm-elcr", 1);
+
+    kpc->parent_realize(dev, errp);
+}
+
+qemu_irq *kvm_i8259_init(ISABus *bus)
+{
+    i8259_init_chip(TYPE_KVM_I8259, bus, true);
+    i8259_init_chip(TYPE_KVM_I8259, bus, false);
+
+    return qemu_allocate_irqs(kvm_pic_set_irq, NULL, ISA_NUM_IRQS);
+}
+
+static void kvm_i8259_class_init(ObjectClass *klass, void *data)
+{
+    KVMPICClass *kpc = KVM_PIC_CLASS(klass);
+    PICCommonClass *k = PIC_COMMON_CLASS(klass);
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->reset     = kvm_pic_reset;
+    device_class_set_parent_realize(dc, kvm_pic_realize, &kpc->parent_realize);
+    k->pre_save   = kvm_pic_get;
+    k->post_load  = kvm_pic_put;
+}
+
+static const TypeInfo kvm_i8259_info = {
+    .name = TYPE_KVM_I8259,
+    .parent = TYPE_PIC_COMMON,
+    .instance_size = sizeof(PICCommonState),
+    .class_init = kvm_i8259_class_init,
+    .class_size = sizeof(KVMPICClass),
+};
+
+static void kvm_pic_register_types(void)
+{
+    type_register_static(&kvm_i8259_info);
+}
+
+type_init(kvm_pic_register_types)
diff --git a/hw/i386/kvm/ioapic.c b/hw/i386/kvm/ioapic.c
new file mode 100644
index 000000000..ee7c8ef68
--- /dev/null
+++ b/hw/i386/kvm/ioapic.c
@@ -0,0 +1,165 @@
+/*
+ * KVM in-kernel IOPIC support
+ *
+ * Copyright (c) 2011 Siemens AG
+ *
+ * Authors:
+ *  Jan Kiszka          <jan.kiszka@siemens.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "monitor/monitor.h"
+#include "hw/i386/x86.h"
+#include "hw/qdev-properties.h"
+#include "hw/i386/ioapic_internal.h"
+#include "hw/i386/apic_internal.h"
+#include "sysemu/kvm.h"
+
+/* PC Utility function */
+void kvm_pc_setup_irq_routing(bool pci_enabled)
+{
+    KVMState *s = kvm_state;
+    int i;
+
+    assert(kvm_has_gsi_routing());
+    for (i = 0; i < 8; ++i) {
+        if (i == 2) {
+            continue;
+        }
+        kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_PIC_MASTER, i);
+    }
+    for (i = 8; i < 16; ++i) {
+        kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_PIC_SLAVE, i - 8);
+    }
+    if (pci_enabled) {
+        for (i = 0; i < 24; ++i) {
+            if (i == 0) {
+                kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_IOAPIC, 2);
+            } else if (i != 2) {
+                kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_IOAPIC, i);
+            }
+        }
+    }
+    kvm_irqchip_commit_routes(s);
+}
+
+typedef struct KVMIOAPICState KVMIOAPICState;
+
+struct KVMIOAPICState {
+    IOAPICCommonState ioapic;
+    uint32_t kvm_gsi_base;
+};
+
+static void kvm_ioapic_get(IOAPICCommonState *s)
+{
+    struct kvm_irqchip chip;
+    struct kvm_ioapic_state *kioapic;
+    int ret, i;
+
+    chip.chip_id = KVM_IRQCHIP_IOAPIC;
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, &chip);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_IRQCHIP failed: %s\n", strerror(-ret));
+        abort();
+    }
+
+    kioapic = &chip.chip.ioapic;
+
+    s->id = kioapic->id;
+    s->ioregsel = kioapic->ioregsel;
+    s->irr = kioapic->irr;
+    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+        s->ioredtbl[i] = kioapic->redirtbl[i].bits;
+    }
+}
+
+static void kvm_ioapic_put(IOAPICCommonState *s)
+{
+    struct kvm_irqchip chip;
+    struct kvm_ioapic_state *kioapic;
+    int ret, i;
+
+    chip.chip_id = KVM_IRQCHIP_IOAPIC;
+    kioapic = &chip.chip.ioapic;
+
+    kioapic->id = s->id;
+    kioapic->ioregsel = s->ioregsel;
+    kioapic->base_address = s->busdev.mmio[0].addr;
+    kioapic->irr = s->irr;
+    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+        kioapic->redirtbl[i].bits = s->ioredtbl[i];
+    }
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, &chip);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_SET_IRQCHIP failed: %s\n", strerror(-ret));
+        abort();
+    }
+}
+
+static void kvm_ioapic_reset(DeviceState *dev)
+{
+    IOAPICCommonState *s = IOAPIC_COMMON(dev);
+
+    ioapic_reset_common(dev);
+    kvm_ioapic_put(s);
+}
+
+static void kvm_ioapic_set_irq(void *opaque, int irq, int level)
+{
+    KVMIOAPICState *s = opaque;
+    IOAPICCommonState *common = IOAPIC_COMMON(s);
+    int delivered;
+
+    ioapic_stat_update_irq(common, irq, level);
+    delivered = kvm_set_irq(kvm_state, s->kvm_gsi_base + irq, level);
+    apic_report_irq_delivered(delivered);
+}
+
+static void kvm_ioapic_realize(DeviceState *dev, Error **errp)
+{
+    IOAPICCommonState *s = IOAPIC_COMMON(dev);
+
+    memory_region_init_io(&s->io_memory, OBJECT(dev), NULL, NULL, "kvm-ioapic", 0x1000);
+    /*
+     * KVM ioapic only supports 0x11 now. This will only be used when
+     * we want to dump ioapic version.
+     */
+    s->version = 0x11;
+
+    qdev_init_gpio_in(dev, kvm_ioapic_set_irq, IOAPIC_NUM_PINS);
+}
+
+static Property kvm_ioapic_properties[] = {
+    DEFINE_PROP_UINT32("gsi_base", KVMIOAPICState, kvm_gsi_base, 0),
+    DEFINE_PROP_END_OF_LIST()
+};
+
+static void kvm_ioapic_class_init(ObjectClass *klass, void *data)
+{
+    IOAPICCommonClass *k = IOAPIC_COMMON_CLASS(klass);
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    k->realize   = kvm_ioapic_realize;
+    k->pre_save  = kvm_ioapic_get;
+    k->post_load = kvm_ioapic_put;
+    dc->reset    = kvm_ioapic_reset;
+    device_class_set_props(dc, kvm_ioapic_properties);
+}
+
+static const TypeInfo kvm_ioapic_info = {
+    .name  = TYPE_KVM_IOAPIC,
+    .parent = TYPE_IOAPIC_COMMON,
+    .instance_size = sizeof(KVMIOAPICState),
+    .class_init = kvm_ioapic_class_init,
+};
+
+static void kvm_ioapic_register_types(void)
+{
+    type_register_static(&kvm_ioapic_info);
+}
+
+type_init(kvm_ioapic_register_types)
diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
new file mode 100644
index 000000000..95467f1de
--- /dev/null
+++ b/hw/i386/kvm/meson.build
@@ -0,0 +1,8 @@
+i386_kvm_ss = ss.source_set()
+i386_kvm_ss.add(files('clock.c'))
+i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c'))
+i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c'))
+i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c'))
+i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c'))
+
+i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c
new file mode 100644
index 000000000..43f8a8f67
--- /dev/null
+++ b/hw/i386/kvmvapic.c
@@ -0,0 +1,871 @@
+/*
+ * TPR optimization for 32-bit Windows guests (XP and Server 2003)
+ *
+ * Copyright (C) 2007-2008 Qumranet Technologies
+ * Copyright (C) 2012      Jan Kiszka, Siemens AG
+ *
+ * This work is licensed under the terms of the GNU GPL version 2, or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/module.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/cpus.h"
+#include "sysemu/hw_accel.h"
+#include "sysemu/kvm.h"
+#include "sysemu/runstate.h"
+#include "hw/i386/apic_internal.h"
+#include "hw/sysbus.h"
+#include "hw/boards.h"
+#include "migration/vmstate.h"
+#include "qom/object.h"
+
+#define VAPIC_IO_PORT           0x7e
+
+#define VAPIC_CPU_SHIFT         7
+
+#define ROM_BLOCK_SIZE          512
+#define ROM_BLOCK_MASK          (~(ROM_BLOCK_SIZE - 1))
+
+typedef enum VAPICMode {
+    VAPIC_INACTIVE = 0,
+    VAPIC_ACTIVE   = 1,
+    VAPIC_STANDBY  = 2,
+} VAPICMode;
+
+typedef struct VAPICHandlers {
+    uint32_t set_tpr;
+    uint32_t set_tpr_eax;
+    uint32_t get_tpr[8];
+    uint32_t get_tpr_stack;
+} QEMU_PACKED VAPICHandlers;
+
+typedef struct GuestROMState {
+    char signature[8];
+    uint32_t vaddr;
+    uint32_t fixup_start;
+    uint32_t fixup_end;
+    uint32_t vapic_vaddr;
+    uint32_t vapic_size;
+    uint32_t vcpu_shift;
+    uint32_t real_tpr_addr;
+    VAPICHandlers up;
+    VAPICHandlers mp;
+} QEMU_PACKED GuestROMState;
+
+struct VAPICROMState {
+    SysBusDevice busdev;
+    MemoryRegion io;
+    MemoryRegion rom;
+    uint32_t state;
+    uint32_t rom_state_paddr;
+    uint32_t rom_state_vaddr;
+    uint32_t vapic_paddr;
+    uint32_t real_tpr_addr;
+    GuestROMState rom_state;
+    size_t rom_size;
+    bool rom_mapped_writable;
+    VMChangeStateEntry *vmsentry;
+};
+
+#define TYPE_VAPIC "kvmvapic"
+OBJECT_DECLARE_SIMPLE_TYPE(VAPICROMState, VAPIC)
+
+#define TPR_INSTR_ABS_MODRM             0x1
+#define TPR_INSTR_MATCH_MODRM_REG       0x2
+
+typedef struct TPRInstruction {
+    uint8_t opcode;
+    uint8_t modrm_reg;
+    unsigned int flags;
+    TPRAccess access;
+    size_t length;
+    off_t addr_offset;
+} TPRInstruction;
+
+/* must be sorted by length, shortest first */
+static const TPRInstruction tpr_instr[] = {
+    { /* mov abs to eax */
+        .opcode = 0xa1,
+        .access = TPR_ACCESS_READ,
+        .length = 5,
+        .addr_offset = 1,
+    },
+    { /* mov eax to abs */
+        .opcode = 0xa3,
+        .access = TPR_ACCESS_WRITE,
+        .length = 5,
+        .addr_offset = 1,
+    },
+    { /* mov r32 to r/m32 */
+        .opcode = 0x89,
+        .flags = TPR_INSTR_ABS_MODRM,
+        .access = TPR_ACCESS_WRITE,
+        .length = 6,
+        .addr_offset = 2,
+    },
+    { /* mov r/m32 to r32 */
+        .opcode = 0x8b,
+        .flags = TPR_INSTR_ABS_MODRM,
+        .access = TPR_ACCESS_READ,
+        .length = 6,
+        .addr_offset = 2,
+    },
+    { /* push r/m32 */
+        .opcode = 0xff,
+        .modrm_reg = 6,
+        .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG,
+        .access = TPR_ACCESS_READ,
+        .length = 6,
+        .addr_offset = 2,
+    },
+    { /* mov imm32, r/m32 (c7/0) */
+        .opcode = 0xc7,
+        .modrm_reg = 0,
+        .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG,
+        .access = TPR_ACCESS_WRITE,
+        .length = 10,
+        .addr_offset = 2,
+    },
+};
+
+static void read_guest_rom_state(VAPICROMState *s)
+{
+    cpu_physical_memory_read(s->rom_state_paddr, &s->rom_state,
+                             sizeof(GuestROMState));
+}
+
+static void write_guest_rom_state(VAPICROMState *s)
+{
+    cpu_physical_memory_write(s->rom_state_paddr, &s->rom_state,
+                              sizeof(GuestROMState));
+}
+
+static void update_guest_rom_state(VAPICROMState *s)
+{
+    read_guest_rom_state(s);
+
+    s->rom_state.real_tpr_addr = cpu_to_le32(s->real_tpr_addr);
+    s->rom_state.vcpu_shift = cpu_to_le32(VAPIC_CPU_SHIFT);
+
+    write_guest_rom_state(s);
+}
+
+static int find_real_tpr_addr(VAPICROMState *s, CPUX86State *env)
+{
+    CPUState *cs = env_cpu(env);
+    hwaddr paddr;
+    target_ulong addr;
+
+    if (s->state == VAPIC_ACTIVE) {
+        return 0;
+    }
+    /*
+     * If there is no prior TPR access instruction we could analyze (which is
+     * the case after resume from hibernation), we need to scan the possible
+     * virtual address space for the APIC mapping.
+     */
+    for (addr = 0xfffff000; addr >= 0x80000000; addr -= TARGET_PAGE_SIZE) {
+        paddr = cpu_get_phys_page_debug(cs, addr);
+        if (paddr != APIC_DEFAULT_ADDRESS) {
+            continue;
+        }
+        s->real_tpr_addr = addr + 0x80;
+        update_guest_rom_state(s);
+        return 0;
+    }
+    return -1;
+}
+
+static uint8_t modrm_reg(uint8_t modrm)
+{
+    return (modrm >> 3) & 7;
+}
+
+static bool is_abs_modrm(uint8_t modrm)
+{
+    return (modrm & 0xc7) == 0x05;
+}
+
+static bool opcode_matches(uint8_t *opcode, const TPRInstruction *instr)
+{
+    return opcode[0] == instr->opcode &&
+        (!(instr->flags & TPR_INSTR_ABS_MODRM) || is_abs_modrm(opcode[1])) &&
+        (!(instr->flags & TPR_INSTR_MATCH_MODRM_REG) ||
+         modrm_reg(opcode[1]) == instr->modrm_reg);
+}
+
+static int evaluate_tpr_instruction(VAPICROMState *s, X86CPU *cpu,
+                                    target_ulong *pip, TPRAccess access)
+{
+    CPUState *cs = CPU(cpu);
+    const TPRInstruction *instr;
+    target_ulong ip = *pip;
+    uint8_t opcode[2];
+    uint32_t real_tpr_addr;
+    int i;
+
+    if ((ip & 0xf0000000ULL) != 0x80000000ULL &&
+        (ip & 0xf0000000ULL) != 0xe0000000ULL) {
+        return -1;
+    }
+
+    /*
+     * Early Windows 2003 SMP initialization contains a
+     *
+     *   mov imm32, r/m32
+     *
+     * instruction that is patched by TPR optimization. The problem is that
+     * RSP, used by the patched instruction, is zero, so the guest gets a
+     * double fault and dies.
+     */
+    if (cpu->env.regs[R_ESP] == 0) {
+        return -1;
+    }
+
+    if (kvm_enabled() && !kvm_irqchip_in_kernel()) {
+        /*
+         * KVM without kernel-based TPR access reporting will pass an IP that
+         * points after the accessing instruction. So we need to look backward
+         * to find the reason.
+         */
+        for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) {
+            instr = &tpr_instr[i];
+            if (instr->access != access) {
+                continue;
+            }
+            if (cpu_memory_rw_debug(cs, ip - instr->length, opcode,
+                                    sizeof(opcode), 0) < 0) {
+                return -1;
+            }
+            if (opcode_matches(opcode, instr)) {
+                ip -= instr->length;
+                goto instruction_ok;
+            }
+        }
+        return -1;
+    } else {
+        if (cpu_memory_rw_debug(cs, ip, opcode, sizeof(opcode), 0) < 0) {
+            return -1;
+        }
+        for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) {
+            instr = &tpr_instr[i];
+            if (opcode_matches(opcode, instr)) {
+                goto instruction_ok;
+            }
+        }
+        return -1;
+    }
+
+instruction_ok:
+    /*
+     * Grab the virtual TPR address from the instruction
+     * and update the cached values.
+     */
+    if (cpu_memory_rw_debug(cs, ip + instr->addr_offset,
+                            (void *)&real_tpr_addr,
+                            sizeof(real_tpr_addr), 0) < 0) {
+        return -1;
+    }
+    real_tpr_addr = le32_to_cpu(real_tpr_addr);
+    if ((real_tpr_addr & 0xfff) != 0x80) {
+        return -1;
+    }
+    s->real_tpr_addr = real_tpr_addr;
+    update_guest_rom_state(s);
+
+    *pip = ip;
+    return 0;
+}
+
+static int update_rom_mapping(VAPICROMState *s, CPUX86State *env, target_ulong ip)
+{
+    CPUState *cs = env_cpu(env);
+    hwaddr paddr;
+    uint32_t rom_state_vaddr;
+    uint32_t pos, patch, offset;
+
+    /* nothing to do if already activated */
+    if (s->state == VAPIC_ACTIVE) {
+        return 0;
+    }
+
+    /* bail out if ROM init code was not executed (missing ROM?) */
+    if (s->state == VAPIC_INACTIVE) {
+        return -1;
+    }
+
+    /* find out virtual address of the ROM */
+    rom_state_vaddr = s->rom_state_paddr + (ip & 0xf0000000);
+    paddr = cpu_get_phys_page_debug(cs, rom_state_vaddr);
+    if (paddr == -1) {
+        return -1;
+    }
+    paddr += rom_state_vaddr & ~TARGET_PAGE_MASK;
+    if (paddr != s->rom_state_paddr) {
+        return -1;
+    }
+    read_guest_rom_state(s);
+    if (memcmp(s->rom_state.signature, "kvm aPiC", 8) != 0) {
+        return -1;
+    }
+    s->rom_state_vaddr = rom_state_vaddr;
+
+    /* fixup addresses in ROM if needed */
+    if (rom_state_vaddr == le32_to_cpu(s->rom_state.vaddr)) {
+        return 0;
+    }
+    for (pos = le32_to_cpu(s->rom_state.fixup_start);
+         pos < le32_to_cpu(s->rom_state.fixup_end);
+         pos += 4) {
+        cpu_physical_memory_read(paddr + pos - s->rom_state.vaddr,
+                                 &offset, sizeof(offset));
+        offset = le32_to_cpu(offset);
+        cpu_physical_memory_read(paddr + offset, &patch, sizeof(patch));
+        patch = le32_to_cpu(patch);
+        patch += rom_state_vaddr - le32_to_cpu(s->rom_state.vaddr);
+        patch = cpu_to_le32(patch);
+        cpu_physical_memory_write(paddr + offset, &patch, sizeof(patch));
+    }
+    read_guest_rom_state(s);
+    s->vapic_paddr = paddr + le32_to_cpu(s->rom_state.vapic_vaddr) -
+        le32_to_cpu(s->rom_state.vaddr);
+
+    return 0;
+}
+
+/*
+ * Tries to read the unique processor number from the Kernel Processor Control
+ * Region (KPCR) of 32-bit Windows XP and Server 2003. Returns -1 if the KPCR
+ * cannot be accessed or is considered invalid. This also ensures that we are
+ * not patching the wrong guest.
+ */
+static int get_kpcr_number(X86CPU *cpu)
+{
+    CPUX86State *env = &cpu->env;
+    struct kpcr {
+        uint8_t  fill1[0x1c];
+        uint32_t self;
+        uint8_t  fill2[0x31];
+        uint8_t  number;
+    } QEMU_PACKED kpcr;
+
+    if (cpu_memory_rw_debug(CPU(cpu), env->segs[R_FS].base,
+                            (void *)&kpcr, sizeof(kpcr), 0) < 0 ||
+        kpcr.self != env->segs[R_FS].base) {
+        return -1;
+    }
+    return kpcr.number;
+}
+
+static int vapic_enable(VAPICROMState *s, X86CPU *cpu)
+{
+    int cpu_number = get_kpcr_number(cpu);
+    hwaddr vapic_paddr;
+    static const uint8_t enabled = 1;
+
+    if (cpu_number < 0) {
+        return -1;
+    }
+    vapic_paddr = s->vapic_paddr +
+        (((hwaddr)cpu_number) << VAPIC_CPU_SHIFT);
+    cpu_physical_memory_write(vapic_paddr + offsetof(VAPICState, enabled),
+                              &enabled, sizeof(enabled));
+    apic_enable_vapic(cpu->apic_state, vapic_paddr);
+
+    s->state = VAPIC_ACTIVE;
+
+    return 0;
+}
+
+static void patch_byte(X86CPU *cpu, target_ulong addr, uint8_t byte)
+{
+    cpu_memory_rw_debug(CPU(cpu), addr, &byte, 1, 1);
+}
+
+static void patch_call(X86CPU *cpu, target_ulong ip, uint32_t target)
+{
+    uint32_t offset;
+
+    offset = cpu_to_le32(target - ip - 5);
+    patch_byte(cpu, ip, 0xe8); /* call near */
+    cpu_memory_rw_debug(CPU(cpu), ip + 1, (void *)&offset, sizeof(offset), 1);
+}
+
+typedef struct PatchInfo {
+    VAPICHandlers *handler;
+    target_ulong ip;
+} PatchInfo;
+
+static void do_patch_instruction(CPUState *cs, run_on_cpu_data data)
+{
+    X86CPU *x86_cpu = X86_CPU(cs);
+    PatchInfo *info = (PatchInfo *) data.host_ptr;
+    VAPICHandlers *handlers = info->handler;
+    target_ulong ip = info->ip;
+    uint8_t opcode[2];
+    uint32_t imm32 = 0;
+
+    cpu_memory_rw_debug(cs, ip, opcode, sizeof(opcode), 0);
+
+    switch (opcode[0]) {
+    case 0x89: /* mov r32 to r/m32 */
+        patch_byte(x86_cpu, ip, 0x50 + modrm_reg(opcode[1]));  /* push reg */
+        patch_call(x86_cpu, ip + 1, handlers->set_tpr);
+        break;
+    case 0x8b: /* mov r/m32 to r32 */
+        patch_byte(x86_cpu, ip, 0x90);
+        patch_call(x86_cpu, ip + 1, handlers->get_tpr[modrm_reg(opcode[1])]);
+        break;
+    case 0xa1: /* mov abs to eax */
+        patch_call(x86_cpu, ip, handlers->get_tpr[0]);
+        break;
+    case 0xa3: /* mov eax to abs */
+        patch_call(x86_cpu, ip, handlers->set_tpr_eax);
+        break;
+    case 0xc7: /* mov imm32, r/m32 (c7/0) */
+        patch_byte(x86_cpu, ip, 0x68);  /* push imm32 */
+        cpu_memory_rw_debug(cs, ip + 6, (void *)&imm32, sizeof(imm32), 0);
+        cpu_memory_rw_debug(cs, ip + 1, (void *)&imm32, sizeof(imm32), 1);
+        patch_call(x86_cpu, ip + 5, handlers->set_tpr);
+        break;
+    case 0xff: /* push r/m32 */
+        patch_byte(x86_cpu, ip, 0x50); /* push eax */
+        patch_call(x86_cpu, ip + 1, handlers->get_tpr_stack);
+        break;
+    default:
+        abort();
+    }
+
+    g_free(info);
+}
+
+static void patch_instruction(VAPICROMState *s, X86CPU *cpu, target_ulong ip)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    CPUState *cs = CPU(cpu);
+    VAPICHandlers *handlers;
+    PatchInfo *info;
+
+    if (ms->smp.cpus == 1) {
+        handlers = &s->rom_state.up;
+    } else {
+        handlers = &s->rom_state.mp;
+    }
+
+    info  = g_new(PatchInfo, 1);
+    info->handler = handlers;
+    info->ip = ip;
+
+    async_safe_run_on_cpu(cs, do_patch_instruction, RUN_ON_CPU_HOST_PTR(info));
+}
+
+void vapic_report_tpr_access(DeviceState *dev, CPUState *cs, target_ulong ip,
+                             TPRAccess access)
+{
+    VAPICROMState *s = VAPIC(dev);
+    X86CPU *cpu = X86_CPU(cs);
+    CPUX86State *env = &cpu->env;
+
+    cpu_synchronize_state(cs);
+
+    if (evaluate_tpr_instruction(s, cpu, &ip, access) < 0) {
+        if (s->state == VAPIC_ACTIVE) {
+            vapic_enable(s, cpu);
+        }
+        return;
+    }
+    if (update_rom_mapping(s, env, ip) < 0) {
+        return;
+    }
+    if (vapic_enable(s, cpu) < 0) {
+        return;
+    }
+    patch_instruction(s, cpu, ip);
+}
+
+typedef struct VAPICEnableTPRReporting {
+    DeviceState *apic;
+    bool enable;
+} VAPICEnableTPRReporting;
+
+static void vapic_do_enable_tpr_reporting(CPUState *cpu, run_on_cpu_data data)
+{
+    VAPICEnableTPRReporting *info = data.host_ptr;
+    apic_enable_tpr_access_reporting(info->apic, info->enable);
+}
+
+static void vapic_enable_tpr_reporting(bool enable)
+{
+    VAPICEnableTPRReporting info = {
+        .enable = enable,
+    };
+    CPUState *cs;
+    X86CPU *cpu;
+
+    CPU_FOREACH(cs) {
+        cpu = X86_CPU(cs);
+        info.apic = cpu->apic_state;
+        run_on_cpu(cs, vapic_do_enable_tpr_reporting, RUN_ON_CPU_HOST_PTR(&info));
+    }
+}
+
+static void vapic_reset(DeviceState *dev)
+{
+    VAPICROMState *s = VAPIC(dev);
+
+    s->state = VAPIC_INACTIVE;
+    s->rom_state_paddr = 0;
+    vapic_enable_tpr_reporting(false);
+}
+
+/*
+ * Set the IRQ polling hypercalls to the supported variant:
+ *  - vmcall if using KVM in-kernel irqchip
+ *  - 32-bit VAPIC port write otherwise
+ */
+static int patch_hypercalls(VAPICROMState *s)
+{
+    hwaddr rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK;
+    static const uint8_t vmcall_pattern[] = { /* vmcall */
+        0xb8, 0x1, 0, 0, 0, 0xf, 0x1, 0xc1
+    };
+    static const uint8_t outl_pattern[] = { /* nop; outl %eax,0x7e */
+        0xb8, 0x1, 0, 0, 0, 0x90, 0xe7, 0x7e
+    };
+    uint8_t alternates[2];
+    const uint8_t *pattern;
+    const uint8_t *patch;
+    off_t pos;
+    uint8_t *rom;
+
+    rom = g_malloc(s->rom_size);
+    cpu_physical_memory_read(rom_paddr, rom, s->rom_size);
+
+    for (pos = 0; pos < s->rom_size - sizeof(vmcall_pattern); pos++) {
+        if (kvm_irqchip_in_kernel()) {
+            pattern = outl_pattern;
+            alternates[0] = outl_pattern[7];
+            alternates[1] = outl_pattern[7];
+            patch = &vmcall_pattern[5];
+        } else {
+            pattern = vmcall_pattern;
+            alternates[0] = vmcall_pattern[7];
+            alternates[1] = 0xd9; /* AMD's VMMCALL */
+            patch = &outl_pattern[5];
+        }
+        if (memcmp(rom + pos, pattern, 7) == 0 &&
+            (rom[pos + 7] == alternates[0] || rom[pos + 7] == alternates[1])) {
+            cpu_physical_memory_write(rom_paddr + pos + 5, patch, 3);
+            /*
+             * Don't flush the tb here. Under ordinary conditions, the patched
+             * calls are miles away from the current IP. Under malicious
+             * conditions, the guest could trick us to crash.
+             */
+        }
+    }
+
+    g_free(rom);
+    return 0;
+}
+
+/*
+ * For TCG mode or the time KVM honors read-only memory regions, we need to
+ * enable write access to the option ROM so that variables can be updated by
+ * the guest.
+ */
+static int vapic_map_rom_writable(VAPICROMState *s)
+{
+    hwaddr rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK;
+    MemoryRegionSection section;
+    MemoryRegion *as;
+    size_t rom_size;
+    uint8_t *ram;
+
+    as = sysbus_address_space(&s->busdev);
+
+    if (s->rom_mapped_writable) {
+        memory_region_del_subregion(as, &s->rom);
+        object_unparent(OBJECT(&s->rom));
+    }
+
+    /* grab RAM memory region (region @rom_paddr may still be pc.rom) */
+    section = memory_region_find(as, 0, 1);
+
+    /* read ROM size from RAM region */
+    if (rom_paddr + 2 >= memory_region_size(section.mr)) {
+        return -1;
+    }
+    ram = memory_region_get_ram_ptr(section.mr);
+    rom_size = ram[rom_paddr + 2] * ROM_BLOCK_SIZE;
+    if (rom_size == 0) {
+        return -1;
+    }
+    s->rom_size = rom_size;
+
+    /* We need to round to avoid creating subpages
+     * from which we cannot run code. */
+    rom_size += rom_paddr & ~TARGET_PAGE_MASK;
+    rom_paddr &= TARGET_PAGE_MASK;
+    rom_size = TARGET_PAGE_ALIGN(rom_size);
+
+    memory_region_init_alias(&s->rom, OBJECT(s), "kvmvapic-rom", section.mr,
+                             rom_paddr, rom_size);
+    memory_region_add_subregion_overlap(as, rom_paddr, &s->rom, 1000);
+    s->rom_mapped_writable = true;
+    memory_region_unref(section.mr);
+
+    return 0;
+}
+
+static int vapic_prepare(VAPICROMState *s)
+{
+    if (vapic_map_rom_writable(s) < 0) {
+        return -1;
+    }
+
+    if (patch_hypercalls(s) < 0) {
+        return -1;
+    }
+
+    vapic_enable_tpr_reporting(true);
+
+    return 0;
+}
+
+static void vapic_write(void *opaque, hwaddr addr, uint64_t data,
+                        unsigned int size)
+{
+    VAPICROMState *s = opaque;
+    X86CPU *cpu;
+    CPUX86State *env;
+    hwaddr rom_paddr;
+
+    if (!current_cpu) {
+        return;
+    }
+
+    cpu_synchronize_state(current_cpu);
+    cpu = X86_CPU(current_cpu);
+    env = &cpu->env;
+
+    /*
+     * The VAPIC supports two PIO-based hypercalls, both via port 0x7E.
+     *  o 16-bit write access:
+     *    Reports the option ROM initialization to the hypervisor. Written
+     *    value is the offset of the state structure in the ROM.
+     *  o 8-bit write access:
+     *    Reactivates the VAPIC after a guest hibernation, i.e. after the
+     *    option ROM content has been re-initialized by a guest power cycle.
+     *  o 32-bit write access:
+     *    Poll for pending IRQs, considering the current VAPIC state.
+     */
+    switch (size) {
+    case 2:
+        if (s->state == VAPIC_INACTIVE) {
+            rom_paddr = (env->segs[R_CS].base + env->eip) & ROM_BLOCK_MASK;
+            s->rom_state_paddr = rom_paddr + data;
+
+            s->state = VAPIC_STANDBY;
+        }
+        if (vapic_prepare(s) < 0) {
+            s->state = VAPIC_INACTIVE;
+            s->rom_state_paddr = 0;
+            break;
+        }
+        break;
+    case 1:
+        if (kvm_enabled()) {
+            /*
+             * Disable triggering instruction in ROM by writing a NOP.
+             *
+             * We cannot do this in TCG mode as the reported IP is not
+             * accurate.
+             */
+            pause_all_vcpus();
+            patch_byte(cpu, env->eip - 2, 0x66);
+            patch_byte(cpu, env->eip - 1, 0x90);
+            resume_all_vcpus();
+        }
+
+        if (s->state == VAPIC_ACTIVE) {
+            break;
+        }
+        if (update_rom_mapping(s, env, env->eip) < 0) {
+            break;
+        }
+        if (find_real_tpr_addr(s, env) < 0) {
+            break;
+        }
+        vapic_enable(s, cpu);
+        break;
+    default:
+    case 4:
+        if (!kvm_irqchip_in_kernel()) {
+            apic_poll_irq(cpu->apic_state);
+        }
+        break;
+    }
+}
+
+static uint64_t vapic_read(void *opaque, hwaddr addr, unsigned size)
+{
+    return 0xffffffff;
+}
+
+static const MemoryRegionOps vapic_ops = {
+    .write = vapic_write,
+    .read = vapic_read,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void vapic_realize(DeviceState *dev, Error **errp)
+{
+    SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+    VAPICROMState *s = VAPIC(dev);
+
+    memory_region_init_io(&s->io, OBJECT(s), &vapic_ops, s, "kvmvapic", 2);
+    sysbus_add_io(sbd, VAPIC_IO_PORT, &s->io);
+    sysbus_init_ioports(sbd, VAPIC_IO_PORT, 2);
+
+    option_rom[nb_option_roms].name = "kvmvapic.bin";
+    option_rom[nb_option_roms].bootindex = -1;
+    nb_option_roms++;
+}
+
+static void do_vapic_enable(CPUState *cs, run_on_cpu_data data)
+{
+    VAPICROMState *s = data.host_ptr;
+    X86CPU *cpu = X86_CPU(cs);
+
+    static const uint8_t enabled = 1;
+    cpu_physical_memory_write(s->vapic_paddr + offsetof(VAPICState, enabled),
+                              &enabled, sizeof(enabled));
+    apic_enable_vapic(cpu->apic_state, s->vapic_paddr);
+    s->state = VAPIC_ACTIVE;
+}
+
+static void kvmvapic_vm_state_change(void *opaque, bool running,
+                                     RunState state)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    VAPICROMState *s = opaque;
+    uint8_t *zero;
+
+    if (!running) {
+        return;
+    }
+
+    if (s->state == VAPIC_ACTIVE) {
+        if (ms->smp.cpus == 1) {
+            run_on_cpu(first_cpu, do_vapic_enable, RUN_ON_CPU_HOST_PTR(s));
+        } else {
+            zero = g_malloc0(s->rom_state.vapic_size);
+            cpu_physical_memory_write(s->vapic_paddr, zero,
+                                      s->rom_state.vapic_size);
+            g_free(zero);
+        }
+    }
+
+    qemu_del_vm_change_state_handler(s->vmsentry);
+    s->vmsentry = NULL;
+}
+
+static int vapic_post_load(void *opaque, int version_id)
+{
+    VAPICROMState *s = opaque;
+
+    /*
+     * The old implementation of qemu-kvm did not provide the state
+     * VAPIC_STANDBY. Reconstruct it.
+     */
+    if (s->state == VAPIC_INACTIVE && s->rom_state_paddr != 0) {
+        s->state = VAPIC_STANDBY;
+    }
+
+    if (s->state != VAPIC_INACTIVE) {
+        if (vapic_prepare(s) < 0) {
+            return -1;
+        }
+    }
+
+    if (!s->vmsentry) {
+        s->vmsentry =
+            qemu_add_vm_change_state_handler(kvmvapic_vm_state_change, s);
+    }
+    return 0;
+}
+
+static const VMStateDescription vmstate_handlers = {
+    .name = "kvmvapic-handlers",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(set_tpr, VAPICHandlers),
+        VMSTATE_UINT32(set_tpr_eax, VAPICHandlers),
+        VMSTATE_UINT32_ARRAY(get_tpr, VAPICHandlers, 8),
+        VMSTATE_UINT32(get_tpr_stack, VAPICHandlers),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription vmstate_guest_rom = {
+    .name = "kvmvapic-guest-rom",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UNUSED(8),     /* signature */
+        VMSTATE_UINT32(vaddr, GuestROMState),
+        VMSTATE_UINT32(fixup_start, GuestROMState),
+        VMSTATE_UINT32(fixup_end, GuestROMState),
+        VMSTATE_UINT32(vapic_vaddr, GuestROMState),
+        VMSTATE_UINT32(vapic_size, GuestROMState),
+        VMSTATE_UINT32(vcpu_shift, GuestROMState),
+        VMSTATE_UINT32(real_tpr_addr, GuestROMState),
+        VMSTATE_STRUCT(up, GuestROMState, 0, vmstate_handlers, VAPICHandlers),
+        VMSTATE_STRUCT(mp, GuestROMState, 0, vmstate_handlers, VAPICHandlers),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription vmstate_vapic = {
+    .name = "kvm-tpr-opt",      /* compatible with qemu-kvm VAPIC */
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .post_load = vapic_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_STRUCT(rom_state, VAPICROMState, 0, vmstate_guest_rom,
+                       GuestROMState),
+        VMSTATE_UINT32(state, VAPICROMState),
+        VMSTATE_UINT32(real_tpr_addr, VAPICROMState),
+        VMSTATE_UINT32(rom_state_vaddr, VAPICROMState),
+        VMSTATE_UINT32(vapic_paddr, VAPICROMState),
+        VMSTATE_UINT32(rom_state_paddr, VAPICROMState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void vapic_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->reset   = vapic_reset;
+    dc->vmsd    = &vmstate_vapic;
+    dc->realize = vapic_realize;
+}
+
+static const TypeInfo vapic_type = {
+    .name          = TYPE_VAPIC,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(VAPICROMState),
+    .class_init    = vapic_class_init,
+};
+
+static void vapic_register(void)
+{
+    type_register_static(&vapic_type);
+}
+
+type_init(vapic_register);
diff --git a/hw/i386/meson.build b/hw/i386/meson.build
new file mode 100644
index 000000000..213e2e82b
--- /dev/null
+++ b/hw/i386/meson.build
@@ -0,0 +1,37 @@
+i386_ss = ss.source_set()
+i386_ss.add(files(
+  'fw_cfg.c',
+  'kvmvapic.c',
+  'e820_memory_layout.c',
+  'multiboot.c',
+  'x86.c',
+))
+
+i386_ss.add(when: 'CONFIG_X86_IOMMU', if_true: files('x86-iommu.c'),
+                                      if_false: files('x86-iommu-stub.c'))
+i386_ss.add(when: 'CONFIG_AMD_IOMMU', if_true: files('amd_iommu.c'))
+i386_ss.add(when: 'CONFIG_I440FX', if_true: files('pc_piix.c'))
+i386_ss.add(when: 'CONFIG_MICROVM', if_true: files('microvm.c', 'acpi-microvm.c', 'microvm-dt.c'))
+i386_ss.add(when: 'CONFIG_Q35', if_true: files('pc_q35.c'))
+i386_ss.add(when: 'CONFIG_VMMOUSE', if_true: files('vmmouse.c'))
+i386_ss.add(when: 'CONFIG_VMPORT', if_true: files('vmport.c'))
+i386_ss.add(when: 'CONFIG_VTD', if_true: files('intel_iommu.c'))
+i386_ss.add(when: 'CONFIG_SGX', if_true: files('sgx-epc.c','sgx.c'),
+                                if_false: files('sgx-stub.c'))
+
+i386_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-common.c'))
+i386_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device_x86.c'))
+i386_ss.add(when: 'CONFIG_PC', if_true: files(
+  'pc.c',
+  'pc_sysfw.c',
+  'acpi-build.c',
+  'port92.c'))
+i386_ss.add(when: 'CONFIG_X86_FW_OVMF', if_true: files('pc_sysfw_ovmf.c'),
+                                        if_false: files('pc_sysfw_ovmf-stubs.c'))
+
+subdir('kvm')
+subdir('xen')
+
+i386_ss.add_all(xenpv_ss)
+
+hw_arch += {'i386': i386_ss}
diff --git a/hw/i386/microvm-dt.c b/hw/i386/microvm-dt.c
new file mode 100644
index 000000000..9c3c4995b
--- /dev/null
+++ b/hw/i386/microvm-dt.c
@@ -0,0 +1,348 @@
+/*
+ * microvm device tree support
+ *
+ * This generates an device tree for microvm and exports it via fw_cfg
+ * as "etc/fdt" to the firmware (edk2 specifically).
+ *
+ * The use case is to allow edk2 find the pcie ecam and the virtio
+ * devices, without adding an ACPI parser, reusing the fdt parser
+ * which is needed anyway for the arm platform.
+ *
+ * Note 1: The device tree is incomplete. CPUs and memory is missing
+ *         for example, those can be detected using other fw_cfg files.
+ *         Also pci ecam irq routing is not there, edk2 doesn't use
+ *         interrupts.
+ *
+ * Note 2: This is for firmware only. OSes should use the more
+ *         complete ACPI tables for hardware discovery.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "sysemu/device_tree.h"
+#include "hw/char/serial.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "hw/sysbus.h"
+#include "hw/virtio/virtio-mmio.h"
+#include "hw/usb/xhci.h"
+
+#include "microvm-dt.h"
+
+static bool debug;
+
+static void dt_add_microvm_irq(MicrovmMachineState *mms,
+                               const char *nodename, uint32_t irq)
+{
+    int index = 0;
+
+    if (irq >= IO_APIC_SECONDARY_IRQBASE) {
+        irq -= IO_APIC_SECONDARY_IRQBASE;
+        index++;
+    }
+
+    qemu_fdt_setprop_cell(mms->fdt, nodename, "interrupt-parent",
+                          mms->ioapic_phandle[index]);
+    qemu_fdt_setprop_cells(mms->fdt, nodename, "interrupts", irq, 0);
+}
+
+static void dt_add_virtio(MicrovmMachineState *mms, VirtIOMMIOProxy *mmio)
+{
+    SysBusDevice *dev = SYS_BUS_DEVICE(mmio);
+    VirtioBusState *mmio_virtio_bus = &mmio->bus;
+    BusState *mmio_bus = &mmio_virtio_bus->parent_obj;
+    char *nodename;
+
+    if (QTAILQ_EMPTY(&mmio_bus->children)) {
+        return;
+    }
+
+    hwaddr base = dev->mmio[0].addr;
+    hwaddr size = 512;
+    unsigned index = (base - VIRTIO_MMIO_BASE) / size;
+    uint32_t irq = mms->virtio_irq_base + index;
+
+    nodename = g_strdup_printf("/virtio_mmio@%" PRIx64, base);
+    qemu_fdt_add_subnode(mms->fdt, nodename);
+    qemu_fdt_setprop_string(mms->fdt, nodename, "compatible", "virtio,mmio");
+    qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size);
+    qemu_fdt_setprop(mms->fdt, nodename, "dma-coherent", NULL, 0);
+    dt_add_microvm_irq(mms, nodename, irq);
+    g_free(nodename);
+}
+
+static void dt_add_xhci(MicrovmMachineState *mms)
+{
+    const char compat[] = "generic-xhci";
+    uint32_t irq = MICROVM_XHCI_IRQ;
+    hwaddr base = MICROVM_XHCI_BASE;
+    hwaddr size = XHCI_LEN_REGS;
+    char *nodename;
+
+    nodename = g_strdup_printf("/usb@%" PRIx64, base);
+    qemu_fdt_add_subnode(mms->fdt, nodename);
+    qemu_fdt_setprop(mms->fdt, nodename, "compatible", compat, sizeof(compat));
+    qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size);
+    qemu_fdt_setprop(mms->fdt, nodename, "dma-coherent", NULL, 0);
+    dt_add_microvm_irq(mms, nodename, irq);
+    g_free(nodename);
+}
+
+static void dt_add_pcie(MicrovmMachineState *mms)
+{
+    hwaddr base = PCIE_MMIO_BASE;
+    int nr_pcie_buses;
+    char *nodename;
+
+    nodename = g_strdup_printf("/pcie@%" PRIx64, base);
+    qemu_fdt_add_subnode(mms->fdt, nodename);
+    qemu_fdt_setprop_string(mms->fdt, nodename,
+                            "compatible", "pci-host-ecam-generic");
+    qemu_fdt_setprop_string(mms->fdt, nodename, "device_type", "pci");
+    qemu_fdt_setprop_cell(mms->fdt, nodename, "#address-cells", 3);
+    qemu_fdt_setprop_cell(mms->fdt, nodename, "#size-cells", 2);
+    qemu_fdt_setprop_cell(mms->fdt, nodename, "linux,pci-domain", 0);
+    qemu_fdt_setprop(mms->fdt, nodename, "dma-coherent", NULL, 0);
+
+    qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg",
+                                 2, PCIE_ECAM_BASE, 2, PCIE_ECAM_SIZE);
+    if (mms->gpex.mmio64.size) {
+        qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "ranges",
+
+                                     1, FDT_PCI_RANGE_MMIO,
+                                     2, mms->gpex.mmio32.base,
+                                     2, mms->gpex.mmio32.base,
+                                     2, mms->gpex.mmio32.size,
+
+                                     1, FDT_PCI_RANGE_MMIO_64BIT,
+                                     2, mms->gpex.mmio64.base,
+                                     2, mms->gpex.mmio64.base,
+                                     2, mms->gpex.mmio64.size);
+    } else {
+        qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "ranges",
+
+                                     1, FDT_PCI_RANGE_MMIO,
+                                     2, mms->gpex.mmio32.base,
+                                     2, mms->gpex.mmio32.base,
+                                     2, mms->gpex.mmio32.size);
+    }
+
+    nr_pcie_buses = PCIE_ECAM_SIZE / PCIE_MMCFG_SIZE_MIN;
+    qemu_fdt_setprop_cells(mms->fdt, nodename, "bus-range", 0,
+                           nr_pcie_buses - 1);
+
+    g_free(nodename);
+}
+
+static void dt_add_ioapic(MicrovmMachineState *mms, SysBusDevice *dev)
+{
+    hwaddr base = dev->mmio[0].addr;
+    char *nodename;
+    uint32_t ph;
+    int index;
+
+    switch (base) {
+    case IO_APIC_DEFAULT_ADDRESS:
+        index = 0;
+        break;
+    case IO_APIC_SECONDARY_ADDRESS:
+        index = 1;
+        break;
+    default:
+        fprintf(stderr, "unknown ioapic @ %" PRIx64 "\n", base);
+        return;
+    }
+
+    nodename = g_strdup_printf("/ioapic%d@%" PRIx64, index + 1, base);
+    qemu_fdt_add_subnode(mms->fdt, nodename);
+    qemu_fdt_setprop_string(mms->fdt, nodename,
+                            "compatible", "intel,ce4100-ioapic");
+    qemu_fdt_setprop(mms->fdt, nodename, "interrupt-controller", NULL, 0);
+    qemu_fdt_setprop_cell(mms->fdt, nodename, "#interrupt-cells", 0x2);
+    qemu_fdt_setprop_cell(mms->fdt, nodename, "#address-cells", 0x2);
+    qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg",
+                                 2, base, 2, 0x1000);
+
+    ph = qemu_fdt_alloc_phandle(mms->fdt);
+    qemu_fdt_setprop_cell(mms->fdt, nodename, "phandle", ph);
+    qemu_fdt_setprop_cell(mms->fdt, nodename, "linux,phandle", ph);
+    mms->ioapic_phandle[index] = ph;
+
+    g_free(nodename);
+}
+
+static void dt_add_isa_serial(MicrovmMachineState *mms, ISADevice *dev)
+{
+    const char compat[] = "ns16550";
+    uint32_t irq = object_property_get_int(OBJECT(dev), "irq", NULL);
+    hwaddr base = object_property_get_int(OBJECT(dev), "iobase", NULL);
+    hwaddr size = 8;
+    char *nodename;
+
+    nodename = g_strdup_printf("/serial@%" PRIx64, base);
+    qemu_fdt_add_subnode(mms->fdt, nodename);
+    qemu_fdt_setprop(mms->fdt, nodename, "compatible", compat, sizeof(compat));
+    qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size);
+    dt_add_microvm_irq(mms, nodename, irq);
+
+    if (base == 0x3f8 /* com1 */) {
+        qemu_fdt_setprop_string(mms->fdt, "/chosen", "stdout-path", nodename);
+    }
+
+    g_free(nodename);
+}
+
+static void dt_add_isa_rtc(MicrovmMachineState *mms, ISADevice *dev)
+{
+    const char compat[] = "motorola,mc146818";
+    uint32_t irq = RTC_ISA_IRQ;
+    hwaddr base = RTC_ISA_BASE;
+    hwaddr size = 8;
+    char *nodename;
+
+    nodename = g_strdup_printf("/rtc@%" PRIx64, base);
+    qemu_fdt_add_subnode(mms->fdt, nodename);
+    qemu_fdt_setprop(mms->fdt, nodename, "compatible", compat, sizeof(compat));
+    qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size);
+    dt_add_microvm_irq(mms, nodename, irq);
+    g_free(nodename);
+}
+
+static void dt_setup_isa_bus(MicrovmMachineState *mms, DeviceState *bridge)
+{
+    BusState *bus = qdev_get_child_bus(bridge, "isa.0");
+    BusChild *kid;
+    Object *obj;
+
+    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+        DeviceState *dev = kid->child;
+
+        /* serial */
+        obj = object_dynamic_cast(OBJECT(dev), TYPE_ISA_SERIAL);
+        if (obj) {
+            dt_add_isa_serial(mms, ISA_DEVICE(obj));
+            continue;
+        }
+
+        /* rtc */
+        obj = object_dynamic_cast(OBJECT(dev), TYPE_MC146818_RTC);
+        if (obj) {
+            dt_add_isa_rtc(mms, ISA_DEVICE(obj));
+            continue;
+        }
+
+        if (debug) {
+            fprintf(stderr, "%s: unhandled: %s\n", __func__,
+                    object_get_typename(OBJECT(dev)));
+        }
+    }
+}
+
+static void dt_setup_sys_bus(MicrovmMachineState *mms)
+{
+    BusState *bus;
+    BusChild *kid;
+    Object *obj;
+
+    /* sysbus devices */
+    bus = sysbus_get_default();
+    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+        DeviceState *dev = kid->child;
+
+        /* ioapic */
+        obj = object_dynamic_cast(OBJECT(dev), TYPE_IOAPIC);
+        if (obj) {
+            dt_add_ioapic(mms, SYS_BUS_DEVICE(obj));
+            continue;
+        }
+    }
+
+    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+        DeviceState *dev = kid->child;
+
+        /* virtio */
+        obj = object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MMIO);
+        if (obj) {
+            dt_add_virtio(mms, VIRTIO_MMIO(obj));
+            continue;
+        }
+
+        /* xhci */
+        obj = object_dynamic_cast(OBJECT(dev), TYPE_XHCI_SYSBUS);
+        if (obj) {
+            dt_add_xhci(mms);
+            continue;
+        }
+
+        /* pcie */
+        obj = object_dynamic_cast(OBJECT(dev), TYPE_GPEX_HOST);
+        if (obj) {
+            dt_add_pcie(mms);
+            continue;
+        }
+
+        /* isa */
+        obj = object_dynamic_cast(OBJECT(dev), "isabus-bridge");
+        if (obj) {
+            dt_setup_isa_bus(mms, DEVICE(obj));
+            continue;
+        }
+
+        if (debug) {
+            obj = object_dynamic_cast(OBJECT(dev), TYPE_IOAPIC);
+            if (obj) {
+                /* ioapic already added in first pass */
+                continue;
+            }
+            fprintf(stderr, "%s: unhandled: %s\n", __func__,
+                    object_get_typename(OBJECT(dev)));
+        }
+    }
+}
+
+void dt_setup_microvm(MicrovmMachineState *mms)
+{
+    X86MachineState *x86ms = X86_MACHINE(mms);
+    int size = 0;
+
+    mms->fdt = create_device_tree(&size);
+
+    /* root node */
+    qemu_fdt_setprop_string(mms->fdt, "/", "compatible", "linux,microvm");
+    qemu_fdt_setprop_cell(mms->fdt, "/", "#address-cells", 0x2);
+    qemu_fdt_setprop_cell(mms->fdt, "/", "#size-cells", 0x2);
+
+    qemu_fdt_add_subnode(mms->fdt, "/chosen");
+    dt_setup_sys_bus(mms);
+
+    /* add to fw_cfg */
+    if (debug) {
+        fprintf(stderr, "%s: add etc/fdt to fw_cfg\n", __func__);
+    }
+    fw_cfg_add_file(x86ms->fw_cfg, "etc/fdt", mms->fdt, size);
+
+    if (debug) {
+        fprintf(stderr, "%s: writing microvm.fdt\n", __func__);
+        if (!g_file_set_contents("microvm.fdt", mms->fdt, size, NULL)) {
+            fprintf(stderr, "%s: writing microvm.fdt failed\n", __func__);
+            return;
+        }
+        int ret = system("dtc -I dtb -O dts microvm.fdt");
+        if (ret != 0) {
+            fprintf(stderr, "%s: oops, dtc not installed?\n", __func__);
+        }
+    }
+}
diff --git a/hw/i386/microvm-dt.h b/hw/i386/microvm-dt.h
new file mode 100644
index 000000000..77c79cbdd
--- /dev/null
+++ b/hw/i386/microvm-dt.h
@@ -0,0 +1,8 @@
+#ifndef HW_I386_MICROVM_DT_H
+#define HW_I386_MICROVM_DT_H
+
+#include "hw/i386/microvm.h"
+
+void dt_setup_microvm(MicrovmMachineState *mms);
+
+#endif
diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
new file mode 100644
index 000000000..4b3b1dd26
--- /dev/null
+++ b/hw/i386/microvm.c
@@ -0,0 +1,779 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "qemu/units.h"
+#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include "qapi/qapi-visit-common.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/cpus.h"
+#include "sysemu/numa.h"
+#include "sysemu/reset.h"
+#include "sysemu/runstate.h"
+#include "acpi-microvm.h"
+#include "microvm-dt.h"
+
+#include "hw/loader.h"
+#include "hw/irq.h"
+#include "hw/kvm/clock.h"
+#include "hw/i386/microvm.h"
+#include "hw/i386/x86.h"
+#include "target/i386/cpu.h"
+#include "hw/intc/i8259.h"
+#include "hw/timer/i8254.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "hw/char/serial.h"
+#include "hw/display/ramfb.h"
+#include "hw/i386/topology.h"
+#include "hw/i386/e820_memory_layout.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/virtio/virtio-mmio.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/generic_event_device.h"
+#include "hw/pci-host/gpex.h"
+#include "hw/usb/xhci.h"
+
+#include "elf.h"
+#include "kvm/kvm_i386.h"
+#include "hw/xen/start_info.h"
+
+#define MICROVM_QBOOT_FILENAME "qboot.rom"
+#define MICROVM_BIOS_FILENAME  "bios-microvm.bin"
+
+static void microvm_set_rtc(MicrovmMachineState *mms, ISADevice *s)
+{
+    X86MachineState *x86ms = X86_MACHINE(mms);
+    int val;
+
+    val = MIN(x86ms->below_4g_mem_size / KiB, 640);
+    rtc_set_memory(s, 0x15, val);
+    rtc_set_memory(s, 0x16, val >> 8);
+    /* extended memory (next 64MiB) */
+    if (x86ms->below_4g_mem_size > 1 * MiB) {
+        val = (x86ms->below_4g_mem_size - 1 * MiB) / KiB;
+    } else {
+        val = 0;
+    }
+    if (val > 65535) {
+        val = 65535;
+    }
+    rtc_set_memory(s, 0x17, val);
+    rtc_set_memory(s, 0x18, val >> 8);
+    rtc_set_memory(s, 0x30, val);
+    rtc_set_memory(s, 0x31, val >> 8);
+    /* memory between 16MiB and 4GiB */
+    if (x86ms->below_4g_mem_size > 16 * MiB) {
+        val = (x86ms->below_4g_mem_size - 16 * MiB) / (64 * KiB);
+    } else {
+        val = 0;
+    }
+    if (val > 65535) {
+        val = 65535;
+    }
+    rtc_set_memory(s, 0x34, val);
+    rtc_set_memory(s, 0x35, val >> 8);
+    /* memory above 4GiB */
+    val = x86ms->above_4g_mem_size / 65536;
+    rtc_set_memory(s, 0x5b, val);
+    rtc_set_memory(s, 0x5c, val >> 8);
+    rtc_set_memory(s, 0x5d, val >> 16);
+}
+
+static void create_gpex(MicrovmMachineState *mms)
+{
+    X86MachineState *x86ms = X86_MACHINE(mms);
+    MemoryRegion *mmio32_alias;
+    MemoryRegion *mmio64_alias;
+    MemoryRegion *mmio_reg;
+    MemoryRegion *ecam_alias;
+    MemoryRegion *ecam_reg;
+    DeviceState *dev;
+    int i;
+
+    dev = qdev_new(TYPE_GPEX_HOST);
+    sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
+
+    /* Map only the first size_ecam bytes of ECAM space */
+    ecam_alias = g_new0(MemoryRegion, 1);
+    ecam_reg = sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 0);
+    memory_region_init_alias(ecam_alias, OBJECT(dev), "pcie-ecam",
+                             ecam_reg, 0, mms->gpex.ecam.size);
+    memory_region_add_subregion(get_system_memory(),
+                                mms->gpex.ecam.base, ecam_alias);
+
+    /* Map the MMIO window into system address space so as to expose
+     * the section of PCI MMIO space which starts at the same base address
+     * (ie 1:1 mapping for that part of PCI MMIO space visible through
+     * the window).
+     */
+    mmio_reg = sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 1);
+    if (mms->gpex.mmio32.size) {
+        mmio32_alias = g_new0(MemoryRegion, 1);
+        memory_region_init_alias(mmio32_alias, OBJECT(dev), "pcie-mmio32", mmio_reg,
+                                 mms->gpex.mmio32.base, mms->gpex.mmio32.size);
+        memory_region_add_subregion(get_system_memory(),
+                                    mms->gpex.mmio32.base, mmio32_alias);
+    }
+    if (mms->gpex.mmio64.size) {
+        mmio64_alias = g_new0(MemoryRegion, 1);
+        memory_region_init_alias(mmio64_alias, OBJECT(dev), "pcie-mmio64", mmio_reg,
+                                 mms->gpex.mmio64.base, mms->gpex.mmio64.size);
+        memory_region_add_subregion(get_system_memory(),
+                                    mms->gpex.mmio64.base, mmio64_alias);
+    }
+
+    for (i = 0; i < GPEX_NUM_IRQS; i++) {
+        sysbus_connect_irq(SYS_BUS_DEVICE(dev), i,
+                           x86ms->gsi[mms->gpex.irq + i]);
+    }
+}
+
+static int microvm_ioapics(MicrovmMachineState *mms)
+{
+    if (!x86_machine_is_acpi_enabled(X86_MACHINE(mms))) {
+        return 1;
+    }
+    if (mms->ioapic2 == ON_OFF_AUTO_OFF) {
+        return 1;
+    }
+    return 2;
+}
+
+static void microvm_devices_init(MicrovmMachineState *mms)
+{
+    const char *default_firmware;
+    X86MachineState *x86ms = X86_MACHINE(mms);
+    ISABus *isa_bus;
+    ISADevice *rtc_state;
+    GSIState *gsi_state;
+    int ioapics;
+    int i;
+
+    /* Core components */
+    ioapics = microvm_ioapics(mms);
+    gsi_state = g_malloc0(sizeof(*gsi_state));
+    x86ms->gsi = qemu_allocate_irqs(gsi_handler, gsi_state,
+                                    IOAPIC_NUM_PINS * ioapics);
+
+    isa_bus = isa_bus_new(NULL, get_system_memory(), get_system_io(),
+                          &error_abort);
+    isa_bus_irqs(isa_bus, x86ms->gsi);
+
+    ioapic_init_gsi(gsi_state, "machine");
+    if (ioapics > 1) {
+        x86ms->ioapic2 = ioapic_init_secondary(gsi_state);
+    }
+
+    kvmclock_create(true);
+
+    mms->virtio_irq_base = 5;
+    mms->virtio_num_transports = 8;
+    if (x86ms->ioapic2) {
+        mms->pcie_irq_base = 16;    /* 16 -> 19 */
+        /* use second ioapic (24 -> 47) for virtio-mmio irq lines */
+        mms->virtio_irq_base = IO_APIC_SECONDARY_IRQBASE;
+        mms->virtio_num_transports = IOAPIC_NUM_PINS;
+    } else if (x86_machine_is_acpi_enabled(x86ms)) {
+        mms->pcie_irq_base = 12;    /* 12 -> 15 */
+        mms->virtio_irq_base = 16;  /* 16 -> 23 */
+    }
+
+    for (i = 0; i < mms->virtio_num_transports; i++) {
+        sysbus_create_simple("virtio-mmio",
+                             VIRTIO_MMIO_BASE + i * 512,
+                             x86ms->gsi[mms->virtio_irq_base + i]);
+    }
+
+    /* Optional and legacy devices */
+    if (x86_machine_is_acpi_enabled(x86ms)) {
+        DeviceState *dev = qdev_new(TYPE_ACPI_GED_X86);
+        qdev_prop_set_uint32(dev, "ged-event", ACPI_GED_PWR_DOWN_EVT);
+        sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, GED_MMIO_BASE);
+        /* sysbus_mmio_map(SYS_BUS_DEVICE(dev), 1, GED_MMIO_BASE_MEMHP); */
+        sysbus_mmio_map(SYS_BUS_DEVICE(dev), 2, GED_MMIO_BASE_REGS);
+        sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0,
+                           x86ms->gsi[GED_MMIO_IRQ]);
+        sysbus_realize(SYS_BUS_DEVICE(dev), &error_fatal);
+        x86ms->acpi_dev = HOTPLUG_HANDLER(dev);
+    }
+
+    if (x86_machine_is_acpi_enabled(x86ms) && machine_usb(MACHINE(mms))) {
+        DeviceState *dev = qdev_new(TYPE_XHCI_SYSBUS);
+        qdev_prop_set_uint32(dev, "intrs", 1);
+        qdev_prop_set_uint32(dev, "slots", XHCI_MAXSLOTS);
+        qdev_prop_set_uint32(dev, "p2", 8);
+        qdev_prop_set_uint32(dev, "p3", 8);
+        sysbus_realize(SYS_BUS_DEVICE(dev), &error_fatal);
+        sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, MICROVM_XHCI_BASE);
+        sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0,
+                           x86ms->gsi[MICROVM_XHCI_IRQ]);
+    }
+
+    if (x86_machine_is_acpi_enabled(x86ms) && mms->pcie == ON_OFF_AUTO_ON) {
+        /* use topmost 25% of the address space available */
+        hwaddr phys_size = (hwaddr)1 << X86_CPU(first_cpu)->phys_bits;
+        if (phys_size > 0x1000000ll) {
+            mms->gpex.mmio64.size = phys_size / 4;
+            mms->gpex.mmio64.base = phys_size - mms->gpex.mmio64.size;
+        }
+        mms->gpex.mmio32.base = PCIE_MMIO_BASE;
+        mms->gpex.mmio32.size = PCIE_MMIO_SIZE;
+        mms->gpex.ecam.base   = PCIE_ECAM_BASE;
+        mms->gpex.ecam.size   = PCIE_ECAM_SIZE;
+        mms->gpex.irq         = mms->pcie_irq_base;
+        create_gpex(mms);
+        x86ms->pci_irq_mask = ((1 << (mms->pcie_irq_base + 0)) |
+                               (1 << (mms->pcie_irq_base + 1)) |
+                               (1 << (mms->pcie_irq_base + 2)) |
+                               (1 << (mms->pcie_irq_base + 3)));
+    } else {
+        x86ms->pci_irq_mask = 0;
+    }
+
+    if (mms->pic == ON_OFF_AUTO_ON || mms->pic == ON_OFF_AUTO_AUTO) {
+        qemu_irq *i8259;
+
+        i8259 = i8259_init(isa_bus, x86_allocate_cpu_irq());
+        for (i = 0; i < ISA_NUM_IRQS; i++) {
+            gsi_state->i8259_irq[i] = i8259[i];
+        }
+        g_free(i8259);
+    }
+
+    if (mms->pit == ON_OFF_AUTO_ON || mms->pit == ON_OFF_AUTO_AUTO) {
+        if (kvm_pit_in_kernel()) {
+            kvm_pit_init(isa_bus, 0x40);
+        } else {
+            i8254_pit_init(isa_bus, 0x40, 0, NULL);
+        }
+    }
+
+    if (mms->rtc == ON_OFF_AUTO_ON ||
+        (mms->rtc == ON_OFF_AUTO_AUTO && !kvm_enabled())) {
+        rtc_state = mc146818_rtc_init(isa_bus, 2000, NULL);
+        microvm_set_rtc(mms, rtc_state);
+    }
+
+    if (mms->isa_serial) {
+        serial_hds_isa_init(isa_bus, 0, 1);
+    }
+
+    default_firmware = x86_machine_is_acpi_enabled(x86ms)
+            ? MICROVM_BIOS_FILENAME
+            : MICROVM_QBOOT_FILENAME;
+    x86_bios_rom_init(MACHINE(mms), default_firmware, get_system_memory(), true);
+}
+
+static void microvm_memory_init(MicrovmMachineState *mms)
+{
+    MachineState *machine = MACHINE(mms);
+    X86MachineState *x86ms = X86_MACHINE(mms);
+    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *system_memory = get_system_memory();
+    FWCfgState *fw_cfg;
+    ram_addr_t lowmem = 0xc0000000; /* 3G */
+    int i;
+
+    if (machine->ram_size > lowmem) {
+        x86ms->above_4g_mem_size = machine->ram_size - lowmem;
+        x86ms->below_4g_mem_size = lowmem;
+    } else {
+        x86ms->above_4g_mem_size = 0;
+        x86ms->below_4g_mem_size = machine->ram_size;
+    }
+
+    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
+    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
+                             0, x86ms->below_4g_mem_size);
+    memory_region_add_subregion(system_memory, 0, ram_below_4g);
+
+    e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
+
+    if (x86ms->above_4g_mem_size > 0) {
+        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
+        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
+                                 machine->ram,
+                                 x86ms->below_4g_mem_size,
+                                 x86ms->above_4g_mem_size);
+        memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                    ram_above_4g);
+        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
+    }
+
+    fw_cfg = fw_cfg_init_io_dma(FW_CFG_IO_BASE, FW_CFG_IO_BASE + 4,
+                                &address_space_memory);
+
+    fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, machine->smp.cpus);
+    fw_cfg_add_i16(fw_cfg, FW_CFG_MAX_CPUS, machine->smp.max_cpus);
+    fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)machine->ram_size);
+    fw_cfg_add_i32(fw_cfg, FW_CFG_IRQ0_OVERRIDE, 1);
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_E820_TABLE,
+                     &e820_reserve, sizeof(e820_reserve));
+    fw_cfg_add_file(fw_cfg, "etc/e820", e820_table,
+                    sizeof(struct e820_entry) * e820_get_num_entries());
+
+    rom_set_fw(fw_cfg);
+
+    if (machine->kernel_filename != NULL) {
+        x86_load_linux(x86ms, fw_cfg, 0, true);
+    }
+
+    if (mms->option_roms) {
+        for (i = 0; i < nb_option_roms; i++) {
+            rom_add_option(option_rom[i].name, option_rom[i].bootindex);
+        }
+    }
+
+    x86ms->fw_cfg = fw_cfg;
+    x86ms->ioapic_as = &address_space_memory;
+}
+
+static gchar *microvm_get_mmio_cmdline(gchar *name, uint32_t virtio_irq_base)
+{
+    gchar *cmdline;
+    gchar *separator;
+    long int index;
+    int ret;
+
+    separator = g_strrstr(name, ".");
+    if (!separator) {
+        return NULL;
+    }
+
+    if (qemu_strtol(separator + 1, NULL, 10, &index) != 0) {
+        return NULL;
+    }
+
+    cmdline = g_malloc0(VIRTIO_CMDLINE_MAXLEN);
+    ret = g_snprintf(cmdline, VIRTIO_CMDLINE_MAXLEN,
+                     " virtio_mmio.device=512@0x%lx:%ld",
+                     VIRTIO_MMIO_BASE + index * 512,
+                     virtio_irq_base + index);
+    if (ret < 0 || ret >= VIRTIO_CMDLINE_MAXLEN) {
+        g_free(cmdline);
+        return NULL;
+    }
+
+    return cmdline;
+}
+
+static void microvm_fix_kernel_cmdline(MachineState *machine)
+{
+    X86MachineState *x86ms = X86_MACHINE(machine);
+    MicrovmMachineState *mms = MICROVM_MACHINE(machine);
+    BusState *bus;
+    BusChild *kid;
+    char *cmdline;
+
+    /*
+     * Find MMIO transports with attached devices, and add them to the kernel
+     * command line.
+     *
+     * Yes, this is a hack, but one that heavily improves the UX without
+     * introducing any significant issues.
+     */
+    cmdline = g_strdup(machine->kernel_cmdline);
+    bus = sysbus_get_default();
+    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+        DeviceState *dev = kid->child;
+        ObjectClass *class = object_get_class(OBJECT(dev));
+
+        if (class == object_class_by_name(TYPE_VIRTIO_MMIO)) {
+            VirtIOMMIOProxy *mmio = VIRTIO_MMIO(OBJECT(dev));
+            VirtioBusState *mmio_virtio_bus = &mmio->bus;
+            BusState *mmio_bus = &mmio_virtio_bus->parent_obj;
+
+            if (!QTAILQ_EMPTY(&mmio_bus->children)) {
+                gchar *mmio_cmdline = microvm_get_mmio_cmdline
+                    (mmio_bus->name, mms->virtio_irq_base);
+                if (mmio_cmdline) {
+                    char *newcmd = g_strjoin(NULL, cmdline, mmio_cmdline, NULL);
+                    g_free(mmio_cmdline);
+                    g_free(cmdline);
+                    cmdline = newcmd;
+                }
+            }
+        }
+    }
+
+    fw_cfg_modify_i32(x86ms->fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(cmdline) + 1);
+    fw_cfg_modify_string(x86ms->fw_cfg, FW_CFG_CMDLINE_DATA, cmdline);
+
+    g_free(cmdline);
+}
+
+static void microvm_device_pre_plug_cb(HotplugHandler *hotplug_dev,
+                                       DeviceState *dev, Error **errp)
+{
+    X86CPU *cpu = X86_CPU(dev);
+
+    cpu->host_phys_bits = true; /* need reliable phys-bits */
+    x86_cpu_pre_plug(hotplug_dev, dev, errp);
+}
+
+static void microvm_device_plug_cb(HotplugHandler *hotplug_dev,
+                                   DeviceState *dev, Error **errp)
+{
+    x86_cpu_plug(hotplug_dev, dev, errp);
+}
+
+static void microvm_device_unplug_request_cb(HotplugHandler *hotplug_dev,
+                                             DeviceState *dev, Error **errp)
+{
+    error_setg(errp, "unplug not supported by microvm");
+}
+
+static void microvm_device_unplug_cb(HotplugHandler *hotplug_dev,
+                                     DeviceState *dev, Error **errp)
+{
+    error_setg(errp, "unplug not supported by microvm");
+}
+
+static HotplugHandler *microvm_get_hotplug_handler(MachineState *machine,
+                                                   DeviceState *dev)
+{
+    if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        return HOTPLUG_HANDLER(machine);
+    }
+    return NULL;
+}
+
+static void microvm_machine_state_init(MachineState *machine)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(machine);
+    X86MachineState *x86ms = X86_MACHINE(machine);
+
+    microvm_memory_init(mms);
+
+    x86_cpus_init(x86ms, CPU_VERSION_LATEST);
+
+    microvm_devices_init(mms);
+}
+
+static void microvm_machine_reset(MachineState *machine)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(machine);
+    CPUState *cs;
+    X86CPU *cpu;
+
+    if (!x86_machine_is_acpi_enabled(X86_MACHINE(machine)) &&
+        machine->kernel_filename != NULL &&
+        mms->auto_kernel_cmdline && !mms->kernel_cmdline_fixed) {
+        microvm_fix_kernel_cmdline(machine);
+        mms->kernel_cmdline_fixed = true;
+    }
+
+    qemu_devices_reset();
+
+    CPU_FOREACH(cs) {
+        cpu = X86_CPU(cs);
+
+        if (cpu->apic_state) {
+            device_legacy_reset(cpu->apic_state);
+        }
+    }
+}
+
+static void microvm_machine_get_pic(Object *obj, Visitor *v, const char *name,
+                                    void *opaque, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+    OnOffAuto pic = mms->pic;
+
+    visit_type_OnOffAuto(v, name, &pic, errp);
+}
+
+static void microvm_machine_set_pic(Object *obj, Visitor *v, const char *name,
+                                    void *opaque, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, name, &mms->pic, errp);
+}
+
+static void microvm_machine_get_pit(Object *obj, Visitor *v, const char *name,
+                                    void *opaque, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+    OnOffAuto pit = mms->pit;
+
+    visit_type_OnOffAuto(v, name, &pit, errp);
+}
+
+static void microvm_machine_set_pit(Object *obj, Visitor *v, const char *name,
+                                    void *opaque, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, name, &mms->pit, errp);
+}
+
+static void microvm_machine_get_rtc(Object *obj, Visitor *v, const char *name,
+                                    void *opaque, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+    OnOffAuto rtc = mms->rtc;
+
+    visit_type_OnOffAuto(v, name, &rtc, errp);
+}
+
+static void microvm_machine_set_rtc(Object *obj, Visitor *v, const char *name,
+                                    void *opaque, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, name, &mms->rtc, errp);
+}
+
+static void microvm_machine_get_pcie(Object *obj, Visitor *v, const char *name,
+                                     void *opaque, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+    OnOffAuto pcie = mms->pcie;
+
+    visit_type_OnOffAuto(v, name, &pcie, errp);
+}
+
+static void microvm_machine_set_pcie(Object *obj, Visitor *v, const char *name,
+                                     void *opaque, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, name, &mms->pcie, errp);
+}
+
+static void microvm_machine_get_ioapic2(Object *obj, Visitor *v, const char *name,
+                                        void *opaque, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+    OnOffAuto ioapic2 = mms->ioapic2;
+
+    visit_type_OnOffAuto(v, name, &ioapic2, errp);
+}
+
+static void microvm_machine_set_ioapic2(Object *obj, Visitor *v, const char *name,
+                                        void *opaque, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, name, &mms->ioapic2, errp);
+}
+
+static bool microvm_machine_get_isa_serial(Object *obj, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    return mms->isa_serial;
+}
+
+static void microvm_machine_set_isa_serial(Object *obj, bool value,
+                                           Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    mms->isa_serial = value;
+}
+
+static bool microvm_machine_get_option_roms(Object *obj, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    return mms->option_roms;
+}
+
+static void microvm_machine_set_option_roms(Object *obj, bool value,
+                                            Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    mms->option_roms = value;
+}
+
+static bool microvm_machine_get_auto_kernel_cmdline(Object *obj, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    return mms->auto_kernel_cmdline;
+}
+
+static void microvm_machine_set_auto_kernel_cmdline(Object *obj, bool value,
+                                                    Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    mms->auto_kernel_cmdline = value;
+}
+
+static void microvm_machine_done(Notifier *notifier, void *data)
+{
+    MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState,
+                                            machine_done);
+
+    acpi_setup_microvm(mms);
+    dt_setup_microvm(mms);
+}
+
+static void microvm_powerdown_req(Notifier *notifier, void *data)
+{
+    MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState,
+                                            powerdown_req);
+    X86MachineState *x86ms = X86_MACHINE(mms);
+
+    if (x86ms->acpi_dev) {
+        Object *obj = OBJECT(x86ms->acpi_dev);
+        AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj);
+        adevc->send_event(ACPI_DEVICE_IF(x86ms->acpi_dev),
+                          ACPI_POWER_DOWN_STATUS);
+    }
+}
+
+static void microvm_machine_initfn(Object *obj)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    /* Configuration */
+    mms->pic = ON_OFF_AUTO_AUTO;
+    mms->pit = ON_OFF_AUTO_AUTO;
+    mms->rtc = ON_OFF_AUTO_AUTO;
+    mms->pcie = ON_OFF_AUTO_AUTO;
+    mms->ioapic2 = ON_OFF_AUTO_AUTO;
+    mms->isa_serial = true;
+    mms->option_roms = true;
+    mms->auto_kernel_cmdline = true;
+
+    /* State */
+    mms->kernel_cmdline_fixed = false;
+
+    mms->machine_done.notify = microvm_machine_done;
+    qemu_add_machine_init_done_notifier(&mms->machine_done);
+    mms->powerdown_req.notify = microvm_powerdown_req;
+    qemu_register_powerdown_notifier(&mms->powerdown_req);
+}
+
+static void microvm_class_init(ObjectClass *oc, void *data)
+{
+    X86MachineClass *x86mc = X86_MACHINE_CLASS(oc);
+    MachineClass *mc = MACHINE_CLASS(oc);
+    HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
+
+    mc->init = microvm_machine_state_init;
+
+    mc->family = "microvm_i386";
+    mc->desc = "microvm (i386)";
+    mc->units_per_default_bus = 1;
+    mc->no_floppy = 1;
+    mc->max_cpus = 288;
+    mc->has_hotpluggable_cpus = false;
+    mc->auto_enable_numa_with_memhp = false;
+    mc->auto_enable_numa_with_memdev = false;
+    mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE;
+    mc->nvdimm_supported = false;
+    mc->default_ram_id = "microvm.ram";
+
+    /* Avoid relying too much on kernel components */
+    mc->default_kernel_irqchip_split = true;
+
+    /* Machine class handlers */
+    mc->reset = microvm_machine_reset;
+
+    /* hotplug (for cpu coldplug) */
+    mc->get_hotplug_handler = microvm_get_hotplug_handler;
+    hc->pre_plug = microvm_device_pre_plug_cb;
+    hc->plug = microvm_device_plug_cb;
+    hc->unplug_request = microvm_device_unplug_request_cb;
+    hc->unplug = microvm_device_unplug_cb;
+
+    x86mc->fwcfg_dma_enabled = true;
+
+    object_class_property_add(oc, MICROVM_MACHINE_PIC, "OnOffAuto",
+                              microvm_machine_get_pic,
+                              microvm_machine_set_pic,
+                              NULL, NULL);
+    object_class_property_set_description(oc, MICROVM_MACHINE_PIC,
+        "Enable i8259 PIC");
+
+    object_class_property_add(oc, MICROVM_MACHINE_PIT, "OnOffAuto",
+                              microvm_machine_get_pit,
+                              microvm_machine_set_pit,
+                              NULL, NULL);
+    object_class_property_set_description(oc, MICROVM_MACHINE_PIT,
+        "Enable i8254 PIT");
+
+    object_class_property_add(oc, MICROVM_MACHINE_RTC, "OnOffAuto",
+                              microvm_machine_get_rtc,
+                              microvm_machine_set_rtc,
+                              NULL, NULL);
+    object_class_property_set_description(oc, MICROVM_MACHINE_RTC,
+        "Enable MC146818 RTC");
+
+    object_class_property_add(oc, MICROVM_MACHINE_PCIE, "OnOffAuto",
+                              microvm_machine_get_pcie,
+                              microvm_machine_set_pcie,
+                              NULL, NULL);
+    object_class_property_set_description(oc, MICROVM_MACHINE_PCIE,
+        "Enable PCIe");
+
+    object_class_property_add(oc, MICROVM_MACHINE_IOAPIC2, "OnOffAuto",
+                              microvm_machine_get_ioapic2,
+                              microvm_machine_set_ioapic2,
+                              NULL, NULL);
+    object_class_property_set_description(oc, MICROVM_MACHINE_IOAPIC2,
+        "Enable second IO-APIC");
+
+    object_class_property_add_bool(oc, MICROVM_MACHINE_ISA_SERIAL,
+                                   microvm_machine_get_isa_serial,
+                                   microvm_machine_set_isa_serial);
+    object_class_property_set_description(oc, MICROVM_MACHINE_ISA_SERIAL,
+        "Set off to disable the instantiation an ISA serial port");
+
+    object_class_property_add_bool(oc, MICROVM_MACHINE_OPTION_ROMS,
+                                   microvm_machine_get_option_roms,
+                                   microvm_machine_set_option_roms);
+    object_class_property_set_description(oc, MICROVM_MACHINE_OPTION_ROMS,
+        "Set off to disable loading option ROMs");
+
+    object_class_property_add_bool(oc, MICROVM_MACHINE_AUTO_KERNEL_CMDLINE,
+                                   microvm_machine_get_auto_kernel_cmdline,
+                                   microvm_machine_set_auto_kernel_cmdline);
+    object_class_property_set_description(oc,
+        MICROVM_MACHINE_AUTO_KERNEL_CMDLINE,
+        "Set off to disable adding virtio-mmio devices to the kernel cmdline");
+
+    machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE);
+}
+
+static const TypeInfo microvm_machine_info = {
+    .name          = TYPE_MICROVM_MACHINE,
+    .parent        = TYPE_X86_MACHINE,
+    .instance_size = sizeof(MicrovmMachineState),
+    .instance_init = microvm_machine_initfn,
+    .class_size    = sizeof(MicrovmMachineClass),
+    .class_init    = microvm_class_init,
+    .interfaces = (InterfaceInfo[]) {
+         { TYPE_HOTPLUG_HANDLER },
+         { }
+    },
+};
+
+static void microvm_machine_init(void)
+{
+    type_register_static(&microvm_machine_info);
+}
+type_init(microvm_machine_init);
diff --git a/hw/i386/multiboot.c b/hw/i386/multiboot.c
new file mode 100644
index 000000000..0a10089f1
--- /dev/null
+++ b/hw/i386/multiboot.c
@@ -0,0 +1,415 @@
+/*
+ * QEMU PC System Emulator
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/option.h"
+#include "cpu.h"
+#include "hw/nvram/fw_cfg.h"
+#include "multiboot.h"
+#include "hw/loader.h"
+#include "elf.h"
+#include "sysemu/sysemu.h"
+#include "qemu/error-report.h"
+
+/* Show multiboot debug output */
+//#define DEBUG_MULTIBOOT
+
+#ifdef DEBUG_MULTIBOOT
+#define mb_debug(a...) error_report(a)
+#else
+#define mb_debug(a...)
+#endif
+
+#define MULTIBOOT_STRUCT_ADDR 0x9000
+
+#if MULTIBOOT_STRUCT_ADDR > 0xf0000
+#error multiboot struct needs to fit in 16 bit real mode
+#endif
+
+enum {
+    /* Multiboot info */
+    MBI_FLAGS       = 0,
+    MBI_MEM_LOWER   = 4,
+    MBI_MEM_UPPER   = 8,
+    MBI_BOOT_DEVICE = 12,
+    MBI_CMDLINE     = 16,
+    MBI_MODS_COUNT  = 20,
+    MBI_MODS_ADDR   = 24,
+    MBI_MMAP_ADDR   = 48,
+    MBI_BOOTLOADER  = 64,
+
+    MBI_SIZE        = 88,
+
+    /* Multiboot modules */
+    MB_MOD_START    = 0,
+    MB_MOD_END      = 4,
+    MB_MOD_CMDLINE  = 8,
+
+    MB_MOD_SIZE     = 16,
+
+    /* Region offsets */
+    ADDR_E820_MAP = MULTIBOOT_STRUCT_ADDR + 0,
+    ADDR_MBI      = ADDR_E820_MAP + 0x500,
+
+    /* Multiboot flags */
+    MULTIBOOT_FLAGS_MEMORY      = 1 << 0,
+    MULTIBOOT_FLAGS_BOOT_DEVICE = 1 << 1,
+    MULTIBOOT_FLAGS_CMDLINE     = 1 << 2,
+    MULTIBOOT_FLAGS_MODULES     = 1 << 3,
+    MULTIBOOT_FLAGS_MMAP        = 1 << 6,
+    MULTIBOOT_FLAGS_BOOTLOADER  = 1 << 9,
+};
+
+typedef struct {
+    /* buffer holding kernel, cmdlines and mb_infos */
+    void *mb_buf;
+    /* address in target */
+    hwaddr mb_buf_phys;
+    /* size of mb_buf in bytes */
+    unsigned mb_buf_size;
+    /* offset of mb-info's in bytes */
+    hwaddr offset_mbinfo;
+    /* offset in buffer for cmdlines in bytes */
+    hwaddr offset_cmdlines;
+    /* offset in buffer for bootloader name in bytes */
+    hwaddr offset_bootloader;
+    /* offset of modules in bytes */
+    hwaddr offset_mods;
+    /* available slots for mb modules infos */
+    int mb_mods_avail;
+    /* currently used slots of mb modules */
+    int mb_mods_count;
+} MultibootState;
+
+const char *bootloader_name = "qemu";
+
+static uint32_t mb_add_cmdline(MultibootState *s, const char *cmdline)
+{
+    hwaddr p = s->offset_cmdlines;
+    char *b = (char *)s->mb_buf + p;
+
+    memcpy(b, cmdline, strlen(cmdline) + 1);
+    s->offset_cmdlines += strlen(b) + 1;
+    return s->mb_buf_phys + p;
+}
+
+static uint32_t mb_add_bootloader(MultibootState *s, const char *bootloader)
+{
+    hwaddr p = s->offset_bootloader;
+    char *b = (char *)s->mb_buf + p;
+
+    memcpy(b, bootloader, strlen(bootloader) + 1);
+    s->offset_bootloader += strlen(b) + 1;
+    return s->mb_buf_phys + p;
+}
+
+static void mb_add_mod(MultibootState *s,
+                       hwaddr start, hwaddr end,
+                       hwaddr cmdline_phys)
+{
+    char *p;
+    assert(s->mb_mods_count < s->mb_mods_avail);
+
+    p = (char *)s->mb_buf + s->offset_mbinfo + MB_MOD_SIZE * s->mb_mods_count;
+
+    stl_p(p + MB_MOD_START,   start);
+    stl_p(p + MB_MOD_END,     end);
+    stl_p(p + MB_MOD_CMDLINE, cmdline_phys);
+
+    mb_debug("mod%02d: "TARGET_FMT_plx" - "TARGET_FMT_plx,
+             s->mb_mods_count, start, end);
+
+    s->mb_mods_count++;
+}
+
+int load_multiboot(X86MachineState *x86ms,
+                   FWCfgState *fw_cfg,
+                   FILE *f,
+                   const char *kernel_filename,
+                   const char *initrd_filename,
+                   const char *kernel_cmdline,
+                   int kernel_file_size,
+                   uint8_t *header)
+{
+    bool multiboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled;
+    int i, is_multiboot = 0;
+    uint32_t flags = 0;
+    uint32_t mh_entry_addr;
+    uint32_t mh_load_addr;
+    uint32_t mb_kernel_size;
+    MultibootState mbs;
+    uint8_t bootinfo[MBI_SIZE];
+    uint8_t *mb_bootinfo_data;
+    uint32_t cmdline_len;
+    GList *mods = NULL;
+
+    /* Ok, let's see if it is a multiboot image.
+       The header is 12x32bit long, so the latest entry may be 8192 - 48. */
+    for (i = 0; i < (8192 - 48); i += 4) {
+        if (ldl_p(header+i) == 0x1BADB002) {
+            uint32_t checksum = ldl_p(header+i+8);
+            flags = ldl_p(header+i+4);
+            checksum += flags;
+            checksum += (uint32_t)0x1BADB002;
+            if (!checksum) {
+                is_multiboot = 1;
+                break;
+            }
+        }
+    }
+
+    if (!is_multiboot)
+        return 0; /* no multiboot */
+
+    mb_debug("I believe we found a multiboot image!");
+    memset(bootinfo, 0, sizeof(bootinfo));
+    memset(&mbs, 0, sizeof(mbs));
+
+    if (flags & 0x00000004) { /* MULTIBOOT_HEADER_HAS_VBE */
+        error_report("multiboot knows VBE. we don't");
+    }
+    if (!(flags & 0x00010000)) { /* MULTIBOOT_HEADER_HAS_ADDR */
+        uint64_t elf_entry;
+        uint64_t elf_low, elf_high;
+        int kernel_size;
+        fclose(f);
+
+        if (((struct elf64_hdr*)header)->e_machine == EM_X86_64) {
+            error_report("Cannot load x86-64 image, give a 32bit one.");
+            exit(1);
+        }
+
+        kernel_size = load_elf(kernel_filename, NULL, NULL, NULL, &elf_entry,
+                               &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE,
+                               0, 0);
+        if (kernel_size < 0) {
+            error_report("Error while loading elf kernel");
+            exit(1);
+        }
+        mh_load_addr = elf_low;
+        mb_kernel_size = elf_high - elf_low;
+        mh_entry_addr = elf_entry;
+
+        mbs.mb_buf = g_malloc(mb_kernel_size);
+        if (rom_copy(mbs.mb_buf, mh_load_addr, mb_kernel_size) != mb_kernel_size) {
+            error_report("Error while fetching elf kernel from rom");
+            exit(1);
+        }
+
+        mb_debug("loading multiboot-elf kernel "
+                 "(%#x bytes) with entry %#zx",
+                 mb_kernel_size, (size_t)mh_entry_addr);
+    } else {
+        /* Valid if mh_flags sets MULTIBOOT_HEADER_HAS_ADDR. */
+        uint32_t mh_header_addr = ldl_p(header+i+12);
+        uint32_t mh_load_end_addr = ldl_p(header+i+20);
+        uint32_t mh_bss_end_addr = ldl_p(header+i+24);
+
+        mh_load_addr = ldl_p(header+i+16);
+        if (mh_header_addr < mh_load_addr) {
+            error_report("invalid load_addr address");
+            exit(1);
+        }
+        if (mh_header_addr - mh_load_addr > i) {
+            error_report("invalid header_addr address");
+            exit(1);
+        }
+
+        uint32_t mb_kernel_text_offset = i - (mh_header_addr - mh_load_addr);
+        uint32_t mb_load_size = 0;
+        mh_entry_addr = ldl_p(header+i+28);
+
+        if (mh_load_end_addr) {
+            if (mh_load_end_addr < mh_load_addr) {
+                error_report("invalid load_end_addr address");
+                exit(1);
+            }
+            mb_load_size = mh_load_end_addr - mh_load_addr;
+        } else {
+            if (kernel_file_size < mb_kernel_text_offset) {
+                error_report("invalid kernel_file_size");
+                exit(1);
+            }
+            mb_load_size = kernel_file_size - mb_kernel_text_offset;
+        }
+        if (mb_load_size > UINT32_MAX - mh_load_addr) {
+            error_report("kernel does not fit in address space");
+            exit(1);
+        }
+        if (mh_bss_end_addr) {
+            if (mh_bss_end_addr < (mh_load_addr + mb_load_size)) {
+                error_report("invalid bss_end_addr address");
+                exit(1);
+            }
+            mb_kernel_size = mh_bss_end_addr - mh_load_addr;
+        } else {
+            mb_kernel_size = mb_load_size;
+        }
+
+        mb_debug("multiboot: header_addr = %#x", mh_header_addr);
+        mb_debug("multiboot: load_addr = %#x", mh_load_addr);
+        mb_debug("multiboot: load_end_addr = %#x", mh_load_end_addr);
+        mb_debug("multiboot: bss_end_addr = %#x", mh_bss_end_addr);
+        mb_debug("loading multiboot kernel (%#x bytes) at %#x",
+                 mb_load_size, mh_load_addr);
+
+        mbs.mb_buf = g_malloc(mb_kernel_size);
+        fseek(f, mb_kernel_text_offset, SEEK_SET);
+        if (fread(mbs.mb_buf, 1, mb_load_size, f) != mb_load_size) {
+            error_report("fread() failed");
+            exit(1);
+        }
+        memset(mbs.mb_buf + mb_load_size, 0, mb_kernel_size - mb_load_size);
+        fclose(f);
+    }
+
+    mbs.mb_buf_phys = mh_load_addr;
+
+    mbs.mb_buf_size = TARGET_PAGE_ALIGN(mb_kernel_size);
+    mbs.offset_mbinfo = mbs.mb_buf_size;
+
+    /* Calculate space for cmdlines, bootloader name, and mb_mods */
+    cmdline_len = strlen(kernel_filename) + 1;
+    cmdline_len += strlen(kernel_cmdline) + 1;
+    if (initrd_filename) {
+        const char *r = initrd_filename;
+        cmdline_len += strlen(initrd_filename) + 1;
+        while (*r) {
+            char *value;
+            r = get_opt_value(r, &value);
+            mbs.mb_mods_avail++;
+            mods = g_list_append(mods, value);
+            if (*r) {
+                r++;
+            }
+        }
+    }
+
+    mbs.mb_buf_size += cmdline_len;
+    mbs.mb_buf_size += MB_MOD_SIZE * mbs.mb_mods_avail;
+    mbs.mb_buf_size += strlen(bootloader_name) + 1;
+
+    mbs.mb_buf_size = TARGET_PAGE_ALIGN(mbs.mb_buf_size);
+
+    /* enlarge mb_buf to hold cmdlines, bootloader, mb-info structs */
+    mbs.mb_buf            = g_realloc(mbs.mb_buf, mbs.mb_buf_size);
+    mbs.offset_cmdlines   = mbs.offset_mbinfo + mbs.mb_mods_avail * MB_MOD_SIZE;
+    mbs.offset_bootloader = mbs.offset_cmdlines + cmdline_len;
+
+    if (mods) {
+        GList *tmpl = mods;
+        mbs.offset_mods = mbs.mb_buf_size;
+
+        while (tmpl) {
+            char *next_space;
+            int mb_mod_length;
+            uint32_t offs = mbs.mb_buf_size;
+            char *one_file = tmpl->data;
+
+            /* if a space comes after the module filename, treat everything
+               after that as parameters */
+            hwaddr c = mb_add_cmdline(&mbs, one_file);
+            next_space = strchr(one_file, ' ');
+            if (next_space) {
+                *next_space = '\0';
+            }
+            mb_debug("multiboot loading module: %s", one_file);
+            mb_mod_length = get_image_size(one_file);
+            if (mb_mod_length < 0) {
+                error_report("Failed to open file '%s'", one_file);
+                exit(1);
+            }
+
+            mbs.mb_buf_size = TARGET_PAGE_ALIGN(mb_mod_length + mbs.mb_buf_size);
+            mbs.mb_buf = g_realloc(mbs.mb_buf, mbs.mb_buf_size);
+
+            if (load_image_size(one_file, (unsigned char *)mbs.mb_buf + offs,
+                                mbs.mb_buf_size - offs) < 0) {
+                error_report("Error loading file '%s'", one_file);
+                exit(1);
+            }
+            mb_add_mod(&mbs, mbs.mb_buf_phys + offs,
+                       mbs.mb_buf_phys + offs + mb_mod_length, c);
+
+            mb_debug("mod_start: %p\nmod_end:   %p\n  cmdline: "TARGET_FMT_plx,
+                     (char *)mbs.mb_buf + offs,
+                     (char *)mbs.mb_buf + offs + mb_mod_length, c);
+            g_free(one_file);
+            tmpl = tmpl->next;
+        }
+        g_list_free(mods);
+    }
+
+    /* Commandline support */
+    char kcmdline[strlen(kernel_filename) + strlen(kernel_cmdline) + 2];
+    snprintf(kcmdline, sizeof(kcmdline), "%s %s",
+             kernel_filename, kernel_cmdline);
+    stl_p(bootinfo + MBI_CMDLINE, mb_add_cmdline(&mbs, kcmdline));
+
+    stl_p(bootinfo + MBI_BOOTLOADER, mb_add_bootloader(&mbs, bootloader_name));
+
+    stl_p(bootinfo + MBI_MODS_ADDR,  mbs.mb_buf_phys + mbs.offset_mbinfo);
+    stl_p(bootinfo + MBI_MODS_COUNT, mbs.mb_mods_count); /* mods_count */
+
+    /* the kernel is where we want it to be now */
+    stl_p(bootinfo + MBI_FLAGS, MULTIBOOT_FLAGS_MEMORY
+                                | MULTIBOOT_FLAGS_BOOT_DEVICE
+                                | MULTIBOOT_FLAGS_CMDLINE
+                                | MULTIBOOT_FLAGS_MODULES
+                                | MULTIBOOT_FLAGS_MMAP
+                                | MULTIBOOT_FLAGS_BOOTLOADER);
+    stl_p(bootinfo + MBI_BOOT_DEVICE, 0x8000ffff); /* XXX: use the -boot switch? */
+    stl_p(bootinfo + MBI_MMAP_ADDR,   ADDR_E820_MAP);
+
+    mb_debug("multiboot: entry_addr = %#x", mh_entry_addr);
+    mb_debug("           mb_buf_phys   = "TARGET_FMT_plx, mbs.mb_buf_phys);
+    mb_debug("           mod_start     = "TARGET_FMT_plx,
+             mbs.mb_buf_phys + mbs.offset_mods);
+    mb_debug("           mb_mods_count = %d", mbs.mb_mods_count);
+
+    /* save bootinfo off the stack */
+    mb_bootinfo_data = g_memdup(bootinfo, sizeof(bootinfo));
+
+    /* Pass variables to option rom */
+    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, mh_entry_addr);
+    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr);
+    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, mbs.mb_buf_size);
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA,
+                     mbs.mb_buf, mbs.mb_buf_size);
+
+    fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, ADDR_MBI);
+    fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, sizeof(bootinfo));
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, mb_bootinfo_data,
+                     sizeof(bootinfo));
+
+    if (multiboot_dma_enabled) {
+        option_rom[nb_option_roms].name = "multiboot_dma.bin";
+    } else {
+        option_rom[nb_option_roms].name = "multiboot.bin";
+    }
+    option_rom[nb_option_roms].bootindex = 0;
+    nb_option_roms++;
+
+    return 1; /* yes, we are multiboot */
+}
diff --git a/hw/i386/multiboot.h b/hw/i386/multiboot.h
new file mode 100644
index 000000000..2b9182a8e
--- /dev/null
+++ b/hw/i386/multiboot.h
@@ -0,0 +1,16 @@
+#ifndef QEMU_MULTIBOOT_H
+#define QEMU_MULTIBOOT_H
+
+#include "hw/nvram/fw_cfg.h"
+#include "hw/i386/x86.h"
+
+int load_multiboot(X86MachineState *x86ms,
+                   FWCfgState *fw_cfg,
+                   FILE *f,
+                   const char *kernel_filename,
+                   const char *initrd_filename,
+                   const char *kernel_cmdline,
+                   int kernel_file_size,
+                   uint8_t *header);
+
+#endif
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
new file mode 100644
index 000000000..a2ef40ecb
--- /dev/null
+++ b/hw/i386/pc.c
@@ -0,0 +1,1777 @@
+/*
+ * QEMU PC System Emulator
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/char/serial.h"
+#include "hw/char/parallel.h"
+#include "hw/i386/apic.h"
+#include "hw/i386/topology.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/i386/vmport.h"
+#include "sysemu/cpus.h"
+#include "hw/block/fdc.h"
+#include "hw/ide.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/nvram/fw_cfg.h"
+#include "hw/timer/hpet.h"
+#include "hw/firmware/smbios.h"
+#include "hw/loader.h"
+#include "elf.h"
+#include "migration/vmstate.h"
+#include "multiboot.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "hw/intc/i8259.h"
+#include "hw/dma/i8257.h"
+#include "hw/timer/i8254.h"
+#include "hw/input/i8042.h"
+#include "hw/irq.h"
+#include "hw/audio/pcspk.h"
+#include "hw/pci/msi.h"
+#include "hw/sysbus.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/tcg.h"
+#include "sysemu/numa.h"
+#include "sysemu/kvm.h"
+#include "sysemu/xen.h"
+#include "sysemu/reset.h"
+#include "sysemu/runstate.h"
+#include "kvm/kvm_i386.h"
+#include "hw/xen/xen.h"
+#include "hw/xen/start_info.h"
+#include "ui/qemu-spice.h"
+#include "exec/memory.h"
+#include "qemu/bitmap.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/cutils.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/cpu_hotplug.h"
+#include "acpi-build.h"
+#include "hw/mem/pc-dimm.h"
+#include "hw/mem/nvdimm.h"
+#include "qapi/error.h"
+#include "qapi/qapi-visit-common.h"
+#include "qapi/visitor.h"
+#include "hw/core/cpu.h"
+#include "hw/usb.h"
+#include "hw/i386/intel_iommu.h"
+#include "hw/net/ne2000-isa.h"
+#include "standard-headers/asm-x86/bootparam.h"
+#include "hw/virtio/virtio-iommu.h"
+#include "hw/virtio/virtio-pmem-pci.h"
+#include "hw/virtio/virtio-mem-pci.h"
+#include "hw/mem/memory-device.h"
+#include "sysemu/replay.h"
+#include "qapi/qmp/qerror.h"
+#include "e820_memory_layout.h"
+#include "fw_cfg.h"
+#include "trace.h"
+#include CONFIG_DEVICES
+
+GlobalProperty pc_compat_6_1[] = {
+    { TYPE_X86_CPU, "hv-version-id-build", "0x1bbc" },
+    { TYPE_X86_CPU, "hv-version-id-major", "0x0006" },
+    { TYPE_X86_CPU, "hv-version-id-minor", "0x0001" },
+    { "ICH9-LPC", "x-keep-pci-slot-hpc", "false" },
+};
+const size_t pc_compat_6_1_len = G_N_ELEMENTS(pc_compat_6_1);
+
+GlobalProperty pc_compat_6_0[] = {
+    { "qemu64" "-" TYPE_X86_CPU, "family", "6" },
+    { "qemu64" "-" TYPE_X86_CPU, "model", "6" },
+    { "qemu64" "-" TYPE_X86_CPU, "stepping", "3" },
+    { TYPE_X86_CPU, "x-vendor-cpuid-only", "off" },
+    { "ICH9-LPC", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" },
+    { "ICH9-LPC", "x-keep-pci-slot-hpc", "true" },
+};
+const size_t pc_compat_6_0_len = G_N_ELEMENTS(pc_compat_6_0);
+
+GlobalProperty pc_compat_5_2[] = {
+    { "ICH9-LPC", "x-smi-cpu-hotunplug", "off" },
+};
+const size_t pc_compat_5_2_len = G_N_ELEMENTS(pc_compat_5_2);
+
+GlobalProperty pc_compat_5_1[] = {
+    { "ICH9-LPC", "x-smi-cpu-hotplug", "off" },
+    { TYPE_X86_CPU, "kvm-msi-ext-dest-id", "off" },
+};
+const size_t pc_compat_5_1_len = G_N_ELEMENTS(pc_compat_5_1);
+
+GlobalProperty pc_compat_5_0[] = {
+};
+const size_t pc_compat_5_0_len = G_N_ELEMENTS(pc_compat_5_0);
+
+GlobalProperty pc_compat_4_2[] = {
+    { "mch", "smbase-smram", "off" },
+};
+const size_t pc_compat_4_2_len = G_N_ELEMENTS(pc_compat_4_2);
+
+GlobalProperty pc_compat_4_1[] = {};
+const size_t pc_compat_4_1_len = G_N_ELEMENTS(pc_compat_4_1);
+
+GlobalProperty pc_compat_4_0[] = {};
+const size_t pc_compat_4_0_len = G_N_ELEMENTS(pc_compat_4_0);
+
+GlobalProperty pc_compat_3_1[] = {
+    { "intel-iommu", "dma-drain", "off" },
+    { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "off" },
+    { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "off" },
+    { "Opteron_G4" "-" TYPE_X86_CPU, "npt", "off" },
+    { "Opteron_G4" "-" TYPE_X86_CPU, "nrip-save", "off" },
+    { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "off" },
+    { "Opteron_G5" "-" TYPE_X86_CPU, "npt", "off" },
+    { "Opteron_G5" "-" TYPE_X86_CPU, "nrip-save", "off" },
+    { "EPYC" "-" TYPE_X86_CPU, "npt", "off" },
+    { "EPYC" "-" TYPE_X86_CPU, "nrip-save", "off" },
+    { "EPYC-IBPB" "-" TYPE_X86_CPU, "npt", "off" },
+    { "EPYC-IBPB" "-" TYPE_X86_CPU, "nrip-save", "off" },
+    { "Skylake-Client" "-" TYPE_X86_CPU,      "mpx", "on" },
+    { "Skylake-Client-IBRS" "-" TYPE_X86_CPU, "mpx", "on" },
+    { "Skylake-Server" "-" TYPE_X86_CPU,      "mpx", "on" },
+    { "Skylake-Server-IBRS" "-" TYPE_X86_CPU, "mpx", "on" },
+    { "Cascadelake-Server" "-" TYPE_X86_CPU,  "mpx", "on" },
+    { "Icelake-Client" "-" TYPE_X86_CPU,      "mpx", "on" },
+    { "Icelake-Server" "-" TYPE_X86_CPU,      "mpx", "on" },
+    { "Cascadelake-Server" "-" TYPE_X86_CPU, "stepping", "5" },
+    { TYPE_X86_CPU, "x-intel-pt-auto-level", "off" },
+};
+const size_t pc_compat_3_1_len = G_N_ELEMENTS(pc_compat_3_1);
+
+GlobalProperty pc_compat_3_0[] = {
+    { TYPE_X86_CPU, "x-hv-synic-kvm-only", "on" },
+    { "Skylake-Server" "-" TYPE_X86_CPU, "pku", "off" },
+    { "Skylake-Server-IBRS" "-" TYPE_X86_CPU, "pku", "off" },
+};
+const size_t pc_compat_3_0_len = G_N_ELEMENTS(pc_compat_3_0);
+
+GlobalProperty pc_compat_2_12[] = {
+    { TYPE_X86_CPU, "legacy-cache", "on" },
+    { TYPE_X86_CPU, "topoext", "off" },
+    { "EPYC-" TYPE_X86_CPU, "xlevel", "0x8000000a" },
+    { "EPYC-IBPB-" TYPE_X86_CPU, "xlevel", "0x8000000a" },
+};
+const size_t pc_compat_2_12_len = G_N_ELEMENTS(pc_compat_2_12);
+
+GlobalProperty pc_compat_2_11[] = {
+    { TYPE_X86_CPU, "x-migrate-smi-count", "off" },
+    { "Skylake-Server" "-" TYPE_X86_CPU, "clflushopt", "off" },
+};
+const size_t pc_compat_2_11_len = G_N_ELEMENTS(pc_compat_2_11);
+
+GlobalProperty pc_compat_2_10[] = {
+    { TYPE_X86_CPU, "x-hv-max-vps", "0x40" },
+    { "i440FX-pcihost", "x-pci-hole64-fix", "off" },
+    { "q35-pcihost", "x-pci-hole64-fix", "off" },
+};
+const size_t pc_compat_2_10_len = G_N_ELEMENTS(pc_compat_2_10);
+
+GlobalProperty pc_compat_2_9[] = {
+    { "mch", "extended-tseg-mbytes", "0" },
+};
+const size_t pc_compat_2_9_len = G_N_ELEMENTS(pc_compat_2_9);
+
+GlobalProperty pc_compat_2_8[] = {
+    { TYPE_X86_CPU, "tcg-cpuid", "off" },
+    { "kvmclock", "x-mach-use-reliable-get-clock", "off" },
+    { "ICH9-LPC", "x-smi-broadcast", "off" },
+    { TYPE_X86_CPU, "vmware-cpuid-freq", "off" },
+    { "Haswell-" TYPE_X86_CPU, "stepping", "1" },
+};
+const size_t pc_compat_2_8_len = G_N_ELEMENTS(pc_compat_2_8);
+
+GlobalProperty pc_compat_2_7[] = {
+    { TYPE_X86_CPU, "l3-cache", "off" },
+    { TYPE_X86_CPU, "full-cpuid-auto-level", "off" },
+    { "Opteron_G3" "-" TYPE_X86_CPU, "family", "15" },
+    { "Opteron_G3" "-" TYPE_X86_CPU, "model", "6" },
+    { "Opteron_G3" "-" TYPE_X86_CPU, "stepping", "1" },
+    { "isa-pcspk", "migrate", "off" },
+};
+const size_t pc_compat_2_7_len = G_N_ELEMENTS(pc_compat_2_7);
+
+GlobalProperty pc_compat_2_6[] = {
+    { TYPE_X86_CPU, "cpuid-0xb", "off" },
+    { "vmxnet3", "romfile", "" },
+    { TYPE_X86_CPU, "fill-mtrr-mask", "off" },
+    { "apic-common", "legacy-instance-id", "on", }
+};
+const size_t pc_compat_2_6_len = G_N_ELEMENTS(pc_compat_2_6);
+
+GlobalProperty pc_compat_2_5[] = {};
+const size_t pc_compat_2_5_len = G_N_ELEMENTS(pc_compat_2_5);
+
+GlobalProperty pc_compat_2_4[] = {
+    PC_CPU_MODEL_IDS("2.4.0")
+    { "Haswell-" TYPE_X86_CPU, "abm", "off" },
+    { "Haswell-noTSX-" TYPE_X86_CPU, "abm", "off" },
+    { "Broadwell-" TYPE_X86_CPU, "abm", "off" },
+    { "Broadwell-noTSX-" TYPE_X86_CPU, "abm", "off" },
+    { "host" "-" TYPE_X86_CPU, "host-cache-info", "on" },
+    { TYPE_X86_CPU, "check", "off" },
+    { "qemu64" "-" TYPE_X86_CPU, "sse4a", "on" },
+    { "qemu64" "-" TYPE_X86_CPU, "abm", "on" },
+    { "qemu64" "-" TYPE_X86_CPU, "popcnt", "on" },
+    { "qemu32" "-" TYPE_X86_CPU, "popcnt", "on" },
+    { "Opteron_G2" "-" TYPE_X86_CPU, "rdtscp", "on" },
+    { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "on" },
+    { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "on" },
+    { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "on", }
+};
+const size_t pc_compat_2_4_len = G_N_ELEMENTS(pc_compat_2_4);
+
+GlobalProperty pc_compat_2_3[] = {
+    PC_CPU_MODEL_IDS("2.3.0")
+    { TYPE_X86_CPU, "arat", "off" },
+    { "qemu64" "-" TYPE_X86_CPU, "min-level", "4" },
+    { "kvm64" "-" TYPE_X86_CPU, "min-level", "5" },
+    { "pentium3" "-" TYPE_X86_CPU, "min-level", "2" },
+    { "n270" "-" TYPE_X86_CPU, "min-level", "5" },
+    { "Conroe" "-" TYPE_X86_CPU, "min-level", "4" },
+    { "Penryn" "-" TYPE_X86_CPU, "min-level", "4" },
+    { "Nehalem" "-" TYPE_X86_CPU, "min-level", "4" },
+    { "n270" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { "Penryn" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { "Conroe" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { "Nehalem" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { "Westmere" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { "SandyBridge" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { "IvyBridge" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { "Haswell" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { "Haswell-noTSX" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { "Broadwell" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { "Broadwell-noTSX" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" },
+    { TYPE_X86_CPU, "kvm-no-smi-migration", "on" },
+};
+const size_t pc_compat_2_3_len = G_N_ELEMENTS(pc_compat_2_3);
+
+GlobalProperty pc_compat_2_2[] = {
+    PC_CPU_MODEL_IDS("2.2.0")
+    { "kvm64" "-" TYPE_X86_CPU, "vme", "off" },
+    { "kvm32" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Conroe" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Penryn" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Nehalem" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Westmere" "-" TYPE_X86_CPU, "vme", "off" },
+    { "SandyBridge" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Haswell" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Broadwell" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Opteron_G1" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Opteron_G2" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Opteron_G3" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Opteron_G4" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Opteron_G5" "-" TYPE_X86_CPU, "vme", "off" },
+    { "Haswell" "-" TYPE_X86_CPU, "f16c", "off" },
+    { "Haswell" "-" TYPE_X86_CPU, "rdrand", "off" },
+    { "Broadwell" "-" TYPE_X86_CPU, "f16c", "off" },
+    { "Broadwell" "-" TYPE_X86_CPU, "rdrand", "off" },
+};
+const size_t pc_compat_2_2_len = G_N_ELEMENTS(pc_compat_2_2);
+
+GlobalProperty pc_compat_2_1[] = {
+    PC_CPU_MODEL_IDS("2.1.0")
+    { "coreduo" "-" TYPE_X86_CPU, "vmx", "on" },
+    { "core2duo" "-" TYPE_X86_CPU, "vmx", "on" },
+};
+const size_t pc_compat_2_1_len = G_N_ELEMENTS(pc_compat_2_1);
+
+GlobalProperty pc_compat_2_0[] = {
+    PC_CPU_MODEL_IDS("2.0.0")
+    { "virtio-scsi-pci", "any_layout", "off" },
+    { "PIIX4_PM", "memory-hotplug-support", "off" },
+    { "apic", "version", "0x11" },
+    { "nec-usb-xhci", "superspeed-ports-first", "off" },
+    { "nec-usb-xhci", "force-pcie-endcap", "on" },
+    { "pci-serial", "prog_if", "0" },
+    { "pci-serial-2x", "prog_if", "0" },
+    { "pci-serial-4x", "prog_if", "0" },
+    { "virtio-net-pci", "guest_announce", "off" },
+    { "ICH9-LPC", "memory-hotplug-support", "off" },
+    { "xio3130-downstream", COMPAT_PROP_PCP, "off" },
+    { "ioh3420", COMPAT_PROP_PCP, "off" },
+};
+const size_t pc_compat_2_0_len = G_N_ELEMENTS(pc_compat_2_0);
+
+GlobalProperty pc_compat_1_7[] = {
+    PC_CPU_MODEL_IDS("1.7.0")
+    { TYPE_USB_DEVICE, "msos-desc", "no" },
+    { "PIIX4_PM", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" },
+    { "hpet", HPET_INTCAP, "4" },
+};
+const size_t pc_compat_1_7_len = G_N_ELEMENTS(pc_compat_1_7);
+
+GlobalProperty pc_compat_1_6[] = {
+    PC_CPU_MODEL_IDS("1.6.0")
+    { "e1000", "mitigation", "off" },
+    { "qemu64-" TYPE_X86_CPU, "model", "2" },
+    { "qemu32-" TYPE_X86_CPU, "model", "3" },
+    { "i440FX-pcihost", "short_root_bus", "1" },
+    { "q35-pcihost", "short_root_bus", "1" },
+};
+const size_t pc_compat_1_6_len = G_N_ELEMENTS(pc_compat_1_6);
+
+GlobalProperty pc_compat_1_5[] = {
+    PC_CPU_MODEL_IDS("1.5.0")
+    { "Conroe-" TYPE_X86_CPU, "model", "2" },
+    { "Conroe-" TYPE_X86_CPU, "min-level", "2" },
+    { "Penryn-" TYPE_X86_CPU, "model", "2" },
+    { "Penryn-" TYPE_X86_CPU, "min-level", "2" },
+    { "Nehalem-" TYPE_X86_CPU, "model", "2" },
+    { "Nehalem-" TYPE_X86_CPU, "min-level", "2" },
+    { "virtio-net-pci", "any_layout", "off" },
+    { TYPE_X86_CPU, "pmu", "on" },
+    { "i440FX-pcihost", "short_root_bus", "0" },
+    { "q35-pcihost", "short_root_bus", "0" },
+};
+const size_t pc_compat_1_5_len = G_N_ELEMENTS(pc_compat_1_5);
+
+GlobalProperty pc_compat_1_4[] = {
+    PC_CPU_MODEL_IDS("1.4.0")
+    { "scsi-hd", "discard_granularity", "0" },
+    { "scsi-cd", "discard_granularity", "0" },
+    { "ide-hd", "discard_granularity", "0" },
+    { "ide-cd", "discard_granularity", "0" },
+    { "virtio-blk-pci", "discard_granularity", "0" },
+    /* DEV_NVECTORS_UNSPECIFIED as a uint32_t string: */
+    { "virtio-serial-pci", "vectors", "0xFFFFFFFF" },
+    { "virtio-net-pci", "ctrl_guest_offloads", "off" },
+    { "e1000", "romfile", "pxe-e1000.rom" },
+    { "ne2k_pci", "romfile", "pxe-ne2k_pci.rom" },
+    { "pcnet", "romfile", "pxe-pcnet.rom" },
+    { "rtl8139", "romfile", "pxe-rtl8139.rom" },
+    { "virtio-net-pci", "romfile", "pxe-virtio.rom" },
+    { "486-" TYPE_X86_CPU, "model", "0" },
+    { "n270" "-" TYPE_X86_CPU, "movbe", "off" },
+    { "Westmere" "-" TYPE_X86_CPU, "pclmulqdq", "off" },
+};
+const size_t pc_compat_1_4_len = G_N_ELEMENTS(pc_compat_1_4);
+
+GSIState *pc_gsi_create(qemu_irq **irqs, bool pci_enabled)
+{
+    GSIState *s;
+
+    s = g_new0(GSIState, 1);
+    if (kvm_ioapic_in_kernel()) {
+        kvm_pc_setup_irq_routing(pci_enabled);
+    }
+    *irqs = qemu_allocate_irqs(gsi_handler, s, GSI_NUM_PINS);
+
+    return s;
+}
+
+static void ioport80_write(void *opaque, hwaddr addr, uint64_t data,
+                           unsigned size)
+{
+}
+
+static uint64_t ioport80_read(void *opaque, hwaddr addr, unsigned size)
+{
+    return 0xffffffffffffffffULL;
+}
+
+/* MSDOS compatibility mode FPU exception support */
+static void ioportF0_write(void *opaque, hwaddr addr, uint64_t data,
+                           unsigned size)
+{
+    if (tcg_enabled()) {
+        cpu_set_ignne();
+    }
+}
+
+static uint64_t ioportF0_read(void *opaque, hwaddr addr, unsigned size)
+{
+    return 0xffffffffffffffffULL;
+}
+
+/* PC cmos mappings */
+
+#define REG_EQUIPMENT_BYTE          0x14
+
+static void cmos_init_hd(ISADevice *s, int type_ofs, int info_ofs,
+                         int16_t cylinders, int8_t heads, int8_t sectors)
+{
+    rtc_set_memory(s, type_ofs, 47);
+    rtc_set_memory(s, info_ofs, cylinders);
+    rtc_set_memory(s, info_ofs + 1, cylinders >> 8);
+    rtc_set_memory(s, info_ofs + 2, heads);
+    rtc_set_memory(s, info_ofs + 3, 0xff);
+    rtc_set_memory(s, info_ofs + 4, 0xff);
+    rtc_set_memory(s, info_ofs + 5, 0xc0 | ((heads > 8) << 3));
+    rtc_set_memory(s, info_ofs + 6, cylinders);
+    rtc_set_memory(s, info_ofs + 7, cylinders >> 8);
+    rtc_set_memory(s, info_ofs + 8, sectors);
+}
+
+/* convert boot_device letter to something recognizable by the bios */
+static int boot_device2nibble(char boot_device)
+{
+    switch(boot_device) {
+    case 'a':
+    case 'b':
+        return 0x01; /* floppy boot */
+    case 'c':
+        return 0x02; /* hard drive boot */
+    case 'd':
+        return 0x03; /* CD-ROM boot */
+    case 'n':
+        return 0x04; /* Network boot */
+    }
+    return 0;
+}
+
+static void set_boot_dev(ISADevice *s, const char *boot_device, Error **errp)
+{
+#define PC_MAX_BOOT_DEVICES 3
+    int nbds, bds[3] = { 0, };
+    int i;
+
+    nbds = strlen(boot_device);
+    if (nbds > PC_MAX_BOOT_DEVICES) {
+        error_setg(errp, "Too many boot devices for PC");
+        return;
+    }
+    for (i = 0; i < nbds; i++) {
+        bds[i] = boot_device2nibble(boot_device[i]);
+        if (bds[i] == 0) {
+            error_setg(errp, "Invalid boot device for PC: '%c'",
+                       boot_device[i]);
+            return;
+        }
+    }
+    rtc_set_memory(s, 0x3d, (bds[1] << 4) | bds[0]);
+    rtc_set_memory(s, 0x38, (bds[2] << 4) | (fd_bootchk ? 0x0 : 0x1));
+}
+
+static void pc_boot_set(void *opaque, const char *boot_device, Error **errp)
+{
+    set_boot_dev(opaque, boot_device, errp);
+}
+
+static void pc_cmos_init_floppy(ISADevice *rtc_state, ISADevice *floppy)
+{
+    int val, nb, i;
+    FloppyDriveType fd_type[2] = { FLOPPY_DRIVE_TYPE_NONE,
+                                   FLOPPY_DRIVE_TYPE_NONE };
+
+    /* floppy type */
+    if (floppy) {
+        for (i = 0; i < 2; i++) {
+            fd_type[i] = isa_fdc_get_drive_type(floppy, i);
+        }
+    }
+    val = (cmos_get_fd_drive_type(fd_type[0]) << 4) |
+        cmos_get_fd_drive_type(fd_type[1]);
+    rtc_set_memory(rtc_state, 0x10, val);
+
+    val = rtc_get_memory(rtc_state, REG_EQUIPMENT_BYTE);
+    nb = 0;
+    if (fd_type[0] != FLOPPY_DRIVE_TYPE_NONE) {
+        nb++;
+    }
+    if (fd_type[1] != FLOPPY_DRIVE_TYPE_NONE) {
+        nb++;
+    }
+    switch (nb) {
+    case 0:
+        break;
+    case 1:
+        val |= 0x01; /* 1 drive, ready for boot */
+        break;
+    case 2:
+        val |= 0x41; /* 2 drives, ready for boot */
+        break;
+    }
+    rtc_set_memory(rtc_state, REG_EQUIPMENT_BYTE, val);
+}
+
+typedef struct pc_cmos_init_late_arg {
+    ISADevice *rtc_state;
+    BusState *idebus[2];
+} pc_cmos_init_late_arg;
+
+typedef struct check_fdc_state {
+    ISADevice *floppy;
+    bool multiple;
+} CheckFdcState;
+
+static int check_fdc(Object *obj, void *opaque)
+{
+    CheckFdcState *state = opaque;
+    Object *fdc;
+    uint32_t iobase;
+    Error *local_err = NULL;
+
+    fdc = object_dynamic_cast(obj, TYPE_ISA_FDC);
+    if (!fdc) {
+        return 0;
+    }
+
+    iobase = object_property_get_uint(obj, "iobase", &local_err);
+    if (local_err || iobase != 0x3f0) {
+        error_free(local_err);
+        return 0;
+    }
+
+    if (state->floppy) {
+        state->multiple = true;
+    } else {
+        state->floppy = ISA_DEVICE(obj);
+    }
+    return 0;
+}
+
+static const char * const fdc_container_path[] = {
+    "/unattached", "/peripheral", "/peripheral-anon"
+};
+
+/*
+ * Locate the FDC at IO address 0x3f0, in order to configure the CMOS registers
+ * and ACPI objects.
+ */
+ISADevice *pc_find_fdc0(void)
+{
+    int i;
+    Object *container;
+    CheckFdcState state = { 0 };
+
+    for (i = 0; i < ARRAY_SIZE(fdc_container_path); i++) {
+        container = container_get(qdev_get_machine(), fdc_container_path[i]);
+        object_child_foreach(container, check_fdc, &state);
+    }
+
+    if (state.multiple) {
+        warn_report("multiple floppy disk controllers with "
+                    "iobase=0x3f0 have been found");
+        error_printf("the one being picked for CMOS setup might not reflect "
+                     "your intent");
+    }
+
+    return state.floppy;
+}
+
+static void pc_cmos_init_late(void *opaque)
+{
+    pc_cmos_init_late_arg *arg = opaque;
+    ISADevice *s = arg->rtc_state;
+    int16_t cylinders;
+    int8_t heads, sectors;
+    int val;
+    int i, trans;
+
+    val = 0;
+    if (arg->idebus[0] && ide_get_geometry(arg->idebus[0], 0,
+                                           &cylinders, &heads, &sectors) >= 0) {
+        cmos_init_hd(s, 0x19, 0x1b, cylinders, heads, sectors);
+        val |= 0xf0;
+    }
+    if (arg->idebus[0] && ide_get_geometry(arg->idebus[0], 1,
+                                           &cylinders, &heads, &sectors) >= 0) {
+        cmos_init_hd(s, 0x1a, 0x24, cylinders, heads, sectors);
+        val |= 0x0f;
+    }
+    rtc_set_memory(s, 0x12, val);
+
+    val = 0;
+    for (i = 0; i < 4; i++) {
+        /* NOTE: ide_get_geometry() returns the physical
+           geometry.  It is always such that: 1 <= sects <= 63, 1
+           <= heads <= 16, 1 <= cylinders <= 16383. The BIOS
+           geometry can be different if a translation is done. */
+        if (arg->idebus[i / 2] &&
+            ide_get_geometry(arg->idebus[i / 2], i % 2,
+                             &cylinders, &heads, &sectors) >= 0) {
+            trans = ide_get_bios_chs_trans(arg->idebus[i / 2], i % 2) - 1;
+            assert((trans & ~3) == 0);
+            val |= trans << (i * 2);
+        }
+    }
+    rtc_set_memory(s, 0x39, val);
+
+    pc_cmos_init_floppy(s, pc_find_fdc0());
+
+    qemu_unregister_reset(pc_cmos_init_late, opaque);
+}
+
+void pc_cmos_init(PCMachineState *pcms,
+                  BusState *idebus0, BusState *idebus1,
+                  ISADevice *s)
+{
+    int val;
+    static pc_cmos_init_late_arg arg;
+    X86MachineState *x86ms = X86_MACHINE(pcms);
+
+    /* various important CMOS locations needed by PC/Bochs bios */
+
+    /* memory size */
+    /* base memory (first MiB) */
+    val = MIN(x86ms->below_4g_mem_size / KiB, 640);
+    rtc_set_memory(s, 0x15, val);
+    rtc_set_memory(s, 0x16, val >> 8);
+    /* extended memory (next 64MiB) */
+    if (x86ms->below_4g_mem_size > 1 * MiB) {
+        val = (x86ms->below_4g_mem_size - 1 * MiB) / KiB;
+    } else {
+        val = 0;
+    }
+    if (val > 65535)
+        val = 65535;
+    rtc_set_memory(s, 0x17, val);
+    rtc_set_memory(s, 0x18, val >> 8);
+    rtc_set_memory(s, 0x30, val);
+    rtc_set_memory(s, 0x31, val >> 8);
+    /* memory between 16MiB and 4GiB */
+    if (x86ms->below_4g_mem_size > 16 * MiB) {
+        val = (x86ms->below_4g_mem_size - 16 * MiB) / (64 * KiB);
+    } else {
+        val = 0;
+    }
+    if (val > 65535)
+        val = 65535;
+    rtc_set_memory(s, 0x34, val);
+    rtc_set_memory(s, 0x35, val >> 8);
+    /* memory above 4GiB */
+    val = x86ms->above_4g_mem_size / 65536;
+    rtc_set_memory(s, 0x5b, val);
+    rtc_set_memory(s, 0x5c, val >> 8);
+    rtc_set_memory(s, 0x5d, val >> 16);
+
+    object_property_add_link(OBJECT(pcms), "rtc_state",
+                             TYPE_ISA_DEVICE,
+                             (Object **)&x86ms->rtc,
+                             object_property_allow_set_link,
+                             OBJ_PROP_LINK_STRONG);
+    object_property_set_link(OBJECT(pcms), "rtc_state", OBJECT(s),
+                             &error_abort);
+
+    set_boot_dev(s, MACHINE(pcms)->boot_order, &error_fatal);
+
+    val = 0;
+    val |= 0x02; /* FPU is there */
+    val |= 0x04; /* PS/2 mouse installed */
+    rtc_set_memory(s, REG_EQUIPMENT_BYTE, val);
+
+    /* hard drives and FDC */
+    arg.rtc_state = s;
+    arg.idebus[0] = idebus0;
+    arg.idebus[1] = idebus1;
+    qemu_register_reset(pc_cmos_init_late, &arg);
+}
+
+static void handle_a20_line_change(void *opaque, int irq, int level)
+{
+    X86CPU *cpu = opaque;
+
+    /* XXX: send to all CPUs ? */
+    /* XXX: add logic to handle multiple A20 line sources */
+    x86_cpu_set_a20(cpu, level);
+}
+
+#define NE2000_NB_MAX 6
+
+static const int ne2000_io[NE2000_NB_MAX] = { 0x300, 0x320, 0x340, 0x360,
+                                              0x280, 0x380 };
+static const int ne2000_irq[NE2000_NB_MAX] = { 9, 10, 11, 3, 4, 5 };
+
+void pc_init_ne2k_isa(ISABus *bus, NICInfo *nd)
+{
+    static int nb_ne2k = 0;
+
+    if (nb_ne2k == NE2000_NB_MAX)
+        return;
+    isa_ne2000_init(bus, ne2000_io[nb_ne2k],
+                    ne2000_irq[nb_ne2k], nd);
+    nb_ne2k++;
+}
+
+void pc_acpi_smi_interrupt(void *opaque, int irq, int level)
+{
+    X86CPU *cpu = opaque;
+
+    if (level) {
+        cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
+    }
+}
+
+static
+void pc_machine_done(Notifier *notifier, void *data)
+{
+    PCMachineState *pcms = container_of(notifier,
+                                        PCMachineState, machine_done);
+    X86MachineState *x86ms = X86_MACHINE(pcms);
+
+    /* set the number of CPUs */
+    x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
+
+    fw_cfg_add_extra_pci_roots(pcms->bus, x86ms->fw_cfg);
+
+    acpi_setup();
+    if (x86ms->fw_cfg) {
+        fw_cfg_build_smbios(MACHINE(pcms), x86ms->fw_cfg);
+        fw_cfg_build_feature_control(MACHINE(pcms), x86ms->fw_cfg);
+        /* update FW_CFG_NB_CPUS to account for -device added CPUs */
+        fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
+    }
+
+
+    if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
+        !kvm_irqchip_in_kernel()) {
+        error_report("current -smp configuration requires kernel "
+                     "irqchip support.");
+        exit(EXIT_FAILURE);
+    }
+}
+
+void pc_guest_info_init(PCMachineState *pcms)
+{
+    X86MachineState *x86ms = X86_MACHINE(pcms);
+
+    x86ms->apic_xrupt_override = true;
+    pcms->machine_done.notify = pc_machine_done;
+    qemu_add_machine_init_done_notifier(&pcms->machine_done);
+}
+
+/* setup pci memory address space mapping into system address space */
+void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory,
+                            MemoryRegion *pci_address_space)
+{
+    /* Set to lower priority than RAM */
+    memory_region_add_subregion_overlap(system_memory, 0x0,
+                                        pci_address_space, -1);
+}
+
+void xen_load_linux(PCMachineState *pcms)
+{
+    int i;
+    FWCfgState *fw_cfg;
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+    X86MachineState *x86ms = X86_MACHINE(pcms);
+
+    assert(MACHINE(pcms)->kernel_filename != NULL);
+
+    fw_cfg = fw_cfg_init_io(FW_CFG_IO_BASE);
+    fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
+    rom_set_fw(fw_cfg);
+
+    x86_load_linux(x86ms, fw_cfg, pcmc->acpi_data_size,
+                   pcmc->pvh_enabled);
+    for (i = 0; i < nb_option_roms; i++) {
+        assert(!strcmp(option_rom[i].name, "linuxboot.bin") ||
+               !strcmp(option_rom[i].name, "linuxboot_dma.bin") ||
+               !strcmp(option_rom[i].name, "pvh.bin") ||
+               !strcmp(option_rom[i].name, "multiboot.bin") ||
+               !strcmp(option_rom[i].name, "multiboot_dma.bin"));
+        rom_add_option(option_rom[i].name, option_rom[i].bootindex);
+    }
+    x86ms->fw_cfg = fw_cfg;
+}
+
+#define PC_ROM_MIN_VGA     0xc0000
+#define PC_ROM_MIN_OPTION  0xc8000
+#define PC_ROM_MAX         0xe0000
+#define PC_ROM_ALIGN       0x800
+#define PC_ROM_SIZE        (PC_ROM_MAX - PC_ROM_MIN_VGA)
+
+void pc_memory_init(PCMachineState *pcms,
+                    MemoryRegion *system_memory,
+                    MemoryRegion *rom_memory,
+                    MemoryRegion **ram_memory)
+{
+    int linux_boot, i;
+    MemoryRegion *option_rom_mr;
+    MemoryRegion *ram_below_4g, *ram_above_4g;
+    FWCfgState *fw_cfg;
+    MachineState *machine = MACHINE(pcms);
+    MachineClass *mc = MACHINE_GET_CLASS(machine);
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+    X86MachineState *x86ms = X86_MACHINE(pcms);
+
+    assert(machine->ram_size == x86ms->below_4g_mem_size +
+                                x86ms->above_4g_mem_size);
+
+    linux_boot = (machine->kernel_filename != NULL);
+
+    /*
+     * Split single memory region and use aliases to address portions of it,
+     * done for backwards compatibility with older qemus.
+     */
+    *ram_memory = machine->ram;
+    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
+    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
+                             0, x86ms->below_4g_mem_size);
+    memory_region_add_subregion(system_memory, 0, ram_below_4g);
+    e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
+    if (x86ms->above_4g_mem_size > 0) {
+        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
+        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
+                                 machine->ram,
+                                 x86ms->below_4g_mem_size,
+                                 x86ms->above_4g_mem_size);
+        memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                    ram_above_4g);
+        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
+    }
+
+    if (pcms->sgx_epc.size != 0) {
+        e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
+    }
+
+    if (!pcmc->has_reserved_memory &&
+        (machine->ram_slots ||
+         (machine->maxram_size > machine->ram_size))) {
+
+        error_report("\"-memory 'slots|maxmem'\" is not supported by: %s",
+                     mc->name);
+        exit(EXIT_FAILURE);
+    }
+
+    /* always allocate the device memory information */
+    machine->device_memory = g_malloc0(sizeof(*machine->device_memory));
+
+    /* initialize device memory address space */
+    if (pcmc->has_reserved_memory &&
+        (machine->ram_size < machine->maxram_size)) {
+        ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
+
+        if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) {
+            error_report("unsupported amount of memory slots: %"PRIu64,
+                         machine->ram_slots);
+            exit(EXIT_FAILURE);
+        }
+
+        if (QEMU_ALIGN_UP(machine->maxram_size,
+                          TARGET_PAGE_SIZE) != machine->maxram_size) {
+            error_report("maximum memory size must by aligned to multiple of "
+                         "%d bytes", TARGET_PAGE_SIZE);
+            exit(EXIT_FAILURE);
+        }
+
+        if (pcms->sgx_epc.size != 0) {
+            machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc);
+        } else {
+            machine->device_memory->base =
+                0x100000000ULL + x86ms->above_4g_mem_size;
+        }
+
+        machine->device_memory->base =
+            ROUND_UP(machine->device_memory->base, 1 * GiB);
+
+        if (pcmc->enforce_aligned_dimm) {
+            /* size device region assuming 1G page max alignment per slot */
+            device_mem_size += (1 * GiB) * machine->ram_slots;
+        }
+
+        if ((machine->device_memory->base + device_mem_size) <
+            device_mem_size) {
+            error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
+                         machine->maxram_size);
+            exit(EXIT_FAILURE);
+        }
+
+        memory_region_init(&machine->device_memory->mr, OBJECT(pcms),
+                           "device-memory", device_mem_size);
+        memory_region_add_subregion(system_memory, machine->device_memory->base,
+                                    &machine->device_memory->mr);
+    }
+
+    /* Initialize PC system firmware */
+    pc_system_firmware_init(pcms, rom_memory);
+
+    option_rom_mr = g_malloc(sizeof(*option_rom_mr));
+    memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
+                           &error_fatal);
+    if (pcmc->pci_enabled) {
+        memory_region_set_readonly(option_rom_mr, true);
+    }
+    memory_region_add_subregion_overlap(rom_memory,
+                                        PC_ROM_MIN_VGA,
+                                        option_rom_mr,
+                                        1);
+
+    fw_cfg = fw_cfg_arch_create(machine,
+                                x86ms->boot_cpus, x86ms->apic_id_limit);
+
+    rom_set_fw(fw_cfg);
+
+    if (pcmc->has_reserved_memory && machine->device_memory->base) {
+        uint64_t *val = g_malloc(sizeof(*val));
+        PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+        uint64_t res_mem_end = machine->device_memory->base;
+
+        if (!pcmc->broken_reserved_end) {
+            res_mem_end += memory_region_size(&machine->device_memory->mr);
+        }
+        *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB));
+        fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val));
+    }
+
+    if (linux_boot) {
+        x86_load_linux(x86ms, fw_cfg, pcmc->acpi_data_size,
+                       pcmc->pvh_enabled);
+    }
+
+    for (i = 0; i < nb_option_roms; i++) {
+        rom_add_option(option_rom[i].name, option_rom[i].bootindex);
+    }
+    x86ms->fw_cfg = fw_cfg;
+
+    /* Init default IOAPIC address space */
+    x86ms->ioapic_as = &address_space_memory;
+
+    /* Init ACPI memory hotplug IO base address */
+    pcms->memhp_io_base = ACPI_MEMORY_HOTPLUG_BASE;
+}
+
+/*
+ * The 64bit pci hole starts after "above 4G RAM" and
+ * potentially the space reserved for memory hotplug.
+ */
+uint64_t pc_pci_hole64_start(void)
+{
+    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+    MachineState *ms = MACHINE(pcms);
+    X86MachineState *x86ms = X86_MACHINE(pcms);
+    uint64_t hole64_start = 0;
+
+    if (pcmc->has_reserved_memory && ms->device_memory->base) {
+        hole64_start = ms->device_memory->base;
+        if (!pcmc->broken_reserved_end) {
+            hole64_start += memory_region_size(&ms->device_memory->mr);
+        }
+    } else if (pcms->sgx_epc.size != 0) {
+            hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc);
+    } else {
+        hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
+    }
+
+    return ROUND_UP(hole64_start, 1 * GiB);
+}
+
+DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus)
+{
+    DeviceState *dev = NULL;
+
+    rom_set_order_override(FW_CFG_ORDER_OVERRIDE_VGA);
+    if (pci_bus) {
+        PCIDevice *pcidev = pci_vga_init(pci_bus);
+        dev = pcidev ? &pcidev->qdev : NULL;
+    } else if (isa_bus) {
+        ISADevice *isadev = isa_vga_init(isa_bus);
+        dev = isadev ? DEVICE(isadev) : NULL;
+    }
+    rom_reset_order_override();
+    return dev;
+}
+
+static const MemoryRegionOps ioport80_io_ops = {
+    .write = ioport80_write,
+    .read = ioport80_read,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 1,
+    },
+};
+
+static const MemoryRegionOps ioportF0_io_ops = {
+    .write = ioportF0_write,
+    .read = ioportF0_read,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 1,
+    },
+};
+
+static void pc_superio_init(ISABus *isa_bus, bool create_fdctrl, bool no_vmport)
+{
+    int i;
+    DriveInfo *fd[MAX_FD];
+    qemu_irq *a20_line;
+    ISADevice *fdc, *i8042, *port92, *vmmouse;
+
+    serial_hds_isa_init(isa_bus, 0, MAX_ISA_SERIAL_PORTS);
+    parallel_hds_isa_init(isa_bus, MAX_PARALLEL_PORTS);
+
+    for (i = 0; i < MAX_FD; i++) {
+        fd[i] = drive_get(IF_FLOPPY, 0, i);
+        create_fdctrl |= !!fd[i];
+    }
+    if (create_fdctrl) {
+        fdc = isa_new(TYPE_ISA_FDC);
+        if (fdc) {
+            isa_realize_and_unref(fdc, isa_bus, &error_fatal);
+            isa_fdc_init_drives(fdc, fd);
+        }
+    }
+
+    i8042 = isa_create_simple(isa_bus, "i8042");
+    if (!no_vmport) {
+        isa_create_simple(isa_bus, TYPE_VMPORT);
+        vmmouse = isa_try_new("vmmouse");
+    } else {
+        vmmouse = NULL;
+    }
+    if (vmmouse) {
+        object_property_set_link(OBJECT(vmmouse), "i8042", OBJECT(i8042),
+                                 &error_abort);
+        isa_realize_and_unref(vmmouse, isa_bus, &error_fatal);
+    }
+    port92 = isa_create_simple(isa_bus, TYPE_PORT92);
+
+    a20_line = qemu_allocate_irqs(handle_a20_line_change, first_cpu, 2);
+    i8042_setup_a20_line(i8042, a20_line[0]);
+    qdev_connect_gpio_out_named(DEVICE(port92),
+                                PORT92_A20_LINE, 0, a20_line[1]);
+    g_free(a20_line);
+}
+
+void pc_basic_device_init(struct PCMachineState *pcms,
+                          ISABus *isa_bus, qemu_irq *gsi,
+                          ISADevice **rtc_state,
+                          bool create_fdctrl,
+                          uint32_t hpet_irqs)
+{
+    int i;
+    DeviceState *hpet = NULL;
+    int pit_isa_irq = 0;
+    qemu_irq pit_alt_irq = NULL;
+    qemu_irq rtc_irq = NULL;
+    ISADevice *pit = NULL;
+    MemoryRegion *ioport80_io = g_new(MemoryRegion, 1);
+    MemoryRegion *ioportF0_io = g_new(MemoryRegion, 1);
+
+    memory_region_init_io(ioport80_io, NULL, &ioport80_io_ops, NULL, "ioport80", 1);
+    memory_region_add_subregion(isa_bus->address_space_io, 0x80, ioport80_io);
+
+    memory_region_init_io(ioportF0_io, NULL, &ioportF0_io_ops, NULL, "ioportF0", 1);
+    memory_region_add_subregion(isa_bus->address_space_io, 0xf0, ioportF0_io);
+
+    /*
+     * Check if an HPET shall be created.
+     *
+     * Without KVM_CAP_PIT_STATE2, we cannot switch off the in-kernel PIT
+     * when the HPET wants to take over. Thus we have to disable the latter.
+     */
+    if (pcms->hpet_enabled && (!kvm_irqchip_in_kernel() ||
+                               kvm_has_pit_state2())) {
+        hpet = qdev_try_new(TYPE_HPET);
+        if (!hpet) {
+            error_report("couldn't create HPET device");
+            exit(1);
+        }
+        /*
+         * For pc-piix-*, hpet's intcap is always IRQ2. For pc-q35-1.7 and
+         * earlier, use IRQ2 for compat. Otherwise, use IRQ16~23, IRQ8 and
+         * IRQ2.
+         */
+        uint8_t compat = object_property_get_uint(OBJECT(hpet),
+                HPET_INTCAP, NULL);
+        if (!compat) {
+            qdev_prop_set_uint32(hpet, HPET_INTCAP, hpet_irqs);
+        }
+        sysbus_realize_and_unref(SYS_BUS_DEVICE(hpet), &error_fatal);
+        sysbus_mmio_map(SYS_BUS_DEVICE(hpet), 0, HPET_BASE);
+
+        for (i = 0; i < GSI_NUM_PINS; i++) {
+            sysbus_connect_irq(SYS_BUS_DEVICE(hpet), i, gsi[i]);
+        }
+        pit_isa_irq = -1;
+        pit_alt_irq = qdev_get_gpio_in(hpet, HPET_LEGACY_PIT_INT);
+        rtc_irq = qdev_get_gpio_in(hpet, HPET_LEGACY_RTC_INT);
+    }
+    *rtc_state = mc146818_rtc_init(isa_bus, 2000, rtc_irq);
+
+    qemu_register_boot_set(pc_boot_set, *rtc_state);
+
+    if (!xen_enabled() && pcms->pit_enabled) {
+        if (kvm_pit_in_kernel()) {
+            pit = kvm_pit_init(isa_bus, 0x40);
+        } else {
+            pit = i8254_pit_init(isa_bus, 0x40, pit_isa_irq, pit_alt_irq);
+        }
+        if (hpet) {
+            /* connect PIT to output control line of the HPET */
+            qdev_connect_gpio_out(hpet, 0, qdev_get_gpio_in(DEVICE(pit), 0));
+        }
+        pcspk_init(pcms->pcspk, isa_bus, pit);
+    }
+
+    i8257_dma_init(isa_bus, 0);
+
+    /* Super I/O */
+    pc_superio_init(isa_bus, create_fdctrl, pcms->vmport != ON_OFF_AUTO_ON);
+}
+
+void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus)
+{
+    int i;
+
+    rom_set_order_override(FW_CFG_ORDER_OVERRIDE_NIC);
+    for (i = 0; i < nb_nics; i++) {
+        NICInfo *nd = &nd_table[i];
+        const char *model = nd->model ? nd->model : pcmc->default_nic_model;
+
+        if (g_str_equal(model, "ne2k_isa")) {
+            pc_init_ne2k_isa(isa_bus, nd);
+        } else {
+            pci_nic_init_nofail(nd, pci_bus, model, NULL);
+        }
+    }
+    rom_reset_order_override();
+}
+
+void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs)
+{
+    qemu_irq *i8259;
+
+    if (kvm_pic_in_kernel()) {
+        i8259 = kvm_i8259_init(isa_bus);
+    } else if (xen_enabled()) {
+        i8259 = xen_interrupt_controller_init();
+    } else {
+        i8259 = i8259_init(isa_bus, x86_allocate_cpu_irq());
+    }
+
+    for (size_t i = 0; i < ISA_NUM_IRQS; i++) {
+        i8259_irqs[i] = i8259[i];
+    }
+
+    g_free(i8259);
+}
+
+static void pc_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
+                               Error **errp)
+{
+    const PCMachineState *pcms = PC_MACHINE(hotplug_dev);
+    const X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+    const PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+    const MachineState *ms = MACHINE(hotplug_dev);
+    const bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
+    const uint64_t legacy_align = TARGET_PAGE_SIZE;
+    Error *local_err = NULL;
+
+    /*
+     * When -no-acpi is used with Q35 machine type, no ACPI is built,
+     * but pcms->acpi_dev is still created. Check !acpi_enabled in
+     * addition to cover this case.
+     */
+    if (!x86ms->acpi_dev || !x86_machine_is_acpi_enabled(x86ms)) {
+        error_setg(errp,
+                   "memory hotplug is not enabled: missing acpi device or acpi disabled");
+        return;
+    }
+
+    if (is_nvdimm && !ms->nvdimms_state->is_enabled) {
+        error_setg(errp, "nvdimm is not enabled: missing 'nvdimm' in '-M'");
+        return;
+    }
+
+    hotplug_handler_pre_plug(x86ms->acpi_dev, dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    pc_dimm_pre_plug(PC_DIMM(dev), MACHINE(hotplug_dev),
+                     pcmc->enforce_aligned_dimm ? NULL : &legacy_align, errp);
+}
+
+static void pc_memory_plug(HotplugHandler *hotplug_dev,
+                           DeviceState *dev, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(hotplug_dev);
+    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+    MachineState *ms = MACHINE(hotplug_dev);
+    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
+
+    pc_dimm_plug(PC_DIMM(dev), MACHINE(pcms));
+
+    if (is_nvdimm) {
+        nvdimm_plug(ms->nvdimms_state);
+    }
+
+    hotplug_handler_plug(x86ms->acpi_dev, dev, &error_abort);
+}
+
+static void pc_memory_unplug_request(HotplugHandler *hotplug_dev,
+                                     DeviceState *dev, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+
+    /*
+     * When -no-acpi is used with Q35 machine type, no ACPI is built,
+     * but pcms->acpi_dev is still created. Check !acpi_enabled in
+     * addition to cover this case.
+     */
+    if (!x86ms->acpi_dev || !x86_machine_is_acpi_enabled(x86ms)) {
+        error_setg(errp,
+                   "memory hotplug is not enabled: missing acpi device or acpi disabled");
+        return;
+    }
+
+    if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
+        error_setg(errp, "nvdimm device hot unplug is not supported yet.");
+        return;
+    }
+
+    hotplug_handler_unplug_request(x86ms->acpi_dev, dev,
+                                   errp);
+}
+
+static void pc_memory_unplug(HotplugHandler *hotplug_dev,
+                             DeviceState *dev, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(hotplug_dev);
+    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+    Error *local_err = NULL;
+
+    hotplug_handler_unplug(x86ms->acpi_dev, dev, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    pc_dimm_unplug(PC_DIMM(dev), MACHINE(pcms));
+    qdev_unrealize(dev);
+ out:
+    error_propagate(errp, local_err);
+}
+
+static void pc_virtio_md_pci_pre_plug(HotplugHandler *hotplug_dev,
+                                      DeviceState *dev, Error **errp)
+{
+    HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
+    Error *local_err = NULL;
+
+    if (!hotplug_dev2 && dev->hotplugged) {
+        /*
+         * Without a bus hotplug handler, we cannot control the plug/unplug
+         * order. We should never reach this point when hotplugging on x86,
+         * however, better add a safety net.
+         */
+        error_setg(errp, "hotplug of virtio based memory devices not supported"
+                   " on this bus.");
+        return;
+    }
+    /*
+     * First, see if we can plug this memory device at all. If that
+     * succeeds, branch of to the actual hotplug handler.
+     */
+    memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL,
+                           &local_err);
+    if (!local_err && hotplug_dev2) {
+        hotplug_handler_pre_plug(hotplug_dev2, dev, &local_err);
+    }
+    error_propagate(errp, local_err);
+}
+
+static void pc_virtio_md_pci_plug(HotplugHandler *hotplug_dev,
+                                  DeviceState *dev, Error **errp)
+{
+    HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
+    Error *local_err = NULL;
+
+    /*
+     * Plug the memory device first and then branch off to the actual
+     * hotplug handler. If that one fails, we can easily undo the memory
+     * device bits.
+     */
+    memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
+    if (hotplug_dev2) {
+        hotplug_handler_plug(hotplug_dev2, dev, &local_err);
+        if (local_err) {
+            memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
+        }
+    }
+    error_propagate(errp, local_err);
+}
+
+static void pc_virtio_md_pci_unplug_request(HotplugHandler *hotplug_dev,
+                                            DeviceState *dev, Error **errp)
+{
+    /* We don't support hot unplug of virtio based memory devices */
+    error_setg(errp, "virtio based memory devices cannot be unplugged.");
+}
+
+static void pc_virtio_md_pci_unplug(HotplugHandler *hotplug_dev,
+                                    DeviceState *dev, Error **errp)
+{
+    /* We don't support hot unplug of virtio based memory devices */
+}
+
+static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
+                                          DeviceState *dev, Error **errp)
+{
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+        pc_memory_pre_plug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        x86_cpu_pre_plug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
+               object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+        pc_virtio_md_pci_pre_plug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+        /* Declare the APIC range as the reserved MSI region */
+        char *resv_prop_str = g_strdup_printf("0xfee00000:0xfeefffff:%d",
+                                              VIRTIO_IOMMU_RESV_MEM_T_MSI);
+
+        object_property_set_uint(OBJECT(dev), "len-reserved-regions", 1, errp);
+        object_property_set_str(OBJECT(dev), "reserved-regions[0]",
+                                resv_prop_str, errp);
+        g_free(resv_prop_str);
+    }
+
+    if (object_dynamic_cast(OBJECT(dev), TYPE_X86_IOMMU_DEVICE) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+        PCMachineState *pcms = PC_MACHINE(hotplug_dev);
+
+        if (pcms->iommu) {
+            error_setg(errp, "QEMU does not support multiple vIOMMUs "
+                       "for x86 yet.");
+            return;
+        }
+        pcms->iommu = dev;
+    }
+}
+
+static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev,
+                                      DeviceState *dev, Error **errp)
+{
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+        pc_memory_plug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        x86_cpu_plug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
+               object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+        pc_virtio_md_pci_plug(hotplug_dev, dev, errp);
+    }
+}
+
+static void pc_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
+                                                DeviceState *dev, Error **errp)
+{
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+        pc_memory_unplug_request(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        x86_cpu_unplug_request_cb(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
+               object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+        pc_virtio_md_pci_unplug_request(hotplug_dev, dev, errp);
+    } else {
+        error_setg(errp, "acpi: device unplug request for not supported device"
+                   " type: %s", object_get_typename(OBJECT(dev)));
+    }
+}
+
+static void pc_machine_device_unplug_cb(HotplugHandler *hotplug_dev,
+                                        DeviceState *dev, Error **errp)
+{
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+        pc_memory_unplug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        x86_cpu_unplug_cb(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
+               object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+        pc_virtio_md_pci_unplug(hotplug_dev, dev, errp);
+    } else {
+        error_setg(errp, "acpi: device unplug for not supported device"
+                   " type: %s", object_get_typename(OBJECT(dev)));
+    }
+}
+
+static HotplugHandler *pc_get_hotplug_handler(MachineState *machine,
+                                             DeviceState *dev)
+{
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_CPU) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_X86_IOMMU_DEVICE)) {
+        return HOTPLUG_HANDLER(machine);
+    }
+
+    return NULL;
+}
+
+static void
+pc_machine_get_device_memory_region_size(Object *obj, Visitor *v,
+                                         const char *name, void *opaque,
+                                         Error **errp)
+{
+    MachineState *ms = MACHINE(obj);
+    int64_t value = 0;
+
+    if (ms->device_memory) {
+        value = memory_region_size(&ms->device_memory->mr);
+    }
+
+    visit_type_int(v, name, &value, errp);
+}
+
+static void pc_machine_get_vmport(Object *obj, Visitor *v, const char *name,
+                                  void *opaque, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+    OnOffAuto vmport = pcms->vmport;
+
+    visit_type_OnOffAuto(v, name, &vmport, errp);
+}
+
+static void pc_machine_set_vmport(Object *obj, Visitor *v, const char *name,
+                                  void *opaque, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, name, &pcms->vmport, errp);
+}
+
+static bool pc_machine_get_smbus(Object *obj, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    return pcms->smbus_enabled;
+}
+
+static void pc_machine_set_smbus(Object *obj, bool value, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    pcms->smbus_enabled = value;
+}
+
+static bool pc_machine_get_sata(Object *obj, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    return pcms->sata_enabled;
+}
+
+static void pc_machine_set_sata(Object *obj, bool value, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    pcms->sata_enabled = value;
+}
+
+static bool pc_machine_get_pit(Object *obj, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    return pcms->pit_enabled;
+}
+
+static void pc_machine_set_pit(Object *obj, bool value, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    pcms->pit_enabled = value;
+}
+
+static bool pc_machine_get_hpet(Object *obj, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    return pcms->hpet_enabled;
+}
+
+static void pc_machine_set_hpet(Object *obj, bool value, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    pcms->hpet_enabled = value;
+}
+
+static bool pc_machine_get_default_bus_bypass_iommu(Object *obj, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    return pcms->default_bus_bypass_iommu;
+}
+
+static void pc_machine_set_default_bus_bypass_iommu(Object *obj, bool value,
+                                                    Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    pcms->default_bus_bypass_iommu = value;
+}
+
+static void pc_machine_get_max_ram_below_4g(Object *obj, Visitor *v,
+                                            const char *name, void *opaque,
+                                            Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+    uint64_t value = pcms->max_ram_below_4g;
+
+    visit_type_size(v, name, &value, errp);
+}
+
+static void pc_machine_set_max_ram_below_4g(Object *obj, Visitor *v,
+                                            const char *name, void *opaque,
+                                            Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+    uint64_t value;
+
+    if (!visit_type_size(v, name, &value, errp)) {
+        return;
+    }
+    if (value > 4 * GiB) {
+        error_setg(errp,
+                   "Machine option 'max-ram-below-4g=%"PRIu64
+                   "' expects size less than or equal to 4G", value);
+        return;
+    }
+
+    if (value < 1 * MiB) {
+        warn_report("Only %" PRIu64 " bytes of RAM below the 4GiB boundary,"
+                    "BIOS may not work with less than 1MiB", value);
+    }
+
+    pcms->max_ram_below_4g = value;
+}
+
+static void pc_machine_get_max_fw_size(Object *obj, Visitor *v,
+                                       const char *name, void *opaque,
+                                       Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+    uint64_t value = pcms->max_fw_size;
+
+    visit_type_size(v, name, &value, errp);
+}
+
+static void pc_machine_set_max_fw_size(Object *obj, Visitor *v,
+                                       const char *name, void *opaque,
+                                       Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+    Error *error = NULL;
+    uint64_t value;
+
+    visit_type_size(v, name, &value, &error);
+    if (error) {
+        error_propagate(errp, error);
+        return;
+    }
+
+    /*
+    * We don't have a theoretically justifiable exact lower bound on the base
+    * address of any flash mapping. In practice, the IO-APIC MMIO range is
+    * [0xFEE00000..0xFEE01000] -- see IO_APIC_DEFAULT_ADDRESS --, leaving free
+    * only 18MB-4KB below 4G. For now, restrict the cumulative mapping to 8MB in
+    * size.
+    */
+    if (value > 16 * MiB) {
+        error_setg(errp,
+                   "User specified max allowed firmware size %" PRIu64 " is "
+                   "greater than 16MiB. If combined firwmare size exceeds "
+                   "16MiB the system may not boot, or experience intermittent"
+                   "stability issues.",
+                   value);
+        return;
+    }
+
+    pcms->max_fw_size = value;
+}
+
+
+static void pc_machine_initfn(Object *obj)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+#ifdef CONFIG_VMPORT
+    pcms->vmport = ON_OFF_AUTO_AUTO;
+#else
+    pcms->vmport = ON_OFF_AUTO_OFF;
+#endif /* CONFIG_VMPORT */
+    pcms->max_ram_below_4g = 0; /* use default */
+    /* acpi build is enabled by default if machine supports it */
+    pcms->acpi_build_enabled = PC_MACHINE_GET_CLASS(pcms)->has_acpi_build;
+    pcms->smbus_enabled = true;
+    pcms->sata_enabled = true;
+    pcms->pit_enabled = true;
+    pcms->max_fw_size = 8 * MiB;
+#ifdef CONFIG_HPET
+    pcms->hpet_enabled = true;
+#endif
+    pcms->default_bus_bypass_iommu = false;
+
+    pc_system_flash_create(pcms);
+    pcms->pcspk = isa_new(TYPE_PC_SPEAKER);
+    object_property_add_alias(OBJECT(pcms), "pcspk-audiodev",
+                              OBJECT(pcms->pcspk), "audiodev");
+}
+
+static void pc_machine_reset(MachineState *machine)
+{
+    CPUState *cs;
+    X86CPU *cpu;
+
+    qemu_devices_reset();
+
+    /* Reset APIC after devices have been reset to cancel
+     * any changes that qemu_devices_reset() might have done.
+     */
+    CPU_FOREACH(cs) {
+        cpu = X86_CPU(cs);
+
+        if (cpu->apic_state) {
+            device_legacy_reset(cpu->apic_state);
+        }
+    }
+}
+
+static void pc_machine_wakeup(MachineState *machine)
+{
+    cpu_synchronize_all_states();
+    pc_machine_reset(machine);
+    cpu_synchronize_all_post_reset();
+}
+
+static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp)
+{
+    X86IOMMUState *iommu = x86_iommu_get_default();
+    IntelIOMMUState *intel_iommu;
+
+    if (iommu &&
+        object_dynamic_cast((Object *)iommu, TYPE_INTEL_IOMMU_DEVICE) &&
+        object_dynamic_cast((Object *)dev, "vfio-pci")) {
+        intel_iommu = INTEL_IOMMU_DEVICE(iommu);
+        if (!intel_iommu->caching_mode) {
+            error_setg(errp, "Device assignment is not allowed without "
+                       "enabling caching-mode=on for Intel IOMMU.");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void pc_machine_class_init(ObjectClass *oc, void *data)
+{
+    MachineClass *mc = MACHINE_CLASS(oc);
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(oc);
+    HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
+
+    pcmc->pci_enabled = true;
+    pcmc->has_acpi_build = true;
+    pcmc->rsdp_in_ram = true;
+    pcmc->smbios_defaults = true;
+    pcmc->smbios_uuid_encoded = true;
+    pcmc->gigabyte_align = true;
+    pcmc->has_reserved_memory = true;
+    pcmc->kvmclock_enabled = true;
+    pcmc->enforce_aligned_dimm = true;
+    /* BIOS ACPI tables: 128K. Other BIOS datastructures: less than 4K reported
+     * to be used at the moment, 32K should be enough for a while.  */
+    pcmc->acpi_data_size = 0x20000 + 0x8000;
+    pcmc->pvh_enabled = true;
+    pcmc->kvmclock_create_always = true;
+    assert(!mc->get_hotplug_handler);
+    mc->get_hotplug_handler = pc_get_hotplug_handler;
+    mc->hotplug_allowed = pc_hotplug_allowed;
+    mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
+    mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
+    mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
+    mc->auto_enable_numa_with_memhp = true;
+    mc->auto_enable_numa_with_memdev = true;
+    mc->has_hotpluggable_cpus = true;
+    mc->default_boot_order = "cad";
+    mc->block_default_type = IF_IDE;
+    mc->max_cpus = 255;
+    mc->reset = pc_machine_reset;
+    mc->wakeup = pc_machine_wakeup;
+    hc->pre_plug = pc_machine_device_pre_plug_cb;
+    hc->plug = pc_machine_device_plug_cb;
+    hc->unplug_request = pc_machine_device_unplug_request_cb;
+    hc->unplug = pc_machine_device_unplug_cb;
+    mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE;
+    mc->nvdimm_supported = true;
+    mc->smp_props.dies_supported = true;
+    mc->default_ram_id = "pc.ram";
+
+    object_class_property_add(oc, PC_MACHINE_MAX_RAM_BELOW_4G, "size",
+        pc_machine_get_max_ram_below_4g, pc_machine_set_max_ram_below_4g,
+        NULL, NULL);
+    object_class_property_set_description(oc, PC_MACHINE_MAX_RAM_BELOW_4G,
+        "Maximum ram below the 4G boundary (32bit boundary)");
+
+    object_class_property_add(oc, PC_MACHINE_DEVMEM_REGION_SIZE, "int",
+        pc_machine_get_device_memory_region_size, NULL,
+        NULL, NULL);
+
+    object_class_property_add(oc, PC_MACHINE_VMPORT, "OnOffAuto",
+        pc_machine_get_vmport, pc_machine_set_vmport,
+        NULL, NULL);
+    object_class_property_set_description(oc, PC_MACHINE_VMPORT,
+        "Enable vmport (pc & q35)");
+
+    object_class_property_add_bool(oc, PC_MACHINE_SMBUS,
+        pc_machine_get_smbus, pc_machine_set_smbus);
+
+    object_class_property_add_bool(oc, PC_MACHINE_SATA,
+        pc_machine_get_sata, pc_machine_set_sata);
+
+    object_class_property_add_bool(oc, PC_MACHINE_PIT,
+        pc_machine_get_pit, pc_machine_set_pit);
+
+    object_class_property_add_bool(oc, "hpet",
+        pc_machine_get_hpet, pc_machine_set_hpet);
+
+    object_class_property_add_bool(oc, "default-bus-bypass-iommu",
+        pc_machine_get_default_bus_bypass_iommu,
+        pc_machine_set_default_bus_bypass_iommu);
+
+    object_class_property_add(oc, PC_MACHINE_MAX_FW_SIZE, "size",
+        pc_machine_get_max_fw_size, pc_machine_set_max_fw_size,
+        NULL, NULL);
+    object_class_property_set_description(oc, PC_MACHINE_MAX_FW_SIZE,
+        "Maximum combined firmware size");
+}
+
+static const TypeInfo pc_machine_info = {
+    .name = TYPE_PC_MACHINE,
+    .parent = TYPE_X86_MACHINE,
+    .abstract = true,
+    .instance_size = sizeof(PCMachineState),
+    .instance_init = pc_machine_initfn,
+    .class_size = sizeof(PCMachineClass),
+    .class_init = pc_machine_class_init,
+    .interfaces = (InterfaceInfo[]) {
+         { TYPE_HOTPLUG_HANDLER },
+         { }
+    },
+};
+
+static void pc_machine_register_types(void)
+{
+    type_register_static(&pc_machine_info);
+}
+
+type_init(pc_machine_register_types)
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
new file mode 100644
index 000000000..223dd3e05
--- /dev/null
+++ b/hw/i386/pc_piix.c
@@ -0,0 +1,951 @@
+/*
+ * QEMU PC System Emulator
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include CONFIG_DEVICES
+
+#include "qemu/units.h"
+#include "hw/loader.h"
+#include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/apic.h"
+#include "hw/pci-host/i440fx.h"
+#include "hw/southbridge/piix.h"
+#include "hw/display/ramfb.h"
+#include "hw/firmware/smbios.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_ids.h"
+#include "hw/usb.h"
+#include "net/net.h"
+#include "hw/ide/pci.h"
+#include "hw/irq.h"
+#include "sysemu/kvm.h"
+#include "hw/kvm/clock.h"
+#include "hw/sysbus.h"
+#include "hw/i2c/smbus_eeprom.h"
+#include "hw/xen/xen-x86.h"
+#include "exec/memory.h"
+#include "hw/acpi/acpi.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "sysemu/xen.h"
+#ifdef CONFIG_XEN
+#include <xen/hvm/hvm_info_table.h>
+#include "hw/xen/xen_pt.h"
+#endif
+#include "migration/global_state.h"
+#include "migration/misc.h"
+#include "sysemu/numa.h"
+#include "hw/hyperv/vmbus-bridge.h"
+#include "hw/mem/nvdimm.h"
+#include "hw/i386/acpi-build.h"
+#include "kvm/kvm-cpu.h"
+
+#define MAX_IDE_BUS 2
+
+#ifdef CONFIG_IDE_ISA
+static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 };
+static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 };
+static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
+#endif
+
+/* PC hardware initialisation */
+static void pc_init1(MachineState *machine,
+                     const char *host_type, const char *pci_type)
+{
+    PCMachineState *pcms = PC_MACHINE(machine);
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+    X86MachineState *x86ms = X86_MACHINE(machine);
+    MemoryRegion *system_memory = get_system_memory();
+    MemoryRegion *system_io = get_system_io();
+    PCIBus *pci_bus;
+    ISABus *isa_bus;
+    PCII440FXState *i440fx_state;
+    int piix3_devfn = -1;
+    qemu_irq smi_irq;
+    GSIState *gsi_state;
+    BusState *idebus[MAX_IDE_BUS];
+    ISADevice *rtc_state;
+    MemoryRegion *ram_memory;
+    MemoryRegion *pci_memory;
+    MemoryRegion *rom_memory;
+    ram_addr_t lowmem;
+
+    /*
+     * Calculate ram split, for memory below and above 4G.  It's a bit
+     * complicated for backward compatibility reasons ...
+     *
+     *  - Traditional split is 3.5G (lowmem = 0xe0000000).  This is the
+     *    default value for max_ram_below_4g now.
+     *
+     *  - Then, to gigabyte align the memory, we move the split to 3G
+     *    (lowmem = 0xc0000000).  But only in case we have to split in
+     *    the first place, i.e. ram_size is larger than (traditional)
+     *    lowmem.  And for new machine types (gigabyte_align = true)
+     *    only, for live migration compatibility reasons.
+     *
+     *  - Next the max-ram-below-4g option was added, which allowed to
+     *    reduce lowmem to a smaller value, to allow a larger PCI I/O
+     *    window below 4G.  qemu doesn't enforce gigabyte alignment here,
+     *    but prints a warning.
+     *
+     *  - Finally max-ram-below-4g got updated to also allow raising lowmem,
+     *    so legacy non-PAE guests can get as much memory as possible in
+     *    the 32bit address space below 4G.
+     *
+     *  - Note that Xen has its own ram setup code in xen_ram_init(),
+     *    called via xen_hvm_init_pc().
+     *
+     * Examples:
+     *    qemu -M pc-1.7 -m 4G    (old default)    -> 3584M low,  512M high
+     *    qemu -M pc -m 4G        (new default)    -> 3072M low, 1024M high
+     *    qemu -M pc,max-ram-below-4g=2G -m 4G     -> 2048M low, 2048M high
+     *    qemu -M pc,max-ram-below-4g=4G -m 3968M  -> 3968M low (=4G-128M)
+     */
+    if (xen_enabled()) {
+        xen_hvm_init_pc(pcms, &ram_memory);
+    } else {
+        if (!pcms->max_ram_below_4g) {
+            pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */
+        }
+        lowmem = pcms->max_ram_below_4g;
+        if (machine->ram_size >= pcms->max_ram_below_4g) {
+            if (pcmc->gigabyte_align) {
+                if (lowmem > 0xc0000000) {
+                    lowmem = 0xc0000000;
+                }
+                if (lowmem & (1 * GiB - 1)) {
+                    warn_report("Large machine and max_ram_below_4g "
+                                "(%" PRIu64 ") not a multiple of 1G; "
+                                "possible bad performance.",
+                                pcms->max_ram_below_4g);
+                }
+            }
+        }
+
+        if (machine->ram_size >= lowmem) {
+            x86ms->above_4g_mem_size = machine->ram_size - lowmem;
+            x86ms->below_4g_mem_size = lowmem;
+        } else {
+            x86ms->above_4g_mem_size = 0;
+            x86ms->below_4g_mem_size = machine->ram_size;
+        }
+    }
+
+    pc_machine_init_sgx_epc(pcms);
+    x86_cpus_init(x86ms, pcmc->default_cpu_version);
+
+    if (pcmc->kvmclock_enabled) {
+        kvmclock_create(pcmc->kvmclock_create_always);
+    }
+
+    if (pcmc->pci_enabled) {
+        pci_memory = g_new(MemoryRegion, 1);
+        memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
+        rom_memory = pci_memory;
+    } else {
+        pci_memory = NULL;
+        rom_memory = system_memory;
+    }
+
+    pc_guest_info_init(pcms);
+
+    if (pcmc->smbios_defaults) {
+        MachineClass *mc = MACHINE_GET_CLASS(machine);
+        /* These values are guest ABI, do not change */
+        smbios_set_defaults("QEMU", "Standard PC (i440FX + PIIX, 1996)",
+                            mc->name, pcmc->smbios_legacy_mode,
+                            pcmc->smbios_uuid_encoded,
+                            SMBIOS_ENTRY_POINT_21);
+    }
+
+    /* allocate ram and load rom/bios */
+    if (!xen_enabled()) {
+        pc_memory_init(pcms, system_memory,
+                       rom_memory, &ram_memory);
+    } else {
+        pc_system_flash_cleanup_unused(pcms);
+        if (machine->kernel_filename != NULL) {
+            /* For xen HVM direct kernel boot, load linux here */
+            xen_load_linux(pcms);
+        }
+    }
+
+    gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled);
+
+    if (pcmc->pci_enabled) {
+        PIIX3State *piix3;
+
+        pci_bus = i440fx_init(host_type,
+                              pci_type,
+                              &i440fx_state,
+                              system_memory, system_io, machine->ram_size,
+                              x86ms->below_4g_mem_size,
+                              x86ms->above_4g_mem_size,
+                              pci_memory, ram_memory);
+        pcms->bus = pci_bus;
+
+        piix3 = piix3_create(pci_bus, &isa_bus);
+        piix3->pic = x86ms->gsi;
+        piix3_devfn = piix3->dev.devfn;
+    } else {
+        pci_bus = NULL;
+        i440fx_state = NULL;
+        isa_bus = isa_bus_new(NULL, get_system_memory(), system_io,
+                              &error_abort);
+        pcms->hpet_enabled = false;
+    }
+    isa_bus_irqs(isa_bus, x86ms->gsi);
+
+    pc_i8259_create(isa_bus, gsi_state->i8259_irq);
+
+    if (pcmc->pci_enabled) {
+        ioapic_init_gsi(gsi_state, "i440fx");
+    }
+
+    if (tcg_enabled()) {
+        x86_register_ferr_irq(x86ms->gsi[13]);
+    }
+
+    pc_vga_init(isa_bus, pcmc->pci_enabled ? pci_bus : NULL);
+
+    assert(pcms->vmport != ON_OFF_AUTO__MAX);
+    if (pcms->vmport == ON_OFF_AUTO_AUTO) {
+        pcms->vmport = xen_enabled() ? ON_OFF_AUTO_OFF : ON_OFF_AUTO_ON;
+    }
+
+    /* init basic PC hardware */
+    pc_basic_device_init(pcms, isa_bus, x86ms->gsi, &rtc_state, true,
+                         0x4);
+
+    pc_nic_init(pcmc, isa_bus, pci_bus);
+
+    if (pcmc->pci_enabled) {
+        PCIDevice *dev;
+
+        dev = pci_create_simple(pci_bus, piix3_devfn + 1,
+                                xen_enabled() ? "piix3-ide-xen" : "piix3-ide");
+        pci_ide_create_devs(dev);
+        idebus[0] = qdev_get_child_bus(&dev->qdev, "ide.0");
+        idebus[1] = qdev_get_child_bus(&dev->qdev, "ide.1");
+        pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state);
+    }
+#ifdef CONFIG_IDE_ISA
+    else {
+        DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
+        int i;
+
+        ide_drive_get(hd, ARRAY_SIZE(hd));
+        for (i = 0; i < MAX_IDE_BUS; i++) {
+            ISADevice *dev;
+            char busname[] = "ide.0";
+            dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i],
+                               ide_irq[i],
+                               hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]);
+            /*
+             * The ide bus name is ide.0 for the first bus and ide.1 for the
+             * second one.
+             */
+            busname[4] = '0' + i;
+            idebus[i] = qdev_get_child_bus(DEVICE(dev), busname);
+        }
+        pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state);
+    }
+#endif
+
+    if (pcmc->pci_enabled && machine_usb(machine)) {
+        pci_create_simple(pci_bus, piix3_devfn + 2, "piix3-usb-uhci");
+    }
+
+    if (pcmc->pci_enabled && x86_machine_is_acpi_enabled(X86_MACHINE(pcms))) {
+        DeviceState *piix4_pm;
+
+        smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0);
+        /* TODO: Populate SPD eeprom data.  */
+        pcms->smbus = piix4_pm_init(pci_bus, piix3_devfn + 3, 0xb100,
+                                    x86ms->gsi[9], smi_irq,
+                                    x86_machine_is_smm_enabled(x86ms),
+                                    &piix4_pm);
+        smbus_eeprom_init(pcms->smbus, 8, NULL, 0);
+
+        object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
+                                 TYPE_HOTPLUG_HANDLER,
+                                 (Object **)&x86ms->acpi_dev,
+                                 object_property_allow_set_link,
+                                 OBJ_PROP_LINK_STRONG);
+        object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
+                                 OBJECT(piix4_pm), &error_abort);
+    }
+
+    if (machine->nvdimms_state->is_enabled) {
+        nvdimm_init_acpi_state(machine->nvdimms_state, system_io,
+                               x86_nvdimm_acpi_dsmio,
+                               x86ms->fw_cfg, OBJECT(pcms));
+    }
+}
+
+/* Looking for a pc_compat_2_4() function? It doesn't exist.
+ * pc_compat_*() functions that run on machine-init time and
+ * change global QEMU state are deprecated. Please don't create
+ * one, and implement any pc-*-2.4 (and newer) compat code in
+ * hw_compat_*, pc_compat_*, or * pc_*_machine_options().
+ */
+
+static void pc_compat_2_3_fn(MachineState *machine)
+{
+    X86MachineState *x86ms = X86_MACHINE(machine);
+    if (kvm_enabled()) {
+        x86ms->smm = ON_OFF_AUTO_OFF;
+    }
+}
+
+static void pc_compat_2_2_fn(MachineState *machine)
+{
+    pc_compat_2_3_fn(machine);
+}
+
+static void pc_compat_2_1_fn(MachineState *machine)
+{
+    pc_compat_2_2_fn(machine);
+    x86_cpu_change_kvm_default("svm", NULL);
+}
+
+static void pc_compat_2_0_fn(MachineState *machine)
+{
+    pc_compat_2_1_fn(machine);
+}
+
+static void pc_compat_1_7_fn(MachineState *machine)
+{
+    pc_compat_2_0_fn(machine);
+    x86_cpu_change_kvm_default("x2apic", NULL);
+}
+
+static void pc_compat_1_6_fn(MachineState *machine)
+{
+    pc_compat_1_7_fn(machine);
+}
+
+static void pc_compat_1_5_fn(MachineState *machine)
+{
+    pc_compat_1_6_fn(machine);
+}
+
+static void pc_compat_1_4_fn(MachineState *machine)
+{
+    pc_compat_1_5_fn(machine);
+}
+
+static void pc_init_isa(MachineState *machine)
+{
+    pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, TYPE_I440FX_PCI_DEVICE);
+}
+
+#ifdef CONFIG_XEN
+static void pc_xen_hvm_init_pci(MachineState *machine)
+{
+    const char *pci_type = xen_igd_gfx_pt_enabled() ?
+                TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE : TYPE_I440FX_PCI_DEVICE;
+
+    pc_init1(machine,
+             TYPE_I440FX_PCI_HOST_BRIDGE,
+             pci_type);
+}
+
+static void pc_xen_hvm_init(MachineState *machine)
+{
+    PCMachineState *pcms = PC_MACHINE(machine);
+
+    if (!xen_enabled()) {
+        error_report("xenfv machine requires the xen accelerator");
+        exit(1);
+    }
+
+    pc_xen_hvm_init_pci(machine);
+    pci_create_simple(pcms->bus, -1, "xen-platform");
+}
+#endif
+
+#define DEFINE_I440FX_MACHINE(suffix, name, compatfn, optionfn) \
+    static void pc_init_##suffix(MachineState *machine) \
+    { \
+        void (*compat)(MachineState *m) = (compatfn); \
+        if (compat) { \
+            compat(machine); \
+        } \
+        pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \
+                 TYPE_I440FX_PCI_DEVICE); \
+    } \
+    DEFINE_PC_MACHINE(suffix, name, pc_init_##suffix, optionfn)
+
+static void pc_i440fx_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+    pcmc->default_nic_model = "e1000";
+    pcmc->pci_root_uid = 0;
+
+    m->family = "pc_piix";
+    m->desc = "Standard PC (i440FX + PIIX, 1996)";
+    m->default_machine_opts = "firmware=bios-256k.bin";
+    m->default_display = "std";
+    machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE);
+    machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
+}
+
+static void pc_i440fx_6_2_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+    pc_i440fx_machine_options(m);
+    m->alias = "pc";
+    m->is_default = true;
+    pcmc->default_cpu_version = 1;
+}
+
+DEFINE_I440FX_MACHINE(v6_2, "pc-i440fx-6.2", NULL,
+                      pc_i440fx_6_2_machine_options);
+
+static void pc_i440fx_6_1_machine_options(MachineClass *m)
+{
+    pc_i440fx_6_2_machine_options(m);
+    m->alias = NULL;
+    m->is_default = false;
+    compat_props_add(m->compat_props, hw_compat_6_1, hw_compat_6_1_len);
+    compat_props_add(m->compat_props, pc_compat_6_1, pc_compat_6_1_len);
+    m->smp_props.prefer_sockets = true;
+}
+
+DEFINE_I440FX_MACHINE(v6_1, "pc-i440fx-6.1", NULL,
+                      pc_i440fx_6_1_machine_options);
+
+static void pc_i440fx_6_0_machine_options(MachineClass *m)
+{
+    pc_i440fx_6_1_machine_options(m);
+    m->alias = NULL;
+    m->is_default = false;
+    compat_props_add(m->compat_props, hw_compat_6_0, hw_compat_6_0_len);
+    compat_props_add(m->compat_props, pc_compat_6_0, pc_compat_6_0_len);
+}
+
+DEFINE_I440FX_MACHINE(v6_0, "pc-i440fx-6.0", NULL,
+                      pc_i440fx_6_0_machine_options);
+
+static void pc_i440fx_5_2_machine_options(MachineClass *m)
+{
+    pc_i440fx_6_0_machine_options(m);
+    m->alias = NULL;
+    m->is_default = false;
+    compat_props_add(m->compat_props, hw_compat_5_2, hw_compat_5_2_len);
+    compat_props_add(m->compat_props, pc_compat_5_2, pc_compat_5_2_len);
+}
+
+DEFINE_I440FX_MACHINE(v5_2, "pc-i440fx-5.2", NULL,
+                      pc_i440fx_5_2_machine_options);
+
+static void pc_i440fx_5_1_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_i440fx_5_2_machine_options(m);
+    m->alias = NULL;
+    m->is_default = false;
+    compat_props_add(m->compat_props, hw_compat_5_1, hw_compat_5_1_len);
+    compat_props_add(m->compat_props, pc_compat_5_1, pc_compat_5_1_len);
+    pcmc->kvmclock_create_always = false;
+    pcmc->pci_root_uid = 1;
+}
+
+DEFINE_I440FX_MACHINE(v5_1, "pc-i440fx-5.1", NULL,
+                      pc_i440fx_5_1_machine_options);
+
+static void pc_i440fx_5_0_machine_options(MachineClass *m)
+{
+    pc_i440fx_5_1_machine_options(m);
+    m->alias = NULL;
+    m->is_default = false;
+    m->numa_mem_supported = true;
+    compat_props_add(m->compat_props, hw_compat_5_0, hw_compat_5_0_len);
+    compat_props_add(m->compat_props, pc_compat_5_0, pc_compat_5_0_len);
+    m->auto_enable_numa_with_memdev = false;
+}
+
+DEFINE_I440FX_MACHINE(v5_0, "pc-i440fx-5.0", NULL,
+                      pc_i440fx_5_0_machine_options);
+
+static void pc_i440fx_4_2_machine_options(MachineClass *m)
+{
+    pc_i440fx_5_0_machine_options(m);
+    m->alias = NULL;
+    m->is_default = false;
+    compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len);
+    compat_props_add(m->compat_props, pc_compat_4_2, pc_compat_4_2_len);
+}
+
+DEFINE_I440FX_MACHINE(v4_2, "pc-i440fx-4.2", NULL,
+                      pc_i440fx_4_2_machine_options);
+
+static void pc_i440fx_4_1_machine_options(MachineClass *m)
+{
+    pc_i440fx_4_2_machine_options(m);
+    m->alias = NULL;
+    m->is_default = false;
+    compat_props_add(m->compat_props, hw_compat_4_1, hw_compat_4_1_len);
+    compat_props_add(m->compat_props, pc_compat_4_1, pc_compat_4_1_len);
+}
+
+DEFINE_I440FX_MACHINE(v4_1, "pc-i440fx-4.1", NULL,
+                      pc_i440fx_4_1_machine_options);
+
+static void pc_i440fx_4_0_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+    pc_i440fx_4_1_machine_options(m);
+    m->alias = NULL;
+    m->is_default = false;
+    pcmc->default_cpu_version = CPU_VERSION_LEGACY;
+    compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len);
+    compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len);
+}
+
+DEFINE_I440FX_MACHINE(v4_0, "pc-i440fx-4.0", NULL,
+                      pc_i440fx_4_0_machine_options);
+
+static void pc_i440fx_3_1_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_i440fx_4_0_machine_options(m);
+    m->is_default = false;
+    pcmc->do_not_add_smb_acpi = true;
+    m->smbus_no_migration_support = true;
+    m->alias = NULL;
+    pcmc->pvh_enabled = false;
+    compat_props_add(m->compat_props, hw_compat_3_1, hw_compat_3_1_len);
+    compat_props_add(m->compat_props, pc_compat_3_1, pc_compat_3_1_len);
+}
+
+DEFINE_I440FX_MACHINE(v3_1, "pc-i440fx-3.1", NULL,
+                      pc_i440fx_3_1_machine_options);
+
+static void pc_i440fx_3_0_machine_options(MachineClass *m)
+{
+    pc_i440fx_3_1_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_3_0, hw_compat_3_0_len);
+    compat_props_add(m->compat_props, pc_compat_3_0, pc_compat_3_0_len);
+}
+
+DEFINE_I440FX_MACHINE(v3_0, "pc-i440fx-3.0", NULL,
+                      pc_i440fx_3_0_machine_options);
+
+static void pc_i440fx_2_12_machine_options(MachineClass *m)
+{
+    pc_i440fx_3_0_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_2_12, hw_compat_2_12_len);
+    compat_props_add(m->compat_props, pc_compat_2_12, pc_compat_2_12_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_12, "pc-i440fx-2.12", NULL,
+                      pc_i440fx_2_12_machine_options);
+
+static void pc_i440fx_2_11_machine_options(MachineClass *m)
+{
+    pc_i440fx_2_12_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_2_11, hw_compat_2_11_len);
+    compat_props_add(m->compat_props, pc_compat_2_11, pc_compat_2_11_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_11, "pc-i440fx-2.11", NULL,
+                      pc_i440fx_2_11_machine_options);
+
+static void pc_i440fx_2_10_machine_options(MachineClass *m)
+{
+    pc_i440fx_2_11_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_2_10, hw_compat_2_10_len);
+    compat_props_add(m->compat_props, pc_compat_2_10, pc_compat_2_10_len);
+    m->auto_enable_numa_with_memhp = false;
+}
+
+DEFINE_I440FX_MACHINE(v2_10, "pc-i440fx-2.10", NULL,
+                      pc_i440fx_2_10_machine_options);
+
+static void pc_i440fx_2_9_machine_options(MachineClass *m)
+{
+    pc_i440fx_2_10_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_2_9, hw_compat_2_9_len);
+    compat_props_add(m->compat_props, pc_compat_2_9, pc_compat_2_9_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_9, "pc-i440fx-2.9", NULL,
+                      pc_i440fx_2_9_machine_options);
+
+static void pc_i440fx_2_8_machine_options(MachineClass *m)
+{
+    pc_i440fx_2_9_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_2_8, hw_compat_2_8_len);
+    compat_props_add(m->compat_props, pc_compat_2_8, pc_compat_2_8_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_8, "pc-i440fx-2.8", NULL,
+                      pc_i440fx_2_8_machine_options);
+
+static void pc_i440fx_2_7_machine_options(MachineClass *m)
+{
+    pc_i440fx_2_8_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_2_7, hw_compat_2_7_len);
+    compat_props_add(m->compat_props, pc_compat_2_7, pc_compat_2_7_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_7, "pc-i440fx-2.7", NULL,
+                      pc_i440fx_2_7_machine_options);
+
+static void pc_i440fx_2_6_machine_options(MachineClass *m)
+{
+    X86MachineClass *x86mc = X86_MACHINE_CLASS(m);
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_i440fx_2_7_machine_options(m);
+    pcmc->legacy_cpu_hotplug = true;
+    x86mc->fwcfg_dma_enabled = false;
+    compat_props_add(m->compat_props, hw_compat_2_6, hw_compat_2_6_len);
+    compat_props_add(m->compat_props, pc_compat_2_6, pc_compat_2_6_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_6, "pc-i440fx-2.6", NULL,
+                      pc_i440fx_2_6_machine_options);
+
+static void pc_i440fx_2_5_machine_options(MachineClass *m)
+{
+    X86MachineClass *x86mc = X86_MACHINE_CLASS(m);
+
+    pc_i440fx_2_6_machine_options(m);
+    x86mc->save_tsc_khz = false;
+    m->legacy_fw_cfg_order = 1;
+    compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len);
+    compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_5, "pc-i440fx-2.5", NULL,
+                      pc_i440fx_2_5_machine_options);
+
+static void pc_i440fx_2_4_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_i440fx_2_5_machine_options(m);
+    m->hw_version = "2.4.0";
+    pcmc->broken_reserved_end = true;
+    compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len);
+    compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_4, "pc-i440fx-2.4", NULL,
+                      pc_i440fx_2_4_machine_options)
+
+static void pc_i440fx_2_3_machine_options(MachineClass *m)
+{
+    pc_i440fx_2_4_machine_options(m);
+    m->hw_version = "2.3.0";
+    compat_props_add(m->compat_props, hw_compat_2_3, hw_compat_2_3_len);
+    compat_props_add(m->compat_props, pc_compat_2_3, pc_compat_2_3_len);
+}
+
+DEFINE_I440FX_MACHINE(v2_3, "pc-i440fx-2.3", pc_compat_2_3_fn,
+                      pc_i440fx_2_3_machine_options);
+
+static void pc_i440fx_2_2_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_i440fx_2_3_machine_options(m);
+    m->hw_version = "2.2.0";
+    m->default_machine_opts = "firmware=bios-256k.bin,suppress-vmdesc=on";
+    compat_props_add(m->compat_props, hw_compat_2_2, hw_compat_2_2_len);
+    compat_props_add(m->compat_props, pc_compat_2_2, pc_compat_2_2_len);
+    pcmc->rsdp_in_ram = false;
+}
+
+DEFINE_I440FX_MACHINE(v2_2, "pc-i440fx-2.2", pc_compat_2_2_fn,
+                      pc_i440fx_2_2_machine_options);
+
+static void pc_i440fx_2_1_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_i440fx_2_2_machine_options(m);
+    m->hw_version = "2.1.0";
+    m->default_display = NULL;
+    compat_props_add(m->compat_props, hw_compat_2_1, hw_compat_2_1_len);
+    compat_props_add(m->compat_props, pc_compat_2_1, pc_compat_2_1_len);
+    pcmc->smbios_uuid_encoded = false;
+    pcmc->enforce_aligned_dimm = false;
+}
+
+DEFINE_I440FX_MACHINE(v2_1, "pc-i440fx-2.1", pc_compat_2_1_fn,
+                      pc_i440fx_2_1_machine_options);
+
+static void pc_i440fx_2_0_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_i440fx_2_1_machine_options(m);
+    m->hw_version = "2.0.0";
+    compat_props_add(m->compat_props, pc_compat_2_0, pc_compat_2_0_len);
+    pcmc->smbios_legacy_mode = true;
+    pcmc->has_reserved_memory = false;
+    /* This value depends on the actual DSDT and SSDT compiled into
+     * the source QEMU; unfortunately it depends on the binary and
+     * not on the machine type, so we cannot make pc-i440fx-1.7 work on
+     * both QEMU 1.7 and QEMU 2.0.
+     *
+     * Large variations cause migration to fail for more than one
+     * consecutive value of the "-smp" maxcpus option.
+     *
+     * For small variations of the kind caused by different iasl versions,
+     * the 4k rounding usually leaves slack.  However, there could be still
+     * one or two values that break.  For QEMU 1.7 and QEMU 2.0 the
+     * slack is only ~10 bytes before one "-smp maxcpus" value breaks!
+     *
+     * 6652 is valid for QEMU 2.0, the right value for pc-i440fx-1.7 on
+     * QEMU 1.7 it is 6414.  For RHEL/CentOS 7.0 it is 6418.
+     */
+    pcmc->legacy_acpi_table_size = 6652;
+    pcmc->acpi_data_size = 0x10000;
+}
+
+DEFINE_I440FX_MACHINE(v2_0, "pc-i440fx-2.0", pc_compat_2_0_fn,
+                      pc_i440fx_2_0_machine_options);
+
+static void pc_i440fx_1_7_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_i440fx_2_0_machine_options(m);
+    m->hw_version = "1.7.0";
+    m->default_machine_opts = NULL;
+    m->option_rom_has_mr = true;
+    compat_props_add(m->compat_props, pc_compat_1_7, pc_compat_1_7_len);
+    pcmc->smbios_defaults = false;
+    pcmc->gigabyte_align = false;
+    pcmc->legacy_acpi_table_size = 6414;
+}
+
+DEFINE_I440FX_MACHINE(v1_7, "pc-i440fx-1.7", pc_compat_1_7_fn,
+                      pc_i440fx_1_7_machine_options);
+
+static void pc_i440fx_1_6_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_i440fx_1_7_machine_options(m);
+    m->hw_version = "1.6.0";
+    m->rom_file_has_mr = false;
+    compat_props_add(m->compat_props, pc_compat_1_6, pc_compat_1_6_len);
+    pcmc->has_acpi_build = false;
+}
+
+DEFINE_I440FX_MACHINE(v1_6, "pc-i440fx-1.6", pc_compat_1_6_fn,
+                      pc_i440fx_1_6_machine_options);
+
+static void pc_i440fx_1_5_machine_options(MachineClass *m)
+{
+    pc_i440fx_1_6_machine_options(m);
+    m->hw_version = "1.5.0";
+    compat_props_add(m->compat_props, pc_compat_1_5, pc_compat_1_5_len);
+}
+
+DEFINE_I440FX_MACHINE(v1_5, "pc-i440fx-1.5", pc_compat_1_5_fn,
+                      pc_i440fx_1_5_machine_options);
+
+static void pc_i440fx_1_4_machine_options(MachineClass *m)
+{
+    pc_i440fx_1_5_machine_options(m);
+    m->hw_version = "1.4.0";
+    compat_props_add(m->compat_props, pc_compat_1_4, pc_compat_1_4_len);
+}
+
+DEFINE_I440FX_MACHINE(v1_4, "pc-i440fx-1.4", pc_compat_1_4_fn,
+                      pc_i440fx_1_4_machine_options);
+
+typedef struct {
+    uint16_t gpu_device_id;
+    uint16_t pch_device_id;
+    uint8_t pch_revision_id;
+} IGDDeviceIDInfo;
+
+/* In real world different GPU should have different PCH. But actually
+ * the different PCH DIDs likely map to different PCH SKUs. We do the
+ * same thing for the GPU. For PCH, the different SKUs are going to be
+ * all the same silicon design and implementation, just different
+ * features turn on and off with fuses. The SW interfaces should be
+ * consistent across all SKUs in a given family (eg LPT). But just same
+ * features may not be supported.
+ *
+ * Most of these different PCH features probably don't matter to the
+ * Gfx driver, but obviously any difference in display port connections
+ * will so it should be fine with any PCH in case of passthrough.
+ *
+ * So currently use one PCH version, 0x8c4e, to cover all HSW(Haswell)
+ * scenarios, 0x9cc3 for BDW(Broadwell).
+ */
+static const IGDDeviceIDInfo igd_combo_id_infos[] = {
+    /* HSW Classic */
+    {0x0402, 0x8c4e, 0x04}, /* HSWGT1D, HSWD_w7 */
+    {0x0406, 0x8c4e, 0x04}, /* HSWGT1M, HSWM_w7 */
+    {0x0412, 0x8c4e, 0x04}, /* HSWGT2D, HSWD_w7 */
+    {0x0416, 0x8c4e, 0x04}, /* HSWGT2M, HSWM_w7 */
+    {0x041E, 0x8c4e, 0x04}, /* HSWGT15D, HSWD_w7 */
+    /* HSW ULT */
+    {0x0A06, 0x8c4e, 0x04}, /* HSWGT1UT, HSWM_w7 */
+    {0x0A16, 0x8c4e, 0x04}, /* HSWGT2UT, HSWM_w7 */
+    {0x0A26, 0x8c4e, 0x06}, /* HSWGT3UT, HSWM_w7 */
+    {0x0A2E, 0x8c4e, 0x04}, /* HSWGT3UT28W, HSWM_w7 */
+    {0x0A1E, 0x8c4e, 0x04}, /* HSWGT2UX, HSWM_w7 */
+    {0x0A0E, 0x8c4e, 0x04}, /* HSWGT1ULX, HSWM_w7 */
+    /* HSW CRW */
+    {0x0D26, 0x8c4e, 0x04}, /* HSWGT3CW, HSWM_w7 */
+    {0x0D22, 0x8c4e, 0x04}, /* HSWGT3CWDT, HSWD_w7 */
+    /* HSW Server */
+    {0x041A, 0x8c4e, 0x04}, /* HSWSVGT2, HSWD_w7 */
+    /* HSW SRVR */
+    {0x040A, 0x8c4e, 0x04}, /* HSWSVGT1, HSWD_w7 */
+    /* BSW */
+    {0x1606, 0x9cc3, 0x03}, /* BDWULTGT1, BDWM_w7 */
+    {0x1616, 0x9cc3, 0x03}, /* BDWULTGT2, BDWM_w7 */
+    {0x1626, 0x9cc3, 0x03}, /* BDWULTGT3, BDWM_w7 */
+    {0x160E, 0x9cc3, 0x03}, /* BDWULXGT1, BDWM_w7 */
+    {0x161E, 0x9cc3, 0x03}, /* BDWULXGT2, BDWM_w7 */
+    {0x1602, 0x9cc3, 0x03}, /* BDWHALOGT1, BDWM_w7 */
+    {0x1612, 0x9cc3, 0x03}, /* BDWHALOGT2, BDWM_w7 */
+    {0x1622, 0x9cc3, 0x03}, /* BDWHALOGT3, BDWM_w7 */
+    {0x162B, 0x9cc3, 0x03}, /* BDWHALO28W, BDWM_w7 */
+    {0x162A, 0x9cc3, 0x03}, /* BDWGT3WRKS, BDWM_w7 */
+    {0x162D, 0x9cc3, 0x03}, /* BDWGT3SRVR, BDWM_w7 */
+};
+
+static void isa_bridge_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+    dc->desc        = "ISA bridge faked to support IGD PT";
+    set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+    k->vendor_id    = PCI_VENDOR_ID_INTEL;
+    k->class_id     = PCI_CLASS_BRIDGE_ISA;
+};
+
+static TypeInfo isa_bridge_info = {
+    .name          = "igd-passthrough-isa-bridge",
+    .parent        = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(PCIDevice),
+    .class_init = isa_bridge_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+        { },
+    },
+};
+
+static void pt_graphics_register_types(void)
+{
+    type_register_static(&isa_bridge_info);
+}
+type_init(pt_graphics_register_types)
+
+void igd_passthrough_isa_bridge_create(PCIBus *bus, uint16_t gpu_dev_id)
+{
+    struct PCIDevice *bridge_dev;
+    int i, num;
+    uint16_t pch_dev_id = 0xffff;
+    uint8_t pch_rev_id = 0;
+
+    num = ARRAY_SIZE(igd_combo_id_infos);
+    for (i = 0; i < num; i++) {
+        if (gpu_dev_id == igd_combo_id_infos[i].gpu_device_id) {
+            pch_dev_id = igd_combo_id_infos[i].pch_device_id;
+            pch_rev_id = igd_combo_id_infos[i].pch_revision_id;
+        }
+    }
+
+    if (pch_dev_id == 0xffff) {
+        return;
+    }
+
+    /* Currently IGD drivers always need to access PCH by 1f.0. */
+    bridge_dev = pci_create_simple(bus, PCI_DEVFN(0x1f, 0),
+                                   "igd-passthrough-isa-bridge");
+
+    /*
+     * Note that vendor id is always PCI_VENDOR_ID_INTEL.
+     */
+    if (!bridge_dev) {
+        fprintf(stderr, "set igd-passthrough-isa-bridge failed!\n");
+        return;
+    }
+    pci_config_set_device_id(bridge_dev->config, pch_dev_id);
+    pci_config_set_revision(bridge_dev->config, pch_rev_id);
+}
+
+static void isapc_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+    m->desc = "ISA-only PC";
+    m->max_cpus = 1;
+    m->option_rom_has_mr = true;
+    m->rom_file_has_mr = false;
+    pcmc->pci_enabled = false;
+    pcmc->has_acpi_build = false;
+    pcmc->smbios_defaults = false;
+    pcmc->gigabyte_align = false;
+    pcmc->smbios_legacy_mode = true;
+    pcmc->has_reserved_memory = false;
+    pcmc->default_nic_model = "ne2k_isa";
+    m->default_cpu_type = X86_CPU_TYPE_NAME("486");
+}
+
+DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa,
+                  isapc_machine_options);
+
+
+#ifdef CONFIG_XEN
+static void xenfv_4_2_machine_options(MachineClass *m)
+{
+    pc_i440fx_4_2_machine_options(m);
+    m->desc = "Xen Fully-virtualized PC";
+    m->max_cpus = HVM_MAX_VCPUS;
+    m->default_machine_opts = "accel=xen,suppress-vmdesc=on";
+}
+
+DEFINE_PC_MACHINE(xenfv_4_2, "xenfv-4.2", pc_xen_hvm_init,
+                  xenfv_4_2_machine_options);
+
+static void xenfv_3_1_machine_options(MachineClass *m)
+{
+    pc_i440fx_3_1_machine_options(m);
+    m->desc = "Xen Fully-virtualized PC";
+    m->alias = "xenfv";
+    m->max_cpus = HVM_MAX_VCPUS;
+    m->default_machine_opts = "accel=xen,suppress-vmdesc=on";
+}
+
+DEFINE_PC_MACHINE(xenfv, "xenfv-3.1", pc_xen_hvm_init,
+                  xenfv_3_1_machine_options);
+#endif
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
new file mode 100644
index 000000000..e1e100316
--- /dev/null
+++ b/hw/i386/pc_q35.c
@@ -0,0 +1,620 @@
+/*
+ * Q35 chipset based pc system emulator
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2009, 2010
+ *               Isaku Yamahata <yamahata at valinux co jp>
+ *               VA Linux Systems Japan K.K.
+ * Copyright (C) 2012 Jason Baron <jbaron@redhat.com>
+ *
+ * This is based on pc.c, but heavily modified.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "hw/loader.h"
+#include "hw/i2c/smbus_eeprom.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "sysemu/kvm.h"
+#include "hw/kvm/clock.h"
+#include "hw/pci-host/q35.h"
+#include "hw/pci/pcie_port.h"
+#include "hw/qdev-properties.h"
+#include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/ich9.h"
+#include "hw/i386/amd_iommu.h"
+#include "hw/i386/intel_iommu.h"
+#include "hw/display/ramfb.h"
+#include "hw/firmware/smbios.h"
+#include "hw/ide/pci.h"
+#include "hw/ide/ahci.h"
+#include "hw/usb.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "sysemu/numa.h"
+#include "hw/hyperv/vmbus-bridge.h"
+#include "hw/mem/nvdimm.h"
+#include "hw/i386/acpi-build.h"
+
+/* ICH9 AHCI has 6 ports */
+#define MAX_SATA_PORTS     6
+
+struct ehci_companions {
+    const char *name;
+    int func;
+    int port;
+};
+
+static const struct ehci_companions ich9_1d[] = {
+    { .name = "ich9-usb-uhci1", .func = 0, .port = 0 },
+    { .name = "ich9-usb-uhci2", .func = 1, .port = 2 },
+    { .name = "ich9-usb-uhci3", .func = 2, .port = 4 },
+};
+
+static const struct ehci_companions ich9_1a[] = {
+    { .name = "ich9-usb-uhci4", .func = 0, .port = 0 },
+    { .name = "ich9-usb-uhci5", .func = 1, .port = 2 },
+    { .name = "ich9-usb-uhci6", .func = 2, .port = 4 },
+};
+
+static int ehci_create_ich9_with_companions(PCIBus *bus, int slot)
+{
+    const struct ehci_companions *comp;
+    PCIDevice *ehci, *uhci;
+    BusState *usbbus;
+    const char *name;
+    int i;
+
+    switch (slot) {
+    case 0x1d:
+        name = "ich9-usb-ehci1";
+        comp = ich9_1d;
+        break;
+    case 0x1a:
+        name = "ich9-usb-ehci2";
+        comp = ich9_1a;
+        break;
+    default:
+        return -1;
+    }
+
+    ehci = pci_new_multifunction(PCI_DEVFN(slot, 7), true, name);
+    pci_realize_and_unref(ehci, bus, &error_fatal);
+    usbbus = QLIST_FIRST(&ehci->qdev.child_bus);
+
+    for (i = 0; i < 3; i++) {
+        uhci = pci_new_multifunction(PCI_DEVFN(slot, comp[i].func), true,
+                                     comp[i].name);
+        qdev_prop_set_string(&uhci->qdev, "masterbus", usbbus->name);
+        qdev_prop_set_uint32(&uhci->qdev, "firstport", comp[i].port);
+        pci_realize_and_unref(uhci, bus, &error_fatal);
+    }
+    return 0;
+}
+
+/* PC hardware initialisation */
+static void pc_q35_init(MachineState *machine)
+{
+    PCMachineState *pcms = PC_MACHINE(machine);
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+    X86MachineState *x86ms = X86_MACHINE(machine);
+    Q35PCIHost *q35_host;
+    PCIHostState *phb;
+    PCIBus *host_bus;
+    PCIDevice *lpc;
+    DeviceState *lpc_dev;
+    BusState *idebus[MAX_SATA_PORTS];
+    ISADevice *rtc_state;
+    MemoryRegion *system_io = get_system_io();
+    MemoryRegion *pci_memory;
+    MemoryRegion *rom_memory;
+    MemoryRegion *ram_memory;
+    GSIState *gsi_state;
+    ISABus *isa_bus;
+    int i;
+    ICH9LPCState *ich9_lpc;
+    PCIDevice *ahci;
+    ram_addr_t lowmem;
+    DriveInfo *hd[MAX_SATA_PORTS];
+    MachineClass *mc = MACHINE_GET_CLASS(machine);
+    bool acpi_pcihp;
+    bool keep_pci_slot_hpc;
+
+    /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory
+     * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping
+     * also known as MMCFG).
+     * If it doesn't, we need to split it in chunks below and above 4G.
+     * In any case, try to make sure that guest addresses aligned at
+     * 1G boundaries get mapped to host addresses aligned at 1G boundaries.
+     */
+    if (machine->ram_size >= 0xb0000000) {
+        lowmem = 0x80000000;
+    } else {
+        lowmem = 0xb0000000;
+    }
+
+    /* Handle the machine opt max-ram-below-4g.  It is basically doing
+     * min(qemu limit, user limit).
+     */
+    if (!pcms->max_ram_below_4g) {
+        pcms->max_ram_below_4g = 4 * GiB;
+    }
+    if (lowmem > pcms->max_ram_below_4g) {
+        lowmem = pcms->max_ram_below_4g;
+        if (machine->ram_size - lowmem > lowmem &&
+            lowmem & (1 * GiB - 1)) {
+            warn_report("There is possibly poor performance as the ram size "
+                        " (0x%" PRIx64 ") is more then twice the size of"
+                        " max-ram-below-4g (%"PRIu64") and"
+                        " max-ram-below-4g is not a multiple of 1G.",
+                        (uint64_t)machine->ram_size, pcms->max_ram_below_4g);
+        }
+    }
+
+    if (machine->ram_size >= lowmem) {
+        x86ms->above_4g_mem_size = machine->ram_size - lowmem;
+        x86ms->below_4g_mem_size = lowmem;
+    } else {
+        x86ms->above_4g_mem_size = 0;
+        x86ms->below_4g_mem_size = machine->ram_size;
+    }
+
+    pc_machine_init_sgx_epc(pcms);
+    x86_cpus_init(x86ms, pcmc->default_cpu_version);
+
+    kvmclock_create(pcmc->kvmclock_create_always);
+
+    /* pci enabled */
+    if (pcmc->pci_enabled) {
+        pci_memory = g_new(MemoryRegion, 1);
+        memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
+        rom_memory = pci_memory;
+    } else {
+        pci_memory = NULL;
+        rom_memory = get_system_memory();
+    }
+
+    pc_guest_info_init(pcms);
+
+    if (pcmc->smbios_defaults) {
+        /* These values are guest ABI, do not change */
+        smbios_set_defaults("QEMU", "Standard PC (Q35 + ICH9, 2009)",
+                            mc->name, pcmc->smbios_legacy_mode,
+                            pcmc->smbios_uuid_encoded,
+                            SMBIOS_ENTRY_POINT_21);
+    }
+
+    /* allocate ram and load rom/bios */
+    pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory);
+
+    /* create pci host bus */
+    q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE));
+
+    object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host));
+    object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM,
+                             OBJECT(ram_memory), NULL);
+    object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_PCI_MEM,
+                             OBJECT(pci_memory), NULL);
+    object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_SYSTEM_MEM,
+                             OBJECT(get_system_memory()), NULL);
+    object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_IO_MEM,
+                             OBJECT(system_io), NULL);
+    object_property_set_int(OBJECT(q35_host), PCI_HOST_BELOW_4G_MEM_SIZE,
+                            x86ms->below_4g_mem_size, NULL);
+    object_property_set_int(OBJECT(q35_host), PCI_HOST_ABOVE_4G_MEM_SIZE,
+                            x86ms->above_4g_mem_size, NULL);
+    /* pci */
+    sysbus_realize_and_unref(SYS_BUS_DEVICE(q35_host), &error_fatal);
+    phb = PCI_HOST_BRIDGE(q35_host);
+    host_bus = phb->bus;
+    /* create ISA bus */
+    lpc = pci_create_simple_multifunction(host_bus, PCI_DEVFN(ICH9_LPC_DEV,
+                                          ICH9_LPC_FUNC), true,
+                                          TYPE_ICH9_LPC_DEVICE);
+
+    object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
+                             TYPE_HOTPLUG_HANDLER,
+                             (Object **)&x86ms->acpi_dev,
+                             object_property_allow_set_link,
+                             OBJ_PROP_LINK_STRONG);
+    object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
+                             OBJECT(lpc), &error_abort);
+
+    acpi_pcihp = object_property_get_bool(OBJECT(lpc),
+                                          ACPI_PM_PROP_ACPI_PCIHP_BRIDGE,
+                                          NULL);
+
+    keep_pci_slot_hpc = object_property_get_bool(OBJECT(lpc),
+                                                 "x-keep-pci-slot-hpc",
+                                                 NULL);
+
+    if (!keep_pci_slot_hpc && acpi_pcihp) {
+        object_register_sugar_prop(TYPE_PCIE_SLOT, "x-native-hotplug",
+                                   "false", true);
+    }
+
+    /* irq lines */
+    gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled);
+
+    ich9_lpc = ICH9_LPC_DEVICE(lpc);
+    lpc_dev = DEVICE(lpc);
+    for (i = 0; i < GSI_NUM_PINS; i++) {
+        qdev_connect_gpio_out_named(lpc_dev, ICH9_GPIO_GSI, i, x86ms->gsi[i]);
+    }
+    pci_bus_irqs(host_bus, ich9_lpc_set_irq, ich9_lpc_map_irq, ich9_lpc,
+                 ICH9_LPC_NB_PIRQS);
+    pci_bus_set_route_irq_fn(host_bus, ich9_route_intx_pin_to_irq);
+    isa_bus = ich9_lpc->isa_bus;
+
+    pc_i8259_create(isa_bus, gsi_state->i8259_irq);
+
+    if (pcmc->pci_enabled) {
+        ioapic_init_gsi(gsi_state, "q35");
+    }
+
+    if (tcg_enabled()) {
+        x86_register_ferr_irq(x86ms->gsi[13]);
+    }
+
+    assert(pcms->vmport != ON_OFF_AUTO__MAX);
+    if (pcms->vmport == ON_OFF_AUTO_AUTO) {
+        pcms->vmport = ON_OFF_AUTO_ON;
+    }
+
+    /* init basic PC hardware */
+    pc_basic_device_init(pcms, isa_bus, x86ms->gsi, &rtc_state, !mc->no_floppy,
+                         0xff0104);
+
+    /* connect pm stuff to lpc */
+    ich9_lpc_pm_init(lpc, x86_machine_is_smm_enabled(x86ms));
+
+    if (pcms->sata_enabled) {
+        /* ahci and SATA device, for q35 1 ahci controller is built-in */
+        ahci = pci_create_simple_multifunction(host_bus,
+                                               PCI_DEVFN(ICH9_SATA1_DEV,
+                                                         ICH9_SATA1_FUNC),
+                                               true, "ich9-ahci");
+        idebus[0] = qdev_get_child_bus(&ahci->qdev, "ide.0");
+        idebus[1] = qdev_get_child_bus(&ahci->qdev, "ide.1");
+        g_assert(MAX_SATA_PORTS == ahci_get_num_ports(ahci));
+        ide_drive_get(hd, ahci_get_num_ports(ahci));
+        ahci_ide_create_devs(ahci, hd);
+    } else {
+        idebus[0] = idebus[1] = NULL;
+    }
+
+    if (machine_usb(machine)) {
+        /* Should we create 6 UHCI according to ich9 spec? */
+        ehci_create_ich9_with_companions(host_bus, 0x1d);
+    }
+
+    if (pcms->smbus_enabled) {
+        /* TODO: Populate SPD eeprom data.  */
+        pcms->smbus = ich9_smb_init(host_bus,
+                                    PCI_DEVFN(ICH9_SMB_DEV, ICH9_SMB_FUNC),
+                                    0xb100);
+        smbus_eeprom_init(pcms->smbus, 8, NULL, 0);
+    }
+
+    pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state);
+
+    /* the rest devices to which pci devfn is automatically assigned */
+    pc_vga_init(isa_bus, host_bus);
+    pc_nic_init(pcmc, isa_bus, host_bus);
+
+    if (machine->nvdimms_state->is_enabled) {
+        nvdimm_init_acpi_state(machine->nvdimms_state, system_io,
+                               x86_nvdimm_acpi_dsmio,
+                               x86ms->fw_cfg, OBJECT(pcms));
+    }
+}
+
+#define DEFINE_Q35_MACHINE(suffix, name, compatfn, optionfn) \
+    static void pc_init_##suffix(MachineState *machine) \
+    { \
+        void (*compat)(MachineState *m) = (compatfn); \
+        if (compat) { \
+            compat(machine); \
+        } \
+        pc_q35_init(machine); \
+    } \
+    DEFINE_PC_MACHINE(suffix, name, pc_init_##suffix, optionfn)
+
+
+static void pc_q35_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+    pcmc->default_nic_model = "e1000e";
+    pcmc->pci_root_uid = 0;
+
+    m->family = "pc_q35";
+    m->desc = "Standard PC (Q35 + ICH9, 2009)";
+    m->units_per_default_bus = 1;
+    m->default_machine_opts = "firmware=bios-256k.bin";
+    m->default_display = "std";
+    m->default_kernel_irqchip_split = false;
+    m->no_floppy = 1;
+    machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE);
+    machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE);
+    machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE);
+    machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
+    m->max_cpus = 288;
+}
+
+static void pc_q35_6_2_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+    pc_q35_machine_options(m);
+    m->alias = "q35";
+    pcmc->default_cpu_version = 1;
+}
+
+DEFINE_Q35_MACHINE(v6_2, "pc-q35-6.2", NULL,
+                   pc_q35_6_2_machine_options);
+
+static void pc_q35_6_1_machine_options(MachineClass *m)
+{
+    pc_q35_6_2_machine_options(m);
+    m->alias = NULL;
+    compat_props_add(m->compat_props, hw_compat_6_1, hw_compat_6_1_len);
+    compat_props_add(m->compat_props, pc_compat_6_1, pc_compat_6_1_len);
+    m->smp_props.prefer_sockets = true;
+}
+
+DEFINE_Q35_MACHINE(v6_1, "pc-q35-6.1", NULL,
+                   pc_q35_6_1_machine_options);
+
+static void pc_q35_6_0_machine_options(MachineClass *m)
+{
+    pc_q35_6_1_machine_options(m);
+    m->alias = NULL;
+    compat_props_add(m->compat_props, hw_compat_6_0, hw_compat_6_0_len);
+    compat_props_add(m->compat_props, pc_compat_6_0, pc_compat_6_0_len);
+}
+
+DEFINE_Q35_MACHINE(v6_0, "pc-q35-6.0", NULL,
+                   pc_q35_6_0_machine_options);
+
+static void pc_q35_5_2_machine_options(MachineClass *m)
+{
+    pc_q35_6_0_machine_options(m);
+    m->alias = NULL;
+    compat_props_add(m->compat_props, hw_compat_5_2, hw_compat_5_2_len);
+    compat_props_add(m->compat_props, pc_compat_5_2, pc_compat_5_2_len);
+}
+
+DEFINE_Q35_MACHINE(v5_2, "pc-q35-5.2", NULL,
+                   pc_q35_5_2_machine_options);
+
+static void pc_q35_5_1_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_q35_5_2_machine_options(m);
+    m->alias = NULL;
+    compat_props_add(m->compat_props, hw_compat_5_1, hw_compat_5_1_len);
+    compat_props_add(m->compat_props, pc_compat_5_1, pc_compat_5_1_len);
+    pcmc->kvmclock_create_always = false;
+    pcmc->pci_root_uid = 1;
+}
+
+DEFINE_Q35_MACHINE(v5_1, "pc-q35-5.1", NULL,
+                   pc_q35_5_1_machine_options);
+
+static void pc_q35_5_0_machine_options(MachineClass *m)
+{
+    pc_q35_5_1_machine_options(m);
+    m->alias = NULL;
+    m->numa_mem_supported = true;
+    compat_props_add(m->compat_props, hw_compat_5_0, hw_compat_5_0_len);
+    compat_props_add(m->compat_props, pc_compat_5_0, pc_compat_5_0_len);
+    m->auto_enable_numa_with_memdev = false;
+}
+
+DEFINE_Q35_MACHINE(v5_0, "pc-q35-5.0", NULL,
+                   pc_q35_5_0_machine_options);
+
+static void pc_q35_4_2_machine_options(MachineClass *m)
+{
+    pc_q35_5_0_machine_options(m);
+    m->alias = NULL;
+    compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len);
+    compat_props_add(m->compat_props, pc_compat_4_2, pc_compat_4_2_len);
+}
+
+DEFINE_Q35_MACHINE(v4_2, "pc-q35-4.2", NULL,
+                   pc_q35_4_2_machine_options);
+
+static void pc_q35_4_1_machine_options(MachineClass *m)
+{
+    pc_q35_4_2_machine_options(m);
+    m->alias = NULL;
+    compat_props_add(m->compat_props, hw_compat_4_1, hw_compat_4_1_len);
+    compat_props_add(m->compat_props, pc_compat_4_1, pc_compat_4_1_len);
+}
+
+DEFINE_Q35_MACHINE(v4_1, "pc-q35-4.1", NULL,
+                   pc_q35_4_1_machine_options);
+
+static void pc_q35_4_0_1_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+    pc_q35_4_1_machine_options(m);
+    m->alias = NULL;
+    pcmc->default_cpu_version = CPU_VERSION_LEGACY;
+    /*
+     * This is the default machine for the 4.0-stable branch. It is basically
+     * a 4.0 that doesn't use split irqchip by default. It MUST hence apply the
+     * 4.0 compat props.
+     */
+    compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len);
+    compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len);
+}
+
+DEFINE_Q35_MACHINE(v4_0_1, "pc-q35-4.0.1", NULL,
+                   pc_q35_4_0_1_machine_options);
+
+static void pc_q35_4_0_machine_options(MachineClass *m)
+{
+    pc_q35_4_0_1_machine_options(m);
+    m->default_kernel_irqchip_split = true;
+    m->alias = NULL;
+    /* Compat props are applied by the 4.0.1 machine */
+}
+
+DEFINE_Q35_MACHINE(v4_0, "pc-q35-4.0", NULL,
+                   pc_q35_4_0_machine_options);
+
+static void pc_q35_3_1_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_q35_4_0_machine_options(m);
+    m->default_kernel_irqchip_split = false;
+    pcmc->do_not_add_smb_acpi = true;
+    m->smbus_no_migration_support = true;
+    m->alias = NULL;
+    pcmc->pvh_enabled = false;
+    compat_props_add(m->compat_props, hw_compat_3_1, hw_compat_3_1_len);
+    compat_props_add(m->compat_props, pc_compat_3_1, pc_compat_3_1_len);
+}
+
+DEFINE_Q35_MACHINE(v3_1, "pc-q35-3.1", NULL,
+                   pc_q35_3_1_machine_options);
+
+static void pc_q35_3_0_machine_options(MachineClass *m)
+{
+    pc_q35_3_1_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_3_0, hw_compat_3_0_len);
+    compat_props_add(m->compat_props, pc_compat_3_0, pc_compat_3_0_len);
+}
+
+DEFINE_Q35_MACHINE(v3_0, "pc-q35-3.0", NULL,
+                    pc_q35_3_0_machine_options);
+
+static void pc_q35_2_12_machine_options(MachineClass *m)
+{
+    pc_q35_3_0_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_2_12, hw_compat_2_12_len);
+    compat_props_add(m->compat_props, pc_compat_2_12, pc_compat_2_12_len);
+}
+
+DEFINE_Q35_MACHINE(v2_12, "pc-q35-2.12", NULL,
+                   pc_q35_2_12_machine_options);
+
+static void pc_q35_2_11_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_q35_2_12_machine_options(m);
+    pcmc->default_nic_model = "e1000";
+    compat_props_add(m->compat_props, hw_compat_2_11, hw_compat_2_11_len);
+    compat_props_add(m->compat_props, pc_compat_2_11, pc_compat_2_11_len);
+}
+
+DEFINE_Q35_MACHINE(v2_11, "pc-q35-2.11", NULL,
+                   pc_q35_2_11_machine_options);
+
+static void pc_q35_2_10_machine_options(MachineClass *m)
+{
+    pc_q35_2_11_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_2_10, hw_compat_2_10_len);
+    compat_props_add(m->compat_props, pc_compat_2_10, pc_compat_2_10_len);
+    m->auto_enable_numa_with_memhp = false;
+}
+
+DEFINE_Q35_MACHINE(v2_10, "pc-q35-2.10", NULL,
+                   pc_q35_2_10_machine_options);
+
+static void pc_q35_2_9_machine_options(MachineClass *m)
+{
+    pc_q35_2_10_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_2_9, hw_compat_2_9_len);
+    compat_props_add(m->compat_props, pc_compat_2_9, pc_compat_2_9_len);
+}
+
+DEFINE_Q35_MACHINE(v2_9, "pc-q35-2.9", NULL,
+                   pc_q35_2_9_machine_options);
+
+static void pc_q35_2_8_machine_options(MachineClass *m)
+{
+    pc_q35_2_9_machine_options(m);
+    compat_props_add(m->compat_props, hw_compat_2_8, hw_compat_2_8_len);
+    compat_props_add(m->compat_props, pc_compat_2_8, pc_compat_2_8_len);
+}
+
+DEFINE_Q35_MACHINE(v2_8, "pc-q35-2.8", NULL,
+                   pc_q35_2_8_machine_options);
+
+static void pc_q35_2_7_machine_options(MachineClass *m)
+{
+    pc_q35_2_8_machine_options(m);
+    m->max_cpus = 255;
+    compat_props_add(m->compat_props, hw_compat_2_7, hw_compat_2_7_len);
+    compat_props_add(m->compat_props, pc_compat_2_7, pc_compat_2_7_len);
+}
+
+DEFINE_Q35_MACHINE(v2_7, "pc-q35-2.7", NULL,
+                   pc_q35_2_7_machine_options);
+
+static void pc_q35_2_6_machine_options(MachineClass *m)
+{
+    X86MachineClass *x86mc = X86_MACHINE_CLASS(m);
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_q35_2_7_machine_options(m);
+    pcmc->legacy_cpu_hotplug = true;
+    x86mc->fwcfg_dma_enabled = false;
+    compat_props_add(m->compat_props, hw_compat_2_6, hw_compat_2_6_len);
+    compat_props_add(m->compat_props, pc_compat_2_6, pc_compat_2_6_len);
+}
+
+DEFINE_Q35_MACHINE(v2_6, "pc-q35-2.6", NULL,
+                   pc_q35_2_6_machine_options);
+
+static void pc_q35_2_5_machine_options(MachineClass *m)
+{
+    X86MachineClass *x86mc = X86_MACHINE_CLASS(m);
+
+    pc_q35_2_6_machine_options(m);
+    x86mc->save_tsc_khz = false;
+    m->legacy_fw_cfg_order = 1;
+    compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len);
+    compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len);
+}
+
+DEFINE_Q35_MACHINE(v2_5, "pc-q35-2.5", NULL,
+                   pc_q35_2_5_machine_options);
+
+static void pc_q35_2_4_machine_options(MachineClass *m)
+{
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+    pc_q35_2_5_machine_options(m);
+    m->hw_version = "2.4.0";
+    pcmc->broken_reserved_end = true;
+    compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len);
+    compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len);
+}
+
+DEFINE_Q35_MACHINE(v2_4, "pc-q35-2.4", NULL,
+                   pc_q35_2_4_machine_options);
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
new file mode 100644
index 000000000..c8b17af95
--- /dev/null
+++ b/hw/i386/pc_sysfw.c
@@ -0,0 +1,262 @@
+/*
+ * QEMU PC System Firmware
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2011-2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qapi/error.h"
+#include "sysemu/block-backend.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/units.h"
+#include "hw/sysbus.h"
+#include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/loader.h"
+#include "hw/qdev-properties.h"
+#include "hw/block/flash.h"
+#include "sysemu/kvm.h"
+#include "sev.h"
+
+#define FLASH_SECTOR_SIZE 4096
+
+static void pc_isa_bios_init(MemoryRegion *rom_memory,
+                             MemoryRegion *flash_mem,
+                             int ram_size)
+{
+    int isa_bios_size;
+    MemoryRegion *isa_bios;
+    uint64_t flash_size;
+    void *flash_ptr, *isa_bios_ptr;
+
+    flash_size = memory_region_size(flash_mem);
+
+    /* map the last 128KB of the BIOS in ISA space */
+    isa_bios_size = MIN(flash_size, 128 * KiB);
+    isa_bios = g_malloc(sizeof(*isa_bios));
+    memory_region_init_ram(isa_bios, NULL, "isa-bios", isa_bios_size,
+                           &error_fatal);
+    memory_region_add_subregion_overlap(rom_memory,
+                                        0x100000 - isa_bios_size,
+                                        isa_bios,
+                                        1);
+
+    /* copy ISA rom image from top of flash memory */
+    flash_ptr = memory_region_get_ram_ptr(flash_mem);
+    isa_bios_ptr = memory_region_get_ram_ptr(isa_bios);
+    memcpy(isa_bios_ptr,
+           ((uint8_t*)flash_ptr) + (flash_size - isa_bios_size),
+           isa_bios_size);
+
+    memory_region_set_readonly(isa_bios, true);
+}
+
+static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms,
+                                     const char *name,
+                                     const char *alias_prop_name)
+{
+    DeviceState *dev = qdev_new(TYPE_PFLASH_CFI01);
+
+    qdev_prop_set_uint64(dev, "sector-length", FLASH_SECTOR_SIZE);
+    qdev_prop_set_uint8(dev, "width", 1);
+    qdev_prop_set_string(dev, "name", name);
+    object_property_add_child(OBJECT(pcms), name, OBJECT(dev));
+    object_property_add_alias(OBJECT(pcms), alias_prop_name,
+                              OBJECT(dev), "drive");
+    /*
+     * The returned reference is tied to the child property and
+     * will be removed with object_unparent.
+     */
+    object_unref(OBJECT(dev));
+    return PFLASH_CFI01(dev);
+}
+
+void pc_system_flash_create(PCMachineState *pcms)
+{
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+
+    if (pcmc->pci_enabled) {
+        pcms->flash[0] = pc_pflash_create(pcms, "system.flash0",
+                                          "pflash0");
+        pcms->flash[1] = pc_pflash_create(pcms, "system.flash1",
+                                          "pflash1");
+    }
+}
+
+void pc_system_flash_cleanup_unused(PCMachineState *pcms)
+{
+    char *prop_name;
+    int i;
+    Object *dev_obj;
+
+    assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled);
+
+    for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
+        dev_obj = OBJECT(pcms->flash[i]);
+        if (!object_property_get_bool(dev_obj, "realized", &error_abort)) {
+            prop_name = g_strdup_printf("pflash%d", i);
+            object_property_del(OBJECT(pcms), prop_name);
+            g_free(prop_name);
+            object_unparent(dev_obj);
+            pcms->flash[i] = NULL;
+        }
+    }
+}
+
+/*
+ * Map the pcms->flash[] from 4GiB downward, and realize.
+ * Map them in descending order, i.e. pcms->flash[0] at the top,
+ * without gaps.
+ * Stop at the first pcms->flash[0] lacking a block backend.
+ * Set each flash's size from its block backend.  Fatal error if the
+ * size isn't a non-zero multiple of 4KiB, or the total size exceeds
+ * pcms->max_fw_size.
+ *
+ * If pcms->flash[0] has a block backend, its memory is passed to
+ * pc_isa_bios_init().  Merging several flash devices for isa-bios is
+ * not supported.
+ */
+static void pc_system_flash_map(PCMachineState *pcms,
+                                MemoryRegion *rom_memory)
+{
+    hwaddr total_size = 0;
+    int i;
+    BlockBackend *blk;
+    int64_t size;
+    PFlashCFI01 *system_flash;
+    MemoryRegion *flash_mem;
+    void *flash_ptr;
+    int flash_size;
+    int ret;
+
+    assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled);
+
+    for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
+        system_flash = pcms->flash[i];
+        blk = pflash_cfi01_get_blk(system_flash);
+        if (!blk) {
+            break;
+        }
+        size = blk_getlength(blk);
+        if (size < 0) {
+            error_report("can't get size of block device %s: %s",
+                         blk_name(blk), strerror(-size));
+            exit(1);
+        }
+        if (size == 0 || !QEMU_IS_ALIGNED(size, FLASH_SECTOR_SIZE)) {
+            error_report("system firmware block device %s has invalid size "
+                         "%" PRId64,
+                         blk_name(blk), size);
+            info_report("its size must be a non-zero multiple of 0x%x",
+                        FLASH_SECTOR_SIZE);
+            exit(1);
+        }
+        if ((hwaddr)size != size
+            || total_size > HWADDR_MAX - size
+            || total_size + size > pcms->max_fw_size) {
+            error_report("combined size of system firmware exceeds "
+                         "%" PRIu64 " bytes",
+                         pcms->max_fw_size);
+            exit(1);
+        }
+
+        total_size += size;
+        qdev_prop_set_uint32(DEVICE(system_flash), "num-blocks",
+                             size / FLASH_SECTOR_SIZE);
+        sysbus_realize_and_unref(SYS_BUS_DEVICE(system_flash), &error_fatal);
+        sysbus_mmio_map(SYS_BUS_DEVICE(system_flash), 0,
+                        0x100000000ULL - total_size);
+
+        if (i == 0) {
+            flash_mem = pflash_cfi01_get_memory(system_flash);
+            pc_isa_bios_init(rom_memory, flash_mem, size);
+
+            /* Encrypt the pflash boot ROM */
+            if (sev_enabled()) {
+                flash_ptr = memory_region_get_ram_ptr(flash_mem);
+                flash_size = memory_region_size(flash_mem);
+                /*
+                 * OVMF places a GUIDed structures in the flash, so
+                 * search for them
+                 */
+                pc_system_parse_ovmf_flash(flash_ptr, flash_size);
+
+                ret = sev_es_save_reset_vector(flash_ptr, flash_size);
+                if (ret) {
+                    error_report("failed to locate and/or save reset vector");
+                    exit(1);
+                }
+
+                sev_encrypt_flash(flash_ptr, flash_size, &error_fatal);
+            }
+        }
+    }
+}
+
+void pc_system_firmware_init(PCMachineState *pcms,
+                             MemoryRegion *rom_memory)
+{
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+    int i;
+    BlockBackend *pflash_blk[ARRAY_SIZE(pcms->flash)];
+
+    if (!pcmc->pci_enabled) {
+        x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, true);
+        return;
+    }
+
+    /* Map legacy -drive if=pflash to machine properties */
+    for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
+        pflash_cfi01_legacy_drive(pcms->flash[i],
+                                  drive_get(IF_PFLASH, 0, i));
+        pflash_blk[i] = pflash_cfi01_get_blk(pcms->flash[i]);
+    }
+
+    /* Reject gaps */
+    for (i = 1; i < ARRAY_SIZE(pcms->flash); i++) {
+        if (pflash_blk[i] && !pflash_blk[i - 1]) {
+            error_report("pflash%d requires pflash%d", i, i - 1);
+            exit(1);
+        }
+    }
+
+    if (!pflash_blk[0]) {
+        /* Machine property pflash0 not set, use ROM mode */
+        x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, false);
+    } else {
+        if (kvm_enabled() && !kvm_readonly_mem_enabled()) {
+            /*
+             * Older KVM cannot execute from device memory. So, flash
+             * memory cannot be used unless the readonly memory kvm
+             * capability is present.
+             */
+            error_report("pflash with kvm requires KVM readonly memory support");
+            exit(1);
+        }
+
+        pc_system_flash_map(pcms, rom_memory);
+    }
+
+    pc_system_flash_cleanup_unused(pcms);
+}
diff --git a/hw/i386/pc_sysfw_ovmf-stubs.c b/hw/i386/pc_sysfw_ovmf-stubs.c
new file mode 100644
index 000000000..aabe78b27
--- /dev/null
+++ b/hw/i386/pc_sysfw_ovmf-stubs.c
@@ -0,0 +1,26 @@
+/*
+ * QEMU PC System Firmware (OVMF stubs)
+ *
+ * Copyright (c) 2021 Red Hat, Inc.
+ *
+ * Author:
+ *   Philippe Mathieu-Daudé <philmd@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/pc.h"
+
+bool pc_system_ovmf_table_find(const char *entry, uint8_t **data, int *data_len)
+{
+    g_assert_not_reached();
+}
+
+void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size)
+{
+    g_assert_not_reached();
+}
diff --git a/hw/i386/pc_sysfw_ovmf.c b/hw/i386/pc_sysfw_ovmf.c
new file mode 100644
index 000000000..f4dd92c58
--- /dev/null
+++ b/hw/i386/pc_sysfw_ovmf.c
@@ -0,0 +1,151 @@
+/*
+ * QEMU PC System Firmware (OVMF specific)
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2011-2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/pc.h"
+#include "cpu.h"
+
+#define OVMF_TABLE_FOOTER_GUID "96b582de-1fb2-45f7-baea-a366c55a082d"
+
+static bool ovmf_flash_parsed;
+static uint8_t *ovmf_table;
+static int ovmf_table_len;
+
+void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size)
+{
+    uint8_t *ptr;
+    QemuUUID guid;
+    int tot_len;
+
+    /* should only be called once */
+    if (ovmf_flash_parsed) {
+        return;
+    }
+
+    ovmf_flash_parsed = true;
+
+    if (flash_size < TARGET_PAGE_SIZE) {
+        return;
+    }
+
+    /*
+     * if this is OVMF there will be a table footer
+     * guid 48 bytes before the end of the flash file.  If it's
+     * not found, silently abort the flash parsing.
+     */
+    qemu_uuid_parse(OVMF_TABLE_FOOTER_GUID, &guid);
+    guid = qemu_uuid_bswap(guid); /* guids are LE */
+    ptr = flash_ptr + flash_size - 48;
+    if (!qemu_uuid_is_equal((QemuUUID *)ptr, &guid)) {
+        return;
+    }
+
+    /* if found, just before is two byte table length */
+    ptr -= sizeof(uint16_t);
+    tot_len = le16_to_cpu(*(uint16_t *)ptr) - sizeof(guid) - sizeof(uint16_t);
+
+    if (tot_len <= 0) {
+        return;
+    }
+
+    ovmf_table = g_malloc(tot_len);
+    ovmf_table_len = tot_len;
+
+    /*
+     * ptr is the foot of the table, so copy it all to the newly
+     * allocated ovmf_table and then set the ovmf_table pointer
+     * to the table foot
+     */
+    memcpy(ovmf_table, ptr - tot_len, tot_len);
+    ovmf_table += tot_len;
+}
+
+/**
+ * pc_system_ovmf_table_find - Find the data associated with an entry in OVMF's
+ * reset vector GUIDed table.
+ *
+ * @entry: GUID string of the entry to lookup
+ * @data: Filled with a pointer to the entry's value (if not NULL)
+ * @data_len: Filled with the length of the entry's value (if not NULL). Pass
+ *            NULL here if the length of data is known.
+ *
+ * Return: true if the entry was found in the OVMF table; false otherwise.
+ */
+bool pc_system_ovmf_table_find(const char *entry, uint8_t **data,
+                               int *data_len)
+{
+    uint8_t *ptr = ovmf_table;
+    int tot_len = ovmf_table_len;
+    QemuUUID entry_guid;
+
+    assert(ovmf_flash_parsed);
+
+    if (qemu_uuid_parse(entry, &entry_guid) < 0) {
+        return false;
+    }
+
+    if (!ptr) {
+        return false;
+    }
+
+    entry_guid = qemu_uuid_bswap(entry_guid); /* guids are LE */
+    while (tot_len >= sizeof(QemuUUID) + sizeof(uint16_t)) {
+        int len;
+        QemuUUID *guid;
+
+        /*
+         * The data structure is
+         *   arbitrary length data
+         *   2 byte length of entire entry
+         *   16 byte guid
+         */
+        guid = (QemuUUID *)(ptr - sizeof(QemuUUID));
+        len = le16_to_cpu(*(uint16_t *)(ptr - sizeof(QemuUUID) -
+                                        sizeof(uint16_t)));
+
+        /*
+         * just in case the table is corrupt, wouldn't want to spin in
+         * the zero case
+         */
+        if (len < sizeof(QemuUUID) + sizeof(uint16_t)) {
+            return false;
+        } else if (len > tot_len) {
+            return false;
+        }
+
+        ptr -= len;
+        tot_len -= len;
+        if (qemu_uuid_is_equal(guid, &entry_guid)) {
+            if (data) {
+                *data = ptr;
+            }
+            if (data_len) {
+                *data_len = len - sizeof(QemuUUID) - sizeof(uint16_t);
+            }
+            return true;
+        }
+    }
+    return false;
+}
diff --git a/hw/i386/port92.c b/hw/i386/port92.c
new file mode 100644
index 000000000..e1379a4f9
--- /dev/null
+++ b/hw/i386/port92.c
@@ -0,0 +1,127 @@
+/*
+ * QEMU I/O port 0x92 (System Control Port A, to handle Fast Gate A20)
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/runstate.h"
+#include "migration/vmstate.h"
+#include "hw/irq.h"
+#include "hw/i386/pc.h"
+#include "trace.h"
+#include "qom/object.h"
+
+OBJECT_DECLARE_SIMPLE_TYPE(Port92State, PORT92)
+
+struct Port92State {
+    ISADevice parent_obj;
+
+    MemoryRegion io;
+    uint8_t outport;
+    qemu_irq a20_out;
+};
+
+static void port92_write(void *opaque, hwaddr addr, uint64_t val,
+                         unsigned size)
+{
+    Port92State *s = opaque;
+    int oldval = s->outport;
+
+    trace_port92_write(val);
+    s->outport = val;
+    qemu_set_irq(s->a20_out, (val >> 1) & 1);
+    if ((val & 1) && !(oldval & 1)) {
+        qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
+    }
+}
+
+static uint64_t port92_read(void *opaque, hwaddr addr,
+                            unsigned size)
+{
+    Port92State *s = opaque;
+    uint32_t ret;
+
+    ret = s->outport;
+    trace_port92_read(ret);
+
+    return ret;
+}
+
+static const VMStateDescription vmstate_port92_isa = {
+    .name = "port92",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8(outport, Port92State),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void port92_reset(DeviceState *d)
+{
+    Port92State *s = PORT92(d);
+
+    s->outport &= ~1;
+}
+
+static const MemoryRegionOps port92_ops = {
+    .read = port92_read,
+    .write = port92_write,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 1,
+    },
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void port92_initfn(Object *obj)
+{
+    Port92State *s = PORT92(obj);
+
+    memory_region_init_io(&s->io, OBJECT(s), &port92_ops, s, "port92", 1);
+
+    s->outport = 0;
+
+    qdev_init_gpio_out_named(DEVICE(obj), &s->a20_out, PORT92_A20_LINE, 1);
+}
+
+static void port92_realizefn(DeviceState *dev, Error **errp)
+{
+    ISADevice *isadev = ISA_DEVICE(dev);
+    Port92State *s = PORT92(dev);
+
+    isa_register_ioport(isadev, &s->io, 0x92);
+}
+
+static void port92_class_initfn(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = port92_realizefn;
+    dc->reset = port92_reset;
+    dc->vmsd = &vmstate_port92_isa;
+    /*
+     * Reason: unlike ordinary ISA devices, this one needs additional
+     * wiring: its A20 output line needs to be wired up with
+     * qdev_connect_gpio_out_named().
+     */
+    dc->user_creatable = false;
+}
+
+static const TypeInfo port92_info = {
+    .name          = TYPE_PORT92,
+    .parent        = TYPE_ISA_DEVICE,
+    .instance_size = sizeof(Port92State),
+    .instance_init = port92_initfn,
+    .class_init    = port92_class_initfn,
+};
+
+static void port92_register_types(void)
+{
+    type_register_static(&port92_info);
+}
+
+type_init(port92_register_types)
diff --git a/hw/i386/sgx-epc.c b/hw/i386/sgx-epc.c
new file mode 100644
index 000000000..e508827e7
--- /dev/null
+++ b/hw/i386/sgx-epc.c
@@ -0,0 +1,185 @@
+/*
+ * SGX EPC device
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Authors:
+ *   Sean Christopherson <sean.j.christopherson@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/sgx-epc.h"
+#include "hw/mem/memory-device.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include "target/i386/cpu.h"
+#include "exec/address-spaces.h"
+
+static Property sgx_epc_properties[] = {
+    DEFINE_PROP_UINT64(SGX_EPC_ADDR_PROP, SGXEPCDevice, addr, 0),
+    DEFINE_PROP_LINK(SGX_EPC_MEMDEV_PROP, SGXEPCDevice, hostmem,
+                     TYPE_MEMORY_BACKEND_EPC, HostMemoryBackendEpc *),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void sgx_epc_get_size(Object *obj, Visitor *v, const char *name,
+                             void *opaque, Error **errp)
+{
+    Error *local_err = NULL;
+    uint64_t value;
+
+    value = memory_device_get_region_size(MEMORY_DEVICE(obj), &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    visit_type_uint64(v, name, &value, errp);
+}
+
+static void sgx_epc_init(Object *obj)
+{
+    object_property_add(obj, SGX_EPC_SIZE_PROP, "uint64", sgx_epc_get_size,
+                        NULL, NULL, NULL);
+}
+
+static void sgx_epc_realize(DeviceState *dev, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+    X86MachineState *x86ms = X86_MACHINE(pcms);
+    MemoryDeviceState *md = MEMORY_DEVICE(dev);
+    SGXEPCState *sgx_epc = &pcms->sgx_epc;
+    SGXEPCDevice *epc = SGX_EPC(dev);
+    HostMemoryBackend *hostmem;
+    const char *path;
+
+    if (x86ms->boot_cpus != 0) {
+        error_setg(errp, "'" TYPE_SGX_EPC "' can't be created after vCPUs,"
+                         "e.g. via -device");
+        return;
+    }
+
+    if (!epc->hostmem) {
+        error_setg(errp, "'" SGX_EPC_MEMDEV_PROP "' property is not set");
+        return;
+    }
+    hostmem = MEMORY_BACKEND(epc->hostmem);
+    if (host_memory_backend_is_mapped(hostmem)) {
+        path = object_get_canonical_path_component(OBJECT(hostmem));
+        error_setg(errp, "can't use already busy memdev: %s", path);
+        return;
+    }
+
+    epc->addr = sgx_epc->base + sgx_epc->size;
+
+    memory_region_add_subregion(&sgx_epc->mr, epc->addr - sgx_epc->base,
+                                host_memory_backend_get_memory(hostmem));
+
+    host_memory_backend_set_mapped(hostmem, true);
+
+    sgx_epc->sections = g_renew(SGXEPCDevice *, sgx_epc->sections,
+                                sgx_epc->nr_sections + 1);
+    sgx_epc->sections[sgx_epc->nr_sections++] = epc;
+
+    sgx_epc->size += memory_device_get_region_size(md, errp);
+}
+
+static void sgx_epc_unrealize(DeviceState *dev)
+{
+    SGXEPCDevice *epc = SGX_EPC(dev);
+    HostMemoryBackend *hostmem = MEMORY_BACKEND(epc->hostmem);
+
+    host_memory_backend_set_mapped(hostmem, false);
+}
+
+static uint64_t sgx_epc_md_get_addr(const MemoryDeviceState *md)
+{
+    const SGXEPCDevice *epc = SGX_EPC(md);
+
+    return epc->addr;
+}
+
+static void sgx_epc_md_set_addr(MemoryDeviceState *md, uint64_t addr,
+                                Error **errp)
+{
+    object_property_set_uint(OBJECT(md), SGX_EPC_ADDR_PROP, addr, errp);
+}
+
+static uint64_t sgx_epc_md_get_plugged_size(const MemoryDeviceState *md,
+                                            Error **errp)
+{
+    return 0;
+}
+
+static MemoryRegion *sgx_epc_md_get_memory_region(MemoryDeviceState *md,
+                                                  Error **errp)
+{
+    SGXEPCDevice *epc = SGX_EPC(md);
+    HostMemoryBackend *hostmem;
+
+    if (!epc->hostmem) {
+        error_setg(errp, "'" SGX_EPC_MEMDEV_PROP "' property must be set");
+        return NULL;
+    }
+
+    hostmem = MEMORY_BACKEND(epc->hostmem);
+    return host_memory_backend_get_memory(hostmem);
+}
+
+static void sgx_epc_md_fill_device_info(const MemoryDeviceState *md,
+                                        MemoryDeviceInfo *info)
+{
+    SgxEPCDeviceInfo *se = g_new0(SgxEPCDeviceInfo, 1);
+    SGXEPCDevice *epc = SGX_EPC(md);
+
+    se->memaddr = epc->addr;
+    se->size = object_property_get_uint(OBJECT(epc), SGX_EPC_SIZE_PROP,
+                                        NULL);
+    se->memdev = object_get_canonical_path(OBJECT(epc->hostmem));
+
+    info->u.sgx_epc.data = se;
+    info->type = MEMORY_DEVICE_INFO_KIND_SGX_EPC;
+}
+
+static void sgx_epc_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+    MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(oc);
+
+    dc->hotpluggable = false;
+    dc->realize = sgx_epc_realize;
+    dc->unrealize = sgx_epc_unrealize;
+    dc->desc = "SGX EPC section";
+    dc->user_creatable = false;
+    device_class_set_props(dc, sgx_epc_properties);
+
+    mdc->get_addr = sgx_epc_md_get_addr;
+    mdc->set_addr = sgx_epc_md_set_addr;
+    mdc->get_plugged_size = sgx_epc_md_get_plugged_size;
+    mdc->get_memory_region = sgx_epc_md_get_memory_region;
+    mdc->fill_device_info = sgx_epc_md_fill_device_info;
+}
+
+static TypeInfo sgx_epc_info = {
+    .name          = TYPE_SGX_EPC,
+    .parent        = TYPE_DEVICE,
+    .instance_size = sizeof(SGXEPCDevice),
+    .instance_init = sgx_epc_init,
+    .class_init    = sgx_epc_class_init,
+    .class_size    = sizeof(DeviceClass),
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_MEMORY_DEVICE },
+        { }
+    },
+};
+
+static void sgx_epc_register_types(void)
+{
+    type_register_static(&sgx_epc_info);
+}
+
+type_init(sgx_epc_register_types)
diff --git a/hw/i386/sgx-stub.c b/hw/i386/sgx-stub.c
new file mode 100644
index 000000000..c9b379e66
--- /dev/null
+++ b/hw/i386/sgx-stub.c
@@ -0,0 +1,34 @@
+#include "qemu/osdep.h"
+#include "monitor/monitor.h"
+#include "monitor/hmp-target.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/sgx-epc.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc-target.h"
+
+SGXInfo *qmp_query_sgx(Error **errp)
+{
+    error_setg(errp, "SGX support is not compiled in");
+    return NULL;
+}
+
+SGXInfo *qmp_query_sgx_capabilities(Error **errp)
+{
+    error_setg(errp, "SGX support is not compiled in");
+    return NULL;
+}
+
+void hmp_info_sgx(Monitor *mon, const QDict *qdict)
+{
+    monitor_printf(mon, "SGX is not available in this QEMU\n");
+}
+
+void pc_machine_init_sgx_epc(PCMachineState *pcms)
+{
+    memset(&pcms->sgx_epc, 0, sizeof(SGXEPCState));
+}
+
+bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
+{
+    g_assert_not_reached();
+}
diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c
new file mode 100644
index 000000000..8fef3dd8f
--- /dev/null
+++ b/hw/i386/sgx.c
@@ -0,0 +1,243 @@
+/*
+ * SGX common code
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Authors:
+ *   Yang Zhong<yang.zhong@intel.com>
+ *   Sean Christopherson <sean.j.christopherson@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "hw/i386/pc.h"
+#include "hw/i386/sgx-epc.h"
+#include "hw/mem/memory-device.h"
+#include "monitor/qdev.h"
+#include "monitor/monitor.h"
+#include "monitor/hmp-target.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc-target.h"
+#include "exec/address-spaces.h"
+#include "sysemu/hw_accel.h"
+#include "sysemu/reset.h"
+#include <sys/ioctl.h>
+
+#define SGX_MAX_EPC_SECTIONS            8
+#define SGX_CPUID_EPC_INVALID           0x0
+
+/* A valid EPC section. */
+#define SGX_CPUID_EPC_SECTION           0x1
+#define SGX_CPUID_EPC_MASK              0xF
+
+#define SGX_MAGIC 0xA4
+#define SGX_IOC_VEPC_REMOVE_ALL       _IO(SGX_MAGIC, 0x04)
+
+#define RETRY_NUM                       2
+
+static uint64_t sgx_calc_section_metric(uint64_t low, uint64_t high)
+{
+    return (low & MAKE_64BIT_MASK(12, 20)) +
+           ((high & MAKE_64BIT_MASK(0, 20)) << 32);
+}
+
+static uint64_t sgx_calc_host_epc_section_size(void)
+{
+    uint32_t i, type;
+    uint32_t eax, ebx, ecx, edx;
+    uint64_t size = 0;
+
+    for (i = 0; i < SGX_MAX_EPC_SECTIONS; i++) {
+        host_cpuid(0x12, i + 2, &eax, &ebx, &ecx, &edx);
+
+        type = eax & SGX_CPUID_EPC_MASK;
+        if (type == SGX_CPUID_EPC_INVALID) {
+            break;
+        }
+
+        if (type != SGX_CPUID_EPC_SECTION) {
+            break;
+        }
+
+        size += sgx_calc_section_metric(ecx, edx);
+    }
+
+    return size;
+}
+
+static void sgx_epc_reset(void *opaque)
+{
+    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+    HostMemoryBackend *hostmem;
+    SGXEPCDevice *epc;
+    int failures;
+    int fd, i, j, r;
+    static bool warned = false;
+
+    /*
+     * The second pass is needed to remove SECS pages that could not
+     * be removed during the first.
+     */
+    for (i = 0; i < RETRY_NUM; i++) {
+        failures = 0;
+        for (j = 0; j < pcms->sgx_epc.nr_sections; j++) {
+            epc = pcms->sgx_epc.sections[j];
+            hostmem = MEMORY_BACKEND(epc->hostmem);
+            fd = memory_region_get_fd(host_memory_backend_get_memory(hostmem));
+
+            r = ioctl(fd, SGX_IOC_VEPC_REMOVE_ALL);
+            if (r == -ENOTTY && !warned) {
+                warned = true;
+                warn_report("kernel does not support SGX_IOC_VEPC_REMOVE_ALL");
+                warn_report("SGX might operate incorrectly in the guest after reset");
+                break;
+            } else if (r > 0) {
+                /* SECS pages remain */
+                failures++;
+                if (i == 1) {
+                    error_report("cannot reset vEPC section %d", j);
+                }
+            }
+        }
+        if (!failures) {
+            break;
+        }
+     }
+}
+
+SGXInfo *qmp_query_sgx_capabilities(Error **errp)
+{
+    SGXInfo *info = NULL;
+    uint32_t eax, ebx, ecx, edx;
+
+    int fd = qemu_open_old("/dev/sgx_vepc", O_RDWR);
+    if (fd < 0) {
+        error_setg(errp, "SGX is not enabled in KVM");
+        return NULL;
+    }
+
+    info = g_new0(SGXInfo, 1);
+    host_cpuid(0x7, 0, &eax, &ebx, &ecx, &edx);
+
+    info->sgx = ebx & (1U << 2) ? true : false;
+    info->flc = ecx & (1U << 30) ? true : false;
+
+    host_cpuid(0x12, 0, &eax, &ebx, &ecx, &edx);
+    info->sgx1 = eax & (1U << 0) ? true : false;
+    info->sgx2 = eax & (1U << 1) ? true : false;
+
+    info->section_size = sgx_calc_host_epc_section_size();
+
+    close(fd);
+
+    return info;
+}
+
+SGXInfo *qmp_query_sgx(Error **errp)
+{
+    SGXInfo *info = NULL;
+    X86MachineState *x86ms;
+    PCMachineState *pcms =
+        (PCMachineState *)object_dynamic_cast(qdev_get_machine(),
+                                              TYPE_PC_MACHINE);
+    if (!pcms) {
+        error_setg(errp, "SGX is only supported on PC machines");
+        return NULL;
+    }
+
+    x86ms = X86_MACHINE(pcms);
+    if (!x86ms->sgx_epc_list) {
+        error_setg(errp, "No EPC regions defined, SGX not available");
+        return NULL;
+    }
+
+    SGXEPCState *sgx_epc = &pcms->sgx_epc;
+    info = g_new0(SGXInfo, 1);
+
+    info->sgx = true;
+    info->sgx1 = true;
+    info->sgx2 = true;
+    info->flc = true;
+    info->section_size = sgx_epc->size;
+
+    return info;
+}
+
+void hmp_info_sgx(Monitor *mon, const QDict *qdict)
+{
+    Error *err = NULL;
+    g_autoptr(SGXInfo) info = qmp_query_sgx(&err);
+
+    if (err) {
+        error_report_err(err);
+        return;
+    }
+    monitor_printf(mon, "SGX support: %s\n",
+                   info->sgx ? "enabled" : "disabled");
+    monitor_printf(mon, "SGX1 support: %s\n",
+                   info->sgx1 ? "enabled" : "disabled");
+    monitor_printf(mon, "SGX2 support: %s\n",
+                   info->sgx2 ? "enabled" : "disabled");
+    monitor_printf(mon, "FLC support: %s\n",
+                   info->flc ? "enabled" : "disabled");
+    monitor_printf(mon, "size: %" PRIu64 "\n",
+                   info->section_size);
+}
+
+bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
+{
+    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+    SGXEPCDevice *epc;
+
+    if (pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) {
+        return true;
+    }
+
+    epc = pcms->sgx_epc.sections[section_nr];
+
+    *addr = epc->addr;
+    *size = memory_device_get_region_size(MEMORY_DEVICE(epc), &error_fatal);
+
+    return false;
+}
+
+void pc_machine_init_sgx_epc(PCMachineState *pcms)
+{
+    SGXEPCState *sgx_epc = &pcms->sgx_epc;
+    X86MachineState *x86ms = X86_MACHINE(pcms);
+    SgxEPCList *list = NULL;
+    Object *obj;
+
+    memset(sgx_epc, 0, sizeof(SGXEPCState));
+    if (!x86ms->sgx_epc_list) {
+        return;
+    }
+
+    sgx_epc->base = 0x100000000ULL + x86ms->above_4g_mem_size;
+
+    memory_region_init(&sgx_epc->mr, OBJECT(pcms), "sgx-epc", UINT64_MAX);
+    memory_region_add_subregion(get_system_memory(), sgx_epc->base,
+                                &sgx_epc->mr);
+
+    for (list = x86ms->sgx_epc_list; list; list = list->next) {
+        obj = object_new("sgx-epc");
+
+        /* set the memdev link with memory backend */
+        object_property_parse(obj, SGX_EPC_MEMDEV_PROP, list->value->memdev,
+                              &error_fatal);
+        object_property_set_bool(obj, "realized", true, &error_fatal);
+        object_unref(obj);
+    }
+
+    if ((sgx_epc->base + sgx_epc->size) < sgx_epc->base) {
+        error_report("Size of all 'sgx-epc' =0x%"PRIu64" causes EPC to wrap",
+                     sgx_epc->size);
+        exit(EXIT_FAILURE);
+    }
+
+    memory_region_set_size(&sgx_epc->mr, sgx_epc->size);
+
+    /* register the reset callback for sgx epc */
+    qemu_register_reset(sgx_epc_reset, NULL);
+}
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
new file mode 100644
index 000000000..5bf7e52bf
--- /dev/null
+++ b/hw/i386/trace-events
@@ -0,0 +1,121 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# x86-iommu.c
+x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32
+
+# intel_iommu.c
+vtd_inv_desc(const char *type, uint64_t hi, uint64_t lo) "invalidate desc type %s high 0x%"PRIx64" low 0x%"PRIx64
+vtd_inv_desc_cc_domain(uint16_t domain) "context invalidate domain 0x%"PRIx16
+vtd_inv_desc_cc_global(void) "context invalidate globally"
+vtd_inv_desc_cc_device(uint8_t bus, uint8_t dev, uint8_t fn) "context invalidate device %02"PRIx8":%02"PRIx8".%02"PRIx8
+vtd_inv_desc_cc_devices(uint16_t sid, uint16_t fmask) "context invalidate devices sid 0x%"PRIx16" fmask 0x%"PRIx16
+vtd_inv_desc_iotlb_global(void) "iotlb invalidate global"
+vtd_inv_desc_iotlb_domain(uint16_t domain) "iotlb invalidate whole domain 0x%"PRIx16
+vtd_inv_desc_iotlb_pages(uint16_t domain, uint64_t addr, uint8_t mask) "iotlb invalidate domain 0x%"PRIx16" addr 0x%"PRIx64" mask 0x%"PRIx8
+vtd_inv_desc_wait_sw(uint64_t addr, uint32_t data) "wait invalidate status write addr 0x%"PRIx64" data 0x%"PRIx32
+vtd_inv_desc_wait_irq(const char *msg) "%s"
+vtd_inv_desc_wait_write_fail(uint64_t hi, uint64_t lo) "write fail for wait desc hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_inv_desc_iec(uint32_t granularity, uint32_t index, uint32_t mask) "granularity 0x%"PRIx32" index 0x%"PRIx32" mask 0x%"PRIx32
+vtd_inv_qi_enable(bool enable) "enabled %d"
+vtd_inv_qi_setup(uint64_t addr, int size) "addr 0x%"PRIx64" size %d"
+vtd_inv_qi_head(uint16_t head) "read head %d"
+vtd_inv_qi_tail(uint16_t head) "write tail %d"
+vtd_inv_qi_fetch(void) ""
+vtd_context_cache_reset(void) ""
+vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
+vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present"
+vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
+vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page update sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
+vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32
+vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
+vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)"
+vtd_fault_disabled(void) "Fault processing disabled for context entry"
+vtd_replay_ce_valid(const char *mode, uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "%s: replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
+vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
+vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
+vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
+vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
+vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
+vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
+vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64
+vtd_pt_enable_fast_path(uint16_t sid, bool success) "sid 0x%"PRIu16" %d"
+vtd_irq_generate(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64
+vtd_reg_read(uint64_t addr, uint64_t size) "addr 0x%"PRIx64" size 0x%"PRIx64
+vtd_reg_write(uint64_t addr, uint64_t size, uint64_t val) "addr 0x%"PRIx64" size 0x%"PRIx64" value 0x%"PRIx64
+vtd_reg_dmar_root(uint64_t addr, bool scalable) "addr 0x%"PRIx64" scalable %d"
+vtd_reg_ir_root(uint64_t addr, uint32_t size) "addr 0x%"PRIx64" size 0x%"PRIx32
+vtd_reg_write_gcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"PRIx32
+vtd_reg_write_fectl(uint32_t value) "value 0x%"PRIx32
+vtd_reg_write_iectl(uint32_t value) "value 0x%"PRIx32
+vtd_reg_ics_clear_ip(void) ""
+vtd_dmar_translate(uint8_t bus, uint8_t slot, uint8_t func, uint64_t iova, uint64_t gpa, uint64_t mask) "dev %02x:%02x.%02x iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64
+vtd_dmar_enable(bool en) "enable %d"
+vtd_dmar_fault(uint16_t sid, int fault, uint64_t addr, bool is_write) "sid 0x%"PRIx16" fault %d addr 0x%"PRIx64" write %d"
+vtd_ir_enable(bool en) "enable %d"
+vtd_ir_irte_get(int index, uint64_t lo, uint64_t hi) "index %d low 0x%"PRIx64" high 0x%"PRIx64
+vtd_ir_remap(int index, int tri, int vec, int deliver, uint32_t dest, int dest_mode) "index %d trigger %d vector %d deliver %d dest 0x%"PRIx32" mode %d"
+vtd_ir_remap_type(const char *type) "%s"
+vtd_ir_remap_msi(uint64_t addr, uint64_t data, uint64_t addr2, uint64_t data2) "(addr 0x%"PRIx64", data 0x%"PRIx64") -> (addr 0x%"PRIx64", data 0x%"PRIx64")"
+vtd_ir_remap_msi_req(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64
+vtd_fsts_ppf(bool set) "FSTS PPF bit set to %d"
+vtd_fsts_clear_ip(void) ""
+vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low 0x%"PRIx64
+vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
+vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)"
+vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)"
+
+# amd_iommu.c
+amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
+amdvi_cache_update(uint16_t domid, uint8_t bus, uint8_t slot, uint8_t func, uint64_t gpa, uint64_t txaddr) " update iotlb domid 0x%"PRIx16" devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64
+amdvi_completion_wait_fail(uint64_t addr) "error: fail to write at address 0x%"PRIx64
+amdvi_mmio_write(const char *reg, uint64_t addr, unsigned size, uint64_t val, uint64_t offset) "%s write addr 0x%"PRIx64", size %u, val 0x%"PRIx64", offset 0x%"PRIx64
+amdvi_mmio_read(const char *reg, uint64_t addr, unsigned size, uint64_t offset) "%s read addr 0x%"PRIx64", size %u offset 0x%"PRIx64
+amdvi_mmio_read_invalid(int max, uint64_t addr, unsigned size) "error: addr outside region (max 0x%x): read addr 0x%" PRIx64 ", size %u"
+amdvi_command_error(uint64_t status) "error: Executing commands with command buffer disabled 0x%"PRIx64
+amdvi_command_read_fail(uint64_t addr, uint32_t head) "error: fail to access memory at 0x%"PRIx64" + 0x%"PRIx32
+amdvi_command_exec(uint32_t head, uint32_t tail, uint64_t buf) "command buffer head at 0x%"PRIx32" command buffer tail at 0x%"PRIx32" command buffer base at 0x%"PRIx64
+amdvi_unhandled_command(uint8_t type) "unhandled command 0x%"PRIx8
+amdvi_intr_inval(void) "Interrupt table invalidated"
+amdvi_iotlb_inval(void) "IOTLB pages invalidated"
+amdvi_prefetch_pages(void) "Pre-fetch of AMD-Vi pages requested"
+amdvi_pages_inval(uint16_t domid) "AMD-Vi pages for domain 0x%"PRIx16 " invalidated"
+amdvi_all_inval(void) "Invalidation of all AMD-Vi cache requested "
+amdvi_ppr_exec(void) "Execution of PPR queue requested "
+amdvi_devtab_inval(uint8_t bus, uint8_t slot, uint8_t func) "device table entry for devid: %02x:%02x.%x invalidated"
+amdvi_completion_wait(uint64_t addr, uint64_t data) "completion wait requested with store address 0x%"PRIx64" and store data 0x%"PRIx64
+amdvi_control_status(uint64_t val) "MMIO_STATUS state 0x%"PRIx64
+amdvi_iotlb_reset(void) "IOTLB exceed size limit - reset "
+amdvi_dte_get_fail(uint64_t addr, uint32_t offset) "error: failed to access Device Entry devtab 0x%"PRIx64" offset 0x%"PRIx32
+amdvi_invalid_dte(uint64_t addr) "PTE entry at 0x%"PRIx64" is invalid "
+amdvi_get_pte_hwerror(uint64_t addr) "hardware error eccessing PTE at addr 0x%"PRIx64
+amdvi_mode_invalid(uint8_t level, uint64_t addr)"error: translation level 0x%"PRIx8" translating addr 0x%"PRIx64
+amdvi_page_fault(uint64_t addr) "error: page fault accessing guest physical address 0x%"PRIx64
+amdvi_iotlb_hit(uint8_t bus, uint8_t slot, uint8_t func, uint64_t addr, uint64_t txaddr) "hit iotlb devid %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64
+amdvi_translation_result(uint8_t bus, uint8_t slot, uint8_t func, uint64_t addr, uint64_t txaddr) "devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64
+amdvi_mem_ir_write_req(uint64_t addr, uint64_t val, uint32_t size) "addr 0x%"PRIx64" data 0x%"PRIx64" size 0x%"PRIx32
+amdvi_mem_ir_write(uint64_t addr, uint64_t val) "addr 0x%"PRIx64" data 0x%"PRIx64
+amdvi_ir_remap_msi_req(uint64_t addr, uint64_t data, uint8_t devid) "addr 0x%"PRIx64" data 0x%"PRIx64" devid 0x%"PRIx8
+amdvi_ir_remap_msi(uint64_t addr, uint64_t data, uint64_t addr2, uint64_t data2) "(addr 0x%"PRIx64", data 0x%"PRIx64") -> (addr 0x%"PRIx64", data 0x%"PRIx64")"
+amdvi_err(const char *str) "%s"
+amdvi_ir_irte(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" offset 0x%"PRIx64
+amdvi_ir_irte_val(uint32_t data) "data 0x%"PRIx32
+amdvi_ir_err(const char *str) "%s"
+amdvi_ir_intctl(uint8_t val) "int_ctl 0x%"PRIx8
+amdvi_ir_target_abort(const char *str) "%s"
+amdvi_ir_delivery_mode(const char *str) "%s"
+amdvi_ir_irte_ga_val(uint64_t hi, uint64_t lo) "hi 0x%"PRIx64" lo 0x%"PRIx64
+
+# vmport.c
+vmport_register(unsigned char command, void *func, void *opaque) "command: 0x%02x func: %p opaque: %p"
+vmport_command(unsigned char command) "command: 0x%02x"
+
+# x86.c
+x86_gsi_interrupt(int irqn, int level) "GSI interrupt #%d level:%d"
+x86_pic_interrupt(int irqn, int level) "PIC interrupt #%d level:%d"
+
+# port92.c
+port92_read(uint8_t val) "port92: read 0x%02x"
+port92_write(uint8_t val) "port92: write 0x%02x"
diff --git a/hw/i386/trace.h b/hw/i386/trace.h
new file mode 100644
index 000000000..37a9f67e5
--- /dev/null
+++ b/hw/i386/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_i386.h"
diff --git a/hw/i386/vmmouse.c b/hw/i386/vmmouse.c
new file mode 100644
index 000000000..3d6636828
--- /dev/null
+++ b/hw/i386/vmmouse.c
@@ -0,0 +1,327 @@
+/*
+ * QEMU VMMouse emulation
+ *
+ * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "ui/console.h"
+#include "hw/i386/vmport.h"
+#include "hw/input/i8042.h"
+#include "hw/qdev-properties.h"
+#include "migration/vmstate.h"
+#include "cpu.h"
+#include "qom/object.h"
+
+/* debug only vmmouse */
+//#define DEBUG_VMMOUSE
+
+#define VMMOUSE_READ_ID			0x45414552
+#define VMMOUSE_DISABLE			0x000000f5
+#define VMMOUSE_REQUEST_RELATIVE	0x4c455252
+#define VMMOUSE_REQUEST_ABSOLUTE	0x53424152
+
+#define VMMOUSE_QUEUE_SIZE	1024
+
+#define VMMOUSE_VERSION		0x3442554a
+
+#ifdef DEBUG_VMMOUSE
+#define DPRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__)
+#else
+#define DPRINTF(fmt, ...) do { } while (0)
+#endif
+
+#define TYPE_VMMOUSE "vmmouse"
+OBJECT_DECLARE_SIMPLE_TYPE(VMMouseState, VMMOUSE)
+
+struct VMMouseState {
+    ISADevice parent_obj;
+
+    uint32_t queue[VMMOUSE_QUEUE_SIZE];
+    int32_t queue_size;
+    uint16_t nb_queue;
+    uint16_t status;
+    uint8_t absolute;
+    QEMUPutMouseEntry *entry;
+    ISAKBDState *i8042;
+};
+
+static void vmmouse_get_data(uint32_t *data)
+{
+    X86CPU *cpu = X86_CPU(current_cpu);
+    CPUX86State *env = &cpu->env;
+
+    data[0] = env->regs[R_EAX]; data[1] = env->regs[R_EBX];
+    data[2] = env->regs[R_ECX]; data[3] = env->regs[R_EDX];
+    data[4] = env->regs[R_ESI]; data[5] = env->regs[R_EDI];
+}
+
+static void vmmouse_set_data(const uint32_t *data)
+{
+    X86CPU *cpu = X86_CPU(current_cpu);
+    CPUX86State *env = &cpu->env;
+
+    env->regs[R_EAX] = data[0]; env->regs[R_EBX] = data[1];
+    env->regs[R_ECX] = data[2]; env->regs[R_EDX] = data[3];
+    env->regs[R_ESI] = data[4]; env->regs[R_EDI] = data[5];
+}
+
+static uint32_t vmmouse_get_status(VMMouseState *s)
+{
+    DPRINTF("vmmouse_get_status()\n");
+    return (s->status << 16) | s->nb_queue;
+}
+
+static void vmmouse_mouse_event(void *opaque, int x, int y, int dz, int buttons_state)
+{
+    VMMouseState *s = opaque;
+    int buttons = 0;
+
+    if (s->nb_queue > (VMMOUSE_QUEUE_SIZE - 4))
+        return;
+
+    DPRINTF("vmmouse_mouse_event(%d, %d, %d, %d)\n",
+            x, y, dz, buttons_state);
+
+    if ((buttons_state & MOUSE_EVENT_LBUTTON))
+        buttons |= 0x20;
+    if ((buttons_state & MOUSE_EVENT_RBUTTON))
+        buttons |= 0x10;
+    if ((buttons_state & MOUSE_EVENT_MBUTTON))
+        buttons |= 0x08;
+
+    if (s->absolute) {
+        x <<= 1;
+        y <<= 1;
+    }
+
+    s->queue[s->nb_queue++] = buttons;
+    s->queue[s->nb_queue++] = x;
+    s->queue[s->nb_queue++] = y;
+    s->queue[s->nb_queue++] = dz;
+
+    /* need to still generate PS2 events to notify driver to
+       read from queue */
+    i8042_isa_mouse_fake_event(s->i8042);
+}
+
+static void vmmouse_remove_handler(VMMouseState *s)
+{
+    if (s->entry) {
+        qemu_remove_mouse_event_handler(s->entry);
+        s->entry = NULL;
+    }
+}
+
+static void vmmouse_update_handler(VMMouseState *s, int absolute)
+{
+    if (s->status != 0) {
+        return;
+    }
+    if (s->absolute != absolute) {
+        s->absolute = absolute;
+        vmmouse_remove_handler(s);
+    }
+    if (s->entry == NULL) {
+        s->entry = qemu_add_mouse_event_handler(vmmouse_mouse_event,
+                                                s, s->absolute,
+                                                "vmmouse");
+        qemu_activate_mouse_event_handler(s->entry);
+    }
+}
+
+static void vmmouse_read_id(VMMouseState *s)
+{
+    DPRINTF("vmmouse_read_id()\n");
+
+    if (s->nb_queue == VMMOUSE_QUEUE_SIZE)
+        return;
+
+    s->queue[s->nb_queue++] = VMMOUSE_VERSION;
+    s->status = 0;
+    vmmouse_update_handler(s, s->absolute);
+}
+
+static void vmmouse_request_relative(VMMouseState *s)
+{
+    DPRINTF("vmmouse_request_relative()\n");
+    vmmouse_update_handler(s, 0);
+}
+
+static void vmmouse_request_absolute(VMMouseState *s)
+{
+    DPRINTF("vmmouse_request_absolute()\n");
+    vmmouse_update_handler(s, 1);
+}
+
+static void vmmouse_disable(VMMouseState *s)
+{
+    DPRINTF("vmmouse_disable()\n");
+    s->status = 0xffff;
+    vmmouse_remove_handler(s);
+}
+
+static void vmmouse_data(VMMouseState *s, uint32_t *data, uint32_t size)
+{
+    int i;
+
+    DPRINTF("vmmouse_data(%d)\n", size);
+
+    if (size == 0 || size > 6 || size > s->nb_queue) {
+        printf("vmmouse: driver requested too much data %d\n", size);
+        s->status = 0xffff;
+        vmmouse_remove_handler(s);
+        return;
+    }
+
+    for (i = 0; i < size; i++)
+        data[i] = s->queue[i];
+
+    s->nb_queue -= size;
+    if (s->nb_queue)
+        memmove(s->queue, &s->queue[size], sizeof(s->queue[0]) * s->nb_queue);
+}
+
+static uint32_t vmmouse_ioport_read(void *opaque, uint32_t addr)
+{
+    VMMouseState *s = opaque;
+    uint32_t data[6];
+    uint16_t command;
+
+    vmmouse_get_data(data);
+
+    command = data[2] & 0xFFFF;
+
+    switch (command) {
+    case VMPORT_CMD_VMMOUSE_STATUS:
+        data[0] = vmmouse_get_status(s);
+        break;
+    case VMPORT_CMD_VMMOUSE_COMMAND:
+        switch (data[1]) {
+        case VMMOUSE_DISABLE:
+            vmmouse_disable(s);
+            break;
+        case VMMOUSE_READ_ID:
+            vmmouse_read_id(s);
+            break;
+        case VMMOUSE_REQUEST_RELATIVE:
+            vmmouse_request_relative(s);
+            break;
+        case VMMOUSE_REQUEST_ABSOLUTE:
+            vmmouse_request_absolute(s);
+            break;
+        default:
+            printf("vmmouse: unknown command %x\n", data[1]);
+            break;
+        }
+        break;
+    case VMPORT_CMD_VMMOUSE_DATA:
+        vmmouse_data(s, data, data[1]);
+        break;
+    default:
+        printf("vmmouse: unknown command %x\n", command);
+        break;
+    }
+
+    vmmouse_set_data(data);
+    return data[0];
+}
+
+static int vmmouse_post_load(void *opaque, int version_id)
+{
+    VMMouseState *s = opaque;
+
+    vmmouse_remove_handler(s);
+    vmmouse_update_handler(s, s->absolute);
+    return 0;
+}
+
+static const VMStateDescription vmstate_vmmouse = {
+    .name = "vmmouse",
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .post_load = vmmouse_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_INT32_EQUAL(queue_size, VMMouseState, NULL),
+        VMSTATE_UINT32_ARRAY(queue, VMMouseState, VMMOUSE_QUEUE_SIZE),
+        VMSTATE_UINT16(nb_queue, VMMouseState),
+        VMSTATE_UINT16(status, VMMouseState),
+        VMSTATE_UINT8(absolute, VMMouseState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void vmmouse_reset(DeviceState *d)
+{
+    VMMouseState *s = VMMOUSE(d);
+
+    s->queue_size = VMMOUSE_QUEUE_SIZE;
+    s->nb_queue = 0;
+
+    vmmouse_disable(s);
+}
+
+static void vmmouse_realizefn(DeviceState *dev, Error **errp)
+{
+    VMMouseState *s = VMMOUSE(dev);
+
+    DPRINTF("vmmouse_init\n");
+
+    if (!object_resolve_path_type("", TYPE_VMPORT, NULL)) {
+        error_setg(errp, "vmmouse needs a machine with vmport");
+        return;
+    }
+
+    vmport_register(VMPORT_CMD_VMMOUSE_STATUS, vmmouse_ioport_read, s);
+    vmport_register(VMPORT_CMD_VMMOUSE_COMMAND, vmmouse_ioport_read, s);
+    vmport_register(VMPORT_CMD_VMMOUSE_DATA, vmmouse_ioport_read, s);
+}
+
+static Property vmmouse_properties[] = {
+    DEFINE_PROP_LINK("i8042", VMMouseState, i8042, TYPE_I8042, ISAKBDState *),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vmmouse_class_initfn(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = vmmouse_realizefn;
+    dc->reset = vmmouse_reset;
+    dc->vmsd = &vmstate_vmmouse;
+    device_class_set_props(dc, vmmouse_properties);
+    set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+}
+
+static const TypeInfo vmmouse_info = {
+    .name          = TYPE_VMMOUSE,
+    .parent        = TYPE_ISA_DEVICE,
+    .instance_size = sizeof(VMMouseState),
+    .class_init    = vmmouse_class_initfn,
+};
+
+static void vmmouse_register_types(void)
+{
+    type_register_static(&vmmouse_info);
+}
+
+type_init(vmmouse_register_types)
diff --git a/hw/i386/vmport.c b/hw/i386/vmport.c
new file mode 100644
index 000000000..7cc75dbc6
--- /dev/null
+++ b/hw/i386/vmport.c
@@ -0,0 +1,313 @@
+/*
+ * QEMU VMPort emulation
+ *
+ * Copyright (C) 2007 Hervé Poussineau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * Guest code that interacts with this virtual device can be found
+ * in VMware open-vm-tools open-source project:
+ * https://github.com/vmware/open-vm-tools
+ */
+
+#include "qemu/osdep.h"
+#include "hw/isa/isa.h"
+#include "hw/i386/vmport.h"
+#include "hw/qdev-properties.h"
+#include "hw/boards.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/hw_accel.h"
+#include "sysemu/qtest.h"
+#include "qemu/log.h"
+#include "trace.h"
+#include "qom/object.h"
+
+#define VMPORT_MAGIC   0x564D5868
+
+/* Compatibility flags for migration */
+#define VMPORT_COMPAT_READ_SET_EAX_BIT              0
+#define VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD_BIT    1
+#define VMPORT_COMPAT_REPORT_VMX_TYPE_BIT           2
+#define VMPORT_COMPAT_CMDS_V2_BIT                   3
+#define VMPORT_COMPAT_READ_SET_EAX              \
+    (1 << VMPORT_COMPAT_READ_SET_EAX_BIT)
+#define VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD    \
+    (1 << VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD_BIT)
+#define VMPORT_COMPAT_REPORT_VMX_TYPE           \
+    (1 << VMPORT_COMPAT_REPORT_VMX_TYPE_BIT)
+#define VMPORT_COMPAT_CMDS_V2                   \
+    (1 << VMPORT_COMPAT_CMDS_V2_BIT)
+
+/* vCPU features reported by CMD_GET_VCPU_INFO */
+#define VCPU_INFO_SLC64_BIT             0
+#define VCPU_INFO_SYNC_VTSCS_BIT        1
+#define VCPU_INFO_HV_REPLAY_OK_BIT      2
+#define VCPU_INFO_LEGACY_X2APIC_BIT     3
+#define VCPU_INFO_RESERVED_BIT          31
+
+OBJECT_DECLARE_SIMPLE_TYPE(VMPortState, VMPORT)
+
+struct VMPortState {
+    ISADevice parent_obj;
+
+    MemoryRegion io;
+    VMPortReadFunc *func[VMPORT_ENTRIES];
+    void *opaque[VMPORT_ENTRIES];
+
+    uint32_t vmware_vmx_version;
+    uint8_t vmware_vmx_type;
+
+    uint32_t compat_flags;
+};
+
+static VMPortState *port_state;
+
+void vmport_register(VMPortCommand command, VMPortReadFunc *func, void *opaque)
+{
+    assert(command < VMPORT_ENTRIES);
+    assert(port_state);
+
+    trace_vmport_register(command, func, opaque);
+    port_state->func[command] = func;
+    port_state->opaque[command] = opaque;
+}
+
+static uint64_t vmport_ioport_read(void *opaque, hwaddr addr,
+                                   unsigned size)
+{
+    VMPortState *s = opaque;
+    CPUState *cs = current_cpu;
+    X86CPU *cpu = X86_CPU(cs);
+    CPUX86State *env;
+    unsigned char command;
+    uint32_t eax;
+
+    if (qtest_enabled()) {
+        return -1;
+    }
+    env = &cpu->env;
+    cpu_synchronize_state(cs);
+
+    eax = env->regs[R_EAX];
+    if (eax != VMPORT_MAGIC) {
+        goto err;
+    }
+
+    command = env->regs[R_ECX];
+    trace_vmport_command(command);
+    if (command >= VMPORT_ENTRIES || !s->func[command]) {
+        qemu_log_mask(LOG_UNIMP, "vmport: unknown command %x\n", command);
+        goto err;
+    }
+
+    eax = s->func[command](s->opaque[command], addr);
+    goto out;
+
+err:
+    if (s->compat_flags & VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD) {
+        eax = UINT32_MAX;
+    }
+
+out:
+    /*
+     * The call above to cpu_synchronize_state() gets vCPU registers values
+     * to QEMU but also cause QEMU to write QEMU vCPU registers values to
+     * vCPU implementation (e.g. Accelerator such as KVM) just before
+     * resuming guest.
+     *
+     * Therefore, in order to make IOPort return value propagate to
+     * guest EAX, we need to explicitly update QEMU EAX register value.
+     */
+    if (s->compat_flags & VMPORT_COMPAT_READ_SET_EAX) {
+        cpu->env.regs[R_EAX] = eax;
+    }
+
+    return eax;
+}
+
+static void vmport_ioport_write(void *opaque, hwaddr addr,
+                                uint64_t val, unsigned size)
+{
+    X86CPU *cpu = X86_CPU(current_cpu);
+
+    if (qtest_enabled()) {
+        return;
+    }
+    cpu->env.regs[R_EAX] = vmport_ioport_read(opaque, addr, 4);
+}
+
+static uint32_t vmport_cmd_get_version(void *opaque, uint32_t addr)
+{
+    X86CPU *cpu = X86_CPU(current_cpu);
+
+    if (qtest_enabled()) {
+        return -1;
+    }
+    cpu->env.regs[R_EBX] = VMPORT_MAGIC;
+    if (port_state->compat_flags & VMPORT_COMPAT_REPORT_VMX_TYPE) {
+        cpu->env.regs[R_ECX] = port_state->vmware_vmx_type;
+    }
+    return port_state->vmware_vmx_version;
+}
+
+static uint32_t vmport_cmd_get_bios_uuid(void *opaque, uint32_t addr)
+{
+    X86CPU *cpu = X86_CPU(current_cpu);
+    uint32_t *uuid_parts = (uint32_t *)(qemu_uuid.data);
+
+    cpu->env.regs[R_EAX] = le32_to_cpu(uuid_parts[0]);
+    cpu->env.regs[R_EBX] = le32_to_cpu(uuid_parts[1]);
+    cpu->env.regs[R_ECX] = le32_to_cpu(uuid_parts[2]);
+    cpu->env.regs[R_EDX] = le32_to_cpu(uuid_parts[3]);
+    return cpu->env.regs[R_EAX];
+}
+
+static uint32_t vmport_cmd_ram_size(void *opaque, uint32_t addr)
+{
+    X86CPU *cpu = X86_CPU(current_cpu);
+
+    if (qtest_enabled()) {
+        return -1;
+    }
+    cpu->env.regs[R_EBX] = 0x1177;
+    return current_machine->ram_size;
+}
+
+static uint32_t vmport_cmd_get_hz(void *opaque, uint32_t addr)
+{
+    X86CPU *cpu = X86_CPU(current_cpu);
+
+    if (cpu->env.tsc_khz && cpu->env.apic_bus_freq) {
+        uint64_t tsc_freq = (uint64_t)cpu->env.tsc_khz * 1000;
+
+        cpu->env.regs[R_ECX] = cpu->env.apic_bus_freq;
+        cpu->env.regs[R_EBX] = (uint32_t)(tsc_freq >> 32);
+        cpu->env.regs[R_EAX] = (uint32_t)tsc_freq;
+    } else {
+        /* Signal cmd as not supported */
+        cpu->env.regs[R_EBX] = UINT32_MAX;
+    }
+
+    return cpu->env.regs[R_EAX];
+}
+
+static uint32_t vmport_cmd_get_vcpu_info(void *opaque, uint32_t addr)
+{
+    X86CPU *cpu = X86_CPU(current_cpu);
+    uint32_t ret = 0;
+
+    if (cpu->env.features[FEAT_1_ECX] & CPUID_EXT_X2APIC) {
+        ret |= 1 << VCPU_INFO_LEGACY_X2APIC_BIT;
+    }
+
+    return ret;
+}
+
+static const MemoryRegionOps vmport_ops = {
+    .read = vmport_ioport_read,
+    .write = vmport_ioport_write,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vmport_realizefn(DeviceState *dev, Error **errp)
+{
+    ISADevice *isadev = ISA_DEVICE(dev);
+    VMPortState *s = VMPORT(dev);
+
+    memory_region_init_io(&s->io, OBJECT(s), &vmport_ops, s, "vmport", 1);
+    isa_register_ioport(isadev, &s->io, 0x5658);
+
+    port_state = s;
+
+    /* Register some generic port commands */
+    vmport_register(VMPORT_CMD_GETVERSION, vmport_cmd_get_version, NULL);
+    vmport_register(VMPORT_CMD_GETRAMSIZE, vmport_cmd_ram_size, NULL);
+    if (s->compat_flags & VMPORT_COMPAT_CMDS_V2) {
+        vmport_register(VMPORT_CMD_GETBIOSUUID, vmport_cmd_get_bios_uuid, NULL);
+        vmport_register(VMPORT_CMD_GETHZ, vmport_cmd_get_hz, NULL);
+        vmport_register(VMPORT_CMD_GET_VCPU_INFO, vmport_cmd_get_vcpu_info,
+                        NULL);
+    }
+}
+
+static Property vmport_properties[] = {
+    /* Used to enforce compatibility for migration */
+    DEFINE_PROP_BIT("x-read-set-eax", VMPortState, compat_flags,
+                    VMPORT_COMPAT_READ_SET_EAX_BIT, true),
+    DEFINE_PROP_BIT("x-signal-unsupported-cmd", VMPortState, compat_flags,
+                    VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD_BIT, true),
+    DEFINE_PROP_BIT("x-report-vmx-type", VMPortState, compat_flags,
+                    VMPORT_COMPAT_REPORT_VMX_TYPE_BIT, true),
+    DEFINE_PROP_BIT("x-cmds-v2", VMPortState, compat_flags,
+                    VMPORT_COMPAT_CMDS_V2_BIT, true),
+
+    /* Default value taken from open-vm-tools code VERSION_MAGIC definition */
+    DEFINE_PROP_UINT32("vmware-vmx-version", VMPortState,
+                       vmware_vmx_version, 6),
+    /*
+     * Value determines which VMware product type host report itself to guest.
+     *
+     * Most guests are fine with exposing host as VMware ESX server.
+     * Some legacy/proprietary guests hard-code a given type.
+     *
+     * For a complete list of values, refer to enum VMXType at open-vm-tools
+     * project (Defined at lib/include/vm_vmx_type.h).
+     *
+     * Reasonable options:
+     * 0 - Unset
+     * 1 - VMware Express (deprecated)
+     * 2 - VMware ESX Server
+     * 3 - VMware Server (Deprecated)
+     * 4 - VMware Workstation
+     * 5 - ACE 1.x (Deprecated)
+     */
+    DEFINE_PROP_UINT8("vmware-vmx-type", VMPortState, vmware_vmx_type, 2),
+
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vmport_class_initfn(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = vmport_realizefn;
+    /* Reason: realize sets global port_state */
+    dc->user_creatable = false;
+    device_class_set_props(dc, vmport_properties);
+}
+
+static const TypeInfo vmport_info = {
+    .name          = TYPE_VMPORT,
+    .parent        = TYPE_ISA_DEVICE,
+    .instance_size = sizeof(VMPortState),
+    .class_init    = vmport_class_initfn,
+};
+
+static void vmport_register_types(void)
+{
+    type_register_static(&vmport_info);
+}
+
+type_init(vmport_register_types)
diff --git a/hw/i386/x86-iommu-stub.c b/hw/i386/x86-iommu-stub.c
new file mode 100644
index 000000000..781b5ff92
--- /dev/null
+++ b/hw/i386/x86-iommu-stub.c
@@ -0,0 +1,38 @@
+/*
+ * Stubs for X86 IOMMU emulation
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/x86-iommu.h"
+
+void x86_iommu_iec_register_notifier(X86IOMMUState *iommu,
+                                     iec_notify_fn fn, void *data)
+{
+}
+
+X86IOMMUState *x86_iommu_get_default(void)
+{
+    return NULL;
+}
+
+bool x86_iommu_ir_supported(X86IOMMUState *s)
+{
+    return false;
+}
diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c
new file mode 100644
index 000000000..01d11325a
--- /dev/null
+++ b/hw/i386/x86-iommu.c
@@ -0,0 +1,162 @@
+/*
+ * QEMU emulation of common X86 IOMMU
+ *
+ * Copyright (C) 2016 Peter Xu, Red Hat <peterx@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/sysbus.h"
+#include "hw/i386/x86-iommu.h"
+#include "hw/qdev-properties.h"
+#include "hw/i386/pc.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "sysemu/kvm.h"
+
+void x86_iommu_iec_register_notifier(X86IOMMUState *iommu,
+                                     iec_notify_fn fn, void *data)
+{
+    IEC_Notifier *notifier = g_new0(IEC_Notifier, 1);
+
+    notifier->iec_notify = fn;
+    notifier->private = data;
+
+    QLIST_INSERT_HEAD(&iommu->iec_notifiers, notifier, list);
+}
+
+void x86_iommu_iec_notify_all(X86IOMMUState *iommu, bool global,
+                              uint32_t index, uint32_t mask)
+{
+    IEC_Notifier *notifier;
+
+    trace_x86_iommu_iec_notify(global, index, mask);
+
+    QLIST_FOREACH(notifier, &iommu->iec_notifiers, list) {
+        if (notifier->iec_notify) {
+            notifier->iec_notify(notifier->private, global,
+                                 index, mask);
+        }
+    }
+}
+
+/* Generate one MSI message from VTDIrq info */
+void x86_iommu_irq_to_msi_message(X86IOMMUIrq *irq, MSIMessage *msg_out)
+{
+    X86IOMMU_MSIMessage msg = {};
+
+    /* Generate address bits */
+    msg.dest_mode = irq->dest_mode;
+    msg.redir_hint = irq->redir_hint;
+    msg.dest = irq->dest;
+    msg.__addr_hi = irq->dest & 0xffffff00;
+    msg.__addr_head = cpu_to_le32(0xfee);
+    /* Keep this from original MSI address bits */
+    msg.__not_used = irq->msi_addr_last_bits;
+
+    /* Generate data bits */
+    msg.vector = irq->vector;
+    msg.delivery_mode = irq->delivery_mode;
+    msg.level = 1;
+    msg.trigger_mode = irq->trigger_mode;
+
+    msg_out->address = msg.msi_addr;
+    msg_out->data = msg.msi_data;
+}
+
+X86IOMMUState *x86_iommu_get_default(void)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    PCMachineState *pcms =
+        PC_MACHINE(object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE));
+
+    if (pcms &&
+        object_dynamic_cast(OBJECT(pcms->iommu), TYPE_X86_IOMMU_DEVICE)) {
+        return X86_IOMMU_DEVICE(pcms->iommu);
+    }
+    return NULL;
+}
+
+static void x86_iommu_realize(DeviceState *dev, Error **errp)
+{
+    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
+    X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_GET_CLASS(dev);
+    MachineState *ms = MACHINE(qdev_get_machine());
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
+    PCMachineState *pcms =
+        PC_MACHINE(object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE));
+    QLIST_INIT(&x86_iommu->iec_notifiers);
+    bool irq_all_kernel = kvm_irqchip_in_kernel() && !kvm_irqchip_is_split();
+
+    if (!pcms || !pcms->bus) {
+        error_setg(errp, "Machine-type '%s' not supported by IOMMU",
+                   mc->name);
+        return;
+    }
+
+    /* If the user didn't specify IR, choose a default value for it */
+    if (x86_iommu->intr_supported == ON_OFF_AUTO_AUTO) {
+        x86_iommu->intr_supported = irq_all_kernel ?
+            ON_OFF_AUTO_OFF : ON_OFF_AUTO_ON;
+    }
+
+    /* Both Intel and AMD IOMMU IR only support "kernel-irqchip={off|split}" */
+    if (x86_iommu_ir_supported(x86_iommu) && irq_all_kernel) {
+        error_setg(errp, "Interrupt Remapping cannot work with "
+                         "kernel-irqchip=on, please use 'split|off'.");
+        return;
+    }
+
+    if (x86_class->realize) {
+        x86_class->realize(dev, errp);
+    }
+}
+
+static Property x86_iommu_properties[] = {
+    DEFINE_PROP_ON_OFF_AUTO("intremap", X86IOMMUState,
+                            intr_supported, ON_OFF_AUTO_AUTO),
+    DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false),
+    DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void x86_iommu_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    dc->realize = x86_iommu_realize;
+    device_class_set_props(dc, x86_iommu_properties);
+}
+
+bool x86_iommu_ir_supported(X86IOMMUState *s)
+{
+    return s->intr_supported == ON_OFF_AUTO_ON;
+}
+
+static const TypeInfo x86_iommu_info = {
+    .name          = TYPE_X86_IOMMU_DEVICE,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(X86IOMMUState),
+    .class_init    = x86_iommu_class_init,
+    .class_size    = sizeof(X86IOMMUClass),
+    .abstract      = true,
+};
+
+static void x86_iommu_register_types(void)
+{
+    type_register_static(&x86_iommu_info);
+}
+
+type_init(x86_iommu_register_types)
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
new file mode 100644
index 000000000..b84840a1b
--- /dev/null
+++ b/hw/i386/x86.c
@@ -0,0 +1,1399 @@
+/*
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/cutils.h"
+#include "qemu/units.h"
+#include "qemu-common.h"
+#include "qemu/datadir.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qapi-visit-common.h"
+#include "qapi/clone-visitor.h"
+#include "qapi/qapi-visit-machine.h"
+#include "qapi/visitor.h"
+#include "sysemu/qtest.h"
+#include "sysemu/whpx.h"
+#include "sysemu/numa.h"
+#include "sysemu/replay.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/cpu-timers.h"
+#include "trace.h"
+
+#include "hw/i386/x86.h"
+#include "target/i386/cpu.h"
+#include "hw/i386/topology.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/intc/i8259.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "target/i386/sev.h"
+
+#include "hw/acpi/cpu_hotplug.h"
+#include "hw/irq.h"
+#include "hw/nmi.h"
+#include "hw/loader.h"
+#include "multiboot.h"
+#include "elf.h"
+#include "standard-headers/asm-x86/bootparam.h"
+#include CONFIG_DEVICES
+#include "kvm/kvm_i386.h"
+
+/* Physical Address of PVH entry point read from kernel ELF NOTE */
+static size_t pvh_start_addr;
+
+inline void init_topo_info(X86CPUTopoInfo *topo_info,
+                           const X86MachineState *x86ms)
+{
+    MachineState *ms = MACHINE(x86ms);
+
+    topo_info->dies_per_pkg = ms->smp.dies;
+    topo_info->cores_per_die = ms->smp.cores;
+    topo_info->threads_per_core = ms->smp.threads;
+}
+
+/*
+ * Calculates initial APIC ID for a specific CPU index
+ *
+ * Currently we need to be able to calculate the APIC ID from the CPU index
+ * alone (without requiring a CPU object), as the QEMU<->Seabios interfaces have
+ * no concept of "CPU index", and the NUMA tables on fw_cfg need the APIC ID of
+ * all CPUs up to max_cpus.
+ */
+uint32_t x86_cpu_apic_id_from_index(X86MachineState *x86ms,
+                                    unsigned int cpu_index)
+{
+    X86MachineClass *x86mc = X86_MACHINE_GET_CLASS(x86ms);
+    X86CPUTopoInfo topo_info;
+    uint32_t correct_id;
+    static bool warned;
+
+    init_topo_info(&topo_info, x86ms);
+
+    correct_id = x86_apicid_from_cpu_idx(&topo_info, cpu_index);
+    if (x86mc->compat_apic_id_mode) {
+        if (cpu_index != correct_id && !warned && !qtest_enabled()) {
+            error_report("APIC IDs set in compatibility mode, "
+                         "CPU topology won't match the configuration");
+            warned = true;
+        }
+        return cpu_index;
+    } else {
+        return correct_id;
+    }
+}
+
+
+void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp)
+{
+    Object *cpu = object_new(MACHINE(x86ms)->cpu_type);
+
+    if (!object_property_set_uint(cpu, "apic-id", apic_id, errp)) {
+        goto out;
+    }
+    qdev_realize(DEVICE(cpu), NULL, errp);
+
+out:
+    object_unref(cpu);
+}
+
+void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
+{
+    int i;
+    const CPUArchIdList *possible_cpus;
+    MachineState *ms = MACHINE(x86ms);
+    MachineClass *mc = MACHINE_GET_CLASS(x86ms);
+
+    x86_cpu_set_default_version(default_cpu_version);
+
+    /*
+     * Calculates the limit to CPU APIC ID values
+     *
+     * Limit for the APIC ID value, so that all
+     * CPU APIC IDs are < x86ms->apic_id_limit.
+     *
+     * This is used for FW_CFG_MAX_CPUS. See comments on fw_cfg_arch_create().
+     */
+    x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms,
+                                                      ms->smp.max_cpus - 1) + 1;
+    possible_cpus = mc->possible_cpu_arch_ids(ms);
+    for (i = 0; i < ms->smp.cpus; i++) {
+        x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
+    }
+}
+
+void x86_rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count)
+{
+    if (cpus_count > 0xff) {
+        /*
+         * If the number of CPUs can't be represented in 8 bits, the
+         * BIOS must use "FW_CFG_NB_CPUS". Set RTC field to 0 just
+         * to make old BIOSes fail more predictably.
+         */
+        rtc_set_memory(rtc, 0x5f, 0);
+    } else {
+        rtc_set_memory(rtc, 0x5f, cpus_count - 1);
+    }
+}
+
+static int x86_apic_cmp(const void *a, const void *b)
+{
+   CPUArchId *apic_a = (CPUArchId *)a;
+   CPUArchId *apic_b = (CPUArchId *)b;
+
+   return apic_a->arch_id - apic_b->arch_id;
+}
+
+/*
+ * returns pointer to CPUArchId descriptor that matches CPU's apic_id
+ * in ms->possible_cpus->cpus, if ms->possible_cpus->cpus has no
+ * entry corresponding to CPU's apic_id returns NULL.
+ */
+CPUArchId *x86_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
+{
+    CPUArchId apic_id, *found_cpu;
+
+    apic_id.arch_id = id;
+    found_cpu = bsearch(&apic_id, ms->possible_cpus->cpus,
+        ms->possible_cpus->len, sizeof(*ms->possible_cpus->cpus),
+        x86_apic_cmp);
+    if (found_cpu && idx) {
+        *idx = found_cpu - ms->possible_cpus->cpus;
+    }
+    return found_cpu;
+}
+
+void x86_cpu_plug(HotplugHandler *hotplug_dev,
+                  DeviceState *dev, Error **errp)
+{
+    CPUArchId *found_cpu;
+    Error *local_err = NULL;
+    X86CPU *cpu = X86_CPU(dev);
+    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+
+    if (x86ms->acpi_dev) {
+        hotplug_handler_plug(x86ms->acpi_dev, dev, &local_err);
+        if (local_err) {
+            goto out;
+        }
+    }
+
+    /* increment the number of CPUs */
+    x86ms->boot_cpus++;
+    if (x86ms->rtc) {
+        x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
+    }
+    if (x86ms->fw_cfg) {
+        fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
+    }
+
+    found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL);
+    found_cpu->cpu = OBJECT(dev);
+out:
+    error_propagate(errp, local_err);
+}
+
+void x86_cpu_unplug_request_cb(HotplugHandler *hotplug_dev,
+                               DeviceState *dev, Error **errp)
+{
+    int idx = -1;
+    X86CPU *cpu = X86_CPU(dev);
+    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+
+    if (!x86ms->acpi_dev) {
+        error_setg(errp, "CPU hot unplug not supported without ACPI");
+        return;
+    }
+
+    x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx);
+    assert(idx != -1);
+    if (idx == 0) {
+        error_setg(errp, "Boot CPU is unpluggable");
+        return;
+    }
+
+    hotplug_handler_unplug_request(x86ms->acpi_dev, dev,
+                                   errp);
+}
+
+void x86_cpu_unplug_cb(HotplugHandler *hotplug_dev,
+                       DeviceState *dev, Error **errp)
+{
+    CPUArchId *found_cpu;
+    Error *local_err = NULL;
+    X86CPU *cpu = X86_CPU(dev);
+    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+
+    hotplug_handler_unplug(x86ms->acpi_dev, dev, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL);
+    found_cpu->cpu = NULL;
+    qdev_unrealize(dev);
+
+    /* decrement the number of CPUs */
+    x86ms->boot_cpus--;
+    /* Update the number of CPUs in CMOS */
+    x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
+    fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
+ out:
+    error_propagate(errp, local_err);
+}
+
+void x86_cpu_pre_plug(HotplugHandler *hotplug_dev,
+                      DeviceState *dev, Error **errp)
+{
+    int idx;
+    CPUState *cs;
+    CPUArchId *cpu_slot;
+    X86CPUTopoIDs topo_ids;
+    X86CPU *cpu = X86_CPU(dev);
+    CPUX86State *env = &cpu->env;
+    MachineState *ms = MACHINE(hotplug_dev);
+    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
+    unsigned int smp_cores = ms->smp.cores;
+    unsigned int smp_threads = ms->smp.threads;
+    X86CPUTopoInfo topo_info;
+
+    if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) {
+        error_setg(errp, "Invalid CPU type, expected cpu type: '%s'",
+                   ms->cpu_type);
+        return;
+    }
+
+    if (x86ms->acpi_dev) {
+        Error *local_err = NULL;
+
+        hotplug_handler_pre_plug(HOTPLUG_HANDLER(x86ms->acpi_dev), dev,
+                                 &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
+    init_topo_info(&topo_info, x86ms);
+
+    env->nr_dies = ms->smp.dies;
+
+    /*
+     * If APIC ID is not set,
+     * set it based on socket/die/core/thread properties.
+     */
+    if (cpu->apic_id == UNASSIGNED_APIC_ID) {
+        int max_socket = (ms->smp.max_cpus - 1) /
+                                smp_threads / smp_cores / ms->smp.dies;
+
+        /*
+         * die-id was optional in QEMU 4.0 and older, so keep it optional
+         * if there's only one die per socket.
+         */
+        if (cpu->die_id < 0 && ms->smp.dies == 1) {
+            cpu->die_id = 0;
+        }
+
+        if (cpu->socket_id < 0) {
+            error_setg(errp, "CPU socket-id is not set");
+            return;
+        } else if (cpu->socket_id > max_socket) {
+            error_setg(errp, "Invalid CPU socket-id: %u must be in range 0:%u",
+                       cpu->socket_id, max_socket);
+            return;
+        }
+        if (cpu->die_id < 0) {
+            error_setg(errp, "CPU die-id is not set");
+            return;
+        } else if (cpu->die_id > ms->smp.dies - 1) {
+            error_setg(errp, "Invalid CPU die-id: %u must be in range 0:%u",
+                       cpu->die_id, ms->smp.dies - 1);
+            return;
+        }
+        if (cpu->core_id < 0) {
+            error_setg(errp, "CPU core-id is not set");
+            return;
+        } else if (cpu->core_id > (smp_cores - 1)) {
+            error_setg(errp, "Invalid CPU core-id: %u must be in range 0:%u",
+                       cpu->core_id, smp_cores - 1);
+            return;
+        }
+        if (cpu->thread_id < 0) {
+            error_setg(errp, "CPU thread-id is not set");
+            return;
+        } else if (cpu->thread_id > (smp_threads - 1)) {
+            error_setg(errp, "Invalid CPU thread-id: %u must be in range 0:%u",
+                       cpu->thread_id, smp_threads - 1);
+            return;
+        }
+
+        topo_ids.pkg_id = cpu->socket_id;
+        topo_ids.die_id = cpu->die_id;
+        topo_ids.core_id = cpu->core_id;
+        topo_ids.smt_id = cpu->thread_id;
+        cpu->apic_id = x86_apicid_from_topo_ids(&topo_info, &topo_ids);
+    }
+
+    cpu_slot = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx);
+    if (!cpu_slot) {
+        MachineState *ms = MACHINE(x86ms);
+
+        x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids);
+        error_setg(errp,
+            "Invalid CPU [socket: %u, die: %u, core: %u, thread: %u] with"
+            " APIC ID %" PRIu32 ", valid index range 0:%d",
+            topo_ids.pkg_id, topo_ids.die_id, topo_ids.core_id, topo_ids.smt_id,
+            cpu->apic_id, ms->possible_cpus->len - 1);
+        return;
+    }
+
+    if (cpu_slot->cpu) {
+        error_setg(errp, "CPU[%d] with APIC ID %" PRIu32 " exists",
+                   idx, cpu->apic_id);
+        return;
+    }
+
+    /* if 'address' properties socket-id/core-id/thread-id are not set, set them
+     * so that machine_query_hotpluggable_cpus would show correct values
+     */
+    /* TODO: move socket_id/core_id/thread_id checks into x86_cpu_realizefn()
+     * once -smp refactoring is complete and there will be CPU private
+     * CPUState::nr_cores and CPUState::nr_threads fields instead of globals */
+    x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids);
+    if (cpu->socket_id != -1 && cpu->socket_id != topo_ids.pkg_id) {
+        error_setg(errp, "property socket-id: %u doesn't match set apic-id:"
+            " 0x%x (socket-id: %u)", cpu->socket_id, cpu->apic_id,
+            topo_ids.pkg_id);
+        return;
+    }
+    cpu->socket_id = topo_ids.pkg_id;
+
+    if (cpu->die_id != -1 && cpu->die_id != topo_ids.die_id) {
+        error_setg(errp, "property die-id: %u doesn't match set apic-id:"
+            " 0x%x (die-id: %u)", cpu->die_id, cpu->apic_id, topo_ids.die_id);
+        return;
+    }
+    cpu->die_id = topo_ids.die_id;
+
+    if (cpu->core_id != -1 && cpu->core_id != topo_ids.core_id) {
+        error_setg(errp, "property core-id: %u doesn't match set apic-id:"
+            " 0x%x (core-id: %u)", cpu->core_id, cpu->apic_id,
+            topo_ids.core_id);
+        return;
+    }
+    cpu->core_id = topo_ids.core_id;
+
+    if (cpu->thread_id != -1 && cpu->thread_id != topo_ids.smt_id) {
+        error_setg(errp, "property thread-id: %u doesn't match set apic-id:"
+            " 0x%x (thread-id: %u)", cpu->thread_id, cpu->apic_id,
+            topo_ids.smt_id);
+        return;
+    }
+    cpu->thread_id = topo_ids.smt_id;
+
+    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) &&
+        !kvm_hv_vpindex_settable()) {
+        error_setg(errp, "kernel doesn't allow setting HyperV VP_INDEX");
+        return;
+    }
+
+    cs = CPU(cpu);
+    cs->cpu_index = idx;
+
+    numa_cpu_pre_plug(cpu_slot, dev, errp);
+}
+
+CpuInstanceProperties
+x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
+{
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
+    const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms);
+
+    assert(cpu_index < possible_cpus->len);
+    return possible_cpus->cpus[cpu_index].props;
+}
+
+int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx)
+{
+   X86CPUTopoIDs topo_ids;
+   X86MachineState *x86ms = X86_MACHINE(ms);
+   X86CPUTopoInfo topo_info;
+
+   init_topo_info(&topo_info, x86ms);
+
+   assert(idx < ms->possible_cpus->len);
+   x86_topo_ids_from_apicid(ms->possible_cpus->cpus[idx].arch_id,
+                            &topo_info, &topo_ids);
+   return topo_ids.pkg_id % ms->numa_state->num_nodes;
+}
+
+const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms)
+{
+    X86MachineState *x86ms = X86_MACHINE(ms);
+    unsigned int max_cpus = ms->smp.max_cpus;
+    X86CPUTopoInfo topo_info;
+    int i;
+
+    if (ms->possible_cpus) {
+        /*
+         * make sure that max_cpus hasn't changed since the first use, i.e.
+         * -smp hasn't been parsed after it
+         */
+        assert(ms->possible_cpus->len == max_cpus);
+        return ms->possible_cpus;
+    }
+
+    ms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
+                                  sizeof(CPUArchId) * max_cpus);
+    ms->possible_cpus->len = max_cpus;
+
+    init_topo_info(&topo_info, x86ms);
+
+    for (i = 0; i < ms->possible_cpus->len; i++) {
+        X86CPUTopoIDs topo_ids;
+
+        ms->possible_cpus->cpus[i].type = ms->cpu_type;
+        ms->possible_cpus->cpus[i].vcpus_count = 1;
+        ms->possible_cpus->cpus[i].arch_id =
+            x86_cpu_apic_id_from_index(x86ms, i);
+        x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id,
+                                 &topo_info, &topo_ids);
+        ms->possible_cpus->cpus[i].props.has_socket_id = true;
+        ms->possible_cpus->cpus[i].props.socket_id = topo_ids.pkg_id;
+        if (ms->smp.dies > 1) {
+            ms->possible_cpus->cpus[i].props.has_die_id = true;
+            ms->possible_cpus->cpus[i].props.die_id = topo_ids.die_id;
+        }
+        ms->possible_cpus->cpus[i].props.has_core_id = true;
+        ms->possible_cpus->cpus[i].props.core_id = topo_ids.core_id;
+        ms->possible_cpus->cpus[i].props.has_thread_id = true;
+        ms->possible_cpus->cpus[i].props.thread_id = topo_ids.smt_id;
+    }
+    return ms->possible_cpus;
+}
+
+static void x86_nmi(NMIState *n, int cpu_index, Error **errp)
+{
+    /* cpu index isn't used */
+    CPUState *cs;
+
+    CPU_FOREACH(cs) {
+        X86CPU *cpu = X86_CPU(cs);
+
+        if (!cpu->apic_state) {
+            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
+        } else {
+            apic_deliver_nmi(cpu->apic_state);
+        }
+    }
+}
+
+static long get_file_size(FILE *f)
+{
+    long where, size;
+
+    /* XXX: on Unix systems, using fstat() probably makes more sense */
+
+    where = ftell(f);
+    fseek(f, 0, SEEK_END);
+    size = ftell(f);
+    fseek(f, where, SEEK_SET);
+
+    return size;
+}
+
+/* TSC handling */
+uint64_t cpu_get_tsc(CPUX86State *env)
+{
+    return cpus_get_elapsed_ticks();
+}
+
+/* IRQ handling */
+static void pic_irq_request(void *opaque, int irq, int level)
+{
+    CPUState *cs = first_cpu;
+    X86CPU *cpu = X86_CPU(cs);
+
+    trace_x86_pic_interrupt(irq, level);
+    if (cpu->apic_state && !kvm_irqchip_in_kernel() &&
+        !whpx_apic_in_platform()) {
+        CPU_FOREACH(cs) {
+            cpu = X86_CPU(cs);
+            if (apic_accept_pic_intr(cpu->apic_state)) {
+                apic_deliver_pic_intr(cpu->apic_state, level);
+            }
+        }
+    } else {
+        if (level) {
+            cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+        } else {
+            cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
+        }
+    }
+}
+
+qemu_irq x86_allocate_cpu_irq(void)
+{
+    return qemu_allocate_irq(pic_irq_request, NULL, 0);
+}
+
+int cpu_get_pic_interrupt(CPUX86State *env)
+{
+    X86CPU *cpu = env_archcpu(env);
+    int intno;
+
+    if (!kvm_irqchip_in_kernel() && !whpx_apic_in_platform()) {
+        intno = apic_get_interrupt(cpu->apic_state);
+        if (intno >= 0) {
+            return intno;
+        }
+        /* read the irq from the PIC */
+        if (!apic_accept_pic_intr(cpu->apic_state)) {
+            return -1;
+        }
+    }
+
+    intno = pic_read_irq(isa_pic);
+    return intno;
+}
+
+DeviceState *cpu_get_current_apic(void)
+{
+    if (current_cpu) {
+        X86CPU *cpu = X86_CPU(current_cpu);
+        return cpu->apic_state;
+    } else {
+        return NULL;
+    }
+}
+
+void gsi_handler(void *opaque, int n, int level)
+{
+    GSIState *s = opaque;
+
+    trace_x86_gsi_interrupt(n, level);
+    switch (n) {
+    case 0 ... ISA_NUM_IRQS - 1:
+        if (s->i8259_irq[n]) {
+            /* Under KVM, Kernel will forward to both PIC and IOAPIC */
+            qemu_set_irq(s->i8259_irq[n], level);
+        }
+        /* fall through */
+    case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1:
+        qemu_set_irq(s->ioapic_irq[n], level);
+        break;
+    case IO_APIC_SECONDARY_IRQBASE
+        ... IO_APIC_SECONDARY_IRQBASE + IOAPIC_NUM_PINS - 1:
+        qemu_set_irq(s->ioapic2_irq[n - IO_APIC_SECONDARY_IRQBASE], level);
+        break;
+    }
+}
+
+void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
+{
+    DeviceState *dev;
+    SysBusDevice *d;
+    unsigned int i;
+
+    assert(parent_name);
+    if (kvm_ioapic_in_kernel()) {
+        dev = qdev_new(TYPE_KVM_IOAPIC);
+    } else {
+        dev = qdev_new(TYPE_IOAPIC);
+    }
+    object_property_add_child(object_resolve_path(parent_name, NULL),
+                              "ioapic", OBJECT(dev));
+    d = SYS_BUS_DEVICE(dev);
+    sysbus_realize_and_unref(d, &error_fatal);
+    sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
+
+    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+        gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i);
+    }
+}
+
+DeviceState *ioapic_init_secondary(GSIState *gsi_state)
+{
+    DeviceState *dev;
+    SysBusDevice *d;
+    unsigned int i;
+
+    dev = qdev_new(TYPE_IOAPIC);
+    d = SYS_BUS_DEVICE(dev);
+    sysbus_realize_and_unref(d, &error_fatal);
+    sysbus_mmio_map(d, 0, IO_APIC_SECONDARY_ADDRESS);
+
+    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+        gsi_state->ioapic2_irq[i] = qdev_get_gpio_in(dev, i);
+    }
+    return dev;
+}
+
+struct setup_data {
+    uint64_t next;
+    uint32_t type;
+    uint32_t len;
+    uint8_t data[];
+} __attribute__((packed));
+
+
+/*
+ * The entry point into the kernel for PVH boot is different from
+ * the native entry point.  The PVH entry is defined by the x86/HVM
+ * direct boot ABI and is available in an ELFNOTE in the kernel binary.
+ *
+ * This function is passed to load_elf() when it is called from
+ * load_elfboot() which then additionally checks for an ELF Note of
+ * type XEN_ELFNOTE_PHYS32_ENTRY and passes it to this function to
+ * parse the PVH entry address from the ELF Note.
+ *
+ * Due to trickery in elf_opts.h, load_elf() is actually available as
+ * load_elf32() or load_elf64() and this routine needs to be able
+ * to deal with being called as 32 or 64 bit.
+ *
+ * The address of the PVH entry point is saved to the 'pvh_start_addr'
+ * global variable.  (although the entry point is 32-bit, the kernel
+ * binary can be either 32-bit or 64-bit).
+ */
+static uint64_t read_pvh_start_addr(void *arg1, void *arg2, bool is64)
+{
+    size_t *elf_note_data_addr;
+
+    /* Check if ELF Note header passed in is valid */
+    if (arg1 == NULL) {
+        return 0;
+    }
+
+    if (is64) {
+        struct elf64_note *nhdr64 = (struct elf64_note *)arg1;
+        uint64_t nhdr_size64 = sizeof(struct elf64_note);
+        uint64_t phdr_align = *(uint64_t *)arg2;
+        uint64_t nhdr_namesz = nhdr64->n_namesz;
+
+        elf_note_data_addr =
+            ((void *)nhdr64) + nhdr_size64 +
+            QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
+
+        pvh_start_addr = *elf_note_data_addr;
+    } else {
+        struct elf32_note *nhdr32 = (struct elf32_note *)arg1;
+        uint32_t nhdr_size32 = sizeof(struct elf32_note);
+        uint32_t phdr_align = *(uint32_t *)arg2;
+        uint32_t nhdr_namesz = nhdr32->n_namesz;
+
+        elf_note_data_addr =
+            ((void *)nhdr32) + nhdr_size32 +
+            QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
+
+        pvh_start_addr = *(uint32_t *)elf_note_data_addr;
+    }
+
+    return pvh_start_addr;
+}
+
+static bool load_elfboot(const char *kernel_filename,
+                         int kernel_file_size,
+                         uint8_t *header,
+                         size_t pvh_xen_start_addr,
+                         FWCfgState *fw_cfg)
+{
+    uint32_t flags = 0;
+    uint32_t mh_load_addr = 0;
+    uint32_t elf_kernel_size = 0;
+    uint64_t elf_entry;
+    uint64_t elf_low, elf_high;
+    int kernel_size;
+
+    if (ldl_p(header) != 0x464c457f) {
+        return false; /* no elfboot */
+    }
+
+    bool elf_is64 = header[EI_CLASS] == ELFCLASS64;
+    flags = elf_is64 ?
+        ((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags;
+
+    if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */
+        error_report("elfboot unsupported flags = %x", flags);
+        exit(1);
+    }
+
+    uint64_t elf_note_type = XEN_ELFNOTE_PHYS32_ENTRY;
+    kernel_size = load_elf(kernel_filename, read_pvh_start_addr,
+                           NULL, &elf_note_type, &elf_entry,
+                           &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE,
+                           0, 0);
+
+    if (kernel_size < 0) {
+        error_report("Error while loading elf kernel");
+        exit(1);
+    }
+    mh_load_addr = elf_low;
+    elf_kernel_size = elf_high - elf_low;
+
+    if (pvh_start_addr == 0) {
+        error_report("Error loading uncompressed kernel without PVH ELF Note");
+        exit(1);
+    }
+    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, pvh_start_addr);
+    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr);
+    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, elf_kernel_size);
+
+    return true;
+}
+
+void x86_load_linux(X86MachineState *x86ms,
+                    FWCfgState *fw_cfg,
+                    int acpi_data_size,
+                    bool pvh_enabled)
+{
+    bool linuxboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled;
+    uint16_t protocol;
+    int setup_size, kernel_size, cmdline_size;
+    int dtb_size, setup_data_offset;
+    uint32_t initrd_max;
+    uint8_t header[8192], *setup, *kernel;
+    hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0;
+    FILE *f;
+    char *vmode;
+    MachineState *machine = MACHINE(x86ms);
+    struct setup_data *setup_data;
+    const char *kernel_filename = machine->kernel_filename;
+    const char *initrd_filename = machine->initrd_filename;
+    const char *dtb_filename = machine->dtb;
+    const char *kernel_cmdline = machine->kernel_cmdline;
+    SevKernelLoaderContext sev_load_ctx = {};
+
+    /* Align to 16 bytes as a paranoia measure */
+    cmdline_size = (strlen(kernel_cmdline) + 16) & ~15;
+
+    /* load the kernel header */
+    f = fopen(kernel_filename, "rb");
+    if (!f) {
+        fprintf(stderr, "qemu: could not open kernel file '%s': %s\n",
+                kernel_filename, strerror(errno));
+        exit(1);
+    }
+
+    kernel_size = get_file_size(f);
+    if (!kernel_size ||
+        fread(header, 1, MIN(ARRAY_SIZE(header), kernel_size), f) !=
+        MIN(ARRAY_SIZE(header), kernel_size)) {
+        fprintf(stderr, "qemu: could not load kernel '%s': %s\n",
+                kernel_filename, strerror(errno));
+        exit(1);
+    }
+
+    /* kernel protocol version */
+    if (ldl_p(header + 0x202) == 0x53726448) {
+        protocol = lduw_p(header + 0x206);
+    } else {
+        /*
+         * This could be a multiboot kernel. If it is, let's stop treating it
+         * like a Linux kernel.
+         * Note: some multiboot images could be in the ELF format (the same of
+         * PVH), so we try multiboot first since we check the multiboot magic
+         * header before to load it.
+         */
+        if (load_multiboot(x86ms, fw_cfg, f, kernel_filename, initrd_filename,
+                           kernel_cmdline, kernel_size, header)) {
+            return;
+        }
+        /*
+         * Check if the file is an uncompressed kernel file (ELF) and load it,
+         * saving the PVH entry point used by the x86/HVM direct boot ABI.
+         * If load_elfboot() is successful, populate the fw_cfg info.
+         */
+        if (pvh_enabled &&
+            load_elfboot(kernel_filename, kernel_size,
+                         header, pvh_start_addr, fw_cfg)) {
+            fclose(f);
+
+            fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
+                strlen(kernel_cmdline) + 1);
+            fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
+
+            fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header));
+            fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA,
+                             header, sizeof(header));
+
+            /* load initrd */
+            if (initrd_filename) {
+                GMappedFile *mapped_file;
+                gsize initrd_size;
+                gchar *initrd_data;
+                GError *gerr = NULL;
+
+                mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
+                if (!mapped_file) {
+                    fprintf(stderr, "qemu: error reading initrd %s: %s\n",
+                            initrd_filename, gerr->message);
+                    exit(1);
+                }
+                x86ms->initrd_mapped_file = mapped_file;
+
+                initrd_data = g_mapped_file_get_contents(mapped_file);
+                initrd_size = g_mapped_file_get_length(mapped_file);
+                initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
+                if (initrd_size >= initrd_max) {
+                    fprintf(stderr, "qemu: initrd is too large, cannot support."
+                            "(max: %"PRIu32", need %"PRId64")\n",
+                            initrd_max, (uint64_t)initrd_size);
+                    exit(1);
+                }
+
+                initrd_addr = (initrd_max - initrd_size) & ~4095;
+
+                fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
+                fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
+                fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data,
+                                 initrd_size);
+            }
+
+            option_rom[nb_option_roms].bootindex = 0;
+            option_rom[nb_option_roms].name = "pvh.bin";
+            nb_option_roms++;
+
+            return;
+        }
+        protocol = 0;
+    }
+
+    if (protocol < 0x200 || !(header[0x211] & 0x01)) {
+        /* Low kernel */
+        real_addr    = 0x90000;
+        cmdline_addr = 0x9a000 - cmdline_size;
+        prot_addr    = 0x10000;
+    } else if (protocol < 0x202) {
+        /* High but ancient kernel */
+        real_addr    = 0x90000;
+        cmdline_addr = 0x9a000 - cmdline_size;
+        prot_addr    = 0x100000;
+    } else {
+        /* High and recent kernel */
+        real_addr    = 0x10000;
+        cmdline_addr = 0x20000;
+        prot_addr    = 0x100000;
+    }
+
+    /* highest address for loading the initrd */
+    if (protocol >= 0x20c &&
+        lduw_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
+        /*
+         * Linux has supported initrd up to 4 GB for a very long time (2007,
+         * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013),
+         * though it only sets initrd_max to 2 GB to "work around bootloader
+         * bugs". Luckily, QEMU firmware(which does something like bootloader)
+         * has supported this.
+         *
+         * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can
+         * be loaded into any address.
+         *
+         * In addition, initrd_max is uint32_t simply because QEMU doesn't
+         * support the 64-bit boot protocol (specifically the ext_ramdisk_image
+         * field).
+         *
+         * Therefore here just limit initrd_max to UINT32_MAX simply as well.
+         */
+        initrd_max = UINT32_MAX;
+    } else if (protocol >= 0x203) {
+        initrd_max = ldl_p(header + 0x22c);
+    } else {
+        initrd_max = 0x37ffffff;
+    }
+
+    if (initrd_max >= x86ms->below_4g_mem_size - acpi_data_size) {
+        initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
+    }
+
+    fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr);
+    fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1);
+    fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
+    sev_load_ctx.cmdline_data = (char *)kernel_cmdline;
+    sev_load_ctx.cmdline_size = strlen(kernel_cmdline) + 1;
+
+    if (protocol >= 0x202) {
+        stl_p(header + 0x228, cmdline_addr);
+    } else {
+        stw_p(header + 0x20, 0xA33F);
+        stw_p(header + 0x22, cmdline_addr - real_addr);
+    }
+
+    /* handle vga= parameter */
+    vmode = strstr(kernel_cmdline, "vga=");
+    if (vmode) {
+        unsigned int video_mode;
+        const char *end;
+        int ret;
+        /* skip "vga=" */
+        vmode += 4;
+        if (!strncmp(vmode, "normal", 6)) {
+            video_mode = 0xffff;
+        } else if (!strncmp(vmode, "ext", 3)) {
+            video_mode = 0xfffe;
+        } else if (!strncmp(vmode, "ask", 3)) {
+            video_mode = 0xfffd;
+        } else {
+            ret = qemu_strtoui(vmode, &end, 0, &video_mode);
+            if (ret != 0 || (*end && *end != ' ')) {
+                fprintf(stderr, "qemu: invalid 'vga=' kernel parameter.\n");
+                exit(1);
+            }
+        }
+        stw_p(header + 0x1fa, video_mode);
+    }
+
+    /* loader type */
+    /*
+     * High nybble = B reserved for QEMU; low nybble is revision number.
+     * If this code is substantially changed, you may want to consider
+     * incrementing the revision.
+     */
+    if (protocol >= 0x200) {
+        header[0x210] = 0xB0;
+    }
+    /* heap */
+    if (protocol >= 0x201) {
+        header[0x211] |= 0x80; /* CAN_USE_HEAP */
+        stw_p(header + 0x224, cmdline_addr - real_addr - 0x200);
+    }
+
+    /* load initrd */
+    if (initrd_filename) {
+        GMappedFile *mapped_file;
+        gsize initrd_size;
+        gchar *initrd_data;
+        GError *gerr = NULL;
+
+        if (protocol < 0x200) {
+            fprintf(stderr, "qemu: linux kernel too old to load a ram disk\n");
+            exit(1);
+        }
+
+        mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
+        if (!mapped_file) {
+            fprintf(stderr, "qemu: error reading initrd %s: %s\n",
+                    initrd_filename, gerr->message);
+            exit(1);
+        }
+        x86ms->initrd_mapped_file = mapped_file;
+
+        initrd_data = g_mapped_file_get_contents(mapped_file);
+        initrd_size = g_mapped_file_get_length(mapped_file);
+        if (initrd_size >= initrd_max) {
+            fprintf(stderr, "qemu: initrd is too large, cannot support."
+                    "(max: %"PRIu32", need %"PRId64")\n",
+                    initrd_max, (uint64_t)initrd_size);
+            exit(1);
+        }
+
+        initrd_addr = (initrd_max - initrd_size) & ~4095;
+
+        fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
+        fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
+        fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, initrd_size);
+        sev_load_ctx.initrd_data = initrd_data;
+        sev_load_ctx.initrd_size = initrd_size;
+
+        stl_p(header + 0x218, initrd_addr);
+        stl_p(header + 0x21c, initrd_size);
+    }
+
+    /* load kernel and setup */
+    setup_size = header[0x1f1];
+    if (setup_size == 0) {
+        setup_size = 4;
+    }
+    setup_size = (setup_size + 1) * 512;
+    if (setup_size > kernel_size) {
+        fprintf(stderr, "qemu: invalid kernel header\n");
+        exit(1);
+    }
+    kernel_size -= setup_size;
+
+    setup  = g_malloc(setup_size);
+    kernel = g_malloc(kernel_size);
+    fseek(f, 0, SEEK_SET);
+    if (fread(setup, 1, setup_size, f) != setup_size) {
+        fprintf(stderr, "fread() failed\n");
+        exit(1);
+    }
+    if (fread(kernel, 1, kernel_size, f) != kernel_size) {
+        fprintf(stderr, "fread() failed\n");
+        exit(1);
+    }
+    fclose(f);
+
+    /* append dtb to kernel */
+    if (dtb_filename) {
+        if (protocol < 0x209) {
+            fprintf(stderr, "qemu: Linux kernel too old to load a dtb\n");
+            exit(1);
+        }
+
+        dtb_size = get_image_size(dtb_filename);
+        if (dtb_size <= 0) {
+            fprintf(stderr, "qemu: error reading dtb %s: %s\n",
+                    dtb_filename, strerror(errno));
+            exit(1);
+        }
+
+        setup_data_offset = QEMU_ALIGN_UP(kernel_size, 16);
+        kernel_size = setup_data_offset + sizeof(struct setup_data) + dtb_size;
+        kernel = g_realloc(kernel, kernel_size);
+
+        stq_p(header + 0x250, prot_addr + setup_data_offset);
+
+        setup_data = (struct setup_data *)(kernel + setup_data_offset);
+        setup_data->next = 0;
+        setup_data->type = cpu_to_le32(SETUP_DTB);
+        setup_data->len = cpu_to_le32(dtb_size);
+
+        load_image_size(dtb_filename, setup_data->data, dtb_size);
+    }
+
+    /*
+     * If we're starting an encrypted VM, it will be OVMF based, which uses the
+     * efi stub for booting and doesn't require any values to be placed in the
+     * kernel header.  We therefore don't update the header so the hash of the
+     * kernel on the other side of the fw_cfg interface matches the hash of the
+     * file the user passed in.
+     */
+    if (!sev_enabled()) {
+        memcpy(setup, header, MIN(sizeof(header), setup_size));
+    }
+
+    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr);
+    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size);
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size);
+    sev_load_ctx.kernel_data = (char *)kernel;
+    sev_load_ctx.kernel_size = kernel_size;
+
+    fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr);
+    fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size);
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, setup, setup_size);
+    sev_load_ctx.setup_data = (char *)setup;
+    sev_load_ctx.setup_size = setup_size;
+
+    if (sev_enabled()) {
+        sev_add_kernel_loader_hashes(&sev_load_ctx, &error_fatal);
+    }
+
+    option_rom[nb_option_roms].bootindex = 0;
+    option_rom[nb_option_roms].name = "linuxboot.bin";
+    if (linuxboot_dma_enabled && fw_cfg_dma_enabled(fw_cfg)) {
+        option_rom[nb_option_roms].name = "linuxboot_dma.bin";
+    }
+    nb_option_roms++;
+}
+
+void x86_bios_rom_init(MachineState *ms, const char *default_firmware,
+                       MemoryRegion *rom_memory, bool isapc_ram_fw)
+{
+    const char *bios_name;
+    char *filename;
+    MemoryRegion *bios, *isa_bios;
+    int bios_size, isa_bios_size;
+    int ret;
+
+    /* BIOS load */
+    bios_name = ms->firmware ?: default_firmware;
+    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
+    if (filename) {
+        bios_size = get_image_size(filename);
+    } else {
+        bios_size = -1;
+    }
+    if (bios_size <= 0 ||
+        (bios_size % 65536) != 0) {
+        goto bios_error;
+    }
+    bios = g_malloc(sizeof(*bios));
+    memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal);
+    if (!isapc_ram_fw) {
+        memory_region_set_readonly(bios, true);
+    }
+    ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1);
+    if (ret != 0) {
+    bios_error:
+        fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name);
+        exit(1);
+    }
+    g_free(filename);
+
+    /* map the last 128KB of the BIOS in ISA space */
+    isa_bios_size = MIN(bios_size, 128 * KiB);
+    isa_bios = g_malloc(sizeof(*isa_bios));
+    memory_region_init_alias(isa_bios, NULL, "isa-bios", bios,
+                             bios_size - isa_bios_size, isa_bios_size);
+    memory_region_add_subregion_overlap(rom_memory,
+                                        0x100000 - isa_bios_size,
+                                        isa_bios,
+                                        1);
+    if (!isapc_ram_fw) {
+        memory_region_set_readonly(isa_bios, true);
+    }
+
+    /* map all the bios at the top of memory */
+    memory_region_add_subregion(rom_memory,
+                                (uint32_t)(-bios_size),
+                                bios);
+}
+
+bool x86_machine_is_smm_enabled(const X86MachineState *x86ms)
+{
+    bool smm_available = false;
+
+    if (x86ms->smm == ON_OFF_AUTO_OFF) {
+        return false;
+    }
+
+    if (tcg_enabled() || qtest_enabled()) {
+        smm_available = true;
+    } else if (kvm_enabled()) {
+        smm_available = kvm_has_smm();
+    }
+
+    if (smm_available) {
+        return true;
+    }
+
+    if (x86ms->smm == ON_OFF_AUTO_ON) {
+        error_report("System Management Mode not supported by this hypervisor.");
+        exit(1);
+    }
+    return false;
+}
+
+static void x86_machine_get_smm(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+    OnOffAuto smm = x86ms->smm;
+
+    visit_type_OnOffAuto(v, name, &smm, errp);
+}
+
+static void x86_machine_set_smm(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, name, &x86ms->smm, errp);
+}
+
+bool x86_machine_is_acpi_enabled(const X86MachineState *x86ms)
+{
+    if (x86ms->acpi == ON_OFF_AUTO_OFF) {
+        return false;
+    }
+    return true;
+}
+
+static void x86_machine_get_acpi(Object *obj, Visitor *v, const char *name,
+                                 void *opaque, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+    OnOffAuto acpi = x86ms->acpi;
+
+    visit_type_OnOffAuto(v, name, &acpi, errp);
+}
+
+static void x86_machine_set_acpi(Object *obj, Visitor *v, const char *name,
+                                 void *opaque, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, name, &x86ms->acpi, errp);
+}
+
+static char *x86_machine_get_oem_id(Object *obj, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+
+    return g_strdup(x86ms->oem_id);
+}
+
+static void x86_machine_set_oem_id(Object *obj, const char *value, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+    size_t len = strlen(value);
+
+    if (len > 6) {
+        error_setg(errp,
+                   "User specified "X86_MACHINE_OEM_ID" value is bigger than "
+                   "6 bytes in size");
+        return;
+    }
+
+    strncpy(x86ms->oem_id, value, 6);
+}
+
+static char *x86_machine_get_oem_table_id(Object *obj, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+
+    return g_strdup(x86ms->oem_table_id);
+}
+
+static void x86_machine_set_oem_table_id(Object *obj, const char *value,
+                                         Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+    size_t len = strlen(value);
+
+    if (len > 8) {
+        error_setg(errp,
+                   "User specified "X86_MACHINE_OEM_TABLE_ID
+                   " value is bigger than "
+                   "8 bytes in size");
+        return;
+    }
+    strncpy(x86ms->oem_table_id, value, 8);
+}
+
+static void x86_machine_get_bus_lock_ratelimit(Object *obj, Visitor *v,
+                                const char *name, void *opaque, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+    uint64_t bus_lock_ratelimit = x86ms->bus_lock_ratelimit;
+
+    visit_type_uint64(v, name, &bus_lock_ratelimit, errp);
+}
+
+static void x86_machine_set_bus_lock_ratelimit(Object *obj, Visitor *v,
+                               const char *name, void *opaque, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+
+    visit_type_uint64(v, name, &x86ms->bus_lock_ratelimit, errp);
+}
+
+static void machine_get_sgx_epc(Object *obj, Visitor *v, const char *name,
+                                void *opaque, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+    SgxEPCList *list = x86ms->sgx_epc_list;
+
+    visit_type_SgxEPCList(v, name, &list, errp);
+}
+
+static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name,
+                                void *opaque, Error **errp)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+    SgxEPCList *list;
+
+    list = x86ms->sgx_epc_list;
+    visit_type_SgxEPCList(v, name, &x86ms->sgx_epc_list, errp);
+
+    qapi_free_SgxEPCList(list);
+}
+
+static void x86_machine_initfn(Object *obj)
+{
+    X86MachineState *x86ms = X86_MACHINE(obj);
+
+    x86ms->smm = ON_OFF_AUTO_AUTO;
+    x86ms->acpi = ON_OFF_AUTO_AUTO;
+    x86ms->pci_irq_mask = ACPI_BUILD_PCI_IRQS;
+    x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6);
+    x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
+    x86ms->bus_lock_ratelimit = 0;
+}
+
+static void x86_machine_class_init(ObjectClass *oc, void *data)
+{
+    MachineClass *mc = MACHINE_CLASS(oc);
+    X86MachineClass *x86mc = X86_MACHINE_CLASS(oc);
+    NMIClass *nc = NMI_CLASS(oc);
+
+    mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
+    mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
+    mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
+    x86mc->compat_apic_id_mode = false;
+    x86mc->save_tsc_khz = true;
+    x86mc->fwcfg_dma_enabled = true;
+    nc->nmi_monitor_handler = x86_nmi;
+
+    object_class_property_add(oc, X86_MACHINE_SMM, "OnOffAuto",
+        x86_machine_get_smm, x86_machine_set_smm,
+        NULL, NULL);
+    object_class_property_set_description(oc, X86_MACHINE_SMM,
+        "Enable SMM");
+
+    object_class_property_add(oc, X86_MACHINE_ACPI, "OnOffAuto",
+        x86_machine_get_acpi, x86_machine_set_acpi,
+        NULL, NULL);
+    object_class_property_set_description(oc, X86_MACHINE_ACPI,
+        "Enable ACPI");
+
+    object_class_property_add_str(oc, X86_MACHINE_OEM_ID,
+                                  x86_machine_get_oem_id,
+                                  x86_machine_set_oem_id);
+    object_class_property_set_description(oc, X86_MACHINE_OEM_ID,
+                                          "Override the default value of field OEMID "
+                                          "in ACPI table header."
+                                          "The string may be up to 6 bytes in size");
+
+
+    object_class_property_add_str(oc, X86_MACHINE_OEM_TABLE_ID,
+                                  x86_machine_get_oem_table_id,
+                                  x86_machine_set_oem_table_id);
+    object_class_property_set_description(oc, X86_MACHINE_OEM_TABLE_ID,
+                                          "Override the default value of field OEM Table ID "
+                                          "in ACPI table header."
+                                          "The string may be up to 8 bytes in size");
+
+    object_class_property_add(oc, X86_MACHINE_BUS_LOCK_RATELIMIT, "uint64_t",
+                                x86_machine_get_bus_lock_ratelimit,
+                                x86_machine_set_bus_lock_ratelimit, NULL, NULL);
+    object_class_property_set_description(oc, X86_MACHINE_BUS_LOCK_RATELIMIT,
+            "Set the ratelimit for the bus locks acquired in VMs");
+
+    object_class_property_add(oc, "sgx-epc", "SgxEPC",
+        machine_get_sgx_epc, machine_set_sgx_epc,
+        NULL, NULL);
+    object_class_property_set_description(oc, "sgx-epc",
+        "SGX EPC device");
+}
+
+static const TypeInfo x86_machine_info = {
+    .name = TYPE_X86_MACHINE,
+    .parent = TYPE_MACHINE,
+    .abstract = true,
+    .instance_size = sizeof(X86MachineState),
+    .instance_init = x86_machine_initfn,
+    .class_size = sizeof(X86MachineClass),
+    .class_init = x86_machine_class_init,
+    .interfaces = (InterfaceInfo[]) {
+         { TYPE_NMI },
+         { }
+    },
+};
+
+static void x86_machine_register_types(void)
+{
+    type_register_static(&x86_machine_info);
+}
+
+type_init(x86_machine_register_types)
diff --git a/hw/i386/xen/meson.build b/hw/i386/xen/meson.build
new file mode 100644
index 000000000..be8413030
--- /dev/null
+++ b/hw/i386/xen/meson.build
@@ -0,0 +1,7 @@
+i386_ss.add(when: 'CONFIG_XEN', if_true: files(
+  'xen-hvm.c',
+  'xen-mapcache.c',
+  'xen_apic.c',
+  'xen_platform.c',
+  'xen_pvdevice.c',
+))
diff --git a/hw/i386/xen/trace-events b/hw/i386/xen/trace-events
new file mode 100644
index 000000000..5d6be6109
--- /dev/null
+++ b/hw/i386/xen/trace-events
@@ -0,0 +1,28 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# xen_platform.c
+xen_platform_log(char *s) "xen platform: %s"
+
+# xen_pvdevice.c
+xen_pv_mmio_read(uint64_t addr) "WARNING: read from Xen PV Device MMIO space (address 0x%"PRIx64")"
+xen_pv_mmio_write(uint64_t addr) "WARNING: write to Xen PV Device MMIO space (address 0x%"PRIx64")"
+
+# xen-hvm.c
+xen_ram_alloc(unsigned long ram_addr, unsigned long size) "requested: 0x%lx, size 0x%lx"
+xen_client_set_memory(uint64_t start_addr, unsigned long size, bool log_dirty) "0x%"PRIx64" size 0x%lx, log_dirty %i"
+handle_ioreq(void *req, uint32_t type, uint32_t dir, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p type=%d dir=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d"
+handle_ioreq_read(void *req, uint32_t type, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p read type=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d"
+handle_ioreq_write(void *req, uint32_t type, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p write type=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d"
+cpu_ioreq_pio(void *req, uint32_t dir, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p pio dir=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d"
+cpu_ioreq_pio_read_reg(void *req, uint64_t data, uint64_t addr, uint32_t size) "I/O=%p pio read reg data=0x%"PRIx64" port=0x%"PRIx64" size=%d"
+cpu_ioreq_pio_write_reg(void *req, uint64_t data, uint64_t addr, uint32_t size) "I/O=%p pio write reg data=0x%"PRIx64" port=0x%"PRIx64" size=%d"
+cpu_ioreq_move(void *req, uint32_t dir, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p copy dir=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d"
+xen_map_resource_ioreq(uint32_t id, void *addr) "id: %u addr: %p"
+cpu_ioreq_config_read(void *req, uint32_t sbdf, uint32_t reg, uint32_t size, uint32_t data) "I/O=%p sbdf=0x%x reg=%u size=%u data=0x%x"
+cpu_ioreq_config_write(void *req, uint32_t sbdf, uint32_t reg, uint32_t size, uint32_t data) "I/O=%p sbdf=0x%x reg=%u size=%u data=0x%x"
+
+# xen-mapcache.c
+xen_map_cache(uint64_t phys_addr) "want 0x%"PRIx64
+xen_remap_bucket(uint64_t index) "index 0x%"PRIx64
+xen_map_cache_return(void* ptr) "%p"
+
diff --git a/hw/i386/xen/trace.h b/hw/i386/xen/trace.h
new file mode 100644
index 000000000..a02bf755d
--- /dev/null
+++ b/hw/i386/xen/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_i386_xen.h"
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
new file mode 100644
index 000000000..482be9541
--- /dev/null
+++ b/hw/i386/xen/xen-hvm.c
@@ -0,0 +1,1620 @@
+/*
+ * Copyright (C) 2010       Citrix Ltd.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+
+#include "cpu.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_host.h"
+#include "hw/i386/pc.h"
+#include "hw/southbridge/piix.h"
+#include "hw/irq.h"
+#include "hw/hw.h"
+#include "hw/i386/apic-msidef.h"
+#include "hw/xen/xen_common.h"
+#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen-bus.h"
+#include "hw/xen/xen-x86.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-migration.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "qemu/range.h"
+#include "sysemu/runstate.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/xen.h"
+#include "sysemu/xen-mapcache.h"
+#include "trace.h"
+
+#include <xen/hvm/ioreq.h>
+#include <xen/hvm/e820.h>
+
+//#define DEBUG_XEN_HVM
+
+#ifdef DEBUG_XEN_HVM
+#define DPRINTF(fmt, ...) \
+    do { fprintf(stderr, "xen: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+static MemoryRegion ram_memory, ram_640k, ram_lo, ram_hi;
+static MemoryRegion *framebuffer;
+static bool xen_in_migration;
+
+/* Compatibility with older version */
+
+/* This allows QEMU to build on a system that has Xen 4.5 or earlier
+ * installed.  This here (not in hw/xen/xen_common.h) because xen/hvm/ioreq.h
+ * needs to be included before this block and hw/xen/xen_common.h needs to
+ * be included before xen/hvm/ioreq.h
+ */
+#ifndef IOREQ_TYPE_VMWARE_PORT
+#define IOREQ_TYPE_VMWARE_PORT  3
+struct vmware_regs {
+    uint32_t esi;
+    uint32_t edi;
+    uint32_t ebx;
+    uint32_t ecx;
+    uint32_t edx;
+};
+typedef struct vmware_regs vmware_regs_t;
+
+struct shared_vmport_iopage {
+    struct vmware_regs vcpu_vmport_regs[1];
+};
+typedef struct shared_vmport_iopage shared_vmport_iopage_t;
+#endif
+
+static inline uint32_t xen_vcpu_eport(shared_iopage_t *shared_page, int i)
+{
+    return shared_page->vcpu_ioreq[i].vp_eport;
+}
+static inline ioreq_t *xen_vcpu_ioreq(shared_iopage_t *shared_page, int vcpu)
+{
+    return &shared_page->vcpu_ioreq[vcpu];
+}
+
+#define BUFFER_IO_MAX_DELAY  100
+
+typedef struct XenPhysmap {
+    hwaddr start_addr;
+    ram_addr_t size;
+    const char *name;
+    hwaddr phys_offset;
+
+    QLIST_ENTRY(XenPhysmap) list;
+} XenPhysmap;
+
+static QLIST_HEAD(, XenPhysmap) xen_physmap;
+
+typedef struct XenPciDevice {
+    PCIDevice *pci_dev;
+    uint32_t sbdf;
+    QLIST_ENTRY(XenPciDevice) entry;
+} XenPciDevice;
+
+typedef struct XenIOState {
+    ioservid_t ioservid;
+    shared_iopage_t *shared_page;
+    shared_vmport_iopage_t *shared_vmport_page;
+    buffered_iopage_t *buffered_io_page;
+    xenforeignmemory_resource_handle *fres;
+    QEMUTimer *buffered_io_timer;
+    CPUState **cpu_by_vcpu_id;
+    /* the evtchn port for polling the notification, */
+    evtchn_port_t *ioreq_local_port;
+    /* evtchn remote and local ports for buffered io */
+    evtchn_port_t bufioreq_remote_port;
+    evtchn_port_t bufioreq_local_port;
+    /* the evtchn fd for polling */
+    xenevtchn_handle *xce_handle;
+    /* which vcpu we are serving */
+    int send_vcpu;
+
+    struct xs_handle *xenstore;
+    MemoryListener memory_listener;
+    MemoryListener io_listener;
+    QLIST_HEAD(, XenPciDevice) dev_list;
+    DeviceListener device_listener;
+    hwaddr free_phys_offset;
+    const XenPhysmap *log_for_dirtybit;
+    /* Buffer used by xen_sync_dirty_bitmap */
+    unsigned long *dirty_bitmap;
+
+    Notifier exit;
+    Notifier suspend;
+    Notifier wakeup;
+} XenIOState;
+
+/* Xen specific function for piix pci */
+
+int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
+{
+    return irq_num + (PCI_SLOT(pci_dev->devfn) << 2);
+}
+
+void xen_piix3_set_irq(void *opaque, int irq_num, int level)
+{
+    xen_set_pci_intx_level(xen_domid, 0, 0, irq_num >> 2,
+                           irq_num & 3, level);
+}
+
+void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len)
+{
+    int i;
+
+    /* Scan for updates to PCI link routes (0x60-0x63). */
+    for (i = 0; i < len; i++) {
+        uint8_t v = (val >> (8 * i)) & 0xff;
+        if (v & 0x80) {
+            v = 0;
+        }
+        v &= 0xf;
+        if (((address + i) >= PIIX_PIRQCA) && ((address + i) <= PIIX_PIRQCD)) {
+            xen_set_pci_link_route(xen_domid, address + i - PIIX_PIRQCA, v);
+        }
+    }
+}
+
+int xen_is_pirq_msi(uint32_t msi_data)
+{
+    /* If vector is 0, the msi is remapped into a pirq, passed as
+     * dest_id.
+     */
+    return ((msi_data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT) == 0;
+}
+
+void xen_hvm_inject_msi(uint64_t addr, uint32_t data)
+{
+    xen_inject_msi(xen_domid, addr, data);
+}
+
+static void xen_suspend_notifier(Notifier *notifier, void *data)
+{
+    xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 3);
+}
+
+/* Xen Interrupt Controller */
+
+static void xen_set_irq(void *opaque, int irq, int level)
+{
+    xen_set_isa_irq_level(xen_domid, irq, level);
+}
+
+qemu_irq *xen_interrupt_controller_init(void)
+{
+    return qemu_allocate_irqs(xen_set_irq, NULL, 16);
+}
+
+/* Memory Ops */
+
+static void xen_ram_init(PCMachineState *pcms,
+                         ram_addr_t ram_size, MemoryRegion **ram_memory_p)
+{
+    X86MachineState *x86ms = X86_MACHINE(pcms);
+    MemoryRegion *sysmem = get_system_memory();
+    ram_addr_t block_len;
+    uint64_t user_lowmem =
+        object_property_get_uint(qdev_get_machine(),
+                                 PC_MACHINE_MAX_RAM_BELOW_4G,
+                                 &error_abort);
+
+    /* Handle the machine opt max-ram-below-4g.  It is basically doing
+     * min(xen limit, user limit).
+     */
+    if (!user_lowmem) {
+        user_lowmem = HVM_BELOW_4G_RAM_END; /* default */
+    }
+    if (HVM_BELOW_4G_RAM_END <= user_lowmem) {
+        user_lowmem = HVM_BELOW_4G_RAM_END;
+    }
+
+    if (ram_size >= user_lowmem) {
+        x86ms->above_4g_mem_size = ram_size - user_lowmem;
+        x86ms->below_4g_mem_size = user_lowmem;
+    } else {
+        x86ms->above_4g_mem_size = 0;
+        x86ms->below_4g_mem_size = ram_size;
+    }
+    if (!x86ms->above_4g_mem_size) {
+        block_len = ram_size;
+    } else {
+        /*
+         * Xen does not allocate the memory continuously, it keeps a
+         * hole of the size computed above or passed in.
+         */
+        block_len = (4 * GiB) + x86ms->above_4g_mem_size;
+    }
+    memory_region_init_ram(&ram_memory, NULL, "xen.ram", block_len,
+                           &error_fatal);
+    *ram_memory_p = &ram_memory;
+
+    memory_region_init_alias(&ram_640k, NULL, "xen.ram.640k",
+                             &ram_memory, 0, 0xa0000);
+    memory_region_add_subregion(sysmem, 0, &ram_640k);
+    /* Skip of the VGA IO memory space, it will be registered later by the VGA
+     * emulated device.
+     *
+     * The area between 0xc0000 and 0x100000 will be used by SeaBIOS to load
+     * the Options ROM, so it is registered here as RAM.
+     */
+    memory_region_init_alias(&ram_lo, NULL, "xen.ram.lo",
+                             &ram_memory, 0xc0000,
+                             x86ms->below_4g_mem_size - 0xc0000);
+    memory_region_add_subregion(sysmem, 0xc0000, &ram_lo);
+    if (x86ms->above_4g_mem_size > 0) {
+        memory_region_init_alias(&ram_hi, NULL, "xen.ram.hi",
+                                 &ram_memory, 0x100000000ULL,
+                                 x86ms->above_4g_mem_size);
+        memory_region_add_subregion(sysmem, 0x100000000ULL, &ram_hi);
+    }
+}
+
+void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, MemoryRegion *mr,
+                   Error **errp)
+{
+    unsigned long nr_pfn;
+    xen_pfn_t *pfn_list;
+    int i;
+
+    if (runstate_check(RUN_STATE_INMIGRATE)) {
+        /* RAM already populated in Xen */
+        fprintf(stderr, "%s: do not alloc "RAM_ADDR_FMT
+                " bytes of ram at "RAM_ADDR_FMT" when runstate is INMIGRATE\n",
+                __func__, size, ram_addr);
+        return;
+    }
+
+    if (mr == &ram_memory) {
+        return;
+    }
+
+    trace_xen_ram_alloc(ram_addr, size);
+
+    nr_pfn = size >> TARGET_PAGE_BITS;
+    pfn_list = g_malloc(sizeof (*pfn_list) * nr_pfn);
+
+    for (i = 0; i < nr_pfn; i++) {
+        pfn_list[i] = (ram_addr >> TARGET_PAGE_BITS) + i;
+    }
+
+    if (xc_domain_populate_physmap_exact(xen_xc, xen_domid, nr_pfn, 0, 0, pfn_list)) {
+        error_setg(errp, "xen: failed to populate ram at " RAM_ADDR_FMT,
+                   ram_addr);
+    }
+
+    g_free(pfn_list);
+}
+
+static XenPhysmap *get_physmapping(hwaddr start_addr, ram_addr_t size)
+{
+    XenPhysmap *physmap = NULL;
+
+    start_addr &= TARGET_PAGE_MASK;
+
+    QLIST_FOREACH(physmap, &xen_physmap, list) {
+        if (range_covers_byte(physmap->start_addr, physmap->size, start_addr)) {
+            return physmap;
+        }
+    }
+    return NULL;
+}
+
+static hwaddr xen_phys_offset_to_gaddr(hwaddr phys_offset, ram_addr_t size)
+{
+    hwaddr addr = phys_offset & TARGET_PAGE_MASK;
+    XenPhysmap *physmap = NULL;
+
+    QLIST_FOREACH(physmap, &xen_physmap, list) {
+        if (range_covers_byte(physmap->phys_offset, physmap->size, addr)) {
+            return physmap->start_addr + (phys_offset - physmap->phys_offset);
+        }
+    }
+
+    return phys_offset;
+}
+
+#ifdef XEN_COMPAT_PHYSMAP
+static int xen_save_physmap(XenIOState *state, XenPhysmap *physmap)
+{
+    char path[80], value[17];
+
+    snprintf(path, sizeof(path),
+            "/local/domain/0/device-model/%d/physmap/%"PRIx64"/start_addr",
+            xen_domid, (uint64_t)physmap->phys_offset);
+    snprintf(value, sizeof(value), "%"PRIx64, (uint64_t)physmap->start_addr);
+    if (!xs_write(state->xenstore, 0, path, value, strlen(value))) {
+        return -1;
+    }
+    snprintf(path, sizeof(path),
+            "/local/domain/0/device-model/%d/physmap/%"PRIx64"/size",
+            xen_domid, (uint64_t)physmap->phys_offset);
+    snprintf(value, sizeof(value), "%"PRIx64, (uint64_t)physmap->size);
+    if (!xs_write(state->xenstore, 0, path, value, strlen(value))) {
+        return -1;
+    }
+    if (physmap->name) {
+        snprintf(path, sizeof(path),
+                "/local/domain/0/device-model/%d/physmap/%"PRIx64"/name",
+                xen_domid, (uint64_t)physmap->phys_offset);
+        if (!xs_write(state->xenstore, 0, path,
+                      physmap->name, strlen(physmap->name))) {
+            return -1;
+        }
+    }
+    return 0;
+}
+#else
+static int xen_save_physmap(XenIOState *state, XenPhysmap *physmap)
+{
+    return 0;
+}
+#endif
+
+static int xen_add_to_physmap(XenIOState *state,
+                              hwaddr start_addr,
+                              ram_addr_t size,
+                              MemoryRegion *mr,
+                              hwaddr offset_within_region)
+{
+    unsigned long nr_pages;
+    int rc = 0;
+    XenPhysmap *physmap = NULL;
+    hwaddr pfn, start_gpfn;
+    hwaddr phys_offset = memory_region_get_ram_addr(mr);
+    const char *mr_name;
+
+    if (get_physmapping(start_addr, size)) {
+        return 0;
+    }
+    if (size <= 0) {
+        return -1;
+    }
+
+    /* Xen can only handle a single dirty log region for now and we want
+     * the linear framebuffer to be that region.
+     * Avoid tracking any regions that is not videoram and avoid tracking
+     * the legacy vga region. */
+    if (mr == framebuffer && start_addr > 0xbffff) {
+        goto go_physmap;
+    }
+    return -1;
+
+go_physmap:
+    DPRINTF("mapping vram to %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
+            start_addr, start_addr + size);
+
+    mr_name = memory_region_name(mr);
+
+    physmap = g_malloc(sizeof(XenPhysmap));
+
+    physmap->start_addr = start_addr;
+    physmap->size = size;
+    physmap->name = mr_name;
+    physmap->phys_offset = phys_offset;
+
+    QLIST_INSERT_HEAD(&xen_physmap, physmap, list);
+
+    if (runstate_check(RUN_STATE_INMIGRATE)) {
+        /* Now when we have a physmap entry we can replace a dummy mapping with
+         * a real one of guest foreign memory. */
+        uint8_t *p = xen_replace_cache_entry(phys_offset, start_addr, size);
+        assert(p && p == memory_region_get_ram_ptr(mr));
+
+        return 0;
+    }
+
+    pfn = phys_offset >> TARGET_PAGE_BITS;
+    start_gpfn = start_addr >> TARGET_PAGE_BITS;
+    nr_pages = size >> TARGET_PAGE_BITS;
+    rc = xendevicemodel_relocate_memory(xen_dmod, xen_domid, nr_pages, pfn,
+                                        start_gpfn);
+    if (rc) {
+        int saved_errno = errno;
+
+        error_report("relocate_memory %lu pages from GFN %"HWADDR_PRIx
+                     " to GFN %"HWADDR_PRIx" failed: %s",
+                     nr_pages, pfn, start_gpfn, strerror(saved_errno));
+        errno = saved_errno;
+        return -1;
+    }
+
+    rc = xendevicemodel_pin_memory_cacheattr(xen_dmod, xen_domid,
+                                   start_addr >> TARGET_PAGE_BITS,
+                                   (start_addr + size - 1) >> TARGET_PAGE_BITS,
+                                   XEN_DOMCTL_MEM_CACHEATTR_WB);
+    if (rc) {
+        error_report("pin_memory_cacheattr failed: %s", strerror(errno));
+    }
+    return xen_save_physmap(state, physmap);
+}
+
+static int xen_remove_from_physmap(XenIOState *state,
+                                   hwaddr start_addr,
+                                   ram_addr_t size)
+{
+    int rc = 0;
+    XenPhysmap *physmap = NULL;
+    hwaddr phys_offset = 0;
+
+    physmap = get_physmapping(start_addr, size);
+    if (physmap == NULL) {
+        return -1;
+    }
+
+    phys_offset = physmap->phys_offset;
+    size = physmap->size;
+
+    DPRINTF("unmapping vram to %"HWADDR_PRIx" - %"HWADDR_PRIx", at "
+            "%"HWADDR_PRIx"\n", start_addr, start_addr + size, phys_offset);
+
+    size >>= TARGET_PAGE_BITS;
+    start_addr >>= TARGET_PAGE_BITS;
+    phys_offset >>= TARGET_PAGE_BITS;
+    rc = xendevicemodel_relocate_memory(xen_dmod, xen_domid, size, start_addr,
+                                        phys_offset);
+    if (rc) {
+        int saved_errno = errno;
+
+        error_report("relocate_memory "RAM_ADDR_FMT" pages"
+                     " from GFN %"HWADDR_PRIx
+                     " to GFN %"HWADDR_PRIx" failed: %s",
+                     size, start_addr, phys_offset, strerror(saved_errno));
+        errno = saved_errno;
+        return -1;
+    }
+
+    QLIST_REMOVE(physmap, list);
+    if (state->log_for_dirtybit == physmap) {
+        state->log_for_dirtybit = NULL;
+        g_free(state->dirty_bitmap);
+        state->dirty_bitmap = NULL;
+    }
+    g_free(physmap);
+
+    return 0;
+}
+
+static void xen_set_memory(struct MemoryListener *listener,
+                           MemoryRegionSection *section,
+                           bool add)
+{
+    XenIOState *state = container_of(listener, XenIOState, memory_listener);
+    hwaddr start_addr = section->offset_within_address_space;
+    ram_addr_t size = int128_get64(section->size);
+    bool log_dirty = memory_region_is_logging(section->mr, DIRTY_MEMORY_VGA);
+    hvmmem_type_t mem_type;
+
+    if (section->mr == &ram_memory) {
+        return;
+    } else {
+        if (add) {
+            xen_map_memory_section(xen_domid, state->ioservid,
+                                   section);
+        } else {
+            xen_unmap_memory_section(xen_domid, state->ioservid,
+                                     section);
+        }
+    }
+
+    if (!memory_region_is_ram(section->mr)) {
+        return;
+    }
+
+    if (log_dirty != add) {
+        return;
+    }
+
+    trace_xen_client_set_memory(start_addr, size, log_dirty);
+
+    start_addr &= TARGET_PAGE_MASK;
+    size = TARGET_PAGE_ALIGN(size);
+
+    if (add) {
+        if (!memory_region_is_rom(section->mr)) {
+            xen_add_to_physmap(state, start_addr, size,
+                               section->mr, section->offset_within_region);
+        } else {
+            mem_type = HVMMEM_ram_ro;
+            if (xen_set_mem_type(xen_domid, mem_type,
+                                 start_addr >> TARGET_PAGE_BITS,
+                                 size >> TARGET_PAGE_BITS)) {
+                DPRINTF("xen_set_mem_type error, addr: "TARGET_FMT_plx"\n",
+                        start_addr);
+            }
+        }
+    } else {
+        if (xen_remove_from_physmap(state, start_addr, size) < 0) {
+            DPRINTF("physmapping does not exist at "TARGET_FMT_plx"\n", start_addr);
+        }
+    }
+}
+
+static void xen_region_add(MemoryListener *listener,
+                           MemoryRegionSection *section)
+{
+    memory_region_ref(section->mr);
+    xen_set_memory(listener, section, true);
+}
+
+static void xen_region_del(MemoryListener *listener,
+                           MemoryRegionSection *section)
+{
+    xen_set_memory(listener, section, false);
+    memory_region_unref(section->mr);
+}
+
+static void xen_io_add(MemoryListener *listener,
+                       MemoryRegionSection *section)
+{
+    XenIOState *state = container_of(listener, XenIOState, io_listener);
+    MemoryRegion *mr = section->mr;
+
+    if (mr->ops == &unassigned_io_ops) {
+        return;
+    }
+
+    memory_region_ref(mr);
+
+    xen_map_io_section(xen_domid, state->ioservid, section);
+}
+
+static void xen_io_del(MemoryListener *listener,
+                       MemoryRegionSection *section)
+{
+    XenIOState *state = container_of(listener, XenIOState, io_listener);
+    MemoryRegion *mr = section->mr;
+
+    if (mr->ops == &unassigned_io_ops) {
+        return;
+    }
+
+    xen_unmap_io_section(xen_domid, state->ioservid, section);
+
+    memory_region_unref(mr);
+}
+
+static void xen_device_realize(DeviceListener *listener,
+                               DeviceState *dev)
+{
+    XenIOState *state = container_of(listener, XenIOState, device_listener);
+
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+        PCIDevice *pci_dev = PCI_DEVICE(dev);
+        XenPciDevice *xendev = g_new(XenPciDevice, 1);
+
+        xendev->pci_dev = pci_dev;
+        xendev->sbdf = PCI_BUILD_BDF(pci_dev_bus_num(pci_dev),
+                                     pci_dev->devfn);
+        QLIST_INSERT_HEAD(&state->dev_list, xendev, entry);
+
+        xen_map_pcidev(xen_domid, state->ioservid, pci_dev);
+    }
+}
+
+static void xen_device_unrealize(DeviceListener *listener,
+                                 DeviceState *dev)
+{
+    XenIOState *state = container_of(listener, XenIOState, device_listener);
+
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+        PCIDevice *pci_dev = PCI_DEVICE(dev);
+        XenPciDevice *xendev, *next;
+
+        xen_unmap_pcidev(xen_domid, state->ioservid, pci_dev);
+
+        QLIST_FOREACH_SAFE(xendev, &state->dev_list, entry, next) {
+            if (xendev->pci_dev == pci_dev) {
+                QLIST_REMOVE(xendev, entry);
+                g_free(xendev);
+                break;
+            }
+        }
+    }
+}
+
+static void xen_sync_dirty_bitmap(XenIOState *state,
+                                  hwaddr start_addr,
+                                  ram_addr_t size)
+{
+    hwaddr npages = size >> TARGET_PAGE_BITS;
+    const int width = sizeof(unsigned long) * 8;
+    size_t bitmap_size = DIV_ROUND_UP(npages, width);
+    int rc, i, j;
+    const XenPhysmap *physmap = NULL;
+
+    physmap = get_physmapping(start_addr, size);
+    if (physmap == NULL) {
+        /* not handled */
+        return;
+    }
+
+    if (state->log_for_dirtybit == NULL) {
+        state->log_for_dirtybit = physmap;
+        state->dirty_bitmap = g_new(unsigned long, bitmap_size);
+    } else if (state->log_for_dirtybit != physmap) {
+        /* Only one range for dirty bitmap can be tracked. */
+        return;
+    }
+
+    rc = xen_track_dirty_vram(xen_domid, start_addr >> TARGET_PAGE_BITS,
+                              npages, state->dirty_bitmap);
+    if (rc < 0) {
+#ifndef ENODATA
+#define ENODATA  ENOENT
+#endif
+        if (errno == ENODATA) {
+            memory_region_set_dirty(framebuffer, 0, size);
+            DPRINTF("xen: track_dirty_vram failed (0x" TARGET_FMT_plx
+                    ", 0x" TARGET_FMT_plx "): %s\n",
+                    start_addr, start_addr + size, strerror(errno));
+        }
+        return;
+    }
+
+    for (i = 0; i < bitmap_size; i++) {
+        unsigned long map = state->dirty_bitmap[i];
+        while (map != 0) {
+            j = ctzl(map);
+            map &= ~(1ul << j);
+            memory_region_set_dirty(framebuffer,
+                                    (i * width + j) * TARGET_PAGE_SIZE,
+                                    TARGET_PAGE_SIZE);
+        };
+    }
+}
+
+static void xen_log_start(MemoryListener *listener,
+                          MemoryRegionSection *section,
+                          int old, int new)
+{
+    XenIOState *state = container_of(listener, XenIOState, memory_listener);
+
+    if (new & ~old & (1 << DIRTY_MEMORY_VGA)) {
+        xen_sync_dirty_bitmap(state, section->offset_within_address_space,
+                              int128_get64(section->size));
+    }
+}
+
+static void xen_log_stop(MemoryListener *listener, MemoryRegionSection *section,
+                         int old, int new)
+{
+    XenIOState *state = container_of(listener, XenIOState, memory_listener);
+
+    if (old & ~new & (1 << DIRTY_MEMORY_VGA)) {
+        state->log_for_dirtybit = NULL;
+        g_free(state->dirty_bitmap);
+        state->dirty_bitmap = NULL;
+        /* Disable dirty bit tracking */
+        xen_track_dirty_vram(xen_domid, 0, 0, NULL);
+    }
+}
+
+static void xen_log_sync(MemoryListener *listener, MemoryRegionSection *section)
+{
+    XenIOState *state = container_of(listener, XenIOState, memory_listener);
+
+    xen_sync_dirty_bitmap(state, section->offset_within_address_space,
+                          int128_get64(section->size));
+}
+
+static void xen_log_global_start(MemoryListener *listener)
+{
+    if (xen_enabled()) {
+        xen_in_migration = true;
+    }
+}
+
+static void xen_log_global_stop(MemoryListener *listener)
+{
+    xen_in_migration = false;
+}
+
+static MemoryListener xen_memory_listener = {
+    .name = "xen-memory",
+    .region_add = xen_region_add,
+    .region_del = xen_region_del,
+    .log_start = xen_log_start,
+    .log_stop = xen_log_stop,
+    .log_sync = xen_log_sync,
+    .log_global_start = xen_log_global_start,
+    .log_global_stop = xen_log_global_stop,
+    .priority = 10,
+};
+
+static MemoryListener xen_io_listener = {
+    .name = "xen-io",
+    .region_add = xen_io_add,
+    .region_del = xen_io_del,
+    .priority = 10,
+};
+
+static DeviceListener xen_device_listener = {
+    .realize = xen_device_realize,
+    .unrealize = xen_device_unrealize,
+};
+
+/* get the ioreq packets from share mem */
+static ioreq_t *cpu_get_ioreq_from_shared_memory(XenIOState *state, int vcpu)
+{
+    ioreq_t *req = xen_vcpu_ioreq(state->shared_page, vcpu);
+
+    if (req->state != STATE_IOREQ_READY) {
+        DPRINTF("I/O request not ready: "
+                "%x, ptr: %x, port: %"PRIx64", "
+                "data: %"PRIx64", count: %u, size: %u\n",
+                req->state, req->data_is_ptr, req->addr,
+                req->data, req->count, req->size);
+        return NULL;
+    }
+
+    xen_rmb(); /* see IOREQ_READY /then/ read contents of ioreq */
+
+    req->state = STATE_IOREQ_INPROCESS;
+    return req;
+}
+
+/* use poll to get the port notification */
+/* ioreq_vec--out,the */
+/* retval--the number of ioreq packet */
+static ioreq_t *cpu_get_ioreq(XenIOState *state)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    unsigned int max_cpus = ms->smp.max_cpus;
+    int i;
+    evtchn_port_t port;
+
+    port = xenevtchn_pending(state->xce_handle);
+    if (port == state->bufioreq_local_port) {
+        timer_mod(state->buffered_io_timer,
+                BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
+        return NULL;
+    }
+
+    if (port != -1) {
+        for (i = 0; i < max_cpus; i++) {
+            if (state->ioreq_local_port[i] == port) {
+                break;
+            }
+        }
+
+        if (i == max_cpus) {
+            hw_error("Fatal error while trying to get io event!\n");
+        }
+
+        /* unmask the wanted port again */
+        xenevtchn_unmask(state->xce_handle, port);
+
+        /* get the io packet from shared memory */
+        state->send_vcpu = i;
+        return cpu_get_ioreq_from_shared_memory(state, i);
+    }
+
+    /* read error or read nothing */
+    return NULL;
+}
+
+static uint32_t do_inp(uint32_t addr, unsigned long size)
+{
+    switch (size) {
+        case 1:
+            return cpu_inb(addr);
+        case 2:
+            return cpu_inw(addr);
+        case 4:
+            return cpu_inl(addr);
+        default:
+            hw_error("inp: bad size: %04x %lx", addr, size);
+    }
+}
+
+static void do_outp(uint32_t addr,
+        unsigned long size, uint32_t val)
+{
+    switch (size) {
+        case 1:
+            return cpu_outb(addr, val);
+        case 2:
+            return cpu_outw(addr, val);
+        case 4:
+            return cpu_outl(addr, val);
+        default:
+            hw_error("outp: bad size: %04x %lx", addr, size);
+    }
+}
+
+/*
+ * Helper functions which read/write an object from/to physical guest
+ * memory, as part of the implementation of an ioreq.
+ *
+ * Equivalent to
+ *   cpu_physical_memory_rw(addr + (req->df ? -1 : +1) * req->size * i,
+ *                          val, req->size, 0/1)
+ * except without the integer overflow problems.
+ */
+static void rw_phys_req_item(hwaddr addr,
+                             ioreq_t *req, uint32_t i, void *val, int rw)
+{
+    /* Do everything unsigned so overflow just results in a truncated result
+     * and accesses to undesired parts of guest memory, which is up
+     * to the guest */
+    hwaddr offset = (hwaddr)req->size * i;
+    if (req->df) {
+        addr -= offset;
+    } else {
+        addr += offset;
+    }
+    cpu_physical_memory_rw(addr, val, req->size, rw);
+}
+
+static inline void read_phys_req_item(hwaddr addr,
+                                      ioreq_t *req, uint32_t i, void *val)
+{
+    rw_phys_req_item(addr, req, i, val, 0);
+}
+static inline void write_phys_req_item(hwaddr addr,
+                                       ioreq_t *req, uint32_t i, void *val)
+{
+    rw_phys_req_item(addr, req, i, val, 1);
+}
+
+
+static void cpu_ioreq_pio(ioreq_t *req)
+{
+    uint32_t i;
+
+    trace_cpu_ioreq_pio(req, req->dir, req->df, req->data_is_ptr, req->addr,
+                         req->data, req->count, req->size);
+
+    if (req->size > sizeof(uint32_t)) {
+        hw_error("PIO: bad size (%u)", req->size);
+    }
+
+    if (req->dir == IOREQ_READ) {
+        if (!req->data_is_ptr) {
+            req->data = do_inp(req->addr, req->size);
+            trace_cpu_ioreq_pio_read_reg(req, req->data, req->addr,
+                                         req->size);
+        } else {
+            uint32_t tmp;
+
+            for (i = 0; i < req->count; i++) {
+                tmp = do_inp(req->addr, req->size);
+                write_phys_req_item(req->data, req, i, &tmp);
+            }
+        }
+    } else if (req->dir == IOREQ_WRITE) {
+        if (!req->data_is_ptr) {
+            trace_cpu_ioreq_pio_write_reg(req, req->data, req->addr,
+                                          req->size);
+            do_outp(req->addr, req->size, req->data);
+        } else {
+            for (i = 0; i < req->count; i++) {
+                uint32_t tmp = 0;
+
+                read_phys_req_item(req->data, req, i, &tmp);
+                do_outp(req->addr, req->size, tmp);
+            }
+        }
+    }
+}
+
+static void cpu_ioreq_move(ioreq_t *req)
+{
+    uint32_t i;
+
+    trace_cpu_ioreq_move(req, req->dir, req->df, req->data_is_ptr, req->addr,
+                         req->data, req->count, req->size);
+
+    if (req->size > sizeof(req->data)) {
+        hw_error("MMIO: bad size (%u)", req->size);
+    }
+
+    if (!req->data_is_ptr) {
+        if (req->dir == IOREQ_READ) {
+            for (i = 0; i < req->count; i++) {
+                read_phys_req_item(req->addr, req, i, &req->data);
+            }
+        } else if (req->dir == IOREQ_WRITE) {
+            for (i = 0; i < req->count; i++) {
+                write_phys_req_item(req->addr, req, i, &req->data);
+            }
+        }
+    } else {
+        uint64_t tmp;
+
+        if (req->dir == IOREQ_READ) {
+            for (i = 0; i < req->count; i++) {
+                read_phys_req_item(req->addr, req, i, &tmp);
+                write_phys_req_item(req->data, req, i, &tmp);
+            }
+        } else if (req->dir == IOREQ_WRITE) {
+            for (i = 0; i < req->count; i++) {
+                read_phys_req_item(req->data, req, i, &tmp);
+                write_phys_req_item(req->addr, req, i, &tmp);
+            }
+        }
+    }
+}
+
+static void cpu_ioreq_config(XenIOState *state, ioreq_t *req)
+{
+    uint32_t sbdf = req->addr >> 32;
+    uint32_t reg = req->addr;
+    XenPciDevice *xendev;
+
+    if (req->size != sizeof(uint8_t) && req->size != sizeof(uint16_t) &&
+        req->size != sizeof(uint32_t)) {
+        hw_error("PCI config access: bad size (%u)", req->size);
+    }
+
+    if (req->count != 1) {
+        hw_error("PCI config access: bad count (%u)", req->count);
+    }
+
+    QLIST_FOREACH(xendev, &state->dev_list, entry) {
+        if (xendev->sbdf != sbdf) {
+            continue;
+        }
+
+        if (!req->data_is_ptr) {
+            if (req->dir == IOREQ_READ) {
+                req->data = pci_host_config_read_common(
+                    xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
+                    req->size);
+                trace_cpu_ioreq_config_read(req, xendev->sbdf, reg,
+                                            req->size, req->data);
+            } else if (req->dir == IOREQ_WRITE) {
+                trace_cpu_ioreq_config_write(req, xendev->sbdf, reg,
+                                             req->size, req->data);
+                pci_host_config_write_common(
+                    xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
+                    req->data, req->size);
+            }
+        } else {
+            uint32_t tmp;
+
+            if (req->dir == IOREQ_READ) {
+                tmp = pci_host_config_read_common(
+                    xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
+                    req->size);
+                trace_cpu_ioreq_config_read(req, xendev->sbdf, reg,
+                                            req->size, tmp);
+                write_phys_req_item(req->data, req, 0, &tmp);
+            } else if (req->dir == IOREQ_WRITE) {
+                read_phys_req_item(req->data, req, 0, &tmp);
+                trace_cpu_ioreq_config_write(req, xendev->sbdf, reg,
+                                             req->size, tmp);
+                pci_host_config_write_common(
+                    xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
+                    tmp, req->size);
+            }
+        }
+    }
+}
+
+static void regs_to_cpu(vmware_regs_t *vmport_regs, ioreq_t *req)
+{
+    X86CPU *cpu;
+    CPUX86State *env;
+
+    cpu = X86_CPU(current_cpu);
+    env = &cpu->env;
+    env->regs[R_EAX] = req->data;
+    env->regs[R_EBX] = vmport_regs->ebx;
+    env->regs[R_ECX] = vmport_regs->ecx;
+    env->regs[R_EDX] = vmport_regs->edx;
+    env->regs[R_ESI] = vmport_regs->esi;
+    env->regs[R_EDI] = vmport_regs->edi;
+}
+
+static void regs_from_cpu(vmware_regs_t *vmport_regs)
+{
+    X86CPU *cpu = X86_CPU(current_cpu);
+    CPUX86State *env = &cpu->env;
+
+    vmport_regs->ebx = env->regs[R_EBX];
+    vmport_regs->ecx = env->regs[R_ECX];
+    vmport_regs->edx = env->regs[R_EDX];
+    vmport_regs->esi = env->regs[R_ESI];
+    vmport_regs->edi = env->regs[R_EDI];
+}
+
+static void handle_vmport_ioreq(XenIOState *state, ioreq_t *req)
+{
+    vmware_regs_t *vmport_regs;
+
+    assert(state->shared_vmport_page);
+    vmport_regs =
+        &state->shared_vmport_page->vcpu_vmport_regs[state->send_vcpu];
+    QEMU_BUILD_BUG_ON(sizeof(*req) < sizeof(*vmport_regs));
+
+    current_cpu = state->cpu_by_vcpu_id[state->send_vcpu];
+    regs_to_cpu(vmport_regs, req);
+    cpu_ioreq_pio(req);
+    regs_from_cpu(vmport_regs);
+    current_cpu = NULL;
+}
+
+static void handle_ioreq(XenIOState *state, ioreq_t *req)
+{
+    trace_handle_ioreq(req, req->type, req->dir, req->df, req->data_is_ptr,
+                       req->addr, req->data, req->count, req->size);
+
+    if (!req->data_is_ptr && (req->dir == IOREQ_WRITE) &&
+            (req->size < sizeof (target_ulong))) {
+        req->data &= ((target_ulong) 1 << (8 * req->size)) - 1;
+    }
+
+    if (req->dir == IOREQ_WRITE)
+        trace_handle_ioreq_write(req, req->type, req->df, req->data_is_ptr,
+                                 req->addr, req->data, req->count, req->size);
+
+    switch (req->type) {
+        case IOREQ_TYPE_PIO:
+            cpu_ioreq_pio(req);
+            break;
+        case IOREQ_TYPE_COPY:
+            cpu_ioreq_move(req);
+            break;
+        case IOREQ_TYPE_VMWARE_PORT:
+            handle_vmport_ioreq(state, req);
+            break;
+        case IOREQ_TYPE_TIMEOFFSET:
+            break;
+        case IOREQ_TYPE_INVALIDATE:
+            xen_invalidate_map_cache();
+            break;
+        case IOREQ_TYPE_PCI_CONFIG:
+            cpu_ioreq_config(state, req);
+            break;
+        default:
+            hw_error("Invalid ioreq type 0x%x\n", req->type);
+    }
+    if (req->dir == IOREQ_READ) {
+        trace_handle_ioreq_read(req, req->type, req->df, req->data_is_ptr,
+                                req->addr, req->data, req->count, req->size);
+    }
+}
+
+static int handle_buffered_iopage(XenIOState *state)
+{
+    buffered_iopage_t *buf_page = state->buffered_io_page;
+    buf_ioreq_t *buf_req = NULL;
+    ioreq_t req;
+    int qw;
+
+    if (!buf_page) {
+        return 0;
+    }
+
+    memset(&req, 0x00, sizeof(req));
+    req.state = STATE_IOREQ_READY;
+    req.count = 1;
+    req.dir = IOREQ_WRITE;
+
+    for (;;) {
+        uint32_t rdptr = buf_page->read_pointer, wrptr;
+
+        xen_rmb();
+        wrptr = buf_page->write_pointer;
+        xen_rmb();
+        if (rdptr != buf_page->read_pointer) {
+            continue;
+        }
+        if (rdptr == wrptr) {
+            break;
+        }
+        buf_req = &buf_page->buf_ioreq[rdptr % IOREQ_BUFFER_SLOT_NUM];
+        req.size = 1U << buf_req->size;
+        req.addr = buf_req->addr;
+        req.data = buf_req->data;
+        req.type = buf_req->type;
+        xen_rmb();
+        qw = (req.size == 8);
+        if (qw) {
+            if (rdptr + 1 == wrptr) {
+                hw_error("Incomplete quad word buffered ioreq");
+            }
+            buf_req = &buf_page->buf_ioreq[(rdptr + 1) %
+                                           IOREQ_BUFFER_SLOT_NUM];
+            req.data |= ((uint64_t)buf_req->data) << 32;
+            xen_rmb();
+        }
+
+        handle_ioreq(state, &req);
+
+        /* Only req.data may get updated by handle_ioreq(), albeit even that
+         * should not happen as such data would never make it to the guest (we
+         * can only usefully see writes here after all).
+         */
+        assert(req.state == STATE_IOREQ_READY);
+        assert(req.count == 1);
+        assert(req.dir == IOREQ_WRITE);
+        assert(!req.data_is_ptr);
+
+        qatomic_add(&buf_page->read_pointer, qw + 1);
+    }
+
+    return req.count;
+}
+
+static void handle_buffered_io(void *opaque)
+{
+    XenIOState *state = opaque;
+
+    if (handle_buffered_iopage(state)) {
+        timer_mod(state->buffered_io_timer,
+                BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
+    } else {
+        timer_del(state->buffered_io_timer);
+        xenevtchn_unmask(state->xce_handle, state->bufioreq_local_port);
+    }
+}
+
+static void cpu_handle_ioreq(void *opaque)
+{
+    XenIOState *state = opaque;
+    ioreq_t *req = cpu_get_ioreq(state);
+
+    handle_buffered_iopage(state);
+    if (req) {
+        ioreq_t copy = *req;
+
+        xen_rmb();
+        handle_ioreq(state, &copy);
+        req->data = copy.data;
+
+        if (req->state != STATE_IOREQ_INPROCESS) {
+            fprintf(stderr, "Badness in I/O request ... not in service?!: "
+                    "%x, ptr: %x, port: %"PRIx64", "
+                    "data: %"PRIx64", count: %u, size: %u, type: %u\n",
+                    req->state, req->data_is_ptr, req->addr,
+                    req->data, req->count, req->size, req->type);
+            destroy_hvm_domain(false);
+            return;
+        }
+
+        xen_wmb(); /* Update ioreq contents /then/ update state. */
+
+        /*
+         * We do this before we send the response so that the tools
+         * have the opportunity to pick up on the reset before the
+         * guest resumes and does a hlt with interrupts disabled which
+         * causes Xen to powerdown the domain.
+         */
+        if (runstate_is_running()) {
+            ShutdownCause request;
+
+            if (qemu_shutdown_requested_get()) {
+                destroy_hvm_domain(false);
+            }
+            request = qemu_reset_requested_get();
+            if (request) {
+                qemu_system_reset(request);
+                destroy_hvm_domain(true);
+            }
+        }
+
+        req->state = STATE_IORESP_READY;
+        xenevtchn_notify(state->xce_handle,
+                         state->ioreq_local_port[state->send_vcpu]);
+    }
+}
+
+static void xen_main_loop_prepare(XenIOState *state)
+{
+    int evtchn_fd = -1;
+
+    if (state->xce_handle != NULL) {
+        evtchn_fd = xenevtchn_fd(state->xce_handle);
+    }
+
+    state->buffered_io_timer = timer_new_ms(QEMU_CLOCK_REALTIME, handle_buffered_io,
+                                                 state);
+
+    if (evtchn_fd != -1) {
+        CPUState *cpu_state;
+
+        DPRINTF("%s: Init cpu_by_vcpu_id\n", __func__);
+        CPU_FOREACH(cpu_state) {
+            DPRINTF("%s: cpu_by_vcpu_id[%d]=%p\n",
+                    __func__, cpu_state->cpu_index, cpu_state);
+            state->cpu_by_vcpu_id[cpu_state->cpu_index] = cpu_state;
+        }
+        qemu_set_fd_handler(evtchn_fd, cpu_handle_ioreq, NULL, state);
+    }
+}
+
+
+static void xen_hvm_change_state_handler(void *opaque, bool running,
+                                         RunState rstate)
+{
+    XenIOState *state = opaque;
+
+    if (running) {
+        xen_main_loop_prepare(state);
+    }
+
+    xen_set_ioreq_server_state(xen_domid,
+                               state->ioservid,
+                               (rstate == RUN_STATE_RUNNING));
+}
+
+static void xen_exit_notifier(Notifier *n, void *data)
+{
+    XenIOState *state = container_of(n, XenIOState, exit);
+
+    xen_destroy_ioreq_server(xen_domid, state->ioservid);
+    if (state->fres != NULL) {
+        xenforeignmemory_unmap_resource(xen_fmem, state->fres);
+    }
+
+    xenevtchn_close(state->xce_handle);
+    xs_daemon_close(state->xenstore);
+}
+
+#ifdef XEN_COMPAT_PHYSMAP
+static void xen_read_physmap(XenIOState *state)
+{
+    XenPhysmap *physmap = NULL;
+    unsigned int len, num, i;
+    char path[80], *value = NULL;
+    char **entries = NULL;
+
+    snprintf(path, sizeof(path),
+            "/local/domain/0/device-model/%d/physmap", xen_domid);
+    entries = xs_directory(state->xenstore, 0, path, &num);
+    if (entries == NULL)
+        return;
+
+    for (i = 0; i < num; i++) {
+        physmap = g_malloc(sizeof (XenPhysmap));
+        physmap->phys_offset = strtoull(entries[i], NULL, 16);
+        snprintf(path, sizeof(path),
+                "/local/domain/0/device-model/%d/physmap/%s/start_addr",
+                xen_domid, entries[i]);
+        value = xs_read(state->xenstore, 0, path, &len);
+        if (value == NULL) {
+            g_free(physmap);
+            continue;
+        }
+        physmap->start_addr = strtoull(value, NULL, 16);
+        free(value);
+
+        snprintf(path, sizeof(path),
+                "/local/domain/0/device-model/%d/physmap/%s/size",
+                xen_domid, entries[i]);
+        value = xs_read(state->xenstore, 0, path, &len);
+        if (value == NULL) {
+            g_free(physmap);
+            continue;
+        }
+        physmap->size = strtoull(value, NULL, 16);
+        free(value);
+
+        snprintf(path, sizeof(path),
+                "/local/domain/0/device-model/%d/physmap/%s/name",
+                xen_domid, entries[i]);
+        physmap->name = xs_read(state->xenstore, 0, path, &len);
+
+        QLIST_INSERT_HEAD(&xen_physmap, physmap, list);
+    }
+    free(entries);
+}
+#else
+static void xen_read_physmap(XenIOState *state)
+{
+}
+#endif
+
+static void xen_wakeup_notifier(Notifier *notifier, void *data)
+{
+    xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 0);
+}
+
+static int xen_map_ioreq_server(XenIOState *state)
+{
+    void *addr = NULL;
+    xen_pfn_t ioreq_pfn;
+    xen_pfn_t bufioreq_pfn;
+    evtchn_port_t bufioreq_evtchn;
+    int rc;
+
+    /*
+     * Attempt to map using the resource API and fall back to normal
+     * foreign mapping if this is not supported.
+     */
+    QEMU_BUILD_BUG_ON(XENMEM_resource_ioreq_server_frame_bufioreq != 0);
+    QEMU_BUILD_BUG_ON(XENMEM_resource_ioreq_server_frame_ioreq(0) != 1);
+    state->fres = xenforeignmemory_map_resource(xen_fmem, xen_domid,
+                                         XENMEM_resource_ioreq_server,
+                                         state->ioservid, 0, 2,
+                                         &addr,
+                                         PROT_READ | PROT_WRITE, 0);
+    if (state->fres != NULL) {
+        trace_xen_map_resource_ioreq(state->ioservid, addr);
+        state->buffered_io_page = addr;
+        state->shared_page = addr + TARGET_PAGE_SIZE;
+    } else if (errno != EOPNOTSUPP) {
+        error_report("failed to map ioreq server resources: error %d handle=%p",
+                     errno, xen_xc);
+        return -1;
+    }
+
+    rc = xen_get_ioreq_server_info(xen_domid, state->ioservid,
+                                   (state->shared_page == NULL) ?
+                                   &ioreq_pfn : NULL,
+                                   (state->buffered_io_page == NULL) ?
+                                   &bufioreq_pfn : NULL,
+                                   &bufioreq_evtchn);
+    if (rc < 0) {
+        error_report("failed to get ioreq server info: error %d handle=%p",
+                     errno, xen_xc);
+        return rc;
+    }
+
+    if (state->shared_page == NULL) {
+        DPRINTF("shared page at pfn %lx\n", ioreq_pfn);
+
+        state->shared_page = xenforeignmemory_map(xen_fmem, xen_domid,
+                                                  PROT_READ | PROT_WRITE,
+                                                  1, &ioreq_pfn, NULL);
+        if (state->shared_page == NULL) {
+            error_report("map shared IO page returned error %d handle=%p",
+                         errno, xen_xc);
+        }
+    }
+
+    if (state->buffered_io_page == NULL) {
+        DPRINTF("buffered io page at pfn %lx\n", bufioreq_pfn);
+
+        state->buffered_io_page = xenforeignmemory_map(xen_fmem, xen_domid,
+                                                       PROT_READ | PROT_WRITE,
+                                                       1, &bufioreq_pfn,
+                                                       NULL);
+        if (state->buffered_io_page == NULL) {
+            error_report("map buffered IO page returned error %d", errno);
+            return -1;
+        }
+    }
+
+    if (state->shared_page == NULL || state->buffered_io_page == NULL) {
+        return -1;
+    }
+
+    DPRINTF("buffered io evtchn is %x\n", bufioreq_evtchn);
+
+    state->bufioreq_remote_port = bufioreq_evtchn;
+
+    return 0;
+}
+
+void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory)
+{
+    MachineState *ms = MACHINE(pcms);
+    unsigned int max_cpus = ms->smp.max_cpus;
+    int i, rc;
+    xen_pfn_t ioreq_pfn;
+    XenIOState *state;
+
+    state = g_malloc0(sizeof (XenIOState));
+
+    state->xce_handle = xenevtchn_open(NULL, 0);
+    if (state->xce_handle == NULL) {
+        perror("xen: event channel open");
+        goto err;
+    }
+
+    state->xenstore = xs_daemon_open();
+    if (state->xenstore == NULL) {
+        perror("xen: xenstore open");
+        goto err;
+    }
+
+    xen_create_ioreq_server(xen_domid, &state->ioservid);
+
+    state->exit.notify = xen_exit_notifier;
+    qemu_add_exit_notifier(&state->exit);
+
+    state->suspend.notify = xen_suspend_notifier;
+    qemu_register_suspend_notifier(&state->suspend);
+
+    state->wakeup.notify = xen_wakeup_notifier;
+    qemu_register_wakeup_notifier(&state->wakeup);
+
+    /*
+     * Register wake-up support in QMP query-current-machine API
+     */
+    qemu_register_wakeup_support();
+
+    rc = xen_map_ioreq_server(state);
+    if (rc < 0) {
+        goto err;
+    }
+
+    rc = xen_get_vmport_regs_pfn(xen_xc, xen_domid, &ioreq_pfn);
+    if (!rc) {
+        DPRINTF("shared vmport page at pfn %lx\n", ioreq_pfn);
+        state->shared_vmport_page =
+            xenforeignmemory_map(xen_fmem, xen_domid, PROT_READ|PROT_WRITE,
+                                 1, &ioreq_pfn, NULL);
+        if (state->shared_vmport_page == NULL) {
+            error_report("map shared vmport IO page returned error %d handle=%p",
+                         errno, xen_xc);
+            goto err;
+        }
+    } else if (rc != -ENOSYS) {
+        error_report("get vmport regs pfn returned error %d, rc=%d",
+                     errno, rc);
+        goto err;
+    }
+
+    /* Note: cpus is empty at this point in init */
+    state->cpu_by_vcpu_id = g_malloc0(max_cpus * sizeof(CPUState *));
+
+    rc = xen_set_ioreq_server_state(xen_domid, state->ioservid, true);
+    if (rc < 0) {
+        error_report("failed to enable ioreq server info: error %d handle=%p",
+                     errno, xen_xc);
+        goto err;
+    }
+
+    state->ioreq_local_port = g_malloc0(max_cpus * sizeof (evtchn_port_t));
+
+    /* FIXME: how about if we overflow the page here? */
+    for (i = 0; i < max_cpus; i++) {
+        rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid,
+                                        xen_vcpu_eport(state->shared_page, i));
+        if (rc == -1) {
+            error_report("shared evtchn %d bind error %d", i, errno);
+            goto err;
+        }
+        state->ioreq_local_port[i] = rc;
+    }
+
+    rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid,
+                                    state->bufioreq_remote_port);
+    if (rc == -1) {
+        error_report("buffered evtchn bind error %d", errno);
+        goto err;
+    }
+    state->bufioreq_local_port = rc;
+
+    /* Init RAM management */
+#ifdef XEN_COMPAT_PHYSMAP
+    xen_map_cache_init(xen_phys_offset_to_gaddr, state);
+#else
+    xen_map_cache_init(NULL, state);
+#endif
+    xen_ram_init(pcms, ms->ram_size, ram_memory);
+
+    qemu_add_vm_change_state_handler(xen_hvm_change_state_handler, state);
+
+    state->memory_listener = xen_memory_listener;
+    memory_listener_register(&state->memory_listener, &address_space_memory);
+    state->log_for_dirtybit = NULL;
+
+    state->io_listener = xen_io_listener;
+    memory_listener_register(&state->io_listener, &address_space_io);
+
+    state->device_listener = xen_device_listener;
+    QLIST_INIT(&state->dev_list);
+    device_listener_register(&state->device_listener);
+
+    xen_bus_init();
+
+    /* Initialize backend core & drivers */
+    if (xen_be_init() != 0) {
+        error_report("xen backend core setup failed");
+        goto err;
+    }
+    xen_be_register_common();
+
+    QLIST_INIT(&xen_physmap);
+    xen_read_physmap(state);
+
+    /* Disable ACPI build because Xen handles it */
+    pcms->acpi_build_enabled = false;
+
+    return;
+
+err:
+    error_report("xen hardware virtual machine initialisation failed");
+    exit(1);
+}
+
+void destroy_hvm_domain(bool reboot)
+{
+    xc_interface *xc_handle;
+    int sts;
+    int rc;
+
+    unsigned int reason = reboot ? SHUTDOWN_reboot : SHUTDOWN_poweroff;
+
+    if (xen_dmod) {
+        rc = xendevicemodel_shutdown(xen_dmod, xen_domid, reason);
+        if (!rc) {
+            return;
+        }
+        if (errno != ENOTTY /* old Xen */) {
+            perror("xendevicemodel_shutdown failed");
+        }
+        /* well, try the old thing then */
+    }
+
+    xc_handle = xc_interface_open(0, 0, 0);
+    if (xc_handle == NULL) {
+        fprintf(stderr, "Cannot acquire xenctrl handle\n");
+    } else {
+        sts = xc_domain_shutdown(xc_handle, xen_domid, reason);
+        if (sts != 0) {
+            fprintf(stderr, "xc_domain_shutdown failed to issue %s, "
+                    "sts %d, %s\n", reboot ? "reboot" : "poweroff",
+                    sts, strerror(errno));
+        } else {
+            fprintf(stderr, "Issued domain %d %s\n", xen_domid,
+                    reboot ? "reboot" : "poweroff");
+        }
+        xc_interface_close(xc_handle);
+    }
+}
+
+void xen_register_framebuffer(MemoryRegion *mr)
+{
+    framebuffer = mr;
+}
+
+void xen_shutdown_fatal_error(const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    vfprintf(stderr, fmt, ap);
+    va_end(ap);
+    fprintf(stderr, "Will destroy the domain.\n");
+    /* destroy the domain */
+    qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR);
+}
+
+void xen_hvm_modified_memory(ram_addr_t start, ram_addr_t length)
+{
+    if (unlikely(xen_in_migration)) {
+        int rc;
+        ram_addr_t start_pfn, nb_pages;
+
+        start = xen_phys_offset_to_gaddr(start, length);
+
+        if (length == 0) {
+            length = TARGET_PAGE_SIZE;
+        }
+        start_pfn = start >> TARGET_PAGE_BITS;
+        nb_pages = ((start + length + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS)
+            - start_pfn;
+        rc = xen_modified_memory(xen_domid, start_pfn, nb_pages);
+        if (rc) {
+            fprintf(stderr,
+                    "%s failed for "RAM_ADDR_FMT" ("RAM_ADDR_FMT"): %i, %s\n",
+                    __func__, start, nb_pages, errno, strerror(errno));
+        }
+    }
+}
+
+void qmp_xen_set_global_dirty_log(bool enable, Error **errp)
+{
+    if (enable) {
+        memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
+    } else {
+        memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
+    }
+}
diff --git a/hw/i386/xen/xen-mapcache.c b/hw/i386/xen/xen-mapcache.c
new file mode 100644
index 000000000..bd47c3d67
--- /dev/null
+++ b/hw/i386/xen/xen-mapcache.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright (C) 2011       Citrix Ltd.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "qemu/error-report.h"
+
+#include <sys/resource.h>
+
+#include "hw/xen/xen-legacy-backend.h"
+#include "qemu/bitmap.h"
+
+#include "sysemu/runstate.h"
+#include "sysemu/xen-mapcache.h"
+#include "trace.h"
+
+
+//#define MAPCACHE_DEBUG
+
+#ifdef MAPCACHE_DEBUG
+#  define DPRINTF(fmt, ...) do { \
+    fprintf(stderr, "xen_mapcache: " fmt, ## __VA_ARGS__); \
+} while (0)
+#else
+#  define DPRINTF(fmt, ...) do { } while (0)
+#endif
+
+#if HOST_LONG_BITS == 32
+#  define MCACHE_BUCKET_SHIFT 16
+#  define MCACHE_MAX_SIZE     (1UL<<31) /* 2GB Cap */
+#else
+#  define MCACHE_BUCKET_SHIFT 20
+#  define MCACHE_MAX_SIZE     (1UL<<35) /* 32GB Cap */
+#endif
+#define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT)
+
+/* This is the size of the virtual address space reserve to QEMU that will not
+ * be use by MapCache.
+ * From empirical tests I observed that qemu use 75MB more than the
+ * max_mcache_size.
+ */
+#define NON_MCACHE_MEMORY_SIZE (80 * MiB)
+
+typedef struct MapCacheEntry {
+    hwaddr paddr_index;
+    uint8_t *vaddr_base;
+    unsigned long *valid_mapping;
+    uint8_t lock;
+#define XEN_MAPCACHE_ENTRY_DUMMY (1 << 0)
+    uint8_t flags;
+    hwaddr size;
+    struct MapCacheEntry *next;
+} MapCacheEntry;
+
+typedef struct MapCacheRev {
+    uint8_t *vaddr_req;
+    hwaddr paddr_index;
+    hwaddr size;
+    QTAILQ_ENTRY(MapCacheRev) next;
+    bool dma;
+} MapCacheRev;
+
+typedef struct MapCache {
+    MapCacheEntry *entry;
+    unsigned long nr_buckets;
+    QTAILQ_HEAD(, MapCacheRev) locked_entries;
+
+    /* For most cases (>99.9%), the page address is the same. */
+    MapCacheEntry *last_entry;
+    unsigned long max_mcache_size;
+    unsigned int mcache_bucket_shift;
+
+    phys_offset_to_gaddr_t phys_offset_to_gaddr;
+    QemuMutex lock;
+    void *opaque;
+} MapCache;
+
+static MapCache *mapcache;
+
+static inline void mapcache_lock(void)
+{
+    qemu_mutex_lock(&mapcache->lock);
+}
+
+static inline void mapcache_unlock(void)
+{
+    qemu_mutex_unlock(&mapcache->lock);
+}
+
+static inline int test_bits(int nr, int size, const unsigned long *addr)
+{
+    unsigned long res = find_next_zero_bit(addr, size + nr, nr);
+    if (res >= nr + size)
+        return 1;
+    else
+        return 0;
+}
+
+void xen_map_cache_init(phys_offset_to_gaddr_t f, void *opaque)
+{
+    unsigned long size;
+    struct rlimit rlimit_as;
+
+    mapcache = g_malloc0(sizeof (MapCache));
+
+    mapcache->phys_offset_to_gaddr = f;
+    mapcache->opaque = opaque;
+    qemu_mutex_init(&mapcache->lock);
+
+    QTAILQ_INIT(&mapcache->locked_entries);
+
+    if (geteuid() == 0) {
+        rlimit_as.rlim_cur = RLIM_INFINITY;
+        rlimit_as.rlim_max = RLIM_INFINITY;
+        mapcache->max_mcache_size = MCACHE_MAX_SIZE;
+    } else {
+        getrlimit(RLIMIT_AS, &rlimit_as);
+        rlimit_as.rlim_cur = rlimit_as.rlim_max;
+
+        if (rlimit_as.rlim_max != RLIM_INFINITY) {
+            warn_report("QEMU's maximum size of virtual"
+                        " memory is not infinity");
+        }
+        if (rlimit_as.rlim_max < MCACHE_MAX_SIZE + NON_MCACHE_MEMORY_SIZE) {
+            mapcache->max_mcache_size = rlimit_as.rlim_max -
+                NON_MCACHE_MEMORY_SIZE;
+        } else {
+            mapcache->max_mcache_size = MCACHE_MAX_SIZE;
+        }
+    }
+
+    setrlimit(RLIMIT_AS, &rlimit_as);
+
+    mapcache->nr_buckets =
+        (((mapcache->max_mcache_size >> XC_PAGE_SHIFT) +
+          (1UL << (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1) >>
+         (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT));
+
+    size = mapcache->nr_buckets * sizeof (MapCacheEntry);
+    size = (size + XC_PAGE_SIZE - 1) & ~(XC_PAGE_SIZE - 1);
+    DPRINTF("%s, nr_buckets = %lx size %lu\n", __func__,
+            mapcache->nr_buckets, size);
+    mapcache->entry = g_malloc0(size);
+}
+
+static void xen_remap_bucket(MapCacheEntry *entry,
+                             void *vaddr,
+                             hwaddr size,
+                             hwaddr address_index,
+                             bool dummy)
+{
+    uint8_t *vaddr_base;
+    xen_pfn_t *pfns;
+    int *err;
+    unsigned int i;
+    hwaddr nb_pfn = size >> XC_PAGE_SHIFT;
+
+    trace_xen_remap_bucket(address_index);
+
+    pfns = g_malloc0(nb_pfn * sizeof (xen_pfn_t));
+    err = g_malloc0(nb_pfn * sizeof (int));
+
+    if (entry->vaddr_base != NULL) {
+        if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) {
+            ram_block_notify_remove(entry->vaddr_base, entry->size,
+                                    entry->size);
+        }
+
+        /*
+         * If an entry is being replaced by another mapping and we're using
+         * MAP_FIXED flag for it - there is possibility of a race for vaddr
+         * address with another thread doing an mmap call itself
+         * (see man 2 mmap). To avoid that we skip explicit unmapping here
+         * and allow the kernel to destroy the previous mappings by replacing
+         * them in mmap call later.
+         *
+         * Non-identical replacements are not allowed therefore.
+         */
+        assert(!vaddr || (entry->vaddr_base == vaddr && entry->size == size));
+
+        if (!vaddr && munmap(entry->vaddr_base, entry->size) != 0) {
+            perror("unmap fails");
+            exit(-1);
+        }
+    }
+    g_free(entry->valid_mapping);
+    entry->valid_mapping = NULL;
+
+    for (i = 0; i < nb_pfn; i++) {
+        pfns[i] = (address_index << (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i;
+    }
+
+    /*
+     * If the caller has requested the mapping at a specific address use
+     * MAP_FIXED to make sure it's honored.
+     */
+    if (!dummy) {
+        vaddr_base = xenforeignmemory_map2(xen_fmem, xen_domid, vaddr,
+                                           PROT_READ | PROT_WRITE,
+                                           vaddr ? MAP_FIXED : 0,
+                                           nb_pfn, pfns, err);
+        if (vaddr_base == NULL) {
+            perror("xenforeignmemory_map2");
+            exit(-1);
+        }
+    } else {
+        /*
+         * We create dummy mappings where we are unable to create a foreign
+         * mapping immediately due to certain circumstances (i.e. on resume now)
+         */
+        vaddr_base = mmap(vaddr, size, PROT_READ | PROT_WRITE,
+                          MAP_ANON | MAP_SHARED | (vaddr ? MAP_FIXED : 0),
+                          -1, 0);
+        if (vaddr_base == MAP_FAILED) {
+            perror("mmap");
+            exit(-1);
+        }
+    }
+
+    if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) {
+        ram_block_notify_add(vaddr_base, size, size);
+    }
+
+    entry->vaddr_base = vaddr_base;
+    entry->paddr_index = address_index;
+    entry->size = size;
+    entry->valid_mapping = (unsigned long *) g_malloc0(sizeof(unsigned long) *
+            BITS_TO_LONGS(size >> XC_PAGE_SHIFT));
+
+    if (dummy) {
+        entry->flags |= XEN_MAPCACHE_ENTRY_DUMMY;
+    } else {
+        entry->flags &= ~(XEN_MAPCACHE_ENTRY_DUMMY);
+    }
+
+    bitmap_zero(entry->valid_mapping, nb_pfn);
+    for (i = 0; i < nb_pfn; i++) {
+        if (!err[i]) {
+            bitmap_set(entry->valid_mapping, i, 1);
+        }
+    }
+
+    g_free(pfns);
+    g_free(err);
+}
+
+static uint8_t *xen_map_cache_unlocked(hwaddr phys_addr, hwaddr size,
+                                       uint8_t lock, bool dma)
+{
+    MapCacheEntry *entry, *pentry = NULL,
+                  *free_entry = NULL, *free_pentry = NULL;
+    hwaddr address_index;
+    hwaddr address_offset;
+    hwaddr cache_size = size;
+    hwaddr test_bit_size;
+    bool translated G_GNUC_UNUSED = false;
+    bool dummy = false;
+
+tryagain:
+    address_index  = phys_addr >> MCACHE_BUCKET_SHIFT;
+    address_offset = phys_addr & (MCACHE_BUCKET_SIZE - 1);
+
+    trace_xen_map_cache(phys_addr);
+
+    /* test_bit_size is always a multiple of XC_PAGE_SIZE */
+    if (size) {
+        test_bit_size = size + (phys_addr & (XC_PAGE_SIZE - 1));
+
+        if (test_bit_size % XC_PAGE_SIZE) {
+            test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE);
+        }
+    } else {
+        test_bit_size = XC_PAGE_SIZE;
+    }
+
+    if (mapcache->last_entry != NULL &&
+        mapcache->last_entry->paddr_index == address_index &&
+        !lock && !size &&
+        test_bits(address_offset >> XC_PAGE_SHIFT,
+                  test_bit_size >> XC_PAGE_SHIFT,
+                  mapcache->last_entry->valid_mapping)) {
+        trace_xen_map_cache_return(mapcache->last_entry->vaddr_base + address_offset);
+        return mapcache->last_entry->vaddr_base + address_offset;
+    }
+
+    /* size is always a multiple of MCACHE_BUCKET_SIZE */
+    if (size) {
+        cache_size = size + address_offset;
+        if (cache_size % MCACHE_BUCKET_SIZE) {
+            cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE);
+        }
+    } else {
+        cache_size = MCACHE_BUCKET_SIZE;
+    }
+
+    entry = &mapcache->entry[address_index % mapcache->nr_buckets];
+
+    while (entry && (lock || entry->lock) && entry->vaddr_base &&
+            (entry->paddr_index != address_index || entry->size != cache_size ||
+             !test_bits(address_offset >> XC_PAGE_SHIFT,
+                 test_bit_size >> XC_PAGE_SHIFT,
+                 entry->valid_mapping))) {
+        if (!free_entry && !entry->lock) {
+            free_entry = entry;
+            free_pentry = pentry;
+        }
+        pentry = entry;
+        entry = entry->next;
+    }
+    if (!entry && free_entry) {
+        entry = free_entry;
+        pentry = free_pentry;
+    }
+    if (!entry) {
+        entry = g_malloc0(sizeof (MapCacheEntry));
+        pentry->next = entry;
+        xen_remap_bucket(entry, NULL, cache_size, address_index, dummy);
+    } else if (!entry->lock) {
+        if (!entry->vaddr_base || entry->paddr_index != address_index ||
+                entry->size != cache_size ||
+                !test_bits(address_offset >> XC_PAGE_SHIFT,
+                    test_bit_size >> XC_PAGE_SHIFT,
+                    entry->valid_mapping)) {
+            xen_remap_bucket(entry, NULL, cache_size, address_index, dummy);
+        }
+    }
+
+    if(!test_bits(address_offset >> XC_PAGE_SHIFT,
+                test_bit_size >> XC_PAGE_SHIFT,
+                entry->valid_mapping)) {
+        mapcache->last_entry = NULL;
+#ifdef XEN_COMPAT_PHYSMAP
+        if (!translated && mapcache->phys_offset_to_gaddr) {
+            phys_addr = mapcache->phys_offset_to_gaddr(phys_addr, size);
+            translated = true;
+            goto tryagain;
+        }
+#endif
+        if (!dummy && runstate_check(RUN_STATE_INMIGRATE)) {
+            dummy = true;
+            goto tryagain;
+        }
+        trace_xen_map_cache_return(NULL);
+        return NULL;
+    }
+
+    mapcache->last_entry = entry;
+    if (lock) {
+        MapCacheRev *reventry = g_malloc0(sizeof(MapCacheRev));
+        entry->lock++;
+        reventry->dma = dma;
+        reventry->vaddr_req = mapcache->last_entry->vaddr_base + address_offset;
+        reventry->paddr_index = mapcache->last_entry->paddr_index;
+        reventry->size = entry->size;
+        QTAILQ_INSERT_HEAD(&mapcache->locked_entries, reventry, next);
+    }
+
+    trace_xen_map_cache_return(mapcache->last_entry->vaddr_base + address_offset);
+    return mapcache->last_entry->vaddr_base + address_offset;
+}
+
+uint8_t *xen_map_cache(hwaddr phys_addr, hwaddr size,
+                       uint8_t lock, bool dma)
+{
+    uint8_t *p;
+
+    mapcache_lock();
+    p = xen_map_cache_unlocked(phys_addr, size, lock, dma);
+    mapcache_unlock();
+    return p;
+}
+
+ram_addr_t xen_ram_addr_from_mapcache(void *ptr)
+{
+    MapCacheEntry *entry = NULL;
+    MapCacheRev *reventry;
+    hwaddr paddr_index;
+    hwaddr size;
+    ram_addr_t raddr;
+    int found = 0;
+
+    mapcache_lock();
+    QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+        if (reventry->vaddr_req == ptr) {
+            paddr_index = reventry->paddr_index;
+            size = reventry->size;
+            found = 1;
+            break;
+        }
+    }
+    if (!found) {
+        fprintf(stderr, "%s, could not find %p\n", __func__, ptr);
+        QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+            DPRINTF("   "TARGET_FMT_plx" -> %p is present\n", reventry->paddr_index,
+                    reventry->vaddr_req);
+        }
+        abort();
+        return 0;
+    }
+
+    entry = &mapcache->entry[paddr_index % mapcache->nr_buckets];
+    while (entry && (entry->paddr_index != paddr_index || entry->size != size)) {
+        entry = entry->next;
+    }
+    if (!entry) {
+        DPRINTF("Trying to find address %p that is not in the mapcache!\n", ptr);
+        raddr = 0;
+    } else {
+        raddr = (reventry->paddr_index << MCACHE_BUCKET_SHIFT) +
+             ((unsigned long) ptr - (unsigned long) entry->vaddr_base);
+    }
+    mapcache_unlock();
+    return raddr;
+}
+
+static void xen_invalidate_map_cache_entry_unlocked(uint8_t *buffer)
+{
+    MapCacheEntry *entry = NULL, *pentry = NULL;
+    MapCacheRev *reventry;
+    hwaddr paddr_index;
+    hwaddr size;
+    int found = 0;
+
+    QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+        if (reventry->vaddr_req == buffer) {
+            paddr_index = reventry->paddr_index;
+            size = reventry->size;
+            found = 1;
+            break;
+        }
+    }
+    if (!found) {
+        DPRINTF("%s, could not find %p\n", __func__, buffer);
+        QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+            DPRINTF("   "TARGET_FMT_plx" -> %p is present\n", reventry->paddr_index, reventry->vaddr_req);
+        }
+        return;
+    }
+    QTAILQ_REMOVE(&mapcache->locked_entries, reventry, next);
+    g_free(reventry);
+
+    if (mapcache->last_entry != NULL &&
+        mapcache->last_entry->paddr_index == paddr_index) {
+        mapcache->last_entry = NULL;
+    }
+
+    entry = &mapcache->entry[paddr_index % mapcache->nr_buckets];
+    while (entry && (entry->paddr_index != paddr_index || entry->size != size)) {
+        pentry = entry;
+        entry = entry->next;
+    }
+    if (!entry) {
+        DPRINTF("Trying to unmap address %p that is not in the mapcache!\n", buffer);
+        return;
+    }
+    entry->lock--;
+    if (entry->lock > 0 || pentry == NULL) {
+        return;
+    }
+
+    pentry->next = entry->next;
+    ram_block_notify_remove(entry->vaddr_base, entry->size, entry->size);
+    if (munmap(entry->vaddr_base, entry->size) != 0) {
+        perror("unmap fails");
+        exit(-1);
+    }
+    g_free(entry->valid_mapping);
+    g_free(entry);
+}
+
+void xen_invalidate_map_cache_entry(uint8_t *buffer)
+{
+    mapcache_lock();
+    xen_invalidate_map_cache_entry_unlocked(buffer);
+    mapcache_unlock();
+}
+
+void xen_invalidate_map_cache(void)
+{
+    unsigned long i;
+    MapCacheRev *reventry;
+
+    /* Flush pending AIO before destroying the mapcache */
+    bdrv_drain_all();
+
+    mapcache_lock();
+
+    QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+        if (!reventry->dma) {
+            continue;
+        }
+        fprintf(stderr, "Locked DMA mapping while invalidating mapcache!"
+                " "TARGET_FMT_plx" -> %p is present\n",
+                reventry->paddr_index, reventry->vaddr_req);
+    }
+
+    for (i = 0; i < mapcache->nr_buckets; i++) {
+        MapCacheEntry *entry = &mapcache->entry[i];
+
+        if (entry->vaddr_base == NULL) {
+            continue;
+        }
+        if (entry->lock > 0) {
+            continue;
+        }
+
+        if (munmap(entry->vaddr_base, entry->size) != 0) {
+            perror("unmap fails");
+            exit(-1);
+        }
+
+        entry->paddr_index = 0;
+        entry->vaddr_base = NULL;
+        entry->size = 0;
+        g_free(entry->valid_mapping);
+        entry->valid_mapping = NULL;
+    }
+
+    mapcache->last_entry = NULL;
+
+    mapcache_unlock();
+}
+
+static uint8_t *xen_replace_cache_entry_unlocked(hwaddr old_phys_addr,
+                                                 hwaddr new_phys_addr,
+                                                 hwaddr size)
+{
+    MapCacheEntry *entry;
+    hwaddr address_index, address_offset;
+    hwaddr test_bit_size, cache_size = size;
+
+    address_index  = old_phys_addr >> MCACHE_BUCKET_SHIFT;
+    address_offset = old_phys_addr & (MCACHE_BUCKET_SIZE - 1);
+
+    assert(size);
+    /* test_bit_size is always a multiple of XC_PAGE_SIZE */
+    test_bit_size = size + (old_phys_addr & (XC_PAGE_SIZE - 1));
+    if (test_bit_size % XC_PAGE_SIZE) {
+        test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE);
+    }
+    cache_size = size + address_offset;
+    if (cache_size % MCACHE_BUCKET_SIZE) {
+        cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE);
+    }
+
+    entry = &mapcache->entry[address_index % mapcache->nr_buckets];
+    while (entry && !(entry->paddr_index == address_index &&
+                      entry->size == cache_size)) {
+        entry = entry->next;
+    }
+    if (!entry) {
+        DPRINTF("Trying to update an entry for "TARGET_FMT_plx \
+                "that is not in the mapcache!\n", old_phys_addr);
+        return NULL;
+    }
+
+    address_index  = new_phys_addr >> MCACHE_BUCKET_SHIFT;
+    address_offset = new_phys_addr & (MCACHE_BUCKET_SIZE - 1);
+
+    fprintf(stderr, "Replacing a dummy mapcache entry for "TARGET_FMT_plx \
+            " with "TARGET_FMT_plx"\n", old_phys_addr, new_phys_addr);
+
+    xen_remap_bucket(entry, entry->vaddr_base,
+                     cache_size, address_index, false);
+    if (!test_bits(address_offset >> XC_PAGE_SHIFT,
+                test_bit_size >> XC_PAGE_SHIFT,
+                entry->valid_mapping)) {
+        DPRINTF("Unable to update a mapcache entry for "TARGET_FMT_plx"!\n",
+                old_phys_addr);
+        return NULL;
+    }
+
+    return entry->vaddr_base + address_offset;
+}
+
+uint8_t *xen_replace_cache_entry(hwaddr old_phys_addr,
+                                 hwaddr new_phys_addr,
+                                 hwaddr size)
+{
+    uint8_t *p;
+
+    mapcache_lock();
+    p = xen_replace_cache_entry_unlocked(old_phys_addr, new_phys_addr, size);
+    mapcache_unlock();
+    return p;
+}
diff --git a/hw/i386/xen/xen_apic.c b/hw/i386/xen/xen_apic.c
new file mode 100644
index 000000000..7c7a60b16
--- /dev/null
+++ b/hw/i386/xen/xen_apic.c
@@ -0,0 +1,103 @@
+/*
+ * Xen basic APIC support
+ *
+ * Copyright (c) 2012 Citrix
+ *
+ * Authors:
+ *  Wei Liu <wei.liu2@citrix.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/apic_internal.h"
+#include "hw/pci/msi.h"
+#include "hw/xen/xen.h"
+#include "qemu/module.h"
+
+static uint64_t xen_apic_mem_read(void *opaque, hwaddr addr,
+                                  unsigned size)
+{
+    return ~(uint64_t)0;
+}
+
+static void xen_apic_mem_write(void *opaque, hwaddr addr,
+                               uint64_t data, unsigned size)
+{
+    if (size != sizeof(uint32_t)) {
+        fprintf(stderr, "Xen: APIC write data size = %d, invalid\n", size);
+        return;
+    }
+
+    xen_hvm_inject_msi(addr, data);
+}
+
+static const MemoryRegionOps xen_apic_io_ops = {
+    .read = xen_apic_mem_read,
+    .write = xen_apic_mem_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void xen_apic_realize(DeviceState *dev, Error **errp)
+{
+    APICCommonState *s = APIC_COMMON(dev);
+
+    s->vapic_control = 0;
+    memory_region_init_io(&s->io_memory, OBJECT(s), &xen_apic_io_ops, s,
+                          "xen-apic-msi", APIC_SPACE_SIZE);
+    msi_nonbroken = true;
+}
+
+static void xen_apic_set_base(APICCommonState *s, uint64_t val)
+{
+}
+
+static void xen_apic_set_tpr(APICCommonState *s, uint8_t val)
+{
+}
+
+static uint8_t xen_apic_get_tpr(APICCommonState *s)
+{
+    return 0;
+}
+
+static void xen_apic_vapic_base_update(APICCommonState *s)
+{
+}
+
+static void xen_apic_external_nmi(APICCommonState *s)
+{
+}
+
+static void xen_send_msi(MSIMessage *msi)
+{
+    xen_hvm_inject_msi(msi->address, msi->data);
+}
+
+static void xen_apic_class_init(ObjectClass *klass, void *data)
+{
+    APICCommonClass *k = APIC_COMMON_CLASS(klass);
+
+    k->realize = xen_apic_realize;
+    k->set_base = xen_apic_set_base;
+    k->set_tpr = xen_apic_set_tpr;
+    k->get_tpr = xen_apic_get_tpr;
+    k->vapic_base_update = xen_apic_vapic_base_update;
+    k->external_nmi = xen_apic_external_nmi;
+    k->send_msi = xen_send_msi;
+}
+
+static const TypeInfo xen_apic_info = {
+    .name = "xen-apic",
+    .parent = TYPE_APIC_COMMON,
+    .instance_size = sizeof(APICCommonState),
+    .class_init = xen_apic_class_init,
+};
+
+static void xen_apic_register_types(void)
+{
+    type_register_static(&xen_apic_info);
+}
+
+type_init(xen_apic_register_types)
diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c
new file mode 100644
index 000000000..72028449b
--- /dev/null
+++ b/hw/i386/xen/xen_platform.c
@@ -0,0 +1,519 @@
+/*
+ * XEN platform pci device, formerly known as the event channel device
+ *
+ * Copyright (c) 2003-2004 Intel Corp.
+ * Copyright (c) 2006 XenSource
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/ide.h"
+#include "hw/pci/pci.h"
+#include "hw/xen/xen_common.h"
+#include "migration/vmstate.h"
+#include "hw/xen/xen-legacy-backend.h"
+#include "trace.h"
+#include "sysemu/xen.h"
+#include "sysemu/block-backend.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+//#define DEBUG_PLATFORM
+
+#ifdef DEBUG_PLATFORM
+#define DPRINTF(fmt, ...) do { \
+    fprintf(stderr, "xen_platform: " fmt, ## __VA_ARGS__); \
+} while (0)
+#else
+#define DPRINTF(fmt, ...) do { } while (0)
+#endif
+
+#define PFFLAG_ROM_LOCK 1 /* Sets whether ROM memory area is RW or RO */
+
+struct PCIXenPlatformState {
+    /*< private >*/
+    PCIDevice parent_obj;
+    /*< public >*/
+
+    MemoryRegion fixed_io;
+    MemoryRegion bar;
+    MemoryRegion mmio_bar;
+    uint8_t flags; /* used only for version_id == 2 */
+    uint16_t driver_product_version;
+
+    /* Log from guest drivers */
+    char log_buffer[4096];
+    int log_buffer_off;
+};
+
+#define TYPE_XEN_PLATFORM "xen-platform"
+OBJECT_DECLARE_SIMPLE_TYPE(PCIXenPlatformState, XEN_PLATFORM)
+
+#define XEN_PLATFORM_IOPORT 0x10
+
+/* Send bytes to syslog */
+static void log_writeb(PCIXenPlatformState *s, char val)
+{
+    if (val == '\n' || s->log_buffer_off == sizeof(s->log_buffer) - 1) {
+        /* Flush buffer */
+        s->log_buffer[s->log_buffer_off] = 0;
+        trace_xen_platform_log(s->log_buffer);
+        s->log_buffer_off = 0;
+    } else {
+        s->log_buffer[s->log_buffer_off++] = val;
+    }
+}
+
+/*
+ * Unplug device flags.
+ *
+ * The logic got a little confused at some point in the past but this is
+ * what they do now.
+ *
+ * bit 0: Unplug all IDE and SCSI disks.
+ * bit 1: Unplug all NICs.
+ * bit 2: Unplug IDE disks except primary master. This is overridden if
+ *        bit 0 is also present in the mask.
+ * bit 3: Unplug all NVMe disks.
+ *
+ */
+#define _UNPLUG_IDE_SCSI_DISKS 0
+#define UNPLUG_IDE_SCSI_DISKS (1u << _UNPLUG_IDE_SCSI_DISKS)
+
+#define _UNPLUG_ALL_NICS 1
+#define UNPLUG_ALL_NICS (1u << _UNPLUG_ALL_NICS)
+
+#define _UNPLUG_AUX_IDE_DISKS 2
+#define UNPLUG_AUX_IDE_DISKS (1u << _UNPLUG_AUX_IDE_DISKS)
+
+#define _UNPLUG_NVME_DISKS 3
+#define UNPLUG_NVME_DISKS (1u << _UNPLUG_NVME_DISKS)
+
+static void unplug_nic(PCIBus *b, PCIDevice *d, void *o)
+{
+    /* We have to ignore passthrough devices */
+    if (pci_get_word(d->config + PCI_CLASS_DEVICE) ==
+            PCI_CLASS_NETWORK_ETHERNET
+            && strcmp(d->name, "xen-pci-passthrough") != 0) {
+        object_unparent(OBJECT(d));
+    }
+}
+
+/* Remove the peer of the NIC device. Normally, this would be a tap device. */
+static void del_nic_peer(NICState *nic, void *opaque)
+{
+    NetClientState *nc;
+
+    nc = qemu_get_queue(nic);
+    if (nc->peer)
+        qemu_del_net_client(nc->peer);
+}
+
+static void pci_unplug_nics(PCIBus *bus)
+{
+    qemu_foreach_nic(del_nic_peer, NULL);
+    pci_for_each_device(bus, 0, unplug_nic, NULL);
+}
+
+static void unplug_disks(PCIBus *b, PCIDevice *d, void *opaque)
+{
+    uint32_t flags = *(uint32_t *)opaque;
+    bool aux = (flags & UNPLUG_AUX_IDE_DISKS) &&
+        !(flags & UNPLUG_IDE_SCSI_DISKS);
+
+    /* We have to ignore passthrough devices */
+    if (!strcmp(d->name, "xen-pci-passthrough")) {
+        return;
+    }
+
+    switch (pci_get_word(d->config + PCI_CLASS_DEVICE)) {
+    case PCI_CLASS_STORAGE_IDE:
+        pci_piix3_xen_ide_unplug(DEVICE(d), aux);
+        break;
+
+    case PCI_CLASS_STORAGE_SCSI:
+        if (!aux) {
+            object_unparent(OBJECT(d));
+        }
+        break;
+
+    case PCI_CLASS_STORAGE_EXPRESS:
+        if (flags & UNPLUG_NVME_DISKS) {
+            object_unparent(OBJECT(d));
+        }
+
+    default:
+        break;
+    }
+}
+
+static void pci_unplug_disks(PCIBus *bus, uint32_t flags)
+{
+    pci_for_each_device(bus, 0, unplug_disks, &flags);
+}
+
+static void platform_fixed_ioport_writew(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIXenPlatformState *s = opaque;
+
+    switch (addr) {
+    case 0: {
+        PCIDevice *pci_dev = PCI_DEVICE(s);
+        /* Unplug devices. See comment above flag definitions */
+        if (val & (UNPLUG_IDE_SCSI_DISKS | UNPLUG_AUX_IDE_DISKS |
+                   UNPLUG_NVME_DISKS)) {
+            DPRINTF("unplug disks\n");
+            pci_unplug_disks(pci_get_bus(pci_dev), val);
+        }
+        if (val & UNPLUG_ALL_NICS) {
+            DPRINTF("unplug nics\n");
+            pci_unplug_nics(pci_get_bus(pci_dev));
+        }
+        break;
+    }
+    case 2:
+        switch (val) {
+        case 1:
+            DPRINTF("Citrix Windows PV drivers loaded in guest\n");
+            break;
+        case 0:
+            DPRINTF("Guest claimed to be running PV product 0?\n");
+            break;
+        default:
+            DPRINTF("Unknown PV product %d loaded in guest\n", val);
+            break;
+        }
+        s->driver_product_version = val;
+        break;
+    }
+}
+
+static void platform_fixed_ioport_writel(void *opaque, uint32_t addr,
+                                         uint32_t val)
+{
+    switch (addr) {
+    case 0:
+        /* PV driver version */
+        break;
+    }
+}
+
+static void platform_fixed_ioport_writeb(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIXenPlatformState *s = opaque;
+
+    switch (addr) {
+    case 0: /* Platform flags */ {
+        hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ?
+            HVMMEM_ram_ro : HVMMEM_ram_rw;
+        if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) {
+            DPRINTF("unable to change ro/rw state of ROM memory area!\n");
+        } else {
+            s->flags = val & PFFLAG_ROM_LOCK;
+            DPRINTF("changed ro/rw state of ROM memory area. now is %s state.\n",
+                    (mem_type == HVMMEM_ram_ro ? "ro":"rw"));
+        }
+        break;
+    }
+    case 2:
+        log_writeb(s, val);
+        break;
+    }
+}
+
+static uint32_t platform_fixed_ioport_readw(void *opaque, uint32_t addr)
+{
+    switch (addr) {
+    case 0:
+        /* Magic value so that you can identify the interface. */
+        return 0x49d2;
+    default:
+        return 0xffff;
+    }
+}
+
+static uint32_t platform_fixed_ioport_readb(void *opaque, uint32_t addr)
+{
+    PCIXenPlatformState *s = opaque;
+
+    switch (addr) {
+    case 0:
+        /* Platform flags */
+        return s->flags;
+    case 2:
+        /* Version number */
+        return 1;
+    default:
+        return 0xff;
+    }
+}
+
+static void platform_fixed_ioport_reset(void *opaque)
+{
+    PCIXenPlatformState *s = opaque;
+
+    platform_fixed_ioport_writeb(s, 0, 0);
+}
+
+static uint64_t platform_fixed_ioport_read(void *opaque,
+                                           hwaddr addr,
+                                           unsigned size)
+{
+    switch (size) {
+    case 1:
+        return platform_fixed_ioport_readb(opaque, addr);
+    case 2:
+        return platform_fixed_ioport_readw(opaque, addr);
+    default:
+        return -1;
+    }
+}
+
+static void platform_fixed_ioport_write(void *opaque, hwaddr addr,
+
+                                        uint64_t val, unsigned size)
+{
+    switch (size) {
+    case 1:
+        platform_fixed_ioport_writeb(opaque, addr, val);
+        break;
+    case 2:
+        platform_fixed_ioport_writew(opaque, addr, val);
+        break;
+    case 4:
+        platform_fixed_ioport_writel(opaque, addr, val);
+        break;
+    }
+}
+
+
+static const MemoryRegionOps platform_fixed_io_ops = {
+    .read = platform_fixed_ioport_read,
+    .write = platform_fixed_ioport_write,
+    .valid = {
+        .unaligned = true,
+    },
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 4,
+        .unaligned = true,
+    },
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void platform_fixed_ioport_init(PCIXenPlatformState* s)
+{
+    memory_region_init_io(&s->fixed_io, OBJECT(s), &platform_fixed_io_ops, s,
+                          "xen-fixed", 16);
+    memory_region_add_subregion(get_system_io(), XEN_PLATFORM_IOPORT,
+                                &s->fixed_io);
+}
+
+/* Xen Platform PCI Device */
+
+static uint64_t xen_platform_ioport_readb(void *opaque, hwaddr addr,
+                                          unsigned int size)
+{
+    if (addr == 0) {
+        return platform_fixed_ioport_readb(opaque, 0);
+    } else {
+        return ~0u;
+    }
+}
+
+static void xen_platform_ioport_writeb(void *opaque, hwaddr addr,
+                                       uint64_t val, unsigned int size)
+{
+    PCIXenPlatformState *s = opaque;
+    PCIDevice *pci_dev = PCI_DEVICE(s);
+
+    switch (addr) {
+    case 0: /* Platform flags */
+        platform_fixed_ioport_writeb(opaque, 0, (uint32_t)val);
+        break;
+    case 4:
+        if (val == 1) {
+            /*
+             * SUSE unplug for Xenlinux
+             * xen-kmp used this since xen-3.0.4, instead the official protocol
+             * from xen-3.3+ It did an unconditional "outl(1, (ioaddr + 4));"
+             * Pre VMDP 1.7 used 4 and 8 depending on how VMDP was configured.
+             * If VMDP was to control both disk and LAN it would use 4.
+             * If it controlled just disk or just LAN, it would use 8 below.
+             */
+            pci_unplug_disks(pci_get_bus(pci_dev), UNPLUG_IDE_SCSI_DISKS);
+            pci_unplug_nics(pci_get_bus(pci_dev));
+        }
+        break;
+    case 8:
+        switch (val) {
+        case 1:
+            pci_unplug_disks(pci_get_bus(pci_dev), UNPLUG_IDE_SCSI_DISKS);
+            break;
+        case 2:
+            pci_unplug_nics(pci_get_bus(pci_dev));
+            break;
+        default:
+            log_writeb(s, (uint32_t)val);
+            break;
+        }
+        break;
+    default:
+        break;
+    }
+}
+
+static const MemoryRegionOps xen_pci_io_ops = {
+    .read  = xen_platform_ioport_readb,
+    .write = xen_platform_ioport_writeb,
+    .impl.min_access_size = 1,
+    .impl.max_access_size = 1,
+};
+
+static void platform_ioport_bar_setup(PCIXenPlatformState *d)
+{
+    memory_region_init_io(&d->bar, OBJECT(d), &xen_pci_io_ops, d,
+                          "xen-pci", 0x100);
+}
+
+static uint64_t platform_mmio_read(void *opaque, hwaddr addr,
+                                   unsigned size)
+{
+    DPRINTF("Warning: attempted read from physical address "
+            "0x" TARGET_FMT_plx " in xen platform mmio space\n", addr);
+
+    return 0;
+}
+
+static void platform_mmio_write(void *opaque, hwaddr addr,
+                                uint64_t val, unsigned size)
+{
+    DPRINTF("Warning: attempted write of 0x%"PRIx64" to physical "
+            "address 0x" TARGET_FMT_plx " in xen platform mmio space\n",
+            val, addr);
+}
+
+static const MemoryRegionOps platform_mmio_handler = {
+    .read = &platform_mmio_read,
+    .write = &platform_mmio_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void platform_mmio_setup(PCIXenPlatformState *d)
+{
+    memory_region_init_io(&d->mmio_bar, OBJECT(d), &platform_mmio_handler, d,
+                          "xen-mmio", 0x1000000);
+}
+
+static int xen_platform_post_load(void *opaque, int version_id)
+{
+    PCIXenPlatformState *s = opaque;
+
+    platform_fixed_ioport_writeb(s, 0, s->flags);
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_xen_platform = {
+    .name = "platform",
+    .version_id = 4,
+    .minimum_version_id = 4,
+    .post_load = xen_platform_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_PCI_DEVICE(parent_obj, PCIXenPlatformState),
+        VMSTATE_UINT8(flags, PCIXenPlatformState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void xen_platform_realize(PCIDevice *dev, Error **errp)
+{
+    PCIXenPlatformState *d = XEN_PLATFORM(dev);
+    uint8_t *pci_conf;
+
+    /* Device will crash on reset if xen is not initialized */
+    if (!xen_enabled()) {
+        error_setg(errp, "xen-platform device requires the Xen accelerator");
+        return;
+    }
+
+    pci_conf = dev->config;
+
+    pci_set_word(pci_conf + PCI_COMMAND, PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+
+    pci_config_set_prog_interface(pci_conf, 0);
+
+    pci_conf[PCI_INTERRUPT_PIN] = 1;
+
+    platform_ioport_bar_setup(d);
+    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_IO, &d->bar);
+
+    /* reserve 16MB mmio address for share memory*/
+    platform_mmio_setup(d);
+    pci_register_bar(dev, 1, PCI_BASE_ADDRESS_MEM_PREFETCH,
+                     &d->mmio_bar);
+
+    platform_fixed_ioport_init(d);
+}
+
+static void platform_reset(DeviceState *dev)
+{
+    PCIXenPlatformState *s = XEN_PLATFORM(dev);
+
+    platform_fixed_ioport_reset(s);
+}
+
+static void xen_platform_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+    k->realize = xen_platform_realize;
+    k->vendor_id = PCI_VENDOR_ID_XEN;
+    k->device_id = PCI_DEVICE_ID_XEN_PLATFORM;
+    k->class_id = PCI_CLASS_OTHERS << 8 | 0x80;
+    k->subsystem_vendor_id = PCI_VENDOR_ID_XEN;
+    k->subsystem_id = PCI_DEVICE_ID_XEN_PLATFORM;
+    k->revision = 1;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    dc->desc = "XEN platform pci device";
+    dc->reset = platform_reset;
+    dc->vmsd = &vmstate_xen_platform;
+}
+
+static const TypeInfo xen_platform_info = {
+    .name          = TYPE_XEN_PLATFORM,
+    .parent        = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(PCIXenPlatformState),
+    .class_init    = xen_platform_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+        { },
+    },
+};
+
+static void xen_platform_register_types(void)
+{
+    type_register_static(&xen_platform_info);
+}
+
+type_init(xen_platform_register_types)
diff --git a/hw/i386/xen/xen_pvdevice.c b/hw/i386/xen/xen_pvdevice.c
new file mode 100644
index 000000000..1ea95fa60
--- /dev/null
+++ b/hw/i386/xen/xen_pvdevice.c
@@ -0,0 +1,154 @@
+/* Copyright (c) Citrix Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms,
+ * with or without modification, are permitted provided
+ * that the following conditions are met:
+ *
+ * *   Redistributions of source code must retain the above
+ *     copyright notice, this list of conditions and the
+ *     following disclaimer.
+ * *   Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the
+ *     following disclaimer in the documentation and/or other
+ *     materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "migration/vmstate.h"
+#include "trace.h"
+#include "qom/object.h"
+
+#define TYPE_XEN_PV_DEVICE  "xen-pvdevice"
+
+OBJECT_DECLARE_SIMPLE_TYPE(XenPVDevice, XEN_PV_DEVICE)
+
+struct XenPVDevice {
+    /*< private >*/
+    PCIDevice       parent_obj;
+    /*< public >*/
+    uint16_t        vendor_id;
+    uint16_t        device_id;
+    uint8_t         revision;
+    uint32_t        size;
+    MemoryRegion    mmio;
+};
+
+static uint64_t xen_pv_mmio_read(void *opaque, hwaddr addr,
+                                 unsigned size)
+{
+    trace_xen_pv_mmio_read(addr);
+
+    return ~(uint64_t)0;
+}
+
+static void xen_pv_mmio_write(void *opaque, hwaddr addr,
+                              uint64_t val, unsigned size)
+{
+    trace_xen_pv_mmio_write(addr);
+}
+
+static const MemoryRegionOps xen_pv_mmio_ops = {
+    .read = &xen_pv_mmio_read,
+    .write = &xen_pv_mmio_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static const VMStateDescription vmstate_xen_pvdevice = {
+    .name = "xen-pvdevice",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_PCI_DEVICE(parent_obj, XenPVDevice),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void xen_pv_realize(PCIDevice *pci_dev, Error **errp)
+{
+    XenPVDevice *d = XEN_PV_DEVICE(pci_dev);
+    uint8_t *pci_conf;
+
+    /* device-id property must always be supplied */
+    if (d->device_id == 0xffff) {
+        error_setg(errp, "Device ID invalid, it must always be supplied");
+        return;
+    }
+
+    pci_conf = pci_dev->config;
+
+    pci_set_word(pci_conf + PCI_VENDOR_ID, d->vendor_id);
+    pci_set_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID, d->vendor_id);
+    pci_set_word(pci_conf + PCI_DEVICE_ID, d->device_id);
+    pci_set_word(pci_conf + PCI_SUBSYSTEM_ID, d->device_id);
+    pci_set_byte(pci_conf + PCI_REVISION_ID, d->revision);
+
+    pci_set_word(pci_conf + PCI_COMMAND, PCI_COMMAND_MEMORY);
+
+    pci_config_set_prog_interface(pci_conf, 0);
+
+    pci_conf[PCI_INTERRUPT_PIN] = 1;
+
+    memory_region_init_io(&d->mmio, NULL, &xen_pv_mmio_ops, d,
+                          "mmio", d->size);
+
+    pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_MEM_PREFETCH,
+                     &d->mmio);
+}
+
+static Property xen_pv_props[] = {
+    DEFINE_PROP_UINT16("vendor-id", XenPVDevice, vendor_id, PCI_VENDOR_ID_XEN),
+    DEFINE_PROP_UINT16("device-id", XenPVDevice, device_id, 0xffff),
+    DEFINE_PROP_UINT8("revision", XenPVDevice, revision, 0x01),
+    DEFINE_PROP_UINT32("size", XenPVDevice, size, 0x400000),
+    DEFINE_PROP_END_OF_LIST()
+};
+
+static void xen_pv_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+    k->realize = xen_pv_realize;
+    k->class_id = PCI_CLASS_SYSTEM_OTHER;
+    dc->desc = "Xen PV Device";
+    device_class_set_props(dc, xen_pv_props);
+    dc->vmsd = &vmstate_xen_pvdevice;
+}
+
+static const TypeInfo xen_pv_type_info = {
+    .name          = TYPE_XEN_PV_DEVICE,
+    .parent        = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(XenPVDevice),
+    .class_init    = xen_pv_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+        { },
+    },
+};
+
+static void xen_pv_register_types(void)
+{
+    type_register_static(&xen_pv_type_info);
+}
+
+type_init(xen_pv_register_types)