diff options
Diffstat (limited to 'hw/i386')
54 files changed, 23847 insertions, 0 deletions
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig new file mode 100644 index 000000000..d22ac4a4b --- /dev/null +++ b/hw/i386/Kconfig @@ -0,0 +1,139 @@ +config X86_FW_OVMF + bool + +config SEV + bool + select X86_FW_OVMF + depends on KVM + +config SGX + bool + depends on KVM + +config PC + bool + imply APPLESMC + imply HYPERV + imply ISA_IPMI_KCS + imply ISA_IPMI_BT + imply PCI_IPMI_KCS + imply PCI_IPMI_BT + imply IPMI_SSIF + imply ISA_DEBUG + imply PARALLEL + imply PCI_DEVICES + imply PVPANIC_ISA + imply QXL + imply SEV + imply SGX + imply SGA + imply TEST_DEVICES + imply TPM_CRB + imply TPM_TIS_ISA + imply VGA_PCI + imply VIRTIO_VGA + imply NVDIMM + select FDC_ISA + select I8259 + select I8254 + select PCKBD + select PCSPK + select I8257 + select MC146818RTC + # For ACPI builder: + select SERIAL_ISA + select ACPI_PCI + select ACPI_VMGENID + select VIRTIO_PMEM_SUPPORTED + select VIRTIO_MEM_SUPPORTED + +config PC_PCI + bool + select APIC + select IOAPIC + select APM + select PC + +config PC_ACPI + bool + select ACPI_X86 + select ACPI_CPU_HOTPLUG + select ACPI_MEMORY_HOTPLUG + select ACPI_VIOT + select SMBUS_EEPROM + select PFLASH_CFI01 + depends on ACPI_SMBUS + +config I440FX + bool + imply E1000_PCI + imply VMPORT + imply VMMOUSE + select PC_PCI + select PC_ACPI + select ACPI_SMBUS + select PCI_I440FX + select PIIX3 + select IDE_PIIX + select DIMM + select SMBIOS + select FW_CFG_DMA + +config ISAPC + bool + select ISA_BUS + select PC + select IDE_ISA + select VGA_ISA + # FIXME: it is in the same file as i440fx, and does not compile + # if separated + depends on I440FX + +config Q35 + bool + imply VTD + imply AMD_IOMMU + imply E1000E_PCI_EXPRESS + imply VMPORT + imply VMMOUSE + select PC_PCI + select PC_ACPI + select PCI_EXPRESS_Q35 + select LPC_ICH9 + select AHCI_ICH9 + select DIMM + select SMBIOS + select FW_CFG_DMA + +config MICROVM + bool + select SERIAL_ISA # for serial_hds_isa_init() + select ISA_BUS + select APIC + select IOAPIC + select I8259 + select MC146818RTC + select VIRTIO_MMIO + select ACPI_HW_REDUCED + select PCI_EXPRESS_GENERIC_BRIDGE + select USB_XHCI_SYSBUS + select I8254 + +config X86_IOMMU + bool + depends on PC + +config VTD + bool + select X86_IOMMU + +config AMD_IOMMU + bool + select X86_IOMMU + +config VMPORT + bool + +config VMMOUSE + bool + depends on VMPORT diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c new file mode 100644 index 000000000..a99c6e4fe --- /dev/null +++ b/hw/i386/acpi-build.c @@ -0,0 +1,2873 @@ +/* Support for generating ACPI tables and passing them to Guests + * + * Copyright (C) 2008-2010 Kevin O'Connor <kevin@koconnor.net> + * Copyright (C) 2006 Fabrice Bellard + * Copyright (C) 2013 Red Hat Inc + * + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qmp/qnum.h" +#include "acpi-build.h" +#include "acpi-common.h" +#include "qemu/bitmap.h" +#include "qemu/error-report.h" +#include "hw/pci/pci.h" +#include "hw/core/cpu.h" +#include "target/i386/cpu.h" +#include "hw/misc/pvpanic.h" +#include "hw/timer/hpet.h" +#include "hw/acpi/acpi-defs.h" +#include "hw/acpi/acpi.h" +#include "hw/acpi/cpu.h" +#include "hw/nvram/fw_cfg.h" +#include "hw/acpi/bios-linker-loader.h" +#include "hw/isa/isa.h" +#include "hw/block/fdc.h" +#include "hw/acpi/memory_hotplug.h" +#include "sysemu/tpm.h" +#include "hw/acpi/tpm.h" +#include "hw/acpi/vmgenid.h" +#include "sysemu/tpm_backend.h" +#include "hw/rtc/mc146818rtc_regs.h" +#include "migration/vmstate.h" +#include "hw/mem/memory-device.h" +#include "hw/mem/nvdimm.h" +#include "sysemu/numa.h" +#include "sysemu/reset.h" +#include "hw/hyperv/vmbus-bridge.h" + +/* Supported chipsets: */ +#include "hw/southbridge/piix.h" +#include "hw/acpi/pcihp.h" +#include "hw/i386/fw_cfg.h" +#include "hw/i386/ich9.h" +#include "hw/pci/pci_bus.h" +#include "hw/pci-host/q35.h" +#include "hw/i386/x86-iommu.h" + +#include "hw/acpi/aml-build.h" +#include "hw/acpi/utils.h" +#include "hw/acpi/pci.h" + +#include "qom/qom-qobject.h" +#include "hw/i386/amd_iommu.h" +#include "hw/i386/intel_iommu.h" +#include "hw/virtio/virtio-iommu.h" + +#include "hw/acpi/ipmi.h" +#include "hw/acpi/hmat.h" +#include "hw/acpi/viot.h" + +/* These are used to size the ACPI tables for -M pc-i440fx-1.7 and + * -M pc-i440fx-2.0. Even if the actual amount of AML generated grows + * a little bit, there should be plenty of free space since the DSDT + * shrunk by ~1.5k between QEMU 2.0 and QEMU 2.1. + */ +#define ACPI_BUILD_LEGACY_CPU_AML_SIZE 97 +#define ACPI_BUILD_ALIGN_SIZE 0x1000 + +#define ACPI_BUILD_TABLE_SIZE 0x20000 + +/* #define DEBUG_ACPI_BUILD */ +#ifdef DEBUG_ACPI_BUILD +#define ACPI_BUILD_DPRINTF(fmt, ...) \ + do {printf("ACPI_BUILD: " fmt, ## __VA_ARGS__); } while (0) +#else +#define ACPI_BUILD_DPRINTF(fmt, ...) +#endif + +typedef struct AcpiPmInfo { + bool s3_disabled; + bool s4_disabled; + bool pcihp_bridge_en; + bool smi_on_cpuhp; + bool smi_on_cpu_unplug; + bool pcihp_root_en; + uint8_t s4_val; + AcpiFadtData fadt; + uint16_t cpu_hp_io_base; + uint16_t pcihp_io_base; + uint16_t pcihp_io_len; +} AcpiPmInfo; + +typedef struct AcpiMiscInfo { + bool is_piix4; + bool has_hpet; +#ifdef CONFIG_TPM + TPMVersion tpm_version; +#endif + const unsigned char *dsdt_code; + unsigned dsdt_size; + uint16_t pvpanic_port; + uint16_t applesmc_io_base; +} AcpiMiscInfo; + +typedef struct AcpiBuildPciBusHotplugState { + GArray *device_table; + GArray *notify_table; + struct AcpiBuildPciBusHotplugState *parent; + bool pcihp_bridge_en; +} AcpiBuildPciBusHotplugState; + +typedef struct FwCfgTPMConfig { + uint32_t tpmppi_address; + uint8_t tpm_version; + uint8_t tpmppi_version; +} QEMU_PACKED FwCfgTPMConfig; + +static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg); + +const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio = { + .space_id = AML_AS_SYSTEM_IO, + .address = NVDIMM_ACPI_IO_BASE, + .bit_width = NVDIMM_ACPI_IO_LEN << 3 +}; + +static void init_common_fadt_data(MachineState *ms, Object *o, + AcpiFadtData *data) +{ + X86MachineState *x86ms = X86_MACHINE(ms); + /* + * "ICH9-LPC" or "PIIX4_PM" has "smm-compat" property to keep the old + * behavior for compatibility irrelevant to smm_enabled, which doesn't + * comforms to ACPI spec. + */ + bool smm_enabled = object_property_get_bool(o, "smm-compat", NULL) ? + true : x86_machine_is_smm_enabled(x86ms); + uint32_t io = object_property_get_uint(o, ACPI_PM_PROP_PM_IO_BASE, NULL); + AmlAddressSpace as = AML_AS_SYSTEM_IO; + AcpiFadtData fadt = { + .rev = 3, + .flags = + (1 << ACPI_FADT_F_WBINVD) | + (1 << ACPI_FADT_F_PROC_C1) | + (1 << ACPI_FADT_F_SLP_BUTTON) | + (1 << ACPI_FADT_F_RTC_S4) | + (1 << ACPI_FADT_F_USE_PLATFORM_CLOCK) | + /* APIC destination mode ("Flat Logical") has an upper limit of 8 + * CPUs for more than 8 CPUs, "Clustered Logical" mode has to be + * used + */ + ((ms->smp.max_cpus > 8) ? + (1 << ACPI_FADT_F_FORCE_APIC_CLUSTER_MODEL) : 0), + .int_model = 1 /* Multiple APIC */, + .rtc_century = RTC_CENTURY, + .plvl2_lat = 0xfff /* C2 state not supported */, + .plvl3_lat = 0xfff /* C3 state not supported */, + .smi_cmd = smm_enabled ? ACPI_PORT_SMI_CMD : 0, + .sci_int = object_property_get_uint(o, ACPI_PM_PROP_SCI_INT, NULL), + .acpi_enable_cmd = + smm_enabled ? + object_property_get_uint(o, ACPI_PM_PROP_ACPI_ENABLE_CMD, NULL) : + 0, + .acpi_disable_cmd = + smm_enabled ? + object_property_get_uint(o, ACPI_PM_PROP_ACPI_DISABLE_CMD, NULL) : + 0, + .pm1a_evt = { .space_id = as, .bit_width = 4 * 8, .address = io }, + .pm1a_cnt = { .space_id = as, .bit_width = 2 * 8, + .address = io + 0x04 }, + .pm_tmr = { .space_id = as, .bit_width = 4 * 8, .address = io + 0x08 }, + .gpe0_blk = { .space_id = as, .bit_width = + object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK_LEN, NULL) * 8, + .address = object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK, NULL) + }, + }; + *data = fadt; +} + +static Object *object_resolve_type_unambiguous(const char *typename) +{ + bool ambig; + Object *o = object_resolve_path_type("", typename, &ambig); + + if (ambig || !o) { + return NULL; + } + return o; +} + +static void acpi_get_pm_info(MachineState *machine, AcpiPmInfo *pm) +{ + Object *piix = object_resolve_type_unambiguous(TYPE_PIIX4_PM); + Object *lpc = object_resolve_type_unambiguous(TYPE_ICH9_LPC_DEVICE); + Object *obj = piix ? piix : lpc; + QObject *o; + pm->cpu_hp_io_base = 0; + pm->pcihp_io_base = 0; + pm->pcihp_io_len = 0; + pm->smi_on_cpuhp = false; + pm->smi_on_cpu_unplug = false; + + assert(obj); + init_common_fadt_data(machine, obj, &pm->fadt); + if (piix) { + /* w2k requires FADT(rev1) or it won't boot, keep PC compatible */ + pm->fadt.rev = 1; + pm->cpu_hp_io_base = PIIX4_CPU_HOTPLUG_IO_BASE; + } + if (lpc) { + uint64_t smi_features = object_property_get_uint(lpc, + ICH9_LPC_SMI_NEGOTIATED_FEAT_PROP, NULL); + struct AcpiGenericAddress r = { .space_id = AML_AS_SYSTEM_IO, + .bit_width = 8, .address = ICH9_RST_CNT_IOPORT }; + pm->fadt.reset_reg = r; + pm->fadt.reset_val = 0xf; + pm->fadt.flags |= 1 << ACPI_FADT_F_RESET_REG_SUP; + pm->cpu_hp_io_base = ICH9_CPU_HOTPLUG_IO_BASE; + pm->smi_on_cpuhp = + !!(smi_features & BIT_ULL(ICH9_LPC_SMI_F_CPU_HOTPLUG_BIT)); + pm->smi_on_cpu_unplug = + !!(smi_features & BIT_ULL(ICH9_LPC_SMI_F_CPU_HOT_UNPLUG_BIT)); + } + pm->pcihp_io_base = + object_property_get_uint(obj, ACPI_PCIHP_IO_BASE_PROP, NULL); + pm->pcihp_io_len = + object_property_get_uint(obj, ACPI_PCIHP_IO_LEN_PROP, NULL); + + /* The above need not be conditional on machine type because the reset port + * happens to be the same on PIIX (pc) and ICH9 (q35). */ + QEMU_BUILD_BUG_ON(ICH9_RST_CNT_IOPORT != PIIX_RCR_IOPORT); + + /* Fill in optional s3/s4 related properties */ + o = object_property_get_qobject(obj, ACPI_PM_PROP_S3_DISABLED, NULL); + if (o) { + pm->s3_disabled = qnum_get_uint(qobject_to(QNum, o)); + } else { + pm->s3_disabled = false; + } + qobject_unref(o); + o = object_property_get_qobject(obj, ACPI_PM_PROP_S4_DISABLED, NULL); + if (o) { + pm->s4_disabled = qnum_get_uint(qobject_to(QNum, o)); + } else { + pm->s4_disabled = false; + } + qobject_unref(o); + o = object_property_get_qobject(obj, ACPI_PM_PROP_S4_VAL, NULL); + if (o) { + pm->s4_val = qnum_get_uint(qobject_to(QNum, o)); + } else { + pm->s4_val = false; + } + qobject_unref(o); + + pm->pcihp_bridge_en = + object_property_get_bool(obj, ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, + NULL); + pm->pcihp_root_en = + object_property_get_bool(obj, ACPI_PM_PROP_ACPI_PCI_ROOTHP, + NULL); +} + +static void acpi_get_misc_info(AcpiMiscInfo *info) +{ + Object *piix = object_resolve_type_unambiguous(TYPE_PIIX4_PM); + Object *lpc = object_resolve_type_unambiguous(TYPE_ICH9_LPC_DEVICE); + assert(!!piix != !!lpc); + + if (piix) { + info->is_piix4 = true; + } + if (lpc) { + info->is_piix4 = false; + } + + info->has_hpet = hpet_find(); +#ifdef CONFIG_TPM + info->tpm_version = tpm_get_version(tpm_find()); +#endif + info->pvpanic_port = pvpanic_port(); + info->applesmc_io_base = applesmc_port(); +} + +/* + * Because of the PXB hosts we cannot simply query TYPE_PCI_HOST_BRIDGE. + * On i386 arch we only have two pci hosts, so we can look only for them. + */ +Object *acpi_get_i386_pci_host(void) +{ + PCIHostState *host; + + host = PCI_HOST_BRIDGE(object_resolve_path("/machine/i440fx", NULL)); + if (!host) { + host = PCI_HOST_BRIDGE(object_resolve_path("/machine/q35", NULL)); + } + + return OBJECT(host); +} + +static void acpi_get_pci_holes(Range *hole, Range *hole64) +{ + Object *pci_host; + + pci_host = acpi_get_i386_pci_host(); + + if (!pci_host) { + return; + } + + range_set_bounds1(hole, + object_property_get_uint(pci_host, + PCI_HOST_PROP_PCI_HOLE_START, + NULL), + object_property_get_uint(pci_host, + PCI_HOST_PROP_PCI_HOLE_END, + NULL)); + range_set_bounds1(hole64, + object_property_get_uint(pci_host, + PCI_HOST_PROP_PCI_HOLE64_START, + NULL), + object_property_get_uint(pci_host, + PCI_HOST_PROP_PCI_HOLE64_END, + NULL)); +} + +static void acpi_align_size(GArray *blob, unsigned align) +{ + /* Align size to multiple of given size. This reduces the chance + * we need to change size in the future (breaking cross version migration). + */ + g_array_set_size(blob, ROUND_UP(acpi_data_len(blob), align)); +} + +/* + * ACPI spec 1.0b, + * 5.2.6 Firmware ACPI Control Structure + */ +static void +build_facs(GArray *table_data) +{ + const char *sig = "FACS"; + const uint8_t reserved[40] = {}; + + g_array_append_vals(table_data, sig, 4); /* Signature */ + build_append_int_noprefix(table_data, 64, 4); /* Length */ + build_append_int_noprefix(table_data, 0, 4); /* Hardware Signature */ + build_append_int_noprefix(table_data, 0, 4); /* Firmware Waking Vector */ + build_append_int_noprefix(table_data, 0, 4); /* Global Lock */ + build_append_int_noprefix(table_data, 0, 4); /* Flags */ + g_array_append_vals(table_data, reserved, 40); /* Reserved */ +} + +static void build_append_pcihp_notify_entry(Aml *method, int slot) +{ + Aml *if_ctx; + int32_t devfn = PCI_DEVFN(slot, 0); + + if_ctx = aml_if(aml_and(aml_arg(0), aml_int(0x1U << slot), NULL)); + aml_append(if_ctx, aml_notify(aml_name("S%.02X", devfn), aml_arg(1))); + aml_append(method, if_ctx); +} + +static void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus, + bool pcihp_bridge_en) +{ + Aml *dev, *notify_method = NULL, *method; + QObject *bsel; + PCIBus *sec; + int devfn; + + bsel = object_property_get_qobject(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, NULL); + if (bsel) { + uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel)); + + aml_append(parent_scope, aml_name_decl("BSEL", aml_int(bsel_val))); + notify_method = aml_method("DVNT", 2, AML_NOTSERIALIZED); + } + + for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) { + DeviceClass *dc; + PCIDeviceClass *pc; + PCIDevice *pdev = bus->devices[devfn]; + int slot = PCI_SLOT(devfn); + int func = PCI_FUNC(devfn); + /* ACPI spec: 1.0b: Table 6-2 _ADR Object Bus Types, PCI type */ + int adr = slot << 16 | func; + bool hotplug_enabled_dev; + bool bridge_in_acpi; + bool cold_plugged_bridge; + + if (!pdev) { + /* + * add hotplug slots for non present devices. + * hotplug is supported only for non-multifunction device + * so generate device description only for function 0 + */ + if (bsel && !func) { + if (pci_bus_is_express(bus) && slot > 0) { + break; + } + dev = aml_device("S%.02X", devfn); + aml_append(dev, aml_name_decl("_SUN", aml_int(slot))); + aml_append(dev, aml_name_decl("_ADR", aml_int(adr))); + method = aml_method("_EJ0", 1, AML_NOTSERIALIZED); + aml_append(method, + aml_call2("PCEJ", aml_name("BSEL"), aml_name("_SUN")) + ); + aml_append(dev, method); + method = aml_method("_DSM", 4, AML_SERIALIZED); + aml_append(method, + aml_return(aml_call6("PDSM", aml_arg(0), aml_arg(1), + aml_arg(2), aml_arg(3), + aml_name("BSEL"), aml_name("_SUN"))) + ); + aml_append(dev, method); + aml_append(parent_scope, dev); + + build_append_pcihp_notify_entry(notify_method, slot); + } + continue; + } + + pc = PCI_DEVICE_GET_CLASS(pdev); + dc = DEVICE_GET_CLASS(pdev); + + /* + * Cold plugged bridges aren't themselves hot-pluggable. + * Hotplugged bridges *are* hot-pluggable. + */ + cold_plugged_bridge = pc->is_bridge && !DEVICE(pdev)->hotplugged; + bridge_in_acpi = cold_plugged_bridge && pcihp_bridge_en; + + hotplug_enabled_dev = bsel && dc->hotpluggable && !cold_plugged_bridge; + + if (pc->class_id == PCI_CLASS_BRIDGE_ISA) { + continue; + } + + /* + * allow describing coldplugged bridges in ACPI even if they are not + * on function 0, as they are not unpluggable, for all other devices + * generate description only for function 0 per slot + */ + if (func && !bridge_in_acpi) { + continue; + } + + /* start to compose PCI device descriptor */ + dev = aml_device("S%.02X", devfn); + aml_append(dev, aml_name_decl("_ADR", aml_int(adr))); + + if (bsel) { + /* + * Can't declare _SUN here for every device as it changes 'slot' + * enumeration order in linux kernel, so use another variable for it + */ + aml_append(dev, aml_name_decl("ASUN", aml_int(slot))); + method = aml_method("_DSM", 4, AML_SERIALIZED); + aml_append(method, aml_return( + aml_call6("PDSM", aml_arg(0), aml_arg(1), aml_arg(2), + aml_arg(3), aml_name("BSEL"), aml_name("ASUN")) + )); + aml_append(dev, method); + } + + if (pc->class_id == PCI_CLASS_DISPLAY_VGA) { + /* add VGA specific AML methods */ + int s3d; + + if (object_dynamic_cast(OBJECT(pdev), "qxl-vga")) { + s3d = 3; + } else { + s3d = 0; + } + + method = aml_method("_S1D", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_int(0))); + aml_append(dev, method); + + method = aml_method("_S2D", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_int(0))); + aml_append(dev, method); + + method = aml_method("_S3D", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_int(s3d))); + aml_append(dev, method); + } else if (hotplug_enabled_dev) { + aml_append(dev, aml_name_decl("_SUN", aml_int(slot))); + /* add _EJ0 to make slot hotpluggable */ + method = aml_method("_EJ0", 1, AML_NOTSERIALIZED); + aml_append(method, + aml_call2("PCEJ", aml_name("BSEL"), aml_name("_SUN")) + ); + aml_append(dev, method); + + if (bsel) { + build_append_pcihp_notify_entry(notify_method, slot); + } + } else if (bridge_in_acpi) { + /* + * device is coldplugged bridge, + * add child device descriptions into its scope + */ + PCIBus *sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev)); + + build_append_pci_bus_devices(dev, sec_bus, pcihp_bridge_en); + } + /* device descriptor has been composed, add it into parent context */ + aml_append(parent_scope, dev); + } + + if (bsel) { + aml_append(parent_scope, notify_method); + } + + /* Append PCNT method to notify about events on local and child buses. + * Add this method for root bus only when hotplug is enabled since DSDT + * expects it. + */ + if (bsel || pcihp_bridge_en) { + method = aml_method("PCNT", 0, AML_NOTSERIALIZED); + + /* If bus supports hotplug select it and notify about local events */ + if (bsel) { + uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel)); + + aml_append(method, aml_store(aml_int(bsel_val), aml_name("BNUM"))); + aml_append(method, aml_call2("DVNT", aml_name("PCIU"), + aml_int(1))); /* Device Check */ + aml_append(method, aml_call2("DVNT", aml_name("PCID"), + aml_int(3))); /* Eject Request */ + } + + /* Notify about child bus events in any case */ + if (pcihp_bridge_en) { + QLIST_FOREACH(sec, &bus->child, sibling) { + if (pci_bus_is_root(sec)) { + continue; + } + + aml_append(method, aml_name("^S%.02X.PCNT", + sec->parent_dev->devfn)); + } + } + + aml_append(parent_scope, method); + } + qobject_unref(bsel); +} + +Aml *aml_pci_device_dsm(void) +{ + Aml *method, *UUID, *ifctx, *ifctx1, *ifctx2, *ifctx3, *elsectx; + Aml *acpi_index = aml_local(0); + Aml *zero = aml_int(0); + Aml *bnum = aml_arg(4); + Aml *func = aml_arg(2); + Aml *rev = aml_arg(1); + Aml *sun = aml_arg(5); + + method = aml_method("PDSM", 6, AML_SERIALIZED); + + /* + * PCI Firmware Specification 3.1 + * 4.6. _DSM Definitions for PCI + */ + UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); + ifctx = aml_if(aml_equal(aml_arg(0), UUID)); + { + aml_append(ifctx, aml_store(aml_call2("AIDX", bnum, sun), acpi_index)); + ifctx1 = aml_if(aml_equal(func, zero)); + { + uint8_t byte_list[1]; + + ifctx2 = aml_if(aml_equal(rev, aml_int(2))); + { + /* + * advertise function 7 if device has acpi-index + * acpi_index values: + * 0: not present (default value) + * FFFFFFFF: not supported (old QEMU without PIDX reg) + * other: device's acpi-index + */ + ifctx3 = aml_if(aml_lnot( + aml_or(aml_equal(acpi_index, zero), + aml_equal(acpi_index, aml_int(0xFFFFFFFF)), NULL) + )); + { + byte_list[0] = + 1 /* have supported functions */ | + 1 << 7 /* support for function 7 */ + ; + aml_append(ifctx3, aml_return(aml_buffer(1, byte_list))); + } + aml_append(ifctx2, ifctx3); + } + aml_append(ifctx1, ifctx2); + + byte_list[0] = 0; /* nothing supported */ + aml_append(ifctx1, aml_return(aml_buffer(1, byte_list))); + } + aml_append(ifctx, ifctx1); + elsectx = aml_else(); + /* + * PCI Firmware Specification 3.1 + * 4.6.7. _DSM for Naming a PCI or PCI Express Device Under + * Operating Systems + */ + ifctx1 = aml_if(aml_equal(func, aml_int(7))); + { + Aml *pkg = aml_package(2); + Aml *ret = aml_local(1); + + aml_append(pkg, zero); + /* + * optional, if not impl. should return null string + */ + aml_append(pkg, aml_string("%s", "")); + aml_append(ifctx1, aml_store(pkg, ret)); + /* + * update acpi-index to actual value + */ + aml_append(ifctx1, aml_store(acpi_index, aml_index(ret, zero))); + aml_append(ifctx1, aml_return(ret)); + } + aml_append(elsectx, ifctx1); + aml_append(ifctx, elsectx); + } + aml_append(method, ifctx); + return method; +} + +/** + * build_prt_entry: + * @link_name: link name for PCI route entry + * + * build AML package containing a PCI route entry for @link_name + */ +static Aml *build_prt_entry(const char *link_name) +{ + Aml *a_zero = aml_int(0); + Aml *pkg = aml_package(4); + aml_append(pkg, a_zero); + aml_append(pkg, a_zero); + aml_append(pkg, aml_name("%s", link_name)); + aml_append(pkg, a_zero); + return pkg; +} + +/* + * initialize_route - Initialize the interrupt routing rule + * through a specific LINK: + * if (lnk_idx == idx) + * route using link 'link_name' + */ +static Aml *initialize_route(Aml *route, const char *link_name, + Aml *lnk_idx, int idx) +{ + Aml *if_ctx = aml_if(aml_equal(lnk_idx, aml_int(idx))); + Aml *pkg = build_prt_entry(link_name); + + aml_append(if_ctx, aml_store(pkg, route)); + + return if_ctx; +} + +/* + * build_prt - Define interrupt rounting rules + * + * Returns an array of 128 routes, one for each device, + * based on device location. + * The main goal is to equaly distribute the interrupts + * over the 4 existing ACPI links (works only for i440fx). + * The hash function is (slot + pin) & 3 -> "LNK[D|A|B|C]". + * + */ +static Aml *build_prt(bool is_pci0_prt) +{ + Aml *method, *while_ctx, *pin, *res; + + method = aml_method("_PRT", 0, AML_NOTSERIALIZED); + res = aml_local(0); + pin = aml_local(1); + aml_append(method, aml_store(aml_package(128), res)); + aml_append(method, aml_store(aml_int(0), pin)); + + /* while (pin < 128) */ + while_ctx = aml_while(aml_lless(pin, aml_int(128))); + { + Aml *slot = aml_local(2); + Aml *lnk_idx = aml_local(3); + Aml *route = aml_local(4); + + /* slot = pin >> 2 */ + aml_append(while_ctx, + aml_store(aml_shiftright(pin, aml_int(2), NULL), slot)); + /* lnk_idx = (slot + pin) & 3 */ + aml_append(while_ctx, + aml_store(aml_and(aml_add(pin, slot, NULL), aml_int(3), NULL), + lnk_idx)); + + /* route[2] = "LNK[D|A|B|C]", selection based on pin % 3 */ + aml_append(while_ctx, initialize_route(route, "LNKD", lnk_idx, 0)); + if (is_pci0_prt) { + Aml *if_device_1, *if_pin_4, *else_pin_4; + + /* device 1 is the power-management device, needs SCI */ + if_device_1 = aml_if(aml_equal(lnk_idx, aml_int(1))); + { + if_pin_4 = aml_if(aml_equal(pin, aml_int(4))); + { + aml_append(if_pin_4, + aml_store(build_prt_entry("LNKS"), route)); + } + aml_append(if_device_1, if_pin_4); + else_pin_4 = aml_else(); + { + aml_append(else_pin_4, + aml_store(build_prt_entry("LNKA"), route)); + } + aml_append(if_device_1, else_pin_4); + } + aml_append(while_ctx, if_device_1); + } else { + aml_append(while_ctx, initialize_route(route, "LNKA", lnk_idx, 1)); + } + aml_append(while_ctx, initialize_route(route, "LNKB", lnk_idx, 2)); + aml_append(while_ctx, initialize_route(route, "LNKC", lnk_idx, 3)); + + /* route[0] = 0x[slot]FFFF */ + aml_append(while_ctx, + aml_store(aml_or(aml_shiftleft(slot, aml_int(16)), aml_int(0xFFFF), + NULL), + aml_index(route, aml_int(0)))); + /* route[1] = pin & 3 */ + aml_append(while_ctx, + aml_store(aml_and(pin, aml_int(3), NULL), + aml_index(route, aml_int(1)))); + /* res[pin] = route */ + aml_append(while_ctx, aml_store(route, aml_index(res, pin))); + /* pin++ */ + aml_append(while_ctx, aml_increment(pin)); + } + aml_append(method, while_ctx); + /* return res*/ + aml_append(method, aml_return(res)); + + return method; +} + +static void build_hpet_aml(Aml *table) +{ + Aml *crs; + Aml *field; + Aml *method; + Aml *if_ctx; + Aml *scope = aml_scope("_SB"); + Aml *dev = aml_device("HPET"); + Aml *zero = aml_int(0); + Aml *id = aml_local(0); + Aml *period = aml_local(1); + + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0103"))); + aml_append(dev, aml_name_decl("_UID", zero)); + + aml_append(dev, + aml_operation_region("HPTM", AML_SYSTEM_MEMORY, aml_int(HPET_BASE), + HPET_LEN)); + field = aml_field("HPTM", AML_DWORD_ACC, AML_LOCK, AML_PRESERVE); + aml_append(field, aml_named_field("VEND", 32)); + aml_append(field, aml_named_field("PRD", 32)); + aml_append(dev, field); + + method = aml_method("_STA", 0, AML_NOTSERIALIZED); + aml_append(method, aml_store(aml_name("VEND"), id)); + aml_append(method, aml_store(aml_name("PRD"), period)); + aml_append(method, aml_shiftright(id, aml_int(16), id)); + if_ctx = aml_if(aml_lor(aml_equal(id, zero), + aml_equal(id, aml_int(0xffff)))); + { + aml_append(if_ctx, aml_return(zero)); + } + aml_append(method, if_ctx); + + if_ctx = aml_if(aml_lor(aml_equal(period, zero), + aml_lgreater(period, aml_int(100000000)))); + { + aml_append(if_ctx, aml_return(zero)); + } + aml_append(method, if_ctx); + + aml_append(method, aml_return(aml_int(0x0F))); + aml_append(dev, method); + + crs = aml_resource_template(); + aml_append(crs, aml_memory32_fixed(HPET_BASE, HPET_LEN, AML_READ_ONLY)); + aml_append(dev, aml_name_decl("_CRS", crs)); + + aml_append(scope, dev); + aml_append(table, scope); +} + +static Aml *build_vmbus_device_aml(VMBusBridge *vmbus_bridge) +{ + Aml *dev; + Aml *method; + Aml *crs; + + dev = aml_device("VMBS"); + aml_append(dev, aml_name_decl("STA", aml_int(0xF))); + aml_append(dev, aml_name_decl("_HID", aml_string("VMBus"))); + aml_append(dev, aml_name_decl("_UID", aml_int(0x0))); + aml_append(dev, aml_name_decl("_DDN", aml_string("VMBUS"))); + + method = aml_method("_DIS", 0, AML_NOTSERIALIZED); + aml_append(method, aml_store(aml_and(aml_name("STA"), aml_int(0xD), NULL), + aml_name("STA"))); + aml_append(dev, method); + + method = aml_method("_PS0", 0, AML_NOTSERIALIZED); + aml_append(method, aml_store(aml_or(aml_name("STA"), aml_int(0xF), NULL), + aml_name("STA"))); + aml_append(dev, method); + + method = aml_method("_STA", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_name("STA"))); + aml_append(dev, method); + + aml_append(dev, aml_name_decl("_PS3", aml_int(0x0))); + + crs = aml_resource_template(); + aml_append(crs, aml_irq_no_flags(vmbus_bridge->irq)); + aml_append(dev, aml_name_decl("_CRS", crs)); + + return dev; +} + +static void build_isa_devices_aml(Aml *table) +{ + bool ambiguous; + Object *obj = object_resolve_path_type("", TYPE_ISA_BUS, &ambiguous); + Aml *scope; + + assert(obj && !ambiguous); + + scope = aml_scope("_SB.PCI0.ISA"); + build_acpi_ipmi_devices(scope, BUS(obj), "\\_SB.PCI0.ISA"); + isa_build_aml(ISA_BUS(obj), scope); + + aml_append(table, scope); +} + +static void build_dbg_aml(Aml *table) +{ + Aml *field; + Aml *method; + Aml *while_ctx; + Aml *scope = aml_scope("\\"); + Aml *buf = aml_local(0); + Aml *len = aml_local(1); + Aml *idx = aml_local(2); + + aml_append(scope, + aml_operation_region("DBG", AML_SYSTEM_IO, aml_int(0x0402), 0x01)); + field = aml_field("DBG", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE); + aml_append(field, aml_named_field("DBGB", 8)); + aml_append(scope, field); + + method = aml_method("DBUG", 1, AML_NOTSERIALIZED); + + aml_append(method, aml_to_hexstring(aml_arg(0), buf)); + aml_append(method, aml_to_buffer(buf, buf)); + aml_append(method, aml_subtract(aml_sizeof(buf), aml_int(1), len)); + aml_append(method, aml_store(aml_int(0), idx)); + + while_ctx = aml_while(aml_lless(idx, len)); + aml_append(while_ctx, + aml_store(aml_derefof(aml_index(buf, idx)), aml_name("DBGB"))); + aml_append(while_ctx, aml_increment(idx)); + aml_append(method, while_ctx); + + aml_append(method, aml_store(aml_int(0x0A), aml_name("DBGB"))); + aml_append(scope, method); + + aml_append(table, scope); +} + +static Aml *build_link_dev(const char *name, uint8_t uid, Aml *reg) +{ + Aml *dev; + Aml *crs; + Aml *method; + uint32_t irqs[] = {5, 10, 11}; + + dev = aml_device("%s", name); + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0C0F"))); + aml_append(dev, aml_name_decl("_UID", aml_int(uid))); + + crs = aml_resource_template(); + aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH, + AML_SHARED, irqs, ARRAY_SIZE(irqs))); + aml_append(dev, aml_name_decl("_PRS", crs)); + + method = aml_method("_STA", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_call1("IQST", reg))); + aml_append(dev, method); + + method = aml_method("_DIS", 0, AML_NOTSERIALIZED); + aml_append(method, aml_or(reg, aml_int(0x80), reg)); + aml_append(dev, method); + + method = aml_method("_CRS", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_call1("IQCR", reg))); + aml_append(dev, method); + + method = aml_method("_SRS", 1, AML_NOTSERIALIZED); + aml_append(method, aml_create_dword_field(aml_arg(0), aml_int(5), "PRRI")); + aml_append(method, aml_store(aml_name("PRRI"), reg)); + aml_append(dev, method); + + return dev; + } + +static Aml *build_gsi_link_dev(const char *name, uint8_t uid, uint8_t gsi) +{ + Aml *dev; + Aml *crs; + Aml *method; + uint32_t irqs; + + dev = aml_device("%s", name); + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0C0F"))); + aml_append(dev, aml_name_decl("_UID", aml_int(uid))); + + crs = aml_resource_template(); + irqs = gsi; + aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH, + AML_SHARED, &irqs, 1)); + aml_append(dev, aml_name_decl("_PRS", crs)); + + aml_append(dev, aml_name_decl("_CRS", crs)); + + /* + * _DIS can be no-op because the interrupt cannot be disabled. + */ + method = aml_method("_DIS", 0, AML_NOTSERIALIZED); + aml_append(dev, method); + + method = aml_method("_SRS", 1, AML_NOTSERIALIZED); + aml_append(dev, method); + + return dev; +} + +/* _CRS method - get current settings */ +static Aml *build_iqcr_method(bool is_piix4) +{ + Aml *if_ctx; + uint32_t irqs; + Aml *method = aml_method("IQCR", 1, AML_SERIALIZED); + Aml *crs = aml_resource_template(); + + irqs = 0; + aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, + AML_ACTIVE_HIGH, AML_SHARED, &irqs, 1)); + aml_append(method, aml_name_decl("PRR0", crs)); + + aml_append(method, + aml_create_dword_field(aml_name("PRR0"), aml_int(5), "PRRI")); + + if (is_piix4) { + if_ctx = aml_if(aml_lless(aml_arg(0), aml_int(0x80))); + aml_append(if_ctx, aml_store(aml_arg(0), aml_name("PRRI"))); + aml_append(method, if_ctx); + } else { + aml_append(method, + aml_store(aml_and(aml_arg(0), aml_int(0xF), NULL), + aml_name("PRRI"))); + } + + aml_append(method, aml_return(aml_name("PRR0"))); + return method; +} + +/* _STA method - get status */ +static Aml *build_irq_status_method(void) +{ + Aml *if_ctx; + Aml *method = aml_method("IQST", 1, AML_NOTSERIALIZED); + + if_ctx = aml_if(aml_and(aml_int(0x80), aml_arg(0), NULL)); + aml_append(if_ctx, aml_return(aml_int(0x09))); + aml_append(method, if_ctx); + aml_append(method, aml_return(aml_int(0x0B))); + return method; +} + +static void build_piix4_pci0_int(Aml *table) +{ + Aml *dev; + Aml *crs; + Aml *field; + Aml *method; + uint32_t irqs; + Aml *sb_scope = aml_scope("_SB"); + Aml *pci0_scope = aml_scope("PCI0"); + + aml_append(pci0_scope, build_prt(true)); + aml_append(sb_scope, pci0_scope); + + field = aml_field("PCI0.ISA.P40C", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE); + aml_append(field, aml_named_field("PRQ0", 8)); + aml_append(field, aml_named_field("PRQ1", 8)); + aml_append(field, aml_named_field("PRQ2", 8)); + aml_append(field, aml_named_field("PRQ3", 8)); + aml_append(sb_scope, field); + + aml_append(sb_scope, build_irq_status_method()); + aml_append(sb_scope, build_iqcr_method(true)); + + aml_append(sb_scope, build_link_dev("LNKA", 0, aml_name("PRQ0"))); + aml_append(sb_scope, build_link_dev("LNKB", 1, aml_name("PRQ1"))); + aml_append(sb_scope, build_link_dev("LNKC", 2, aml_name("PRQ2"))); + aml_append(sb_scope, build_link_dev("LNKD", 3, aml_name("PRQ3"))); + + dev = aml_device("LNKS"); + { + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0C0F"))); + aml_append(dev, aml_name_decl("_UID", aml_int(4))); + + crs = aml_resource_template(); + irqs = 9; + aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, + AML_ACTIVE_HIGH, AML_SHARED, + &irqs, 1)); + aml_append(dev, aml_name_decl("_PRS", crs)); + + /* The SCI cannot be disabled and is always attached to GSI 9, + * so these are no-ops. We only need this link to override the + * polarity to active high and match the content of the MADT. + */ + method = aml_method("_STA", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_int(0x0b))); + aml_append(dev, method); + + method = aml_method("_DIS", 0, AML_NOTSERIALIZED); + aml_append(dev, method); + + method = aml_method("_CRS", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_name("_PRS"))); + aml_append(dev, method); + + method = aml_method("_SRS", 1, AML_NOTSERIALIZED); + aml_append(dev, method); + } + aml_append(sb_scope, dev); + + aml_append(table, sb_scope); +} + +static void append_q35_prt_entry(Aml *ctx, uint32_t nr, const char *name) +{ + int i; + int head; + Aml *pkg; + char base = name[3] < 'E' ? 'A' : 'E'; + char *s = g_strdup(name); + Aml *a_nr = aml_int((nr << 16) | 0xffff); + + assert(strlen(s) == 4); + + head = name[3] - base; + for (i = 0; i < 4; i++) { + if (head + i > 3) { + head = i * -1; + } + s[3] = base + head + i; + pkg = aml_package(4); + aml_append(pkg, a_nr); + aml_append(pkg, aml_int(i)); + aml_append(pkg, aml_name("%s", s)); + aml_append(pkg, aml_int(0)); + aml_append(ctx, pkg); + } + g_free(s); +} + +static Aml *build_q35_routing_table(const char *str) +{ + int i; + Aml *pkg; + char *name = g_strdup_printf("%s ", str); + + pkg = aml_package(128); + for (i = 0; i < 0x18; i++) { + name[3] = 'E' + (i & 0x3); + append_q35_prt_entry(pkg, i, name); + } + + name[3] = 'E'; + append_q35_prt_entry(pkg, 0x18, name); + + /* INTA -> PIRQA for slot 25 - 31, see the default value of D<N>IR */ + for (i = 0x0019; i < 0x1e; i++) { + name[3] = 'A'; + append_q35_prt_entry(pkg, i, name); + } + + /* PCIe->PCI bridge. use PIRQ[E-H] */ + name[3] = 'E'; + append_q35_prt_entry(pkg, 0x1e, name); + name[3] = 'A'; + append_q35_prt_entry(pkg, 0x1f, name); + + g_free(name); + return pkg; +} + +static void build_q35_pci0_int(Aml *table) +{ + Aml *field; + Aml *method; + Aml *sb_scope = aml_scope("_SB"); + Aml *pci0_scope = aml_scope("PCI0"); + + /* Zero => PIC mode, One => APIC Mode */ + aml_append(table, aml_name_decl("PICF", aml_int(0))); + method = aml_method("_PIC", 1, AML_NOTSERIALIZED); + { + aml_append(method, aml_store(aml_arg(0), aml_name("PICF"))); + } + aml_append(table, method); + + aml_append(pci0_scope, + aml_name_decl("PRTP", build_q35_routing_table("LNK"))); + aml_append(pci0_scope, + aml_name_decl("PRTA", build_q35_routing_table("GSI"))); + + method = aml_method("_PRT", 0, AML_NOTSERIALIZED); + { + Aml *if_ctx; + Aml *else_ctx; + + /* PCI IRQ routing table, example from ACPI 2.0a specification, + section 6.2.8.1 */ + /* Note: we provide the same info as the PCI routing + table of the Bochs BIOS */ + if_ctx = aml_if(aml_equal(aml_name("PICF"), aml_int(0))); + aml_append(if_ctx, aml_return(aml_name("PRTP"))); + aml_append(method, if_ctx); + else_ctx = aml_else(); + aml_append(else_ctx, aml_return(aml_name("PRTA"))); + aml_append(method, else_ctx); + } + aml_append(pci0_scope, method); + aml_append(sb_scope, pci0_scope); + + field = aml_field("PCI0.ISA.PIRQ", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE); + aml_append(field, aml_named_field("PRQA", 8)); + aml_append(field, aml_named_field("PRQB", 8)); + aml_append(field, aml_named_field("PRQC", 8)); + aml_append(field, aml_named_field("PRQD", 8)); + aml_append(field, aml_reserved_field(0x20)); + aml_append(field, aml_named_field("PRQE", 8)); + aml_append(field, aml_named_field("PRQF", 8)); + aml_append(field, aml_named_field("PRQG", 8)); + aml_append(field, aml_named_field("PRQH", 8)); + aml_append(sb_scope, field); + + aml_append(sb_scope, build_irq_status_method()); + aml_append(sb_scope, build_iqcr_method(false)); + + aml_append(sb_scope, build_link_dev("LNKA", 0, aml_name("PRQA"))); + aml_append(sb_scope, build_link_dev("LNKB", 1, aml_name("PRQB"))); + aml_append(sb_scope, build_link_dev("LNKC", 2, aml_name("PRQC"))); + aml_append(sb_scope, build_link_dev("LNKD", 3, aml_name("PRQD"))); + aml_append(sb_scope, build_link_dev("LNKE", 4, aml_name("PRQE"))); + aml_append(sb_scope, build_link_dev("LNKF", 5, aml_name("PRQF"))); + aml_append(sb_scope, build_link_dev("LNKG", 6, aml_name("PRQG"))); + aml_append(sb_scope, build_link_dev("LNKH", 7, aml_name("PRQH"))); + + aml_append(sb_scope, build_gsi_link_dev("GSIA", 0x10, 0x10)); + aml_append(sb_scope, build_gsi_link_dev("GSIB", 0x11, 0x11)); + aml_append(sb_scope, build_gsi_link_dev("GSIC", 0x12, 0x12)); + aml_append(sb_scope, build_gsi_link_dev("GSID", 0x13, 0x13)); + aml_append(sb_scope, build_gsi_link_dev("GSIE", 0x14, 0x14)); + aml_append(sb_scope, build_gsi_link_dev("GSIF", 0x15, 0x15)); + aml_append(sb_scope, build_gsi_link_dev("GSIG", 0x16, 0x16)); + aml_append(sb_scope, build_gsi_link_dev("GSIH", 0x17, 0x17)); + + aml_append(table, sb_scope); +} + +static Aml *build_q35_dram_controller(const AcpiMcfgInfo *mcfg) +{ + Aml *dev; + Aml *resource_template; + + /* DRAM controller */ + dev = aml_device("DRAC"); + aml_append(dev, aml_name_decl("_HID", aml_string("PNP0C01"))); + + resource_template = aml_resource_template(); + if (mcfg->base + mcfg->size - 1 >= (1ULL << 32)) { + aml_append(resource_template, + aml_qword_memory(AML_POS_DECODE, + AML_MIN_FIXED, + AML_MAX_FIXED, + AML_NON_CACHEABLE, + AML_READ_WRITE, + 0x0000000000000000, + mcfg->base, + mcfg->base + mcfg->size - 1, + 0x0000000000000000, + mcfg->size)); + } else { + aml_append(resource_template, + aml_dword_memory(AML_POS_DECODE, + AML_MIN_FIXED, + AML_MAX_FIXED, + AML_NON_CACHEABLE, + AML_READ_WRITE, + 0x0000000000000000, + mcfg->base, + mcfg->base + mcfg->size - 1, + 0x0000000000000000, + mcfg->size)); + } + aml_append(dev, aml_name_decl("_CRS", resource_template)); + + return dev; +} + +static void build_q35_isa_bridge(Aml *table) +{ + Aml *dev; + Aml *scope; + + scope = aml_scope("_SB.PCI0"); + dev = aml_device("ISA"); + aml_append(dev, aml_name_decl("_ADR", aml_int(0x001F0000))); + + /* ICH9 PCI to ISA irq remapping */ + aml_append(dev, aml_operation_region("PIRQ", AML_PCI_CONFIG, + aml_int(0x60), 0x0C)); + + aml_append(scope, dev); + aml_append(table, scope); +} + +static void build_piix4_isa_bridge(Aml *table) +{ + Aml *dev; + Aml *scope; + + scope = aml_scope("_SB.PCI0"); + dev = aml_device("ISA"); + aml_append(dev, aml_name_decl("_ADR", aml_int(0x00010000))); + + /* PIIX PCI to ISA irq remapping */ + aml_append(dev, aml_operation_region("P40C", AML_PCI_CONFIG, + aml_int(0x60), 0x04)); + + aml_append(scope, dev); + aml_append(table, scope); +} + +static void build_x86_acpi_pci_hotplug(Aml *table, uint64_t pcihp_addr) +{ + Aml *scope; + Aml *field; + Aml *method; + + scope = aml_scope("_SB.PCI0"); + + aml_append(scope, + aml_operation_region("PCST", AML_SYSTEM_IO, aml_int(pcihp_addr), 0x08)); + field = aml_field("PCST", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS); + aml_append(field, aml_named_field("PCIU", 32)); + aml_append(field, aml_named_field("PCID", 32)); + aml_append(scope, field); + + aml_append(scope, + aml_operation_region("SEJ", AML_SYSTEM_IO, + aml_int(pcihp_addr + ACPI_PCIHP_SEJ_BASE), 0x04)); + field = aml_field("SEJ", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS); + aml_append(field, aml_named_field("B0EJ", 32)); + aml_append(scope, field); + + aml_append(scope, + aml_operation_region("BNMR", AML_SYSTEM_IO, + aml_int(pcihp_addr + ACPI_PCIHP_BNMR_BASE), 0x08)); + field = aml_field("BNMR", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS); + aml_append(field, aml_named_field("BNUM", 32)); + aml_append(field, aml_named_field("PIDX", 32)); + aml_append(scope, field); + + aml_append(scope, aml_mutex("BLCK", 0)); + + method = aml_method("PCEJ", 2, AML_NOTSERIALIZED); + aml_append(method, aml_acquire(aml_name("BLCK"), 0xFFFF)); + aml_append(method, aml_store(aml_arg(0), aml_name("BNUM"))); + aml_append(method, + aml_store(aml_shiftleft(aml_int(1), aml_arg(1)), aml_name("B0EJ"))); + aml_append(method, aml_release(aml_name("BLCK"))); + aml_append(method, aml_return(aml_int(0))); + aml_append(scope, method); + + method = aml_method("AIDX", 2, AML_NOTSERIALIZED); + aml_append(method, aml_acquire(aml_name("BLCK"), 0xFFFF)); + aml_append(method, aml_store(aml_arg(0), aml_name("BNUM"))); + aml_append(method, + aml_store(aml_shiftleft(aml_int(1), aml_arg(1)), aml_name("PIDX"))); + aml_append(method, aml_store(aml_name("PIDX"), aml_local(0))); + aml_append(method, aml_release(aml_name("BLCK"))); + aml_append(method, aml_return(aml_local(0))); + aml_append(scope, method); + + aml_append(scope, aml_pci_device_dsm()); + + aml_append(table, scope); +} + +static Aml *build_q35_osc_method(bool enable_native_pcie_hotplug) +{ + Aml *if_ctx; + Aml *if_ctx2; + Aml *else_ctx; + Aml *method; + Aml *a_cwd1 = aml_name("CDW1"); + Aml *a_ctrl = aml_local(0); + + method = aml_method("_OSC", 4, AML_NOTSERIALIZED); + aml_append(method, aml_create_dword_field(aml_arg(3), aml_int(0), "CDW1")); + + if_ctx = aml_if(aml_equal( + aml_arg(0), aml_touuid("33DB4D5B-1FF7-401C-9657-7441C03DD766"))); + aml_append(if_ctx, aml_create_dword_field(aml_arg(3), aml_int(4), "CDW2")); + aml_append(if_ctx, aml_create_dword_field(aml_arg(3), aml_int(8), "CDW3")); + + aml_append(if_ctx, aml_store(aml_name("CDW3"), a_ctrl)); + + /* + * Always allow native PME, AER (no dependencies) + * Allow SHPC (PCI bridges can have SHPC controller) + * Disable PCIe Native Hot-plug if ACPI PCI Hot-plug is enabled. + */ + aml_append(if_ctx, aml_and(a_ctrl, + aml_int(0x1E | (enable_native_pcie_hotplug ? 0x1 : 0x0)), a_ctrl)); + + if_ctx2 = aml_if(aml_lnot(aml_equal(aml_arg(1), aml_int(1)))); + /* Unknown revision */ + aml_append(if_ctx2, aml_or(a_cwd1, aml_int(0x08), a_cwd1)); + aml_append(if_ctx, if_ctx2); + + if_ctx2 = aml_if(aml_lnot(aml_equal(aml_name("CDW3"), a_ctrl))); + /* Capabilities bits were masked */ + aml_append(if_ctx2, aml_or(a_cwd1, aml_int(0x10), a_cwd1)); + aml_append(if_ctx, if_ctx2); + + /* Update DWORD3 in the buffer */ + aml_append(if_ctx, aml_store(a_ctrl, aml_name("CDW3"))); + aml_append(method, if_ctx); + + else_ctx = aml_else(); + /* Unrecognized UUID */ + aml_append(else_ctx, aml_or(a_cwd1, aml_int(4), a_cwd1)); + aml_append(method, else_ctx); + + aml_append(method, aml_return(aml_arg(3))); + return method; +} + +static void build_smb0(Aml *table, I2CBus *smbus, int devnr, int func) +{ + Aml *scope = aml_scope("_SB.PCI0"); + Aml *dev = aml_device("SMB0"); + + aml_append(dev, aml_name_decl("_ADR", aml_int(devnr << 16 | func))); + build_acpi_ipmi_devices(dev, BUS(smbus), "\\_SB.PCI0.SMB0"); + aml_append(scope, dev); + aml_append(table, scope); +} + +static void +build_dsdt(GArray *table_data, BIOSLinker *linker, + AcpiPmInfo *pm, AcpiMiscInfo *misc, + Range *pci_hole, Range *pci_hole64, MachineState *machine) +{ + CrsRangeEntry *entry; + Aml *dsdt, *sb_scope, *scope, *dev, *method, *field, *pkg, *crs; + CrsRangeSet crs_range_set; + PCMachineState *pcms = PC_MACHINE(machine); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(machine); + X86MachineState *x86ms = X86_MACHINE(machine); + AcpiMcfgInfo mcfg; + bool mcfg_valid = !!acpi_get_mcfg(&mcfg); + uint32_t nr_mem = machine->ram_slots; + int root_bus_limit = 0xFF; + PCIBus *bus = NULL; +#ifdef CONFIG_TPM + TPMIf *tpm = tpm_find(); +#endif + int i; + VMBusBridge *vmbus_bridge = vmbus_bridge_find(); + AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = x86ms->oem_id, + .oem_table_id = x86ms->oem_table_id }; + + acpi_table_begin(&table, table_data); + dsdt = init_aml_allocator(); + + build_dbg_aml(dsdt); + if (misc->is_piix4) { + sb_scope = aml_scope("_SB"); + dev = aml_device("PCI0"); + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03"))); + aml_append(dev, aml_name_decl("_ADR", aml_int(0))); + aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid))); + aml_append(sb_scope, dev); + aml_append(dsdt, sb_scope); + + if (misc->has_hpet) { + build_hpet_aml(dsdt); + } + build_piix4_isa_bridge(dsdt); + build_isa_devices_aml(dsdt); + if (pm->pcihp_bridge_en || pm->pcihp_root_en) { + build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base); + } + build_piix4_pci0_int(dsdt); + } else { + sb_scope = aml_scope("_SB"); + dev = aml_device("PCI0"); + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A08"))); + aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03"))); + aml_append(dev, aml_name_decl("_ADR", aml_int(0))); + aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid))); + aml_append(dev, build_q35_osc_method(!pm->pcihp_bridge_en)); + aml_append(sb_scope, dev); + if (mcfg_valid) { + aml_append(sb_scope, build_q35_dram_controller(&mcfg)); + } + + if (pm->smi_on_cpuhp) { + /* reserve SMI block resources, IO ports 0xB2, 0xB3 */ + dev = aml_device("PCI0.SMI0"); + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A06"))); + aml_append(dev, aml_name_decl("_UID", aml_string("SMI resources"))); + crs = aml_resource_template(); + aml_append(crs, + aml_io( + AML_DECODE16, + ACPI_PORT_SMI_CMD, + ACPI_PORT_SMI_CMD, + 1, + 2) + ); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(dev, aml_operation_region("SMIR", AML_SYSTEM_IO, + aml_int(ACPI_PORT_SMI_CMD), 2)); + field = aml_field("SMIR", AML_BYTE_ACC, AML_NOLOCK, + AML_WRITE_AS_ZEROS); + aml_append(field, aml_named_field("SMIC", 8)); + aml_append(field, aml_reserved_field(8)); + aml_append(dev, field); + aml_append(sb_scope, dev); + } + + aml_append(dsdt, sb_scope); + + if (misc->has_hpet) { + build_hpet_aml(dsdt); + } + build_q35_isa_bridge(dsdt); + build_isa_devices_aml(dsdt); + if (pm->pcihp_bridge_en) { + build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base); + } + build_q35_pci0_int(dsdt); + if (pcms->smbus && !pcmc->do_not_add_smb_acpi) { + build_smb0(dsdt, pcms->smbus, ICH9_SMB_DEV, ICH9_SMB_FUNC); + } + } + + if (vmbus_bridge) { + sb_scope = aml_scope("_SB"); + aml_append(sb_scope, build_vmbus_device_aml(vmbus_bridge)); + aml_append(dsdt, sb_scope); + } + + if (pcmc->legacy_cpu_hotplug) { + build_legacy_cpu_hotplug_aml(dsdt, machine, pm->cpu_hp_io_base); + } else { + CPUHotplugFeatures opts = { + .acpi_1_compatible = true, .has_legacy_cphp = true, + .smi_path = pm->smi_on_cpuhp ? "\\_SB.PCI0.SMI0.SMIC" : NULL, + .fw_unplugs_cpu = pm->smi_on_cpu_unplug, + }; + build_cpus_aml(dsdt, machine, opts, pm->cpu_hp_io_base, + "\\_SB.PCI0", "\\_GPE._E02"); + } + + if (pcms->memhp_io_base && nr_mem) { + build_memory_hotplug_aml(dsdt, nr_mem, "\\_SB.PCI0", + "\\_GPE._E03", AML_SYSTEM_IO, + pcms->memhp_io_base); + } + + scope = aml_scope("_GPE"); + { + aml_append(scope, aml_name_decl("_HID", aml_string("ACPI0006"))); + + if (pm->pcihp_bridge_en || pm->pcihp_root_en) { + method = aml_method("_E01", 0, AML_NOTSERIALIZED); + aml_append(method, + aml_acquire(aml_name("\\_SB.PCI0.BLCK"), 0xFFFF)); + aml_append(method, aml_call0("\\_SB.PCI0.PCNT")); + aml_append(method, aml_release(aml_name("\\_SB.PCI0.BLCK"))); + aml_append(scope, method); + } + + if (machine->nvdimms_state->is_enabled) { + method = aml_method("_E04", 0, AML_NOTSERIALIZED); + aml_append(method, aml_notify(aml_name("\\_SB.NVDR"), + aml_int(0x80))); + aml_append(scope, method); + } + } + aml_append(dsdt, scope); + + crs_range_set_init(&crs_range_set); + bus = PC_MACHINE(machine)->bus; + if (bus) { + QLIST_FOREACH(bus, &bus->child, sibling) { + uint8_t bus_num = pci_bus_num(bus); + uint8_t numa_node = pci_bus_numa_node(bus); + + /* look only for expander root buses */ + if (!pci_bus_is_root(bus)) { + continue; + } + + if (bus_num < root_bus_limit) { + root_bus_limit = bus_num - 1; + } + + scope = aml_scope("\\_SB"); + dev = aml_device("PC%.02X", bus_num); + aml_append(dev, aml_name_decl("_UID", aml_int(bus_num))); + aml_append(dev, aml_name_decl("_BBN", aml_int(bus_num))); + if (pci_bus_is_express(bus)) { + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A08"))); + aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03"))); + + /* Expander bridges do not have ACPI PCI Hot-plug enabled */ + aml_append(dev, build_q35_osc_method(true)); + } else { + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03"))); + } + + if (numa_node != NUMA_NODE_UNASSIGNED) { + aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node))); + } + + aml_append(dev, build_prt(false)); + crs = build_crs(PCI_HOST_BRIDGE(BUS(bus)->parent), &crs_range_set, + 0, 0, 0, 0); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); + aml_append(dsdt, scope); + } + } + + /* + * At this point crs_range_set has all the ranges used by pci + * busses *other* than PCI0. These ranges will be excluded from + * the PCI0._CRS. Add mmconfig to the set so it will be excluded + * too. + */ + if (mcfg_valid) { + crs_range_insert(crs_range_set.mem_ranges, + mcfg.base, mcfg.base + mcfg.size - 1); + } + + scope = aml_scope("\\_SB.PCI0"); + /* build PCI0._CRS */ + crs = aml_resource_template(); + aml_append(crs, + aml_word_bus_number(AML_MIN_FIXED, AML_MAX_FIXED, AML_POS_DECODE, + 0x0000, 0x0, root_bus_limit, + 0x0000, root_bus_limit + 1)); + aml_append(crs, aml_io(AML_DECODE16, 0x0CF8, 0x0CF8, 0x01, 0x08)); + + aml_append(crs, + aml_word_io(AML_MIN_FIXED, AML_MAX_FIXED, + AML_POS_DECODE, AML_ENTIRE_RANGE, + 0x0000, 0x0000, 0x0CF7, 0x0000, 0x0CF8)); + + crs_replace_with_free_ranges(crs_range_set.io_ranges, 0x0D00, 0xFFFF); + for (i = 0; i < crs_range_set.io_ranges->len; i++) { + entry = g_ptr_array_index(crs_range_set.io_ranges, i); + aml_append(crs, + aml_word_io(AML_MIN_FIXED, AML_MAX_FIXED, + AML_POS_DECODE, AML_ENTIRE_RANGE, + 0x0000, entry->base, entry->limit, + 0x0000, entry->limit - entry->base + 1)); + } + + aml_append(crs, + aml_dword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED, + AML_CACHEABLE, AML_READ_WRITE, + 0, 0x000A0000, 0x000BFFFF, 0, 0x00020000)); + + crs_replace_with_free_ranges(crs_range_set.mem_ranges, + range_lob(pci_hole), + range_upb(pci_hole)); + for (i = 0; i < crs_range_set.mem_ranges->len; i++) { + entry = g_ptr_array_index(crs_range_set.mem_ranges, i); + aml_append(crs, + aml_dword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED, + AML_NON_CACHEABLE, AML_READ_WRITE, + 0, entry->base, entry->limit, + 0, entry->limit - entry->base + 1)); + } + + if (!range_is_empty(pci_hole64)) { + crs_replace_with_free_ranges(crs_range_set.mem_64bit_ranges, + range_lob(pci_hole64), + range_upb(pci_hole64)); + for (i = 0; i < crs_range_set.mem_64bit_ranges->len; i++) { + entry = g_ptr_array_index(crs_range_set.mem_64bit_ranges, i); + aml_append(crs, + aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED, + AML_MAX_FIXED, + AML_CACHEABLE, AML_READ_WRITE, + 0, entry->base, entry->limit, + 0, entry->limit - entry->base + 1)); + } + } + +#ifdef CONFIG_TPM + if (TPM_IS_TIS_ISA(tpm_find())) { + aml_append(crs, aml_memory32_fixed(TPM_TIS_ADDR_BASE, + TPM_TIS_ADDR_SIZE, AML_READ_WRITE)); + } +#endif + aml_append(scope, aml_name_decl("_CRS", crs)); + + /* reserve GPE0 block resources */ + dev = aml_device("GPE0"); + aml_append(dev, aml_name_decl("_HID", aml_string("PNP0A06"))); + aml_append(dev, aml_name_decl("_UID", aml_string("GPE0 resources"))); + /* device present, functioning, decoding, not shown in UI */ + aml_append(dev, aml_name_decl("_STA", aml_int(0xB))); + crs = aml_resource_template(); + aml_append(crs, + aml_io( + AML_DECODE16, + pm->fadt.gpe0_blk.address, + pm->fadt.gpe0_blk.address, + 1, + pm->fadt.gpe0_blk.bit_width / 8) + ); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); + + crs_range_set_free(&crs_range_set); + + /* reserve PCIHP resources */ + if (pm->pcihp_io_len && (pm->pcihp_bridge_en || pm->pcihp_root_en)) { + dev = aml_device("PHPR"); + aml_append(dev, aml_name_decl("_HID", aml_string("PNP0A06"))); + aml_append(dev, + aml_name_decl("_UID", aml_string("PCI Hotplug resources"))); + /* device present, functioning, decoding, not shown in UI */ + aml_append(dev, aml_name_decl("_STA", aml_int(0xB))); + crs = aml_resource_template(); + aml_append(crs, + aml_io(AML_DECODE16, pm->pcihp_io_base, pm->pcihp_io_base, 1, + pm->pcihp_io_len) + ); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); + } + aml_append(dsdt, scope); + + /* create S3_ / S4_ / S5_ packages if necessary */ + scope = aml_scope("\\"); + if (!pm->s3_disabled) { + pkg = aml_package(4); + aml_append(pkg, aml_int(1)); /* PM1a_CNT.SLP_TYP */ + aml_append(pkg, aml_int(1)); /* PM1b_CNT.SLP_TYP, FIXME: not impl. */ + aml_append(pkg, aml_int(0)); /* reserved */ + aml_append(pkg, aml_int(0)); /* reserved */ + aml_append(scope, aml_name_decl("_S3", pkg)); + } + + if (!pm->s4_disabled) { + pkg = aml_package(4); + aml_append(pkg, aml_int(pm->s4_val)); /* PM1a_CNT.SLP_TYP */ + /* PM1b_CNT.SLP_TYP, FIXME: not impl. */ + aml_append(pkg, aml_int(pm->s4_val)); + aml_append(pkg, aml_int(0)); /* reserved */ + aml_append(pkg, aml_int(0)); /* reserved */ + aml_append(scope, aml_name_decl("_S4", pkg)); + } + + pkg = aml_package(4); + aml_append(pkg, aml_int(0)); /* PM1a_CNT.SLP_TYP */ + aml_append(pkg, aml_int(0)); /* PM1b_CNT.SLP_TYP not impl. */ + aml_append(pkg, aml_int(0)); /* reserved */ + aml_append(pkg, aml_int(0)); /* reserved */ + aml_append(scope, aml_name_decl("_S5", pkg)); + aml_append(dsdt, scope); + + /* create fw_cfg node, unconditionally */ + { + scope = aml_scope("\\_SB.PCI0"); + fw_cfg_add_acpi_dsdt(scope, x86ms->fw_cfg); + aml_append(dsdt, scope); + } + + if (misc->applesmc_io_base) { + scope = aml_scope("\\_SB.PCI0.ISA"); + dev = aml_device("SMC"); + + aml_append(dev, aml_name_decl("_HID", aml_eisaid("APP0001"))); + /* device present, functioning, decoding, not shown in UI */ + aml_append(dev, aml_name_decl("_STA", aml_int(0xB))); + + crs = aml_resource_template(); + aml_append(crs, + aml_io(AML_DECODE16, misc->applesmc_io_base, misc->applesmc_io_base, + 0x01, APPLESMC_MAX_DATA_LENGTH) + ); + aml_append(crs, aml_irq_no_flags(6)); + aml_append(dev, aml_name_decl("_CRS", crs)); + + aml_append(scope, dev); + aml_append(dsdt, scope); + } + + if (misc->pvpanic_port) { + scope = aml_scope("\\_SB.PCI0.ISA"); + + dev = aml_device("PEVT"); + aml_append(dev, aml_name_decl("_HID", aml_string("QEMU0001"))); + + crs = aml_resource_template(); + aml_append(crs, + aml_io(AML_DECODE16, misc->pvpanic_port, misc->pvpanic_port, 1, 1) + ); + aml_append(dev, aml_name_decl("_CRS", crs)); + + aml_append(dev, aml_operation_region("PEOR", AML_SYSTEM_IO, + aml_int(misc->pvpanic_port), 1)); + field = aml_field("PEOR", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE); + aml_append(field, aml_named_field("PEPT", 8)); + aml_append(dev, field); + + /* device present, functioning, decoding, shown in UI */ + aml_append(dev, aml_name_decl("_STA", aml_int(0xF))); + + method = aml_method("RDPT", 0, AML_NOTSERIALIZED); + aml_append(method, aml_store(aml_name("PEPT"), aml_local(0))); + aml_append(method, aml_return(aml_local(0))); + aml_append(dev, method); + + method = aml_method("WRPT", 1, AML_NOTSERIALIZED); + aml_append(method, aml_store(aml_arg(0), aml_name("PEPT"))); + aml_append(dev, method); + + aml_append(scope, dev); + aml_append(dsdt, scope); + } + + sb_scope = aml_scope("\\_SB"); + { + Object *pci_host; + PCIBus *bus = NULL; + + pci_host = acpi_get_i386_pci_host(); + + if (pci_host) { + bus = PCI_HOST_BRIDGE(pci_host)->bus; + } + + if (bus) { + Aml *scope = aml_scope("PCI0"); + /* Scan all PCI buses. Generate tables to support hotplug. */ + build_append_pci_bus_devices(scope, bus, pm->pcihp_bridge_en); + +#ifdef CONFIG_TPM + if (TPM_IS_TIS_ISA(tpm)) { + if (misc->tpm_version == TPM_VERSION_2_0) { + dev = aml_device("TPM"); + aml_append(dev, aml_name_decl("_HID", + aml_string("MSFT0101"))); + } else { + dev = aml_device("ISA.TPM"); + aml_append(dev, aml_name_decl("_HID", + aml_eisaid("PNP0C31"))); + } + + aml_append(dev, aml_name_decl("_STA", aml_int(0xF))); + crs = aml_resource_template(); + aml_append(crs, aml_memory32_fixed(TPM_TIS_ADDR_BASE, + TPM_TIS_ADDR_SIZE, AML_READ_WRITE)); + /* + FIXME: TPM_TIS_IRQ=5 conflicts with PNP0C0F irqs, + Rewrite to take IRQ from TPM device model and + fix default IRQ value there to use some unused IRQ + */ + /* aml_append(crs, aml_irq_no_flags(TPM_TIS_IRQ)); */ + aml_append(dev, aml_name_decl("_CRS", crs)); + + tpm_build_ppi_acpi(tpm, dev); + + aml_append(scope, dev); + } +#endif + + aml_append(sb_scope, scope); + } + } + +#ifdef CONFIG_TPM + if (TPM_IS_CRB(tpm)) { + dev = aml_device("TPM"); + aml_append(dev, aml_name_decl("_HID", aml_string("MSFT0101"))); + crs = aml_resource_template(); + aml_append(crs, aml_memory32_fixed(TPM_CRB_ADDR_BASE, + TPM_CRB_ADDR_SIZE, AML_READ_WRITE)); + aml_append(dev, aml_name_decl("_CRS", crs)); + + aml_append(dev, aml_name_decl("_STA", aml_int(0xf))); + + tpm_build_ppi_acpi(tpm, dev); + + aml_append(sb_scope, dev); + } +#endif + + if (pcms->sgx_epc.size != 0) { + uint64_t epc_base = pcms->sgx_epc.base; + uint64_t epc_size = pcms->sgx_epc.size; + + dev = aml_device("EPC"); + aml_append(dev, aml_name_decl("_HID", aml_eisaid("INT0E0C"))); + aml_append(dev, aml_name_decl("_STR", + aml_unicode("Enclave Page Cache 1.0"))); + crs = aml_resource_template(); + aml_append(crs, + aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED, + AML_MAX_FIXED, AML_NON_CACHEABLE, + AML_READ_WRITE, 0, epc_base, + epc_base + epc_size - 1, 0, epc_size)); + aml_append(dev, aml_name_decl("_CRS", crs)); + + method = aml_method("_STA", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_int(0x0f))); + aml_append(dev, method); + + aml_append(sb_scope, dev); + } + aml_append(dsdt, sb_scope); + + /* copy AML table into ACPI tables blob and patch header there */ + g_array_append_vals(table_data, dsdt->buf->data, dsdt->buf->len); + acpi_table_end(linker, &table); + free_aml_allocator(); +} + +/* + * IA-PC HPET (High Precision Event Timers) Specification (Revision: 1.0a) + * 3.2.4The ACPI 2.0 HPET Description Table (HPET) + */ +static void +build_hpet(GArray *table_data, BIOSLinker *linker, const char *oem_id, + const char *oem_table_id) +{ + AcpiTable table = { .sig = "HPET", .rev = 1, + .oem_id = oem_id, .oem_table_id = oem_table_id }; + + acpi_table_begin(&table, table_data); + /* Note timer_block_id value must be kept in sync with value advertised by + * emulated hpet + */ + /* Event Timer Block ID */ + build_append_int_noprefix(table_data, 0x8086a201, 4); + /* BASE_ADDRESS */ + build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 0, 0, 0, HPET_BASE); + /* HPET Number */ + build_append_int_noprefix(table_data, 0, 1); + /* Main Counter Minimum Clock_tick in Periodic Mode */ + build_append_int_noprefix(table_data, 0, 2); + /* Page Protection And OEM Attribute */ + build_append_int_noprefix(table_data, 0, 1); + acpi_table_end(linker, &table); +} + +#ifdef CONFIG_TPM +/* + * TCPA Description Table + * + * Following Level 00, Rev 00.37 of specs: + * http://www.trustedcomputinggroup.org/resources/tcg_acpi_specification + * 7.1.2 ACPI Table Layout + */ +static void +build_tpm_tcpa(GArray *table_data, BIOSLinker *linker, GArray *tcpalog, + const char *oem_id, const char *oem_table_id) +{ + unsigned log_addr_offset; + AcpiTable table = { .sig = "TCPA", .rev = 2, + .oem_id = oem_id, .oem_table_id = oem_table_id }; + + acpi_table_begin(&table, table_data); + /* Platform Class */ + build_append_int_noprefix(table_data, TPM_TCPA_ACPI_CLASS_CLIENT, 2); + /* Log Area Minimum Length (LAML) */ + build_append_int_noprefix(table_data, TPM_LOG_AREA_MINIMUM_SIZE, 4); + /* Log Area Start Address (LASA) */ + log_addr_offset = table_data->len; + build_append_int_noprefix(table_data, 0, 8); + + /* allocate/reserve space for TPM log area */ + acpi_data_push(tcpalog, TPM_LOG_AREA_MINIMUM_SIZE); + bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1, + false /* high memory */); + /* log area start address to be filled by Guest linker */ + bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE, + log_addr_offset, 8, ACPI_BUILD_TPMLOG_FILE, 0); + + acpi_table_end(linker, &table); +} +#endif + +#define HOLE_640K_START (640 * KiB) +#define HOLE_640K_END (1 * MiB) + +/* + * ACPI spec, Revision 3.0 + * 5.2.15 System Resource Affinity Table (SRAT) + */ +static void +build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) +{ + int i; + int numa_mem_start, slots; + uint64_t mem_len, mem_base, next_base; + MachineClass *mc = MACHINE_GET_CLASS(machine); + X86MachineState *x86ms = X86_MACHINE(machine); + const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(machine); + PCMachineState *pcms = PC_MACHINE(machine); + int nb_numa_nodes = machine->numa_state->num_nodes; + NodeInfo *numa_info = machine->numa_state->nodes; + ram_addr_t hotpluggable_address_space_size = + object_property_get_int(OBJECT(pcms), PC_MACHINE_DEVMEM_REGION_SIZE, + NULL); + AcpiTable table = { .sig = "SRAT", .rev = 1, .oem_id = x86ms->oem_id, + .oem_table_id = x86ms->oem_table_id }; + + acpi_table_begin(&table, table_data); + build_append_int_noprefix(table_data, 1, 4); /* Reserved */ + build_append_int_noprefix(table_data, 0, 8); /* Reserved */ + + for (i = 0; i < apic_ids->len; i++) { + int node_id = apic_ids->cpus[i].props.node_id; + uint32_t apic_id = apic_ids->cpus[i].arch_id; + + if (apic_id < 255) { + /* 5.2.15.1 Processor Local APIC/SAPIC Affinity Structure */ + build_append_int_noprefix(table_data, 0, 1); /* Type */ + build_append_int_noprefix(table_data, 16, 1); /* Length */ + /* Proximity Domain [7:0] */ + build_append_int_noprefix(table_data, node_id, 1); + build_append_int_noprefix(table_data, apic_id, 1); /* APIC ID */ + /* Flags, Table 5-36 */ + build_append_int_noprefix(table_data, 1, 4); + build_append_int_noprefix(table_data, 0, 1); /* Local SAPIC EID */ + /* Proximity Domain [31:8] */ + build_append_int_noprefix(table_data, 0, 3); + build_append_int_noprefix(table_data, 0, 4); /* Reserved */ + } else { + /* + * ACPI spec, Revision 4.0 + * 5.2.16.3 Processor Local x2APIC Affinity Structure + */ + build_append_int_noprefix(table_data, 2, 1); /* Type */ + build_append_int_noprefix(table_data, 24, 1); /* Length */ + build_append_int_noprefix(table_data, 0, 2); /* Reserved */ + /* Proximity Domain */ + build_append_int_noprefix(table_data, node_id, 4); + build_append_int_noprefix(table_data, apic_id, 4); /* X2APIC ID */ + /* Flags, Table 5-39 */ + build_append_int_noprefix(table_data, 1 /* Enabled */, 4); + build_append_int_noprefix(table_data, 0, 4); /* Clock Domain */ + build_append_int_noprefix(table_data, 0, 4); /* Reserved */ + } + } + + /* the memory map is a bit tricky, it contains at least one hole + * from 640k-1M and possibly another one from 3.5G-4G. + */ + next_base = 0; + numa_mem_start = table_data->len; + + for (i = 1; i < nb_numa_nodes + 1; ++i) { + mem_base = next_base; + mem_len = numa_info[i - 1].node_mem; + next_base = mem_base + mem_len; + + /* Cut out the 640K hole */ + if (mem_base <= HOLE_640K_START && + next_base > HOLE_640K_START) { + mem_len -= next_base - HOLE_640K_START; + if (mem_len > 0) { + build_srat_memory(table_data, mem_base, mem_len, i - 1, + MEM_AFFINITY_ENABLED); + } + + /* Check for the rare case: 640K < RAM < 1M */ + if (next_base <= HOLE_640K_END) { + next_base = HOLE_640K_END; + continue; + } + mem_base = HOLE_640K_END; + mem_len = next_base - HOLE_640K_END; + } + + /* Cut out the ACPI_PCI hole */ + if (mem_base <= x86ms->below_4g_mem_size && + next_base > x86ms->below_4g_mem_size) { + mem_len -= next_base - x86ms->below_4g_mem_size; + if (mem_len > 0) { + build_srat_memory(table_data, mem_base, mem_len, i - 1, + MEM_AFFINITY_ENABLED); + } + mem_base = 1ULL << 32; + mem_len = next_base - x86ms->below_4g_mem_size; + next_base = mem_base + mem_len; + } + + if (mem_len > 0) { + build_srat_memory(table_data, mem_base, mem_len, i - 1, + MEM_AFFINITY_ENABLED); + } + } + + if (machine->nvdimms_state->is_enabled) { + nvdimm_build_srat(table_data); + } + + /* + * TODO: this part is not in ACPI spec and current linux kernel boots fine + * without these entries. But I recall there were issues the last time I + * tried to remove it with some ancient guest OS, however I can't remember + * what that was so keep this around for now + */ + slots = (table_data->len - numa_mem_start) / 40 /* mem affinity len */; + for (; slots < nb_numa_nodes + 2; slots++) { + build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS); + } + + /* + * Entry is required for Windows to enable memory hotplug in OS + * and for Linux to enable SWIOTLB when booted with less than + * 4G of RAM. Windows works better if the entry sets proximity + * to the highest NUMA node in the machine. + * Memory devices may override proximity set by this entry, + * providing _PXM method if necessary. + */ + if (hotpluggable_address_space_size) { + build_srat_memory(table_data, machine->device_memory->base, + hotpluggable_address_space_size, nb_numa_nodes - 1, + MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); + } + + acpi_table_end(linker, &table); +} + +/* + * Insert DMAR scope for PCI bridges and endpoint devcie + */ +static void +insert_scope(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + const size_t device_scope_size = 6 /* device scope structure */ + + 2 /* 1 path entry */; + GArray *scope_blob = opaque; + + if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { + /* Dmar Scope Type: 0x02 for PCI Bridge */ + build_append_int_noprefix(scope_blob, 0x02, 1); + } else { + /* Dmar Scope Type: 0x01 for PCI Endpoint Device */ + build_append_int_noprefix(scope_blob, 0x01, 1); + } + + /* length */ + build_append_int_noprefix(scope_blob, device_scope_size, 1); + /* reserved */ + build_append_int_noprefix(scope_blob, 0, 2); + /* enumeration_id */ + build_append_int_noprefix(scope_blob, 0, 1); + /* bus */ + build_append_int_noprefix(scope_blob, pci_bus_num(bus), 1); + /* device */ + build_append_int_noprefix(scope_blob, PCI_SLOT(dev->devfn), 1); + /* function */ + build_append_int_noprefix(scope_blob, PCI_FUNC(dev->devfn), 1); +} + +/* For a given PCI host bridge, walk and insert DMAR scope */ +static int +dmar_host_bridges(Object *obj, void *opaque) +{ + GArray *scope_blob = opaque; + + if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { + PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; + + if (bus && !pci_bus_bypass_iommu(bus)) { + pci_for_each_device_under_bus(bus, insert_scope, scope_blob); + } + } + + return 0; +} + +/* + * Intel ® Virtualization Technology for Directed I/O + * Architecture Specification. Revision 3.3 + * 8.1 DMA Remapping Reporting Structure + */ +static void +build_dmar_q35(GArray *table_data, BIOSLinker *linker, const char *oem_id, + const char *oem_table_id) +{ + uint8_t dmar_flags = 0; + uint8_t rsvd10[10] = {}; + /* Root complex IOAPIC uses one path only */ + const size_t ioapic_scope_size = 6 /* device scope structure */ + + 2 /* 1 path entry */; + X86IOMMUState *iommu = x86_iommu_get_default(); + IntelIOMMUState *intel_iommu = INTEL_IOMMU_DEVICE(iommu); + GArray *scope_blob = g_array_new(false, true, 1); + + AcpiTable table = { .sig = "DMAR", .rev = 1, .oem_id = oem_id, + .oem_table_id = oem_table_id }; + + /* + * A PCI bus walk, for each PCI host bridge. + * Insert scope for each PCI bridge and endpoint device which + * is attached to a bus with iommu enabled. + */ + object_child_foreach_recursive(object_get_root(), + dmar_host_bridges, scope_blob); + + assert(iommu); + if (x86_iommu_ir_supported(iommu)) { + dmar_flags |= 0x1; /* Flags: 0x1: INT_REMAP */ + } + + acpi_table_begin(&table, table_data); + /* Host Address Width */ + build_append_int_noprefix(table_data, intel_iommu->aw_bits - 1, 1); + build_append_int_noprefix(table_data, dmar_flags, 1); /* Flags */ + g_array_append_vals(table_data, rsvd10, sizeof(rsvd10)); /* Reserved */ + + /* 8.3 DMAR Remapping Hardware Unit Definition structure */ + build_append_int_noprefix(table_data, 0, 2); /* Type */ + /* Length */ + build_append_int_noprefix(table_data, + 16 + ioapic_scope_size + scope_blob->len, 2); + /* Flags */ + build_append_int_noprefix(table_data, 0 /* Don't include all pci device */ , + 1); + build_append_int_noprefix(table_data, 0 , 1); /* Reserved */ + build_append_int_noprefix(table_data, 0 , 2); /* Segment Number */ + /* Register Base Address */ + build_append_int_noprefix(table_data, Q35_HOST_BRIDGE_IOMMU_ADDR , 8); + + /* Scope definition for the root-complex IOAPIC. See VT-d spec + * 8.3.1 (version Oct. 2014 or later). */ + build_append_int_noprefix(table_data, 0x03 /* IOAPIC */, 1); /* Type */ + build_append_int_noprefix(table_data, ioapic_scope_size, 1); /* Length */ + build_append_int_noprefix(table_data, 0, 2); /* Reserved */ + /* Enumeration ID */ + build_append_int_noprefix(table_data, ACPI_BUILD_IOAPIC_ID, 1); + /* Start Bus Number */ + build_append_int_noprefix(table_data, Q35_PSEUDO_BUS_PLATFORM, 1); + /* Path, {Device, Function} pair */ + build_append_int_noprefix(table_data, PCI_SLOT(Q35_PSEUDO_DEVFN_IOAPIC), 1); + build_append_int_noprefix(table_data, PCI_FUNC(Q35_PSEUDO_DEVFN_IOAPIC), 1); + + /* Add scope found above */ + g_array_append_vals(table_data, scope_blob->data, scope_blob->len); + g_array_free(scope_blob, true); + + if (iommu->dt_supported) { + /* 8.5 Root Port ATS Capability Reporting Structure */ + build_append_int_noprefix(table_data, 2, 2); /* Type */ + build_append_int_noprefix(table_data, 8, 2); /* Length */ + build_append_int_noprefix(table_data, 1 /* ALL_PORTS */, 1); /* Flags */ + build_append_int_noprefix(table_data, 0, 1); /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); /* Segment Number */ + } + + acpi_table_end(linker, &table); +} + +/* + * Windows ACPI Emulated Devices Table + * (Version 1.0 - April 6, 2009) + * Spec: http://download.microsoft.com/download/7/E/7/7E7662CF-CBEA-470B-A97E-CE7CE0D98DC2/WAET.docx + * + * Helpful to speedup Windows guests and ignored by others. + */ +static void +build_waet(GArray *table_data, BIOSLinker *linker, const char *oem_id, + const char *oem_table_id) +{ + AcpiTable table = { .sig = "WAET", .rev = 1, .oem_id = oem_id, + .oem_table_id = oem_table_id }; + + acpi_table_begin(&table, table_data); + /* + * Set "ACPI PM timer good" flag. + * + * Tells Windows guests that our ACPI PM timer is reliable in the + * sense that guest can read it only once to obtain a reliable value. + * Which avoids costly VMExits caused by guest re-reading it unnecessarily. + */ + build_append_int_noprefix(table_data, 1 << 1 /* ACPI PM timer good */, 4); + acpi_table_end(linker, &table); +} + +/* + * IVRS table as specified in AMD IOMMU Specification v2.62, Section 5.2 + * accessible here http://support.amd.com/TechDocs/48882_IOMMU.pdf + */ +#define IOAPIC_SB_DEVID (uint64_t)PCI_BUILD_BDF(0, PCI_DEVFN(0x14, 0)) + +/* + * Insert IVHD entry for device and recurse, insert alias, or insert range as + * necessary for the PCI topology. + */ +static void +insert_ivhd(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + GArray *table_data = opaque; + uint32_t entry; + + /* "Select" IVHD entry, type 0x2 */ + entry = PCI_BUILD_BDF(pci_bus_num(bus), dev->devfn) << 8 | 0x2; + build_append_int_noprefix(table_data, entry, 4); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { + PCIBus *sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(dev)); + uint8_t sec = pci_bus_num(sec_bus); + uint8_t sub = dev->config[PCI_SUBORDINATE_BUS]; + + if (pci_bus_is_express(sec_bus)) { + /* + * Walk the bus if there are subordinates, otherwise use a range + * to cover an entire leaf bus. We could potentially also use a + * range for traversed buses, but we'd need to take care not to + * create both Select and Range entries covering the same device. + * This is easier and potentially more compact. + * + * An example bare metal system seems to use Select entries for + * root ports without a slot (ie. built-ins) and Range entries + * when there is a slot. The same system also only hard-codes + * the alias range for an onboard PCIe-to-PCI bridge, apparently + * making no effort to support nested bridges. We attempt to + * be more thorough here. + */ + if (sec == sub) { /* leaf bus */ + /* "Start of Range" IVHD entry, type 0x3 */ + entry = PCI_BUILD_BDF(sec, PCI_DEVFN(0, 0)) << 8 | 0x3; + build_append_int_noprefix(table_data, entry, 4); + /* "End of Range" IVHD entry, type 0x4 */ + entry = PCI_BUILD_BDF(sub, PCI_DEVFN(31, 7)) << 8 | 0x4; + build_append_int_noprefix(table_data, entry, 4); + } else { + pci_for_each_device(sec_bus, sec, insert_ivhd, table_data); + } + } else { + /* + * If the secondary bus is conventional, then we need to create an + * Alias range for everything downstream. The range covers the + * first devfn on the secondary bus to the last devfn on the + * subordinate bus. The alias target depends on legacy versus + * express bridges, just as in pci_device_iommu_address_space(). + * DeviceIDa vs DeviceIDb as per the AMD IOMMU spec. + */ + uint16_t dev_id_a, dev_id_b; + + dev_id_a = PCI_BUILD_BDF(sec, PCI_DEVFN(0, 0)); + + if (pci_is_express(dev) && + pcie_cap_get_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) { + dev_id_b = dev_id_a; + } else { + dev_id_b = PCI_BUILD_BDF(pci_bus_num(bus), dev->devfn); + } + + /* "Alias Start of Range" IVHD entry, type 0x43, 8 bytes */ + build_append_int_noprefix(table_data, dev_id_a << 8 | 0x43, 4); + build_append_int_noprefix(table_data, dev_id_b << 8 | 0x0, 4); + + /* "End of Range" IVHD entry, type 0x4 */ + entry = PCI_BUILD_BDF(sub, PCI_DEVFN(31, 7)) << 8 | 0x4; + build_append_int_noprefix(table_data, entry, 4); + } + } +} + +/* For all PCI host bridges, walk and insert IVHD entries */ +static int +ivrs_host_bridges(Object *obj, void *opaque) +{ + GArray *ivhd_blob = opaque; + + if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { + PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; + + if (bus && !pci_bus_bypass_iommu(bus)) { + pci_for_each_device_under_bus(bus, insert_ivhd, ivhd_blob); + } + } + + return 0; +} + +static void +build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, + const char *oem_table_id) +{ + int ivhd_table_len = 24; + AMDVIState *s = AMD_IOMMU_DEVICE(x86_iommu_get_default()); + GArray *ivhd_blob = g_array_new(false, true, 1); + AcpiTable table = { .sig = "IVRS", .rev = 1, .oem_id = oem_id, + .oem_table_id = oem_table_id }; + + acpi_table_begin(&table, table_data); + /* IVinfo - IO virtualization information common to all + * IOMMU units in a system + */ + build_append_int_noprefix(table_data, 40UL << 8/* PASize */, 4); + /* reserved */ + build_append_int_noprefix(table_data, 0, 8); + + /* IVHD definition - type 10h */ + build_append_int_noprefix(table_data, 0x10, 1); + /* virtualization flags */ + build_append_int_noprefix(table_data, + (1UL << 0) | /* HtTunEn */ + (1UL << 4) | /* iotblSup */ + (1UL << 6) | /* PrefSup */ + (1UL << 7), /* PPRSup */ + 1); + + /* + * A PCI bus walk, for each PCI host bridge, is necessary to create a + * complete set of IVHD entries. Do this into a separate blob so that we + * can calculate the total IVRS table length here and then append the new + * blob further below. Fall back to an entry covering all devices, which + * is sufficient when no aliases are present. + */ + object_child_foreach_recursive(object_get_root(), + ivrs_host_bridges, ivhd_blob); + + if (!ivhd_blob->len) { + /* + * Type 1 device entry reporting all devices + * These are 4-byte device entries currently reporting the range of + * Refer to Spec - Table 95:IVHD Device Entry Type Codes(4-byte) + */ + build_append_int_noprefix(ivhd_blob, 0x0000001, 4); + } + + ivhd_table_len += ivhd_blob->len; + + /* + * When interrupt remapping is supported, we add a special IVHD device + * for type IO-APIC. + */ + if (x86_iommu_ir_supported(x86_iommu_get_default())) { + ivhd_table_len += 8; + } + + /* IVHD length */ + build_append_int_noprefix(table_data, ivhd_table_len, 2); + /* DeviceID */ + build_append_int_noprefix(table_data, s->devid, 2); + /* Capability offset */ + build_append_int_noprefix(table_data, s->capab_offset, 2); + /* IOMMU base address */ + build_append_int_noprefix(table_data, s->mmio.addr, 8); + /* PCI Segment Group */ + build_append_int_noprefix(table_data, 0, 2); + /* IOMMU info */ + build_append_int_noprefix(table_data, 0, 2); + /* IOMMU Feature Reporting */ + build_append_int_noprefix(table_data, + (48UL << 30) | /* HATS */ + (48UL << 28) | /* GATS */ + (1UL << 2) | /* GTSup */ + (1UL << 6), /* GASup */ + 4); + + /* IVHD entries as found above */ + g_array_append_vals(table_data, ivhd_blob->data, ivhd_blob->len); + g_array_free(ivhd_blob, TRUE); + + /* + * Add a special IVHD device type. + * Refer to spec - Table 95: IVHD device entry type codes + * + * Linux IOMMU driver checks for the special IVHD device (type IO-APIC). + * See Linux kernel commit 'c2ff5cf5294bcbd7fa50f7d860e90a66db7e5059' + */ + if (x86_iommu_ir_supported(x86_iommu_get_default())) { + build_append_int_noprefix(table_data, + (0x1ull << 56) | /* type IOAPIC */ + (IOAPIC_SB_DEVID << 40) | /* IOAPIC devid */ + 0x48, /* special device */ + 8); + } + acpi_table_end(linker, &table); +} + +typedef +struct AcpiBuildState { + /* Copy of table in RAM (for patching). */ + MemoryRegion *table_mr; + /* Is table patched? */ + uint8_t patched; + void *rsdp; + MemoryRegion *rsdp_mr; + MemoryRegion *linker_mr; +} AcpiBuildState; + +static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) +{ + Object *pci_host; + QObject *o; + + pci_host = acpi_get_i386_pci_host(); + if (!pci_host) { + return false; + } + + o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL); + if (!o) { + return false; + } + mcfg->base = qnum_get_uint(qobject_to(QNum, o)); + qobject_unref(o); + if (mcfg->base == PCIE_BASE_ADDR_UNMAPPED) { + return false; + } + + o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL); + assert(o); + mcfg->size = qnum_get_uint(qobject_to(QNum, o)); + qobject_unref(o); + return true; +} + +static +void acpi_build(AcpiBuildTables *tables, MachineState *machine) +{ + PCMachineState *pcms = PC_MACHINE(machine); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + X86MachineState *x86ms = X86_MACHINE(machine); + DeviceState *iommu = pcms->iommu; + GArray *table_offsets; + unsigned facs, dsdt, rsdt, fadt; + AcpiPmInfo pm; + AcpiMiscInfo misc; + AcpiMcfgInfo mcfg; + Range pci_hole = {}, pci_hole64 = {}; + uint8_t *u; + size_t aml_len = 0; + GArray *tables_blob = tables->table_data; + AcpiSlicOem slic_oem = { .id = NULL, .table_id = NULL }; + Object *vmgenid_dev; + char *oem_id; + char *oem_table_id; + + acpi_get_pm_info(machine, &pm); + acpi_get_misc_info(&misc); + acpi_get_pci_holes(&pci_hole, &pci_hole64); + acpi_get_slic_oem(&slic_oem); + + if (slic_oem.id) { + oem_id = slic_oem.id; + } else { + oem_id = x86ms->oem_id; + } + + if (slic_oem.table_id) { + oem_table_id = slic_oem.table_id; + } else { + oem_table_id = x86ms->oem_table_id; + } + + table_offsets = g_array_new(false, true /* clear */, + sizeof(uint32_t)); + ACPI_BUILD_DPRINTF("init ACPI tables\n"); + + bios_linker_loader_alloc(tables->linker, + ACPI_BUILD_TABLE_FILE, tables_blob, + 64 /* Ensure FACS is aligned */, + false /* high memory */); + + /* + * FACS is pointed to by FADT. + * We place it first since it's the only table that has alignment + * requirements. + */ + facs = tables_blob->len; + build_facs(tables_blob); + + /* DSDT is pointed to by FADT */ + dsdt = tables_blob->len; + build_dsdt(tables_blob, tables->linker, &pm, &misc, + &pci_hole, &pci_hole64, machine); + + /* Count the size of the DSDT and SSDT, we will need it for legacy + * sizing of ACPI tables. + */ + aml_len += tables_blob->len - dsdt; + + /* ACPI tables pointed to by RSDT */ + fadt = tables_blob->len; + acpi_add_table(table_offsets, tables_blob); + pm.fadt.facs_tbl_offset = &facs; + pm.fadt.dsdt_tbl_offset = &dsdt; + pm.fadt.xdsdt_tbl_offset = &dsdt; + build_fadt(tables_blob, tables->linker, &pm.fadt, oem_id, oem_table_id); + aml_len += tables_blob->len - fadt; + + acpi_add_table(table_offsets, tables_blob); + acpi_build_madt(tables_blob, tables->linker, x86ms, + ACPI_DEVICE_IF(x86ms->acpi_dev), x86ms->oem_id, + x86ms->oem_table_id); + + vmgenid_dev = find_vmgenid_dev(); + if (vmgenid_dev) { + acpi_add_table(table_offsets, tables_blob); + vmgenid_build_acpi(VMGENID(vmgenid_dev), tables_blob, + tables->vmgenid, tables->linker, x86ms->oem_id); + } + + if (misc.has_hpet) { + acpi_add_table(table_offsets, tables_blob); + build_hpet(tables_blob, tables->linker, x86ms->oem_id, + x86ms->oem_table_id); + } +#ifdef CONFIG_TPM + if (misc.tpm_version != TPM_VERSION_UNSPEC) { + if (misc.tpm_version == TPM_VERSION_1_2) { + acpi_add_table(table_offsets, tables_blob); + build_tpm_tcpa(tables_blob, tables->linker, tables->tcpalog, + x86ms->oem_id, x86ms->oem_table_id); + } else { /* TPM_VERSION_2_0 */ + acpi_add_table(table_offsets, tables_blob); + build_tpm2(tables_blob, tables->linker, tables->tcpalog, + x86ms->oem_id, x86ms->oem_table_id); + } + } +#endif + if (machine->numa_state->num_nodes) { + acpi_add_table(table_offsets, tables_blob); + build_srat(tables_blob, tables->linker, machine); + if (machine->numa_state->have_numa_distance) { + acpi_add_table(table_offsets, tables_blob); + build_slit(tables_blob, tables->linker, machine, x86ms->oem_id, + x86ms->oem_table_id); + } + if (machine->numa_state->hmat_enabled) { + acpi_add_table(table_offsets, tables_blob); + build_hmat(tables_blob, tables->linker, machine->numa_state, + x86ms->oem_id, x86ms->oem_table_id); + } + } + if (acpi_get_mcfg(&mcfg)) { + acpi_add_table(table_offsets, tables_blob); + build_mcfg(tables_blob, tables->linker, &mcfg, x86ms->oem_id, + x86ms->oem_table_id); + } + if (object_dynamic_cast(OBJECT(iommu), TYPE_AMD_IOMMU_DEVICE)) { + acpi_add_table(table_offsets, tables_blob); + build_amd_iommu(tables_blob, tables->linker, x86ms->oem_id, + x86ms->oem_table_id); + } else if (object_dynamic_cast(OBJECT(iommu), TYPE_INTEL_IOMMU_DEVICE)) { + acpi_add_table(table_offsets, tables_blob); + build_dmar_q35(tables_blob, tables->linker, x86ms->oem_id, + x86ms->oem_table_id); + } else if (object_dynamic_cast(OBJECT(iommu), TYPE_VIRTIO_IOMMU_PCI)) { + PCIDevice *pdev = PCI_DEVICE(iommu); + + acpi_add_table(table_offsets, tables_blob); + build_viot(machine, tables_blob, tables->linker, pci_get_bdf(pdev), + x86ms->oem_id, x86ms->oem_table_id); + } + if (machine->nvdimms_state->is_enabled) { + nvdimm_build_acpi(table_offsets, tables_blob, tables->linker, + machine->nvdimms_state, machine->ram_slots, + x86ms->oem_id, x86ms->oem_table_id); + } + + acpi_add_table(table_offsets, tables_blob); + build_waet(tables_blob, tables->linker, x86ms->oem_id, x86ms->oem_table_id); + + /* Add tables supplied by user (if any) */ + for (u = acpi_table_first(); u; u = acpi_table_next(u)) { + unsigned len = acpi_table_len(u); + + acpi_add_table(table_offsets, tables_blob); + g_array_append_vals(tables_blob, u, len); + } + + /* RSDT is pointed to by RSDP */ + rsdt = tables_blob->len; + build_rsdt(tables_blob, tables->linker, table_offsets, + oem_id, oem_table_id); + + /* RSDP is in FSEG memory, so allocate it separately */ + { + AcpiRsdpData rsdp_data = { + .revision = 0, + .oem_id = x86ms->oem_id, + .xsdt_tbl_offset = NULL, + .rsdt_tbl_offset = &rsdt, + }; + build_rsdp(tables->rsdp, tables->linker, &rsdp_data); + if (!pcmc->rsdp_in_ram) { + /* We used to allocate some extra space for RSDP revision 2 but + * only used the RSDP revision 0 space. The extra bytes were + * zeroed out and not used. + * Here we continue wasting those extra 16 bytes to make sure we + * don't break migration for machine types 2.2 and older due to + * RSDP blob size mismatch. + */ + build_append_int_noprefix(tables->rsdp, 0, 16); + } + } + + /* We'll expose it all to Guest so we want to reduce + * chance of size changes. + * + * We used to align the tables to 4k, but of course this would + * too simple to be enough. 4k turned out to be too small an + * alignment very soon, and in fact it is almost impossible to + * keep the table size stable for all (max_cpus, max_memory_slots) + * combinations. So the table size is always 64k for pc-i440fx-2.1 + * and we give an error if the table grows beyond that limit. + * + * We still have the problem of migrating from "-M pc-i440fx-2.0". For + * that, we exploit the fact that QEMU 2.1 generates _smaller_ tables + * than 2.0 and we can always pad the smaller tables with zeros. We can + * then use the exact size of the 2.0 tables. + * + * All this is for PIIX4, since QEMU 2.0 didn't support Q35 migration. + */ + if (pcmc->legacy_acpi_table_size) { + /* Subtracting aml_len gives the size of fixed tables. Then add the + * size of the PIIX4 DSDT/SSDT in QEMU 2.0. + */ + int legacy_aml_len = + pcmc->legacy_acpi_table_size + + ACPI_BUILD_LEGACY_CPU_AML_SIZE * x86ms->apic_id_limit; + int legacy_table_size = + ROUND_UP(tables_blob->len - aml_len + legacy_aml_len, + ACPI_BUILD_ALIGN_SIZE); + if (tables_blob->len > legacy_table_size) { + /* Should happen only with PCI bridges and -M pc-i440fx-2.0. */ + warn_report("ACPI table size %u exceeds %d bytes," + " migration may not work", + tables_blob->len, legacy_table_size); + error_printf("Try removing CPUs, NUMA nodes, memory slots" + " or PCI bridges."); + } + g_array_set_size(tables_blob, legacy_table_size); + } else { + /* Make sure we have a buffer in case we need to resize the tables. */ + if (tables_blob->len > ACPI_BUILD_TABLE_SIZE / 2) { + /* As of QEMU 2.1, this fires with 160 VCPUs and 255 memory slots. */ + warn_report("ACPI table size %u exceeds %d bytes," + " migration may not work", + tables_blob->len, ACPI_BUILD_TABLE_SIZE / 2); + error_printf("Try removing CPUs, NUMA nodes, memory slots" + " or PCI bridges."); + } + acpi_align_size(tables_blob, ACPI_BUILD_TABLE_SIZE); + } + + acpi_align_size(tables->linker->cmd_blob, ACPI_BUILD_ALIGN_SIZE); + + /* Cleanup memory that's no longer used. */ + g_array_free(table_offsets, true); +} + +static void acpi_ram_update(MemoryRegion *mr, GArray *data) +{ + uint32_t size = acpi_data_len(data); + + /* Make sure RAM size is correct - in case it got changed e.g. by migration */ + memory_region_ram_resize(mr, size, &error_abort); + + memcpy(memory_region_get_ram_ptr(mr), data->data, size); + memory_region_set_dirty(mr, 0, size); +} + +static void acpi_build_update(void *build_opaque) +{ + AcpiBuildState *build_state = build_opaque; + AcpiBuildTables tables; + + /* No state to update or already patched? Nothing to do. */ + if (!build_state || build_state->patched) { + return; + } + build_state->patched = 1; + + acpi_build_tables_init(&tables); + + acpi_build(&tables, MACHINE(qdev_get_machine())); + + acpi_ram_update(build_state->table_mr, tables.table_data); + + if (build_state->rsdp) { + memcpy(build_state->rsdp, tables.rsdp->data, acpi_data_len(tables.rsdp)); + } else { + acpi_ram_update(build_state->rsdp_mr, tables.rsdp); + } + + acpi_ram_update(build_state->linker_mr, tables.linker->cmd_blob); + acpi_build_tables_cleanup(&tables, true); +} + +static void acpi_build_reset(void *build_opaque) +{ + AcpiBuildState *build_state = build_opaque; + build_state->patched = 0; +} + +static const VMStateDescription vmstate_acpi_build = { + .name = "acpi_build", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT8(patched, AcpiBuildState), + VMSTATE_END_OF_LIST() + }, +}; + +void acpi_setup(void) +{ + PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + X86MachineState *x86ms = X86_MACHINE(pcms); + AcpiBuildTables tables; + AcpiBuildState *build_state; + Object *vmgenid_dev; +#ifdef CONFIG_TPM + TPMIf *tpm; + static FwCfgTPMConfig tpm_config; +#endif + + if (!x86ms->fw_cfg) { + ACPI_BUILD_DPRINTF("No fw cfg. Bailing out.\n"); + return; + } + + if (!pcms->acpi_build_enabled) { + ACPI_BUILD_DPRINTF("ACPI build disabled. Bailing out.\n"); + return; + } + + if (!x86_machine_is_acpi_enabled(X86_MACHINE(pcms))) { + ACPI_BUILD_DPRINTF("ACPI disabled. Bailing out.\n"); + return; + } + + build_state = g_malloc0(sizeof *build_state); + + acpi_build_tables_init(&tables); + acpi_build(&tables, MACHINE(pcms)); + + /* Now expose it all to Guest */ + build_state->table_mr = acpi_add_rom_blob(acpi_build_update, + build_state, tables.table_data, + ACPI_BUILD_TABLE_FILE); + assert(build_state->table_mr != NULL); + + build_state->linker_mr = + acpi_add_rom_blob(acpi_build_update, build_state, + tables.linker->cmd_blob, ACPI_BUILD_LOADER_FILE); + +#ifdef CONFIG_TPM + fw_cfg_add_file(x86ms->fw_cfg, ACPI_BUILD_TPMLOG_FILE, + tables.tcpalog->data, acpi_data_len(tables.tcpalog)); + + tpm = tpm_find(); + if (tpm && object_property_get_bool(OBJECT(tpm), "ppi", &error_abort)) { + tpm_config = (FwCfgTPMConfig) { + .tpmppi_address = cpu_to_le32(TPM_PPI_ADDR_BASE), + .tpm_version = tpm_get_version(tpm), + .tpmppi_version = TPM_PPI_VERSION_1_30 + }; + fw_cfg_add_file(x86ms->fw_cfg, "etc/tpm/config", + &tpm_config, sizeof tpm_config); + } +#endif + + vmgenid_dev = find_vmgenid_dev(); + if (vmgenid_dev) { + vmgenid_add_fw_cfg(VMGENID(vmgenid_dev), x86ms->fw_cfg, + tables.vmgenid); + } + + if (!pcmc->rsdp_in_ram) { + /* + * Keep for compatibility with old machine types. + * Though RSDP is small, its contents isn't immutable, so + * we'll update it along with the rest of tables on guest access. + */ + uint32_t rsdp_size = acpi_data_len(tables.rsdp); + + build_state->rsdp = g_memdup(tables.rsdp->data, rsdp_size); + fw_cfg_add_file_callback(x86ms->fw_cfg, ACPI_BUILD_RSDP_FILE, + acpi_build_update, NULL, build_state, + build_state->rsdp, rsdp_size, true); + build_state->rsdp_mr = NULL; + } else { + build_state->rsdp = NULL; + build_state->rsdp_mr = acpi_add_rom_blob(acpi_build_update, + build_state, tables.rsdp, + ACPI_BUILD_RSDP_FILE); + } + + qemu_register_reset(acpi_build_reset, build_state); + acpi_build_reset(build_state); + vmstate_register(NULL, 0, &vmstate_acpi_build, build_state); + + /* Cleanup tables but don't free the memory: we track it + * in build_state. + */ + acpi_build_tables_cleanup(&tables, false); +} diff --git a/hw/i386/acpi-build.h b/hw/i386/acpi-build.h new file mode 100644 index 000000000..0dce155c8 --- /dev/null +++ b/hw/i386/acpi-build.h @@ -0,0 +1,15 @@ + +#ifndef HW_I386_ACPI_BUILD_H +#define HW_I386_ACPI_BUILD_H +#include "hw/acpi/acpi-defs.h" + +extern const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio; + +/* PCI Hot-plug registers bases. See docs/spec/acpi_pci_hotplug.txt */ +#define ACPI_PCIHP_SEJ_BASE 0x8 +#define ACPI_PCIHP_BNMR_BASE 0x10 + +void acpi_setup(void); +Object *acpi_get_i386_pci_host(void); + +#endif diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c new file mode 100644 index 000000000..4aaafbdd7 --- /dev/null +++ b/hw/i386/acpi-common.c @@ -0,0 +1,165 @@ +/* Support for generating ACPI tables and passing them to Guests + * + * Copyright (C) 2008-2010 Kevin O'Connor <kevin@koconnor.net> + * Copyright (C) 2006 Fabrice Bellard + * Copyright (C) 2013 Red Hat Inc + * + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" + +#include "exec/memory.h" +#include "hw/acpi/acpi.h" +#include "hw/acpi/aml-build.h" +#include "hw/acpi/utils.h" +#include "hw/i386/pc.h" +#include "target/i386/cpu.h" + +#include "acpi-build.h" +#include "acpi-common.h" + +void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid, + const CPUArchIdList *apic_ids, GArray *entry, + bool force_enabled) +{ + uint32_t apic_id = apic_ids->cpus[uid].arch_id; + /* Flags – Local APIC Flags */ + uint32_t flags = apic_ids->cpus[uid].cpu != NULL || force_enabled ? + 1 /* Enabled */ : 0; + + /* ACPI spec says that LAPIC entry for non present + * CPU may be omitted from MADT or it must be marked + * as disabled. However omitting non present CPU from + * MADT breaks hotplug on linux. So possible CPUs + * should be put in MADT but kept disabled. + */ + if (apic_id < 255) { + /* Rev 1.0b, Table 5-13 Processor Local APIC Structure */ + build_append_int_noprefix(entry, 0, 1); /* Type */ + build_append_int_noprefix(entry, 8, 1); /* Length */ + build_append_int_noprefix(entry, uid, 1); /* ACPI Processor ID */ + build_append_int_noprefix(entry, apic_id, 1); /* APIC ID */ + build_append_int_noprefix(entry, flags, 4); /* Flags */ + } else { + /* Rev 4.0, 5.2.12.12 Processor Local x2APIC Structure */ + build_append_int_noprefix(entry, 9, 1); /* Type */ + build_append_int_noprefix(entry, 16, 1); /* Length */ + build_append_int_noprefix(entry, 0, 2); /* Reserved */ + build_append_int_noprefix(entry, apic_id, 4); /* X2APIC ID */ + build_append_int_noprefix(entry, flags, 4); /* Flags */ + build_append_int_noprefix(entry, uid, 4); /* ACPI Processor UID */ + } +} + +static void build_ioapic(GArray *entry, uint8_t id, uint32_t addr, uint32_t irq) +{ + /* Rev 1.0b, 5.2.8.2 IO APIC */ + build_append_int_noprefix(entry, 1, 1); /* Type */ + build_append_int_noprefix(entry, 12, 1); /* Length */ + build_append_int_noprefix(entry, id, 1); /* IO APIC ID */ + build_append_int_noprefix(entry, 0, 1); /* Reserved */ + build_append_int_noprefix(entry, addr, 4); /* IO APIC Address */ + build_append_int_noprefix(entry, irq, 4); /* System Vector Base */ +} + +static void +build_xrupt_override(GArray *entry, uint8_t src, uint32_t gsi, uint16_t flags) +{ + /* Rev 1.0b, 5.2.8.3.1 Interrupt Source Overrides */ + build_append_int_noprefix(entry, 2, 1); /* Type */ + build_append_int_noprefix(entry, 10, 1); /* Length */ + build_append_int_noprefix(entry, 0, 1); /* Bus */ + build_append_int_noprefix(entry, src, 1); /* Source */ + /* Global System Interrupt Vector */ + build_append_int_noprefix(entry, gsi, 4); + build_append_int_noprefix(entry, flags, 2); /* Flags */ +} + +/* + * ACPI spec, Revision 1.0b + * 5.2.8 Multiple APIC Description Table + */ +void acpi_build_madt(GArray *table_data, BIOSLinker *linker, + X86MachineState *x86ms, AcpiDeviceIf *adev, + const char *oem_id, const char *oem_table_id) +{ + int i; + bool x2apic_mode = false; + MachineClass *mc = MACHINE_GET_CLASS(x86ms); + const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms)); + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(adev); + AcpiTable table = { .sig = "APIC", .rev = 1, .oem_id = oem_id, + .oem_table_id = oem_table_id }; + + acpi_table_begin(&table, table_data); + /* Local APIC Address */ + build_append_int_noprefix(table_data, APIC_DEFAULT_ADDRESS, 4); + build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags */ + + for (i = 0; i < apic_ids->len; i++) { + adevc->madt_cpu(adev, i, apic_ids, table_data, false); + if (apic_ids->cpus[i].arch_id > 254) { + x2apic_mode = true; + } + } + + build_ioapic(table_data, ACPI_BUILD_IOAPIC_ID, IO_APIC_DEFAULT_ADDRESS, 0); + if (x86ms->ioapic2) { + build_ioapic(table_data, ACPI_BUILD_IOAPIC_ID + 1, + IO_APIC_SECONDARY_ADDRESS, IO_APIC_SECONDARY_IRQBASE); + } + + if (x86ms->apic_xrupt_override) { + build_xrupt_override(table_data, 0, 2, + 0 /* Flags: Conforms to the specifications of the bus */); + } + + for (i = 1; i < 16; i++) { + if (!(x86ms->pci_irq_mask & (1 << i))) { + /* No need for a INT source override structure. */ + continue; + } + build_xrupt_override(table_data, i, i, + 0xd /* Flags: Active high, Level Triggered */); + } + + if (x2apic_mode) { + /* Rev 4.0, 5.2.12.13 Local x2APIC NMI Structure*/ + build_append_int_noprefix(table_data, 0xA, 1); /* Type */ + build_append_int_noprefix(table_data, 12, 1); /* Length */ + build_append_int_noprefix(table_data, 0, 2); /* Flags */ + /* ACPI Processor UID */ + build_append_int_noprefix(table_data, 0xFFFFFFFF /* all processors */, + 4); + /* Local x2APIC LINT# */ + build_append_int_noprefix(table_data, 1 /* ACPI_LINT1 */, 1); + build_append_int_noprefix(table_data, 0, 3); /* Reserved */ + } else { + /* Rev 1.0b, 5.2.8.3.3 Local APIC NMI */ + build_append_int_noprefix(table_data, 4, 1); /* Type */ + build_append_int_noprefix(table_data, 6, 1); /* Length */ + /* ACPI Processor ID */ + build_append_int_noprefix(table_data, 0xFF /* all processors */, 1); + build_append_int_noprefix(table_data, 0, 2); /* Flags */ + /* Local APIC INTI# */ + build_append_int_noprefix(table_data, 1 /* ACPI_LINT1 */, 1); + } + + acpi_table_end(linker, &table); +} + diff --git a/hw/i386/acpi-common.h b/hw/i386/acpi-common.h new file mode 100644 index 000000000..a68825acf --- /dev/null +++ b/hw/i386/acpi-common.h @@ -0,0 +1,15 @@ +#ifndef HW_I386_ACPI_COMMON_H +#define HW_I386_ACPI_COMMON_H + +#include "hw/acpi/acpi_dev_interface.h" +#include "hw/acpi/bios-linker-loader.h" +#include "hw/i386/x86.h" + +/* Default IOAPIC ID */ +#define ACPI_BUILD_IOAPIC_ID 0x0 + +void acpi_build_madt(GArray *table_data, BIOSLinker *linker, + X86MachineState *x86ms, AcpiDeviceIf *adev, + const char *oem_id, const char *oem_table_id); + +#endif diff --git a/hw/i386/acpi-microvm.c b/hw/i386/acpi-microvm.c new file mode 100644 index 000000000..196d31849 --- /dev/null +++ b/hw/i386/acpi-microvm.c @@ -0,0 +1,258 @@ +/* Support for generating ACPI tables and passing them to Guests + * + * Copyright (C) 2008-2010 Kevin O'Connor <kevin@koconnor.net> + * Copyright (C) 2006 Fabrice Bellard + * Copyright (C) 2013 Red Hat Inc + * + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "qapi/error.h" + +#include "exec/memory.h" +#include "hw/acpi/acpi.h" +#include "hw/acpi/aml-build.h" +#include "hw/acpi/bios-linker-loader.h" +#include "hw/acpi/generic_event_device.h" +#include "hw/acpi/utils.h" +#include "hw/i386/fw_cfg.h" +#include "hw/i386/microvm.h" +#include "hw/pci/pci.h" +#include "hw/pci/pcie_host.h" +#include "hw/usb/xhci.h" +#include "hw/virtio/virtio-mmio.h" + +#include "acpi-common.h" +#include "acpi-microvm.h" + +static void acpi_dsdt_add_virtio(Aml *scope, + MicrovmMachineState *mms) +{ + gchar *separator; + long int index; + BusState *bus; + BusChild *kid; + + bus = sysbus_get_default(); + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + Object *obj = object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MMIO); + + if (obj) { + VirtIOMMIOProxy *mmio = VIRTIO_MMIO(obj); + VirtioBusState *mmio_virtio_bus = &mmio->bus; + BusState *mmio_bus = &mmio_virtio_bus->parent_obj; + + if (QTAILQ_EMPTY(&mmio_bus->children)) { + continue; + } + separator = g_strrstr(mmio_bus->name, "."); + if (!separator) { + continue; + } + if (qemu_strtol(separator + 1, NULL, 10, &index) != 0) { + continue; + } + + uint32_t irq = mms->virtio_irq_base + index; + hwaddr base = VIRTIO_MMIO_BASE + index * 512; + hwaddr size = 512; + + Aml *dev = aml_device("VR%02u", (unsigned)index); + aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0005"))); + aml_append(dev, aml_name_decl("_UID", aml_int(index))); + aml_append(dev, aml_name_decl("_CCA", aml_int(1))); + + Aml *crs = aml_resource_template(); + aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE)); + aml_append(crs, + aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH, + AML_EXCLUSIVE, &irq, 1)); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); + } + } +} + +static void acpi_dsdt_add_xhci(Aml *scope, MicrovmMachineState *mms) +{ + if (machine_usb(MACHINE(mms))) { + xhci_sysbus_build_aml(scope, MICROVM_XHCI_BASE, MICROVM_XHCI_IRQ); + } +} + +static void acpi_dsdt_add_pci(Aml *scope, MicrovmMachineState *mms) +{ + if (mms->pcie != ON_OFF_AUTO_ON) { + return; + } + + acpi_dsdt_add_gpex(scope, &mms->gpex); +} + +static void +build_dsdt_microvm(GArray *table_data, BIOSLinker *linker, + MicrovmMachineState *mms) +{ + X86MachineState *x86ms = X86_MACHINE(mms); + Aml *dsdt, *sb_scope, *scope, *pkg; + bool ambiguous; + Object *isabus; + AcpiTable table = { .sig = "DSDT", .rev = 2, .oem_id = x86ms->oem_id, + .oem_table_id = x86ms->oem_table_id }; + + isabus = object_resolve_path_type("", TYPE_ISA_BUS, &ambiguous); + assert(isabus); + assert(!ambiguous); + + acpi_table_begin(&table, table_data); + dsdt = init_aml_allocator(); + + sb_scope = aml_scope("_SB"); + fw_cfg_add_acpi_dsdt(sb_scope, x86ms->fw_cfg); + isa_build_aml(ISA_BUS(isabus), sb_scope); + build_ged_aml(sb_scope, GED_DEVICE, x86ms->acpi_dev, + GED_MMIO_IRQ, AML_SYSTEM_MEMORY, GED_MMIO_BASE); + acpi_dsdt_add_power_button(sb_scope); + acpi_dsdt_add_virtio(sb_scope, mms); + acpi_dsdt_add_xhci(sb_scope, mms); + acpi_dsdt_add_pci(sb_scope, mms); + aml_append(dsdt, sb_scope); + + /* ACPI 5.0: Table 7-209 System State Package */ + scope = aml_scope("\\"); + pkg = aml_package(4); + aml_append(pkg, aml_int(ACPI_GED_SLP_TYP_S5)); + aml_append(pkg, aml_int(0)); /* ignored */ + aml_append(pkg, aml_int(0)); /* reserved */ + aml_append(pkg, aml_int(0)); /* reserved */ + aml_append(scope, aml_name_decl("_S5", pkg)); + aml_append(dsdt, scope); + + /* copy AML bytecode into ACPI tables blob */ + g_array_append_vals(table_data, dsdt->buf->data, dsdt->buf->len); + + acpi_table_end(linker, &table); + free_aml_allocator(); +} + +static void acpi_build_microvm(AcpiBuildTables *tables, + MicrovmMachineState *mms) +{ + MachineState *machine = MACHINE(mms); + X86MachineState *x86ms = X86_MACHINE(mms); + GArray *table_offsets; + GArray *tables_blob = tables->table_data; + unsigned dsdt, xsdt; + AcpiFadtData pmfadt = { + /* ACPI 5.0: 4.1 Hardware-Reduced ACPI */ + .rev = 5, + .flags = ((1 << ACPI_FADT_F_HW_REDUCED_ACPI) | + (1 << ACPI_FADT_F_RESET_REG_SUP)), + + /* ACPI 5.0: 4.8.3.7 Sleep Control and Status Registers */ + .sleep_ctl = { + .space_id = AML_AS_SYSTEM_MEMORY, + .bit_width = 8, + .address = GED_MMIO_BASE_REGS + ACPI_GED_REG_SLEEP_CTL, + }, + .sleep_sts = { + .space_id = AML_AS_SYSTEM_MEMORY, + .bit_width = 8, + .address = GED_MMIO_BASE_REGS + ACPI_GED_REG_SLEEP_STS, + }, + + /* ACPI 5.0: 4.8.3.6 Reset Register */ + .reset_reg = { + .space_id = AML_AS_SYSTEM_MEMORY, + .bit_width = 8, + .address = GED_MMIO_BASE_REGS + ACPI_GED_REG_RESET, + }, + .reset_val = ACPI_GED_RESET_VALUE, + }; + + table_offsets = g_array_new(false, true /* clear */, + sizeof(uint32_t)); + bios_linker_loader_alloc(tables->linker, + ACPI_BUILD_TABLE_FILE, tables_blob, + 64 /* Ensure FACS is aligned */, + false /* high memory */); + + dsdt = tables_blob->len; + build_dsdt_microvm(tables_blob, tables->linker, mms); + + pmfadt.dsdt_tbl_offset = &dsdt; + pmfadt.xdsdt_tbl_offset = &dsdt; + acpi_add_table(table_offsets, tables_blob); + build_fadt(tables_blob, tables->linker, &pmfadt, x86ms->oem_id, + x86ms->oem_table_id); + + acpi_add_table(table_offsets, tables_blob); + acpi_build_madt(tables_blob, tables->linker, X86_MACHINE(machine), + ACPI_DEVICE_IF(x86ms->acpi_dev), x86ms->oem_id, + x86ms->oem_table_id); + + xsdt = tables_blob->len; + build_xsdt(tables_blob, tables->linker, table_offsets, x86ms->oem_id, + x86ms->oem_table_id); + + /* RSDP is in FSEG memory, so allocate it separately */ + { + AcpiRsdpData rsdp_data = { + /* ACPI 2.0: 5.2.4.3 RSDP Structure */ + .revision = 2, /* xsdt needs v2 */ + .oem_id = x86ms->oem_id, + .xsdt_tbl_offset = &xsdt, + .rsdt_tbl_offset = NULL, + }; + build_rsdp(tables->rsdp, tables->linker, &rsdp_data); + } + + /* Cleanup memory that's no longer used. */ + g_array_free(table_offsets, true); +} + +static void acpi_build_no_update(void *build_opaque) +{ + /* nothing, microvm tables don't change at runtime */ +} + +void acpi_setup_microvm(MicrovmMachineState *mms) +{ + X86MachineState *x86ms = X86_MACHINE(mms); + AcpiBuildTables tables; + + assert(x86ms->fw_cfg); + + if (!x86_machine_is_acpi_enabled(x86ms)) { + return; + } + + acpi_build_tables_init(&tables); + acpi_build_microvm(&tables, mms); + + /* Now expose it all to Guest */ + acpi_add_rom_blob(acpi_build_no_update, NULL, tables.table_data, + ACPI_BUILD_TABLE_FILE); + acpi_add_rom_blob(acpi_build_no_update, NULL, tables.linker->cmd_blob, + ACPI_BUILD_LOADER_FILE); + acpi_add_rom_blob(acpi_build_no_update, NULL, tables.rsdp, + ACPI_BUILD_RSDP_FILE); + + acpi_build_tables_cleanup(&tables, false); +} diff --git a/hw/i386/acpi-microvm.h b/hw/i386/acpi-microvm.h new file mode 100644 index 000000000..dfe853690 --- /dev/null +++ b/hw/i386/acpi-microvm.h @@ -0,0 +1,8 @@ +#ifndef HW_I386_ACPI_MICROVM_H +#define HW_I386_ACPI_MICROVM_H + +#include "hw/i386/microvm.h" + +void acpi_setup_microvm(MicrovmMachineState *mms); + +#endif diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c new file mode 100644 index 000000000..91fe34ae5 --- /dev/null +++ b/hw/i386/amd_iommu.c @@ -0,0 +1,1662 @@ +/* + * QEMU emulation of AMD IOMMU (AMD-Vi) + * + * Copyright (C) 2011 Eduard - Gabriel Munteanu + * Copyright (C) 2015, 2016 David Kiarie Kahurani + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + * + * Cache implementation inspired by hw/i386/intel_iommu.c + */ + +#include "qemu/osdep.h" +#include "hw/i386/pc.h" +#include "hw/pci/msi.h" +#include "hw/pci/pci_bus.h" +#include "migration/vmstate.h" +#include "amd_iommu.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "hw/i386/apic_internal.h" +#include "trace.h" +#include "hw/i386/apic-msidef.h" + +/* used AMD-Vi MMIO registers */ +const char *amdvi_mmio_low[] = { + "AMDVI_MMIO_DEVTAB_BASE", + "AMDVI_MMIO_CMDBUF_BASE", + "AMDVI_MMIO_EVTLOG_BASE", + "AMDVI_MMIO_CONTROL", + "AMDVI_MMIO_EXCL_BASE", + "AMDVI_MMIO_EXCL_LIMIT", + "AMDVI_MMIO_EXT_FEATURES", + "AMDVI_MMIO_PPR_BASE", + "UNHANDLED" +}; +const char *amdvi_mmio_high[] = { + "AMDVI_MMIO_COMMAND_HEAD", + "AMDVI_MMIO_COMMAND_TAIL", + "AMDVI_MMIO_EVTLOG_HEAD", + "AMDVI_MMIO_EVTLOG_TAIL", + "AMDVI_MMIO_STATUS", + "AMDVI_MMIO_PPR_HEAD", + "AMDVI_MMIO_PPR_TAIL", + "UNHANDLED" +}; + +struct AMDVIAddressSpace { + uint8_t bus_num; /* bus number */ + uint8_t devfn; /* device function */ + AMDVIState *iommu_state; /* AMDVI - one per machine */ + MemoryRegion root; /* AMDVI Root memory map region */ + IOMMUMemoryRegion iommu; /* Device's address translation region */ + MemoryRegion iommu_ir; /* Device's interrupt remapping region */ + AddressSpace as; /* device's corresponding address space */ +}; + +/* AMDVI cache entry */ +typedef struct AMDVIIOTLBEntry { + uint16_t domid; /* assigned domain id */ + uint16_t devid; /* device owning entry */ + uint64_t perms; /* access permissions */ + uint64_t translated_addr; /* translated address */ + uint64_t page_mask; /* physical page size */ +} AMDVIIOTLBEntry; + +/* configure MMIO registers at startup/reset */ +static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val, + uint64_t romask, uint64_t w1cmask) +{ + stq_le_p(&s->mmior[addr], val); + stq_le_p(&s->romask[addr], romask); + stq_le_p(&s->w1cmask[addr], w1cmask); +} + +static uint16_t amdvi_readw(AMDVIState *s, hwaddr addr) +{ + return lduw_le_p(&s->mmior[addr]); +} + +static uint32_t amdvi_readl(AMDVIState *s, hwaddr addr) +{ + return ldl_le_p(&s->mmior[addr]); +} + +static uint64_t amdvi_readq(AMDVIState *s, hwaddr addr) +{ + return ldq_le_p(&s->mmior[addr]); +} + +/* internal write */ +static void amdvi_writeq_raw(AMDVIState *s, hwaddr addr, uint64_t val) +{ + stq_le_p(&s->mmior[addr], val); +} + +/* external write */ +static void amdvi_writew(AMDVIState *s, hwaddr addr, uint16_t val) +{ + uint16_t romask = lduw_le_p(&s->romask[addr]); + uint16_t w1cmask = lduw_le_p(&s->w1cmask[addr]); + uint16_t oldval = lduw_le_p(&s->mmior[addr]); + stw_le_p(&s->mmior[addr], + ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask)); +} + +static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val) +{ + uint32_t romask = ldl_le_p(&s->romask[addr]); + uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); + uint32_t oldval = ldl_le_p(&s->mmior[addr]); + stl_le_p(&s->mmior[addr], + ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask)); +} + +static void amdvi_writeq(AMDVIState *s, hwaddr addr, uint64_t val) +{ + uint64_t romask = ldq_le_p(&s->romask[addr]); + uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); + uint32_t oldval = ldq_le_p(&s->mmior[addr]); + stq_le_p(&s->mmior[addr], + ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask)); +} + +/* OR a 64-bit register with a 64-bit value */ +static bool amdvi_test_mask(AMDVIState *s, hwaddr addr, uint64_t val) +{ + return amdvi_readq(s, addr) | val; +} + +/* OR a 64-bit register with a 64-bit value storing result in the register */ +static void amdvi_assign_orq(AMDVIState *s, hwaddr addr, uint64_t val) +{ + amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) | val); +} + +/* AND a 64-bit register with a 64-bit value storing result in the register */ +static void amdvi_assign_andq(AMDVIState *s, hwaddr addr, uint64_t val) +{ + amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) & val); +} + +static void amdvi_generate_msi_interrupt(AMDVIState *s) +{ + MSIMessage msg = {}; + MemTxAttrs attrs = { + .requester_id = pci_requester_id(&s->pci.dev) + }; + + if (msi_enabled(&s->pci.dev)) { + msg = msi_get_message(&s->pci.dev, 0); + address_space_stl_le(&address_space_memory, msg.address, msg.data, + attrs, NULL); + } +} + +static void amdvi_log_event(AMDVIState *s, uint64_t *evt) +{ + /* event logging not enabled */ + if (!s->evtlog_enabled || amdvi_test_mask(s, AMDVI_MMIO_STATUS, + AMDVI_MMIO_STATUS_EVT_OVF)) { + return; + } + + /* event log buffer full */ + if (s->evtlog_tail >= s->evtlog_len) { + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF); + /* generate interrupt */ + amdvi_generate_msi_interrupt(s); + return; + } + + if (dma_memory_write(&address_space_memory, s->evtlog + s->evtlog_tail, + evt, AMDVI_EVENT_LEN)) { + trace_amdvi_evntlog_fail(s->evtlog, s->evtlog_tail); + } + + s->evtlog_tail += AMDVI_EVENT_LEN; + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT); + amdvi_generate_msi_interrupt(s); +} + +static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start, + int length) +{ + int index = start / 64, bitpos = start % 64; + uint64_t mask = MAKE_64BIT_MASK(start, length); + buffer[index] &= ~mask; + buffer[index] |= (value << bitpos) & mask; +} +/* + * AMDVi event structure + * 0:15 -> DeviceID + * 55:63 -> event type + miscellaneous info + * 63:127 -> related address + */ +static void amdvi_encode_event(uint64_t *evt, uint16_t devid, uint64_t addr, + uint16_t info) +{ + amdvi_setevent_bits(evt, devid, 0, 16); + amdvi_setevent_bits(evt, info, 55, 8); + amdvi_setevent_bits(evt, addr, 63, 64); +} +/* log an error encountered during a page walk + * + * @addr: virtual address in translation request + */ +static void amdvi_page_fault(AMDVIState *s, uint16_t devid, + hwaddr addr, uint16_t info) +{ + uint64_t evt[4]; + + info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF; + amdvi_encode_event(evt, devid, addr, info); + amdvi_log_event(s, evt); + pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); +} +/* + * log a master abort accessing device table + * @devtab : address of device table entry + * @info : error flags + */ +static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid, + hwaddr devtab, uint16_t info) +{ + uint64_t evt[4]; + + info |= AMDVI_EVENT_DEV_TAB_HW_ERROR; + + amdvi_encode_event(evt, devid, devtab, info); + amdvi_log_event(s, evt); + pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); +} +/* log an event trying to access command buffer + * @addr : address that couldn't be accessed + */ +static void amdvi_log_command_error(AMDVIState *s, hwaddr addr) +{ + uint64_t evt[4], info = AMDVI_EVENT_COMMAND_HW_ERROR; + + amdvi_encode_event(evt, 0, addr, info); + amdvi_log_event(s, evt); + pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); +} +/* log an illegal comand event + * @addr : address of illegal command + */ +static void amdvi_log_illegalcom_error(AMDVIState *s, uint16_t info, + hwaddr addr) +{ + uint64_t evt[4]; + + info |= AMDVI_EVENT_ILLEGAL_COMMAND_ERROR; + amdvi_encode_event(evt, 0, addr, info); + amdvi_log_event(s, evt); +} +/* log an error accessing device table + * + * @devid : device owning the table entry + * @devtab : address of device table entry + * @info : error flags + */ +static void amdvi_log_illegaldevtab_error(AMDVIState *s, uint16_t devid, + hwaddr addr, uint16_t info) +{ + uint64_t evt[4]; + + info |= AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY; + amdvi_encode_event(evt, devid, addr, info); + amdvi_log_event(s, evt); +} +/* log an error accessing a PTE entry + * @addr : address that couldn't be accessed + */ +static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid, + hwaddr addr, uint16_t info) +{ + uint64_t evt[4]; + + info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR; + amdvi_encode_event(evt, devid, addr, info); + amdvi_log_event(s, evt); + pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); +} + +static gboolean amdvi_uint64_equal(gconstpointer v1, gconstpointer v2) +{ + return *((const uint64_t *)v1) == *((const uint64_t *)v2); +} + +static guint amdvi_uint64_hash(gconstpointer v) +{ + return (guint)*(const uint64_t *)v; +} + +static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr, + uint64_t devid) +{ + uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) | + ((uint64_t)(devid) << AMDVI_DEVID_SHIFT); + return g_hash_table_lookup(s->iotlb, &key); +} + +static void amdvi_iotlb_reset(AMDVIState *s) +{ + assert(s->iotlb); + trace_amdvi_iotlb_reset(); + g_hash_table_remove_all(s->iotlb); +} + +static gboolean amdvi_iotlb_remove_by_devid(gpointer key, gpointer value, + gpointer user_data) +{ + AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value; + uint16_t devid = *(uint16_t *)user_data; + return entry->devid == devid; +} + +static void amdvi_iotlb_remove_page(AMDVIState *s, hwaddr addr, + uint64_t devid) +{ + uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) | + ((uint64_t)(devid) << AMDVI_DEVID_SHIFT); + g_hash_table_remove(s->iotlb, &key); +} + +static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid, + uint64_t gpa, IOMMUTLBEntry to_cache, + uint16_t domid) +{ + AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1); + uint64_t *key = g_new(uint64_t, 1); + uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K; + + /* don't cache erroneous translations */ + if (to_cache.perm != IOMMU_NONE) { + trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid), + PCI_FUNC(devid), gpa, to_cache.translated_addr); + + if (g_hash_table_size(s->iotlb) >= AMDVI_IOTLB_MAX_SIZE) { + amdvi_iotlb_reset(s); + } + + entry->domid = domid; + entry->perms = to_cache.perm; + entry->translated_addr = to_cache.translated_addr; + entry->page_mask = to_cache.addr_mask; + *key = gfn | ((uint64_t)(devid) << AMDVI_DEVID_SHIFT); + g_hash_table_replace(s->iotlb, key, entry); + } +} + +static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd) +{ + /* pad the last 3 bits */ + hwaddr addr = cpu_to_le64(extract64(cmd[0], 3, 49)) << 3; + uint64_t data = cpu_to_le64(cmd[1]); + + if (extract64(cmd[0], 52, 8)) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + if (extract64(cmd[0], 0, 1)) { + if (dma_memory_write(&address_space_memory, addr, &data, + AMDVI_COMPLETION_DATA_SIZE)) { + trace_amdvi_completion_wait_fail(addr); + } + } + /* set completion interrupt */ + if (extract64(cmd[0], 1, 1)) { + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT); + /* generate interrupt */ + amdvi_generate_msi_interrupt(s); + } + trace_amdvi_completion_wait(addr, data); +} + +/* log error without aborting since linux seems to be using reserved bits */ +static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd) +{ + uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16)); + + /* This command should invalidate internal caches of which there isn't */ + if (extract64(cmd[0], 16, 44) || cmd[1]) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid), + PCI_FUNC(devid)); +} + +static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd) +{ + if (extract64(cmd[0], 16, 16) || extract64(cmd[0], 52, 8) || + extract64(cmd[1], 0, 2) || extract64(cmd[1], 3, 29) + || extract64(cmd[1], 48, 16)) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + trace_amdvi_ppr_exec(); +} + +static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd) +{ + if (extract64(cmd[0], 0, 60) || cmd[1]) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + + amdvi_iotlb_reset(s); + trace_amdvi_all_inval(); +} + +static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value, + gpointer user_data) +{ + AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value; + uint16_t domid = *(uint16_t *)user_data; + return entry->domid == domid; +} + +/* we don't have devid - we can't remove pages by address */ +static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd) +{ + uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16)); + + if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) || + extract64(cmd[1], 3, 9)) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + + g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid, + &domid); + trace_amdvi_pages_inval(domid); +} + +static void amdvi_prefetch_pages(AMDVIState *s, uint64_t *cmd) +{ + if (extract64(cmd[0], 16, 8) || extract64(cmd[0], 52, 8) || + extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) || + extract64(cmd[1], 5, 7)) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + + trace_amdvi_prefetch_pages(); +} + +static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd) +{ + if (extract64(cmd[0], 16, 44) || cmd[1]) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + return; + } + + trace_amdvi_intr_inval(); +} + +/* FIXME: Try to work with the specified size instead of all the pages + * when the S bit is on + */ +static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd) +{ + + uint16_t devid = extract64(cmd[0], 0, 16); + if (extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) || + extract64(cmd[1], 6, 6)) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + return; + } + + if (extract64(cmd[1], 0, 1)) { + g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_devid, + &devid); + } else { + amdvi_iotlb_remove_page(s, cpu_to_le64(extract64(cmd[1], 12, 52)) << 12, + cpu_to_le16(extract64(cmd[1], 0, 16))); + } + trace_amdvi_iotlb_inval(); +} + +/* not honouring reserved bits is regarded as an illegal command */ +static void amdvi_cmdbuf_exec(AMDVIState *s) +{ + uint64_t cmd[2]; + + if (dma_memory_read(&address_space_memory, s->cmdbuf + s->cmdbuf_head, + cmd, AMDVI_COMMAND_SIZE)) { + trace_amdvi_command_read_fail(s->cmdbuf, s->cmdbuf_head); + amdvi_log_command_error(s, s->cmdbuf + s->cmdbuf_head); + return; + } + + switch (extract64(cmd[0], 60, 4)) { + case AMDVI_CMD_COMPLETION_WAIT: + amdvi_completion_wait(s, cmd); + break; + case AMDVI_CMD_INVAL_DEVTAB_ENTRY: + amdvi_inval_devtab_entry(s, cmd); + break; + case AMDVI_CMD_INVAL_AMDVI_PAGES: + amdvi_inval_pages(s, cmd); + break; + case AMDVI_CMD_INVAL_IOTLB_PAGES: + iommu_inval_iotlb(s, cmd); + break; + case AMDVI_CMD_INVAL_INTR_TABLE: + amdvi_inval_inttable(s, cmd); + break; + case AMDVI_CMD_PREFETCH_AMDVI_PAGES: + amdvi_prefetch_pages(s, cmd); + break; + case AMDVI_CMD_COMPLETE_PPR_REQUEST: + amdvi_complete_ppr(s, cmd); + break; + case AMDVI_CMD_INVAL_AMDVI_ALL: + amdvi_inval_all(s, cmd); + break; + default: + trace_amdvi_unhandled_command(extract64(cmd[1], 60, 4)); + /* log illegal command */ + amdvi_log_illegalcom_error(s, extract64(cmd[1], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } +} + +static void amdvi_cmdbuf_run(AMDVIState *s) +{ + if (!s->cmdbuf_enabled) { + trace_amdvi_command_error(amdvi_readq(s, AMDVI_MMIO_CONTROL)); + return; + } + + /* check if there is work to do. */ + while (s->cmdbuf_head != s->cmdbuf_tail) { + trace_amdvi_command_exec(s->cmdbuf_head, s->cmdbuf_tail, s->cmdbuf); + amdvi_cmdbuf_exec(s); + s->cmdbuf_head += AMDVI_COMMAND_SIZE; + amdvi_writeq_raw(s, AMDVI_MMIO_COMMAND_HEAD, s->cmdbuf_head); + + /* wrap head pointer */ + if (s->cmdbuf_head >= s->cmdbuf_len * AMDVI_COMMAND_SIZE) { + s->cmdbuf_head = 0; + } + } +} + +static void amdvi_mmio_trace(hwaddr addr, unsigned size) +{ + uint8_t index = (addr & ~0x2000) / 8; + + if ((addr & 0x2000)) { + /* high table */ + index = index >= AMDVI_MMIO_REGS_HIGH ? AMDVI_MMIO_REGS_HIGH : index; + trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07); + } else { + index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index; + trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07); + } +} + +static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size) +{ + AMDVIState *s = opaque; + + uint64_t val = -1; + if (addr + size > AMDVI_MMIO_SIZE) { + trace_amdvi_mmio_read_invalid(AMDVI_MMIO_SIZE, addr, size); + return (uint64_t)-1; + } + + if (size == 2) { + val = amdvi_readw(s, addr); + } else if (size == 4) { + val = amdvi_readl(s, addr); + } else if (size == 8) { + val = amdvi_readq(s, addr); + } + amdvi_mmio_trace(addr, size); + + return val; +} + +static void amdvi_handle_control_write(AMDVIState *s) +{ + unsigned long control = amdvi_readq(s, AMDVI_MMIO_CONTROL); + s->enabled = !!(control & AMDVI_MMIO_CONTROL_AMDVIEN); + + s->ats_enabled = !!(control & AMDVI_MMIO_CONTROL_HTTUNEN); + s->evtlog_enabled = s->enabled && !!(control & + AMDVI_MMIO_CONTROL_EVENTLOGEN); + + s->evtlog_intr = !!(control & AMDVI_MMIO_CONTROL_EVENTINTEN); + s->completion_wait_intr = !!(control & AMDVI_MMIO_CONTROL_COMWAITINTEN); + s->cmdbuf_enabled = s->enabled && !!(control & + AMDVI_MMIO_CONTROL_CMDBUFLEN); + s->ga_enabled = !!(control & AMDVI_MMIO_CONTROL_GAEN); + + /* update the flags depending on the control register */ + if (s->cmdbuf_enabled) { + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_CMDBUF_RUN); + } else { + amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_CMDBUF_RUN); + } + if (s->evtlog_enabled) { + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_RUN); + } else { + amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_EVT_RUN); + } + + trace_amdvi_control_status(control); + amdvi_cmdbuf_run(s); +} + +static inline void amdvi_handle_devtab_write(AMDVIState *s) + +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_DEVICE_TABLE); + s->devtab = (val & AMDVI_MMIO_DEVTAB_BASE_MASK); + + /* set device table length */ + s->devtab_len = ((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1 * + (AMDVI_MMIO_DEVTAB_SIZE_UNIT / + AMDVI_MMIO_DEVTAB_ENTRY_SIZE)); +} + +static inline void amdvi_handle_cmdhead_write(AMDVIState *s) +{ + s->cmdbuf_head = amdvi_readq(s, AMDVI_MMIO_COMMAND_HEAD) + & AMDVI_MMIO_CMDBUF_HEAD_MASK; + amdvi_cmdbuf_run(s); +} + +static inline void amdvi_handle_cmdbase_write(AMDVIState *s) +{ + s->cmdbuf = amdvi_readq(s, AMDVI_MMIO_COMMAND_BASE) + & AMDVI_MMIO_CMDBUF_BASE_MASK; + s->cmdbuf_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_CMDBUF_SIZE_BYTE) + & AMDVI_MMIO_CMDBUF_SIZE_MASK); + s->cmdbuf_head = s->cmdbuf_tail = 0; +} + +static inline void amdvi_handle_cmdtail_write(AMDVIState *s) +{ + s->cmdbuf_tail = amdvi_readq(s, AMDVI_MMIO_COMMAND_TAIL) + & AMDVI_MMIO_CMDBUF_TAIL_MASK; + amdvi_cmdbuf_run(s); +} + +static inline void amdvi_handle_excllim_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_EXCL_LIMIT); + s->excl_limit = (val & AMDVI_MMIO_EXCL_LIMIT_MASK) | + AMDVI_MMIO_EXCL_LIMIT_LOW; +} + +static inline void amdvi_handle_evtbase_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_BASE); + s->evtlog = val & AMDVI_MMIO_EVTLOG_BASE_MASK; + s->evtlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_EVTLOG_SIZE_BYTE) + & AMDVI_MMIO_EVTLOG_SIZE_MASK); +} + +static inline void amdvi_handle_evttail_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_TAIL); + s->evtlog_tail = val & AMDVI_MMIO_EVTLOG_TAIL_MASK; +} + +static inline void amdvi_handle_evthead_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_HEAD); + s->evtlog_head = val & AMDVI_MMIO_EVTLOG_HEAD_MASK; +} + +static inline void amdvi_handle_pprbase_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_BASE); + s->ppr_log = val & AMDVI_MMIO_PPRLOG_BASE_MASK; + s->pprlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_PPRLOG_SIZE_BYTE) + & AMDVI_MMIO_PPRLOG_SIZE_MASK); +} + +static inline void amdvi_handle_pprhead_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_HEAD); + s->pprlog_head = val & AMDVI_MMIO_PPRLOG_HEAD_MASK; +} + +static inline void amdvi_handle_pprtail_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_TAIL); + s->pprlog_tail = val & AMDVI_MMIO_PPRLOG_TAIL_MASK; +} + +/* FIXME: something might go wrong if System Software writes in chunks + * of one byte but linux writes in chunks of 4 bytes so currently it + * works correctly with linux but will definitely be busted if software + * reads/writes 8 bytes + */ +static void amdvi_mmio_reg_write(AMDVIState *s, unsigned size, uint64_t val, + hwaddr addr) +{ + if (size == 2) { + amdvi_writew(s, addr, val); + } else if (size == 4) { + amdvi_writel(s, addr, val); + } else if (size == 8) { + amdvi_writeq(s, addr, val); + } +} + +static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ + AMDVIState *s = opaque; + unsigned long offset = addr & 0x07; + + if (addr + size > AMDVI_MMIO_SIZE) { + trace_amdvi_mmio_write("error: addr outside region: max ", + (uint64_t)AMDVI_MMIO_SIZE, size, val, offset); + return; + } + + amdvi_mmio_trace(addr, size); + switch (addr & ~0x07) { + case AMDVI_MMIO_CONTROL: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_control_write(s); + break; + case AMDVI_MMIO_DEVICE_TABLE: + amdvi_mmio_reg_write(s, size, val, addr); + /* set device table address + * This also suffers from inability to tell whether software + * is done writing + */ + if (offset || (size == 8)) { + amdvi_handle_devtab_write(s); + } + break; + case AMDVI_MMIO_COMMAND_HEAD: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_cmdhead_write(s); + break; + case AMDVI_MMIO_COMMAND_BASE: + amdvi_mmio_reg_write(s, size, val, addr); + /* FIXME - make sure System Software has finished writing incase + * it writes in chucks less than 8 bytes in a robust way.As for + * now, this hacks works for the linux driver + */ + if (offset || (size == 8)) { + amdvi_handle_cmdbase_write(s); + } + break; + case AMDVI_MMIO_COMMAND_TAIL: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_cmdtail_write(s); + break; + case AMDVI_MMIO_EVENT_BASE: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_evtbase_write(s); + break; + case AMDVI_MMIO_EVENT_HEAD: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_evthead_write(s); + break; + case AMDVI_MMIO_EVENT_TAIL: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_evttail_write(s); + break; + case AMDVI_MMIO_EXCL_LIMIT: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_excllim_write(s); + break; + /* PPR log base - unused for now */ + case AMDVI_MMIO_PPR_BASE: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_pprbase_write(s); + break; + /* PPR log head - also unused for now */ + case AMDVI_MMIO_PPR_HEAD: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_pprhead_write(s); + break; + /* PPR log tail - unused for now */ + case AMDVI_MMIO_PPR_TAIL: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_pprtail_write(s); + break; + } +} + +static inline uint64_t amdvi_get_perms(uint64_t entry) +{ + return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >> + AMDVI_DEV_PERM_SHIFT; +} + +/* validate that reserved bits are honoured */ +static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid, + uint64_t *dte) +{ + if ((dte[0] & AMDVI_DTE_LOWER_QUAD_RESERVED) + || (dte[1] & AMDVI_DTE_MIDDLE_QUAD_RESERVED) + || (dte[2] & AMDVI_DTE_UPPER_QUAD_RESERVED) || dte[3]) { + amdvi_log_illegaldevtab_error(s, devid, + s->devtab + + devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); + return false; + } + + return true; +} + +/* get a device table entry given the devid */ +static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry) +{ + uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE; + + if (dma_memory_read(&address_space_memory, s->devtab + offset, entry, + AMDVI_DEVTAB_ENTRY_SIZE)) { + trace_amdvi_dte_get_fail(s->devtab, offset); + /* log error accessing dte */ + amdvi_log_devtab_error(s, devid, s->devtab + offset, 0); + return false; + } + + *entry = le64_to_cpu(*entry); + if (!amdvi_validate_dte(s, devid, entry)) { + trace_amdvi_invalid_dte(entry[0]); + return false; + } + + return true; +} + +/* get pte translation mode */ +static inline uint8_t get_pte_translation_mode(uint64_t pte) +{ + return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; +} + +static inline uint64_t pte_override_page_mask(uint64_t pte) +{ + uint8_t page_mask = 13; + uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) >> 12; + /* find the first zero bit */ + while (addr & 1) { + page_mask++; + addr = addr >> 1; + } + + return ~((1ULL << page_mask) - 1); +} + +static inline uint64_t pte_get_page_mask(uint64_t oldlevel) +{ + return ~((1UL << ((oldlevel * 9) + 3)) - 1); +} + +static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, + uint16_t devid) +{ + uint64_t pte; + + if (dma_memory_read(&address_space_memory, pte_addr, &pte, sizeof(pte))) { + trace_amdvi_get_pte_hwerror(pte_addr); + amdvi_log_pagetab_error(s, devid, pte_addr, 0); + pte = 0; + return pte; + } + + pte = le64_to_cpu(pte); + return pte; +} + +static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte, + IOMMUTLBEntry *ret, unsigned perms, + hwaddr addr) +{ + unsigned level, present, pte_perms, oldlevel; + uint64_t pte = dte[0], pte_addr, page_mask; + + /* make sure the DTE has TV = 1 */ + if (pte & AMDVI_DEV_TRANSLATION_VALID) { + level = get_pte_translation_mode(pte); + if (level >= 7) { + trace_amdvi_mode_invalid(level, addr); + return; + } + if (level == 0) { + goto no_remap; + } + + /* we are at the leaf page table or page table encodes a huge page */ + while (level > 0) { + pte_perms = amdvi_get_perms(pte); + present = pte & 1; + if (!present || perms != (perms & pte_perms)) { + amdvi_page_fault(as->iommu_state, as->devfn, addr, perms); + trace_amdvi_page_fault(addr); + return; + } + + /* go to the next lower level */ + pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK; + /* add offset and load pte */ + pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3; + pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn); + if (!pte) { + return; + } + oldlevel = level; + level = get_pte_translation_mode(pte); + if (level == 0x7) { + break; + } + } + + if (level == 0x7) { + page_mask = pte_override_page_mask(pte); + } else { + page_mask = pte_get_page_mask(oldlevel); + } + + /* get access permissions from pte */ + ret->iova = addr & page_mask; + ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask; + ret->addr_mask = ~page_mask; + ret->perm = amdvi_get_perms(pte); + return; + } +no_remap: + ret->iova = addr & AMDVI_PAGE_MASK_4K; + ret->translated_addr = addr & AMDVI_PAGE_MASK_4K; + ret->addr_mask = ~AMDVI_PAGE_MASK_4K; + ret->perm = amdvi_get_perms(pte); +} + +static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, + bool is_write, IOMMUTLBEntry *ret) +{ + AMDVIState *s = as->iommu_state; + uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); + AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid); + uint64_t entry[4]; + + if (iotlb_entry) { + trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid), + PCI_FUNC(devid), addr, iotlb_entry->translated_addr); + ret->iova = addr & ~iotlb_entry->page_mask; + ret->translated_addr = iotlb_entry->translated_addr; + ret->addr_mask = iotlb_entry->page_mask; + ret->perm = iotlb_entry->perms; + return; + } + + if (!amdvi_get_dte(s, devid, entry)) { + return; + } + + /* devices with V = 0 are not translated */ + if (!(entry[0] & AMDVI_DEV_VALID)) { + goto out; + } + + amdvi_page_walk(as, entry, ret, + is_write ? AMDVI_PERM_WRITE : AMDVI_PERM_READ, addr); + + amdvi_update_iotlb(s, devid, addr, *ret, + entry[1] & AMDVI_DEV_DOMID_ID_MASK); + return; + +out: + ret->iova = addr & AMDVI_PAGE_MASK_4K; + ret->translated_addr = addr & AMDVI_PAGE_MASK_4K; + ret->addr_mask = ~AMDVI_PAGE_MASK_4K; + ret->perm = IOMMU_RW; +} + +static inline bool amdvi_is_interrupt_addr(hwaddr addr) +{ + return addr >= AMDVI_INT_ADDR_FIRST && addr <= AMDVI_INT_ADDR_LAST; +} + +static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr, + IOMMUAccessFlags flag, int iommu_idx) +{ + AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); + AMDVIState *s = as->iommu_state; + IOMMUTLBEntry ret = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = 0, + .addr_mask = ~(hwaddr)0, + .perm = IOMMU_NONE + }; + + if (!s->enabled) { + /* AMDVI disabled - corresponds to iommu=off not + * failure to provide any parameter + */ + ret.iova = addr & AMDVI_PAGE_MASK_4K; + ret.translated_addr = addr & AMDVI_PAGE_MASK_4K; + ret.addr_mask = ~AMDVI_PAGE_MASK_4K; + ret.perm = IOMMU_RW; + return ret; + } else if (amdvi_is_interrupt_addr(addr)) { + ret.iova = addr & AMDVI_PAGE_MASK_4K; + ret.translated_addr = addr & AMDVI_PAGE_MASK_4K; + ret.addr_mask = ~AMDVI_PAGE_MASK_4K; + ret.perm = IOMMU_WO; + return ret; + } + + amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret); + trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn), + PCI_FUNC(as->devfn), addr, ret.translated_addr); + return ret; +} + +static int amdvi_get_irte(AMDVIState *s, MSIMessage *origin, uint64_t *dte, + union irte *irte, uint16_t devid) +{ + uint64_t irte_root, offset; + + irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK; + offset = (origin->data & AMDVI_IRTE_OFFSET) << 2; + + trace_amdvi_ir_irte(irte_root, offset); + + if (dma_memory_read(&address_space_memory, irte_root + offset, + irte, sizeof(*irte))) { + trace_amdvi_ir_err("failed to get irte"); + return -AMDVI_IR_GET_IRTE; + } + + trace_amdvi_ir_irte_val(irte->val); + + return 0; +} + +static int amdvi_int_remap_legacy(AMDVIState *iommu, + MSIMessage *origin, + MSIMessage *translated, + uint64_t *dte, + X86IOMMUIrq *irq, + uint16_t sid) +{ + int ret; + union irte irte; + + /* get interrupt remapping table */ + ret = amdvi_get_irte(iommu, origin, dte, &irte, sid); + if (ret < 0) { + return ret; + } + + if (!irte.fields.valid) { + trace_amdvi_ir_target_abort("RemapEn is disabled"); + return -AMDVI_IR_TARGET_ABORT; + } + + if (irte.fields.guest_mode) { + error_report_once("guest mode is not zero"); + return -AMDVI_IR_ERR; + } + + if (irte.fields.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) { + error_report_once("reserved int_type"); + return -AMDVI_IR_ERR; + } + + irq->delivery_mode = irte.fields.int_type; + irq->vector = irte.fields.vector; + irq->dest_mode = irte.fields.dm; + irq->redir_hint = irte.fields.rq_eoi; + irq->dest = irte.fields.destination; + + return 0; +} + +static int amdvi_get_irte_ga(AMDVIState *s, MSIMessage *origin, uint64_t *dte, + struct irte_ga *irte, uint16_t devid) +{ + uint64_t irte_root, offset; + + irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK; + offset = (origin->data & AMDVI_IRTE_OFFSET) << 4; + trace_amdvi_ir_irte(irte_root, offset); + + if (dma_memory_read(&address_space_memory, irte_root + offset, + irte, sizeof(*irte))) { + trace_amdvi_ir_err("failed to get irte_ga"); + return -AMDVI_IR_GET_IRTE; + } + + trace_amdvi_ir_irte_ga_val(irte->hi.val, irte->lo.val); + return 0; +} + +static int amdvi_int_remap_ga(AMDVIState *iommu, + MSIMessage *origin, + MSIMessage *translated, + uint64_t *dte, + X86IOMMUIrq *irq, + uint16_t sid) +{ + int ret; + struct irte_ga irte; + + /* get interrupt remapping table */ + ret = amdvi_get_irte_ga(iommu, origin, dte, &irte, sid); + if (ret < 0) { + return ret; + } + + if (!irte.lo.fields_remap.valid) { + trace_amdvi_ir_target_abort("RemapEn is disabled"); + return -AMDVI_IR_TARGET_ABORT; + } + + if (irte.lo.fields_remap.guest_mode) { + error_report_once("guest mode is not zero"); + return -AMDVI_IR_ERR; + } + + if (irte.lo.fields_remap.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) { + error_report_once("reserved int_type is set"); + return -AMDVI_IR_ERR; + } + + irq->delivery_mode = irte.lo.fields_remap.int_type; + irq->vector = irte.hi.fields.vector; + irq->dest_mode = irte.lo.fields_remap.dm; + irq->redir_hint = irte.lo.fields_remap.rq_eoi; + irq->dest = irte.lo.fields_remap.destination; + + return 0; +} + +static int __amdvi_int_remap_msi(AMDVIState *iommu, + MSIMessage *origin, + MSIMessage *translated, + uint64_t *dte, + X86IOMMUIrq *irq, + uint16_t sid) +{ + int ret; + uint8_t int_ctl; + + int_ctl = (dte[2] >> AMDVI_IR_INTCTL_SHIFT) & 3; + trace_amdvi_ir_intctl(int_ctl); + + switch (int_ctl) { + case AMDVI_IR_INTCTL_PASS: + memcpy(translated, origin, sizeof(*origin)); + return 0; + case AMDVI_IR_INTCTL_REMAP: + break; + case AMDVI_IR_INTCTL_ABORT: + trace_amdvi_ir_target_abort("int_ctl abort"); + return -AMDVI_IR_TARGET_ABORT; + default: + trace_amdvi_ir_err("int_ctl reserved"); + return -AMDVI_IR_ERR; + } + + if (iommu->ga_enabled) { + ret = amdvi_int_remap_ga(iommu, origin, translated, dte, irq, sid); + } else { + ret = amdvi_int_remap_legacy(iommu, origin, translated, dte, irq, sid); + } + + return ret; +} + +/* Interrupt remapping for MSI/MSI-X entry */ +static int amdvi_int_remap_msi(AMDVIState *iommu, + MSIMessage *origin, + MSIMessage *translated, + uint16_t sid) +{ + int ret = 0; + uint64_t pass = 0; + uint64_t dte[4] = { 0 }; + X86IOMMUIrq irq = { 0 }; + uint8_t dest_mode, delivery_mode; + + assert(origin && translated); + + /* + * When IOMMU is enabled, interrupt remap request will come either from + * IO-APIC or PCI device. If interrupt is from PCI device then it will + * have a valid requester id but if the interrupt is from IO-APIC + * then requester id will be invalid. + */ + if (sid == X86_IOMMU_SID_INVALID) { + sid = AMDVI_IOAPIC_SB_DEVID; + } + + trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid); + + /* check if device table entry is set before we go further. */ + if (!iommu || !iommu->devtab_len) { + memcpy(translated, origin, sizeof(*origin)); + goto out; + } + + if (!amdvi_get_dte(iommu, sid, dte)) { + return -AMDVI_IR_ERR; + } + + /* Check if IR is enabled in DTE */ + if (!(dte[2] & AMDVI_IR_REMAP_ENABLE)) { + memcpy(translated, origin, sizeof(*origin)); + goto out; + } + + /* validate that we are configure with intremap=on */ + if (!x86_iommu_ir_supported(X86_IOMMU_DEVICE(iommu))) { + trace_amdvi_err("Interrupt remapping is enabled in the guest but " + "not in the host. Use intremap=on to enable interrupt " + "remapping in amd-iommu."); + return -AMDVI_IR_ERR; + } + + if (origin->address & AMDVI_MSI_ADDR_HI_MASK) { + trace_amdvi_err("MSI address high 32 bits non-zero when " + "Interrupt Remapping enabled."); + return -AMDVI_IR_ERR; + } + + if ((origin->address & AMDVI_MSI_ADDR_LO_MASK) != APIC_DEFAULT_ADDRESS) { + trace_amdvi_err("MSI is not from IOAPIC."); + return -AMDVI_IR_ERR; + } + + /* + * The MSI data register [10:8] are used to get the upstream interrupt type. + * + * See MSI/MSI-X format: + * https://pdfs.semanticscholar.org/presentation/9420/c279e942eca568157711ef5c92b800c40a79.pdf + * (page 5) + */ + delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7; + + switch (delivery_mode) { + case AMDVI_IOAPIC_INT_TYPE_FIXED: + case AMDVI_IOAPIC_INT_TYPE_ARBITRATED: + trace_amdvi_ir_delivery_mode("fixed/arbitrated"); + ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, sid); + if (ret < 0) { + goto remap_fail; + } else { + /* Translate IRQ to MSI messages */ + x86_iommu_irq_to_msi_message(&irq, translated); + goto out; + } + break; + case AMDVI_IOAPIC_INT_TYPE_SMI: + error_report("SMI is not supported!"); + ret = -AMDVI_IR_ERR; + break; + case AMDVI_IOAPIC_INT_TYPE_NMI: + pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK; + trace_amdvi_ir_delivery_mode("nmi"); + break; + case AMDVI_IOAPIC_INT_TYPE_INIT: + pass = dte[3] & AMDVI_DEV_INT_PASS_MASK; + trace_amdvi_ir_delivery_mode("init"); + break; + case AMDVI_IOAPIC_INT_TYPE_EINT: + pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK; + trace_amdvi_ir_delivery_mode("eint"); + break; + default: + trace_amdvi_ir_delivery_mode("unsupported delivery_mode"); + ret = -AMDVI_IR_ERR; + break; + } + + if (ret < 0) { + goto remap_fail; + } + + /* + * The MSI address register bit[2] is used to get the destination + * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts + * only. + */ + dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1; + if (dest_mode) { + trace_amdvi_ir_err("invalid dest_mode"); + ret = -AMDVI_IR_ERR; + goto remap_fail; + } + + if (pass) { + memcpy(translated, origin, sizeof(*origin)); + } else { + trace_amdvi_ir_err("passthrough is not enabled"); + ret = -AMDVI_IR_ERR; + goto remap_fail; + } + +out: + trace_amdvi_ir_remap_msi(origin->address, origin->data, + translated->address, translated->data); + return 0; + +remap_fail: + return ret; +} + +static int amdvi_int_remap(X86IOMMUState *iommu, + MSIMessage *origin, + MSIMessage *translated, + uint16_t sid) +{ + return amdvi_int_remap_msi(AMD_IOMMU_DEVICE(iommu), origin, + translated, sid); +} + +static MemTxResult amdvi_mem_ir_write(void *opaque, hwaddr addr, + uint64_t value, unsigned size, + MemTxAttrs attrs) +{ + int ret; + MSIMessage from = { 0, 0 }, to = { 0, 0 }; + uint16_t sid = AMDVI_IOAPIC_SB_DEVID; + + from.address = (uint64_t) addr + AMDVI_INT_ADDR_FIRST; + from.data = (uint32_t) value; + + trace_amdvi_mem_ir_write_req(addr, value, size); + + if (!attrs.unspecified) { + /* We have explicit Source ID */ + sid = attrs.requester_id; + } + + ret = amdvi_int_remap_msi(opaque, &from, &to, sid); + if (ret < 0) { + /* TODO: log the event using IOMMU log event interface */ + error_report_once("failed to remap interrupt from devid 0x%x", sid); + return MEMTX_ERROR; + } + + apic_get_class()->send_msi(&to); + + trace_amdvi_mem_ir_write(to.address, to.data); + return MEMTX_OK; +} + +static MemTxResult amdvi_mem_ir_read(void *opaque, hwaddr addr, + uint64_t *data, unsigned size, + MemTxAttrs attrs) +{ + return MEMTX_OK; +} + +static const MemoryRegionOps amdvi_ir_ops = { + .read_with_attrs = amdvi_mem_ir_read, + .write_with_attrs = amdvi_mem_ir_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, + .valid = { + .min_access_size = 4, + .max_access_size = 4, + } +}; + +static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) +{ + char name[128]; + AMDVIState *s = opaque; + AMDVIAddressSpace **iommu_as, *amdvi_dev_as; + int bus_num = pci_bus_num(bus); + + iommu_as = s->address_spaces[bus_num]; + + /* allocate memory during the first run */ + if (!iommu_as) { + iommu_as = g_malloc0(sizeof(AMDVIAddressSpace *) * PCI_DEVFN_MAX); + s->address_spaces[bus_num] = iommu_as; + } + + /* set up AMD-Vi region */ + if (!iommu_as[devfn]) { + snprintf(name, sizeof(name), "amd_iommu_devfn_%d", devfn); + + iommu_as[devfn] = g_malloc0(sizeof(AMDVIAddressSpace)); + iommu_as[devfn]->bus_num = (uint8_t)bus_num; + iommu_as[devfn]->devfn = (uint8_t)devfn; + iommu_as[devfn]->iommu_state = s; + + amdvi_dev_as = iommu_as[devfn]; + + /* + * Memory region relationships looks like (Address range shows + * only lower 32 bits to make it short in length...): + * + * |-----------------+-------------------+----------| + * | Name | Address range | Priority | + * |-----------------+-------------------+----------+ + * | amdvi_root | 00000000-ffffffff | 0 | + * | amdvi_iommu | 00000000-ffffffff | 1 | + * | amdvi_iommu_ir | fee00000-feefffff | 64 | + * |-----------------+-------------------+----------| + */ + memory_region_init_iommu(&amdvi_dev_as->iommu, + sizeof(amdvi_dev_as->iommu), + TYPE_AMD_IOMMU_MEMORY_REGION, + OBJECT(s), + "amd_iommu", UINT64_MAX); + memory_region_init(&amdvi_dev_as->root, OBJECT(s), + "amdvi_root", UINT64_MAX); + address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name); + memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s), + &amdvi_ir_ops, s, "amd_iommu_ir", + AMDVI_INT_ADDR_SIZE); + memory_region_add_subregion_overlap(&amdvi_dev_as->root, + AMDVI_INT_ADDR_FIRST, + &amdvi_dev_as->iommu_ir, + 64); + memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0, + MEMORY_REGION(&amdvi_dev_as->iommu), + 1); + } + return &iommu_as[devfn]->as; +} + +static const MemoryRegionOps mmio_mem_ops = { + .read = amdvi_mmio_read, + .write = amdvi_mmio_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = false, + }, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + } +}; + +static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, + IOMMUNotifierFlag old, + IOMMUNotifierFlag new, + Error **errp) +{ + AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); + + if (new & IOMMU_NOTIFIER_MAP) { + error_setg(errp, + "device %02x.%02x.%x requires iommu notifier which is not " + "currently supported", as->bus_num, PCI_SLOT(as->devfn), + PCI_FUNC(as->devfn)); + return -EINVAL; + } + return 0; +} + +static void amdvi_init(AMDVIState *s) +{ + amdvi_iotlb_reset(s); + + s->devtab_len = 0; + s->cmdbuf_len = 0; + s->cmdbuf_head = 0; + s->cmdbuf_tail = 0; + s->evtlog_head = 0; + s->evtlog_tail = 0; + s->excl_enabled = false; + s->excl_allow = false; + s->mmio_enabled = false; + s->enabled = false; + s->ats_enabled = false; + s->cmdbuf_enabled = false; + + /* reset MMIO */ + memset(s->mmior, 0, AMDVI_MMIO_SIZE); + amdvi_set_quad(s, AMDVI_MMIO_EXT_FEATURES, AMDVI_EXT_FEATURES, + 0xffffffffffffffef, 0); + amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98, 0x67); + + /* reset device ident */ + pci_config_set_vendor_id(s->pci.dev.config, PCI_VENDOR_ID_AMD); + pci_config_set_prog_interface(s->pci.dev.config, 00); + pci_config_set_device_id(s->pci.dev.config, s->devid); + pci_config_set_class(s->pci.dev.config, 0x0806); + + /* reset AMDVI specific capabilities, all r/o */ + pci_set_long(s->pci.dev.config + s->capab_offset, AMDVI_CAPAB_FEATURES); + pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_LOW, + s->mmio.addr & ~(0xffff0000)); + pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH, + (s->mmio.addr & ~(0xffff)) >> 16); + pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_RANGE, + 0xff000000); + pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, 0); + pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, + AMDVI_MAX_PH_ADDR | AMDVI_MAX_GVA_ADDR | AMDVI_MAX_VA_ADDR); +} + +static void amdvi_sysbus_reset(DeviceState *dev) +{ + AMDVIState *s = AMD_IOMMU_DEVICE(dev); + + msi_reset(&s->pci.dev); + amdvi_init(s); +} + +static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) +{ + int ret = 0; + AMDVIState *s = AMD_IOMMU_DEVICE(dev); + MachineState *ms = MACHINE(qdev_get_machine()); + PCMachineState *pcms = PC_MACHINE(ms); + X86MachineState *x86ms = X86_MACHINE(ms); + PCIBus *bus = pcms->bus; + + s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, + amdvi_uint64_equal, g_free, g_free); + + /* This device should take care of IOMMU PCI properties */ + if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) { + return; + } + ret = pci_add_capability(&s->pci.dev, AMDVI_CAPAB_ID_SEC, 0, + AMDVI_CAPAB_SIZE, errp); + if (ret < 0) { + return; + } + s->capab_offset = ret; + + ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_MSI, 0, + AMDVI_CAPAB_REG_SIZE, errp); + if (ret < 0) { + return; + } + ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_HT, 0, + AMDVI_CAPAB_REG_SIZE, errp); + if (ret < 0) { + return; + } + + /* Pseudo address space under root PCI bus. */ + x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID); + + /* set up MMIO */ + memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio", + AMDVI_MMIO_SIZE); + + sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio); + sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR); + pci_setup_iommu(bus, amdvi_host_dma_iommu, s); + s->devid = object_property_get_int(OBJECT(&s->pci), "addr", &error_abort); + msi_init(&s->pci.dev, 0, 1, true, false, errp); + amdvi_init(s); +} + +static const VMStateDescription vmstate_amdvi_sysbus = { + .name = "amd-iommu", + .unmigratable = 1 +}; + +static void amdvi_sysbus_instance_init(Object *klass) +{ + AMDVIState *s = AMD_IOMMU_DEVICE(klass); + + object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI); +} + +static void amdvi_sysbus_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + X86IOMMUClass *dc_class = X86_IOMMU_DEVICE_CLASS(klass); + + dc->reset = amdvi_sysbus_reset; + dc->vmsd = &vmstate_amdvi_sysbus; + dc->hotpluggable = false; + dc_class->realize = amdvi_sysbus_realize; + dc_class->int_remap = amdvi_int_remap; + /* Supported by the pc-q35-* machine types */ + dc->user_creatable = true; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device"; +} + +static const TypeInfo amdvi_sysbus = { + .name = TYPE_AMD_IOMMU_DEVICE, + .parent = TYPE_X86_IOMMU_DEVICE, + .instance_size = sizeof(AMDVIState), + .instance_init = amdvi_sysbus_instance_init, + .class_init = amdvi_sysbus_class_init +}; + +static void amdvi_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device"; +} + +static const TypeInfo amdvi_pci = { + .name = TYPE_AMD_IOMMU_PCI, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(AMDVIPCIState), + .class_init = amdvi_pci_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { }, + }, +}; + +static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, void *data) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); + + imrc->translate = amdvi_translate; + imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed; +} + +static const TypeInfo amdvi_iommu_memory_region_info = { + .parent = TYPE_IOMMU_MEMORY_REGION, + .name = TYPE_AMD_IOMMU_MEMORY_REGION, + .class_init = amdvi_iommu_memory_region_class_init, +}; + +static void amdvi_register_types(void) +{ + type_register_static(&amdvi_pci); + type_register_static(&amdvi_sysbus); + type_register_static(&amdvi_iommu_memory_region_info); +} + +type_init(amdvi_register_types); diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h new file mode 100644 index 000000000..79d38a3e4 --- /dev/null +++ b/hw/i386/amd_iommu.h @@ -0,0 +1,372 @@ +/* + * QEMU emulation of an AMD IOMMU (AMD-Vi) + * + * Copyright (C) 2011 Eduard - Gabriel Munteanu + * Copyright (C) 2015, 2016 David Kiarie Kahurani + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef AMD_IOMMU_H +#define AMD_IOMMU_H + +#include "hw/pci/pci.h" +#include "hw/i386/x86-iommu.h" +#include "qom/object.h" + +/* Capability registers */ +#define AMDVI_CAPAB_BAR_LOW 0x04 +#define AMDVI_CAPAB_BAR_HIGH 0x08 +#define AMDVI_CAPAB_RANGE 0x0C +#define AMDVI_CAPAB_MISC 0x10 + +#define AMDVI_CAPAB_SIZE 0x18 +#define AMDVI_CAPAB_REG_SIZE 0x04 + +/* Capability header data */ +#define AMDVI_CAPAB_ID_SEC 0xf +#define AMDVI_CAPAB_FLAT_EXT (1 << 28) +#define AMDVI_CAPAB_EFR_SUP (1 << 27) +#define AMDVI_CAPAB_FLAG_NPCACHE (1 << 26) +#define AMDVI_CAPAB_FLAG_HTTUNNEL (1 << 25) +#define AMDVI_CAPAB_FLAG_IOTLBSUP (1 << 24) +#define AMDVI_CAPAB_INIT_TYPE (3 << 16) + +/* No. of used MMIO registers */ +#define AMDVI_MMIO_REGS_HIGH 7 +#define AMDVI_MMIO_REGS_LOW 8 + +/* MMIO registers */ +#define AMDVI_MMIO_DEVICE_TABLE 0x0000 +#define AMDVI_MMIO_COMMAND_BASE 0x0008 +#define AMDVI_MMIO_EVENT_BASE 0x0010 +#define AMDVI_MMIO_CONTROL 0x0018 +#define AMDVI_MMIO_EXCL_BASE 0x0020 +#define AMDVI_MMIO_EXCL_LIMIT 0x0028 +#define AMDVI_MMIO_EXT_FEATURES 0x0030 +#define AMDVI_MMIO_COMMAND_HEAD 0x2000 +#define AMDVI_MMIO_COMMAND_TAIL 0x2008 +#define AMDVI_MMIO_EVENT_HEAD 0x2010 +#define AMDVI_MMIO_EVENT_TAIL 0x2018 +#define AMDVI_MMIO_STATUS 0x2020 +#define AMDVI_MMIO_PPR_BASE 0x0038 +#define AMDVI_MMIO_PPR_HEAD 0x2030 +#define AMDVI_MMIO_PPR_TAIL 0x2038 + +#define AMDVI_MMIO_SIZE 0x4000 + +#define AMDVI_MMIO_DEVTAB_SIZE_MASK ((1ULL << 12) - 1) +#define AMDVI_MMIO_DEVTAB_BASE_MASK (((1ULL << 52) - 1) & ~ \ + AMDVI_MMIO_DEVTAB_SIZE_MASK) +#define AMDVI_MMIO_DEVTAB_ENTRY_SIZE 32 +#define AMDVI_MMIO_DEVTAB_SIZE_UNIT 4096 + +/* some of this are similar but just for readability */ +#define AMDVI_MMIO_CMDBUF_SIZE_BYTE (AMDVI_MMIO_COMMAND_BASE + 7) +#define AMDVI_MMIO_CMDBUF_SIZE_MASK 0x0f +#define AMDVI_MMIO_CMDBUF_BASE_MASK AMDVI_MMIO_DEVTAB_BASE_MASK +#define AMDVI_MMIO_CMDBUF_HEAD_MASK (((1ULL << 19) - 1) & ~0x0f) +#define AMDVI_MMIO_CMDBUF_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK + +#define AMDVI_MMIO_EVTLOG_SIZE_BYTE (AMDVI_MMIO_EVENT_BASE + 7) +#define AMDVI_MMIO_EVTLOG_SIZE_MASK AMDVI_MMIO_CMDBUF_SIZE_MASK +#define AMDVI_MMIO_EVTLOG_BASE_MASK AMDVI_MMIO_CMDBUF_BASE_MASK +#define AMDVI_MMIO_EVTLOG_HEAD_MASK (((1ULL << 19) - 1) & ~0x0f) +#define AMDVI_MMIO_EVTLOG_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK + +#define AMDVI_MMIO_PPRLOG_SIZE_BYTE (AMDVI_MMIO_EVENT_BASE + 7) +#define AMDVI_MMIO_PPRLOG_HEAD_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK +#define AMDVI_MMIO_PPRLOG_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK +#define AMDVI_MMIO_PPRLOG_BASE_MASK AMDVI_MMIO_EVTLOG_BASE_MASK +#define AMDVI_MMIO_PPRLOG_SIZE_MASK AMDVI_MMIO_EVTLOG_SIZE_MASK + +#define AMDVI_MMIO_EXCL_ENABLED_MASK (1ULL << 0) +#define AMDVI_MMIO_EXCL_ALLOW_MASK (1ULL << 1) +#define AMDVI_MMIO_EXCL_LIMIT_MASK AMDVI_MMIO_DEVTAB_BASE_MASK +#define AMDVI_MMIO_EXCL_LIMIT_LOW 0xfff + +/* mmio control register flags */ +#define AMDVI_MMIO_CONTROL_AMDVIEN (1ULL << 0) +#define AMDVI_MMIO_CONTROL_HTTUNEN (1ULL << 1) +#define AMDVI_MMIO_CONTROL_EVENTLOGEN (1ULL << 2) +#define AMDVI_MMIO_CONTROL_EVENTINTEN (1ULL << 3) +#define AMDVI_MMIO_CONTROL_COMWAITINTEN (1ULL << 4) +#define AMDVI_MMIO_CONTROL_CMDBUFLEN (1ULL << 12) +#define AMDVI_MMIO_CONTROL_GAEN (1ULL << 17) + +/* MMIO status register bits */ +#define AMDVI_MMIO_STATUS_CMDBUF_RUN (1 << 4) +#define AMDVI_MMIO_STATUS_EVT_RUN (1 << 3) +#define AMDVI_MMIO_STATUS_COMP_INT (1 << 2) +#define AMDVI_MMIO_STATUS_EVT_OVF (1 << 0) + +#define AMDVI_CMDBUF_ID_BYTE 0x07 +#define AMDVI_CMDBUF_ID_RSHIFT 4 + +#define AMDVI_CMD_COMPLETION_WAIT 0x01 +#define AMDVI_CMD_INVAL_DEVTAB_ENTRY 0x02 +#define AMDVI_CMD_INVAL_AMDVI_PAGES 0x03 +#define AMDVI_CMD_INVAL_IOTLB_PAGES 0x04 +#define AMDVI_CMD_INVAL_INTR_TABLE 0x05 +#define AMDVI_CMD_PREFETCH_AMDVI_PAGES 0x06 +#define AMDVI_CMD_COMPLETE_PPR_REQUEST 0x07 +#define AMDVI_CMD_INVAL_AMDVI_ALL 0x08 + +#define AMDVI_DEVTAB_ENTRY_SIZE 32 + +/* Device table entry bits 0:63 */ +#define AMDVI_DEV_VALID (1ULL << 0) +#define AMDVI_DEV_TRANSLATION_VALID (1ULL << 1) +#define AMDVI_DEV_MODE_MASK 0x7 +#define AMDVI_DEV_MODE_RSHIFT 9 +#define AMDVI_DEV_PT_ROOT_MASK 0xffffffffff000 +#define AMDVI_DEV_PT_ROOT_RSHIFT 12 +#define AMDVI_DEV_PERM_SHIFT 61 +#define AMDVI_DEV_PERM_READ (1ULL << 61) +#define AMDVI_DEV_PERM_WRITE (1ULL << 62) + +/* Device table entry bits 64:127 */ +#define AMDVI_DEV_DOMID_ID_MASK ((1ULL << 16) - 1) + +/* Event codes and flags, as stored in the info field */ +#define AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY (0x1U << 12) +#define AMDVI_EVENT_IOPF (0x2U << 12) +#define AMDVI_EVENT_IOPF_I (1U << 3) +#define AMDVI_EVENT_DEV_TAB_HW_ERROR (0x3U << 12) +#define AMDVI_EVENT_PAGE_TAB_HW_ERROR (0x4U << 12) +#define AMDVI_EVENT_ILLEGAL_COMMAND_ERROR (0x5U << 12) +#define AMDVI_EVENT_COMMAND_HW_ERROR (0x6U << 12) + +#define AMDVI_EVENT_LEN 16 +#define AMDVI_PERM_READ (1 << 0) +#define AMDVI_PERM_WRITE (1 << 1) + +#define AMDVI_FEATURE_PREFETCH (1ULL << 0) /* page prefetch */ +#define AMDVI_FEATURE_PPR (1ULL << 1) /* PPR Support */ +#define AMDVI_FEATURE_GT (1ULL << 4) /* Guest Translation */ +#define AMDVI_FEATURE_IA (1ULL << 6) /* inval all support */ +#define AMDVI_FEATURE_GA (1ULL << 7) /* guest VAPIC support */ +#define AMDVI_FEATURE_HE (1ULL << 8) /* hardware error regs */ +#define AMDVI_FEATURE_PC (1ULL << 9) /* Perf counters */ + +/* reserved DTE bits */ +#define AMDVI_DTE_LOWER_QUAD_RESERVED 0x80300000000000fc +#define AMDVI_DTE_MIDDLE_QUAD_RESERVED 0x0000000000000100 +#define AMDVI_DTE_UPPER_QUAD_RESERVED 0x08f0000000000000 + +/* AMDVI paging mode */ +#define AMDVI_GATS_MODE (2ULL << 12) +#define AMDVI_HATS_MODE (2ULL << 10) + +/* IOTLB */ +#define AMDVI_IOTLB_MAX_SIZE 1024 +#define AMDVI_DEVID_SHIFT 36 + +/* extended feature support */ +#define AMDVI_EXT_FEATURES (AMDVI_FEATURE_PREFETCH | AMDVI_FEATURE_PPR | \ + AMDVI_FEATURE_IA | AMDVI_FEATURE_GT | AMDVI_FEATURE_HE | \ + AMDVI_GATS_MODE | AMDVI_HATS_MODE | AMDVI_FEATURE_GA) + +/* capabilities header */ +#define AMDVI_CAPAB_FEATURES (AMDVI_CAPAB_FLAT_EXT | \ + AMDVI_CAPAB_FLAG_NPCACHE | AMDVI_CAPAB_FLAG_IOTLBSUP \ + | AMDVI_CAPAB_ID_SEC | AMDVI_CAPAB_INIT_TYPE | \ + AMDVI_CAPAB_FLAG_HTTUNNEL | AMDVI_CAPAB_EFR_SUP) + +/* AMDVI default address */ +#define AMDVI_BASE_ADDR 0xfed80000 + +/* page management constants */ +#define AMDVI_PAGE_SHIFT 12 +#define AMDVI_PAGE_SIZE (1ULL << AMDVI_PAGE_SHIFT) + +#define AMDVI_PAGE_SHIFT_4K 12 +#define AMDVI_PAGE_MASK_4K (~((1ULL << AMDVI_PAGE_SHIFT_4K) - 1)) + +#define AMDVI_MAX_VA_ADDR (48UL << 5) +#define AMDVI_MAX_PH_ADDR (40UL << 8) +#define AMDVI_MAX_GVA_ADDR (48UL << 15) + +/* Completion Wait data size */ +#define AMDVI_COMPLETION_DATA_SIZE 8 + +#define AMDVI_COMMAND_SIZE 16 +/* Completion Wait data size */ +#define AMDVI_COMPLETION_DATA_SIZE 8 + +#define AMDVI_COMMAND_SIZE 16 + +#define AMDVI_INT_ADDR_FIRST 0xfee00000 +#define AMDVI_INT_ADDR_LAST 0xfeefffff +#define AMDVI_INT_ADDR_SIZE (AMDVI_INT_ADDR_LAST - AMDVI_INT_ADDR_FIRST + 1) +#define AMDVI_MSI_ADDR_HI_MASK (0xffffffff00000000ULL) +#define AMDVI_MSI_ADDR_LO_MASK (0x00000000ffffffffULL) + +/* SB IOAPIC is always on this device in AMD systems */ +#define AMDVI_IOAPIC_SB_DEVID PCI_BUILD_BDF(0, PCI_DEVFN(0x14, 0)) + +/* Interrupt remapping errors */ +#define AMDVI_IR_ERR 0x1 +#define AMDVI_IR_GET_IRTE 0x2 +#define AMDVI_IR_TARGET_ABORT 0x3 + +/* Interrupt remapping */ +#define AMDVI_IR_REMAP_ENABLE 1ULL +#define AMDVI_IR_INTCTL_SHIFT 60 +#define AMDVI_IR_INTCTL_ABORT 0 +#define AMDVI_IR_INTCTL_PASS 1 +#define AMDVI_IR_INTCTL_REMAP 2 + +#define AMDVI_IR_PHYS_ADDR_MASK (((1ULL << 45) - 1) << 6) + +/* MSI data 10:0 bits (section 2.2.5.1 Fig 14) */ +#define AMDVI_IRTE_OFFSET 0x7ff + +/* Delivery mode of MSI data (same as IOAPIC deilver mode encoding) */ +#define AMDVI_IOAPIC_INT_TYPE_FIXED 0x0 +#define AMDVI_IOAPIC_INT_TYPE_ARBITRATED 0x1 +#define AMDVI_IOAPIC_INT_TYPE_SMI 0x2 +#define AMDVI_IOAPIC_INT_TYPE_NMI 0x4 +#define AMDVI_IOAPIC_INT_TYPE_INIT 0x5 +#define AMDVI_IOAPIC_INT_TYPE_EINT 0x7 + +/* Pass through interrupt */ +#define AMDVI_DEV_INT_PASS_MASK (1ULL << 56) +#define AMDVI_DEV_EINT_PASS_MASK (1ULL << 57) +#define AMDVI_DEV_NMI_PASS_MASK (1ULL << 58) +#define AMDVI_DEV_LINT0_PASS_MASK (1ULL << 62) +#define AMDVI_DEV_LINT1_PASS_MASK (1ULL << 63) + +/* Interrupt remapping table fields (Guest VAPIC not enabled) */ +union irte { + uint32_t val; + struct { + uint32_t valid:1, + no_fault:1, + int_type:3, + rq_eoi:1, + dm:1, + guest_mode:1, + destination:8, + vector:8, + rsvd:8; + } fields; +}; + +/* Interrupt remapping table fields (Guest VAPIC is enabled) */ +union irte_ga_lo { + uint64_t val; + + /* For int remapping */ + struct { + uint64_t valid:1, + no_fault:1, + /* ------ */ + int_type:3, + rq_eoi:1, + dm:1, + /* ------ */ + guest_mode:1, + destination:8, + rsvd_1:48; + } fields_remap; +}; + +union irte_ga_hi { + uint64_t val; + struct { + uint64_t vector:8, + rsvd_2:56; + } fields; +}; + +struct irte_ga { + union irte_ga_lo lo; + union irte_ga_hi hi; +}; + +#define TYPE_AMD_IOMMU_DEVICE "amd-iommu" +OBJECT_DECLARE_SIMPLE_TYPE(AMDVIState, AMD_IOMMU_DEVICE) + +#define TYPE_AMD_IOMMU_PCI "AMDVI-PCI" + +#define TYPE_AMD_IOMMU_MEMORY_REGION "amd-iommu-iommu-memory-region" + +typedef struct AMDVIAddressSpace AMDVIAddressSpace; + +/* functions to steal PCI config space */ +typedef struct AMDVIPCIState { + PCIDevice dev; /* The PCI device itself */ +} AMDVIPCIState; + +struct AMDVIState { + X86IOMMUState iommu; /* IOMMU bus device */ + AMDVIPCIState pci; /* IOMMU PCI device */ + + uint32_t version; + uint32_t capab_offset; /* capability offset pointer */ + + uint64_t mmio_addr; + + uint32_t devid; /* auto-assigned devid */ + + bool enabled; /* IOMMU enabled */ + bool ats_enabled; /* address translation enabled */ + bool cmdbuf_enabled; /* command buffer enabled */ + bool evtlog_enabled; /* event log enabled */ + bool excl_enabled; + + hwaddr devtab; /* base address device table */ + size_t devtab_len; /* device table length */ + + hwaddr cmdbuf; /* command buffer base address */ + uint64_t cmdbuf_len; /* command buffer length */ + uint32_t cmdbuf_head; /* current IOMMU read position */ + uint32_t cmdbuf_tail; /* next Software write position */ + bool completion_wait_intr; + + hwaddr evtlog; /* base address event log */ + bool evtlog_intr; + uint32_t evtlog_len; /* event log length */ + uint32_t evtlog_head; /* current IOMMU write position */ + uint32_t evtlog_tail; /* current Software read position */ + + /* unused for now */ + hwaddr excl_base; /* base DVA - IOMMU exclusion range */ + hwaddr excl_limit; /* limit of IOMMU exclusion range */ + bool excl_allow; /* translate accesses to the exclusion range */ + bool excl_enable; /* exclusion range enabled */ + + hwaddr ppr_log; /* base address ppr log */ + uint32_t pprlog_len; /* ppr log len */ + uint32_t pprlog_head; /* ppr log head */ + uint32_t pprlog_tail; /* ppr log tail */ + + MemoryRegion mmio; /* MMIO region */ + uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */ + uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */ + uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */ + bool mmio_enabled; + + /* for each served device */ + AMDVIAddressSpace **address_spaces[PCI_BUS_MAX]; + + /* IOTLB */ + GHashTable *iotlb; + + /* Interrupt remapping */ + bool ga_enabled; +}; + +#endif diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c new file mode 100644 index 000000000..bcf9eaf83 --- /dev/null +++ b/hw/i386/e820_memory_layout.c @@ -0,0 +1,59 @@ +/* + * QEMU BIOS e820 routines + * + * Copyright (c) 2003-2004 Fabrice Bellard + * + * SPDX-License-Identifier: MIT + */ + +#include "qemu/osdep.h" +#include "qemu/bswap.h" +#include "e820_memory_layout.h" + +static size_t e820_entries; +struct e820_table e820_reserve; +struct e820_entry *e820_table; + +int e820_add_entry(uint64_t address, uint64_t length, uint32_t type) +{ + int index = le32_to_cpu(e820_reserve.count); + struct e820_entry *entry; + + if (type != E820_RAM) { + /* old FW_CFG_E820_TABLE entry -- reservations only */ + if (index >= E820_NR_ENTRIES) { + return -EBUSY; + } + entry = &e820_reserve.entry[index++]; + + entry->address = cpu_to_le64(address); + entry->length = cpu_to_le64(length); + entry->type = cpu_to_le32(type); + + e820_reserve.count = cpu_to_le32(index); + } + + /* new "etc/e820" file -- include ram too */ + e820_table = g_renew(struct e820_entry, e820_table, e820_entries + 1); + e820_table[e820_entries].address = cpu_to_le64(address); + e820_table[e820_entries].length = cpu_to_le64(length); + e820_table[e820_entries].type = cpu_to_le32(type); + e820_entries++; + + return e820_entries; +} + +int e820_get_num_entries(void) +{ + return e820_entries; +} + +bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length) +{ + if (idx < e820_entries && e820_table[idx].type == cpu_to_le32(type)) { + *address = le64_to_cpu(e820_table[idx].address); + *length = le64_to_cpu(e820_table[idx].length); + return true; + } + return false; +} diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h new file mode 100644 index 000000000..2a0ceb8b9 --- /dev/null +++ b/hw/i386/e820_memory_layout.h @@ -0,0 +1,42 @@ +/* + * QEMU BIOS e820 routines + * + * Copyright (c) 2003-2004 Fabrice Bellard + * + * SPDX-License-Identifier: MIT + */ + +#ifndef HW_I386_E820_H +#define HW_I386_E820_H + +/* e820 types */ +#define E820_RAM 1 +#define E820_RESERVED 2 +#define E820_ACPI 3 +#define E820_NVS 4 +#define E820_UNUSABLE 5 + +#define E820_NR_ENTRIES 16 + +struct e820_entry { + uint64_t address; + uint64_t length; + uint32_t type; +} QEMU_PACKED __attribute((__aligned__(4))); + +struct e820_table { + uint32_t count; + struct e820_entry entry[E820_NR_ENTRIES]; +} QEMU_PACKED __attribute((__aligned__(4))); + +extern struct e820_table e820_reserve; +extern struct e820_entry *e820_table; + +int e820_add_entry(uint64_t address, uint64_t length, uint32_t type); +int e820_get_num_entries(void); +bool e820_get_entry(int index, uint32_t type, + uint64_t *address, uint64_t *length); + + + +#endif diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c new file mode 100644 index 000000000..a283785a8 --- /dev/null +++ b/hw/i386/fw_cfg.c @@ -0,0 +1,221 @@ +/* + * QEMU fw_cfg helpers (X86 specific) + * + * Copyright (c) 2019 Red Hat, Inc. + * + * Author: + * Philippe Mathieu-Daudé <philmd@redhat.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "sysemu/numa.h" +#include "hw/acpi/acpi.h" +#include "hw/acpi/aml-build.h" +#include "hw/firmware/smbios.h" +#include "hw/i386/fw_cfg.h" +#include "hw/timer/hpet.h" +#include "hw/nvram/fw_cfg.h" +#include "e820_memory_layout.h" +#include "kvm/kvm_i386.h" +#include "qapi/error.h" +#include CONFIG_DEVICES + +struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX}; + +const char *fw_cfg_arch_key_name(uint16_t key) +{ + static const struct { + uint16_t key; + const char *name; + } fw_cfg_arch_wellknown_keys[] = { + {FW_CFG_ACPI_TABLES, "acpi_tables"}, + {FW_CFG_SMBIOS_ENTRIES, "smbios_entries"}, + {FW_CFG_IRQ0_OVERRIDE, "irq0_override"}, + {FW_CFG_E820_TABLE, "e820_table"}, + {FW_CFG_HPET, "hpet"}, + }; + + for (size_t i = 0; i < ARRAY_SIZE(fw_cfg_arch_wellknown_keys); i++) { + if (fw_cfg_arch_wellknown_keys[i].key == key) { + return fw_cfg_arch_wellknown_keys[i].name; + } + } + return NULL; +} + +void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg) +{ +#ifdef CONFIG_SMBIOS + uint8_t *smbios_tables, *smbios_anchor; + size_t smbios_tables_len, smbios_anchor_len; + struct smbios_phys_mem_area *mem_array; + unsigned i, array_count; + X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu); + + /* tell smbios about cpuid version and features */ + smbios_set_cpuid(cpu->env.cpuid_version, cpu->env.features[FEAT_1_EDX]); + + smbios_tables = smbios_get_table_legacy(ms, &smbios_tables_len); + if (smbios_tables) { + fw_cfg_add_bytes(fw_cfg, FW_CFG_SMBIOS_ENTRIES, + smbios_tables, smbios_tables_len); + } + + /* build the array of physical mem area from e820 table */ + mem_array = g_malloc0(sizeof(*mem_array) * e820_get_num_entries()); + for (i = 0, array_count = 0; i < e820_get_num_entries(); i++) { + uint64_t addr, len; + + if (e820_get_entry(i, E820_RAM, &addr, &len)) { + mem_array[array_count].address = addr; + mem_array[array_count].length = len; + array_count++; + } + } + smbios_get_tables(ms, mem_array, array_count, + &smbios_tables, &smbios_tables_len, + &smbios_anchor, &smbios_anchor_len, + &error_fatal); + g_free(mem_array); + + if (smbios_anchor) { + fw_cfg_add_file(fw_cfg, "etc/smbios/smbios-tables", + smbios_tables, smbios_tables_len); + fw_cfg_add_file(fw_cfg, "etc/smbios/smbios-anchor", + smbios_anchor, smbios_anchor_len); + } +#endif +} + +FWCfgState *fw_cfg_arch_create(MachineState *ms, + uint16_t boot_cpus, + uint16_t apic_id_limit) +{ + FWCfgState *fw_cfg; + uint64_t *numa_fw_cfg; + int i; + MachineClass *mc = MACHINE_GET_CLASS(ms); + const CPUArchIdList *cpus = mc->possible_cpu_arch_ids(ms); + int nb_numa_nodes = ms->numa_state->num_nodes; + + fw_cfg = fw_cfg_init_io_dma(FW_CFG_IO_BASE, FW_CFG_IO_BASE + 4, + &address_space_memory); + fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, boot_cpus); + + /* FW_CFG_MAX_CPUS is a bit confusing/problematic on x86: + * + * For machine types prior to 1.8, SeaBIOS needs FW_CFG_MAX_CPUS for + * building MPTable, ACPI MADT, ACPI CPU hotplug and ACPI SRAT table, + * that tables are based on xAPIC ID and QEMU<->SeaBIOS interface + * for CPU hotplug also uses APIC ID and not "CPU index". + * This means that FW_CFG_MAX_CPUS is not the "maximum number of CPUs", + * but the "limit to the APIC ID values SeaBIOS may see". + * + * So for compatibility reasons with old BIOSes we are stuck with + * "etc/max-cpus" actually being apic_id_limit + */ + fw_cfg_add_i16(fw_cfg, FW_CFG_MAX_CPUS, apic_id_limit); + fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, ms->ram_size); +#ifdef CONFIG_ACPI + fw_cfg_add_bytes(fw_cfg, FW_CFG_ACPI_TABLES, + acpi_tables, acpi_tables_len); +#endif + fw_cfg_add_i32(fw_cfg, FW_CFG_IRQ0_OVERRIDE, 1); + + fw_cfg_add_bytes(fw_cfg, FW_CFG_E820_TABLE, + &e820_reserve, sizeof(e820_reserve)); + fw_cfg_add_file(fw_cfg, "etc/e820", e820_table, + sizeof(struct e820_entry) * e820_get_num_entries()); + + fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, &hpet_cfg, sizeof(hpet_cfg)); + /* allocate memory for the NUMA channel: one (64bit) word for the number + * of nodes, one word for each VCPU->node and one word for each node to + * hold the amount of memory. + */ + numa_fw_cfg = g_new0(uint64_t, 1 + apic_id_limit + nb_numa_nodes); + numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes); + for (i = 0; i < cpus->len; i++) { + unsigned int apic_id = cpus->cpus[i].arch_id; + assert(apic_id < apic_id_limit); + numa_fw_cfg[apic_id + 1] = cpu_to_le64(cpus->cpus[i].props.node_id); + } + for (i = 0; i < nb_numa_nodes; i++) { + numa_fw_cfg[apic_id_limit + 1 + i] = + cpu_to_le64(ms->numa_state->nodes[i].node_mem); + } + fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg, + (1 + apic_id_limit + nb_numa_nodes) * + sizeof(*numa_fw_cfg)); + + return fw_cfg; +} + +void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg) +{ + X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu); + CPUX86State *env = &cpu->env; + uint32_t unused, ebx, ecx, edx; + uint64_t feature_control_bits = 0; + uint64_t *val; + + cpu_x86_cpuid(env, 1, 0, &unused, &unused, &ecx, &edx); + if (ecx & CPUID_EXT_VMX) { + feature_control_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + } + + if ((edx & (CPUID_EXT2_MCE | CPUID_EXT2_MCA)) == + (CPUID_EXT2_MCE | CPUID_EXT2_MCA) && + (env->mcg_cap & MCG_LMCE_P)) { + feature_control_bits |= FEATURE_CONTROL_LMCE; + } + + if (env->cpuid_level >= 7) { + cpu_x86_cpuid(env, 0x7, 0, &unused, &ebx, &ecx, &unused); + if (ebx & CPUID_7_0_EBX_SGX) { + feature_control_bits |= FEATURE_CONTROL_SGX; + } + if (ecx & CPUID_7_0_ECX_SGX_LC) { + feature_control_bits |= FEATURE_CONTROL_SGX_LC; + } + } + + if (!feature_control_bits) { + return; + } + + val = g_malloc(sizeof(*val)); + *val = cpu_to_le64(feature_control_bits | FEATURE_CONTROL_LOCKED); + fw_cfg_add_file(fw_cfg, "etc/msr_feature_control", val, sizeof(*val)); +} + +void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg) +{ + /* + * when using port i/o, the 8-bit data register *always* overlaps + * with half of the 16-bit control register. Hence, the total size + * of the i/o region used is FW_CFG_CTL_SIZE; when using DMA, the + * DMA control register is located at FW_CFG_DMA_IO_BASE + 4 + */ + Object *obj = OBJECT(fw_cfg); + uint8_t io_size = object_property_get_bool(obj, "dma_enabled", NULL) ? + ROUND_UP(FW_CFG_CTL_SIZE, 4) + sizeof(dma_addr_t) : + FW_CFG_CTL_SIZE; + Aml *dev = aml_device("FWCF"); + Aml *crs = aml_resource_template(); + + aml_append(dev, aml_name_decl("_HID", aml_string("QEMU0002"))); + + /* device present, functioning, decoding, not shown in UI */ + aml_append(dev, aml_name_decl("_STA", aml_int(0xB))); + + aml_append(crs, + aml_io(AML_DECODE16, FW_CFG_IO_BASE, FW_CFG_IO_BASE, 0x01, io_size)); + + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); +} diff --git a/hw/i386/fw_cfg.h b/hw/i386/fw_cfg.h new file mode 100644 index 000000000..275f15c1c --- /dev/null +++ b/hw/i386/fw_cfg.h @@ -0,0 +1,30 @@ +/* + * QEMU fw_cfg helpers (X86 specific) + * + * Copyright (c) 2003-2004 Fabrice Bellard + * + * SPDX-License-Identifier: MIT + */ + +#ifndef HW_I386_FW_CFG_H +#define HW_I386_FW_CFG_H + +#include "hw/boards.h" +#include "hw/nvram/fw_cfg.h" + +#define FW_CFG_IO_BASE 0x510 + +#define FW_CFG_ACPI_TABLES (FW_CFG_ARCH_LOCAL + 0) +#define FW_CFG_SMBIOS_ENTRIES (FW_CFG_ARCH_LOCAL + 1) +#define FW_CFG_IRQ0_OVERRIDE (FW_CFG_ARCH_LOCAL + 2) +#define FW_CFG_E820_TABLE (FW_CFG_ARCH_LOCAL + 3) +#define FW_CFG_HPET (FW_CFG_ARCH_LOCAL + 4) + +FWCfgState *fw_cfg_arch_create(MachineState *ms, + uint16_t boot_cpus, + uint16_t apic_id_limit); +void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg); +void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg); +void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg); + +#endif diff --git a/hw/i386/generic_event_device_x86.c b/hw/i386/generic_event_device_x86.c new file mode 100644 index 000000000..e26fb02a2 --- /dev/null +++ b/hw/i386/generic_event_device_x86.c @@ -0,0 +1,36 @@ +/* + * x86 variant of the generic event device for hw reduced acpi + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2 or later, as published by the Free Software Foundation. + */ + +#include "qemu/osdep.h" +#include "hw/acpi/generic_event_device.h" +#include "hw/i386/pc.h" + +static void acpi_ged_x86_class_init(ObjectClass *class, void *data) +{ + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_CLASS(class); + + adevc->madt_cpu = pc_madt_cpu_entry; +} + +static const TypeInfo acpi_ged_x86_info = { + .name = TYPE_ACPI_GED_X86, + .parent = TYPE_ACPI_GED, + .class_init = acpi_ged_x86_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_HOTPLUG_HANDLER }, + { TYPE_ACPI_DEVICE_IF }, + { } + } +}; + +static void acpi_ged_x86_register_types(void) +{ + type_register_static(&acpi_ged_x86_info); +} + +type_init(acpi_ged_x86_register_types) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c new file mode 100644 index 000000000..f584449d8 --- /dev/null +++ b/hw/i386/intel_iommu.c @@ -0,0 +1,3900 @@ +/* + * QEMU emulation of an Intel IOMMU (VT-d) + * (DMA Remapping device) + * + * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com> + * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "qapi/error.h" +#include "hw/sysbus.h" +#include "intel_iommu_internal.h" +#include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" +#include "hw/qdev-properties.h" +#include "hw/i386/pc.h" +#include "hw/i386/apic-msidef.h" +#include "hw/i386/x86-iommu.h" +#include "hw/pci-host/q35.h" +#include "sysemu/kvm.h" +#include "sysemu/dma.h" +#include "sysemu/sysemu.h" +#include "hw/i386/apic_internal.h" +#include "kvm/kvm_i386.h" +#include "migration/vmstate.h" +#include "trace.h" + +/* context entry operations */ +#define VTD_CE_GET_RID2PASID(ce) \ + ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) +#define VTD_CE_GET_PASID_DIR_TABLE(ce) \ + ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) + +/* pe operations */ +#define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) +#define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW)) +#define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\ + if (ret_fr) { \ + ret_fr = -ret_fr; \ + if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { \ + trace_vtd_fault_disabled(); \ + } else { \ + vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); \ + } \ + goto error; \ + } \ +} + +static void vtd_address_space_refresh_all(IntelIOMMUState *s); +static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); + +static void vtd_panic_require_caching_mode(void) +{ + error_report("We need to set caching-mode=on for intel-iommu to enable " + "device assignment with IOMMU protection."); + exit(1); +} + +static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, + uint64_t wmask, uint64_t w1cmask) +{ + stq_le_p(&s->csr[addr], val); + stq_le_p(&s->wmask[addr], wmask); + stq_le_p(&s->w1cmask[addr], w1cmask); +} + +static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask) +{ + stq_le_p(&s->womask[addr], mask); +} + +static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val, + uint32_t wmask, uint32_t w1cmask) +{ + stl_le_p(&s->csr[addr], val); + stl_le_p(&s->wmask[addr], wmask); + stl_le_p(&s->w1cmask[addr], w1cmask); +} + +static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask) +{ + stl_le_p(&s->womask[addr], mask); +} + +/* "External" get/set operations */ +static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val) +{ + uint64_t oldval = ldq_le_p(&s->csr[addr]); + uint64_t wmask = ldq_le_p(&s->wmask[addr]); + uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); + stq_le_p(&s->csr[addr], + ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); +} + +static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val) +{ + uint32_t oldval = ldl_le_p(&s->csr[addr]); + uint32_t wmask = ldl_le_p(&s->wmask[addr]); + uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); + stl_le_p(&s->csr[addr], + ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); +} + +static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr) +{ + uint64_t val = ldq_le_p(&s->csr[addr]); + uint64_t womask = ldq_le_p(&s->womask[addr]); + return val & ~womask; +} + +static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr) +{ + uint32_t val = ldl_le_p(&s->csr[addr]); + uint32_t womask = ldl_le_p(&s->womask[addr]); + return val & ~womask; +} + +/* "Internal" get/set operations */ +static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr) +{ + return ldq_le_p(&s->csr[addr]); +} + +static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr) +{ + return ldl_le_p(&s->csr[addr]); +} + +static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val) +{ + stq_le_p(&s->csr[addr], val); +} + +static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, + uint32_t clear, uint32_t mask) +{ + uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask; + stl_le_p(&s->csr[addr], new_val); + return new_val; +} + +static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, + uint64_t clear, uint64_t mask) +{ + uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask; + stq_le_p(&s->csr[addr], new_val); + return new_val; +} + +static inline void vtd_iommu_lock(IntelIOMMUState *s) +{ + qemu_mutex_lock(&s->iommu_lock); +} + +static inline void vtd_iommu_unlock(IntelIOMMUState *s) +{ + qemu_mutex_unlock(&s->iommu_lock); +} + +static void vtd_update_scalable_state(IntelIOMMUState *s) +{ + uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); + + if (s->scalable_mode) { + s->root_scalable = val & VTD_RTADDR_SMT; + } +} + +/* Whether the address space needs to notify new mappings */ +static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) +{ + return as->notifier_flags & IOMMU_NOTIFIER_MAP; +} + +/* GHashTable functions */ +static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) +{ + return *((const uint64_t *)v1) == *((const uint64_t *)v2); +} + +static guint vtd_uint64_hash(gconstpointer v) +{ + return (guint)*(const uint64_t *)v; +} + +static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, + gpointer user_data) +{ + VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; + uint16_t domain_id = *(uint16_t *)user_data; + return entry->domain_id == domain_id; +} + +/* The shift of an addr for a certain level of paging structure */ +static inline uint32_t vtd_slpt_level_shift(uint32_t level) +{ + assert(level != 0); + return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; +} + +static inline uint64_t vtd_slpt_level_page_mask(uint32_t level) +{ + return ~((1ULL << vtd_slpt_level_shift(level)) - 1); +} + +static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, + gpointer user_data) +{ + VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; + VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; + uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask; + uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K; + return (entry->domain_id == info->domain_id) && + (((entry->gfn & info->mask) == gfn) || + (entry->gfn == gfn_tlb)); +} + +/* Reset all the gen of VTDAddressSpace to zero and set the gen of + * IntelIOMMUState to 1. Must be called with IOMMU lock held. + */ +static void vtd_reset_context_cache_locked(IntelIOMMUState *s) +{ + VTDAddressSpace *vtd_as; + VTDBus *vtd_bus; + GHashTableIter bus_it; + uint32_t devfn_it; + + trace_vtd_context_cache_reset(); + + g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr); + + while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) { + for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { + vtd_as = vtd_bus->dev_as[devfn_it]; + if (!vtd_as) { + continue; + } + vtd_as->context_cache_entry.context_cache_gen = 0; + } + } + s->context_cache_gen = 1; +} + +/* Must be called with IOMMU lock held. */ +static void vtd_reset_iotlb_locked(IntelIOMMUState *s) +{ + assert(s->iotlb); + g_hash_table_remove_all(s->iotlb); +} + +static void vtd_reset_iotlb(IntelIOMMUState *s) +{ + vtd_iommu_lock(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); +} + +static void vtd_reset_caches(IntelIOMMUState *s) +{ + vtd_iommu_lock(s); + vtd_reset_iotlb_locked(s); + vtd_reset_context_cache_locked(s); + vtd_iommu_unlock(s); +} + +static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, + uint32_t level) +{ + return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | + ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT); +} + +static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) +{ + return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; +} + +/* Must be called with IOMMU lock held */ +static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, + hwaddr addr) +{ + VTDIOTLBEntry *entry; + uint64_t key; + int level; + + for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { + key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level), + source_id, level); + entry = g_hash_table_lookup(s->iotlb, &key); + if (entry) { + goto out; + } + } + +out: + return entry; +} + +/* Must be with IOMMU lock held */ +static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, + uint16_t domain_id, hwaddr addr, uint64_t slpte, + uint8_t access_flags, uint32_t level) +{ + VTDIOTLBEntry *entry = g_malloc(sizeof(*entry)); + uint64_t *key = g_malloc(sizeof(*key)); + uint64_t gfn = vtd_get_iotlb_gfn(addr, level); + + trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); + if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { + trace_vtd_iotlb_reset("iotlb exceeds size limit"); + vtd_reset_iotlb_locked(s); + } + + entry->gfn = gfn; + entry->domain_id = domain_id; + entry->slpte = slpte; + entry->access_flags = access_flags; + entry->mask = vtd_slpt_level_page_mask(level); + *key = vtd_get_iotlb_key(gfn, source_id, level); + g_hash_table_replace(s->iotlb, key, entry); +} + +/* Given the reg addr of both the message data and address, generate an + * interrupt via MSI. + */ +static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, + hwaddr mesg_data_reg) +{ + MSIMessage msi; + + assert(mesg_data_reg < DMAR_REG_SIZE); + assert(mesg_addr_reg < DMAR_REG_SIZE); + + msi.address = vtd_get_long_raw(s, mesg_addr_reg); + msi.data = vtd_get_long_raw(s, mesg_data_reg); + + trace_vtd_irq_generate(msi.address, msi.data); + + apic_get_class()->send_msi(&msi); +} + +/* Generate a fault event to software via MSI if conditions are met. + * Notice that the value of FSTS_REG being passed to it should be the one + * before any update. + */ +static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts) +{ + if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO || + pre_fsts & VTD_FSTS_IQE) { + error_report_once("There are previous interrupt conditions " + "to be serviced by software, fault event " + "is not generated"); + return; + } + vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP); + if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) { + error_report_once("Interrupt Mask set, irq is not generated"); + } else { + vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); + vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); + } +} + +/* Check if the Fault (F) field of the Fault Recording Register referenced by + * @index is Set. + */ +static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index) +{ + /* Each reg is 128-bit */ + hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); + addr += 8; /* Access the high 64-bit half */ + + assert(index < DMAR_FRCD_REG_NR); + + return vtd_get_quad_raw(s, addr) & VTD_FRCD_F; +} + +/* Update the PPF field of Fault Status Register. + * Should be called whenever change the F field of any fault recording + * registers. + */ +static void vtd_update_fsts_ppf(IntelIOMMUState *s) +{ + uint32_t i; + uint32_t ppf_mask = 0; + + for (i = 0; i < DMAR_FRCD_REG_NR; i++) { + if (vtd_is_frcd_set(s, i)) { + ppf_mask = VTD_FSTS_PPF; + break; + } + } + vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask); + trace_vtd_fsts_ppf(!!ppf_mask); +} + +static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index) +{ + /* Each reg is 128-bit */ + hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); + addr += 8; /* Access the high 64-bit half */ + + assert(index < DMAR_FRCD_REG_NR); + + vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F); + vtd_update_fsts_ppf(s); +} + +/* Must not update F field now, should be done later */ +static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index, + uint16_t source_id, hwaddr addr, + VTDFaultReason fault, bool is_write) +{ + uint64_t hi = 0, lo; + hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); + + assert(index < DMAR_FRCD_REG_NR); + + lo = VTD_FRCD_FI(addr); + hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault); + if (!is_write) { + hi |= VTD_FRCD_T; + } + vtd_set_quad_raw(s, frcd_reg_addr, lo); + vtd_set_quad_raw(s, frcd_reg_addr + 8, hi); + + trace_vtd_frr_new(index, hi, lo); +} + +/* Try to collapse multiple pending faults from the same requester */ +static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id) +{ + uint32_t i; + uint64_t frcd_reg; + hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */ + + for (i = 0; i < DMAR_FRCD_REG_NR; i++) { + frcd_reg = vtd_get_quad_raw(s, addr); + if ((frcd_reg & VTD_FRCD_F) && + ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) { + return true; + } + addr += 16; /* 128-bit for each */ + } + return false; +} + +/* Log and report an DMAR (address translation) fault to software */ +static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, + hwaddr addr, VTDFaultReason fault, + bool is_write) +{ + uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); + + assert(fault < VTD_FR_MAX); + + if (fault == VTD_FR_RESERVED_ERR) { + /* This is not a normal fault reason case. Drop it. */ + return; + } + + trace_vtd_dmar_fault(source_id, fault, addr, is_write); + + if (fsts_reg & VTD_FSTS_PFO) { + error_report_once("New fault is not recorded due to " + "Primary Fault Overflow"); + return; + } + + if (vtd_try_collapse_fault(s, source_id)) { + error_report_once("New fault is not recorded due to " + "compression of faults"); + return; + } + + if (vtd_is_frcd_set(s, s->next_frcd_reg)) { + error_report_once("Next Fault Recording Reg is used, " + "new fault is not recorded, set PFO field"); + vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO); + return; + } + + vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write); + + if (fsts_reg & VTD_FSTS_PPF) { + error_report_once("There are pending faults already, " + "fault event is not generated"); + vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); + s->next_frcd_reg++; + if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { + s->next_frcd_reg = 0; + } + } else { + vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK, + VTD_FSTS_FRI(s->next_frcd_reg)); + vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */ + s->next_frcd_reg++; + if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { + s->next_frcd_reg = 0; + } + /* This case actually cause the PPF to be Set. + * So generate fault event (interrupt). + */ + vtd_generate_fault_event(s, fsts_reg); + } +} + +/* Handle Invalidation Queue Errors of queued invalidation interface error + * conditions. + */ +static void vtd_handle_inv_queue_error(IntelIOMMUState *s) +{ + uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); + + vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE); + vtd_generate_fault_event(s, fsts_reg); +} + +/* Set the IWC field and try to generate an invalidation completion interrupt */ +static void vtd_generate_completion_event(IntelIOMMUState *s) +{ + if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) { + trace_vtd_inv_desc_wait_irq("One pending, skip current"); + return; + } + vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC); + vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP); + if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) { + trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, " + "new event not generated"); + return; + } else { + /* Generate the interrupt event */ + trace_vtd_inv_desc_wait_irq("Generating complete event"); + vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); + vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); + } +} + +static inline bool vtd_root_entry_present(IntelIOMMUState *s, + VTDRootEntry *re, + uint8_t devfn) +{ + if (s->root_scalable && devfn > UINT8_MAX / 2) { + return re->hi & VTD_ROOT_ENTRY_P; + } + + return re->lo & VTD_ROOT_ENTRY_P; +} + +static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, + VTDRootEntry *re) +{ + dma_addr_t addr; + + addr = s->root + index * sizeof(*re); + if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) { + re->lo = 0; + return -VTD_FR_ROOT_TABLE_INV; + } + re->lo = le64_to_cpu(re->lo); + re->hi = le64_to_cpu(re->hi); + return 0; +} + +static inline bool vtd_ce_present(VTDContextEntry *context) +{ + return context->lo & VTD_CONTEXT_ENTRY_P; +} + +static int vtd_get_context_entry_from_root(IntelIOMMUState *s, + VTDRootEntry *re, + uint8_t index, + VTDContextEntry *ce) +{ + dma_addr_t addr, ce_size; + + /* we have checked that root entry is present */ + ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE : + VTD_CTX_ENTRY_LEGACY_SIZE; + + if (s->root_scalable && index > UINT8_MAX / 2) { + index = index & (~VTD_DEVFN_CHECK_MASK); + addr = re->hi & VTD_ROOT_ENTRY_CTP; + } else { + addr = re->lo & VTD_ROOT_ENTRY_CTP; + } + + addr = addr + index * ce_size; + if (dma_memory_read(&address_space_memory, addr, ce, ce_size)) { + return -VTD_FR_CONTEXT_TABLE_INV; + } + + ce->lo = le64_to_cpu(ce->lo); + ce->hi = le64_to_cpu(ce->hi); + if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) { + ce->val[2] = le64_to_cpu(ce->val[2]); + ce->val[3] = le64_to_cpu(ce->val[3]); + } + return 0; +} + +static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) +{ + return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; +} + +static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw) +{ + return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw); +} + +/* Whether the pte indicates the address of the page frame */ +static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level) +{ + return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK); +} + +/* Get the content of a spte located in @base_addr[@index] */ +static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) +{ + uint64_t slpte; + + assert(index < VTD_SL_PT_ENTRY_NR); + + if (dma_memory_read(&address_space_memory, + base_addr + index * sizeof(slpte), &slpte, + sizeof(slpte))) { + slpte = (uint64_t)-1; + return slpte; + } + slpte = le64_to_cpu(slpte); + return slpte; +} + +/* Given an iova and the level of paging structure, return the offset + * of current level. + */ +static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) +{ + return (iova >> vtd_slpt_level_shift(level)) & + ((1ULL << VTD_SL_LEVEL_BITS) - 1); +} + +/* Check Capability Register to see if the @level of page-table is supported */ +static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) +{ + return VTD_CAP_SAGAW_MASK & s->cap & + (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT)); +} + +/* Return true if check passed, otherwise false */ +static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu, + VTDPASIDEntry *pe) +{ + switch (VTD_PE_GET_TYPE(pe)) { + case VTD_SM_PASID_ENTRY_FLT: + case VTD_SM_PASID_ENTRY_SLT: + case VTD_SM_PASID_ENTRY_NESTED: + break; + case VTD_SM_PASID_ENTRY_PT: + if (!x86_iommu->pt_supported) { + return false; + } + break; + default: + /* Unknown type */ + return false; + } + return true; +} + +static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire) +{ + return pdire->val & 1; +} + +/** + * Caller of this function should check present bit if wants + * to use pdir entry for further usage except for fpd bit check. + */ +static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base, + uint32_t pasid, + VTDPASIDDirEntry *pdire) +{ + uint32_t index; + dma_addr_t addr, entry_size; + + index = VTD_PASID_DIR_INDEX(pasid); + entry_size = VTD_PASID_DIR_ENTRY_SIZE; + addr = pasid_dir_base + index * entry_size; + if (dma_memory_read(&address_space_memory, addr, pdire, entry_size)) { + return -VTD_FR_PASID_TABLE_INV; + } + + return 0; +} + +static inline bool vtd_pe_present(VTDPASIDEntry *pe) +{ + return pe->val[0] & VTD_PASID_ENTRY_P; +} + +static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s, + uint32_t pasid, + dma_addr_t addr, + VTDPASIDEntry *pe) +{ + uint32_t index; + dma_addr_t entry_size; + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + + index = VTD_PASID_TABLE_INDEX(pasid); + entry_size = VTD_PASID_ENTRY_SIZE; + addr = addr + index * entry_size; + if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) { + return -VTD_FR_PASID_TABLE_INV; + } + + /* Do translation type check */ + if (!vtd_pe_type_check(x86_iommu, pe)) { + return -VTD_FR_PASID_TABLE_INV; + } + + if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) { + return -VTD_FR_PASID_TABLE_INV; + } + + return 0; +} + +/** + * Caller of this function should check present bit if wants + * to use pasid entry for further usage except for fpd bit check. + */ +static int vtd_get_pe_from_pdire(IntelIOMMUState *s, + uint32_t pasid, + VTDPASIDDirEntry *pdire, + VTDPASIDEntry *pe) +{ + dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK; + + return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe); +} + +/** + * This function gets a pasid entry from a specified pasid + * table (includes dir and leaf table) with a specified pasid. + * Sanity check should be done to ensure return a present + * pasid entry to caller. + */ +static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s, + dma_addr_t pasid_dir_base, + uint32_t pasid, + VTDPASIDEntry *pe) +{ + int ret; + VTDPASIDDirEntry pdire; + + ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, + pasid, &pdire); + if (ret) { + return ret; + } + + if (!vtd_pdire_present(&pdire)) { + return -VTD_FR_PASID_TABLE_INV; + } + + ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe); + if (ret) { + return ret; + } + + if (!vtd_pe_present(pe)) { + return -VTD_FR_PASID_TABLE_INV; + } + + return 0; +} + +static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, + VTDContextEntry *ce, + VTDPASIDEntry *pe) +{ + uint32_t pasid; + dma_addr_t pasid_dir_base; + int ret = 0; + + pasid = VTD_CE_GET_RID2PASID(ce); + pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); + ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe); + + return ret; +} + +static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, + VTDContextEntry *ce, + bool *pe_fpd_set) +{ + int ret; + uint32_t pasid; + dma_addr_t pasid_dir_base; + VTDPASIDDirEntry pdire; + VTDPASIDEntry pe; + + pasid = VTD_CE_GET_RID2PASID(ce); + pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); + + /* + * No present bit check since fpd is meaningful even + * if the present bit is clear. + */ + ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire); + if (ret) { + return ret; + } + + if (pdire.val & VTD_PASID_DIR_FPD) { + *pe_fpd_set = true; + return 0; + } + + if (!vtd_pdire_present(&pdire)) { + return -VTD_FR_PASID_TABLE_INV; + } + + /* + * No present bit check since fpd is meaningful even + * if the present bit is clear. + */ + ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe); + if (ret) { + return ret; + } + + if (pe.val[0] & VTD_PASID_ENTRY_FPD) { + *pe_fpd_set = true; + } + + return 0; +} + +/* Get the page-table level that hardware should use for the second-level + * page-table walk from the Address Width field of context-entry. + */ +static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) +{ + return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW); +} + +static uint32_t vtd_get_iova_level(IntelIOMMUState *s, + VTDContextEntry *ce) +{ + VTDPASIDEntry pe; + + if (s->root_scalable) { + vtd_ce_get_rid2pasid_entry(s, ce, &pe); + return VTD_PE_GET_LEVEL(&pe); + } + + return vtd_ce_get_level(ce); +} + +static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) +{ + return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; +} + +static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s, + VTDContextEntry *ce) +{ + VTDPASIDEntry pe; + + if (s->root_scalable) { + vtd_ce_get_rid2pasid_entry(s, ce, &pe); + return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9; + } + + return vtd_ce_get_agaw(ce); +} + +static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) +{ + return ce->lo & VTD_CONTEXT_ENTRY_TT; +} + +/* Only for Legacy Mode. Return true if check passed, otherwise false */ +static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, + VTDContextEntry *ce) +{ + switch (vtd_ce_get_type(ce)) { + case VTD_CONTEXT_TT_MULTI_LEVEL: + /* Always supported */ + break; + case VTD_CONTEXT_TT_DEV_IOTLB: + if (!x86_iommu->dt_supported) { + error_report_once("%s: DT specified but not supported", __func__); + return false; + } + break; + case VTD_CONTEXT_TT_PASS_THROUGH: + if (!x86_iommu->pt_supported) { + error_report_once("%s: PT specified but not supported", __func__); + return false; + } + break; + default: + /* Unknown type */ + error_report_once("%s: unknown ce type: %"PRIu32, __func__, + vtd_ce_get_type(ce)); + return false; + } + return true; +} + +static inline uint64_t vtd_iova_limit(IntelIOMMUState *s, + VTDContextEntry *ce, uint8_t aw) +{ + uint32_t ce_agaw = vtd_get_iova_agaw(s, ce); + return 1ULL << MIN(ce_agaw, aw); +} + +/* Return true if IOVA passes range check, otherwise false. */ +static inline bool vtd_iova_range_check(IntelIOMMUState *s, + uint64_t iova, VTDContextEntry *ce, + uint8_t aw) +{ + /* + * Check if @iova is above 2^X-1, where X is the minimum of MGAW + * in CAP_REG and AW in context-entry. + */ + return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1)); +} + +static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, + VTDContextEntry *ce) +{ + VTDPASIDEntry pe; + + if (s->root_scalable) { + vtd_ce_get_rid2pasid_entry(s, ce, &pe); + return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR; + } + + return vtd_ce_get_slpt_base(ce); +} + +/* + * Rsvd field masks for spte: + * vtd_spte_rsvd 4k pages + * vtd_spte_rsvd_large large pages + */ +static uint64_t vtd_spte_rsvd[5]; +static uint64_t vtd_spte_rsvd_large[5]; + +static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) +{ + uint64_t rsvd_mask = vtd_spte_rsvd[level]; + + if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) && + (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) { + /* large page */ + rsvd_mask = vtd_spte_rsvd_large[level]; + } + + return slpte & rsvd_mask; +} + +/* Find the VTD address space associated with a given bus number */ +static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) +{ + VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; + GHashTableIter iter; + + if (vtd_bus) { + return vtd_bus; + } + + /* + * Iterate over the registered buses to find the one which + * currently holds this bus number and update the bus_num + * lookup table. + */ + g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { + if (pci_bus_num(vtd_bus->bus) == bus_num) { + s->vtd_as_by_bus_num[bus_num] = vtd_bus; + return vtd_bus; + } + } + + return NULL; +} + +/* Given the @iova, get relevant @slptep. @slpte_level will be the last level + * of the translation, can be used for deciding the size of large page. + */ +static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, + uint64_t iova, bool is_write, + uint64_t *slptep, uint32_t *slpte_level, + bool *reads, bool *writes, uint8_t aw_bits) +{ + dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); + uint32_t level = vtd_get_iova_level(s, ce); + uint32_t offset; + uint64_t slpte; + uint64_t access_right_check; + + if (!vtd_iova_range_check(s, iova, ce, aw_bits)) { + error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")", + __func__, iova); + return -VTD_FR_ADDR_BEYOND_MGAW; + } + + /* FIXME: what is the Atomics request here? */ + access_right_check = is_write ? VTD_SL_W : VTD_SL_R; + + while (true) { + offset = vtd_iova_level_offset(iova, level); + slpte = vtd_get_slpte(addr, offset); + + if (slpte == (uint64_t)-1) { + error_report_once("%s: detected read error on DMAR slpte " + "(iova=0x%" PRIx64 ")", __func__, iova); + if (level == vtd_get_iova_level(s, ce)) { + /* Invalid programming of context-entry */ + return -VTD_FR_CONTEXT_ENTRY_INV; + } else { + return -VTD_FR_PAGING_ENTRY_INV; + } + } + *reads = (*reads) && (slpte & VTD_SL_R); + *writes = (*writes) && (slpte & VTD_SL_W); + if (!(slpte & access_right_check)) { + error_report_once("%s: detected slpte permission error " + "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " + "slpte=0x%" PRIx64 ", write=%d)", __func__, + iova, level, slpte, is_write); + return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; + } + if (vtd_slpte_nonzero_rsvd(slpte, level)) { + error_report_once("%s: detected splte reserve non-zero " + "iova=0x%" PRIx64 ", level=0x%" PRIx32 + "slpte=0x%" PRIx64 ")", __func__, iova, + level, slpte); + return -VTD_FR_PAGING_ENTRY_RSVD; + } + + if (vtd_is_last_slpte(slpte, level)) { + *slptep = slpte; + *slpte_level = level; + return 0; + } + addr = vtd_get_slpte_addr(slpte, aw_bits); + level--; + } +} + +typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private); + +/** + * Constant information used during page walking + * + * @hook_fn: hook func to be called when detected page + * @private: private data to be passed into hook func + * @notify_unmap: whether we should notify invalid entries + * @as: VT-d address space of the device + * @aw: maximum address width + * @domain: domain ID of the page walk + */ +typedef struct { + VTDAddressSpace *as; + vtd_page_walk_hook hook_fn; + void *private; + bool notify_unmap; + uint8_t aw; + uint16_t domain_id; +} vtd_page_walk_info; + +static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info) +{ + VTDAddressSpace *as = info->as; + vtd_page_walk_hook hook_fn = info->hook_fn; + void *private = info->private; + IOMMUTLBEntry *entry = &event->entry; + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + const DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) { + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + + assert(hook_fn); + + /* Update local IOVA mapped ranges */ + if (event->type == IOMMU_NOTIFIER_MAP) { + if (mapped) { + /* If it's exactly the same translation, skip */ + if (!memcmp(mapped, &target, sizeof(target))) { + trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, + entry->translated_addr); + return 0; + } else { + /* + * Translation changed. Normally this should not + * happen, but it can happen when with buggy guest + * OSes. Note that there will be a small window that + * we don't have map at all. But that's the best + * effort we can do. The ideal way to emulate this is + * atomically modify the PTE to follow what has + * changed, but we can't. One example is that vfio + * driver only has VFIO_IOMMU_[UN]MAP_DMA but no + * interface to modify a mapping (meanwhile it seems + * meaningless to even provide one). Anyway, let's + * mark this as a TODO in case one day we'll have + * a better solution. + */ + IOMMUAccessFlags cache_perm = entry->perm; + int ret; + + /* Emulate an UNMAP */ + event->type = IOMMU_NOTIFIER_UNMAP; + entry->perm = IOMMU_NONE; + trace_vtd_page_walk_one(info->domain_id, + entry->iova, + entry->translated_addr, + entry->addr_mask, + entry->perm); + ret = hook_fn(event, private); + if (ret) { + return ret; + } + /* Drop any existing mapping */ + iova_tree_remove(as->iova_tree, &target); + /* Recover the correct type */ + event->type = IOMMU_NOTIFIER_MAP; + entry->perm = cache_perm; + } + } + iova_tree_insert(as->iova_tree, &target); + } else { + if (!mapped) { + /* Skip since we didn't map this range at all */ + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + iova_tree_remove(as->iova_tree, &target); + } + + trace_vtd_page_walk_one(info->domain_id, entry->iova, + entry->translated_addr, entry->addr_mask, + entry->perm); + return hook_fn(event, private); +} + +/** + * vtd_page_walk_level - walk over specific level for IOVA range + * + * @addr: base GPA addr to start the walk + * @start: IOVA range start address + * @end: IOVA range end address (start <= addr < end) + * @read: whether parent level has read permission + * @write: whether parent level has write permission + * @info: constant information for the page walk + */ +static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, + uint64_t end, uint32_t level, bool read, + bool write, vtd_page_walk_info *info) +{ + bool read_cur, write_cur, entry_valid; + uint32_t offset; + uint64_t slpte; + uint64_t subpage_size, subpage_mask; + IOMMUTLBEvent event; + uint64_t iova = start; + uint64_t iova_next; + int ret = 0; + + trace_vtd_page_walk_level(addr, level, start, end); + + subpage_size = 1ULL << vtd_slpt_level_shift(level); + subpage_mask = vtd_slpt_level_page_mask(level); + + while (iova < end) { + iova_next = (iova & subpage_mask) + subpage_size; + + offset = vtd_iova_level_offset(iova, level); + slpte = vtd_get_slpte(addr, offset); + + if (slpte == (uint64_t)-1) { + trace_vtd_page_walk_skip_read(iova, iova_next); + goto next; + } + + if (vtd_slpte_nonzero_rsvd(slpte, level)) { + trace_vtd_page_walk_skip_reserve(iova, iova_next); + goto next; + } + + /* Permissions are stacked with parents' */ + read_cur = read && (slpte & VTD_SL_R); + write_cur = write && (slpte & VTD_SL_W); + + /* + * As long as we have either read/write permission, this is a + * valid entry. The rule works for both page entries and page + * table entries. + */ + entry_valid = read_cur | write_cur; + + if (!vtd_is_last_slpte(slpte, level) && entry_valid) { + /* + * This is a valid PDE (or even bigger than PDE). We need + * to walk one further level. + */ + ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), + iova, MIN(iova_next, end), level - 1, + read_cur, write_cur, info); + } else { + /* + * This means we are either: + * + * (1) the real page entry (either 4K page, or huge page) + * (2) the whole range is invalid + * + * In either case, we send an IOTLB notification down. + */ + event.entry.target_as = &address_space_memory; + event.entry.iova = iova & subpage_mask; + event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); + event.entry.addr_mask = ~subpage_mask; + /* NOTE: this is only meaningful if entry_valid == true */ + event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); + event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP : + IOMMU_NOTIFIER_UNMAP; + ret = vtd_page_walk_one(&event, info); + } + + if (ret < 0) { + return ret; + } + +next: + iova = iova_next; + } + + return 0; +} + +/** + * vtd_page_walk - walk specific IOVA range, and call the hook + * + * @s: intel iommu state + * @ce: context entry to walk upon + * @start: IOVA address to start the walk + * @end: IOVA range end address (start <= addr < end) + * @info: page walking information struct + */ +static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce, + uint64_t start, uint64_t end, + vtd_page_walk_info *info) +{ + dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); + uint32_t level = vtd_get_iova_level(s, ce); + + if (!vtd_iova_range_check(s, start, ce, info->aw)) { + return -VTD_FR_ADDR_BEYOND_MGAW; + } + + if (!vtd_iova_range_check(s, end, ce, info->aw)) { + /* Fix end so that it reaches the maximum */ + end = vtd_iova_limit(s, ce, info->aw); + } + + return vtd_page_walk_level(addr, start, end, level, true, true, info); +} + +static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s, + VTDRootEntry *re) +{ + /* Legacy Mode reserved bits check */ + if (!s->root_scalable && + (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) + goto rsvd_err; + + /* Scalable Mode reserved bits check */ + if (s->root_scalable && + ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) || + (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) + goto rsvd_err; + + return 0; + +rsvd_err: + error_report_once("%s: invalid root entry: hi=0x%"PRIx64 + ", lo=0x%"PRIx64, + __func__, re->hi, re->lo); + return -VTD_FR_ROOT_ENTRY_RSVD; +} + +static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s, + VTDContextEntry *ce) +{ + if (!s->root_scalable && + (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI || + ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) { + error_report_once("%s: invalid context entry: hi=%"PRIx64 + ", lo=%"PRIx64" (reserved nonzero)", + __func__, ce->hi, ce->lo); + return -VTD_FR_CONTEXT_ENTRY_RSVD; + } + + if (s->root_scalable && + (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) || + ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 || + ce->val[2] || + ce->val[3])) { + error_report_once("%s: invalid context entry: val[3]=%"PRIx64 + ", val[2]=%"PRIx64 + ", val[1]=%"PRIx64 + ", val[0]=%"PRIx64" (reserved nonzero)", + __func__, ce->val[3], ce->val[2], + ce->val[1], ce->val[0]); + return -VTD_FR_CONTEXT_ENTRY_RSVD; + } + + return 0; +} + +static int vtd_ce_rid2pasid_check(IntelIOMMUState *s, + VTDContextEntry *ce) +{ + VTDPASIDEntry pe; + + /* + * Make sure in Scalable Mode, a present context entry + * has valid rid2pasid setting, which includes valid + * rid2pasid field and corresponding pasid entry setting + */ + return vtd_ce_get_rid2pasid_entry(s, ce, &pe); +} + +/* Map a device to its corresponding domain (context-entry) */ +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, + uint8_t devfn, VTDContextEntry *ce) +{ + VTDRootEntry re; + int ret_fr; + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + + ret_fr = vtd_get_root_entry(s, bus_num, &re); + if (ret_fr) { + return ret_fr; + } + + if (!vtd_root_entry_present(s, &re, devfn)) { + /* Not error - it's okay we don't have root entry. */ + trace_vtd_re_not_present(bus_num); + return -VTD_FR_ROOT_ENTRY_P; + } + + ret_fr = vtd_root_entry_rsvd_bits_check(s, &re); + if (ret_fr) { + return ret_fr; + } + + ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce); + if (ret_fr) { + return ret_fr; + } + + if (!vtd_ce_present(ce)) { + /* Not error - it's okay we don't have context entry. */ + trace_vtd_ce_not_present(bus_num, devfn); + return -VTD_FR_CONTEXT_ENTRY_P; + } + + ret_fr = vtd_context_entry_rsvd_bits_check(s, ce); + if (ret_fr) { + return ret_fr; + } + + /* Check if the programming of context-entry is valid */ + if (!s->root_scalable && + !vtd_is_level_supported(s, vtd_ce_get_level(ce))) { + error_report_once("%s: invalid context entry: hi=%"PRIx64 + ", lo=%"PRIx64" (level %d not supported)", + __func__, ce->hi, ce->lo, + vtd_ce_get_level(ce)); + return -VTD_FR_CONTEXT_ENTRY_INV; + } + + if (!s->root_scalable) { + /* Do translation type check */ + if (!vtd_ce_type_check(x86_iommu, ce)) { + /* Errors dumped in vtd_ce_type_check() */ + return -VTD_FR_CONTEXT_ENTRY_INV; + } + } else { + /* + * Check if the programming of context-entry.rid2pasid + * and corresponding pasid setting is valid, and thus + * avoids to check pasid entry fetching result in future + * helper function calling. + */ + ret_fr = vtd_ce_rid2pasid_check(s, ce); + if (ret_fr) { + return ret_fr; + } + } + + return 0; +} + +static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event, + void *private) +{ + memory_region_notify_iommu(private, 0, *event); + return 0; +} + +static uint16_t vtd_get_domain_id(IntelIOMMUState *s, + VTDContextEntry *ce) +{ + VTDPASIDEntry pe; + + if (s->root_scalable) { + vtd_ce_get_rid2pasid_entry(s, ce, &pe); + return VTD_SM_PASID_ENTRY_DID(pe.val[1]); + } + + return VTD_CONTEXT_ENTRY_DID(ce->hi); +} + +static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, + VTDContextEntry *ce, + hwaddr addr, hwaddr size) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + vtd_page_walk_info info = { + .hook_fn = vtd_sync_shadow_page_hook, + .private = (void *)&vtd_as->iommu, + .notify_unmap = true, + .aw = s->aw_bits, + .as = vtd_as, + .domain_id = vtd_get_domain_id(s, ce), + }; + + return vtd_page_walk(s, ce, addr, addr + size, &info); +} + +static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) +{ + int ret; + VTDContextEntry ce; + IOMMUNotifier *n; + + if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) { + return 0; + } + + ret = vtd_dev_to_context_entry(vtd_as->iommu_state, + pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce); + if (ret) { + if (ret == -VTD_FR_CONTEXT_ENTRY_P) { + /* + * It's a valid scenario to have a context entry that is + * not present. For example, when a device is removed + * from an existing domain then the context entry will be + * zeroed by the guest before it was put into another + * domain. When this happens, instead of synchronizing + * the shadow pages we should invalidate all existing + * mappings and notify the backends. + */ + IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { + vtd_address_space_unmap(vtd_as, n); + } + ret = 0; + } + return ret; + } + + return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX); +} + +/* + * Check if specific device is configured to bypass address + * translation for DMA requests. In Scalable Mode, bypass + * 1st-level translation or 2nd-level translation, it depends + * on PGTT setting. + */ +static bool vtd_dev_pt_enabled(VTDAddressSpace *as) +{ + IntelIOMMUState *s; + VTDContextEntry ce; + VTDPASIDEntry pe; + int ret; + + assert(as); + + s = as->iommu_state; + ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), + as->devfn, &ce); + if (ret) { + /* + * Possibly failed to parse the context entry for some reason + * (e.g., during init, or any guest configuration errors on + * context entries). We should assume PT not enabled for + * safety. + */ + return false; + } + + if (s->root_scalable) { + ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe); + if (ret) { + error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32, + __func__, ret); + return false; + } + return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT); + } + + return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH); +} + +/* Return whether the device is using IOMMU translation. */ +static bool vtd_switch_address_space(VTDAddressSpace *as) +{ + bool use_iommu; + /* Whether we need to take the BQL on our own */ + bool take_bql = !qemu_mutex_iothread_locked(); + + assert(as); + + use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as); + + trace_vtd_switch_address_space(pci_bus_num(as->bus), + VTD_PCI_SLOT(as->devfn), + VTD_PCI_FUNC(as->devfn), + use_iommu); + + /* + * It's possible that we reach here without BQL, e.g., when called + * from vtd_pt_enable_fast_path(). However the memory APIs need + * it. We'd better make sure we have had it already, or, take it. + */ + if (take_bql) { + qemu_mutex_lock_iothread(); + } + + /* Turn off first then on the other */ + if (use_iommu) { + memory_region_set_enabled(&as->nodmar, false); + memory_region_set_enabled(MEMORY_REGION(&as->iommu), true); + } else { + memory_region_set_enabled(MEMORY_REGION(&as->iommu), false); + memory_region_set_enabled(&as->nodmar, true); + } + + if (take_bql) { + qemu_mutex_unlock_iothread(); + } + + return use_iommu; +} + +static void vtd_switch_address_space_all(IntelIOMMUState *s) +{ + GHashTableIter iter; + VTDBus *vtd_bus; + int i; + + g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { + for (i = 0; i < PCI_DEVFN_MAX; i++) { + if (!vtd_bus->dev_as[i]) { + continue; + } + vtd_switch_address_space(vtd_bus->dev_as[i]); + } + } +} + +static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) +{ + return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); +} + +static const bool vtd_qualified_faults[] = { + [VTD_FR_RESERVED] = false, + [VTD_FR_ROOT_ENTRY_P] = false, + [VTD_FR_CONTEXT_ENTRY_P] = true, + [VTD_FR_CONTEXT_ENTRY_INV] = true, + [VTD_FR_ADDR_BEYOND_MGAW] = true, + [VTD_FR_WRITE] = true, + [VTD_FR_READ] = true, + [VTD_FR_PAGING_ENTRY_INV] = true, + [VTD_FR_ROOT_TABLE_INV] = false, + [VTD_FR_CONTEXT_TABLE_INV] = false, + [VTD_FR_ROOT_ENTRY_RSVD] = false, + [VTD_FR_PAGING_ENTRY_RSVD] = true, + [VTD_FR_CONTEXT_ENTRY_TT] = true, + [VTD_FR_PASID_TABLE_INV] = false, + [VTD_FR_RESERVED_ERR] = false, + [VTD_FR_MAX] = false, +}; + +/* To see if a fault condition is "qualified", which is reported to software + * only if the FPD field in the context-entry used to process the faulting + * request is 0. + */ +static inline bool vtd_is_qualified_fault(VTDFaultReason fault) +{ + return vtd_qualified_faults[fault]; +} + +static inline bool vtd_is_interrupt_addr(hwaddr addr) +{ + return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; +} + +static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) +{ + VTDBus *vtd_bus; + VTDAddressSpace *vtd_as; + bool success = false; + + vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); + if (!vtd_bus) { + goto out; + } + + vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; + if (!vtd_as) { + goto out; + } + + if (vtd_switch_address_space(vtd_as) == false) { + /* We switched off IOMMU region successfully. */ + success = true; + } + +out: + trace_vtd_pt_enable_fast_path(source_id, success); +} + +/* Map dev to context-entry then do a paging-structures walk to do a iommu + * translation. + * + * Called from RCU critical section. + * + * @bus_num: The bus number + * @devfn: The devfn, which is the combined of device and function number + * @is_write: The access is a write operation + * @entry: IOMMUTLBEntry that contain the addr to be translated and result + * + * Returns true if translation is successful, otherwise false. + */ +static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, + uint8_t devfn, hwaddr addr, bool is_write, + IOMMUTLBEntry *entry) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + VTDContextEntry ce; + uint8_t bus_num = pci_bus_num(bus); + VTDContextCacheEntry *cc_entry; + uint64_t slpte, page_mask; + uint32_t level; + uint16_t source_id = vtd_make_source_id(bus_num, devfn); + int ret_fr; + bool is_fpd_set = false; + bool reads = true; + bool writes = true; + uint8_t access_flags; + VTDIOTLBEntry *iotlb_entry; + + /* + * We have standalone memory region for interrupt addresses, we + * should never receive translation requests in this region. + */ + assert(!vtd_is_interrupt_addr(addr)); + + vtd_iommu_lock(s); + + cc_entry = &vtd_as->context_cache_entry; + + /* Try to fetch slpte form IOTLB */ + iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); + if (iotlb_entry) { + trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, + iotlb_entry->domain_id); + slpte = iotlb_entry->slpte; + access_flags = iotlb_entry->access_flags; + page_mask = iotlb_entry->mask; + goto out; + } + + /* Try to fetch context-entry from cache first */ + if (cc_entry->context_cache_gen == s->context_cache_gen) { + trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi, + cc_entry->context_entry.lo, + cc_entry->context_cache_gen); + ce = cc_entry->context_entry; + is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; + if (!is_fpd_set && s->root_scalable) { + ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); + VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); + } + } else { + ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); + is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; + if (!ret_fr && !is_fpd_set && s->root_scalable) { + ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); + } + VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); + /* Update context-cache */ + trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, + cc_entry->context_cache_gen, + s->context_cache_gen); + cc_entry->context_entry = ce; + cc_entry->context_cache_gen = s->context_cache_gen; + } + + /* + * We don't need to translate for pass-through context entries. + * Also, let's ignore IOTLB caching as well for PT devices. + */ + if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { + entry->iova = addr & VTD_PAGE_MASK_4K; + entry->translated_addr = entry->iova; + entry->addr_mask = ~VTD_PAGE_MASK_4K; + entry->perm = IOMMU_RW; + trace_vtd_translate_pt(source_id, entry->iova); + + /* + * When this happens, it means firstly caching-mode is not + * enabled, and this is the first passthrough translation for + * the device. Let's enable the fast path for passthrough. + * + * When passthrough is disabled again for the device, we can + * capture it via the context entry invalidation, then the + * IOMMU region can be swapped back. + */ + vtd_pt_enable_fast_path(s, source_id); + vtd_iommu_unlock(s); + return true; + } + + ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level, + &reads, &writes, s->aw_bits); + VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); + + page_mask = vtd_slpt_level_page_mask(level); + access_flags = IOMMU_ACCESS_FLAG(reads, writes); + vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte, + access_flags, level); +out: + vtd_iommu_unlock(s); + entry->iova = addr & page_mask; + entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; + entry->addr_mask = ~page_mask; + entry->perm = access_flags; + return true; + +error: + vtd_iommu_unlock(s); + entry->iova = 0; + entry->translated_addr = 0; + entry->addr_mask = 0; + entry->perm = IOMMU_NONE; + return false; +} + +static void vtd_root_table_setup(IntelIOMMUState *s) +{ + s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG); + s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits); + + vtd_update_scalable_state(s); + + trace_vtd_reg_dmar_root(s->root, s->root_scalable); +} + +static void vtd_iec_notify_all(IntelIOMMUState *s, bool global, + uint32_t index, uint32_t mask) +{ + x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask); +} + +static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) +{ + uint64_t value = 0; + value = vtd_get_quad_raw(s, DMAR_IRTA_REG); + s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); + s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits); + s->intr_eime = value & VTD_IRTA_EIME; + + /* Notify global invalidation */ + vtd_iec_notify_all(s, true, 0, 0); + + trace_vtd_reg_ir_root(s->intr_root, s->intr_size); +} + +static void vtd_iommu_replay_all(IntelIOMMUState *s) +{ + VTDAddressSpace *vtd_as; + + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { + vtd_sync_shadow_page_table(vtd_as); + } +} + +static void vtd_context_global_invalidate(IntelIOMMUState *s) +{ + trace_vtd_inv_desc_cc_global(); + /* Protects context cache */ + vtd_iommu_lock(s); + s->context_cache_gen++; + if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { + vtd_reset_context_cache_locked(s); + } + vtd_iommu_unlock(s); + vtd_address_space_refresh_all(s); + /* + * From VT-d spec 6.5.2.1, a global context entry invalidation + * should be followed by a IOTLB global invalidation, so we should + * be safe even without this. Hoewever, let's replay the region as + * well to be safer, and go back here when we need finer tunes for + * VT-d emulation codes. + */ + vtd_iommu_replay_all(s); +} + +/* Do a context-cache device-selective invalidation. + * @func_mask: FM field after shifting + */ +static void vtd_context_device_invalidate(IntelIOMMUState *s, + uint16_t source_id, + uint16_t func_mask) +{ + uint16_t mask; + VTDBus *vtd_bus; + VTDAddressSpace *vtd_as; + uint8_t bus_n, devfn; + uint16_t devfn_it; + + trace_vtd_inv_desc_cc_devices(source_id, func_mask); + + switch (func_mask & 3) { + case 0: + mask = 0; /* No bits in the SID field masked */ + break; + case 1: + mask = 4; /* Mask bit 2 in the SID field */ + break; + case 2: + mask = 6; /* Mask bit 2:1 in the SID field */ + break; + case 3: + mask = 7; /* Mask bit 2:0 in the SID field */ + break; + default: + g_assert_not_reached(); + } + mask = ~mask; + + bus_n = VTD_SID_TO_BUS(source_id); + vtd_bus = vtd_find_as_from_bus_num(s, bus_n); + if (vtd_bus) { + devfn = VTD_SID_TO_DEVFN(source_id); + for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { + vtd_as = vtd_bus->dev_as[devfn_it]; + if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { + trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), + VTD_PCI_FUNC(devfn_it)); + vtd_iommu_lock(s); + vtd_as->context_cache_entry.context_cache_gen = 0; + vtd_iommu_unlock(s); + /* + * Do switch address space when needed, in case if the + * device passthrough bit is switched. + */ + vtd_switch_address_space(vtd_as); + /* + * So a device is moving out of (or moving into) a + * domain, resync the shadow page table. + * This won't bring bad even if we have no such + * notifier registered - the IOMMU notification + * framework will skip MAP notifications if that + * happened. + */ + vtd_sync_shadow_page_table(vtd_as); + } + } + } +} + +/* Context-cache invalidation + * Returns the Context Actual Invalidation Granularity. + * @val: the content of the CCMD_REG + */ +static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val) +{ + uint64_t caig; + uint64_t type = val & VTD_CCMD_CIRG_MASK; + + switch (type) { + case VTD_CCMD_DOMAIN_INVL: + /* Fall through */ + case VTD_CCMD_GLOBAL_INVL: + caig = VTD_CCMD_GLOBAL_INVL_A; + vtd_context_global_invalidate(s); + break; + + case VTD_CCMD_DEVICE_INVL: + caig = VTD_CCMD_DEVICE_INVL_A; + vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val)); + break; + + default: + error_report_once("%s: invalid context: 0x%" PRIx64, + __func__, val); + caig = 0; + } + return caig; +} + +static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) +{ + trace_vtd_inv_desc_iotlb_global(); + vtd_reset_iotlb(s); + vtd_iommu_replay_all(s); +} + +static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) +{ + VTDContextEntry ce; + VTDAddressSpace *vtd_as; + + trace_vtd_inv_desc_iotlb_domain(domain_id); + + vtd_iommu_lock(s); + g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, + &domain_id); + vtd_iommu_unlock(s); + + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { + if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce) && + domain_id == vtd_get_domain_id(s, &ce)) { + vtd_sync_shadow_page_table(vtd_as); + } + } +} + +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, + uint16_t domain_id, hwaddr addr, + uint8_t am) +{ + VTDAddressSpace *vtd_as; + VTDContextEntry ce; + int ret; + hwaddr size = (1 << am) * VTD_PAGE_SIZE; + + QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce); + if (!ret && domain_id == vtd_get_domain_id(s, &ce)) { + if (vtd_as_has_map_notifier(vtd_as)) { + /* + * As long as we have MAP notifications registered in + * any of our IOMMU notifiers, we need to sync the + * shadow page table. + */ + vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); + } else { + /* + * For UNMAP-only notifiers, we don't need to walk the + * page tables. We just deliver the PSI down to + * invalidate caches. + */ + IOMMUTLBEvent event = { + .type = IOMMU_NOTIFIER_UNMAP, + .entry = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = 0, + .addr_mask = size - 1, + .perm = IOMMU_NONE, + }, + }; + memory_region_notify_iommu(&vtd_as->iommu, 0, event); + } + } + } +} + +static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, + hwaddr addr, uint8_t am) +{ + VTDIOTLBPageInvInfo info; + + trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am); + + assert(am <= VTD_MAMV); + info.domain_id = domain_id; + info.addr = addr; + info.mask = ~((1 << am) - 1); + vtd_iommu_lock(s); + g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); + vtd_iommu_unlock(s); + vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); +} + +/* Flush IOTLB + * Returns the IOTLB Actual Invalidation Granularity. + * @val: the content of the IOTLB_REG + */ +static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val) +{ + uint64_t iaig; + uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK; + uint16_t domain_id; + hwaddr addr; + uint8_t am; + + switch (type) { + case VTD_TLB_GLOBAL_FLUSH: + iaig = VTD_TLB_GLOBAL_FLUSH_A; + vtd_iotlb_global_invalidate(s); + break; + + case VTD_TLB_DSI_FLUSH: + domain_id = VTD_TLB_DID(val); + iaig = VTD_TLB_DSI_FLUSH_A; + vtd_iotlb_domain_invalidate(s, domain_id); + break; + + case VTD_TLB_PSI_FLUSH: + domain_id = VTD_TLB_DID(val); + addr = vtd_get_quad_raw(s, DMAR_IVA_REG); + am = VTD_IVA_AM(addr); + addr = VTD_IVA_ADDR(addr); + if (am > VTD_MAMV) { + error_report_once("%s: address mask overflow: 0x%" PRIx64, + __func__, vtd_get_quad_raw(s, DMAR_IVA_REG)); + iaig = 0; + break; + } + iaig = VTD_TLB_PSI_FLUSH_A; + vtd_iotlb_page_invalidate(s, domain_id, addr, am); + break; + + default: + error_report_once("%s: invalid granularity: 0x%" PRIx64, + __func__, val); + iaig = 0; + } + return iaig; +} + +static void vtd_fetch_inv_desc(IntelIOMMUState *s); + +static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s) +{ + return s->qi_enabled && (s->iq_tail == s->iq_head) && + (s->iq_last_desc_type == VTD_INV_DESC_WAIT); +} + +static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en) +{ + uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG); + + trace_vtd_inv_qi_enable(en); + + if (en) { + s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits); + /* 2^(x+8) entries */ + s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0)); + s->qi_enabled = true; + trace_vtd_inv_qi_setup(s->iq, s->iq_size); + /* Ok - report back to driver */ + vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES); + + if (s->iq_tail != 0) { + /* + * This is a spec violation but Windows guests are known to set up + * Queued Invalidation this way so we allow the write and process + * Invalidation Descriptors right away. + */ + trace_vtd_warn_invalid_qi_tail(s->iq_tail); + if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { + vtd_fetch_inv_desc(s); + } + } + } else { + if (vtd_queued_inv_disable_check(s)) { + /* disable Queued Invalidation */ + vtd_set_quad_raw(s, DMAR_IQH_REG, 0); + s->iq_head = 0; + s->qi_enabled = false; + /* Ok - report back to driver */ + vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0); + } else { + error_report_once("%s: detected improper state when disable QI " + "(head=0x%x, tail=0x%x, last_type=%d)", + __func__, + s->iq_head, s->iq_tail, s->iq_last_desc_type); + } + } +} + +/* Set Root Table Pointer */ +static void vtd_handle_gcmd_srtp(IntelIOMMUState *s) +{ + vtd_root_table_setup(s); + /* Ok - report back to driver */ + vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS); + vtd_reset_caches(s); + vtd_address_space_refresh_all(s); +} + +/* Set Interrupt Remap Table Pointer */ +static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) +{ + vtd_interrupt_remap_table_setup(s); + /* Ok - report back to driver */ + vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); +} + +/* Handle Translation Enable/Disable */ +static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) +{ + if (s->dmar_enabled == en) { + return; + } + + trace_vtd_dmar_enable(en); + + if (en) { + s->dmar_enabled = true; + /* Ok - report back to driver */ + vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES); + } else { + s->dmar_enabled = false; + + /* Clear the index of Fault Recording Register */ + s->next_frcd_reg = 0; + /* Ok - report back to driver */ + vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0); + } + + vtd_reset_caches(s); + vtd_address_space_refresh_all(s); +} + +/* Handle Interrupt Remap Enable/Disable */ +static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en) +{ + trace_vtd_ir_enable(en); + + if (en) { + s->intr_enabled = true; + /* Ok - report back to driver */ + vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES); + } else { + s->intr_enabled = false; + /* Ok - report back to driver */ + vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0); + } +} + +/* Handle write to Global Command Register */ +static void vtd_handle_gcmd_write(IntelIOMMUState *s) +{ + uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG); + uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG); + uint32_t changed = status ^ val; + + trace_vtd_reg_write_gcmd(status, val); + if (changed & VTD_GCMD_TE) { + /* Translation enable/disable */ + vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); + } + if (val & VTD_GCMD_SRTP) { + /* Set/update the root-table pointer */ + vtd_handle_gcmd_srtp(s); + } + if (changed & VTD_GCMD_QIE) { + /* Queued Invalidation Enable */ + vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE); + } + if (val & VTD_GCMD_SIRTP) { + /* Set/update the interrupt remapping root-table pointer */ + vtd_handle_gcmd_sirtp(s); + } + if (changed & VTD_GCMD_IRE) { + /* Interrupt remap enable/disable */ + vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE); + } +} + +/* Handle write to Context Command Register */ +static void vtd_handle_ccmd_write(IntelIOMMUState *s) +{ + uint64_t ret; + uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG); + + /* Context-cache invalidation request */ + if (val & VTD_CCMD_ICC) { + if (s->qi_enabled) { + error_report_once("Queued Invalidation enabled, " + "should not use register-based invalidation"); + return; + } + ret = vtd_context_cache_invalidate(s, val); + /* Invalidation completed. Change something to show */ + vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL); + ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK, + ret); + } +} + +/* Handle write to IOTLB Invalidation Register */ +static void vtd_handle_iotlb_write(IntelIOMMUState *s) +{ + uint64_t ret; + uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG); + + /* IOTLB invalidation request */ + if (val & VTD_TLB_IVT) { + if (s->qi_enabled) { + error_report_once("Queued Invalidation enabled, " + "should not use register-based invalidation"); + return; + } + ret = vtd_iotlb_flush(s, val); + /* Invalidation completed. Change something to show */ + vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL); + ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, + VTD_TLB_FLUSH_GRANU_MASK_A, ret); + } +} + +/* Fetch an Invalidation Descriptor from the Invalidation Queue */ +static bool vtd_get_inv_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + dma_addr_t base_addr = s->iq; + uint32_t offset = s->iq_head; + uint32_t dw = s->iq_dw ? 32 : 16; + dma_addr_t addr = base_addr + offset * dw; + + if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) { + error_report_once("Read INV DESC failed."); + return false; + } + inv_desc->lo = le64_to_cpu(inv_desc->lo); + inv_desc->hi = le64_to_cpu(inv_desc->hi); + if (dw == 32) { + inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]); + inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]); + } + return true; +} + +static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) +{ + if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || + (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { + error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 + " (reserved nonzero)", __func__, inv_desc->hi, + inv_desc->lo); + return false; + } + if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { + /* Status Write */ + uint32_t status_data = (uint32_t)(inv_desc->lo >> + VTD_INV_DESC_WAIT_DATA_SHIFT); + + assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF)); + + /* FIXME: need to be masked with HAW? */ + dma_addr_t status_addr = inv_desc->hi; + trace_vtd_inv_desc_wait_sw(status_addr, status_data); + status_data = cpu_to_le32(status_data); + if (dma_memory_write(&address_space_memory, status_addr, &status_data, + sizeof(status_data))) { + trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); + return false; + } + } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { + /* Interrupt flag */ + vtd_generate_completion_event(s); + } else { + error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 + " (unknown type)", __func__, inv_desc->hi, + inv_desc->lo); + return false; + } + return true; +} + +static bool vtd_process_context_cache_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + uint16_t sid, fmask; + + if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { + error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 + " (reserved nonzero)", __func__, inv_desc->hi, + inv_desc->lo); + return false; + } + switch (inv_desc->lo & VTD_INV_DESC_CC_G) { + case VTD_INV_DESC_CC_DOMAIN: + trace_vtd_inv_desc_cc_domain( + (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); + /* Fall through */ + case VTD_INV_DESC_CC_GLOBAL: + vtd_context_global_invalidate(s); + break; + + case VTD_INV_DESC_CC_DEVICE: + sid = VTD_INV_DESC_CC_SID(inv_desc->lo); + fmask = VTD_INV_DESC_CC_FM(inv_desc->lo); + vtd_context_device_invalidate(s, sid, fmask); + break; + + default: + error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 + " (invalid type)", __func__, inv_desc->hi, + inv_desc->lo); + return false; + } + return true; +} + +static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) +{ + uint16_t domain_id; + uint8_t am; + hwaddr addr; + + if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || + (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { + error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 + ", lo=0x%"PRIx64" (reserved bits unzero)", + __func__, inv_desc->hi, inv_desc->lo); + return false; + } + + switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) { + case VTD_INV_DESC_IOTLB_GLOBAL: + vtd_iotlb_global_invalidate(s); + break; + + case VTD_INV_DESC_IOTLB_DOMAIN: + domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); + vtd_iotlb_domain_invalidate(s, domain_id); + break; + + case VTD_INV_DESC_IOTLB_PAGE: + domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); + addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi); + am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi); + if (am > VTD_MAMV) { + error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 + ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)", + __func__, inv_desc->hi, inv_desc->lo, + am, (unsigned)VTD_MAMV); + return false; + } + vtd_iotlb_page_invalidate(s, domain_id, addr, am); + break; + + default: + error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 + ", lo=0x%"PRIx64" (type mismatch: 0x%llx)", + __func__, inv_desc->hi, inv_desc->lo, + inv_desc->lo & VTD_INV_DESC_IOTLB_G); + return false; + } + return true; +} + +static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + trace_vtd_inv_desc_iec(inv_desc->iec.granularity, + inv_desc->iec.index, + inv_desc->iec.index_mask); + + vtd_iec_notify_all(s, !inv_desc->iec.granularity, + inv_desc->iec.index, + inv_desc->iec.index_mask); + return true; +} + +static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + VTDAddressSpace *vtd_dev_as; + IOMMUTLBEvent event; + struct VTDBus *vtd_bus; + hwaddr addr; + uint64_t sz; + uint16_t sid; + uint8_t devfn; + bool size; + uint8_t bus_num; + + addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); + sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); + devfn = sid & 0xff; + bus_num = sid >> 8; + size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); + + if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) || + (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) { + error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64 + ", lo=%"PRIx64" (reserved nonzero)", __func__, + inv_desc->hi, inv_desc->lo); + return false; + } + + vtd_bus = vtd_find_as_from_bus_num(s, bus_num); + if (!vtd_bus) { + goto done; + } + + vtd_dev_as = vtd_bus->dev_as[devfn]; + if (!vtd_dev_as) { + goto done; + } + + /* According to ATS spec table 2.4: + * S = 0, bits 15:12 = xxxx range size: 4K + * S = 1, bits 15:12 = xxx0 range size: 8K + * S = 1, bits 15:12 = xx01 range size: 16K + * S = 1, bits 15:12 = x011 range size: 32K + * S = 1, bits 15:12 = 0111 range size: 64K + * ... + */ + if (size) { + sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); + addr &= ~(sz - 1); + } else { + sz = VTD_PAGE_SIZE; + } + + event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP; + event.entry.target_as = &vtd_dev_as->as; + event.entry.addr_mask = sz - 1; + event.entry.iova = addr; + event.entry.perm = IOMMU_NONE; + event.entry.translated_addr = 0; + memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event); + +done: + return true; +} + +static bool vtd_process_inv_desc(IntelIOMMUState *s) +{ + VTDInvDesc inv_desc; + uint8_t desc_type; + + trace_vtd_inv_qi_head(s->iq_head); + if (!vtd_get_inv_desc(s, &inv_desc)) { + s->iq_last_desc_type = VTD_INV_DESC_NONE; + return false; + } + + desc_type = inv_desc.lo & VTD_INV_DESC_TYPE; + /* FIXME: should update at first or at last? */ + s->iq_last_desc_type = desc_type; + + switch (desc_type) { + case VTD_INV_DESC_CC: + trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo); + if (!vtd_process_context_cache_desc(s, &inv_desc)) { + return false; + } + break; + + case VTD_INV_DESC_IOTLB: + trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo); + if (!vtd_process_iotlb_desc(s, &inv_desc)) { + return false; + } + break; + + /* + * TODO: the entity of below two cases will be implemented in future series. + * To make guest (which integrates scalable mode support patch set in + * iommu driver) work, just return true is enough so far. + */ + case VTD_INV_DESC_PC: + break; + + case VTD_INV_DESC_PIOTLB: + break; + + case VTD_INV_DESC_WAIT: + trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo); + if (!vtd_process_wait_desc(s, &inv_desc)) { + return false; + } + break; + + case VTD_INV_DESC_IEC: + trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo); + if (!vtd_process_inv_iec_desc(s, &inv_desc)) { + return false; + } + break; + + case VTD_INV_DESC_DEVICE: + trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo); + if (!vtd_process_device_iotlb_desc(s, &inv_desc)) { + return false; + } + break; + + default: + error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64 + " (unknown type)", __func__, inv_desc.hi, + inv_desc.lo); + return false; + } + s->iq_head++; + if (s->iq_head == s->iq_size) { + s->iq_head = 0; + } + return true; +} + +/* Try to fetch and process more Invalidation Descriptors */ +static void vtd_fetch_inv_desc(IntelIOMMUState *s) +{ + int qi_shift; + + /* Refer to 10.4.23 of VT-d spec 3.0 */ + qi_shift = s->iq_dw ? VTD_IQH_QH_SHIFT_5 : VTD_IQH_QH_SHIFT_4; + + trace_vtd_inv_qi_fetch(); + + if (s->iq_tail >= s->iq_size) { + /* Detects an invalid Tail pointer */ + error_report_once("%s: detected invalid QI tail " + "(tail=0x%x, size=0x%x)", + __func__, s->iq_tail, s->iq_size); + vtd_handle_inv_queue_error(s); + return; + } + while (s->iq_head != s->iq_tail) { + if (!vtd_process_inv_desc(s)) { + /* Invalidation Queue Errors */ + vtd_handle_inv_queue_error(s); + break; + } + /* Must update the IQH_REG in time */ + vtd_set_quad_raw(s, DMAR_IQH_REG, + (((uint64_t)(s->iq_head)) << qi_shift) & + VTD_IQH_QH_MASK); + } +} + +/* Handle write to Invalidation Queue Tail Register */ +static void vtd_handle_iqt_write(IntelIOMMUState *s) +{ + uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG); + + if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) { + error_report_once("%s: RSV bit is set: val=0x%"PRIx64, + __func__, val); + return; + } + s->iq_tail = VTD_IQT_QT(s->iq_dw, val); + trace_vtd_inv_qi_tail(s->iq_tail); + + if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { + /* Process Invalidation Queue here */ + vtd_fetch_inv_desc(s); + } +} + +static void vtd_handle_fsts_write(IntelIOMMUState *s) +{ + uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); + uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); + uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE; + + if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) { + vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); + trace_vtd_fsts_clear_ip(); + } + /* FIXME: when IQE is Clear, should we try to fetch some Invalidation + * Descriptors if there are any when Queued Invalidation is enabled? + */ +} + +static void vtd_handle_fectl_write(IntelIOMMUState *s) +{ + uint32_t fectl_reg; + /* FIXME: when software clears the IM field, check the IP field. But do we + * need to compare the old value and the new value to conclude that + * software clears the IM field? Or just check if the IM field is zero? + */ + fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); + + trace_vtd_reg_write_fectl(fectl_reg); + + if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) { + vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); + vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); + } +} + +static void vtd_handle_ics_write(IntelIOMMUState *s) +{ + uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG); + uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); + + if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) { + trace_vtd_reg_ics_clear_ip(); + vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); + } +} + +static void vtd_handle_iectl_write(IntelIOMMUState *s) +{ + uint32_t iectl_reg; + /* FIXME: when software clears the IM field, check the IP field. But do we + * need to compare the old value and the new value to conclude that + * software clears the IM field? Or just check if the IM field is zero? + */ + iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); + + trace_vtd_reg_write_iectl(iectl_reg); + + if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) { + vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); + vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); + } +} + +static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) +{ + IntelIOMMUState *s = opaque; + uint64_t val; + + trace_vtd_reg_read(addr, size); + + if (addr + size > DMAR_REG_SIZE) { + error_report_once("%s: MMIO over range: addr=0x%" PRIx64 + " size=0x%x", __func__, addr, size); + return (uint64_t)-1; + } + + switch (addr) { + /* Root Table Address Register, 64-bit */ + case DMAR_RTADDR_REG: + val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); + if (size == 4) { + val = val & ((1ULL << 32) - 1); + } + break; + + case DMAR_RTADDR_REG_HI: + assert(size == 4); + val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32; + break; + + /* Invalidation Queue Address Register, 64-bit */ + case DMAR_IQA_REG: + val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS); + if (size == 4) { + val = val & ((1ULL << 32) - 1); + } + break; + + case DMAR_IQA_REG_HI: + assert(size == 4); + val = s->iq >> 32; + break; + + default: + if (size == 4) { + val = vtd_get_long(s, addr); + } else { + val = vtd_get_quad(s, addr); + } + } + + return val; +} + +static void vtd_mem_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + IntelIOMMUState *s = opaque; + + trace_vtd_reg_write(addr, size, val); + + if (addr + size > DMAR_REG_SIZE) { + error_report_once("%s: MMIO over range: addr=0x%" PRIx64 + " size=0x%x", __func__, addr, size); + return; + } + + switch (addr) { + /* Global Command Register, 32-bit */ + case DMAR_GCMD_REG: + vtd_set_long(s, addr, val); + vtd_handle_gcmd_write(s); + break; + + /* Context Command Register, 64-bit */ + case DMAR_CCMD_REG: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + vtd_set_quad(s, addr, val); + vtd_handle_ccmd_write(s); + } + break; + + case DMAR_CCMD_REG_HI: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_ccmd_write(s); + break; + + /* IOTLB Invalidation Register, 64-bit */ + case DMAR_IOTLB_REG: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + vtd_set_quad(s, addr, val); + vtd_handle_iotlb_write(s); + } + break; + + case DMAR_IOTLB_REG_HI: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_iotlb_write(s); + break; + + /* Invalidate Address Register, 64-bit */ + case DMAR_IVA_REG: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + vtd_set_quad(s, addr, val); + } + break; + + case DMAR_IVA_REG_HI: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + /* Fault Status Register, 32-bit */ + case DMAR_FSTS_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_fsts_write(s); + break; + + /* Fault Event Control Register, 32-bit */ + case DMAR_FECTL_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_fectl_write(s); + break; + + /* Fault Event Data Register, 32-bit */ + case DMAR_FEDATA_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + /* Fault Event Address Register, 32-bit */ + case DMAR_FEADDR_REG: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + /* + * While the register is 32-bit only, some guests (Xen...) write to + * it with 64-bit. + */ + vtd_set_quad(s, addr, val); + } + break; + + /* Fault Event Upper Address Register, 32-bit */ + case DMAR_FEUADDR_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + /* Protected Memory Enable Register, 32-bit */ + case DMAR_PMEN_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + /* Root Table Address Register, 64-bit */ + case DMAR_RTADDR_REG: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + vtd_set_quad(s, addr, val); + } + break; + + case DMAR_RTADDR_REG_HI: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + /* Invalidation Queue Tail Register, 64-bit */ + case DMAR_IQT_REG: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + vtd_set_quad(s, addr, val); + } + vtd_handle_iqt_write(s); + break; + + case DMAR_IQT_REG_HI: + assert(size == 4); + vtd_set_long(s, addr, val); + /* 19:63 of IQT_REG is RsvdZ, do nothing here */ + break; + + /* Invalidation Queue Address Register, 64-bit */ + case DMAR_IQA_REG: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + vtd_set_quad(s, addr, val); + } + if (s->ecap & VTD_ECAP_SMTS && + val & VTD_IQA_DW_MASK) { + s->iq_dw = true; + } else { + s->iq_dw = false; + } + break; + + case DMAR_IQA_REG_HI: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + /* Invalidation Completion Status Register, 32-bit */ + case DMAR_ICS_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_ics_write(s); + break; + + /* Invalidation Event Control Register, 32-bit */ + case DMAR_IECTL_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_iectl_write(s); + break; + + /* Invalidation Event Data Register, 32-bit */ + case DMAR_IEDATA_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + /* Invalidation Event Address Register, 32-bit */ + case DMAR_IEADDR_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + /* Invalidation Event Upper Address Register, 32-bit */ + case DMAR_IEUADDR_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + /* Fault Recording Registers, 128-bit */ + case DMAR_FRCD_REG_0_0: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + vtd_set_quad(s, addr, val); + } + break; + + case DMAR_FRCD_REG_0_1: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + case DMAR_FRCD_REG_0_2: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + vtd_set_quad(s, addr, val); + /* May clear bit 127 (Fault), update PPF */ + vtd_update_fsts_ppf(s); + } + break; + + case DMAR_FRCD_REG_0_3: + assert(size == 4); + vtd_set_long(s, addr, val); + /* May clear bit 127 (Fault), update PPF */ + vtd_update_fsts_ppf(s); + break; + + case DMAR_IRTA_REG: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + vtd_set_quad(s, addr, val); + } + break; + + case DMAR_IRTA_REG_HI: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + + default: + if (size == 4) { + vtd_set_long(s, addr, val); + } else { + vtd_set_quad(s, addr, val); + } + } +} + +static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr, + IOMMUAccessFlags flag, int iommu_idx) +{ + VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); + IntelIOMMUState *s = vtd_as->iommu_state; + IOMMUTLBEntry iotlb = { + /* We'll fill in the rest later. */ + .target_as = &address_space_memory, + }; + bool success; + + if (likely(s->dmar_enabled)) { + success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, + addr, flag & IOMMU_WO, &iotlb); + } else { + /* DMAR disabled, passthrough, use 4k-page*/ + iotlb.iova = addr & VTD_PAGE_MASK_4K; + iotlb.translated_addr = addr & VTD_PAGE_MASK_4K; + iotlb.addr_mask = ~VTD_PAGE_MASK_4K; + iotlb.perm = IOMMU_RW; + success = true; + } + + if (likely(success)) { + trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus), + VTD_PCI_SLOT(vtd_as->devfn), + VTD_PCI_FUNC(vtd_as->devfn), + iotlb.iova, iotlb.translated_addr, + iotlb.addr_mask); + } else { + error_report_once("%s: detected translation failure " + "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")", + __func__, pci_bus_num(vtd_as->bus), + VTD_PCI_SLOT(vtd_as->devfn), + VTD_PCI_FUNC(vtd_as->devfn), + addr); + } + + return iotlb; +} + +static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, + IOMMUNotifierFlag old, + IOMMUNotifierFlag new, + Error **errp) +{ + VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); + IntelIOMMUState *s = vtd_as->iommu_state; + + /* Update per-address-space notifier flags */ + vtd_as->notifier_flags = new; + + if (old == IOMMU_NOTIFIER_NONE) { + QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(vtd_as, next); + } + return 0; +} + +static int vtd_post_load(void *opaque, int version_id) +{ + IntelIOMMUState *iommu = opaque; + + /* + * Memory regions are dynamically turned on/off depending on + * context entry configurations from the guest. After migration, + * we need to make sure the memory regions are still correct. + */ + vtd_switch_address_space_all(iommu); + + /* + * We don't need to migrate the root_scalable because we can + * simply do the calculation after the loading is complete. We + * can actually do similar things with root, dmar_enabled, etc. + * however since we've had them already so we'd better keep them + * for compatibility of migration. + */ + vtd_update_scalable_state(iommu); + + return 0; +} + +static const VMStateDescription vtd_vmstate = { + .name = "iommu-intel", + .version_id = 1, + .minimum_version_id = 1, + .priority = MIG_PRI_IOMMU, + .post_load = vtd_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT64(root, IntelIOMMUState), + VMSTATE_UINT64(intr_root, IntelIOMMUState), + VMSTATE_UINT64(iq, IntelIOMMUState), + VMSTATE_UINT32(intr_size, IntelIOMMUState), + VMSTATE_UINT16(iq_head, IntelIOMMUState), + VMSTATE_UINT16(iq_tail, IntelIOMMUState), + VMSTATE_UINT16(iq_size, IntelIOMMUState), + VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState), + VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE), + VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState), + VMSTATE_UNUSED(1), /* bool root_extended is obsolete by VT-d */ + VMSTATE_BOOL(dmar_enabled, IntelIOMMUState), + VMSTATE_BOOL(qi_enabled, IntelIOMMUState), + VMSTATE_BOOL(intr_enabled, IntelIOMMUState), + VMSTATE_BOOL(intr_eime, IntelIOMMUState), + VMSTATE_END_OF_LIST() + } +}; + +static const MemoryRegionOps vtd_mem_ops = { + .read = vtd_mem_read, + .write = vtd_mem_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 8, + }, + .valid = { + .min_access_size = 4, + .max_access_size = 8, + }, +}; + +static Property vtd_properties[] = { + DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0), + DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, + ON_OFF_AUTO_AUTO), + DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), + DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits, + VTD_HOST_ADDRESS_WIDTH), + DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), + DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE), + DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), + DEFINE_PROP_END_OF_LIST(), +}; + +/* Read IRTE entry with specific index */ +static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, + VTD_IR_TableEntry *entry, uint16_t sid) +{ + static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \ + {0xffff, 0xfffb, 0xfff9, 0xfff8}; + dma_addr_t addr = 0x00; + uint16_t mask, source_id; + uint8_t bus, bus_max, bus_min; + + if (index >= iommu->intr_size) { + error_report_once("%s: index too large: ind=0x%x", + __func__, index); + return -VTD_FR_IR_INDEX_OVER; + } + + addr = iommu->intr_root + index * sizeof(*entry); + if (dma_memory_read(&address_space_memory, addr, entry, + sizeof(*entry))) { + error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64, + __func__, index, addr); + return -VTD_FR_IR_ROOT_INVAL; + } + + trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]), + le64_to_cpu(entry->data[0])); + + if (!entry->irte.present) { + error_report_once("%s: detected non-present IRTE " + "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", + __func__, index, le64_to_cpu(entry->data[1]), + le64_to_cpu(entry->data[0])); + return -VTD_FR_IR_ENTRY_P; + } + + if (entry->irte.__reserved_0 || entry->irte.__reserved_1 || + entry->irte.__reserved_2) { + error_report_once("%s: detected non-zero reserved IRTE " + "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", + __func__, index, le64_to_cpu(entry->data[1]), + le64_to_cpu(entry->data[0])); + return -VTD_FR_IR_IRTE_RSVD; + } + + if (sid != X86_IOMMU_SID_INVALID) { + /* Validate IRTE SID */ + source_id = le32_to_cpu(entry->irte.source_id); + switch (entry->irte.sid_vtype) { + case VTD_SVT_NONE: + break; + + case VTD_SVT_ALL: + mask = vtd_svt_mask[entry->irte.sid_q]; + if ((source_id & mask) != (sid & mask)) { + error_report_once("%s: invalid IRTE SID " + "(index=%u, sid=%u, source_id=%u)", + __func__, index, sid, source_id); + return -VTD_FR_IR_SID_ERR; + } + break; + + case VTD_SVT_BUS: + bus_max = source_id >> 8; + bus_min = source_id & 0xff; + bus = sid >> 8; + if (bus > bus_max || bus < bus_min) { + error_report_once("%s: invalid SVT_BUS " + "(index=%u, bus=%u, min=%u, max=%u)", + __func__, index, bus, bus_min, bus_max); + return -VTD_FR_IR_SID_ERR; + } + break; + + default: + error_report_once("%s: detected invalid IRTE SVT " + "(index=%u, type=%d)", __func__, + index, entry->irte.sid_vtype); + /* Take this as verification failure. */ + return -VTD_FR_IR_SID_ERR; + } + } + + return 0; +} + +/* Fetch IRQ information of specific IR index */ +static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, + X86IOMMUIrq *irq, uint16_t sid) +{ + VTD_IR_TableEntry irte = {}; + int ret = 0; + + ret = vtd_irte_get(iommu, index, &irte, sid); + if (ret) { + return ret; + } + + irq->trigger_mode = irte.irte.trigger_mode; + irq->vector = irte.irte.vector; + irq->delivery_mode = irte.irte.delivery_mode; + irq->dest = le32_to_cpu(irte.irte.dest_id); + if (!iommu->intr_eime) { +#define VTD_IR_APIC_DEST_MASK (0xff00ULL) +#define VTD_IR_APIC_DEST_SHIFT (8) + irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> + VTD_IR_APIC_DEST_SHIFT; + } + irq->dest_mode = irte.irte.dest_mode; + irq->redir_hint = irte.irte.redir_hint; + + trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector, + irq->delivery_mode, irq->dest, irq->dest_mode); + + return 0; +} + +/* Interrupt remapping for MSI/MSI-X entry */ +static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, + MSIMessage *origin, + MSIMessage *translated, + uint16_t sid) +{ + int ret = 0; + VTD_IR_MSIAddress addr; + uint16_t index; + X86IOMMUIrq irq = {}; + + assert(origin && translated); + + trace_vtd_ir_remap_msi_req(origin->address, origin->data); + + if (!iommu || !iommu->intr_enabled) { + memcpy(translated, origin, sizeof(*origin)); + goto out; + } + + if (origin->address & VTD_MSI_ADDR_HI_MASK) { + error_report_once("%s: MSI address high 32 bits non-zero detected: " + "address=0x%" PRIx64, __func__, origin->address); + return -VTD_FR_IR_REQ_RSVD; + } + + addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; + if (addr.addr.__head != 0xfee) { + error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32, + __func__, addr.data); + return -VTD_FR_IR_REQ_RSVD; + } + + /* This is compatible mode. */ + if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) { + memcpy(translated, origin, sizeof(*origin)); + goto out; + } + + index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l); + +#define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff) +#define VTD_IR_MSI_DATA_RESERVED (0xffff0000) + + if (addr.addr.sub_valid) { + /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */ + index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE; + } + + ret = vtd_remap_irq_get(iommu, index, &irq, sid); + if (ret) { + return ret; + } + + if (addr.addr.sub_valid) { + trace_vtd_ir_remap_type("MSI"); + if (origin->data & VTD_IR_MSI_DATA_RESERVED) { + error_report_once("%s: invalid IR MSI " + "(sid=%u, address=0x%" PRIx64 + ", data=0x%" PRIx32 ")", + __func__, sid, origin->address, origin->data); + return -VTD_FR_IR_REQ_RSVD; + } + } else { + uint8_t vector = origin->data & 0xff; + uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; + + trace_vtd_ir_remap_type("IOAPIC"); + /* IOAPIC entry vector should be aligned with IRTE vector + * (see vt-d spec 5.1.5.1). */ + if (vector != irq.vector) { + trace_vtd_warn_ir_vector(sid, index, vector, irq.vector); + } + + /* The Trigger Mode field must match the Trigger Mode in the IRTE. + * (see vt-d spec 5.1.5.1). */ + if (trigger_mode != irq.trigger_mode) { + trace_vtd_warn_ir_trigger(sid, index, trigger_mode, + irq.trigger_mode); + } + } + + /* + * We'd better keep the last two bits, assuming that guest OS + * might modify it. Keep it does not hurt after all. + */ + irq.msi_addr_last_bits = addr.addr.__not_care; + + /* Translate X86IOMMUIrq to MSI message */ + x86_iommu_irq_to_msi_message(&irq, translated); + +out: + trace_vtd_ir_remap_msi(origin->address, origin->data, + translated->address, translated->data); + return 0; +} + +static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, + MSIMessage *dst, uint16_t sid) +{ + return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), + src, dst, sid); +} + +static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, + uint64_t *data, unsigned size, + MemTxAttrs attrs) +{ + return MEMTX_OK; +} + +static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, + uint64_t value, unsigned size, + MemTxAttrs attrs) +{ + int ret = 0; + MSIMessage from = {}, to = {}; + uint16_t sid = X86_IOMMU_SID_INVALID; + + from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST; + from.data = (uint32_t) value; + + if (!attrs.unspecified) { + /* We have explicit Source ID */ + sid = attrs.requester_id; + } + + ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid); + if (ret) { + /* TODO: report error */ + /* Drop this interrupt */ + return MEMTX_ERROR; + } + + apic_get_class()->send_msi(&to); + + return MEMTX_OK; +} + +static const MemoryRegionOps vtd_mem_ir_ops = { + .read_with_attrs = vtd_mem_ir_read, + .write_with_attrs = vtd_mem_ir_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, + .valid = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) +{ + uintptr_t key = (uintptr_t)bus; + VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key); + VTDAddressSpace *vtd_dev_as; + char name[128]; + + if (!vtd_bus) { + uintptr_t *new_key = g_malloc(sizeof(*new_key)); + *new_key = (uintptr_t)bus; + /* No corresponding free() */ + vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \ + PCI_DEVFN_MAX); + vtd_bus->bus = bus; + g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus); + } + + vtd_dev_as = vtd_bus->dev_as[devfn]; + + if (!vtd_dev_as) { + snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn), + PCI_FUNC(devfn)); + vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace)); + + vtd_dev_as->bus = bus; + vtd_dev_as->devfn = (uint8_t)devfn; + vtd_dev_as->iommu_state = s; + vtd_dev_as->context_cache_entry.context_cache_gen = 0; + vtd_dev_as->iova_tree = iova_tree_new(); + + memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX); + address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root"); + + /* + * Build the DMAR-disabled container with aliases to the + * shared MRs. Note that aliasing to a shared memory region + * could help the memory API to detect same FlatViews so we + * can have devices to share the same FlatView when DMAR is + * disabled (either by not providing "intel_iommu=on" or with + * "iommu=pt"). It will greatly reduce the total number of + * FlatViews of the system hence VM runs faster. + */ + memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s), + "vtd-nodmar", &s->mr_nodmar, 0, + memory_region_size(&s->mr_nodmar)); + + /* + * Build the per-device DMAR-enabled container. + * + * TODO: currently we have per-device IOMMU memory region only + * because we have per-device IOMMU notifiers for devices. If + * one day we can abstract the IOMMU notifiers out of the + * memory regions then we can also share the same memory + * region here just like what we've done above with the nodmar + * region. + */ + strcat(name, "-dmar"); + memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu), + TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s), + name, UINT64_MAX); + memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir", + &s->mr_ir, 0, memory_region_size(&s->mr_ir)); + memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu), + VTD_INTERRUPT_ADDR_FIRST, + &vtd_dev_as->iommu_ir, 1); + + /* + * Hook both the containers under the root container, we + * switch between DMAR & noDMAR by enable/disable + * corresponding sub-containers + */ + memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, + MEMORY_REGION(&vtd_dev_as->iommu), + 0); + memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, + &vtd_dev_as->nodmar, 0); + + vtd_switch_address_space(vtd_dev_as); + } + return vtd_dev_as; +} + +/* Unmap the whole range in the notifier's scope. */ +static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) +{ + hwaddr size, remain; + hwaddr start = n->start; + hwaddr end = n->end; + IntelIOMMUState *s = as->iommu_state; + DMAMap map; + + /* + * Note: all the codes in this function has a assumption that IOVA + * bits are no more than VTD_MGAW bits (which is restricted by + * VT-d spec), otherwise we need to consider overflow of 64 bits. + */ + + if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) { + /* + * Don't need to unmap regions that is bigger than the whole + * VT-d supported address space size + */ + end = VTD_ADDRESS_SIZE(s->aw_bits) - 1; + } + + assert(start <= end); + size = remain = end - start + 1; + + while (remain >= VTD_PAGE_SIZE) { + IOMMUTLBEvent event; + uint64_t mask = dma_aligned_pow2_mask(start, end, s->aw_bits); + uint64_t size = mask + 1; + + assert(size); + + event.type = IOMMU_NOTIFIER_UNMAP; + event.entry.iova = start; + event.entry.addr_mask = mask; + event.entry.target_as = &address_space_memory; + event.entry.perm = IOMMU_NONE; + /* This field is meaningless for unmap */ + event.entry.translated_addr = 0; + + memory_region_notify_iommu_one(n, &event); + + start += size; + remain -= size; + } + + assert(!remain); + + trace_vtd_as_unmap_whole(pci_bus_num(as->bus), + VTD_PCI_SLOT(as->devfn), + VTD_PCI_FUNC(as->devfn), + n->start, size); + + map.iova = n->start; + map.size = size; + iova_tree_remove(as->iova_tree, &map); +} + +static void vtd_address_space_unmap_all(IntelIOMMUState *s) +{ + VTDAddressSpace *vtd_as; + IOMMUNotifier *n; + + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { + IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { + vtd_address_space_unmap(vtd_as, n); + } + } +} + +static void vtd_address_space_refresh_all(IntelIOMMUState *s) +{ + vtd_address_space_unmap_all(s); + vtd_switch_address_space_all(s); +} + +static int vtd_replay_hook(IOMMUTLBEvent *event, void *private) +{ + memory_region_notify_iommu_one(private, event); + return 0; +} + +static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) +{ + VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu); + IntelIOMMUState *s = vtd_as->iommu_state; + uint8_t bus_n = pci_bus_num(vtd_as->bus); + VTDContextEntry ce; + + /* + * The replay can be triggered by either a invalidation or a newly + * created entry. No matter what, we release existing mappings + * (it means flushing caches for UNMAP-only registers). + */ + vtd_address_space_unmap(vtd_as, n); + + if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { + trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" : + "legacy mode", + bus_n, PCI_SLOT(vtd_as->devfn), + PCI_FUNC(vtd_as->devfn), + vtd_get_domain_id(s, &ce), + ce.hi, ce.lo); + if (vtd_as_has_map_notifier(vtd_as)) { + /* This is required only for MAP typed notifiers */ + vtd_page_walk_info info = { + .hook_fn = vtd_replay_hook, + .private = (void *)n, + .notify_unmap = false, + .aw = s->aw_bits, + .as = vtd_as, + .domain_id = vtd_get_domain_id(s, &ce), + }; + + vtd_page_walk(s, &ce, 0, ~0ULL, &info); + } + } else { + trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), + PCI_FUNC(vtd_as->devfn)); + } + + return; +} + +/* Do the initialization. It will also be called when reset, so pay + * attention when adding new initialization stuff. + */ +static void vtd_init(IntelIOMMUState *s) +{ + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + + memset(s->csr, 0, DMAR_REG_SIZE); + memset(s->wmask, 0, DMAR_REG_SIZE); + memset(s->w1cmask, 0, DMAR_REG_SIZE); + memset(s->womask, 0, DMAR_REG_SIZE); + + s->root = 0; + s->root_scalable = false; + s->dmar_enabled = false; + s->intr_enabled = false; + s->iq_head = 0; + s->iq_tail = 0; + s->iq = 0; + s->iq_size = 0; + s->qi_enabled = false; + s->iq_last_desc_type = VTD_INV_DESC_NONE; + s->iq_dw = false; + s->next_frcd_reg = 0; + s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | + VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | + VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); + if (s->dma_drain) { + s->cap |= VTD_CAP_DRAIN; + } + if (s->aw_bits == VTD_HOST_AW_48BIT) { + s->cap |= VTD_CAP_SAGAW_48bit; + } + s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; + + /* + * Rsvd field masks for spte + */ + vtd_spte_rsvd[0] = ~0ULL; + vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); + vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); + vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); + + vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + + if (s->scalable_mode) { + vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; + } + + if (x86_iommu_ir_supported(x86_iommu)) { + s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; + if (s->intr_eim == ON_OFF_AUTO_ON) { + s->ecap |= VTD_ECAP_EIM; + } + assert(s->intr_eim != ON_OFF_AUTO_AUTO); + } + + if (x86_iommu->dt_supported) { + s->ecap |= VTD_ECAP_DT; + } + + if (x86_iommu->pt_supported) { + s->ecap |= VTD_ECAP_PT; + } + + if (s->caching_mode) { + s->cap |= VTD_CAP_CM; + } + + /* TODO: read cap/ecap from host to decide which cap to be exposed. */ + if (s->scalable_mode) { + s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; + } + + vtd_reset_caches(s); + + /* Define registers with default values and bit semantics */ + vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); + vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); + vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); + vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); + vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); + vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); + vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0); + vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0); + vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL); + + /* Advanced Fault Logging not supported */ + vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL); + vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0); + vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0); + vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0); + + /* Treated as RsvdZ when EIM in ECAP_REG is not supported + * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0); + */ + vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0); + + /* Treated as RO for implementations that PLMR and PHMR fields reported + * as Clear in the CAP_REG. + * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0); + */ + vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0); + + vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0); + vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0); + vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0); + vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL); + vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0); + vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0); + vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0); + /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */ + vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0); + + /* IOTLB registers */ + vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0); + vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0); + vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL); + + /* Fault Recording Registers, 128-bit */ + vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0); + vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); + + /* + * Interrupt remapping registers. + */ + vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); +} + +/* Should not reset address_spaces when reset because devices will still use + * the address space they got at first (won't ask the bus again). + */ +static void vtd_reset(DeviceState *dev) +{ + IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); + + vtd_init(s); + vtd_address_space_refresh_all(s); +} + +static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + assert(0 <= devfn && devfn < PCI_DEVFN_MAX); + + vtd_as = vtd_find_add_as(s, bus, devfn); + return &vtd_as->as; +} + +static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) +{ + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + + if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) { + error_setg(errp, "eim=on cannot be selected without intremap=on"); + return false; + } + + if (s->intr_eim == ON_OFF_AUTO_AUTO) { + s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim) + && x86_iommu_ir_supported(x86_iommu) ? + ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; + } + if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { + if (!kvm_irqchip_in_kernel()) { + error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); + return false; + } + if (!kvm_enable_x2apic()) { + error_setg(errp, "eim=on requires support on the KVM side" + "(X2APIC_API, first shipped in v4.7)"); + return false; + } + } + + /* Currently only address widths supported are 39 and 48 bits */ + if ((s->aw_bits != VTD_HOST_AW_39BIT) && + (s->aw_bits != VTD_HOST_AW_48BIT)) { + error_setg(errp, "Supported values for aw-bits are: %d, %d", + VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT); + return false; + } + + if (s->scalable_mode && !s->dma_drain) { + error_setg(errp, "Need to set dma_drain for scalable mode"); + return false; + } + + return true; +} + +static int vtd_machine_done_notify_one(Object *child, void *unused) +{ + IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); + + /* + * We hard-coded here because vfio-pci is the only special case + * here. Let's be more elegant in the future when we can, but so + * far there seems to be no better way. + */ + if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) { + vtd_panic_require_caching_mode(); + } + + return 0; +} + +static void vtd_machine_done_hook(Notifier *notifier, void *unused) +{ + object_child_foreach_recursive(object_get_root(), + vtd_machine_done_notify_one, NULL); +} + +static Notifier vtd_machine_done_notify = { + .notify = vtd_machine_done_hook, +}; + +static void vtd_realize(DeviceState *dev, Error **errp) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + PCMachineState *pcms = PC_MACHINE(ms); + X86MachineState *x86ms = X86_MACHINE(ms); + PCIBus *bus = pcms->bus; + IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); + + if (!vtd_decide_config(s, errp)) { + return; + } + + QLIST_INIT(&s->vtd_as_with_notifiers); + qemu_mutex_init(&s->iommu_lock); + memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); + memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, + "intel_iommu", DMAR_REG_SIZE); + + /* Create the shared memory regions by all devices */ + memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar", + UINT64_MAX); + memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops, + s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE); + memory_region_init_alias(&s->mr_sys_alias, OBJECT(s), + "vtd-sys-alias", get_system_memory(), 0, + memory_region_size(get_system_memory())); + memory_region_add_subregion_overlap(&s->mr_nodmar, 0, + &s->mr_sys_alias, 0); + memory_region_add_subregion_overlap(&s->mr_nodmar, + VTD_INTERRUPT_ADDR_FIRST, + &s->mr_ir, 1); + + sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem); + /* No corresponding destroy */ + s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, + g_free, g_free); + s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, + g_free, g_free); + vtd_init(s); + sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); + pci_setup_iommu(bus, vtd_host_dma_iommu, dev); + /* Pseudo address space under root PCI bus. */ + x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); + qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); +} + +static void vtd_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass); + + dc->reset = vtd_reset; + dc->vmsd = &vtd_vmstate; + device_class_set_props(dc, vtd_properties); + dc->hotpluggable = false; + x86_class->realize = vtd_realize; + x86_class->int_remap = vtd_int_remap; + /* Supported by the pc-q35-* machine types */ + dc->user_creatable = true; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + dc->desc = "Intel IOMMU (VT-d) DMA Remapping device"; +} + +static const TypeInfo vtd_info = { + .name = TYPE_INTEL_IOMMU_DEVICE, + .parent = TYPE_X86_IOMMU_DEVICE, + .instance_size = sizeof(IntelIOMMUState), + .class_init = vtd_class_init, +}; + +static void vtd_iommu_memory_region_class_init(ObjectClass *klass, + void *data) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); + + imrc->translate = vtd_iommu_translate; + imrc->notify_flag_changed = vtd_iommu_notify_flag_changed; + imrc->replay = vtd_iommu_replay; +} + +static const TypeInfo vtd_iommu_memory_region_info = { + .parent = TYPE_IOMMU_MEMORY_REGION, + .name = TYPE_INTEL_IOMMU_MEMORY_REGION, + .class_init = vtd_iommu_memory_region_class_init, +}; + +static void vtd_register_types(void) +{ + type_register_static(&vtd_info); + type_register_static(&vtd_iommu_memory_region_info); +} + +type_init(vtd_register_types) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h new file mode 100644 index 000000000..a6c788049 --- /dev/null +++ b/hw/i386/intel_iommu_internal.h @@ -0,0 +1,518 @@ +/* + * QEMU emulation of an Intel IOMMU (VT-d) + * (DMA Remapping device) + * + * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com> + * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + * + * Lots of defines copied from kernel/include/linux/intel-iommu.h: + * Copyright (C) 2006-2008 Intel Corporation + * Author: Ashok Raj <ashok.raj@intel.com> + * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + * + */ + +#ifndef HW_I386_INTEL_IOMMU_INTERNAL_H +#define HW_I386_INTEL_IOMMU_INTERNAL_H +#include "hw/i386/intel_iommu.h" + +/* + * Intel IOMMU register specification + */ +#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */ +#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */ +#define DMAR_CAP_REG_HI 0xc /* High 32-bit of DMAR_CAP_REG */ +#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ +#define DMAR_ECAP_REG_HI 0X14 +#define DMAR_GCMD_REG 0x18 /* Global command */ +#define DMAR_GSTS_REG 0x1c /* Global status */ +#define DMAR_RTADDR_REG 0x20 /* Root entry table */ +#define DMAR_RTADDR_REG_HI 0X24 +#define DMAR_CCMD_REG 0x28 /* Context command */ +#define DMAR_CCMD_REG_HI 0x2c +#define DMAR_FSTS_REG 0x34 /* Fault status */ +#define DMAR_FECTL_REG 0x38 /* Fault control */ +#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data */ +#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr */ +#define DMAR_FEUADDR_REG 0x44 /* Upper address */ +#define DMAR_AFLOG_REG 0x58 /* Advanced fault control */ +#define DMAR_AFLOG_REG_HI 0X5c +#define DMAR_PMEN_REG 0x64 /* Enable protected memory region */ +#define DMAR_PLMBASE_REG 0x68 /* PMRR low addr */ +#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ +#define DMAR_PHMBASE_REG 0x70 /* PMRR high base addr */ +#define DMAR_PHMBASE_REG_HI 0X74 +#define DMAR_PHMLIMIT_REG 0x78 /* PMRR high limit */ +#define DMAR_PHMLIMIT_REG_HI 0x7c +#define DMAR_IQH_REG 0x80 /* Invalidation queue head */ +#define DMAR_IQH_REG_HI 0X84 +#define DMAR_IQT_REG 0x88 /* Invalidation queue tail */ +#define DMAR_IQT_REG_HI 0X8c +#define DMAR_IQA_REG 0x90 /* Invalidation queue addr */ +#define DMAR_IQA_REG_HI 0x94 +#define DMAR_ICS_REG 0x9c /* Invalidation complete status */ +#define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr */ +#define DMAR_IRTA_REG_HI 0xbc +#define DMAR_IECTL_REG 0xa0 /* Invalidation event control */ +#define DMAR_IEDATA_REG 0xa4 /* Invalidation event data */ +#define DMAR_IEADDR_REG 0xa8 /* Invalidation event address */ +#define DMAR_IEUADDR_REG 0xac /* Invalidation event address */ +#define DMAR_PQH_REG 0xc0 /* Page request queue head */ +#define DMAR_PQH_REG_HI 0xc4 +#define DMAR_PQT_REG 0xc8 /* Page request queue tail*/ +#define DMAR_PQT_REG_HI 0xcc +#define DMAR_PQA_REG 0xd0 /* Page request queue address */ +#define DMAR_PQA_REG_HI 0xd4 +#define DMAR_PRS_REG 0xdc /* Page request status */ +#define DMAR_PECTL_REG 0xe0 /* Page request event control */ +#define DMAR_PEDATA_REG 0xe4 /* Page request event data */ +#define DMAR_PEADDR_REG 0xe8 /* Page request event address */ +#define DMAR_PEUADDR_REG 0xec /* Page event upper address */ +#define DMAR_MTRRCAP_REG 0x100 /* MTRR capability */ +#define DMAR_MTRRCAP_REG_HI 0x104 +#define DMAR_MTRRDEF_REG 0x108 /* MTRR default type */ +#define DMAR_MTRRDEF_REG_HI 0x10c + +/* IOTLB registers */ +#define DMAR_IOTLB_REG_OFFSET 0xf0 /* Offset to the IOTLB registers */ +#define DMAR_IVA_REG DMAR_IOTLB_REG_OFFSET /* Invalidate address */ +#define DMAR_IVA_REG_HI (DMAR_IVA_REG + 4) +/* IOTLB invalidate register */ +#define DMAR_IOTLB_REG (DMAR_IOTLB_REG_OFFSET + 0x8) +#define DMAR_IOTLB_REG_HI (DMAR_IOTLB_REG + 4) + +/* FRCD */ +#define DMAR_FRCD_REG_OFFSET 0x220 /* Offset to the fault recording regs */ +/* NOTICE: If you change the DMAR_FRCD_REG_NR, please remember to change the + * DMAR_REG_SIZE in include/hw/i386/intel_iommu.h. + * #define DMAR_REG_SIZE (DMAR_FRCD_REG_OFFSET + 16 * DMAR_FRCD_REG_NR) + */ +#define DMAR_FRCD_REG_NR 1ULL /* Num of fault recording regs */ + +#define DMAR_FRCD_REG_0_0 0x220 /* The 0th fault recording regs */ +#define DMAR_FRCD_REG_0_1 0x224 +#define DMAR_FRCD_REG_0_2 0x228 +#define DMAR_FRCD_REG_0_3 0x22c + +/* Interrupt Address Range */ +#define VTD_INTERRUPT_ADDR_FIRST 0xfee00000ULL +#define VTD_INTERRUPT_ADDR_LAST 0xfeefffffULL +#define VTD_INTERRUPT_ADDR_SIZE (VTD_INTERRUPT_ADDR_LAST - \ + VTD_INTERRUPT_ADDR_FIRST + 1) + +/* The shift of source_id in the key of IOTLB hash table */ +#define VTD_IOTLB_SID_SHIFT 36 +#define VTD_IOTLB_LVL_SHIFT 52 +#define VTD_IOTLB_MAX_SIZE 1024 /* Max size of the hash table */ + +/* IOTLB_REG */ +#define VTD_TLB_GLOBAL_FLUSH (1ULL << 60) /* Global invalidation */ +#define VTD_TLB_DSI_FLUSH (2ULL << 60) /* Domain-selective */ +#define VTD_TLB_PSI_FLUSH (3ULL << 60) /* Page-selective */ +#define VTD_TLB_FLUSH_GRANU_MASK (3ULL << 60) +#define VTD_TLB_GLOBAL_FLUSH_A (1ULL << 57) +#define VTD_TLB_DSI_FLUSH_A (2ULL << 57) +#define VTD_TLB_PSI_FLUSH_A (3ULL << 57) +#define VTD_TLB_FLUSH_GRANU_MASK_A (3ULL << 57) +#define VTD_TLB_IVT (1ULL << 63) +#define VTD_TLB_DID(val) (((val) >> 32) & VTD_DOMAIN_ID_MASK) + +/* IVA_REG */ +#define VTD_IVA_ADDR(val) ((val) & ~0xfffULL) +#define VTD_IVA_AM(val) ((val) & 0x3fULL) + +/* GCMD_REG */ +#define VTD_GCMD_TE (1UL << 31) +#define VTD_GCMD_SRTP (1UL << 30) +#define VTD_GCMD_SFL (1UL << 29) +#define VTD_GCMD_EAFL (1UL << 28) +#define VTD_GCMD_WBF (1UL << 27) +#define VTD_GCMD_QIE (1UL << 26) +#define VTD_GCMD_IRE (1UL << 25) +#define VTD_GCMD_SIRTP (1UL << 24) +#define VTD_GCMD_CFI (1UL << 23) + +/* GSTS_REG */ +#define VTD_GSTS_TES (1UL << 31) +#define VTD_GSTS_RTPS (1UL << 30) +#define VTD_GSTS_FLS (1UL << 29) +#define VTD_GSTS_AFLS (1UL << 28) +#define VTD_GSTS_WBFS (1UL << 27) +#define VTD_GSTS_QIES (1UL << 26) +#define VTD_GSTS_IRES (1UL << 25) +#define VTD_GSTS_IRTPS (1UL << 24) +#define VTD_GSTS_CFIS (1UL << 23) + +/* CCMD_REG */ +#define VTD_CCMD_ICC (1ULL << 63) +#define VTD_CCMD_GLOBAL_INVL (1ULL << 61) +#define VTD_CCMD_DOMAIN_INVL (2ULL << 61) +#define VTD_CCMD_DEVICE_INVL (3ULL << 61) +#define VTD_CCMD_CIRG_MASK (3ULL << 61) +#define VTD_CCMD_GLOBAL_INVL_A (1ULL << 59) +#define VTD_CCMD_DOMAIN_INVL_A (2ULL << 59) +#define VTD_CCMD_DEVICE_INVL_A (3ULL << 59) +#define VTD_CCMD_CAIG_MASK (3ULL << 59) +#define VTD_CCMD_DID(val) ((val) & VTD_DOMAIN_ID_MASK) +#define VTD_CCMD_SID(val) (((val) >> 16) & 0xffffULL) +#define VTD_CCMD_FM(val) (((val) >> 32) & 3ULL) + +/* RTADDR_REG */ +#define VTD_RTADDR_SMT (1ULL << 10) +#define VTD_RTADDR_ADDR_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL) + +/* IRTA_REG */ +#define VTD_IRTA_ADDR_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL) +#define VTD_IRTA_EIME (1ULL << 11) +#define VTD_IRTA_SIZE_MASK (0xfULL) + +/* ECAP_REG */ +/* (offset >> 4) << 8 */ +#define VTD_ECAP_IRO (DMAR_IOTLB_REG_OFFSET << 4) +#define VTD_ECAP_QI (1ULL << 1) +#define VTD_ECAP_DT (1ULL << 2) +/* Interrupt Remapping support */ +#define VTD_ECAP_IR (1ULL << 3) +#define VTD_ECAP_EIM (1ULL << 4) +#define VTD_ECAP_PT (1ULL << 6) +#define VTD_ECAP_MHMV (15ULL << 20) +#define VTD_ECAP_SRS (1ULL << 31) +#define VTD_ECAP_SMTS (1ULL << 43) +#define VTD_ECAP_SLTS (1ULL << 46) + +/* CAP_REG */ +/* (offset >> 4) << 24 */ +#define VTD_CAP_FRO (DMAR_FRCD_REG_OFFSET << 20) +#define VTD_CAP_NFR ((DMAR_FRCD_REG_NR - 1) << 40) +#define VTD_DOMAIN_ID_SHIFT 16 /* 16-bit domain id for 64K domains */ +#define VTD_DOMAIN_ID_MASK ((1UL << VTD_DOMAIN_ID_SHIFT) - 1) +#define VTD_CAP_ND (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL) +#define VTD_ADDRESS_SIZE(aw) (1ULL << (aw)) +#define VTD_CAP_MGAW(aw) ((((aw) - 1) & 0x3fULL) << 16) +#define VTD_MAMV 18ULL +#define VTD_CAP_MAMV (VTD_MAMV << 48) +#define VTD_CAP_PSI (1ULL << 39) +#define VTD_CAP_SLLPS ((1ULL << 34) | (1ULL << 35)) +#define VTD_CAP_DRAIN_WRITE (1ULL << 54) +#define VTD_CAP_DRAIN_READ (1ULL << 55) +#define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE) +#define VTD_CAP_CM (1ULL << 7) + +/* Supported Adjusted Guest Address Widths */ +#define VTD_CAP_SAGAW_SHIFT 8 +#define VTD_CAP_SAGAW_MASK (0x1fULL << VTD_CAP_SAGAW_SHIFT) + /* 39-bit AGAW, 3-level page-table */ +#define VTD_CAP_SAGAW_39bit (0x2ULL << VTD_CAP_SAGAW_SHIFT) + /* 48-bit AGAW, 4-level page-table */ +#define VTD_CAP_SAGAW_48bit (0x4ULL << VTD_CAP_SAGAW_SHIFT) + +/* IQT_REG */ +#define VTD_IQT_QT(dw_bit, val) (dw_bit ? (((val) >> 5) & 0x3fffULL) : \ + (((val) >> 4) & 0x7fffULL)) +#define VTD_IQT_QT_256_RSV_BIT 0x10 + +/* IQA_REG */ +#define VTD_IQA_IQA_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL) +#define VTD_IQA_QS 0x7ULL +#define VTD_IQA_DW_MASK 0x800 + +/* IQH_REG */ +#define VTD_IQH_QH_SHIFT_4 4 +#define VTD_IQH_QH_SHIFT_5 5 +#define VTD_IQH_QH_MASK 0x7fff0ULL + +/* ICS_REG */ +#define VTD_ICS_IWC 1UL + +/* IECTL_REG */ +#define VTD_IECTL_IM (1UL << 31) +#define VTD_IECTL_IP (1UL << 30) + +/* FSTS_REG */ +#define VTD_FSTS_FRI_MASK 0xff00UL +#define VTD_FSTS_FRI(val) ((((uint32_t)(val)) << 8) & VTD_FSTS_FRI_MASK) +#define VTD_FSTS_IQE (1UL << 4) +#define VTD_FSTS_PPF (1UL << 1) +#define VTD_FSTS_PFO 1UL + +/* FECTL_REG */ +#define VTD_FECTL_IM (1UL << 31) +#define VTD_FECTL_IP (1UL << 30) + +/* Fault Recording Register */ +/* For the high 64-bit of 128-bit */ +#define VTD_FRCD_F (1ULL << 63) +#define VTD_FRCD_T (1ULL << 62) +#define VTD_FRCD_FR(val) (((val) & 0xffULL) << 32) +#define VTD_FRCD_SID_MASK 0xffffULL +#define VTD_FRCD_SID(val) ((val) & VTD_FRCD_SID_MASK) +/* For the low 64-bit of 128-bit */ +#define VTD_FRCD_FI(val) ((val) & ~0xfffULL) + +/* DMA Remapping Fault Conditions */ +typedef enum VTDFaultReason { + VTD_FR_RESERVED = 0, /* Reserved for Advanced Fault logging */ + VTD_FR_ROOT_ENTRY_P = 1, /* The Present(P) field of root-entry is 0 */ + VTD_FR_CONTEXT_ENTRY_P, /* The Present(P) field of context-entry is 0 */ + VTD_FR_CONTEXT_ENTRY_INV, /* Invalid programming of a context-entry */ + VTD_FR_ADDR_BEYOND_MGAW, /* Input-address above (2^x-1) */ + VTD_FR_WRITE, /* No write permission */ + VTD_FR_READ, /* No read permission */ + /* Fail to access a second-level paging entry (not SL_PML4E) */ + VTD_FR_PAGING_ENTRY_INV, + VTD_FR_ROOT_TABLE_INV, /* Fail to access a root-entry */ + VTD_FR_CONTEXT_TABLE_INV, /* Fail to access a context-entry */ + /* Non-zero reserved field in a present root-entry */ + VTD_FR_ROOT_ENTRY_RSVD, + /* Non-zero reserved field in a present context-entry */ + VTD_FR_CONTEXT_ENTRY_RSVD, + /* Non-zero reserved field in a second-level paging entry with at lease one + * Read(R) and Write(W) or Execute(E) field is Set. + */ + VTD_FR_PAGING_ENTRY_RSVD, + /* Translation request or translated request explicitly blocked dut to the + * programming of the Translation Type (T) field in the present + * context-entry. + */ + VTD_FR_CONTEXT_ENTRY_TT, + + /* Interrupt remapping transition faults */ + VTD_FR_IR_REQ_RSVD = 0x20, /* One or more IR request reserved + * fields set */ + VTD_FR_IR_INDEX_OVER = 0x21, /* Index value greater than max */ + VTD_FR_IR_ENTRY_P = 0x22, /* Present (P) not set in IRTE */ + VTD_FR_IR_ROOT_INVAL = 0x23, /* IR Root table invalid */ + VTD_FR_IR_IRTE_RSVD = 0x24, /* IRTE Rsvd field non-zero with + * Present flag set */ + VTD_FR_IR_REQ_COMPAT = 0x25, /* Encountered compatible IR + * request while disabled */ + VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */ + + VTD_FR_PASID_TABLE_INV = 0x58, /*Invalid PASID table entry */ + + /* This is not a normal fault reason. We use this to indicate some faults + * that are not referenced by the VT-d specification. + * Fault event with such reason should not be recorded. + */ + VTD_FR_RESERVED_ERR, + VTD_FR_MAX, /* Guard */ +} VTDFaultReason; + +#define VTD_CONTEXT_CACHE_GEN_MAX 0xffffffffUL + +/* Interrupt Entry Cache Invalidation Descriptor: VT-d 6.5.2.7. */ +struct VTDInvDescIEC { + uint32_t type:4; /* Should always be 0x4 */ + uint32_t granularity:1; /* If set, it's global IR invalidation */ + uint32_t resved_1:22; + uint32_t index_mask:5; /* 2^N for continuous int invalidation */ + uint32_t index:16; /* Start index to invalidate */ + uint32_t reserved_2:16; +}; +typedef struct VTDInvDescIEC VTDInvDescIEC; + +/* Queued Invalidation Descriptor */ +union VTDInvDesc { + struct { + uint64_t lo; + uint64_t hi; + }; + struct { + uint64_t val[4]; + }; + union { + VTDInvDescIEC iec; + }; +}; +typedef union VTDInvDesc VTDInvDesc; + +/* Masks for struct VTDInvDesc */ +#define VTD_INV_DESC_TYPE 0xf +#define VTD_INV_DESC_CC 0x1 /* Context-cache Invalidate Desc */ +#define VTD_INV_DESC_IOTLB 0x2 +#define VTD_INV_DESC_DEVICE 0x3 +#define VTD_INV_DESC_IEC 0x4 /* Interrupt Entry Cache + Invalidate Descriptor */ +#define VTD_INV_DESC_WAIT 0x5 /* Invalidation Wait Descriptor */ +#define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */ +#define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */ +#define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */ + +/* Masks for Invalidation Wait Descriptor*/ +#define VTD_INV_DESC_WAIT_SW (1ULL << 5) +#define VTD_INV_DESC_WAIT_IF (1ULL << 4) +#define VTD_INV_DESC_WAIT_FN (1ULL << 6) +#define VTD_INV_DESC_WAIT_DATA_SHIFT 32 +#define VTD_INV_DESC_WAIT_RSVD_LO 0Xffffff80ULL +#define VTD_INV_DESC_WAIT_RSVD_HI 3ULL + +/* Masks for Context-cache Invalidation Descriptor */ +#define VTD_INV_DESC_CC_G (3ULL << 4) +#define VTD_INV_DESC_CC_GLOBAL (1ULL << 4) +#define VTD_INV_DESC_CC_DOMAIN (2ULL << 4) +#define VTD_INV_DESC_CC_DEVICE (3ULL << 4) +#define VTD_INV_DESC_CC_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK) +#define VTD_INV_DESC_CC_SID(val) (((val) >> 32) & 0xffffUL) +#define VTD_INV_DESC_CC_FM(val) (((val) >> 48) & 3UL) +#define VTD_INV_DESC_CC_RSVD 0xfffc00000000ffc0ULL + +/* Masks for IOTLB Invalidate Descriptor */ +#define VTD_INV_DESC_IOTLB_G (3ULL << 4) +#define VTD_INV_DESC_IOTLB_GLOBAL (1ULL << 4) +#define VTD_INV_DESC_IOTLB_DOMAIN (2ULL << 4) +#define VTD_INV_DESC_IOTLB_PAGE (3ULL << 4) +#define VTD_INV_DESC_IOTLB_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK) +#define VTD_INV_DESC_IOTLB_ADDR(val) ((val) & ~0xfffULL) +#define VTD_INV_DESC_IOTLB_AM(val) ((val) & 0x3fULL) +#define VTD_INV_DESC_IOTLB_RSVD_LO 0xffffffff0000ff00ULL +#define VTD_INV_DESC_IOTLB_RSVD_HI 0xf80ULL + +/* Mask for Device IOTLB Invalidate Descriptor */ +#define VTD_INV_DESC_DEVICE_IOTLB_ADDR(val) ((val) & 0xfffffffffffff000ULL) +#define VTD_INV_DESC_DEVICE_IOTLB_SIZE(val) ((val) & 0x1) +#define VTD_INV_DESC_DEVICE_IOTLB_SID(val) (((val) >> 32) & 0xFFFFULL) +#define VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI 0xffeULL +#define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0fff8 + +/* Rsvd field masks for spte */ +#define VTD_SPTE_SNP 0x800ULL + +#define VTD_SPTE_PAGE_L1_RSVD_MASK(aw, dt_supported) \ + dt_supported ? \ + (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \ + (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_PAGE_L2_RSVD_MASK(aw) \ + (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_PAGE_L3_RSVD_MASK(aw) \ + (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \ + (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) + +#define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw, dt_supported) \ + dt_supported ? \ + (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \ + (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_LPAGE_L3_RSVD_MASK(aw, dt_supported) \ + dt_supported ? \ + (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \ + (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) + +/* Information about page-selective IOTLB invalidate */ +struct VTDIOTLBPageInvInfo { + uint16_t domain_id; + uint64_t addr; + uint8_t mask; +}; +typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo; + +/* Pagesize of VTD paging structures, including root and context tables */ +#define VTD_PAGE_SHIFT 12 +#define VTD_PAGE_SIZE (1ULL << VTD_PAGE_SHIFT) + +#define VTD_PAGE_SHIFT_4K 12 +#define VTD_PAGE_MASK_4K (~((1ULL << VTD_PAGE_SHIFT_4K) - 1)) +#define VTD_PAGE_SHIFT_2M 21 +#define VTD_PAGE_MASK_2M (~((1ULL << VTD_PAGE_SHIFT_2M) - 1)) +#define VTD_PAGE_SHIFT_1G 30 +#define VTD_PAGE_MASK_1G (~((1ULL << VTD_PAGE_SHIFT_1G) - 1)) + +struct VTDRootEntry { + uint64_t lo; + uint64_t hi; +}; +typedef struct VTDRootEntry VTDRootEntry; + +/* Masks for struct VTDRootEntry */ +#define VTD_ROOT_ENTRY_P 1ULL +#define VTD_ROOT_ENTRY_CTP (~0xfffULL) + +#define VTD_ROOT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(VTDRootEntry)) +#define VTD_ROOT_ENTRY_RSVD(aw) (0xffeULL | ~VTD_HAW_MASK(aw)) + +#define VTD_DEVFN_CHECK_MASK 0x80 + +/* Masks for struct VTDContextEntry */ +/* lo */ +#define VTD_CONTEXT_ENTRY_P (1ULL << 0) +#define VTD_CONTEXT_ENTRY_FPD (1ULL << 1) /* Fault Processing Disable */ +#define VTD_CONTEXT_ENTRY_TT (3ULL << 2) /* Translation Type */ +#define VTD_CONTEXT_TT_MULTI_LEVEL 0 +#define VTD_CONTEXT_TT_DEV_IOTLB (1ULL << 2) +#define VTD_CONTEXT_TT_PASS_THROUGH (2ULL << 2) +/* Second Level Page Translation Pointer*/ +#define VTD_CONTEXT_ENTRY_SLPTPTR (~0xfffULL) +#define VTD_CONTEXT_ENTRY_RSVD_LO(aw) (0xff0ULL | ~VTD_HAW_MASK(aw)) +/* hi */ +#define VTD_CONTEXT_ENTRY_AW 7ULL /* Adjusted guest-address-width */ +#define VTD_CONTEXT_ENTRY_DID(val) (((val) >> 8) & VTD_DOMAIN_ID_MASK) +#define VTD_CONTEXT_ENTRY_RSVD_HI 0xffffffffff000080ULL + +#define VTD_CONTEXT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(VTDContextEntry)) + +#define VTD_CTX_ENTRY_LEGACY_SIZE 16 +#define VTD_CTX_ENTRY_SCALABLE_SIZE 32 + +#define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff +#define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL | ~VTD_HAW_MASK(aw)) +#define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL + +/* PASID Table Related Definitions */ +#define VTD_PASID_DIR_BASE_ADDR_MASK (~0xfffULL) +#define VTD_PASID_TABLE_BASE_ADDR_MASK (~0xfffULL) +#define VTD_PASID_DIR_ENTRY_SIZE 8 +#define VTD_PASID_ENTRY_SIZE 64 +#define VTD_PASID_DIR_BITS_MASK (0x3fffULL) +#define VTD_PASID_DIR_INDEX(pasid) (((pasid) >> 6) & VTD_PASID_DIR_BITS_MASK) +#define VTD_PASID_DIR_FPD (1ULL << 1) /* Fault Processing Disable */ +#define VTD_PASID_TABLE_BITS_MASK (0x3fULL) +#define VTD_PASID_TABLE_INDEX(pasid) ((pasid) & VTD_PASID_TABLE_BITS_MASK) +#define VTD_PASID_ENTRY_FPD (1ULL << 1) /* Fault Processing Disable */ + +/* PASID Granular Translation Type Mask */ +#define VTD_PASID_ENTRY_P 1ULL +#define VTD_SM_PASID_ENTRY_PGTT (7ULL << 6) +#define VTD_SM_PASID_ENTRY_FLT (1ULL << 6) +#define VTD_SM_PASID_ENTRY_SLT (2ULL << 6) +#define VTD_SM_PASID_ENTRY_NESTED (3ULL << 6) +#define VTD_SM_PASID_ENTRY_PT (4ULL << 6) + +#define VTD_SM_PASID_ENTRY_AW 7ULL /* Adjusted guest-address-width */ +#define VTD_SM_PASID_ENTRY_DID(val) ((val) & VTD_DOMAIN_ID_MASK) + +/* Second Level Page Translation Pointer*/ +#define VTD_SM_PASID_ENTRY_SLPTPTR (~0xfffULL) + +/* Paging Structure common */ +#define VTD_SL_PT_PAGE_SIZE_MASK (1ULL << 7) +/* Bits to decide the offset for each level */ +#define VTD_SL_LEVEL_BITS 9 + +/* Second Level Paging Structure */ +#define VTD_SL_PML4_LEVEL 4 +#define VTD_SL_PDP_LEVEL 3 +#define VTD_SL_PD_LEVEL 2 +#define VTD_SL_PT_LEVEL 1 +#define VTD_SL_PT_ENTRY_NR 512 + +/* Masks for Second Level Paging Entry */ +#define VTD_SL_RW_MASK 3ULL +#define VTD_SL_R 1ULL +#define VTD_SL_W (1ULL << 1) +#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw)) +#define VTD_SL_IGN_COM 0xbff0000000000000ULL +#define VTD_SL_TM (1ULL << 62) + +#endif diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c new file mode 100644 index 000000000..1e89ca089 --- /dev/null +++ b/hw/i386/kvm/apic.c @@ -0,0 +1,271 @@ +/* + * KVM in-kernel APIC support + * + * Copyright (c) 2011 Siemens AG + * + * Authors: + * Jan Kiszka <jan.kiszka@siemens.com> + * + * This work is licensed under the terms of the GNU GPL version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/module.h" +#include "hw/i386/apic_internal.h" +#include "hw/pci/msi.h" +#include "sysemu/hw_accel.h" +#include "sysemu/kvm.h" +#include "kvm/kvm_i386.h" + +static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic, + int reg_id, uint32_t val) +{ + *((uint32_t *)(kapic->regs + (reg_id << 4))) = val; +} + +static inline uint32_t kvm_apic_get_reg(struct kvm_lapic_state *kapic, + int reg_id) +{ + return *((uint32_t *)(kapic->regs + (reg_id << 4))); +} + +static void kvm_put_apic_state(APICCommonState *s, struct kvm_lapic_state *kapic) +{ + int i; + + memset(kapic, 0, sizeof(*kapic)); + if (kvm_has_x2apic_api() && s->apicbase & MSR_IA32_APICBASE_EXTD) { + kvm_apic_set_reg(kapic, 0x2, s->initial_apic_id); + } else { + kvm_apic_set_reg(kapic, 0x2, s->id << 24); + } + kvm_apic_set_reg(kapic, 0x8, s->tpr); + kvm_apic_set_reg(kapic, 0xd, s->log_dest << 24); + kvm_apic_set_reg(kapic, 0xe, s->dest_mode << 28 | 0x0fffffff); + kvm_apic_set_reg(kapic, 0xf, s->spurious_vec); + for (i = 0; i < 8; i++) { + kvm_apic_set_reg(kapic, 0x10 + i, s->isr[i]); + kvm_apic_set_reg(kapic, 0x18 + i, s->tmr[i]); + kvm_apic_set_reg(kapic, 0x20 + i, s->irr[i]); + } + kvm_apic_set_reg(kapic, 0x28, s->esr); + kvm_apic_set_reg(kapic, 0x30, s->icr[0]); + kvm_apic_set_reg(kapic, 0x31, s->icr[1]); + for (i = 0; i < APIC_LVT_NB; i++) { + kvm_apic_set_reg(kapic, 0x32 + i, s->lvt[i]); + } + kvm_apic_set_reg(kapic, 0x38, s->initial_count); + kvm_apic_set_reg(kapic, 0x3e, s->divide_conf); +} + +void kvm_get_apic_state(DeviceState *dev, struct kvm_lapic_state *kapic) +{ + APICCommonState *s = APIC_COMMON(dev); + int i, v; + + if (kvm_has_x2apic_api() && s->apicbase & MSR_IA32_APICBASE_EXTD) { + assert(kvm_apic_get_reg(kapic, 0x2) == s->initial_apic_id); + } else { + s->id = kvm_apic_get_reg(kapic, 0x2) >> 24; + } + s->tpr = kvm_apic_get_reg(kapic, 0x8); + s->arb_id = kvm_apic_get_reg(kapic, 0x9); + s->log_dest = kvm_apic_get_reg(kapic, 0xd) >> 24; + s->dest_mode = kvm_apic_get_reg(kapic, 0xe) >> 28; + s->spurious_vec = kvm_apic_get_reg(kapic, 0xf); + for (i = 0; i < 8; i++) { + s->isr[i] = kvm_apic_get_reg(kapic, 0x10 + i); + s->tmr[i] = kvm_apic_get_reg(kapic, 0x18 + i); + s->irr[i] = kvm_apic_get_reg(kapic, 0x20 + i); + } + s->esr = kvm_apic_get_reg(kapic, 0x28); + s->icr[0] = kvm_apic_get_reg(kapic, 0x30); + s->icr[1] = kvm_apic_get_reg(kapic, 0x31); + for (i = 0; i < APIC_LVT_NB; i++) { + s->lvt[i] = kvm_apic_get_reg(kapic, 0x32 + i); + } + s->initial_count = kvm_apic_get_reg(kapic, 0x38); + s->divide_conf = kvm_apic_get_reg(kapic, 0x3e); + + v = (s->divide_conf & 3) | ((s->divide_conf >> 1) & 4); + s->count_shift = (v + 1) & 7; + + s->initial_count_load_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + apic_next_timer(s, s->initial_count_load_time); +} + +static void kvm_apic_set_base(APICCommonState *s, uint64_t val) +{ + s->apicbase = val; +} + +static void kvm_apic_set_tpr(APICCommonState *s, uint8_t val) +{ + s->tpr = (val & 0x0f) << 4; +} + +static uint8_t kvm_apic_get_tpr(APICCommonState *s) +{ + return s->tpr >> 4; +} + +static void kvm_apic_enable_tpr_reporting(APICCommonState *s, bool enable) +{ + struct kvm_tpr_access_ctl ctl = { + .enabled = enable + }; + + kvm_vcpu_ioctl(CPU(s->cpu), KVM_TPR_ACCESS_REPORTING, &ctl); +} + +static void kvm_apic_vapic_base_update(APICCommonState *s) +{ + struct kvm_vapic_addr vapid_addr = { + .vapic_addr = s->vapic_paddr, + }; + int ret; + + ret = kvm_vcpu_ioctl(CPU(s->cpu), KVM_SET_VAPIC_ADDR, &vapid_addr); + if (ret < 0) { + fprintf(stderr, "KVM: setting VAPIC address failed (%s)\n", + strerror(-ret)); + abort(); + } +} + +static void kvm_apic_put(CPUState *cs, run_on_cpu_data data) +{ + APICCommonState *s = data.host_ptr; + struct kvm_lapic_state kapic; + int ret; + + kvm_put_apicbase(s->cpu, s->apicbase); + kvm_put_apic_state(s, &kapic); + + ret = kvm_vcpu_ioctl(CPU(s->cpu), KVM_SET_LAPIC, &kapic); + if (ret < 0) { + fprintf(stderr, "KVM_SET_LAPIC failed: %s\n", strerror(-ret)); + abort(); + } +} + +static void kvm_apic_post_load(APICCommonState *s) +{ + run_on_cpu(CPU(s->cpu), kvm_apic_put, RUN_ON_CPU_HOST_PTR(s)); +} + +static void do_inject_external_nmi(CPUState *cpu, run_on_cpu_data data) +{ + APICCommonState *s = data.host_ptr; + uint32_t lvt; + int ret; + + cpu_synchronize_state(cpu); + + lvt = s->lvt[APIC_LVT_LINT1]; + if (!(lvt & APIC_LVT_MASKED) && ((lvt >> 8) & 7) == APIC_DM_NMI) { + ret = kvm_vcpu_ioctl(cpu, KVM_NMI); + if (ret < 0) { + fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n", + strerror(-ret)); + } + } +} + +static void kvm_apic_external_nmi(APICCommonState *s) +{ + run_on_cpu(CPU(s->cpu), do_inject_external_nmi, RUN_ON_CPU_HOST_PTR(s)); +} + +static void kvm_send_msi(MSIMessage *msg) +{ + int ret; + + /* + * The message has already passed through interrupt remapping if enabled, + * but the legacy extended destination ID in low bits still needs to be + * handled. + */ + msg->address = kvm_swizzle_msi_ext_dest_id(msg->address); + + ret = kvm_irqchip_send_msi(kvm_state, *msg); + if (ret < 0) { + fprintf(stderr, "KVM: injection failed, MSI lost (%s)\n", + strerror(-ret)); + } +} + +static uint64_t kvm_apic_mem_read(void *opaque, hwaddr addr, + unsigned size) +{ + return ~(uint64_t)0; +} + +static void kvm_apic_mem_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + MSIMessage msg = { .address = addr, .data = data }; + + kvm_send_msi(&msg); +} + +static const MemoryRegionOps kvm_apic_io_ops = { + .read = kvm_apic_mem_read, + .write = kvm_apic_mem_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void kvm_apic_reset(APICCommonState *s) +{ + /* Not used by KVM, which uses the CPU mp_state instead. */ + s->wait_for_sipi = 0; + + run_on_cpu(CPU(s->cpu), kvm_apic_put, RUN_ON_CPU_HOST_PTR(s)); +} + +static void kvm_apic_realize(DeviceState *dev, Error **errp) +{ + APICCommonState *s = APIC_COMMON(dev); + + memory_region_init_io(&s->io_memory, OBJECT(s), &kvm_apic_io_ops, s, + "kvm-apic-msi", APIC_SPACE_SIZE); + + assert(kvm_has_gsi_routing()); + msi_nonbroken = true; +} + +static void kvm_apic_unrealize(DeviceState *dev) +{ +} + +static void kvm_apic_class_init(ObjectClass *klass, void *data) +{ + APICCommonClass *k = APIC_COMMON_CLASS(klass); + + k->realize = kvm_apic_realize; + k->unrealize = kvm_apic_unrealize; + k->reset = kvm_apic_reset; + k->set_base = kvm_apic_set_base; + k->set_tpr = kvm_apic_set_tpr; + k->get_tpr = kvm_apic_get_tpr; + k->post_load = kvm_apic_post_load; + k->enable_tpr_reporting = kvm_apic_enable_tpr_reporting; + k->vapic_base_update = kvm_apic_vapic_base_update; + k->external_nmi = kvm_apic_external_nmi; + k->send_msi = kvm_send_msi; +} + +static const TypeInfo kvm_apic_info = { + .name = "kvm-apic", + .parent = TYPE_APIC_COMMON, + .instance_size = sizeof(APICCommonState), + .class_init = kvm_apic_class_init, +}; + +static void kvm_apic_register_types(void) +{ + type_register_static(&kvm_apic_info); +} + +type_init(kvm_apic_register_types) diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c new file mode 100644 index 000000000..df70b4a03 --- /dev/null +++ b/hw/i386/kvm/clock.c @@ -0,0 +1,350 @@ +/* + * QEMU KVM support, paravirtual clock device + * + * Copyright (C) 2011 Siemens AG + * + * Authors: + * Jan Kiszka <jan.kiszka@siemens.com> + * + * This work is licensed under the terms of the GNU GPL version 2. + * See the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" +#include "qemu/module.h" +#include "sysemu/kvm.h" +#include "sysemu/runstate.h" +#include "sysemu/hw_accel.h" +#include "kvm/kvm_i386.h" +#include "migration/vmstate.h" +#include "hw/sysbus.h" +#include "hw/kvm/clock.h" +#include "hw/qdev-properties.h" +#include "qapi/error.h" + +#include <linux/kvm.h> +#include "standard-headers/asm-x86/kvm_para.h" +#include "qom/object.h" + +#define TYPE_KVM_CLOCK "kvmclock" +OBJECT_DECLARE_SIMPLE_TYPE(KVMClockState, KVM_CLOCK) + +struct KVMClockState { + /*< private >*/ + SysBusDevice busdev; + /*< public >*/ + + uint64_t clock; + bool clock_valid; + + /* whether the 'clock' value was obtained in the 'paused' state */ + bool runstate_paused; + + /* whether machine type supports reliable KVM_GET_CLOCK */ + bool mach_use_reliable_get_clock; + + /* whether the 'clock' value was obtained in a host with + * reliable KVM_GET_CLOCK */ + bool clock_is_reliable; +}; + +struct pvclock_vcpu_time_info { + uint32_t version; + uint32_t pad0; + uint64_t tsc_timestamp; + uint64_t system_time; + uint32_t tsc_to_system_mul; + int8_t tsc_shift; + uint8_t flags; + uint8_t pad[2]; +} __attribute__((__packed__)); /* 32 bytes */ + +static uint64_t kvmclock_current_nsec(KVMClockState *s) +{ + CPUState *cpu = first_cpu; + CPUX86State *env = cpu->env_ptr; + hwaddr kvmclock_struct_pa; + uint64_t migration_tsc = env->tsc; + struct pvclock_vcpu_time_info time; + uint64_t delta; + uint64_t nsec_lo; + uint64_t nsec_hi; + uint64_t nsec; + + cpu_synchronize_state(cpu); + + if (!(env->system_time_msr & 1ULL)) { + /* KVM clock not active */ + return 0; + } + + kvmclock_struct_pa = env->system_time_msr & ~1ULL; + cpu_physical_memory_read(kvmclock_struct_pa, &time, sizeof(time)); + + assert(time.tsc_timestamp <= migration_tsc); + delta = migration_tsc - time.tsc_timestamp; + if (time.tsc_shift < 0) { + delta >>= -time.tsc_shift; + } else { + delta <<= time.tsc_shift; + } + + mulu64(&nsec_lo, &nsec_hi, delta, time.tsc_to_system_mul); + nsec = (nsec_lo >> 32) | (nsec_hi << 32); + return nsec + time.system_time; +} + +static void kvm_update_clock(KVMClockState *s) +{ + struct kvm_clock_data data; + int ret; + + ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data); + if (ret < 0) { + fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(-ret)); + abort(); + } + s->clock = data.clock; + + /* If kvm_has_adjust_clock_stable() is false, KVM_GET_CLOCK returns + * essentially CLOCK_MONOTONIC plus a guest-specific adjustment. This + * can drift from the TSC-based value that is computed by the guest, + * so we need to go through kvmclock_current_nsec(). If + * kvm_has_adjust_clock_stable() is true, and the flags contain + * KVM_CLOCK_TSC_STABLE, then KVM_GET_CLOCK returns a TSC-based value + * and kvmclock_current_nsec() is not necessary. + * + * Here, however, we need not check KVM_CLOCK_TSC_STABLE. This is because: + * + * - if the host has disabled the kvmclock master clock, the guest already + * has protection against time going backwards. This "safety net" is only + * absent when kvmclock is stable; + * + * - therefore, we can replace a check like + * + * if last KVM_GET_CLOCK was not reliable then + * read from memory + * + * with + * + * if last KVM_GET_CLOCK was not reliable && masterclock is enabled + * read from memory + * + * However: + * + * - if kvm_has_adjust_clock_stable() returns false, the left side is + * always true (KVM_GET_CLOCK is never reliable), and the right side is + * unknown (because we don't have data.flags). We must assume it's true + * and read from memory. + * + * - if kvm_has_adjust_clock_stable() returns true, the result of the && + * is always false (masterclock is enabled iff KVM_GET_CLOCK is reliable) + * + * So we can just use this instead: + * + * if !kvm_has_adjust_clock_stable() then + * read from memory + */ + s->clock_is_reliable = kvm_has_adjust_clock_stable(); +} + +static void do_kvmclock_ctrl(CPUState *cpu, run_on_cpu_data data) +{ + int ret = kvm_vcpu_ioctl(cpu, KVM_KVMCLOCK_CTRL, 0); + + if (ret && ret != -EINVAL) { + fprintf(stderr, "%s: %s\n", __func__, strerror(-ret)); + } +} + +static void kvmclock_vm_state_change(void *opaque, bool running, + RunState state) +{ + KVMClockState *s = opaque; + CPUState *cpu; + int cap_clock_ctrl = kvm_check_extension(kvm_state, KVM_CAP_KVMCLOCK_CTRL); + int ret; + + if (running) { + struct kvm_clock_data data = {}; + + /* + * If the host where s->clock was read did not support reliable + * KVM_GET_CLOCK, read kvmclock value from memory. + */ + if (!s->clock_is_reliable) { + uint64_t pvclock_via_mem = kvmclock_current_nsec(s); + /* We can't rely on the saved clock value, just discard it */ + if (pvclock_via_mem) { + s->clock = pvclock_via_mem; + } + } + + s->clock_valid = false; + + data.clock = s->clock; + ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data); + if (ret < 0) { + fprintf(stderr, "KVM_SET_CLOCK failed: %s\n", strerror(-ret)); + abort(); + } + + if (!cap_clock_ctrl) { + return; + } + CPU_FOREACH(cpu) { + run_on_cpu(cpu, do_kvmclock_ctrl, RUN_ON_CPU_NULL); + } + } else { + + if (s->clock_valid) { + return; + } + + s->runstate_paused = runstate_check(RUN_STATE_PAUSED); + + kvm_synchronize_all_tsc(); + + kvm_update_clock(s); + /* + * If the VM is stopped, declare the clock state valid to + * avoid re-reading it on next vmsave (which would return + * a different value). Will be reset when the VM is continued. + */ + s->clock_valid = true; + } +} + +static void kvmclock_realize(DeviceState *dev, Error **errp) +{ + KVMClockState *s = KVM_CLOCK(dev); + + if (!kvm_enabled()) { + error_setg(errp, "kvmclock device requires KVM"); + return; + } + + kvm_update_clock(s); + + qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s); +} + +static bool kvmclock_clock_is_reliable_needed(void *opaque) +{ + KVMClockState *s = opaque; + + return s->mach_use_reliable_get_clock; +} + +static const VMStateDescription kvmclock_reliable_get_clock = { + .name = "kvmclock/clock_is_reliable", + .version_id = 1, + .minimum_version_id = 1, + .needed = kvmclock_clock_is_reliable_needed, + .fields = (VMStateField[]) { + VMSTATE_BOOL(clock_is_reliable, KVMClockState), + VMSTATE_END_OF_LIST() + } +}; + +/* + * When migrating, assume the source has an unreliable + * KVM_GET_CLOCK unless told otherwise. + */ +static int kvmclock_pre_load(void *opaque) +{ + KVMClockState *s = opaque; + + s->clock_is_reliable = false; + + return 0; +} + +/* + * When migrating a running guest, read the clock just + * before migration, so that the guest clock counts + * during the events between: + * + * * vm_stop() + * * + * * pre_save() + * + * This reduces kvmclock difference on migration from 5s + * to 0.1s (when max_downtime == 5s), because sending the + * final pages of memory (which happens between vm_stop() + * and pre_save()) takes max_downtime. + */ +static int kvmclock_pre_save(void *opaque) +{ + KVMClockState *s = opaque; + + if (!s->runstate_paused) { + kvm_update_clock(s); + } + + return 0; +} + +static const VMStateDescription kvmclock_vmsd = { + .name = "kvmclock", + .version_id = 1, + .minimum_version_id = 1, + .pre_load = kvmclock_pre_load, + .pre_save = kvmclock_pre_save, + .fields = (VMStateField[]) { + VMSTATE_UINT64(clock, KVMClockState), + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * []) { + &kvmclock_reliable_get_clock, + NULL + } +}; + +static Property kvmclock_properties[] = { + DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState, + mach_use_reliable_get_clock, true), + DEFINE_PROP_END_OF_LIST(), +}; + +static void kvmclock_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = kvmclock_realize; + dc->vmsd = &kvmclock_vmsd; + device_class_set_props(dc, kvmclock_properties); +} + +static const TypeInfo kvmclock_info = { + .name = TYPE_KVM_CLOCK, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(KVMClockState), + .class_init = kvmclock_class_init, +}; + +/* Note: Must be called after VCPU initialization. */ +void kvmclock_create(bool create_always) +{ + X86CPU *cpu = X86_CPU(first_cpu); + + if (!kvm_enabled() || !kvm_has_adjust_clock()) + return; + + if (create_always || + cpu->env.features[FEAT_KVM] & ((1ULL << KVM_FEATURE_CLOCKSOURCE) | + (1ULL << KVM_FEATURE_CLOCKSOURCE2))) { + sysbus_create_simple(TYPE_KVM_CLOCK, -1, NULL); + } +} + +static void kvmclock_register_types(void) +{ + type_register_static(&kvmclock_info); +} + +type_init(kvmclock_register_types) diff --git a/hw/i386/kvm/i8254.c b/hw/i386/kvm/i8254.c new file mode 100644 index 000000000..191a26fa5 --- /dev/null +++ b/hw/i386/kvm/i8254.c @@ -0,0 +1,337 @@ +/* + * KVM in-kernel PIT (i8254) support + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2012 Jan Kiszka, Siemens AG + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include <linux/kvm.h> +#include "qapi/qapi-types-machine.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "qemu/timer.h" +#include "sysemu/runstate.h" +#include "hw/timer/i8254.h" +#include "hw/timer/i8254_internal.h" +#include "hw/qdev-properties-system.h" +#include "sysemu/kvm.h" +#include "qom/object.h" + +#define KVM_PIT_REINJECT_BIT 0 + +#define CALIBRATION_ROUNDS 3 + +typedef struct KVMPITClass KVMPITClass; +typedef struct KVMPITState KVMPITState; +DECLARE_OBJ_CHECKERS(KVMPITState, KVMPITClass, + KVM_PIT, TYPE_KVM_I8254) + +struct KVMPITState { + PITCommonState parent_obj; + + LostTickPolicy lost_tick_policy; + bool vm_stopped; + int64_t kernel_clock_offset; +}; + +struct KVMPITClass { + PITCommonClass parent_class; + + DeviceRealize parent_realize; +}; + +static void kvm_pit_update_clock_offset(KVMPITState *s) +{ + int64_t offset, clock_offset; + struct timespec ts; + int i; + + /* + * Measure the delta between CLOCK_MONOTONIC, the base used for + * kvm_pit_channel_state::count_load_time, and QEMU_CLOCK_VIRTUAL. Take the + * minimum of several samples to filter out scheduling noise. + */ + clock_offset = INT64_MAX; + for (i = 0; i < CALIBRATION_ROUNDS; i++) { + offset = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + clock_gettime(CLOCK_MONOTONIC, &ts); + offset -= ts.tv_nsec; + offset -= (int64_t)ts.tv_sec * 1000000000; + if (uabs64(offset) < uabs64(clock_offset)) { + clock_offset = offset; + } + } + s->kernel_clock_offset = clock_offset; +} + +static void kvm_pit_get(PITCommonState *pit) +{ + KVMPITState *s = KVM_PIT(pit); + struct kvm_pit_state2 kpit; + struct kvm_pit_channel_state *kchan; + struct PITChannelState *sc; + int i, ret; + + /* No need to re-read the state if VM is stopped. */ + if (s->vm_stopped) { + return; + } + + if (kvm_has_pit_state2()) { + ret = kvm_vm_ioctl(kvm_state, KVM_GET_PIT2, &kpit); + if (ret < 0) { + fprintf(stderr, "KVM_GET_PIT2 failed: %s\n", strerror(-ret)); + abort(); + } + pit->channels[0].irq_disabled = kpit.flags & KVM_PIT_FLAGS_HPET_LEGACY; + } else { + /* + * kvm_pit_state2 is superset of kvm_pit_state struct, + * so we can use it for KVM_GET_PIT as well. + */ + ret = kvm_vm_ioctl(kvm_state, KVM_GET_PIT, &kpit); + if (ret < 0) { + fprintf(stderr, "KVM_GET_PIT failed: %s\n", strerror(-ret)); + abort(); + } + } + for (i = 0; i < 3; i++) { + kchan = &kpit.channels[i]; + sc = &pit->channels[i]; + sc->count = kchan->count; + sc->latched_count = kchan->latched_count; + sc->count_latched = kchan->count_latched; + sc->status_latched = kchan->status_latched; + sc->status = kchan->status; + sc->read_state = kchan->read_state; + sc->write_state = kchan->write_state; + sc->write_latch = kchan->write_latch; + sc->rw_mode = kchan->rw_mode; + sc->mode = kchan->mode; + sc->bcd = kchan->bcd; + sc->gate = kchan->gate; + sc->count_load_time = kchan->count_load_time + s->kernel_clock_offset; + } + + sc = &pit->channels[0]; + sc->next_transition_time = + pit_get_next_transition_time(sc, sc->count_load_time); +} + +static void kvm_pit_put(PITCommonState *pit) +{ + KVMPITState *s = KVM_PIT(pit); + struct kvm_pit_state2 kpit = {}; + struct kvm_pit_channel_state *kchan; + struct PITChannelState *sc; + int i, ret; + + /* The offset keeps changing as long as the VM is stopped. */ + if (s->vm_stopped) { + kvm_pit_update_clock_offset(s); + } + + kpit.flags = pit->channels[0].irq_disabled ? KVM_PIT_FLAGS_HPET_LEGACY : 0; + for (i = 0; i < 3; i++) { + kchan = &kpit.channels[i]; + sc = &pit->channels[i]; + kchan->count = sc->count; + kchan->latched_count = sc->latched_count; + kchan->count_latched = sc->count_latched; + kchan->status_latched = sc->status_latched; + kchan->status = sc->status; + kchan->read_state = sc->read_state; + kchan->write_state = sc->write_state; + kchan->write_latch = sc->write_latch; + kchan->rw_mode = sc->rw_mode; + kchan->mode = sc->mode; + kchan->bcd = sc->bcd; + kchan->gate = sc->gate; + kchan->count_load_time = sc->count_load_time - s->kernel_clock_offset; + } + + ret = kvm_vm_ioctl(kvm_state, + kvm_has_pit_state2() ? KVM_SET_PIT2 : KVM_SET_PIT, + &kpit); + if (ret < 0) { + fprintf(stderr, "%s failed: %s\n", + kvm_has_pit_state2() ? "KVM_SET_PIT2" : "KVM_SET_PIT", + strerror(-ret)); + abort(); + } +} + +static void kvm_pit_set_gate(PITCommonState *s, PITChannelState *sc, int val) +{ + kvm_pit_get(s); + + switch (sc->mode) { + default: + case 0: + case 4: + /* XXX: just disable/enable counting */ + break; + case 1: + case 2: + case 3: + case 5: + if (sc->gate < val) { + /* restart counting on rising edge */ + sc->count_load_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + } + break; + } + sc->gate = val; + + kvm_pit_put(s); +} + +static void kvm_pit_get_channel_info(PITCommonState *s, PITChannelState *sc, + PITChannelInfo *info) +{ + kvm_pit_get(s); + + pit_get_channel_info_common(s, sc, info); +} + +static void kvm_pit_reset(DeviceState *dev) +{ + PITCommonState *s = PIT_COMMON(dev); + + pit_reset_common(s); + + kvm_pit_put(s); +} + +static void kvm_pit_irq_control(void *opaque, int n, int enable) +{ + PITCommonState *pit = opaque; + PITChannelState *s = &pit->channels[0]; + + kvm_pit_get(pit); + + s->irq_disabled = !enable; + + kvm_pit_put(pit); +} + +static void kvm_pit_vm_state_change(void *opaque, bool running, + RunState state) +{ + KVMPITState *s = opaque; + + if (running) { + kvm_pit_update_clock_offset(s); + kvm_pit_put(PIT_COMMON(s)); + s->vm_stopped = false; + } else { + kvm_pit_update_clock_offset(s); + kvm_pit_get(PIT_COMMON(s)); + s->vm_stopped = true; + } +} + +static void kvm_pit_realizefn(DeviceState *dev, Error **errp) +{ + PITCommonState *pit = PIT_COMMON(dev); + KVMPITClass *kpc = KVM_PIT_GET_CLASS(dev); + KVMPITState *s = KVM_PIT(pit); + struct kvm_pit_config config = { + .flags = 0, + }; + int ret; + + if (kvm_check_extension(kvm_state, KVM_CAP_PIT2)) { + ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_PIT2, &config); + } else { + ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_PIT); + } + if (ret < 0) { + error_setg(errp, "Create kernel PIC irqchip failed: %s", + strerror(-ret)); + return; + } + switch (s->lost_tick_policy) { + case LOST_TICK_POLICY_DELAY: + break; /* enabled by default */ + case LOST_TICK_POLICY_DISCARD: + if (kvm_check_extension(kvm_state, KVM_CAP_REINJECT_CONTROL)) { + struct kvm_reinject_control control = { .pit_reinject = 0 }; + + ret = kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control); + if (ret < 0) { + error_setg(errp, + "Can't disable in-kernel PIT reinjection: %s", + strerror(-ret)); + return; + } + } + break; + default: + error_setg(errp, "Lost tick policy not supported."); + return; + } + + memory_region_init_io(&pit->ioports, OBJECT(dev), NULL, NULL, "kvm-pit", 4); + + qdev_init_gpio_in(dev, kvm_pit_irq_control, 1); + + qemu_add_vm_change_state_handler(kvm_pit_vm_state_change, s); + + kpc->parent_realize(dev, errp); +} + +static Property kvm_pit_properties[] = { + DEFINE_PROP_UINT32("iobase", PITCommonState, iobase, -1), + DEFINE_PROP_LOSTTICKPOLICY("lost_tick_policy", KVMPITState, + lost_tick_policy, LOST_TICK_POLICY_DELAY), + DEFINE_PROP_END_OF_LIST(), +}; + +static void kvm_pit_class_init(ObjectClass *klass, void *data) +{ + KVMPITClass *kpc = KVM_PIT_CLASS(klass); + PITCommonClass *k = PIT_COMMON_CLASS(klass); + DeviceClass *dc = DEVICE_CLASS(klass); + + device_class_set_parent_realize(dc, kvm_pit_realizefn, + &kpc->parent_realize); + k->set_channel_gate = kvm_pit_set_gate; + k->get_channel_info = kvm_pit_get_channel_info; + dc->reset = kvm_pit_reset; + device_class_set_props(dc, kvm_pit_properties); +} + +static const TypeInfo kvm_pit_info = { + .name = TYPE_KVM_I8254, + .parent = TYPE_PIT_COMMON, + .instance_size = sizeof(KVMPITState), + .class_init = kvm_pit_class_init, + .class_size = sizeof(KVMPITClass), +}; + +static void kvm_pit_register(void) +{ + type_register_static(&kvm_pit_info); +} + +type_init(kvm_pit_register) diff --git a/hw/i386/kvm/i8259.c b/hw/i386/kvm/i8259.c new file mode 100644 index 000000000..d61bae4dc --- /dev/null +++ b/hw/i386/kvm/i8259.c @@ -0,0 +1,167 @@ +/* + * KVM in-kernel PIC (i8259) support + * + * Copyright (c) 2011 Siemens AG + * + * Authors: + * Jan Kiszka <jan.kiszka@siemens.com> + * + * This work is licensed under the terms of the GNU GPL version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "hw/isa/i8259_internal.h" +#include "hw/intc/i8259.h" +#include "qemu/module.h" +#include "hw/i386/apic_internal.h" +#include "hw/irq.h" +#include "sysemu/kvm.h" +#include "qom/object.h" + +#define TYPE_KVM_I8259 "kvm-i8259" +typedef struct KVMPICClass KVMPICClass; +DECLARE_CLASS_CHECKERS(KVMPICClass, KVM_PIC, + TYPE_KVM_I8259) + +/** + * KVMPICClass: + * @parent_realize: The parent's realizefn. + */ +struct KVMPICClass { + PICCommonClass parent_class; + + DeviceRealize parent_realize; +}; + +static void kvm_pic_get(PICCommonState *s) +{ + struct kvm_irqchip chip; + struct kvm_pic_state *kpic; + int ret; + + chip.chip_id = s->master ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE; + ret = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, &chip); + if (ret < 0) { + fprintf(stderr, "KVM_GET_IRQCHIP failed: %s\n", strerror(-ret)); + abort(); + } + + kpic = &chip.chip.pic; + + s->last_irr = kpic->last_irr; + s->irr = kpic->irr; + s->imr = kpic->imr; + s->isr = kpic->isr; + s->priority_add = kpic->priority_add; + s->irq_base = kpic->irq_base; + s->read_reg_select = kpic->read_reg_select; + s->poll = kpic->poll; + s->special_mask = kpic->special_mask; + s->init_state = kpic->init_state; + s->auto_eoi = kpic->auto_eoi; + s->rotate_on_auto_eoi = kpic->rotate_on_auto_eoi; + s->special_fully_nested_mode = kpic->special_fully_nested_mode; + s->init4 = kpic->init4; + s->elcr = kpic->elcr; + s->elcr_mask = kpic->elcr_mask; +} + +static void kvm_pic_put(PICCommonState *s) +{ + struct kvm_irqchip chip; + struct kvm_pic_state *kpic; + int ret; + + chip.chip_id = s->master ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE; + + kpic = &chip.chip.pic; + + kpic->last_irr = s->last_irr; + kpic->irr = s->irr; + kpic->imr = s->imr; + kpic->isr = s->isr; + kpic->priority_add = s->priority_add; + kpic->irq_base = s->irq_base; + kpic->read_reg_select = s->read_reg_select; + kpic->poll = s->poll; + kpic->special_mask = s->special_mask; + kpic->init_state = s->init_state; + kpic->auto_eoi = s->auto_eoi; + kpic->rotate_on_auto_eoi = s->rotate_on_auto_eoi; + kpic->special_fully_nested_mode = s->special_fully_nested_mode; + kpic->init4 = s->init4; + kpic->elcr = s->elcr; + kpic->elcr_mask = s->elcr_mask; + + ret = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, &chip); + if (ret < 0) { + fprintf(stderr, "KVM_SET_IRQCHIP failed: %s\n", strerror(-ret)); + abort(); + } +} + +static void kvm_pic_reset(DeviceState *dev) +{ + PICCommonState *s = PIC_COMMON(dev); + + s->elcr = 0; + pic_reset_common(s); + + kvm_pic_put(s); +} + +static void kvm_pic_set_irq(void *opaque, int irq, int level) +{ + int delivered; + + pic_stat_update_irq(irq, level); + delivered = kvm_set_irq(kvm_state, irq, level); + apic_report_irq_delivered(delivered); +} + +static void kvm_pic_realize(DeviceState *dev, Error **errp) +{ + PICCommonState *s = PIC_COMMON(dev); + KVMPICClass *kpc = KVM_PIC_GET_CLASS(dev); + + memory_region_init_io(&s->base_io, OBJECT(dev), NULL, NULL, "kvm-pic", 2); + memory_region_init_io(&s->elcr_io, OBJECT(dev), NULL, NULL, "kvm-elcr", 1); + + kpc->parent_realize(dev, errp); +} + +qemu_irq *kvm_i8259_init(ISABus *bus) +{ + i8259_init_chip(TYPE_KVM_I8259, bus, true); + i8259_init_chip(TYPE_KVM_I8259, bus, false); + + return qemu_allocate_irqs(kvm_pic_set_irq, NULL, ISA_NUM_IRQS); +} + +static void kvm_i8259_class_init(ObjectClass *klass, void *data) +{ + KVMPICClass *kpc = KVM_PIC_CLASS(klass); + PICCommonClass *k = PIC_COMMON_CLASS(klass); + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->reset = kvm_pic_reset; + device_class_set_parent_realize(dc, kvm_pic_realize, &kpc->parent_realize); + k->pre_save = kvm_pic_get; + k->post_load = kvm_pic_put; +} + +static const TypeInfo kvm_i8259_info = { + .name = TYPE_KVM_I8259, + .parent = TYPE_PIC_COMMON, + .instance_size = sizeof(PICCommonState), + .class_init = kvm_i8259_class_init, + .class_size = sizeof(KVMPICClass), +}; + +static void kvm_pic_register_types(void) +{ + type_register_static(&kvm_i8259_info); +} + +type_init(kvm_pic_register_types) diff --git a/hw/i386/kvm/ioapic.c b/hw/i386/kvm/ioapic.c new file mode 100644 index 000000000..ee7c8ef68 --- /dev/null +++ b/hw/i386/kvm/ioapic.c @@ -0,0 +1,165 @@ +/* + * KVM in-kernel IOPIC support + * + * Copyright (c) 2011 Siemens AG + * + * Authors: + * Jan Kiszka <jan.kiszka@siemens.com> + * + * This work is licensed under the terms of the GNU GPL version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "monitor/monitor.h" +#include "hw/i386/x86.h" +#include "hw/qdev-properties.h" +#include "hw/i386/ioapic_internal.h" +#include "hw/i386/apic_internal.h" +#include "sysemu/kvm.h" + +/* PC Utility function */ +void kvm_pc_setup_irq_routing(bool pci_enabled) +{ + KVMState *s = kvm_state; + int i; + + assert(kvm_has_gsi_routing()); + for (i = 0; i < 8; ++i) { + if (i == 2) { + continue; + } + kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_PIC_MASTER, i); + } + for (i = 8; i < 16; ++i) { + kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_PIC_SLAVE, i - 8); + } + if (pci_enabled) { + for (i = 0; i < 24; ++i) { + if (i == 0) { + kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_IOAPIC, 2); + } else if (i != 2) { + kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_IOAPIC, i); + } + } + } + kvm_irqchip_commit_routes(s); +} + +typedef struct KVMIOAPICState KVMIOAPICState; + +struct KVMIOAPICState { + IOAPICCommonState ioapic; + uint32_t kvm_gsi_base; +}; + +static void kvm_ioapic_get(IOAPICCommonState *s) +{ + struct kvm_irqchip chip; + struct kvm_ioapic_state *kioapic; + int ret, i; + + chip.chip_id = KVM_IRQCHIP_IOAPIC; + ret = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, &chip); + if (ret < 0) { + fprintf(stderr, "KVM_GET_IRQCHIP failed: %s\n", strerror(-ret)); + abort(); + } + + kioapic = &chip.chip.ioapic; + + s->id = kioapic->id; + s->ioregsel = kioapic->ioregsel; + s->irr = kioapic->irr; + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + s->ioredtbl[i] = kioapic->redirtbl[i].bits; + } +} + +static void kvm_ioapic_put(IOAPICCommonState *s) +{ + struct kvm_irqchip chip; + struct kvm_ioapic_state *kioapic; + int ret, i; + + chip.chip_id = KVM_IRQCHIP_IOAPIC; + kioapic = &chip.chip.ioapic; + + kioapic->id = s->id; + kioapic->ioregsel = s->ioregsel; + kioapic->base_address = s->busdev.mmio[0].addr; + kioapic->irr = s->irr; + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + kioapic->redirtbl[i].bits = s->ioredtbl[i]; + } + + ret = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, &chip); + if (ret < 0) { + fprintf(stderr, "KVM_SET_IRQCHIP failed: %s\n", strerror(-ret)); + abort(); + } +} + +static void kvm_ioapic_reset(DeviceState *dev) +{ + IOAPICCommonState *s = IOAPIC_COMMON(dev); + + ioapic_reset_common(dev); + kvm_ioapic_put(s); +} + +static void kvm_ioapic_set_irq(void *opaque, int irq, int level) +{ + KVMIOAPICState *s = opaque; + IOAPICCommonState *common = IOAPIC_COMMON(s); + int delivered; + + ioapic_stat_update_irq(common, irq, level); + delivered = kvm_set_irq(kvm_state, s->kvm_gsi_base + irq, level); + apic_report_irq_delivered(delivered); +} + +static void kvm_ioapic_realize(DeviceState *dev, Error **errp) +{ + IOAPICCommonState *s = IOAPIC_COMMON(dev); + + memory_region_init_io(&s->io_memory, OBJECT(dev), NULL, NULL, "kvm-ioapic", 0x1000); + /* + * KVM ioapic only supports 0x11 now. This will only be used when + * we want to dump ioapic version. + */ + s->version = 0x11; + + qdev_init_gpio_in(dev, kvm_ioapic_set_irq, IOAPIC_NUM_PINS); +} + +static Property kvm_ioapic_properties[] = { + DEFINE_PROP_UINT32("gsi_base", KVMIOAPICState, kvm_gsi_base, 0), + DEFINE_PROP_END_OF_LIST() +}; + +static void kvm_ioapic_class_init(ObjectClass *klass, void *data) +{ + IOAPICCommonClass *k = IOAPIC_COMMON_CLASS(klass); + DeviceClass *dc = DEVICE_CLASS(klass); + + k->realize = kvm_ioapic_realize; + k->pre_save = kvm_ioapic_get; + k->post_load = kvm_ioapic_put; + dc->reset = kvm_ioapic_reset; + device_class_set_props(dc, kvm_ioapic_properties); +} + +static const TypeInfo kvm_ioapic_info = { + .name = TYPE_KVM_IOAPIC, + .parent = TYPE_IOAPIC_COMMON, + .instance_size = sizeof(KVMIOAPICState), + .class_init = kvm_ioapic_class_init, +}; + +static void kvm_ioapic_register_types(void) +{ + type_register_static(&kvm_ioapic_info); +} + +type_init(kvm_ioapic_register_types) diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build new file mode 100644 index 000000000..95467f1de --- /dev/null +++ b/hw/i386/kvm/meson.build @@ -0,0 +1,8 @@ +i386_kvm_ss = ss.source_set() +i386_kvm_ss.add(files('clock.c')) +i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c')) +i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c')) +i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c')) +i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c')) + +i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss) diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c new file mode 100644 index 000000000..43f8a8f67 --- /dev/null +++ b/hw/i386/kvmvapic.c @@ -0,0 +1,871 @@ +/* + * TPR optimization for 32-bit Windows guests (XP and Server 2003) + * + * Copyright (C) 2007-2008 Qumranet Technologies + * Copyright (C) 2012 Jan Kiszka, Siemens AG + * + * This work is licensed under the terms of the GNU GPL version 2, or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/module.h" +#include "sysemu/sysemu.h" +#include "sysemu/cpus.h" +#include "sysemu/hw_accel.h" +#include "sysemu/kvm.h" +#include "sysemu/runstate.h" +#include "hw/i386/apic_internal.h" +#include "hw/sysbus.h" +#include "hw/boards.h" +#include "migration/vmstate.h" +#include "qom/object.h" + +#define VAPIC_IO_PORT 0x7e + +#define VAPIC_CPU_SHIFT 7 + +#define ROM_BLOCK_SIZE 512 +#define ROM_BLOCK_MASK (~(ROM_BLOCK_SIZE - 1)) + +typedef enum VAPICMode { + VAPIC_INACTIVE = 0, + VAPIC_ACTIVE = 1, + VAPIC_STANDBY = 2, +} VAPICMode; + +typedef struct VAPICHandlers { + uint32_t set_tpr; + uint32_t set_tpr_eax; + uint32_t get_tpr[8]; + uint32_t get_tpr_stack; +} QEMU_PACKED VAPICHandlers; + +typedef struct GuestROMState { + char signature[8]; + uint32_t vaddr; + uint32_t fixup_start; + uint32_t fixup_end; + uint32_t vapic_vaddr; + uint32_t vapic_size; + uint32_t vcpu_shift; + uint32_t real_tpr_addr; + VAPICHandlers up; + VAPICHandlers mp; +} QEMU_PACKED GuestROMState; + +struct VAPICROMState { + SysBusDevice busdev; + MemoryRegion io; + MemoryRegion rom; + uint32_t state; + uint32_t rom_state_paddr; + uint32_t rom_state_vaddr; + uint32_t vapic_paddr; + uint32_t real_tpr_addr; + GuestROMState rom_state; + size_t rom_size; + bool rom_mapped_writable; + VMChangeStateEntry *vmsentry; +}; + +#define TYPE_VAPIC "kvmvapic" +OBJECT_DECLARE_SIMPLE_TYPE(VAPICROMState, VAPIC) + +#define TPR_INSTR_ABS_MODRM 0x1 +#define TPR_INSTR_MATCH_MODRM_REG 0x2 + +typedef struct TPRInstruction { + uint8_t opcode; + uint8_t modrm_reg; + unsigned int flags; + TPRAccess access; + size_t length; + off_t addr_offset; +} TPRInstruction; + +/* must be sorted by length, shortest first */ +static const TPRInstruction tpr_instr[] = { + { /* mov abs to eax */ + .opcode = 0xa1, + .access = TPR_ACCESS_READ, + .length = 5, + .addr_offset = 1, + }, + { /* mov eax to abs */ + .opcode = 0xa3, + .access = TPR_ACCESS_WRITE, + .length = 5, + .addr_offset = 1, + }, + { /* mov r32 to r/m32 */ + .opcode = 0x89, + .flags = TPR_INSTR_ABS_MODRM, + .access = TPR_ACCESS_WRITE, + .length = 6, + .addr_offset = 2, + }, + { /* mov r/m32 to r32 */ + .opcode = 0x8b, + .flags = TPR_INSTR_ABS_MODRM, + .access = TPR_ACCESS_READ, + .length = 6, + .addr_offset = 2, + }, + { /* push r/m32 */ + .opcode = 0xff, + .modrm_reg = 6, + .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG, + .access = TPR_ACCESS_READ, + .length = 6, + .addr_offset = 2, + }, + { /* mov imm32, r/m32 (c7/0) */ + .opcode = 0xc7, + .modrm_reg = 0, + .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG, + .access = TPR_ACCESS_WRITE, + .length = 10, + .addr_offset = 2, + }, +}; + +static void read_guest_rom_state(VAPICROMState *s) +{ + cpu_physical_memory_read(s->rom_state_paddr, &s->rom_state, + sizeof(GuestROMState)); +} + +static void write_guest_rom_state(VAPICROMState *s) +{ + cpu_physical_memory_write(s->rom_state_paddr, &s->rom_state, + sizeof(GuestROMState)); +} + +static void update_guest_rom_state(VAPICROMState *s) +{ + read_guest_rom_state(s); + + s->rom_state.real_tpr_addr = cpu_to_le32(s->real_tpr_addr); + s->rom_state.vcpu_shift = cpu_to_le32(VAPIC_CPU_SHIFT); + + write_guest_rom_state(s); +} + +static int find_real_tpr_addr(VAPICROMState *s, CPUX86State *env) +{ + CPUState *cs = env_cpu(env); + hwaddr paddr; + target_ulong addr; + + if (s->state == VAPIC_ACTIVE) { + return 0; + } + /* + * If there is no prior TPR access instruction we could analyze (which is + * the case after resume from hibernation), we need to scan the possible + * virtual address space for the APIC mapping. + */ + for (addr = 0xfffff000; addr >= 0x80000000; addr -= TARGET_PAGE_SIZE) { + paddr = cpu_get_phys_page_debug(cs, addr); + if (paddr != APIC_DEFAULT_ADDRESS) { + continue; + } + s->real_tpr_addr = addr + 0x80; + update_guest_rom_state(s); + return 0; + } + return -1; +} + +static uint8_t modrm_reg(uint8_t modrm) +{ + return (modrm >> 3) & 7; +} + +static bool is_abs_modrm(uint8_t modrm) +{ + return (modrm & 0xc7) == 0x05; +} + +static bool opcode_matches(uint8_t *opcode, const TPRInstruction *instr) +{ + return opcode[0] == instr->opcode && + (!(instr->flags & TPR_INSTR_ABS_MODRM) || is_abs_modrm(opcode[1])) && + (!(instr->flags & TPR_INSTR_MATCH_MODRM_REG) || + modrm_reg(opcode[1]) == instr->modrm_reg); +} + +static int evaluate_tpr_instruction(VAPICROMState *s, X86CPU *cpu, + target_ulong *pip, TPRAccess access) +{ + CPUState *cs = CPU(cpu); + const TPRInstruction *instr; + target_ulong ip = *pip; + uint8_t opcode[2]; + uint32_t real_tpr_addr; + int i; + + if ((ip & 0xf0000000ULL) != 0x80000000ULL && + (ip & 0xf0000000ULL) != 0xe0000000ULL) { + return -1; + } + + /* + * Early Windows 2003 SMP initialization contains a + * + * mov imm32, r/m32 + * + * instruction that is patched by TPR optimization. The problem is that + * RSP, used by the patched instruction, is zero, so the guest gets a + * double fault and dies. + */ + if (cpu->env.regs[R_ESP] == 0) { + return -1; + } + + if (kvm_enabled() && !kvm_irqchip_in_kernel()) { + /* + * KVM without kernel-based TPR access reporting will pass an IP that + * points after the accessing instruction. So we need to look backward + * to find the reason. + */ + for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) { + instr = &tpr_instr[i]; + if (instr->access != access) { + continue; + } + if (cpu_memory_rw_debug(cs, ip - instr->length, opcode, + sizeof(opcode), 0) < 0) { + return -1; + } + if (opcode_matches(opcode, instr)) { + ip -= instr->length; + goto instruction_ok; + } + } + return -1; + } else { + if (cpu_memory_rw_debug(cs, ip, opcode, sizeof(opcode), 0) < 0) { + return -1; + } + for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) { + instr = &tpr_instr[i]; + if (opcode_matches(opcode, instr)) { + goto instruction_ok; + } + } + return -1; + } + +instruction_ok: + /* + * Grab the virtual TPR address from the instruction + * and update the cached values. + */ + if (cpu_memory_rw_debug(cs, ip + instr->addr_offset, + (void *)&real_tpr_addr, + sizeof(real_tpr_addr), 0) < 0) { + return -1; + } + real_tpr_addr = le32_to_cpu(real_tpr_addr); + if ((real_tpr_addr & 0xfff) != 0x80) { + return -1; + } + s->real_tpr_addr = real_tpr_addr; + update_guest_rom_state(s); + + *pip = ip; + return 0; +} + +static int update_rom_mapping(VAPICROMState *s, CPUX86State *env, target_ulong ip) +{ + CPUState *cs = env_cpu(env); + hwaddr paddr; + uint32_t rom_state_vaddr; + uint32_t pos, patch, offset; + + /* nothing to do if already activated */ + if (s->state == VAPIC_ACTIVE) { + return 0; + } + + /* bail out if ROM init code was not executed (missing ROM?) */ + if (s->state == VAPIC_INACTIVE) { + return -1; + } + + /* find out virtual address of the ROM */ + rom_state_vaddr = s->rom_state_paddr + (ip & 0xf0000000); + paddr = cpu_get_phys_page_debug(cs, rom_state_vaddr); + if (paddr == -1) { + return -1; + } + paddr += rom_state_vaddr & ~TARGET_PAGE_MASK; + if (paddr != s->rom_state_paddr) { + return -1; + } + read_guest_rom_state(s); + if (memcmp(s->rom_state.signature, "kvm aPiC", 8) != 0) { + return -1; + } + s->rom_state_vaddr = rom_state_vaddr; + + /* fixup addresses in ROM if needed */ + if (rom_state_vaddr == le32_to_cpu(s->rom_state.vaddr)) { + return 0; + } + for (pos = le32_to_cpu(s->rom_state.fixup_start); + pos < le32_to_cpu(s->rom_state.fixup_end); + pos += 4) { + cpu_physical_memory_read(paddr + pos - s->rom_state.vaddr, + &offset, sizeof(offset)); + offset = le32_to_cpu(offset); + cpu_physical_memory_read(paddr + offset, &patch, sizeof(patch)); + patch = le32_to_cpu(patch); + patch += rom_state_vaddr - le32_to_cpu(s->rom_state.vaddr); + patch = cpu_to_le32(patch); + cpu_physical_memory_write(paddr + offset, &patch, sizeof(patch)); + } + read_guest_rom_state(s); + s->vapic_paddr = paddr + le32_to_cpu(s->rom_state.vapic_vaddr) - + le32_to_cpu(s->rom_state.vaddr); + + return 0; +} + +/* + * Tries to read the unique processor number from the Kernel Processor Control + * Region (KPCR) of 32-bit Windows XP and Server 2003. Returns -1 if the KPCR + * cannot be accessed or is considered invalid. This also ensures that we are + * not patching the wrong guest. + */ +static int get_kpcr_number(X86CPU *cpu) +{ + CPUX86State *env = &cpu->env; + struct kpcr { + uint8_t fill1[0x1c]; + uint32_t self; + uint8_t fill2[0x31]; + uint8_t number; + } QEMU_PACKED kpcr; + + if (cpu_memory_rw_debug(CPU(cpu), env->segs[R_FS].base, + (void *)&kpcr, sizeof(kpcr), 0) < 0 || + kpcr.self != env->segs[R_FS].base) { + return -1; + } + return kpcr.number; +} + +static int vapic_enable(VAPICROMState *s, X86CPU *cpu) +{ + int cpu_number = get_kpcr_number(cpu); + hwaddr vapic_paddr; + static const uint8_t enabled = 1; + + if (cpu_number < 0) { + return -1; + } + vapic_paddr = s->vapic_paddr + + (((hwaddr)cpu_number) << VAPIC_CPU_SHIFT); + cpu_physical_memory_write(vapic_paddr + offsetof(VAPICState, enabled), + &enabled, sizeof(enabled)); + apic_enable_vapic(cpu->apic_state, vapic_paddr); + + s->state = VAPIC_ACTIVE; + + return 0; +} + +static void patch_byte(X86CPU *cpu, target_ulong addr, uint8_t byte) +{ + cpu_memory_rw_debug(CPU(cpu), addr, &byte, 1, 1); +} + +static void patch_call(X86CPU *cpu, target_ulong ip, uint32_t target) +{ + uint32_t offset; + + offset = cpu_to_le32(target - ip - 5); + patch_byte(cpu, ip, 0xe8); /* call near */ + cpu_memory_rw_debug(CPU(cpu), ip + 1, (void *)&offset, sizeof(offset), 1); +} + +typedef struct PatchInfo { + VAPICHandlers *handler; + target_ulong ip; +} PatchInfo; + +static void do_patch_instruction(CPUState *cs, run_on_cpu_data data) +{ + X86CPU *x86_cpu = X86_CPU(cs); + PatchInfo *info = (PatchInfo *) data.host_ptr; + VAPICHandlers *handlers = info->handler; + target_ulong ip = info->ip; + uint8_t opcode[2]; + uint32_t imm32 = 0; + + cpu_memory_rw_debug(cs, ip, opcode, sizeof(opcode), 0); + + switch (opcode[0]) { + case 0x89: /* mov r32 to r/m32 */ + patch_byte(x86_cpu, ip, 0x50 + modrm_reg(opcode[1])); /* push reg */ + patch_call(x86_cpu, ip + 1, handlers->set_tpr); + break; + case 0x8b: /* mov r/m32 to r32 */ + patch_byte(x86_cpu, ip, 0x90); + patch_call(x86_cpu, ip + 1, handlers->get_tpr[modrm_reg(opcode[1])]); + break; + case 0xa1: /* mov abs to eax */ + patch_call(x86_cpu, ip, handlers->get_tpr[0]); + break; + case 0xa3: /* mov eax to abs */ + patch_call(x86_cpu, ip, handlers->set_tpr_eax); + break; + case 0xc7: /* mov imm32, r/m32 (c7/0) */ + patch_byte(x86_cpu, ip, 0x68); /* push imm32 */ + cpu_memory_rw_debug(cs, ip + 6, (void *)&imm32, sizeof(imm32), 0); + cpu_memory_rw_debug(cs, ip + 1, (void *)&imm32, sizeof(imm32), 1); + patch_call(x86_cpu, ip + 5, handlers->set_tpr); + break; + case 0xff: /* push r/m32 */ + patch_byte(x86_cpu, ip, 0x50); /* push eax */ + patch_call(x86_cpu, ip + 1, handlers->get_tpr_stack); + break; + default: + abort(); + } + + g_free(info); +} + +static void patch_instruction(VAPICROMState *s, X86CPU *cpu, target_ulong ip) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + CPUState *cs = CPU(cpu); + VAPICHandlers *handlers; + PatchInfo *info; + + if (ms->smp.cpus == 1) { + handlers = &s->rom_state.up; + } else { + handlers = &s->rom_state.mp; + } + + info = g_new(PatchInfo, 1); + info->handler = handlers; + info->ip = ip; + + async_safe_run_on_cpu(cs, do_patch_instruction, RUN_ON_CPU_HOST_PTR(info)); +} + +void vapic_report_tpr_access(DeviceState *dev, CPUState *cs, target_ulong ip, + TPRAccess access) +{ + VAPICROMState *s = VAPIC(dev); + X86CPU *cpu = X86_CPU(cs); + CPUX86State *env = &cpu->env; + + cpu_synchronize_state(cs); + + if (evaluate_tpr_instruction(s, cpu, &ip, access) < 0) { + if (s->state == VAPIC_ACTIVE) { + vapic_enable(s, cpu); + } + return; + } + if (update_rom_mapping(s, env, ip) < 0) { + return; + } + if (vapic_enable(s, cpu) < 0) { + return; + } + patch_instruction(s, cpu, ip); +} + +typedef struct VAPICEnableTPRReporting { + DeviceState *apic; + bool enable; +} VAPICEnableTPRReporting; + +static void vapic_do_enable_tpr_reporting(CPUState *cpu, run_on_cpu_data data) +{ + VAPICEnableTPRReporting *info = data.host_ptr; + apic_enable_tpr_access_reporting(info->apic, info->enable); +} + +static void vapic_enable_tpr_reporting(bool enable) +{ + VAPICEnableTPRReporting info = { + .enable = enable, + }; + CPUState *cs; + X86CPU *cpu; + + CPU_FOREACH(cs) { + cpu = X86_CPU(cs); + info.apic = cpu->apic_state; + run_on_cpu(cs, vapic_do_enable_tpr_reporting, RUN_ON_CPU_HOST_PTR(&info)); + } +} + +static void vapic_reset(DeviceState *dev) +{ + VAPICROMState *s = VAPIC(dev); + + s->state = VAPIC_INACTIVE; + s->rom_state_paddr = 0; + vapic_enable_tpr_reporting(false); +} + +/* + * Set the IRQ polling hypercalls to the supported variant: + * - vmcall if using KVM in-kernel irqchip + * - 32-bit VAPIC port write otherwise + */ +static int patch_hypercalls(VAPICROMState *s) +{ + hwaddr rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK; + static const uint8_t vmcall_pattern[] = { /* vmcall */ + 0xb8, 0x1, 0, 0, 0, 0xf, 0x1, 0xc1 + }; + static const uint8_t outl_pattern[] = { /* nop; outl %eax,0x7e */ + 0xb8, 0x1, 0, 0, 0, 0x90, 0xe7, 0x7e + }; + uint8_t alternates[2]; + const uint8_t *pattern; + const uint8_t *patch; + off_t pos; + uint8_t *rom; + + rom = g_malloc(s->rom_size); + cpu_physical_memory_read(rom_paddr, rom, s->rom_size); + + for (pos = 0; pos < s->rom_size - sizeof(vmcall_pattern); pos++) { + if (kvm_irqchip_in_kernel()) { + pattern = outl_pattern; + alternates[0] = outl_pattern[7]; + alternates[1] = outl_pattern[7]; + patch = &vmcall_pattern[5]; + } else { + pattern = vmcall_pattern; + alternates[0] = vmcall_pattern[7]; + alternates[1] = 0xd9; /* AMD's VMMCALL */ + patch = &outl_pattern[5]; + } + if (memcmp(rom + pos, pattern, 7) == 0 && + (rom[pos + 7] == alternates[0] || rom[pos + 7] == alternates[1])) { + cpu_physical_memory_write(rom_paddr + pos + 5, patch, 3); + /* + * Don't flush the tb here. Under ordinary conditions, the patched + * calls are miles away from the current IP. Under malicious + * conditions, the guest could trick us to crash. + */ + } + } + + g_free(rom); + return 0; +} + +/* + * For TCG mode or the time KVM honors read-only memory regions, we need to + * enable write access to the option ROM so that variables can be updated by + * the guest. + */ +static int vapic_map_rom_writable(VAPICROMState *s) +{ + hwaddr rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK; + MemoryRegionSection section; + MemoryRegion *as; + size_t rom_size; + uint8_t *ram; + + as = sysbus_address_space(&s->busdev); + + if (s->rom_mapped_writable) { + memory_region_del_subregion(as, &s->rom); + object_unparent(OBJECT(&s->rom)); + } + + /* grab RAM memory region (region @rom_paddr may still be pc.rom) */ + section = memory_region_find(as, 0, 1); + + /* read ROM size from RAM region */ + if (rom_paddr + 2 >= memory_region_size(section.mr)) { + return -1; + } + ram = memory_region_get_ram_ptr(section.mr); + rom_size = ram[rom_paddr + 2] * ROM_BLOCK_SIZE; + if (rom_size == 0) { + return -1; + } + s->rom_size = rom_size; + + /* We need to round to avoid creating subpages + * from which we cannot run code. */ + rom_size += rom_paddr & ~TARGET_PAGE_MASK; + rom_paddr &= TARGET_PAGE_MASK; + rom_size = TARGET_PAGE_ALIGN(rom_size); + + memory_region_init_alias(&s->rom, OBJECT(s), "kvmvapic-rom", section.mr, + rom_paddr, rom_size); + memory_region_add_subregion_overlap(as, rom_paddr, &s->rom, 1000); + s->rom_mapped_writable = true; + memory_region_unref(section.mr); + + return 0; +} + +static int vapic_prepare(VAPICROMState *s) +{ + if (vapic_map_rom_writable(s) < 0) { + return -1; + } + + if (patch_hypercalls(s) < 0) { + return -1; + } + + vapic_enable_tpr_reporting(true); + + return 0; +} + +static void vapic_write(void *opaque, hwaddr addr, uint64_t data, + unsigned int size) +{ + VAPICROMState *s = opaque; + X86CPU *cpu; + CPUX86State *env; + hwaddr rom_paddr; + + if (!current_cpu) { + return; + } + + cpu_synchronize_state(current_cpu); + cpu = X86_CPU(current_cpu); + env = &cpu->env; + + /* + * The VAPIC supports two PIO-based hypercalls, both via port 0x7E. + * o 16-bit write access: + * Reports the option ROM initialization to the hypervisor. Written + * value is the offset of the state structure in the ROM. + * o 8-bit write access: + * Reactivates the VAPIC after a guest hibernation, i.e. after the + * option ROM content has been re-initialized by a guest power cycle. + * o 32-bit write access: + * Poll for pending IRQs, considering the current VAPIC state. + */ + switch (size) { + case 2: + if (s->state == VAPIC_INACTIVE) { + rom_paddr = (env->segs[R_CS].base + env->eip) & ROM_BLOCK_MASK; + s->rom_state_paddr = rom_paddr + data; + + s->state = VAPIC_STANDBY; + } + if (vapic_prepare(s) < 0) { + s->state = VAPIC_INACTIVE; + s->rom_state_paddr = 0; + break; + } + break; + case 1: + if (kvm_enabled()) { + /* + * Disable triggering instruction in ROM by writing a NOP. + * + * We cannot do this in TCG mode as the reported IP is not + * accurate. + */ + pause_all_vcpus(); + patch_byte(cpu, env->eip - 2, 0x66); + patch_byte(cpu, env->eip - 1, 0x90); + resume_all_vcpus(); + } + + if (s->state == VAPIC_ACTIVE) { + break; + } + if (update_rom_mapping(s, env, env->eip) < 0) { + break; + } + if (find_real_tpr_addr(s, env) < 0) { + break; + } + vapic_enable(s, cpu); + break; + default: + case 4: + if (!kvm_irqchip_in_kernel()) { + apic_poll_irq(cpu->apic_state); + } + break; + } +} + +static uint64_t vapic_read(void *opaque, hwaddr addr, unsigned size) +{ + return 0xffffffff; +} + +static const MemoryRegionOps vapic_ops = { + .write = vapic_write, + .read = vapic_read, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void vapic_realize(DeviceState *dev, Error **errp) +{ + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + VAPICROMState *s = VAPIC(dev); + + memory_region_init_io(&s->io, OBJECT(s), &vapic_ops, s, "kvmvapic", 2); + sysbus_add_io(sbd, VAPIC_IO_PORT, &s->io); + sysbus_init_ioports(sbd, VAPIC_IO_PORT, 2); + + option_rom[nb_option_roms].name = "kvmvapic.bin"; + option_rom[nb_option_roms].bootindex = -1; + nb_option_roms++; +} + +static void do_vapic_enable(CPUState *cs, run_on_cpu_data data) +{ + VAPICROMState *s = data.host_ptr; + X86CPU *cpu = X86_CPU(cs); + + static const uint8_t enabled = 1; + cpu_physical_memory_write(s->vapic_paddr + offsetof(VAPICState, enabled), + &enabled, sizeof(enabled)); + apic_enable_vapic(cpu->apic_state, s->vapic_paddr); + s->state = VAPIC_ACTIVE; +} + +static void kvmvapic_vm_state_change(void *opaque, bool running, + RunState state) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + VAPICROMState *s = opaque; + uint8_t *zero; + + if (!running) { + return; + } + + if (s->state == VAPIC_ACTIVE) { + if (ms->smp.cpus == 1) { + run_on_cpu(first_cpu, do_vapic_enable, RUN_ON_CPU_HOST_PTR(s)); + } else { + zero = g_malloc0(s->rom_state.vapic_size); + cpu_physical_memory_write(s->vapic_paddr, zero, + s->rom_state.vapic_size); + g_free(zero); + } + } + + qemu_del_vm_change_state_handler(s->vmsentry); + s->vmsentry = NULL; +} + +static int vapic_post_load(void *opaque, int version_id) +{ + VAPICROMState *s = opaque; + + /* + * The old implementation of qemu-kvm did not provide the state + * VAPIC_STANDBY. Reconstruct it. + */ + if (s->state == VAPIC_INACTIVE && s->rom_state_paddr != 0) { + s->state = VAPIC_STANDBY; + } + + if (s->state != VAPIC_INACTIVE) { + if (vapic_prepare(s) < 0) { + return -1; + } + } + + if (!s->vmsentry) { + s->vmsentry = + qemu_add_vm_change_state_handler(kvmvapic_vm_state_change, s); + } + return 0; +} + +static const VMStateDescription vmstate_handlers = { + .name = "kvmvapic-handlers", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(set_tpr, VAPICHandlers), + VMSTATE_UINT32(set_tpr_eax, VAPICHandlers), + VMSTATE_UINT32_ARRAY(get_tpr, VAPICHandlers, 8), + VMSTATE_UINT32(get_tpr_stack, VAPICHandlers), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_guest_rom = { + .name = "kvmvapic-guest-rom", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UNUSED(8), /* signature */ + VMSTATE_UINT32(vaddr, GuestROMState), + VMSTATE_UINT32(fixup_start, GuestROMState), + VMSTATE_UINT32(fixup_end, GuestROMState), + VMSTATE_UINT32(vapic_vaddr, GuestROMState), + VMSTATE_UINT32(vapic_size, GuestROMState), + VMSTATE_UINT32(vcpu_shift, GuestROMState), + VMSTATE_UINT32(real_tpr_addr, GuestROMState), + VMSTATE_STRUCT(up, GuestROMState, 0, vmstate_handlers, VAPICHandlers), + VMSTATE_STRUCT(mp, GuestROMState, 0, vmstate_handlers, VAPICHandlers), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_vapic = { + .name = "kvm-tpr-opt", /* compatible with qemu-kvm VAPIC */ + .version_id = 1, + .minimum_version_id = 1, + .post_load = vapic_post_load, + .fields = (VMStateField[]) { + VMSTATE_STRUCT(rom_state, VAPICROMState, 0, vmstate_guest_rom, + GuestROMState), + VMSTATE_UINT32(state, VAPICROMState), + VMSTATE_UINT32(real_tpr_addr, VAPICROMState), + VMSTATE_UINT32(rom_state_vaddr, VAPICROMState), + VMSTATE_UINT32(vapic_paddr, VAPICROMState), + VMSTATE_UINT32(rom_state_paddr, VAPICROMState), + VMSTATE_END_OF_LIST() + } +}; + +static void vapic_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->reset = vapic_reset; + dc->vmsd = &vmstate_vapic; + dc->realize = vapic_realize; +} + +static const TypeInfo vapic_type = { + .name = TYPE_VAPIC, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(VAPICROMState), + .class_init = vapic_class_init, +}; + +static void vapic_register(void) +{ + type_register_static(&vapic_type); +} + +type_init(vapic_register); diff --git a/hw/i386/meson.build b/hw/i386/meson.build new file mode 100644 index 000000000..213e2e82b --- /dev/null +++ b/hw/i386/meson.build @@ -0,0 +1,37 @@ +i386_ss = ss.source_set() +i386_ss.add(files( + 'fw_cfg.c', + 'kvmvapic.c', + 'e820_memory_layout.c', + 'multiboot.c', + 'x86.c', +)) + +i386_ss.add(when: 'CONFIG_X86_IOMMU', if_true: files('x86-iommu.c'), + if_false: files('x86-iommu-stub.c')) +i386_ss.add(when: 'CONFIG_AMD_IOMMU', if_true: files('amd_iommu.c')) +i386_ss.add(when: 'CONFIG_I440FX', if_true: files('pc_piix.c')) +i386_ss.add(when: 'CONFIG_MICROVM', if_true: files('microvm.c', 'acpi-microvm.c', 'microvm-dt.c')) +i386_ss.add(when: 'CONFIG_Q35', if_true: files('pc_q35.c')) +i386_ss.add(when: 'CONFIG_VMMOUSE', if_true: files('vmmouse.c')) +i386_ss.add(when: 'CONFIG_VMPORT', if_true: files('vmport.c')) +i386_ss.add(when: 'CONFIG_VTD', if_true: files('intel_iommu.c')) +i386_ss.add(when: 'CONFIG_SGX', if_true: files('sgx-epc.c','sgx.c'), + if_false: files('sgx-stub.c')) + +i386_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-common.c')) +i386_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device_x86.c')) +i386_ss.add(when: 'CONFIG_PC', if_true: files( + 'pc.c', + 'pc_sysfw.c', + 'acpi-build.c', + 'port92.c')) +i386_ss.add(when: 'CONFIG_X86_FW_OVMF', if_true: files('pc_sysfw_ovmf.c'), + if_false: files('pc_sysfw_ovmf-stubs.c')) + +subdir('kvm') +subdir('xen') + +i386_ss.add_all(xenpv_ss) + +hw_arch += {'i386': i386_ss} diff --git a/hw/i386/microvm-dt.c b/hw/i386/microvm-dt.c new file mode 100644 index 000000000..9c3c4995b --- /dev/null +++ b/hw/i386/microvm-dt.c @@ -0,0 +1,348 @@ +/* + * microvm device tree support + * + * This generates an device tree for microvm and exports it via fw_cfg + * as "etc/fdt" to the firmware (edk2 specifically). + * + * The use case is to allow edk2 find the pcie ecam and the virtio + * devices, without adding an ACPI parser, reusing the fdt parser + * which is needed anyway for the arm platform. + * + * Note 1: The device tree is incomplete. CPUs and memory is missing + * for example, those can be detected using other fw_cfg files. + * Also pci ecam irq routing is not there, edk2 doesn't use + * interrupts. + * + * Note 2: This is for firmware only. OSes should use the more + * complete ACPI tables for hardware discovery. + * + * ---------------------------------------------------------------------- + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2 or later, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "sysemu/device_tree.h" +#include "hw/char/serial.h" +#include "hw/i386/fw_cfg.h" +#include "hw/rtc/mc146818rtc.h" +#include "hw/sysbus.h" +#include "hw/virtio/virtio-mmio.h" +#include "hw/usb/xhci.h" + +#include "microvm-dt.h" + +static bool debug; + +static void dt_add_microvm_irq(MicrovmMachineState *mms, + const char *nodename, uint32_t irq) +{ + int index = 0; + + if (irq >= IO_APIC_SECONDARY_IRQBASE) { + irq -= IO_APIC_SECONDARY_IRQBASE; + index++; + } + + qemu_fdt_setprop_cell(mms->fdt, nodename, "interrupt-parent", + mms->ioapic_phandle[index]); + qemu_fdt_setprop_cells(mms->fdt, nodename, "interrupts", irq, 0); +} + +static void dt_add_virtio(MicrovmMachineState *mms, VirtIOMMIOProxy *mmio) +{ + SysBusDevice *dev = SYS_BUS_DEVICE(mmio); + VirtioBusState *mmio_virtio_bus = &mmio->bus; + BusState *mmio_bus = &mmio_virtio_bus->parent_obj; + char *nodename; + + if (QTAILQ_EMPTY(&mmio_bus->children)) { + return; + } + + hwaddr base = dev->mmio[0].addr; + hwaddr size = 512; + unsigned index = (base - VIRTIO_MMIO_BASE) / size; + uint32_t irq = mms->virtio_irq_base + index; + + nodename = g_strdup_printf("/virtio_mmio@%" PRIx64, base); + qemu_fdt_add_subnode(mms->fdt, nodename); + qemu_fdt_setprop_string(mms->fdt, nodename, "compatible", "virtio,mmio"); + qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size); + qemu_fdt_setprop(mms->fdt, nodename, "dma-coherent", NULL, 0); + dt_add_microvm_irq(mms, nodename, irq); + g_free(nodename); +} + +static void dt_add_xhci(MicrovmMachineState *mms) +{ + const char compat[] = "generic-xhci"; + uint32_t irq = MICROVM_XHCI_IRQ; + hwaddr base = MICROVM_XHCI_BASE; + hwaddr size = XHCI_LEN_REGS; + char *nodename; + + nodename = g_strdup_printf("/usb@%" PRIx64, base); + qemu_fdt_add_subnode(mms->fdt, nodename); + qemu_fdt_setprop(mms->fdt, nodename, "compatible", compat, sizeof(compat)); + qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size); + qemu_fdt_setprop(mms->fdt, nodename, "dma-coherent", NULL, 0); + dt_add_microvm_irq(mms, nodename, irq); + g_free(nodename); +} + +static void dt_add_pcie(MicrovmMachineState *mms) +{ + hwaddr base = PCIE_MMIO_BASE; + int nr_pcie_buses; + char *nodename; + + nodename = g_strdup_printf("/pcie@%" PRIx64, base); + qemu_fdt_add_subnode(mms->fdt, nodename); + qemu_fdt_setprop_string(mms->fdt, nodename, + "compatible", "pci-host-ecam-generic"); + qemu_fdt_setprop_string(mms->fdt, nodename, "device_type", "pci"); + qemu_fdt_setprop_cell(mms->fdt, nodename, "#address-cells", 3); + qemu_fdt_setprop_cell(mms->fdt, nodename, "#size-cells", 2); + qemu_fdt_setprop_cell(mms->fdt, nodename, "linux,pci-domain", 0); + qemu_fdt_setprop(mms->fdt, nodename, "dma-coherent", NULL, 0); + + qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", + 2, PCIE_ECAM_BASE, 2, PCIE_ECAM_SIZE); + if (mms->gpex.mmio64.size) { + qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "ranges", + + 1, FDT_PCI_RANGE_MMIO, + 2, mms->gpex.mmio32.base, + 2, mms->gpex.mmio32.base, + 2, mms->gpex.mmio32.size, + + 1, FDT_PCI_RANGE_MMIO_64BIT, + 2, mms->gpex.mmio64.base, + 2, mms->gpex.mmio64.base, + 2, mms->gpex.mmio64.size); + } else { + qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "ranges", + + 1, FDT_PCI_RANGE_MMIO, + 2, mms->gpex.mmio32.base, + 2, mms->gpex.mmio32.base, + 2, mms->gpex.mmio32.size); + } + + nr_pcie_buses = PCIE_ECAM_SIZE / PCIE_MMCFG_SIZE_MIN; + qemu_fdt_setprop_cells(mms->fdt, nodename, "bus-range", 0, + nr_pcie_buses - 1); + + g_free(nodename); +} + +static void dt_add_ioapic(MicrovmMachineState *mms, SysBusDevice *dev) +{ + hwaddr base = dev->mmio[0].addr; + char *nodename; + uint32_t ph; + int index; + + switch (base) { + case IO_APIC_DEFAULT_ADDRESS: + index = 0; + break; + case IO_APIC_SECONDARY_ADDRESS: + index = 1; + break; + default: + fprintf(stderr, "unknown ioapic @ %" PRIx64 "\n", base); + return; + } + + nodename = g_strdup_printf("/ioapic%d@%" PRIx64, index + 1, base); + qemu_fdt_add_subnode(mms->fdt, nodename); + qemu_fdt_setprop_string(mms->fdt, nodename, + "compatible", "intel,ce4100-ioapic"); + qemu_fdt_setprop(mms->fdt, nodename, "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(mms->fdt, nodename, "#interrupt-cells", 0x2); + qemu_fdt_setprop_cell(mms->fdt, nodename, "#address-cells", 0x2); + qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", + 2, base, 2, 0x1000); + + ph = qemu_fdt_alloc_phandle(mms->fdt); + qemu_fdt_setprop_cell(mms->fdt, nodename, "phandle", ph); + qemu_fdt_setprop_cell(mms->fdt, nodename, "linux,phandle", ph); + mms->ioapic_phandle[index] = ph; + + g_free(nodename); +} + +static void dt_add_isa_serial(MicrovmMachineState *mms, ISADevice *dev) +{ + const char compat[] = "ns16550"; + uint32_t irq = object_property_get_int(OBJECT(dev), "irq", NULL); + hwaddr base = object_property_get_int(OBJECT(dev), "iobase", NULL); + hwaddr size = 8; + char *nodename; + + nodename = g_strdup_printf("/serial@%" PRIx64, base); + qemu_fdt_add_subnode(mms->fdt, nodename); + qemu_fdt_setprop(mms->fdt, nodename, "compatible", compat, sizeof(compat)); + qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size); + dt_add_microvm_irq(mms, nodename, irq); + + if (base == 0x3f8 /* com1 */) { + qemu_fdt_setprop_string(mms->fdt, "/chosen", "stdout-path", nodename); + } + + g_free(nodename); +} + +static void dt_add_isa_rtc(MicrovmMachineState *mms, ISADevice *dev) +{ + const char compat[] = "motorola,mc146818"; + uint32_t irq = RTC_ISA_IRQ; + hwaddr base = RTC_ISA_BASE; + hwaddr size = 8; + char *nodename; + + nodename = g_strdup_printf("/rtc@%" PRIx64, base); + qemu_fdt_add_subnode(mms->fdt, nodename); + qemu_fdt_setprop(mms->fdt, nodename, "compatible", compat, sizeof(compat)); + qemu_fdt_setprop_sized_cells(mms->fdt, nodename, "reg", 2, base, 2, size); + dt_add_microvm_irq(mms, nodename, irq); + g_free(nodename); +} + +static void dt_setup_isa_bus(MicrovmMachineState *mms, DeviceState *bridge) +{ + BusState *bus = qdev_get_child_bus(bridge, "isa.0"); + BusChild *kid; + Object *obj; + + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + + /* serial */ + obj = object_dynamic_cast(OBJECT(dev), TYPE_ISA_SERIAL); + if (obj) { + dt_add_isa_serial(mms, ISA_DEVICE(obj)); + continue; + } + + /* rtc */ + obj = object_dynamic_cast(OBJECT(dev), TYPE_MC146818_RTC); + if (obj) { + dt_add_isa_rtc(mms, ISA_DEVICE(obj)); + continue; + } + + if (debug) { + fprintf(stderr, "%s: unhandled: %s\n", __func__, + object_get_typename(OBJECT(dev))); + } + } +} + +static void dt_setup_sys_bus(MicrovmMachineState *mms) +{ + BusState *bus; + BusChild *kid; + Object *obj; + + /* sysbus devices */ + bus = sysbus_get_default(); + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + + /* ioapic */ + obj = object_dynamic_cast(OBJECT(dev), TYPE_IOAPIC); + if (obj) { + dt_add_ioapic(mms, SYS_BUS_DEVICE(obj)); + continue; + } + } + + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + + /* virtio */ + obj = object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MMIO); + if (obj) { + dt_add_virtio(mms, VIRTIO_MMIO(obj)); + continue; + } + + /* xhci */ + obj = object_dynamic_cast(OBJECT(dev), TYPE_XHCI_SYSBUS); + if (obj) { + dt_add_xhci(mms); + continue; + } + + /* pcie */ + obj = object_dynamic_cast(OBJECT(dev), TYPE_GPEX_HOST); + if (obj) { + dt_add_pcie(mms); + continue; + } + + /* isa */ + obj = object_dynamic_cast(OBJECT(dev), "isabus-bridge"); + if (obj) { + dt_setup_isa_bus(mms, DEVICE(obj)); + continue; + } + + if (debug) { + obj = object_dynamic_cast(OBJECT(dev), TYPE_IOAPIC); + if (obj) { + /* ioapic already added in first pass */ + continue; + } + fprintf(stderr, "%s: unhandled: %s\n", __func__, + object_get_typename(OBJECT(dev))); + } + } +} + +void dt_setup_microvm(MicrovmMachineState *mms) +{ + X86MachineState *x86ms = X86_MACHINE(mms); + int size = 0; + + mms->fdt = create_device_tree(&size); + + /* root node */ + qemu_fdt_setprop_string(mms->fdt, "/", "compatible", "linux,microvm"); + qemu_fdt_setprop_cell(mms->fdt, "/", "#address-cells", 0x2); + qemu_fdt_setprop_cell(mms->fdt, "/", "#size-cells", 0x2); + + qemu_fdt_add_subnode(mms->fdt, "/chosen"); + dt_setup_sys_bus(mms); + + /* add to fw_cfg */ + if (debug) { + fprintf(stderr, "%s: add etc/fdt to fw_cfg\n", __func__); + } + fw_cfg_add_file(x86ms->fw_cfg, "etc/fdt", mms->fdt, size); + + if (debug) { + fprintf(stderr, "%s: writing microvm.fdt\n", __func__); + if (!g_file_set_contents("microvm.fdt", mms->fdt, size, NULL)) { + fprintf(stderr, "%s: writing microvm.fdt failed\n", __func__); + return; + } + int ret = system("dtc -I dtb -O dts microvm.fdt"); + if (ret != 0) { + fprintf(stderr, "%s: oops, dtc not installed?\n", __func__); + } + } +} diff --git a/hw/i386/microvm-dt.h b/hw/i386/microvm-dt.h new file mode 100644 index 000000000..77c79cbdd --- /dev/null +++ b/hw/i386/microvm-dt.h @@ -0,0 +1,8 @@ +#ifndef HW_I386_MICROVM_DT_H +#define HW_I386_MICROVM_DT_H + +#include "hw/i386/microvm.h" + +void dt_setup_microvm(MicrovmMachineState *mms); + +#endif diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c new file mode 100644 index 000000000..4b3b1dd26 --- /dev/null +++ b/hw/i386/microvm.c @@ -0,0 +1,779 @@ +/* + * Copyright (c) 2018 Intel Corporation + * Copyright (c) 2019 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2 or later, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qemu/cutils.h" +#include "qemu/units.h" +#include "qapi/error.h" +#include "qapi/visitor.h" +#include "qapi/qapi-visit-common.h" +#include "sysemu/sysemu.h" +#include "sysemu/cpus.h" +#include "sysemu/numa.h" +#include "sysemu/reset.h" +#include "sysemu/runstate.h" +#include "acpi-microvm.h" +#include "microvm-dt.h" + +#include "hw/loader.h" +#include "hw/irq.h" +#include "hw/kvm/clock.h" +#include "hw/i386/microvm.h" +#include "hw/i386/x86.h" +#include "target/i386/cpu.h" +#include "hw/intc/i8259.h" +#include "hw/timer/i8254.h" +#include "hw/rtc/mc146818rtc.h" +#include "hw/char/serial.h" +#include "hw/display/ramfb.h" +#include "hw/i386/topology.h" +#include "hw/i386/e820_memory_layout.h" +#include "hw/i386/fw_cfg.h" +#include "hw/virtio/virtio-mmio.h" +#include "hw/acpi/acpi.h" +#include "hw/acpi/generic_event_device.h" +#include "hw/pci-host/gpex.h" +#include "hw/usb/xhci.h" + +#include "elf.h" +#include "kvm/kvm_i386.h" +#include "hw/xen/start_info.h" + +#define MICROVM_QBOOT_FILENAME "qboot.rom" +#define MICROVM_BIOS_FILENAME "bios-microvm.bin" + +static void microvm_set_rtc(MicrovmMachineState *mms, ISADevice *s) +{ + X86MachineState *x86ms = X86_MACHINE(mms); + int val; + + val = MIN(x86ms->below_4g_mem_size / KiB, 640); + rtc_set_memory(s, 0x15, val); + rtc_set_memory(s, 0x16, val >> 8); + /* extended memory (next 64MiB) */ + if (x86ms->below_4g_mem_size > 1 * MiB) { + val = (x86ms->below_4g_mem_size - 1 * MiB) / KiB; + } else { + val = 0; + } + if (val > 65535) { + val = 65535; + } + rtc_set_memory(s, 0x17, val); + rtc_set_memory(s, 0x18, val >> 8); + rtc_set_memory(s, 0x30, val); + rtc_set_memory(s, 0x31, val >> 8); + /* memory between 16MiB and 4GiB */ + if (x86ms->below_4g_mem_size > 16 * MiB) { + val = (x86ms->below_4g_mem_size - 16 * MiB) / (64 * KiB); + } else { + val = 0; + } + if (val > 65535) { + val = 65535; + } + rtc_set_memory(s, 0x34, val); + rtc_set_memory(s, 0x35, val >> 8); + /* memory above 4GiB */ + val = x86ms->above_4g_mem_size / 65536; + rtc_set_memory(s, 0x5b, val); + rtc_set_memory(s, 0x5c, val >> 8); + rtc_set_memory(s, 0x5d, val >> 16); +} + +static void create_gpex(MicrovmMachineState *mms) +{ + X86MachineState *x86ms = X86_MACHINE(mms); + MemoryRegion *mmio32_alias; + MemoryRegion *mmio64_alias; + MemoryRegion *mmio_reg; + MemoryRegion *ecam_alias; + MemoryRegion *ecam_reg; + DeviceState *dev; + int i; + + dev = qdev_new(TYPE_GPEX_HOST); + sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); + + /* Map only the first size_ecam bytes of ECAM space */ + ecam_alias = g_new0(MemoryRegion, 1); + ecam_reg = sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 0); + memory_region_init_alias(ecam_alias, OBJECT(dev), "pcie-ecam", + ecam_reg, 0, mms->gpex.ecam.size); + memory_region_add_subregion(get_system_memory(), + mms->gpex.ecam.base, ecam_alias); + + /* Map the MMIO window into system address space so as to expose + * the section of PCI MMIO space which starts at the same base address + * (ie 1:1 mapping for that part of PCI MMIO space visible through + * the window). + */ + mmio_reg = sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 1); + if (mms->gpex.mmio32.size) { + mmio32_alias = g_new0(MemoryRegion, 1); + memory_region_init_alias(mmio32_alias, OBJECT(dev), "pcie-mmio32", mmio_reg, + mms->gpex.mmio32.base, mms->gpex.mmio32.size); + memory_region_add_subregion(get_system_memory(), + mms->gpex.mmio32.base, mmio32_alias); + } + if (mms->gpex.mmio64.size) { + mmio64_alias = g_new0(MemoryRegion, 1); + memory_region_init_alias(mmio64_alias, OBJECT(dev), "pcie-mmio64", mmio_reg, + mms->gpex.mmio64.base, mms->gpex.mmio64.size); + memory_region_add_subregion(get_system_memory(), + mms->gpex.mmio64.base, mmio64_alias); + } + + for (i = 0; i < GPEX_NUM_IRQS; i++) { + sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, + x86ms->gsi[mms->gpex.irq + i]); + } +} + +static int microvm_ioapics(MicrovmMachineState *mms) +{ + if (!x86_machine_is_acpi_enabled(X86_MACHINE(mms))) { + return 1; + } + if (mms->ioapic2 == ON_OFF_AUTO_OFF) { + return 1; + } + return 2; +} + +static void microvm_devices_init(MicrovmMachineState *mms) +{ + const char *default_firmware; + X86MachineState *x86ms = X86_MACHINE(mms); + ISABus *isa_bus; + ISADevice *rtc_state; + GSIState *gsi_state; + int ioapics; + int i; + + /* Core components */ + ioapics = microvm_ioapics(mms); + gsi_state = g_malloc0(sizeof(*gsi_state)); + x86ms->gsi = qemu_allocate_irqs(gsi_handler, gsi_state, + IOAPIC_NUM_PINS * ioapics); + + isa_bus = isa_bus_new(NULL, get_system_memory(), get_system_io(), + &error_abort); + isa_bus_irqs(isa_bus, x86ms->gsi); + + ioapic_init_gsi(gsi_state, "machine"); + if (ioapics > 1) { + x86ms->ioapic2 = ioapic_init_secondary(gsi_state); + } + + kvmclock_create(true); + + mms->virtio_irq_base = 5; + mms->virtio_num_transports = 8; + if (x86ms->ioapic2) { + mms->pcie_irq_base = 16; /* 16 -> 19 */ + /* use second ioapic (24 -> 47) for virtio-mmio irq lines */ + mms->virtio_irq_base = IO_APIC_SECONDARY_IRQBASE; + mms->virtio_num_transports = IOAPIC_NUM_PINS; + } else if (x86_machine_is_acpi_enabled(x86ms)) { + mms->pcie_irq_base = 12; /* 12 -> 15 */ + mms->virtio_irq_base = 16; /* 16 -> 23 */ + } + + for (i = 0; i < mms->virtio_num_transports; i++) { + sysbus_create_simple("virtio-mmio", + VIRTIO_MMIO_BASE + i * 512, + x86ms->gsi[mms->virtio_irq_base + i]); + } + + /* Optional and legacy devices */ + if (x86_machine_is_acpi_enabled(x86ms)) { + DeviceState *dev = qdev_new(TYPE_ACPI_GED_X86); + qdev_prop_set_uint32(dev, "ged-event", ACPI_GED_PWR_DOWN_EVT); + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, GED_MMIO_BASE); + /* sysbus_mmio_map(SYS_BUS_DEVICE(dev), 1, GED_MMIO_BASE_MEMHP); */ + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 2, GED_MMIO_BASE_REGS); + sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, + x86ms->gsi[GED_MMIO_IRQ]); + sysbus_realize(SYS_BUS_DEVICE(dev), &error_fatal); + x86ms->acpi_dev = HOTPLUG_HANDLER(dev); + } + + if (x86_machine_is_acpi_enabled(x86ms) && machine_usb(MACHINE(mms))) { + DeviceState *dev = qdev_new(TYPE_XHCI_SYSBUS); + qdev_prop_set_uint32(dev, "intrs", 1); + qdev_prop_set_uint32(dev, "slots", XHCI_MAXSLOTS); + qdev_prop_set_uint32(dev, "p2", 8); + qdev_prop_set_uint32(dev, "p3", 8); + sysbus_realize(SYS_BUS_DEVICE(dev), &error_fatal); + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, MICROVM_XHCI_BASE); + sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, + x86ms->gsi[MICROVM_XHCI_IRQ]); + } + + if (x86_machine_is_acpi_enabled(x86ms) && mms->pcie == ON_OFF_AUTO_ON) { + /* use topmost 25% of the address space available */ + hwaddr phys_size = (hwaddr)1 << X86_CPU(first_cpu)->phys_bits; + if (phys_size > 0x1000000ll) { + mms->gpex.mmio64.size = phys_size / 4; + mms->gpex.mmio64.base = phys_size - mms->gpex.mmio64.size; + } + mms->gpex.mmio32.base = PCIE_MMIO_BASE; + mms->gpex.mmio32.size = PCIE_MMIO_SIZE; + mms->gpex.ecam.base = PCIE_ECAM_BASE; + mms->gpex.ecam.size = PCIE_ECAM_SIZE; + mms->gpex.irq = mms->pcie_irq_base; + create_gpex(mms); + x86ms->pci_irq_mask = ((1 << (mms->pcie_irq_base + 0)) | + (1 << (mms->pcie_irq_base + 1)) | + (1 << (mms->pcie_irq_base + 2)) | + (1 << (mms->pcie_irq_base + 3))); + } else { + x86ms->pci_irq_mask = 0; + } + + if (mms->pic == ON_OFF_AUTO_ON || mms->pic == ON_OFF_AUTO_AUTO) { + qemu_irq *i8259; + + i8259 = i8259_init(isa_bus, x86_allocate_cpu_irq()); + for (i = 0; i < ISA_NUM_IRQS; i++) { + gsi_state->i8259_irq[i] = i8259[i]; + } + g_free(i8259); + } + + if (mms->pit == ON_OFF_AUTO_ON || mms->pit == ON_OFF_AUTO_AUTO) { + if (kvm_pit_in_kernel()) { + kvm_pit_init(isa_bus, 0x40); + } else { + i8254_pit_init(isa_bus, 0x40, 0, NULL); + } + } + + if (mms->rtc == ON_OFF_AUTO_ON || + (mms->rtc == ON_OFF_AUTO_AUTO && !kvm_enabled())) { + rtc_state = mc146818_rtc_init(isa_bus, 2000, NULL); + microvm_set_rtc(mms, rtc_state); + } + + if (mms->isa_serial) { + serial_hds_isa_init(isa_bus, 0, 1); + } + + default_firmware = x86_machine_is_acpi_enabled(x86ms) + ? MICROVM_BIOS_FILENAME + : MICROVM_QBOOT_FILENAME; + x86_bios_rom_init(MACHINE(mms), default_firmware, get_system_memory(), true); +} + +static void microvm_memory_init(MicrovmMachineState *mms) +{ + MachineState *machine = MACHINE(mms); + X86MachineState *x86ms = X86_MACHINE(mms); + MemoryRegion *ram_below_4g, *ram_above_4g; + MemoryRegion *system_memory = get_system_memory(); + FWCfgState *fw_cfg; + ram_addr_t lowmem = 0xc0000000; /* 3G */ + int i; + + if (machine->ram_size > lowmem) { + x86ms->above_4g_mem_size = machine->ram_size - lowmem; + x86ms->below_4g_mem_size = lowmem; + } else { + x86ms->above_4g_mem_size = 0; + x86ms->below_4g_mem_size = machine->ram_size; + } + + ram_below_4g = g_malloc(sizeof(*ram_below_4g)); + memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram, + 0, x86ms->below_4g_mem_size); + memory_region_add_subregion(system_memory, 0, ram_below_4g); + + e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM); + + if (x86ms->above_4g_mem_size > 0) { + ram_above_4g = g_malloc(sizeof(*ram_above_4g)); + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", + machine->ram, + x86ms->below_4g_mem_size, + x86ms->above_4g_mem_size); + memory_region_add_subregion(system_memory, 0x100000000ULL, + ram_above_4g); + e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM); + } + + fw_cfg = fw_cfg_init_io_dma(FW_CFG_IO_BASE, FW_CFG_IO_BASE + 4, + &address_space_memory); + + fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, machine->smp.cpus); + fw_cfg_add_i16(fw_cfg, FW_CFG_MAX_CPUS, machine->smp.max_cpus); + fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)machine->ram_size); + fw_cfg_add_i32(fw_cfg, FW_CFG_IRQ0_OVERRIDE, 1); + fw_cfg_add_bytes(fw_cfg, FW_CFG_E820_TABLE, + &e820_reserve, sizeof(e820_reserve)); + fw_cfg_add_file(fw_cfg, "etc/e820", e820_table, + sizeof(struct e820_entry) * e820_get_num_entries()); + + rom_set_fw(fw_cfg); + + if (machine->kernel_filename != NULL) { + x86_load_linux(x86ms, fw_cfg, 0, true); + } + + if (mms->option_roms) { + for (i = 0; i < nb_option_roms; i++) { + rom_add_option(option_rom[i].name, option_rom[i].bootindex); + } + } + + x86ms->fw_cfg = fw_cfg; + x86ms->ioapic_as = &address_space_memory; +} + +static gchar *microvm_get_mmio_cmdline(gchar *name, uint32_t virtio_irq_base) +{ + gchar *cmdline; + gchar *separator; + long int index; + int ret; + + separator = g_strrstr(name, "."); + if (!separator) { + return NULL; + } + + if (qemu_strtol(separator + 1, NULL, 10, &index) != 0) { + return NULL; + } + + cmdline = g_malloc0(VIRTIO_CMDLINE_MAXLEN); + ret = g_snprintf(cmdline, VIRTIO_CMDLINE_MAXLEN, + " virtio_mmio.device=512@0x%lx:%ld", + VIRTIO_MMIO_BASE + index * 512, + virtio_irq_base + index); + if (ret < 0 || ret >= VIRTIO_CMDLINE_MAXLEN) { + g_free(cmdline); + return NULL; + } + + return cmdline; +} + +static void microvm_fix_kernel_cmdline(MachineState *machine) +{ + X86MachineState *x86ms = X86_MACHINE(machine); + MicrovmMachineState *mms = MICROVM_MACHINE(machine); + BusState *bus; + BusChild *kid; + char *cmdline; + + /* + * Find MMIO transports with attached devices, and add them to the kernel + * command line. + * + * Yes, this is a hack, but one that heavily improves the UX without + * introducing any significant issues. + */ + cmdline = g_strdup(machine->kernel_cmdline); + bus = sysbus_get_default(); + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + ObjectClass *class = object_get_class(OBJECT(dev)); + + if (class == object_class_by_name(TYPE_VIRTIO_MMIO)) { + VirtIOMMIOProxy *mmio = VIRTIO_MMIO(OBJECT(dev)); + VirtioBusState *mmio_virtio_bus = &mmio->bus; + BusState *mmio_bus = &mmio_virtio_bus->parent_obj; + + if (!QTAILQ_EMPTY(&mmio_bus->children)) { + gchar *mmio_cmdline = microvm_get_mmio_cmdline + (mmio_bus->name, mms->virtio_irq_base); + if (mmio_cmdline) { + char *newcmd = g_strjoin(NULL, cmdline, mmio_cmdline, NULL); + g_free(mmio_cmdline); + g_free(cmdline); + cmdline = newcmd; + } + } + } + } + + fw_cfg_modify_i32(x86ms->fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(cmdline) + 1); + fw_cfg_modify_string(x86ms->fw_cfg, FW_CFG_CMDLINE_DATA, cmdline); + + g_free(cmdline); +} + +static void microvm_device_pre_plug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + X86CPU *cpu = X86_CPU(dev); + + cpu->host_phys_bits = true; /* need reliable phys-bits */ + x86_cpu_pre_plug(hotplug_dev, dev, errp); +} + +static void microvm_device_plug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + x86_cpu_plug(hotplug_dev, dev, errp); +} + +static void microvm_device_unplug_request_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + error_setg(errp, "unplug not supported by microvm"); +} + +static void microvm_device_unplug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + error_setg(errp, "unplug not supported by microvm"); +} + +static HotplugHandler *microvm_get_hotplug_handler(MachineState *machine, + DeviceState *dev) +{ + if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + return HOTPLUG_HANDLER(machine); + } + return NULL; +} + +static void microvm_machine_state_init(MachineState *machine) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(machine); + X86MachineState *x86ms = X86_MACHINE(machine); + + microvm_memory_init(mms); + + x86_cpus_init(x86ms, CPU_VERSION_LATEST); + + microvm_devices_init(mms); +} + +static void microvm_machine_reset(MachineState *machine) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(machine); + CPUState *cs; + X86CPU *cpu; + + if (!x86_machine_is_acpi_enabled(X86_MACHINE(machine)) && + machine->kernel_filename != NULL && + mms->auto_kernel_cmdline && !mms->kernel_cmdline_fixed) { + microvm_fix_kernel_cmdline(machine); + mms->kernel_cmdline_fixed = true; + } + + qemu_devices_reset(); + + CPU_FOREACH(cs) { + cpu = X86_CPU(cs); + + if (cpu->apic_state) { + device_legacy_reset(cpu->apic_state); + } + } +} + +static void microvm_machine_get_pic(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + OnOffAuto pic = mms->pic; + + visit_type_OnOffAuto(v, name, &pic, errp); +} + +static void microvm_machine_set_pic(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &mms->pic, errp); +} + +static void microvm_machine_get_pit(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + OnOffAuto pit = mms->pit; + + visit_type_OnOffAuto(v, name, &pit, errp); +} + +static void microvm_machine_set_pit(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &mms->pit, errp); +} + +static void microvm_machine_get_rtc(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + OnOffAuto rtc = mms->rtc; + + visit_type_OnOffAuto(v, name, &rtc, errp); +} + +static void microvm_machine_set_rtc(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &mms->rtc, errp); +} + +static void microvm_machine_get_pcie(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + OnOffAuto pcie = mms->pcie; + + visit_type_OnOffAuto(v, name, &pcie, errp); +} + +static void microvm_machine_set_pcie(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &mms->pcie, errp); +} + +static void microvm_machine_get_ioapic2(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + OnOffAuto ioapic2 = mms->ioapic2; + + visit_type_OnOffAuto(v, name, &ioapic2, errp); +} + +static void microvm_machine_set_ioapic2(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &mms->ioapic2, errp); +} + +static bool microvm_machine_get_isa_serial(Object *obj, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + return mms->isa_serial; +} + +static void microvm_machine_set_isa_serial(Object *obj, bool value, + Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + mms->isa_serial = value; +} + +static bool microvm_machine_get_option_roms(Object *obj, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + return mms->option_roms; +} + +static void microvm_machine_set_option_roms(Object *obj, bool value, + Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + mms->option_roms = value; +} + +static bool microvm_machine_get_auto_kernel_cmdline(Object *obj, Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + return mms->auto_kernel_cmdline; +} + +static void microvm_machine_set_auto_kernel_cmdline(Object *obj, bool value, + Error **errp) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + mms->auto_kernel_cmdline = value; +} + +static void microvm_machine_done(Notifier *notifier, void *data) +{ + MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, + machine_done); + + acpi_setup_microvm(mms); + dt_setup_microvm(mms); +} + +static void microvm_powerdown_req(Notifier *notifier, void *data) +{ + MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, + powerdown_req); + X86MachineState *x86ms = X86_MACHINE(mms); + + if (x86ms->acpi_dev) { + Object *obj = OBJECT(x86ms->acpi_dev); + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj); + adevc->send_event(ACPI_DEVICE_IF(x86ms->acpi_dev), + ACPI_POWER_DOWN_STATUS); + } +} + +static void microvm_machine_initfn(Object *obj) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + + /* Configuration */ + mms->pic = ON_OFF_AUTO_AUTO; + mms->pit = ON_OFF_AUTO_AUTO; + mms->rtc = ON_OFF_AUTO_AUTO; + mms->pcie = ON_OFF_AUTO_AUTO; + mms->ioapic2 = ON_OFF_AUTO_AUTO; + mms->isa_serial = true; + mms->option_roms = true; + mms->auto_kernel_cmdline = true; + + /* State */ + mms->kernel_cmdline_fixed = false; + + mms->machine_done.notify = microvm_machine_done; + qemu_add_machine_init_done_notifier(&mms->machine_done); + mms->powerdown_req.notify = microvm_powerdown_req; + qemu_register_powerdown_notifier(&mms->powerdown_req); +} + +static void microvm_class_init(ObjectClass *oc, void *data) +{ + X86MachineClass *x86mc = X86_MACHINE_CLASS(oc); + MachineClass *mc = MACHINE_CLASS(oc); + HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); + + mc->init = microvm_machine_state_init; + + mc->family = "microvm_i386"; + mc->desc = "microvm (i386)"; + mc->units_per_default_bus = 1; + mc->no_floppy = 1; + mc->max_cpus = 288; + mc->has_hotpluggable_cpus = false; + mc->auto_enable_numa_with_memhp = false; + mc->auto_enable_numa_with_memdev = false; + mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE; + mc->nvdimm_supported = false; + mc->default_ram_id = "microvm.ram"; + + /* Avoid relying too much on kernel components */ + mc->default_kernel_irqchip_split = true; + + /* Machine class handlers */ + mc->reset = microvm_machine_reset; + + /* hotplug (for cpu coldplug) */ + mc->get_hotplug_handler = microvm_get_hotplug_handler; + hc->pre_plug = microvm_device_pre_plug_cb; + hc->plug = microvm_device_plug_cb; + hc->unplug_request = microvm_device_unplug_request_cb; + hc->unplug = microvm_device_unplug_cb; + + x86mc->fwcfg_dma_enabled = true; + + object_class_property_add(oc, MICROVM_MACHINE_PIC, "OnOffAuto", + microvm_machine_get_pic, + microvm_machine_set_pic, + NULL, NULL); + object_class_property_set_description(oc, MICROVM_MACHINE_PIC, + "Enable i8259 PIC"); + + object_class_property_add(oc, MICROVM_MACHINE_PIT, "OnOffAuto", + microvm_machine_get_pit, + microvm_machine_set_pit, + NULL, NULL); + object_class_property_set_description(oc, MICROVM_MACHINE_PIT, + "Enable i8254 PIT"); + + object_class_property_add(oc, MICROVM_MACHINE_RTC, "OnOffAuto", + microvm_machine_get_rtc, + microvm_machine_set_rtc, + NULL, NULL); + object_class_property_set_description(oc, MICROVM_MACHINE_RTC, + "Enable MC146818 RTC"); + + object_class_property_add(oc, MICROVM_MACHINE_PCIE, "OnOffAuto", + microvm_machine_get_pcie, + microvm_machine_set_pcie, + NULL, NULL); + object_class_property_set_description(oc, MICROVM_MACHINE_PCIE, + "Enable PCIe"); + + object_class_property_add(oc, MICROVM_MACHINE_IOAPIC2, "OnOffAuto", + microvm_machine_get_ioapic2, + microvm_machine_set_ioapic2, + NULL, NULL); + object_class_property_set_description(oc, MICROVM_MACHINE_IOAPIC2, + "Enable second IO-APIC"); + + object_class_property_add_bool(oc, MICROVM_MACHINE_ISA_SERIAL, + microvm_machine_get_isa_serial, + microvm_machine_set_isa_serial); + object_class_property_set_description(oc, MICROVM_MACHINE_ISA_SERIAL, + "Set off to disable the instantiation an ISA serial port"); + + object_class_property_add_bool(oc, MICROVM_MACHINE_OPTION_ROMS, + microvm_machine_get_option_roms, + microvm_machine_set_option_roms); + object_class_property_set_description(oc, MICROVM_MACHINE_OPTION_ROMS, + "Set off to disable loading option ROMs"); + + object_class_property_add_bool(oc, MICROVM_MACHINE_AUTO_KERNEL_CMDLINE, + microvm_machine_get_auto_kernel_cmdline, + microvm_machine_set_auto_kernel_cmdline); + object_class_property_set_description(oc, + MICROVM_MACHINE_AUTO_KERNEL_CMDLINE, + "Set off to disable adding virtio-mmio devices to the kernel cmdline"); + + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE); +} + +static const TypeInfo microvm_machine_info = { + .name = TYPE_MICROVM_MACHINE, + .parent = TYPE_X86_MACHINE, + .instance_size = sizeof(MicrovmMachineState), + .instance_init = microvm_machine_initfn, + .class_size = sizeof(MicrovmMachineClass), + .class_init = microvm_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_HOTPLUG_HANDLER }, + { } + }, +}; + +static void microvm_machine_init(void) +{ + type_register_static(µvm_machine_info); +} +type_init(microvm_machine_init); diff --git a/hw/i386/multiboot.c b/hw/i386/multiboot.c new file mode 100644 index 000000000..0a10089f1 --- /dev/null +++ b/hw/i386/multiboot.c @@ -0,0 +1,415 @@ +/* + * QEMU PC System Emulator + * + * Copyright (c) 2003-2004 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/option.h" +#include "cpu.h" +#include "hw/nvram/fw_cfg.h" +#include "multiboot.h" +#include "hw/loader.h" +#include "elf.h" +#include "sysemu/sysemu.h" +#include "qemu/error-report.h" + +/* Show multiboot debug output */ +//#define DEBUG_MULTIBOOT + +#ifdef DEBUG_MULTIBOOT +#define mb_debug(a...) error_report(a) +#else +#define mb_debug(a...) +#endif + +#define MULTIBOOT_STRUCT_ADDR 0x9000 + +#if MULTIBOOT_STRUCT_ADDR > 0xf0000 +#error multiboot struct needs to fit in 16 bit real mode +#endif + +enum { + /* Multiboot info */ + MBI_FLAGS = 0, + MBI_MEM_LOWER = 4, + MBI_MEM_UPPER = 8, + MBI_BOOT_DEVICE = 12, + MBI_CMDLINE = 16, + MBI_MODS_COUNT = 20, + MBI_MODS_ADDR = 24, + MBI_MMAP_ADDR = 48, + MBI_BOOTLOADER = 64, + + MBI_SIZE = 88, + + /* Multiboot modules */ + MB_MOD_START = 0, + MB_MOD_END = 4, + MB_MOD_CMDLINE = 8, + + MB_MOD_SIZE = 16, + + /* Region offsets */ + ADDR_E820_MAP = MULTIBOOT_STRUCT_ADDR + 0, + ADDR_MBI = ADDR_E820_MAP + 0x500, + + /* Multiboot flags */ + MULTIBOOT_FLAGS_MEMORY = 1 << 0, + MULTIBOOT_FLAGS_BOOT_DEVICE = 1 << 1, + MULTIBOOT_FLAGS_CMDLINE = 1 << 2, + MULTIBOOT_FLAGS_MODULES = 1 << 3, + MULTIBOOT_FLAGS_MMAP = 1 << 6, + MULTIBOOT_FLAGS_BOOTLOADER = 1 << 9, +}; + +typedef struct { + /* buffer holding kernel, cmdlines and mb_infos */ + void *mb_buf; + /* address in target */ + hwaddr mb_buf_phys; + /* size of mb_buf in bytes */ + unsigned mb_buf_size; + /* offset of mb-info's in bytes */ + hwaddr offset_mbinfo; + /* offset in buffer for cmdlines in bytes */ + hwaddr offset_cmdlines; + /* offset in buffer for bootloader name in bytes */ + hwaddr offset_bootloader; + /* offset of modules in bytes */ + hwaddr offset_mods; + /* available slots for mb modules infos */ + int mb_mods_avail; + /* currently used slots of mb modules */ + int mb_mods_count; +} MultibootState; + +const char *bootloader_name = "qemu"; + +static uint32_t mb_add_cmdline(MultibootState *s, const char *cmdline) +{ + hwaddr p = s->offset_cmdlines; + char *b = (char *)s->mb_buf + p; + + memcpy(b, cmdline, strlen(cmdline) + 1); + s->offset_cmdlines += strlen(b) + 1; + return s->mb_buf_phys + p; +} + +static uint32_t mb_add_bootloader(MultibootState *s, const char *bootloader) +{ + hwaddr p = s->offset_bootloader; + char *b = (char *)s->mb_buf + p; + + memcpy(b, bootloader, strlen(bootloader) + 1); + s->offset_bootloader += strlen(b) + 1; + return s->mb_buf_phys + p; +} + +static void mb_add_mod(MultibootState *s, + hwaddr start, hwaddr end, + hwaddr cmdline_phys) +{ + char *p; + assert(s->mb_mods_count < s->mb_mods_avail); + + p = (char *)s->mb_buf + s->offset_mbinfo + MB_MOD_SIZE * s->mb_mods_count; + + stl_p(p + MB_MOD_START, start); + stl_p(p + MB_MOD_END, end); + stl_p(p + MB_MOD_CMDLINE, cmdline_phys); + + mb_debug("mod%02d: "TARGET_FMT_plx" - "TARGET_FMT_plx, + s->mb_mods_count, start, end); + + s->mb_mods_count++; +} + +int load_multiboot(X86MachineState *x86ms, + FWCfgState *fw_cfg, + FILE *f, + const char *kernel_filename, + const char *initrd_filename, + const char *kernel_cmdline, + int kernel_file_size, + uint8_t *header) +{ + bool multiboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled; + int i, is_multiboot = 0; + uint32_t flags = 0; + uint32_t mh_entry_addr; + uint32_t mh_load_addr; + uint32_t mb_kernel_size; + MultibootState mbs; + uint8_t bootinfo[MBI_SIZE]; + uint8_t *mb_bootinfo_data; + uint32_t cmdline_len; + GList *mods = NULL; + + /* Ok, let's see if it is a multiboot image. + The header is 12x32bit long, so the latest entry may be 8192 - 48. */ + for (i = 0; i < (8192 - 48); i += 4) { + if (ldl_p(header+i) == 0x1BADB002) { + uint32_t checksum = ldl_p(header+i+8); + flags = ldl_p(header+i+4); + checksum += flags; + checksum += (uint32_t)0x1BADB002; + if (!checksum) { + is_multiboot = 1; + break; + } + } + } + + if (!is_multiboot) + return 0; /* no multiboot */ + + mb_debug("I believe we found a multiboot image!"); + memset(bootinfo, 0, sizeof(bootinfo)); + memset(&mbs, 0, sizeof(mbs)); + + if (flags & 0x00000004) { /* MULTIBOOT_HEADER_HAS_VBE */ + error_report("multiboot knows VBE. we don't"); + } + if (!(flags & 0x00010000)) { /* MULTIBOOT_HEADER_HAS_ADDR */ + uint64_t elf_entry; + uint64_t elf_low, elf_high; + int kernel_size; + fclose(f); + + if (((struct elf64_hdr*)header)->e_machine == EM_X86_64) { + error_report("Cannot load x86-64 image, give a 32bit one."); + exit(1); + } + + kernel_size = load_elf(kernel_filename, NULL, NULL, NULL, &elf_entry, + &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE, + 0, 0); + if (kernel_size < 0) { + error_report("Error while loading elf kernel"); + exit(1); + } + mh_load_addr = elf_low; + mb_kernel_size = elf_high - elf_low; + mh_entry_addr = elf_entry; + + mbs.mb_buf = g_malloc(mb_kernel_size); + if (rom_copy(mbs.mb_buf, mh_load_addr, mb_kernel_size) != mb_kernel_size) { + error_report("Error while fetching elf kernel from rom"); + exit(1); + } + + mb_debug("loading multiboot-elf kernel " + "(%#x bytes) with entry %#zx", + mb_kernel_size, (size_t)mh_entry_addr); + } else { + /* Valid if mh_flags sets MULTIBOOT_HEADER_HAS_ADDR. */ + uint32_t mh_header_addr = ldl_p(header+i+12); + uint32_t mh_load_end_addr = ldl_p(header+i+20); + uint32_t mh_bss_end_addr = ldl_p(header+i+24); + + mh_load_addr = ldl_p(header+i+16); + if (mh_header_addr < mh_load_addr) { + error_report("invalid load_addr address"); + exit(1); + } + if (mh_header_addr - mh_load_addr > i) { + error_report("invalid header_addr address"); + exit(1); + } + + uint32_t mb_kernel_text_offset = i - (mh_header_addr - mh_load_addr); + uint32_t mb_load_size = 0; + mh_entry_addr = ldl_p(header+i+28); + + if (mh_load_end_addr) { + if (mh_load_end_addr < mh_load_addr) { + error_report("invalid load_end_addr address"); + exit(1); + } + mb_load_size = mh_load_end_addr - mh_load_addr; + } else { + if (kernel_file_size < mb_kernel_text_offset) { + error_report("invalid kernel_file_size"); + exit(1); + } + mb_load_size = kernel_file_size - mb_kernel_text_offset; + } + if (mb_load_size > UINT32_MAX - mh_load_addr) { + error_report("kernel does not fit in address space"); + exit(1); + } + if (mh_bss_end_addr) { + if (mh_bss_end_addr < (mh_load_addr + mb_load_size)) { + error_report("invalid bss_end_addr address"); + exit(1); + } + mb_kernel_size = mh_bss_end_addr - mh_load_addr; + } else { + mb_kernel_size = mb_load_size; + } + + mb_debug("multiboot: header_addr = %#x", mh_header_addr); + mb_debug("multiboot: load_addr = %#x", mh_load_addr); + mb_debug("multiboot: load_end_addr = %#x", mh_load_end_addr); + mb_debug("multiboot: bss_end_addr = %#x", mh_bss_end_addr); + mb_debug("loading multiboot kernel (%#x bytes) at %#x", + mb_load_size, mh_load_addr); + + mbs.mb_buf = g_malloc(mb_kernel_size); + fseek(f, mb_kernel_text_offset, SEEK_SET); + if (fread(mbs.mb_buf, 1, mb_load_size, f) != mb_load_size) { + error_report("fread() failed"); + exit(1); + } + memset(mbs.mb_buf + mb_load_size, 0, mb_kernel_size - mb_load_size); + fclose(f); + } + + mbs.mb_buf_phys = mh_load_addr; + + mbs.mb_buf_size = TARGET_PAGE_ALIGN(mb_kernel_size); + mbs.offset_mbinfo = mbs.mb_buf_size; + + /* Calculate space for cmdlines, bootloader name, and mb_mods */ + cmdline_len = strlen(kernel_filename) + 1; + cmdline_len += strlen(kernel_cmdline) + 1; + if (initrd_filename) { + const char *r = initrd_filename; + cmdline_len += strlen(initrd_filename) + 1; + while (*r) { + char *value; + r = get_opt_value(r, &value); + mbs.mb_mods_avail++; + mods = g_list_append(mods, value); + if (*r) { + r++; + } + } + } + + mbs.mb_buf_size += cmdline_len; + mbs.mb_buf_size += MB_MOD_SIZE * mbs.mb_mods_avail; + mbs.mb_buf_size += strlen(bootloader_name) + 1; + + mbs.mb_buf_size = TARGET_PAGE_ALIGN(mbs.mb_buf_size); + + /* enlarge mb_buf to hold cmdlines, bootloader, mb-info structs */ + mbs.mb_buf = g_realloc(mbs.mb_buf, mbs.mb_buf_size); + mbs.offset_cmdlines = mbs.offset_mbinfo + mbs.mb_mods_avail * MB_MOD_SIZE; + mbs.offset_bootloader = mbs.offset_cmdlines + cmdline_len; + + if (mods) { + GList *tmpl = mods; + mbs.offset_mods = mbs.mb_buf_size; + + while (tmpl) { + char *next_space; + int mb_mod_length; + uint32_t offs = mbs.mb_buf_size; + char *one_file = tmpl->data; + + /* if a space comes after the module filename, treat everything + after that as parameters */ + hwaddr c = mb_add_cmdline(&mbs, one_file); + next_space = strchr(one_file, ' '); + if (next_space) { + *next_space = '\0'; + } + mb_debug("multiboot loading module: %s", one_file); + mb_mod_length = get_image_size(one_file); + if (mb_mod_length < 0) { + error_report("Failed to open file '%s'", one_file); + exit(1); + } + + mbs.mb_buf_size = TARGET_PAGE_ALIGN(mb_mod_length + mbs.mb_buf_size); + mbs.mb_buf = g_realloc(mbs.mb_buf, mbs.mb_buf_size); + + if (load_image_size(one_file, (unsigned char *)mbs.mb_buf + offs, + mbs.mb_buf_size - offs) < 0) { + error_report("Error loading file '%s'", one_file); + exit(1); + } + mb_add_mod(&mbs, mbs.mb_buf_phys + offs, + mbs.mb_buf_phys + offs + mb_mod_length, c); + + mb_debug("mod_start: %p\nmod_end: %p\n cmdline: "TARGET_FMT_plx, + (char *)mbs.mb_buf + offs, + (char *)mbs.mb_buf + offs + mb_mod_length, c); + g_free(one_file); + tmpl = tmpl->next; + } + g_list_free(mods); + } + + /* Commandline support */ + char kcmdline[strlen(kernel_filename) + strlen(kernel_cmdline) + 2]; + snprintf(kcmdline, sizeof(kcmdline), "%s %s", + kernel_filename, kernel_cmdline); + stl_p(bootinfo + MBI_CMDLINE, mb_add_cmdline(&mbs, kcmdline)); + + stl_p(bootinfo + MBI_BOOTLOADER, mb_add_bootloader(&mbs, bootloader_name)); + + stl_p(bootinfo + MBI_MODS_ADDR, mbs.mb_buf_phys + mbs.offset_mbinfo); + stl_p(bootinfo + MBI_MODS_COUNT, mbs.mb_mods_count); /* mods_count */ + + /* the kernel is where we want it to be now */ + stl_p(bootinfo + MBI_FLAGS, MULTIBOOT_FLAGS_MEMORY + | MULTIBOOT_FLAGS_BOOT_DEVICE + | MULTIBOOT_FLAGS_CMDLINE + | MULTIBOOT_FLAGS_MODULES + | MULTIBOOT_FLAGS_MMAP + | MULTIBOOT_FLAGS_BOOTLOADER); + stl_p(bootinfo + MBI_BOOT_DEVICE, 0x8000ffff); /* XXX: use the -boot switch? */ + stl_p(bootinfo + MBI_MMAP_ADDR, ADDR_E820_MAP); + + mb_debug("multiboot: entry_addr = %#x", mh_entry_addr); + mb_debug(" mb_buf_phys = "TARGET_FMT_plx, mbs.mb_buf_phys); + mb_debug(" mod_start = "TARGET_FMT_plx, + mbs.mb_buf_phys + mbs.offset_mods); + mb_debug(" mb_mods_count = %d", mbs.mb_mods_count); + + /* save bootinfo off the stack */ + mb_bootinfo_data = g_memdup(bootinfo, sizeof(bootinfo)); + + /* Pass variables to option rom */ + fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, mh_entry_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, mbs.mb_buf_size); + fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, + mbs.mb_buf, mbs.mb_buf_size); + + fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, ADDR_MBI); + fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, sizeof(bootinfo)); + fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, mb_bootinfo_data, + sizeof(bootinfo)); + + if (multiboot_dma_enabled) { + option_rom[nb_option_roms].name = "multiboot_dma.bin"; + } else { + option_rom[nb_option_roms].name = "multiboot.bin"; + } + option_rom[nb_option_roms].bootindex = 0; + nb_option_roms++; + + return 1; /* yes, we are multiboot */ +} diff --git a/hw/i386/multiboot.h b/hw/i386/multiboot.h new file mode 100644 index 000000000..2b9182a8e --- /dev/null +++ b/hw/i386/multiboot.h @@ -0,0 +1,16 @@ +#ifndef QEMU_MULTIBOOT_H +#define QEMU_MULTIBOOT_H + +#include "hw/nvram/fw_cfg.h" +#include "hw/i386/x86.h" + +int load_multiboot(X86MachineState *x86ms, + FWCfgState *fw_cfg, + FILE *f, + const char *kernel_filename, + const char *initrd_filename, + const char *kernel_cmdline, + int kernel_file_size, + uint8_t *header); + +#endif diff --git a/hw/i386/pc.c b/hw/i386/pc.c new file mode 100644 index 000000000..a2ef40ecb --- /dev/null +++ b/hw/i386/pc.c @@ -0,0 +1,1777 @@ +/* + * QEMU PC System Emulator + * + * Copyright (c) 2003-2004 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "hw/i386/x86.h" +#include "hw/i386/pc.h" +#include "hw/char/serial.h" +#include "hw/char/parallel.h" +#include "hw/i386/apic.h" +#include "hw/i386/topology.h" +#include "hw/i386/fw_cfg.h" +#include "hw/i386/vmport.h" +#include "sysemu/cpus.h" +#include "hw/block/fdc.h" +#include "hw/ide.h" +#include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" +#include "hw/nvram/fw_cfg.h" +#include "hw/timer/hpet.h" +#include "hw/firmware/smbios.h" +#include "hw/loader.h" +#include "elf.h" +#include "migration/vmstate.h" +#include "multiboot.h" +#include "hw/rtc/mc146818rtc.h" +#include "hw/intc/i8259.h" +#include "hw/dma/i8257.h" +#include "hw/timer/i8254.h" +#include "hw/input/i8042.h" +#include "hw/irq.h" +#include "hw/audio/pcspk.h" +#include "hw/pci/msi.h" +#include "hw/sysbus.h" +#include "sysemu/sysemu.h" +#include "sysemu/tcg.h" +#include "sysemu/numa.h" +#include "sysemu/kvm.h" +#include "sysemu/xen.h" +#include "sysemu/reset.h" +#include "sysemu/runstate.h" +#include "kvm/kvm_i386.h" +#include "hw/xen/xen.h" +#include "hw/xen/start_info.h" +#include "ui/qemu-spice.h" +#include "exec/memory.h" +#include "qemu/bitmap.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "qemu/option.h" +#include "qemu/cutils.h" +#include "hw/acpi/acpi.h" +#include "hw/acpi/cpu_hotplug.h" +#include "acpi-build.h" +#include "hw/mem/pc-dimm.h" +#include "hw/mem/nvdimm.h" +#include "qapi/error.h" +#include "qapi/qapi-visit-common.h" +#include "qapi/visitor.h" +#include "hw/core/cpu.h" +#include "hw/usb.h" +#include "hw/i386/intel_iommu.h" +#include "hw/net/ne2000-isa.h" +#include "standard-headers/asm-x86/bootparam.h" +#include "hw/virtio/virtio-iommu.h" +#include "hw/virtio/virtio-pmem-pci.h" +#include "hw/virtio/virtio-mem-pci.h" +#include "hw/mem/memory-device.h" +#include "sysemu/replay.h" +#include "qapi/qmp/qerror.h" +#include "e820_memory_layout.h" +#include "fw_cfg.h" +#include "trace.h" +#include CONFIG_DEVICES + +GlobalProperty pc_compat_6_1[] = { + { TYPE_X86_CPU, "hv-version-id-build", "0x1bbc" }, + { TYPE_X86_CPU, "hv-version-id-major", "0x0006" }, + { TYPE_X86_CPU, "hv-version-id-minor", "0x0001" }, + { "ICH9-LPC", "x-keep-pci-slot-hpc", "false" }, +}; +const size_t pc_compat_6_1_len = G_N_ELEMENTS(pc_compat_6_1); + +GlobalProperty pc_compat_6_0[] = { + { "qemu64" "-" TYPE_X86_CPU, "family", "6" }, + { "qemu64" "-" TYPE_X86_CPU, "model", "6" }, + { "qemu64" "-" TYPE_X86_CPU, "stepping", "3" }, + { TYPE_X86_CPU, "x-vendor-cpuid-only", "off" }, + { "ICH9-LPC", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" }, + { "ICH9-LPC", "x-keep-pci-slot-hpc", "true" }, +}; +const size_t pc_compat_6_0_len = G_N_ELEMENTS(pc_compat_6_0); + +GlobalProperty pc_compat_5_2[] = { + { "ICH9-LPC", "x-smi-cpu-hotunplug", "off" }, +}; +const size_t pc_compat_5_2_len = G_N_ELEMENTS(pc_compat_5_2); + +GlobalProperty pc_compat_5_1[] = { + { "ICH9-LPC", "x-smi-cpu-hotplug", "off" }, + { TYPE_X86_CPU, "kvm-msi-ext-dest-id", "off" }, +}; +const size_t pc_compat_5_1_len = G_N_ELEMENTS(pc_compat_5_1); + +GlobalProperty pc_compat_5_0[] = { +}; +const size_t pc_compat_5_0_len = G_N_ELEMENTS(pc_compat_5_0); + +GlobalProperty pc_compat_4_2[] = { + { "mch", "smbase-smram", "off" }, +}; +const size_t pc_compat_4_2_len = G_N_ELEMENTS(pc_compat_4_2); + +GlobalProperty pc_compat_4_1[] = {}; +const size_t pc_compat_4_1_len = G_N_ELEMENTS(pc_compat_4_1); + +GlobalProperty pc_compat_4_0[] = {}; +const size_t pc_compat_4_0_len = G_N_ELEMENTS(pc_compat_4_0); + +GlobalProperty pc_compat_3_1[] = { + { "intel-iommu", "dma-drain", "off" }, + { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "off" }, + { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "off" }, + { "Opteron_G4" "-" TYPE_X86_CPU, "npt", "off" }, + { "Opteron_G4" "-" TYPE_X86_CPU, "nrip-save", "off" }, + { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "off" }, + { "Opteron_G5" "-" TYPE_X86_CPU, "npt", "off" }, + { "Opteron_G5" "-" TYPE_X86_CPU, "nrip-save", "off" }, + { "EPYC" "-" TYPE_X86_CPU, "npt", "off" }, + { "EPYC" "-" TYPE_X86_CPU, "nrip-save", "off" }, + { "EPYC-IBPB" "-" TYPE_X86_CPU, "npt", "off" }, + { "EPYC-IBPB" "-" TYPE_X86_CPU, "nrip-save", "off" }, + { "Skylake-Client" "-" TYPE_X86_CPU, "mpx", "on" }, + { "Skylake-Client-IBRS" "-" TYPE_X86_CPU, "mpx", "on" }, + { "Skylake-Server" "-" TYPE_X86_CPU, "mpx", "on" }, + { "Skylake-Server-IBRS" "-" TYPE_X86_CPU, "mpx", "on" }, + { "Cascadelake-Server" "-" TYPE_X86_CPU, "mpx", "on" }, + { "Icelake-Client" "-" TYPE_X86_CPU, "mpx", "on" }, + { "Icelake-Server" "-" TYPE_X86_CPU, "mpx", "on" }, + { "Cascadelake-Server" "-" TYPE_X86_CPU, "stepping", "5" }, + { TYPE_X86_CPU, "x-intel-pt-auto-level", "off" }, +}; +const size_t pc_compat_3_1_len = G_N_ELEMENTS(pc_compat_3_1); + +GlobalProperty pc_compat_3_0[] = { + { TYPE_X86_CPU, "x-hv-synic-kvm-only", "on" }, + { "Skylake-Server" "-" TYPE_X86_CPU, "pku", "off" }, + { "Skylake-Server-IBRS" "-" TYPE_X86_CPU, "pku", "off" }, +}; +const size_t pc_compat_3_0_len = G_N_ELEMENTS(pc_compat_3_0); + +GlobalProperty pc_compat_2_12[] = { + { TYPE_X86_CPU, "legacy-cache", "on" }, + { TYPE_X86_CPU, "topoext", "off" }, + { "EPYC-" TYPE_X86_CPU, "xlevel", "0x8000000a" }, + { "EPYC-IBPB-" TYPE_X86_CPU, "xlevel", "0x8000000a" }, +}; +const size_t pc_compat_2_12_len = G_N_ELEMENTS(pc_compat_2_12); + +GlobalProperty pc_compat_2_11[] = { + { TYPE_X86_CPU, "x-migrate-smi-count", "off" }, + { "Skylake-Server" "-" TYPE_X86_CPU, "clflushopt", "off" }, +}; +const size_t pc_compat_2_11_len = G_N_ELEMENTS(pc_compat_2_11); + +GlobalProperty pc_compat_2_10[] = { + { TYPE_X86_CPU, "x-hv-max-vps", "0x40" }, + { "i440FX-pcihost", "x-pci-hole64-fix", "off" }, + { "q35-pcihost", "x-pci-hole64-fix", "off" }, +}; +const size_t pc_compat_2_10_len = G_N_ELEMENTS(pc_compat_2_10); + +GlobalProperty pc_compat_2_9[] = { + { "mch", "extended-tseg-mbytes", "0" }, +}; +const size_t pc_compat_2_9_len = G_N_ELEMENTS(pc_compat_2_9); + +GlobalProperty pc_compat_2_8[] = { + { TYPE_X86_CPU, "tcg-cpuid", "off" }, + { "kvmclock", "x-mach-use-reliable-get-clock", "off" }, + { "ICH9-LPC", "x-smi-broadcast", "off" }, + { TYPE_X86_CPU, "vmware-cpuid-freq", "off" }, + { "Haswell-" TYPE_X86_CPU, "stepping", "1" }, +}; +const size_t pc_compat_2_8_len = G_N_ELEMENTS(pc_compat_2_8); + +GlobalProperty pc_compat_2_7[] = { + { TYPE_X86_CPU, "l3-cache", "off" }, + { TYPE_X86_CPU, "full-cpuid-auto-level", "off" }, + { "Opteron_G3" "-" TYPE_X86_CPU, "family", "15" }, + { "Opteron_G3" "-" TYPE_X86_CPU, "model", "6" }, + { "Opteron_G3" "-" TYPE_X86_CPU, "stepping", "1" }, + { "isa-pcspk", "migrate", "off" }, +}; +const size_t pc_compat_2_7_len = G_N_ELEMENTS(pc_compat_2_7); + +GlobalProperty pc_compat_2_6[] = { + { TYPE_X86_CPU, "cpuid-0xb", "off" }, + { "vmxnet3", "romfile", "" }, + { TYPE_X86_CPU, "fill-mtrr-mask", "off" }, + { "apic-common", "legacy-instance-id", "on", } +}; +const size_t pc_compat_2_6_len = G_N_ELEMENTS(pc_compat_2_6); + +GlobalProperty pc_compat_2_5[] = {}; +const size_t pc_compat_2_5_len = G_N_ELEMENTS(pc_compat_2_5); + +GlobalProperty pc_compat_2_4[] = { + PC_CPU_MODEL_IDS("2.4.0") + { "Haswell-" TYPE_X86_CPU, "abm", "off" }, + { "Haswell-noTSX-" TYPE_X86_CPU, "abm", "off" }, + { "Broadwell-" TYPE_X86_CPU, "abm", "off" }, + { "Broadwell-noTSX-" TYPE_X86_CPU, "abm", "off" }, + { "host" "-" TYPE_X86_CPU, "host-cache-info", "on" }, + { TYPE_X86_CPU, "check", "off" }, + { "qemu64" "-" TYPE_X86_CPU, "sse4a", "on" }, + { "qemu64" "-" TYPE_X86_CPU, "abm", "on" }, + { "qemu64" "-" TYPE_X86_CPU, "popcnt", "on" }, + { "qemu32" "-" TYPE_X86_CPU, "popcnt", "on" }, + { "Opteron_G2" "-" TYPE_X86_CPU, "rdtscp", "on" }, + { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "on" }, + { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "on" }, + { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "on", } +}; +const size_t pc_compat_2_4_len = G_N_ELEMENTS(pc_compat_2_4); + +GlobalProperty pc_compat_2_3[] = { + PC_CPU_MODEL_IDS("2.3.0") + { TYPE_X86_CPU, "arat", "off" }, + { "qemu64" "-" TYPE_X86_CPU, "min-level", "4" }, + { "kvm64" "-" TYPE_X86_CPU, "min-level", "5" }, + { "pentium3" "-" TYPE_X86_CPU, "min-level", "2" }, + { "n270" "-" TYPE_X86_CPU, "min-level", "5" }, + { "Conroe" "-" TYPE_X86_CPU, "min-level", "4" }, + { "Penryn" "-" TYPE_X86_CPU, "min-level", "4" }, + { "Nehalem" "-" TYPE_X86_CPU, "min-level", "4" }, + { "n270" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { "Penryn" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { "Conroe" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { "Nehalem" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { "Westmere" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { "SandyBridge" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { "IvyBridge" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { "Haswell" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { "Haswell-noTSX" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { "Broadwell" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { "Broadwell-noTSX" "-" TYPE_X86_CPU, "min-xlevel", "0x8000000a" }, + { TYPE_X86_CPU, "kvm-no-smi-migration", "on" }, +}; +const size_t pc_compat_2_3_len = G_N_ELEMENTS(pc_compat_2_3); + +GlobalProperty pc_compat_2_2[] = { + PC_CPU_MODEL_IDS("2.2.0") + { "kvm64" "-" TYPE_X86_CPU, "vme", "off" }, + { "kvm32" "-" TYPE_X86_CPU, "vme", "off" }, + { "Conroe" "-" TYPE_X86_CPU, "vme", "off" }, + { "Penryn" "-" TYPE_X86_CPU, "vme", "off" }, + { "Nehalem" "-" TYPE_X86_CPU, "vme", "off" }, + { "Westmere" "-" TYPE_X86_CPU, "vme", "off" }, + { "SandyBridge" "-" TYPE_X86_CPU, "vme", "off" }, + { "Haswell" "-" TYPE_X86_CPU, "vme", "off" }, + { "Broadwell" "-" TYPE_X86_CPU, "vme", "off" }, + { "Opteron_G1" "-" TYPE_X86_CPU, "vme", "off" }, + { "Opteron_G2" "-" TYPE_X86_CPU, "vme", "off" }, + { "Opteron_G3" "-" TYPE_X86_CPU, "vme", "off" }, + { "Opteron_G4" "-" TYPE_X86_CPU, "vme", "off" }, + { "Opteron_G5" "-" TYPE_X86_CPU, "vme", "off" }, + { "Haswell" "-" TYPE_X86_CPU, "f16c", "off" }, + { "Haswell" "-" TYPE_X86_CPU, "rdrand", "off" }, + { "Broadwell" "-" TYPE_X86_CPU, "f16c", "off" }, + { "Broadwell" "-" TYPE_X86_CPU, "rdrand", "off" }, +}; +const size_t pc_compat_2_2_len = G_N_ELEMENTS(pc_compat_2_2); + +GlobalProperty pc_compat_2_1[] = { + PC_CPU_MODEL_IDS("2.1.0") + { "coreduo" "-" TYPE_X86_CPU, "vmx", "on" }, + { "core2duo" "-" TYPE_X86_CPU, "vmx", "on" }, +}; +const size_t pc_compat_2_1_len = G_N_ELEMENTS(pc_compat_2_1); + +GlobalProperty pc_compat_2_0[] = { + PC_CPU_MODEL_IDS("2.0.0") + { "virtio-scsi-pci", "any_layout", "off" }, + { "PIIX4_PM", "memory-hotplug-support", "off" }, + { "apic", "version", "0x11" }, + { "nec-usb-xhci", "superspeed-ports-first", "off" }, + { "nec-usb-xhci", "force-pcie-endcap", "on" }, + { "pci-serial", "prog_if", "0" }, + { "pci-serial-2x", "prog_if", "0" }, + { "pci-serial-4x", "prog_if", "0" }, + { "virtio-net-pci", "guest_announce", "off" }, + { "ICH9-LPC", "memory-hotplug-support", "off" }, + { "xio3130-downstream", COMPAT_PROP_PCP, "off" }, + { "ioh3420", COMPAT_PROP_PCP, "off" }, +}; +const size_t pc_compat_2_0_len = G_N_ELEMENTS(pc_compat_2_0); + +GlobalProperty pc_compat_1_7[] = { + PC_CPU_MODEL_IDS("1.7.0") + { TYPE_USB_DEVICE, "msos-desc", "no" }, + { "PIIX4_PM", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" }, + { "hpet", HPET_INTCAP, "4" }, +}; +const size_t pc_compat_1_7_len = G_N_ELEMENTS(pc_compat_1_7); + +GlobalProperty pc_compat_1_6[] = { + PC_CPU_MODEL_IDS("1.6.0") + { "e1000", "mitigation", "off" }, + { "qemu64-" TYPE_X86_CPU, "model", "2" }, + { "qemu32-" TYPE_X86_CPU, "model", "3" }, + { "i440FX-pcihost", "short_root_bus", "1" }, + { "q35-pcihost", "short_root_bus", "1" }, +}; +const size_t pc_compat_1_6_len = G_N_ELEMENTS(pc_compat_1_6); + +GlobalProperty pc_compat_1_5[] = { + PC_CPU_MODEL_IDS("1.5.0") + { "Conroe-" TYPE_X86_CPU, "model", "2" }, + { "Conroe-" TYPE_X86_CPU, "min-level", "2" }, + { "Penryn-" TYPE_X86_CPU, "model", "2" }, + { "Penryn-" TYPE_X86_CPU, "min-level", "2" }, + { "Nehalem-" TYPE_X86_CPU, "model", "2" }, + { "Nehalem-" TYPE_X86_CPU, "min-level", "2" }, + { "virtio-net-pci", "any_layout", "off" }, + { TYPE_X86_CPU, "pmu", "on" }, + { "i440FX-pcihost", "short_root_bus", "0" }, + { "q35-pcihost", "short_root_bus", "0" }, +}; +const size_t pc_compat_1_5_len = G_N_ELEMENTS(pc_compat_1_5); + +GlobalProperty pc_compat_1_4[] = { + PC_CPU_MODEL_IDS("1.4.0") + { "scsi-hd", "discard_granularity", "0" }, + { "scsi-cd", "discard_granularity", "0" }, + { "ide-hd", "discard_granularity", "0" }, + { "ide-cd", "discard_granularity", "0" }, + { "virtio-blk-pci", "discard_granularity", "0" }, + /* DEV_NVECTORS_UNSPECIFIED as a uint32_t string: */ + { "virtio-serial-pci", "vectors", "0xFFFFFFFF" }, + { "virtio-net-pci", "ctrl_guest_offloads", "off" }, + { "e1000", "romfile", "pxe-e1000.rom" }, + { "ne2k_pci", "romfile", "pxe-ne2k_pci.rom" }, + { "pcnet", "romfile", "pxe-pcnet.rom" }, + { "rtl8139", "romfile", "pxe-rtl8139.rom" }, + { "virtio-net-pci", "romfile", "pxe-virtio.rom" }, + { "486-" TYPE_X86_CPU, "model", "0" }, + { "n270" "-" TYPE_X86_CPU, "movbe", "off" }, + { "Westmere" "-" TYPE_X86_CPU, "pclmulqdq", "off" }, +}; +const size_t pc_compat_1_4_len = G_N_ELEMENTS(pc_compat_1_4); + +GSIState *pc_gsi_create(qemu_irq **irqs, bool pci_enabled) +{ + GSIState *s; + + s = g_new0(GSIState, 1); + if (kvm_ioapic_in_kernel()) { + kvm_pc_setup_irq_routing(pci_enabled); + } + *irqs = qemu_allocate_irqs(gsi_handler, s, GSI_NUM_PINS); + + return s; +} + +static void ioport80_write(void *opaque, hwaddr addr, uint64_t data, + unsigned size) +{ +} + +static uint64_t ioport80_read(void *opaque, hwaddr addr, unsigned size) +{ + return 0xffffffffffffffffULL; +} + +/* MSDOS compatibility mode FPU exception support */ +static void ioportF0_write(void *opaque, hwaddr addr, uint64_t data, + unsigned size) +{ + if (tcg_enabled()) { + cpu_set_ignne(); + } +} + +static uint64_t ioportF0_read(void *opaque, hwaddr addr, unsigned size) +{ + return 0xffffffffffffffffULL; +} + +/* PC cmos mappings */ + +#define REG_EQUIPMENT_BYTE 0x14 + +static void cmos_init_hd(ISADevice *s, int type_ofs, int info_ofs, + int16_t cylinders, int8_t heads, int8_t sectors) +{ + rtc_set_memory(s, type_ofs, 47); + rtc_set_memory(s, info_ofs, cylinders); + rtc_set_memory(s, info_ofs + 1, cylinders >> 8); + rtc_set_memory(s, info_ofs + 2, heads); + rtc_set_memory(s, info_ofs + 3, 0xff); + rtc_set_memory(s, info_ofs + 4, 0xff); + rtc_set_memory(s, info_ofs + 5, 0xc0 | ((heads > 8) << 3)); + rtc_set_memory(s, info_ofs + 6, cylinders); + rtc_set_memory(s, info_ofs + 7, cylinders >> 8); + rtc_set_memory(s, info_ofs + 8, sectors); +} + +/* convert boot_device letter to something recognizable by the bios */ +static int boot_device2nibble(char boot_device) +{ + switch(boot_device) { + case 'a': + case 'b': + return 0x01; /* floppy boot */ + case 'c': + return 0x02; /* hard drive boot */ + case 'd': + return 0x03; /* CD-ROM boot */ + case 'n': + return 0x04; /* Network boot */ + } + return 0; +} + +static void set_boot_dev(ISADevice *s, const char *boot_device, Error **errp) +{ +#define PC_MAX_BOOT_DEVICES 3 + int nbds, bds[3] = { 0, }; + int i; + + nbds = strlen(boot_device); + if (nbds > PC_MAX_BOOT_DEVICES) { + error_setg(errp, "Too many boot devices for PC"); + return; + } + for (i = 0; i < nbds; i++) { + bds[i] = boot_device2nibble(boot_device[i]); + if (bds[i] == 0) { + error_setg(errp, "Invalid boot device for PC: '%c'", + boot_device[i]); + return; + } + } + rtc_set_memory(s, 0x3d, (bds[1] << 4) | bds[0]); + rtc_set_memory(s, 0x38, (bds[2] << 4) | (fd_bootchk ? 0x0 : 0x1)); +} + +static void pc_boot_set(void *opaque, const char *boot_device, Error **errp) +{ + set_boot_dev(opaque, boot_device, errp); +} + +static void pc_cmos_init_floppy(ISADevice *rtc_state, ISADevice *floppy) +{ + int val, nb, i; + FloppyDriveType fd_type[2] = { FLOPPY_DRIVE_TYPE_NONE, + FLOPPY_DRIVE_TYPE_NONE }; + + /* floppy type */ + if (floppy) { + for (i = 0; i < 2; i++) { + fd_type[i] = isa_fdc_get_drive_type(floppy, i); + } + } + val = (cmos_get_fd_drive_type(fd_type[0]) << 4) | + cmos_get_fd_drive_type(fd_type[1]); + rtc_set_memory(rtc_state, 0x10, val); + + val = rtc_get_memory(rtc_state, REG_EQUIPMENT_BYTE); + nb = 0; + if (fd_type[0] != FLOPPY_DRIVE_TYPE_NONE) { + nb++; + } + if (fd_type[1] != FLOPPY_DRIVE_TYPE_NONE) { + nb++; + } + switch (nb) { + case 0: + break; + case 1: + val |= 0x01; /* 1 drive, ready for boot */ + break; + case 2: + val |= 0x41; /* 2 drives, ready for boot */ + break; + } + rtc_set_memory(rtc_state, REG_EQUIPMENT_BYTE, val); +} + +typedef struct pc_cmos_init_late_arg { + ISADevice *rtc_state; + BusState *idebus[2]; +} pc_cmos_init_late_arg; + +typedef struct check_fdc_state { + ISADevice *floppy; + bool multiple; +} CheckFdcState; + +static int check_fdc(Object *obj, void *opaque) +{ + CheckFdcState *state = opaque; + Object *fdc; + uint32_t iobase; + Error *local_err = NULL; + + fdc = object_dynamic_cast(obj, TYPE_ISA_FDC); + if (!fdc) { + return 0; + } + + iobase = object_property_get_uint(obj, "iobase", &local_err); + if (local_err || iobase != 0x3f0) { + error_free(local_err); + return 0; + } + + if (state->floppy) { + state->multiple = true; + } else { + state->floppy = ISA_DEVICE(obj); + } + return 0; +} + +static const char * const fdc_container_path[] = { + "/unattached", "/peripheral", "/peripheral-anon" +}; + +/* + * Locate the FDC at IO address 0x3f0, in order to configure the CMOS registers + * and ACPI objects. + */ +ISADevice *pc_find_fdc0(void) +{ + int i; + Object *container; + CheckFdcState state = { 0 }; + + for (i = 0; i < ARRAY_SIZE(fdc_container_path); i++) { + container = container_get(qdev_get_machine(), fdc_container_path[i]); + object_child_foreach(container, check_fdc, &state); + } + + if (state.multiple) { + warn_report("multiple floppy disk controllers with " + "iobase=0x3f0 have been found"); + error_printf("the one being picked for CMOS setup might not reflect " + "your intent"); + } + + return state.floppy; +} + +static void pc_cmos_init_late(void *opaque) +{ + pc_cmos_init_late_arg *arg = opaque; + ISADevice *s = arg->rtc_state; + int16_t cylinders; + int8_t heads, sectors; + int val; + int i, trans; + + val = 0; + if (arg->idebus[0] && ide_get_geometry(arg->idebus[0], 0, + &cylinders, &heads, §ors) >= 0) { + cmos_init_hd(s, 0x19, 0x1b, cylinders, heads, sectors); + val |= 0xf0; + } + if (arg->idebus[0] && ide_get_geometry(arg->idebus[0], 1, + &cylinders, &heads, §ors) >= 0) { + cmos_init_hd(s, 0x1a, 0x24, cylinders, heads, sectors); + val |= 0x0f; + } + rtc_set_memory(s, 0x12, val); + + val = 0; + for (i = 0; i < 4; i++) { + /* NOTE: ide_get_geometry() returns the physical + geometry. It is always such that: 1 <= sects <= 63, 1 + <= heads <= 16, 1 <= cylinders <= 16383. The BIOS + geometry can be different if a translation is done. */ + if (arg->idebus[i / 2] && + ide_get_geometry(arg->idebus[i / 2], i % 2, + &cylinders, &heads, §ors) >= 0) { + trans = ide_get_bios_chs_trans(arg->idebus[i / 2], i % 2) - 1; + assert((trans & ~3) == 0); + val |= trans << (i * 2); + } + } + rtc_set_memory(s, 0x39, val); + + pc_cmos_init_floppy(s, pc_find_fdc0()); + + qemu_unregister_reset(pc_cmos_init_late, opaque); +} + +void pc_cmos_init(PCMachineState *pcms, + BusState *idebus0, BusState *idebus1, + ISADevice *s) +{ + int val; + static pc_cmos_init_late_arg arg; + X86MachineState *x86ms = X86_MACHINE(pcms); + + /* various important CMOS locations needed by PC/Bochs bios */ + + /* memory size */ + /* base memory (first MiB) */ + val = MIN(x86ms->below_4g_mem_size / KiB, 640); + rtc_set_memory(s, 0x15, val); + rtc_set_memory(s, 0x16, val >> 8); + /* extended memory (next 64MiB) */ + if (x86ms->below_4g_mem_size > 1 * MiB) { + val = (x86ms->below_4g_mem_size - 1 * MiB) / KiB; + } else { + val = 0; + } + if (val > 65535) + val = 65535; + rtc_set_memory(s, 0x17, val); + rtc_set_memory(s, 0x18, val >> 8); + rtc_set_memory(s, 0x30, val); + rtc_set_memory(s, 0x31, val >> 8); + /* memory between 16MiB and 4GiB */ + if (x86ms->below_4g_mem_size > 16 * MiB) { + val = (x86ms->below_4g_mem_size - 16 * MiB) / (64 * KiB); + } else { + val = 0; + } + if (val > 65535) + val = 65535; + rtc_set_memory(s, 0x34, val); + rtc_set_memory(s, 0x35, val >> 8); + /* memory above 4GiB */ + val = x86ms->above_4g_mem_size / 65536; + rtc_set_memory(s, 0x5b, val); + rtc_set_memory(s, 0x5c, val >> 8); + rtc_set_memory(s, 0x5d, val >> 16); + + object_property_add_link(OBJECT(pcms), "rtc_state", + TYPE_ISA_DEVICE, + (Object **)&x86ms->rtc, + object_property_allow_set_link, + OBJ_PROP_LINK_STRONG); + object_property_set_link(OBJECT(pcms), "rtc_state", OBJECT(s), + &error_abort); + + set_boot_dev(s, MACHINE(pcms)->boot_order, &error_fatal); + + val = 0; + val |= 0x02; /* FPU is there */ + val |= 0x04; /* PS/2 mouse installed */ + rtc_set_memory(s, REG_EQUIPMENT_BYTE, val); + + /* hard drives and FDC */ + arg.rtc_state = s; + arg.idebus[0] = idebus0; + arg.idebus[1] = idebus1; + qemu_register_reset(pc_cmos_init_late, &arg); +} + +static void handle_a20_line_change(void *opaque, int irq, int level) +{ + X86CPU *cpu = opaque; + + /* XXX: send to all CPUs ? */ + /* XXX: add logic to handle multiple A20 line sources */ + x86_cpu_set_a20(cpu, level); +} + +#define NE2000_NB_MAX 6 + +static const int ne2000_io[NE2000_NB_MAX] = { 0x300, 0x320, 0x340, 0x360, + 0x280, 0x380 }; +static const int ne2000_irq[NE2000_NB_MAX] = { 9, 10, 11, 3, 4, 5 }; + +void pc_init_ne2k_isa(ISABus *bus, NICInfo *nd) +{ + static int nb_ne2k = 0; + + if (nb_ne2k == NE2000_NB_MAX) + return; + isa_ne2000_init(bus, ne2000_io[nb_ne2k], + ne2000_irq[nb_ne2k], nd); + nb_ne2k++; +} + +void pc_acpi_smi_interrupt(void *opaque, int irq, int level) +{ + X86CPU *cpu = opaque; + + if (level) { + cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI); + } +} + +static +void pc_machine_done(Notifier *notifier, void *data) +{ + PCMachineState *pcms = container_of(notifier, + PCMachineState, machine_done); + X86MachineState *x86ms = X86_MACHINE(pcms); + + /* set the number of CPUs */ + x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus); + + fw_cfg_add_extra_pci_roots(pcms->bus, x86ms->fw_cfg); + + acpi_setup(); + if (x86ms->fw_cfg) { + fw_cfg_build_smbios(MACHINE(pcms), x86ms->fw_cfg); + fw_cfg_build_feature_control(MACHINE(pcms), x86ms->fw_cfg); + /* update FW_CFG_NB_CPUS to account for -device added CPUs */ + fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus); + } + + + if (x86ms->apic_id_limit > 255 && !xen_enabled() && + !kvm_irqchip_in_kernel()) { + error_report("current -smp configuration requires kernel " + "irqchip support."); + exit(EXIT_FAILURE); + } +} + +void pc_guest_info_init(PCMachineState *pcms) +{ + X86MachineState *x86ms = X86_MACHINE(pcms); + + x86ms->apic_xrupt_override = true; + pcms->machine_done.notify = pc_machine_done; + qemu_add_machine_init_done_notifier(&pcms->machine_done); +} + +/* setup pci memory address space mapping into system address space */ +void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory, + MemoryRegion *pci_address_space) +{ + /* Set to lower priority than RAM */ + memory_region_add_subregion_overlap(system_memory, 0x0, + pci_address_space, -1); +} + +void xen_load_linux(PCMachineState *pcms) +{ + int i; + FWCfgState *fw_cfg; + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + X86MachineState *x86ms = X86_MACHINE(pcms); + + assert(MACHINE(pcms)->kernel_filename != NULL); + + fw_cfg = fw_cfg_init_io(FW_CFG_IO_BASE); + fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus); + rom_set_fw(fw_cfg); + + x86_load_linux(x86ms, fw_cfg, pcmc->acpi_data_size, + pcmc->pvh_enabled); + for (i = 0; i < nb_option_roms; i++) { + assert(!strcmp(option_rom[i].name, "linuxboot.bin") || + !strcmp(option_rom[i].name, "linuxboot_dma.bin") || + !strcmp(option_rom[i].name, "pvh.bin") || + !strcmp(option_rom[i].name, "multiboot.bin") || + !strcmp(option_rom[i].name, "multiboot_dma.bin")); + rom_add_option(option_rom[i].name, option_rom[i].bootindex); + } + x86ms->fw_cfg = fw_cfg; +} + +#define PC_ROM_MIN_VGA 0xc0000 +#define PC_ROM_MIN_OPTION 0xc8000 +#define PC_ROM_MAX 0xe0000 +#define PC_ROM_ALIGN 0x800 +#define PC_ROM_SIZE (PC_ROM_MAX - PC_ROM_MIN_VGA) + +void pc_memory_init(PCMachineState *pcms, + MemoryRegion *system_memory, + MemoryRegion *rom_memory, + MemoryRegion **ram_memory) +{ + int linux_boot, i; + MemoryRegion *option_rom_mr; + MemoryRegion *ram_below_4g, *ram_above_4g; + FWCfgState *fw_cfg; + MachineState *machine = MACHINE(pcms); + MachineClass *mc = MACHINE_GET_CLASS(machine); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + X86MachineState *x86ms = X86_MACHINE(pcms); + + assert(machine->ram_size == x86ms->below_4g_mem_size + + x86ms->above_4g_mem_size); + + linux_boot = (machine->kernel_filename != NULL); + + /* + * Split single memory region and use aliases to address portions of it, + * done for backwards compatibility with older qemus. + */ + *ram_memory = machine->ram; + ram_below_4g = g_malloc(sizeof(*ram_below_4g)); + memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram, + 0, x86ms->below_4g_mem_size); + memory_region_add_subregion(system_memory, 0, ram_below_4g); + e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM); + if (x86ms->above_4g_mem_size > 0) { + ram_above_4g = g_malloc(sizeof(*ram_above_4g)); + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", + machine->ram, + x86ms->below_4g_mem_size, + x86ms->above_4g_mem_size); + memory_region_add_subregion(system_memory, 0x100000000ULL, + ram_above_4g); + e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM); + } + + if (pcms->sgx_epc.size != 0) { + e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED); + } + + if (!pcmc->has_reserved_memory && + (machine->ram_slots || + (machine->maxram_size > machine->ram_size))) { + + error_report("\"-memory 'slots|maxmem'\" is not supported by: %s", + mc->name); + exit(EXIT_FAILURE); + } + + /* always allocate the device memory information */ + machine->device_memory = g_malloc0(sizeof(*machine->device_memory)); + + /* initialize device memory address space */ + if (pcmc->has_reserved_memory && + (machine->ram_size < machine->maxram_size)) { + ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size; + + if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) { + error_report("unsupported amount of memory slots: %"PRIu64, + machine->ram_slots); + exit(EXIT_FAILURE); + } + + if (QEMU_ALIGN_UP(machine->maxram_size, + TARGET_PAGE_SIZE) != machine->maxram_size) { + error_report("maximum memory size must by aligned to multiple of " + "%d bytes", TARGET_PAGE_SIZE); + exit(EXIT_FAILURE); + } + + if (pcms->sgx_epc.size != 0) { + machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc); + } else { + machine->device_memory->base = + 0x100000000ULL + x86ms->above_4g_mem_size; + } + + machine->device_memory->base = + ROUND_UP(machine->device_memory->base, 1 * GiB); + + if (pcmc->enforce_aligned_dimm) { + /* size device region assuming 1G page max alignment per slot */ + device_mem_size += (1 * GiB) * machine->ram_slots; + } + + if ((machine->device_memory->base + device_mem_size) < + device_mem_size) { + error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT, + machine->maxram_size); + exit(EXIT_FAILURE); + } + + memory_region_init(&machine->device_memory->mr, OBJECT(pcms), + "device-memory", device_mem_size); + memory_region_add_subregion(system_memory, machine->device_memory->base, + &machine->device_memory->mr); + } + + /* Initialize PC system firmware */ + pc_system_firmware_init(pcms, rom_memory); + + option_rom_mr = g_malloc(sizeof(*option_rom_mr)); + memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, + &error_fatal); + if (pcmc->pci_enabled) { + memory_region_set_readonly(option_rom_mr, true); + } + memory_region_add_subregion_overlap(rom_memory, + PC_ROM_MIN_VGA, + option_rom_mr, + 1); + + fw_cfg = fw_cfg_arch_create(machine, + x86ms->boot_cpus, x86ms->apic_id_limit); + + rom_set_fw(fw_cfg); + + if (pcmc->has_reserved_memory && machine->device_memory->base) { + uint64_t *val = g_malloc(sizeof(*val)); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + uint64_t res_mem_end = machine->device_memory->base; + + if (!pcmc->broken_reserved_end) { + res_mem_end += memory_region_size(&machine->device_memory->mr); + } + *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB)); + fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val)); + } + + if (linux_boot) { + x86_load_linux(x86ms, fw_cfg, pcmc->acpi_data_size, + pcmc->pvh_enabled); + } + + for (i = 0; i < nb_option_roms; i++) { + rom_add_option(option_rom[i].name, option_rom[i].bootindex); + } + x86ms->fw_cfg = fw_cfg; + + /* Init default IOAPIC address space */ + x86ms->ioapic_as = &address_space_memory; + + /* Init ACPI memory hotplug IO base address */ + pcms->memhp_io_base = ACPI_MEMORY_HOTPLUG_BASE; +} + +/* + * The 64bit pci hole starts after "above 4G RAM" and + * potentially the space reserved for memory hotplug. + */ +uint64_t pc_pci_hole64_start(void) +{ + PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + MachineState *ms = MACHINE(pcms); + X86MachineState *x86ms = X86_MACHINE(pcms); + uint64_t hole64_start = 0; + + if (pcmc->has_reserved_memory && ms->device_memory->base) { + hole64_start = ms->device_memory->base; + if (!pcmc->broken_reserved_end) { + hole64_start += memory_region_size(&ms->device_memory->mr); + } + } else if (pcms->sgx_epc.size != 0) { + hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc); + } else { + hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size; + } + + return ROUND_UP(hole64_start, 1 * GiB); +} + +DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus) +{ + DeviceState *dev = NULL; + + rom_set_order_override(FW_CFG_ORDER_OVERRIDE_VGA); + if (pci_bus) { + PCIDevice *pcidev = pci_vga_init(pci_bus); + dev = pcidev ? &pcidev->qdev : NULL; + } else if (isa_bus) { + ISADevice *isadev = isa_vga_init(isa_bus); + dev = isadev ? DEVICE(isadev) : NULL; + } + rom_reset_order_override(); + return dev; +} + +static const MemoryRegionOps ioport80_io_ops = { + .write = ioport80_write, + .read = ioport80_read, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + .min_access_size = 1, + .max_access_size = 1, + }, +}; + +static const MemoryRegionOps ioportF0_io_ops = { + .write = ioportF0_write, + .read = ioportF0_read, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + .min_access_size = 1, + .max_access_size = 1, + }, +}; + +static void pc_superio_init(ISABus *isa_bus, bool create_fdctrl, bool no_vmport) +{ + int i; + DriveInfo *fd[MAX_FD]; + qemu_irq *a20_line; + ISADevice *fdc, *i8042, *port92, *vmmouse; + + serial_hds_isa_init(isa_bus, 0, MAX_ISA_SERIAL_PORTS); + parallel_hds_isa_init(isa_bus, MAX_PARALLEL_PORTS); + + for (i = 0; i < MAX_FD; i++) { + fd[i] = drive_get(IF_FLOPPY, 0, i); + create_fdctrl |= !!fd[i]; + } + if (create_fdctrl) { + fdc = isa_new(TYPE_ISA_FDC); + if (fdc) { + isa_realize_and_unref(fdc, isa_bus, &error_fatal); + isa_fdc_init_drives(fdc, fd); + } + } + + i8042 = isa_create_simple(isa_bus, "i8042"); + if (!no_vmport) { + isa_create_simple(isa_bus, TYPE_VMPORT); + vmmouse = isa_try_new("vmmouse"); + } else { + vmmouse = NULL; + } + if (vmmouse) { + object_property_set_link(OBJECT(vmmouse), "i8042", OBJECT(i8042), + &error_abort); + isa_realize_and_unref(vmmouse, isa_bus, &error_fatal); + } + port92 = isa_create_simple(isa_bus, TYPE_PORT92); + + a20_line = qemu_allocate_irqs(handle_a20_line_change, first_cpu, 2); + i8042_setup_a20_line(i8042, a20_line[0]); + qdev_connect_gpio_out_named(DEVICE(port92), + PORT92_A20_LINE, 0, a20_line[1]); + g_free(a20_line); +} + +void pc_basic_device_init(struct PCMachineState *pcms, + ISABus *isa_bus, qemu_irq *gsi, + ISADevice **rtc_state, + bool create_fdctrl, + uint32_t hpet_irqs) +{ + int i; + DeviceState *hpet = NULL; + int pit_isa_irq = 0; + qemu_irq pit_alt_irq = NULL; + qemu_irq rtc_irq = NULL; + ISADevice *pit = NULL; + MemoryRegion *ioport80_io = g_new(MemoryRegion, 1); + MemoryRegion *ioportF0_io = g_new(MemoryRegion, 1); + + memory_region_init_io(ioport80_io, NULL, &ioport80_io_ops, NULL, "ioport80", 1); + memory_region_add_subregion(isa_bus->address_space_io, 0x80, ioport80_io); + + memory_region_init_io(ioportF0_io, NULL, &ioportF0_io_ops, NULL, "ioportF0", 1); + memory_region_add_subregion(isa_bus->address_space_io, 0xf0, ioportF0_io); + + /* + * Check if an HPET shall be created. + * + * Without KVM_CAP_PIT_STATE2, we cannot switch off the in-kernel PIT + * when the HPET wants to take over. Thus we have to disable the latter. + */ + if (pcms->hpet_enabled && (!kvm_irqchip_in_kernel() || + kvm_has_pit_state2())) { + hpet = qdev_try_new(TYPE_HPET); + if (!hpet) { + error_report("couldn't create HPET device"); + exit(1); + } + /* + * For pc-piix-*, hpet's intcap is always IRQ2. For pc-q35-1.7 and + * earlier, use IRQ2 for compat. Otherwise, use IRQ16~23, IRQ8 and + * IRQ2. + */ + uint8_t compat = object_property_get_uint(OBJECT(hpet), + HPET_INTCAP, NULL); + if (!compat) { + qdev_prop_set_uint32(hpet, HPET_INTCAP, hpet_irqs); + } + sysbus_realize_and_unref(SYS_BUS_DEVICE(hpet), &error_fatal); + sysbus_mmio_map(SYS_BUS_DEVICE(hpet), 0, HPET_BASE); + + for (i = 0; i < GSI_NUM_PINS; i++) { + sysbus_connect_irq(SYS_BUS_DEVICE(hpet), i, gsi[i]); + } + pit_isa_irq = -1; + pit_alt_irq = qdev_get_gpio_in(hpet, HPET_LEGACY_PIT_INT); + rtc_irq = qdev_get_gpio_in(hpet, HPET_LEGACY_RTC_INT); + } + *rtc_state = mc146818_rtc_init(isa_bus, 2000, rtc_irq); + + qemu_register_boot_set(pc_boot_set, *rtc_state); + + if (!xen_enabled() && pcms->pit_enabled) { + if (kvm_pit_in_kernel()) { + pit = kvm_pit_init(isa_bus, 0x40); + } else { + pit = i8254_pit_init(isa_bus, 0x40, pit_isa_irq, pit_alt_irq); + } + if (hpet) { + /* connect PIT to output control line of the HPET */ + qdev_connect_gpio_out(hpet, 0, qdev_get_gpio_in(DEVICE(pit), 0)); + } + pcspk_init(pcms->pcspk, isa_bus, pit); + } + + i8257_dma_init(isa_bus, 0); + + /* Super I/O */ + pc_superio_init(isa_bus, create_fdctrl, pcms->vmport != ON_OFF_AUTO_ON); +} + +void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus) +{ + int i; + + rom_set_order_override(FW_CFG_ORDER_OVERRIDE_NIC); + for (i = 0; i < nb_nics; i++) { + NICInfo *nd = &nd_table[i]; + const char *model = nd->model ? nd->model : pcmc->default_nic_model; + + if (g_str_equal(model, "ne2k_isa")) { + pc_init_ne2k_isa(isa_bus, nd); + } else { + pci_nic_init_nofail(nd, pci_bus, model, NULL); + } + } + rom_reset_order_override(); +} + +void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs) +{ + qemu_irq *i8259; + + if (kvm_pic_in_kernel()) { + i8259 = kvm_i8259_init(isa_bus); + } else if (xen_enabled()) { + i8259 = xen_interrupt_controller_init(); + } else { + i8259 = i8259_init(isa_bus, x86_allocate_cpu_irq()); + } + + for (size_t i = 0; i < ISA_NUM_IRQS; i++) { + i8259_irqs[i] = i8259[i]; + } + + g_free(i8259); +} + +static void pc_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, + Error **errp) +{ + const PCMachineState *pcms = PC_MACHINE(hotplug_dev); + const X86MachineState *x86ms = X86_MACHINE(hotplug_dev); + const PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + const MachineState *ms = MACHINE(hotplug_dev); + const bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM); + const uint64_t legacy_align = TARGET_PAGE_SIZE; + Error *local_err = NULL; + + /* + * When -no-acpi is used with Q35 machine type, no ACPI is built, + * but pcms->acpi_dev is still created. Check !acpi_enabled in + * addition to cover this case. + */ + if (!x86ms->acpi_dev || !x86_machine_is_acpi_enabled(x86ms)) { + error_setg(errp, + "memory hotplug is not enabled: missing acpi device or acpi disabled"); + return; + } + + if (is_nvdimm && !ms->nvdimms_state->is_enabled) { + error_setg(errp, "nvdimm is not enabled: missing 'nvdimm' in '-M'"); + return; + } + + hotplug_handler_pre_plug(x86ms->acpi_dev, dev, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + pc_dimm_pre_plug(PC_DIMM(dev), MACHINE(hotplug_dev), + pcmc->enforce_aligned_dimm ? NULL : &legacy_align, errp); +} + +static void pc_memory_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(hotplug_dev); + X86MachineState *x86ms = X86_MACHINE(hotplug_dev); + MachineState *ms = MACHINE(hotplug_dev); + bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM); + + pc_dimm_plug(PC_DIMM(dev), MACHINE(pcms)); + + if (is_nvdimm) { + nvdimm_plug(ms->nvdimms_state); + } + + hotplug_handler_plug(x86ms->acpi_dev, dev, &error_abort); +} + +static void pc_memory_unplug_request(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(hotplug_dev); + + /* + * When -no-acpi is used with Q35 machine type, no ACPI is built, + * but pcms->acpi_dev is still created. Check !acpi_enabled in + * addition to cover this case. + */ + if (!x86ms->acpi_dev || !x86_machine_is_acpi_enabled(x86ms)) { + error_setg(errp, + "memory hotplug is not enabled: missing acpi device or acpi disabled"); + return; + } + + if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) { + error_setg(errp, "nvdimm device hot unplug is not supported yet."); + return; + } + + hotplug_handler_unplug_request(x86ms->acpi_dev, dev, + errp); +} + +static void pc_memory_unplug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(hotplug_dev); + X86MachineState *x86ms = X86_MACHINE(hotplug_dev); + Error *local_err = NULL; + + hotplug_handler_unplug(x86ms->acpi_dev, dev, &local_err); + if (local_err) { + goto out; + } + + pc_dimm_unplug(PC_DIMM(dev), MACHINE(pcms)); + qdev_unrealize(dev); + out: + error_propagate(errp, local_err); +} + +static void pc_virtio_md_pci_pre_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev); + Error *local_err = NULL; + + if (!hotplug_dev2 && dev->hotplugged) { + /* + * Without a bus hotplug handler, we cannot control the plug/unplug + * order. We should never reach this point when hotplugging on x86, + * however, better add a safety net. + */ + error_setg(errp, "hotplug of virtio based memory devices not supported" + " on this bus."); + return; + } + /* + * First, see if we can plug this memory device at all. If that + * succeeds, branch of to the actual hotplug handler. + */ + memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL, + &local_err); + if (!local_err && hotplug_dev2) { + hotplug_handler_pre_plug(hotplug_dev2, dev, &local_err); + } + error_propagate(errp, local_err); +} + +static void pc_virtio_md_pci_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev); + Error *local_err = NULL; + + /* + * Plug the memory device first and then branch off to the actual + * hotplug handler. If that one fails, we can easily undo the memory + * device bits. + */ + memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev)); + if (hotplug_dev2) { + hotplug_handler_plug(hotplug_dev2, dev, &local_err); + if (local_err) { + memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev)); + } + } + error_propagate(errp, local_err); +} + +static void pc_virtio_md_pci_unplug_request(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + /* We don't support hot unplug of virtio based memory devices */ + error_setg(errp, "virtio based memory devices cannot be unplugged."); +} + +static void pc_virtio_md_pci_unplug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + /* We don't support hot unplug of virtio based memory devices */ +} + +static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { + pc_memory_pre_plug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + x86_cpu_pre_plug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) { + pc_virtio_md_pci_pre_plug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) { + /* Declare the APIC range as the reserved MSI region */ + char *resv_prop_str = g_strdup_printf("0xfee00000:0xfeefffff:%d", + VIRTIO_IOMMU_RESV_MEM_T_MSI); + + object_property_set_uint(OBJECT(dev), "len-reserved-regions", 1, errp); + object_property_set_str(OBJECT(dev), "reserved-regions[0]", + resv_prop_str, errp); + g_free(resv_prop_str); + } + + if (object_dynamic_cast(OBJECT(dev), TYPE_X86_IOMMU_DEVICE) || + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) { + PCMachineState *pcms = PC_MACHINE(hotplug_dev); + + if (pcms->iommu) { + error_setg(errp, "QEMU does not support multiple vIOMMUs " + "for x86 yet."); + return; + } + pcms->iommu = dev; + } +} + +static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { + pc_memory_plug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + x86_cpu_plug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) { + pc_virtio_md_pci_plug(hotplug_dev, dev, errp); + } +} + +static void pc_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { + pc_memory_unplug_request(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + x86_cpu_unplug_request_cb(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) { + pc_virtio_md_pci_unplug_request(hotplug_dev, dev, errp); + } else { + error_setg(errp, "acpi: device unplug request for not supported device" + " type: %s", object_get_typename(OBJECT(dev))); + } +} + +static void pc_machine_device_unplug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { + pc_memory_unplug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + x86_cpu_unplug_cb(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) { + pc_virtio_md_pci_unplug(hotplug_dev, dev, errp); + } else { + error_setg(errp, "acpi: device unplug for not supported device" + " type: %s", object_get_typename(OBJECT(dev))); + } +} + +static HotplugHandler *pc_get_hotplug_handler(MachineState *machine, + DeviceState *dev) +{ + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) || + object_dynamic_cast(OBJECT(dev), TYPE_CPU) || + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_X86_IOMMU_DEVICE)) { + return HOTPLUG_HANDLER(machine); + } + + return NULL; +} + +static void +pc_machine_get_device_memory_region_size(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + MachineState *ms = MACHINE(obj); + int64_t value = 0; + + if (ms->device_memory) { + value = memory_region_size(&ms->device_memory->mr); + } + + visit_type_int(v, name, &value, errp); +} + +static void pc_machine_get_vmport(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + OnOffAuto vmport = pcms->vmport; + + visit_type_OnOffAuto(v, name, &vmport, errp); +} + +static void pc_machine_set_vmport(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &pcms->vmport, errp); +} + +static bool pc_machine_get_smbus(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->smbus_enabled; +} + +static void pc_machine_set_smbus(Object *obj, bool value, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->smbus_enabled = value; +} + +static bool pc_machine_get_sata(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->sata_enabled; +} + +static void pc_machine_set_sata(Object *obj, bool value, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->sata_enabled = value; +} + +static bool pc_machine_get_pit(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->pit_enabled; +} + +static void pc_machine_set_pit(Object *obj, bool value, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->pit_enabled = value; +} + +static bool pc_machine_get_hpet(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->hpet_enabled; +} + +static void pc_machine_set_hpet(Object *obj, bool value, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->hpet_enabled = value; +} + +static bool pc_machine_get_default_bus_bypass_iommu(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->default_bus_bypass_iommu; +} + +static void pc_machine_set_default_bus_bypass_iommu(Object *obj, bool value, + Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->default_bus_bypass_iommu = value; +} + +static void pc_machine_get_max_ram_below_4g(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + uint64_t value = pcms->max_ram_below_4g; + + visit_type_size(v, name, &value, errp); +} + +static void pc_machine_set_max_ram_below_4g(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + uint64_t value; + + if (!visit_type_size(v, name, &value, errp)) { + return; + } + if (value > 4 * GiB) { + error_setg(errp, + "Machine option 'max-ram-below-4g=%"PRIu64 + "' expects size less than or equal to 4G", value); + return; + } + + if (value < 1 * MiB) { + warn_report("Only %" PRIu64 " bytes of RAM below the 4GiB boundary," + "BIOS may not work with less than 1MiB", value); + } + + pcms->max_ram_below_4g = value; +} + +static void pc_machine_get_max_fw_size(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + uint64_t value = pcms->max_fw_size; + + visit_type_size(v, name, &value, errp); +} + +static void pc_machine_set_max_fw_size(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + Error *error = NULL; + uint64_t value; + + visit_type_size(v, name, &value, &error); + if (error) { + error_propagate(errp, error); + return; + } + + /* + * We don't have a theoretically justifiable exact lower bound on the base + * address of any flash mapping. In practice, the IO-APIC MMIO range is + * [0xFEE00000..0xFEE01000] -- see IO_APIC_DEFAULT_ADDRESS --, leaving free + * only 18MB-4KB below 4G. For now, restrict the cumulative mapping to 8MB in + * size. + */ + if (value > 16 * MiB) { + error_setg(errp, + "User specified max allowed firmware size %" PRIu64 " is " + "greater than 16MiB. If combined firwmare size exceeds " + "16MiB the system may not boot, or experience intermittent" + "stability issues.", + value); + return; + } + + pcms->max_fw_size = value; +} + + +static void pc_machine_initfn(Object *obj) +{ + PCMachineState *pcms = PC_MACHINE(obj); + +#ifdef CONFIG_VMPORT + pcms->vmport = ON_OFF_AUTO_AUTO; +#else + pcms->vmport = ON_OFF_AUTO_OFF; +#endif /* CONFIG_VMPORT */ + pcms->max_ram_below_4g = 0; /* use default */ + /* acpi build is enabled by default if machine supports it */ + pcms->acpi_build_enabled = PC_MACHINE_GET_CLASS(pcms)->has_acpi_build; + pcms->smbus_enabled = true; + pcms->sata_enabled = true; + pcms->pit_enabled = true; + pcms->max_fw_size = 8 * MiB; +#ifdef CONFIG_HPET + pcms->hpet_enabled = true; +#endif + pcms->default_bus_bypass_iommu = false; + + pc_system_flash_create(pcms); + pcms->pcspk = isa_new(TYPE_PC_SPEAKER); + object_property_add_alias(OBJECT(pcms), "pcspk-audiodev", + OBJECT(pcms->pcspk), "audiodev"); +} + +static void pc_machine_reset(MachineState *machine) +{ + CPUState *cs; + X86CPU *cpu; + + qemu_devices_reset(); + + /* Reset APIC after devices have been reset to cancel + * any changes that qemu_devices_reset() might have done. + */ + CPU_FOREACH(cs) { + cpu = X86_CPU(cs); + + if (cpu->apic_state) { + device_legacy_reset(cpu->apic_state); + } + } +} + +static void pc_machine_wakeup(MachineState *machine) +{ + cpu_synchronize_all_states(); + pc_machine_reset(machine); + cpu_synchronize_all_post_reset(); +} + +static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp) +{ + X86IOMMUState *iommu = x86_iommu_get_default(); + IntelIOMMUState *intel_iommu; + + if (iommu && + object_dynamic_cast((Object *)iommu, TYPE_INTEL_IOMMU_DEVICE) && + object_dynamic_cast((Object *)dev, "vfio-pci")) { + intel_iommu = INTEL_IOMMU_DEVICE(iommu); + if (!intel_iommu->caching_mode) { + error_setg(errp, "Device assignment is not allowed without " + "enabling caching-mode=on for Intel IOMMU."); + return false; + } + } + + return true; +} + +static void pc_machine_class_init(ObjectClass *oc, void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + PCMachineClass *pcmc = PC_MACHINE_CLASS(oc); + HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); + + pcmc->pci_enabled = true; + pcmc->has_acpi_build = true; + pcmc->rsdp_in_ram = true; + pcmc->smbios_defaults = true; + pcmc->smbios_uuid_encoded = true; + pcmc->gigabyte_align = true; + pcmc->has_reserved_memory = true; + pcmc->kvmclock_enabled = true; + pcmc->enforce_aligned_dimm = true; + /* BIOS ACPI tables: 128K. Other BIOS datastructures: less than 4K reported + * to be used at the moment, 32K should be enough for a while. */ + pcmc->acpi_data_size = 0x20000 + 0x8000; + pcmc->pvh_enabled = true; + pcmc->kvmclock_create_always = true; + assert(!mc->get_hotplug_handler); + mc->get_hotplug_handler = pc_get_hotplug_handler; + mc->hotplug_allowed = pc_hotplug_allowed; + mc->cpu_index_to_instance_props = x86_cpu_index_to_props; + mc->get_default_cpu_node_id = x86_get_default_cpu_node_id; + mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids; + mc->auto_enable_numa_with_memhp = true; + mc->auto_enable_numa_with_memdev = true; + mc->has_hotpluggable_cpus = true; + mc->default_boot_order = "cad"; + mc->block_default_type = IF_IDE; + mc->max_cpus = 255; + mc->reset = pc_machine_reset; + mc->wakeup = pc_machine_wakeup; + hc->pre_plug = pc_machine_device_pre_plug_cb; + hc->plug = pc_machine_device_plug_cb; + hc->unplug_request = pc_machine_device_unplug_request_cb; + hc->unplug = pc_machine_device_unplug_cb; + mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE; + mc->nvdimm_supported = true; + mc->smp_props.dies_supported = true; + mc->default_ram_id = "pc.ram"; + + object_class_property_add(oc, PC_MACHINE_MAX_RAM_BELOW_4G, "size", + pc_machine_get_max_ram_below_4g, pc_machine_set_max_ram_below_4g, + NULL, NULL); + object_class_property_set_description(oc, PC_MACHINE_MAX_RAM_BELOW_4G, + "Maximum ram below the 4G boundary (32bit boundary)"); + + object_class_property_add(oc, PC_MACHINE_DEVMEM_REGION_SIZE, "int", + pc_machine_get_device_memory_region_size, NULL, + NULL, NULL); + + object_class_property_add(oc, PC_MACHINE_VMPORT, "OnOffAuto", + pc_machine_get_vmport, pc_machine_set_vmport, + NULL, NULL); + object_class_property_set_description(oc, PC_MACHINE_VMPORT, + "Enable vmport (pc & q35)"); + + object_class_property_add_bool(oc, PC_MACHINE_SMBUS, + pc_machine_get_smbus, pc_machine_set_smbus); + + object_class_property_add_bool(oc, PC_MACHINE_SATA, + pc_machine_get_sata, pc_machine_set_sata); + + object_class_property_add_bool(oc, PC_MACHINE_PIT, + pc_machine_get_pit, pc_machine_set_pit); + + object_class_property_add_bool(oc, "hpet", + pc_machine_get_hpet, pc_machine_set_hpet); + + object_class_property_add_bool(oc, "default-bus-bypass-iommu", + pc_machine_get_default_bus_bypass_iommu, + pc_machine_set_default_bus_bypass_iommu); + + object_class_property_add(oc, PC_MACHINE_MAX_FW_SIZE, "size", + pc_machine_get_max_fw_size, pc_machine_set_max_fw_size, + NULL, NULL); + object_class_property_set_description(oc, PC_MACHINE_MAX_FW_SIZE, + "Maximum combined firmware size"); +} + +static const TypeInfo pc_machine_info = { + .name = TYPE_PC_MACHINE, + .parent = TYPE_X86_MACHINE, + .abstract = true, + .instance_size = sizeof(PCMachineState), + .instance_init = pc_machine_initfn, + .class_size = sizeof(PCMachineClass), + .class_init = pc_machine_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_HOTPLUG_HANDLER }, + { } + }, +}; + +static void pc_machine_register_types(void) +{ + type_register_static(&pc_machine_info); +} + +type_init(pc_machine_register_types) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c new file mode 100644 index 000000000..223dd3e05 --- /dev/null +++ b/hw/i386/pc_piix.c @@ -0,0 +1,951 @@ +/* + * QEMU PC System Emulator + * + * Copyright (c) 2003-2004 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include CONFIG_DEVICES + +#include "qemu/units.h" +#include "hw/loader.h" +#include "hw/i386/x86.h" +#include "hw/i386/pc.h" +#include "hw/i386/apic.h" +#include "hw/pci-host/i440fx.h" +#include "hw/southbridge/piix.h" +#include "hw/display/ramfb.h" +#include "hw/firmware/smbios.h" +#include "hw/pci/pci.h" +#include "hw/pci/pci_ids.h" +#include "hw/usb.h" +#include "net/net.h" +#include "hw/ide/pci.h" +#include "hw/irq.h" +#include "sysemu/kvm.h" +#include "hw/kvm/clock.h" +#include "hw/sysbus.h" +#include "hw/i2c/smbus_eeprom.h" +#include "hw/xen/xen-x86.h" +#include "exec/memory.h" +#include "hw/acpi/acpi.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "sysemu/xen.h" +#ifdef CONFIG_XEN +#include <xen/hvm/hvm_info_table.h> +#include "hw/xen/xen_pt.h" +#endif +#include "migration/global_state.h" +#include "migration/misc.h" +#include "sysemu/numa.h" +#include "hw/hyperv/vmbus-bridge.h" +#include "hw/mem/nvdimm.h" +#include "hw/i386/acpi-build.h" +#include "kvm/kvm-cpu.h" + +#define MAX_IDE_BUS 2 + +#ifdef CONFIG_IDE_ISA +static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 }; +static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 }; +static const int ide_irq[MAX_IDE_BUS] = { 14, 15 }; +#endif + +/* PC hardware initialisation */ +static void pc_init1(MachineState *machine, + const char *host_type, const char *pci_type) +{ + PCMachineState *pcms = PC_MACHINE(machine); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + X86MachineState *x86ms = X86_MACHINE(machine); + MemoryRegion *system_memory = get_system_memory(); + MemoryRegion *system_io = get_system_io(); + PCIBus *pci_bus; + ISABus *isa_bus; + PCII440FXState *i440fx_state; + int piix3_devfn = -1; + qemu_irq smi_irq; + GSIState *gsi_state; + BusState *idebus[MAX_IDE_BUS]; + ISADevice *rtc_state; + MemoryRegion *ram_memory; + MemoryRegion *pci_memory; + MemoryRegion *rom_memory; + ram_addr_t lowmem; + + /* + * Calculate ram split, for memory below and above 4G. It's a bit + * complicated for backward compatibility reasons ... + * + * - Traditional split is 3.5G (lowmem = 0xe0000000). This is the + * default value for max_ram_below_4g now. + * + * - Then, to gigabyte align the memory, we move the split to 3G + * (lowmem = 0xc0000000). But only in case we have to split in + * the first place, i.e. ram_size is larger than (traditional) + * lowmem. And for new machine types (gigabyte_align = true) + * only, for live migration compatibility reasons. + * + * - Next the max-ram-below-4g option was added, which allowed to + * reduce lowmem to a smaller value, to allow a larger PCI I/O + * window below 4G. qemu doesn't enforce gigabyte alignment here, + * but prints a warning. + * + * - Finally max-ram-below-4g got updated to also allow raising lowmem, + * so legacy non-PAE guests can get as much memory as possible in + * the 32bit address space below 4G. + * + * - Note that Xen has its own ram setup code in xen_ram_init(), + * called via xen_hvm_init_pc(). + * + * Examples: + * qemu -M pc-1.7 -m 4G (old default) -> 3584M low, 512M high + * qemu -M pc -m 4G (new default) -> 3072M low, 1024M high + * qemu -M pc,max-ram-below-4g=2G -m 4G -> 2048M low, 2048M high + * qemu -M pc,max-ram-below-4g=4G -m 3968M -> 3968M low (=4G-128M) + */ + if (xen_enabled()) { + xen_hvm_init_pc(pcms, &ram_memory); + } else { + if (!pcms->max_ram_below_4g) { + pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */ + } + lowmem = pcms->max_ram_below_4g; + if (machine->ram_size >= pcms->max_ram_below_4g) { + if (pcmc->gigabyte_align) { + if (lowmem > 0xc0000000) { + lowmem = 0xc0000000; + } + if (lowmem & (1 * GiB - 1)) { + warn_report("Large machine and max_ram_below_4g " + "(%" PRIu64 ") not a multiple of 1G; " + "possible bad performance.", + pcms->max_ram_below_4g); + } + } + } + + if (machine->ram_size >= lowmem) { + x86ms->above_4g_mem_size = machine->ram_size - lowmem; + x86ms->below_4g_mem_size = lowmem; + } else { + x86ms->above_4g_mem_size = 0; + x86ms->below_4g_mem_size = machine->ram_size; + } + } + + pc_machine_init_sgx_epc(pcms); + x86_cpus_init(x86ms, pcmc->default_cpu_version); + + if (pcmc->kvmclock_enabled) { + kvmclock_create(pcmc->kvmclock_create_always); + } + + if (pcmc->pci_enabled) { + pci_memory = g_new(MemoryRegion, 1); + memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); + rom_memory = pci_memory; + } else { + pci_memory = NULL; + rom_memory = system_memory; + } + + pc_guest_info_init(pcms); + + if (pcmc->smbios_defaults) { + MachineClass *mc = MACHINE_GET_CLASS(machine); + /* These values are guest ABI, do not change */ + smbios_set_defaults("QEMU", "Standard PC (i440FX + PIIX, 1996)", + mc->name, pcmc->smbios_legacy_mode, + pcmc->smbios_uuid_encoded, + SMBIOS_ENTRY_POINT_21); + } + + /* allocate ram and load rom/bios */ + if (!xen_enabled()) { + pc_memory_init(pcms, system_memory, + rom_memory, &ram_memory); + } else { + pc_system_flash_cleanup_unused(pcms); + if (machine->kernel_filename != NULL) { + /* For xen HVM direct kernel boot, load linux here */ + xen_load_linux(pcms); + } + } + + gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled); + + if (pcmc->pci_enabled) { + PIIX3State *piix3; + + pci_bus = i440fx_init(host_type, + pci_type, + &i440fx_state, + system_memory, system_io, machine->ram_size, + x86ms->below_4g_mem_size, + x86ms->above_4g_mem_size, + pci_memory, ram_memory); + pcms->bus = pci_bus; + + piix3 = piix3_create(pci_bus, &isa_bus); + piix3->pic = x86ms->gsi; + piix3_devfn = piix3->dev.devfn; + } else { + pci_bus = NULL; + i440fx_state = NULL; + isa_bus = isa_bus_new(NULL, get_system_memory(), system_io, + &error_abort); + pcms->hpet_enabled = false; + } + isa_bus_irqs(isa_bus, x86ms->gsi); + + pc_i8259_create(isa_bus, gsi_state->i8259_irq); + + if (pcmc->pci_enabled) { + ioapic_init_gsi(gsi_state, "i440fx"); + } + + if (tcg_enabled()) { + x86_register_ferr_irq(x86ms->gsi[13]); + } + + pc_vga_init(isa_bus, pcmc->pci_enabled ? pci_bus : NULL); + + assert(pcms->vmport != ON_OFF_AUTO__MAX); + if (pcms->vmport == ON_OFF_AUTO_AUTO) { + pcms->vmport = xen_enabled() ? ON_OFF_AUTO_OFF : ON_OFF_AUTO_ON; + } + + /* init basic PC hardware */ + pc_basic_device_init(pcms, isa_bus, x86ms->gsi, &rtc_state, true, + 0x4); + + pc_nic_init(pcmc, isa_bus, pci_bus); + + if (pcmc->pci_enabled) { + PCIDevice *dev; + + dev = pci_create_simple(pci_bus, piix3_devfn + 1, + xen_enabled() ? "piix3-ide-xen" : "piix3-ide"); + pci_ide_create_devs(dev); + idebus[0] = qdev_get_child_bus(&dev->qdev, "ide.0"); + idebus[1] = qdev_get_child_bus(&dev->qdev, "ide.1"); + pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state); + } +#ifdef CONFIG_IDE_ISA + else { + DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS]; + int i; + + ide_drive_get(hd, ARRAY_SIZE(hd)); + for (i = 0; i < MAX_IDE_BUS; i++) { + ISADevice *dev; + char busname[] = "ide.0"; + dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i], + ide_irq[i], + hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]); + /* + * The ide bus name is ide.0 for the first bus and ide.1 for the + * second one. + */ + busname[4] = '0' + i; + idebus[i] = qdev_get_child_bus(DEVICE(dev), busname); + } + pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state); + } +#endif + + if (pcmc->pci_enabled && machine_usb(machine)) { + pci_create_simple(pci_bus, piix3_devfn + 2, "piix3-usb-uhci"); + } + + if (pcmc->pci_enabled && x86_machine_is_acpi_enabled(X86_MACHINE(pcms))) { + DeviceState *piix4_pm; + + smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0); + /* TODO: Populate SPD eeprom data. */ + pcms->smbus = piix4_pm_init(pci_bus, piix3_devfn + 3, 0xb100, + x86ms->gsi[9], smi_irq, + x86_machine_is_smm_enabled(x86ms), + &piix4_pm); + smbus_eeprom_init(pcms->smbus, 8, NULL, 0); + + object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, + TYPE_HOTPLUG_HANDLER, + (Object **)&x86ms->acpi_dev, + object_property_allow_set_link, + OBJ_PROP_LINK_STRONG); + object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, + OBJECT(piix4_pm), &error_abort); + } + + if (machine->nvdimms_state->is_enabled) { + nvdimm_init_acpi_state(machine->nvdimms_state, system_io, + x86_nvdimm_acpi_dsmio, + x86ms->fw_cfg, OBJECT(pcms)); + } +} + +/* Looking for a pc_compat_2_4() function? It doesn't exist. + * pc_compat_*() functions that run on machine-init time and + * change global QEMU state are deprecated. Please don't create + * one, and implement any pc-*-2.4 (and newer) compat code in + * hw_compat_*, pc_compat_*, or * pc_*_machine_options(). + */ + +static void pc_compat_2_3_fn(MachineState *machine) +{ + X86MachineState *x86ms = X86_MACHINE(machine); + if (kvm_enabled()) { + x86ms->smm = ON_OFF_AUTO_OFF; + } +} + +static void pc_compat_2_2_fn(MachineState *machine) +{ + pc_compat_2_3_fn(machine); +} + +static void pc_compat_2_1_fn(MachineState *machine) +{ + pc_compat_2_2_fn(machine); + x86_cpu_change_kvm_default("svm", NULL); +} + +static void pc_compat_2_0_fn(MachineState *machine) +{ + pc_compat_2_1_fn(machine); +} + +static void pc_compat_1_7_fn(MachineState *machine) +{ + pc_compat_2_0_fn(machine); + x86_cpu_change_kvm_default("x2apic", NULL); +} + +static void pc_compat_1_6_fn(MachineState *machine) +{ + pc_compat_1_7_fn(machine); +} + +static void pc_compat_1_5_fn(MachineState *machine) +{ + pc_compat_1_6_fn(machine); +} + +static void pc_compat_1_4_fn(MachineState *machine) +{ + pc_compat_1_5_fn(machine); +} + +static void pc_init_isa(MachineState *machine) +{ + pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, TYPE_I440FX_PCI_DEVICE); +} + +#ifdef CONFIG_XEN +static void pc_xen_hvm_init_pci(MachineState *machine) +{ + const char *pci_type = xen_igd_gfx_pt_enabled() ? + TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE : TYPE_I440FX_PCI_DEVICE; + + pc_init1(machine, + TYPE_I440FX_PCI_HOST_BRIDGE, + pci_type); +} + +static void pc_xen_hvm_init(MachineState *machine) +{ + PCMachineState *pcms = PC_MACHINE(machine); + + if (!xen_enabled()) { + error_report("xenfv machine requires the xen accelerator"); + exit(1); + } + + pc_xen_hvm_init_pci(machine); + pci_create_simple(pcms->bus, -1, "xen-platform"); +} +#endif + +#define DEFINE_I440FX_MACHINE(suffix, name, compatfn, optionfn) \ + static void pc_init_##suffix(MachineState *machine) \ + { \ + void (*compat)(MachineState *m) = (compatfn); \ + if (compat) { \ + compat(machine); \ + } \ + pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ + TYPE_I440FX_PCI_DEVICE); \ + } \ + DEFINE_PC_MACHINE(suffix, name, pc_init_##suffix, optionfn) + +static void pc_i440fx_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pcmc->default_nic_model = "e1000"; + pcmc->pci_root_uid = 0; + + m->family = "pc_piix"; + m->desc = "Standard PC (i440FX + PIIX, 1996)"; + m->default_machine_opts = "firmware=bios-256k.bin"; + m->default_display = "std"; + machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); + machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); +} + +static void pc_i440fx_6_2_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pc_i440fx_machine_options(m); + m->alias = "pc"; + m->is_default = true; + pcmc->default_cpu_version = 1; +} + +DEFINE_I440FX_MACHINE(v6_2, "pc-i440fx-6.2", NULL, + pc_i440fx_6_2_machine_options); + +static void pc_i440fx_6_1_machine_options(MachineClass *m) +{ + pc_i440fx_6_2_machine_options(m); + m->alias = NULL; + m->is_default = false; + compat_props_add(m->compat_props, hw_compat_6_1, hw_compat_6_1_len); + compat_props_add(m->compat_props, pc_compat_6_1, pc_compat_6_1_len); + m->smp_props.prefer_sockets = true; +} + +DEFINE_I440FX_MACHINE(v6_1, "pc-i440fx-6.1", NULL, + pc_i440fx_6_1_machine_options); + +static void pc_i440fx_6_0_machine_options(MachineClass *m) +{ + pc_i440fx_6_1_machine_options(m); + m->alias = NULL; + m->is_default = false; + compat_props_add(m->compat_props, hw_compat_6_0, hw_compat_6_0_len); + compat_props_add(m->compat_props, pc_compat_6_0, pc_compat_6_0_len); +} + +DEFINE_I440FX_MACHINE(v6_0, "pc-i440fx-6.0", NULL, + pc_i440fx_6_0_machine_options); + +static void pc_i440fx_5_2_machine_options(MachineClass *m) +{ + pc_i440fx_6_0_machine_options(m); + m->alias = NULL; + m->is_default = false; + compat_props_add(m->compat_props, hw_compat_5_2, hw_compat_5_2_len); + compat_props_add(m->compat_props, pc_compat_5_2, pc_compat_5_2_len); +} + +DEFINE_I440FX_MACHINE(v5_2, "pc-i440fx-5.2", NULL, + pc_i440fx_5_2_machine_options); + +static void pc_i440fx_5_1_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_i440fx_5_2_machine_options(m); + m->alias = NULL; + m->is_default = false; + compat_props_add(m->compat_props, hw_compat_5_1, hw_compat_5_1_len); + compat_props_add(m->compat_props, pc_compat_5_1, pc_compat_5_1_len); + pcmc->kvmclock_create_always = false; + pcmc->pci_root_uid = 1; +} + +DEFINE_I440FX_MACHINE(v5_1, "pc-i440fx-5.1", NULL, + pc_i440fx_5_1_machine_options); + +static void pc_i440fx_5_0_machine_options(MachineClass *m) +{ + pc_i440fx_5_1_machine_options(m); + m->alias = NULL; + m->is_default = false; + m->numa_mem_supported = true; + compat_props_add(m->compat_props, hw_compat_5_0, hw_compat_5_0_len); + compat_props_add(m->compat_props, pc_compat_5_0, pc_compat_5_0_len); + m->auto_enable_numa_with_memdev = false; +} + +DEFINE_I440FX_MACHINE(v5_0, "pc-i440fx-5.0", NULL, + pc_i440fx_5_0_machine_options); + +static void pc_i440fx_4_2_machine_options(MachineClass *m) +{ + pc_i440fx_5_0_machine_options(m); + m->alias = NULL; + m->is_default = false; + compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len); + compat_props_add(m->compat_props, pc_compat_4_2, pc_compat_4_2_len); +} + +DEFINE_I440FX_MACHINE(v4_2, "pc-i440fx-4.2", NULL, + pc_i440fx_4_2_machine_options); + +static void pc_i440fx_4_1_machine_options(MachineClass *m) +{ + pc_i440fx_4_2_machine_options(m); + m->alias = NULL; + m->is_default = false; + compat_props_add(m->compat_props, hw_compat_4_1, hw_compat_4_1_len); + compat_props_add(m->compat_props, pc_compat_4_1, pc_compat_4_1_len); +} + +DEFINE_I440FX_MACHINE(v4_1, "pc-i440fx-4.1", NULL, + pc_i440fx_4_1_machine_options); + +static void pc_i440fx_4_0_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pc_i440fx_4_1_machine_options(m); + m->alias = NULL; + m->is_default = false; + pcmc->default_cpu_version = CPU_VERSION_LEGACY; + compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len); + compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len); +} + +DEFINE_I440FX_MACHINE(v4_0, "pc-i440fx-4.0", NULL, + pc_i440fx_4_0_machine_options); + +static void pc_i440fx_3_1_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_i440fx_4_0_machine_options(m); + m->is_default = false; + pcmc->do_not_add_smb_acpi = true; + m->smbus_no_migration_support = true; + m->alias = NULL; + pcmc->pvh_enabled = false; + compat_props_add(m->compat_props, hw_compat_3_1, hw_compat_3_1_len); + compat_props_add(m->compat_props, pc_compat_3_1, pc_compat_3_1_len); +} + +DEFINE_I440FX_MACHINE(v3_1, "pc-i440fx-3.1", NULL, + pc_i440fx_3_1_machine_options); + +static void pc_i440fx_3_0_machine_options(MachineClass *m) +{ + pc_i440fx_3_1_machine_options(m); + compat_props_add(m->compat_props, hw_compat_3_0, hw_compat_3_0_len); + compat_props_add(m->compat_props, pc_compat_3_0, pc_compat_3_0_len); +} + +DEFINE_I440FX_MACHINE(v3_0, "pc-i440fx-3.0", NULL, + pc_i440fx_3_0_machine_options); + +static void pc_i440fx_2_12_machine_options(MachineClass *m) +{ + pc_i440fx_3_0_machine_options(m); + compat_props_add(m->compat_props, hw_compat_2_12, hw_compat_2_12_len); + compat_props_add(m->compat_props, pc_compat_2_12, pc_compat_2_12_len); +} + +DEFINE_I440FX_MACHINE(v2_12, "pc-i440fx-2.12", NULL, + pc_i440fx_2_12_machine_options); + +static void pc_i440fx_2_11_machine_options(MachineClass *m) +{ + pc_i440fx_2_12_machine_options(m); + compat_props_add(m->compat_props, hw_compat_2_11, hw_compat_2_11_len); + compat_props_add(m->compat_props, pc_compat_2_11, pc_compat_2_11_len); +} + +DEFINE_I440FX_MACHINE(v2_11, "pc-i440fx-2.11", NULL, + pc_i440fx_2_11_machine_options); + +static void pc_i440fx_2_10_machine_options(MachineClass *m) +{ + pc_i440fx_2_11_machine_options(m); + compat_props_add(m->compat_props, hw_compat_2_10, hw_compat_2_10_len); + compat_props_add(m->compat_props, pc_compat_2_10, pc_compat_2_10_len); + m->auto_enable_numa_with_memhp = false; +} + +DEFINE_I440FX_MACHINE(v2_10, "pc-i440fx-2.10", NULL, + pc_i440fx_2_10_machine_options); + +static void pc_i440fx_2_9_machine_options(MachineClass *m) +{ + pc_i440fx_2_10_machine_options(m); + compat_props_add(m->compat_props, hw_compat_2_9, hw_compat_2_9_len); + compat_props_add(m->compat_props, pc_compat_2_9, pc_compat_2_9_len); +} + +DEFINE_I440FX_MACHINE(v2_9, "pc-i440fx-2.9", NULL, + pc_i440fx_2_9_machine_options); + +static void pc_i440fx_2_8_machine_options(MachineClass *m) +{ + pc_i440fx_2_9_machine_options(m); + compat_props_add(m->compat_props, hw_compat_2_8, hw_compat_2_8_len); + compat_props_add(m->compat_props, pc_compat_2_8, pc_compat_2_8_len); +} + +DEFINE_I440FX_MACHINE(v2_8, "pc-i440fx-2.8", NULL, + pc_i440fx_2_8_machine_options); + +static void pc_i440fx_2_7_machine_options(MachineClass *m) +{ + pc_i440fx_2_8_machine_options(m); + compat_props_add(m->compat_props, hw_compat_2_7, hw_compat_2_7_len); + compat_props_add(m->compat_props, pc_compat_2_7, pc_compat_2_7_len); +} + +DEFINE_I440FX_MACHINE(v2_7, "pc-i440fx-2.7", NULL, + pc_i440fx_2_7_machine_options); + +static void pc_i440fx_2_6_machine_options(MachineClass *m) +{ + X86MachineClass *x86mc = X86_MACHINE_CLASS(m); + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_i440fx_2_7_machine_options(m); + pcmc->legacy_cpu_hotplug = true; + x86mc->fwcfg_dma_enabled = false; + compat_props_add(m->compat_props, hw_compat_2_6, hw_compat_2_6_len); + compat_props_add(m->compat_props, pc_compat_2_6, pc_compat_2_6_len); +} + +DEFINE_I440FX_MACHINE(v2_6, "pc-i440fx-2.6", NULL, + pc_i440fx_2_6_machine_options); + +static void pc_i440fx_2_5_machine_options(MachineClass *m) +{ + X86MachineClass *x86mc = X86_MACHINE_CLASS(m); + + pc_i440fx_2_6_machine_options(m); + x86mc->save_tsc_khz = false; + m->legacy_fw_cfg_order = 1; + compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len); + compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len); +} + +DEFINE_I440FX_MACHINE(v2_5, "pc-i440fx-2.5", NULL, + pc_i440fx_2_5_machine_options); + +static void pc_i440fx_2_4_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_i440fx_2_5_machine_options(m); + m->hw_version = "2.4.0"; + pcmc->broken_reserved_end = true; + compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len); + compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len); +} + +DEFINE_I440FX_MACHINE(v2_4, "pc-i440fx-2.4", NULL, + pc_i440fx_2_4_machine_options) + +static void pc_i440fx_2_3_machine_options(MachineClass *m) +{ + pc_i440fx_2_4_machine_options(m); + m->hw_version = "2.3.0"; + compat_props_add(m->compat_props, hw_compat_2_3, hw_compat_2_3_len); + compat_props_add(m->compat_props, pc_compat_2_3, pc_compat_2_3_len); +} + +DEFINE_I440FX_MACHINE(v2_3, "pc-i440fx-2.3", pc_compat_2_3_fn, + pc_i440fx_2_3_machine_options); + +static void pc_i440fx_2_2_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_i440fx_2_3_machine_options(m); + m->hw_version = "2.2.0"; + m->default_machine_opts = "firmware=bios-256k.bin,suppress-vmdesc=on"; + compat_props_add(m->compat_props, hw_compat_2_2, hw_compat_2_2_len); + compat_props_add(m->compat_props, pc_compat_2_2, pc_compat_2_2_len); + pcmc->rsdp_in_ram = false; +} + +DEFINE_I440FX_MACHINE(v2_2, "pc-i440fx-2.2", pc_compat_2_2_fn, + pc_i440fx_2_2_machine_options); + +static void pc_i440fx_2_1_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_i440fx_2_2_machine_options(m); + m->hw_version = "2.1.0"; + m->default_display = NULL; + compat_props_add(m->compat_props, hw_compat_2_1, hw_compat_2_1_len); + compat_props_add(m->compat_props, pc_compat_2_1, pc_compat_2_1_len); + pcmc->smbios_uuid_encoded = false; + pcmc->enforce_aligned_dimm = false; +} + +DEFINE_I440FX_MACHINE(v2_1, "pc-i440fx-2.1", pc_compat_2_1_fn, + pc_i440fx_2_1_machine_options); + +static void pc_i440fx_2_0_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_i440fx_2_1_machine_options(m); + m->hw_version = "2.0.0"; + compat_props_add(m->compat_props, pc_compat_2_0, pc_compat_2_0_len); + pcmc->smbios_legacy_mode = true; + pcmc->has_reserved_memory = false; + /* This value depends on the actual DSDT and SSDT compiled into + * the source QEMU; unfortunately it depends on the binary and + * not on the machine type, so we cannot make pc-i440fx-1.7 work on + * both QEMU 1.7 and QEMU 2.0. + * + * Large variations cause migration to fail for more than one + * consecutive value of the "-smp" maxcpus option. + * + * For small variations of the kind caused by different iasl versions, + * the 4k rounding usually leaves slack. However, there could be still + * one or two values that break. For QEMU 1.7 and QEMU 2.0 the + * slack is only ~10 bytes before one "-smp maxcpus" value breaks! + * + * 6652 is valid for QEMU 2.0, the right value for pc-i440fx-1.7 on + * QEMU 1.7 it is 6414. For RHEL/CentOS 7.0 it is 6418. + */ + pcmc->legacy_acpi_table_size = 6652; + pcmc->acpi_data_size = 0x10000; +} + +DEFINE_I440FX_MACHINE(v2_0, "pc-i440fx-2.0", pc_compat_2_0_fn, + pc_i440fx_2_0_machine_options); + +static void pc_i440fx_1_7_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_i440fx_2_0_machine_options(m); + m->hw_version = "1.7.0"; + m->default_machine_opts = NULL; + m->option_rom_has_mr = true; + compat_props_add(m->compat_props, pc_compat_1_7, pc_compat_1_7_len); + pcmc->smbios_defaults = false; + pcmc->gigabyte_align = false; + pcmc->legacy_acpi_table_size = 6414; +} + +DEFINE_I440FX_MACHINE(v1_7, "pc-i440fx-1.7", pc_compat_1_7_fn, + pc_i440fx_1_7_machine_options); + +static void pc_i440fx_1_6_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_i440fx_1_7_machine_options(m); + m->hw_version = "1.6.0"; + m->rom_file_has_mr = false; + compat_props_add(m->compat_props, pc_compat_1_6, pc_compat_1_6_len); + pcmc->has_acpi_build = false; +} + +DEFINE_I440FX_MACHINE(v1_6, "pc-i440fx-1.6", pc_compat_1_6_fn, + pc_i440fx_1_6_machine_options); + +static void pc_i440fx_1_5_machine_options(MachineClass *m) +{ + pc_i440fx_1_6_machine_options(m); + m->hw_version = "1.5.0"; + compat_props_add(m->compat_props, pc_compat_1_5, pc_compat_1_5_len); +} + +DEFINE_I440FX_MACHINE(v1_5, "pc-i440fx-1.5", pc_compat_1_5_fn, + pc_i440fx_1_5_machine_options); + +static void pc_i440fx_1_4_machine_options(MachineClass *m) +{ + pc_i440fx_1_5_machine_options(m); + m->hw_version = "1.4.0"; + compat_props_add(m->compat_props, pc_compat_1_4, pc_compat_1_4_len); +} + +DEFINE_I440FX_MACHINE(v1_4, "pc-i440fx-1.4", pc_compat_1_4_fn, + pc_i440fx_1_4_machine_options); + +typedef struct { + uint16_t gpu_device_id; + uint16_t pch_device_id; + uint8_t pch_revision_id; +} IGDDeviceIDInfo; + +/* In real world different GPU should have different PCH. But actually + * the different PCH DIDs likely map to different PCH SKUs. We do the + * same thing for the GPU. For PCH, the different SKUs are going to be + * all the same silicon design and implementation, just different + * features turn on and off with fuses. The SW interfaces should be + * consistent across all SKUs in a given family (eg LPT). But just same + * features may not be supported. + * + * Most of these different PCH features probably don't matter to the + * Gfx driver, but obviously any difference in display port connections + * will so it should be fine with any PCH in case of passthrough. + * + * So currently use one PCH version, 0x8c4e, to cover all HSW(Haswell) + * scenarios, 0x9cc3 for BDW(Broadwell). + */ +static const IGDDeviceIDInfo igd_combo_id_infos[] = { + /* HSW Classic */ + {0x0402, 0x8c4e, 0x04}, /* HSWGT1D, HSWD_w7 */ + {0x0406, 0x8c4e, 0x04}, /* HSWGT1M, HSWM_w7 */ + {0x0412, 0x8c4e, 0x04}, /* HSWGT2D, HSWD_w7 */ + {0x0416, 0x8c4e, 0x04}, /* HSWGT2M, HSWM_w7 */ + {0x041E, 0x8c4e, 0x04}, /* HSWGT15D, HSWD_w7 */ + /* HSW ULT */ + {0x0A06, 0x8c4e, 0x04}, /* HSWGT1UT, HSWM_w7 */ + {0x0A16, 0x8c4e, 0x04}, /* HSWGT2UT, HSWM_w7 */ + {0x0A26, 0x8c4e, 0x06}, /* HSWGT3UT, HSWM_w7 */ + {0x0A2E, 0x8c4e, 0x04}, /* HSWGT3UT28W, HSWM_w7 */ + {0x0A1E, 0x8c4e, 0x04}, /* HSWGT2UX, HSWM_w7 */ + {0x0A0E, 0x8c4e, 0x04}, /* HSWGT1ULX, HSWM_w7 */ + /* HSW CRW */ + {0x0D26, 0x8c4e, 0x04}, /* HSWGT3CW, HSWM_w7 */ + {0x0D22, 0x8c4e, 0x04}, /* HSWGT3CWDT, HSWD_w7 */ + /* HSW Server */ + {0x041A, 0x8c4e, 0x04}, /* HSWSVGT2, HSWD_w7 */ + /* HSW SRVR */ + {0x040A, 0x8c4e, 0x04}, /* HSWSVGT1, HSWD_w7 */ + /* BSW */ + {0x1606, 0x9cc3, 0x03}, /* BDWULTGT1, BDWM_w7 */ + {0x1616, 0x9cc3, 0x03}, /* BDWULTGT2, BDWM_w7 */ + {0x1626, 0x9cc3, 0x03}, /* BDWULTGT3, BDWM_w7 */ + {0x160E, 0x9cc3, 0x03}, /* BDWULXGT1, BDWM_w7 */ + {0x161E, 0x9cc3, 0x03}, /* BDWULXGT2, BDWM_w7 */ + {0x1602, 0x9cc3, 0x03}, /* BDWHALOGT1, BDWM_w7 */ + {0x1612, 0x9cc3, 0x03}, /* BDWHALOGT2, BDWM_w7 */ + {0x1622, 0x9cc3, 0x03}, /* BDWHALOGT3, BDWM_w7 */ + {0x162B, 0x9cc3, 0x03}, /* BDWHALO28W, BDWM_w7 */ + {0x162A, 0x9cc3, 0x03}, /* BDWGT3WRKS, BDWM_w7 */ + {0x162D, 0x9cc3, 0x03}, /* BDWGT3SRVR, BDWM_w7 */ +}; + +static void isa_bridge_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + dc->desc = "ISA bridge faked to support IGD PT"; + set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); + k->vendor_id = PCI_VENDOR_ID_INTEL; + k->class_id = PCI_CLASS_BRIDGE_ISA; +}; + +static TypeInfo isa_bridge_info = { + .name = "igd-passthrough-isa-bridge", + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(PCIDevice), + .class_init = isa_bridge_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { }, + }, +}; + +static void pt_graphics_register_types(void) +{ + type_register_static(&isa_bridge_info); +} +type_init(pt_graphics_register_types) + +void igd_passthrough_isa_bridge_create(PCIBus *bus, uint16_t gpu_dev_id) +{ + struct PCIDevice *bridge_dev; + int i, num; + uint16_t pch_dev_id = 0xffff; + uint8_t pch_rev_id = 0; + + num = ARRAY_SIZE(igd_combo_id_infos); + for (i = 0; i < num; i++) { + if (gpu_dev_id == igd_combo_id_infos[i].gpu_device_id) { + pch_dev_id = igd_combo_id_infos[i].pch_device_id; + pch_rev_id = igd_combo_id_infos[i].pch_revision_id; + } + } + + if (pch_dev_id == 0xffff) { + return; + } + + /* Currently IGD drivers always need to access PCH by 1f.0. */ + bridge_dev = pci_create_simple(bus, PCI_DEVFN(0x1f, 0), + "igd-passthrough-isa-bridge"); + + /* + * Note that vendor id is always PCI_VENDOR_ID_INTEL. + */ + if (!bridge_dev) { + fprintf(stderr, "set igd-passthrough-isa-bridge failed!\n"); + return; + } + pci_config_set_device_id(bridge_dev->config, pch_dev_id); + pci_config_set_revision(bridge_dev->config, pch_rev_id); +} + +static void isapc_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + m->desc = "ISA-only PC"; + m->max_cpus = 1; + m->option_rom_has_mr = true; + m->rom_file_has_mr = false; + pcmc->pci_enabled = false; + pcmc->has_acpi_build = false; + pcmc->smbios_defaults = false; + pcmc->gigabyte_align = false; + pcmc->smbios_legacy_mode = true; + pcmc->has_reserved_memory = false; + pcmc->default_nic_model = "ne2k_isa"; + m->default_cpu_type = X86_CPU_TYPE_NAME("486"); +} + +DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa, + isapc_machine_options); + + +#ifdef CONFIG_XEN +static void xenfv_4_2_machine_options(MachineClass *m) +{ + pc_i440fx_4_2_machine_options(m); + m->desc = "Xen Fully-virtualized PC"; + m->max_cpus = HVM_MAX_VCPUS; + m->default_machine_opts = "accel=xen,suppress-vmdesc=on"; +} + +DEFINE_PC_MACHINE(xenfv_4_2, "xenfv-4.2", pc_xen_hvm_init, + xenfv_4_2_machine_options); + +static void xenfv_3_1_machine_options(MachineClass *m) +{ + pc_i440fx_3_1_machine_options(m); + m->desc = "Xen Fully-virtualized PC"; + m->alias = "xenfv"; + m->max_cpus = HVM_MAX_VCPUS; + m->default_machine_opts = "accel=xen,suppress-vmdesc=on"; +} + +DEFINE_PC_MACHINE(xenfv, "xenfv-3.1", pc_xen_hvm_init, + xenfv_3_1_machine_options); +#endif diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c new file mode 100644 index 000000000..e1e100316 --- /dev/null +++ b/hw/i386/pc_q35.c @@ -0,0 +1,620 @@ +/* + * Q35 chipset based pc system emulator + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2009, 2010 + * Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * Copyright (C) 2012 Jason Baron <jbaron@redhat.com> + * + * This is based on pc.c, but heavily modified. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "hw/loader.h" +#include "hw/i2c/smbus_eeprom.h" +#include "hw/rtc/mc146818rtc.h" +#include "sysemu/kvm.h" +#include "hw/kvm/clock.h" +#include "hw/pci-host/q35.h" +#include "hw/pci/pcie_port.h" +#include "hw/qdev-properties.h" +#include "hw/i386/x86.h" +#include "hw/i386/pc.h" +#include "hw/i386/ich9.h" +#include "hw/i386/amd_iommu.h" +#include "hw/i386/intel_iommu.h" +#include "hw/display/ramfb.h" +#include "hw/firmware/smbios.h" +#include "hw/ide/pci.h" +#include "hw/ide/ahci.h" +#include "hw/usb.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "sysemu/numa.h" +#include "hw/hyperv/vmbus-bridge.h" +#include "hw/mem/nvdimm.h" +#include "hw/i386/acpi-build.h" + +/* ICH9 AHCI has 6 ports */ +#define MAX_SATA_PORTS 6 + +struct ehci_companions { + const char *name; + int func; + int port; +}; + +static const struct ehci_companions ich9_1d[] = { + { .name = "ich9-usb-uhci1", .func = 0, .port = 0 }, + { .name = "ich9-usb-uhci2", .func = 1, .port = 2 }, + { .name = "ich9-usb-uhci3", .func = 2, .port = 4 }, +}; + +static const struct ehci_companions ich9_1a[] = { + { .name = "ich9-usb-uhci4", .func = 0, .port = 0 }, + { .name = "ich9-usb-uhci5", .func = 1, .port = 2 }, + { .name = "ich9-usb-uhci6", .func = 2, .port = 4 }, +}; + +static int ehci_create_ich9_with_companions(PCIBus *bus, int slot) +{ + const struct ehci_companions *comp; + PCIDevice *ehci, *uhci; + BusState *usbbus; + const char *name; + int i; + + switch (slot) { + case 0x1d: + name = "ich9-usb-ehci1"; + comp = ich9_1d; + break; + case 0x1a: + name = "ich9-usb-ehci2"; + comp = ich9_1a; + break; + default: + return -1; + } + + ehci = pci_new_multifunction(PCI_DEVFN(slot, 7), true, name); + pci_realize_and_unref(ehci, bus, &error_fatal); + usbbus = QLIST_FIRST(&ehci->qdev.child_bus); + + for (i = 0; i < 3; i++) { + uhci = pci_new_multifunction(PCI_DEVFN(slot, comp[i].func), true, + comp[i].name); + qdev_prop_set_string(&uhci->qdev, "masterbus", usbbus->name); + qdev_prop_set_uint32(&uhci->qdev, "firstport", comp[i].port); + pci_realize_and_unref(uhci, bus, &error_fatal); + } + return 0; +} + +/* PC hardware initialisation */ +static void pc_q35_init(MachineState *machine) +{ + PCMachineState *pcms = PC_MACHINE(machine); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + X86MachineState *x86ms = X86_MACHINE(machine); + Q35PCIHost *q35_host; + PCIHostState *phb; + PCIBus *host_bus; + PCIDevice *lpc; + DeviceState *lpc_dev; + BusState *idebus[MAX_SATA_PORTS]; + ISADevice *rtc_state; + MemoryRegion *system_io = get_system_io(); + MemoryRegion *pci_memory; + MemoryRegion *rom_memory; + MemoryRegion *ram_memory; + GSIState *gsi_state; + ISABus *isa_bus; + int i; + ICH9LPCState *ich9_lpc; + PCIDevice *ahci; + ram_addr_t lowmem; + DriveInfo *hd[MAX_SATA_PORTS]; + MachineClass *mc = MACHINE_GET_CLASS(machine); + bool acpi_pcihp; + bool keep_pci_slot_hpc; + + /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory + * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping + * also known as MMCFG). + * If it doesn't, we need to split it in chunks below and above 4G. + * In any case, try to make sure that guest addresses aligned at + * 1G boundaries get mapped to host addresses aligned at 1G boundaries. + */ + if (machine->ram_size >= 0xb0000000) { + lowmem = 0x80000000; + } else { + lowmem = 0xb0000000; + } + + /* Handle the machine opt max-ram-below-4g. It is basically doing + * min(qemu limit, user limit). + */ + if (!pcms->max_ram_below_4g) { + pcms->max_ram_below_4g = 4 * GiB; + } + if (lowmem > pcms->max_ram_below_4g) { + lowmem = pcms->max_ram_below_4g; + if (machine->ram_size - lowmem > lowmem && + lowmem & (1 * GiB - 1)) { + warn_report("There is possibly poor performance as the ram size " + " (0x%" PRIx64 ") is more then twice the size of" + " max-ram-below-4g (%"PRIu64") and" + " max-ram-below-4g is not a multiple of 1G.", + (uint64_t)machine->ram_size, pcms->max_ram_below_4g); + } + } + + if (machine->ram_size >= lowmem) { + x86ms->above_4g_mem_size = machine->ram_size - lowmem; + x86ms->below_4g_mem_size = lowmem; + } else { + x86ms->above_4g_mem_size = 0; + x86ms->below_4g_mem_size = machine->ram_size; + } + + pc_machine_init_sgx_epc(pcms); + x86_cpus_init(x86ms, pcmc->default_cpu_version); + + kvmclock_create(pcmc->kvmclock_create_always); + + /* pci enabled */ + if (pcmc->pci_enabled) { + pci_memory = g_new(MemoryRegion, 1); + memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); + rom_memory = pci_memory; + } else { + pci_memory = NULL; + rom_memory = get_system_memory(); + } + + pc_guest_info_init(pcms); + + if (pcmc->smbios_defaults) { + /* These values are guest ABI, do not change */ + smbios_set_defaults("QEMU", "Standard PC (Q35 + ICH9, 2009)", + mc->name, pcmc->smbios_legacy_mode, + pcmc->smbios_uuid_encoded, + SMBIOS_ENTRY_POINT_21); + } + + /* allocate ram and load rom/bios */ + pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); + + /* create pci host bus */ + q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE)); + + object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host)); + object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM, + OBJECT(ram_memory), NULL); + object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_PCI_MEM, + OBJECT(pci_memory), NULL); + object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_SYSTEM_MEM, + OBJECT(get_system_memory()), NULL); + object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_IO_MEM, + OBJECT(system_io), NULL); + object_property_set_int(OBJECT(q35_host), PCI_HOST_BELOW_4G_MEM_SIZE, + x86ms->below_4g_mem_size, NULL); + object_property_set_int(OBJECT(q35_host), PCI_HOST_ABOVE_4G_MEM_SIZE, + x86ms->above_4g_mem_size, NULL); + /* pci */ + sysbus_realize_and_unref(SYS_BUS_DEVICE(q35_host), &error_fatal); + phb = PCI_HOST_BRIDGE(q35_host); + host_bus = phb->bus; + /* create ISA bus */ + lpc = pci_create_simple_multifunction(host_bus, PCI_DEVFN(ICH9_LPC_DEV, + ICH9_LPC_FUNC), true, + TYPE_ICH9_LPC_DEVICE); + + object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, + TYPE_HOTPLUG_HANDLER, + (Object **)&x86ms->acpi_dev, + object_property_allow_set_link, + OBJ_PROP_LINK_STRONG); + object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, + OBJECT(lpc), &error_abort); + + acpi_pcihp = object_property_get_bool(OBJECT(lpc), + ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, + NULL); + + keep_pci_slot_hpc = object_property_get_bool(OBJECT(lpc), + "x-keep-pci-slot-hpc", + NULL); + + if (!keep_pci_slot_hpc && acpi_pcihp) { + object_register_sugar_prop(TYPE_PCIE_SLOT, "x-native-hotplug", + "false", true); + } + + /* irq lines */ + gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled); + + ich9_lpc = ICH9_LPC_DEVICE(lpc); + lpc_dev = DEVICE(lpc); + for (i = 0; i < GSI_NUM_PINS; i++) { + qdev_connect_gpio_out_named(lpc_dev, ICH9_GPIO_GSI, i, x86ms->gsi[i]); + } + pci_bus_irqs(host_bus, ich9_lpc_set_irq, ich9_lpc_map_irq, ich9_lpc, + ICH9_LPC_NB_PIRQS); + pci_bus_set_route_irq_fn(host_bus, ich9_route_intx_pin_to_irq); + isa_bus = ich9_lpc->isa_bus; + + pc_i8259_create(isa_bus, gsi_state->i8259_irq); + + if (pcmc->pci_enabled) { + ioapic_init_gsi(gsi_state, "q35"); + } + + if (tcg_enabled()) { + x86_register_ferr_irq(x86ms->gsi[13]); + } + + assert(pcms->vmport != ON_OFF_AUTO__MAX); + if (pcms->vmport == ON_OFF_AUTO_AUTO) { + pcms->vmport = ON_OFF_AUTO_ON; + } + + /* init basic PC hardware */ + pc_basic_device_init(pcms, isa_bus, x86ms->gsi, &rtc_state, !mc->no_floppy, + 0xff0104); + + /* connect pm stuff to lpc */ + ich9_lpc_pm_init(lpc, x86_machine_is_smm_enabled(x86ms)); + + if (pcms->sata_enabled) { + /* ahci and SATA device, for q35 1 ahci controller is built-in */ + ahci = pci_create_simple_multifunction(host_bus, + PCI_DEVFN(ICH9_SATA1_DEV, + ICH9_SATA1_FUNC), + true, "ich9-ahci"); + idebus[0] = qdev_get_child_bus(&ahci->qdev, "ide.0"); + idebus[1] = qdev_get_child_bus(&ahci->qdev, "ide.1"); + g_assert(MAX_SATA_PORTS == ahci_get_num_ports(ahci)); + ide_drive_get(hd, ahci_get_num_ports(ahci)); + ahci_ide_create_devs(ahci, hd); + } else { + idebus[0] = idebus[1] = NULL; + } + + if (machine_usb(machine)) { + /* Should we create 6 UHCI according to ich9 spec? */ + ehci_create_ich9_with_companions(host_bus, 0x1d); + } + + if (pcms->smbus_enabled) { + /* TODO: Populate SPD eeprom data. */ + pcms->smbus = ich9_smb_init(host_bus, + PCI_DEVFN(ICH9_SMB_DEV, ICH9_SMB_FUNC), + 0xb100); + smbus_eeprom_init(pcms->smbus, 8, NULL, 0); + } + + pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state); + + /* the rest devices to which pci devfn is automatically assigned */ + pc_vga_init(isa_bus, host_bus); + pc_nic_init(pcmc, isa_bus, host_bus); + + if (machine->nvdimms_state->is_enabled) { + nvdimm_init_acpi_state(machine->nvdimms_state, system_io, + x86_nvdimm_acpi_dsmio, + x86ms->fw_cfg, OBJECT(pcms)); + } +} + +#define DEFINE_Q35_MACHINE(suffix, name, compatfn, optionfn) \ + static void pc_init_##suffix(MachineState *machine) \ + { \ + void (*compat)(MachineState *m) = (compatfn); \ + if (compat) { \ + compat(machine); \ + } \ + pc_q35_init(machine); \ + } \ + DEFINE_PC_MACHINE(suffix, name, pc_init_##suffix, optionfn) + + +static void pc_q35_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pcmc->default_nic_model = "e1000e"; + pcmc->pci_root_uid = 0; + + m->family = "pc_q35"; + m->desc = "Standard PC (Q35 + ICH9, 2009)"; + m->units_per_default_bus = 1; + m->default_machine_opts = "firmware=bios-256k.bin"; + m->default_display = "std"; + m->default_kernel_irqchip_split = false; + m->no_floppy = 1; + machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE); + machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); + machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); + machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); + m->max_cpus = 288; +} + +static void pc_q35_6_2_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pc_q35_machine_options(m); + m->alias = "q35"; + pcmc->default_cpu_version = 1; +} + +DEFINE_Q35_MACHINE(v6_2, "pc-q35-6.2", NULL, + pc_q35_6_2_machine_options); + +static void pc_q35_6_1_machine_options(MachineClass *m) +{ + pc_q35_6_2_machine_options(m); + m->alias = NULL; + compat_props_add(m->compat_props, hw_compat_6_1, hw_compat_6_1_len); + compat_props_add(m->compat_props, pc_compat_6_1, pc_compat_6_1_len); + m->smp_props.prefer_sockets = true; +} + +DEFINE_Q35_MACHINE(v6_1, "pc-q35-6.1", NULL, + pc_q35_6_1_machine_options); + +static void pc_q35_6_0_machine_options(MachineClass *m) +{ + pc_q35_6_1_machine_options(m); + m->alias = NULL; + compat_props_add(m->compat_props, hw_compat_6_0, hw_compat_6_0_len); + compat_props_add(m->compat_props, pc_compat_6_0, pc_compat_6_0_len); +} + +DEFINE_Q35_MACHINE(v6_0, "pc-q35-6.0", NULL, + pc_q35_6_0_machine_options); + +static void pc_q35_5_2_machine_options(MachineClass *m) +{ + pc_q35_6_0_machine_options(m); + m->alias = NULL; + compat_props_add(m->compat_props, hw_compat_5_2, hw_compat_5_2_len); + compat_props_add(m->compat_props, pc_compat_5_2, pc_compat_5_2_len); +} + +DEFINE_Q35_MACHINE(v5_2, "pc-q35-5.2", NULL, + pc_q35_5_2_machine_options); + +static void pc_q35_5_1_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_q35_5_2_machine_options(m); + m->alias = NULL; + compat_props_add(m->compat_props, hw_compat_5_1, hw_compat_5_1_len); + compat_props_add(m->compat_props, pc_compat_5_1, pc_compat_5_1_len); + pcmc->kvmclock_create_always = false; + pcmc->pci_root_uid = 1; +} + +DEFINE_Q35_MACHINE(v5_1, "pc-q35-5.1", NULL, + pc_q35_5_1_machine_options); + +static void pc_q35_5_0_machine_options(MachineClass *m) +{ + pc_q35_5_1_machine_options(m); + m->alias = NULL; + m->numa_mem_supported = true; + compat_props_add(m->compat_props, hw_compat_5_0, hw_compat_5_0_len); + compat_props_add(m->compat_props, pc_compat_5_0, pc_compat_5_0_len); + m->auto_enable_numa_with_memdev = false; +} + +DEFINE_Q35_MACHINE(v5_0, "pc-q35-5.0", NULL, + pc_q35_5_0_machine_options); + +static void pc_q35_4_2_machine_options(MachineClass *m) +{ + pc_q35_5_0_machine_options(m); + m->alias = NULL; + compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len); + compat_props_add(m->compat_props, pc_compat_4_2, pc_compat_4_2_len); +} + +DEFINE_Q35_MACHINE(v4_2, "pc-q35-4.2", NULL, + pc_q35_4_2_machine_options); + +static void pc_q35_4_1_machine_options(MachineClass *m) +{ + pc_q35_4_2_machine_options(m); + m->alias = NULL; + compat_props_add(m->compat_props, hw_compat_4_1, hw_compat_4_1_len); + compat_props_add(m->compat_props, pc_compat_4_1, pc_compat_4_1_len); +} + +DEFINE_Q35_MACHINE(v4_1, "pc-q35-4.1", NULL, + pc_q35_4_1_machine_options); + +static void pc_q35_4_0_1_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pc_q35_4_1_machine_options(m); + m->alias = NULL; + pcmc->default_cpu_version = CPU_VERSION_LEGACY; + /* + * This is the default machine for the 4.0-stable branch. It is basically + * a 4.0 that doesn't use split irqchip by default. It MUST hence apply the + * 4.0 compat props. + */ + compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len); + compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len); +} + +DEFINE_Q35_MACHINE(v4_0_1, "pc-q35-4.0.1", NULL, + pc_q35_4_0_1_machine_options); + +static void pc_q35_4_0_machine_options(MachineClass *m) +{ + pc_q35_4_0_1_machine_options(m); + m->default_kernel_irqchip_split = true; + m->alias = NULL; + /* Compat props are applied by the 4.0.1 machine */ +} + +DEFINE_Q35_MACHINE(v4_0, "pc-q35-4.0", NULL, + pc_q35_4_0_machine_options); + +static void pc_q35_3_1_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_q35_4_0_machine_options(m); + m->default_kernel_irqchip_split = false; + pcmc->do_not_add_smb_acpi = true; + m->smbus_no_migration_support = true; + m->alias = NULL; + pcmc->pvh_enabled = false; + compat_props_add(m->compat_props, hw_compat_3_1, hw_compat_3_1_len); + compat_props_add(m->compat_props, pc_compat_3_1, pc_compat_3_1_len); +} + +DEFINE_Q35_MACHINE(v3_1, "pc-q35-3.1", NULL, + pc_q35_3_1_machine_options); + +static void pc_q35_3_0_machine_options(MachineClass *m) +{ + pc_q35_3_1_machine_options(m); + compat_props_add(m->compat_props, hw_compat_3_0, hw_compat_3_0_len); + compat_props_add(m->compat_props, pc_compat_3_0, pc_compat_3_0_len); +} + +DEFINE_Q35_MACHINE(v3_0, "pc-q35-3.0", NULL, + pc_q35_3_0_machine_options); + +static void pc_q35_2_12_machine_options(MachineClass *m) +{ + pc_q35_3_0_machine_options(m); + compat_props_add(m->compat_props, hw_compat_2_12, hw_compat_2_12_len); + compat_props_add(m->compat_props, pc_compat_2_12, pc_compat_2_12_len); +} + +DEFINE_Q35_MACHINE(v2_12, "pc-q35-2.12", NULL, + pc_q35_2_12_machine_options); + +static void pc_q35_2_11_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_q35_2_12_machine_options(m); + pcmc->default_nic_model = "e1000"; + compat_props_add(m->compat_props, hw_compat_2_11, hw_compat_2_11_len); + compat_props_add(m->compat_props, pc_compat_2_11, pc_compat_2_11_len); +} + +DEFINE_Q35_MACHINE(v2_11, "pc-q35-2.11", NULL, + pc_q35_2_11_machine_options); + +static void pc_q35_2_10_machine_options(MachineClass *m) +{ + pc_q35_2_11_machine_options(m); + compat_props_add(m->compat_props, hw_compat_2_10, hw_compat_2_10_len); + compat_props_add(m->compat_props, pc_compat_2_10, pc_compat_2_10_len); + m->auto_enable_numa_with_memhp = false; +} + +DEFINE_Q35_MACHINE(v2_10, "pc-q35-2.10", NULL, + pc_q35_2_10_machine_options); + +static void pc_q35_2_9_machine_options(MachineClass *m) +{ + pc_q35_2_10_machine_options(m); + compat_props_add(m->compat_props, hw_compat_2_9, hw_compat_2_9_len); + compat_props_add(m->compat_props, pc_compat_2_9, pc_compat_2_9_len); +} + +DEFINE_Q35_MACHINE(v2_9, "pc-q35-2.9", NULL, + pc_q35_2_9_machine_options); + +static void pc_q35_2_8_machine_options(MachineClass *m) +{ + pc_q35_2_9_machine_options(m); + compat_props_add(m->compat_props, hw_compat_2_8, hw_compat_2_8_len); + compat_props_add(m->compat_props, pc_compat_2_8, pc_compat_2_8_len); +} + +DEFINE_Q35_MACHINE(v2_8, "pc-q35-2.8", NULL, + pc_q35_2_8_machine_options); + +static void pc_q35_2_7_machine_options(MachineClass *m) +{ + pc_q35_2_8_machine_options(m); + m->max_cpus = 255; + compat_props_add(m->compat_props, hw_compat_2_7, hw_compat_2_7_len); + compat_props_add(m->compat_props, pc_compat_2_7, pc_compat_2_7_len); +} + +DEFINE_Q35_MACHINE(v2_7, "pc-q35-2.7", NULL, + pc_q35_2_7_machine_options); + +static void pc_q35_2_6_machine_options(MachineClass *m) +{ + X86MachineClass *x86mc = X86_MACHINE_CLASS(m); + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_q35_2_7_machine_options(m); + pcmc->legacy_cpu_hotplug = true; + x86mc->fwcfg_dma_enabled = false; + compat_props_add(m->compat_props, hw_compat_2_6, hw_compat_2_6_len); + compat_props_add(m->compat_props, pc_compat_2_6, pc_compat_2_6_len); +} + +DEFINE_Q35_MACHINE(v2_6, "pc-q35-2.6", NULL, + pc_q35_2_6_machine_options); + +static void pc_q35_2_5_machine_options(MachineClass *m) +{ + X86MachineClass *x86mc = X86_MACHINE_CLASS(m); + + pc_q35_2_6_machine_options(m); + x86mc->save_tsc_khz = false; + m->legacy_fw_cfg_order = 1; + compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len); + compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len); +} + +DEFINE_Q35_MACHINE(v2_5, "pc-q35-2.5", NULL, + pc_q35_2_5_machine_options); + +static void pc_q35_2_4_machine_options(MachineClass *m) +{ + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + pc_q35_2_5_machine_options(m); + m->hw_version = "2.4.0"; + pcmc->broken_reserved_end = true; + compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len); + compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len); +} + +DEFINE_Q35_MACHINE(v2_4, "pc-q35-2.4", NULL, + pc_q35_2_4_machine_options); diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c new file mode 100644 index 000000000..c8b17af95 --- /dev/null +++ b/hw/i386/pc_sysfw.c @@ -0,0 +1,262 @@ +/* + * QEMU PC System Firmware + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2011-2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qapi/error.h" +#include "sysemu/block-backend.h" +#include "qemu/error-report.h" +#include "qemu/option.h" +#include "qemu/units.h" +#include "hw/sysbus.h" +#include "hw/i386/x86.h" +#include "hw/i386/pc.h" +#include "hw/loader.h" +#include "hw/qdev-properties.h" +#include "hw/block/flash.h" +#include "sysemu/kvm.h" +#include "sev.h" + +#define FLASH_SECTOR_SIZE 4096 + +static void pc_isa_bios_init(MemoryRegion *rom_memory, + MemoryRegion *flash_mem, + int ram_size) +{ + int isa_bios_size; + MemoryRegion *isa_bios; + uint64_t flash_size; + void *flash_ptr, *isa_bios_ptr; + + flash_size = memory_region_size(flash_mem); + + /* map the last 128KB of the BIOS in ISA space */ + isa_bios_size = MIN(flash_size, 128 * KiB); + isa_bios = g_malloc(sizeof(*isa_bios)); + memory_region_init_ram(isa_bios, NULL, "isa-bios", isa_bios_size, + &error_fatal); + memory_region_add_subregion_overlap(rom_memory, + 0x100000 - isa_bios_size, + isa_bios, + 1); + + /* copy ISA rom image from top of flash memory */ + flash_ptr = memory_region_get_ram_ptr(flash_mem); + isa_bios_ptr = memory_region_get_ram_ptr(isa_bios); + memcpy(isa_bios_ptr, + ((uint8_t*)flash_ptr) + (flash_size - isa_bios_size), + isa_bios_size); + + memory_region_set_readonly(isa_bios, true); +} + +static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms, + const char *name, + const char *alias_prop_name) +{ + DeviceState *dev = qdev_new(TYPE_PFLASH_CFI01); + + qdev_prop_set_uint64(dev, "sector-length", FLASH_SECTOR_SIZE); + qdev_prop_set_uint8(dev, "width", 1); + qdev_prop_set_string(dev, "name", name); + object_property_add_child(OBJECT(pcms), name, OBJECT(dev)); + object_property_add_alias(OBJECT(pcms), alias_prop_name, + OBJECT(dev), "drive"); + /* + * The returned reference is tied to the child property and + * will be removed with object_unparent. + */ + object_unref(OBJECT(dev)); + return PFLASH_CFI01(dev); +} + +void pc_system_flash_create(PCMachineState *pcms) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + + if (pcmc->pci_enabled) { + pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", + "pflash0"); + pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", + "pflash1"); + } +} + +void pc_system_flash_cleanup_unused(PCMachineState *pcms) +{ + char *prop_name; + int i; + Object *dev_obj; + + assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled); + + for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { + dev_obj = OBJECT(pcms->flash[i]); + if (!object_property_get_bool(dev_obj, "realized", &error_abort)) { + prop_name = g_strdup_printf("pflash%d", i); + object_property_del(OBJECT(pcms), prop_name); + g_free(prop_name); + object_unparent(dev_obj); + pcms->flash[i] = NULL; + } + } +} + +/* + * Map the pcms->flash[] from 4GiB downward, and realize. + * Map them in descending order, i.e. pcms->flash[0] at the top, + * without gaps. + * Stop at the first pcms->flash[0] lacking a block backend. + * Set each flash's size from its block backend. Fatal error if the + * size isn't a non-zero multiple of 4KiB, or the total size exceeds + * pcms->max_fw_size. + * + * If pcms->flash[0] has a block backend, its memory is passed to + * pc_isa_bios_init(). Merging several flash devices for isa-bios is + * not supported. + */ +static void pc_system_flash_map(PCMachineState *pcms, + MemoryRegion *rom_memory) +{ + hwaddr total_size = 0; + int i; + BlockBackend *blk; + int64_t size; + PFlashCFI01 *system_flash; + MemoryRegion *flash_mem; + void *flash_ptr; + int flash_size; + int ret; + + assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled); + + for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { + system_flash = pcms->flash[i]; + blk = pflash_cfi01_get_blk(system_flash); + if (!blk) { + break; + } + size = blk_getlength(blk); + if (size < 0) { + error_report("can't get size of block device %s: %s", + blk_name(blk), strerror(-size)); + exit(1); + } + if (size == 0 || !QEMU_IS_ALIGNED(size, FLASH_SECTOR_SIZE)) { + error_report("system firmware block device %s has invalid size " + "%" PRId64, + blk_name(blk), size); + info_report("its size must be a non-zero multiple of 0x%x", + FLASH_SECTOR_SIZE); + exit(1); + } + if ((hwaddr)size != size + || total_size > HWADDR_MAX - size + || total_size + size > pcms->max_fw_size) { + error_report("combined size of system firmware exceeds " + "%" PRIu64 " bytes", + pcms->max_fw_size); + exit(1); + } + + total_size += size; + qdev_prop_set_uint32(DEVICE(system_flash), "num-blocks", + size / FLASH_SECTOR_SIZE); + sysbus_realize_and_unref(SYS_BUS_DEVICE(system_flash), &error_fatal); + sysbus_mmio_map(SYS_BUS_DEVICE(system_flash), 0, + 0x100000000ULL - total_size); + + if (i == 0) { + flash_mem = pflash_cfi01_get_memory(system_flash); + pc_isa_bios_init(rom_memory, flash_mem, size); + + /* Encrypt the pflash boot ROM */ + if (sev_enabled()) { + flash_ptr = memory_region_get_ram_ptr(flash_mem); + flash_size = memory_region_size(flash_mem); + /* + * OVMF places a GUIDed structures in the flash, so + * search for them + */ + pc_system_parse_ovmf_flash(flash_ptr, flash_size); + + ret = sev_es_save_reset_vector(flash_ptr, flash_size); + if (ret) { + error_report("failed to locate and/or save reset vector"); + exit(1); + } + + sev_encrypt_flash(flash_ptr, flash_size, &error_fatal); + } + } + } +} + +void pc_system_firmware_init(PCMachineState *pcms, + MemoryRegion *rom_memory) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + int i; + BlockBackend *pflash_blk[ARRAY_SIZE(pcms->flash)]; + + if (!pcmc->pci_enabled) { + x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, true); + return; + } + + /* Map legacy -drive if=pflash to machine properties */ + for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { + pflash_cfi01_legacy_drive(pcms->flash[i], + drive_get(IF_PFLASH, 0, i)); + pflash_blk[i] = pflash_cfi01_get_blk(pcms->flash[i]); + } + + /* Reject gaps */ + for (i = 1; i < ARRAY_SIZE(pcms->flash); i++) { + if (pflash_blk[i] && !pflash_blk[i - 1]) { + error_report("pflash%d requires pflash%d", i, i - 1); + exit(1); + } + } + + if (!pflash_blk[0]) { + /* Machine property pflash0 not set, use ROM mode */ + x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, false); + } else { + if (kvm_enabled() && !kvm_readonly_mem_enabled()) { + /* + * Older KVM cannot execute from device memory. So, flash + * memory cannot be used unless the readonly memory kvm + * capability is present. + */ + error_report("pflash with kvm requires KVM readonly memory support"); + exit(1); + } + + pc_system_flash_map(pcms, rom_memory); + } + + pc_system_flash_cleanup_unused(pcms); +} diff --git a/hw/i386/pc_sysfw_ovmf-stubs.c b/hw/i386/pc_sysfw_ovmf-stubs.c new file mode 100644 index 000000000..aabe78b27 --- /dev/null +++ b/hw/i386/pc_sysfw_ovmf-stubs.c @@ -0,0 +1,26 @@ +/* + * QEMU PC System Firmware (OVMF stubs) + * + * Copyright (c) 2021 Red Hat, Inc. + * + * Author: + * Philippe Mathieu-Daudé <philmd@redhat.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "hw/i386/pc.h" + +bool pc_system_ovmf_table_find(const char *entry, uint8_t **data, int *data_len) +{ + g_assert_not_reached(); +} + +void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size) +{ + g_assert_not_reached(); +} diff --git a/hw/i386/pc_sysfw_ovmf.c b/hw/i386/pc_sysfw_ovmf.c new file mode 100644 index 000000000..f4dd92c58 --- /dev/null +++ b/hw/i386/pc_sysfw_ovmf.c @@ -0,0 +1,151 @@ +/* + * QEMU PC System Firmware (OVMF specific) + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2011-2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "hw/i386/pc.h" +#include "cpu.h" + +#define OVMF_TABLE_FOOTER_GUID "96b582de-1fb2-45f7-baea-a366c55a082d" + +static bool ovmf_flash_parsed; +static uint8_t *ovmf_table; +static int ovmf_table_len; + +void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size) +{ + uint8_t *ptr; + QemuUUID guid; + int tot_len; + + /* should only be called once */ + if (ovmf_flash_parsed) { + return; + } + + ovmf_flash_parsed = true; + + if (flash_size < TARGET_PAGE_SIZE) { + return; + } + + /* + * if this is OVMF there will be a table footer + * guid 48 bytes before the end of the flash file. If it's + * not found, silently abort the flash parsing. + */ + qemu_uuid_parse(OVMF_TABLE_FOOTER_GUID, &guid); + guid = qemu_uuid_bswap(guid); /* guids are LE */ + ptr = flash_ptr + flash_size - 48; + if (!qemu_uuid_is_equal((QemuUUID *)ptr, &guid)) { + return; + } + + /* if found, just before is two byte table length */ + ptr -= sizeof(uint16_t); + tot_len = le16_to_cpu(*(uint16_t *)ptr) - sizeof(guid) - sizeof(uint16_t); + + if (tot_len <= 0) { + return; + } + + ovmf_table = g_malloc(tot_len); + ovmf_table_len = tot_len; + + /* + * ptr is the foot of the table, so copy it all to the newly + * allocated ovmf_table and then set the ovmf_table pointer + * to the table foot + */ + memcpy(ovmf_table, ptr - tot_len, tot_len); + ovmf_table += tot_len; +} + +/** + * pc_system_ovmf_table_find - Find the data associated with an entry in OVMF's + * reset vector GUIDed table. + * + * @entry: GUID string of the entry to lookup + * @data: Filled with a pointer to the entry's value (if not NULL) + * @data_len: Filled with the length of the entry's value (if not NULL). Pass + * NULL here if the length of data is known. + * + * Return: true if the entry was found in the OVMF table; false otherwise. + */ +bool pc_system_ovmf_table_find(const char *entry, uint8_t **data, + int *data_len) +{ + uint8_t *ptr = ovmf_table; + int tot_len = ovmf_table_len; + QemuUUID entry_guid; + + assert(ovmf_flash_parsed); + + if (qemu_uuid_parse(entry, &entry_guid) < 0) { + return false; + } + + if (!ptr) { + return false; + } + + entry_guid = qemu_uuid_bswap(entry_guid); /* guids are LE */ + while (tot_len >= sizeof(QemuUUID) + sizeof(uint16_t)) { + int len; + QemuUUID *guid; + + /* + * The data structure is + * arbitrary length data + * 2 byte length of entire entry + * 16 byte guid + */ + guid = (QemuUUID *)(ptr - sizeof(QemuUUID)); + len = le16_to_cpu(*(uint16_t *)(ptr - sizeof(QemuUUID) - + sizeof(uint16_t))); + + /* + * just in case the table is corrupt, wouldn't want to spin in + * the zero case + */ + if (len < sizeof(QemuUUID) + sizeof(uint16_t)) { + return false; + } else if (len > tot_len) { + return false; + } + + ptr -= len; + tot_len -= len; + if (qemu_uuid_is_equal(guid, &entry_guid)) { + if (data) { + *data = ptr; + } + if (data_len) { + *data_len = len - sizeof(QemuUUID) - sizeof(uint16_t); + } + return true; + } + } + return false; +} diff --git a/hw/i386/port92.c b/hw/i386/port92.c new file mode 100644 index 000000000..e1379a4f9 --- /dev/null +++ b/hw/i386/port92.c @@ -0,0 +1,127 @@ +/* + * QEMU I/O port 0x92 (System Control Port A, to handle Fast Gate A20) + * + * Copyright (c) 2003-2004 Fabrice Bellard + * + * SPDX-License-Identifier: MIT + */ + +#include "qemu/osdep.h" +#include "sysemu/runstate.h" +#include "migration/vmstate.h" +#include "hw/irq.h" +#include "hw/i386/pc.h" +#include "trace.h" +#include "qom/object.h" + +OBJECT_DECLARE_SIMPLE_TYPE(Port92State, PORT92) + +struct Port92State { + ISADevice parent_obj; + + MemoryRegion io; + uint8_t outport; + qemu_irq a20_out; +}; + +static void port92_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ + Port92State *s = opaque; + int oldval = s->outport; + + trace_port92_write(val); + s->outport = val; + qemu_set_irq(s->a20_out, (val >> 1) & 1); + if ((val & 1) && !(oldval & 1)) { + qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); + } +} + +static uint64_t port92_read(void *opaque, hwaddr addr, + unsigned size) +{ + Port92State *s = opaque; + uint32_t ret; + + ret = s->outport; + trace_port92_read(ret); + + return ret; +} + +static const VMStateDescription vmstate_port92_isa = { + .name = "port92", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT8(outport, Port92State), + VMSTATE_END_OF_LIST() + } +}; + +static void port92_reset(DeviceState *d) +{ + Port92State *s = PORT92(d); + + s->outport &= ~1; +} + +static const MemoryRegionOps port92_ops = { + .read = port92_read, + .write = port92_write, + .impl = { + .min_access_size = 1, + .max_access_size = 1, + }, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void port92_initfn(Object *obj) +{ + Port92State *s = PORT92(obj); + + memory_region_init_io(&s->io, OBJECT(s), &port92_ops, s, "port92", 1); + + s->outport = 0; + + qdev_init_gpio_out_named(DEVICE(obj), &s->a20_out, PORT92_A20_LINE, 1); +} + +static void port92_realizefn(DeviceState *dev, Error **errp) +{ + ISADevice *isadev = ISA_DEVICE(dev); + Port92State *s = PORT92(dev); + + isa_register_ioport(isadev, &s->io, 0x92); +} + +static void port92_class_initfn(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = port92_realizefn; + dc->reset = port92_reset; + dc->vmsd = &vmstate_port92_isa; + /* + * Reason: unlike ordinary ISA devices, this one needs additional + * wiring: its A20 output line needs to be wired up with + * qdev_connect_gpio_out_named(). + */ + dc->user_creatable = false; +} + +static const TypeInfo port92_info = { + .name = TYPE_PORT92, + .parent = TYPE_ISA_DEVICE, + .instance_size = sizeof(Port92State), + .instance_init = port92_initfn, + .class_init = port92_class_initfn, +}; + +static void port92_register_types(void) +{ + type_register_static(&port92_info); +} + +type_init(port92_register_types) diff --git a/hw/i386/sgx-epc.c b/hw/i386/sgx-epc.c new file mode 100644 index 000000000..e508827e7 --- /dev/null +++ b/hw/i386/sgx-epc.c @@ -0,0 +1,185 @@ +/* + * SGX EPC device + * + * Copyright (C) 2019 Intel Corporation + * + * Authors: + * Sean Christopherson <sean.j.christopherson@intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "hw/i386/pc.h" +#include "hw/i386/sgx-epc.h" +#include "hw/mem/memory-device.h" +#include "hw/qdev-properties.h" +#include "qapi/error.h" +#include "qapi/visitor.h" +#include "target/i386/cpu.h" +#include "exec/address-spaces.h" + +static Property sgx_epc_properties[] = { + DEFINE_PROP_UINT64(SGX_EPC_ADDR_PROP, SGXEPCDevice, addr, 0), + DEFINE_PROP_LINK(SGX_EPC_MEMDEV_PROP, SGXEPCDevice, hostmem, + TYPE_MEMORY_BACKEND_EPC, HostMemoryBackendEpc *), + DEFINE_PROP_END_OF_LIST(), +}; + +static void sgx_epc_get_size(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + Error *local_err = NULL; + uint64_t value; + + value = memory_device_get_region_size(MEMORY_DEVICE(obj), &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + visit_type_uint64(v, name, &value, errp); +} + +static void sgx_epc_init(Object *obj) +{ + object_property_add(obj, SGX_EPC_SIZE_PROP, "uint64", sgx_epc_get_size, + NULL, NULL, NULL); +} + +static void sgx_epc_realize(DeviceState *dev, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + X86MachineState *x86ms = X86_MACHINE(pcms); + MemoryDeviceState *md = MEMORY_DEVICE(dev); + SGXEPCState *sgx_epc = &pcms->sgx_epc; + SGXEPCDevice *epc = SGX_EPC(dev); + HostMemoryBackend *hostmem; + const char *path; + + if (x86ms->boot_cpus != 0) { + error_setg(errp, "'" TYPE_SGX_EPC "' can't be created after vCPUs," + "e.g. via -device"); + return; + } + + if (!epc->hostmem) { + error_setg(errp, "'" SGX_EPC_MEMDEV_PROP "' property is not set"); + return; + } + hostmem = MEMORY_BACKEND(epc->hostmem); + if (host_memory_backend_is_mapped(hostmem)) { + path = object_get_canonical_path_component(OBJECT(hostmem)); + error_setg(errp, "can't use already busy memdev: %s", path); + return; + } + + epc->addr = sgx_epc->base + sgx_epc->size; + + memory_region_add_subregion(&sgx_epc->mr, epc->addr - sgx_epc->base, + host_memory_backend_get_memory(hostmem)); + + host_memory_backend_set_mapped(hostmem, true); + + sgx_epc->sections = g_renew(SGXEPCDevice *, sgx_epc->sections, + sgx_epc->nr_sections + 1); + sgx_epc->sections[sgx_epc->nr_sections++] = epc; + + sgx_epc->size += memory_device_get_region_size(md, errp); +} + +static void sgx_epc_unrealize(DeviceState *dev) +{ + SGXEPCDevice *epc = SGX_EPC(dev); + HostMemoryBackend *hostmem = MEMORY_BACKEND(epc->hostmem); + + host_memory_backend_set_mapped(hostmem, false); +} + +static uint64_t sgx_epc_md_get_addr(const MemoryDeviceState *md) +{ + const SGXEPCDevice *epc = SGX_EPC(md); + + return epc->addr; +} + +static void sgx_epc_md_set_addr(MemoryDeviceState *md, uint64_t addr, + Error **errp) +{ + object_property_set_uint(OBJECT(md), SGX_EPC_ADDR_PROP, addr, errp); +} + +static uint64_t sgx_epc_md_get_plugged_size(const MemoryDeviceState *md, + Error **errp) +{ + return 0; +} + +static MemoryRegion *sgx_epc_md_get_memory_region(MemoryDeviceState *md, + Error **errp) +{ + SGXEPCDevice *epc = SGX_EPC(md); + HostMemoryBackend *hostmem; + + if (!epc->hostmem) { + error_setg(errp, "'" SGX_EPC_MEMDEV_PROP "' property must be set"); + return NULL; + } + + hostmem = MEMORY_BACKEND(epc->hostmem); + return host_memory_backend_get_memory(hostmem); +} + +static void sgx_epc_md_fill_device_info(const MemoryDeviceState *md, + MemoryDeviceInfo *info) +{ + SgxEPCDeviceInfo *se = g_new0(SgxEPCDeviceInfo, 1); + SGXEPCDevice *epc = SGX_EPC(md); + + se->memaddr = epc->addr; + se->size = object_property_get_uint(OBJECT(epc), SGX_EPC_SIZE_PROP, + NULL); + se->memdev = object_get_canonical_path(OBJECT(epc->hostmem)); + + info->u.sgx_epc.data = se; + info->type = MEMORY_DEVICE_INFO_KIND_SGX_EPC; +} + +static void sgx_epc_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(oc); + + dc->hotpluggable = false; + dc->realize = sgx_epc_realize; + dc->unrealize = sgx_epc_unrealize; + dc->desc = "SGX EPC section"; + dc->user_creatable = false; + device_class_set_props(dc, sgx_epc_properties); + + mdc->get_addr = sgx_epc_md_get_addr; + mdc->set_addr = sgx_epc_md_set_addr; + mdc->get_plugged_size = sgx_epc_md_get_plugged_size; + mdc->get_memory_region = sgx_epc_md_get_memory_region; + mdc->fill_device_info = sgx_epc_md_fill_device_info; +} + +static TypeInfo sgx_epc_info = { + .name = TYPE_SGX_EPC, + .parent = TYPE_DEVICE, + .instance_size = sizeof(SGXEPCDevice), + .instance_init = sgx_epc_init, + .class_init = sgx_epc_class_init, + .class_size = sizeof(DeviceClass), + .interfaces = (InterfaceInfo[]) { + { TYPE_MEMORY_DEVICE }, + { } + }, +}; + +static void sgx_epc_register_types(void) +{ + type_register_static(&sgx_epc_info); +} + +type_init(sgx_epc_register_types) diff --git a/hw/i386/sgx-stub.c b/hw/i386/sgx-stub.c new file mode 100644 index 000000000..c9b379e66 --- /dev/null +++ b/hw/i386/sgx-stub.c @@ -0,0 +1,34 @@ +#include "qemu/osdep.h" +#include "monitor/monitor.h" +#include "monitor/hmp-target.h" +#include "hw/i386/pc.h" +#include "hw/i386/sgx-epc.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-misc-target.h" + +SGXInfo *qmp_query_sgx(Error **errp) +{ + error_setg(errp, "SGX support is not compiled in"); + return NULL; +} + +SGXInfo *qmp_query_sgx_capabilities(Error **errp) +{ + error_setg(errp, "SGX support is not compiled in"); + return NULL; +} + +void hmp_info_sgx(Monitor *mon, const QDict *qdict) +{ + monitor_printf(mon, "SGX is not available in this QEMU\n"); +} + +void pc_machine_init_sgx_epc(PCMachineState *pcms) +{ + memset(&pcms->sgx_epc, 0, sizeof(SGXEPCState)); +} + +bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size) +{ + g_assert_not_reached(); +} diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c new file mode 100644 index 000000000..8fef3dd8f --- /dev/null +++ b/hw/i386/sgx.c @@ -0,0 +1,243 @@ +/* + * SGX common code + * + * Copyright (C) 2021 Intel Corporation + * + * Authors: + * Yang Zhong<yang.zhong@intel.com> + * Sean Christopherson <sean.j.christopherson@intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "hw/i386/pc.h" +#include "hw/i386/sgx-epc.h" +#include "hw/mem/memory-device.h" +#include "monitor/qdev.h" +#include "monitor/monitor.h" +#include "monitor/hmp-target.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-misc-target.h" +#include "exec/address-spaces.h" +#include "sysemu/hw_accel.h" +#include "sysemu/reset.h" +#include <sys/ioctl.h> + +#define SGX_MAX_EPC_SECTIONS 8 +#define SGX_CPUID_EPC_INVALID 0x0 + +/* A valid EPC section. */ +#define SGX_CPUID_EPC_SECTION 0x1 +#define SGX_CPUID_EPC_MASK 0xF + +#define SGX_MAGIC 0xA4 +#define SGX_IOC_VEPC_REMOVE_ALL _IO(SGX_MAGIC, 0x04) + +#define RETRY_NUM 2 + +static uint64_t sgx_calc_section_metric(uint64_t low, uint64_t high) +{ + return (low & MAKE_64BIT_MASK(12, 20)) + + ((high & MAKE_64BIT_MASK(0, 20)) << 32); +} + +static uint64_t sgx_calc_host_epc_section_size(void) +{ + uint32_t i, type; + uint32_t eax, ebx, ecx, edx; + uint64_t size = 0; + + for (i = 0; i < SGX_MAX_EPC_SECTIONS; i++) { + host_cpuid(0x12, i + 2, &eax, &ebx, &ecx, &edx); + + type = eax & SGX_CPUID_EPC_MASK; + if (type == SGX_CPUID_EPC_INVALID) { + break; + } + + if (type != SGX_CPUID_EPC_SECTION) { + break; + } + + size += sgx_calc_section_metric(ecx, edx); + } + + return size; +} + +static void sgx_epc_reset(void *opaque) +{ + PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + HostMemoryBackend *hostmem; + SGXEPCDevice *epc; + int failures; + int fd, i, j, r; + static bool warned = false; + + /* + * The second pass is needed to remove SECS pages that could not + * be removed during the first. + */ + for (i = 0; i < RETRY_NUM; i++) { + failures = 0; + for (j = 0; j < pcms->sgx_epc.nr_sections; j++) { + epc = pcms->sgx_epc.sections[j]; + hostmem = MEMORY_BACKEND(epc->hostmem); + fd = memory_region_get_fd(host_memory_backend_get_memory(hostmem)); + + r = ioctl(fd, SGX_IOC_VEPC_REMOVE_ALL); + if (r == -ENOTTY && !warned) { + warned = true; + warn_report("kernel does not support SGX_IOC_VEPC_REMOVE_ALL"); + warn_report("SGX might operate incorrectly in the guest after reset"); + break; + } else if (r > 0) { + /* SECS pages remain */ + failures++; + if (i == 1) { + error_report("cannot reset vEPC section %d", j); + } + } + } + if (!failures) { + break; + } + } +} + +SGXInfo *qmp_query_sgx_capabilities(Error **errp) +{ + SGXInfo *info = NULL; + uint32_t eax, ebx, ecx, edx; + + int fd = qemu_open_old("/dev/sgx_vepc", O_RDWR); + if (fd < 0) { + error_setg(errp, "SGX is not enabled in KVM"); + return NULL; + } + + info = g_new0(SGXInfo, 1); + host_cpuid(0x7, 0, &eax, &ebx, &ecx, &edx); + + info->sgx = ebx & (1U << 2) ? true : false; + info->flc = ecx & (1U << 30) ? true : false; + + host_cpuid(0x12, 0, &eax, &ebx, &ecx, &edx); + info->sgx1 = eax & (1U << 0) ? true : false; + info->sgx2 = eax & (1U << 1) ? true : false; + + info->section_size = sgx_calc_host_epc_section_size(); + + close(fd); + + return info; +} + +SGXInfo *qmp_query_sgx(Error **errp) +{ + SGXInfo *info = NULL; + X86MachineState *x86ms; + PCMachineState *pcms = + (PCMachineState *)object_dynamic_cast(qdev_get_machine(), + TYPE_PC_MACHINE); + if (!pcms) { + error_setg(errp, "SGX is only supported on PC machines"); + return NULL; + } + + x86ms = X86_MACHINE(pcms); + if (!x86ms->sgx_epc_list) { + error_setg(errp, "No EPC regions defined, SGX not available"); + return NULL; + } + + SGXEPCState *sgx_epc = &pcms->sgx_epc; + info = g_new0(SGXInfo, 1); + + info->sgx = true; + info->sgx1 = true; + info->sgx2 = true; + info->flc = true; + info->section_size = sgx_epc->size; + + return info; +} + +void hmp_info_sgx(Monitor *mon, const QDict *qdict) +{ + Error *err = NULL; + g_autoptr(SGXInfo) info = qmp_query_sgx(&err); + + if (err) { + error_report_err(err); + return; + } + monitor_printf(mon, "SGX support: %s\n", + info->sgx ? "enabled" : "disabled"); + monitor_printf(mon, "SGX1 support: %s\n", + info->sgx1 ? "enabled" : "disabled"); + monitor_printf(mon, "SGX2 support: %s\n", + info->sgx2 ? "enabled" : "disabled"); + monitor_printf(mon, "FLC support: %s\n", + info->flc ? "enabled" : "disabled"); + monitor_printf(mon, "size: %" PRIu64 "\n", + info->section_size); +} + +bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size) +{ + PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + SGXEPCDevice *epc; + + if (pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) { + return true; + } + + epc = pcms->sgx_epc.sections[section_nr]; + + *addr = epc->addr; + *size = memory_device_get_region_size(MEMORY_DEVICE(epc), &error_fatal); + + return false; +} + +void pc_machine_init_sgx_epc(PCMachineState *pcms) +{ + SGXEPCState *sgx_epc = &pcms->sgx_epc; + X86MachineState *x86ms = X86_MACHINE(pcms); + SgxEPCList *list = NULL; + Object *obj; + + memset(sgx_epc, 0, sizeof(SGXEPCState)); + if (!x86ms->sgx_epc_list) { + return; + } + + sgx_epc->base = 0x100000000ULL + x86ms->above_4g_mem_size; + + memory_region_init(&sgx_epc->mr, OBJECT(pcms), "sgx-epc", UINT64_MAX); + memory_region_add_subregion(get_system_memory(), sgx_epc->base, + &sgx_epc->mr); + + for (list = x86ms->sgx_epc_list; list; list = list->next) { + obj = object_new("sgx-epc"); + + /* set the memdev link with memory backend */ + object_property_parse(obj, SGX_EPC_MEMDEV_PROP, list->value->memdev, + &error_fatal); + object_property_set_bool(obj, "realized", true, &error_fatal); + object_unref(obj); + } + + if ((sgx_epc->base + sgx_epc->size) < sgx_epc->base) { + error_report("Size of all 'sgx-epc' =0x%"PRIu64" causes EPC to wrap", + sgx_epc->size); + exit(EXIT_FAILURE); + } + + memory_region_set_size(&sgx_epc->mr, sgx_epc->size); + + /* register the reset callback for sgx epc */ + qemu_register_reset(sgx_epc_reset, NULL); +} diff --git a/hw/i386/trace-events b/hw/i386/trace-events new file mode 100644 index 000000000..5bf7e52bf --- /dev/null +++ b/hw/i386/trace-events @@ -0,0 +1,121 @@ +# See docs/devel/tracing.rst for syntax documentation. + +# x86-iommu.c +x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32 + +# intel_iommu.c +vtd_inv_desc(const char *type, uint64_t hi, uint64_t lo) "invalidate desc type %s high 0x%"PRIx64" low 0x%"PRIx64 +vtd_inv_desc_cc_domain(uint16_t domain) "context invalidate domain 0x%"PRIx16 +vtd_inv_desc_cc_global(void) "context invalidate globally" +vtd_inv_desc_cc_device(uint8_t bus, uint8_t dev, uint8_t fn) "context invalidate device %02"PRIx8":%02"PRIx8".%02"PRIx8 +vtd_inv_desc_cc_devices(uint16_t sid, uint16_t fmask) "context invalidate devices sid 0x%"PRIx16" fmask 0x%"PRIx16 +vtd_inv_desc_iotlb_global(void) "iotlb invalidate global" +vtd_inv_desc_iotlb_domain(uint16_t domain) "iotlb invalidate whole domain 0x%"PRIx16 +vtd_inv_desc_iotlb_pages(uint16_t domain, uint64_t addr, uint8_t mask) "iotlb invalidate domain 0x%"PRIx16" addr 0x%"PRIx64" mask 0x%"PRIx8 +vtd_inv_desc_wait_sw(uint64_t addr, uint32_t data) "wait invalidate status write addr 0x%"PRIx64" data 0x%"PRIx32 +vtd_inv_desc_wait_irq(const char *msg) "%s" +vtd_inv_desc_wait_write_fail(uint64_t hi, uint64_t lo) "write fail for wait desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_inv_desc_iec(uint32_t granularity, uint32_t index, uint32_t mask) "granularity 0x%"PRIx32" index 0x%"PRIx32" mask 0x%"PRIx32 +vtd_inv_qi_enable(bool enable) "enabled %d" +vtd_inv_qi_setup(uint64_t addr, int size) "addr 0x%"PRIx64" size %d" +vtd_inv_qi_head(uint16_t head) "read head %d" +vtd_inv_qi_tail(uint16_t head) "write tail %d" +vtd_inv_qi_fetch(void) "" +vtd_context_cache_reset(void) "" +vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present" +vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present" +vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16 +vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page update sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16 +vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32 +vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32 +vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)" +vtd_fault_disabled(void) "Fault processing disabled for context entry" +vtd_replay_ce_valid(const char *mode, uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "%s: replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8 +vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64 +vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64 +vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64 +vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read" +vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" +vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" +vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 +vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64 +vtd_pt_enable_fast_path(uint16_t sid, bool success) "sid 0x%"PRIu16" %d" +vtd_irq_generate(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64 +vtd_reg_read(uint64_t addr, uint64_t size) "addr 0x%"PRIx64" size 0x%"PRIx64 +vtd_reg_write(uint64_t addr, uint64_t size, uint64_t val) "addr 0x%"PRIx64" size 0x%"PRIx64" value 0x%"PRIx64 +vtd_reg_dmar_root(uint64_t addr, bool scalable) "addr 0x%"PRIx64" scalable %d" +vtd_reg_ir_root(uint64_t addr, uint32_t size) "addr 0x%"PRIx64" size 0x%"PRIx32 +vtd_reg_write_gcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"PRIx32 +vtd_reg_write_fectl(uint32_t value) "value 0x%"PRIx32 +vtd_reg_write_iectl(uint32_t value) "value 0x%"PRIx32 +vtd_reg_ics_clear_ip(void) "" +vtd_dmar_translate(uint8_t bus, uint8_t slot, uint8_t func, uint64_t iova, uint64_t gpa, uint64_t mask) "dev %02x:%02x.%02x iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64 +vtd_dmar_enable(bool en) "enable %d" +vtd_dmar_fault(uint16_t sid, int fault, uint64_t addr, bool is_write) "sid 0x%"PRIx16" fault %d addr 0x%"PRIx64" write %d" +vtd_ir_enable(bool en) "enable %d" +vtd_ir_irte_get(int index, uint64_t lo, uint64_t hi) "index %d low 0x%"PRIx64" high 0x%"PRIx64 +vtd_ir_remap(int index, int tri, int vec, int deliver, uint32_t dest, int dest_mode) "index %d trigger %d vector %d deliver %d dest 0x%"PRIx32" mode %d" +vtd_ir_remap_type(const char *type) "%s" +vtd_ir_remap_msi(uint64_t addr, uint64_t data, uint64_t addr2, uint64_t data2) "(addr 0x%"PRIx64", data 0x%"PRIx64") -> (addr 0x%"PRIx64", data 0x%"PRIx64")" +vtd_ir_remap_msi_req(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64 +vtd_fsts_ppf(bool set) "FSTS PPF bit set to %d" +vtd_fsts_clear_ip(void) "" +vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low 0x%"PRIx64 +vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16 +vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)" +vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)" + +# amd_iommu.c +amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 +amdvi_cache_update(uint16_t domid, uint8_t bus, uint8_t slot, uint8_t func, uint64_t gpa, uint64_t txaddr) " update iotlb domid 0x%"PRIx16" devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64 +amdvi_completion_wait_fail(uint64_t addr) "error: fail to write at address 0x%"PRIx64 +amdvi_mmio_write(const char *reg, uint64_t addr, unsigned size, uint64_t val, uint64_t offset) "%s write addr 0x%"PRIx64", size %u, val 0x%"PRIx64", offset 0x%"PRIx64 +amdvi_mmio_read(const char *reg, uint64_t addr, unsigned size, uint64_t offset) "%s read addr 0x%"PRIx64", size %u offset 0x%"PRIx64 +amdvi_mmio_read_invalid(int max, uint64_t addr, unsigned size) "error: addr outside region (max 0x%x): read addr 0x%" PRIx64 ", size %u" +amdvi_command_error(uint64_t status) "error: Executing commands with command buffer disabled 0x%"PRIx64 +amdvi_command_read_fail(uint64_t addr, uint32_t head) "error: fail to access memory at 0x%"PRIx64" + 0x%"PRIx32 +amdvi_command_exec(uint32_t head, uint32_t tail, uint64_t buf) "command buffer head at 0x%"PRIx32" command buffer tail at 0x%"PRIx32" command buffer base at 0x%"PRIx64 +amdvi_unhandled_command(uint8_t type) "unhandled command 0x%"PRIx8 +amdvi_intr_inval(void) "Interrupt table invalidated" +amdvi_iotlb_inval(void) "IOTLB pages invalidated" +amdvi_prefetch_pages(void) "Pre-fetch of AMD-Vi pages requested" +amdvi_pages_inval(uint16_t domid) "AMD-Vi pages for domain 0x%"PRIx16 " invalidated" +amdvi_all_inval(void) "Invalidation of all AMD-Vi cache requested " +amdvi_ppr_exec(void) "Execution of PPR queue requested " +amdvi_devtab_inval(uint8_t bus, uint8_t slot, uint8_t func) "device table entry for devid: %02x:%02x.%x invalidated" +amdvi_completion_wait(uint64_t addr, uint64_t data) "completion wait requested with store address 0x%"PRIx64" and store data 0x%"PRIx64 +amdvi_control_status(uint64_t val) "MMIO_STATUS state 0x%"PRIx64 +amdvi_iotlb_reset(void) "IOTLB exceed size limit - reset " +amdvi_dte_get_fail(uint64_t addr, uint32_t offset) "error: failed to access Device Entry devtab 0x%"PRIx64" offset 0x%"PRIx32 +amdvi_invalid_dte(uint64_t addr) "PTE entry at 0x%"PRIx64" is invalid " +amdvi_get_pte_hwerror(uint64_t addr) "hardware error eccessing PTE at addr 0x%"PRIx64 +amdvi_mode_invalid(uint8_t level, uint64_t addr)"error: translation level 0x%"PRIx8" translating addr 0x%"PRIx64 +amdvi_page_fault(uint64_t addr) "error: page fault accessing guest physical address 0x%"PRIx64 +amdvi_iotlb_hit(uint8_t bus, uint8_t slot, uint8_t func, uint64_t addr, uint64_t txaddr) "hit iotlb devid %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64 +amdvi_translation_result(uint8_t bus, uint8_t slot, uint8_t func, uint64_t addr, uint64_t txaddr) "devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64 +amdvi_mem_ir_write_req(uint64_t addr, uint64_t val, uint32_t size) "addr 0x%"PRIx64" data 0x%"PRIx64" size 0x%"PRIx32 +amdvi_mem_ir_write(uint64_t addr, uint64_t val) "addr 0x%"PRIx64" data 0x%"PRIx64 +amdvi_ir_remap_msi_req(uint64_t addr, uint64_t data, uint8_t devid) "addr 0x%"PRIx64" data 0x%"PRIx64" devid 0x%"PRIx8 +amdvi_ir_remap_msi(uint64_t addr, uint64_t data, uint64_t addr2, uint64_t data2) "(addr 0x%"PRIx64", data 0x%"PRIx64") -> (addr 0x%"PRIx64", data 0x%"PRIx64")" +amdvi_err(const char *str) "%s" +amdvi_ir_irte(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" offset 0x%"PRIx64 +amdvi_ir_irte_val(uint32_t data) "data 0x%"PRIx32 +amdvi_ir_err(const char *str) "%s" +amdvi_ir_intctl(uint8_t val) "int_ctl 0x%"PRIx8 +amdvi_ir_target_abort(const char *str) "%s" +amdvi_ir_delivery_mode(const char *str) "%s" +amdvi_ir_irte_ga_val(uint64_t hi, uint64_t lo) "hi 0x%"PRIx64" lo 0x%"PRIx64 + +# vmport.c +vmport_register(unsigned char command, void *func, void *opaque) "command: 0x%02x func: %p opaque: %p" +vmport_command(unsigned char command) "command: 0x%02x" + +# x86.c +x86_gsi_interrupt(int irqn, int level) "GSI interrupt #%d level:%d" +x86_pic_interrupt(int irqn, int level) "PIC interrupt #%d level:%d" + +# port92.c +port92_read(uint8_t val) "port92: read 0x%02x" +port92_write(uint8_t val) "port92: write 0x%02x" diff --git a/hw/i386/trace.h b/hw/i386/trace.h new file mode 100644 index 000000000..37a9f67e5 --- /dev/null +++ b/hw/i386/trace.h @@ -0,0 +1 @@ +#include "trace/trace-hw_i386.h" diff --git a/hw/i386/vmmouse.c b/hw/i386/vmmouse.c new file mode 100644 index 000000000..3d6636828 --- /dev/null +++ b/hw/i386/vmmouse.c @@ -0,0 +1,327 @@ +/* + * QEMU VMMouse emulation + * + * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "ui/console.h" +#include "hw/i386/vmport.h" +#include "hw/input/i8042.h" +#include "hw/qdev-properties.h" +#include "migration/vmstate.h" +#include "cpu.h" +#include "qom/object.h" + +/* debug only vmmouse */ +//#define DEBUG_VMMOUSE + +#define VMMOUSE_READ_ID 0x45414552 +#define VMMOUSE_DISABLE 0x000000f5 +#define VMMOUSE_REQUEST_RELATIVE 0x4c455252 +#define VMMOUSE_REQUEST_ABSOLUTE 0x53424152 + +#define VMMOUSE_QUEUE_SIZE 1024 + +#define VMMOUSE_VERSION 0x3442554a + +#ifdef DEBUG_VMMOUSE +#define DPRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#else +#define DPRINTF(fmt, ...) do { } while (0) +#endif + +#define TYPE_VMMOUSE "vmmouse" +OBJECT_DECLARE_SIMPLE_TYPE(VMMouseState, VMMOUSE) + +struct VMMouseState { + ISADevice parent_obj; + + uint32_t queue[VMMOUSE_QUEUE_SIZE]; + int32_t queue_size; + uint16_t nb_queue; + uint16_t status; + uint8_t absolute; + QEMUPutMouseEntry *entry; + ISAKBDState *i8042; +}; + +static void vmmouse_get_data(uint32_t *data) +{ + X86CPU *cpu = X86_CPU(current_cpu); + CPUX86State *env = &cpu->env; + + data[0] = env->regs[R_EAX]; data[1] = env->regs[R_EBX]; + data[2] = env->regs[R_ECX]; data[3] = env->regs[R_EDX]; + data[4] = env->regs[R_ESI]; data[5] = env->regs[R_EDI]; +} + +static void vmmouse_set_data(const uint32_t *data) +{ + X86CPU *cpu = X86_CPU(current_cpu); + CPUX86State *env = &cpu->env; + + env->regs[R_EAX] = data[0]; env->regs[R_EBX] = data[1]; + env->regs[R_ECX] = data[2]; env->regs[R_EDX] = data[3]; + env->regs[R_ESI] = data[4]; env->regs[R_EDI] = data[5]; +} + +static uint32_t vmmouse_get_status(VMMouseState *s) +{ + DPRINTF("vmmouse_get_status()\n"); + return (s->status << 16) | s->nb_queue; +} + +static void vmmouse_mouse_event(void *opaque, int x, int y, int dz, int buttons_state) +{ + VMMouseState *s = opaque; + int buttons = 0; + + if (s->nb_queue > (VMMOUSE_QUEUE_SIZE - 4)) + return; + + DPRINTF("vmmouse_mouse_event(%d, %d, %d, %d)\n", + x, y, dz, buttons_state); + + if ((buttons_state & MOUSE_EVENT_LBUTTON)) + buttons |= 0x20; + if ((buttons_state & MOUSE_EVENT_RBUTTON)) + buttons |= 0x10; + if ((buttons_state & MOUSE_EVENT_MBUTTON)) + buttons |= 0x08; + + if (s->absolute) { + x <<= 1; + y <<= 1; + } + + s->queue[s->nb_queue++] = buttons; + s->queue[s->nb_queue++] = x; + s->queue[s->nb_queue++] = y; + s->queue[s->nb_queue++] = dz; + + /* need to still generate PS2 events to notify driver to + read from queue */ + i8042_isa_mouse_fake_event(s->i8042); +} + +static void vmmouse_remove_handler(VMMouseState *s) +{ + if (s->entry) { + qemu_remove_mouse_event_handler(s->entry); + s->entry = NULL; + } +} + +static void vmmouse_update_handler(VMMouseState *s, int absolute) +{ + if (s->status != 0) { + return; + } + if (s->absolute != absolute) { + s->absolute = absolute; + vmmouse_remove_handler(s); + } + if (s->entry == NULL) { + s->entry = qemu_add_mouse_event_handler(vmmouse_mouse_event, + s, s->absolute, + "vmmouse"); + qemu_activate_mouse_event_handler(s->entry); + } +} + +static void vmmouse_read_id(VMMouseState *s) +{ + DPRINTF("vmmouse_read_id()\n"); + + if (s->nb_queue == VMMOUSE_QUEUE_SIZE) + return; + + s->queue[s->nb_queue++] = VMMOUSE_VERSION; + s->status = 0; + vmmouse_update_handler(s, s->absolute); +} + +static void vmmouse_request_relative(VMMouseState *s) +{ + DPRINTF("vmmouse_request_relative()\n"); + vmmouse_update_handler(s, 0); +} + +static void vmmouse_request_absolute(VMMouseState *s) +{ + DPRINTF("vmmouse_request_absolute()\n"); + vmmouse_update_handler(s, 1); +} + +static void vmmouse_disable(VMMouseState *s) +{ + DPRINTF("vmmouse_disable()\n"); + s->status = 0xffff; + vmmouse_remove_handler(s); +} + +static void vmmouse_data(VMMouseState *s, uint32_t *data, uint32_t size) +{ + int i; + + DPRINTF("vmmouse_data(%d)\n", size); + + if (size == 0 || size > 6 || size > s->nb_queue) { + printf("vmmouse: driver requested too much data %d\n", size); + s->status = 0xffff; + vmmouse_remove_handler(s); + return; + } + + for (i = 0; i < size; i++) + data[i] = s->queue[i]; + + s->nb_queue -= size; + if (s->nb_queue) + memmove(s->queue, &s->queue[size], sizeof(s->queue[0]) * s->nb_queue); +} + +static uint32_t vmmouse_ioport_read(void *opaque, uint32_t addr) +{ + VMMouseState *s = opaque; + uint32_t data[6]; + uint16_t command; + + vmmouse_get_data(data); + + command = data[2] & 0xFFFF; + + switch (command) { + case VMPORT_CMD_VMMOUSE_STATUS: + data[0] = vmmouse_get_status(s); + break; + case VMPORT_CMD_VMMOUSE_COMMAND: + switch (data[1]) { + case VMMOUSE_DISABLE: + vmmouse_disable(s); + break; + case VMMOUSE_READ_ID: + vmmouse_read_id(s); + break; + case VMMOUSE_REQUEST_RELATIVE: + vmmouse_request_relative(s); + break; + case VMMOUSE_REQUEST_ABSOLUTE: + vmmouse_request_absolute(s); + break; + default: + printf("vmmouse: unknown command %x\n", data[1]); + break; + } + break; + case VMPORT_CMD_VMMOUSE_DATA: + vmmouse_data(s, data, data[1]); + break; + default: + printf("vmmouse: unknown command %x\n", command); + break; + } + + vmmouse_set_data(data); + return data[0]; +} + +static int vmmouse_post_load(void *opaque, int version_id) +{ + VMMouseState *s = opaque; + + vmmouse_remove_handler(s); + vmmouse_update_handler(s, s->absolute); + return 0; +} + +static const VMStateDescription vmstate_vmmouse = { + .name = "vmmouse", + .version_id = 0, + .minimum_version_id = 0, + .post_load = vmmouse_post_load, + .fields = (VMStateField[]) { + VMSTATE_INT32_EQUAL(queue_size, VMMouseState, NULL), + VMSTATE_UINT32_ARRAY(queue, VMMouseState, VMMOUSE_QUEUE_SIZE), + VMSTATE_UINT16(nb_queue, VMMouseState), + VMSTATE_UINT16(status, VMMouseState), + VMSTATE_UINT8(absolute, VMMouseState), + VMSTATE_END_OF_LIST() + } +}; + +static void vmmouse_reset(DeviceState *d) +{ + VMMouseState *s = VMMOUSE(d); + + s->queue_size = VMMOUSE_QUEUE_SIZE; + s->nb_queue = 0; + + vmmouse_disable(s); +} + +static void vmmouse_realizefn(DeviceState *dev, Error **errp) +{ + VMMouseState *s = VMMOUSE(dev); + + DPRINTF("vmmouse_init\n"); + + if (!object_resolve_path_type("", TYPE_VMPORT, NULL)) { + error_setg(errp, "vmmouse needs a machine with vmport"); + return; + } + + vmport_register(VMPORT_CMD_VMMOUSE_STATUS, vmmouse_ioport_read, s); + vmport_register(VMPORT_CMD_VMMOUSE_COMMAND, vmmouse_ioport_read, s); + vmport_register(VMPORT_CMD_VMMOUSE_DATA, vmmouse_ioport_read, s); +} + +static Property vmmouse_properties[] = { + DEFINE_PROP_LINK("i8042", VMMouseState, i8042, TYPE_I8042, ISAKBDState *), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vmmouse_class_initfn(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = vmmouse_realizefn; + dc->reset = vmmouse_reset; + dc->vmsd = &vmstate_vmmouse; + device_class_set_props(dc, vmmouse_properties); + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); +} + +static const TypeInfo vmmouse_info = { + .name = TYPE_VMMOUSE, + .parent = TYPE_ISA_DEVICE, + .instance_size = sizeof(VMMouseState), + .class_init = vmmouse_class_initfn, +}; + +static void vmmouse_register_types(void) +{ + type_register_static(&vmmouse_info); +} + +type_init(vmmouse_register_types) diff --git a/hw/i386/vmport.c b/hw/i386/vmport.c new file mode 100644 index 000000000..7cc75dbc6 --- /dev/null +++ b/hw/i386/vmport.c @@ -0,0 +1,313 @@ +/* + * QEMU VMPort emulation + * + * Copyright (C) 2007 Hervé Poussineau + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/* + * Guest code that interacts with this virtual device can be found + * in VMware open-vm-tools open-source project: + * https://github.com/vmware/open-vm-tools + */ + +#include "qemu/osdep.h" +#include "hw/isa/isa.h" +#include "hw/i386/vmport.h" +#include "hw/qdev-properties.h" +#include "hw/boards.h" +#include "sysemu/sysemu.h" +#include "sysemu/hw_accel.h" +#include "sysemu/qtest.h" +#include "qemu/log.h" +#include "trace.h" +#include "qom/object.h" + +#define VMPORT_MAGIC 0x564D5868 + +/* Compatibility flags for migration */ +#define VMPORT_COMPAT_READ_SET_EAX_BIT 0 +#define VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD_BIT 1 +#define VMPORT_COMPAT_REPORT_VMX_TYPE_BIT 2 +#define VMPORT_COMPAT_CMDS_V2_BIT 3 +#define VMPORT_COMPAT_READ_SET_EAX \ + (1 << VMPORT_COMPAT_READ_SET_EAX_BIT) +#define VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD \ + (1 << VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD_BIT) +#define VMPORT_COMPAT_REPORT_VMX_TYPE \ + (1 << VMPORT_COMPAT_REPORT_VMX_TYPE_BIT) +#define VMPORT_COMPAT_CMDS_V2 \ + (1 << VMPORT_COMPAT_CMDS_V2_BIT) + +/* vCPU features reported by CMD_GET_VCPU_INFO */ +#define VCPU_INFO_SLC64_BIT 0 +#define VCPU_INFO_SYNC_VTSCS_BIT 1 +#define VCPU_INFO_HV_REPLAY_OK_BIT 2 +#define VCPU_INFO_LEGACY_X2APIC_BIT 3 +#define VCPU_INFO_RESERVED_BIT 31 + +OBJECT_DECLARE_SIMPLE_TYPE(VMPortState, VMPORT) + +struct VMPortState { + ISADevice parent_obj; + + MemoryRegion io; + VMPortReadFunc *func[VMPORT_ENTRIES]; + void *opaque[VMPORT_ENTRIES]; + + uint32_t vmware_vmx_version; + uint8_t vmware_vmx_type; + + uint32_t compat_flags; +}; + +static VMPortState *port_state; + +void vmport_register(VMPortCommand command, VMPortReadFunc *func, void *opaque) +{ + assert(command < VMPORT_ENTRIES); + assert(port_state); + + trace_vmport_register(command, func, opaque); + port_state->func[command] = func; + port_state->opaque[command] = opaque; +} + +static uint64_t vmport_ioport_read(void *opaque, hwaddr addr, + unsigned size) +{ + VMPortState *s = opaque; + CPUState *cs = current_cpu; + X86CPU *cpu = X86_CPU(cs); + CPUX86State *env; + unsigned char command; + uint32_t eax; + + if (qtest_enabled()) { + return -1; + } + env = &cpu->env; + cpu_synchronize_state(cs); + + eax = env->regs[R_EAX]; + if (eax != VMPORT_MAGIC) { + goto err; + } + + command = env->regs[R_ECX]; + trace_vmport_command(command); + if (command >= VMPORT_ENTRIES || !s->func[command]) { + qemu_log_mask(LOG_UNIMP, "vmport: unknown command %x\n", command); + goto err; + } + + eax = s->func[command](s->opaque[command], addr); + goto out; + +err: + if (s->compat_flags & VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD) { + eax = UINT32_MAX; + } + +out: + /* + * The call above to cpu_synchronize_state() gets vCPU registers values + * to QEMU but also cause QEMU to write QEMU vCPU registers values to + * vCPU implementation (e.g. Accelerator such as KVM) just before + * resuming guest. + * + * Therefore, in order to make IOPort return value propagate to + * guest EAX, we need to explicitly update QEMU EAX register value. + */ + if (s->compat_flags & VMPORT_COMPAT_READ_SET_EAX) { + cpu->env.regs[R_EAX] = eax; + } + + return eax; +} + +static void vmport_ioport_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + X86CPU *cpu = X86_CPU(current_cpu); + + if (qtest_enabled()) { + return; + } + cpu->env.regs[R_EAX] = vmport_ioport_read(opaque, addr, 4); +} + +static uint32_t vmport_cmd_get_version(void *opaque, uint32_t addr) +{ + X86CPU *cpu = X86_CPU(current_cpu); + + if (qtest_enabled()) { + return -1; + } + cpu->env.regs[R_EBX] = VMPORT_MAGIC; + if (port_state->compat_flags & VMPORT_COMPAT_REPORT_VMX_TYPE) { + cpu->env.regs[R_ECX] = port_state->vmware_vmx_type; + } + return port_state->vmware_vmx_version; +} + +static uint32_t vmport_cmd_get_bios_uuid(void *opaque, uint32_t addr) +{ + X86CPU *cpu = X86_CPU(current_cpu); + uint32_t *uuid_parts = (uint32_t *)(qemu_uuid.data); + + cpu->env.regs[R_EAX] = le32_to_cpu(uuid_parts[0]); + cpu->env.regs[R_EBX] = le32_to_cpu(uuid_parts[1]); + cpu->env.regs[R_ECX] = le32_to_cpu(uuid_parts[2]); + cpu->env.regs[R_EDX] = le32_to_cpu(uuid_parts[3]); + return cpu->env.regs[R_EAX]; +} + +static uint32_t vmport_cmd_ram_size(void *opaque, uint32_t addr) +{ + X86CPU *cpu = X86_CPU(current_cpu); + + if (qtest_enabled()) { + return -1; + } + cpu->env.regs[R_EBX] = 0x1177; + return current_machine->ram_size; +} + +static uint32_t vmport_cmd_get_hz(void *opaque, uint32_t addr) +{ + X86CPU *cpu = X86_CPU(current_cpu); + + if (cpu->env.tsc_khz && cpu->env.apic_bus_freq) { + uint64_t tsc_freq = (uint64_t)cpu->env.tsc_khz * 1000; + + cpu->env.regs[R_ECX] = cpu->env.apic_bus_freq; + cpu->env.regs[R_EBX] = (uint32_t)(tsc_freq >> 32); + cpu->env.regs[R_EAX] = (uint32_t)tsc_freq; + } else { + /* Signal cmd as not supported */ + cpu->env.regs[R_EBX] = UINT32_MAX; + } + + return cpu->env.regs[R_EAX]; +} + +static uint32_t vmport_cmd_get_vcpu_info(void *opaque, uint32_t addr) +{ + X86CPU *cpu = X86_CPU(current_cpu); + uint32_t ret = 0; + + if (cpu->env.features[FEAT_1_ECX] & CPUID_EXT_X2APIC) { + ret |= 1 << VCPU_INFO_LEGACY_X2APIC_BIT; + } + + return ret; +} + +static const MemoryRegionOps vmport_ops = { + .read = vmport_ioport_read, + .write = vmport_ioport_write, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vmport_realizefn(DeviceState *dev, Error **errp) +{ + ISADevice *isadev = ISA_DEVICE(dev); + VMPortState *s = VMPORT(dev); + + memory_region_init_io(&s->io, OBJECT(s), &vmport_ops, s, "vmport", 1); + isa_register_ioport(isadev, &s->io, 0x5658); + + port_state = s; + + /* Register some generic port commands */ + vmport_register(VMPORT_CMD_GETVERSION, vmport_cmd_get_version, NULL); + vmport_register(VMPORT_CMD_GETRAMSIZE, vmport_cmd_ram_size, NULL); + if (s->compat_flags & VMPORT_COMPAT_CMDS_V2) { + vmport_register(VMPORT_CMD_GETBIOSUUID, vmport_cmd_get_bios_uuid, NULL); + vmport_register(VMPORT_CMD_GETHZ, vmport_cmd_get_hz, NULL); + vmport_register(VMPORT_CMD_GET_VCPU_INFO, vmport_cmd_get_vcpu_info, + NULL); + } +} + +static Property vmport_properties[] = { + /* Used to enforce compatibility for migration */ + DEFINE_PROP_BIT("x-read-set-eax", VMPortState, compat_flags, + VMPORT_COMPAT_READ_SET_EAX_BIT, true), + DEFINE_PROP_BIT("x-signal-unsupported-cmd", VMPortState, compat_flags, + VMPORT_COMPAT_SIGNAL_UNSUPPORTED_CMD_BIT, true), + DEFINE_PROP_BIT("x-report-vmx-type", VMPortState, compat_flags, + VMPORT_COMPAT_REPORT_VMX_TYPE_BIT, true), + DEFINE_PROP_BIT("x-cmds-v2", VMPortState, compat_flags, + VMPORT_COMPAT_CMDS_V2_BIT, true), + + /* Default value taken from open-vm-tools code VERSION_MAGIC definition */ + DEFINE_PROP_UINT32("vmware-vmx-version", VMPortState, + vmware_vmx_version, 6), + /* + * Value determines which VMware product type host report itself to guest. + * + * Most guests are fine with exposing host as VMware ESX server. + * Some legacy/proprietary guests hard-code a given type. + * + * For a complete list of values, refer to enum VMXType at open-vm-tools + * project (Defined at lib/include/vm_vmx_type.h). + * + * Reasonable options: + * 0 - Unset + * 1 - VMware Express (deprecated) + * 2 - VMware ESX Server + * 3 - VMware Server (Deprecated) + * 4 - VMware Workstation + * 5 - ACE 1.x (Deprecated) + */ + DEFINE_PROP_UINT8("vmware-vmx-type", VMPortState, vmware_vmx_type, 2), + + DEFINE_PROP_END_OF_LIST(), +}; + +static void vmport_class_initfn(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = vmport_realizefn; + /* Reason: realize sets global port_state */ + dc->user_creatable = false; + device_class_set_props(dc, vmport_properties); +} + +static const TypeInfo vmport_info = { + .name = TYPE_VMPORT, + .parent = TYPE_ISA_DEVICE, + .instance_size = sizeof(VMPortState), + .class_init = vmport_class_initfn, +}; + +static void vmport_register_types(void) +{ + type_register_static(&vmport_info); +} + +type_init(vmport_register_types) diff --git a/hw/i386/x86-iommu-stub.c b/hw/i386/x86-iommu-stub.c new file mode 100644 index 000000000..781b5ff92 --- /dev/null +++ b/hw/i386/x86-iommu-stub.c @@ -0,0 +1,38 @@ +/* + * Stubs for X86 IOMMU emulation + * + * Copyright (C) 2019 Red Hat, Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "hw/i386/x86-iommu.h" + +void x86_iommu_iec_register_notifier(X86IOMMUState *iommu, + iec_notify_fn fn, void *data) +{ +} + +X86IOMMUState *x86_iommu_get_default(void) +{ + return NULL; +} + +bool x86_iommu_ir_supported(X86IOMMUState *s) +{ + return false; +} diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c new file mode 100644 index 000000000..01d11325a --- /dev/null +++ b/hw/i386/x86-iommu.c @@ -0,0 +1,162 @@ +/* + * QEMU emulation of common X86 IOMMU + * + * Copyright (C) 2016 Peter Xu, Red Hat <peterx@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "hw/sysbus.h" +#include "hw/i386/x86-iommu.h" +#include "hw/qdev-properties.h" +#include "hw/i386/pc.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "sysemu/kvm.h" + +void x86_iommu_iec_register_notifier(X86IOMMUState *iommu, + iec_notify_fn fn, void *data) +{ + IEC_Notifier *notifier = g_new0(IEC_Notifier, 1); + + notifier->iec_notify = fn; + notifier->private = data; + + QLIST_INSERT_HEAD(&iommu->iec_notifiers, notifier, list); +} + +void x86_iommu_iec_notify_all(X86IOMMUState *iommu, bool global, + uint32_t index, uint32_t mask) +{ + IEC_Notifier *notifier; + + trace_x86_iommu_iec_notify(global, index, mask); + + QLIST_FOREACH(notifier, &iommu->iec_notifiers, list) { + if (notifier->iec_notify) { + notifier->iec_notify(notifier->private, global, + index, mask); + } + } +} + +/* Generate one MSI message from VTDIrq info */ +void x86_iommu_irq_to_msi_message(X86IOMMUIrq *irq, MSIMessage *msg_out) +{ + X86IOMMU_MSIMessage msg = {}; + + /* Generate address bits */ + msg.dest_mode = irq->dest_mode; + msg.redir_hint = irq->redir_hint; + msg.dest = irq->dest; + msg.__addr_hi = irq->dest & 0xffffff00; + msg.__addr_head = cpu_to_le32(0xfee); + /* Keep this from original MSI address bits */ + msg.__not_used = irq->msi_addr_last_bits; + + /* Generate data bits */ + msg.vector = irq->vector; + msg.delivery_mode = irq->delivery_mode; + msg.level = 1; + msg.trigger_mode = irq->trigger_mode; + + msg_out->address = msg.msi_addr; + msg_out->data = msg.msi_data; +} + +X86IOMMUState *x86_iommu_get_default(void) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + PCMachineState *pcms = + PC_MACHINE(object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE)); + + if (pcms && + object_dynamic_cast(OBJECT(pcms->iommu), TYPE_X86_IOMMU_DEVICE)) { + return X86_IOMMU_DEVICE(pcms->iommu); + } + return NULL; +} + +static void x86_iommu_realize(DeviceState *dev, Error **errp) +{ + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev); + X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_GET_CLASS(dev); + MachineState *ms = MACHINE(qdev_get_machine()); + MachineClass *mc = MACHINE_GET_CLASS(ms); + PCMachineState *pcms = + PC_MACHINE(object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE)); + QLIST_INIT(&x86_iommu->iec_notifiers); + bool irq_all_kernel = kvm_irqchip_in_kernel() && !kvm_irqchip_is_split(); + + if (!pcms || !pcms->bus) { + error_setg(errp, "Machine-type '%s' not supported by IOMMU", + mc->name); + return; + } + + /* If the user didn't specify IR, choose a default value for it */ + if (x86_iommu->intr_supported == ON_OFF_AUTO_AUTO) { + x86_iommu->intr_supported = irq_all_kernel ? + ON_OFF_AUTO_OFF : ON_OFF_AUTO_ON; + } + + /* Both Intel and AMD IOMMU IR only support "kernel-irqchip={off|split}" */ + if (x86_iommu_ir_supported(x86_iommu) && irq_all_kernel) { + error_setg(errp, "Interrupt Remapping cannot work with " + "kernel-irqchip=on, please use 'split|off'."); + return; + } + + if (x86_class->realize) { + x86_class->realize(dev, errp); + } +} + +static Property x86_iommu_properties[] = { + DEFINE_PROP_ON_OFF_AUTO("intremap", X86IOMMUState, + intr_supported, ON_OFF_AUTO_AUTO), + DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), + DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), + DEFINE_PROP_END_OF_LIST(), +}; + +static void x86_iommu_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + dc->realize = x86_iommu_realize; + device_class_set_props(dc, x86_iommu_properties); +} + +bool x86_iommu_ir_supported(X86IOMMUState *s) +{ + return s->intr_supported == ON_OFF_AUTO_ON; +} + +static const TypeInfo x86_iommu_info = { + .name = TYPE_X86_IOMMU_DEVICE, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(X86IOMMUState), + .class_init = x86_iommu_class_init, + .class_size = sizeof(X86IOMMUClass), + .abstract = true, +}; + +static void x86_iommu_register_types(void) +{ + type_register_static(&x86_iommu_info); +} + +type_init(x86_iommu_register_types) diff --git a/hw/i386/x86.c b/hw/i386/x86.c new file mode 100644 index 000000000..b84840a1b --- /dev/null +++ b/hw/i386/x86.c @@ -0,0 +1,1399 @@ +/* + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2019 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qemu/option.h" +#include "qemu/cutils.h" +#include "qemu/units.h" +#include "qemu-common.h" +#include "qemu/datadir.h" +#include "qapi/error.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qapi-visit-common.h" +#include "qapi/clone-visitor.h" +#include "qapi/qapi-visit-machine.h" +#include "qapi/visitor.h" +#include "sysemu/qtest.h" +#include "sysemu/whpx.h" +#include "sysemu/numa.h" +#include "sysemu/replay.h" +#include "sysemu/sysemu.h" +#include "sysemu/cpu-timers.h" +#include "trace.h" + +#include "hw/i386/x86.h" +#include "target/i386/cpu.h" +#include "hw/i386/topology.h" +#include "hw/i386/fw_cfg.h" +#include "hw/intc/i8259.h" +#include "hw/rtc/mc146818rtc.h" +#include "target/i386/sev.h" + +#include "hw/acpi/cpu_hotplug.h" +#include "hw/irq.h" +#include "hw/nmi.h" +#include "hw/loader.h" +#include "multiboot.h" +#include "elf.h" +#include "standard-headers/asm-x86/bootparam.h" +#include CONFIG_DEVICES +#include "kvm/kvm_i386.h" + +/* Physical Address of PVH entry point read from kernel ELF NOTE */ +static size_t pvh_start_addr; + +inline void init_topo_info(X86CPUTopoInfo *topo_info, + const X86MachineState *x86ms) +{ + MachineState *ms = MACHINE(x86ms); + + topo_info->dies_per_pkg = ms->smp.dies; + topo_info->cores_per_die = ms->smp.cores; + topo_info->threads_per_core = ms->smp.threads; +} + +/* + * Calculates initial APIC ID for a specific CPU index + * + * Currently we need to be able to calculate the APIC ID from the CPU index + * alone (without requiring a CPU object), as the QEMU<->Seabios interfaces have + * no concept of "CPU index", and the NUMA tables on fw_cfg need the APIC ID of + * all CPUs up to max_cpus. + */ +uint32_t x86_cpu_apic_id_from_index(X86MachineState *x86ms, + unsigned int cpu_index) +{ + X86MachineClass *x86mc = X86_MACHINE_GET_CLASS(x86ms); + X86CPUTopoInfo topo_info; + uint32_t correct_id; + static bool warned; + + init_topo_info(&topo_info, x86ms); + + correct_id = x86_apicid_from_cpu_idx(&topo_info, cpu_index); + if (x86mc->compat_apic_id_mode) { + if (cpu_index != correct_id && !warned && !qtest_enabled()) { + error_report("APIC IDs set in compatibility mode, " + "CPU topology won't match the configuration"); + warned = true; + } + return cpu_index; + } else { + return correct_id; + } +} + + +void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp) +{ + Object *cpu = object_new(MACHINE(x86ms)->cpu_type); + + if (!object_property_set_uint(cpu, "apic-id", apic_id, errp)) { + goto out; + } + qdev_realize(DEVICE(cpu), NULL, errp); + +out: + object_unref(cpu); +} + +void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) +{ + int i; + const CPUArchIdList *possible_cpus; + MachineState *ms = MACHINE(x86ms); + MachineClass *mc = MACHINE_GET_CLASS(x86ms); + + x86_cpu_set_default_version(default_cpu_version); + + /* + * Calculates the limit to CPU APIC ID values + * + * Limit for the APIC ID value, so that all + * CPU APIC IDs are < x86ms->apic_id_limit. + * + * This is used for FW_CFG_MAX_CPUS. See comments on fw_cfg_arch_create(). + */ + x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms, + ms->smp.max_cpus - 1) + 1; + possible_cpus = mc->possible_cpu_arch_ids(ms); + for (i = 0; i < ms->smp.cpus; i++) { + x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal); + } +} + +void x86_rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count) +{ + if (cpus_count > 0xff) { + /* + * If the number of CPUs can't be represented in 8 bits, the + * BIOS must use "FW_CFG_NB_CPUS". Set RTC field to 0 just + * to make old BIOSes fail more predictably. + */ + rtc_set_memory(rtc, 0x5f, 0); + } else { + rtc_set_memory(rtc, 0x5f, cpus_count - 1); + } +} + +static int x86_apic_cmp(const void *a, const void *b) +{ + CPUArchId *apic_a = (CPUArchId *)a; + CPUArchId *apic_b = (CPUArchId *)b; + + return apic_a->arch_id - apic_b->arch_id; +} + +/* + * returns pointer to CPUArchId descriptor that matches CPU's apic_id + * in ms->possible_cpus->cpus, if ms->possible_cpus->cpus has no + * entry corresponding to CPU's apic_id returns NULL. + */ +CPUArchId *x86_find_cpu_slot(MachineState *ms, uint32_t id, int *idx) +{ + CPUArchId apic_id, *found_cpu; + + apic_id.arch_id = id; + found_cpu = bsearch(&apic_id, ms->possible_cpus->cpus, + ms->possible_cpus->len, sizeof(*ms->possible_cpus->cpus), + x86_apic_cmp); + if (found_cpu && idx) { + *idx = found_cpu - ms->possible_cpus->cpus; + } + return found_cpu; +} + +void x86_cpu_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + CPUArchId *found_cpu; + Error *local_err = NULL; + X86CPU *cpu = X86_CPU(dev); + X86MachineState *x86ms = X86_MACHINE(hotplug_dev); + + if (x86ms->acpi_dev) { + hotplug_handler_plug(x86ms->acpi_dev, dev, &local_err); + if (local_err) { + goto out; + } + } + + /* increment the number of CPUs */ + x86ms->boot_cpus++; + if (x86ms->rtc) { + x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus); + } + if (x86ms->fw_cfg) { + fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus); + } + + found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL); + found_cpu->cpu = OBJECT(dev); +out: + error_propagate(errp, local_err); +} + +void x86_cpu_unplug_request_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + int idx = -1; + X86CPU *cpu = X86_CPU(dev); + X86MachineState *x86ms = X86_MACHINE(hotplug_dev); + + if (!x86ms->acpi_dev) { + error_setg(errp, "CPU hot unplug not supported without ACPI"); + return; + } + + x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx); + assert(idx != -1); + if (idx == 0) { + error_setg(errp, "Boot CPU is unpluggable"); + return; + } + + hotplug_handler_unplug_request(x86ms->acpi_dev, dev, + errp); +} + +void x86_cpu_unplug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + CPUArchId *found_cpu; + Error *local_err = NULL; + X86CPU *cpu = X86_CPU(dev); + X86MachineState *x86ms = X86_MACHINE(hotplug_dev); + + hotplug_handler_unplug(x86ms->acpi_dev, dev, &local_err); + if (local_err) { + goto out; + } + + found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL); + found_cpu->cpu = NULL; + qdev_unrealize(dev); + + /* decrement the number of CPUs */ + x86ms->boot_cpus--; + /* Update the number of CPUs in CMOS */ + x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus); + fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus); + out: + error_propagate(errp, local_err); +} + +void x86_cpu_pre_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + int idx; + CPUState *cs; + CPUArchId *cpu_slot; + X86CPUTopoIDs topo_ids; + X86CPU *cpu = X86_CPU(dev); + CPUX86State *env = &cpu->env; + MachineState *ms = MACHINE(hotplug_dev); + X86MachineState *x86ms = X86_MACHINE(hotplug_dev); + unsigned int smp_cores = ms->smp.cores; + unsigned int smp_threads = ms->smp.threads; + X86CPUTopoInfo topo_info; + + if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) { + error_setg(errp, "Invalid CPU type, expected cpu type: '%s'", + ms->cpu_type); + return; + } + + if (x86ms->acpi_dev) { + Error *local_err = NULL; + + hotplug_handler_pre_plug(HOTPLUG_HANDLER(x86ms->acpi_dev), dev, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } + + init_topo_info(&topo_info, x86ms); + + env->nr_dies = ms->smp.dies; + + /* + * If APIC ID is not set, + * set it based on socket/die/core/thread properties. + */ + if (cpu->apic_id == UNASSIGNED_APIC_ID) { + int max_socket = (ms->smp.max_cpus - 1) / + smp_threads / smp_cores / ms->smp.dies; + + /* + * die-id was optional in QEMU 4.0 and older, so keep it optional + * if there's only one die per socket. + */ + if (cpu->die_id < 0 && ms->smp.dies == 1) { + cpu->die_id = 0; + } + + if (cpu->socket_id < 0) { + error_setg(errp, "CPU socket-id is not set"); + return; + } else if (cpu->socket_id > max_socket) { + error_setg(errp, "Invalid CPU socket-id: %u must be in range 0:%u", + cpu->socket_id, max_socket); + return; + } + if (cpu->die_id < 0) { + error_setg(errp, "CPU die-id is not set"); + return; + } else if (cpu->die_id > ms->smp.dies - 1) { + error_setg(errp, "Invalid CPU die-id: %u must be in range 0:%u", + cpu->die_id, ms->smp.dies - 1); + return; + } + if (cpu->core_id < 0) { + error_setg(errp, "CPU core-id is not set"); + return; + } else if (cpu->core_id > (smp_cores - 1)) { + error_setg(errp, "Invalid CPU core-id: %u must be in range 0:%u", + cpu->core_id, smp_cores - 1); + return; + } + if (cpu->thread_id < 0) { + error_setg(errp, "CPU thread-id is not set"); + return; + } else if (cpu->thread_id > (smp_threads - 1)) { + error_setg(errp, "Invalid CPU thread-id: %u must be in range 0:%u", + cpu->thread_id, smp_threads - 1); + return; + } + + topo_ids.pkg_id = cpu->socket_id; + topo_ids.die_id = cpu->die_id; + topo_ids.core_id = cpu->core_id; + topo_ids.smt_id = cpu->thread_id; + cpu->apic_id = x86_apicid_from_topo_ids(&topo_info, &topo_ids); + } + + cpu_slot = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx); + if (!cpu_slot) { + MachineState *ms = MACHINE(x86ms); + + x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids); + error_setg(errp, + "Invalid CPU [socket: %u, die: %u, core: %u, thread: %u] with" + " APIC ID %" PRIu32 ", valid index range 0:%d", + topo_ids.pkg_id, topo_ids.die_id, topo_ids.core_id, topo_ids.smt_id, + cpu->apic_id, ms->possible_cpus->len - 1); + return; + } + + if (cpu_slot->cpu) { + error_setg(errp, "CPU[%d] with APIC ID %" PRIu32 " exists", + idx, cpu->apic_id); + return; + } + + /* if 'address' properties socket-id/core-id/thread-id are not set, set them + * so that machine_query_hotpluggable_cpus would show correct values + */ + /* TODO: move socket_id/core_id/thread_id checks into x86_cpu_realizefn() + * once -smp refactoring is complete and there will be CPU private + * CPUState::nr_cores and CPUState::nr_threads fields instead of globals */ + x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids); + if (cpu->socket_id != -1 && cpu->socket_id != topo_ids.pkg_id) { + error_setg(errp, "property socket-id: %u doesn't match set apic-id:" + " 0x%x (socket-id: %u)", cpu->socket_id, cpu->apic_id, + topo_ids.pkg_id); + return; + } + cpu->socket_id = topo_ids.pkg_id; + + if (cpu->die_id != -1 && cpu->die_id != topo_ids.die_id) { + error_setg(errp, "property die-id: %u doesn't match set apic-id:" + " 0x%x (die-id: %u)", cpu->die_id, cpu->apic_id, topo_ids.die_id); + return; + } + cpu->die_id = topo_ids.die_id; + + if (cpu->core_id != -1 && cpu->core_id != topo_ids.core_id) { + error_setg(errp, "property core-id: %u doesn't match set apic-id:" + " 0x%x (core-id: %u)", cpu->core_id, cpu->apic_id, + topo_ids.core_id); + return; + } + cpu->core_id = topo_ids.core_id; + + if (cpu->thread_id != -1 && cpu->thread_id != topo_ids.smt_id) { + error_setg(errp, "property thread-id: %u doesn't match set apic-id:" + " 0x%x (thread-id: %u)", cpu->thread_id, cpu->apic_id, + topo_ids.smt_id); + return; + } + cpu->thread_id = topo_ids.smt_id; + + if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && + !kvm_hv_vpindex_settable()) { + error_setg(errp, "kernel doesn't allow setting HyperV VP_INDEX"); + return; + } + + cs = CPU(cpu); + cs->cpu_index = idx; + + numa_cpu_pre_plug(cpu_slot, dev, errp); +} + +CpuInstanceProperties +x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index) +{ + MachineClass *mc = MACHINE_GET_CLASS(ms); + const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms); + + assert(cpu_index < possible_cpus->len); + return possible_cpus->cpus[cpu_index].props; +} + +int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx) +{ + X86CPUTopoIDs topo_ids; + X86MachineState *x86ms = X86_MACHINE(ms); + X86CPUTopoInfo topo_info; + + init_topo_info(&topo_info, x86ms); + + assert(idx < ms->possible_cpus->len); + x86_topo_ids_from_apicid(ms->possible_cpus->cpus[idx].arch_id, + &topo_info, &topo_ids); + return topo_ids.pkg_id % ms->numa_state->num_nodes; +} + +const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms) +{ + X86MachineState *x86ms = X86_MACHINE(ms); + unsigned int max_cpus = ms->smp.max_cpus; + X86CPUTopoInfo topo_info; + int i; + + if (ms->possible_cpus) { + /* + * make sure that max_cpus hasn't changed since the first use, i.e. + * -smp hasn't been parsed after it + */ + assert(ms->possible_cpus->len == max_cpus); + return ms->possible_cpus; + } + + ms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) + + sizeof(CPUArchId) * max_cpus); + ms->possible_cpus->len = max_cpus; + + init_topo_info(&topo_info, x86ms); + + for (i = 0; i < ms->possible_cpus->len; i++) { + X86CPUTopoIDs topo_ids; + + ms->possible_cpus->cpus[i].type = ms->cpu_type; + ms->possible_cpus->cpus[i].vcpus_count = 1; + ms->possible_cpus->cpus[i].arch_id = + x86_cpu_apic_id_from_index(x86ms, i); + x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id, + &topo_info, &topo_ids); + ms->possible_cpus->cpus[i].props.has_socket_id = true; + ms->possible_cpus->cpus[i].props.socket_id = topo_ids.pkg_id; + if (ms->smp.dies > 1) { + ms->possible_cpus->cpus[i].props.has_die_id = true; + ms->possible_cpus->cpus[i].props.die_id = topo_ids.die_id; + } + ms->possible_cpus->cpus[i].props.has_core_id = true; + ms->possible_cpus->cpus[i].props.core_id = topo_ids.core_id; + ms->possible_cpus->cpus[i].props.has_thread_id = true; + ms->possible_cpus->cpus[i].props.thread_id = topo_ids.smt_id; + } + return ms->possible_cpus; +} + +static void x86_nmi(NMIState *n, int cpu_index, Error **errp) +{ + /* cpu index isn't used */ + CPUState *cs; + + CPU_FOREACH(cs) { + X86CPU *cpu = X86_CPU(cs); + + if (!cpu->apic_state) { + cpu_interrupt(cs, CPU_INTERRUPT_NMI); + } else { + apic_deliver_nmi(cpu->apic_state); + } + } +} + +static long get_file_size(FILE *f) +{ + long where, size; + + /* XXX: on Unix systems, using fstat() probably makes more sense */ + + where = ftell(f); + fseek(f, 0, SEEK_END); + size = ftell(f); + fseek(f, where, SEEK_SET); + + return size; +} + +/* TSC handling */ +uint64_t cpu_get_tsc(CPUX86State *env) +{ + return cpus_get_elapsed_ticks(); +} + +/* IRQ handling */ +static void pic_irq_request(void *opaque, int irq, int level) +{ + CPUState *cs = first_cpu; + X86CPU *cpu = X86_CPU(cs); + + trace_x86_pic_interrupt(irq, level); + if (cpu->apic_state && !kvm_irqchip_in_kernel() && + !whpx_apic_in_platform()) { + CPU_FOREACH(cs) { + cpu = X86_CPU(cs); + if (apic_accept_pic_intr(cpu->apic_state)) { + apic_deliver_pic_intr(cpu->apic_state, level); + } + } + } else { + if (level) { + cpu_interrupt(cs, CPU_INTERRUPT_HARD); + } else { + cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD); + } + } +} + +qemu_irq x86_allocate_cpu_irq(void) +{ + return qemu_allocate_irq(pic_irq_request, NULL, 0); +} + +int cpu_get_pic_interrupt(CPUX86State *env) +{ + X86CPU *cpu = env_archcpu(env); + int intno; + + if (!kvm_irqchip_in_kernel() && !whpx_apic_in_platform()) { + intno = apic_get_interrupt(cpu->apic_state); + if (intno >= 0) { + return intno; + } + /* read the irq from the PIC */ + if (!apic_accept_pic_intr(cpu->apic_state)) { + return -1; + } + } + + intno = pic_read_irq(isa_pic); + return intno; +} + +DeviceState *cpu_get_current_apic(void) +{ + if (current_cpu) { + X86CPU *cpu = X86_CPU(current_cpu); + return cpu->apic_state; + } else { + return NULL; + } +} + +void gsi_handler(void *opaque, int n, int level) +{ + GSIState *s = opaque; + + trace_x86_gsi_interrupt(n, level); + switch (n) { + case 0 ... ISA_NUM_IRQS - 1: + if (s->i8259_irq[n]) { + /* Under KVM, Kernel will forward to both PIC and IOAPIC */ + qemu_set_irq(s->i8259_irq[n], level); + } + /* fall through */ + case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1: + qemu_set_irq(s->ioapic_irq[n], level); + break; + case IO_APIC_SECONDARY_IRQBASE + ... IO_APIC_SECONDARY_IRQBASE + IOAPIC_NUM_PINS - 1: + qemu_set_irq(s->ioapic2_irq[n - IO_APIC_SECONDARY_IRQBASE], level); + break; + } +} + +void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name) +{ + DeviceState *dev; + SysBusDevice *d; + unsigned int i; + + assert(parent_name); + if (kvm_ioapic_in_kernel()) { + dev = qdev_new(TYPE_KVM_IOAPIC); + } else { + dev = qdev_new(TYPE_IOAPIC); + } + object_property_add_child(object_resolve_path(parent_name, NULL), + "ioapic", OBJECT(dev)); + d = SYS_BUS_DEVICE(dev); + sysbus_realize_and_unref(d, &error_fatal); + sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS); + + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i); + } +} + +DeviceState *ioapic_init_secondary(GSIState *gsi_state) +{ + DeviceState *dev; + SysBusDevice *d; + unsigned int i; + + dev = qdev_new(TYPE_IOAPIC); + d = SYS_BUS_DEVICE(dev); + sysbus_realize_and_unref(d, &error_fatal); + sysbus_mmio_map(d, 0, IO_APIC_SECONDARY_ADDRESS); + + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + gsi_state->ioapic2_irq[i] = qdev_get_gpio_in(dev, i); + } + return dev; +} + +struct setup_data { + uint64_t next; + uint32_t type; + uint32_t len; + uint8_t data[]; +} __attribute__((packed)); + + +/* + * The entry point into the kernel for PVH boot is different from + * the native entry point. The PVH entry is defined by the x86/HVM + * direct boot ABI and is available in an ELFNOTE in the kernel binary. + * + * This function is passed to load_elf() when it is called from + * load_elfboot() which then additionally checks for an ELF Note of + * type XEN_ELFNOTE_PHYS32_ENTRY and passes it to this function to + * parse the PVH entry address from the ELF Note. + * + * Due to trickery in elf_opts.h, load_elf() is actually available as + * load_elf32() or load_elf64() and this routine needs to be able + * to deal with being called as 32 or 64 bit. + * + * The address of the PVH entry point is saved to the 'pvh_start_addr' + * global variable. (although the entry point is 32-bit, the kernel + * binary can be either 32-bit or 64-bit). + */ +static uint64_t read_pvh_start_addr(void *arg1, void *arg2, bool is64) +{ + size_t *elf_note_data_addr; + + /* Check if ELF Note header passed in is valid */ + if (arg1 == NULL) { + return 0; + } + + if (is64) { + struct elf64_note *nhdr64 = (struct elf64_note *)arg1; + uint64_t nhdr_size64 = sizeof(struct elf64_note); + uint64_t phdr_align = *(uint64_t *)arg2; + uint64_t nhdr_namesz = nhdr64->n_namesz; + + elf_note_data_addr = + ((void *)nhdr64) + nhdr_size64 + + QEMU_ALIGN_UP(nhdr_namesz, phdr_align); + + pvh_start_addr = *elf_note_data_addr; + } else { + struct elf32_note *nhdr32 = (struct elf32_note *)arg1; + uint32_t nhdr_size32 = sizeof(struct elf32_note); + uint32_t phdr_align = *(uint32_t *)arg2; + uint32_t nhdr_namesz = nhdr32->n_namesz; + + elf_note_data_addr = + ((void *)nhdr32) + nhdr_size32 + + QEMU_ALIGN_UP(nhdr_namesz, phdr_align); + + pvh_start_addr = *(uint32_t *)elf_note_data_addr; + } + + return pvh_start_addr; +} + +static bool load_elfboot(const char *kernel_filename, + int kernel_file_size, + uint8_t *header, + size_t pvh_xen_start_addr, + FWCfgState *fw_cfg) +{ + uint32_t flags = 0; + uint32_t mh_load_addr = 0; + uint32_t elf_kernel_size = 0; + uint64_t elf_entry; + uint64_t elf_low, elf_high; + int kernel_size; + + if (ldl_p(header) != 0x464c457f) { + return false; /* no elfboot */ + } + + bool elf_is64 = header[EI_CLASS] == ELFCLASS64; + flags = elf_is64 ? + ((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags; + + if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */ + error_report("elfboot unsupported flags = %x", flags); + exit(1); + } + + uint64_t elf_note_type = XEN_ELFNOTE_PHYS32_ENTRY; + kernel_size = load_elf(kernel_filename, read_pvh_start_addr, + NULL, &elf_note_type, &elf_entry, + &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE, + 0, 0); + + if (kernel_size < 0) { + error_report("Error while loading elf kernel"); + exit(1); + } + mh_load_addr = elf_low; + elf_kernel_size = elf_high - elf_low; + + if (pvh_start_addr == 0) { + error_report("Error loading uncompressed kernel without PVH ELF Note"); + exit(1); + } + fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, pvh_start_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, elf_kernel_size); + + return true; +} + +void x86_load_linux(X86MachineState *x86ms, + FWCfgState *fw_cfg, + int acpi_data_size, + bool pvh_enabled) +{ + bool linuxboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled; + uint16_t protocol; + int setup_size, kernel_size, cmdline_size; + int dtb_size, setup_data_offset; + uint32_t initrd_max; + uint8_t header[8192], *setup, *kernel; + hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0; + FILE *f; + char *vmode; + MachineState *machine = MACHINE(x86ms); + struct setup_data *setup_data; + const char *kernel_filename = machine->kernel_filename; + const char *initrd_filename = machine->initrd_filename; + const char *dtb_filename = machine->dtb; + const char *kernel_cmdline = machine->kernel_cmdline; + SevKernelLoaderContext sev_load_ctx = {}; + + /* Align to 16 bytes as a paranoia measure */ + cmdline_size = (strlen(kernel_cmdline) + 16) & ~15; + + /* load the kernel header */ + f = fopen(kernel_filename, "rb"); + if (!f) { + fprintf(stderr, "qemu: could not open kernel file '%s': %s\n", + kernel_filename, strerror(errno)); + exit(1); + } + + kernel_size = get_file_size(f); + if (!kernel_size || + fread(header, 1, MIN(ARRAY_SIZE(header), kernel_size), f) != + MIN(ARRAY_SIZE(header), kernel_size)) { + fprintf(stderr, "qemu: could not load kernel '%s': %s\n", + kernel_filename, strerror(errno)); + exit(1); + } + + /* kernel protocol version */ + if (ldl_p(header + 0x202) == 0x53726448) { + protocol = lduw_p(header + 0x206); + } else { + /* + * This could be a multiboot kernel. If it is, let's stop treating it + * like a Linux kernel. + * Note: some multiboot images could be in the ELF format (the same of + * PVH), so we try multiboot first since we check the multiboot magic + * header before to load it. + */ + if (load_multiboot(x86ms, fw_cfg, f, kernel_filename, initrd_filename, + kernel_cmdline, kernel_size, header)) { + return; + } + /* + * Check if the file is an uncompressed kernel file (ELF) and load it, + * saving the PVH entry point used by the x86/HVM direct boot ABI. + * If load_elfboot() is successful, populate the fw_cfg info. + */ + if (pvh_enabled && + load_elfboot(kernel_filename, kernel_size, + header, pvh_start_addr, fw_cfg)) { + fclose(f); + + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, + strlen(kernel_cmdline) + 1); + fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline); + + fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header)); + fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, + header, sizeof(header)); + + /* load initrd */ + if (initrd_filename) { + GMappedFile *mapped_file; + gsize initrd_size; + gchar *initrd_data; + GError *gerr = NULL; + + mapped_file = g_mapped_file_new(initrd_filename, false, &gerr); + if (!mapped_file) { + fprintf(stderr, "qemu: error reading initrd %s: %s\n", + initrd_filename, gerr->message); + exit(1); + } + x86ms->initrd_mapped_file = mapped_file; + + initrd_data = g_mapped_file_get_contents(mapped_file); + initrd_size = g_mapped_file_get_length(mapped_file); + initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; + if (initrd_size >= initrd_max) { + fprintf(stderr, "qemu: initrd is too large, cannot support." + "(max: %"PRIu32", need %"PRId64")\n", + initrd_max, (uint64_t)initrd_size); + exit(1); + } + + initrd_addr = (initrd_max - initrd_size) & ~4095; + + fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size); + fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, + initrd_size); + } + + option_rom[nb_option_roms].bootindex = 0; + option_rom[nb_option_roms].name = "pvh.bin"; + nb_option_roms++; + + return; + } + protocol = 0; + } + + if (protocol < 0x200 || !(header[0x211] & 0x01)) { + /* Low kernel */ + real_addr = 0x90000; + cmdline_addr = 0x9a000 - cmdline_size; + prot_addr = 0x10000; + } else if (protocol < 0x202) { + /* High but ancient kernel */ + real_addr = 0x90000; + cmdline_addr = 0x9a000 - cmdline_size; + prot_addr = 0x100000; + } else { + /* High and recent kernel */ + real_addr = 0x10000; + cmdline_addr = 0x20000; + prot_addr = 0x100000; + } + + /* highest address for loading the initrd */ + if (protocol >= 0x20c && + lduw_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) { + /* + * Linux has supported initrd up to 4 GB for a very long time (2007, + * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013), + * though it only sets initrd_max to 2 GB to "work around bootloader + * bugs". Luckily, QEMU firmware(which does something like bootloader) + * has supported this. + * + * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can + * be loaded into any address. + * + * In addition, initrd_max is uint32_t simply because QEMU doesn't + * support the 64-bit boot protocol (specifically the ext_ramdisk_image + * field). + * + * Therefore here just limit initrd_max to UINT32_MAX simply as well. + */ + initrd_max = UINT32_MAX; + } else if (protocol >= 0x203) { + initrd_max = ldl_p(header + 0x22c); + } else { + initrd_max = 0x37ffffff; + } + + if (initrd_max >= x86ms->below_4g_mem_size - acpi_data_size) { + initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; + } + + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1); + fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline); + sev_load_ctx.cmdline_data = (char *)kernel_cmdline; + sev_load_ctx.cmdline_size = strlen(kernel_cmdline) + 1; + + if (protocol >= 0x202) { + stl_p(header + 0x228, cmdline_addr); + } else { + stw_p(header + 0x20, 0xA33F); + stw_p(header + 0x22, cmdline_addr - real_addr); + } + + /* handle vga= parameter */ + vmode = strstr(kernel_cmdline, "vga="); + if (vmode) { + unsigned int video_mode; + const char *end; + int ret; + /* skip "vga=" */ + vmode += 4; + if (!strncmp(vmode, "normal", 6)) { + video_mode = 0xffff; + } else if (!strncmp(vmode, "ext", 3)) { + video_mode = 0xfffe; + } else if (!strncmp(vmode, "ask", 3)) { + video_mode = 0xfffd; + } else { + ret = qemu_strtoui(vmode, &end, 0, &video_mode); + if (ret != 0 || (*end && *end != ' ')) { + fprintf(stderr, "qemu: invalid 'vga=' kernel parameter.\n"); + exit(1); + } + } + stw_p(header + 0x1fa, video_mode); + } + + /* loader type */ + /* + * High nybble = B reserved for QEMU; low nybble is revision number. + * If this code is substantially changed, you may want to consider + * incrementing the revision. + */ + if (protocol >= 0x200) { + header[0x210] = 0xB0; + } + /* heap */ + if (protocol >= 0x201) { + header[0x211] |= 0x80; /* CAN_USE_HEAP */ + stw_p(header + 0x224, cmdline_addr - real_addr - 0x200); + } + + /* load initrd */ + if (initrd_filename) { + GMappedFile *mapped_file; + gsize initrd_size; + gchar *initrd_data; + GError *gerr = NULL; + + if (protocol < 0x200) { + fprintf(stderr, "qemu: linux kernel too old to load a ram disk\n"); + exit(1); + } + + mapped_file = g_mapped_file_new(initrd_filename, false, &gerr); + if (!mapped_file) { + fprintf(stderr, "qemu: error reading initrd %s: %s\n", + initrd_filename, gerr->message); + exit(1); + } + x86ms->initrd_mapped_file = mapped_file; + + initrd_data = g_mapped_file_get_contents(mapped_file); + initrd_size = g_mapped_file_get_length(mapped_file); + if (initrd_size >= initrd_max) { + fprintf(stderr, "qemu: initrd is too large, cannot support." + "(max: %"PRIu32", need %"PRId64")\n", + initrd_max, (uint64_t)initrd_size); + exit(1); + } + + initrd_addr = (initrd_max - initrd_size) & ~4095; + + fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size); + fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, initrd_size); + sev_load_ctx.initrd_data = initrd_data; + sev_load_ctx.initrd_size = initrd_size; + + stl_p(header + 0x218, initrd_addr); + stl_p(header + 0x21c, initrd_size); + } + + /* load kernel and setup */ + setup_size = header[0x1f1]; + if (setup_size == 0) { + setup_size = 4; + } + setup_size = (setup_size + 1) * 512; + if (setup_size > kernel_size) { + fprintf(stderr, "qemu: invalid kernel header\n"); + exit(1); + } + kernel_size -= setup_size; + + setup = g_malloc(setup_size); + kernel = g_malloc(kernel_size); + fseek(f, 0, SEEK_SET); + if (fread(setup, 1, setup_size, f) != setup_size) { + fprintf(stderr, "fread() failed\n"); + exit(1); + } + if (fread(kernel, 1, kernel_size, f) != kernel_size) { + fprintf(stderr, "fread() failed\n"); + exit(1); + } + fclose(f); + + /* append dtb to kernel */ + if (dtb_filename) { + if (protocol < 0x209) { + fprintf(stderr, "qemu: Linux kernel too old to load a dtb\n"); + exit(1); + } + + dtb_size = get_image_size(dtb_filename); + if (dtb_size <= 0) { + fprintf(stderr, "qemu: error reading dtb %s: %s\n", + dtb_filename, strerror(errno)); + exit(1); + } + + setup_data_offset = QEMU_ALIGN_UP(kernel_size, 16); + kernel_size = setup_data_offset + sizeof(struct setup_data) + dtb_size; + kernel = g_realloc(kernel, kernel_size); + + stq_p(header + 0x250, prot_addr + setup_data_offset); + + setup_data = (struct setup_data *)(kernel + setup_data_offset); + setup_data->next = 0; + setup_data->type = cpu_to_le32(SETUP_DTB); + setup_data->len = cpu_to_le32(dtb_size); + + load_image_size(dtb_filename, setup_data->data, dtb_size); + } + + /* + * If we're starting an encrypted VM, it will be OVMF based, which uses the + * efi stub for booting and doesn't require any values to be placed in the + * kernel header. We therefore don't update the header so the hash of the + * kernel on the other side of the fw_cfg interface matches the hash of the + * file the user passed in. + */ + if (!sev_enabled()) { + memcpy(setup, header, MIN(sizeof(header), setup_size)); + } + + fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size); + fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size); + sev_load_ctx.kernel_data = (char *)kernel; + sev_load_ctx.kernel_size = kernel_size; + + fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size); + fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, setup, setup_size); + sev_load_ctx.setup_data = (char *)setup; + sev_load_ctx.setup_size = setup_size; + + if (sev_enabled()) { + sev_add_kernel_loader_hashes(&sev_load_ctx, &error_fatal); + } + + option_rom[nb_option_roms].bootindex = 0; + option_rom[nb_option_roms].name = "linuxboot.bin"; + if (linuxboot_dma_enabled && fw_cfg_dma_enabled(fw_cfg)) { + option_rom[nb_option_roms].name = "linuxboot_dma.bin"; + } + nb_option_roms++; +} + +void x86_bios_rom_init(MachineState *ms, const char *default_firmware, + MemoryRegion *rom_memory, bool isapc_ram_fw) +{ + const char *bios_name; + char *filename; + MemoryRegion *bios, *isa_bios; + int bios_size, isa_bios_size; + int ret; + + /* BIOS load */ + bios_name = ms->firmware ?: default_firmware; + filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name); + if (filename) { + bios_size = get_image_size(filename); + } else { + bios_size = -1; + } + if (bios_size <= 0 || + (bios_size % 65536) != 0) { + goto bios_error; + } + bios = g_malloc(sizeof(*bios)); + memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal); + if (!isapc_ram_fw) { + memory_region_set_readonly(bios, true); + } + ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1); + if (ret != 0) { + bios_error: + fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name); + exit(1); + } + g_free(filename); + + /* map the last 128KB of the BIOS in ISA space */ + isa_bios_size = MIN(bios_size, 128 * KiB); + isa_bios = g_malloc(sizeof(*isa_bios)); + memory_region_init_alias(isa_bios, NULL, "isa-bios", bios, + bios_size - isa_bios_size, isa_bios_size); + memory_region_add_subregion_overlap(rom_memory, + 0x100000 - isa_bios_size, + isa_bios, + 1); + if (!isapc_ram_fw) { + memory_region_set_readonly(isa_bios, true); + } + + /* map all the bios at the top of memory */ + memory_region_add_subregion(rom_memory, + (uint32_t)(-bios_size), + bios); +} + +bool x86_machine_is_smm_enabled(const X86MachineState *x86ms) +{ + bool smm_available = false; + + if (x86ms->smm == ON_OFF_AUTO_OFF) { + return false; + } + + if (tcg_enabled() || qtest_enabled()) { + smm_available = true; + } else if (kvm_enabled()) { + smm_available = kvm_has_smm(); + } + + if (smm_available) { + return true; + } + + if (x86ms->smm == ON_OFF_AUTO_ON) { + error_report("System Management Mode not supported by this hypervisor."); + exit(1); + } + return false; +} + +static void x86_machine_get_smm(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + OnOffAuto smm = x86ms->smm; + + visit_type_OnOffAuto(v, name, &smm, errp); +} + +static void x86_machine_set_smm(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &x86ms->smm, errp); +} + +bool x86_machine_is_acpi_enabled(const X86MachineState *x86ms) +{ + if (x86ms->acpi == ON_OFF_AUTO_OFF) { + return false; + } + return true; +} + +static void x86_machine_get_acpi(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + OnOffAuto acpi = x86ms->acpi; + + visit_type_OnOffAuto(v, name, &acpi, errp); +} + +static void x86_machine_set_acpi(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &x86ms->acpi, errp); +} + +static char *x86_machine_get_oem_id(Object *obj, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + return g_strdup(x86ms->oem_id); +} + +static void x86_machine_set_oem_id(Object *obj, const char *value, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + size_t len = strlen(value); + + if (len > 6) { + error_setg(errp, + "User specified "X86_MACHINE_OEM_ID" value is bigger than " + "6 bytes in size"); + return; + } + + strncpy(x86ms->oem_id, value, 6); +} + +static char *x86_machine_get_oem_table_id(Object *obj, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + return g_strdup(x86ms->oem_table_id); +} + +static void x86_machine_set_oem_table_id(Object *obj, const char *value, + Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + size_t len = strlen(value); + + if (len > 8) { + error_setg(errp, + "User specified "X86_MACHINE_OEM_TABLE_ID + " value is bigger than " + "8 bytes in size"); + return; + } + strncpy(x86ms->oem_table_id, value, 8); +} + +static void x86_machine_get_bus_lock_ratelimit(Object *obj, Visitor *v, + const char *name, void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + uint64_t bus_lock_ratelimit = x86ms->bus_lock_ratelimit; + + visit_type_uint64(v, name, &bus_lock_ratelimit, errp); +} + +static void x86_machine_set_bus_lock_ratelimit(Object *obj, Visitor *v, + const char *name, void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + visit_type_uint64(v, name, &x86ms->bus_lock_ratelimit, errp); +} + +static void machine_get_sgx_epc(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + SgxEPCList *list = x86ms->sgx_epc_list; + + visit_type_SgxEPCList(v, name, &list, errp); +} + +static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + SgxEPCList *list; + + list = x86ms->sgx_epc_list; + visit_type_SgxEPCList(v, name, &x86ms->sgx_epc_list, errp); + + qapi_free_SgxEPCList(list); +} + +static void x86_machine_initfn(Object *obj) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + x86ms->smm = ON_OFF_AUTO_AUTO; + x86ms->acpi = ON_OFF_AUTO_AUTO; + x86ms->pci_irq_mask = ACPI_BUILD_PCI_IRQS; + x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); + x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); + x86ms->bus_lock_ratelimit = 0; +} + +static void x86_machine_class_init(ObjectClass *oc, void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + X86MachineClass *x86mc = X86_MACHINE_CLASS(oc); + NMIClass *nc = NMI_CLASS(oc); + + mc->cpu_index_to_instance_props = x86_cpu_index_to_props; + mc->get_default_cpu_node_id = x86_get_default_cpu_node_id; + mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids; + x86mc->compat_apic_id_mode = false; + x86mc->save_tsc_khz = true; + x86mc->fwcfg_dma_enabled = true; + nc->nmi_monitor_handler = x86_nmi; + + object_class_property_add(oc, X86_MACHINE_SMM, "OnOffAuto", + x86_machine_get_smm, x86_machine_set_smm, + NULL, NULL); + object_class_property_set_description(oc, X86_MACHINE_SMM, + "Enable SMM"); + + object_class_property_add(oc, X86_MACHINE_ACPI, "OnOffAuto", + x86_machine_get_acpi, x86_machine_set_acpi, + NULL, NULL); + object_class_property_set_description(oc, X86_MACHINE_ACPI, + "Enable ACPI"); + + object_class_property_add_str(oc, X86_MACHINE_OEM_ID, + x86_machine_get_oem_id, + x86_machine_set_oem_id); + object_class_property_set_description(oc, X86_MACHINE_OEM_ID, + "Override the default value of field OEMID " + "in ACPI table header." + "The string may be up to 6 bytes in size"); + + + object_class_property_add_str(oc, X86_MACHINE_OEM_TABLE_ID, + x86_machine_get_oem_table_id, + x86_machine_set_oem_table_id); + object_class_property_set_description(oc, X86_MACHINE_OEM_TABLE_ID, + "Override the default value of field OEM Table ID " + "in ACPI table header." + "The string may be up to 8 bytes in size"); + + object_class_property_add(oc, X86_MACHINE_BUS_LOCK_RATELIMIT, "uint64_t", + x86_machine_get_bus_lock_ratelimit, + x86_machine_set_bus_lock_ratelimit, NULL, NULL); + object_class_property_set_description(oc, X86_MACHINE_BUS_LOCK_RATELIMIT, + "Set the ratelimit for the bus locks acquired in VMs"); + + object_class_property_add(oc, "sgx-epc", "SgxEPC", + machine_get_sgx_epc, machine_set_sgx_epc, + NULL, NULL); + object_class_property_set_description(oc, "sgx-epc", + "SGX EPC device"); +} + +static const TypeInfo x86_machine_info = { + .name = TYPE_X86_MACHINE, + .parent = TYPE_MACHINE, + .abstract = true, + .instance_size = sizeof(X86MachineState), + .instance_init = x86_machine_initfn, + .class_size = sizeof(X86MachineClass), + .class_init = x86_machine_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_NMI }, + { } + }, +}; + +static void x86_machine_register_types(void) +{ + type_register_static(&x86_machine_info); +} + +type_init(x86_machine_register_types) diff --git a/hw/i386/xen/meson.build b/hw/i386/xen/meson.build new file mode 100644 index 000000000..be8413030 --- /dev/null +++ b/hw/i386/xen/meson.build @@ -0,0 +1,7 @@ +i386_ss.add(when: 'CONFIG_XEN', if_true: files( + 'xen-hvm.c', + 'xen-mapcache.c', + 'xen_apic.c', + 'xen_platform.c', + 'xen_pvdevice.c', +)) diff --git a/hw/i386/xen/trace-events b/hw/i386/xen/trace-events new file mode 100644 index 000000000..5d6be6109 --- /dev/null +++ b/hw/i386/xen/trace-events @@ -0,0 +1,28 @@ +# See docs/devel/tracing.rst for syntax documentation. + +# xen_platform.c +xen_platform_log(char *s) "xen platform: %s" + +# xen_pvdevice.c +xen_pv_mmio_read(uint64_t addr) "WARNING: read from Xen PV Device MMIO space (address 0x%"PRIx64")" +xen_pv_mmio_write(uint64_t addr) "WARNING: write to Xen PV Device MMIO space (address 0x%"PRIx64")" + +# xen-hvm.c +xen_ram_alloc(unsigned long ram_addr, unsigned long size) "requested: 0x%lx, size 0x%lx" +xen_client_set_memory(uint64_t start_addr, unsigned long size, bool log_dirty) "0x%"PRIx64" size 0x%lx, log_dirty %i" +handle_ioreq(void *req, uint32_t type, uint32_t dir, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p type=%d dir=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d" +handle_ioreq_read(void *req, uint32_t type, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p read type=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d" +handle_ioreq_write(void *req, uint32_t type, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p write type=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d" +cpu_ioreq_pio(void *req, uint32_t dir, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p pio dir=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d" +cpu_ioreq_pio_read_reg(void *req, uint64_t data, uint64_t addr, uint32_t size) "I/O=%p pio read reg data=0x%"PRIx64" port=0x%"PRIx64" size=%d" +cpu_ioreq_pio_write_reg(void *req, uint64_t data, uint64_t addr, uint32_t size) "I/O=%p pio write reg data=0x%"PRIx64" port=0x%"PRIx64" size=%d" +cpu_ioreq_move(void *req, uint32_t dir, uint32_t df, uint32_t data_is_ptr, uint64_t addr, uint64_t data, uint32_t count, uint32_t size) "I/O=%p copy dir=%d df=%d ptr=%d port=0x%"PRIx64" data=0x%"PRIx64" count=%d size=%d" +xen_map_resource_ioreq(uint32_t id, void *addr) "id: %u addr: %p" +cpu_ioreq_config_read(void *req, uint32_t sbdf, uint32_t reg, uint32_t size, uint32_t data) "I/O=%p sbdf=0x%x reg=%u size=%u data=0x%x" +cpu_ioreq_config_write(void *req, uint32_t sbdf, uint32_t reg, uint32_t size, uint32_t data) "I/O=%p sbdf=0x%x reg=%u size=%u data=0x%x" + +# xen-mapcache.c +xen_map_cache(uint64_t phys_addr) "want 0x%"PRIx64 +xen_remap_bucket(uint64_t index) "index 0x%"PRIx64 +xen_map_cache_return(void* ptr) "%p" + diff --git a/hw/i386/xen/trace.h b/hw/i386/xen/trace.h new file mode 100644 index 000000000..a02bf755d --- /dev/null +++ b/hw/i386/xen/trace.h @@ -0,0 +1 @@ +#include "trace/trace-hw_i386_xen.h" diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c new file mode 100644 index 000000000..482be9541 --- /dev/null +++ b/hw/i386/xen/xen-hvm.c @@ -0,0 +1,1620 @@ +/* + * Copyright (C) 2010 Citrix Ltd. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" + +#include "cpu.h" +#include "hw/pci/pci.h" +#include "hw/pci/pci_host.h" +#include "hw/i386/pc.h" +#include "hw/southbridge/piix.h" +#include "hw/irq.h" +#include "hw/hw.h" +#include "hw/i386/apic-msidef.h" +#include "hw/xen/xen_common.h" +#include "hw/xen/xen-legacy-backend.h" +#include "hw/xen/xen-bus.h" +#include "hw/xen/xen-x86.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-migration.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "qemu/range.h" +#include "sysemu/runstate.h" +#include "sysemu/sysemu.h" +#include "sysemu/xen.h" +#include "sysemu/xen-mapcache.h" +#include "trace.h" + +#include <xen/hvm/ioreq.h> +#include <xen/hvm/e820.h> + +//#define DEBUG_XEN_HVM + +#ifdef DEBUG_XEN_HVM +#define DPRINTF(fmt, ...) \ + do { fprintf(stderr, "xen: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ + do { } while (0) +#endif + +static MemoryRegion ram_memory, ram_640k, ram_lo, ram_hi; +static MemoryRegion *framebuffer; +static bool xen_in_migration; + +/* Compatibility with older version */ + +/* This allows QEMU to build on a system that has Xen 4.5 or earlier + * installed. This here (not in hw/xen/xen_common.h) because xen/hvm/ioreq.h + * needs to be included before this block and hw/xen/xen_common.h needs to + * be included before xen/hvm/ioreq.h + */ +#ifndef IOREQ_TYPE_VMWARE_PORT +#define IOREQ_TYPE_VMWARE_PORT 3 +struct vmware_regs { + uint32_t esi; + uint32_t edi; + uint32_t ebx; + uint32_t ecx; + uint32_t edx; +}; +typedef struct vmware_regs vmware_regs_t; + +struct shared_vmport_iopage { + struct vmware_regs vcpu_vmport_regs[1]; +}; +typedef struct shared_vmport_iopage shared_vmport_iopage_t; +#endif + +static inline uint32_t xen_vcpu_eport(shared_iopage_t *shared_page, int i) +{ + return shared_page->vcpu_ioreq[i].vp_eport; +} +static inline ioreq_t *xen_vcpu_ioreq(shared_iopage_t *shared_page, int vcpu) +{ + return &shared_page->vcpu_ioreq[vcpu]; +} + +#define BUFFER_IO_MAX_DELAY 100 + +typedef struct XenPhysmap { + hwaddr start_addr; + ram_addr_t size; + const char *name; + hwaddr phys_offset; + + QLIST_ENTRY(XenPhysmap) list; +} XenPhysmap; + +static QLIST_HEAD(, XenPhysmap) xen_physmap; + +typedef struct XenPciDevice { + PCIDevice *pci_dev; + uint32_t sbdf; + QLIST_ENTRY(XenPciDevice) entry; +} XenPciDevice; + +typedef struct XenIOState { + ioservid_t ioservid; + shared_iopage_t *shared_page; + shared_vmport_iopage_t *shared_vmport_page; + buffered_iopage_t *buffered_io_page; + xenforeignmemory_resource_handle *fres; + QEMUTimer *buffered_io_timer; + CPUState **cpu_by_vcpu_id; + /* the evtchn port for polling the notification, */ + evtchn_port_t *ioreq_local_port; + /* evtchn remote and local ports for buffered io */ + evtchn_port_t bufioreq_remote_port; + evtchn_port_t bufioreq_local_port; + /* the evtchn fd for polling */ + xenevtchn_handle *xce_handle; + /* which vcpu we are serving */ + int send_vcpu; + + struct xs_handle *xenstore; + MemoryListener memory_listener; + MemoryListener io_listener; + QLIST_HEAD(, XenPciDevice) dev_list; + DeviceListener device_listener; + hwaddr free_phys_offset; + const XenPhysmap *log_for_dirtybit; + /* Buffer used by xen_sync_dirty_bitmap */ + unsigned long *dirty_bitmap; + + Notifier exit; + Notifier suspend; + Notifier wakeup; +} XenIOState; + +/* Xen specific function for piix pci */ + +int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num) +{ + return irq_num + (PCI_SLOT(pci_dev->devfn) << 2); +} + +void xen_piix3_set_irq(void *opaque, int irq_num, int level) +{ + xen_set_pci_intx_level(xen_domid, 0, 0, irq_num >> 2, + irq_num & 3, level); +} + +void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len) +{ + int i; + + /* Scan for updates to PCI link routes (0x60-0x63). */ + for (i = 0; i < len; i++) { + uint8_t v = (val >> (8 * i)) & 0xff; + if (v & 0x80) { + v = 0; + } + v &= 0xf; + if (((address + i) >= PIIX_PIRQCA) && ((address + i) <= PIIX_PIRQCD)) { + xen_set_pci_link_route(xen_domid, address + i - PIIX_PIRQCA, v); + } + } +} + +int xen_is_pirq_msi(uint32_t msi_data) +{ + /* If vector is 0, the msi is remapped into a pirq, passed as + * dest_id. + */ + return ((msi_data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT) == 0; +} + +void xen_hvm_inject_msi(uint64_t addr, uint32_t data) +{ + xen_inject_msi(xen_domid, addr, data); +} + +static void xen_suspend_notifier(Notifier *notifier, void *data) +{ + xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 3); +} + +/* Xen Interrupt Controller */ + +static void xen_set_irq(void *opaque, int irq, int level) +{ + xen_set_isa_irq_level(xen_domid, irq, level); +} + +qemu_irq *xen_interrupt_controller_init(void) +{ + return qemu_allocate_irqs(xen_set_irq, NULL, 16); +} + +/* Memory Ops */ + +static void xen_ram_init(PCMachineState *pcms, + ram_addr_t ram_size, MemoryRegion **ram_memory_p) +{ + X86MachineState *x86ms = X86_MACHINE(pcms); + MemoryRegion *sysmem = get_system_memory(); + ram_addr_t block_len; + uint64_t user_lowmem = + object_property_get_uint(qdev_get_machine(), + PC_MACHINE_MAX_RAM_BELOW_4G, + &error_abort); + + /* Handle the machine opt max-ram-below-4g. It is basically doing + * min(xen limit, user limit). + */ + if (!user_lowmem) { + user_lowmem = HVM_BELOW_4G_RAM_END; /* default */ + } + if (HVM_BELOW_4G_RAM_END <= user_lowmem) { + user_lowmem = HVM_BELOW_4G_RAM_END; + } + + if (ram_size >= user_lowmem) { + x86ms->above_4g_mem_size = ram_size - user_lowmem; + x86ms->below_4g_mem_size = user_lowmem; + } else { + x86ms->above_4g_mem_size = 0; + x86ms->below_4g_mem_size = ram_size; + } + if (!x86ms->above_4g_mem_size) { + block_len = ram_size; + } else { + /* + * Xen does not allocate the memory continuously, it keeps a + * hole of the size computed above or passed in. + */ + block_len = (4 * GiB) + x86ms->above_4g_mem_size; + } + memory_region_init_ram(&ram_memory, NULL, "xen.ram", block_len, + &error_fatal); + *ram_memory_p = &ram_memory; + + memory_region_init_alias(&ram_640k, NULL, "xen.ram.640k", + &ram_memory, 0, 0xa0000); + memory_region_add_subregion(sysmem, 0, &ram_640k); + /* Skip of the VGA IO memory space, it will be registered later by the VGA + * emulated device. + * + * The area between 0xc0000 and 0x100000 will be used by SeaBIOS to load + * the Options ROM, so it is registered here as RAM. + */ + memory_region_init_alias(&ram_lo, NULL, "xen.ram.lo", + &ram_memory, 0xc0000, + x86ms->below_4g_mem_size - 0xc0000); + memory_region_add_subregion(sysmem, 0xc0000, &ram_lo); + if (x86ms->above_4g_mem_size > 0) { + memory_region_init_alias(&ram_hi, NULL, "xen.ram.hi", + &ram_memory, 0x100000000ULL, + x86ms->above_4g_mem_size); + memory_region_add_subregion(sysmem, 0x100000000ULL, &ram_hi); + } +} + +void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, MemoryRegion *mr, + Error **errp) +{ + unsigned long nr_pfn; + xen_pfn_t *pfn_list; + int i; + + if (runstate_check(RUN_STATE_INMIGRATE)) { + /* RAM already populated in Xen */ + fprintf(stderr, "%s: do not alloc "RAM_ADDR_FMT + " bytes of ram at "RAM_ADDR_FMT" when runstate is INMIGRATE\n", + __func__, size, ram_addr); + return; + } + + if (mr == &ram_memory) { + return; + } + + trace_xen_ram_alloc(ram_addr, size); + + nr_pfn = size >> TARGET_PAGE_BITS; + pfn_list = g_malloc(sizeof (*pfn_list) * nr_pfn); + + for (i = 0; i < nr_pfn; i++) { + pfn_list[i] = (ram_addr >> TARGET_PAGE_BITS) + i; + } + + if (xc_domain_populate_physmap_exact(xen_xc, xen_domid, nr_pfn, 0, 0, pfn_list)) { + error_setg(errp, "xen: failed to populate ram at " RAM_ADDR_FMT, + ram_addr); + } + + g_free(pfn_list); +} + +static XenPhysmap *get_physmapping(hwaddr start_addr, ram_addr_t size) +{ + XenPhysmap *physmap = NULL; + + start_addr &= TARGET_PAGE_MASK; + + QLIST_FOREACH(physmap, &xen_physmap, list) { + if (range_covers_byte(physmap->start_addr, physmap->size, start_addr)) { + return physmap; + } + } + return NULL; +} + +static hwaddr xen_phys_offset_to_gaddr(hwaddr phys_offset, ram_addr_t size) +{ + hwaddr addr = phys_offset & TARGET_PAGE_MASK; + XenPhysmap *physmap = NULL; + + QLIST_FOREACH(physmap, &xen_physmap, list) { + if (range_covers_byte(physmap->phys_offset, physmap->size, addr)) { + return physmap->start_addr + (phys_offset - physmap->phys_offset); + } + } + + return phys_offset; +} + +#ifdef XEN_COMPAT_PHYSMAP +static int xen_save_physmap(XenIOState *state, XenPhysmap *physmap) +{ + char path[80], value[17]; + + snprintf(path, sizeof(path), + "/local/domain/0/device-model/%d/physmap/%"PRIx64"/start_addr", + xen_domid, (uint64_t)physmap->phys_offset); + snprintf(value, sizeof(value), "%"PRIx64, (uint64_t)physmap->start_addr); + if (!xs_write(state->xenstore, 0, path, value, strlen(value))) { + return -1; + } + snprintf(path, sizeof(path), + "/local/domain/0/device-model/%d/physmap/%"PRIx64"/size", + xen_domid, (uint64_t)physmap->phys_offset); + snprintf(value, sizeof(value), "%"PRIx64, (uint64_t)physmap->size); + if (!xs_write(state->xenstore, 0, path, value, strlen(value))) { + return -1; + } + if (physmap->name) { + snprintf(path, sizeof(path), + "/local/domain/0/device-model/%d/physmap/%"PRIx64"/name", + xen_domid, (uint64_t)physmap->phys_offset); + if (!xs_write(state->xenstore, 0, path, + physmap->name, strlen(physmap->name))) { + return -1; + } + } + return 0; +} +#else +static int xen_save_physmap(XenIOState *state, XenPhysmap *physmap) +{ + return 0; +} +#endif + +static int xen_add_to_physmap(XenIOState *state, + hwaddr start_addr, + ram_addr_t size, + MemoryRegion *mr, + hwaddr offset_within_region) +{ + unsigned long nr_pages; + int rc = 0; + XenPhysmap *physmap = NULL; + hwaddr pfn, start_gpfn; + hwaddr phys_offset = memory_region_get_ram_addr(mr); + const char *mr_name; + + if (get_physmapping(start_addr, size)) { + return 0; + } + if (size <= 0) { + return -1; + } + + /* Xen can only handle a single dirty log region for now and we want + * the linear framebuffer to be that region. + * Avoid tracking any regions that is not videoram and avoid tracking + * the legacy vga region. */ + if (mr == framebuffer && start_addr > 0xbffff) { + goto go_physmap; + } + return -1; + +go_physmap: + DPRINTF("mapping vram to %"HWADDR_PRIx" - %"HWADDR_PRIx"\n", + start_addr, start_addr + size); + + mr_name = memory_region_name(mr); + + physmap = g_malloc(sizeof(XenPhysmap)); + + physmap->start_addr = start_addr; + physmap->size = size; + physmap->name = mr_name; + physmap->phys_offset = phys_offset; + + QLIST_INSERT_HEAD(&xen_physmap, physmap, list); + + if (runstate_check(RUN_STATE_INMIGRATE)) { + /* Now when we have a physmap entry we can replace a dummy mapping with + * a real one of guest foreign memory. */ + uint8_t *p = xen_replace_cache_entry(phys_offset, start_addr, size); + assert(p && p == memory_region_get_ram_ptr(mr)); + + return 0; + } + + pfn = phys_offset >> TARGET_PAGE_BITS; + start_gpfn = start_addr >> TARGET_PAGE_BITS; + nr_pages = size >> TARGET_PAGE_BITS; + rc = xendevicemodel_relocate_memory(xen_dmod, xen_domid, nr_pages, pfn, + start_gpfn); + if (rc) { + int saved_errno = errno; + + error_report("relocate_memory %lu pages from GFN %"HWADDR_PRIx + " to GFN %"HWADDR_PRIx" failed: %s", + nr_pages, pfn, start_gpfn, strerror(saved_errno)); + errno = saved_errno; + return -1; + } + + rc = xendevicemodel_pin_memory_cacheattr(xen_dmod, xen_domid, + start_addr >> TARGET_PAGE_BITS, + (start_addr + size - 1) >> TARGET_PAGE_BITS, + XEN_DOMCTL_MEM_CACHEATTR_WB); + if (rc) { + error_report("pin_memory_cacheattr failed: %s", strerror(errno)); + } + return xen_save_physmap(state, physmap); +} + +static int xen_remove_from_physmap(XenIOState *state, + hwaddr start_addr, + ram_addr_t size) +{ + int rc = 0; + XenPhysmap *physmap = NULL; + hwaddr phys_offset = 0; + + physmap = get_physmapping(start_addr, size); + if (physmap == NULL) { + return -1; + } + + phys_offset = physmap->phys_offset; + size = physmap->size; + + DPRINTF("unmapping vram to %"HWADDR_PRIx" - %"HWADDR_PRIx", at " + "%"HWADDR_PRIx"\n", start_addr, start_addr + size, phys_offset); + + size >>= TARGET_PAGE_BITS; + start_addr >>= TARGET_PAGE_BITS; + phys_offset >>= TARGET_PAGE_BITS; + rc = xendevicemodel_relocate_memory(xen_dmod, xen_domid, size, start_addr, + phys_offset); + if (rc) { + int saved_errno = errno; + + error_report("relocate_memory "RAM_ADDR_FMT" pages" + " from GFN %"HWADDR_PRIx + " to GFN %"HWADDR_PRIx" failed: %s", + size, start_addr, phys_offset, strerror(saved_errno)); + errno = saved_errno; + return -1; + } + + QLIST_REMOVE(physmap, list); + if (state->log_for_dirtybit == physmap) { + state->log_for_dirtybit = NULL; + g_free(state->dirty_bitmap); + state->dirty_bitmap = NULL; + } + g_free(physmap); + + return 0; +} + +static void xen_set_memory(struct MemoryListener *listener, + MemoryRegionSection *section, + bool add) +{ + XenIOState *state = container_of(listener, XenIOState, memory_listener); + hwaddr start_addr = section->offset_within_address_space; + ram_addr_t size = int128_get64(section->size); + bool log_dirty = memory_region_is_logging(section->mr, DIRTY_MEMORY_VGA); + hvmmem_type_t mem_type; + + if (section->mr == &ram_memory) { + return; + } else { + if (add) { + xen_map_memory_section(xen_domid, state->ioservid, + section); + } else { + xen_unmap_memory_section(xen_domid, state->ioservid, + section); + } + } + + if (!memory_region_is_ram(section->mr)) { + return; + } + + if (log_dirty != add) { + return; + } + + trace_xen_client_set_memory(start_addr, size, log_dirty); + + start_addr &= TARGET_PAGE_MASK; + size = TARGET_PAGE_ALIGN(size); + + if (add) { + if (!memory_region_is_rom(section->mr)) { + xen_add_to_physmap(state, start_addr, size, + section->mr, section->offset_within_region); + } else { + mem_type = HVMMEM_ram_ro; + if (xen_set_mem_type(xen_domid, mem_type, + start_addr >> TARGET_PAGE_BITS, + size >> TARGET_PAGE_BITS)) { + DPRINTF("xen_set_mem_type error, addr: "TARGET_FMT_plx"\n", + start_addr); + } + } + } else { + if (xen_remove_from_physmap(state, start_addr, size) < 0) { + DPRINTF("physmapping does not exist at "TARGET_FMT_plx"\n", start_addr); + } + } +} + +static void xen_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + memory_region_ref(section->mr); + xen_set_memory(listener, section, true); +} + +static void xen_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + xen_set_memory(listener, section, false); + memory_region_unref(section->mr); +} + +static void xen_io_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + XenIOState *state = container_of(listener, XenIOState, io_listener); + MemoryRegion *mr = section->mr; + + if (mr->ops == &unassigned_io_ops) { + return; + } + + memory_region_ref(mr); + + xen_map_io_section(xen_domid, state->ioservid, section); +} + +static void xen_io_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + XenIOState *state = container_of(listener, XenIOState, io_listener); + MemoryRegion *mr = section->mr; + + if (mr->ops == &unassigned_io_ops) { + return; + } + + xen_unmap_io_section(xen_domid, state->ioservid, section); + + memory_region_unref(mr); +} + +static void xen_device_realize(DeviceListener *listener, + DeviceState *dev) +{ + XenIOState *state = container_of(listener, XenIOState, device_listener); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + PCIDevice *pci_dev = PCI_DEVICE(dev); + XenPciDevice *xendev = g_new(XenPciDevice, 1); + + xendev->pci_dev = pci_dev; + xendev->sbdf = PCI_BUILD_BDF(pci_dev_bus_num(pci_dev), + pci_dev->devfn); + QLIST_INSERT_HEAD(&state->dev_list, xendev, entry); + + xen_map_pcidev(xen_domid, state->ioservid, pci_dev); + } +} + +static void xen_device_unrealize(DeviceListener *listener, + DeviceState *dev) +{ + XenIOState *state = container_of(listener, XenIOState, device_listener); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + PCIDevice *pci_dev = PCI_DEVICE(dev); + XenPciDevice *xendev, *next; + + xen_unmap_pcidev(xen_domid, state->ioservid, pci_dev); + + QLIST_FOREACH_SAFE(xendev, &state->dev_list, entry, next) { + if (xendev->pci_dev == pci_dev) { + QLIST_REMOVE(xendev, entry); + g_free(xendev); + break; + } + } + } +} + +static void xen_sync_dirty_bitmap(XenIOState *state, + hwaddr start_addr, + ram_addr_t size) +{ + hwaddr npages = size >> TARGET_PAGE_BITS; + const int width = sizeof(unsigned long) * 8; + size_t bitmap_size = DIV_ROUND_UP(npages, width); + int rc, i, j; + const XenPhysmap *physmap = NULL; + + physmap = get_physmapping(start_addr, size); + if (physmap == NULL) { + /* not handled */ + return; + } + + if (state->log_for_dirtybit == NULL) { + state->log_for_dirtybit = physmap; + state->dirty_bitmap = g_new(unsigned long, bitmap_size); + } else if (state->log_for_dirtybit != physmap) { + /* Only one range for dirty bitmap can be tracked. */ + return; + } + + rc = xen_track_dirty_vram(xen_domid, start_addr >> TARGET_PAGE_BITS, + npages, state->dirty_bitmap); + if (rc < 0) { +#ifndef ENODATA +#define ENODATA ENOENT +#endif + if (errno == ENODATA) { + memory_region_set_dirty(framebuffer, 0, size); + DPRINTF("xen: track_dirty_vram failed (0x" TARGET_FMT_plx + ", 0x" TARGET_FMT_plx "): %s\n", + start_addr, start_addr + size, strerror(errno)); + } + return; + } + + for (i = 0; i < bitmap_size; i++) { + unsigned long map = state->dirty_bitmap[i]; + while (map != 0) { + j = ctzl(map); + map &= ~(1ul << j); + memory_region_set_dirty(framebuffer, + (i * width + j) * TARGET_PAGE_SIZE, + TARGET_PAGE_SIZE); + }; + } +} + +static void xen_log_start(MemoryListener *listener, + MemoryRegionSection *section, + int old, int new) +{ + XenIOState *state = container_of(listener, XenIOState, memory_listener); + + if (new & ~old & (1 << DIRTY_MEMORY_VGA)) { + xen_sync_dirty_bitmap(state, section->offset_within_address_space, + int128_get64(section->size)); + } +} + +static void xen_log_stop(MemoryListener *listener, MemoryRegionSection *section, + int old, int new) +{ + XenIOState *state = container_of(listener, XenIOState, memory_listener); + + if (old & ~new & (1 << DIRTY_MEMORY_VGA)) { + state->log_for_dirtybit = NULL; + g_free(state->dirty_bitmap); + state->dirty_bitmap = NULL; + /* Disable dirty bit tracking */ + xen_track_dirty_vram(xen_domid, 0, 0, NULL); + } +} + +static void xen_log_sync(MemoryListener *listener, MemoryRegionSection *section) +{ + XenIOState *state = container_of(listener, XenIOState, memory_listener); + + xen_sync_dirty_bitmap(state, section->offset_within_address_space, + int128_get64(section->size)); +} + +static void xen_log_global_start(MemoryListener *listener) +{ + if (xen_enabled()) { + xen_in_migration = true; + } +} + +static void xen_log_global_stop(MemoryListener *listener) +{ + xen_in_migration = false; +} + +static MemoryListener xen_memory_listener = { + .name = "xen-memory", + .region_add = xen_region_add, + .region_del = xen_region_del, + .log_start = xen_log_start, + .log_stop = xen_log_stop, + .log_sync = xen_log_sync, + .log_global_start = xen_log_global_start, + .log_global_stop = xen_log_global_stop, + .priority = 10, +}; + +static MemoryListener xen_io_listener = { + .name = "xen-io", + .region_add = xen_io_add, + .region_del = xen_io_del, + .priority = 10, +}; + +static DeviceListener xen_device_listener = { + .realize = xen_device_realize, + .unrealize = xen_device_unrealize, +}; + +/* get the ioreq packets from share mem */ +static ioreq_t *cpu_get_ioreq_from_shared_memory(XenIOState *state, int vcpu) +{ + ioreq_t *req = xen_vcpu_ioreq(state->shared_page, vcpu); + + if (req->state != STATE_IOREQ_READY) { + DPRINTF("I/O request not ready: " + "%x, ptr: %x, port: %"PRIx64", " + "data: %"PRIx64", count: %u, size: %u\n", + req->state, req->data_is_ptr, req->addr, + req->data, req->count, req->size); + return NULL; + } + + xen_rmb(); /* see IOREQ_READY /then/ read contents of ioreq */ + + req->state = STATE_IOREQ_INPROCESS; + return req; +} + +/* use poll to get the port notification */ +/* ioreq_vec--out,the */ +/* retval--the number of ioreq packet */ +static ioreq_t *cpu_get_ioreq(XenIOState *state) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + unsigned int max_cpus = ms->smp.max_cpus; + int i; + evtchn_port_t port; + + port = xenevtchn_pending(state->xce_handle); + if (port == state->bufioreq_local_port) { + timer_mod(state->buffered_io_timer, + BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); + return NULL; + } + + if (port != -1) { + for (i = 0; i < max_cpus; i++) { + if (state->ioreq_local_port[i] == port) { + break; + } + } + + if (i == max_cpus) { + hw_error("Fatal error while trying to get io event!\n"); + } + + /* unmask the wanted port again */ + xenevtchn_unmask(state->xce_handle, port); + + /* get the io packet from shared memory */ + state->send_vcpu = i; + return cpu_get_ioreq_from_shared_memory(state, i); + } + + /* read error or read nothing */ + return NULL; +} + +static uint32_t do_inp(uint32_t addr, unsigned long size) +{ + switch (size) { + case 1: + return cpu_inb(addr); + case 2: + return cpu_inw(addr); + case 4: + return cpu_inl(addr); + default: + hw_error("inp: bad size: %04x %lx", addr, size); + } +} + +static void do_outp(uint32_t addr, + unsigned long size, uint32_t val) +{ + switch (size) { + case 1: + return cpu_outb(addr, val); + case 2: + return cpu_outw(addr, val); + case 4: + return cpu_outl(addr, val); + default: + hw_error("outp: bad size: %04x %lx", addr, size); + } +} + +/* + * Helper functions which read/write an object from/to physical guest + * memory, as part of the implementation of an ioreq. + * + * Equivalent to + * cpu_physical_memory_rw(addr + (req->df ? -1 : +1) * req->size * i, + * val, req->size, 0/1) + * except without the integer overflow problems. + */ +static void rw_phys_req_item(hwaddr addr, + ioreq_t *req, uint32_t i, void *val, int rw) +{ + /* Do everything unsigned so overflow just results in a truncated result + * and accesses to undesired parts of guest memory, which is up + * to the guest */ + hwaddr offset = (hwaddr)req->size * i; + if (req->df) { + addr -= offset; + } else { + addr += offset; + } + cpu_physical_memory_rw(addr, val, req->size, rw); +} + +static inline void read_phys_req_item(hwaddr addr, + ioreq_t *req, uint32_t i, void *val) +{ + rw_phys_req_item(addr, req, i, val, 0); +} +static inline void write_phys_req_item(hwaddr addr, + ioreq_t *req, uint32_t i, void *val) +{ + rw_phys_req_item(addr, req, i, val, 1); +} + + +static void cpu_ioreq_pio(ioreq_t *req) +{ + uint32_t i; + + trace_cpu_ioreq_pio(req, req->dir, req->df, req->data_is_ptr, req->addr, + req->data, req->count, req->size); + + if (req->size > sizeof(uint32_t)) { + hw_error("PIO: bad size (%u)", req->size); + } + + if (req->dir == IOREQ_READ) { + if (!req->data_is_ptr) { + req->data = do_inp(req->addr, req->size); + trace_cpu_ioreq_pio_read_reg(req, req->data, req->addr, + req->size); + } else { + uint32_t tmp; + + for (i = 0; i < req->count; i++) { + tmp = do_inp(req->addr, req->size); + write_phys_req_item(req->data, req, i, &tmp); + } + } + } else if (req->dir == IOREQ_WRITE) { + if (!req->data_is_ptr) { + trace_cpu_ioreq_pio_write_reg(req, req->data, req->addr, + req->size); + do_outp(req->addr, req->size, req->data); + } else { + for (i = 0; i < req->count; i++) { + uint32_t tmp = 0; + + read_phys_req_item(req->data, req, i, &tmp); + do_outp(req->addr, req->size, tmp); + } + } + } +} + +static void cpu_ioreq_move(ioreq_t *req) +{ + uint32_t i; + + trace_cpu_ioreq_move(req, req->dir, req->df, req->data_is_ptr, req->addr, + req->data, req->count, req->size); + + if (req->size > sizeof(req->data)) { + hw_error("MMIO: bad size (%u)", req->size); + } + + if (!req->data_is_ptr) { + if (req->dir == IOREQ_READ) { + for (i = 0; i < req->count; i++) { + read_phys_req_item(req->addr, req, i, &req->data); + } + } else if (req->dir == IOREQ_WRITE) { + for (i = 0; i < req->count; i++) { + write_phys_req_item(req->addr, req, i, &req->data); + } + } + } else { + uint64_t tmp; + + if (req->dir == IOREQ_READ) { + for (i = 0; i < req->count; i++) { + read_phys_req_item(req->addr, req, i, &tmp); + write_phys_req_item(req->data, req, i, &tmp); + } + } else if (req->dir == IOREQ_WRITE) { + for (i = 0; i < req->count; i++) { + read_phys_req_item(req->data, req, i, &tmp); + write_phys_req_item(req->addr, req, i, &tmp); + } + } + } +} + +static void cpu_ioreq_config(XenIOState *state, ioreq_t *req) +{ + uint32_t sbdf = req->addr >> 32; + uint32_t reg = req->addr; + XenPciDevice *xendev; + + if (req->size != sizeof(uint8_t) && req->size != sizeof(uint16_t) && + req->size != sizeof(uint32_t)) { + hw_error("PCI config access: bad size (%u)", req->size); + } + + if (req->count != 1) { + hw_error("PCI config access: bad count (%u)", req->count); + } + + QLIST_FOREACH(xendev, &state->dev_list, entry) { + if (xendev->sbdf != sbdf) { + continue; + } + + if (!req->data_is_ptr) { + if (req->dir == IOREQ_READ) { + req->data = pci_host_config_read_common( + xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE, + req->size); + trace_cpu_ioreq_config_read(req, xendev->sbdf, reg, + req->size, req->data); + } else if (req->dir == IOREQ_WRITE) { + trace_cpu_ioreq_config_write(req, xendev->sbdf, reg, + req->size, req->data); + pci_host_config_write_common( + xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE, + req->data, req->size); + } + } else { + uint32_t tmp; + + if (req->dir == IOREQ_READ) { + tmp = pci_host_config_read_common( + xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE, + req->size); + trace_cpu_ioreq_config_read(req, xendev->sbdf, reg, + req->size, tmp); + write_phys_req_item(req->data, req, 0, &tmp); + } else if (req->dir == IOREQ_WRITE) { + read_phys_req_item(req->data, req, 0, &tmp); + trace_cpu_ioreq_config_write(req, xendev->sbdf, reg, + req->size, tmp); + pci_host_config_write_common( + xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE, + tmp, req->size); + } + } + } +} + +static void regs_to_cpu(vmware_regs_t *vmport_regs, ioreq_t *req) +{ + X86CPU *cpu; + CPUX86State *env; + + cpu = X86_CPU(current_cpu); + env = &cpu->env; + env->regs[R_EAX] = req->data; + env->regs[R_EBX] = vmport_regs->ebx; + env->regs[R_ECX] = vmport_regs->ecx; + env->regs[R_EDX] = vmport_regs->edx; + env->regs[R_ESI] = vmport_regs->esi; + env->regs[R_EDI] = vmport_regs->edi; +} + +static void regs_from_cpu(vmware_regs_t *vmport_regs) +{ + X86CPU *cpu = X86_CPU(current_cpu); + CPUX86State *env = &cpu->env; + + vmport_regs->ebx = env->regs[R_EBX]; + vmport_regs->ecx = env->regs[R_ECX]; + vmport_regs->edx = env->regs[R_EDX]; + vmport_regs->esi = env->regs[R_ESI]; + vmport_regs->edi = env->regs[R_EDI]; +} + +static void handle_vmport_ioreq(XenIOState *state, ioreq_t *req) +{ + vmware_regs_t *vmport_regs; + + assert(state->shared_vmport_page); + vmport_regs = + &state->shared_vmport_page->vcpu_vmport_regs[state->send_vcpu]; + QEMU_BUILD_BUG_ON(sizeof(*req) < sizeof(*vmport_regs)); + + current_cpu = state->cpu_by_vcpu_id[state->send_vcpu]; + regs_to_cpu(vmport_regs, req); + cpu_ioreq_pio(req); + regs_from_cpu(vmport_regs); + current_cpu = NULL; +} + +static void handle_ioreq(XenIOState *state, ioreq_t *req) +{ + trace_handle_ioreq(req, req->type, req->dir, req->df, req->data_is_ptr, + req->addr, req->data, req->count, req->size); + + if (!req->data_is_ptr && (req->dir == IOREQ_WRITE) && + (req->size < sizeof (target_ulong))) { + req->data &= ((target_ulong) 1 << (8 * req->size)) - 1; + } + + if (req->dir == IOREQ_WRITE) + trace_handle_ioreq_write(req, req->type, req->df, req->data_is_ptr, + req->addr, req->data, req->count, req->size); + + switch (req->type) { + case IOREQ_TYPE_PIO: + cpu_ioreq_pio(req); + break; + case IOREQ_TYPE_COPY: + cpu_ioreq_move(req); + break; + case IOREQ_TYPE_VMWARE_PORT: + handle_vmport_ioreq(state, req); + break; + case IOREQ_TYPE_TIMEOFFSET: + break; + case IOREQ_TYPE_INVALIDATE: + xen_invalidate_map_cache(); + break; + case IOREQ_TYPE_PCI_CONFIG: + cpu_ioreq_config(state, req); + break; + default: + hw_error("Invalid ioreq type 0x%x\n", req->type); + } + if (req->dir == IOREQ_READ) { + trace_handle_ioreq_read(req, req->type, req->df, req->data_is_ptr, + req->addr, req->data, req->count, req->size); + } +} + +static int handle_buffered_iopage(XenIOState *state) +{ + buffered_iopage_t *buf_page = state->buffered_io_page; + buf_ioreq_t *buf_req = NULL; + ioreq_t req; + int qw; + + if (!buf_page) { + return 0; + } + + memset(&req, 0x00, sizeof(req)); + req.state = STATE_IOREQ_READY; + req.count = 1; + req.dir = IOREQ_WRITE; + + for (;;) { + uint32_t rdptr = buf_page->read_pointer, wrptr; + + xen_rmb(); + wrptr = buf_page->write_pointer; + xen_rmb(); + if (rdptr != buf_page->read_pointer) { + continue; + } + if (rdptr == wrptr) { + break; + } + buf_req = &buf_page->buf_ioreq[rdptr % IOREQ_BUFFER_SLOT_NUM]; + req.size = 1U << buf_req->size; + req.addr = buf_req->addr; + req.data = buf_req->data; + req.type = buf_req->type; + xen_rmb(); + qw = (req.size == 8); + if (qw) { + if (rdptr + 1 == wrptr) { + hw_error("Incomplete quad word buffered ioreq"); + } + buf_req = &buf_page->buf_ioreq[(rdptr + 1) % + IOREQ_BUFFER_SLOT_NUM]; + req.data |= ((uint64_t)buf_req->data) << 32; + xen_rmb(); + } + + handle_ioreq(state, &req); + + /* Only req.data may get updated by handle_ioreq(), albeit even that + * should not happen as such data would never make it to the guest (we + * can only usefully see writes here after all). + */ + assert(req.state == STATE_IOREQ_READY); + assert(req.count == 1); + assert(req.dir == IOREQ_WRITE); + assert(!req.data_is_ptr); + + qatomic_add(&buf_page->read_pointer, qw + 1); + } + + return req.count; +} + +static void handle_buffered_io(void *opaque) +{ + XenIOState *state = opaque; + + if (handle_buffered_iopage(state)) { + timer_mod(state->buffered_io_timer, + BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); + } else { + timer_del(state->buffered_io_timer); + xenevtchn_unmask(state->xce_handle, state->bufioreq_local_port); + } +} + +static void cpu_handle_ioreq(void *opaque) +{ + XenIOState *state = opaque; + ioreq_t *req = cpu_get_ioreq(state); + + handle_buffered_iopage(state); + if (req) { + ioreq_t copy = *req; + + xen_rmb(); + handle_ioreq(state, ©); + req->data = copy.data; + + if (req->state != STATE_IOREQ_INPROCESS) { + fprintf(stderr, "Badness in I/O request ... not in service?!: " + "%x, ptr: %x, port: %"PRIx64", " + "data: %"PRIx64", count: %u, size: %u, type: %u\n", + req->state, req->data_is_ptr, req->addr, + req->data, req->count, req->size, req->type); + destroy_hvm_domain(false); + return; + } + + xen_wmb(); /* Update ioreq contents /then/ update state. */ + + /* + * We do this before we send the response so that the tools + * have the opportunity to pick up on the reset before the + * guest resumes and does a hlt with interrupts disabled which + * causes Xen to powerdown the domain. + */ + if (runstate_is_running()) { + ShutdownCause request; + + if (qemu_shutdown_requested_get()) { + destroy_hvm_domain(false); + } + request = qemu_reset_requested_get(); + if (request) { + qemu_system_reset(request); + destroy_hvm_domain(true); + } + } + + req->state = STATE_IORESP_READY; + xenevtchn_notify(state->xce_handle, + state->ioreq_local_port[state->send_vcpu]); + } +} + +static void xen_main_loop_prepare(XenIOState *state) +{ + int evtchn_fd = -1; + + if (state->xce_handle != NULL) { + evtchn_fd = xenevtchn_fd(state->xce_handle); + } + + state->buffered_io_timer = timer_new_ms(QEMU_CLOCK_REALTIME, handle_buffered_io, + state); + + if (evtchn_fd != -1) { + CPUState *cpu_state; + + DPRINTF("%s: Init cpu_by_vcpu_id\n", __func__); + CPU_FOREACH(cpu_state) { + DPRINTF("%s: cpu_by_vcpu_id[%d]=%p\n", + __func__, cpu_state->cpu_index, cpu_state); + state->cpu_by_vcpu_id[cpu_state->cpu_index] = cpu_state; + } + qemu_set_fd_handler(evtchn_fd, cpu_handle_ioreq, NULL, state); + } +} + + +static void xen_hvm_change_state_handler(void *opaque, bool running, + RunState rstate) +{ + XenIOState *state = opaque; + + if (running) { + xen_main_loop_prepare(state); + } + + xen_set_ioreq_server_state(xen_domid, + state->ioservid, + (rstate == RUN_STATE_RUNNING)); +} + +static void xen_exit_notifier(Notifier *n, void *data) +{ + XenIOState *state = container_of(n, XenIOState, exit); + + xen_destroy_ioreq_server(xen_domid, state->ioservid); + if (state->fres != NULL) { + xenforeignmemory_unmap_resource(xen_fmem, state->fres); + } + + xenevtchn_close(state->xce_handle); + xs_daemon_close(state->xenstore); +} + +#ifdef XEN_COMPAT_PHYSMAP +static void xen_read_physmap(XenIOState *state) +{ + XenPhysmap *physmap = NULL; + unsigned int len, num, i; + char path[80], *value = NULL; + char **entries = NULL; + + snprintf(path, sizeof(path), + "/local/domain/0/device-model/%d/physmap", xen_domid); + entries = xs_directory(state->xenstore, 0, path, &num); + if (entries == NULL) + return; + + for (i = 0; i < num; i++) { + physmap = g_malloc(sizeof (XenPhysmap)); + physmap->phys_offset = strtoull(entries[i], NULL, 16); + snprintf(path, sizeof(path), + "/local/domain/0/device-model/%d/physmap/%s/start_addr", + xen_domid, entries[i]); + value = xs_read(state->xenstore, 0, path, &len); + if (value == NULL) { + g_free(physmap); + continue; + } + physmap->start_addr = strtoull(value, NULL, 16); + free(value); + + snprintf(path, sizeof(path), + "/local/domain/0/device-model/%d/physmap/%s/size", + xen_domid, entries[i]); + value = xs_read(state->xenstore, 0, path, &len); + if (value == NULL) { + g_free(physmap); + continue; + } + physmap->size = strtoull(value, NULL, 16); + free(value); + + snprintf(path, sizeof(path), + "/local/domain/0/device-model/%d/physmap/%s/name", + xen_domid, entries[i]); + physmap->name = xs_read(state->xenstore, 0, path, &len); + + QLIST_INSERT_HEAD(&xen_physmap, physmap, list); + } + free(entries); +} +#else +static void xen_read_physmap(XenIOState *state) +{ +} +#endif + +static void xen_wakeup_notifier(Notifier *notifier, void *data) +{ + xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 0); +} + +static int xen_map_ioreq_server(XenIOState *state) +{ + void *addr = NULL; + xen_pfn_t ioreq_pfn; + xen_pfn_t bufioreq_pfn; + evtchn_port_t bufioreq_evtchn; + int rc; + + /* + * Attempt to map using the resource API and fall back to normal + * foreign mapping if this is not supported. + */ + QEMU_BUILD_BUG_ON(XENMEM_resource_ioreq_server_frame_bufioreq != 0); + QEMU_BUILD_BUG_ON(XENMEM_resource_ioreq_server_frame_ioreq(0) != 1); + state->fres = xenforeignmemory_map_resource(xen_fmem, xen_domid, + XENMEM_resource_ioreq_server, + state->ioservid, 0, 2, + &addr, + PROT_READ | PROT_WRITE, 0); + if (state->fres != NULL) { + trace_xen_map_resource_ioreq(state->ioservid, addr); + state->buffered_io_page = addr; + state->shared_page = addr + TARGET_PAGE_SIZE; + } else if (errno != EOPNOTSUPP) { + error_report("failed to map ioreq server resources: error %d handle=%p", + errno, xen_xc); + return -1; + } + + rc = xen_get_ioreq_server_info(xen_domid, state->ioservid, + (state->shared_page == NULL) ? + &ioreq_pfn : NULL, + (state->buffered_io_page == NULL) ? + &bufioreq_pfn : NULL, + &bufioreq_evtchn); + if (rc < 0) { + error_report("failed to get ioreq server info: error %d handle=%p", + errno, xen_xc); + return rc; + } + + if (state->shared_page == NULL) { + DPRINTF("shared page at pfn %lx\n", ioreq_pfn); + + state->shared_page = xenforeignmemory_map(xen_fmem, xen_domid, + PROT_READ | PROT_WRITE, + 1, &ioreq_pfn, NULL); + if (state->shared_page == NULL) { + error_report("map shared IO page returned error %d handle=%p", + errno, xen_xc); + } + } + + if (state->buffered_io_page == NULL) { + DPRINTF("buffered io page at pfn %lx\n", bufioreq_pfn); + + state->buffered_io_page = xenforeignmemory_map(xen_fmem, xen_domid, + PROT_READ | PROT_WRITE, + 1, &bufioreq_pfn, + NULL); + if (state->buffered_io_page == NULL) { + error_report("map buffered IO page returned error %d", errno); + return -1; + } + } + + if (state->shared_page == NULL || state->buffered_io_page == NULL) { + return -1; + } + + DPRINTF("buffered io evtchn is %x\n", bufioreq_evtchn); + + state->bufioreq_remote_port = bufioreq_evtchn; + + return 0; +} + +void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory) +{ + MachineState *ms = MACHINE(pcms); + unsigned int max_cpus = ms->smp.max_cpus; + int i, rc; + xen_pfn_t ioreq_pfn; + XenIOState *state; + + state = g_malloc0(sizeof (XenIOState)); + + state->xce_handle = xenevtchn_open(NULL, 0); + if (state->xce_handle == NULL) { + perror("xen: event channel open"); + goto err; + } + + state->xenstore = xs_daemon_open(); + if (state->xenstore == NULL) { + perror("xen: xenstore open"); + goto err; + } + + xen_create_ioreq_server(xen_domid, &state->ioservid); + + state->exit.notify = xen_exit_notifier; + qemu_add_exit_notifier(&state->exit); + + state->suspend.notify = xen_suspend_notifier; + qemu_register_suspend_notifier(&state->suspend); + + state->wakeup.notify = xen_wakeup_notifier; + qemu_register_wakeup_notifier(&state->wakeup); + + /* + * Register wake-up support in QMP query-current-machine API + */ + qemu_register_wakeup_support(); + + rc = xen_map_ioreq_server(state); + if (rc < 0) { + goto err; + } + + rc = xen_get_vmport_regs_pfn(xen_xc, xen_domid, &ioreq_pfn); + if (!rc) { + DPRINTF("shared vmport page at pfn %lx\n", ioreq_pfn); + state->shared_vmport_page = + xenforeignmemory_map(xen_fmem, xen_domid, PROT_READ|PROT_WRITE, + 1, &ioreq_pfn, NULL); + if (state->shared_vmport_page == NULL) { + error_report("map shared vmport IO page returned error %d handle=%p", + errno, xen_xc); + goto err; + } + } else if (rc != -ENOSYS) { + error_report("get vmport regs pfn returned error %d, rc=%d", + errno, rc); + goto err; + } + + /* Note: cpus is empty at this point in init */ + state->cpu_by_vcpu_id = g_malloc0(max_cpus * sizeof(CPUState *)); + + rc = xen_set_ioreq_server_state(xen_domid, state->ioservid, true); + if (rc < 0) { + error_report("failed to enable ioreq server info: error %d handle=%p", + errno, xen_xc); + goto err; + } + + state->ioreq_local_port = g_malloc0(max_cpus * sizeof (evtchn_port_t)); + + /* FIXME: how about if we overflow the page here? */ + for (i = 0; i < max_cpus; i++) { + rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid, + xen_vcpu_eport(state->shared_page, i)); + if (rc == -1) { + error_report("shared evtchn %d bind error %d", i, errno); + goto err; + } + state->ioreq_local_port[i] = rc; + } + + rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid, + state->bufioreq_remote_port); + if (rc == -1) { + error_report("buffered evtchn bind error %d", errno); + goto err; + } + state->bufioreq_local_port = rc; + + /* Init RAM management */ +#ifdef XEN_COMPAT_PHYSMAP + xen_map_cache_init(xen_phys_offset_to_gaddr, state); +#else + xen_map_cache_init(NULL, state); +#endif + xen_ram_init(pcms, ms->ram_size, ram_memory); + + qemu_add_vm_change_state_handler(xen_hvm_change_state_handler, state); + + state->memory_listener = xen_memory_listener; + memory_listener_register(&state->memory_listener, &address_space_memory); + state->log_for_dirtybit = NULL; + + state->io_listener = xen_io_listener; + memory_listener_register(&state->io_listener, &address_space_io); + + state->device_listener = xen_device_listener; + QLIST_INIT(&state->dev_list); + device_listener_register(&state->device_listener); + + xen_bus_init(); + + /* Initialize backend core & drivers */ + if (xen_be_init() != 0) { + error_report("xen backend core setup failed"); + goto err; + } + xen_be_register_common(); + + QLIST_INIT(&xen_physmap); + xen_read_physmap(state); + + /* Disable ACPI build because Xen handles it */ + pcms->acpi_build_enabled = false; + + return; + +err: + error_report("xen hardware virtual machine initialisation failed"); + exit(1); +} + +void destroy_hvm_domain(bool reboot) +{ + xc_interface *xc_handle; + int sts; + int rc; + + unsigned int reason = reboot ? SHUTDOWN_reboot : SHUTDOWN_poweroff; + + if (xen_dmod) { + rc = xendevicemodel_shutdown(xen_dmod, xen_domid, reason); + if (!rc) { + return; + } + if (errno != ENOTTY /* old Xen */) { + perror("xendevicemodel_shutdown failed"); + } + /* well, try the old thing then */ + } + + xc_handle = xc_interface_open(0, 0, 0); + if (xc_handle == NULL) { + fprintf(stderr, "Cannot acquire xenctrl handle\n"); + } else { + sts = xc_domain_shutdown(xc_handle, xen_domid, reason); + if (sts != 0) { + fprintf(stderr, "xc_domain_shutdown failed to issue %s, " + "sts %d, %s\n", reboot ? "reboot" : "poweroff", + sts, strerror(errno)); + } else { + fprintf(stderr, "Issued domain %d %s\n", xen_domid, + reboot ? "reboot" : "poweroff"); + } + xc_interface_close(xc_handle); + } +} + +void xen_register_framebuffer(MemoryRegion *mr) +{ + framebuffer = mr; +} + +void xen_shutdown_fatal_error(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + fprintf(stderr, "Will destroy the domain.\n"); + /* destroy the domain */ + qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR); +} + +void xen_hvm_modified_memory(ram_addr_t start, ram_addr_t length) +{ + if (unlikely(xen_in_migration)) { + int rc; + ram_addr_t start_pfn, nb_pages; + + start = xen_phys_offset_to_gaddr(start, length); + + if (length == 0) { + length = TARGET_PAGE_SIZE; + } + start_pfn = start >> TARGET_PAGE_BITS; + nb_pages = ((start + length + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS) + - start_pfn; + rc = xen_modified_memory(xen_domid, start_pfn, nb_pages); + if (rc) { + fprintf(stderr, + "%s failed for "RAM_ADDR_FMT" ("RAM_ADDR_FMT"): %i, %s\n", + __func__, start, nb_pages, errno, strerror(errno)); + } + } +} + +void qmp_xen_set_global_dirty_log(bool enable, Error **errp) +{ + if (enable) { + memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); + } else { + memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); + } +} diff --git a/hw/i386/xen/xen-mapcache.c b/hw/i386/xen/xen-mapcache.c new file mode 100644 index 000000000..bd47c3d67 --- /dev/null +++ b/hw/i386/xen/xen-mapcache.c @@ -0,0 +1,593 @@ +/* + * Copyright (C) 2011 Citrix Ltd. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "qemu/error-report.h" + +#include <sys/resource.h> + +#include "hw/xen/xen-legacy-backend.h" +#include "qemu/bitmap.h" + +#include "sysemu/runstate.h" +#include "sysemu/xen-mapcache.h" +#include "trace.h" + + +//#define MAPCACHE_DEBUG + +#ifdef MAPCACHE_DEBUG +# define DPRINTF(fmt, ...) do { \ + fprintf(stderr, "xen_mapcache: " fmt, ## __VA_ARGS__); \ +} while (0) +#else +# define DPRINTF(fmt, ...) do { } while (0) +#endif + +#if HOST_LONG_BITS == 32 +# define MCACHE_BUCKET_SHIFT 16 +# define MCACHE_MAX_SIZE (1UL<<31) /* 2GB Cap */ +#else +# define MCACHE_BUCKET_SHIFT 20 +# define MCACHE_MAX_SIZE (1UL<<35) /* 32GB Cap */ +#endif +#define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT) + +/* This is the size of the virtual address space reserve to QEMU that will not + * be use by MapCache. + * From empirical tests I observed that qemu use 75MB more than the + * max_mcache_size. + */ +#define NON_MCACHE_MEMORY_SIZE (80 * MiB) + +typedef struct MapCacheEntry { + hwaddr paddr_index; + uint8_t *vaddr_base; + unsigned long *valid_mapping; + uint8_t lock; +#define XEN_MAPCACHE_ENTRY_DUMMY (1 << 0) + uint8_t flags; + hwaddr size; + struct MapCacheEntry *next; +} MapCacheEntry; + +typedef struct MapCacheRev { + uint8_t *vaddr_req; + hwaddr paddr_index; + hwaddr size; + QTAILQ_ENTRY(MapCacheRev) next; + bool dma; +} MapCacheRev; + +typedef struct MapCache { + MapCacheEntry *entry; + unsigned long nr_buckets; + QTAILQ_HEAD(, MapCacheRev) locked_entries; + + /* For most cases (>99.9%), the page address is the same. */ + MapCacheEntry *last_entry; + unsigned long max_mcache_size; + unsigned int mcache_bucket_shift; + + phys_offset_to_gaddr_t phys_offset_to_gaddr; + QemuMutex lock; + void *opaque; +} MapCache; + +static MapCache *mapcache; + +static inline void mapcache_lock(void) +{ + qemu_mutex_lock(&mapcache->lock); +} + +static inline void mapcache_unlock(void) +{ + qemu_mutex_unlock(&mapcache->lock); +} + +static inline int test_bits(int nr, int size, const unsigned long *addr) +{ + unsigned long res = find_next_zero_bit(addr, size + nr, nr); + if (res >= nr + size) + return 1; + else + return 0; +} + +void xen_map_cache_init(phys_offset_to_gaddr_t f, void *opaque) +{ + unsigned long size; + struct rlimit rlimit_as; + + mapcache = g_malloc0(sizeof (MapCache)); + + mapcache->phys_offset_to_gaddr = f; + mapcache->opaque = opaque; + qemu_mutex_init(&mapcache->lock); + + QTAILQ_INIT(&mapcache->locked_entries); + + if (geteuid() == 0) { + rlimit_as.rlim_cur = RLIM_INFINITY; + rlimit_as.rlim_max = RLIM_INFINITY; + mapcache->max_mcache_size = MCACHE_MAX_SIZE; + } else { + getrlimit(RLIMIT_AS, &rlimit_as); + rlimit_as.rlim_cur = rlimit_as.rlim_max; + + if (rlimit_as.rlim_max != RLIM_INFINITY) { + warn_report("QEMU's maximum size of virtual" + " memory is not infinity"); + } + if (rlimit_as.rlim_max < MCACHE_MAX_SIZE + NON_MCACHE_MEMORY_SIZE) { + mapcache->max_mcache_size = rlimit_as.rlim_max - + NON_MCACHE_MEMORY_SIZE; + } else { + mapcache->max_mcache_size = MCACHE_MAX_SIZE; + } + } + + setrlimit(RLIMIT_AS, &rlimit_as); + + mapcache->nr_buckets = + (((mapcache->max_mcache_size >> XC_PAGE_SHIFT) + + (1UL << (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1) >> + (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)); + + size = mapcache->nr_buckets * sizeof (MapCacheEntry); + size = (size + XC_PAGE_SIZE - 1) & ~(XC_PAGE_SIZE - 1); + DPRINTF("%s, nr_buckets = %lx size %lu\n", __func__, + mapcache->nr_buckets, size); + mapcache->entry = g_malloc0(size); +} + +static void xen_remap_bucket(MapCacheEntry *entry, + void *vaddr, + hwaddr size, + hwaddr address_index, + bool dummy) +{ + uint8_t *vaddr_base; + xen_pfn_t *pfns; + int *err; + unsigned int i; + hwaddr nb_pfn = size >> XC_PAGE_SHIFT; + + trace_xen_remap_bucket(address_index); + + pfns = g_malloc0(nb_pfn * sizeof (xen_pfn_t)); + err = g_malloc0(nb_pfn * sizeof (int)); + + if (entry->vaddr_base != NULL) { + if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) { + ram_block_notify_remove(entry->vaddr_base, entry->size, + entry->size); + } + + /* + * If an entry is being replaced by another mapping and we're using + * MAP_FIXED flag for it - there is possibility of a race for vaddr + * address with another thread doing an mmap call itself + * (see man 2 mmap). To avoid that we skip explicit unmapping here + * and allow the kernel to destroy the previous mappings by replacing + * them in mmap call later. + * + * Non-identical replacements are not allowed therefore. + */ + assert(!vaddr || (entry->vaddr_base == vaddr && entry->size == size)); + + if (!vaddr && munmap(entry->vaddr_base, entry->size) != 0) { + perror("unmap fails"); + exit(-1); + } + } + g_free(entry->valid_mapping); + entry->valid_mapping = NULL; + + for (i = 0; i < nb_pfn; i++) { + pfns[i] = (address_index << (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i; + } + + /* + * If the caller has requested the mapping at a specific address use + * MAP_FIXED to make sure it's honored. + */ + if (!dummy) { + vaddr_base = xenforeignmemory_map2(xen_fmem, xen_domid, vaddr, + PROT_READ | PROT_WRITE, + vaddr ? MAP_FIXED : 0, + nb_pfn, pfns, err); + if (vaddr_base == NULL) { + perror("xenforeignmemory_map2"); + exit(-1); + } + } else { + /* + * We create dummy mappings where we are unable to create a foreign + * mapping immediately due to certain circumstances (i.e. on resume now) + */ + vaddr_base = mmap(vaddr, size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_SHARED | (vaddr ? MAP_FIXED : 0), + -1, 0); + if (vaddr_base == MAP_FAILED) { + perror("mmap"); + exit(-1); + } + } + + if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) { + ram_block_notify_add(vaddr_base, size, size); + } + + entry->vaddr_base = vaddr_base; + entry->paddr_index = address_index; + entry->size = size; + entry->valid_mapping = (unsigned long *) g_malloc0(sizeof(unsigned long) * + BITS_TO_LONGS(size >> XC_PAGE_SHIFT)); + + if (dummy) { + entry->flags |= XEN_MAPCACHE_ENTRY_DUMMY; + } else { + entry->flags &= ~(XEN_MAPCACHE_ENTRY_DUMMY); + } + + bitmap_zero(entry->valid_mapping, nb_pfn); + for (i = 0; i < nb_pfn; i++) { + if (!err[i]) { + bitmap_set(entry->valid_mapping, i, 1); + } + } + + g_free(pfns); + g_free(err); +} + +static uint8_t *xen_map_cache_unlocked(hwaddr phys_addr, hwaddr size, + uint8_t lock, bool dma) +{ + MapCacheEntry *entry, *pentry = NULL, + *free_entry = NULL, *free_pentry = NULL; + hwaddr address_index; + hwaddr address_offset; + hwaddr cache_size = size; + hwaddr test_bit_size; + bool translated G_GNUC_UNUSED = false; + bool dummy = false; + +tryagain: + address_index = phys_addr >> MCACHE_BUCKET_SHIFT; + address_offset = phys_addr & (MCACHE_BUCKET_SIZE - 1); + + trace_xen_map_cache(phys_addr); + + /* test_bit_size is always a multiple of XC_PAGE_SIZE */ + if (size) { + test_bit_size = size + (phys_addr & (XC_PAGE_SIZE - 1)); + + if (test_bit_size % XC_PAGE_SIZE) { + test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE); + } + } else { + test_bit_size = XC_PAGE_SIZE; + } + + if (mapcache->last_entry != NULL && + mapcache->last_entry->paddr_index == address_index && + !lock && !size && + test_bits(address_offset >> XC_PAGE_SHIFT, + test_bit_size >> XC_PAGE_SHIFT, + mapcache->last_entry->valid_mapping)) { + trace_xen_map_cache_return(mapcache->last_entry->vaddr_base + address_offset); + return mapcache->last_entry->vaddr_base + address_offset; + } + + /* size is always a multiple of MCACHE_BUCKET_SIZE */ + if (size) { + cache_size = size + address_offset; + if (cache_size % MCACHE_BUCKET_SIZE) { + cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE); + } + } else { + cache_size = MCACHE_BUCKET_SIZE; + } + + entry = &mapcache->entry[address_index % mapcache->nr_buckets]; + + while (entry && (lock || entry->lock) && entry->vaddr_base && + (entry->paddr_index != address_index || entry->size != cache_size || + !test_bits(address_offset >> XC_PAGE_SHIFT, + test_bit_size >> XC_PAGE_SHIFT, + entry->valid_mapping))) { + if (!free_entry && !entry->lock) { + free_entry = entry; + free_pentry = pentry; + } + pentry = entry; + entry = entry->next; + } + if (!entry && free_entry) { + entry = free_entry; + pentry = free_pentry; + } + if (!entry) { + entry = g_malloc0(sizeof (MapCacheEntry)); + pentry->next = entry; + xen_remap_bucket(entry, NULL, cache_size, address_index, dummy); + } else if (!entry->lock) { + if (!entry->vaddr_base || entry->paddr_index != address_index || + entry->size != cache_size || + !test_bits(address_offset >> XC_PAGE_SHIFT, + test_bit_size >> XC_PAGE_SHIFT, + entry->valid_mapping)) { + xen_remap_bucket(entry, NULL, cache_size, address_index, dummy); + } + } + + if(!test_bits(address_offset >> XC_PAGE_SHIFT, + test_bit_size >> XC_PAGE_SHIFT, + entry->valid_mapping)) { + mapcache->last_entry = NULL; +#ifdef XEN_COMPAT_PHYSMAP + if (!translated && mapcache->phys_offset_to_gaddr) { + phys_addr = mapcache->phys_offset_to_gaddr(phys_addr, size); + translated = true; + goto tryagain; + } +#endif + if (!dummy && runstate_check(RUN_STATE_INMIGRATE)) { + dummy = true; + goto tryagain; + } + trace_xen_map_cache_return(NULL); + return NULL; + } + + mapcache->last_entry = entry; + if (lock) { + MapCacheRev *reventry = g_malloc0(sizeof(MapCacheRev)); + entry->lock++; + reventry->dma = dma; + reventry->vaddr_req = mapcache->last_entry->vaddr_base + address_offset; + reventry->paddr_index = mapcache->last_entry->paddr_index; + reventry->size = entry->size; + QTAILQ_INSERT_HEAD(&mapcache->locked_entries, reventry, next); + } + + trace_xen_map_cache_return(mapcache->last_entry->vaddr_base + address_offset); + return mapcache->last_entry->vaddr_base + address_offset; +} + +uint8_t *xen_map_cache(hwaddr phys_addr, hwaddr size, + uint8_t lock, bool dma) +{ + uint8_t *p; + + mapcache_lock(); + p = xen_map_cache_unlocked(phys_addr, size, lock, dma); + mapcache_unlock(); + return p; +} + +ram_addr_t xen_ram_addr_from_mapcache(void *ptr) +{ + MapCacheEntry *entry = NULL; + MapCacheRev *reventry; + hwaddr paddr_index; + hwaddr size; + ram_addr_t raddr; + int found = 0; + + mapcache_lock(); + QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) { + if (reventry->vaddr_req == ptr) { + paddr_index = reventry->paddr_index; + size = reventry->size; + found = 1; + break; + } + } + if (!found) { + fprintf(stderr, "%s, could not find %p\n", __func__, ptr); + QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) { + DPRINTF(" "TARGET_FMT_plx" -> %p is present\n", reventry->paddr_index, + reventry->vaddr_req); + } + abort(); + return 0; + } + + entry = &mapcache->entry[paddr_index % mapcache->nr_buckets]; + while (entry && (entry->paddr_index != paddr_index || entry->size != size)) { + entry = entry->next; + } + if (!entry) { + DPRINTF("Trying to find address %p that is not in the mapcache!\n", ptr); + raddr = 0; + } else { + raddr = (reventry->paddr_index << MCACHE_BUCKET_SHIFT) + + ((unsigned long) ptr - (unsigned long) entry->vaddr_base); + } + mapcache_unlock(); + return raddr; +} + +static void xen_invalidate_map_cache_entry_unlocked(uint8_t *buffer) +{ + MapCacheEntry *entry = NULL, *pentry = NULL; + MapCacheRev *reventry; + hwaddr paddr_index; + hwaddr size; + int found = 0; + + QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) { + if (reventry->vaddr_req == buffer) { + paddr_index = reventry->paddr_index; + size = reventry->size; + found = 1; + break; + } + } + if (!found) { + DPRINTF("%s, could not find %p\n", __func__, buffer); + QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) { + DPRINTF(" "TARGET_FMT_plx" -> %p is present\n", reventry->paddr_index, reventry->vaddr_req); + } + return; + } + QTAILQ_REMOVE(&mapcache->locked_entries, reventry, next); + g_free(reventry); + + if (mapcache->last_entry != NULL && + mapcache->last_entry->paddr_index == paddr_index) { + mapcache->last_entry = NULL; + } + + entry = &mapcache->entry[paddr_index % mapcache->nr_buckets]; + while (entry && (entry->paddr_index != paddr_index || entry->size != size)) { + pentry = entry; + entry = entry->next; + } + if (!entry) { + DPRINTF("Trying to unmap address %p that is not in the mapcache!\n", buffer); + return; + } + entry->lock--; + if (entry->lock > 0 || pentry == NULL) { + return; + } + + pentry->next = entry->next; + ram_block_notify_remove(entry->vaddr_base, entry->size, entry->size); + if (munmap(entry->vaddr_base, entry->size) != 0) { + perror("unmap fails"); + exit(-1); + } + g_free(entry->valid_mapping); + g_free(entry); +} + +void xen_invalidate_map_cache_entry(uint8_t *buffer) +{ + mapcache_lock(); + xen_invalidate_map_cache_entry_unlocked(buffer); + mapcache_unlock(); +} + +void xen_invalidate_map_cache(void) +{ + unsigned long i; + MapCacheRev *reventry; + + /* Flush pending AIO before destroying the mapcache */ + bdrv_drain_all(); + + mapcache_lock(); + + QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) { + if (!reventry->dma) { + continue; + } + fprintf(stderr, "Locked DMA mapping while invalidating mapcache!" + " "TARGET_FMT_plx" -> %p is present\n", + reventry->paddr_index, reventry->vaddr_req); + } + + for (i = 0; i < mapcache->nr_buckets; i++) { + MapCacheEntry *entry = &mapcache->entry[i]; + + if (entry->vaddr_base == NULL) { + continue; + } + if (entry->lock > 0) { + continue; + } + + if (munmap(entry->vaddr_base, entry->size) != 0) { + perror("unmap fails"); + exit(-1); + } + + entry->paddr_index = 0; + entry->vaddr_base = NULL; + entry->size = 0; + g_free(entry->valid_mapping); + entry->valid_mapping = NULL; + } + + mapcache->last_entry = NULL; + + mapcache_unlock(); +} + +static uint8_t *xen_replace_cache_entry_unlocked(hwaddr old_phys_addr, + hwaddr new_phys_addr, + hwaddr size) +{ + MapCacheEntry *entry; + hwaddr address_index, address_offset; + hwaddr test_bit_size, cache_size = size; + + address_index = old_phys_addr >> MCACHE_BUCKET_SHIFT; + address_offset = old_phys_addr & (MCACHE_BUCKET_SIZE - 1); + + assert(size); + /* test_bit_size is always a multiple of XC_PAGE_SIZE */ + test_bit_size = size + (old_phys_addr & (XC_PAGE_SIZE - 1)); + if (test_bit_size % XC_PAGE_SIZE) { + test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE); + } + cache_size = size + address_offset; + if (cache_size % MCACHE_BUCKET_SIZE) { + cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE); + } + + entry = &mapcache->entry[address_index % mapcache->nr_buckets]; + while (entry && !(entry->paddr_index == address_index && + entry->size == cache_size)) { + entry = entry->next; + } + if (!entry) { + DPRINTF("Trying to update an entry for "TARGET_FMT_plx \ + "that is not in the mapcache!\n", old_phys_addr); + return NULL; + } + + address_index = new_phys_addr >> MCACHE_BUCKET_SHIFT; + address_offset = new_phys_addr & (MCACHE_BUCKET_SIZE - 1); + + fprintf(stderr, "Replacing a dummy mapcache entry for "TARGET_FMT_plx \ + " with "TARGET_FMT_plx"\n", old_phys_addr, new_phys_addr); + + xen_remap_bucket(entry, entry->vaddr_base, + cache_size, address_index, false); + if (!test_bits(address_offset >> XC_PAGE_SHIFT, + test_bit_size >> XC_PAGE_SHIFT, + entry->valid_mapping)) { + DPRINTF("Unable to update a mapcache entry for "TARGET_FMT_plx"!\n", + old_phys_addr); + return NULL; + } + + return entry->vaddr_base + address_offset; +} + +uint8_t *xen_replace_cache_entry(hwaddr old_phys_addr, + hwaddr new_phys_addr, + hwaddr size) +{ + uint8_t *p; + + mapcache_lock(); + p = xen_replace_cache_entry_unlocked(old_phys_addr, new_phys_addr, size); + mapcache_unlock(); + return p; +} diff --git a/hw/i386/xen/xen_apic.c b/hw/i386/xen/xen_apic.c new file mode 100644 index 000000000..7c7a60b16 --- /dev/null +++ b/hw/i386/xen/xen_apic.c @@ -0,0 +1,103 @@ +/* + * Xen basic APIC support + * + * Copyright (c) 2012 Citrix + * + * Authors: + * Wei Liu <wei.liu2@citrix.com> + * + * This work is licensed under the terms of the GNU GPL version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "hw/i386/apic_internal.h" +#include "hw/pci/msi.h" +#include "hw/xen/xen.h" +#include "qemu/module.h" + +static uint64_t xen_apic_mem_read(void *opaque, hwaddr addr, + unsigned size) +{ + return ~(uint64_t)0; +} + +static void xen_apic_mem_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + if (size != sizeof(uint32_t)) { + fprintf(stderr, "Xen: APIC write data size = %d, invalid\n", size); + return; + } + + xen_hvm_inject_msi(addr, data); +} + +static const MemoryRegionOps xen_apic_io_ops = { + .read = xen_apic_mem_read, + .write = xen_apic_mem_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void xen_apic_realize(DeviceState *dev, Error **errp) +{ + APICCommonState *s = APIC_COMMON(dev); + + s->vapic_control = 0; + memory_region_init_io(&s->io_memory, OBJECT(s), &xen_apic_io_ops, s, + "xen-apic-msi", APIC_SPACE_SIZE); + msi_nonbroken = true; +} + +static void xen_apic_set_base(APICCommonState *s, uint64_t val) +{ +} + +static void xen_apic_set_tpr(APICCommonState *s, uint8_t val) +{ +} + +static uint8_t xen_apic_get_tpr(APICCommonState *s) +{ + return 0; +} + +static void xen_apic_vapic_base_update(APICCommonState *s) +{ +} + +static void xen_apic_external_nmi(APICCommonState *s) +{ +} + +static void xen_send_msi(MSIMessage *msi) +{ + xen_hvm_inject_msi(msi->address, msi->data); +} + +static void xen_apic_class_init(ObjectClass *klass, void *data) +{ + APICCommonClass *k = APIC_COMMON_CLASS(klass); + + k->realize = xen_apic_realize; + k->set_base = xen_apic_set_base; + k->set_tpr = xen_apic_set_tpr; + k->get_tpr = xen_apic_get_tpr; + k->vapic_base_update = xen_apic_vapic_base_update; + k->external_nmi = xen_apic_external_nmi; + k->send_msi = xen_send_msi; +} + +static const TypeInfo xen_apic_info = { + .name = "xen-apic", + .parent = TYPE_APIC_COMMON, + .instance_size = sizeof(APICCommonState), + .class_init = xen_apic_class_init, +}; + +static void xen_apic_register_types(void) +{ + type_register_static(&xen_apic_info); +} + +type_init(xen_apic_register_types) diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c new file mode 100644 index 000000000..72028449b --- /dev/null +++ b/hw/i386/xen/xen_platform.c @@ -0,0 +1,519 @@ +/* + * XEN platform pci device, formerly known as the event channel device + * + * Copyright (c) 2003-2004 Intel Corp. + * Copyright (c) 2006 XenSource + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/ide.h" +#include "hw/pci/pci.h" +#include "hw/xen/xen_common.h" +#include "migration/vmstate.h" +#include "hw/xen/xen-legacy-backend.h" +#include "trace.h" +#include "sysemu/xen.h" +#include "sysemu/block-backend.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "qom/object.h" + +//#define DEBUG_PLATFORM + +#ifdef DEBUG_PLATFORM +#define DPRINTF(fmt, ...) do { \ + fprintf(stderr, "xen_platform: " fmt, ## __VA_ARGS__); \ +} while (0) +#else +#define DPRINTF(fmt, ...) do { } while (0) +#endif + +#define PFFLAG_ROM_LOCK 1 /* Sets whether ROM memory area is RW or RO */ + +struct PCIXenPlatformState { + /*< private >*/ + PCIDevice parent_obj; + /*< public >*/ + + MemoryRegion fixed_io; + MemoryRegion bar; + MemoryRegion mmio_bar; + uint8_t flags; /* used only for version_id == 2 */ + uint16_t driver_product_version; + + /* Log from guest drivers */ + char log_buffer[4096]; + int log_buffer_off; +}; + +#define TYPE_XEN_PLATFORM "xen-platform" +OBJECT_DECLARE_SIMPLE_TYPE(PCIXenPlatformState, XEN_PLATFORM) + +#define XEN_PLATFORM_IOPORT 0x10 + +/* Send bytes to syslog */ +static void log_writeb(PCIXenPlatformState *s, char val) +{ + if (val == '\n' || s->log_buffer_off == sizeof(s->log_buffer) - 1) { + /* Flush buffer */ + s->log_buffer[s->log_buffer_off] = 0; + trace_xen_platform_log(s->log_buffer); + s->log_buffer_off = 0; + } else { + s->log_buffer[s->log_buffer_off++] = val; + } +} + +/* + * Unplug device flags. + * + * The logic got a little confused at some point in the past but this is + * what they do now. + * + * bit 0: Unplug all IDE and SCSI disks. + * bit 1: Unplug all NICs. + * bit 2: Unplug IDE disks except primary master. This is overridden if + * bit 0 is also present in the mask. + * bit 3: Unplug all NVMe disks. + * + */ +#define _UNPLUG_IDE_SCSI_DISKS 0 +#define UNPLUG_IDE_SCSI_DISKS (1u << _UNPLUG_IDE_SCSI_DISKS) + +#define _UNPLUG_ALL_NICS 1 +#define UNPLUG_ALL_NICS (1u << _UNPLUG_ALL_NICS) + +#define _UNPLUG_AUX_IDE_DISKS 2 +#define UNPLUG_AUX_IDE_DISKS (1u << _UNPLUG_AUX_IDE_DISKS) + +#define _UNPLUG_NVME_DISKS 3 +#define UNPLUG_NVME_DISKS (1u << _UNPLUG_NVME_DISKS) + +static void unplug_nic(PCIBus *b, PCIDevice *d, void *o) +{ + /* We have to ignore passthrough devices */ + if (pci_get_word(d->config + PCI_CLASS_DEVICE) == + PCI_CLASS_NETWORK_ETHERNET + && strcmp(d->name, "xen-pci-passthrough") != 0) { + object_unparent(OBJECT(d)); + } +} + +/* Remove the peer of the NIC device. Normally, this would be a tap device. */ +static void del_nic_peer(NICState *nic, void *opaque) +{ + NetClientState *nc; + + nc = qemu_get_queue(nic); + if (nc->peer) + qemu_del_net_client(nc->peer); +} + +static void pci_unplug_nics(PCIBus *bus) +{ + qemu_foreach_nic(del_nic_peer, NULL); + pci_for_each_device(bus, 0, unplug_nic, NULL); +} + +static void unplug_disks(PCIBus *b, PCIDevice *d, void *opaque) +{ + uint32_t flags = *(uint32_t *)opaque; + bool aux = (flags & UNPLUG_AUX_IDE_DISKS) && + !(flags & UNPLUG_IDE_SCSI_DISKS); + + /* We have to ignore passthrough devices */ + if (!strcmp(d->name, "xen-pci-passthrough")) { + return; + } + + switch (pci_get_word(d->config + PCI_CLASS_DEVICE)) { + case PCI_CLASS_STORAGE_IDE: + pci_piix3_xen_ide_unplug(DEVICE(d), aux); + break; + + case PCI_CLASS_STORAGE_SCSI: + if (!aux) { + object_unparent(OBJECT(d)); + } + break; + + case PCI_CLASS_STORAGE_EXPRESS: + if (flags & UNPLUG_NVME_DISKS) { + object_unparent(OBJECT(d)); + } + + default: + break; + } +} + +static void pci_unplug_disks(PCIBus *bus, uint32_t flags) +{ + pci_for_each_device(bus, 0, unplug_disks, &flags); +} + +static void platform_fixed_ioport_writew(void *opaque, uint32_t addr, uint32_t val) +{ + PCIXenPlatformState *s = opaque; + + switch (addr) { + case 0: { + PCIDevice *pci_dev = PCI_DEVICE(s); + /* Unplug devices. See comment above flag definitions */ + if (val & (UNPLUG_IDE_SCSI_DISKS | UNPLUG_AUX_IDE_DISKS | + UNPLUG_NVME_DISKS)) { + DPRINTF("unplug disks\n"); + pci_unplug_disks(pci_get_bus(pci_dev), val); + } + if (val & UNPLUG_ALL_NICS) { + DPRINTF("unplug nics\n"); + pci_unplug_nics(pci_get_bus(pci_dev)); + } + break; + } + case 2: + switch (val) { + case 1: + DPRINTF("Citrix Windows PV drivers loaded in guest\n"); + break; + case 0: + DPRINTF("Guest claimed to be running PV product 0?\n"); + break; + default: + DPRINTF("Unknown PV product %d loaded in guest\n", val); + break; + } + s->driver_product_version = val; + break; + } +} + +static void platform_fixed_ioport_writel(void *opaque, uint32_t addr, + uint32_t val) +{ + switch (addr) { + case 0: + /* PV driver version */ + break; + } +} + +static void platform_fixed_ioport_writeb(void *opaque, uint32_t addr, uint32_t val) +{ + PCIXenPlatformState *s = opaque; + + switch (addr) { + case 0: /* Platform flags */ { + hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ? + HVMMEM_ram_ro : HVMMEM_ram_rw; + if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) { + DPRINTF("unable to change ro/rw state of ROM memory area!\n"); + } else { + s->flags = val & PFFLAG_ROM_LOCK; + DPRINTF("changed ro/rw state of ROM memory area. now is %s state.\n", + (mem_type == HVMMEM_ram_ro ? "ro":"rw")); + } + break; + } + case 2: + log_writeb(s, val); + break; + } +} + +static uint32_t platform_fixed_ioport_readw(void *opaque, uint32_t addr) +{ + switch (addr) { + case 0: + /* Magic value so that you can identify the interface. */ + return 0x49d2; + default: + return 0xffff; + } +} + +static uint32_t platform_fixed_ioport_readb(void *opaque, uint32_t addr) +{ + PCIXenPlatformState *s = opaque; + + switch (addr) { + case 0: + /* Platform flags */ + return s->flags; + case 2: + /* Version number */ + return 1; + default: + return 0xff; + } +} + +static void platform_fixed_ioport_reset(void *opaque) +{ + PCIXenPlatformState *s = opaque; + + platform_fixed_ioport_writeb(s, 0, 0); +} + +static uint64_t platform_fixed_ioport_read(void *opaque, + hwaddr addr, + unsigned size) +{ + switch (size) { + case 1: + return platform_fixed_ioport_readb(opaque, addr); + case 2: + return platform_fixed_ioport_readw(opaque, addr); + default: + return -1; + } +} + +static void platform_fixed_ioport_write(void *opaque, hwaddr addr, + + uint64_t val, unsigned size) +{ + switch (size) { + case 1: + platform_fixed_ioport_writeb(opaque, addr, val); + break; + case 2: + platform_fixed_ioport_writew(opaque, addr, val); + break; + case 4: + platform_fixed_ioport_writel(opaque, addr, val); + break; + } +} + + +static const MemoryRegionOps platform_fixed_io_ops = { + .read = platform_fixed_ioport_read, + .write = platform_fixed_ioport_write, + .valid = { + .unaligned = true, + }, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + .unaligned = true, + }, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void platform_fixed_ioport_init(PCIXenPlatformState* s) +{ + memory_region_init_io(&s->fixed_io, OBJECT(s), &platform_fixed_io_ops, s, + "xen-fixed", 16); + memory_region_add_subregion(get_system_io(), XEN_PLATFORM_IOPORT, + &s->fixed_io); +} + +/* Xen Platform PCI Device */ + +static uint64_t xen_platform_ioport_readb(void *opaque, hwaddr addr, + unsigned int size) +{ + if (addr == 0) { + return platform_fixed_ioport_readb(opaque, 0); + } else { + return ~0u; + } +} + +static void xen_platform_ioport_writeb(void *opaque, hwaddr addr, + uint64_t val, unsigned int size) +{ + PCIXenPlatformState *s = opaque; + PCIDevice *pci_dev = PCI_DEVICE(s); + + switch (addr) { + case 0: /* Platform flags */ + platform_fixed_ioport_writeb(opaque, 0, (uint32_t)val); + break; + case 4: + if (val == 1) { + /* + * SUSE unplug for Xenlinux + * xen-kmp used this since xen-3.0.4, instead the official protocol + * from xen-3.3+ It did an unconditional "outl(1, (ioaddr + 4));" + * Pre VMDP 1.7 used 4 and 8 depending on how VMDP was configured. + * If VMDP was to control both disk and LAN it would use 4. + * If it controlled just disk or just LAN, it would use 8 below. + */ + pci_unplug_disks(pci_get_bus(pci_dev), UNPLUG_IDE_SCSI_DISKS); + pci_unplug_nics(pci_get_bus(pci_dev)); + } + break; + case 8: + switch (val) { + case 1: + pci_unplug_disks(pci_get_bus(pci_dev), UNPLUG_IDE_SCSI_DISKS); + break; + case 2: + pci_unplug_nics(pci_get_bus(pci_dev)); + break; + default: + log_writeb(s, (uint32_t)val); + break; + } + break; + default: + break; + } +} + +static const MemoryRegionOps xen_pci_io_ops = { + .read = xen_platform_ioport_readb, + .write = xen_platform_ioport_writeb, + .impl.min_access_size = 1, + .impl.max_access_size = 1, +}; + +static void platform_ioport_bar_setup(PCIXenPlatformState *d) +{ + memory_region_init_io(&d->bar, OBJECT(d), &xen_pci_io_ops, d, + "xen-pci", 0x100); +} + +static uint64_t platform_mmio_read(void *opaque, hwaddr addr, + unsigned size) +{ + DPRINTF("Warning: attempted read from physical address " + "0x" TARGET_FMT_plx " in xen platform mmio space\n", addr); + + return 0; +} + +static void platform_mmio_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + DPRINTF("Warning: attempted write of 0x%"PRIx64" to physical " + "address 0x" TARGET_FMT_plx " in xen platform mmio space\n", + val, addr); +} + +static const MemoryRegionOps platform_mmio_handler = { + .read = &platform_mmio_read, + .write = &platform_mmio_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void platform_mmio_setup(PCIXenPlatformState *d) +{ + memory_region_init_io(&d->mmio_bar, OBJECT(d), &platform_mmio_handler, d, + "xen-mmio", 0x1000000); +} + +static int xen_platform_post_load(void *opaque, int version_id) +{ + PCIXenPlatformState *s = opaque; + + platform_fixed_ioport_writeb(s, 0, s->flags); + + return 0; +} + +static const VMStateDescription vmstate_xen_platform = { + .name = "platform", + .version_id = 4, + .minimum_version_id = 4, + .post_load = xen_platform_post_load, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(parent_obj, PCIXenPlatformState), + VMSTATE_UINT8(flags, PCIXenPlatformState), + VMSTATE_END_OF_LIST() + } +}; + +static void xen_platform_realize(PCIDevice *dev, Error **errp) +{ + PCIXenPlatformState *d = XEN_PLATFORM(dev); + uint8_t *pci_conf; + + /* Device will crash on reset if xen is not initialized */ + if (!xen_enabled()) { + error_setg(errp, "xen-platform device requires the Xen accelerator"); + return; + } + + pci_conf = dev->config; + + pci_set_word(pci_conf + PCI_COMMAND, PCI_COMMAND_IO | PCI_COMMAND_MEMORY); + + pci_config_set_prog_interface(pci_conf, 0); + + pci_conf[PCI_INTERRUPT_PIN] = 1; + + platform_ioport_bar_setup(d); + pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_IO, &d->bar); + + /* reserve 16MB mmio address for share memory*/ + platform_mmio_setup(d); + pci_register_bar(dev, 1, PCI_BASE_ADDRESS_MEM_PREFETCH, + &d->mmio_bar); + + platform_fixed_ioport_init(d); +} + +static void platform_reset(DeviceState *dev) +{ + PCIXenPlatformState *s = XEN_PLATFORM(dev); + + platform_fixed_ioport_reset(s); +} + +static void xen_platform_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + k->realize = xen_platform_realize; + k->vendor_id = PCI_VENDOR_ID_XEN; + k->device_id = PCI_DEVICE_ID_XEN_PLATFORM; + k->class_id = PCI_CLASS_OTHERS << 8 | 0x80; + k->subsystem_vendor_id = PCI_VENDOR_ID_XEN; + k->subsystem_id = PCI_DEVICE_ID_XEN_PLATFORM; + k->revision = 1; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + dc->desc = "XEN platform pci device"; + dc->reset = platform_reset; + dc->vmsd = &vmstate_xen_platform; +} + +static const TypeInfo xen_platform_info = { + .name = TYPE_XEN_PLATFORM, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(PCIXenPlatformState), + .class_init = xen_platform_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { }, + }, +}; + +static void xen_platform_register_types(void) +{ + type_register_static(&xen_platform_info); +} + +type_init(xen_platform_register_types) diff --git a/hw/i386/xen/xen_pvdevice.c b/hw/i386/xen/xen_pvdevice.c new file mode 100644 index 000000000..1ea95fa60 --- /dev/null +++ b/hw/i386/xen/xen_pvdevice.c @@ -0,0 +1,154 @@ +/* Copyright (c) Citrix Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, + * with or without modification, are permitted provided + * that the following conditions are met: + * + * * Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the + * following disclaimer in the documentation and/or other + * materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "hw/pci/pci.h" +#include "hw/qdev-properties.h" +#include "migration/vmstate.h" +#include "trace.h" +#include "qom/object.h" + +#define TYPE_XEN_PV_DEVICE "xen-pvdevice" + +OBJECT_DECLARE_SIMPLE_TYPE(XenPVDevice, XEN_PV_DEVICE) + +struct XenPVDevice { + /*< private >*/ + PCIDevice parent_obj; + /*< public >*/ + uint16_t vendor_id; + uint16_t device_id; + uint8_t revision; + uint32_t size; + MemoryRegion mmio; +}; + +static uint64_t xen_pv_mmio_read(void *opaque, hwaddr addr, + unsigned size) +{ + trace_xen_pv_mmio_read(addr); + + return ~(uint64_t)0; +} + +static void xen_pv_mmio_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + trace_xen_pv_mmio_write(addr); +} + +static const MemoryRegionOps xen_pv_mmio_ops = { + .read = &xen_pv_mmio_read, + .write = &xen_pv_mmio_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static const VMStateDescription vmstate_xen_pvdevice = { + .name = "xen-pvdevice", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(parent_obj, XenPVDevice), + VMSTATE_END_OF_LIST() + } +}; + +static void xen_pv_realize(PCIDevice *pci_dev, Error **errp) +{ + XenPVDevice *d = XEN_PV_DEVICE(pci_dev); + uint8_t *pci_conf; + + /* device-id property must always be supplied */ + if (d->device_id == 0xffff) { + error_setg(errp, "Device ID invalid, it must always be supplied"); + return; + } + + pci_conf = pci_dev->config; + + pci_set_word(pci_conf + PCI_VENDOR_ID, d->vendor_id); + pci_set_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID, d->vendor_id); + pci_set_word(pci_conf + PCI_DEVICE_ID, d->device_id); + pci_set_word(pci_conf + PCI_SUBSYSTEM_ID, d->device_id); + pci_set_byte(pci_conf + PCI_REVISION_ID, d->revision); + + pci_set_word(pci_conf + PCI_COMMAND, PCI_COMMAND_MEMORY); + + pci_config_set_prog_interface(pci_conf, 0); + + pci_conf[PCI_INTERRUPT_PIN] = 1; + + memory_region_init_io(&d->mmio, NULL, &xen_pv_mmio_ops, d, + "mmio", d->size); + + pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_MEM_PREFETCH, + &d->mmio); +} + +static Property xen_pv_props[] = { + DEFINE_PROP_UINT16("vendor-id", XenPVDevice, vendor_id, PCI_VENDOR_ID_XEN), + DEFINE_PROP_UINT16("device-id", XenPVDevice, device_id, 0xffff), + DEFINE_PROP_UINT8("revision", XenPVDevice, revision, 0x01), + DEFINE_PROP_UINT32("size", XenPVDevice, size, 0x400000), + DEFINE_PROP_END_OF_LIST() +}; + +static void xen_pv_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + k->realize = xen_pv_realize; + k->class_id = PCI_CLASS_SYSTEM_OTHER; + dc->desc = "Xen PV Device"; + device_class_set_props(dc, xen_pv_props); + dc->vmsd = &vmstate_xen_pvdevice; +} + +static const TypeInfo xen_pv_type_info = { + .name = TYPE_XEN_PV_DEVICE, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(XenPVDevice), + .class_init = xen_pv_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { }, + }, +}; + +static void xen_pv_register_types(void) +{ + type_register_static(&xen_pv_type_info); +} + +type_init(xen_pv_register_types) |