Add submodule dependency filesHEAD master

Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
author: Angelos Mouzakitis <a.mouzakitis@virtualopensystems.com> 2023-10-10 14:33:42 +0000
committer: Angelos Mouzakitis <a.mouzakitis@virtualopensystems.com> 2023-10-10 14:33:42 +0000
commit: af1a266670d040d2f4083ff309d732d648afba2a (patch)
tree: 2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/skiboot/hw
parent: e02cda008591317b1625707ff8e115a4841aa889 (diff)
94 files changed, 74585 insertions, 0 deletions
diff --git a/roms/skiboot/hw/Makefile.inc b/roms/skiboot/hw/Makefile.inc
new file mode 100644
index 000000000..37256d3cc
--- /dev/null
+++ b/roms/skiboot/hw/Makefile.inc
@@ -0,0 +1,19 @@
+# -*-Makefile-*-
+SUBDIRS += hw
+HW_OBJS  = xscom.o chiptod.o lpc.o lpc-uart.o psi.o
+HW_OBJS += homer.o slw.o occ.o fsi-master.o centaur.o imc.o
+HW_OBJS += nx.o nx-rng.o nx-crypto.o nx-compress.o nx-842.o nx-gzip.o
+HW_OBJS += phb3.o sfc-ctrl.o fake-rtc.o bt.o p8-i2c.o prd.o
+HW_OBJS += dts.o lpc-rtc.o npu.o npu-hw-procedures.o xive.o phb4.o
+HW_OBJS += fake-nvram.o lpc-mbox.o npu2.o npu2-hw-procedures.o
+HW_OBJS += npu2-common.o npu2-opencapi.o phys-map.o sbe-p9.o capp.o
+HW_OBJS += occ-sensor.o vas.o sbe-p8.o dio-p9.o lpc-port80h.o cache-p9.o
+HW_OBJS += npu-opal.o npu3.o npu3-nvlink.o npu3-hw-procedures.o
+HW_OBJS += ocmb.o xive2.o
+HW=hw/built-in.a
+
+include $(SRC)/hw/fsp/Makefile.inc
+include $(SRC)/hw/ast-bmc/Makefile.inc
+include $(SRC)/hw/ipmi/Makefile.inc
+
+$(HW): $(HW_OBJS:%=hw/%) $(FSP) $(EC) $(AST_BMC) $(IPMI)
diff --git a/roms/skiboot/hw/ast-bmc/Makefile.inc b/roms/skiboot/hw/ast-bmc/Makefile.inc
new file mode 100644
index 000000000..e7ded0e88
--- /dev/null
+++ b/roms/skiboot/hw/ast-bmc/Makefile.inc
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+SUBDIRS += hw/ast-bmc
+
+AST_BMC_OBJS  = ast-io.o ast-sf-ctrl.o
+AST_BMC = hw/ast-bmc/built-in.a
+$(AST_BMC): $(AST_BMC_OBJS:%=hw/ast-bmc/%)
diff --git a/roms/skiboot/hw/ast-bmc/ast-io.c b/roms/skiboot/hw/ast-bmc/ast-io.c
new file mode 100644
index 000000000..f0f8c4c4d
--- /dev/null
+++ b/roms/skiboot/hw/ast-bmc/ast-io.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Note about accesses to the AST2400 internal memory map:
+ *
+ * There are two ways to genrate accesses to the AHB bus of the AST2400
+ * from the host. The LPC->AHB bridge and the iLPC->AHB bridge.
+ *
+ * LPC->AHB bridge
+ * ---------------
+ *
+ * This bridge directly converts memory or firmware accesses using
+ * a set of registers for establishing a remapping window. We prefer
+ * using FW space as normal memory space is limited to byte accesses
+ * to a fixed 256M window, while FW space allows us to use different
+ * access sizes and to control the IDSEL bits which essentially enable
+ * a full 4G address space.
+ *
+ * The way FW accesses map onto AHB is controlled via two registers
+ * in the BMC's LPC host controller:
+ *
+ * HICR7 at 0x1e789088 [31:16] : ADRBASE
+ *                     [15:00] : HWMBASE
+ *
+ * HICR8 at 0x1e78908c [31:16] : ADRMASK
+ *		       [15:00] : HWNCARE
+ *
+ * All decoding/remapping happens on the top 16 bits of the LPC address
+ * named LPC_ADDR as follow:
+ *
+ *  - For decoding, LPC_ADDR bits are compared with HWMBASE if the
+ *    corresponding bit in HWNCARE is 0.
+ *
+ *  - For remapping, the AHB address is constructed by taking bits
+ *    from LPC_ADDR if the corresponding bit in ADRMASK is 0 or in
+ *    ADRBASE if the corresponding bit in ADRMASK is 1
+ *
+ * Example of 2MB SPI flash, LPC 0xFCE00000~0xFCFFFFFF onto
+ *                           AHB 0x30000000~0x301FFFFF (SPI flash)
+ *
+ * ADRBASE=0x3000 HWMBASE=0xFCE0
+ * ADRMASK=0xFFE0 HWNCARE=0x001F
+ *
+ * This comes pre-configured by the BMC or HostBoot to access the PNOR
+ * flash from IDSEL 0 as follow:
+ *
+ * ADRBASE=0x3000 HWMBASE=0x0e00 for 32MB
+ * ADRMASK=0xfe00 HWNCARE=0x01ff
+ *
+ * Which means mapping of   LPC 0x0e000000..0x0fffffff onto
+ *                          AHB 0x30000000..0x31ffffff
+ *
+ * iLPC->AHB bridge
+ * ---------------
+ *
+ * This bridge is hosted in the SuperIO part of the BMC and is
+ * controlled by a series of byte-sized registers accessed indirectly
+ * via IO ports 0x2e and 0x2f.
+ *
+ * Via these, byte by byte, we can construct an AHB address and
+ * fill a data buffer to trigger a write cycle, or we can do a
+ * read cycle and read back the data, byte after byte.
+ *
+ * This is fairly convoluted and slow but works regardless of what
+ * mapping was established in the LPC->AHB bridge.
+ *
+ * For the time being, we use the iLPC->AHB for everything except
+ * pnor accesses. In the long run, we will reconfigure the LPC->AHB
+ * to provide more direct access to all of the BMC address space but
+ * we'll only do that after the boot script/program on the BMC is
+ * updated to restore the bridge to a state compatible with the SBE
+ * expectations on boot.
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <lpc.h>
+#include <lock.h>
+#include <device.h>
+
+#include "ast.h"
+
+#define BMC_SIO_SCR28 0x28
+#define BOOT_FLAGS_VERSION 0x42
+
+/*
+ *  SIO Register 0x29: Boot Flags (normal bit ordering)
+ *
+ *       [7:6] Hostboot Boot mode:
+ *              00 : Normal
+ *              01 : Terminate on first error
+ *              10 : istep mode
+ *              11 : reserved
+ *       [5:4] Boot options
+ *              00 : reserved
+ *              01 : Memboot
+ *              10 : Clear gard
+ *              11 : reserved
+ *       [ 3 ] BMC mbox PNOR driver
+ *       [2:0] Hostboot Log level:
+ *                 000 : Normal
+ *                 001 : Enable Scan trace
+ *                 xxx : reserved
+ */
+
+#define BMC_SIO_SCR29 0x29
+#define BMC_SIO_SCR29_MBOX 0x08
+#define BMC_SIO_SCR29_MEMBOOT 0x10
+
+/*
+ *  SIO Register 0x2d: Platform Flags (normal bit ordering)
+ *
+ *       [ 7 ] Hostboot configures SUART
+ *       [ 6 ] Hostboot configures VUART
+ *       [5:1] Reserved
+ *       [ 0 ] Isolate Service Processor
+ */
+#define BMC_SIO_PLAT_FLAGS 		0x2d
+#define  BMC_SIO_PLAT_ISOLATE_SP 	0x01
+
+enum {
+	BMC_SIO_DEV_NONE	= -1,
+	BMC_SIO_DEV_UART1	= 2,
+	BMC_SIO_DEV_UART2	= 3,
+	BMC_SIO_DEV_SWC		= 4,
+	BMC_SIO_DEV_KBC		= 5,
+	BMC_SIO_DEV_P80		= 7,
+	BMC_SIO_DEV_UART3	= 0xb,
+	BMC_SIO_DEV_UART4	= 0xc,
+	BMC_SIO_DEV_LPC2AHB	= 0xd,
+	BMC_SIO_DEV_MBOX	= 0xe,
+};
+
+static struct lock bmc_sio_lock = LOCK_UNLOCKED;
+static int bmc_sio_cur_dev = BMC_SIO_DEV_NONE;
+
+/*
+ * SuperIO indirect accesses
+ */
+static void bmc_sio_outb(uint8_t val, uint8_t reg)
+{
+	lpc_outb(reg, 0x2e);
+	lpc_outb(val, 0x2f);
+}
+
+static uint8_t bmc_sio_inb(uint8_t reg)
+{
+	lpc_outb(reg, 0x2e);
+	return lpc_inb(0x2f);
+}
+
+static void bmc_sio_get(int dev)
+{
+	lock(&bmc_sio_lock);
+
+	if (bmc_sio_cur_dev == dev || dev < 0)
+		return;
+
+	if (bmc_sio_cur_dev == BMC_SIO_DEV_NONE) {
+		/* Send SuperIO password */
+		lpc_outb(0xa5, 0x2e);
+		lpc_outb(0xa5, 0x2e);
+	}
+
+	/* Select logical dev */
+	bmc_sio_outb(dev, 0x07);
+
+	bmc_sio_cur_dev = dev;
+}
+
+static void bmc_sio_put(bool lock_sio)
+{
+	if (lock_sio) {
+		/* Re-lock SuperIO */
+		lpc_outb(0xaa, 0x2e);
+
+		bmc_sio_cur_dev = BMC_SIO_DEV_NONE;
+	}
+	unlock(&bmc_sio_lock);
+}
+
+/*
+ * AHB accesses via iLPC->AHB in SuperIO. Works on byteswapped
+ * values (ie. Little Endian registers)
+ */
+static void bmc_sio_ahb_prep(uint32_t reg, uint8_t type)
+{
+	/* Enable iLPC->AHB */
+	bmc_sio_outb(0x01, 0x30);
+
+	/* Address */
+	bmc_sio_outb((reg >> 24) & 0xff, 0xf0);
+	bmc_sio_outb((reg >> 16) & 0xff, 0xf1);
+	bmc_sio_outb((reg >>  8) & 0xff, 0xf2);
+	bmc_sio_outb((reg      ) & 0xff, 0xf3);
+
+	/* bytes cycle type */
+	bmc_sio_outb(type, 0xf8);
+}
+
+static void bmc_sio_ahb_writel(uint32_t val, uint32_t reg)
+{
+	bmc_sio_get(BMC_SIO_DEV_LPC2AHB);
+
+	bmc_sio_ahb_prep(reg, 2);
+
+	/* Write data */
+	bmc_sio_outb(val >> 24, 0xf4);
+	bmc_sio_outb(val >> 16, 0xf5);
+	bmc_sio_outb(val >>  8, 0xf6);
+	bmc_sio_outb(val      , 0xf7);
+
+	/* Trigger */
+	bmc_sio_outb(0xcf, 0xfe);
+
+	bmc_sio_put(false);
+}
+
+static uint32_t bmc_sio_ahb_readl(uint32_t reg)
+{
+	uint32_t val = 0;
+
+	bmc_sio_get(BMC_SIO_DEV_LPC2AHB);
+
+	bmc_sio_ahb_prep(reg, 2);
+
+	/* Trigger */
+	bmc_sio_inb(0xfe);
+
+	/* Read results */
+	val = (val << 8) | bmc_sio_inb(0xf4);
+	val = (val << 8) | bmc_sio_inb(0xf5);
+	val = (val << 8) | bmc_sio_inb(0xf6);
+	val = (val << 8) | bmc_sio_inb(0xf7);
+
+	bmc_sio_put(false);
+
+	return val;
+}
+
+/*
+ * External API
+ *
+ * We only support 4-byte accesses to all of AHB. We additionally
+ * support 1-byte accesses to the flash area only.
+ *
+ * We could support all access sizes via iLPC but we don't need
+ * that for now.
+ */
+
+void ast_ahb_writel(uint32_t val, uint32_t reg)
+{
+	/* For now, always use iLPC->AHB, it will byteswap */
+	bmc_sio_ahb_writel(val, reg);
+}
+
+uint32_t ast_ahb_readl(uint32_t reg)
+{
+	/* For now, always use iLPC->AHB, it will byteswap */
+	return bmc_sio_ahb_readl(reg);
+}
+
+static void ast_setup_sio_irq_polarity(void)
+{
+	/* Select logical dev 2 */
+	bmc_sio_get(BMC_SIO_DEV_UART1);
+	bmc_sio_outb(0x01, 0x71); /* level low */
+	bmc_sio_put(false);
+
+	/* Select logical dev 3 */
+	bmc_sio_get(BMC_SIO_DEV_UART2);
+	bmc_sio_outb(0x01, 0x71); /* irq level low */
+	bmc_sio_put(false);
+
+	/* Select logical dev 4 */
+	bmc_sio_get(BMC_SIO_DEV_SWC);
+	bmc_sio_outb(0x01, 0x71); /* irq level low */
+	bmc_sio_put(false);
+
+	/* Select logical dev 5 */
+	bmc_sio_get(BMC_SIO_DEV_KBC);
+	bmc_sio_outb(0x01, 0x71); /* irq level low */
+	bmc_sio_outb(0x01, 0x73); /* irq level low */
+	bmc_sio_put(false);
+
+	/* Select logical dev 7 */
+	bmc_sio_get(BMC_SIO_DEV_P80);
+	bmc_sio_outb(0x01, 0x71); /* irq level low */
+	bmc_sio_put(false);
+
+	/* Select logical dev d */
+	bmc_sio_get(BMC_SIO_DEV_UART3);
+	bmc_sio_outb(0x01, 0x71); /* irq level low */
+	bmc_sio_put(false);
+
+	/* Select logical dev c */
+	bmc_sio_get(BMC_SIO_DEV_UART4);
+	bmc_sio_outb(0x01, 0x71); /* irq level low */
+	bmc_sio_put(false);
+
+	/* Select logical dev d */
+	bmc_sio_get(BMC_SIO_DEV_LPC2AHB);
+	bmc_sio_outb(0x01, 0x71); /* irq level low */
+	bmc_sio_put(false);
+
+	/* Select logical dev e */
+	bmc_sio_get(BMC_SIO_DEV_MBOX);
+	bmc_sio_outb(0x01, 0x71); /* irq level low */
+	bmc_sio_put(true);
+}
+
+bool ast_sio_is_enabled(void)
+{
+	bool enabled;
+	int64_t rc;
+
+	lock(&bmc_sio_lock);
+	/*
+	 * Probe by attempting to lock the SIO device, this way the
+	 * post-condition is that the SIO device is locked or not able to be
+	 * unlocked. This turns out neater than trying to use the unlock code.
+	 */
+	rc = lpc_probe_write(OPAL_LPC_IO, 0x2e, 0xaa, 1);
+	if (rc) {
+		enabled = false;
+		/* If we can't lock it, then we can't unlock it either */
+		goto out;
+	}
+
+	/*
+	 * Now that we know that is locked and able to be unlocked, unlock it
+	 * if skiboot's recorded device state indicates it was previously
+	 * unlocked.
+	 */
+	if (bmc_sio_cur_dev != BMC_SIO_DEV_NONE) {
+		/* Send SuperIO password */
+		lpc_outb(0xa5, 0x2e);
+		lpc_outb(0xa5, 0x2e);
+
+		/* Ensure the previously selected logical dev is selected */
+		bmc_sio_outb(bmc_sio_cur_dev, 0x07);
+	}
+
+	enabled = true;
+out:
+	unlock(&bmc_sio_lock);
+
+	return enabled;
+}
+
+bool ast_sio_init(void)
+{
+	bool enabled = ast_sio_is_enabled();
+
+	/* Configure all AIO interrupts to level low */
+	if (enabled)
+		ast_setup_sio_irq_polarity();
+
+	return enabled;
+}
+
+bool ast_io_is_rw(void)
+{
+	return !(ast_ahb_readl(LPC_HICRB) & LPC_HICRB_ILPC_DISABLE);
+}
+
+bool ast_io_init(void)
+{
+	return ast_io_is_rw();
+}
+
+bool ast_lpc_fw_ipmi_hiomap(void)
+{
+	return platform.bmc->sw->ipmi_oem_hiomap_cmd != 0;
+}
+
+bool ast_lpc_fw_mbox_hiomap(void)
+{
+	struct dt_node *n;
+
+	n = dt_find_compatible_node(dt_root, NULL, "mbox");
+
+	return n != NULL;
+}
+
+bool ast_lpc_fw_maps_flash(void)
+{
+	uint8_t boot_version;
+	uint8_t boot_flags;
+
+	boot_version = bmc_sio_inb(BMC_SIO_SCR28);
+	if (boot_version != BOOT_FLAGS_VERSION)
+		return true;
+
+	boot_flags = bmc_sio_inb(BMC_SIO_SCR29);
+	return !(boot_flags & BMC_SIO_SCR29_MEMBOOT);
+}
+
+bool ast_scratch_reg_is_mbox(void)
+{
+	uint8_t boot_version;
+	uint8_t boot_flags;
+
+	boot_version = bmc_sio_inb(BMC_SIO_SCR28);
+	if (boot_version != BOOT_FLAGS_VERSION)
+		return false;
+
+	boot_flags = bmc_sio_inb(BMC_SIO_SCR29);
+	return boot_flags & BMC_SIO_SCR29_MBOX;
+}
+
+void ast_setup_ibt(uint16_t io_base, uint8_t irq)
+{
+	uint32_t v;
+
+	v = bmc_sio_ahb_readl(LPC_iBTCR0);
+	v = v & ~(0xfffffc00u);
+	v = v | (((uint32_t)io_base) << 16);
+	v = v | (((uint32_t)irq) << 12);
+	bmc_sio_ahb_writel(v, LPC_iBTCR0);
+}
+
+bool ast_is_vuart1_enabled(void)
+{
+	uint32_t v;
+
+	v = bmc_sio_ahb_readl(VUART1_GCTRLA);
+	return !!(v & 1);
+}
+
+void ast_setup_vuart1(uint16_t io_base, uint8_t irq)
+{
+	uint32_t v;
+
+	/* IRQ level low */
+	v = bmc_sio_ahb_readl(VUART1_GCTRLA);
+	v = v & ~2u;
+	bmc_sio_ahb_writel(v, VUART1_GCTRLA);
+	v = bmc_sio_ahb_readl(VUART1_GCTRLA);
+
+	/* IRQ number */
+	v = bmc_sio_ahb_readl(VUART1_GCTRLB);
+	v = (v & ~0xf0u) | (irq << 4);
+	bmc_sio_ahb_writel(v, VUART1_GCTRLB);
+
+	/* Address */
+	bmc_sio_ahb_writel(io_base & 0xff, VUART1_ADDRL);
+	bmc_sio_ahb_writel(io_base >> 8, VUART1_ADDRH);
+}
+
+/* Setup SuperIO UART 1 */
+void ast_setup_sio_uart1(uint16_t io_base, uint8_t irq)
+{
+	bmc_sio_get(BMC_SIO_DEV_UART1);
+
+	/* Disable UART1 for configuration */
+	bmc_sio_outb(0x00, 0x30);
+
+	/* Configure base and interrupt */
+	bmc_sio_outb(io_base >> 8, 0x60);
+	bmc_sio_outb(io_base & 0xff, 0x61);
+	bmc_sio_outb(irq, 0x70);
+	bmc_sio_outb(0x01, 0x71); /* level low */
+
+	/* Enable UART1 */
+	bmc_sio_outb(0x01, 0x30);
+
+	bmc_sio_put(true);
+}
+
+void ast_disable_sio_uart1(void)
+{
+	bmc_sio_get(BMC_SIO_DEV_UART1);
+
+	/* Disable UART1 */
+	bmc_sio_outb(0x00, 0x30);
+
+	bmc_sio_put(true);
+}
+
+void ast_setup_sio_mbox(uint16_t io_base, uint8_t irq)
+{
+	bmc_sio_get(BMC_SIO_DEV_MBOX);
+
+	/* Disable for configuration */
+	bmc_sio_outb(0x00, 0x30);
+
+	bmc_sio_outb(io_base >> 8, 0x60);
+	bmc_sio_outb(io_base & 0xff, 0x61);
+	bmc_sio_outb(irq, 0x70);
+	bmc_sio_outb(0x01, 0x71); /* level low */
+
+	/* Enable MailBox */
+	bmc_sio_outb(0x01, 0x30);
+
+	bmc_sio_put(true);
+}
+
diff --git a/roms/skiboot/hw/ast-bmc/ast-sf-ctrl.c b/roms/skiboot/hw/ast-bmc/ast-sf-ctrl.c
new file mode 100644
index 000000000..03cc44318
--- /dev/null
+++ b/roms/skiboot/hw/ast-bmc/ast-sf-ctrl.c
@@ -0,0 +1,1020 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2018 IBM Corp. */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <libflash/libflash.h>
+#include <libflash/libflash-priv.h>
+#ifdef __SKIBOOT__
+#include "lpc.h"
+#endif
+
+#include "ast.h"
+
+#ifndef __unused
+#define __unused __attribute__((unused))
+#endif
+
+#define CALIBRATE_BUF_SIZE	16384
+
+struct ast_sf_ctrl {
+	/* We have 2 controllers, one for the BMC flash, one for the PNOR */
+	uint8_t			type;
+
+	/* Address and previous value of the ctrl register */
+	uint32_t		ctl_reg;
+
+	/* Control register value for normal commands */
+	uint32_t		ctl_val;
+
+	/* Control register value for (fast) reads */
+	uint32_t		ctl_read_val;
+
+	/* Flash read timing register  */
+	uint32_t		fread_timing_reg;
+	uint32_t		fread_timing_val;
+
+	/* Address of the flash mapping */
+	uint32_t		flash;
+
+	/* Current 4b mode */
+	bool			mode_4b;
+
+	/* Callbacks */
+	struct spi_flash_ctrl	ops;
+};
+
+static uint32_t ast_ahb_freq;
+
+static const uint32_t ast_ct_hclk_divs[] = {
+	0xf, /* HCLK */
+	0x7, /* HCLK/2 */
+	0xe, /* HCLK/3 */
+	0x6, /* HCLK/4 */
+	0xd, /* HCLK/5 */
+};
+
+#ifdef __SKIBOOT__
+#define PNOR_AHB_ADDR	0x30000000
+static uint32_t pnor_lpc_offset;
+
+static int ast_copy_to_ahb(uint32_t reg, const void *src, uint32_t len)
+{
+	/* Check we don't cross IDSEL segments */
+	if ((reg ^ (reg + len - 1)) >> 28)
+		return -EINVAL;
+
+	/* SPI flash, use LPC->AHB bridge */
+	if ((reg >> 28) == (PNOR_AHB_ADDR >> 28)) {
+		uint32_t chunk, off = reg - PNOR_AHB_ADDR + pnor_lpc_offset;
+		int64_t rc;
+
+		while(len) {
+			/* Chose access size */
+			if (len > 3 && !(off & 3)) {
+				rc = lpc_write(OPAL_LPC_FW, off,
+					       *(uint32_t *)src, 4);
+				chunk = 4;
+			} else {
+				rc = lpc_write(OPAL_LPC_FW, off,
+					       *(uint8_t *)src, 1);
+				chunk = 1;
+			}
+			if (rc) {
+				prerror("AST_IO: lpc_write.sb failure %lld"
+					" to FW 0x%08x\n", rc, off);
+				return rc;
+			}
+			len -= chunk;
+			off += chunk;
+			src += chunk;
+		}
+		return 0;
+	}
+
+	/* Otherwise we don't do byte access (... yet)  */
+	prerror("AST_IO: Attempted write bytes access to %08x\n", reg);
+	return -EINVAL;
+}
+
+static int ast_copy_from_ahb(void *dst, uint32_t reg, uint32_t len)
+{
+	/* Check we don't cross IDSEL segments */
+	if ((reg ^ (reg + len - 1)) >> 28)
+		return -EINVAL;
+
+	/* SPI flash, use LPC->AHB bridge */
+	if ((reg >> 28) == (PNOR_AHB_ADDR >> 28)) {
+		uint32_t chunk, off = reg - PNOR_AHB_ADDR + pnor_lpc_offset;
+		int64_t rc;
+
+		while(len) {
+			uint32_t dat;
+
+			/* Chose access size */
+			if (len > 3 && !(off & 3)) {
+				rc = lpc_read(OPAL_LPC_FW, off, &dat, 4);
+				if (!rc)
+					*(uint32_t *)dst = dat;
+				chunk = 4;
+			} else {
+				rc = lpc_read(OPAL_LPC_FW, off, &dat, 1);
+				if (!rc)
+					*(uint8_t *)dst = dat;
+				chunk = 1;
+			}
+			if (rc) {
+				prerror("AST_IO: lpc_read.sb failure %lld"
+					" to FW 0x%08x\n", rc, off);
+				return rc;
+			}
+			len -= chunk;
+			off += chunk;
+			dst += chunk;
+		}
+		return 0;
+	}
+	/* Otherwise we don't do byte access (... yet)  */
+	prerror("AST_IO: Attempted read bytes access to %08x\n", reg);
+	return -EINVAL;
+}
+#endif /* __SKIBOOT__ */
+
+static int ast_sf_start_cmd(struct ast_sf_ctrl *ct, uint8_t cmd)
+{
+	/* Switch to user mode, CE# dropped */
+	ast_ahb_writel(ct->ctl_val | 7, ct->ctl_reg);
+
+	/* user mode, CE# active */
+	ast_ahb_writel(ct->ctl_val | 3, ct->ctl_reg);
+
+	/* write cmd */
+	return ast_copy_to_ahb(ct->flash, &cmd, 1);
+}
+
+static void ast_sf_end_cmd(struct ast_sf_ctrl *ct)
+{
+	/* clear CE# */
+	ast_ahb_writel(ct->ctl_val | 7, ct->ctl_reg);
+
+	/* Switch back to read mode */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+}
+
+static int ast_sf_send_addr(struct ast_sf_ctrl *ct, uint32_t addr)
+{
+	const void *ap;
+	beint32_t tmp;
+
+	/* Layout address MSB first in memory */
+	tmp = cpu_to_be32(addr);
+
+	/* Send the right amount of bytes */
+	ap = (char *)&tmp;
+
+	if (ct->mode_4b)
+		return ast_copy_to_ahb(ct->flash, ap, 4);
+	else
+		return ast_copy_to_ahb(ct->flash, ap + 1, 3);
+}
+
+static int ast_sf_cmd_rd(struct spi_flash_ctrl *ctrl, uint8_t cmd,
+			 bool has_addr, uint32_t addr, void *buffer,
+			 uint32_t size)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+	int rc;
+
+	rc = ast_sf_start_cmd(ct, cmd);
+	if (rc)
+		goto bail;
+	if (has_addr) {
+		rc = ast_sf_send_addr(ct, addr);
+		if (rc)
+			goto bail;
+	}
+	if (buffer && size)
+		rc = ast_copy_from_ahb(buffer, ct->flash, size);
+ bail:
+	ast_sf_end_cmd(ct);
+	return rc;
+}
+
+static int ast_sf_cmd_wr(struct spi_flash_ctrl *ctrl, uint8_t cmd,
+			 bool has_addr, uint32_t addr, const void *buffer,
+			 uint32_t size)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+	int rc;
+
+	rc = ast_sf_start_cmd(ct, cmd);
+	if (rc)
+		goto bail;
+	if (has_addr) {
+		rc = ast_sf_send_addr(ct, addr);
+		if (rc)
+			goto bail;
+	}
+	if (buffer && size)
+		rc = ast_copy_to_ahb(ct->flash, buffer, size);
+ bail:
+	ast_sf_end_cmd(ct);
+	return rc;
+}
+
+static int ast_sf_set_4b(struct spi_flash_ctrl *ctrl, bool enable)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+	uint32_t ce_ctrl = 0;
+
+	if (ct->type == AST_SF_TYPE_BMC && ct->ops.finfo->size > 0x1000000)
+		ce_ctrl = ast_ahb_readl(BMC_SPI_FCTL_CE_CTRL);
+	else if (ct->type != AST_SF_TYPE_PNOR)
+		return enable ? FLASH_ERR_4B_NOT_SUPPORTED : 0;
+
+	/*
+	 * We update the "old" value as well since when quitting
+	 * we don't restore the mode of the flash itself so we need
+	 * to leave the controller in a compatible setup
+	 */
+	if (enable) {
+		ct->ctl_val |= 0x2000;
+		ct->ctl_read_val |= 0x2000;
+		ce_ctrl |= 0x1;
+	} else {
+		ct->ctl_val &= ~0x2000;
+		ct->ctl_read_val &= ~0x2000;
+		ce_ctrl &= ~0x1;
+	}
+	ct->mode_4b = enable;
+
+	/* Update read mode */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+	if (ce_ctrl && ct->type == AST_SF_TYPE_BMC)
+		ast_ahb_writel(ce_ctrl, BMC_SPI_FCTL_CE_CTRL);
+
+	return 0;
+}
+
+static int ast_sf_read(struct spi_flash_ctrl *ctrl, uint32_t pos,
+		       void *buf, uint32_t len)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+
+	/*
+	 * We are in read mode by default. We don't yet support fancy
+	 * things like fast read or X2 mode
+	 */
+	return ast_copy_from_ahb(buf, ct->flash + pos, len);
+}
+
+static void ast_get_ahb_freq(void)
+{
+	static const uint32_t cpu_freqs_24_48[] = {
+		384000000,
+		360000000,
+		336000000,
+		408000000
+	};
+	static const uint32_t cpu_freqs_25[] = {
+		400000000,
+		375000000,
+		350000000,
+		425000000
+	};
+	static const uint32_t ahb_div[] = { 1, 2, 4, 3 };
+	uint32_t strap, cpu_clk, div;
+
+	if (ast_ahb_freq)
+		return;
+
+	/* HW strapping gives us the CPU freq and AHB divisor */
+	strap = ast_ahb_readl(SCU_HW_STRAPPING);
+	if (strap & 0x00800000) {
+		FL_DBG("AST: CLKIN 25Mhz\n");
+		cpu_clk = cpu_freqs_25[(strap >> 8) & 3];
+	} else {
+		FL_DBG("AST: CLKIN 24/48Mhz\n");
+		cpu_clk = cpu_freqs_24_48[(strap >> 8) & 3];
+	}
+	FL_DBG("AST: CPU frequency: %d Mhz\n", cpu_clk / 1000000);
+	div = ahb_div[(strap >> 10) & 3];
+	ast_ahb_freq = cpu_clk / div;
+	FL_DBG("AST: AHB frequency: %d Mhz\n", ast_ahb_freq / 1000000);
+}
+
+static int ast_sf_check_reads(struct ast_sf_ctrl *ct,
+			      const uint8_t *golden_buf, uint8_t *test_buf)
+{
+	int i, rc;
+
+	for (i = 0; i < 10; i++) {
+		rc = ast_copy_from_ahb(test_buf, ct->flash, CALIBRATE_BUF_SIZE);
+		if (rc)
+			return rc;
+		if (memcmp(test_buf, golden_buf, CALIBRATE_BUF_SIZE) != 0)
+			return FLASH_ERR_VERIFY_FAILURE;
+	}
+	return 0;
+}
+
+static int ast_sf_calibrate_reads(struct ast_sf_ctrl *ct, uint32_t hdiv,
+				  const uint8_t *golden_buf, uint8_t *test_buf)
+{
+	int i, rc;
+	int good_pass = -1, pass_count = 0;
+	uint32_t shift = (hdiv - 1) << 2;
+	uint32_t mask = ~(0xfu << shift);
+
+#define FREAD_TPASS(i)	(((i) / 2) | (((i) & 1) ? 0 : 8))
+
+	/* Try HCLK delay 0..5, each one with/without delay and look for a
+	 * good pair.
+	 */
+	for (i = 0; i < 12; i++) {
+		bool pass;
+
+		ct->fread_timing_val &= mask;
+		ct->fread_timing_val |= FREAD_TPASS(i) << shift;
+		ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg);
+		rc = ast_sf_check_reads(ct, golden_buf, test_buf);
+		if (rc && rc != FLASH_ERR_VERIFY_FAILURE)
+			return rc;
+		pass = (rc == 0);
+		FL_DBG("  * [%08x] %d HCLK delay, %dns DI delay : %s\n",
+		       ct->fread_timing_val, i/2, (i & 1) ? 0 : 4, pass ? "PASS" : "FAIL");
+		if (pass) {
+			pass_count++;
+			if (pass_count == 3) {
+				good_pass = i - 1;
+				break;
+			}
+		} else
+			pass_count = 0;
+	}
+
+	/* No good setting for this frequency */
+	if (good_pass < 0)
+		return FLASH_ERR_VERIFY_FAILURE;
+
+	/* We have at least one pass of margin, let's use first pass */
+	ct->fread_timing_val &= mask;
+	ct->fread_timing_val |= FREAD_TPASS(good_pass) << shift;
+	ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg);
+	FL_DBG("AST:  * -> good is pass %d [0x%08x]\n",
+	       good_pass, ct->fread_timing_val);
+	return 0;
+}
+
+static bool ast_calib_data_usable(const uint8_t *test_buf, uint32_t size)
+{
+	const uint32_t *tb32 = (const uint32_t *)test_buf;
+	uint32_t i, cnt = 0;
+
+	/* We check if we have enough words that are neither all 0
+	 * nor all 1's so the calibration can be considered valid.
+	 *
+	 * I use an arbitrary threshold for now of 64
+	 */
+	size >>= 2;
+	for (i = 0; i < size; i++) {
+		if (tb32[i] != 0 && tb32[i] != 0xffffffff)
+			cnt++;
+	}
+	return cnt >= 64;
+}
+
+static int ast_sf_optimize_reads(struct ast_sf_ctrl *ct,
+				 struct flash_info *info __unused,
+				 uint32_t max_freq)
+{
+	uint8_t *golden_buf, *test_buf;
+	int i, rc, best_div = -1;
+	uint32_t save_read_val = ct->ctl_read_val;
+
+	test_buf = malloc(CALIBRATE_BUF_SIZE * 2);
+	golden_buf = test_buf + CALIBRATE_BUF_SIZE;
+
+	/* We start with the dumbest setting and read some data */
+	ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+		(0x00 << 28) | /* Single bit */
+		(0x00 << 24) | /* CE# max */
+		(0x03 << 16) | /* use normal reads */
+		(0x00 <<  8) | /* HCLK/16 */
+		(0x00 <<  6) | /* no dummy cycle */
+		(0x00);        /* normal read */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+	rc = ast_copy_from_ahb(golden_buf, ct->flash, CALIBRATE_BUF_SIZE);
+	if (rc) {
+		free(test_buf);
+		return rc;
+	}
+
+	/* Establish our read mode with freq field set to 0 */
+	ct->ctl_read_val = save_read_val & 0xfffff0ff;
+
+	/* Check if calibration data is suitable */
+	if (!ast_calib_data_usable(golden_buf, CALIBRATE_BUF_SIZE)) {
+		FL_INF("AST: Calibration area too uniform, "
+		       "using low speed\n");
+		ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+		free(test_buf);
+		return 0;
+	}
+
+	/* Now we iterate the HCLK dividers until we find our breaking point */
+	for (i = 5; i > 0; i--) {
+		uint32_t tv, freq;
+
+		/* Compare timing to max */
+		freq = ast_ahb_freq / i;
+		if (freq >= max_freq)
+			continue;
+
+		/* Set the timing */
+		tv = ct->ctl_read_val | (ast_ct_hclk_divs[i - 1] << 8);
+		ast_ahb_writel(tv, ct->ctl_reg);
+		FL_DBG("AST: Trying HCLK/%d...\n", i);
+		rc = ast_sf_calibrate_reads(ct, i, golden_buf, test_buf);
+
+		/* Some other error occurred, bail out */
+		if (rc && rc != FLASH_ERR_VERIFY_FAILURE) {
+			free(test_buf);
+			return rc;
+		}
+		if (rc == 0)
+			best_div = i;
+	}
+	free(test_buf);
+
+	/* Nothing found ? */
+	if (best_div < 0)
+		FL_ERR("AST: No good frequency, using dumb slow\n");
+	else {
+		FL_DBG("AST: Found good read timings at HCLK/%d\n", best_div);
+		ct->ctl_read_val |= (ast_ct_hclk_divs[best_div - 1] << 8);
+	}
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+	return 0;
+}
+
+static int ast_sf_get_hclk(uint32_t *ctl_val, uint32_t max_freq)
+{
+	int i;
+
+	/* It appears that running commands at HCLK/2 on some micron
+	 * chips results in occasionally reads of bogus status (that
+	 * or unrelated chip hangs).
+	 *
+	 * Since we cannot calibrate properly the reads for commands,
+	 * instead, let's limit our SPI frequency to HCLK/4 to stay
+	 * on the safe side of things
+	 */
+#define MIN_CMD_FREQ	4
+	for (i = MIN_CMD_FREQ; i <= 5; i++) {
+		uint32_t freq = ast_ahb_freq / i;
+		if (freq >= max_freq)
+			continue;
+		*ctl_val |= (ast_ct_hclk_divs[i - 1] << 8);
+		return i;
+	}
+	return 0;
+}
+
+static int ast_sf_setup_macronix(struct ast_sf_ctrl *ct, struct flash_info *info)
+{
+	int rc, div __unused;
+	uint8_t srcr[2];
+
+	/*
+	 * Those Macronix chips support dual reads at 104Mhz
+	 * and dual IO at 84Mhz with 4 dummies.
+	 *
+	 * Our calibration algo should give us something along
+	 * the lines of HCLK/3 (HCLK/2 seems to work sometimes
+	 * but appears to be fairly unreliable) which is 64Mhz
+	 *
+	 * So we chose dual IO mode.
+	 *
+	 * The CE# inactive width for reads must be 7ns, we set it
+	 * to 3T which is about 15ns at the fastest speed we support
+	 * HCLK/2) as I've had issue with smaller values.
+	 *
+	 * For write and program it's 30ns so let's set the value
+	 * for normal ops to 6T.
+	 *
+	 * Preserve the current 4b mode.
+	 */
+	FL_DBG("AST: Setting up Macronix...\n");
+
+	/*
+	 * Read the status and config registers
+	 */
+	rc = ast_sf_cmd_rd(&ct->ops, CMD_RDSR, false, 0, &srcr[0], 1);
+	if (rc != 0) {
+		FL_ERR("AST: Failed to read status\n");
+		return rc;
+	}
+	rc = ast_sf_cmd_rd(&ct->ops, CMD_RDCR, false, 0, &srcr[1], 1);
+	if (rc != 0) {
+		FL_ERR("AST: Failed to read configuration\n");
+		return rc;
+	}
+
+	FL_DBG("AST: Macronix SR:CR: 0x%02x:%02x\n", srcr[0], srcr[1]);
+
+	/* Switch to 8 dummy cycles to enable 104Mhz operations */
+	srcr[1] = (srcr[1] & 0x3f) | 0x80;
+
+	rc = fl_wren(&ct->ops);
+	if (rc) {
+		FL_ERR("AST: Failed to WREN for Macronix config\n");
+		return rc;
+	}
+
+	rc = ast_sf_cmd_wr(&ct->ops, CMD_WRSR, false, 0, srcr, 2);
+	if (rc != 0) {
+		FL_ERR("AST: Failed to write Macronix config\n");
+		return rc;
+	}
+	rc = fl_sync_wait_idle(&ct->ops);;
+	if (rc != 0) {
+		FL_ERR("AST: Failed waiting for config write\n");
+		return rc;
+	}
+
+	FL_DBG("AST: Macronix SR:CR: 0x%02x:%02x\n", srcr[0], srcr[1]);
+
+	/* Use 2READ */
+	ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+		(0x03 << 28) | /* Dual IO */
+		(0x0d << 24) | /* CE# width 3T */
+		(0xbb << 16) | /* 2READ command */
+		(0x00 <<  8) | /* HCLK/16 (optimize later) */
+		(0x02 <<  6) | /* 2 bytes dummy cycle (8 clocks) */
+		(0x01);	       /* fast read */
+
+	/* Configure SPI flash read timing */
+	rc = ast_sf_optimize_reads(ct, info, 104000000);
+	if (rc) {
+		FL_ERR("AST: Failed to setup proper read timings, rc=%d\n", rc);
+		return rc;
+	}
+
+	/*
+	 * For other commands and writes also increase the SPI clock
+	 * to HCLK/2 since the chip supports up to 133Mhz and set
+	 * CE# inactive to 6T. We request a timing that is 20% below
+	 * the limit of the chip, so about 106Mhz which should fit.
+	 */
+	ct->ctl_val = (ct->ctl_val & 0x2000) |
+		(0x00 << 28) | /* Single bit */
+		(0x0a << 24) | /* CE# width 6T (b1010) */
+		(0x00 << 16) | /* no command */
+		(0x00 <<  8) | /* HCLK/16 (done later) */
+		(0x00 <<  6) | /* no dummy cycle */
+		(0x00);	       /* normal read */
+
+	div = ast_sf_get_hclk(&ct->ctl_val, 106000000);
+	FL_DBG("AST: Command timing set to HCLK/%d\n", div);
+
+	/* Update chip with current read config */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+	return 0;
+}
+
+static int ast_sf_setup_winbond(struct ast_sf_ctrl *ct, struct flash_info *info)
+{
+	int rc, div __unused;
+
+	FL_DBG("AST: Setting up Windbond...\n");
+
+	/*
+	 * This Windbond chip support dual reads at 104Mhz
+	 * with 8 dummy cycles.
+	 *
+	 * The CE# inactive width for reads must be 10ns, we set it
+	 * to 3T which is about 15.6ns.
+	 */
+	ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+		(0x02 << 28) | /* Dual bit data only */
+		(0x0e << 24) | /* CE# width 2T (b1110) */
+		(0x3b << 16) | /* DREAD command */
+		(0x00 <<  8) | /* HCLK/16 */
+		(0x01 <<  6) | /* 1-byte dummy cycle */
+		(0x01);	       /* fast read */
+
+	/* Configure SPI flash read timing */
+	rc = ast_sf_optimize_reads(ct, info, 104000000);
+	if (rc) {
+		FL_ERR("AST: Failed to setup proper read timings, rc=%d\n", rc);
+		return rc;
+	}
+
+	/*
+	 * For other commands and writes also increase the SPI clock
+	 * to HCLK/2 since the chip supports up to 133Mhz. CE# inactive
+	 * for write and erase is 50ns so let's set it to 10T.
+	 */
+	ct->ctl_val = (ct->ctl_read_val & 0x2000) |
+		(0x00 << 28) | /* Single bit */
+		(0x06 << 24) | /* CE# width 10T (b0110) */
+		(0x00 << 16) | /* no command */
+		(0x00 <<  8) | /* HCLK/16 */
+		(0x00 <<  6) | /* no dummy cycle */
+		(0x01);	       /* fast read */
+
+	div = ast_sf_get_hclk(&ct->ctl_val, 106000000);
+	FL_DBG("AST: Command timing set to HCLK/%d\n", div);
+
+	/* Update chip with current read config */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+	return 0;
+}
+
+static int ast_sf_setup_micron(struct ast_sf_ctrl *ct, struct flash_info *info)
+{
+	uint8_t	vconf, ext_id[6];
+	int rc, div __unused;
+
+	FL_DBG("AST: Setting up Micron...\n");
+
+	/*
+	 * Read the extended chip ID to try to detect old vs. new
+	 * flashes since old Micron flashes have a lot of issues
+	 */
+	rc = ast_sf_cmd_rd(&ct->ops, CMD_RDID, false, 0, ext_id, 6);
+	if (rc != 0) {
+		FL_ERR("AST: Failed to read Micron ext ID, sticking to dumb speed\n");
+		return 0;
+	}
+	/* Check ID matches expectations */
+	if (ext_id[0] != ((info->id >> 16) & 0xff) ||
+	    ext_id[1] != ((info->id >>  8) & 0xff) ||
+	    ext_id[2] != ((info->id      ) & 0xff)) {
+		FL_ERR("AST: Micron ext ID mismatch, sticking to dumb speed\n");
+		return 0;
+	}
+	FL_DBG("AST: Micron ext ID byte: 0x%02x\n", ext_id[4]);
+
+	/* Check for old (<45nm) chips, don't try to be fancy on those */
+	if (!(ext_id[4] & 0x40)) {
+		FL_DBG("AST: Old chip, using dumb timings\n");
+		goto dumb;
+	}
+
+	/*
+	 * Read the micron specific volatile configuration reg
+	 */
+	rc = ast_sf_cmd_rd(&ct->ops, CMD_MIC_RDVCONF, false, 0, &vconf, 1);
+	if (rc != 0) {
+		FL_ERR("AST: Failed to read Micron vconf, sticking to dumb speed\n");
+		goto dumb;
+	}
+	FL_DBG("AST: Micron VCONF: 0x%02x\n", vconf);
+
+	/* Switch to 8 dummy cycles (we might be able to operate with 4
+	 * but let's keep some margin
+	 */
+	vconf = (vconf & 0x0f) | 0x80;
+
+	rc = ast_sf_cmd_wr(&ct->ops, CMD_MIC_WRVCONF, false, 0, &vconf, 1);
+	if (rc != 0) {
+		FL_ERR("AST: Failed to write Micron vconf, "
+		       " sticking to dumb speed\n");
+		goto dumb;
+	}
+	rc = fl_sync_wait_idle(&ct->ops);;
+	if (rc != 0) {
+		FL_ERR("AST: Failed waiting for config write\n");
+		return rc;
+	}
+	FL_DBG("AST: Updated to  : 0x%02x\n", vconf);
+
+	/*
+	 * Try to do full dual IO, with 8 dummy cycles it supports 133Mhz
+	 *
+	 * The CE# inactive width for reads must be 20ns, we set it
+	 * to 4T which is about 20.8ns.
+	 */
+	ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+		(0x03 << 28) | /* Single bit */
+		(0x0c << 24) | /* CE# 4T */
+		(0xbb << 16) | /* 2READ command */
+		(0x00 <<  8) | /* HCLK/16 (optimize later) */
+		(0x02 <<  6) | /* 8 dummy cycles (2 bytes) */
+		(0x01);	       /* fast read */
+
+	/* Configure SPI flash read timing */
+	rc = ast_sf_optimize_reads(ct, info, 133000000);
+	if (rc) {
+		FL_ERR("AST: Failed to setup proper read timings, rc=%d\n", rc);
+		return rc;
+	}
+
+	/*
+	 * For other commands and writes also increase the SPI clock
+	 * to HCLK/2 since the chip supports up to 133Mhz. CE# inactive
+	 * for write and erase is 50ns so let's set it to 10T.
+	 */
+	ct->ctl_val = (ct->ctl_read_val & 0x2000) |
+		(0x00 << 28) | /* Single bit */
+		(0x06 << 24) | /* CE# width 10T (b0110) */
+		(0x00 << 16) | /* no command */
+		(0x00 <<  8) | /* HCLK/16 */
+		(0x00 <<  6) | /* no dummy cycle */
+		(0x00);	       /* norm read */
+
+	div = ast_sf_get_hclk(&ct->ctl_val, 133000000);
+	FL_DBG("AST: Command timing set to HCLK/%d\n", div);
+
+	/* Update chip with current read config */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+	return 0;
+
+ dumb:
+	ct->ctl_val = ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+		(0x00 << 28) | /* Single bit */
+		(0x00 << 24) | /* CE# max */
+		(0x03 << 16) | /* use normal reads */
+		(0x06 <<  8) | /* HCLK/4 */
+		(0x00 <<  6) | /* no dummy cycle */
+		(0x00);	       /* normal read */
+
+	/* Update chip with current read config */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+	return 0;
+}
+
+static int ast_sf_setup(struct spi_flash_ctrl *ctrl, uint32_t *tsize)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+	struct flash_info *info = ctrl->finfo;
+
+	(void)tsize;
+
+	/*
+	 * Configure better timings and read mode for known
+	 * flash chips
+	 */
+	switch(info->id) {
+	case 0xc22018: /* MX25L12835F */
+	case 0xc22019: /* MX25L25635F */
+	case 0xc2201a: /* MX66L51235F */
+	case 0xc2201b: /* MX66L1G45G */
+		return ast_sf_setup_macronix(ct, info);
+	case 0xef4018: /* W25Q128BV */
+		return ast_sf_setup_winbond(ct, info);
+	case 0x20ba20: /* MT25Qx512xx */
+		return ast_sf_setup_micron(ct, info);
+	}
+	/* No special tuning */
+	return 0;
+}
+
+static bool ast_sf_init_pnor(struct ast_sf_ctrl *ct)
+{
+	uint32_t reg;
+
+	ct->ctl_reg = PNOR_SPI_FCTL_CTRL;
+	ct->fread_timing_reg = PNOR_SPI_FREAD_TIMING;
+	ct->flash = PNOR_FLASH_BASE;
+
+	/* Enable writing to the controller */
+	reg = ast_ahb_readl(PNOR_SPI_FCTL_CONF);
+	if (reg == 0xffffffff) {
+		FL_ERR("AST_SF: Failed read from controller config\n");
+		return false;
+	}
+	ast_ahb_writel(reg | 1, PNOR_SPI_FCTL_CONF);
+
+	/*
+	 * Snapshot control reg and sanitize it for our
+	 * use, switching to 1-bit mode, clearing user
+	 * mode if set, etc...
+	 *
+	 * Also configure SPI clock to something safe
+	 * like HCLK/8 (24Mhz)
+	 */
+	ct->ctl_val = ast_ahb_readl(ct->ctl_reg);
+	if (ct->ctl_val == 0xffffffff) {
+		FL_ERR("AST_SF: Failed read from controller control\n");
+		return false;
+	}
+
+	ct->ctl_val = (ct->ctl_val & 0x2000) |
+		(0x00 << 28) | /* Single bit */
+		(0x00 << 24) | /* CE# width 16T */
+		(0x00 << 16) | /* no command */
+		(0x04 <<  8) | /* HCLK/8 */
+		(0x00 <<  6) | /* no dummy cycle */
+		(0x00);	       /* normal read */
+
+	/* Initial read mode is default */
+	ct->ctl_read_val = ct->ctl_val;
+
+	/* Initial read timings all 0 */
+	ct->fread_timing_val = 0;
+
+	/* Configure for read */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+	ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg);
+
+	if (ct->ctl_val & 0x2000)
+		ct->mode_4b = true;
+	else
+		ct->mode_4b = false;
+
+	return true;
+}
+
+static bool ast_sf_init_bmc(struct ast_sf_ctrl *ct)
+{
+	ct->ctl_reg = BMC_SPI_FCTL_CTRL;
+	ct->fread_timing_reg = BMC_SPI_FREAD_TIMING;
+	ct->flash = BMC_FLASH_BASE;
+
+	/*
+	 * Snapshot control reg and sanitize it for our
+	 * use, switching to 1-bit mode, clearing user
+	 * mode if set, etc...
+	 *
+	 * Also configure SPI clock to something safe
+	 * like HCLK/8 (24Mhz)
+	 */
+	ct->ctl_val =
+		(0x00 << 28) | /* Single bit */
+		(0x00 << 24) | /* CE# width 16T */
+		(0x00 << 16) | /* no command */
+		(0x04 <<  8) | /* HCLK/8 */
+		(0x00 <<  6) | /* no dummy cycle */
+		(0x00);	       /* normal read */
+
+	/* Initial read mode is default */
+	ct->ctl_read_val = ct->ctl_val;
+
+	/* Initial read timings all 0 */
+	ct->fread_timing_val = 0;
+
+	/* Configure for read */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+	ast_ahb_writel(ct->fread_timing_val, ct->fread_timing_reg);
+
+	ct->mode_4b = false;
+
+	return true;
+}
+
+static int ast_mem_set4b(struct spi_flash_ctrl *ctrl __unused,
+			 bool enable __unused)
+{
+	return 0;
+}
+
+static int ast_mem_setup(struct spi_flash_ctrl *ctrl __unused,
+			 uint32_t *tsize __unused)
+{
+	return 0;
+}
+
+static int ast_mem_chipid(struct spi_flash_ctrl *ctrl __unused, uint8_t *id_buf,
+			  uint32_t *id_size)
+{
+	if (*id_size < 3)
+		return -1;
+
+	id_buf[0] = 0xaa;
+	id_buf[1] = 0x55;
+	id_buf[2] = 0xaa;
+	*id_size = 3;
+	return 0;
+}
+
+static int ast_mem_write(struct spi_flash_ctrl *ctrl, uint32_t pos,
+			const void *buf, uint32_t len)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+
+	/*
+	 * This only works when the ahb is pointed at system memory.
+	 */
+	return ast_copy_to_ahb(ct->flash + pos, buf, len);
+}
+
+static int ast_mem_erase(struct spi_flash_ctrl *ctrl, uint32_t addr, uint32_t size)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+	uint32_t pos, len, end = addr + size;
+	uint64_t zero = 0;
+	int ret;
+
+	for (pos = addr; pos < end; pos += sizeof(zero)) {
+		if (pos + sizeof(zero) > end)
+			len = end - pos;
+		else
+			len = sizeof(zero);
+
+		ret = ast_copy_to_ahb(ct->flash + pos, &zero, len);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int ast_sf_open(uint8_t type, struct spi_flash_ctrl **ctrl)
+{
+	struct ast_sf_ctrl *ct;
+#ifdef __SKIBOOT__
+	uint32_t hicr7;
+
+	if (!ast_sio_is_enabled())
+		return -ENODEV;
+#endif /* __SKIBOOT__ */
+
+	if (type != AST_SF_TYPE_PNOR && type != AST_SF_TYPE_BMC
+	    && type != AST_SF_TYPE_MEM)
+		return -EINVAL;
+
+	*ctrl = NULL;
+	ct = malloc(sizeof(*ct));
+	if (!ct) {
+		FL_ERR("AST_SF: Failed to allocate\n");
+		return -ENOMEM;
+	}
+	memset(ct, 0, sizeof(*ct));
+	ct->type = type;
+
+	if (type == AST_SF_TYPE_MEM) {
+		ct->ops.cmd_wr = NULL;
+		ct->ops.cmd_rd = NULL;
+		ct->ops.read = ast_sf_read;
+		ct->ops.set_4b = ast_mem_set4b;
+		ct->ops.write = ast_mem_write;
+		ct->ops.erase = ast_mem_erase;
+		ct->ops.setup = ast_mem_setup;
+		ct->ops.chip_id = ast_mem_chipid;
+		ct->flash = PNOR_FLASH_BASE;
+	} else {
+		ct->ops.cmd_wr = ast_sf_cmd_wr;
+		ct->ops.cmd_rd = ast_sf_cmd_rd;
+		ct->ops.set_4b = ast_sf_set_4b;
+		ct->ops.read = ast_sf_read;
+		ct->ops.setup = ast_sf_setup;
+	}
+
+	ast_get_ahb_freq();
+
+	if (type == AST_SF_TYPE_PNOR) {
+		if (!ast_sf_init_pnor(ct))
+			goto fail;
+	} else if (type == AST_SF_TYPE_BMC) {
+		if (!ast_sf_init_bmc(ct))
+			goto fail;
+	}
+
+#ifdef __SKIBOOT__
+	/* Read the configuration of the LPC->AHB bridge for PNOR
+	 * to extract the PNOR LPC offset which can be different
+	 * depending on flash size
+	 */
+	hicr7 = ast_ahb_readl(LPC_HICR7);
+	pnor_lpc_offset = (hicr7 & 0xffffu) << 16;
+	prlog(PR_DEBUG, "AST: PNOR LPC offset: 0x%08x\n", pnor_lpc_offset);
+#endif /* __SKIBOOT__ */
+
+	*ctrl = &ct->ops;
+
+	return 0;
+ fail:
+	free(ct);
+	return -EIO;
+}
+
+void ast_sf_close(struct spi_flash_ctrl *ctrl)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+
+	/* Restore control reg to read */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+	/* Additional cleanup */
+	if (ct->type == AST_SF_TYPE_PNOR) {
+		uint32_t reg = ast_ahb_readl(PNOR_SPI_FCTL_CONF);
+		if (reg != 0xffffffff)
+			ast_ahb_writel(reg & ~1, PNOR_SPI_FCTL_CONF);
+	}
+
+	/* Free the whole lot */
+	free(ct);
+}
diff --git a/roms/skiboot/hw/bt.c b/roms/skiboot/hw/bt.c
new file mode 100644
index 000000000..5016feab6
--- /dev/null
+++ b/roms/skiboot/hw/bt.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Block Transfer, typically what IPMI goes over
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "BT: " fmt
+
+#include <skiboot.h>
+#include <lpc.h>
+#include <lock.h>
+#include <device.h>
+#include <timebase.h>
+#include <ipmi.h>
+#include <bt.h>
+#include <timer.h>
+#include <ipmi.h>
+#include <timebase.h>
+#include <chip.h>
+#include <interrupts.h>
+
+/* BT registers */
+#define BT_CTRL			0
+#define BT_CTRL_B_BUSY		0x80
+#define BT_CTRL_H_BUSY		0x40
+#define BT_CTRL_OEM0		0x20
+#define BT_CTRL_SMS_ATN		0x10
+#define BT_CTRL_B2H_ATN		0x08
+#define BT_CTRL_H2B_ATN		0x04
+#define BT_CTRL_CLR_RD_PTR	0x02
+#define BT_CTRL_CLR_WR_PTR	0x01
+#define BT_HOST2BMC		1
+#define BT_INTMASK		2
+#define BT_INTMASK_B2H_IRQEN	0x01
+#define BT_INTMASK_B2H_IRQ	0x02
+#define BT_INTMASK_BMC_HWRST	0x80
+
+/* Maximum size of the HW FIFO */
+#define BT_FIFO_LEN		64
+
+/* Default poll interval before interrupts are working */
+#define BT_DEFAULT_POLL_MS	200
+
+/*
+ * Minimum size of an IPMI request/response including
+ * mandatory headers.
+ */
+#define BT_MIN_REQ_LEN		3
+#define BT_MIN_RESP_LEN		4
+
+/* How long (in uS) to poll for new ipmi data. */
+#define POLL_TIMEOUT		10000
+
+/* Maximum number of outstanding messages to allow in the queue. */
+#define BT_MAX_QUEUE_LEN	10
+
+/* How long (in seconds) before a message is timed out. */
+#define BT_MSG_TIMEOUT		3
+
+/* Maximum number of times to attempt sending a message before giving up. */
+#define BT_MAX_RETRIES		1
+
+/* Macro to enable printing BT message queue for debug */
+#define BT_QUEUE_DEBUG		0
+
+/* BT message logging macros */
+#define _BT_Q_LOG(level, msg, fmt, args...) \
+	do { if (msg) \
+			prlog(level, "seq 0x%02x netfn 0x%02x cmd 0x%02x: " fmt "\n", \
+			(msg)->seq, ((msg)->ipmi_msg.netfn >> 2), (msg)->ipmi_msg.cmd, ##args); \
+		else \
+			prlog(level, "seq 0x?? netfn 0x?? cmd 0x??: " fmt "\n", ##args); \
+	} while (0)
+
+#define BT_Q_ERR(msg, fmt, args...) \
+	_BT_Q_LOG(PR_ERR, msg, fmt, ##args)
+
+#define BT_Q_DBG(msg, fmt, args...) \
+	_BT_Q_LOG(PR_DEBUG, msg, fmt, ##args)
+
+#define BT_Q_TRACE(msg, fmt, args...) \
+	_BT_Q_LOG(PR_TRACE, msg, fmt, ##args)
+
+struct bt_msg {
+	struct list_node link;
+	unsigned long tb;
+	uint8_t seq;
+	uint8_t send_count;
+	bool disable_retry;
+	struct ipmi_msg ipmi_msg;
+};
+
+struct bt_caps {
+	uint8_t num_requests;
+	uint16_t input_buf_len;
+	uint16_t output_buf_len;
+	uint8_t msg_timeout;
+	uint8_t max_retries;
+};
+
+struct bt {
+	uint32_t base_addr;
+	struct lock lock;
+	struct list_head msgq;
+	struct list_head msgq_sync; /* separate list for synchronous messages */
+	struct timer poller;
+	bool irq_ok;
+	int queue_len;
+	struct bt_caps caps;
+};
+
+static struct bt bt;
+static struct bt_msg *inflight_bt_msg; /* Holds in flight message */
+
+static int ipmi_seq;
+
+static inline uint8_t bt_inb(uint32_t reg)
+{
+	return lpc_inb(bt.base_addr + reg);
+}
+
+static inline void bt_outb(uint8_t data, uint32_t reg)
+{
+	lpc_outb(data, bt.base_addr + reg);
+}
+
+static inline void bt_set_h_busy(bool value)
+{
+	uint8_t rval;
+
+	rval = bt_inb(BT_CTRL);
+	if (value != !!(rval & BT_CTRL_H_BUSY))
+		bt_outb(BT_CTRL_H_BUSY, BT_CTRL);
+}
+
+static inline void bt_assert_h_busy(void)
+{
+	uint8_t rval;
+	rval = bt_inb(BT_CTRL);
+	assert(rval & BT_CTRL_H_BUSY);
+}
+
+static void get_bt_caps_complete(struct ipmi_msg *msg)
+{
+	/* Ignore errors, we'll fallback to using the defaults, no big deal */
+	if (msg->data[0] == 0) {
+		prlog(PR_DEBUG, "Got illegal BMC BT capability\n");
+		goto out;
+	}
+
+	if (msg->data[1] != BT_FIFO_LEN) {
+		prlog(PR_DEBUG, "Got a input buffer len (%u) cap which differs from the default\n",
+				msg->data[1]);
+	}
+
+	if (msg->data[2] != BT_FIFO_LEN) {
+		prlog(PR_DEBUG, "Got a output buffer len (%u) cap which differs from the default\n",
+				msg->data[2]);
+	}
+
+	/*
+	 * IPMI Spec says that the value for buffer sizes are:
+	 * "the largest value allowed in first byte"
+	 * Therefore we want to add one to what we get
+	 */
+	bt.caps.num_requests = msg->data[0];
+	bt.caps.input_buf_len = msg->data[1] + 1;
+	bt.caps.output_buf_len = msg->data[2] + 1;
+	bt.caps.msg_timeout = msg->data[3];
+	bt.caps.max_retries = msg->data[4];
+	prlog(PR_DEBUG, "BMC BT capabilities received:\n");
+	prlog(PR_DEBUG, "buffer sizes: %d input %d output\n",
+			bt.caps.input_buf_len, bt.caps.output_buf_len);
+	prlog(PR_DEBUG, "number of requests: %d\n", bt.caps.num_requests);
+	prlog(PR_DEBUG,  "msg timeout: %d max retries: %d\n",
+			bt.caps.msg_timeout, bt.caps.max_retries);
+
+out:
+	ipmi_free_msg(msg);
+}
+
+static void get_bt_caps(void)
+{
+
+	struct ipmi_msg *bmc_caps;
+	/*
+	 * Didn't sent a message, now is a good time to ask the BMC for its
+	 * capabilities.
+	 */
+	bmc_caps = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_GET_BT_CAPS,
+			get_bt_caps_complete, NULL, NULL, 0, sizeof(struct bt_caps));
+	if (!bmc_caps)
+		prerror("Couldn't create BMC BT capabilities msg\n");
+
+	if (bmc_caps && ipmi_queue_msg(bmc_caps))
+		prerror("Couldn't enqueue request for BMC BT capabilities\n");
+
+	/* Ignore errors, we'll fallback to using the defaults, no big deal */
+}
+
+static inline bool bt_idle(void)
+{
+	uint8_t bt_ctrl = bt_inb(BT_CTRL);
+
+	return !(bt_ctrl & BT_CTRL_B_BUSY) && !(bt_ctrl & BT_CTRL_H2B_ATN);
+}
+
+/* Must be called with bt.lock held */
+static void bt_msg_del(struct bt_msg *bt_msg)
+{
+	list_del(&bt_msg->link);
+	bt.queue_len--;
+
+	/* once inflight_bt_msg out of list, it should be emptyed */
+	if (bt_msg == inflight_bt_msg)
+		inflight_bt_msg = NULL;
+
+	unlock(&bt.lock);
+	ipmi_cmd_done(bt_msg->ipmi_msg.cmd,
+		      IPMI_NETFN_RETURN_CODE(bt_msg->ipmi_msg.netfn),
+		      IPMI_TIMEOUT_ERR, &bt_msg->ipmi_msg);
+	lock(&bt.lock);
+}
+
+static void bt_init_interface(void)
+{
+	/* Clear interrupt condition & enable irq */
+	bt_outb(BT_INTMASK_B2H_IRQ | BT_INTMASK_B2H_IRQEN, BT_INTMASK);
+
+	/* Take care of a stable H_BUSY if any */
+	bt_set_h_busy(false);
+}
+
+static void bt_reset_interface(void)
+{
+	bt_outb(BT_INTMASK_BMC_HWRST, BT_INTMASK);
+	bt_init_interface();
+}
+
+/*
+ * Try and send a message from the message queue. Caller must hold
+ * bt.bt_lock and bt.lock and ensue the message queue is not
+ * empty.
+ */
+static void bt_send_msg(struct bt_msg *bt_msg)
+{
+	int i;
+	struct ipmi_msg *ipmi_msg;
+
+	ipmi_msg = &bt_msg->ipmi_msg;
+
+	/* Send the message */
+	bt_outb(BT_CTRL_CLR_WR_PTR, BT_CTRL);
+
+	/* Byte 1 - Length */
+	bt_outb(ipmi_msg->req_size + BT_MIN_REQ_LEN, BT_HOST2BMC);
+
+	/* Byte 2 - NetFn/LUN */
+	bt_outb(ipmi_msg->netfn, BT_HOST2BMC);
+
+	/* Byte 3 - Seq */
+	bt_outb(bt_msg->seq, BT_HOST2BMC);
+
+	/* Byte 4 - Cmd */
+	bt_outb(ipmi_msg->cmd, BT_HOST2BMC);
+
+	/* Byte 5:N - Data */
+	for (i = 0; i < ipmi_msg->req_size; i++)
+		bt_outb(ipmi_msg->data[i], BT_HOST2BMC);
+
+	BT_Q_TRACE(bt_msg, "Message sent to host");
+	bt_msg->send_count++;
+
+	bt_outb(BT_CTRL_H2B_ATN, BT_CTRL);
+
+	return;
+}
+
+static void bt_clear_fifo(void)
+{
+	int i;
+
+	for (i = 0; i < bt.caps.input_buf_len; i++)
+		bt_outb(0xff, BT_HOST2BMC);
+}
+
+static void bt_flush_msg(void)
+{
+	bt_assert_h_busy();
+	bt_outb(BT_CTRL_B2H_ATN | BT_CTRL_CLR_RD_PTR | BT_CTRL_CLR_WR_PTR, BT_CTRL);
+	bt_clear_fifo();
+	/* Can't hurt to clear the write pointer again, just to be sure */
+	bt_outb(BT_CTRL_CLR_WR_PTR, BT_CTRL);
+	bt_set_h_busy(false);
+}
+
+static void bt_get_resp(void)
+{
+	int i;
+	struct ipmi_msg *ipmi_msg;
+	uint8_t resp_len, netfn, seq, cmd;
+	uint8_t cc = IPMI_CC_NO_ERROR;
+
+	/* Indicate to the BMC that we are busy */
+	bt_set_h_busy(true);
+
+	/* Clear B2H_ATN and read pointer */
+	bt_outb(BT_CTRL_B2H_ATN, BT_CTRL);
+	bt_outb(BT_CTRL_CLR_RD_PTR, BT_CTRL);
+
+	/* Read the response */
+	/* Byte 1 - Length (includes header size) */
+	resp_len = bt_inb(BT_HOST2BMC) - BT_MIN_RESP_LEN;
+
+	/* Byte 2 - NetFn/LUN */
+	netfn = bt_inb(BT_HOST2BMC);
+
+	/* Byte 3 - Seq */
+	seq = bt_inb(BT_HOST2BMC);
+
+	/* Byte 4 - Cmd */
+	cmd = bt_inb(BT_HOST2BMC);
+
+	/* Byte 5 - Completion Code */
+	cc = bt_inb(BT_HOST2BMC);
+
+	/* Find the corresponding message */
+	if (inflight_bt_msg == NULL || inflight_bt_msg->seq != seq) {
+		/* A response to a message we no longer care about. */
+		prlog(PR_INFO, "Nobody cared about a response to an BT/IPMI message"
+		       "(seq 0x%02x netfn 0x%02x cmd 0x%02x)\n", seq, (netfn >> 2), cmd);
+		bt_flush_msg();
+		return;
+	}
+
+	ipmi_msg = &inflight_bt_msg->ipmi_msg;
+
+	/*
+	 * Make sure we have enough room to store the response. As all values
+	 * are unsigned we will also trigger this error if
+	 * bt_inb(BT_HOST2BMC) < BT_MIN_RESP_LEN (which should never occur).
+	 */
+	if (resp_len > ipmi_msg->resp_size) {
+		BT_Q_ERR(inflight_bt_msg, "Invalid resp_len %d", resp_len);
+		resp_len = ipmi_msg->resp_size;
+		cc = IPMI_ERR_MSG_TRUNCATED;
+	}
+	ipmi_msg->resp_size = resp_len;
+
+	/* Byte 6:N - Data */
+	for (i = 0; i < resp_len; i++)
+		ipmi_msg->data[i] = bt_inb(BT_HOST2BMC);
+	bt_set_h_busy(false);
+
+	BT_Q_TRACE(inflight_bt_msg, "IPMI MSG done");
+
+	list_del(&inflight_bt_msg->link);
+	/* Ready to send next message */
+	inflight_bt_msg = NULL;
+	bt.queue_len--;
+	unlock(&bt.lock);
+
+	/* Call IPMI layer to finish processing the message. */
+	ipmi_cmd_done(cmd, netfn, cc, ipmi_msg);
+	lock(&bt.lock);
+
+	return;
+}
+
+static void bt_expire_old_msg(uint64_t tb)
+{
+	struct bt_msg *bt_msg = inflight_bt_msg;
+
+	if (bt_msg && bt_msg->tb > 0 && !chip_quirk(QUIRK_SIMICS) &&
+	    (tb_compare(tb, bt_msg->tb +
+			secs_to_tb(bt.caps.msg_timeout)) == TB_AAFTERB)) {
+		if (bt_msg->send_count <= bt.caps.max_retries &&
+		    !bt_msg->disable_retry) {
+			/* A message timeout is usually due to the BMC
+			 * clearing the H2B_ATN flag without actually
+			 * doing anything. The data will still be in the
+			 * FIFO so just reset the flag.*/
+			BT_Q_ERR(bt_msg, "Retry sending message");
+
+			/* This means we have started message timeout, but not
+			 * yet sent message to BMC as driver was not free to
+			 * send message. Lets resend message.
+			 */
+			if (bt_msg->send_count == 0)
+				bt_send_msg(bt_msg);
+			else
+				bt_outb(BT_CTRL_H2B_ATN, BT_CTRL);
+
+			bt_msg->send_count++;
+			bt_msg->tb = tb;
+		} else {
+			BT_Q_ERR(bt_msg, "Timeout sending message");
+			bt_msg_del(bt_msg);
+
+			/*
+			 * Timing out a message is inherently racy as the BMC
+			 * may start writing just as we decide to kill the
+			 * message. Hopefully resetting the interface is
+			 * sufficient to guard against such things.
+			 */
+			bt_reset_interface();
+		}
+	}
+}
+
+#if BT_QUEUE_DEBUG
+static void print_debug_queue_info(void)
+{
+	struct bt_msg *msg;
+	static bool printed;
+
+	if (!list_empty(&bt.msgq_sync) || !list_empty(&bt.msgq)) {
+		printed = false;
+		prlog(PR_DEBUG, "-------- BT Sync Msg Queue -------\n");
+		list_for_each(&bt.msgq_sync, msg, link) {
+			BT_Q_DBG(msg, "[ sent %d ]", msg->send_count);
+		}
+		prlog(PR_DEBUG, "---------- BT Msg Queue ----------\n");
+		list_for_each(&bt.msgq, msg, link) {
+			BT_Q_DBG(msg, "[ sent %d ]", msg->send_count);
+		}
+		prlog(PR_DEBUG, "----------------------------------\n");
+	} else if (!printed) {
+		printed = true;
+		prlog(PR_DEBUG, "------- BT Msg Queue Empty -------\n");
+	}
+}
+#endif
+
+static void bt_send_and_unlock(void)
+{
+	/* Busy? */
+	if (inflight_bt_msg)
+		goto out_unlock;
+
+	if (!lpc_ok())
+		goto out_unlock;
+
+	/* Synchronous messages gets priority over normal message */
+	if (!list_empty(&bt.msgq_sync))
+		inflight_bt_msg = list_top(&bt.msgq_sync, struct bt_msg, link);
+	else if (!list_empty(&bt.msgq))
+		inflight_bt_msg = list_top(&bt.msgq, struct bt_msg, link);
+	else
+		goto out_unlock;
+
+	assert(inflight_bt_msg);
+	/*
+	 * Start the message timeout once it gets to the top
+	 * of the queue. This will ensure we timeout messages
+	 * in the case of a broken bt interface as occurs when
+	 * the BMC is not responding to any IPMI messages.
+	 */
+	if (inflight_bt_msg->tb == 0)
+		inflight_bt_msg->tb = mftb();
+
+	/*
+	 * Only send it if we haven't already.
+	 * Timeouts and retries happen in bt_expire_old_msg()
+	 * called from bt_poll()
+	 */
+	if (bt_idle() && inflight_bt_msg->send_count == 0)
+		bt_send_msg(inflight_bt_msg);
+
+out_unlock:
+	unlock(&bt.lock);
+}
+
+static void bt_poll(struct timer *t __unused, void *data __unused,
+		    uint64_t now)
+{
+	uint8_t bt_ctrl;
+
+	/* Don't do anything if the LPC bus is offline */
+	if (!lpc_ok())
+		return;
+
+	/*
+	 * If we can't get the lock assume someone else will notice
+	 * the new message and process it.
+	 */
+	lock(&bt.lock);
+
+#if BT_QUEUE_DEBUG
+	print_debug_queue_info();
+#endif
+
+	bt_ctrl = bt_inb(BT_CTRL);
+
+	/* Is there a response waiting for us? */
+	if (bt_ctrl & BT_CTRL_B2H_ATN)
+		bt_get_resp();
+
+	bt_expire_old_msg(now);
+
+	/* Check for sms_atn */
+	if (bt_inb(BT_CTRL) & BT_CTRL_SMS_ATN) {
+		bt_outb(BT_CTRL_SMS_ATN, BT_CTRL);
+		unlock(&bt.lock);
+		ipmi_sms_attention();
+		lock(&bt.lock);
+	}
+
+	/*
+	 * Send messages if we can. If the BMC was really quick we
+	 * could loop back to the start and check for a response
+	 * instead of unlocking, but testing shows the BMC isn't that
+	 * fast so we will wait for the IRQ or a call to the pollers instead.
+	 */
+	bt_send_and_unlock();
+
+	schedule_timer(&bt.poller,
+		       bt.irq_ok ? TIMER_POLL : msecs_to_tb(BT_DEFAULT_POLL_MS));
+}
+
+static void bt_ipmi_poll(void)
+{
+	bt_poll(NULL, NULL, mftb());
+}
+
+static void bt_add_msg(struct bt_msg *bt_msg)
+{
+	bt_msg->tb = 0;
+	bt_msg->seq = ipmi_seq++;
+	bt_msg->send_count = 0;
+	bt.queue_len++;
+	if (bt.queue_len > BT_MAX_QUEUE_LEN) {
+		/* Maximum queue length exceeded, remove oldest messages. */
+		BT_Q_ERR(bt_msg, "Maximum queue length exceeded");
+		/* First try to remove message from normal queue */
+		if (!list_empty(&bt.msgq))
+			bt_msg = list_tail(&bt.msgq, struct bt_msg, link);
+		else if (!list_empty(&bt.msgq_sync))
+			bt_msg = list_tail(&bt.msgq_sync, struct bt_msg, link);
+		assert(bt_msg);
+		BT_Q_ERR(bt_msg, "Removed from queue");
+		bt_msg_del(bt_msg);
+	}
+}
+
+/* Add message to synchronous message list */
+static int bt_add_ipmi_msg_head(struct ipmi_msg *ipmi_msg)
+{
+	struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg);
+
+	lock(&bt.lock);
+	bt_add_msg(bt_msg);
+	list_add_tail(&bt.msgq_sync, &bt_msg->link);
+	bt_send_and_unlock();
+
+	return 0;
+}
+
+static int bt_add_ipmi_msg(struct ipmi_msg *ipmi_msg)
+{
+	struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg);
+
+	lock(&bt.lock);
+	bt_add_msg(bt_msg);
+	list_add_tail(&bt.msgq, &bt_msg->link);
+	bt_send_and_unlock();
+
+	return 0;
+}
+
+static void bt_irq(uint32_t chip_id __unused, uint32_t irq_mask __unused)
+{
+	uint8_t ireg;
+
+	ireg = bt_inb(BT_INTMASK);
+
+	bt.irq_ok = true;
+	if (ireg & BT_INTMASK_B2H_IRQ) {
+		bt_outb(BT_INTMASK_B2H_IRQ | BT_INTMASK_B2H_IRQEN, BT_INTMASK);
+		bt_poll(NULL, NULL, mftb());
+	}
+}
+
+/*
+ * Allocate an ipmi message and bt container and return the ipmi
+ * message struct. Allocates enough space for the request and response
+ * data.
+ */
+static struct ipmi_msg *bt_alloc_ipmi_msg(size_t request_size, size_t response_size)
+{
+	struct bt_msg *bt_msg;
+
+	bt_msg = zalloc(sizeof(struct bt_msg) + MAX(request_size, response_size));
+	if (!bt_msg)
+		return NULL;
+
+	bt_msg->ipmi_msg.req_size = request_size;
+	bt_msg->ipmi_msg.resp_size = response_size;
+	bt_msg->ipmi_msg.data = (uint8_t *) (bt_msg + 1);
+
+	return &bt_msg->ipmi_msg;
+}
+
+/*
+ * Free a previously allocated ipmi message.
+ */
+static void bt_free_ipmi_msg(struct ipmi_msg *ipmi_msg)
+{
+	struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg);
+
+	free(bt_msg);
+}
+
+/*
+ * Do not resend IPMI messages to BMC.
+ */
+static void bt_disable_ipmi_msg_retry(struct ipmi_msg *ipmi_msg)
+{
+	struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg);
+
+	bt_msg->disable_retry = true;
+}
+
+/*
+ * Remove a message from the queue. The memory allocated for the ipmi message
+ * will need to be freed by the caller with bt_free_ipmi_msg() as it will no
+ * longer be in the queue of messages.
+ */
+static int bt_del_ipmi_msg(struct ipmi_msg *ipmi_msg)
+{
+	struct bt_msg *bt_msg = container_of(ipmi_msg, struct bt_msg, ipmi_msg);
+
+	lock(&bt.lock);
+	list_del(&bt_msg->link);
+	bt.queue_len--;
+	bt_send_and_unlock();
+	return 0;
+}
+
+static struct ipmi_backend bt_backend = {
+	.alloc_msg = bt_alloc_ipmi_msg,
+	.free_msg = bt_free_ipmi_msg,
+	.queue_msg = bt_add_ipmi_msg,
+	.queue_msg_head = bt_add_ipmi_msg_head,
+	.dequeue_msg = bt_del_ipmi_msg,
+	.disable_retry = bt_disable_ipmi_msg_retry,
+	.poll = bt_ipmi_poll,
+};
+
+static struct lpc_client bt_lpc_client = {
+	.interrupt = bt_irq,
+};
+
+void bt_init(void)
+{
+	struct dt_node *n;
+	const struct dt_property *prop;
+	uint32_t irq;
+
+	/* Set sane capability defaults */
+	bt.caps.num_requests = 1;
+	bt.caps.input_buf_len = BT_FIFO_LEN;
+	bt.caps.output_buf_len = BT_FIFO_LEN;
+	bt.caps.msg_timeout = BT_MSG_TIMEOUT;
+	bt.caps.max_retries = BT_MAX_RETRIES;
+
+	/* We support only one */
+	n = dt_find_compatible_node(dt_root, NULL, "ipmi-bt");
+	if (!n) {
+		prerror("No BT device\n");
+		return;
+	}
+
+	/* Get IO base */
+	prop = dt_find_property(n, "reg");
+	if (!prop) {
+		prerror("Can't find reg property\n");
+		return;
+	}
+	if (dt_property_get_cell(prop, 0) != OPAL_LPC_IO) {
+		prerror("Only supports IO addresses\n");
+		return;
+	}
+	bt.base_addr = dt_property_get_cell(prop, 1);
+	init_timer(&bt.poller, bt_poll, NULL);
+
+	bt_init_interface();
+	init_lock(&bt.lock);
+
+	/*
+	 * The iBT interface comes up in the busy state until the daemon has
+	 * initialised it.
+	 */
+	list_head_init(&bt.msgq);
+	list_head_init(&bt.msgq_sync);
+	inflight_bt_msg = NULL;
+	bt.queue_len = 0;
+
+	prlog(PR_INFO, "Interface initialized, IO 0x%04x\n", bt.base_addr);
+
+	ipmi_register_backend(&bt_backend);
+
+	/*
+	 * We initially schedule the poller as a relatively fast timer, at
+	 * least until we have at least one interrupt occurring at which
+	 * point we turn it into a background poller
+	 */
+	schedule_timer(&bt.poller, msecs_to_tb(BT_DEFAULT_POLL_MS));
+
+	irq = dt_prop_get_u32(n, "interrupts");
+	bt_lpc_client.interrupts = LPC_IRQ(irq);
+	lpc_register_client(dt_get_chip_id(n), &bt_lpc_client,
+			    IRQ_ATTR_TARGET_OPAL);
+
+	/* Enqueue an IPMI message to ask the BMC about its BT capabilities */
+	get_bt_caps();
+
+	prlog(PR_DEBUG, "Using LPC IRQ %d\n", irq);
+}
diff --git a/roms/skiboot/hw/cache-p9.c b/roms/skiboot/hw/cache-p9.c
new file mode 100644
index 000000000..fb5ce3087
--- /dev/null
+++ b/roms/skiboot/hw/cache-p9.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <xscom.h>
+#include <timebase.h>
+#include <xscom-p9-regs.h>
+#include <cache-p9.h>
+
+/* Registers and bits used to clear the L2 and L3 cache */
+#define L2_PRD_PURGE_CMD_REG			0x1080e
+#define   L2_PRD_PURGE_CMD_TRIGGER		PPC_BIT(0)
+#define   L2_PRD_PURGE_CMD_TYPE_MASK		PPC_BITMASK(1, 4)
+#define     L2CAC_FLUSH				0x0
+#define   L2_PRD_PURGE_CMD_REG_BUSY		PPC_BIT(9)
+#define L3_PRD_PURGE_REG			0x1180e
+#define   L3_PRD_PURGE_REQ			PPC_BIT(0)
+#define   L3_PRD_PURGE_TTYPE_MASK		PPC_BITMASK(1, 4)
+#define     L3_FULL_PURGE			0x0
+
+#define L2_L3_PRD_PURGE_TIMEOUT_MS		20
+
+static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
+{
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
+	int rc;
+
+	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
+			      L2_PRD_PURGE_CMD_TYPE_MASK);
+	if (!rc)
+		rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
+			      L2_PRD_PURGE_CMD_TRIGGER);
+	if (rc)
+		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
+		      "failed %i\n", core_id, rc);
+	return rc;
+}
+
+static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
+{
+	uint64_t val;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
+	unsigned long now = mftb();
+	unsigned long end = now + msecs_to_tb(L2_L3_PRD_PURGE_TIMEOUT_MS);
+	int rc;
+
+	while (1) {
+		rc = xscom_read(chip_id, addr, &val);
+		if (rc) {
+			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
+			      "failed %i\n", core_id, rc);
+			break;
+		}
+		if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
+			break;
+		now = mftb();
+		if (tb_compare(now, end) == TB_AAFTERB) {
+			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
+			      core_id, rc);
+			return OPAL_BUSY;
+		}
+	}
+
+	/* We have to clear the trigger bit ourselves */
+	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
+	rc = xscom_write(chip_id, addr, val);
+	if (rc)
+		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
+		      core_id, rc);
+	return rc;
+}
+
+static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
+{
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
+	int rc;
+
+	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
+			      L3_PRD_PURGE_TTYPE_MASK);
+	if (!rc)
+		rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
+			      L3_PRD_PURGE_REQ);
+	if (rc)
+		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
+		      "failed %i\n", core_id, rc);
+	return rc;
+}
+
+static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
+{
+	uint64_t val;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
+	unsigned long now = mftb();
+	unsigned long end = now + msecs_to_tb(L2_L3_PRD_PURGE_TIMEOUT_MS);
+	int rc;
+
+	/* Trigger bit is automatically set to zero when flushing is done */
+	while (1) {
+		rc = xscom_read(chip_id, addr, &val);
+		if (rc) {
+			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
+			      "failed %i\n", core_id, rc);
+			break;
+		}
+		if (!(val & L3_PRD_PURGE_REQ))
+			break;
+		now = mftb();
+		if (tb_compare(now, end) == TB_AAFTERB) {
+			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
+			      core_id, rc);
+			return OPAL_BUSY;
+		}
+	}
+	return rc;
+}
+
+int64_t purge_l2_l3_caches(void)
+{
+	struct cpu_thread *t;
+	uint64_t core_id, prev_core_id = (uint64_t)-1;
+	int rc;
+	unsigned long now = mftb();
+
+	for_each_ungarded_cpu(t) {
+		/* Only need to do it once per core chiplet */
+		core_id = pir_to_core_id(t->pir);
+		if (prev_core_id == core_id)
+			continue;
+		prev_core_id = core_id;
+		rc = start_l2_purge(t->chip_id, core_id);
+		if (rc)
+			goto trace_exit;
+		rc = start_l3_purge(t->chip_id, core_id);
+		if (rc)
+			goto trace_exit;
+	}
+
+	prev_core_id = (uint64_t)-1;
+	for_each_ungarded_cpu(t) {
+		/* Only need to do it once per core chiplet */
+		core_id = pir_to_core_id(t->pir);
+		if (prev_core_id == core_id)
+			continue;
+		prev_core_id = core_id;
+
+		rc = wait_l2_purge(t->chip_id, core_id);
+		if (rc)
+			goto trace_exit;
+		rc = wait_l3_purge(t->chip_id, core_id);
+		if (rc)
+			goto trace_exit;
+	}
+
+trace_exit:
+	prlog(PR_TRACE, "L2/L3 purging took %ldus\n",
+			tb_to_usecs(mftb() - now));
+
+	return rc;
+}
diff --git a/roms/skiboot/hw/capp.c b/roms/skiboot/hw/capp.c
new file mode 100644
index 000000000..a1aa1caa9
--- /dev/null
+++ b/roms/skiboot/hw/capp.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * CAPP unit (i.e. CAPI)
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <opal.h>
+#include <chip.h>
+#include <xscom.h>
+#include <capp.h>
+
+#define PHBERR(opal_id, chip_id, index, fmt, a...) \
+	       prlog(PR_ERR, "PHB#%04x[%d:%d]: " fmt, \
+		     opal_id, chip_id, \
+		     index,  ## a)
+
+static struct {
+	uint32_t			ec_level;
+	struct capp_lid_hdr		*lid;
+	size_t size;
+	int load_result;
+} capp_ucode_info = { 0, NULL, 0, false };
+
+#define CAPP_UCODE_MAX_SIZE 0x20000
+
+struct lock capi_lock = LOCK_UNLOCKED;
+struct capp_ops capi_ops = { NULL };
+
+bool capp_ucode_loaded(struct proc_chip *chip, unsigned int index)
+{
+	return (chip->capp_ucode_loaded & (1 << index));
+}
+
+int preload_capp_ucode(void)
+{
+	struct dt_node *p;
+	struct proc_chip *chip;
+	uint32_t index;
+	uint64_t rc;
+	int ret;
+
+	/* CAPI is supported on P8 and P9 only */
+	p = dt_find_compatible_node(dt_root, NULL, "ibm,power8-pbcq");
+	if (!p)
+		p = dt_find_compatible_node(dt_root, NULL, "ibm,power9-pbcq");
+	if (!p)
+		return OPAL_SUCCESS;
+
+	chip = get_chip(dt_get_chip_id(p));
+
+	rc = xscom_read_cfam_chipid(chip->id, &index);
+	if (rc) {
+		prerror("CAPP: Error reading cfam chip-id\n");
+		ret = OPAL_HARDWARE;
+		return ret;
+	}
+	/* Keep ChipID and Major/Minor EC.  Mask out the Location Code. */
+	index = index & 0xf0fff;
+
+	/* Assert that we're preloading */
+	assert(capp_ucode_info.lid == NULL);
+	capp_ucode_info.load_result = OPAL_EMPTY;
+
+	capp_ucode_info.ec_level = index;
+
+	/* Is the ucode preloaded like for BML? */
+	if (dt_has_node_property(p, "ibm,capp-ucode", NULL)) {
+		capp_ucode_info.lid = (struct capp_lid_hdr *)(u64)
+			dt_prop_get_u32(p, "ibm,capp-ucode");
+		capp_ucode_info.load_result = OPAL_SUCCESS;
+		ret = OPAL_SUCCESS;
+		goto end;
+	}
+	/* If we successfully download the ucode, we leave it around forever */
+	capp_ucode_info.size = CAPP_UCODE_MAX_SIZE;
+	capp_ucode_info.lid = malloc(CAPP_UCODE_MAX_SIZE);
+	if (!capp_ucode_info.lid) {
+		prerror("CAPP: Can't allocate space for ucode lid\n");
+		ret = OPAL_NO_MEM;
+		goto end;
+	}
+
+	prlog(PR_INFO, "CAPI: Preloading ucode %x\n", capp_ucode_info.ec_level);
+
+	ret = start_preload_resource(RESOURCE_ID_CAPP, index,
+				     capp_ucode_info.lid,
+				     &capp_ucode_info.size);
+
+	if (ret != OPAL_SUCCESS) {
+		prerror("CAPI: Failed to preload resource %d\n", ret);
+		capp_ucode_info.load_result = ret;
+	}
+
+end:
+	return ret;
+}
+
+static int64_t capp_lid_download(void)
+{
+	int64_t ret;
+
+	if (capp_ucode_info.load_result != OPAL_EMPTY)
+		return capp_ucode_info.load_result;
+
+	capp_ucode_info.load_result = wait_for_resource_loaded(
+		RESOURCE_ID_CAPP,
+		capp_ucode_info.ec_level);
+
+	if (capp_ucode_info.load_result != OPAL_SUCCESS) {
+		prerror("CAPP: Error loading ucode lid. index=%x\n",
+			capp_ucode_info.ec_level);
+		ret = OPAL_RESOURCE;
+		free(capp_ucode_info.lid);
+		capp_ucode_info.lid = NULL;
+		goto end;
+	}
+
+	ret = OPAL_SUCCESS;
+end:
+	return ret;
+}
+
+int64_t capp_load_ucode(unsigned int chip_id, uint32_t opal_id,
+			unsigned int index, u64 lid_eyecatcher,
+			uint32_t reg_offset,
+			uint64_t apc_master_addr, uint64_t apc_master_write,
+			uint64_t snp_array_addr, uint64_t snp_array_write)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct capp_ucode_lid *ucode;
+	struct capp_ucode_data *data;
+	struct capp_lid_hdr *lid;
+	uint64_t rc, val, addr;
+	uint32_t chunk_count, offset;
+	int i;
+
+	if (capp_ucode_loaded(chip, index))
+		return OPAL_SUCCESS;
+
+	rc = capp_lid_download();
+	if (rc)
+		return rc;
+
+	prlog(PR_INFO, "CHIP%i: CAPP ucode lid loaded at %p\n",
+	      chip_id, capp_ucode_info.lid);
+
+	lid = capp_ucode_info.lid;
+	/*
+	 * If lid header is present (on FSP machines), it'll tell us where to
+	 * find the ucode.  Otherwise this is the ucode.
+	 */
+	ucode = (struct capp_ucode_lid *)lid;
+	if (be64_to_cpu(lid->eyecatcher) == lid_eyecatcher) {
+		if (be64_to_cpu(lid->version) != 0x1) {
+			PHBERR(opal_id, chip_id, index,
+			       "capi ucode lid header invalid\n");
+			return OPAL_HARDWARE;
+		}
+		ucode = (struct capp_ucode_lid *)
+			((char *)ucode + be64_to_cpu(lid->ucode_offset));
+	}
+
+	/* 'CAPPULID' in ASCII */
+	if ((be64_to_cpu(ucode->eyecatcher) != 0x43415050554C4944UL) ||
+	    (be64_to_cpu(ucode->version) != 1)) {
+		PHBERR(opal_id, chip_id, index,
+		       "CAPP: ucode header invalid\n");
+		return OPAL_HARDWARE;
+	}
+
+	offset = 0;
+	while (offset < be64_to_cpu(ucode->data_size)) {
+		data = (struct capp_ucode_data *)
+			((char *)&ucode->data + offset);
+		chunk_count = be32_to_cpu(data->hdr.chunk_count);
+		offset += sizeof(struct capp_ucode_data_hdr) + chunk_count * 8;
+
+		/* 'CAPPUCOD' in ASCII */
+		if (be64_to_cpu(data->hdr.eyecatcher) != 0x4341505055434F44UL) {
+			PHBERR(opal_id, chip_id, index,
+			       "CAPP: ucode data header invalid:%i\n",
+			       offset);
+			return OPAL_HARDWARE;
+		}
+
+		switch (data->hdr.reg) {
+		case apc_master_cresp:
+			xscom_write(chip_id, apc_master_addr + reg_offset,
+				    0);
+			addr = apc_master_write;
+			break;
+		case apc_master_uop_table:
+			xscom_write(chip_id, apc_master_addr + reg_offset,
+				    0x180ULL << 52);
+			addr = apc_master_write;
+			break;
+		case snp_ttype:
+			xscom_write(chip_id, snp_array_addr + reg_offset,
+				    0x5000ULL << 48);
+			addr = snp_array_write;
+			break;
+		case snp_uop_table:
+			xscom_write(chip_id, snp_array_addr + reg_offset,
+				    0x4000ULL << 48);
+			addr = snp_array_write;
+			break;
+		default:
+			continue;
+		}
+
+		for (i = 0; i < chunk_count; i++) {
+			val = be64_to_cpu(data->data[i]);
+			xscom_write(chip_id, addr + reg_offset, val);
+		}
+	}
+
+	chip->capp_ucode_loaded |= (1 << index);
+
+	return OPAL_SUCCESS;
+}
+
+int64_t capp_get_info(int chip_id, struct phb *phb, struct capp_info *info)
+{
+	if (capi_ops.get_capp_info)
+		return capi_ops.get_capp_info(chip_id, phb, info);
+
+	return OPAL_PARAMETER;
+}
+
+int64_t capp_xscom_read(struct capp *capp, int64_t off, uint64_t *val)
+{
+	return capp == NULL ? OPAL_PARAMETER :
+		xscom_read(capp->chip_id, off + capp->capp_xscom_offset, val);
+}
+
+int64_t capp_xscom_write(struct capp *capp, int64_t off, uint64_t val)
+{
+	return capp == NULL ? OPAL_PARAMETER :
+		xscom_write(capp->chip_id, off + capp->capp_xscom_offset, val);
+}
diff --git a/roms/skiboot/hw/centaur.c b/roms/skiboot/hw/centaur.c
new file mode 100644
index 000000000..e9ff4197f
--- /dev/null
+++ b/roms/skiboot/hw/centaur.c
@@ -0,0 +1,555 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Centaur memory buffer chip
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <processor.h>
+#include <device.h>
+#include <chip.h>
+#include <centaur.h>
+#include <lock.h>
+#include <fsi-master.h>
+#include <timebase.h>
+
+/*
+ * Centaur chip IDs are using the XSCOM "partID" encoding
+ * described in xscom.h. recap:
+ *
+ *     0b1000.0000.0000.0000.0000.00NN.NCCC.MMMM
+ *     N=Node, C=Chip, M=Memory Channel
+ *
+ * We currently use FSI exclusively for centaur access. We can
+ * start using MMIO on Centaur DD2.x when we have a way to handle
+ * machine checks happening inside Sapphire which we don't at the
+ * moment.
+ */
+
+/* Is that correct ? */
+#define MAX_CENTAURS_PER_CHIP	8
+
+/* Mark the centaur offline after this many consecutive errors */
+#define CENTAUR_ERR_OFFLINE_THRESHOLD	10
+
+/*
+ * FSI2PIB register definitions (this could be moved out if we were to
+ * support FSI master to other chips.
+ */
+#define FSI_DATA0_REG		0x1000
+#define FSI_DATA1_REG		0x1004
+#define FSI_CMD_REG		0x1008
+#define   FSI_CMD_WR		0x80000000
+#define   FSI_CMD_RD		0x00000000
+#define FSI_ENG_RESET_REG	0x1018
+#define FSI_STATUS_REG		0x101c
+#define   FSI_STATUS_ABORT	0x00100000
+#define   FSI_STATUS_ERRORS	0x00007000
+
+/* Some Centaur XSCOMs we care about */
+#define SCAC_CONFIG_REG		0x020115ce
+#define SCAC_CONFIG_SET		0x020115cf
+#define SCAC_CONFIG_CLR		0x020115d0
+#define SCAC_ENABLE_MSK		PPC_BIT(0)
+
+#define cent_log(__lev, __c, __fmt, ...)				\
+	prlog(__lev, "CENTAUR %x: " __fmt, __c->part_id, ##__VA_ARGS__)
+
+static int64_t centaur_fsiscom_complete(struct centaur_chip *centaur)
+{
+	int64_t rc;
+	uint32_t stat;
+
+	rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+		       centaur->fsi_master_port, FSI_STATUS_REG, &stat);
+	if (rc) {
+		cent_log(PR_ERR, centaur, "MFSI read error %lld reading STAT\n", rc);
+		return rc;
+	}
+	if ((stat & (FSI_STATUS_ABORT | FSI_STATUS_ERRORS)) == 0)
+		return OPAL_SUCCESS;
+
+	cent_log(PR_ERR, centaur, "Remote FSI SCOM error, status=0x%08x\n", stat);
+
+	/* All 1's ? Assume it's gone */
+	if (stat == 0xffffffffu) {
+		cent_log(PR_ERR, centaur, "Chip appears to be dead !\n");
+		centaur->valid = false;
+
+		/* Here, hostboot grabs a pile of FFDC from the FSI layer,
+		 * we could do that too ...
+		 */
+		return OPAL_HARDWARE;
+	}
+
+	/* Here HB prints the GPx registers which I believe are only
+	 * in the host (FSI master). We skip that for now, we don't have
+	 * a good API to them
+	 */
+
+	/* Recovery sequence from HostBoot fsiscom.C
+	 *  if SCOM fails and FSI Master displays "MasterTimeOut"
+	 *     then 7,6  <covered by FSI driver>
+	 *  else if SCOM fails and FSI2PIB Status shows PIB abort
+	 *     then just perform unit reset (6) and wait 1 ms
+	 *  else (PIB_abort='0' but PIB error is unequal 0)
+	 *     then just perform unit reset (6) (wait not needed).
+	 *
+	 * Note: Waiting 1ms inside OPAL is a BIG NO NO !!! We have
+	 * no choice but doing it at the moment but that will have
+	 * to be fixed one way or another, possibly by returning some
+	 * kind of busy status until the delay is expired.
+	 */
+	rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+			centaur->fsi_master_port, FSI_ENG_RESET_REG, 0);
+	if (rc) {
+		cent_log(PR_ERR, centaur, "MFSI write error %lld resetting SCOM engine\n",
+			 rc);
+	}
+	return OPAL_HARDWARE;
+}
+
+static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_addr,
+				    uint64_t *val)
+{
+	int64_t rc;
+	uint32_t data0, data1;
+
+	rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+			centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_RD);
+	if (rc) {
+		cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc);
+		return rc;
+	}
+
+	rc = centaur_fsiscom_complete(centaur);
+	if (rc)
+		return rc;
+
+	rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+		       centaur->fsi_master_port, FSI_DATA0_REG, &data0);
+	if (rc) {
+		cent_log(PR_ERR, centaur, "MFSI read error %lld reading DATA0\n", rc);
+		return rc;
+	}
+	rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+		       centaur->fsi_master_port, FSI_DATA1_REG, &data1);
+	if (rc) {
+		cent_log(PR_ERR, centaur, "MFSI read error %lld readking DATA1\n", rc);
+		return rc;
+	}
+
+	*val = (((uint64_t)data0) << 32) | data1;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t centaur_fsiscom_write(struct centaur_chip *centaur, uint32_t pcb_addr,
+				     uint64_t val)
+{
+	int64_t rc;
+
+	rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+			centaur->fsi_master_port, FSI_DATA0_REG, hi32(val));
+	if (rc) {
+		cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA0\n", rc);
+		return rc;
+	}
+	rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+			centaur->fsi_master_port, FSI_DATA1_REG, lo32(val));
+	if (rc) {
+		cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA1\n", rc);
+		return rc;
+	}
+	rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+			centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_WR);
+	if (rc) {
+		cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc);
+		return rc;
+	}
+
+	return centaur_fsiscom_complete(centaur);
+}
+
+struct centaur_chip *get_centaur(uint32_t part_id)
+{
+	uint32_t hchip_id, mchan;
+	struct proc_chip *hchip;
+	struct centaur_chip *centaur;
+
+	if ((part_id >> 28) != 8) {
+		prerror("CENTAUR: Invalid part ID 0x%x\n", part_id);
+		return NULL;
+	}
+	hchip_id = (part_id & 0x0fffffff) >> 4;
+	mchan = part_id & 0xf;
+
+	hchip = get_chip(hchip_id);
+	if (!hchip) {
+		prerror("CENTAUR: Centaur 0x%x not found on non-existing chip 0%x\n",
+			part_id, hchip_id);
+		return NULL;
+	}
+	if (mchan >= MAX_CENTAURS_PER_CHIP) {
+		prerror("CENTAUR: Centaur 0x%x channel out of bounds !\n", part_id);
+		return NULL;
+	}
+	if (!hchip->centaurs) {
+		prerror("CENTAUR: Centaur 0x%x not found on chip 0%x (no centaurs)\n",
+			part_id, hchip_id);
+		return NULL;
+	}
+	centaur = &hchip->centaurs[mchan];
+	if (!centaur->valid) {
+		prerror("CENTAUR: Centaur 0x%x not valid on chip 0%x\n",
+			part_id, hchip_id);
+		return NULL;
+	}
+	return centaur;
+}
+
+/*
+ * Indirect XSCOM access functions. Copied from xscom.c, at a
+ * latter date, we should merge these properly.
+ */
+static void centaur_xscom_handle_ind_error(struct centaur_chip *centaur,
+					   uint64_t data, uint64_t pcb_addr,
+					   bool is_write)
+{
+	unsigned int stat = GETFIELD(XSCOM_DATA_IND_ERR, data);
+	bool timeout = !(data & XSCOM_DATA_IND_COMPLETE);
+
+	/* XXX: Create error log entry ? */
+	if (timeout)
+		cent_log(PR_ERR, centaur,
+			 "inddirect %s timeout, pcb_addr=0x%llx stat=0x%x\n",
+			is_write ? "write" : "read", pcb_addr, stat);
+	else
+		cent_log(PR_ERR, centaur,
+			 "indirect %s error, pcb_addr=0x%llx stat=0x%x\n",
+			is_write ? "write" : "read", pcb_addr, stat);
+}
+
+static int centaur_xscom_ind_read(struct centaur_chip *centaur,
+				  uint64_t pcb_addr, uint64_t *val)
+{
+	uint32_t addr;
+	uint64_t data;
+	int rc, retries;
+
+	/* Write indirect address */
+	addr = pcb_addr & 0x7fffffff;
+	data = XSCOM_DATA_IND_READ |
+		(pcb_addr & XSCOM_ADDR_IND_ADDR);
+	rc = centaur_fsiscom_write(centaur, addr, data);
+	if (rc)
+		goto bail;
+
+	/* Wait for completion */
+	for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
+		rc = centaur_fsiscom_read(centaur, addr, &data);
+		if (rc)
+			goto bail;
+		if ((data & XSCOM_DATA_IND_COMPLETE) &&
+		    ((data & XSCOM_DATA_IND_ERR) == 0)) {
+			*val = data & XSCOM_DATA_IND_DATA;
+			break;
+		}
+		if ((data & XSCOM_DATA_IND_COMPLETE) ||
+		    (retries >= XSCOM_IND_MAX_RETRIES)) {
+			centaur_xscom_handle_ind_error(centaur, data, pcb_addr,
+						       false);
+			rc = OPAL_HARDWARE;
+			goto bail;
+		}
+	}
+ bail:
+	if (rc)
+		*val = (uint64_t)-1;
+	return rc;
+}
+
+static int centaur_xscom_ind_write(struct centaur_chip *centaur,
+				   uint64_t pcb_addr, uint64_t val)
+{
+	uint32_t addr;
+	uint64_t data;
+	int rc, retries;
+
+	/* Write indirect address & data */
+	addr = pcb_addr & 0x7fffffff;
+	data = pcb_addr & XSCOM_ADDR_IND_ADDR;
+	data |= val & XSCOM_ADDR_IND_DATA;
+
+	rc = centaur_fsiscom_write(centaur, addr, data);
+	if (rc)
+		goto bail;
+
+	/* Wait for completion */
+	for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
+		rc = centaur_fsiscom_read(centaur, addr, &data);
+		if (rc)
+			goto bail;
+		if ((data & XSCOM_DATA_IND_COMPLETE) &&
+		    ((data & XSCOM_DATA_IND_ERR) == 0))
+			break;
+		if ((data & XSCOM_DATA_IND_COMPLETE) ||
+		    (retries >= XSCOM_IND_MAX_RETRIES)) {
+			centaur_xscom_handle_ind_error(centaur, data, pcb_addr,
+						       true);
+			rc = OPAL_HARDWARE;
+			goto bail;
+		}
+	}
+ bail:
+	return rc;
+}
+
+static int64_t centaur_xscom_read(struct scom_controller *scom,
+				  uint32_t id __unused, uint64_t pcb_addr,
+				  uint64_t *val)
+{
+	struct centaur_chip *centaur = scom->private;
+	int64_t rc;
+
+	if (!centaur)
+		return OPAL_PARAMETER;
+	if (!centaur->online)
+		return OPAL_XSCOM_CTR_OFFLINED;
+
+	lock(&centaur->lock);
+	if (pcb_addr & XSCOM_ADDR_IND_FLAG)
+		rc = centaur_xscom_ind_read(centaur, pcb_addr, val);
+	else
+		rc = centaur_fsiscom_read(centaur, pcb_addr, val);
+
+	/* We mark the centaur offline if we get too many errors on
+	 * consecutive accesses
+	 */
+	if (rc) {
+		centaur->error_count++;
+		if (centaur->error_count > CENTAUR_ERR_OFFLINE_THRESHOLD) {
+			centaur->online = false;
+			/**
+			 * @fwts-label CentaurOfflinedTooManyErrors
+			 * @fwts-advice OPAL marked a Centaur (memory buffer)
+			 * as offline due to CENTAUR_ERR_OFFLINE_THRESHOLD (10)
+			 * consecutive errors on XSCOMs to this centaur.
+			 * OPAL will now return OPAL_XSCOM_CTR_OFFLINED and not
+			 * try any further XSCOMs. This is likely caused by
+			 * some hardware issue or PRD recovery issue.
+			 */
+			prlog(PR_ERR, "CENTAUR: Offlined %x due to > %d consecutive XSCOM errors. No more XSCOMs to this centaur.\n",
+			      id, CENTAUR_ERR_OFFLINE_THRESHOLD);
+		}
+	} else
+		centaur->error_count = 0;
+	unlock(&centaur->lock);
+
+	return rc;
+}
+
+static int64_t centaur_xscom_write(struct scom_controller *scom,
+				   uint32_t id __unused, uint64_t pcb_addr,
+				   uint64_t val)
+{
+	struct centaur_chip *centaur = scom->private;
+	int64_t rc;
+
+	if (!centaur)
+		return OPAL_PARAMETER;
+	if (!centaur->online)
+		return OPAL_XSCOM_CTR_OFFLINED;
+
+	lock(&centaur->lock);
+	if (pcb_addr & XSCOM_ADDR_IND_FLAG)
+		rc = centaur_xscom_ind_write(centaur, pcb_addr, val);
+	else
+		rc = centaur_fsiscom_write(centaur, pcb_addr, val);
+
+	/* We mark the centaur offline if we get too many errors on
+	 * consecutive accesses
+	 */
+	if (rc) {
+		centaur->error_count++;
+		if (centaur->error_count > CENTAUR_ERR_OFFLINE_THRESHOLD)
+			centaur->online = false;
+	} else
+		centaur->error_count = 0;
+	unlock(&centaur->lock);
+
+	return rc;
+}
+
+static bool centaur_check_id(struct centaur_chip *centaur)
+{
+	int64_t rc;
+	uint64_t val;
+
+	rc = centaur_fsiscom_read(centaur, 0xf000f, &val);
+	if (rc) {
+		cent_log(PR_ERR, centaur,
+			 "   FSISCOM error %lld reading ID register\n",
+			 rc);
+		return false;
+	}
+
+	/* Extract CFAM id */
+	val >>= 44;
+
+	/* Identify chip */
+	if ((val & 0xff) != 0xe9) {
+		cent_log(PR_ERR, centaur,
+			 "   CFAM ID 0x%02x is not a Centaur !\n",
+			(unsigned int)(val & 0xff));
+		return false;
+	}
+
+	/* Get EC level from CFAM ID */
+	centaur->ec_level = ((val >> 16) & 0xf) << 4;
+	centaur->ec_level |= (val >> 8) & 0xf;
+
+	return true;
+}
+
+static bool centaur_add(uint32_t part_id, uint32_t mchip, uint32_t meng,
+			uint32_t mport)
+{
+	uint32_t hchip_id, mchan;
+	struct proc_chip *hchip;
+	struct centaur_chip *centaur;
+
+	if ((part_id >> 28) != 8) {
+		prerror("CENTAUR: Invalid part ID 0x%x\n", part_id);
+		return false;
+	}
+	hchip_id = (part_id & 0x0fffffff) >> 4;
+	mchan = part_id & 0xf;
+
+	printf("CENTAUR: Found centaur for chip 0x%x channel %d\n",
+	       hchip_id, mchan);
+	printf("CENTAUR:   FSI host: 0x%x cMFSI%d port %d\n",
+	       mchip, meng, mport);
+
+	hchip = get_chip(hchip_id);
+	if (!hchip) {
+		prerror("CENTAUR:   No such chip !!!\n");
+		return false;
+	}
+
+	if (mchan >= MAX_CENTAURS_PER_CHIP) {
+		prerror("CENTAUR:   Channel out of bounds !\n");
+		return false;
+	}
+
+	if (!hchip->centaurs) {
+		hchip->centaurs =
+			zalloc(sizeof(struct centaur_chip) *
+			       MAX_CENTAURS_PER_CHIP);
+		assert(hchip->centaurs);
+	}
+
+	centaur = &hchip->centaurs[mchan];
+	if (centaur->valid) {
+		prerror("CENTAUR:   Duplicate centaur !\n");
+		return false;
+	}
+	centaur->part_id = part_id;
+	centaur->fsi_master_chip_id = mchip;
+	centaur->fsi_master_port = mport;
+	centaur->fsi_master_engine = meng ? MFSI_cMFSI1 : MFSI_cMFSI0;
+	centaur->online = true;
+	init_lock(&centaur->lock);
+	list_head_init(&centaur->i2cms);
+
+	if (!centaur_check_id(centaur))
+		return false;
+
+	centaur->scom.part_id = part_id;
+	centaur->scom.private = centaur;
+	centaur->scom.read = centaur_xscom_read;
+	centaur->scom.write = centaur_xscom_write;
+	scom_register(&centaur->scom);
+
+	cent_log(PR_INFO, centaur, "Found DD%x.%x chip\n",
+		       centaur->ec_level >> 4,
+		       centaur->ec_level & 0xf);
+
+	centaur->valid = true;
+	return true;
+}
+
+/* Returns how long to wait for logic to stop in TB ticks or a negative
+ * value on error
+ */
+int64_t centaur_disable_sensor_cache(uint32_t part_id)
+{
+	struct centaur_chip *centaur = get_centaur(part_id);
+	int64_t rc = 0;
+	uint64_t ctrl;
+
+	if (!centaur)
+		return false;
+
+	lock(&centaur->lock);
+	centaur->scache_disable_count++;
+	if (centaur->scache_disable_count == 1) {
+		centaur->scache_was_enabled = false;
+		rc = centaur_fsiscom_read(centaur, SCAC_CONFIG_REG, &ctrl);
+		if (rc)
+			goto bail;
+		centaur->scache_was_enabled = !!(ctrl & SCAC_ENABLE_MSK);
+		rc = centaur_fsiscom_write(centaur, SCAC_CONFIG_CLR, SCAC_ENABLE_MSK);
+		if (rc)
+			goto bail;
+		rc = msecs_to_tb(30);
+	}
+ bail:
+	unlock(&centaur->lock);
+	return rc;
+}
+
+int64_t centaur_enable_sensor_cache(uint32_t part_id)
+{
+	struct centaur_chip *centaur = get_centaur(part_id);
+	int64_t rc = 0;
+
+	if (!centaur)
+		return false;
+
+	lock(&centaur->lock);
+	if (centaur->scache_disable_count == 0) {
+		cent_log(PR_ERR, centaur, "Cache count going negative !\n");
+		backtrace();
+		goto bail;
+	}
+	centaur->scache_disable_count--;
+	if (centaur->scache_disable_count == 0 && centaur->scache_was_enabled)
+		rc = centaur_fsiscom_write(centaur, SCAC_CONFIG_SET, SCAC_ENABLE_MSK);
+ bail:
+	unlock(&centaur->lock);
+	return rc;
+}
+
+void centaur_init(void)
+{
+	struct dt_node *cn;
+
+	dt_for_each_compatible(dt_root, cn, "ibm,centaur") {
+		uint32_t chip_id, mchip, meng, mport;
+
+		chip_id = dt_prop_get_u32(cn, "ibm,chip-id");
+		mchip = dt_prop_get_u32(cn, "ibm,fsi-master-chip-id");
+		meng = dt_prop_get_cell(cn, "ibm,fsi-master-port", 0);
+		mport = dt_prop_get_cell(cn, "ibm,fsi-master-port", 1);
+
+		/*
+		 * If adding the centaur succeeds, we expose it to
+		 * Linux as a scom-controller
+		 */
+		if (centaur_add(chip_id, mchip, meng, mport))
+			dt_add_property(cn, "scom-controller", NULL, 0);
+	}
+}
diff --git a/roms/skiboot/hw/chiptod.c b/roms/skiboot/hw/chiptod.c
new file mode 100644
index 000000000..7c0a1ffc7
--- /dev/null
+++ b/roms/skiboot/hw/chiptod.c
@@ -0,0 +1,2067 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Handle ChipTOD chip & configure core and CAPP timebases
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt)	"CHIPTOD: " fmt
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <pci.h>
+#include <chiptod.h>
+#include <chip.h>
+#include <io.h>
+#include <cpu.h>
+#include <timebase.h>
+#include <opal-api.h>
+
+/* TOD chip XSCOM addresses */
+#define TOD_MASTER_PATH_CTRL		0x00040000 /* Master Path ctrl reg */
+#define TOD_PRI_PORT0_CTRL		0x00040001 /* Primary port0 ctrl reg */
+#define TOD_PRI_PORT1_CTRL		0x00040002 /* Primary port1 ctrl reg */
+#define TOD_SEC_PORT0_CTRL		0x00040003 /* Secondary p0 ctrl reg */
+#define TOD_SEC_PORT1_CTRL		0x00040004 /* Secondary p1 ctrl reg */
+#define TOD_SLAVE_PATH_CTRL		0x00040005 /* Slave Path ctrl reg */
+#define TOD_INTERNAL_PATH_CTRL		0x00040006 /* Internal Path ctrl reg */
+
+/* -- TOD primary/secondary master/slave control register -- */
+#define TOD_PSMS_CTRL			0x00040007
+#define  TOD_PSMSC_PM_TOD_SELECT	PPC_BIT(1)  /* Primary Master TOD */
+#define  TOD_PSMSC_PM_DRAW_SELECT	PPC_BIT(2)  /* Primary Master Drawer */
+#define  TOD_PSMSC_SM_TOD_SELECT	PPC_BIT(9)  /* Secondary Master TOD */
+#define  TOD_PSMSC_SM_DRAW_SELECT	PPC_BIT(10) /* Secondary Master Draw */
+
+/* -- TOD primary/secondary master/slave status register -- */
+#define TOD_STATUS			0x00040008
+#define   TOD_ST_TOPOLOGY_SELECT	PPC_BITMASK(0, 2)
+#define   TOD_ST_MPATH0_STEP_VALID	PPC_BIT(6)  /* MasterPath0 step valid */
+#define   TOD_ST_MPATH1_STEP_VALID	PPC_BIT(7)  /* MasterPath1 step valid */
+#define   TOD_ST_SPATH0_STEP_VALID	PPC_BIT(8)  /* SlavePath0 step valid */
+#define   TOD_ST_SPATH1_STEP_VALID	PPC_BIT(10) /* SlavePath1 step valid */
+/* Primary master/slave path select (0 = PATH_0, 1 = PATH_1) */
+#define   TOD_ST_PRI_MPATH_SELECT	PPC_BIT(12) /* Primary MPath Select */
+#define   TOD_ST_PRI_SPATH_SELECT	PPC_BIT(15) /* Primary SPath Select */
+/* Secondary master/slave path select (0 = PATH_0, 1 = PATH_1) */
+#define   TOD_ST_SEC_MPATH_SELECT	PPC_BIT(16) /* Secondary MPath Select */
+#define   TOD_ST_SEC_SPATH_SELECT	PPC_BIT(19) /* Secondary SPath Select */
+#define   TOD_ST_ACTIVE_MASTER		PPC_BIT(23)
+#define   TOD_ST_BACKUP_MASTER		PPC_BIT(24)
+
+/* TOD chip XSCOM addresses */
+#define TOD_CHIP_CTRL			0x00040010 /* Chip control register */
+#define TOD_TTYPE_0			0x00040011
+#define TOD_TTYPE_1			0x00040012 /* PSS switch */
+#define TOD_TTYPE_2			0x00040013 /* Enable step checkers */
+#define TOD_TTYPE_3			0x00040014 /* Request TOD */
+#define TOD_TTYPE_4			0x00040015 /* Send TOD */
+#define TOD_TTYPE_5			0x00040016 /* Invalidate TOD */
+#define TOD_CHIPTOD_TO_TB		0x00040017
+#define TOD_LOAD_TOD_MOD		0x00040018
+#define TOD_CHIPTOD_VALUE		0x00040020
+#define TOD_CHIPTOD_LOAD_TB		0x00040021
+#define TOD_CHIPTOD_FSM			0x00040024
+
+/* -- TOD PIB Master reg -- */
+#define TOD_PIB_MASTER			0x00040027
+#define   TOD_PIBM_ADDR_CFG_MCAST	PPC_BIT(25)
+#define   TOD_PIBM_ADDR_CFG_SLADDR	PPC_BITMASK(26, 31)
+#define   TOD_PIBM_TTYPE4_SEND_MODE	PPC_BIT(32)
+#define   TOD_PIBM_TTYPE4_SEND_ENBL	PPC_BIT(33)
+
+/* -- TOD Error interrupt register -- */
+#define TOD_ERROR			0x00040030
+/* SYNC errors */
+#define   TOD_ERR_CRMO_PARITY		PPC_BIT(0)
+#define   TOD_ERR_OSC0_PARITY		PPC_BIT(1)
+#define   TOD_ERR_OSC1_PARITY		PPC_BIT(2)
+#define   TOD_ERR_PPORT0_CREG_PARITY	PPC_BIT(3)
+#define   TOD_ERR_PPORT1_CREG_PARITY	PPC_BIT(4)
+#define   TOD_ERR_SPORT0_CREG_PARITY	PPC_BIT(5)
+#define   TOD_ERR_SPORT1_CREG_PARITY	PPC_BIT(6)
+#define   TOD_ERR_SPATH_CREG_PARITY	PPC_BIT(7)
+#define   TOD_ERR_IPATH_CREG_PARITY	PPC_BIT(8)
+#define   TOD_ERR_PSMS_CREG_PARITY	PPC_BIT(9)
+#define   TOD_ERR_CRITC_PARITY		PPC_BIT(13)
+#define   TOD_ERR_MP0_STEP_CHECK	PPC_BIT(14)
+#define   TOD_ERR_MP1_STEP_CHECK	PPC_BIT(15)
+#define   TOD_ERR_PSS_HAMMING_DISTANCE	PPC_BIT(18)
+#define	  TOD_ERR_DELAY_COMPL_PARITY	PPC_BIT(22)
+/* CNTR errors */
+#define   TOD_ERR_CTCR_PARITY		PPC_BIT(32)
+#define   TOD_ERR_TOD_SYNC_CHECK	PPC_BIT(33)
+#define   TOD_ERR_TOD_FSM_PARITY	PPC_BIT(34)
+#define   TOD_ERR_TOD_REGISTER_PARITY	PPC_BIT(35)
+#define   TOD_ERR_OVERFLOW_YR2042	PPC_BIT(36)
+#define   TOD_ERR_TOD_WOF_LSTEP_PARITY	PPC_BIT(37)
+#define   TOD_ERR_TTYPE0_RECVD		PPC_BIT(38)
+#define   TOD_ERR_TTYPE1_RECVD		PPC_BIT(39)
+#define   TOD_ERR_TTYPE2_RECVD		PPC_BIT(40)
+#define   TOD_ERR_TTYPE3_RECVD		PPC_BIT(41)
+#define   TOD_ERR_TTYPE4_RECVD		PPC_BIT(42)
+#define   TOD_ERR_TTYPE5_RECVD		PPC_BIT(43)
+
+/* -- TOD Error interrupt register -- */
+#define TOD_ERROR_INJECT		0x00040031
+
+/* PC unit PIB address which recieves the timebase transfer from TOD */
+#define   PC_TOD			0x4A3
+
+/* Local FIR EH.TPCHIP.TPC.LOCAL_FIR */
+#define LOCAL_CORE_FIR		0x0104000C
+#define LFIR_SWITCH_COMPLETE	PPC_BIT(18)
+
+/* Number of iterations for the various timeouts */
+#define TIMEOUT_LOOPS		20000000
+
+/* TOD active Primary/secondary configuration */
+#define TOD_PRI_CONF_IN_USE	0	/* Tod using primary topology*/
+#define TOD_SEC_CONF_IN_USE	7	/* Tod using secondary topo */
+
+/* Timebase State Machine error state */
+#define TBST_STATE_ERROR	9
+
+static enum chiptod_type {
+	chiptod_unknown,
+	chiptod_p8,
+	chiptod_p9,
+	chiptod_p10,
+} chiptod_type;
+
+enum chiptod_chip_role {
+	chiptod_chip_role_UNKNOWN = -1,
+	chiptod_chip_role_MDMT = 0,	/* Master Drawer Master TOD */
+	chiptod_chip_role_MDST,		/* Master Drawer Slave TOD */
+	chiptod_chip_role_SDMT,		/* Slave Drawer Master TOD */
+	chiptod_chip_role_SDST,		/* Slave Drawer Slave TOD */
+};
+
+enum chiptod_chip_status {
+	chiptod_active_master = 0,	/* Chip TOD is Active master */
+	chiptod_backup_master = 1,	/* Chip TOD is backup master */
+	chiptod_backup_disabled,	/* Chip TOD is backup but disabled */
+};
+
+struct chiptod_chip_config_info {
+	int32_t id;				/* chip id */
+	enum chiptod_chip_role role;		/* Chip role */
+	enum chiptod_chip_status status;	/* active/backup/disabled */
+};
+
+static int32_t chiptod_primary = -1;
+static int32_t chiptod_secondary = -1;
+static enum chiptod_topology current_topology = chiptod_topo_unknown;
+
+/*
+ * chiptod_topology_info holds primary/secondary chip configuration info.
+ * This info is initialized during chiptod_init(). This is an array of two:
+ *	[0] = [chiptod_topo_primary] = Primary topology config info
+ *	[1] = [chiptod_topo_secondary] = Secondary topology config info
+ */
+static struct chiptod_chip_config_info chiptod_topology_info[2];
+
+/*
+ * Array of TOD control registers that holds last known valid values.
+ *
+ * Cache chiptod control register values at following instances:
+ * 1. Chiptod initialization
+ * 2. After topology switch is complete.
+ * 3. Upon receiving enable/disable topology request from FSP.
+ *
+ * Cache following chip TOD control registers:
+ *   - Master Path control register (0x00040000)
+ *   - Primary Port-0 control register (0x00040001)
+ *   - Primary Port-1 control register (0x00040002)
+ *   - Secondary Port-0 control register (0x00040003)
+ *   - Secondary Port-1 control register (0x00040004)
+ *   - Slave Path control register (0x00040005)
+ *   - Internal Path control register (0x00040006)
+ *   - Primary/secondary master/slave control register (0x00040007)
+ *   - Chip control register (0x00040010)
+ *
+ * This data is used for restoring respective TOD registers to sane values
+ * whenever parity errors are reported on these registers (through HMI).
+ * The error_bit maps to corresponding bit from TOD error register that
+ * reports parity error on respective TOD registers.
+ */
+static struct chiptod_tod_regs {
+	/* error bit from TOD Error reg */
+	const uint64_t	error_bit;
+
+	/* xscom address of TOD register to be restored. */
+	const uint64_t	xscom_addr;
+	/* per chip cached value of TOD control registers to be restored. */
+	struct {
+		uint64_t	data;
+		bool		valid;
+	} val[MAX_CHIPS];
+} chiptod_tod_regs[] = {
+	{ TOD_ERR_CRMO_PARITY, TOD_MASTER_PATH_CTRL, { } },
+	{ TOD_ERR_PPORT0_CREG_PARITY, TOD_PRI_PORT0_CTRL,  { } },
+	{ TOD_ERR_PPORT1_CREG_PARITY, TOD_PRI_PORT1_CTRL, { } },
+	{ TOD_ERR_SPORT0_CREG_PARITY, TOD_SEC_PORT0_CTRL, { } },
+	{ TOD_ERR_SPORT1_CREG_PARITY, TOD_SEC_PORT1_CTRL, { } },
+	{ TOD_ERR_SPATH_CREG_PARITY, TOD_SLAVE_PATH_CTRL, { } },
+	{ TOD_ERR_IPATH_CREG_PARITY, TOD_INTERNAL_PATH_CTRL, { } },
+	{ TOD_ERR_PSMS_CREG_PARITY, TOD_PSMS_CTRL, { } },
+	{ TOD_ERR_CTCR_PARITY, TOD_CHIP_CTRL, { } },
+};
+
+/* The base TFMR value is the same for the whole machine
+ * for now as far as I can tell
+ */
+static uint64_t base_tfmr;
+
+/*
+ * For now, we use a global lock for runtime chiptod operations,
+ * eventually make this a per-core lock for wakeup rsync and
+ * take all of them for RAS cases.
+ */
+static struct lock chiptod_lock = LOCK_UNLOCKED;
+static bool chiptod_unrecoverable;
+
+#define NUM_SYNC_RETRIES 10
+
+static void _chiptod_cache_tod_regs(int32_t chip_id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(chiptod_tod_regs); i++) {
+		if (xscom_read(chip_id, chiptod_tod_regs[i].xscom_addr,
+			&(chiptod_tod_regs[i].val[chip_id].data))) {
+			prerror("XSCOM error reading 0x%08llx reg.\n",
+					chiptod_tod_regs[i].xscom_addr);
+			/* Invalidate this record and continue */
+			chiptod_tod_regs[i].val[chip_id].valid = 0;
+			continue;
+		}
+		chiptod_tod_regs[i].val[chip_id].valid = 1;
+	}
+}
+
+static void chiptod_cache_tod_registers(void)
+{
+	struct proc_chip *chip;
+
+	for_each_chip(chip)
+		_chiptod_cache_tod_regs(chip->id);
+}
+
+static void print_topo_info(enum chiptod_topology topo)
+{
+	const char *role[] = { "Unknown", "MDMT", "MDST", "SDMT", "SDST" };
+	const char *status[] = { "Unknown",
+		"Active Master", "Backup Master", "Backup Master Disabled" };
+
+	prlog(PR_DEBUG, "  Chip id: %d, Role: %s, Status: %s\n",
+				chiptod_topology_info[topo].id,
+				role[chiptod_topology_info[topo].role + 1],
+				status[chiptod_topology_info[topo].status + 1]);
+}
+
+static void print_topology_info(void)
+{
+	const char *topo[] = { "Unknown", "Primary", "Secondary" };
+
+	if (current_topology < 0)
+		return;
+
+	prlog(PR_DEBUG, "TOD Topology in Use: %s\n",
+						topo[current_topology+1]);
+	prlog(PR_DEBUG, "  Primary configuration:\n");
+	print_topo_info(chiptod_topo_primary);
+	prlog(PR_DEBUG, "  Secondary configuration:\n");
+	print_topo_info(chiptod_topo_secondary);
+}
+
+static enum chiptod_topology query_current_topology(void)
+{
+	uint64_t tod_status;
+
+	if (xscom_readme(TOD_STATUS, &tod_status)) {
+		prerror("XSCOM error reading TOD_STATUS reg\n");
+		return chiptod_topo_unknown;
+	}
+
+	/*
+	 * Tod status register bit [0-2] tells configuration in use.
+	 *	000 <= primary configuration in use
+	 *	111 <= secondary configuration in use
+	 */
+	if ((tod_status & TOD_ST_TOPOLOGY_SELECT) == TOD_PRI_CONF_IN_USE)
+		return chiptod_topo_primary;
+	else
+		return chiptod_topo_secondary;
+}
+
+static enum chiptod_chip_role
+chiptod_get_chip_role(enum chiptod_topology topology, int32_t chip_id)
+{
+	uint64_t tod_ctrl;
+	enum chiptod_chip_role role = chiptod_chip_role_UNKNOWN;
+
+	if (chip_id < 0)
+		return role;
+
+	if (xscom_read(chip_id, TOD_PSMS_CTRL, &tod_ctrl)) {
+		prerror("XSCOM error reading TOD_PSMS_CTRL\n");
+		return chiptod_chip_role_UNKNOWN;
+	}
+
+	switch (topology) {
+	case chiptod_topo_primary:
+		if (tod_ctrl & TOD_PSMSC_PM_DRAW_SELECT) {
+			if (tod_ctrl & TOD_PSMSC_PM_TOD_SELECT)
+				role = chiptod_chip_role_MDMT;
+			else
+				role = chiptod_chip_role_MDST;
+		} else {
+			if (tod_ctrl & TOD_PSMSC_PM_TOD_SELECT)
+				role = chiptod_chip_role_SDMT;
+			else
+				role = chiptod_chip_role_SDST;
+		}
+		break;
+	case chiptod_topo_secondary:
+		if (tod_ctrl & TOD_PSMSC_SM_DRAW_SELECT) {
+			if (tod_ctrl & TOD_PSMSC_SM_TOD_SELECT)
+				role = chiptod_chip_role_MDMT;
+			else
+				role = chiptod_chip_role_MDST;
+		} else {
+			if (tod_ctrl & TOD_PSMSC_SM_TOD_SELECT)
+				role = chiptod_chip_role_SDMT;
+			else
+				role = chiptod_chip_role_SDST;
+		}
+		break;
+	case chiptod_topo_unknown:
+	default:
+		break;
+	}
+	return role;
+}
+
+/*
+ * Check and return the status of sync step network for a given
+ * topology configuration.
+ * Return values:
+ *	true:	Sync Step network is running
+ *	false:	Sync Step network is not running
+ */
+static bool chiptod_sync_step_check_running(enum chiptod_topology topology)
+{
+	uint64_t tod_status;
+	enum chiptod_chip_role role;
+	bool running = false;
+	int32_t chip_id = chiptod_topology_info[topology].id;
+
+	/* Sanity check */
+	if (chip_id < 0)
+		return false;
+
+	if (xscom_read(chip_id, TOD_STATUS, &tod_status)) {
+		prerror("XSCOM error reading TOD_STATUS reg\n");
+		return false;
+	}
+
+	switch (topology) {
+	case chiptod_topo_primary:
+		/* Primary configuration */
+		role = chiptod_topology_info[topology].role;
+		if (role == chiptod_chip_role_MDMT) {
+			/*
+			 * Chip is using Master path.
+			 * Check if it is using path_0/path_1 and then
+			 * validity of that path.
+			 *
+			 * TOD_STATUS[12]: 0 = PATH_0, 1 = PATH_1
+			 */
+			if (tod_status & TOD_ST_PRI_MPATH_SELECT) {
+				if (tod_status & TOD_ST_MPATH1_STEP_VALID)
+					running = true;
+			} else {
+				if (tod_status & TOD_ST_MPATH0_STEP_VALID)
+					running = true;
+			}
+		} else {
+			/*
+			 * Chip is using Slave path.
+			 *
+			 * TOD_STATUS[15]: 0 = PATH_0, 1 = PATH_1
+			 */
+			if (tod_status & TOD_ST_PRI_SPATH_SELECT) {
+				if (tod_status & TOD_ST_SPATH1_STEP_VALID)
+					running = true;
+			} else {
+				if (tod_status & TOD_ST_SPATH0_STEP_VALID)
+					running = true;
+			}
+		}
+		break;
+	case chiptod_topo_secondary:
+		/* Secondary configuration */
+		role = chiptod_topology_info[topology].role;
+		if (role == chiptod_chip_role_MDMT) {
+			/*
+			 * Chip is using Master path.
+			 * Check if it is using path_0/path_1 and then
+			 * validity of that path.
+			 *
+			 * TOD_STATUS[12]: 0 = PATH_0, 1 = PATH_1
+			 */
+			if (tod_status & TOD_ST_SEC_MPATH_SELECT) {
+				if (tod_status & TOD_ST_MPATH1_STEP_VALID)
+					running = true;
+			} else {
+				if (tod_status & TOD_ST_MPATH0_STEP_VALID)
+					running = true;
+			}
+		} else {
+			/*
+			 * Chip is using Slave path.
+			 *
+			 * TOD_STATUS[15]: 0 = PATH_0, 1 = PATH_1
+			 */
+			if (tod_status & TOD_ST_SEC_SPATH_SELECT) {
+				if (tod_status & TOD_ST_SPATH1_STEP_VALID)
+					running = true;
+			} else {
+				if (tod_status & TOD_ST_SPATH0_STEP_VALID)
+					running = true;
+			}
+		}
+		break;
+	default:
+		break;
+	}
+	return running;
+}
+
+static enum chiptod_chip_status _chiptod_get_chip_status(int32_t chip_id)
+{
+	uint64_t tod_status;
+	enum chiptod_chip_status status = -1;
+
+	if (chip_id < 0)
+		return chiptod_backup_disabled;
+
+	if (xscom_read(chip_id, TOD_STATUS, &tod_status)) {
+		prerror("XSCOM error reading TOD_STATUS reg\n");
+		return status;
+	}
+
+	if (tod_status & TOD_ST_ACTIVE_MASTER)
+		status = chiptod_active_master;
+	else if (tod_status & TOD_ST_BACKUP_MASTER)
+		status = chiptod_backup_master;
+
+	return status;
+}
+
+static enum chiptod_chip_status
+chiptod_get_chip_status(enum chiptod_topology topology)
+{
+	return _chiptod_get_chip_status(chiptod_topology_info[topology].id);
+}
+
+static void chiptod_update_topology(enum chiptod_topology topo)
+{
+	int32_t chip_id = chiptod_topology_info[topo].id;
+
+	if (chip_id < 0)
+		return;
+
+	chiptod_topology_info[topo].role = chiptod_get_chip_role(topo, chip_id);
+	chiptod_topology_info[topo].status = chiptod_get_chip_status(topo);
+
+	/*
+	 * If chip TOD on this topology is a backup master then check if
+	 * sync/step network is running on this topology. If not,
+	 * then mark status as backup not valid.
+	 */
+	if ((chiptod_topology_info[topo].status == chiptod_backup_master) &&
+			!chiptod_sync_step_check_running(topo))
+		chiptod_topology_info[topo].status = chiptod_backup_disabled;
+}
+
+static void chiptod_setup_base_tfmr(void)
+{
+	struct dt_node *cpu = this_cpu()->node;
+	uint64_t core_freq, tod_freq;
+	uint64_t mcbs;
+
+	base_tfmr = SPR_TFMR_TB_ECLIPZ;
+
+	/* Get CPU and TOD freqs in Hz */
+	if (dt_has_node_property(cpu, "ibm,extended-clock-frequency", NULL))
+		core_freq = dt_prop_get_u64(cpu, "ibm,extended-clock-frequency");
+	else
+		core_freq = dt_prop_get_u32(cpu, "clock-frequency");
+
+	if (!core_freq) {
+		prlog(PR_ERR, "CPU clock frequency is not set\n");
+		abort();
+	}
+
+	tod_freq = 32000000;
+
+	/* Calculate the "Max Cycles Between Steps" value according
+	 * to the magic formula:
+	 *
+	 * mcbs = (core_freq * max_jitter_factor) / (4 * tod_freq) / 100;
+	 *
+	 * The max jitter factor is set to 240 based on what pHyp uses.
+	 */
+	mcbs = (core_freq * 240) / (4 * tod_freq) / 100;
+	prlog(PR_INFO, "Calculated MCBS is 0x%llx"
+	      " (Cfreq=%lld Tfreq=%lld)\n",
+	      mcbs, core_freq, tod_freq);
+
+	/* Bake that all into TFMR */
+	base_tfmr = SETFIELD(SPR_TFMR_MAX_CYC_BET_STEPS, base_tfmr, mcbs);
+	base_tfmr = SETFIELD(SPR_TFMR_N_CLKS_PER_STEP, base_tfmr, 0);
+	base_tfmr = SETFIELD(SPR_TFMR_SYNC_BIT_SEL, base_tfmr, 4);
+}
+
+static bool chiptod_mod_tb(void)
+{
+	uint64_t tfmr = base_tfmr;
+	uint64_t timeout = 0;
+
+	/* Switch timebase to "Not Set" state */
+	mtspr(SPR_TFMR, tfmr | SPR_TFMR_LOAD_TOD_MOD);
+	do {
+		if (++timeout >= (TIMEOUT_LOOPS*2)) {
+			prerror("TB \"Not Set\" timeout\n");
+			return false;
+		}
+		tfmr = mfspr(SPR_TFMR);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("TB \"Not Set\" TFMR corrupt\n");
+			return false;
+		}
+		if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == 9) {
+			prerror("TB \"Not Set\" TOD in error state\n");
+			return false;
+		}
+	} while (tfmr & SPR_TFMR_LOAD_TOD_MOD);
+
+	return true;
+}
+
+static bool chiptod_interrupt_check(void)
+{
+	uint64_t tfmr;
+	uint64_t timeout = 0;
+
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			prerror("Interrupt check fail\n");
+			return false;
+		}
+		tfmr = mfspr(SPR_TFMR);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("Interrupt check TFMR corrupt !\n");
+			return false;
+		}
+	} while (tfmr & SPR_TFMR_CHIP_TOD_INTERRUPT);
+
+	return true;
+}
+
+static bool chiptod_running_check(uint32_t chip_id)
+{
+	uint64_t tval;
+
+	if (xscom_read(chip_id, TOD_CHIPTOD_FSM, &tval)) {
+		prerror("XSCOM error polling run\n");
+		return false;
+	}
+	if (tval & 0x0800000000000000UL)
+		return true;
+	else
+		return false;
+}
+
+static bool chiptod_poll_running(void)
+{
+	uint64_t timeout = 0;
+	uint64_t tval;
+
+	/* Chip TOD running check */
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			prerror("Running check fail timeout\n");
+			return false;
+		}
+		if (xscom_readme(TOD_CHIPTOD_FSM, &tval)) {
+			prerror("XSCOM error polling run\n");
+			return false;
+		}
+	} while (!(tval & 0x0800000000000000UL));
+
+	return true;
+}
+
+static bool chiptod_to_tb(void)
+{
+	uint32_t pir = this_cpu()->pir;
+	uint64_t tval, tfmr;
+	uint64_t timeout = 0;
+
+	/* Tell the ChipTOD about our fabric address
+	 *
+	 * The pib_master value is calculated from the CPU core ID, given in
+	 * the PIR. Because we have different core/thread arrangements in the
+	 * PIR between p7 and p8, we need to do the calculation differently.
+	 *
+	 * p7: 0b00001 || 3-bit core id
+	 * p8:  0b0001 || 4-bit core id
+	 * p9:   0b001 || 5-bit core id
+	 * p10:  0b001 || 5-bit core id
+	 *
+	 * However in P10 we don't use the core ID addressing, but rather core
+	 * scom addressing mode, which appears to work better.
+	 */
+
+	if (xscom_readme(TOD_PIB_MASTER, &tval)) {
+		prerror("XSCOM error reading PIB_MASTER\n");
+		return false;
+	}
+
+	if (chiptod_type == chiptod_p10) {
+		uint32_t core_id = pir_to_core_id(pir);
+
+		if (this_cpu()->is_fused_core &&
+				PVR_VERS_MAJ(mfspr(SPR_PVR)) == 2) {
+			/* Workaround: must address the even small core. */
+			core_id &= ~1;
+		}
+
+		tval = XSCOM_ADDR_P10_EC(core_id, PC_TOD);
+
+		tval <<= 32; /* PIB slave address goes in PPC bits [0:31] */
+
+		tval |= PPC_BIT(35); /* Enable SCOM addressing. */
+
+	} else {
+		uint64_t tvbits;
+
+		if (chiptod_type == chiptod_p9) {
+			tvbits = (pir >> 2) & 0x1f;
+			tvbits |= 0x20;
+		} else if (chiptod_type == chiptod_p8) {
+			tvbits = (pir >> 3) & 0xf;
+			tvbits |= 0x10;
+		} else {
+			tvbits = (pir >> 2) & 0x7;
+			tvbits |= 0x08;
+		}
+		tval &= ~TOD_PIBM_ADDR_CFG_MCAST;
+		tval = SETFIELD(TOD_PIBM_ADDR_CFG_SLADDR, tval, tvbits);
+	}
+
+	if (xscom_writeme(TOD_PIB_MASTER, tval)) {
+		prerror("XSCOM error writing PIB_MASTER\n");
+		return false;
+	}
+
+	/* Make us ready to get the TB from the chipTOD */
+	mtspr(SPR_TFMR, base_tfmr | SPR_TFMR_MOVE_CHIP_TOD_TO_TB);
+
+	/* Tell the ChipTOD to send it */
+	if (xscom_writeme(TOD_CHIPTOD_TO_TB, PPC_BIT(0))) {
+		prerror("XSCOM error writing CHIPTOD_TO_TB\n");
+		return false;
+	}
+
+	/* Wait for it to complete */
+	timeout = 0;
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			prerror("Chip to TB timeout\n");
+			return false;
+		}
+		tfmr = mfspr(SPR_TFMR);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("MoveToTB: corrupt TFMR !\n");
+			return false;
+		}
+	} while (tfmr & SPR_TFMR_MOVE_CHIP_TOD_TO_TB);
+
+	return true;
+}
+
+static bool chiptod_check_tb_running(void)
+{
+	/* We used to wait for two SYNC pulses in TFMR but that
+	 * doesn't seem to occur in sim, so instead we use a
+	 * method similar to what pHyp does which is to check for
+	 * TFMR SPR_TFMR_TB_VALID and not SPR_TFMR_TFMR_CORRUPT
+	 */
+#if 0
+	uint64_t tfmr, timeout;
+	unsigned int i;
+
+	for (i = 0; i < 2; i++) {
+		tfmr = mfspr(SPR_TFMR);
+		tfmr &= ~SPR_TFMR_TB_SYNC_OCCURED;
+		mtspr(SPR_TFMR, tfmr);
+		timeout = 0;
+		do {
+			if (++timeout >= TIMEOUT_LOOPS) {
+				prerror("CHIPTOD: No sync pulses\n");
+				return false;
+			}
+			tfmr = mfspr(SPR_TFMR);
+		} while (!(tfmr & SPR_TFMR_TB_SYNC_OCCURED));
+	}
+#else
+	uint64_t tfmr = mfspr(SPR_TFMR);
+
+	return (tfmr & SPR_TFMR_TB_VALID) &&
+		!(tfmr & SPR_TFMR_TFMR_CORRUPT);
+#endif
+	return true;
+}
+
+static bool chiptod_reset_tb_errors(void)
+{
+	uint64_t tfmr;
+	unsigned long timeout = 0;
+
+	/* Ask for automatic clear of errors */
+	tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+
+	/* Additionally pHyp sets these (write-1-to-clear ?) */
+	tfmr |= SPR_TFMR_TB_MISSING_SYNC;
+	tfmr |= SPR_TFMR_TB_MISSING_STEP;
+	tfmr |= SPR_TFMR_TB_RESIDUE_ERR;
+	mtspr(SPR_TFMR, tfmr);
+
+	/* We have to write "Clear TB Errors" again */
+	tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+	mtspr(SPR_TFMR, tfmr);
+
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			/* Don't actually do anything on error for
+			 * now ... not much we can do, panic maybe ?
+			 */
+			prerror("TB error reset timeout !\n");
+			return false;
+		}
+		tfmr = mfspr(SPR_TFMR);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("TB error reset: corrupt TFMR !\n");
+			return false;
+		}
+	} while (tfmr & SPR_TFMR_CLEAR_TB_ERRORS);
+	return true;
+}
+
+static void chiptod_cleanup_thread_tfmr(void)
+{
+	uint64_t tfmr = base_tfmr;
+
+	tfmr |= SPR_TFMR_PURR_PARITY_ERR;
+	tfmr |= SPR_TFMR_SPURR_PARITY_ERR;
+	tfmr |= SPR_TFMR_DEC_PARITY_ERR;
+	tfmr |= SPR_TFMR_TFMR_CORRUPT;
+	tfmr |= SPR_TFMR_PURR_OVERFLOW;
+	tfmr |= SPR_TFMR_SPURR_OVERFLOW;
+	mtspr(SPR_TFMR, tfmr);
+}
+
+static void chiptod_reset_tod_errors(void)
+{
+	uint64_t terr;
+
+	/*
+	 * At boot, we clear the errors that the firmware is
+	 * supposed to handle. List provided by the pHyp folks.
+	 */
+
+	terr = TOD_ERR_CRITC_PARITY;
+	terr |= TOD_ERR_PSS_HAMMING_DISTANCE;
+	terr |= TOD_ERR_DELAY_COMPL_PARITY;
+	terr |= TOD_ERR_CTCR_PARITY;
+	terr |= TOD_ERR_TOD_SYNC_CHECK;
+	terr |= TOD_ERR_TOD_FSM_PARITY;
+	terr |= TOD_ERR_TOD_REGISTER_PARITY;
+
+	if (xscom_writeme(TOD_ERROR, terr)) {
+		prerror("XSCOM error writing TOD_ERROR !\n");
+		/* Not much we can do here ... abort ? */
+	}
+}
+
+static void chiptod_sync_master(void *data)
+{
+	uint64_t initial_tb_value;
+	bool *result = data;
+
+	prlog(PR_DEBUG, "Master sync on CPU PIR 0x%04x...\n",
+	      this_cpu()->pir);
+
+	/* Apply base tfmr */
+	mtspr(SPR_TFMR, base_tfmr);
+
+	/* From recipe provided by pHyp folks, reset various errors
+	 * before attempting the sync
+	 */
+	chiptod_reset_tb_errors();
+
+	/* Cleanup thread tfmr bits */
+	chiptod_cleanup_thread_tfmr();
+
+	/* Reset errors in the chiptod itself */
+	chiptod_reset_tod_errors();
+
+	/* Switch timebase to "Not Set" state */
+	if (!chiptod_mod_tb())
+		goto error;
+	prlog(PR_INSANE, "SYNC MASTER Step 2 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Chip TOD step checkers enable */
+	if (xscom_writeme(TOD_TTYPE_2, PPC_BIT(0))) {
+		prerror("XSCOM error enabling steppers\n");
+		goto error;
+	}
+
+	prlog(PR_INSANE, "SYNC MASTER Step 3 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Chip TOD interrupt check */
+	if (!chiptod_interrupt_check())
+		goto error;
+	prlog(PR_INSANE, "SYNC MASTER Step 4 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Switch local chiptod to "Not Set" state */
+	if (xscom_writeme(TOD_LOAD_TOD_MOD, PPC_BIT(0))) {
+		prerror("XSCOM error sending LOAD_TOD_MOD\n");
+		goto error;
+	}
+
+	/* Switch all remote chiptod to "Not Set" state */
+	if (xscom_writeme(TOD_TTYPE_5, PPC_BIT(0))) {
+		prerror("XSCOM error sending TTYPE_5\n");
+		goto error;
+	}
+
+	/*
+	 * Load the master's current timebase value into the Chip TOD
+	 * network. This is so we have sane timestamps across the whole
+	 * IPL process. The Chip TOD documentation says that the loaded
+	 * value needs to be one STEP before a SYNC. In other words,
+	 * set the low bits to 0x1ff0.
+	 */
+	initial_tb_value = (mftb() & ~0x1fff) | 0x1ff0;
+
+	/* Chip TOD load initial value */
+	if (xscom_writeme(TOD_CHIPTOD_LOAD_TB, initial_tb_value)) {
+		prerror("XSCOM error setting init TB\n");
+		goto error;
+	}
+
+	prlog(PR_INSANE, "SYNC MASTER Step 5 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	if (!chiptod_poll_running())
+		goto error;
+	prlog(PR_INSANE, "SYNC MASTER Step 6 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Move chiptod value to core TB */
+	if (!chiptod_to_tb())
+		goto error;
+	prlog(PR_INSANE, "SYNC MASTER Step 7 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Send local chip TOD to all chips TOD */
+	if (xscom_writeme(TOD_TTYPE_4, PPC_BIT(0))) {
+		prerror("XSCOM error sending TTYPE_4\n");
+		goto error;
+	}
+
+	/* Check if TB is running */
+	if (!chiptod_check_tb_running())
+		goto error;
+
+	prlog(PR_INSANE, "Master sync completed, TB=%lx\n", mfspr(SPR_TBRL));
+
+	/*
+	 * A little delay to make sure the remote chips get up to
+	 * speed before we start syncing them.
+	 *
+	 * We have to do it here because we know our TB is running
+	 * while the boot thread TB might not yet.
+	 */
+	time_wait_ms(1);
+
+	*result = true;
+	return;
+ error:
+	prerror("Master sync failed! TFMR=0x%016lx,  retrying...\n", mfspr(SPR_TFMR));
+	*result = false;
+}
+
+static void chiptod_sync_slave(void *data)
+{
+	bool *result = data;
+	bool do_sync = false;
+
+	/* Only get primaries, not threads */
+	if (!this_cpu()->is_secondary)
+		do_sync = true;
+
+	if (chiptod_type == chiptod_p10 && this_cpu()->is_fused_core &&
+			PVR_VERS_MAJ(mfspr(SPR_PVR)) == 2) {
+		/* P10 DD2 fused core workaround, must sync on small cores */
+		if (this_cpu() == this_cpu()->ec_primary)
+			do_sync = true;
+	}
+
+	if (!do_sync) {
+		/* Just cleanup the TFMR */
+		chiptod_cleanup_thread_tfmr();
+		*result = true;
+		return;
+	}
+
+	prlog(PR_DEBUG, "Slave sync on CPU PIR 0x%04x...\n",
+	      this_cpu()->pir);
+
+	/* Apply base tfmr */
+	mtspr(SPR_TFMR, base_tfmr);
+
+	/* From recipe provided by pHyp folks, reset various errors
+	 * before attempting the sync
+	 */
+	chiptod_reset_tb_errors();
+
+	/* Cleanup thread tfmr bits */
+	chiptod_cleanup_thread_tfmr();
+
+	/* Switch timebase to "Not Set" state */
+	if (!chiptod_mod_tb())
+		goto error;
+	prlog(PR_INSANE, "SYNC SLAVE Step 2 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Chip TOD running check */
+	if (!chiptod_poll_running())
+		goto error;
+	prlog(PR_INSANE, "SYNC SLAVE Step 3 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Chip TOD interrupt check */
+	if (!chiptod_interrupt_check())
+		goto error;
+	prlog(PR_INSANE, "SYNC SLAVE Step 4 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Move chiptod value to core TB */
+	if (!chiptod_to_tb())
+		goto error;
+	prlog(PR_INSANE, "SYNC SLAVE Step 5 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Check if TB is running */
+	if (!chiptod_check_tb_running())
+		goto error;
+
+	prlog(PR_INSANE, "Slave sync completed, TB=%lx\n", mfspr(SPR_TBRL));
+
+	*result = true;
+	return;
+ error:
+	prerror("Slave sync failed ! TFMR=0x%016lx, retrying...\n", mfspr(SPR_TFMR));
+	*result = false;
+}
+
+bool chiptod_wakeup_resync(void)
+{
+	if (chiptod_primary < 0)
+		return 0;
+
+	lock(&chiptod_lock);
+
+	/* Apply base tfmr */
+	mtspr(SPR_TFMR, base_tfmr);
+
+	/* From recipe provided by pHyp folks, reset various errors
+	 * before attempting the sync
+	 */
+	chiptod_reset_tb_errors();
+
+	/* Cleanup thread tfmr bits */
+	chiptod_cleanup_thread_tfmr();
+
+	/* Switch timebase to "Not Set" state */
+	if (!chiptod_mod_tb())
+		goto error;
+
+	/* Move chiptod value to core TB */
+	if (!chiptod_to_tb())
+		goto error;
+
+	unlock(&chiptod_lock);
+
+	return true;
+ error:
+	prerror("Resync failed ! TFMR=0x%16lx\n", mfspr(SPR_TFMR));
+	unlock(&chiptod_lock);
+	return false;
+}
+
+/*
+ * Fixup for p10 TOD bug workaround.
+ *
+ * The TOD may fail to start if all clocks in the system are derived from
+ * the same reference oscillator.
+ *
+ * Avoiding this is pretty easy: Whenever we clear/reset the TOD registers,
+ * make sure to init bits 26:31 of TOD_SLAVE_PATH_CTRL (0x40005) to 0b111111
+ * instead of 0b000000. The value 0 in TOD_S_PATH_CTRL_REG(26:31) must be
+ * avoided, and if it does get written it must be followed up by writing a
+ * value of all ones to clean up the resulting bad state before the (nonzero)
+ * final value can be written.
+ */
+static void fixup_tod_reg_value(struct chiptod_tod_regs *treg_entry)
+{
+	int32_t chip_id = this_cpu()->chip_id;
+
+	if (proc_gen != proc_gen_p10)
+		return;
+
+	if (treg_entry->xscom_addr == TOD_SLAVE_PATH_CTRL)
+		treg_entry->val[chip_id].data |= PPC_BITMASK(26,31);
+}
+
+static int __chiptod_recover_tod_errors(void)
+{
+	uint64_t terr;
+	uint64_t treset = 0;
+	int i, rc = -1;
+	int32_t chip_id = this_cpu()->chip_id;
+
+	/* Read TOD error register */
+	if (xscom_readme(TOD_ERROR, &terr)) {
+		prerror("XSCOM error reading TOD_ERROR reg\n");
+		return 0;
+	}
+	/* Check for sync check error and recover */
+	if ((terr & TOD_ERR_TOD_SYNC_CHECK) ||
+		(terr & TOD_ERR_TOD_FSM_PARITY) ||
+		(terr & TOD_ERR_CTCR_PARITY) ||
+		(terr & TOD_ERR_PSS_HAMMING_DISTANCE) ||
+		(terr & TOD_ERR_DELAY_COMPL_PARITY) ||
+		(terr & TOD_ERR_TOD_REGISTER_PARITY)) {
+		chiptod_reset_tod_errors();
+		rc = 1;
+	}
+
+	/*
+	 * Check for TOD control register parity errors and restore those
+	 * registers with last saved valid values.
+	 */
+	for (i = 0; i < ARRAY_SIZE(chiptod_tod_regs); i++) {
+		if (!(terr & chiptod_tod_regs[i].error_bit))
+			continue;
+
+		/* Check if we have valid last saved register value. */
+		if (!chiptod_tod_regs[i].val[chip_id].valid) {
+			prerror("Failed to restore TOD register: %08llx",
+					chiptod_tod_regs[i].xscom_addr);
+			return 0;
+		}
+
+		fixup_tod_reg_value(&chiptod_tod_regs[i]);
+
+		prlog(PR_DEBUG, "Parity error, Restoring TOD register: "
+				"%08llx = %016llx\n",
+				chiptod_tod_regs[i].xscom_addr,
+				chiptod_tod_regs[i].val[chip_id].data);
+		if (xscom_writeme(chiptod_tod_regs[i].xscom_addr,
+			chiptod_tod_regs[i].val[chip_id].data)) {
+			prerror("XSCOM error writing 0x%08llx reg.\n",
+					chiptod_tod_regs[i].xscom_addr);
+			return 0;
+		}
+		treset |= chiptod_tod_regs[i].error_bit;
+	}
+
+	if (treset && (xscom_writeme(TOD_ERROR, treset))) {
+		prerror("XSCOM error writing TOD_ERROR !\n");
+		return 0;
+	}
+	/* We have handled all the TOD errors routed to hypervisor */
+	if (treset)
+		rc = 1;
+	return rc;
+}
+
+int chiptod_recover_tod_errors(void)
+{
+	int rc;
+
+	lock(&chiptod_lock);
+	rc = __chiptod_recover_tod_errors();
+	unlock(&chiptod_lock);
+	return rc;
+}
+
+static int32_t chiptod_get_active_master(void)
+{
+	if (current_topology < 0)
+		return -1;
+
+	if (chiptod_topology_info[current_topology].status ==
+							chiptod_active_master)
+		return chiptod_topology_info[current_topology].id;
+	return -1;
+}
+
+/* Return true if Active master TOD is running. */
+static bool chiptod_master_running(void)
+{
+	int32_t active_master_chip;
+
+	active_master_chip = chiptod_get_active_master();
+	if (active_master_chip != -1) {
+		if (chiptod_running_check(active_master_chip))
+			return true;
+	}
+	return false;
+}
+
+static bool chiptod_set_ttype4_mode(struct proc_chip *chip, bool enable)
+{
+	uint64_t tval;
+
+	/* Sanity check */
+	if (!chip)
+		return false;
+
+	if (xscom_read(chip->id, TOD_PIB_MASTER, &tval)) {
+		prerror("XSCOM error reading PIB_MASTER\n");
+		return false;
+	}
+
+	if (enable) {
+		/*
+		 * Enable TTYPE4 send mode. This allows TOD to respond to
+		 * TTYPE3 request.
+		 */
+		tval |= TOD_PIBM_TTYPE4_SEND_MODE;
+		tval |= TOD_PIBM_TTYPE4_SEND_ENBL;
+	} else {
+		/* Disable TTYPE4 send mode. */
+		tval &= ~TOD_PIBM_TTYPE4_SEND_MODE;
+		tval &= ~TOD_PIBM_TTYPE4_SEND_ENBL;
+	}
+
+	if (xscom_write(chip->id, TOD_PIB_MASTER, tval)) {
+		prerror("XSCOM error writing PIB_MASTER\n");
+		return false;
+	}
+	return true;
+}
+
+/* Stop TODs on slave chips in backup topology. */
+static void chiptod_stop_slave_tods(void)
+{
+	struct proc_chip *chip = NULL;
+	enum chiptod_topology backup_topo;
+	uint64_t terr = 0;
+
+	/* Inject TOD sync check error on salve TODs to stop them. */
+	terr |= TOD_ERR_TOD_SYNC_CHECK;
+
+	if (current_topology == chiptod_topo_primary)
+		backup_topo = chiptod_topo_secondary;
+	else
+		backup_topo = chiptod_topo_primary;
+
+	for_each_chip(chip) {
+		enum chiptod_chip_role role;
+
+		/* Current chip TOD is already in stooped state */
+		if (chip->id == this_cpu()->chip_id)
+			continue;
+
+		role = chiptod_get_chip_role(backup_topo, chip->id);
+
+		/* Skip backup master chip TOD. */
+		if (role == chiptod_chip_role_MDMT)
+			continue;
+
+		if (xscom_write(chip->id, TOD_ERROR_INJECT, terr))
+			prerror("XSCOM error writing TOD_ERROR_INJ\n");
+
+		if (chiptod_running_check(chip->id)) {
+			prlog(PR_DEBUG,
+			"Failed to stop TOD on slave CHIP [%d]\n",
+								chip->id);
+		}
+	}
+}
+
+static bool is_topology_switch_required(void)
+{
+	int32_t active_master_chip;
+	uint64_t tod_error;
+
+	active_master_chip = chiptod_get_active_master();
+
+	/* Check if TOD is running on Active master. */
+	if (chiptod_master_running())
+		return false;
+
+	/*
+	 * Check if sync/step network is running.
+	 *
+	 * If sync/step network is not running on current active topology
+	 * then we need switch topology to recover from TOD error.
+	 */
+	if (!chiptod_sync_step_check_running(current_topology)) {
+		prlog(PR_DEBUG, "Sync/Step network not running\n");
+		return true;
+	}
+
+	/*
+	 * Check if there is a step check error reported on
+	 * Active master.
+	 */
+	if (xscom_read(active_master_chip, TOD_ERROR, &tod_error)) {
+		prerror("XSCOM error reading TOD_ERROR reg\n");
+		/*
+		 * Can't do anything here. But we already found that
+		 * sync/step network is running. Hence return false.
+		 */
+		return false;
+	}
+
+	if (tod_error & TOD_ERR_MP0_STEP_CHECK) {
+		prlog(PR_DEBUG, "TOD step check error\n");
+		return true;
+	}
+
+	return false;
+}
+
+static bool chiptod_backup_valid(void)
+{
+	enum chiptod_topology backup_topo;
+
+	if (current_topology < 0)
+		return false;
+
+	if (current_topology == chiptod_topo_primary)
+		backup_topo = chiptod_topo_secondary;
+	else
+		backup_topo = chiptod_topo_primary;
+
+	if (chiptod_topology_info[backup_topo].status == chiptod_backup_master)
+		return chiptod_sync_step_check_running(backup_topo);
+
+	return false;
+}
+
+static void chiptod_topology_switch_complete(void)
+{
+	/*
+	 * After the topology switch, we may have a non-functional backup
+	 * topology, and we won't be able to recover from future TOD errors
+	 * that requires topology switch. Someone needs to either fix it OR
+	 * configure new functional backup topology.
+	 *
+	 * Bit 18 of the Pervasive FIR is used to signal that TOD error
+	 * analysis needs to be performed. This allows FSP/PRD to
+	 * investigate and re-configure new backup topology if required.
+	 * Once new backup topology is configured and ready, FSP sends a
+	 * mailbox command xE6, s/c 0x06, mod 0, to enable the backup
+	 * topology.
+	 *
+	 * This isn't documented anywhere. This info is provided by FSP
+	 * folks.
+	 */
+	if (xscom_writeme(LOCAL_CORE_FIR, LFIR_SWITCH_COMPLETE)) {
+		prerror("XSCOM error writing LOCAL_CORE_FIR\n");
+		return;
+	}
+
+	/* Save TOD control registers values. */
+	chiptod_cache_tod_registers();
+
+	prlog(PR_DEBUG, "Topology switch complete\n");
+	print_topology_info();
+}
+
+/*
+ * Sync up TOD with other chips and get TOD in running state.
+ * Check if current topology is active and running. If not, then
+ * trigger a topology switch.
+ */
+static int chiptod_start_tod(void)
+{
+	struct proc_chip *chip = NULL;
+
+	/*  Do a topology switch if required. */
+	if (is_topology_switch_required()) {
+		int32_t mchip = chiptod_get_active_master();
+
+		prlog(PR_DEBUG, "Need topology switch to recover\n");
+		/*
+		 * There is a failure in StepSync network in current
+		 * active topology. TOD is not running on active master chip.
+		 * We need to sync with backup master chip TOD.
+		 * But before we do that we need to switch topology to make
+		 * backup master as the new active master. Once we switch the
+		 * topology we can then request TOD value from new active
+		 * master. But make sure we move local chiptod to Not Set
+		 * before requesting TOD value.
+		 *
+		 * Before triggering a topology switch, check if backup
+		 * is valid and stop all slave TODs in backup topology.
+		 */
+		if (!chiptod_backup_valid()) {
+			prerror("Backup master is not enabled. "
+				"Can not do a topology switch.\n");
+			goto error_out;
+		}
+
+		chiptod_stop_slave_tods();
+
+		if (xscom_write(mchip, TOD_TTYPE_1, PPC_BIT(0))) {
+			prerror("XSCOM error switching primary/secondary\n");
+			goto error_out;
+		}
+
+		/* Update topology info. */
+		current_topology = query_current_topology();
+		chiptod_update_topology(chiptod_topo_primary);
+		chiptod_update_topology(chiptod_topo_secondary);
+
+		/*
+		 * We just switched topologies to recover.
+		 * Check if new master TOD is running.
+		 */
+		if (!chiptod_master_running()) {
+			prerror("TOD is not running on new master.\n");
+			goto error_out;
+		}
+
+		/*
+		 * Enable step checkers on all Chip TODs
+		 *
+		 * During topology switch, step checkers are disabled
+		 * on all Chip TODs by default. Enable them.
+		 */
+		if (xscom_writeme(TOD_TTYPE_2, PPC_BIT(0))) {
+			prerror("XSCOM error enabling steppers\n");
+			goto error_out;
+		}
+
+		chiptod_topology_switch_complete();
+	}
+
+	if (!chiptod_master_running()) {
+		/*
+		 * Active Master TOD is not running, which means it won't
+		 * respond to TTYPE_3 request.
+		 *
+		 * Find a chip that has TOD in running state and configure
+		 * it to respond to TTYPE_3 request.
+		 */
+		for_each_chip(chip) {
+			if (chiptod_running_check(chip->id)) {
+				if (chiptod_set_ttype4_mode(chip, true))
+					break;
+			}
+		}
+	}
+
+	/* Switch local chiptod to "Not Set" state */
+	if (xscom_writeme(TOD_LOAD_TOD_MOD, PPC_BIT(0))) {
+		prerror("XSCOM error sending LOAD_TOD_MOD\n");
+		goto error_out;
+	}
+
+	/*
+	 * Request the current TOD value from another chip.
+	 * This will move TOD in running state
+	 */
+	if (xscom_writeme(TOD_TTYPE_3, PPC_BIT(0))) {
+		prerror("XSCOM error sending TTYPE_3\n");
+		goto error_out;
+	}
+
+	/* Check if chip TOD is running. */
+	if (!chiptod_poll_running())
+		goto error_out;
+
+	/* Restore the ttype4_mode. */
+	chiptod_set_ttype4_mode(chip, false);
+	return 1;
+
+error_out:
+	chiptod_unrecoverable = true;
+	return 0;
+}
+
+static bool tfmr_recover_tb_errors(uint64_t tfmr)
+{
+	uint64_t tfmr_reset_error;
+	unsigned long timeout = 0;
+
+	/* Ask for automatic clear of errors */
+	tfmr_reset_error = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+
+	/* Additionally pHyp sets these (write-1-to-clear ?) */
+	if (tfmr & SPR_TFMR_TB_MISSING_SYNC)
+		tfmr_reset_error |= SPR_TFMR_TB_MISSING_SYNC;
+
+	if (tfmr & SPR_TFMR_TB_MISSING_STEP)
+		tfmr_reset_error |= SPR_TFMR_TB_MISSING_STEP;
+
+	/*
+	 * write 1 to bit 45 to clear TB residue the error.
+	 * TB register has already been reset to zero as part pre-recovery.
+	 */
+	if (tfmr & SPR_TFMR_TB_RESIDUE_ERR)
+		tfmr_reset_error |= SPR_TFMR_TB_RESIDUE_ERR;
+
+	if (tfmr & SPR_TFMR_FW_CONTROL_ERR)
+		tfmr_reset_error |= SPR_TFMR_FW_CONTROL_ERR;
+
+	if (tfmr & SPR_TFMR_TBST_CORRUPT)
+		tfmr_reset_error |= SPR_TFMR_TBST_CORRUPT;
+
+	mtspr(SPR_TFMR, tfmr_reset_error);
+
+	/* We have to write "Clear TB Errors" again */
+	tfmr_reset_error = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+	mtspr(SPR_TFMR, tfmr_reset_error);
+
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			prerror("TB error reset timeout !\n");
+			return false;
+		}
+		tfmr = mfspr(SPR_TFMR);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("TB error reset: corrupt TFMR !\n");
+			return false;
+		}
+	} while (tfmr & SPR_TFMR_CLEAR_TB_ERRORS);
+	return true;
+}
+
+bool tfmr_recover_local_errors(uint64_t tfmr)
+{
+	uint64_t tfmr_reset_errors = 0;
+
+	if (tfmr & SPR_TFMR_DEC_PARITY_ERR) {
+		/* Set DEC with all ones */
+		mtspr(SPR_DEC, ~0);
+
+		/* set bit 59 to clear TFMR DEC parity error. */
+		tfmr_reset_errors |= SPR_TFMR_DEC_PARITY_ERR;
+	}
+
+	/*
+	* Reset PURR/SPURR to recover. We also need help from KVM
+	* layer to handle this change in PURR/SPURR. That needs
+	* to be handled in kernel KVM layer. For now, to recover just
+	* reset it.
+	*/
+	if (tfmr & SPR_TFMR_PURR_PARITY_ERR) {
+		/* set PURR register with sane value or reset it. */
+		mtspr(SPR_PURR, 0);
+
+		/* set bit 57 to clear TFMR PURR parity error. */
+		tfmr_reset_errors |= SPR_TFMR_PURR_PARITY_ERR;
+	}
+
+	if (tfmr & SPR_TFMR_SPURR_PARITY_ERR) {
+		/* set PURR register with sane value or reset it. */
+		mtspr(SPR_SPURR, 0);
+
+		/* set bit 58 to clear TFMR PURR parity error. */
+		tfmr_reset_errors |= SPR_TFMR_SPURR_PARITY_ERR;
+	}
+
+	/* Write TFMR twice to clear the error */
+	mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+	mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+
+	/* Get fresh copy of TFMR */
+	tfmr = mfspr(SPR_TFMR);
+
+	/* Check if TFMR non-TB errors still present. */
+	if (tfmr & tfmr_reset_errors) {
+		prerror("TFMR non-TB error recovery failed! "
+			"TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+		return false;
+	}
+	return true;
+}
+
+/*
+ * TFMR parity error recovery as per pc_workbook:
+ *	MT(TFMR) bits 11 and 60 are b’1’
+ *	MT(HMER) all bits 1 except for bits 4,5
+ */
+bool recover_corrupt_tfmr(void)
+{
+	uint64_t tfmr;
+
+	/* Get the base TFMR */
+	tfmr = base_tfmr;
+
+	/* Set bit 60 to clear TFMR parity error. */
+	tfmr |= SPR_TFMR_TFMR_CORRUPT;
+	mtspr(SPR_TFMR, tfmr);
+
+	/* Write twice to clear the error */
+	mtspr(SPR_TFMR, tfmr);
+
+	/* Get fresh copy of TFMR */
+	tfmr = mfspr(SPR_TFMR);
+
+	/* Check if TFMR parity error still present. */
+	if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+		prerror("TFMR error recovery: corrupt TFMR !\n");
+		return false;
+	}
+
+	/*
+	 * Now that we have sane value in TFMR, check if Timebase machine
+	 * state is in ERROR state. If yes, clear TB errors so that
+	 * Timebase machine state changes to RESET state. Once in RESET state
+	 * then we can then load TB with TOD value.
+	 */
+	if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == TBST_STATE_ERROR) {
+		if (!chiptod_reset_tb_errors())
+			return false;
+	}
+	return true;
+}
+
+void tfmr_cleanup_core_errors(uint64_t tfmr)
+{
+	/* If HDEC is bad, clean it on all threads before we clear the
+	 * error condition.
+	 */
+	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
+		mtspr(SPR_HDEC, 0);
+
+	/* If TB is invalid, clean it on all threads as well, it will be
+	 * restored after the next rendez-vous
+	 */
+	if (!(tfmr & SPR_TFMR_TB_VALID)) {
+		mtspr(SPR_TBWU, 0);
+		mtspr(SPR_TBWU, 0);
+	}
+}
+
+int tfmr_clear_core_errors(uint64_t tfmr)
+{
+	uint64_t tfmr_reset_errors = 0;
+
+	/* return -1 if there is nothing to be fixed. */
+	if (!(tfmr & SPR_TFMR_HDEC_PARITY_ERROR))
+		return -1;
+
+	tfmr_reset_errors |= SPR_TFMR_HDEC_PARITY_ERROR;
+
+	/* Write TFMR twice to clear the error */
+	mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+	mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+
+	return 1;
+}
+
+/*
+ * Recover from TB and TOD errors.
+ * Timebase register is per core and first thread that gets chance to
+ * handle interrupt would fix actual TFAC errors and rest of the threads
+ * from same core would see no errors. Return -1 if no errors have been
+ * found. The caller (handle_hmi_exception) of this function would not
+ * send an HMI event to host if return value is -1.
+ *
+ * Return values:
+ *	0	<= Failed to recover from errors
+ *	1	<= Successfully recovered from errors
+ *	-1	<= No errors found. Errors are already been fixed.
+ */
+int chiptod_recover_tb_errors(bool *out_resynced)
+{
+	uint64_t tfmr;
+	int rc = -1;
+
+	*out_resynced = false;
+
+	if (chiptod_primary < 0)
+		return 0;
+
+	lock(&chiptod_lock);
+
+	/*
+	 * Return if TOD is unrecoverable.
+	 * The previous attempt to recover TOD has been failed.
+	 */
+	if (chiptod_unrecoverable) {
+		rc = 0;
+		goto error_out;
+	}
+
+	/* Get fresh copy of TFMR */
+	tfmr = mfspr(SPR_TFMR);
+
+	/*
+	 * Check for TB errors.
+	 * On Sync check error, bit 44 of TFMR is set. Check for it and
+	 * clear it.
+	 *
+	 * In some rare situations we may have all TB errors already cleared,
+	 * but TB stuck in waiting for new value from TOD with TFMR bit 18
+	 * set to '1'. This uncertain state of TB would fail the process
+	 * of getting TB back into running state. Get TB in clean initial
+	 * state by clearing TB errors if TFMR[18] is set.
+	 */
+	if ((tfmr & SPR_TFMR_TB_MISSING_STEP) ||
+		(tfmr & SPR_TFMR_TB_RESIDUE_ERR) ||
+		(tfmr & SPR_TFMR_FW_CONTROL_ERR) ||
+		(tfmr & SPR_TFMR_TBST_CORRUPT) ||
+		(tfmr & SPR_TFMR_MOVE_CHIP_TOD_TO_TB) ||
+		(tfmr & SPR_TFMR_TB_MISSING_SYNC)) {
+		if (!tfmr_recover_tb_errors(tfmr)) {
+			rc = 0;
+			goto error_out;
+		}
+	}
+
+	/*
+	 * Check for TOD sync check error.
+	 * On TOD errors, bit 51 of TFMR is set. If this bit is on then we
+	 * need to fetch TOD error register and recover from TOD errors.
+	 * Bit 33 of TOD error register indicates sync check error.
+	 */
+	if (tfmr & SPR_TFMR_CHIP_TOD_INTERRUPT)
+		rc = __chiptod_recover_tod_errors();
+
+	/* Check if TB is running. If not then we need to get it running. */
+	if (!(tfmr & SPR_TFMR_TB_VALID)) {
+		rc = 0;
+
+		/* Place TB in Notset state. */
+		if (!chiptod_mod_tb())
+			goto error_out;
+
+		/*
+		 * Before we move TOD to core TB check if TOD is running.
+		 * If not, then get TOD in running state.
+		 */
+		if (!chiptod_running_check(this_cpu()->chip_id))
+			if (!chiptod_start_tod())
+				goto error_out;
+
+		/* Move chiptod value to core TB */
+		if (!chiptod_to_tb())
+			goto error_out;
+
+		*out_resynced = true;
+
+		/* We have successfully able to get TB running. */
+		rc = 1;
+	}
+
+error_out:
+	unlock(&chiptod_lock);
+	return rc;
+}
+
+static int64_t opal_resync_timebase(void)
+{
+	/* Mambo and qemu doesn't simulate the chiptod */
+	if (chip_quirk(QUIRK_NO_CHIPTOD))
+		return OPAL_SUCCESS;
+
+	if (!chiptod_wakeup_resync()) {
+		prerror("OPAL: Resync timebase failed on CPU 0x%04x\n",
+			this_cpu()->pir);
+		return OPAL_HARDWARE;
+	}
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_RESYNC_TIMEBASE, opal_resync_timebase, 0);
+
+static void chiptod_print_tb(void *data __unused)
+{
+	prlog(PR_DEBUG, "PIR 0x%04x TB=%lx\n", this_cpu()->pir,
+				mfspr(SPR_TBRL));
+}
+
+static bool chiptod_probe(void)
+{
+	struct dt_node *np;
+
+	dt_for_each_compatible(dt_root, np, "ibm,power-chiptod") {
+		uint32_t chip;
+
+		/* Old DT has chip-id in chiptod node, newer only in the
+		 * parent xscom bridge
+		 */
+		chip = dt_get_chip_id(np);
+
+		if (dt_has_node_property(np, "primary", NULL)) {
+			chiptod_primary = chip;
+			if (dt_node_is_compatible(np, "ibm,power8-chiptod"))
+				chiptod_type = chiptod_p8;
+			if (dt_node_is_compatible(np, "ibm,power9-chiptod"))
+				chiptod_type = chiptod_p9;
+			if (dt_node_is_compatible(np, "ibm,power10-chiptod"))
+				chiptod_type = chiptod_p10;
+		}
+
+		if (dt_has_node_property(np, "secondary", NULL))
+			chiptod_secondary = chip;
+
+	}
+
+	if (chiptod_type == chiptod_unknown) {
+		prerror("Unknown TOD type !\n");
+		return false;
+	}
+
+	return true;
+}
+
+static void chiptod_discover_new_backup(enum chiptod_topology topo)
+{
+	struct proc_chip *chip = NULL;
+
+	/* Scan through available chips to find new backup master chip */
+	for_each_chip(chip) {
+		if (_chiptod_get_chip_status(chip->id) == chiptod_backup_master)
+			break;
+	}
+
+	/* Found new backup master chip. Update the topology info */
+	if (chip) {
+		prlog(PR_DEBUG, "New backup master: CHIP [%d]\n",
+								chip->id);
+
+		if (topo == chiptod_topo_primary)
+			chiptod_primary = chip->id;
+		else
+			chiptod_secondary = chip->id;
+		chiptod_topology_info[topo].id = chip->id;
+		chiptod_update_topology(topo);
+
+		prlog(PR_DEBUG,
+			"Backup topology configuration changed.\n");
+		print_topology_info();
+	}
+
+	/*
+	 * Topology configuration has changed. Save TOD control registers
+	 * values.
+	 */
+	chiptod_cache_tod_registers();
+}
+
+/*
+ * Enable/disable backup topology.
+ * If request is to enable topology, then discover new backup master
+ * chip and update the topology configuration info. If the request is
+ * to disable topology, then mark the current backup topology as disabled.
+ * Return error (-1) if the action is requested on currenlty active
+ * topology.
+ *
+ * Return values:
+ *	true	<= Success
+ *	false	<= Topology is active and in use.
+ */
+bool chiptod_adjust_topology(enum chiptod_topology topo, bool enable)
+{
+	uint8_t rc = true;
+	/*
+	 * The FSP can only request that the currently inactive topology
+	 * be disabled or enabled. If the requested topology is currently
+	 * the active topology, then fail this request with a -1 (TOD
+	 * topology in use) status as return code.
+	 */
+	lock(&chiptod_lock);
+	if (topo == current_topology) {
+		rc = false;
+		goto out;
+	}
+
+	if (enable)
+		chiptod_discover_new_backup(topo);
+	else
+		chiptod_topology_info[topo].status = chiptod_backup_disabled;
+out:
+	unlock(&chiptod_lock);
+	return rc;
+}
+
+static void chiptod_init_topology_info(void)
+{
+	/* Find and update current topology in use. */
+	current_topology = query_current_topology();
+
+	/* Initialized primary topology chip config info */
+	chiptod_topology_info[chiptod_topo_primary].id = chiptod_primary;
+	chiptod_update_topology(chiptod_topo_primary);
+
+	/* Initialized secondary topology chip config info */
+	chiptod_topology_info[chiptod_topo_secondary].id = chiptod_secondary;
+	chiptod_update_topology(chiptod_topo_secondary);
+
+	/* Cache TOD control registers values. */
+	chiptod_cache_tod_registers();
+	print_topology_info();
+}
+
+void chiptod_init(void)
+{
+	struct cpu_thread *cpu0, *cpu;
+	bool sres;
+	int i;
+
+	/* Mambo and qemu doesn't simulate the chiptod */
+	if (chip_quirk(QUIRK_NO_CHIPTOD))
+		return;
+
+	op_display(OP_LOG, OP_MOD_CHIPTOD, 0);
+
+	if (!chiptod_probe()) {
+		prerror("Failed ChipTOD detection !\n");
+		op_display(OP_FATAL, OP_MOD_CHIPTOD, 0);
+		abort();
+	}
+
+	op_display(OP_LOG, OP_MOD_CHIPTOD, 1);
+
+	/* Pick somebody on the primary */
+	cpu0 = find_cpu_by_chip_id(chiptod_primary);
+
+	/* Calculate the base TFMR value used for everybody */
+	chiptod_setup_base_tfmr();
+
+	prlog(PR_DEBUG, "Base TFMR=0x%016llx\n", base_tfmr);
+
+	i = NUM_SYNC_RETRIES;
+	do {
+		/* Schedule master sync */
+		sres = false;
+		cpu_wait_job(cpu_queue_job(cpu0, "chiptod_sync_master",
+				   chiptod_sync_master, &sres), true);
+	} while (!sres && i--);
+
+	if (!sres) {
+		op_display(OP_FATAL, OP_MOD_CHIPTOD, 2);
+		abort();
+	}
+
+	op_display(OP_LOG, OP_MOD_CHIPTOD, 2);
+
+	/* Schedule slave sync */
+	for_each_available_cpu(cpu) {
+		/* Skip master */
+		if (cpu == cpu0)
+			continue;
+
+		i = NUM_SYNC_RETRIES;
+		do {
+			/* Queue job */
+			sres = false;
+			cpu_wait_job(cpu_queue_job(cpu, "chiptod_sync_slave",
+						   chiptod_sync_slave, &sres),
+				     true);
+		} while (!sres && i--);
+
+		if (!sres) {
+			op_display(OP_WARN, OP_MOD_CHIPTOD, 3|(cpu->pir << 8));
+			prerror("CHIPTOD: Failed to sync PIR 0x%04x\n",
+					this_cpu()->pir);
+
+			/* Disable threads */
+			cpu_disable_all_threads(cpu);
+		}
+		op_display(OP_LOG, OP_MOD_CHIPTOD, 3|(cpu->pir << 8));
+	}
+
+	/* Display TBs */
+	for_each_available_cpu(cpu) {
+		/* Only do primaries, not threads */
+		if (cpu->is_secondary)
+			continue;
+		cpu_wait_job(cpu_queue_job(cpu, "chiptod_print_tb",
+					   chiptod_print_tb, NULL), true);
+	}
+
+	chiptod_init_topology_info();
+	op_display(OP_LOG, OP_MOD_CHIPTOD, 4);
+}
+
+/* CAPP timebase sync */
+
+static bool chiptod_capp_reset_tb_errors(uint32_t chip_id,
+					 uint32_t tfmr_addr,
+					 uint32_t offset)
+{
+	uint64_t tfmr;
+	unsigned long timeout = 0;
+
+	/* Ask for automatic clear of errors */
+	tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+
+	/* Additionally pHyp sets these (write-1-to-clear ?) */
+	tfmr |= SPR_TFMR_TB_MISSING_SYNC;
+	tfmr |= SPR_TFMR_TB_MISSING_STEP;
+	tfmr |= SPR_TFMR_TB_RESIDUE_ERR;
+	tfmr |= SPR_TFMR_TBST_CORRUPT;
+	tfmr |= SPR_TFMR_TFMR_CORRUPT;
+
+	/* Write CAPP TFMR */
+	xscom_write(chip_id, tfmr_addr + offset, tfmr);
+
+	/* We have to write "Clear TB Errors" again */
+	tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+	/* Write CAPP TFMR */
+	xscom_write(chip_id, tfmr_addr + offset, tfmr);
+
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			prerror("CAPP: TB error reset timeout !\n");
+			return false;
+		}
+		/* Read CAPP TFMR */
+		xscom_read(chip_id, tfmr_addr + offset, &tfmr);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("CAPP: TB error reset: corrupt TFMR!\n");
+			return false;
+		}
+	} while (tfmr & SPR_TFMR_CLEAR_TB_ERRORS);
+	return true;
+}
+
+static bool chiptod_capp_mod_tb(uint32_t chip_id, uint32_t tfmr_addr,
+				uint32_t offset)
+{
+	uint64_t timeout = 0;
+	uint64_t tfmr;
+
+	/* Switch CAPP timebase to "Not Set" state */
+	tfmr = base_tfmr | SPR_TFMR_LOAD_TOD_MOD;
+	xscom_write(chip_id, tfmr_addr + offset, tfmr);
+	do {
+		if (++timeout >= (TIMEOUT_LOOPS*2)) {
+			prerror("CAPP: TB \"Not Set\" timeout\n");
+			return false;
+		}
+		xscom_read(chip_id, tfmr_addr + offset, &tfmr);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("CAPP: TB \"Not Set\" TFMR corrupt\n");
+			return false;
+		}
+		if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == 9) {
+			prerror("CAPP: TB \"Not Set\" TOD in error state\n");
+			return false;
+		}
+	} while (tfmr & SPR_TFMR_LOAD_TOD_MOD);
+
+	return true;
+}
+
+static bool chiptod_wait_for_chip_sync(void)
+{
+	uint64_t tfmr;
+	uint64_t timeout = 0;
+
+	/* Read core TFMR, mask bit 42, write core TFMR back */
+	tfmr = mfspr(SPR_TFMR);
+	tfmr &= ~SPR_TFMR_TB_SYNC_OCCURED;
+	mtspr(SPR_TFMR, tfmr);
+
+	/* Read core TFMR until the TB sync occurred */
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			prerror("No sync pulses\n");
+			return false;
+		}
+		tfmr = mfspr(SPR_TFMR);
+	} while (!(tfmr & SPR_TFMR_TB_SYNC_OCCURED));
+	return true;
+}
+
+static bool chiptod_capp_check_tb_running(uint32_t chip_id,
+					  uint32_t tfmr_addr,
+					  uint32_t offset)
+{
+	uint64_t tfmr;
+	uint64_t timeout = 0;
+
+	/* Read CAPP TFMR until TB becomes valid */
+	do {
+		if (++timeout >= (TIMEOUT_LOOPS*2)) {
+			prerror("CAPP: TB Invalid!\n");
+			return false;
+		}
+		xscom_read(chip_id, tfmr_addr + offset, &tfmr);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("CAPP: TFMR corrupt!\n");
+			return false;
+		}
+	} while (!(tfmr & SPR_TFMR_TB_VALID));
+	return true;
+}
+
+bool chiptod_capp_timebase_sync(unsigned int chip_id, uint32_t tfmr_addr,
+				uint32_t tb_addr, uint32_t offset)
+{
+	uint64_t tfmr;
+	uint64_t capp_tb;
+	int64_t delta;
+	unsigned int retry = 0;
+
+	/* Set CAPP TFMR to base tfmr value */
+	xscom_write(chip_id, tfmr_addr + offset, base_tfmr);
+
+	/* Reset CAPP TB errors before attempting the sync */
+	if (!chiptod_capp_reset_tb_errors(chip_id, tfmr_addr, offset))
+		return false;
+
+	/* Switch CAPP TB to "Not Set" state */
+	if (!chiptod_capp_mod_tb(chip_id, tfmr_addr, offset))
+		return false;
+
+	/* Sync CAPP TB with core TB, retry while difference > 16usecs */
+	do {
+		if (retry++ > 5) {
+			prerror("CAPP: TB sync: giving up!\n");
+			return false;
+		}
+
+		/* Make CAPP ready to get the TB, wait for chip sync */
+		tfmr = base_tfmr | SPR_TFMR_MOVE_CHIP_TOD_TO_TB;
+		xscom_write(chip_id, tfmr_addr + offset, tfmr);
+		if (!chiptod_wait_for_chip_sync())
+			return false;
+
+		/* Set CAPP TB from core TB */
+		xscom_write(chip_id, tb_addr + offset, mftb());
+
+		/* Wait for CAPP TFMR tb_valid bit */
+		if (!chiptod_capp_check_tb_running(chip_id, tfmr_addr, offset))
+			return false;
+
+		/* Read CAPP TB, read core TB, compare */
+		xscom_read(chip_id, tb_addr + offset, &capp_tb);
+		delta = mftb() - capp_tb;
+		if (delta < 0)
+			delta = -delta;
+	} while (tb_to_usecs(delta) > 16);
+
+	return true;
+}
diff --git a/roms/skiboot/hw/dio-p9.c b/roms/skiboot/hw/dio-p9.c
new file mode 100644
index 000000000..5153f6eeb
--- /dev/null
+++ b/roms/skiboot/hw/dio-p9.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2019 IBM Corp. */
+
+#define pr_fmt(fmt) "DIO: " fmt
+
+#include <chip.h>
+#include <dio-p9.h>
+#include <opal.h>
+#include <xscom.h>
+#include <xscom-p9-regs.h>
+
+void p9_dio_init(void)
+{
+	struct dt_node *xn;
+	struct proc_chip *chip;
+	struct p9_dio *dio;
+
+	if (proc_gen < proc_gen_p9)
+		return;
+
+	dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+		dio = zalloc(sizeof(struct p9_dio));
+		assert(dio);
+		chip = get_chip(dt_get_chip_id(xn));
+		assert(chip);
+		chip->dio = dio;
+	}
+}
+
+int dio_interrupt_register(struct proc_chip *chip,
+		int port, dio_interrupt_callback callback)
+{
+	u64 val;
+	int rc;
+
+	assert(chip);
+	assert(chip->dio);
+
+	if (port < 0 || port >= NUM_OF_P9_DIO_PORTS)
+		return OPAL_PARAMETER;
+
+	if (chip->dio->callbacks[port]) /* This port already has a callback */
+		return OPAL_PARAMETER;
+
+	rc = xscom_read(chip->id, P9_GPIO_INTERRUPT_ENABLE, &val);
+	if (rc != OPAL_SUCCESS) {
+		prlog(PR_ERR, "XSCOM error %d reading reg 0x%llx\n",
+				rc, P9_GPIO_INTERRUPT_ENABLE);
+		return OPAL_HARDWARE;
+	}
+
+	val |= PPC_BIT(port);
+	rc = xscom_write(chip->id, P9_GPIO_INTERRUPT_ENABLE, val);
+	if (rc != OPAL_SUCCESS) {
+		prlog(PR_ERR, "XSCOM error %d writing reg 0x%llx\n",
+				rc, P9_GPIO_INTERRUPT_ENABLE);
+		return OPAL_HARDWARE;
+	}
+
+	chip->dio->callbacks[port] = callback;
+
+	return OPAL_SUCCESS;
+}
+
+int dio_interrupt_deregister(struct proc_chip* chip,
+		int port, dio_interrupt_callback callback)
+{
+	u64 val;
+	int rc;
+
+	assert(chip);
+	assert(chip->dio);
+
+	if (port < 0 || port >= NUM_OF_P9_DIO_PORTS)
+		return OPAL_PARAMETER;
+
+	if (chip->dio->callbacks[port] != callback)
+		return OPAL_PARAMETER;
+
+	rc = xscom_read(chip->id, P9_GPIO_INTERRUPT_ENABLE, &val);
+	if (rc != OPAL_SUCCESS) {
+		prlog(PR_ERR, "XSCOM error %d reading reg 0x%llx\n",
+				rc, P9_GPIO_INTERRUPT_ENABLE);
+		return OPAL_HARDWARE;
+	}
+
+	val &= ~PPC_BIT(port);
+	rc = xscom_write(chip->id, P9_GPIO_INTERRUPT_ENABLE, val);
+	if (rc != OPAL_SUCCESS) {
+		prlog(PR_ERR, "XSCOM error %d writing reg 0x%llx\n",
+				rc, P9_GPIO_INTERRUPT_ENABLE);
+		return OPAL_HARDWARE;
+	}
+
+	chip->dio->callbacks[port] = NULL;
+
+	return OPAL_SUCCESS;
+}
+
+void dio_interrupt_handler(uint32_t chip_id)
+{
+	struct proc_chip *chip;
+	u64 val;
+	int rc;
+	int i;
+
+	chip = get_chip(chip_id);
+	if (chip == NULL || chip->dio == NULL)
+		return;
+
+	rc = xscom_read(chip->id, P9_GPIO_INTERRUPT_STATUS, &val);
+	if (rc != OPAL_SUCCESS) {
+		prlog(PR_ERR, "XSCOM error %d reading reg 0x%llx\n",
+				rc, P9_GPIO_INTERRUPT_STATUS);
+		return;
+	}
+
+	for (i = 0; i < NUM_OF_P9_DIO_PORTS; ++i) {
+		if (val & PPC_BIT(i)) {
+			if (chip->dio->callbacks[i])
+				chip->dio->callbacks[i](chip);
+			else
+				prlog(PR_ERR,
+					"DIO interrupt triggerd on chip 0x%x"
+					" port %d but no handler\n",
+					chip->id, i);
+			/* Write 1 to clear the interrupt status */
+			xscom_write(chip->id, P9_GPIO_INTERRUPT_CONDITION,
+					val & PPC_BIT(i));
+		}
+	}
+}
diff --git a/roms/skiboot/hw/dts.c b/roms/skiboot/hw/dts.c
new file mode 100644
index 000000000..d8831e4d3
--- /dev/null
+++ b/roms/skiboot/hw/dts.c
@@ -0,0 +1,416 @@
+// SPDX-License-Identifier: Apache-2.0
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <xscom.h>
+#include <chip.h>
+#include <sensor.h>
+#include <dts.h>
+#include <skiboot.h>
+#include <opal-api.h>
+#include <opal-msg.h>
+#include <timer.h>
+#include <timebase.h>
+
+struct dts {
+	uint8_t		valid;
+	uint8_t		trip;
+	int16_t		temp;
+};
+
+/*
+ * Attributes for the core temperature sensor
+ */
+enum {
+	SENSOR_DTS_ATTR_TEMP_MAX,
+	SENSOR_DTS_ATTR_TEMP_TRIP
+};
+
+
+/* Therm mac result masking for DTS (result(0:15)
+ *  0:3   - 0x0
+ *  4:11  - Temperature in degrees C
+ *  12:13 - trip bits: 00 - no trip; 01 - warning; 10 - critical; 11 - fatal
+ *  14    - spare
+ *  15    - valid
+ */
+static void dts_decode_one_dts(uint16_t raw, struct dts *dts)
+{
+	/*
+	 * The value is both signed and unsigned :-) 0xff could be
+	 * either 255C or -1C, so for now we treat this as unsigned
+	 * which is sufficient for our purpose. We could try to be
+	 * a bit smarter and treat it as signed for values between
+	 * -10 and 0 and unsigned to 239 or something like that...
+	 */
+	dts->valid = raw & 1;
+	if (dts->valid) {
+		dts->temp = (raw >> 4) & 0xff;
+		dts->trip = (raw >> 2) & 0x3;
+	} else {
+		dts->temp = 0;
+		dts->trip = 0;
+	}
+}
+
+static void dts_keep_max(struct dts *temps, int n, struct dts *dts)
+{
+	int i;
+
+	for (i = 0; i < n; i++) {
+		int16_t t = temps[i].temp;
+
+		if (!temps[i].valid)
+			continue;
+
+		if (t > dts->temp)
+			dts->temp = t;
+
+		dts->valid++;
+		dts->trip |= temps[i].trip;
+	}
+}
+
+/* Per core Digital Thermal Sensors */
+#define EX_THERM_DTS_RESULT0	0x10050000
+#define EX_THERM_DTS_RESULT1	0x10050001
+
+/* Different sensor locations */
+#define P8_CT_ZONE_LSU	0
+#define P8_CT_ZONE_ISU	1
+#define P8_CT_ZONE_FXU	2
+#define P8_CT_ZONE_L3C	3
+#define P8_CT_ZONES	4
+
+/*
+ * Returns the temperature as the max of all 4 zones and a global trip
+ * attribute.
+ */
+static int dts_read_core_temp_p8(uint32_t pir, struct dts *dts)
+{
+	int32_t chip_id = pir_to_chip_id(pir);
+	int32_t core = pir_to_core_id(pir);
+	uint64_t dts0, dts1;
+	struct dts temps[P8_CT_ZONES];
+	int rc;
+
+	rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, EX_THERM_DTS_RESULT0),
+			&dts0);
+	if (rc)
+		return rc;
+
+	rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, EX_THERM_DTS_RESULT1),
+			&dts1);
+	if (rc)
+		return rc;
+
+	dts_decode_one_dts(dts0 >> 48, &temps[P8_CT_ZONE_LSU]);
+	dts_decode_one_dts(dts0 >> 32, &temps[P8_CT_ZONE_ISU]);
+	dts_decode_one_dts(dts0 >> 16, &temps[P8_CT_ZONE_FXU]);
+	dts_decode_one_dts(dts1 >> 48, &temps[P8_CT_ZONE_L3C]);
+
+	dts_keep_max(temps, P8_CT_ZONES, dts);
+
+	prlog(PR_TRACE, "DTS: Chip %x Core %x temp:%dC trip:%x\n",
+	      chip_id, core, dts->temp, dts->trip);
+
+	/*
+	 * FIXME: The trip bits are always set ?! Just discard
+	 * them for the moment until we understand why.
+	 */
+	dts->trip = 0;
+	return 0;
+}
+
+/* Per core Digital Thermal Sensors */
+#define EC_THERM_P9_DTS_RESULT0	0x050000
+
+/* Different sensor locations */
+#define P9_CORE_DTS0	0
+#define P9_CORE_DTS1	1
+#define P9_CORE_ZONES	2
+
+/*
+ * Returns the temperature as the max of all zones and a global trip
+ * attribute.
+ */
+static int dts_read_core_temp_p9(uint32_t pir, struct dts *dts)
+{
+	int32_t chip_id = pir_to_chip_id(pir);
+	int32_t core = pir_to_core_id(pir);
+	uint64_t dts0;
+	struct dts temps[P9_CORE_ZONES];
+	int rc;
+
+	rc = xscom_read(chip_id, XSCOM_ADDR_P9_EC(core, EC_THERM_P9_DTS_RESULT0),
+			&dts0);
+	if (rc)
+		return rc;
+
+	dts_decode_one_dts(dts0 >> 48, &temps[P9_CORE_DTS0]);
+	dts_decode_one_dts(dts0 >> 32, &temps[P9_CORE_DTS1]);
+
+	dts_keep_max(temps, P9_CORE_ZONES, dts);
+
+	prlog(PR_TRACE, "DTS: Chip %x Core %x temp:%dC trip:%x\n",
+	      chip_id, core, dts->temp, dts->trip);
+
+	/*
+	 * FIXME: The trip bits are always set ?! Just discard
+	 * them for the moment until we understand why.
+	 */
+	dts->trip = 0;
+	return 0;
+}
+
+static void dts_async_read_temp(struct timer *t __unused, void *data,
+				u64 now __unused)
+{
+	struct dts dts = {0};
+	int rc, swkup_rc;
+	struct cpu_thread *cpu = data;
+
+	swkup_rc = dctl_set_special_wakeup(cpu);
+
+	if (proc_gen == proc_gen_p9)
+		rc = dts_read_core_temp_p9(cpu->pir, &dts);
+	else /* (proc_gen == proc_gen_p10) */
+		rc = OPAL_UNSUPPORTED; /* XXX P10 */
+
+	if (!rc) {
+		if (cpu->sensor_attr == SENSOR_DTS_ATTR_TEMP_MAX)
+			*cpu->sensor_data = cpu_to_be64(dts.temp);
+		else if (cpu->sensor_attr == SENSOR_DTS_ATTR_TEMP_TRIP)
+			*cpu->sensor_data = cpu_to_be64(dts.trip);
+	}
+
+	if (!swkup_rc)
+		dctl_clear_special_wakeup(cpu);
+
+	check_sensor_read(cpu->token);
+	rc = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			cpu_to_be64(cpu->token),
+			cpu_to_be64(rc));
+	if (rc)
+		prerror("Failed to queue async message\n");
+
+	cpu->dts_read_in_progress = false;
+}
+
+static int dts_read_core_temp(u32 pir, struct dts *dts, u8 attr,
+			      int token, __be64 *sensor_data)
+{
+	struct cpu_thread *cpu;
+	int rc;
+
+	switch (proc_gen) {
+	case proc_gen_p8:
+		rc = dts_read_core_temp_p8(pir, dts);
+		break;
+	case proc_gen_p9: /* Asynchronus read */
+		cpu = find_cpu_by_pir(pir);
+		if (!cpu)
+			return OPAL_PARAMETER;
+		lock(&cpu->dts_lock);
+		if (cpu->dts_read_in_progress) {
+			unlock(&cpu->dts_lock);
+			return OPAL_BUSY;
+		}
+		cpu->dts_read_in_progress = true;
+		cpu->sensor_attr = attr;
+		cpu->sensor_data = sensor_data;
+		cpu->token = token;
+		schedule_timer(&cpu->dts_timer, 0);
+		rc = OPAL_ASYNC_COMPLETION;
+		unlock(&cpu->dts_lock);
+		break;
+	case proc_gen_p10: /* XXX P10 */
+	default:
+		rc = OPAL_UNSUPPORTED;
+	}
+	return rc;
+}
+
+/* Per memory controller Digital Thermal Sensors */
+#define THERM_MEM_DTS_RESULT0	0x2050000
+
+/* Different sensor locations */
+#define P8_MEM_DTS0	0
+#define P8_MEM_DTS1	1
+#define P8_MEM_ZONES	2
+
+static int dts_read_mem_temp(uint32_t chip_id, struct dts *dts)
+{
+	uint64_t dts0;
+	struct dts temps[P8_MEM_ZONES];
+	int i;
+	int rc;
+
+	rc = xscom_read(chip_id, THERM_MEM_DTS_RESULT0, &dts0);
+	if (rc)
+		return rc;
+
+	dts_decode_one_dts(dts0 >> 48, &temps[P8_MEM_DTS0]);
+	dts_decode_one_dts(dts0 >> 32, &temps[P8_MEM_DTS1]);
+
+	for (i = 0; i < P8_MEM_ZONES; i++) {
+		int16_t t = temps[i].temp;
+
+		if (!temps[i].valid)
+			continue;
+
+		/* keep the max temperature of all 4 sensors */
+		if (t > dts->temp)
+			dts->temp = t;
+
+		dts->valid++;
+		dts->trip |= temps[i].trip;
+	}
+
+	prlog(PR_TRACE, "DTS: Chip %x temp:%dC trip:%x\n",
+	      chip_id, dts->temp, dts->trip);
+
+	/*
+	 * FIXME: The trip bits are always set ?! Just discard
+	 * them for the moment until we understand why.
+	 */
+	dts->trip = 0;
+	return 0;
+}
+
+/*
+ * DTS sensor class ids. Only one for the moment: the core
+ * temperature.
+ */
+enum sensor_dts_class {
+	SENSOR_DTS_CORE_TEMP,
+	SENSOR_DTS_MEM_TEMP,
+	/* To be continued */
+};
+
+/*
+ * Extract the centaur chip id which was truncated to fit in the
+ * resource identifier field of the sensor handler
+ */
+#define centaur_get_id(rid) (0x80000000 | ((rid) & 0x3ff))
+
+int64_t dts_sensor_read(u32 sensor_hndl, int token, __be64 *sensor_data)
+{
+	uint8_t	attr = sensor_get_attr(sensor_hndl);
+	uint32_t rid = sensor_get_rid(sensor_hndl);
+	struct dts dts = {0};
+	int64_t rc;
+
+	if (attr > SENSOR_DTS_ATTR_TEMP_TRIP)
+		return OPAL_PARAMETER;
+
+	memset(&dts, 0, sizeof(struct dts));
+
+	switch (sensor_get_frc(sensor_hndl)) {
+	case SENSOR_DTS_CORE_TEMP:
+		rc = dts_read_core_temp(rid, &dts, attr, token, sensor_data);
+		break;
+	case SENSOR_DTS_MEM_TEMP:
+		rc = dts_read_mem_temp(centaur_get_id(rid), &dts);
+		break;
+	default:
+		rc = OPAL_PARAMETER;
+		break;
+	}
+	if (rc)
+		return rc;
+
+	if (attr == SENSOR_DTS_ATTR_TEMP_MAX)
+		*sensor_data = cpu_to_be64(dts.temp);
+	else if (attr == SENSOR_DTS_ATTR_TEMP_TRIP)
+		*sensor_data = cpu_to_be64(dts.trip);
+
+	return 0;
+}
+
+/*
+ * We only have two bytes for the resource identifier in the sensor
+ * handler. Let's trunctate the centaur chip id to squeeze it in.
+ *
+ * Centaur chip IDs are using the XSCOM "partID" encoding described in
+ * xscom.h. recap:
+ *
+ *     0b1000.0000.0000.0000.0000.00NN.NCCC.MMMM
+ *     N=Node, C=Chip, M=Memory Channel
+ */
+#define centaur_make_id(cen_id, dimm_id)	\
+	(((chip_id) & 0x3ff) | ((dimm_id) << 10))
+
+#define core_handler(core_id, attr_id)					\
+	sensor_make_handler(SENSOR_DTS, SENSOR_DTS_CORE_TEMP,		\
+			    core_id, attr_id)
+
+#define cen_handler(cen_id, attr_id)					\
+	sensor_make_handler(SENSOR_DTS, SENSOR_DTS_MEM_TEMP,		\
+			    centaur_make_id(chip_id, 0), attr_id)
+
+bool dts_sensor_create_nodes(struct dt_node *sensors)
+{
+	struct proc_chip *chip;
+	struct dt_node *cn;
+	char name[64];
+
+	/* build the device tree nodes :
+	 *
+	 *     sensors/core-temp@pir
+	 *
+	 * The core is identified by its PIR, is stored in the resource
+	 * number of the sensor handler.
+	 */
+	for_each_chip(chip) {
+		struct cpu_thread *c;
+
+		for_each_available_core_in_chip(c, chip->id) {
+			struct dt_node *node;
+			uint32_t handler;
+
+			snprintf(name, sizeof(name), "core-temp@%x", c->pir);
+
+			handler = core_handler(c->pir, SENSOR_DTS_ATTR_TEMP_MAX);
+			node = dt_new(sensors, name);
+			dt_add_property_string(node, "compatible",
+					       "ibm,opal-sensor");
+			dt_add_property_cells(node, "sensor-data", handler);
+			handler = core_handler(c->pir, SENSOR_DTS_ATTR_TEMP_TRIP);
+			dt_add_property_cells(node, "sensor-status", handler);
+			dt_add_property_string(node, "sensor-type", "temp");
+			dt_add_property_cells(node, "ibm,pir", c->pir);
+			dt_add_property_cells(node, "reg", handler);
+			dt_add_property_string(node, "label", "Core");
+			init_timer(&c->dts_timer, dts_async_read_temp, c);
+			c->dts_read_in_progress = false;
+		}
+	}
+
+	/*
+	 * sensors/mem-temp@chip for Centaurs
+	 */
+	dt_for_each_compatible(dt_root, cn, "ibm,centaur") {
+		uint32_t chip_id;
+		struct dt_node *node;
+		uint32_t handler;
+
+		chip_id = dt_prop_get_u32(cn, "ibm,chip-id");
+
+		snprintf(name, sizeof(name), "mem-temp@%x", chip_id);
+
+		handler = cen_handler(chip_id, SENSOR_DTS_ATTR_TEMP_MAX);
+		node = dt_new(sensors, name);
+		dt_add_property_string(node, "compatible",
+				       "ibm,opal-sensor");
+		dt_add_property_cells(node, "sensor-data", handler);
+
+		handler = cen_handler(chip_id, SENSOR_DTS_ATTR_TEMP_TRIP);
+		dt_add_property_cells(node, "sensor-status", handler);
+		dt_add_property_string(node, "sensor-type", "temp");
+		dt_add_property_cells(node, "ibm,chip-id", chip_id);
+		dt_add_property_cells(node, "reg", handler);
+		dt_add_property_string(node, "label", "Centaur");
+	}
+
+	return true;
+}
diff --git a/roms/skiboot/hw/fake-nvram.c b/roms/skiboot/hw/fake-nvram.c
new file mode 100644
index 000000000..44adde4a3
--- /dev/null
+++ b/roms/skiboot/hw/fake-nvram.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2017 IBM Corp. */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <mem_region.h>
+#include <lock.h>
+
+static struct mem_region *nvram_region;
+static struct lock fake_nvram_lock = LOCK_UNLOCKED;
+
+int fake_nvram_info(uint32_t *total_size)
+{
+	nvram_region = find_mem_region("ibm,fake-nvram");
+
+	if (!nvram_region)
+		return OPAL_HARDWARE;
+
+	*total_size = nvram_region->len;
+
+	return OPAL_SUCCESS;
+}
+
+int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
+{
+	if (!nvram_region)
+		return -ENODEV;
+
+	lock(&fake_nvram_lock);
+	memcpy(dst, (void *) (nvram_region->start + src), len);
+	unlock(&fake_nvram_lock);
+
+	nvram_read_complete(true);
+
+	return 0;
+}
+
+int fake_nvram_write(uint32_t offset, void *src, uint32_t size)
+{
+	if (!nvram_region)
+		return OPAL_HARDWARE;
+
+	lock(&fake_nvram_lock);
+	memcpy((void *) (nvram_region->start + offset), src, size);
+	unlock(&fake_nvram_lock);
+
+	return 0;
+}
+
diff --git a/roms/skiboot/hw/fake-rtc.c b/roms/skiboot/hw/fake-rtc.c
new file mode 100644
index 000000000..3f083050c
--- /dev/null
+++ b/roms/skiboot/hw/fake-rtc.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2017 IBM Corp. */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <mem_region.h>
+#include <device.h>
+#include <timebase.h>
+#include <time-utils.h>
+#include <lock.h>
+
+/* timebase when tm_offset was assigned */
+static unsigned long tb_synctime;
+
+/*
+ * Absolute time that was last assigned.
+ * Current rtc value is calculated from this.
+*/
+static struct tm tm_offset;
+
+/* protects tm_offset & tb_synctime */
+static struct lock emulation_lock;
+
+static int64_t fake_rtc_write(uint32_t ymd, uint64_t hmsm)
+{
+
+	lock(&emulation_lock);
+
+	datetime_to_tm(ymd, hmsm, &tm_offset);
+	tb_synctime = mftb();
+
+	unlock(&emulation_lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t fake_rtc_read(__be32 *__ymd, __be64 *__hmsm)
+{
+
+	time_t sec;
+	struct tm tm_calculated;
+	uint32_t ymd;
+	uint64_t hmsm;
+
+	if (!__ymd || !__hmsm)
+		return OPAL_PARAMETER;
+
+	/* Compute the emulated clock value */
+	lock(&emulation_lock);
+
+	sec = tb_to_secs(mftb() - tb_synctime) + mktime(&tm_offset);
+	gmtime_r(&sec, &tm_calculated);
+	tm_to_datetime(&tm_calculated, &ymd, &hmsm);
+
+	unlock(&emulation_lock);
+
+	*__ymd = cpu_to_be32(ymd);
+	*__hmsm = cpu_to_be64(hmsm);
+
+	return OPAL_SUCCESS;
+}
+
+void fake_rtc_init(void)
+{
+	struct mem_region *rtc_region = NULL;
+	uint32_t *rtc = NULL, *fake_ymd;
+	uint64_t *fake_hmsm;
+	struct dt_node *np;
+
+	/* Read initial values from reserved memory */
+	rtc_region = find_mem_region("ibm,fake-rtc");
+
+	/* Should we register anyway? */
+	if (!rtc_region) {
+		prlog(PR_TRACE, "No initial RTC value found\n");
+		return;
+	}
+
+	init_lock(&emulation_lock);
+
+	/* Fetch the initial rtc values */
+	rtc = (uint32_t *) rtc_region->start;
+
+	fake_ymd = rtc;
+	fake_hmsm = ((uint64_t *) &rtc[1]);
+
+	fake_rtc_write(*fake_ymd, *fake_hmsm);
+
+	/* Register opal calls */
+	opal_register(OPAL_RTC_READ, fake_rtc_read, 2);
+	opal_register(OPAL_RTC_WRITE, fake_rtc_write, 2);
+
+	/* add the fake rtc dt node */
+	np = dt_new(opal_node, "rtc");
+	dt_add_property_strings(np, "compatible", "ibm,opal-rtc");
+
+	prlog(PR_TRACE, "Init fake RTC to Date:%d-%d-%d Time:%d-%d-%d\n",
+	      tm_offset.tm_mon, tm_offset.tm_mday, tm_offset.tm_year,
+	      tm_offset.tm_hour, tm_offset.tm_min, tm_offset.tm_sec);
+}
diff --git a/roms/skiboot/hw/fsi-master.c b/roms/skiboot/hw/fsi-master.c
new file mode 100644
index 000000000..410542a19
--- /dev/null
+++ b/roms/skiboot/hw/fsi-master.c
@@ -0,0 +1,675 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2017 IBM Corp. */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <lock.h>
+#include <timebase.h>
+#include <chip.h>
+#include <fsi-master.h>
+
+/*
+ * FSI Masters sit on OPB busses behind PIB2OPB bridges
+ *
+ * There are two cMFSI behind two different bridges at
+ * different XSCOM addresses. For now we don't have them in
+ * the device-tree so we hard code the address
+ */
+#define PIB2OPB_MFSI0_ADDR	0x20000
+#define PIB2OPB_MFSI1_ADDR	0x30000
+
+/*
+ * Bridge registers on XSCOM that allow generatoin
+ * of OPB cycles
+ */
+#define PIB2OPB_REG_CMD		0x0
+#define   OPB_CMD_WRITE		0x80000000
+#define   OPB_CMD_READ		0x00000000
+#define   OPB_CMD_8BIT		0x00000000
+#define   OPB_CMD_16BIT		0x20000000
+#define   OPB_CMD_32BIT		0x60000000
+#define PIB2OPB_REG_STAT	0x1
+#define   OPB_STAT_ANY_ERR	0x80000000
+#define   OPB_STAT_ERR_OPB      0x7FEC0000
+#define   OPB_STAT_ERRACK       0x00100000
+#define   OPB_STAT_BUSY		0x00010000
+#define   OPB_STAT_READ_VALID   0x00020000
+#define   OPB_STAT_ERR_CMFSI    0x0000FC00
+#define   OPB_STAT_ERR_HMFSI    0x000000FC
+#define   OPB_STAT_ERR_BASE	(OPB_STAT_ANY_ERR | \
+				 OPB_STAT_ERR_OPB | \
+				 OPB_STAT_ERRACK)
+#define PIB2OPB_REG_LSTAT	0x2
+#define PIB2OPB_REG_RESET	0x4
+#define PIB2OPB_REG_cRSIC	0x5
+#define PIB2OPB_REG_cRSIM       0x6
+#define PIB2OPB_REG_cRSIS	0x7
+#define PIB2OPB_REG_hRSIC	0x8
+#define PIB2OPB_REG_hRSIM	0x9
+#define PIB2OPB_REG_hRSIS	0xA
+
+/* Low level errors from OPB contain the status in the bottom 32-bit
+ * and one of these in the top 32-bit
+ */
+#define OPB_ERR_XSCOM_ERR	0x100000000ull
+#define OPB_ERR_TIMEOUT_ERR	0x200000000ull
+#define OPB_ERR_BAD_OPB_ADDR	0x400000000ull
+
+/*
+ * PIB2OPB 0 has 2 MFSIs, cMFSI and hMFSI, PIB2OPB 1 only
+ * has cMFSI
+ */
+#define cMFSI_OPB_PORTS_BASE	0x40000
+#define cMFSI_OPB_REG_BASE	0x03000
+#define hMFSI_OPB_PORTS_BASE	0x80000
+#define hMFSI_OPB_REG_BASE	0x03400
+#define MFSI_OPB_PORT_STRIDE	0x08000
+
+/* MFSI control registers */
+#define MFSI_REG_MSTAP(__n)	(0x0D0 + (__n) * 4)
+#define MFSI_REG_MATRB0		0x1D8
+#define MFSI_REG_MDTRB0		0x1DC
+#define MFSI_REG_MESRB0		0x1D0
+#define MFSI_REG_MAESP0		0x050
+#define MFSI_REG_MAEB		0x070
+#define MFSI_REG_MSCSB0		0x1D4
+
+/* FSI Slave registers */
+#define FSI_SLAVE_REGS		0x000800	/**< FSI Slave Register */
+#define FSI_SMODE		(FSI_SLAVE_REGS | 0x00)
+#define FSI_SLBUS		(FSI_SLAVE_REGS | 0x30)
+#define FSI_SLRES		(FSI_SLAVE_REGS | 0x34)
+
+#define FSI2PIB_ENGINE		0x001000	/**< FSI2PIB Engine (SCOM) */
+#define FSI2PIB_RESET		(FSI2PIB_ENGINE | 0x18)
+#define FSI2PIB_STATUS		(FSI2PIB_ENGINE | 0x1C)
+#define FSI2PIB_COMPMASK	(FSI2PIB_ENGINE | 0x30)
+#define FSI2PIB_TRUEMASK	(FSI2PIB_ENGINE | 0x34)
+
+struct mfsi {
+	uint32_t chip_id;
+	uint32_t unit;
+	uint32_t xscom_base;
+	uint32_t ports_base;
+	uint32_t reg_base;
+	uint32_t err_bits;
+};
+
+#define mfsi_log(__lev, __m, __fmt, ...) \
+	prlog(__lev, "MFSI %x:%x: " __fmt, __m->chip_id, __m->unit, ##__VA_ARGS__)
+/*
+ * Use a global FSI lock for now. Beware of re-entrancy
+ * if we ever add support for normal chip XSCOM via FSI, in
+ * which case we'll probably have to consider either per chip
+ * lock (which can have AB->BA deadlock issues) or a re-entrant
+ * global lock or something else. ...
+ */
+static struct lock fsi_lock = LOCK_UNLOCKED;
+
+/*
+ * OPB accessors
+ */
+
+/* We try up to 1.2ms for an OPB access */
+#define MFSI_OPB_MAX_TRIES	1200
+
+static uint64_t mfsi_opb_poll(struct mfsi *mfsi, uint32_t *read_data)
+{
+	unsigned long retries = MFSI_OPB_MAX_TRIES;
+	uint64_t sval;
+	uint32_t stat;
+	int64_t rc;
+
+	/* We try again every 10us for a bit more than 1ms */
+	for (;;) {
+		/* Read OPB status register */
+		rc = xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_STAT, &sval);
+		if (rc) {
+			/* Do something here ? */
+			mfsi_log(PR_ERR, mfsi, "XSCOM error %lld read OPB STAT\n", rc);
+			return OPB_ERR_XSCOM_ERR;
+		}
+		mfsi_log(PR_INSANE, mfsi, "  STAT=0x%16llx...\n", sval);
+
+		stat = sval >> 32;
+
+		/* Complete */
+		if (!(stat & OPB_STAT_BUSY))
+			break;
+		if (retries-- == 0) {
+			/* This isn't supposed to happen (HW timeout) */
+			mfsi_log(PR_ERR, mfsi, "OPB POLL timeout !\n");
+			return OPB_ERR_TIMEOUT_ERR | (stat & mfsi->err_bits);
+		}
+		time_wait_us(1);
+	}
+
+	/* Did we have an error ? */
+	if (stat & mfsi->err_bits)
+		return stat & mfsi->err_bits;
+
+	if (read_data) {
+		if (!(stat & OPB_STAT_READ_VALID)) {
+			mfsi_log(PR_ERR, mfsi, "Read successful but no data !\n");
+
+			/* What do do here ? can it actually happen ? */
+			sval = 0xffffffff;
+		}
+		*read_data = sval & 0xffffffff;
+	}
+
+	return 0;
+}
+
+static uint64_t mfsi_opb_read(struct mfsi *mfsi, uint32_t opb_addr, uint32_t *data)
+{
+	uint64_t opb_cmd = OPB_CMD_READ | OPB_CMD_32BIT;
+	int64_t rc;
+
+	if (opb_addr > 0x00ffffff)
+		return OPB_ERR_BAD_OPB_ADDR;
+
+	opb_cmd |= opb_addr;
+	opb_cmd <<= 32;
+
+	mfsi_log(PR_INSANE, mfsi, "MFSI_OPB_READ: Writing 0x%16llx to XSCOM %x\n",
+		 opb_cmd, mfsi->xscom_base);
+
+	rc = xscom_write(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_CMD, opb_cmd);
+	if (rc) {
+		mfsi_log(PR_ERR, mfsi, "XSCOM error %lld writing OPB CMD\n", rc);
+		return OPB_ERR_XSCOM_ERR;
+	}
+	return mfsi_opb_poll(mfsi, data);
+}
+
+static uint64_t mfsi_opb_write(struct mfsi *mfsi, uint32_t opb_addr, uint32_t data)
+{
+	uint64_t opb_cmd = OPB_CMD_WRITE | OPB_CMD_32BIT;
+	int64_t rc;
+
+	if (opb_addr > 0x00ffffff)
+		return OPB_ERR_BAD_OPB_ADDR;
+
+	opb_cmd |= opb_addr;
+	opb_cmd <<= 32;
+	opb_cmd |= data;
+
+	mfsi_log(PR_INSANE, mfsi, "MFSI_OPB_WRITE: Writing 0x%16llx to XSCOM %x\n",
+		 opb_cmd, mfsi->xscom_base);
+
+	rc = xscom_write(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_CMD, opb_cmd);
+	if (rc) {
+		mfsi_log(PR_ERR, mfsi, "XSCOM error %lld writing OPB CMD\n", rc);
+		return OPB_ERR_XSCOM_ERR;
+	}
+	return mfsi_opb_poll(mfsi, NULL);
+}
+
+static struct mfsi *mfsi_get(uint32_t chip_id, uint32_t unit)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct mfsi *mfsi;
+
+	if (!chip || unit > MFSI_hMFSI0)
+		return NULL;
+	mfsi = &chip->fsi_masters[unit];
+	if (mfsi->xscom_base == 0)
+		return NULL;
+	return mfsi;
+}
+
+static int64_t mfsi_reset_pib2opb(struct mfsi *mfsi)
+{
+	uint64_t stat;
+	int64_t rc;
+
+	rc = xscom_write(mfsi->chip_id,
+			 mfsi->xscom_base + PIB2OPB_REG_RESET, (1ul << 63));
+	if (rc) {
+		mfsi_log(PR_ERR, mfsi, "XSCOM error %lld resetting PIB2OPB\n", rc);
+		return rc;
+	}
+	rc = xscom_write(mfsi->chip_id,
+			 mfsi->xscom_base + PIB2OPB_REG_STAT, (1ul << 63));
+	if (rc) {
+		mfsi_log(PR_ERR, mfsi, "XSCOM error %lld resetting status\n", rc);
+		return rc;
+	}
+	rc = xscom_read(mfsi->chip_id,
+			mfsi->xscom_base + PIB2OPB_REG_STAT, &stat);
+	if (rc) {
+		mfsi_log(PR_ERR, mfsi, "XSCOM error %lld reading status\n", rc);
+		return rc;
+	}
+	return 0;
+}
+
+
+static void mfsi_dump_pib2opb_state(struct mfsi *mfsi)
+{
+	uint64_t val;
+
+	/* Dump a bunch of registers */
+	if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_CMD, &val))
+		goto xscom_error;
+	mfsi_log(PR_ERR, mfsi, " PIB2OPB CMD   = %016llx\n", val);
+	if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_STAT, &val))
+		goto xscom_error;
+	mfsi_log(PR_ERR, mfsi, " PIB2OPB STAT  = %016llx\n", val);
+	if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_LSTAT, &val))
+		goto xscom_error;
+	mfsi_log(PR_ERR, mfsi, " PIB2OPB LSTAT = %016llx\n", val);
+
+	if (mfsi->unit == MFSI_cMFSI0 || mfsi->unit == MFSI_cMFSI1) {
+		if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_cRSIC, &val))
+			goto xscom_error;
+		mfsi_log(PR_ERR, mfsi, " PIB2OPB cRSIC = %016llx\n", val);
+		if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_cRSIM, &val))
+			goto xscom_error;
+		mfsi_log(PR_ERR, mfsi, " PIB2OPB cRSIM = %016llx\n", val);
+		if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_cRSIS, &val))
+			goto xscom_error;
+		mfsi_log(PR_ERR, mfsi, " PIB2OPB cRSIS = %016llx\n", val);
+	} else if (mfsi->unit == MFSI_hMFSI0) {
+		if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_hRSIC, &val))
+			goto xscom_error;
+		mfsi_log(PR_ERR, mfsi, " PIB2OPB hRSIC = %016llx\n", val);
+		if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_hRSIM, &val))
+			goto xscom_error;
+		mfsi_log(PR_ERR, mfsi, " PIB2OPB hRSIM = %016llx\n", val);
+		if (xscom_read(mfsi->chip_id, mfsi->xscom_base + PIB2OPB_REG_hRSIS, &val))
+			goto xscom_error;
+		mfsi_log(PR_ERR, mfsi, " PIB2OPB hRSIS = %016llx\n", val);
+	}
+	return;
+ xscom_error:
+	mfsi_log(PR_ERR, mfsi, "XSCOM error reading PIB2OPB registers\n");
+}
+
+static int64_t mfsi_dump_ctrl_regs(struct mfsi *mfsi)
+{
+	uint64_t opb_stat;
+	uint32_t i;
+
+	/* List of registers to dump (from HB) */
+	static uint32_t dump_regs[] = {
+		MFSI_REG_MATRB0,
+		MFSI_REG_MDTRB0,
+		MFSI_REG_MESRB0,
+		MFSI_REG_MAESP0,
+		MFSI_REG_MAEB,
+		MFSI_REG_MSCSB0,
+	};
+	static const char *dump_regs_names[] = {
+		"MFSI_REG_MATRB0",
+		"MFSI_REG_MDTRB0",
+		"MFSI_REG_MESRB0",
+		"MFSI_REG_MAESP0",
+		"MFSI_REG_MAEB  ",
+		"MFSI_REG_MSCSB0",
+	};
+	for (i = 0; i < ARRAY_SIZE(dump_regs); i++) {
+		uint32_t val;
+
+		opb_stat = mfsi_opb_read(mfsi, mfsi->reg_base + dump_regs[i], &val);
+		if (opb_stat) {
+			/* Error on dump, give up */
+			mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx dumping reg %x\n",
+				 opb_stat, dump_regs[i]);
+			return OPAL_HARDWARE;
+		}
+		mfsi_log(PR_ERR, mfsi, " %s = %08x\n", dump_regs_names[i], val);
+	}
+	for (i = 0; i < 8; i++) {
+		uint32_t val;
+
+		opb_stat = mfsi_opb_read(mfsi, mfsi->reg_base + MFSI_REG_MSTAP(i), &val);
+		if (opb_stat) {
+			/* Error on dump, give up */
+			mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx dumping reg %x\n",
+				 opb_stat, MFSI_REG_MSTAP(i));
+			return OPAL_HARDWARE;
+		}
+		mfsi_log(PR_ERR, mfsi, " MFSI_REG_MSTAP%d = %08x\n", i, val);
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t mfsi_master_cleanup(struct mfsi *mfsi, uint32_t port)
+{
+	uint64_t opb_stat;
+	uint32_t port_base, compmask, truemask;
+
+	/* Reset the bridge to clear up the residual errors */
+
+	/* bit0 = Bridge: General reset */
+	opb_stat = mfsi_opb_write(mfsi, mfsi->reg_base + MFSI_REG_MESRB0, 0x80000000u);
+	if (opb_stat) {
+		mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx writing reset to MESRB0\n",
+			 opb_stat);
+		return OPAL_HARDWARE;
+	}
+
+	/* Calculate base address of port */
+	port_base = mfsi->ports_base + port * MFSI_OPB_PORT_STRIDE;
+
+	/* Perform error reset on Centaur fsi slave: */
+	/*  write 0x4000000 to addr=834 */
+	opb_stat = mfsi_opb_write(mfsi, port_base + FSI_SLRES, 0x04000000);
+	if (opb_stat) {
+		mfsi_log(PR_ERR, mfsi,
+			 " OPB stat 0x%016llx writing reset to FSI slave\n",
+			 opb_stat);
+		return OPAL_HARDWARE;
+	}
+
+	/* Further step is to issue a PIB reset to the FSI2PIB engine
+	 * in busy state, i.e. write arbitrary data to 101c
+	 * (putcfam 1007) register of the previously failed FSI2PIB
+	 * engine on Centaur.
+	 *
+	 * XXX BenH: Should that be done by the upper FSI XSCOM layer ?
+	 */
+	opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_STATUS, 0xFFFFFFFF);
+	if (opb_stat) {
+		mfsi_log(PR_ERR, mfsi,
+			 " OPB stat 0x%016llx clearing FSI2PIB_STATUS\n",
+			 opb_stat);
+		return OPAL_HARDWARE;
+	}
+
+	/* Need to save/restore the true/comp masks or the FSP (PRD ?) will
+	 * get annoyed
+	 */
+	opb_stat = mfsi_opb_read(mfsi, port_base + FSI2PIB_COMPMASK, &compmask);
+	if (opb_stat) {
+		mfsi_log(PR_ERR, mfsi,
+			 " OPB stat 0x%016llx reading FSI2PIB_COMPMASK\n",
+			 opb_stat);
+		return OPAL_HARDWARE;
+	}
+	opb_stat = mfsi_opb_read(mfsi, port_base + FSI2PIB_TRUEMASK, &truemask);
+	if (opb_stat) {
+		mfsi_log(PR_ERR, mfsi,
+			 " OPB stat 0x%016llx reading FSI2PIB_TRUEMASK\n",
+			 opb_stat);
+		return OPAL_HARDWARE;
+	}
+
+	/* Then, write arbitrary data to 1018  (putcfam 1006) to
+	 * reset any pending FSI2PIB errors.
+	 */
+	opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_RESET, 0xFFFFFFFF);
+	if (opb_stat) {
+		mfsi_log(PR_ERR, mfsi,
+			 " OPB stat 0x%016llx writing FSI2PIB_RESET\n",
+			 opb_stat);
+		return OPAL_HARDWARE;
+	}
+
+	/* Restore the true/comp masks */
+	opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_COMPMASK, compmask);
+	if (opb_stat) {
+		mfsi_log(PR_ERR, mfsi,
+			 " OPB stat 0x%016llx writing FSI2PIB_COMPMASK\n",
+			 opb_stat);
+		return OPAL_HARDWARE;
+	}
+	opb_stat = mfsi_opb_write(mfsi, port_base + FSI2PIB_TRUEMASK, truemask);
+	if (opb_stat) {
+		mfsi_log(PR_ERR, mfsi,
+			 " OPB stat 0x%016llx writing FSI2PIB_TRUEMASK\n",
+			 opb_stat);
+		return OPAL_HARDWARE;
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t mfsi_analyse_fsi_error(struct mfsi *mfsi)
+{
+	uint64_t opb_stat;
+	uint32_t mesrb0;
+
+	/* Most of the code below is adapted from HB. The main difference is
+	 * that we don't gard
+	 */
+
+	/* Read MESRB0 */
+	opb_stat = mfsi_opb_read(mfsi, mfsi->reg_base + MFSI_REG_MESRB0, &mesrb0);
+	if (opb_stat) {
+		mfsi_log(PR_ERR, mfsi, " OPB stat 0x%016llx reading MESRB0\n", opb_stat);
+		return OPAL_HARDWARE;
+	}
+	mfsi_log(PR_ERR, mfsi, " MESRB0=%08x\n", mesrb0);
+
+	/* bits 8:15 are internal parity errors in the master */
+	if (mesrb0 & 0x00FF0000) {
+		mfsi_log(PR_ERR, mfsi, " Master parity error !\n");
+	} else {
+		/* bits 0:3 are a specific error code */
+		switch ((mesrb0 & 0xF0000000) >> 28) {
+		case 0x1: /* OPB error	*/
+		case 0x2: /* Invalid state of OPB state machine */
+			/* error is inside the OPB logic */
+			mfsi_log(PR_ERR, mfsi, " OPB logic error !\n");
+			break;
+		case 0x3: /* Port access error */
+			/* probably some kind of code collision */
+			/* could also be something weird in the chip */
+			mfsi_log(PR_ERR, mfsi, " Port access error !\n");
+			break;
+		case 0x4: /* ID mismatch */
+			mfsi_log(PR_ERR, mfsi, " Port ID mismatch !\n");
+			break;
+		case 0x6: /* port timeout error */
+			mfsi_log(PR_ERR, mfsi, " Port timeout !\n");
+			break;
+		case 0x7: /* master timeout error */
+			mfsi_log(PR_ERR, mfsi, " Master timeout !\n");
+			break;
+		case 0x9: /* Any error response from Slave */
+			mfsi_log(PR_ERR, mfsi, " Slave error response !\n");
+			break;
+		case 0xC: /* bridge parity error */
+			mfsi_log(PR_ERR, mfsi, " Bridge parity error !\n");
+			break;
+		case 0xB: /* protocol error */
+			mfsi_log(PR_ERR, mfsi, " Protocol error !\n");
+			break;
+		case 0x8: /* master CRC error */
+			mfsi_log(PR_ERR, mfsi, " Master CRC error !\n");
+			break;
+		case 0xA: /* Slave CRC error */
+			mfsi_log(PR_ERR, mfsi, " Slave CRC error !\n");
+			break;
+		default:
+			mfsi_log(PR_ERR, mfsi, " Unknown error !\n");
+			break;
+		}
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t mfsi_handle_error(struct mfsi *mfsi, uint32_t port,
+				 uint64_t opb_stat, uint32_t fsi_addr)
+{
+	int rc;
+	bool found_root_cause = false;
+
+	mfsi_log(PR_ERR, mfsi, "Access error on port %d, stat=%012llx\n",
+		 port, opb_stat);
+
+	/* First handle stat codes we synthetized */
+	if (opb_stat & OPB_ERR_XSCOM_ERR)
+		return OPAL_HARDWARE;
+	if (opb_stat & OPB_ERR_BAD_OPB_ADDR)
+		return OPAL_PARAMETER;
+
+	/* Dump a bunch of regisers from PIB2OPB and reset it */
+	mfsi_dump_pib2opb_state(mfsi);
+
+	/* Reset PIB2OPB */
+	mfsi_reset_pib2opb(mfsi);
+
+	/* This one is not supposed to happen but ... */
+	if (opb_stat & OPB_ERR_TIMEOUT_ERR)
+		return OPAL_HARDWARE;
+
+	/* Dump some FSI control registers */
+	rc = mfsi_dump_ctrl_regs(mfsi);
+
+	/* If that failed, reset PIB2OPB again and return */
+	if (rc) {
+		mfsi_dump_pib2opb_state(mfsi);
+		mfsi_reset_pib2opb(mfsi);
+		return OPAL_HARDWARE;
+	}
+
+	/* Now check for known root causes (from HB) */
+
+	/* First check if it's a ctrl register access error and we got an OPB NACK,
+	 * which means an out of bounds control reg
+	 */
+	if ((opb_stat & OPB_STAT_ERRACK) &&
+	    ((fsi_addr & ~0x2ffu) == mfsi->reg_base)) {
+		mfsi_log(PR_ERR, mfsi, " Error appears to be out of bounds reg %08x\n",
+			 fsi_addr);
+		found_root_cause = true;
+	}
+	/* Else check for other OPB errors */
+	else if (opb_stat & OPB_STAT_ERR_OPB) {
+		mfsi_log(PR_ERR, mfsi, " Error appears to be an OPB error\n");
+		found_root_cause = true;
+	}
+
+	/* Root cause not found, dig into FSI logic */
+	if (!found_root_cause) {
+		rc = mfsi_analyse_fsi_error(mfsi);
+		if (!rc) {
+			/* If that failed too, reset the PIB2OPB again */
+			mfsi_reset_pib2opb(mfsi);
+		}
+	}
+
+	/* Cleanup MFSI master */
+	mfsi_master_cleanup(mfsi, port);
+
+	return OPAL_HARDWARE;
+}
+
+int64_t mfsi_read(uint32_t chip, uint32_t unit, uint32_t port,
+		  uint32_t fsi_addr, uint32_t *data)
+{
+	struct mfsi *mfsi = mfsi_get(chip, unit);
+	uint32_t port_addr;
+	uint64_t opb_stat;
+	int64_t rc = OPAL_SUCCESS;
+
+	if (!mfsi || port > 7)
+		return OPAL_PARAMETER;
+
+	lock(&fsi_lock);
+
+	/* Calculate port address */
+	port_addr = mfsi->ports_base + port * MFSI_OPB_PORT_STRIDE;
+	port_addr += fsi_addr;
+
+	/* Perform OPB access */
+	opb_stat = mfsi_opb_read(mfsi, port_addr, data);
+	if (opb_stat)
+		rc = mfsi_handle_error(mfsi, port, opb_stat, port_addr);
+
+	unlock(&fsi_lock);
+
+	return rc;
+}
+
+int64_t mfsi_write(uint32_t chip, uint32_t unit, uint32_t port,
+		   uint32_t fsi_addr, uint32_t data)
+{
+	struct mfsi *mfsi = mfsi_get(chip, unit);
+	uint32_t port_addr;
+	uint64_t opb_stat;
+	int64_t rc = OPAL_SUCCESS;
+
+	if (!mfsi || port > 7)
+		return OPAL_PARAMETER;
+
+	lock(&fsi_lock);
+
+	/* Calculate port address */
+	port_addr = mfsi->ports_base + port * MFSI_OPB_PORT_STRIDE;
+	port_addr += fsi_addr;
+
+	/* Perform OPB access */
+	opb_stat = mfsi_opb_write(mfsi, port_addr, data);
+	if (opb_stat)
+		rc = mfsi_handle_error(mfsi, port, opb_stat, port_addr);
+
+	unlock(&fsi_lock);
+
+	return rc;
+}
+
+static void mfsi_add(struct proc_chip *chip, struct mfsi *mfsi, uint32_t unit)
+{
+	mfsi->chip_id = chip->id;
+	mfsi->unit = unit;
+
+	/* We hard code everything for now */
+	switch (unit) {
+	case MFSI_cMFSI0:
+		mfsi->xscom_base = PIB2OPB_MFSI0_ADDR;
+		mfsi->ports_base = cMFSI_OPB_PORTS_BASE;
+		mfsi->reg_base = cMFSI_OPB_REG_BASE;
+		mfsi->err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI;
+		break;
+	case MFSI_cMFSI1:
+		mfsi->xscom_base = PIB2OPB_MFSI1_ADDR;
+		mfsi->ports_base = cMFSI_OPB_PORTS_BASE;
+		mfsi->reg_base = cMFSI_OPB_REG_BASE;
+		mfsi->err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI;
+		break;
+	case MFSI_hMFSI0:
+		mfsi->xscom_base = PIB2OPB_MFSI0_ADDR;
+		mfsi->ports_base = hMFSI_OPB_PORTS_BASE;
+		mfsi->reg_base = hMFSI_OPB_REG_BASE;
+		mfsi->err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_HMFSI;
+		break;
+	default:
+		/* ??? */
+		return;
+	}
+
+	/* Hardware Bug HW222712 on Murano DD1.0 causes the
+	 * any_error bit to be un-clearable so we just
+	 * have to ignore it. Additionally, HostBoot applies
+	 * this to Venice too, though the comment there claims
+	 * this is a Simics workaround.
+	 *
+	 * The doc says that bit can be safely ignored, so let's
+	 * just not bother and always take it out.
+	 */
+
+	/* 16: cMFSI any-master-error */
+	/* 24: hMFSI any-master-error */
+	mfsi->err_bits &= 0xFFFF7F7F;
+
+	mfsi_log(PR_INFO, mfsi, "Initialized\n");
+}
+
+void mfsi_init(void)
+{
+	struct proc_chip *chip;
+
+	for_each_chip(chip) {
+		chip->fsi_masters = zalloc(sizeof(struct mfsi) * 3);
+		assert(chip->fsi_masters);
+		mfsi_add(chip, &chip->fsi_masters[MFSI_cMFSI0], MFSI_cMFSI0);
+		mfsi_add(chip, &chip->fsi_masters[MFSI_hMFSI0], MFSI_hMFSI0);
+		mfsi_add(chip, &chip->fsi_masters[MFSI_cMFSI1], MFSI_cMFSI1);
+
+	}
+}
+
diff --git a/roms/skiboot/hw/fsp/Makefile.inc b/roms/skiboot/hw/fsp/Makefile.inc
new file mode 100644
index 000000000..21dc52a9f
--- /dev/null
+++ b/roms/skiboot/hw/fsp/Makefile.inc
@@ -0,0 +1,13 @@
+SUBDIRS += hw/fsp
+
+FSP_OBJS  = fsp.o fsp-console.o fsp-rtc.o fsp-nvram.o fsp-sysparam.o
+FSP_OBJS += fsp-surveillance.o fsp-codeupdate.o fsp-sensor.o
+FSP_OBJS += fsp-diag.o fsp-leds.o fsp-mem-err.o fsp-op-panel.o
+FSP_OBJS += fsp-elog-read.o fsp-elog-write.o fsp-epow.o fsp-dpo.o
+FSP_OBJS += fsp-dump.o fsp-sysdump.o fsp-chiptod.o fsp-ipmi.o
+FSP_OBJS += fsp-attn.o fsp-occ.o fsp-psi.o
+FSP = hw/fsp/built-in.a
+
+ifeq ($(CONFIG_FSP),1)
+$(FSP): $(FSP_OBJS:%=hw/fsp/%)
+endif
diff --git a/roms/skiboot/hw/fsp/fsp-attn.c b/roms/skiboot/hw/fsp/fsp-attn.c
new file mode 100644
index 000000000..6e358e0d4
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-attn.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * FSP ATTentioN support
+ *
+ * FSP can grab a bunch of things on host firmware dying,
+ * let's set that up.
+ *
+ * Copyright 2013-2019 IBM Corp.
+*/
+#include <fsp.h>
+#include <skiboot.h>
+#include <fsp-elog.h>
+#include <fsp-attn.h>
+#include <hdata/spira.h>
+#include <stack.h>
+#include <processor.h>
+#include <opal-dump.h>
+
+#define TI_CMD_VALID	0x1	/* Command valid */
+#define TI_CMD		0xA1	/* Terminate Immediate command */
+#define TI_DATA_LEN	0x0400	/* Data length */
+/* Controls dump actions
+ *	- Non-destructive hardware dump (bit 0)
+ *	- memory dump (bit 1)
+ *	- Destructive hardware dump (bit 2)
+ */
+#define TI_DMP_CTL	0x6
+/* Dump type
+ * 0 - Abbreviated hardware dump
+ * 1 - Complete hardware dump
+ * 2 - No hardware dump
+ */
+#define TI_DUMP_TYPE	0x1
+#define TI_FORMAT	0x02	/* SRC format */
+#define TI_SRC_FLAGS	0x0	/* SRC flags */
+#define TI_ASCII_WORDS	0x0	/* Number of ASCII words */
+
+/* HEX words: Number of hex words of data added, up to 8 total
+ * this value is one more.
+ */
+#define TI_HEX_WORDS	0x02
+/* SRC length : 8 byte header, 8 hex words of data and
+ * 32 byte ASCII SRC
+ */
+#define TI_SRC_LEN	0x48
+
+static struct ti_attn *ti_attn;
+
+/* Initialises SP attention area with default values */
+static void init_sp_attn_area(void)
+{
+	/* Already done */
+	if (ti_attn)
+		return;
+
+	/* We are just enabling attention area 1 */
+	ti_attn = (struct ti_attn *)&cpu_ctl_sp_attn_area1;
+
+	/* Attention component checks Attn area 2  first, if its NULL
+	 * it will check for Attn area 1.
+	 */
+	memset(&cpu_ctl_sp_attn_area1, 0, sizeof(struct sp_attn_area));
+	memset(&cpu_ctl_sp_attn_area2, 0, sizeof(struct sp_attn_area));
+
+	ti_attn->cmd_valid = TI_CMD_VALID;
+	ti_attn->attn_cmd = TI_CMD;
+	ti_attn->data_len = CPU_TO_BE16(TI_DATA_LEN);
+	/* Dump control byte not used as of now */
+	ti_attn->dump_ctrl =TI_DMP_CTL;
+	ti_attn->dump_type = CPU_TO_BE16(TI_DUMP_TYPE);
+
+	/* SRC format */
+	ti_attn->src_fmt = TI_FORMAT;
+	/* SRC flags */
+	ti_attn->src_flags = TI_SRC_FLAGS;
+	/* #ASCII words */
+	ti_attn->ascii_cnt = TI_ASCII_WORDS;
+	/* #HEX words */
+	ti_attn->hex_cnt = TI_HEX_WORDS;
+	ti_attn->src_len = CPU_TO_BE16(TI_SRC_LEN);
+	snprintf(ti_attn->src, SRC_LEN, "%X", generate_src_from_comp(OPAL_RC_ATTN));
+}
+
+/* Updates src in sp attention area
+ */
+static void update_sp_attn_area(const char *msg)
+{
+#define STACK_BUF_ENTRIES	20
+	struct bt_entry bt_buf[STACK_BUF_ENTRIES];
+	struct bt_metadata metadata;
+	unsigned int len;
+
+	if (!fsp_present())
+		return;
+
+	/* This can be called early */
+	if (!ti_attn)
+		init_sp_attn_area();
+
+	ti_attn->src_word[0] =
+		cpu_to_be32((uint32_t)((uint64_t)__builtin_return_address(0) & 0xffffffff));
+
+	snprintf(ti_attn->msg.version, VERSION_LEN, "%s", version);
+	backtrace_create(bt_buf, STACK_BUF_ENTRIES, &metadata);
+	metadata.token = OPAL_LAST + 1;
+	len = BT_FRAME_LEN;
+	backtrace_print(bt_buf, &metadata, ti_attn->msg.bt_buf, &len, false);
+	snprintf(ti_attn->msg.file_info, FILE_INFO_LEN, "%s", msg);
+
+	ti_attn->msg_len = cpu_to_be32(VERSION_LEN + BT_FRAME_LEN +
+                                   strlen(ti_attn->msg.file_info));
+}
+
+void __attribute__((noreturn)) ibm_fsp_terminate(const char *msg)
+{
+	/* Update SP attention area */
+	update_sp_attn_area(msg);
+
+	/* Update op panel op_display */
+	op_display(OP_FATAL, OP_MOD_CORE, 0x6666);
+
+	/* Save crashing CPU details */
+	opal_mpipl_save_crashing_pir();
+
+	/* XXX FIXME: We should fsp_poll for a while to ensure any pending
+	 * console writes have made it out, but until we have decent PSI
+	 * link handling we must not do it forever. Polling can prevent the
+	 * FSP from bringing the PSI link up and it can get stuck in a
+	 * reboot loop.
+	 */
+
+	trigger_attn();
+	for (;;) ;
+}
+
+/* Intialises SP attention area */
+void fsp_attn_init(void)
+{
+	if (!fsp_present())
+		return;
+
+	init_sp_attn_area();
+}
diff --git a/roms/skiboot/hw/fsp/fsp-chiptod.c b/roms/skiboot/hw/fsp/fsp-chiptod.c
new file mode 100644
index 000000000..e4ede3c1c
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-chiptod.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * On some chiptod errors, ask the FSP for a new topology
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#define pr_fmt(fmt)	"CHIPTOD: " fmt
+
+#include <skiboot.h>
+#include <chiptod.h>
+#include <fsp.h>
+
+/* Response status for fsp command 0xE6, s/c 0x06 (Enable/Disable Topology) */
+#define FSP_STATUS_TOPO_IN_USE	0xb8		/* topology is in use */
+
+static bool fsp_chiptod_update_topology(uint32_t cmd_sub_mod,
+					struct fsp_msg *msg)
+{
+	struct fsp_msg *resp;
+	enum chiptod_topology topo;
+	bool action;
+	uint8_t status = 0;
+
+	switch (cmd_sub_mod) {
+	case FSP_CMD_TOPO_ENABLE_DISABLE:
+		/*
+		 * Action Values: 0x00 = Disable, 0x01 = Enable
+		 * Topology Values: 0x00 = Primary, 0x01 = Secondary
+		 */
+		action = !!msg->data.bytes[2];
+		topo = msg->data.bytes[3];
+		prlog(PR_DEBUG, "Topology update event:\n");
+		prlog(PR_DEBUG, "  Action = %s, Topology = %s\n",
+					action ? "Enable" : "Disable",
+					topo ? "Secondary" : "Primary");
+
+		if (!chiptod_adjust_topology(topo, action))
+			status = FSP_STATUS_TOPO_IN_USE;
+		else
+			status = 0x00;
+
+		resp = fsp_mkmsg(FSP_RSP_TOPO_ENABLE_DISABLE | status, 0);
+		if (!resp) {
+			prerror("Response allocation failed\n");
+			return false;
+		}
+		if (fsp_queue_msg(resp, fsp_freemsg)) {
+			fsp_freemsg(resp);
+			prerror("Failed to queue response msg\n");
+			return false;
+		}
+		return true;
+	default:
+		prlog(PR_DEBUG, "Unhandled sub cmd: %06x\n", cmd_sub_mod);
+		break;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_chiptod_client = {
+		.message = fsp_chiptod_update_topology,
+};
+
+void fsp_chiptod_init(void)
+{
+	/* Register for Class E6 (HW maintanance) */
+	fsp_register_client(&fsp_chiptod_client, FSP_MCLASS_HW_MAINT);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-codeupdate.c b/roms/skiboot/hw/fsp/fsp-codeupdate.c
new file mode 100644
index 000000000..3cd5b2bc9
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-codeupdate.c
@@ -0,0 +1,1315 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Firmware code update for FSP systems
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <fsp-sysparam.h>
+#include <lock.h>
+#include <device.h>
+#include <ccan/endian/endian.h>
+#include <errorlog.h>
+#include <opal-api.h>
+#include <timebase.h>
+
+#include "fsp-codeupdate.h"
+
+enum flash_state {
+	FLASH_STATE_ABSENT,
+	FLASH_STATE_INVALID, /* IPL side marker lid is invalid */
+	FLASH_STATE_READING,
+	FLASH_STATE_READ,
+	FLASH_STATE_ABORT,
+};
+
+enum lid_fetch_side {
+	FETCH_T_SIDE_ONLY,
+	FETCH_P_SIDE_ONLY,
+	FETCH_BOTH_SIDE,
+};
+
+static enum flash_state flash_state = FLASH_STATE_INVALID;
+static enum lid_fetch_side lid_fetch_side = FETCH_BOTH_SIDE;
+
+/* Image buffers */
+static struct opal_sg_list *image_data;
+static uint32_t tce_start;
+static void *lid_data;
+static char validate_buf[VALIDATE_BUF_SIZE];
+
+/* TCE buffer lock */
+static struct lock flash_lock = LOCK_UNLOCKED;
+
+/* FW VPD data */
+static struct fw_image_vpd fw_vpd[2];
+
+/* Code update related sys parameters */
+static uint32_t ipl_side;
+static uint32_t hmc_managed;
+static uint32_t update_policy;
+static uint32_t in_flight_params;
+
+/* If non-NULL, this gets called just before rebooting */
+int (*fsp_flash_term_hook)(void);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE,
+		OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_FLASH, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE,
+		OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_SG_LIST, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE,
+		OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_COMMIT, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE,
+		OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_MSG, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE,
+		OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_NOTIFY, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE,
+		OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_MARKER_LID, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE,
+		OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT, OPAL_NA);
+
+static inline void code_update_tce_map(uint32_t tce_offset,
+				       void *buffer, uint32_t size)
+{
+	uint32_t tlen = ALIGN_UP(size, TCE_PSIZE);
+
+	fsp_tce_map(PSI_DMA_CODE_UPD + tce_offset, buffer, tlen);
+}
+
+static inline void code_update_tce_unmap(uint32_t size)
+{
+	fsp_tce_unmap(PSI_DMA_CODE_UPD, size);
+}
+
+static inline void set_def_fw_version(uint32_t side)
+{
+	strncpy(fw_vpd[side].mi_keyword, FW_VERSION_UNKNOWN, MI_KEYWORD_SIZE);
+	strncpy(fw_vpd[side].ext_fw_id, FW_VERSION_UNKNOWN, ML_KEYWORD_SIZE);
+}
+
+/*
+ * Get IPL side
+ */
+static void get_ipl_side(void)
+{
+	struct dt_node *iplp;
+	const char *side = NULL;
+
+	iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+	if (iplp)
+		side = dt_prop_get_def(iplp, "cec-ipl-side", NULL);
+	prlog(PR_NOTICE, "CUPD: IPL SIDE = %s\n", side);
+
+	if (!side || !strcmp(side, "temp"))
+		ipl_side = FW_IPL_SIDE_TEMP;
+	else
+		ipl_side = FW_IPL_SIDE_PERM;
+}
+
+
+/*
+ * Helper routines to retrieve code update related
+ * system parameters from FSP.
+ */
+
+static void inc_in_flight_param(void)
+{
+	lock(&flash_lock);
+	in_flight_params++;
+	unlock(&flash_lock);
+}
+
+static void dec_in_flight_param(void)
+{
+	lock(&flash_lock);
+	assert(in_flight_params > 0);
+	in_flight_params--;
+	unlock(&flash_lock);
+}
+
+static void got_code_update_policy(uint32_t param_id __unused, int err_len,
+				   void *data __unused)
+{
+	if (err_len != 4) {
+		log_simple_error(&e_info(OPAL_RC_CU_INIT), "CUPD: Error "
+			"retrieving code update policy: %d\n", err_len);
+	} else {
+		update_policy = be32_to_cpu((__be32)update_policy);
+		prlog(PR_NOTICE, "CUPD: Code update policy from FSP: %d\n",
+		      update_policy);
+	}
+
+	dec_in_flight_param();
+}
+
+static void get_code_update_policy(void)
+{
+	int rc;
+
+	inc_in_flight_param();
+	rc = fsp_get_sys_param(SYS_PARAM_FLASH_POLICY, &update_policy, 4,
+			       got_code_update_policy, NULL);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_CU_INIT),
+			"CUPD: Error %d queueing param request\n", rc);
+		dec_in_flight_param();
+	}
+}
+
+static void got_platform_hmc_managed(uint32_t param_id __unused, int err_len,
+				     void *data __unused)
+{
+	if (err_len != 4) {
+		log_simple_error(&e_info(OPAL_RC_CU_INIT), "CUPD: Error "
+			"retrieving hmc managed status: %d\n", err_len);
+	} else {
+		hmc_managed = be32_to_cpu((__be32)hmc_managed);
+		prlog(PR_NOTICE, "CUPD: HMC managed status from FSP: %d\n",
+		      hmc_managed);
+	}
+
+	dec_in_flight_param();
+}
+
+static void get_platform_hmc_managed(void)
+{
+	int rc;
+
+	inc_in_flight_param();
+	rc = fsp_get_sys_param(SYS_PARAM_HMC_MANAGED, &hmc_managed, 4,
+			       got_platform_hmc_managed, NULL);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_CU_INIT),
+			"CUPD: Error %d queueing param request\n", rc);
+		dec_in_flight_param();
+	}
+}
+
+static bool fw_ipl_side_update_notify(struct fsp_msg *msg)
+{
+	u32 param_id = fsp_msg_get_data_word(msg, 0);
+	int dlen = fsp_msg_get_data_word(msg, 1) & 0xffff;
+	uint32_t state = fsp_msg_get_data_word(msg, 2);
+
+	if (param_id != SYS_PARAM_FW_IPL_SIDE)
+		return false;
+
+	if (dlen != 4) {
+		prlog(PR_DEBUG,
+		      "CUPD: Invalid sysparams notify len : 0x%x\n", dlen);
+		return false;
+	}
+
+	prlog(PR_NOTICE, "CUPD: FW IPL side changed. Disable fast reboot\n");
+	prlog(PR_NOTICE, "CUPD: Next IPL side : %s\n",
+	      state == FW_IPL_SIDE_TEMP ? "temp" : "perm");
+
+	disable_fast_reboot("FSP IPL Side Change");
+	return true;
+}
+
+static int64_t code_update_check_state(void)
+{
+	switch(flash_state) {
+	case FLASH_STATE_ABSENT:
+		return OPAL_HARDWARE;
+	case FLASH_STATE_INVALID:
+	case FLASH_STATE_ABORT:
+		return OPAL_INTERNAL_ERROR;
+	case FLASH_STATE_READING:
+		return OPAL_BUSY;
+	default:
+		break;
+	}
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Get common marker LID additional data section
+ */
+static void *get_adf_sec_data(struct com_marker_adf_sec *adf_sec,
+			      uint32_t name)
+{
+	struct com_marker_adf_header *adf_header;
+	int i;
+
+	adf_header = (void *)adf_sec->adf_data;
+	for (i = 0; i < be32_to_cpu(adf_sec->adf_cnt); i++) {
+		if (be32_to_cpu(adf_header->name) == name)
+			return adf_header;
+
+		adf_header = (void *)adf_header + be32_to_cpu(adf_header->size);
+	}
+	return NULL;
+}
+
+/*
+ * Parse common marker LID to get FW version details
+ *
+ * Note:
+ *   At present, we are parsing "Service Pack Nomenclature ADF"
+ *   section only. If we are adding FW IP support, then we have
+ *   to parse "Firmware IP Protection ADF" as well.
+ */
+static void parse_marker_lid(uint32_t side)
+{
+	struct com_marker_header *header;
+	struct com_marker_mi_section *mi_sec;
+	struct com_marker_adf_sec *adf_sec;
+	struct com_marker_adf_sp *adf_sp;
+
+	header = (void *)lid_data;
+
+	/* Get MI details */
+	mi_sec = (void *)header + be32_to_cpu(header->MI_offset);
+	/*
+	 * If Marker LID is invalid, then FSP will return a Marker
+	 * LID with ASCII zeros for the entire MI keyword.
+	 */
+	if (mi_sec->mi_keyword[0] == '0')
+		return;
+
+	strncpy(fw_vpd[side].mi_keyword, mi_sec->mi_keyword, MI_KEYWORD_SIZE);
+	fw_vpd[side].mi_keyword[MI_KEYWORD_SIZE - 1] = '\0';
+	prlog(PR_NOTICE, "CUPD: %s side MI Keyword = %s\n",
+	      side == 0x00 ? "P" : "T", fw_vpd[side].mi_keyword);
+
+	/* Get ML details */
+	adf_sec = (void *)header + be32_to_cpu(mi_sec->adf_offset);
+	adf_sp = get_adf_sec_data(adf_sec, ADF_NAME_SP);
+	if (!adf_sp)
+		return;
+
+	strncpy(fw_vpd[side].ext_fw_id,
+		(void *)adf_sp + be32_to_cpu(adf_sp->sp_name_offset),
+		ML_KEYWORD_SIZE);
+	fw_vpd[side].ext_fw_id[ML_KEYWORD_SIZE - 1] = '\0';
+	prlog(PR_NOTICE, "CUPD: %s side ML Keyword = %s\n",
+	      side == 0x00 ? "P" : "T", fw_vpd[side].ext_fw_id);
+}
+
+static void validate_com_marker_lid(void)
+{
+	if (!strncmp(fw_vpd[ipl_side].mi_keyword, FW_VERSION_UNKNOWN,
+		     sizeof(FW_VERSION_UNKNOWN))) {
+		log_simple_error(&e_info(OPAL_RC_CU_MARKER_LID),
+			"CUPD: IPL side Marker LID is not valid\n");
+		flash_state = FLASH_STATE_INVALID;
+		return;
+	}
+
+	flash_state = FLASH_STATE_READ;
+}
+
+static void fetch_lid_data_complete(struct fsp_msg *msg)
+{
+	void *buffer;
+	size_t length, chunk;
+	uint32_t lid_id, offset;
+	uint16_t id;
+	uint8_t flags, status;
+	int rc;
+
+	status = (msg->resp->word1 >> 8) & 0xff;
+	flags = (fsp_msg_get_data_word(msg, 0) >> 16) & 0xff;
+	id = fsp_msg_get_data_word(msg, 0) & 0xffff;
+	lid_id = fsp_msg_get_data_word(msg, 1);
+	offset = fsp_msg_get_data_word(msg->resp, 1);
+	length = fsp_msg_get_data_word(msg->resp, 2);
+
+	prlog(PR_NOTICE, "CUPD: Marker LID id : size : status = "
+	      "0x%x : 0x%x : 0x%x\n",
+	      fsp_msg_get_data_word(msg, 1), fsp_msg_get_data_word(msg->resp, 2), status);
+
+	fsp_freemsg(msg);
+
+	switch (status) {
+	case FSP_STATUS_SUCCESS: /* Read complete, parse VPD */
+		parse_marker_lid(lid_id == P_COM_MARKER_LID_ID ? 0 : 1);
+		break;
+	case FSP_STATUS_MORE_DATA: /* More data left */
+		offset += length;
+		chunk = MARKER_LID_SIZE - offset;
+		if (chunk > 0) {
+			buffer = (void *)PSI_DMA_CODE_UPD + offset;
+			rc = fsp_fetch_data_queue(flags, id, lid_id,
+						  offset, buffer, &chunk,
+						  fetch_lid_data_complete);
+
+			/* If queue msg fails, then continue with marker LID
+			 * validation hoping that we have at least boot side
+			 * information.
+			 */
+			if (rc == OPAL_SUCCESS)
+				return;
+		}
+		break;
+	default:	/* Fetch LID call failed */
+		break;
+	}
+
+	/* If required, fetch T side marker LID */
+	if (lid_id == P_COM_MARKER_LID_ID &&
+	    lid_fetch_side == FETCH_BOTH_SIDE) {
+		length = MARKER_LID_SIZE;
+		rc = fsp_fetch_data_queue(flags, id, T_COM_MARKER_LID_ID,
+					  0, (void *)PSI_DMA_CODE_UPD,
+					  &length, fetch_lid_data_complete);
+
+		/* If queue msg fails, then continue with marker LID
+		 * validation hoping that we have at least boot side
+		 * information.
+		 */
+		if (rc == OPAL_SUCCESS)
+			return;
+	}
+
+	lock(&flash_lock);
+
+	/* Validate marker LID data */
+	validate_com_marker_lid();
+	/* TCE unmap */
+	code_update_tce_unmap(MARKER_LID_SIZE);
+
+	unlock(&flash_lock);
+}
+
+static void fetch_com_marker_lid(void)
+{
+	size_t length = MARKER_LID_SIZE;
+	uint32_t lid_id;
+	int rc;
+
+	/* Read in progress? */
+	rc = code_update_check_state();
+	if (rc == OPAL_HARDWARE || rc == OPAL_BUSY)
+		return;
+
+	if (lid_fetch_side == FETCH_T_SIDE_ONLY) {
+		lid_id = T_COM_MARKER_LID_ID;
+		set_def_fw_version(FW_IPL_SIDE_TEMP);
+	} else if (lid_fetch_side == FETCH_P_SIDE_ONLY) {
+		lid_id = P_COM_MARKER_LID_ID;
+		set_def_fw_version(FW_IPL_SIDE_PERM);
+	} else {
+		lid_id = P_COM_MARKER_LID_ID;
+		set_def_fw_version(FW_IPL_SIDE_PERM);
+		set_def_fw_version(FW_IPL_SIDE_TEMP);
+	}
+
+	code_update_tce_map(0, lid_data, length);
+	rc = fsp_fetch_data_queue(0x00, 0x05, lid_id, 0,
+				  (void *)PSI_DMA_CODE_UPD, &length,
+				  fetch_lid_data_complete);
+	if (!rc)
+		flash_state = FLASH_STATE_READING;
+	else
+		flash_state = FLASH_STATE_INVALID;
+}
+
+/*
+ * Add MI and ML keyword details into DT
+ */
+#define FW_VER_SIZE	64
+static void add_opal_firmware_version(void)
+{
+	struct dt_node *dt_fw;
+	char buffer[FW_VER_SIZE];
+	int offset;
+
+	dt_fw = dt_find_by_path(dt_root, "ibm,opal/firmware");
+	if (!dt_fw)
+		return;
+
+	/* MI version */
+	offset = snprintf(buffer, FW_VER_SIZE, "MI %s %s",
+			  fw_vpd[FW_IPL_SIDE_TEMP].mi_keyword,
+			  fw_vpd[FW_IPL_SIDE_PERM].mi_keyword);
+	if (ipl_side == FW_IPL_SIDE_TEMP)
+		snprintf(buffer + offset, FW_VER_SIZE - offset,
+			 " %s", fw_vpd[FW_IPL_SIDE_TEMP].mi_keyword);
+	else
+		snprintf(buffer + offset, FW_VER_SIZE - offset,
+			 " %s", fw_vpd[FW_IPL_SIDE_PERM].mi_keyword);
+
+	dt_add_property(dt_fw, "mi-version", buffer, strlen(buffer));
+
+	/* ML version */
+	offset = snprintf(buffer, FW_VER_SIZE, "ML %s %s",
+			  fw_vpd[FW_IPL_SIDE_TEMP].ext_fw_id,
+			  fw_vpd[FW_IPL_SIDE_PERM].ext_fw_id);
+	if (ipl_side == FW_IPL_SIDE_TEMP)
+		snprintf(buffer + offset, FW_VER_SIZE - offset,
+			 " %s", fw_vpd[FW_IPL_SIDE_TEMP].ext_fw_id);
+	else
+		snprintf(buffer + offset, FW_VER_SIZE - offset,
+			 " %s", fw_vpd[FW_IPL_SIDE_PERM].ext_fw_id);
+
+	dt_add_property(dt_fw, "ml-version", buffer, strlen(buffer));
+}
+
+/*
+ * This is called right before starting the payload (Linux) to
+ * ensure the common marker LID read and parsing has happened
+ * before we transfer control.
+ */
+void fsp_code_update_wait_vpd(bool is_boot)
+{
+	int waited = 0;
+
+	if (!fsp_present())
+		return;
+
+	prlog(PR_NOTICE, "CUPD: Waiting read marker LID"
+	      " and in flight parsm completion...\n");
+
+	lock(&flash_lock);
+	while(true) {
+		if (!(flash_state == FLASH_STATE_READING || in_flight_params))
+			break;
+		unlock(&flash_lock);
+		time_wait_ms(5);
+		waited+=5;
+		lock(&flash_lock);
+	}
+	unlock(&flash_lock);
+
+	if (waited)
+		prlog(PR_DEBUG, "CUPD: fsp_code_update_wait_vpd %d\n", waited);
+
+	if (is_boot)
+		add_opal_firmware_version();
+}
+
+static int code_update_start(void)
+{
+	struct fsp_msg *msg;
+	int rc;
+	uint16_t comp = 0x00;	/* All components */
+	uint8_t side = OPAL_COMMIT_TMP_SIDE;	/* Temporary side */
+
+	msg = fsp_mkmsg(FSP_CMD_FLASH_START, 1, side << 16 | comp);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CMD_FLASH_START message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static int code_update_write_lid(uint32_t lid_id, uint32_t size)
+{
+	struct fsp_msg *msg;
+	int rc, n_pairs = 1;
+
+	msg = fsp_mkmsg(FSP_CMD_FLASH_WRITE, 5, lid_id,
+			n_pairs, 0, tce_start, size);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CMD_FLASH_WRITE message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static int code_update_del_lid(uint32_t lid_id)
+{
+	struct fsp_msg *msg;
+	int rc;
+
+	msg = fsp_mkmsg(FSP_CMD_FLASH_DEL, 1, lid_id);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CMD_FLASH_DEL message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static int code_update_complete(uint32_t cmd)
+{
+	struct fsp_msg *msg;
+	int rc;
+
+	msg = fsp_mkmsg(cmd, 0);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CUPD COMPLETE message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static int code_update_swap_side(void)
+{
+	struct fsp_msg *msg;
+	int rc;
+
+	msg = fsp_mkmsg(FSP_CMD_FLASH_SWAP, 0);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CMD_FLASH_SWAP message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static int code_update_set_ipl_side(void)
+{
+	struct fsp_msg *msg;
+	uint8_t side = FW_IPL_SIDE_TEMP; /* Next IPL side */
+	int rc;
+
+	msg = fsp_mkmsg(FSP_CMD_SET_IPL_SIDE, 1, side << 16);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CMD_SET_IPL_SIDE message allocation failed!\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: Setting next IPL side failed!\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static void code_update_commit_complete(struct fsp_msg *msg)
+{
+	int rc;
+	uint8_t type;
+
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	type = (msg->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_CU_COMMIT),
+			"CUPD: Code update commit failed, err 0x%x\n", rc);
+		return;
+	}
+
+	/* Reset cached VPD data */
+	lock(&flash_lock);
+
+	/* Find commit type */
+	if (type == 0x01) {
+		lid_fetch_side = FETCH_P_SIDE_ONLY;
+	} else if (type == 0x02)
+		lid_fetch_side = FETCH_T_SIDE_ONLY;
+	else
+		lid_fetch_side = FETCH_BOTH_SIDE;
+
+	fetch_com_marker_lid();
+
+	unlock(&flash_lock);
+}
+
+static int code_update_commit(uint32_t cmd)
+{
+	struct fsp_msg *msg;
+
+	msg = fsp_mkmsg(cmd, 0);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: COMMIT message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_queue_msg(msg, code_update_commit_complete)) {
+		log_simple_error(&e_info(OPAL_RC_CU_COMMIT),
+			"CUPD: Failed to queue code update commit message\n");
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Inband code update is allowed?
+ */
+static int64_t validate_inband_policy(void)
+{
+	/* Quirk:
+	 *  If the code update policy is out-of-band, but the system
+	 *  is not HMC-managed, then inband update is allowed.
+	 */
+	if (hmc_managed != PLATFORM_HMC_MANAGED)
+		return 0;
+	if (update_policy == INBAND_UPDATE_ALLOWED)
+		return 0;
+
+	return -1;
+}
+
+/*
+ * Validate magic Number
+ */
+static int64_t validate_magic_num(uint16_t magic)
+{
+	if (magic != IMAGE_MAGIC_NUMBER)
+		return -1;
+	return 0;
+}
+
+/*
+ * Compare MI keyword to make sure candidate image
+ * is valid for this platform.
+ */
+static int64_t validate_image_version(struct update_image_header *header,
+				      uint32_t *result)
+{
+	struct fw_image_vpd vpd;
+	int t_valid = 0, p_valid = 0, cton_ver = -1, ptot_ver = -1;
+
+	/* Valid flash image level? */
+	if (strncmp(fw_vpd[0].mi_keyword, FW_VERSION_UNKNOWN,
+		    sizeof(FW_VERSION_UNKNOWN)) != 0)
+		p_valid = 1;
+
+	if (strncmp(fw_vpd[1].mi_keyword, FW_VERSION_UNKNOWN,
+		    sizeof(FW_VERSION_UNKNOWN)) != 0)
+		t_valid = 1;
+
+	/* Validate with IPL side image */
+	vpd = fw_vpd[ipl_side];
+
+	/* Validate platform identifier (first two char of MI keyword) */
+	if (strncmp(vpd.mi_keyword, header->mi_keyword_data, 2) != 0) {
+		*result = VALIDATE_INVALID_IMG;
+		return OPAL_SUCCESS;
+	}
+
+	/* Don't flash different FW series (like P7 image on P8) */
+	if (vpd.mi_keyword[2] != header->mi_keyword_data[2]) {
+		*result = VALIDATE_INVALID_IMG;
+		return OPAL_SUCCESS;
+	}
+
+	/* Get current to new version difference */
+	cton_ver = strncmp(vpd.mi_keyword + 3, header->mi_keyword_data + 3, 6);
+
+	/* Get P to T version difference */
+	if (t_valid && p_valid)
+		ptot_ver = strncmp(fw_vpd[0].mi_keyword + 3,
+				   fw_vpd[1].mi_keyword + 3, 6);
+
+	/* Update validation result */
+	if (ipl_side == FW_IPL_SIDE_TEMP) {
+		if (!ptot_ver && cton_ver > 0) /* downgrade T side */
+			*result = VALIDATE_TMP_UPDATE_DL;
+		else if (!ptot_ver && cton_ver <= 0) /* upgrade T side */
+			*result = VALIDATE_TMP_UPDATE;
+		else if (cton_ver > 0) /* Implied commit & downgrade T side */
+			*result = VALIDATE_TMP_COMMIT_DL;
+		else /* Implied commit & upgrade T side */
+			*result = VALIDATE_TMP_COMMIT;
+	} else {
+		if (!t_valid)	/* Current unknown */
+			*result = VALIDATE_CUR_UNKNOWN;
+		else if (cton_ver > 0) /* downgrade FW version */
+			*result = VALIDATE_TMP_UPDATE_DL;
+		else		/* upgrade FW version */
+			*result = VALIDATE_TMP_UPDATE;
+	}
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Validate candidate image
+ */
+static int validate_candidate_image(uint64_t buffer,
+				    uint32_t size, uint32_t *result)
+{
+	struct update_image_header *header;
+	int rc = OPAL_PARAMETER;
+
+	if (size < VALIDATE_BUF_SIZE)
+		goto out;
+
+	rc = code_update_check_state();
+	if (rc != OPAL_SUCCESS)
+		goto out;
+
+	if (validate_inband_policy() != 0) {
+		*result = VALIDATE_FLASH_AUTH;
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+
+	memcpy(validate_buf, (void *)buffer, VALIDATE_BUF_SIZE);
+	header = (struct update_image_header *)validate_buf;
+
+	if (validate_magic_num(be16_to_cpu(header->magic)) != 0) {
+		*result = VALIDATE_INVALID_IMG;
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+	rc = validate_image_version(header, result);
+out:
+	return rc;
+}
+
+static int validate_out_buf_mi_data(void *buffer, int offset, uint32_t result)
+{
+	struct update_image_header *header = (void *)validate_buf;
+
+	/* Current T & P side MI data */
+	offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+			   "MI %s %s\n",
+			   fw_vpd[1].mi_keyword, fw_vpd[0].mi_keyword);
+
+	/* New T & P side MI data */
+	offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+			   "MI %s", header->mi_keyword_data);
+	if (result == VALIDATE_TMP_COMMIT_DL ||
+	    result == VALIDATE_TMP_COMMIT)
+		offset += snprintf(buffer + offset,
+				   VALIDATE_BUF_SIZE - offset,
+				   " %s\n", fw_vpd[1].mi_keyword);
+	else
+		offset += snprintf(buffer + offset,
+				   VALIDATE_BUF_SIZE - offset,
+				   " %s\n", fw_vpd[0].mi_keyword);
+	return offset;
+}
+
+static int validate_out_buf_ml_data(void *buffer, int offset, uint32_t result)
+{
+	struct update_image_header *header = (void *)validate_buf;
+	/* Candidate image ML data */
+	char *ext_fw_id = (void *)header->data;
+
+	/* Current T & P side ML data */
+	offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+			   "ML %s %s\n",
+			   fw_vpd[1].ext_fw_id, fw_vpd[0].ext_fw_id);
+
+	/* New T & P side ML data */
+	offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+			   "ML %s", ext_fw_id);
+	if (result == VALIDATE_TMP_COMMIT_DL ||
+	    result == VALIDATE_TMP_COMMIT)
+		offset += snprintf(buffer + offset,
+				   VALIDATE_BUF_SIZE - offset,
+				   " %s\n", fw_vpd[1].ext_fw_id);
+	else
+		offset += snprintf(buffer + offset,
+				   VALIDATE_BUF_SIZE - offset,
+				   " %s\n", fw_vpd[0].ext_fw_id);
+
+	return offset;
+}
+
+/*
+ * Copy LID data to TCE buffer
+ */
+static int get_lid_data(struct opal_sg_list *list,
+			int lid_size, int lid_offset)
+{
+	struct opal_sg_list *sg;
+	struct opal_sg_entry *entry;
+	int length, num_entries, i, buf_pos = 0;
+	int map_act, map_size;
+	bool last = false;
+
+	/* Reset TCE start address */
+	tce_start = 0;
+
+	for (sg = list; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) {
+		length = (be64_to_cpu(sg->length) & ~(SG_LIST_VERSION << 56)) - 16;
+		num_entries = length / sizeof(struct opal_sg_entry);
+		if (num_entries <= 0)
+			return -1;
+
+		for (i = 0; i < num_entries; i++) {
+			entry = &sg->entry[i];
+
+			/*
+			 * Continue until we get data block which
+			 * contains LID data
+			 */
+			if (lid_offset > be64_to_cpu(entry->length)) {
+				lid_offset -= be64_to_cpu(entry->length);
+				continue;
+			}
+
+                        /*
+			 * SG list entry size can be more than 4k.
+			 * Map only required pages, instead of
+			 * mapping entire entry.
+			 */
+			map_act = be64_to_cpu(entry->length);
+			map_size = be64_to_cpu(entry->length);
+
+			/* First TCE mapping */
+			if (!tce_start) {
+				tce_start = PSI_DMA_CODE_UPD +
+						(lid_offset & 0xfff);
+				map_act = be64_to_cpu(entry->length) - lid_offset;
+				lid_offset &= ~0xfff;
+				map_size = be64_to_cpu(entry->length) - lid_offset;
+			}
+
+			/* Check pending LID size to map */
+			if (lid_size <= map_act) {
+				/* (map_size - map_act) gives page
+				 * start to tce offset difference.
+				 * This is required when LID size
+				 * is <= 4k.
+				 */
+				map_size = (map_size - map_act) + lid_size;
+				last = true;
+			}
+
+			/* Ajust remaining size to map */
+			lid_size -= map_act;
+
+			/* TCE mapping */
+			code_update_tce_map(buf_pos,
+					    (void*)(be64_to_cpu(entry->data)
+						    + lid_offset),
+					    map_size);
+			buf_pos += map_size;
+			/* Reset LID offset count */
+			lid_offset = 0;
+
+			if (last)
+				return OPAL_SUCCESS;
+		}
+	} /* outer loop */
+	return -1;
+}
+
+/*
+ * If IPL side is T, then swap P & T sides to add
+ * new fix to T side.
+ */
+static int validate_ipl_side(void)
+{
+	if (ipl_side == FW_IPL_SIDE_PERM)
+		return 0;
+	return code_update_swap_side();
+}
+
+static int64_t fsp_opal_validate_flash(uint64_t buffer,
+				       __be32 *size, __be32 *result)
+{
+	int64_t rc = 0;
+	int offset;
+	uint32_t r;
+
+	lock(&flash_lock);
+
+	rc = validate_candidate_image(buffer, be32_to_cpu(*size), &r);
+	/* Fill output buffer
+	 *
+	 * Format:
+	 *   MI<sp>current-T-image<sp>current-P-image<0x0A>
+	 *   MI<sp>new-T-image<sp>new-P-image<0x0A>
+	 *   ML<sp>current-T-image<sp>current-P-image<0x0A>
+	 *   ML<sp>new-T-image<sp>new-P-image<0x0A>
+	 */
+	if (!rc && (r != VALIDATE_FLASH_AUTH && r != VALIDATE_INVALID_IMG)) {
+		/* Clear output buffer */
+		memset((void *)buffer, 0, VALIDATE_BUF_SIZE);
+
+		offset = validate_out_buf_mi_data((void *)buffer, 0, r);
+		offset += validate_out_buf_ml_data((void *)buffer, offset, r);
+		*size = cpu_to_be32(offset);
+	}
+	*result = cpu_to_be32(r);
+
+	unlock(&flash_lock);
+	return rc;
+}
+
+/* Commit/Reject T side image */
+static int64_t fsp_opal_manage_flash(uint8_t op)
+{
+	uint32_t cmd;
+	int rc;
+
+	lock(&flash_lock);
+	rc = code_update_check_state();
+	unlock(&flash_lock);
+
+	if (rc != OPAL_SUCCESS)
+		return rc;
+
+	if (op != OPAL_REJECT_TMP_SIDE && op != OPAL_COMMIT_TMP_SIDE)
+		return OPAL_PARAMETER;
+
+	if ((op == OPAL_COMMIT_TMP_SIDE && ipl_side == FW_IPL_SIDE_PERM) ||
+	    (op == OPAL_REJECT_TMP_SIDE && ipl_side == FW_IPL_SIDE_TEMP))
+		return OPAL_ACTIVE_SIDE_ERR;
+
+	if (op == OPAL_COMMIT_TMP_SIDE)
+		cmd = FSP_CMD_FLASH_NORMAL;
+	else
+		cmd = FSP_CMD_FLASH_REMOVE;
+
+	return code_update_commit(cmd);
+}
+
+static int fsp_flash_firmware(void)
+{
+	struct update_image_header *header;
+	struct lid_index_entry *idx_entry;
+	struct opal_sg_list *list;
+	struct opal_sg_entry *entry;
+	int rc, i;
+
+	/* Make sure no outstanding LID read is in progress */
+	rc = code_update_check_state();
+	if (rc == OPAL_BUSY)
+		fsp_code_update_wait_vpd(false);
+
+	/* Get LID Index */
+	list = image_data;
+	if (!list)
+		goto out;
+	entry = &list->entry[0];
+	header = (struct update_image_header *)be64_to_cpu(entry->data);
+	idx_entry = (void *)header + be16_to_cpu(header->lid_index_offset);
+
+	/* FIXME:
+	 *   At present we depend on FSP to validate CRC for
+	 *   individual LIDs. Calculate and validate individual
+	 *   LID CRC here.
+	 */
+
+	if (validate_ipl_side() != 0) {
+		log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+				 "Rename (Swap T and P) failed!\n");
+		goto out;
+	}
+
+	/* Set next IPL side */
+	if (code_update_set_ipl_side() != 0) {
+		log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+				 "Setting next IPL side failed!\n");
+		goto out;
+	}
+
+	/* Start code update process */
+	if (code_update_start() != 0) {
+		log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+				 "Code update start failed!\n");
+		goto out;
+	}
+
+	/*
+	 * Delete T side LIDs before writing.
+	 *
+	 * Note:
+	 *   - Applicable for FWv >= 760.
+	 *   - Current Code Update design is to ignore
+	 *     any delete lid failure, and continue with
+	 *     the update.
+	 */
+	rc = code_update_del_lid(DEL_UPD_SIDE_LIDS);
+
+	if (rc)
+		prlog(PR_TRACE, "CUPD: Failed to delete LIDs (%d). This is okay, continuing..", rc);
+
+	for (i = 0; i < be16_to_cpu(header->number_lids); i++) {
+		if (be32_to_cpu(idx_entry->size) > LID_MAX_SIZE) {
+			log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: LID"
+				" (0x%x) size 0x%x is > max LID size (0x%x).\n",
+				 be32_to_cpu(idx_entry->id),
+				 be32_to_cpu(idx_entry->size), LID_MAX_SIZE);
+			goto abort_update;
+		}
+
+		rc = get_lid_data(list, be32_to_cpu(idx_entry->size),
+				  be32_to_cpu(idx_entry->offset));
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+				"Failed to parse LID from firmware image."
+				" (rc : %d).\n", rc);
+			goto abort_update;
+		}
+
+		rc = code_update_write_lid(be32_to_cpu(idx_entry->id),
+					   be32_to_cpu(idx_entry->size));
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+				"Failed to write LID to FSP. (rc : %d).\n", rc);
+			goto abort_update;
+		}
+
+		/* Unmap TCE */
+		code_update_tce_unmap(PSI_DMA_CODE_UPD_SIZE);
+
+		/* Next LID index */
+		idx_entry = (void *)idx_entry + sizeof(struct lid_index_entry);
+	}
+
+	/* Code update completed */
+	rc = code_update_complete(FSP_CMD_FLASH_COMPLETE);
+
+	return rc;
+
+abort_update:
+	rc = code_update_complete(FSP_CMD_FLASH_ABORT);
+	if (rc)
+		log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+			 "Code update abort command failed. (rc : %d).", rc);
+
+out:
+	return -1;
+}
+
+static int64_t validate_sglist(struct opal_sg_list *list)
+{
+	struct opal_sg_list *sg;
+	struct opal_sg_entry *prev_entry, *entry;
+	int length, num_entries, i;
+
+	prev_entry = NULL;
+	for (sg = list; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) {
+		length = (be64_to_cpu(sg->length) & ~(SG_LIST_VERSION << 56)) - 16;
+		num_entries = length / sizeof(struct opal_sg_entry);
+		if (num_entries <= 0)
+			return -1;
+
+		for (i = 0; i < num_entries; i++) {
+			entry = &sg->entry[i];
+
+			/* All entries must be aligned */
+			if (((uint64_t)be64_to_cpu(entry->data)) & 0xfff)
+				return OPAL_PARAMETER;
+
+			/* All non-terminal entries size must be aligned */
+			if (prev_entry && (be64_to_cpu(prev_entry->length) & 0xfff))
+				return OPAL_PARAMETER;
+
+			prev_entry = entry;
+		}
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t fsp_opal_update_flash(struct opal_sg_list *list)
+{
+	struct opal_sg_entry *entry;
+	int length, num_entries, result = 0, rc = OPAL_PARAMETER;
+
+	/* Ensure that the sg list honors our alignment requirements */
+	rc = validate_sglist(list);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_CU_SG_LIST),
+			"CUPD: sglist fails alignment requirements\n");
+		return rc;
+	}
+
+	lock(&flash_lock);
+	if (!list) {	/* Cancel update request */
+		fsp_flash_term_hook = NULL;
+		image_data = NULL;
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+
+	disable_fast_reboot("FSP Code Update");
+
+	length = (be64_to_cpu(list->length) & ~(SG_LIST_VERSION << 56)) - 16;
+	num_entries = length / sizeof(struct opal_sg_entry);
+	if (num_entries <= 0)
+		goto out;
+
+	/* Validate image header */
+	entry = &list->entry[0];
+	rc = validate_candidate_image((uint64_t)be64_to_cpu(entry->data),
+				      VALIDATE_BUF_SIZE, &result);
+	if (!rc && (result != VALIDATE_FLASH_AUTH &&
+		   result != VALIDATE_INVALID_IMG)) {
+		image_data = list;
+		fsp_flash_term_hook = fsp_flash_firmware;
+		goto out;
+	}
+
+	/* Adjust return code */
+	if (result == VALIDATE_FLASH_AUTH)
+		rc = OPAL_FLASH_NO_AUTH;
+	else if (result == VALIDATE_INVALID_IMG)
+		rc = OPAL_INVALID_IMAGE;
+
+out:
+	unlock(&flash_lock);
+	return rc;
+}
+
+/*
+ * Code Update notifications
+ *
+ * Note: At present we just ACK these notifications.
+ *       Reset cached VPD data if we are going to support
+ *       concurrent image maint in future.
+ */
+static bool code_update_notify(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+	int rc;
+	uint32_t cmd;
+
+	switch(cmd_sub_mod) {
+	case FSP_CMD_FLASH_CACHE:
+		cmd = FSP_CMD_FLASH_CACHE_RSP;
+		prlog(PR_NOTICE, "CUPD: Update LID cache event [data = 0x%x]\n",
+		      fsp_msg_get_data_word(msg, 0));
+		break;
+	case FSP_CMD_FLASH_OUTC:
+	case FSP_CMD_FLASH_OUTR:
+	case FSP_CMD_FLASH_OUTS:
+		cmd = FSP_CMD_FLASH_OUT_RSP;
+		prlog(PR_NOTICE, "CUPD: Out of band commit notify "
+		      "[Type = 0x%x]\n", (msg->word1 >> 8) & 0xff);
+		break;
+	default:
+		log_simple_error(&e_info(OPAL_RC_CU_NOTIFY), "CUPD: Unknown "
+			"notification [cmd = 0x%x]\n", cmd_sub_mod);
+		return false;
+	}
+
+	rc = fsp_queue_msg(fsp_mkmsg(cmd, 0), fsp_freemsg);
+	if (rc)
+		log_simple_error(&e_info(OPAL_RC_CU_NOTIFY), "CUPD: Failed to "
+			"queue code update notification response :%d\n", rc);
+
+	return true;
+}
+
+/*
+ * Handle FSP R/R event.
+ *
+ * Note:
+ *   If FSP R/R happens during code update, then entire system reboots
+ *   and comes up with P side image (and T side image will be invalid).
+ *   Hence we don't need to handle R/R during code update.
+ *
+ *   Also if FSP R/R happens in init path (while retrieving in_flight_params)
+ *   then system fails to continue booting (because we have not yet loaded
+ *   all required data/LID from FSP). Hence we don't need to handle R/R
+ *   for system params.
+ */
+static bool fsp_code_update_rr(uint32_t cmd_sub_mod,
+			       struct fsp_msg *msg __unused)
+{
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		lock(&flash_lock);
+
+		if (code_update_check_state() == OPAL_BUSY)
+			flash_state = FLASH_STATE_ABORT;
+
+		unlock(&flash_lock);
+		return true;
+	case FSP_RELOAD_COMPLETE:
+		lock(&flash_lock);
+
+		/* Lets try to parse marker LID again, if we failed
+		 * to parse marker LID last time.
+		 */
+		if (code_update_check_state() == OPAL_INTERNAL_ERROR)
+			fetch_com_marker_lid();
+
+		unlock(&flash_lock);
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_cupd_client_rr = {
+	        .message = fsp_code_update_rr,
+};
+
+static struct fsp_client fsp_get_notify = {
+	.message = code_update_notify,
+};
+
+void fsp_code_update_init(void)
+{
+	if (!fsp_present()) {
+		flash_state = FLASH_STATE_ABSENT;
+		return;
+	}
+
+	/* OPAL interface */
+	opal_register(OPAL_FLASH_VALIDATE, fsp_opal_validate_flash, 3);
+	opal_register(OPAL_FLASH_MANAGE, fsp_opal_manage_flash, 1);
+	opal_register(OPAL_FLASH_UPDATE, fsp_opal_update_flash, 1);
+
+	/* register Code Update Class D3 */
+	fsp_register_client(&fsp_get_notify, FSP_MCLASS_CODE_UPDATE);
+	/* Register for Class AA (FSP R/R) */
+	fsp_register_client(&fsp_cupd_client_rr, FSP_MCLASS_RR_EVENT);
+
+	/* Register for firmware IPL side update notification */
+	sysparam_add_update_notifier(fw_ipl_side_update_notify);
+
+	/* Flash hook */
+	fsp_flash_term_hook = NULL;
+
+	/* Fetch various code update related sys parameters */
+	get_ipl_side();
+	get_code_update_policy();
+	get_platform_hmc_managed();
+
+	/* Fetch common marker LID */
+	lid_data = memalign(TCE_PSIZE, MARKER_LID_SIZE);
+	if (!lid_data) {
+		log_simple_error(&e_info(OPAL_RC_CU_INIT),
+			"CUPD: Failed to allocate memory for marker LID\n");
+		flash_state = FLASH_STATE_ABSENT;
+		return;
+	}
+	fetch_com_marker_lid();
+}
diff --git a/roms/skiboot/hw/fsp/fsp-codeupdate.h b/roms/skiboot/hw/fsp/fsp-codeupdate.h
new file mode 100644
index 000000000..2b86619ef
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-codeupdate.h
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2015 IBM Corp. */
+
+#ifndef __CODEUPDATE_H
+#define __CODEUPDATE_H
+
+/* Flash SG list version */
+#define SG_LIST_VERSION		(1UL)
+
+/* LID size <= 16M */
+#define LID_MAX_SIZE            0x1000000
+
+/* Delete all LIDs in */
+#define DEL_UPD_SIDE_LIDS	0xFFFFFFFF
+
+/* System parameter values used in code update validation */
+#define INBAND_UPDATE_ALLOWED   0x01
+#define PLATFORM_HMC_MANAGED    0x01
+#define FW_LICENSE_ACCEPT       0x01
+
+/* Running image side */
+#define FW_IPL_SIDE_TEMP	0x01
+#define FW_IPL_SIDE_PERM	0x00
+
+/* Manage operations */
+#define OPAL_REJECT_TMP_SIDE	0
+#define OPAL_COMMIT_TMP_SIDE	1
+
+/* Validate image size */
+#define VALIDATE_BUF_SIZE	4096
+
+/* Code update operation status */
+#define OPAL_INVALID_IMAGE	-1003	/* Unacceptable image */
+#define OPAL_ACTIVE_SIDE_ERR	-9001
+#define OPAL_FLASH_NO_AUTH	-9002
+
+/* Validate image update result tokens */
+#define VALIDATE_TMP_UPDATE	0     /* T side will be updated */
+#define VALIDATE_FLASH_AUTH	1     /* Partition does not have authority */
+#define VALIDATE_INVALID_IMG	2     /* Candidate image is not valid */
+#define VALIDATE_CUR_UNKNOWN	3     /* Current fixpack level is unknown */
+/*
+ * Current T side will be committed to P side before being replace with new
+ * image, and the new image is downlevel from current image
+ */
+#define VALIDATE_TMP_COMMIT_DL	4
+/*
+ * Current T side will be committed to P side before being replaced with new
+ * image
+ */
+#define VALIDATE_TMP_COMMIT	5
+/*
+ * T side will be updated with a downlevel image
+ */
+#define VALIDATE_TMP_UPDATE_DL	6
+/*
+ * The candidate image's release date is later than the system's firmware
+ * service entitlement date - service warranty period has expired
+ */
+#define VALIDATE_OUT_OF_WRNTY	7
+
+/* default version */
+#define FW_VERSION_UNKNOWN "UNKNOWN"
+
+/* Actual size of MI & ML keyword including NULL */
+#define MI_KEYWORD_SIZE		10
+#define ML_KEYWORD_SIZE		9
+
+/* Firmware image VPD data */
+struct fw_image_vpd {
+	char	mi_keyword[MI_KEYWORD_SIZE];	/* NNSSS_FFF */
+	char	ext_fw_id[ML_KEYWORD_SIZE];	/* FWxxx.yy */
+};
+
+/* Master LID header */
+struct master_lid_header {
+	char		key[3];		/* "MLH" */
+	uint8_t		version;	/* 0x02 */
+	__be16		header_size;
+	__be16		entry_size;
+	uint8_t		reserved[56];
+};
+
+/* LID index entry */
+struct lid_index_entry {
+	__be32		id;
+	__be32		size;
+	__be32		offset;
+	__be32		crc;
+};
+
+/* SP flags */
+#define FW_ONE_OFF_SP	0x80000000
+#define FW_EMERGENCY_SP	0x40000000
+
+/*
+ * SP GA date
+ *
+ * sp_flag addr = header->data + header->ext_fw_id_size
+ */
+struct update_image_ga_date {
+	__be32		sp_flag;
+	char		sp_ga_date[8];	/* YYYYMMDD */
+};
+
+/* Image magic number */
+#define IMAGE_MAGIC_NUMBER	0x5549
+
+/* Image header structure */
+struct update_image_header {
+	__be16		magic;
+	__be16		version;
+	__be32		package_size;
+	__be32		crc;
+	__be16		lid_index_offset;
+	__be16		number_lids;
+	__be16		package_flags;
+	__be16		mi_keyword_size;
+	char		mi_keyword_data[40];
+	__be16	ext_fw_id_size;
+	/* Rest of the image data including ext fw id, sp flags */
+	char		data[];
+};
+
+/* FipS header */
+struct fips_header {
+	__be16		magic;
+	__be16		version;
+	__be32		lid_id;
+	__be32		lid_date;	/* YYYYMMDD */
+	__be16		lid_time;	/* HHMM */
+	__be16		lid_class;
+	__be32		crc;
+	__be32		lid_size;	/* Number of bytes below header */
+	__be32		header_size;
+	uint8_t		mtd_number;
+	uint8_t		valid;		/* 1 = valid, 0 = invalid */
+	uint8_t		reserved;
+	uint8_t		lid_info_size;
+	char		lid_info[64];	/* code level */
+	__be32		update_date;	/* YYYYMMDD */
+	__be16		update_time;	/* HHMM */
+	__be16		phylum_len;
+	uint8_t		lid_phylum[];
+};
+
+/* Approximate LID size */
+#define MASTER_LID_SIZE		0x5000
+/*
+ * Note:
+ *  Doc indicates non-SP LIDs size is 0-8MB. However
+ *  in reality marker LID size less than 4k. Allocating
+ *  8k to give some breathing space.
+ */
+#define MARKER_LID_SIZE         0x00002000
+
+/* Common marker LID no */
+#define P_COM_MARKER_LID_ID	0x80A00001
+#define T_COM_MARKER_LID_ID	(P_COM_MARKER_LID_ID | ADJUST_T_SIDE_LID_NO)
+
+/*
+ * Common marker LID structure
+ *
+ * Note that we are populating only required sections,
+ * not all ADF sections in common marker LID.
+ */
+struct com_marker_header {
+	__be32		version;
+	__be32		MI_offset;	/* Offset to MI section */
+	__be32		iseries_offset;
+};
+
+/* MI Keyword section */
+struct com_marker_mi_section {
+	__be32		MI_size;
+	char		mi_keyword[40];	/* MI Keyword */
+	char		lst_disrupt_fix_lvl[3];
+	char		skip[21];	/* Skip not interested fields */
+	__be32		adf_offset;	/* Offset to ADF section */
+};
+
+/* Additional Data Fields */
+struct com_marker_adf_sec {
+	__be32		adf_cnt;	/* ADF count */
+	char		adf_data[];	/* ADF data */
+};
+
+/* ADF common header */
+struct com_marker_adf_header {
+	__be32		size;	/* Section size */
+	__be32		name;	/* Section name */
+};
+
+/*
+ * Service Pack Nomenclature ADF
+ *
+ * Service pack release name.
+ */
+#define ADF_NAME_SP	0x53504E4D	/* SPNM */
+struct com_marker_adf_sp
+{
+	struct com_marker_adf_header header;
+	__be32		sp_name_offset;	/* Offset from start of ADF */
+	__be32		sp_name_size;
+	__be32		skip[4];	/* Skip rest of fields */
+};
+
+/*
+ * Firmware IP Protection ADF
+ *
+ * Service Pack flags and GA date.
+ */
+#define ADF_NAME_FW_IP	0x46495050	/* FIPP */
+struct com_marker_fw_ip {
+	struct com_marker_adf_header header;
+	__be32		sp_flag_offset;	/* Offset from start of ADF */
+	__be32		sp_flag_size;
+	__be32		sp_ga_offset;	/* Offset from start of ADF*/
+	__be32		sp_ga_size;
+};
+
+#endif /* __CODEUPDATE_H */
diff --git a/roms/skiboot/hw/fsp/fsp-console.c b/roms/skiboot/hw/fsp/fsp-console.c
new file mode 100644
index 000000000..dc23ac46f
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-console.c
@@ -0,0 +1,1062 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Flexible Service Processor (FSP) serial console handling code
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <processor.h>
+#include <io.h>
+#include <fsp.h>
+#include <console.h>
+#include <opal.h>
+#include <timebase.h>
+#include <device.h>
+#include <fsp-sysparam.h>
+#include <errorlog.h>
+#include <lock.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_CONSOLE_HANG, OPAL_PLATFORM_ERR_EVT, OPAL_CONSOLE,
+		 OPAL_PLATFORM_FIRMWARE,
+		 OPAL_PREDICTIVE_ERR_GENERAL, OPAL_NA);
+
+struct fsp_serbuf_hdr {
+	__be16	partition_id;
+	u8	session_id;
+	u8	hmc_id;
+	__be16	data_offset;
+	__be16	last_valid;
+	__be16	ovf_count;
+	__be16	next_in;
+	u8	flags;
+	u8	reserved;
+	__be16	next_out;
+	u8	data[];
+};
+#define SER_BUF_DATA_SIZE	(0x10000 - sizeof(struct fsp_serbuf_hdr))
+
+struct fsp_serial {
+	bool			available;
+	bool			open;
+	bool			has_part0;
+	bool			has_part1;
+	bool			log_port;
+	bool			out_poke;
+	char			loc_code[LOC_CODE_SIZE];
+	u16			rsrc_id;
+	struct fsp_serbuf_hdr	*in_buf;
+	struct fsp_serbuf_hdr	*out_buf;
+	struct fsp_msg		*poke_msg;
+	u8			waiting;
+	u64			irq;
+	u16			out_buf_prev_len;
+	u64			out_buf_timeout;
+};
+
+#define SER_BUFFER_SIZE 0x00040000UL
+#define MAX_SERIAL	4
+
+#define SER_BUFFER_OUT_TIMEOUT	10
+
+static struct fsp_serial fsp_serials[MAX_SERIAL];
+static bool got_intf_query;
+static struct lock fsp_con_lock = LOCK_UNLOCKED;
+static void* ser_buffer = NULL;
+
+static void fsp_console_reinit(void)
+{
+	int i;
+	void *base;
+	struct fsp_msg *msg;
+
+	/* Initialize out data structure pointers & TCE maps */
+	base = ser_buffer;
+	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *ser = &fsp_serials[i];
+
+		ser->in_buf = base;
+		ser->out_buf = base + SER_BUFFER_SIZE/2;
+		base += SER_BUFFER_SIZE;
+	}
+	fsp_tce_map(PSI_DMA_SER0_BASE, ser_buffer,
+			4 * PSI_DMA_SER0_SIZE);
+
+	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+
+		if (!fs->available)
+			continue;
+
+		if (fs->rsrc_id == 0xffff)
+			continue;
+		prlog(PR_DEBUG, "FSP: Reassociating HVSI console %d\n", i);
+		msg = fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2,
+				(fs->rsrc_id << 16) | 1, i);
+		if (!msg) {
+			prerror("FSPCON: Failed to allocate associate msg\n");
+			return;
+		}
+		if (fsp_queue_msg(msg, fsp_freemsg)) {
+			fsp_freemsg(msg);
+			prerror("FSPCON: Failed to queue associate msg\n");
+			return;
+		}
+	}
+}
+
+static void fsp_close_consoles(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+
+		if (!fs->available)
+			continue;
+
+		lock(&fsp_con_lock);
+		if (fs->open) {
+			fs->open = false;
+			fs->out_poke = false;
+			if (fs->poke_msg->state != fsp_msg_unused)
+				fsp_cancelmsg(fs->poke_msg);
+			fsp_freemsg(fs->poke_msg);
+			fs->poke_msg = NULL;
+		}
+		unlock(&fsp_con_lock);
+	}
+	prlog(PR_DEBUG, "FSPCON: Closed consoles due to FSP reset/reload\n");
+}
+
+static void fsp_pokemsg_reclaim(struct fsp_msg *msg)
+{
+	struct fsp_serial *fs = msg->user_data;
+
+	/*
+	 * The poke_msg might have been "detached" from the console
+	 * in vserial_close, so we need to check whether it's current
+	 * before touching the state, otherwise, just free it
+	 */
+	lock(&fsp_con_lock);
+	if (fs->open && fs->poke_msg == msg) {
+		if (fs->out_poke) {
+			if (fsp_queue_msg(fs->poke_msg, fsp_pokemsg_reclaim)) {
+				prerror("FSPCON: failed to queue poke msg\n");
+			} else {
+				fs->out_poke = false;
+			}
+		} else
+			fs->poke_msg->state = fsp_msg_unused;
+	} else
+		fsp_freemsg(msg);
+	unlock(&fsp_con_lock);
+}
+
+/* Called with the fsp_con_lock held */
+static size_t fsp_write_vserial(struct fsp_serial *fs, const char *buf,
+				size_t len)
+{
+	struct fsp_serbuf_hdr *sb = fs->out_buf;
+	u16 old_nin = be16_to_cpu(sb->next_in);
+	u16 space, chunk;
+
+	if (!fs->open)
+		return 0;
+
+	space = (be16_to_cpu(sb->next_out) + SER_BUF_DATA_SIZE - old_nin - 1)
+		% SER_BUF_DATA_SIZE;
+	if (space < len)
+		len = space;
+	if (!len)
+		return 0;
+
+	chunk = SER_BUF_DATA_SIZE - old_nin;
+	if (chunk > len)
+		chunk = len;
+	memcpy(&sb->data[old_nin], buf, chunk);
+	if (chunk < len)
+		memcpy(&sb->data[0], buf + chunk, len - chunk);
+	lwsync();
+	sb->next_in = cpu_to_be16((old_nin + len) % SER_BUF_DATA_SIZE);
+	sync();
+
+	if (be16_to_cpu(sb->next_out) == old_nin && fs->poke_msg) {
+		if (fs->poke_msg->state == fsp_msg_unused) {
+			if (fsp_queue_msg(fs->poke_msg, fsp_pokemsg_reclaim))
+				prerror("FSPCON: poke msg queuing failed\n");
+		} else
+			fs->out_poke = true;
+	}
+#ifndef DISABLE_CON_PENDING_EVT
+	opal_update_pending_evt(OPAL_EVENT_CONSOLE_OUTPUT,
+				OPAL_EVENT_CONSOLE_OUTPUT);
+#endif
+	return len;
+}
+
+#ifdef DVS_CONSOLE
+static int fsp_con_port = -1;
+static bool fsp_con_full;
+
+/*
+ * This is called by the code in console.c without the con_lock
+ * held. However it can be called as the result of any printf
+ * thus any other lock might be held including possibly the
+ * FSP lock
+ */
+static size_t fsp_con_write(const char *buf, size_t len)
+{
+	size_t written;
+
+	if (fsp_con_port < 0)
+		return 0;
+
+	lock(&fsp_con_lock);
+	written = fsp_write_vserial(&fsp_serials[fsp_con_port], buf, len);
+	fsp_con_full = (written < len);
+	unlock(&fsp_con_lock);
+
+	return written;
+}
+
+static struct con_ops fsp_con_ops = {
+	.write = fsp_con_write,
+};
+#endif /* DVS_CONSOLE */
+
+static void fsp_open_vserial(struct fsp_msg *msg)
+{
+	struct fsp_msg *resp;
+
+	u16 part_id = fsp_msg_get_data_word(msg, 0) & 0xffff;
+	u16 sess_id = fsp_msg_get_data_word(msg, 1) & 0xffff;
+	u8 hmc_sess = msg->data.bytes[0];	
+	u8 hmc_indx = msg->data.bytes[1];
+	u8 authority = msg->data.bytes[4];
+	u32 tce_in, tce_out;
+	struct fsp_serial *fs;
+
+	prlog(PR_INFO, "FSPCON: Got VSerial Open\n");
+	prlog(PR_DEBUG, "  part_id   = 0x%04x\n", part_id);
+	prlog(PR_DEBUG, "  sess_id   = 0x%04x\n", sess_id);
+	prlog(PR_DEBUG, "  hmc_sess  = 0x%02x\n", hmc_sess);
+	prlog(PR_DEBUG, "  hmc_indx  = 0x%02x\n", hmc_indx);
+	prlog(PR_DEBUG, "  authority = 0x%02x\n", authority);
+
+	if (sess_id >= MAX_SERIAL || !fsp_serials[sess_id].available) {
+		prlog(PR_WARNING, "FSPCON: 0x%04x  NOT AVAILABLE!\n", sess_id);
+		resp = fsp_mkmsg(FSP_RSP_OPEN_VSERIAL | 0x2f, 0);
+		if (!resp) {
+			prerror("FSPCON: Response allocation failed\n");
+			return;
+		}
+		if (fsp_queue_msg(resp, fsp_freemsg)) {
+			fsp_freemsg(resp);
+			prerror("FSPCON: Failed to queue response msg\n");
+		}
+		return;
+	}
+
+	fs = &fsp_serials[sess_id];
+
+	/* Hack ! On blades, the console opened via the mm has partition 1
+	 * while the debug DVS generally has partition 0 (though you can
+	 * use what you want really).
+	 * We don't want a DVS open/close to crap on the blademm console
+	 * thus if it's a raw console, gets an open with partID 1, we
+	 * set a flag that ignores the close of partid 0
+	 */
+	if (fs->rsrc_id == 0xffff) {
+		if (part_id == 0)
+			fs->has_part0 = true;
+		if (part_id == 1)
+			fs->has_part1 = true;
+	}
+
+	tce_in = PSI_DMA_SER0_BASE + PSI_DMA_SER0_SIZE * sess_id;
+	tce_out = tce_in + SER_BUFFER_SIZE/2;
+
+	lock(&fsp_con_lock);
+	if (fs->open) {
+		prlog(PR_DEBUG, "  already open, skipping init !\n");
+		unlock(&fsp_con_lock);
+		goto already_open;
+	}
+
+	fs->poke_msg = fsp_mkmsg(FSP_CMD_VSERIAL_OUT, 2,
+				 fsp_msg_get_data_word(msg, 0),
+				 fsp_msg_get_data_word(msg, 1) & 0xffff);
+	if (fs->poke_msg == NULL) {
+		prerror("FSPCON: Failed to allocate poke_msg\n");
+		unlock(&fsp_con_lock);
+		return;
+	}
+
+	fs->open = true;
+	fs->poke_msg->user_data = fs;
+
+	fs->in_buf->partition_id = fs->out_buf->partition_id = cpu_to_be16(part_id);
+	fs->in_buf->session_id	 = fs->out_buf->session_id   = sess_id;
+	fs->in_buf->hmc_id       = fs->out_buf->hmc_id       = hmc_indx;
+	fs->in_buf->data_offset  = fs->out_buf->data_offset  =
+		cpu_to_be16(sizeof(struct fsp_serbuf_hdr));
+	fs->in_buf->last_valid   = fs->out_buf->last_valid   =
+		cpu_to_be16(SER_BUF_DATA_SIZE - 1);
+	fs->in_buf->ovf_count    = fs->out_buf->ovf_count    = 0;
+	fs->in_buf->next_in      = fs->out_buf->next_in      = 0;
+	fs->in_buf->flags        = fs->out_buf->flags        = 0;
+	fs->in_buf->reserved     = fs->out_buf->reserved     = 0;
+	fs->in_buf->next_out     = fs->out_buf->next_out     = 0;
+	fs->out_buf_prev_len     = 0;
+	fs->out_buf_timeout      = 0;
+	unlock(&fsp_con_lock);
+
+ already_open:
+	resp = fsp_mkmsg(FSP_RSP_OPEN_VSERIAL, 6, fsp_msg_get_data_word(msg, 0),
+			fsp_msg_get_data_word(msg, 1) & 0xffff, 0, tce_in, 0, tce_out);
+	if (!resp) {
+		prerror("FSPCON: Failed to allocate open msg response\n");
+		return;
+	}
+	if (fsp_queue_msg(resp, fsp_freemsg)) {
+		fsp_freemsg(resp);
+		prerror("FSPCON: Failed to queue open msg response\n");
+		return;
+	}
+
+#ifdef DVS_CONSOLE
+	prlog(PR_DEBUG, "  log_port  = %d\n", fs->log_port);
+	if (fs->log_port) {
+		fsp_con_port = sess_id;
+		sync();
+		/*
+		 * We mark the FSP lock as being in the console
+		 * path. We do that only once, we never unmark it
+		 * (there is really no much point)
+		 */
+		fsp_used_by_console();
+		fsp_con_lock.in_con_path = true;
+		/* See comment in fsp_used_by_console */
+		lock(&fsp_con_lock);
+		unlock(&fsp_con_lock);
+		set_console(&fsp_con_ops);
+	}
+#endif
+}
+
+static void fsp_close_vserial(struct fsp_msg *msg)
+{
+	u16 part_id = fsp_msg_get_data_word(msg, 0) & 0xffff;
+	u16 sess_id = fsp_msg_get_data_word(msg, 1) & 0xffff;
+	u8 hmc_sess = msg->data.bytes[0];	
+	u8 hmc_indx = msg->data.bytes[1];
+	u8 authority = msg->data.bytes[4];
+	struct fsp_serial *fs;
+	struct fsp_msg *resp;
+
+	prlog(PR_INFO, "FSPCON: Got VSerial Close\n");
+	prlog(PR_DEBUG, "  part_id   = 0x%04x\n", part_id);
+	prlog(PR_DEBUG, "  sess_id   = 0x%04x\n", sess_id);
+	prlog(PR_DEBUG, "  hmc_sess  = 0x%02x\n", hmc_sess);
+	prlog(PR_DEBUG, "  hmc_indx  = 0x%02x\n", hmc_indx);
+	prlog(PR_DEBUG, "  authority = 0x%02x\n", authority);
+
+	if (sess_id >= MAX_SERIAL || !fsp_serials[sess_id].available) {
+		prlog(PR_WARNING, "FSPCON: 0x%04x  NOT AVAILABLE!\n", sess_id);
+		goto skip_close;
+	}
+
+	fs = &fsp_serials[sess_id];
+
+	/* See "HACK" comment in open */
+	if (fs->rsrc_id == 0xffff) {
+		if (part_id == 0)
+			fs->has_part0 = false;
+		if (part_id == 1)
+			fs->has_part1 = false;
+		if (fs->has_part0 || fs->has_part1) {
+			prlog(PR_DEBUG, "  skipping close !\n");
+			goto skip_close;
+		}
+	}
+
+#ifdef DVS_CONSOLE
+	if (fs->log_port) {
+		fsp_con_port = -1;
+		set_console(NULL);
+	}
+#endif
+
+	lock(&fsp_con_lock);
+	if (fs->open) {
+		fs->open = false;
+		fs->out_poke = false;
+		if (fs->poke_msg && fs->poke_msg->state == fsp_msg_unused) {
+			fsp_freemsg(fs->poke_msg);
+			fs->poke_msg = NULL;
+		}
+	}
+	unlock(&fsp_con_lock);
+ skip_close:
+	resp = fsp_mkmsg(FSP_RSP_CLOSE_VSERIAL, 2, fsp_msg_get_data_word(msg, 0),
+			fsp_msg_get_data_word(msg, 1) & 0xffff);
+	if (!resp) {
+		prerror("FSPCON: Failed to allocate close msg response\n");
+		return;
+	}
+	if (fsp_queue_msg(resp, fsp_freemsg)) {
+		fsp_freemsg(resp);
+		prerror("FSPCON: Failed to queue close msg response\n");
+	}
+}
+
+static bool fsp_con_msg_hmc(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	struct fsp_msg *resp;
+
+	/* Associate response */
+	if ((cmd_sub_mod >> 8) == 0xe08a) {
+		prlog(PR_TRACE, "FSPCON: Got associate response, status"
+		      " 0x%02x\n", cmd_sub_mod & 0xff);
+		return true;
+	}
+	if ((cmd_sub_mod >> 8) == 0xe08b) {
+		prlog(PR_TRACE, "Got unassociate response, status 0x%02x\n",
+		      cmd_sub_mod & 0xff);
+		return true;
+	}
+	switch(cmd_sub_mod) {
+	case FSP_CMD_OPEN_VSERIAL:
+		fsp_open_vserial(msg);
+		return true;
+	case FSP_CMD_CLOSE_VSERIAL:
+		fsp_close_vserial(msg);
+		return true;
+	case FSP_CMD_HMC_INTF_QUERY:
+		prlog(PR_DEBUG, "FSPCON: Got HMC interface query\n");
+		got_intf_query = true;
+		resp = fsp_mkmsg(FSP_RSP_HMC_INTF_QUERY, 1,
+				fsp_msg_get_data_word(msg, 0) & 0x00ffffff);
+		if (!resp) {
+			prerror("FSPCON: Failed to allocate hmc intf response\n");
+			return true;
+		}
+		if (fsp_queue_msg(resp, fsp_freemsg)) {
+			fsp_freemsg(resp);
+			prerror("FSPCON: Failed to queue hmc intf response\n");
+		}
+		return true;
+	}
+	return false;
+}
+
+static bool fsp_con_msg_vt(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	u16 sess_id = fsp_msg_get_data_word(msg, 1) & 0xffff;
+
+	if (cmd_sub_mod == FSP_CMD_VSERIAL_IN && sess_id < MAX_SERIAL) {
+		struct fsp_serial *fs = &fsp_serials[sess_id];
+
+		if (!fs->open)
+			return true;
+
+		/* FSP is signaling some incoming data. We take the console
+		 * lock to avoid racing with a simultaneous read, though we
+		 * might want to consider to simplify all that locking into
+		 * one single lock that covers the console and the pending
+		 * events.
+		 */
+		lock(&fsp_con_lock);
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT,
+					OPAL_EVENT_CONSOLE_INPUT);
+		opal_update_pending_evt(fs->irq, fs->irq);
+		unlock(&fsp_con_lock);
+	}
+	return true;
+}
+
+static bool fsp_con_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	assert(msg == NULL);
+
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		fsp_close_consoles();
+		return true;
+	case FSP_RELOAD_COMPLETE:
+		fsp_console_reinit();
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_con_client_hmc = {
+	.message = fsp_con_msg_hmc,
+};
+
+static struct fsp_client fsp_con_client_vt = {
+	.message = fsp_con_msg_vt,
+};
+
+static struct fsp_client fsp_con_client_rr = {
+	.message = fsp_con_msg_rr,
+};
+
+static void fsp_serial_add(int index, u16 rsrc_id, const char *loc_code,
+			   bool log_port)
+{
+	struct fsp_serial *ser;
+	struct fsp_msg *msg;
+
+	lock(&fsp_con_lock);
+	ser = &fsp_serials[index];
+
+	if (ser->available) {
+		unlock(&fsp_con_lock);
+		return;
+	}
+
+	ser->rsrc_id = rsrc_id;
+	memset(ser->loc_code, 0x00, LOC_CODE_SIZE);
+	strncpy(ser->loc_code, loc_code, LOC_CODE_SIZE - 1);
+	ser->available = true;
+	ser->log_port = log_port;
+	unlock(&fsp_con_lock);
+
+	/* DVS doesn't have that */
+	if (rsrc_id != 0xffff) {
+		msg = fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2,
+				(rsrc_id << 16) | 1, index);
+		if (!msg) {
+			prerror("FSPCON: Assoc serial alloc failed\n");
+			return;
+		}
+		if (fsp_queue_msg(msg, fsp_freemsg)) {
+			fsp_freemsg(msg);
+			prerror("FSPCON: Assoc serial queue failed\n");
+			return;
+		}
+	}
+}
+
+void fsp_console_preinit(void)
+{
+	int i;
+	void *base;
+
+	if (!fsp_present())
+		return;
+
+	ser_buffer = memalign(TCE_PSIZE, SER_BUFFER_SIZE * MAX_SERIAL);
+
+	/* Initialize out data structure pointers & TCE maps */
+	base = ser_buffer;
+	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *ser = &fsp_serials[i];
+
+		ser->in_buf = base;
+		ser->out_buf = base + SER_BUFFER_SIZE/2;
+		base += SER_BUFFER_SIZE;
+	}
+	fsp_tce_map(PSI_DMA_SER0_BASE, ser_buffer,
+		    4 * PSI_DMA_SER0_SIZE);
+
+	/* Register for class E0 and E1 */
+	fsp_register_client(&fsp_con_client_hmc, FSP_MCLASS_HMC_INTFMSG);
+	fsp_register_client(&fsp_con_client_vt, FSP_MCLASS_HMC_VT);
+	fsp_register_client(&fsp_con_client_rr, FSP_MCLASS_RR_EVENT);
+
+	/* Add DVS ports. We currently have session 0 and 3, 0 is for
+	 * OS use. 3 is our debug port. We need to add those before
+	 * we complete the OPL or we'll potentially miss the
+	 * console setup on Firebird blades.
+	 */
+	fsp_serial_add(0, 0xffff, "DVS_OS", false);
+	op_display(OP_LOG, OP_MOD_FSPCON, 0x0001);
+	fsp_serial_add(3, 0xffff, "DVS_FW", true);
+	op_display(OP_LOG, OP_MOD_FSPCON, 0x0002);
+
+}
+
+static int64_t fsp_console_write(int64_t term_number, __be64 *__length,
+				 const uint8_t *buffer)
+{
+	struct fsp_serial *fs;
+	size_t written, requested;
+
+	if (term_number < 0 || term_number >= MAX_SERIAL)
+		return OPAL_PARAMETER;
+	fs = &fsp_serials[term_number];
+	if (!fs->available || fs->log_port)
+		return OPAL_PARAMETER;
+	lock(&fsp_con_lock);
+	if (!fs->open) {
+		unlock(&fsp_con_lock);
+		return OPAL_CLOSED;
+	}
+	/* Clamp to a reasonable size */
+	requested = be64_to_cpu(*__length);
+	if (requested > 0x1000)
+		requested = 0x1000;
+	written = fsp_write_vserial(fs, buffer, requested);
+
+	if (written) {
+		/* If we wrote anything, reset timeout */
+		fs->out_buf_prev_len = 0;
+		fs->out_buf_timeout = 0;
+	}
+
+#ifdef OPAL_DEBUG_CONSOLE_IO
+	prlog(PR_TRACE, "OPAL: console write req=%ld written=%ld"
+	      " ni=%d no=%d\n",
+	      requested, written, be16_to_cpu(fs->out_buf->next_in),
+	      be16_to_cpu(fs->out_buf->next_out));
+	prlog(PR_TRACE, "      %02x %02x %02x %02x "
+	      "%02x \'%c\' %02x \'%c\' %02x \'%c\'.%02x \'%c\'..\n",
+	      buffer[0], buffer[1], buffer[2], buffer[3],
+	      buffer[4], buffer[4], buffer[5], buffer[5],
+	      buffer[6], buffer[6], buffer[7], buffer[7]);
+#endif /* OPAL_DEBUG_CONSOLE_IO */
+
+	*__length = cpu_to_be64(written);
+	unlock(&fsp_con_lock);
+
+	if (written)
+		return OPAL_SUCCESS;
+
+	return OPAL_HARDWARE;
+}
+
+static int64_t fsp_console_write_buffer_space(int64_t term_number,
+					      __be64 *__length)
+{
+	static bool elog_generated = false;
+	struct fsp_serial *fs;
+	struct fsp_serbuf_hdr *sb;
+	int64_t length;
+
+	if (term_number < 0 || term_number >= MAX_SERIAL)
+		return OPAL_PARAMETER;
+	fs = &fsp_serials[term_number];
+	if (!fs->available || fs->log_port)
+		return OPAL_PARAMETER;
+	lock(&fsp_con_lock);
+	if (!fs->open) {
+		unlock(&fsp_con_lock);
+		return OPAL_CLOSED;
+	}
+	sb = fs->out_buf;
+	length = (be16_to_cpu(sb->next_out) + SER_BUF_DATA_SIZE
+			- be16_to_cpu(sb->next_in) - 1)
+		% SER_BUF_DATA_SIZE;
+	unlock(&fsp_con_lock);
+
+	/* Console buffer has enough space to write incoming data */
+	if (length != fs->out_buf_prev_len) {
+		fs->out_buf_prev_len = length;
+		fs->out_buf_timeout = 0;
+
+		*__length = cpu_to_be64(length);
+		return OPAL_SUCCESS;
+	}
+
+	/*
+	 * Buffer is full, start internal timer. We will continue returning
+	 * SUCCESS until timeout happens, hoping FSP will consume data within
+	 * timeout period.
+	 */
+	if (fs->out_buf_timeout == 0) {
+		fs->out_buf_timeout = mftb() +
+			secs_to_tb(SER_BUFFER_OUT_TIMEOUT);
+	}
+
+	if (tb_compare(mftb(), fs->out_buf_timeout) != TB_AAFTERB) {
+		*__length = cpu_to_be64(length);
+		return OPAL_SUCCESS;
+	}
+
+	/*
+	 * FSP is still active but not reading console data. Hence
+	 * our console buffer became full. Most likely IPMI daemon
+	 * on FSP is buggy. Lets log error and return OPAL_RESOURCE
+	 * to payload (Linux).
+	 */
+	if (!elog_generated) {
+		elog_generated = true;
+		log_simple_error(&e_info(OPAL_RC_CONSOLE_HANG), "FSPCON: Console "
+				 "buffer is full, dropping console data\n");
+	}
+
+	/* Timeout happened. Lets drop incoming data */
+	return OPAL_RESOURCE;
+}
+
+static int64_t fsp_console_read(int64_t term_number, __be64 *__length,
+				uint8_t *buffer)
+{
+	struct fsp_serial *fs;
+	struct fsp_serbuf_hdr *sb;
+	bool pending = false;
+	uint32_t old_nin, n, i, chunk, req = be64_to_cpu(*__length);
+	int rc = OPAL_SUCCESS;
+
+	if (term_number < 0 || term_number >= MAX_SERIAL)
+		return OPAL_PARAMETER;
+	fs = &fsp_serials[term_number];
+	if (!fs->available || fs->log_port)
+		return OPAL_PARAMETER;
+	lock(&fsp_con_lock);
+	if (!fs->open) {
+		rc = OPAL_CLOSED;
+		goto clr_flag;
+	}
+	if (fs->waiting)
+		fs->waiting = 0;
+	sb = fs->in_buf;
+	old_nin = be16_to_cpu(sb->next_in);
+	lwsync();
+	n = (old_nin + SER_BUF_DATA_SIZE - be16_to_cpu(sb->next_out))
+		% SER_BUF_DATA_SIZE;
+	if (n > req) {
+		pending = true;
+		n = req;
+	}
+	*__length = cpu_to_be64(n);
+
+	chunk = SER_BUF_DATA_SIZE - be16_to_cpu(sb->next_out);
+	if (chunk > n)
+		chunk = n;
+	memcpy(buffer, &sb->data[be16_to_cpu(sb->next_out)], chunk);
+	if (chunk < n)
+		memcpy(buffer + chunk, &sb->data[0], n - chunk);
+	sb->next_out = cpu_to_be16(((be16_to_cpu(sb->next_out)) + n) % SER_BUF_DATA_SIZE);
+
+#ifdef OPAL_DEBUG_CONSOLE_IO
+	prlog(PR_TRACE, "OPAL: console read req=%d read=%d ni=%d no=%d\n",
+	      req, n, be16_to_cpu(sb->next_in), be16_to_cpu(sb->next_out));
+	prlog(PR_TRACE, "      %02x %02x %02x %02x %02x %02x %02x %02x ...\n",
+	       buffer[0], buffer[1], buffer[2], buffer[3],
+	       buffer[4], buffer[5], buffer[6], buffer[7]);
+#endif /* OPAL_DEBUG_CONSOLE_IO */
+
+clr_flag:
+	/* Might clear the input pending flag */
+	for (i = 0; i < MAX_SERIAL && !pending; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+		struct fsp_serbuf_hdr *sb = fs->in_buf;
+
+		if (fs->log_port || !fs->open)
+			continue;
+		if (sb->next_out != sb->next_in) {
+			/*
+			 * HACK: Some kernels (4.1+) may fail to properly
+			 * register hvc1 and will never read it. This can lead
+			 * to RCU stalls, so if we notice this console is not
+			 * being read, do not set OPAL_EVENT_CONSOLE_INPUT even
+			 * if it has data
+			 */
+			if (fs->waiting < 5) {
+				pending = true;
+				fs->waiting++;
+			}
+		}
+	}
+	if (!pending) {
+		opal_update_pending_evt(fs->irq, 0);
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0);
+	}
+
+	unlock(&fsp_con_lock);
+
+	return rc;
+}
+
+void fsp_console_poll(void *data __unused)
+{
+#ifdef OPAL_DEBUG_CONSOLE_POLL
+       	static int debug;
+#endif
+
+	/*
+	 * We don't get messages for out buffer being consumed, so we
+	 * need to poll. We also defer sending of poke messages from
+	 * the sapphire console to avoid a locking nightmare with
+	 * beging called from printf() deep into an existing lock nest
+	 * stack.
+	 */
+	if (fsp_con_full ||
+	    (opal_pending_events & OPAL_EVENT_CONSOLE_OUTPUT)) {
+		unsigned int i;
+		bool pending = false;
+
+		/* We take the console lock. This is somewhat inefficient
+		 * but it guarantees we aren't racing with a write, and
+		 * thus clearing an event improperly
+		 */
+		lock(&fsp_con_lock);
+		for (i = 0; i < MAX_SERIAL && !pending; i++) {
+			struct fsp_serial *fs = &fsp_serials[i];
+			struct fsp_serbuf_hdr *sb = fs->out_buf;
+
+			if (!fs->open)
+				continue;
+			if (sb->next_out == sb->next_in) {
+				continue;
+			}
+			if (fs->log_port) {
+				flush_console();
+			} else {
+#ifdef OPAL_DEBUG_CONSOLE_POLL
+				if (debug < 5) {
+					prlog(PR_DEBUG,"OPAL: %d still pending"
+					      " ni=%d no=%d\n",
+					      i, be16_to_cpu(sb->next_in),
+					      be16_to_cpu(sb->next_out));
+					debug++;
+				}
+#endif /* OPAL_DEBUG_CONSOLE_POLL */
+				pending = true;
+			}
+		}
+		if (!pending) {
+			opal_update_pending_evt(OPAL_EVENT_CONSOLE_OUTPUT, 0);
+#ifdef OPAL_DEBUG_CONSOLE_POLL
+			debug = 0;
+#endif
+		}
+		unlock(&fsp_con_lock);
+	}
+}
+
+void fsp_console_init(void)
+{
+	struct dt_node *serials, *ser;
+	int i;
+
+	if (!fsp_present())
+		return;
+
+	/* Wait until we got the intf query before moving on */
+	while (!got_intf_query)
+		opal_run_pollers();
+
+	op_display(OP_LOG, OP_MOD_FSPCON, 0x0000);
+
+	/* Register poller */
+	opal_add_poller(fsp_console_poll, NULL);
+
+	/* Register OPAL console backend */
+	set_opal_console(&fsp_opal_con);
+
+	/* Parse serial port data */
+	serials = dt_find_by_path(dt_root, "ipl-params/fsp-serial");
+	if (!serials) {
+		prerror("FSPCON: No FSP serial ports in device-tree\n");
+		return;
+	}
+
+	i = 1;
+	dt_for_each_child(serials, ser) {
+		u32 rsrc_id = dt_prop_get_u32(ser, "reg");
+		const void *lc = dt_prop_get(ser, "ibm,loc-code");
+
+		prlog(PR_NOTICE, "FSPCON: Serial %d rsrc: %04x loc: %s\n",
+		      i, rsrc_id, (const char *)lc);
+		fsp_serial_add(i++, rsrc_id, lc, false);
+		op_display(OP_LOG, OP_MOD_FSPCON, 0x0010 + i);
+	}
+
+	op_display(OP_LOG, OP_MOD_FSPCON, 0x0005);
+}
+
+static int64_t fsp_console_flush(int64_t terminal __unused)
+{
+	/* FIXME: There's probably something we can do here... */
+	return OPAL_PARAMETER;
+}
+
+struct opal_con_ops fsp_opal_con = {
+	.name = "FSP OPAL console",
+	.init = NULL, /* all the required setup is done in fsp_console_init() */
+	.read = fsp_console_read,
+	.write = fsp_console_write,
+	.space = fsp_console_write_buffer_space,
+	.flush = fsp_console_flush,
+};
+
+static void flush_all_input(void)
+{
+	unsigned int i;
+
+	lock(&fsp_con_lock);
+ 	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+		struct fsp_serbuf_hdr *sb = fs->in_buf;
+
+		if (fs->log_port)
+			continue;
+
+		sb->next_out = sb->next_in;
+	}
+	unlock(&fsp_con_lock);
+}
+		
+static bool send_all_hvsi_close(void)
+{
+	unsigned int i;
+	bool has_hvsi = false;
+	static const uint8_t close_packet[] = { 0xfe, 6, 0, 1, 0, 3 };
+
+ 	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+		struct fsp_serbuf_hdr *sb = fs->out_buf;
+		unsigned int space, timeout = 10;
+
+		if (fs->log_port)
+			continue;
+		if (fs->rsrc_id == 0xffff)
+			continue;
+		has_hvsi = true;
+
+		/* Do we have room ? Wait a bit if not */
+		while(timeout--) {
+			space = (be16_to_cpu(sb->next_out) + SER_BUF_DATA_SIZE -
+				 be16_to_cpu(sb->next_in) - 1) % SER_BUF_DATA_SIZE;
+			if (space >= 6)
+				break;
+			time_wait_ms(500);
+		}
+		lock(&fsp_con_lock);
+		fsp_write_vserial(fs, close_packet, 6);
+		unlock(&fsp_con_lock);
+	}
+
+	return has_hvsi;
+}
+
+static void reopen_all_hvsi(void)
+{
+	unsigned int i;
+
+ 	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+
+		if (!fs->available)
+			continue;
+
+		if (fs->rsrc_id == 0xffff)
+			continue;
+		prlog(PR_NOTICE, "FSP: Deassociating HVSI console %d\n", i);
+		fsp_sync_msg(fsp_mkmsg(FSP_CMD_UNASSOC_SERIAL, 1,
+				       (i << 16) | 1), true);
+	}
+ 	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+
+		if (!fs->available)
+			continue;
+
+		if (fs->rsrc_id == 0xffff)
+			continue;
+		prlog(PR_NOTICE, "FSP: Reassociating HVSI console %d\n", i);
+		fsp_sync_msg(fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2,
+				       (fs->rsrc_id << 16) | 1, i), true);
+	}
+}
+
+void fsp_console_reset(void)
+{
+	if (!fsp_present())
+		return;
+
+	prlog(PR_NOTICE, "FSP: Console reset !\n");
+
+	/* This is called on a fast-reset. To work around issues with HVSI
+	 * initial negotiation, before we reboot the kernel, we flush all
+	 * input and send an HVSI close packet.
+	 */
+	flush_all_input();
+
+	/* Returns false if there is no HVSI console */
+	if (!send_all_hvsi_close())
+		return;
+
+	time_wait_ms(500);
+
+	reopen_all_hvsi();
+
+}
+
+void fsp_console_add_nodes(void)
+{
+	struct dt_node *opal_event;
+	unsigned int i;
+
+	opal_event = dt_find_by_name(opal_node, "event");
+
+	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+		struct dt_node *fs_node;
+		const char *type;
+
+		if (fs->log_port || !fs->available)
+			continue;
+
+		if (fs->rsrc_id == 0xffff)
+			type = "raw";
+		else
+			type = "hvsi";
+
+		fs_node = add_opal_console_node(i, type, SER_BUF_DATA_SIZE);
+
+		fs->irq = opal_dynamic_event_alloc();
+		dt_add_property_cells(fs_node, "interrupts", ilog2(fs->irq));
+
+		if (opal_event)
+			dt_add_property_cells(fs_node, "interrupt-parent",
+					      opal_event->phandle);
+	}
+}
+
+void fsp_console_select_stdout(void)
+{
+	bool use_serial = false;
+	int rc;
+	u8 param;
+
+	if (!fsp_present())
+		return;
+
+	rc = fsp_get_sys_param(SYS_PARAM_CONSOLE_SELECT,
+			       &param, 1, NULL, NULL);
+	if (rc != 1) {
+		prerror("FSPCON: Failed to get console"
+			" sysparam rc %d\n", rc);
+	} else {
+		switch(param) {
+		case 0:
+			use_serial = false;
+			break;
+		case 1:
+			use_serial = true;
+			break;
+		default:
+			prerror("FSPCON: Unknown console"
+				" sysparam %d\n", param);
+		}
+	}
+
+	dt_check_del_prop(dt_chosen, "linux,stdout-path");
+
+	if (fsp_serials[1].open && use_serial) {
+		dt_add_property_string(dt_chosen, "linux,stdout-path",
+				       "/ibm,opal/consoles/serial@1");
+		prlog(PR_NOTICE, "FSPCON: default console set to serial A\n");
+	} else {
+		dt_add_property_string(dt_chosen, "linux,stdout-path",
+				       "/ibm,opal/consoles/serial@0");
+		prlog(PR_NOTICE, "FSPCON: default console set to SOL/DVS\n");
+	}
+}
+
diff --git a/roms/skiboot/hw/fsp/fsp-diag.c b/roms/skiboot/hw/fsp/fsp-diag.c
new file mode 100644
index 000000000..d9101f31b
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-diag.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Code for handling FSP_MCLASS_DIAG messages (cmd 0xee)
+ * Receiving a high level ack timeout is likely indicative of a firmware bug
+ *
+ * Copyright 2013-2014 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <processor.h>
+#include <timebase.h>
+#include <opal.h>
+#include <fsp-sysparam.h>
+
+static bool fsp_diag_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+
+	if (cmd_sub_mod == FSP_RSP_DIAG_LINK_ERROR) {
+		printf("FIXME: Unhandled FSP_MCLASS_DIAG Link Error Report\n");
+		return false;
+	}
+
+	if (cmd_sub_mod != FSP_RSP_DIAG_ACK_TIMEOUT) {
+		printf("BUG: Unhandled subcommand: 0x%x (New FSP spec?)\n",
+		       cmd_sub_mod);
+		return false;
+	}
+
+	printf("BUG: High Level ACK timeout (FSP_MCLASS_DIAG) for 0x%x\n",
+	       fsp_msg_get_data_word(msg, 0) & 0xffff0000);
+
+	return true;
+}
+
+static struct fsp_client fsp_diag = {
+	.message = fsp_diag_msg,
+};
+
+/* This is called at boot time */
+void fsp_init_diag(void)
+{
+	/* Register for the diag event */
+	fsp_register_client(&fsp_diag, FSP_MCLASS_DIAG);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-dpo.c b/roms/skiboot/hw/fsp/fsp-dpo.c
new file mode 100644
index 000000000..91919f915
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-dpo.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * FSP DPO (Delayed Power Off) event support
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "FSP-DPO: " fmt
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <stdio.h>
+#include <timebase.h>
+#include <opal.h>
+#include <opal-msg.h>
+
+#define DPO_CMD_SGN_BYTE0	0xf4 /* Byte[0] signature */
+#define DPO_CMD_SGN_BYTE1	0x20 /* Byte[1] signature */
+#define DPO_TIMEOUT		2700 /* 45 minutes in seconds */
+
+bool fsp_dpo_pending;
+static unsigned long fsp_dpo_init_tb;
+
+/*
+ * OPAL DPO interface
+ *
+ * Returns zero if DPO is not active, positive value indicating number
+ * of seconds remaining for a forced system shutdown. This will enable
+ * the host to schedule for shutdown voluntarily before timeout occurs.
+ */
+static int64_t fsp_opal_get_dpo_status(__be64 *dpo_timeout)
+{
+	if (!fsp_dpo_pending) {
+		*dpo_timeout = 0;
+		return OPAL_WRONG_STATE;
+	}
+
+	*dpo_timeout = cpu_to_be64(DPO_TIMEOUT - tb_to_secs(mftb() - fsp_dpo_init_tb));
+	return OPAL_SUCCESS;
+}
+
+/* Process FSP DPO init message */
+static void fsp_process_dpo(struct fsp_msg *msg)
+{
+	struct fsp_msg *resp;
+	u32 cmd = FSP_RSP_INIT_DPO;
+	int rc;
+
+	/* DPO message does not have the correct signatures */
+	if ((msg->data.bytes[0] != DPO_CMD_SGN_BYTE0)
+			|| (msg->data.bytes[1] != DPO_CMD_SGN_BYTE1)) {
+		prerror("Message signatures did not match\n");
+		cmd |= FSP_STATUS_INVALID_CMD;
+		resp = fsp_mkmsg(cmd, 0);
+		if (resp == NULL) {
+			prerror("%s : Message allocation failed\n", __func__);
+			return;
+		}
+		if (fsp_queue_msg(resp, fsp_freemsg)) {
+			fsp_freemsg(resp);
+			prerror("%s : Failed to queue response "
+				"message\n", __func__);
+		}
+		return;
+	}
+
+	/* OPAL is already in "DPO pending" state */
+	if (fsp_dpo_pending) {
+		prlog(PR_INFO, "OPAL already in DPO pending state\n");
+		cmd |= FSP_STATUS_INVALID_DPOSTATE;
+		resp = fsp_mkmsg(cmd, 0);
+		if (resp == NULL) {
+			prerror("%s : Message allocation failed\n", __func__);
+			return;
+		}
+		if (fsp_queue_msg(resp, fsp_freemsg)) {
+			fsp_freemsg(resp);
+			prerror("%s : Failed to queue response "
+				"message\n", __func__);
+		}
+		return;
+	}
+
+
+	/* Inform the host about DPO */
+	rc = opal_queue_msg(OPAL_MSG_DPO, NULL, NULL);
+	if (rc) {
+		prerror("OPAL message queuing failed\n");
+		cmd |= FSP_STATUS_GENERIC_ERROR;
+		resp = fsp_mkmsg(cmd, 0);
+		if (resp == NULL) {
+			prerror("%s : Message allocation failed\n", __func__);
+			return;
+		}
+		if (fsp_queue_msg(resp, fsp_freemsg)) {
+			fsp_freemsg(resp);
+			prerror("%s : Failed to queue response "
+				"message\n", __func__);
+		}
+		return;
+	} else
+		prlog(PR_INFO, "Notified host about DPO event\n");
+
+	/* Acknowledge the FSP on DPO */
+	resp = fsp_mkmsg(cmd, 0);
+	if (resp == NULL) {
+		prerror("%s : Message allocation failed\n", __func__);
+		return;
+	}
+	if (fsp_queue_msg(resp, fsp_freemsg)) {
+		fsp_freemsg(resp);
+		prerror("%s : Failed to queue response message\n", __func__);
+		return;
+	}
+
+	/* Record DPO init time and set DPO pending flag */
+	fsp_dpo_init_tb = mftb();
+	fsp_dpo_pending = true;
+
+	/*
+	 * OPAL is now in DPO pending state. After first detecting DPO
+	 * condition from OPAL, the host will have 45 minutes to prepare
+	 * the system for shutdown. The host must take all necessary actions
+	 * required in that regard and at the end shutdown itself. The host
+	 * shutdown sequence eventually will make the call OPAL_CEC_POWER_DOWN
+	 * which in turn ask the FSP to shutdown the CEC. If the FSP does not
+	 * receive the cec power down command from OPAL within 45 minutes,
+	 * it will assume that the host and the OPAL has processed the DPO
+	 * sequence successfully and hence force power off the system.
+	 */
+}
+
+/* Handle DPO sub-command from FSP */
+static bool fsp_dpo_message(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	if (cmd_sub_mod == FSP_CMD_INIT_DPO) {
+		prlog(PR_INFO, "Delayed Power Off (DPO) notification received\n");
+		fsp_process_dpo(msg);
+		return true;
+	}
+
+	return false;
+}
+
+static struct fsp_client fsp_dpo_client = {
+	.message = fsp_dpo_message,
+};
+
+void fsp_dpo_init(void)
+{
+	fsp_register_client(&fsp_dpo_client, FSP_MCLASS_SERVICE);
+	opal_register(OPAL_GET_DPO_STATUS, fsp_opal_get_dpo_status, 1);
+	prlog(PR_INFO, "FSP DPO support initialized\n");
+}
diff --git a/roms/skiboot/hw/fsp/fsp-dump.c b/roms/skiboot/hw/fsp/fsp-dump.c
new file mode 100644
index 000000000..96cb45e6f
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-dump.c
@@ -0,0 +1,916 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Dump support:
+ *  We get dump notification from different sources:
+ *   - During system initialization via HDAT
+ *   - During FSP reset/reload (FipS dump)
+ *   - Dump available notification MBOX command (0xCE, 0x78, 0x00)
+ *
+ *  To avoid complications, we keep list of dumps in a list and fetch
+ *  them serially.
+ *
+ * Dump retrieve process:
+ *   - Once we get notification from FSP we enqueue the dump ID and notify
+ *     Linux via OPAL event notification.
+ *   - Linux reads dump info and allocates required memory to fetch the dump
+ *     and makes dump read call.
+ *   - Sapphire fetches dump data from FSP.
+ *   - Linux writes dump to disk and sends acknowledgement.
+ *   - Sapphire acknowledges FSP.
+ *
+ * Copyright 2013-2015 IBM Corp.
+ */
+
+#include <fsp.h>
+#include <psi.h>
+#include <lock.h>
+#include <device.h>
+#include <skiboot.h>
+#include <errorlog.h>
+#include <opal-api.h>
+
+/*
+ * Max outstanding dumps to retrieve
+ *
+ * Note:
+ *  Dumps are serialized. We don't get notification for second
+ *  dump of given type until we acknowledge first one. But we
+ *  may get notification for different dump type. And our dump
+ *  retrieval code is serialized. Hence we use list to keep
+ *  track of outstanding dumps to be retrieved.
+ */
+#define MAX_DUMP_RECORD		0x04
+
+/* Max retry */
+#define FIPS_DUMP_MAX_RETRY	0x03
+
+/* Dump type */
+#define DUMP_TYPE_FSP		0x01
+#define DUMP_TYPE_SYS		0x02
+#define DUMP_TYPE_SMA		0x03
+
+/* Dump fetch size */
+#define DUMP_FETCH_SIZE_FSP	0x500000
+#define DUMP_FETCH_SIZE_SYS	0x400000
+#define DUMP_FETCH_SIZE_RES	0x200000
+
+/* Params for Fips dump */
+#define FSP_DUMP_TOOL_TYPE	"SYS "
+#define FSP_DUMP_CLIENT_ID	"SAPPHIRE_CLIENT"
+
+enum dump_state {
+	DUMP_STATE_ABSENT,	/* No FSP dump */
+	DUMP_STATE_NONE,	/* No dump to retrieve */
+	DUMP_STATE_NOTIFY,	/* Notified Linux */
+	DUMP_STATE_FETCHING,	/* Dump retrieval is in progress */
+	DUMP_STATE_FETCH,	/* Dump retrieve complete */
+	DUMP_STATE_PARTIAL,	/* Partial read */
+	DUMP_STATE_ABORTING,	/* Aborting due to kexec */
+};
+
+/* Pending dump list */
+struct dump_record {
+	uint8_t	 type;
+	uint32_t id;
+	uint32_t size;
+	struct list_node link;
+};
+
+/* List definations */
+static LIST_HEAD(dump_pending);
+static LIST_HEAD(dump_free);
+
+/* Dump retrieve state */
+static enum dump_state dump_state = DUMP_STATE_NONE;
+
+/* Dump buffer SG list */
+static struct opal_sg_list *dump_data;
+static struct dump_record *dump_entry;
+static int64_t dump_offset;
+static size_t fetch_remain;
+
+/* FipS dump retry count */
+static int retry_cnt;
+
+/* Protect list and dump retrieve state */
+static struct lock dump_lock = LOCK_UNLOCKED;
+
+/* Forward declaration */
+static int64_t fsp_opal_dump_init(uint8_t dump_type);
+static int64_t fsp_dump_read(void);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE,
+		 OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+		 OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_LIST, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE,
+		 OPAL_INFO,
+		 OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_ACK, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+		 OPAL_NA);
+
+/*
+ * Helper functions
+ */
+static inline void update_dump_state(enum dump_state state)
+{
+	dump_state = state;
+}
+
+static int64_t check_dump_state(void)
+{
+	switch (dump_state) {
+	case DUMP_STATE_ABSENT:
+		return OPAL_HARDWARE;
+	case DUMP_STATE_NONE:
+	case DUMP_STATE_NOTIFY:
+		/* During dump fetch, notify is wrong state */
+		return OPAL_WRONG_STATE;
+	case DUMP_STATE_FETCHING:
+	case DUMP_STATE_ABORTING:
+		return OPAL_BUSY_EVENT;
+	case DUMP_STATE_FETCH:
+		return OPAL_SUCCESS;
+	case DUMP_STATE_PARTIAL:
+		return OPAL_PARTIAL;
+	}
+	return OPAL_SUCCESS;
+}
+
+static inline void dump_tce_map(uint32_t tce_offset,
+				void *buffer, uint32_t size)
+{
+	uint32_t tlen = ALIGN_UP(size, TCE_PSIZE);
+	fsp_tce_map(PSI_DMA_DUMP_DATA + tce_offset, buffer, tlen);
+}
+
+static inline void dump_tce_unmap(uint32_t size)
+{
+	fsp_tce_unmap(PSI_DMA_DUMP_DATA, size);
+}
+
+/*
+ * Returns Data set ID for the given dump type
+ */
+static inline uint16_t get_dump_data_set_id(uint8_t type)
+{
+	switch (type) {
+	case DUMP_TYPE_FSP:
+		return FSP_DATASET_SP_DUMP;
+	case DUMP_TYPE_SYS:
+		return FSP_DATASET_HW_DUMP;
+	default:
+		break;
+	}
+	return OPAL_INTERNAL_ERROR;
+}
+
+/*
+ * Returns max data we can fetch from FSP fetch data call
+ */
+static inline int64_t get_dump_fetch_max_size(uint8_t type)
+{
+	switch (type) {
+	case DUMP_TYPE_FSP:
+		return DUMP_FETCH_SIZE_FSP;
+	case DUMP_TYPE_SYS:
+		return DUMP_FETCH_SIZE_SYS;
+	default:
+		break;
+	}
+	return OPAL_INTERNAL_ERROR;
+}
+
+/*
+ * Get dump record from pending list
+ */
+static inline struct dump_record *get_dump_rec_from_list(uint32_t id)
+{
+	struct dump_record *record;
+
+	list_for_each(&dump_pending, record, link) {
+		if (record->id == id)
+			return record;
+	}
+	return NULL;
+}
+
+/*
+ * New dump available notification to Linux
+ */
+static void update_opal_dump_notify(void)
+{
+	/*
+	 * Wait until current dump retrieval to complete
+	 * before notifying again.
+	 */
+	if (dump_state != DUMP_STATE_NONE)
+		return;
+
+	 /* More dump's to retrieve */
+	if (!list_empty(&dump_pending)) {
+		update_dump_state(DUMP_STATE_NOTIFY);
+		opal_update_pending_evt(OPAL_EVENT_DUMP_AVAIL,
+					OPAL_EVENT_DUMP_AVAIL);
+	}
+}
+
+static int64_t remove_dump_id_from_list(uint32_t dump_id)
+{
+	struct dump_record *record, *nxt_record;
+	int rc = OPAL_SUCCESS;
+	bool found = false;
+
+	/* Remove record from pending list */
+	list_for_each_safe(&dump_pending, record, nxt_record, link) {
+		if (record->id != dump_id)
+			continue;
+
+		found = true;
+		list_del(&record->link);
+		list_add(&dump_free, &record->link);
+		break;
+	}
+
+	/*
+	 * Continue update_opal_dump_notify even if it fails
+	 * to remove ID. So that we can resend notification
+	 * for the same dump ID to Linux.
+	 */
+	if (!found) { /* List corrupted? */
+		log_simple_error(&e_info(OPAL_RC_DUMP_LIST),
+				 "DUMP: ID 0x%x not found in list!\n",
+				 dump_id);
+		rc = OPAL_PARAMETER;
+	}
+
+	/* Update state */
+	update_dump_state(DUMP_STATE_NONE);
+	/* Notify next available dump to retrieve */
+	update_opal_dump_notify();
+
+	return rc;
+}
+
+static int64_t add_dump_id_to_list(uint8_t dump_type,
+				   uint32_t dump_id, uint32_t dump_size)
+{
+	struct dump_record *record;
+	int rc = OPAL_SUCCESS;
+
+	lock(&dump_lock);
+
+	rc = check_dump_state();
+	if (rc == OPAL_HARDWARE)
+		goto out;
+
+	/* List is full ? */
+	if (list_empty(&dump_free)) {
+		printf("DUMP: Dump ID 0x%x is not queued.\n", dump_id);
+		rc = OPAL_RESOURCE;
+		goto out;
+	}
+
+	/* Already queued? */
+	record = get_dump_rec_from_list(dump_id);
+	if (record) {
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+
+	/* Add to list */
+	record = list_pop(&dump_free, struct dump_record, link);
+	record->type = dump_type;
+	record->id = dump_id;
+	record->size = dump_size;
+	list_add_tail(&dump_pending, &record->link);
+
+	/* OPAL notification */
+	update_opal_dump_notify();
+	rc = OPAL_SUCCESS;
+
+out:
+	unlock(&dump_lock);
+	return rc;
+}
+
+static void dump_init_complete(struct fsp_msg *msg)
+{
+	uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+
+	printf("DUMP: FipS dump init status = 0x%x\n", status);
+	fsp_freemsg(msg);
+
+	switch (status) {
+	case FSP_STATUS_SUCCESS:
+		printf("DUMP: Initiated FipS dump.\n");
+		break;
+	case FSP_STATUS_BUSY: /* Retry, if FSP is busy */
+		if (retry_cnt++ < FIPS_DUMP_MAX_RETRY)
+			if (fsp_opal_dump_init(DUMP_TYPE_FSP) == OPAL_SUCCESS)
+				return;
+		break;
+	default:
+		break;
+	}
+	/* Reset max retry count */
+	retry_cnt = 0;
+}
+
+/*
+ * Initiate new FipS dump
+ */
+static int64_t fsp_opal_dump_init(uint8_t dump_type)
+{
+	struct fsp_msg *msg;
+	int rc = OPAL_SUCCESS;
+	uint32_t *tool_type = (void *)FSP_DUMP_TOOL_TYPE;
+	uint32_t *client_id = (void *)FSP_DUMP_CLIENT_ID;
+
+	/* Only FipS dump generate request is supported */
+	if (dump_type != DUMP_TYPE_FSP)
+		return OPAL_PARAMETER;
+
+	msg = fsp_mkmsg(FSP_CMD_FSP_DUMP_INIT, 6, *tool_type,
+			sizeof(FSP_DUMP_CLIENT_ID), *client_id,
+			*(client_id + 1), *(client_id + 2), *(client_id + 3));
+
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_INIT),
+				 "DUMP: Message allocation failed.\n");
+		rc = OPAL_INTERNAL_ERROR;
+	} else if (fsp_queue_msg(msg, dump_init_complete)) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_INIT),
+			"DUMP: Failed to queue FipS dump init request.\n");
+		fsp_freemsg(msg);
+		rc = OPAL_INTERNAL_ERROR;
+	}
+
+	return rc;
+}
+
+/*
+ * OPAL interface to send dump information to Linux.
+ */
+static int64_t fsp_opal_dump_info2(__be32 *dump_id, __be32 *dump_size,
+				   __be32 *dump_type)
+{
+	struct dump_record *record;
+	int rc = OPAL_SUCCESS;
+
+	lock(&dump_lock);
+
+	/* Clear notification */
+	opal_update_pending_evt(OPAL_EVENT_DUMP_AVAIL, 0);
+
+	record = list_top(&dump_pending, struct dump_record, link);
+	if (!record) { /* List corrupted? */
+		update_dump_state(DUMP_STATE_NONE);
+		rc = OPAL_INTERNAL_ERROR;
+		goto out;
+	}
+	*dump_id = cpu_to_be32(record->id);
+	*dump_size = cpu_to_be32(record->size);
+	*dump_type = cpu_to_be32(record->type);
+
+out:
+	unlock(&dump_lock);
+	return rc;
+}
+
+static int64_t fsp_opal_dump_info(__be32 *dump_id, __be32 *dump_size)
+{
+	__be32 dump_type;
+	return fsp_opal_dump_info2(dump_id, dump_size, &dump_type);
+}
+
+static int64_t validate_dump_sglist(struct opal_sg_list *list,
+				    int64_t *size)
+{
+	struct opal_sg_list *sg;
+	struct opal_sg_entry *prev_entry, *entry;
+	int length, num_entries, i;
+
+	prev_entry = NULL;
+	*size = 0;
+	for (sg = list; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) {
+		length = be64_to_cpu(sg->length) - 16;
+		num_entries = length / sizeof(struct opal_sg_entry);
+		if (num_entries <= 0)
+			return OPAL_PARAMETER;
+
+		for (i = 0; i < num_entries; i++) {
+			entry = &sg->entry[i];
+			*size += be64_to_cpu(entry->length);
+
+			/* All entries must be aligned */
+			if (((uint64_t)be64_to_cpu(entry->data)) & 0xfff)
+				return OPAL_PARAMETER;
+
+			/* All non-terminal entries size must be aligned */
+			if (prev_entry && (be64_to_cpu(prev_entry->length) & 0xfff))
+				return OPAL_PARAMETER;
+
+			prev_entry = entry;
+		}
+	}
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Map dump buffer to TCE buffer
+ */
+static int64_t map_dump_buffer(void)
+{
+	struct opal_sg_list *sg;
+	struct opal_sg_entry *entry;
+	int64_t fetch_max;
+	int length, num_entries, i;
+	int buf_off, fetch_off, tce_off, sg_off;
+	bool last = false;
+
+	/* FSP fetch max size */
+	fetch_max = get_dump_fetch_max_size(dump_entry->type);
+	if (fetch_max > (dump_entry->size - dump_offset))
+		fetch_remain = dump_entry->size - dump_offset;
+	else
+		fetch_remain = fetch_max;
+
+	/* offsets */
+	fetch_off = fetch_remain;
+	tce_off = sg_off = 0;
+
+	for (sg = dump_data; sg; sg = (struct opal_sg_list*)be64_to_cpu(sg->next)) {
+		num_entries = (be64_to_cpu(sg->length) - 16) /
+					sizeof(struct opal_sg_entry);
+		if (num_entries <= 0)
+			return OPAL_PARAMETER;
+
+		for (i = 0; i < num_entries; i++) {
+			entry = &sg->entry[i];
+
+			/* Continue until we get offset */
+			if ((sg_off + be64_to_cpu(entry->length)) < dump_offset) {
+				sg_off += be64_to_cpu(entry->length);
+				continue;
+			}
+
+			/*
+			 * SG list entry size can be more than 4k.
+			 * Map only required pages, instead of
+			 * mapping entire entry.
+			 */
+			if (!tce_off) {
+				buf_off = (dump_offset - sg_off) & ~0xfff;
+				length = be64_to_cpu(entry->length) - buf_off;
+			} else {
+				buf_off = 0;
+				length = be64_to_cpu(entry->length);
+			}
+
+			/* Adjust length for last mapping */
+			if (fetch_off <= length) {
+				length = fetch_off;
+				last = true;
+			}
+
+			/* Adjust offset */
+			sg_off += be64_to_cpu(entry->length);
+			fetch_off -= length;
+
+			/* TCE mapping */
+			dump_tce_map(tce_off, (void*)(be64_to_cpu(entry->data) + buf_off), length);
+			tce_off += length;
+
+			/* TCE mapping complete */
+			if (last)
+				return OPAL_SUCCESS;
+		}
+	} /* outer loop */
+	return OPAL_PARAMETER;
+}
+
+static void dump_read_complete(struct fsp_msg *msg)
+{
+	void *buffer;
+	size_t length, offset;
+	int rc;
+	uint32_t dump_id;
+	uint16_t id;
+	uint8_t flags, status;
+	bool compl = false;
+
+	status = (msg->resp->word1 >> 8) & 0xff;
+	flags = (fsp_msg_get_data_word(msg, 0) >> 16) & 0xff;
+	id = fsp_msg_get_data_word(msg, 0) & 0xffff;
+	dump_id = fsp_msg_get_data_word(msg, 1);
+	offset = fsp_msg_get_data_word(msg->resp, 1);
+	length = fsp_msg_get_data_word(msg->resp, 2);
+
+	fsp_freemsg(msg);
+
+	lock(&dump_lock);
+
+	if (dump_state == DUMP_STATE_ABORTING) {
+		printf("DUMP: Fetch dump aborted, ID = 0x%x\n", dump_id);
+		dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+		update_dump_state(DUMP_STATE_NONE);
+		goto bail;
+	}
+
+	switch (status) {
+	case FSP_STATUS_SUCCESS: /* Fetch next dump block */
+		if (dump_offset < dump_entry->size) {
+			dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+			rc = fsp_dump_read();
+			if (rc == OPAL_SUCCESS)
+				goto bail;
+		} else { /* Dump read complete */
+			compl = true;
+		}
+		break;
+	case FSP_STATUS_MORE_DATA:	/* More data to read */
+		offset += length;
+		buffer = (void *)PSI_DMA_DUMP_DATA + offset;
+		fetch_remain -= length;
+
+		rc = fsp_fetch_data_queue(flags, id, dump_id, offset, buffer,
+					  &fetch_remain, dump_read_complete);
+		if (rc == OPAL_SUCCESS)
+			goto bail;
+		break;
+	default:
+		break;
+	}
+
+	dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+
+	/* Update state */
+	if (compl) {
+		printf("DUMP: Fetch dump success. ID = 0x%x\n", dump_id);
+		update_dump_state(DUMP_STATE_FETCH);
+	} else {
+		printf("DUMP: Fetch dump partial. ID = 0x%x\n", dump_id);
+		update_dump_state(DUMP_STATE_PARTIAL);
+	}
+ bail:
+	unlock(&dump_lock);
+}
+
+/*
+ * Fetch dump data from FSP
+ */
+static int64_t fsp_dump_read(void)
+{
+	int64_t rc;
+	uint16_t data_set;
+	uint8_t flags = 0x00;
+
+	/* Get data set ID */
+	data_set = get_dump_data_set_id(dump_entry->type);
+
+	/* Map TCE buffer */
+	rc = map_dump_buffer();
+	if (rc != OPAL_SUCCESS) {
+		printf("DUMP: TCE mapping failed\n");
+		return rc;
+	}
+
+	printf("DUMP: Fetch Dump. ID = %02x, sub ID = %08x, len = %ld\n",
+	       data_set, dump_entry->id, fetch_remain);
+
+	/* Fetch data */
+	rc = fsp_fetch_data_queue(flags, data_set, dump_entry->id,
+				  dump_offset, (void *)PSI_DMA_DUMP_DATA,
+				  &fetch_remain, dump_read_complete);
+
+	/* Adjust dump fetch offset */
+	dump_offset += fetch_remain;
+
+	return rc;
+}
+
+static int64_t fsp_opal_dump_read(uint32_t dump_id,
+				  struct opal_sg_list *list)
+{
+	struct dump_record *record;
+	int64_t rc, size;
+
+	lock(&dump_lock);
+
+	/* Check state */
+	if (dump_state != DUMP_STATE_NOTIFY) {
+		rc = check_dump_state();
+		goto out;
+	}
+
+	/* Validate dump ID */
+	record = get_dump_rec_from_list(dump_id);
+	if (!record) { /* List corrupted? */
+		rc = OPAL_INTERNAL_ERROR;
+		goto out;
+	}
+
+	/* Validate dump buffer and size */
+	rc = validate_dump_sglist(list, &size);
+	if (rc != OPAL_SUCCESS) {
+		printf("DUMP: SG list validation failed\n");
+		goto out;
+	}
+
+	if (size < record->size) { /* Insuffient buffer */
+		printf("DUMP: Insufficient buffer\n");
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	/* Update state */
+	update_dump_state(DUMP_STATE_FETCHING);
+
+	/* Fetch dump data */
+	dump_entry = record;
+	dump_data = list;
+	dump_offset = 0;
+	rc = fsp_dump_read();
+	if (rc != OPAL_SUCCESS)
+		goto out;
+
+	/* Check status after initiating fetch data */
+	rc = check_dump_state();
+
+out:
+	unlock(&dump_lock);
+	return rc;
+}
+
+static void dump_ack_complete(struct fsp_msg *msg)
+{
+	uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+
+	if (status)
+		log_simple_error(&e_info(OPAL_RC_DUMP_ACK),
+				 "DUMP: ACK failed for ID: 0x%x\n",
+				 fsp_msg_get_data_word(msg, 0));
+	else
+		printf("DUMP: ACKed dump ID: 0x%x\n", fsp_msg_get_data_word(msg, 0));
+
+	fsp_freemsg(msg);
+}
+
+/*
+ * Acknowledge dump
+ */
+static int64_t fsp_opal_dump_ack(uint32_t dump_id)
+{
+	struct dump_record *record;
+	struct fsp_msg *msg;
+	int rc;
+	uint32_t cmd;
+	uint8_t dump_type = 0;
+
+	/* Get dump type */
+	lock(&dump_lock);
+	record = get_dump_rec_from_list(dump_id);
+	if (record)
+		dump_type = record->type;
+
+	/*
+	 * Next available dump in pending list will be of different
+	 * type. Hence we don't need to wait for ack complete.
+	 *
+	 * Note:
+	 *   This allows us to proceed even if we fail to ACK.
+	 *   In the worst case we may get notification for the
+	 *   same dump again, which is probably better than
+	 *   looping forever.
+	 */
+	rc = remove_dump_id_from_list(dump_id);
+	if (rc != OPAL_SUCCESS) /* Invalid dump id */
+		goto out;
+
+	/* Adjust mod value */
+	cmd = FSP_CMD_ACK_DUMP | (dump_type & 0xff);
+	msg = fsp_mkmsg(cmd, 1, dump_id);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_ACK),
+				 "DUMP: Message allocation failed.!\n");
+		rc = OPAL_INTERNAL_ERROR;
+	} else if (fsp_queue_msg(msg, dump_ack_complete)) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_ACK),
+			"DUMP: Failed to queue dump ack message.\n");
+		fsp_freemsg(msg);
+		rc = OPAL_INTERNAL_ERROR;
+	}
+out:
+	unlock(&dump_lock);
+	return rc;
+}
+
+/* Resend dump available notification */
+static int64_t fsp_opal_dump_resend_notification(void)
+{
+	lock(&dump_lock);
+
+	if (dump_state != DUMP_STATE_ABSENT)
+		update_dump_state(DUMP_STATE_NONE);
+
+	update_opal_dump_notify();
+
+	unlock(&dump_lock);
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Handle FSP R/R event.
+ */
+static bool fsp_dump_retrieve_rr(uint32_t cmd_sub_mod,
+				 struct fsp_msg *msg __unused)
+{
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		lock(&dump_lock);
+		/* Reset dump state */
+		if (dump_state == DUMP_STATE_FETCHING)
+			update_dump_state(DUMP_STATE_ABORTING);
+		unlock(&dump_lock);
+		return true;
+	case FSP_RELOAD_COMPLETE:
+		lock(&dump_lock);
+
+		/* Reset TCE mapping */
+		dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+
+		/* Reset dump state */
+		update_dump_state(DUMP_STATE_NONE);
+
+		/*
+		 * For now keeping R/R handler simple. In the worst case
+		 * we may endup resending dump available notification for
+		 * same dump ID twice to Linux.
+		 */
+		update_opal_dump_notify();
+		unlock(&dump_lock);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Handle host kexec'ing scenarios
+ */
+static bool opal_kexec_dump_notify(void *data __unused)
+{
+	bool ready = true;
+
+	lock(&dump_lock);
+
+	/* Dump retrieve is in progress? */
+	if (dump_state == DUMP_STATE_FETCHING)
+		dump_state = DUMP_STATE_ABORTING;
+
+	/* Not yet safe to kexec */
+	if (dump_state == DUMP_STATE_ABORTING)
+		ready = false;
+
+	unlock(&dump_lock);
+
+	return ready;
+}
+
+/*
+ * FipS dump notification
+ */
+void fsp_fips_dump_notify(uint32_t dump_id, uint32_t dump_size)
+{
+	printf("DUMP: FipS dump available. ID = 0x%x [size: %d bytes]\n",
+	       dump_id, dump_size);
+	add_dump_id_to_list(DUMP_TYPE_FSP, dump_id, dump_size);
+}
+
+/*
+ * System/Platform dump notification
+ */
+static bool fsp_sys_dump_notify(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+	/*
+	 * Though spec says mod 00 is deprecated we still
+	 * seems to get mod 00 notification (at least on
+	 * P7 machine).
+	 */
+	if (cmd_sub_mod != FSP_RSP_SYS_DUMP &&
+	    cmd_sub_mod != FSP_RSP_SYS_DUMP_OLD)
+		return false;
+
+	printf("DUMP: Platform dump available. ID = 0x%x [size: %d bytes]\n",
+	       fsp_msg_get_data_word(msg, 0), fsp_msg_get_data_word(msg, 1));
+
+	add_dump_id_to_list(DUMP_TYPE_SYS,
+			    fsp_msg_get_data_word(msg, 0),
+			    fsp_msg_get_data_word(msg, 1));
+	return true;
+}
+
+/*
+ * If platform dump available during IPL time, then we
+ * get notification via HDAT. Check for DT for the dump
+ * presence.
+ */
+static void check_ipl_sys_dump(void)
+{
+	struct dt_node *dump_node, *opal_node;
+	uint32_t dump_id, dump_size;
+
+	if (proc_gen >= proc_gen_p9) {
+		opal_node = dt_find_by_path(dt_root, "ibm,opal");
+		if (!opal_node)
+			return;
+		dump_node = dt_find_by_path(opal_node, "dump");
+		if (dump_node) {
+			if (dt_find_property(dump_node, "mpipl-boot"))
+				return;
+		}
+	}
+
+	dump_node = dt_find_by_path(dt_root, "ipl-params/platform-dump");
+	if (!dump_node)
+		return;
+
+	if (!dt_find_property(dump_node, "dump-id"))
+		return;
+
+	dump_id = dt_prop_get_u32(dump_node, "dump-id");
+	dump_size = (uint32_t)dt_prop_get_u64(dump_node, "total-size");
+
+	printf("DUMP: Platform dump present during IPL.\n");
+	printf("      ID = 0x%x [size: %d bytes]\n", dump_id, dump_size);
+
+	add_dump_id_to_list(DUMP_TYPE_SYS, dump_id, dump_size);
+}
+
+/*
+ * Allocate and initialize dump list
+ */
+static int init_dump_free_list(void)
+{
+	struct dump_record *entry;
+	int i;
+
+	entry = zalloc(sizeof(struct dump_record) * MAX_DUMP_RECORD);
+	if (!entry) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_INIT),
+				 "DUMP: Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < MAX_DUMP_RECORD; i++) {
+		list_add_tail(&dump_free, &entry->link);
+		entry++;
+	}
+	return 0;
+}
+
+static struct fsp_client fsp_sys_dump_client = {
+	.message = fsp_sys_dump_notify,
+};
+
+static struct fsp_client fsp_dump_client_rr = {
+	.message = fsp_dump_retrieve_rr,
+};
+
+void fsp_dump_init(void)
+{
+	if (!fsp_present()) {
+		update_dump_state(DUMP_STATE_ABSENT);
+		return;
+	}
+
+	/* Initialize list */
+	if (init_dump_free_list() != 0) {
+		update_dump_state(DUMP_STATE_ABSENT);
+		return;
+	}
+
+	/* Register for Class CE */
+	fsp_register_client(&fsp_sys_dump_client, FSP_MCLASS_SERVICE);
+	/* Register for Class AA (FSP R/R) */
+	fsp_register_client(&fsp_dump_client_rr, FSP_MCLASS_RR_EVENT);
+
+	/* Register for sync on host reboot call */
+	opal_add_host_sync_notifier(opal_kexec_dump_notify, NULL);
+
+	/* OPAL interface */
+	opal_register(OPAL_DUMP_INIT, fsp_opal_dump_init, 1);
+	opal_register(OPAL_DUMP_INFO, fsp_opal_dump_info, 2);
+	opal_register(OPAL_DUMP_INFO2, fsp_opal_dump_info2, 3);
+	opal_register(OPAL_DUMP_READ, fsp_opal_dump_read, 2);
+	opal_register(OPAL_DUMP_ACK, fsp_opal_dump_ack, 1);
+	opal_register(OPAL_DUMP_RESEND, fsp_opal_dump_resend_notification, 0);
+
+	/* Check for platform dump presence during IPL time */
+	check_ipl_sys_dump();
+}
diff --git a/roms/skiboot/hw/fsp/fsp-elog-read.c b/roms/skiboot/hw/fsp/fsp-elog-read.c
new file mode 100644
index 000000000..bd23ffbe8
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-elog-read.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * This code will enable retrieving of error log from FSP -> Sapphire in
+ * sequence.
+ * Here, FSP would send next log only when Sapphire sends a new log notification
+ * response to FSP. On Completion of reading the log from FSP,
+ * OPAL_EVENT_ERROR_LOG_AVAIL is signaled. This will remain raised until a call
+ * to opal_elog_read() is made and OPAL_SUCCESS is returned. Upon which, the
+ * operation is complete and the event is cleared. This is READ action from FSP.
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+/*
+ * Design of READ error log :
+ * When we receive a new error log entry notification from FSP, we queue it into
+ * the "pending" list. If the "pending" list is not empty, then we start
+ * fetching log from FSP.
+ *
+ * When Linux reads a log entry, we dequeue it from the "pending" list and
+ * enqueue it to another "processed" list. At this point, if the "pending"
+ * list is not empty, we continue to fetch the next log.
+ *
+ * When Linux calls opal_resend_pending_logs(), we fetch the log corresponding
+ * to the head of the pending list and move it to the processed list, and
+ * continue this process until the pending list is empty. If the pending list
+ * was empty earlier and is currently non-empty, we initiate an error log fetch.
+ *
+ * When Linux acks an error log, we remove it from processed list.
+ */
+
+#include <errno.h>
+#include <fsp.h>
+#include <fsp-elog.h>
+#include <lock.h>
+#include <opal-api.h>
+#include <psi.h>
+#include <skiboot.h>
+
+/*
+ * Maximum number of entries that are pre-allocated
+ * to keep track of pending elogs to be fetched.
+ */
+#define ELOG_READ_MAX_RECORD		128
+
+/* Structure to maintain log-id, log-size, pending and processed list. */
+struct fsp_log_entry {
+	uint32_t log_id;
+	size_t log_size;
+	struct list_node link;
+};
+
+static LIST_HEAD(elog_read_pending);
+static LIST_HEAD(elog_read_processed);
+static LIST_HEAD(elog_read_free);
+/*
+ * Lock is used to protect overwriting of processed and pending list
+ * and also used while updating state of each log.
+ */
+static struct lock elog_read_lock = LOCK_UNLOCKED;
+
+#define ELOG_READ_BUFFER_SIZE		0x00004000
+/* Log buffer to copy FSP log for read */
+static void *elog_read_buffer;
+static uint32_t elog_head_id;	/* FSP entry ID */
+static size_t elog_head_size;	/* Actual FSP log size */
+static uint32_t elog_read_retries;	/* Bad response status count */
+
+/* Initialize the state of the log */
+static enum elog_head_state elog_read_from_fsp_head_state = ELOG_STATE_NONE;
+
+static bool elog_enabled = false;
+
+/* Need forward declaration because of circular dependency. */
+static void fsp_elog_queue_fetch(void);
+
+/*
+ * Check the response message for mbox acknowledgement
+ * command send to FSP.
+ */
+static void fsp_elog_ack_complete(struct fsp_msg *msg)
+{
+	uint8_t val;
+
+	val = (msg->resp->word1 >> 8) & 0xff;
+	if (val != 0)
+		prerror("ELOG: Acknowledgement error\n");
+
+	fsp_freemsg(msg);
+}
+
+/* Send error log PHYP acknowledgement to FSP with entry ID. */
+static int64_t fsp_send_elog_ack(uint32_t log_id)
+{
+	struct fsp_msg *ack_msg;
+
+	ack_msg = fsp_mkmsg(FSP_CMD_ERRLOG_PHYP_ACK, 1, log_id);
+	if (!ack_msg) {
+		prerror("ELOG: Failed to allocate ack message\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	if (fsp_queue_msg(ack_msg, fsp_elog_ack_complete)) {
+		fsp_freemsg(ack_msg);
+		ack_msg = NULL;
+		prerror("ELOG: Error queueing elog ack complete\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+/* Retrieve error log from FSP with TCE for the data transfer. */
+static void fsp_elog_check_and_fetch_head(void)
+{
+	lock(&elog_read_lock);
+	if (elog_read_from_fsp_head_state != ELOG_STATE_NONE ||
+			list_empty(&elog_read_pending)) {
+		unlock(&elog_read_lock);
+		return;
+	}
+
+	elog_read_retries = 0;
+	/* Start fetching first entry from the pending list */
+	fsp_elog_queue_fetch();
+	unlock(&elog_read_lock);
+}
+
+void elog_set_head_state(bool opal_logs, enum elog_head_state state)
+{
+	static enum elog_head_state opal_logs_state = ELOG_STATE_NONE;
+	static enum elog_head_state fsp_logs_state = ELOG_STATE_NONE;
+
+	/* ELOG disabled */
+	if (!elog_enabled)
+		return;
+
+	if (opal_logs)
+		opal_logs_state = state;
+	else
+		fsp_logs_state = state;
+
+	if (fsp_logs_state == ELOG_STATE_FETCHED_DATA ||
+		opal_logs_state == ELOG_STATE_FETCHED_DATA)
+		opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL,
+					OPAL_EVENT_ERROR_LOG_AVAIL);
+	else
+		opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL, 0);
+}
+
+/* This function should be called with the lock held. */
+static inline void fsp_elog_set_head_state(enum elog_head_state state)
+{
+	elog_set_head_state(false, state);
+	elog_read_from_fsp_head_state = state;
+}
+
+/*
+ * When, we try maximum time of fetching log from FSP
+ * we call following function to delete log from the
+ * pending list and update the state to fetch next log.
+ *
+ * This function should be called with the lock held.
+ */
+static void fsp_elog_fetch_failure(uint8_t fsp_status)
+{
+	struct fsp_log_entry *log_data;
+
+	/* Read top list and delete the node */
+	log_data = list_top(&elog_read_pending, struct fsp_log_entry, link);
+	if (!log_data) {
+		/**
+		 * @fwts-label ElogFetchFailureInconsistent
+		 * @fwts-advice Inconsistent state between OPAL and FSP
+		 * in code path for handling failure of fetching error log
+		 * from FSP. Likely a bug in interaction between FSP and OPAL.
+		 */
+		prlog(PR_ERR, "%s: Inconsistent internal list state !\n",
+		      __func__);
+	} else {
+		list_del(&log_data->link);
+		list_add(&elog_read_free, &log_data->link);
+		prerror("ELOG: received invalid data: %x FSP status: 0x%x\n",
+			log_data->log_id, fsp_status);
+	}
+
+	fsp_elog_set_head_state(ELOG_STATE_NONE);
+}
+
+/* Read response value from FSP for fetch sp data mbox command */
+static void fsp_elog_read_complete(struct fsp_msg *read_msg)
+{
+	uint8_t val;
+
+	lock(&elog_read_lock);
+	val = (read_msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(read_msg);
+	if (elog_read_from_fsp_head_state == ELOG_STATE_REJECTED) {
+		fsp_elog_set_head_state(ELOG_STATE_NONE);
+		goto elog_read_out;
+	}
+
+	switch (val) {
+	case FSP_STATUS_SUCCESS:
+		fsp_elog_set_head_state(ELOG_STATE_FETCHED_DATA);
+		break;
+
+	case FSP_STATUS_DMA_ERROR:
+		if (elog_read_retries++ < MAX_RETRIES) {
+			/*
+			 * For a error response value from FSP, we try to
+			 * send fetch sp data mbox command again for three
+			 * times if response from FSP is still not valid
+			 * we send generic error response to FSP.
+			 */
+			fsp_elog_queue_fetch();
+			break;
+		}
+
+		fsp_elog_fetch_failure(val);
+		break;
+
+	default:
+		fsp_elog_fetch_failure(val);
+	}
+
+elog_read_out:
+	unlock(&elog_read_lock);
+
+	/* Check if a new log needs fetching */
+	fsp_elog_check_and_fetch_head();
+}
+
+/* Read error log from FSP through mbox commands */
+static void fsp_elog_queue_fetch(void)
+{
+	int rc;
+	uint8_t flags = 0;
+	struct fsp_log_entry *entry;
+
+	entry = list_top(&elog_read_pending, struct fsp_log_entry, link);
+	if (!entry) {
+		/**
+		 * @fwts-label ElogQueueInconsistent
+		 * @fwts-advice Bug in interaction between FSP and OPAL. We
+		 * expected there to be a pending read from FSP but the list
+		 * was empty.
+		 */
+		prlog(PR_ERR, "%s: Inconsistent internal list state !\n",
+			__func__);
+		fsp_elog_set_head_state(ELOG_STATE_NONE);
+		return;
+	}
+
+	fsp_elog_set_head_state(ELOG_STATE_FETCHING);
+	elog_head_id = entry->log_id;
+	elog_head_size = entry->log_size;
+	rc = fsp_fetch_data_queue(flags, FSP_DATASET_ERRLOG, elog_head_id,
+				  0, (void *)PSI_DMA_ERRLOG_READ_BUF,
+				  &elog_head_size, fsp_elog_read_complete);
+	if (rc) {
+		prerror("ELOG: failed to queue read message: %d\n", rc);
+		fsp_elog_set_head_state(ELOG_STATE_NONE);
+	}
+}
+
+/* OPAL interface for PowerNV to read log size and log ID from Sapphire. */
+static int64_t fsp_opal_elog_info(__be64 *opal_elog_id,
+				  __be64 *opal_elog_size, __be64 *elog_type)
+{
+	struct fsp_log_entry *log_data;
+
+	/* Copy type of the error log */
+	*elog_type = cpu_to_be64(ELOG_TYPE_PEL);
+
+	/* Check if any OPAL log needs to be reported to the host */
+	if (opal_elog_info(opal_elog_id, opal_elog_size))
+		return OPAL_SUCCESS;
+
+	lock(&elog_read_lock);
+	if (elog_read_from_fsp_head_state != ELOG_STATE_FETCHED_DATA) {
+		unlock(&elog_read_lock);
+		return OPAL_WRONG_STATE;
+	}
+
+	log_data = list_top(&elog_read_pending, struct fsp_log_entry, link);
+	if (!log_data) {
+		/**
+		 * @fwts-label ElogInfoInconsistentState
+		 * @fwts-advice We expected there to be an entry in the list
+		 * of error logs for the error log we're fetching information
+		 * for. There wasn't. This means there's a bug.
+		 */
+		prlog(PR_ERR, "%s: Inconsistent internal list state !\n",
+		      __func__);
+		fsp_elog_set_head_state(ELOG_STATE_NONE);
+		unlock(&elog_read_lock);
+		return OPAL_WRONG_STATE;
+	}
+
+	*opal_elog_id = cpu_to_be64(log_data->log_id);
+	*opal_elog_size = cpu_to_be64(log_data->log_size);
+	fsp_elog_set_head_state(ELOG_STATE_HOST_INFO);
+	unlock(&elog_read_lock);
+	return OPAL_SUCCESS;
+}
+
+/* OPAL interface for PowerNV to read log from Sapphire. */
+static int64_t fsp_opal_elog_read(void *buffer, uint64_t opal_elog_size,
+				  uint64_t opal_elog_id)
+{
+	int size = opal_elog_size;
+	struct fsp_log_entry *log_data;
+
+	/* Check if any OPAL log needs to be reported to the PowerNV */
+	if (opal_elog_read(buffer, opal_elog_size, opal_elog_id))
+		return OPAL_SUCCESS;
+
+	/*
+	 * Read top entry from list.
+	 * As we know always top record of the list is fetched from FSP
+	 */
+	lock(&elog_read_lock);
+	if (elog_read_from_fsp_head_state != ELOG_STATE_HOST_INFO) {
+		unlock(&elog_read_lock);
+		return OPAL_WRONG_STATE;
+	}
+
+	log_data = list_top(&elog_read_pending, struct fsp_log_entry, link);
+	if (!log_data) {
+		/**
+		 * @fwts-label ElogReadInconsistentState
+		 * @fwts-advice Inconsistent state while reading error log
+		 * from FSP. Bug in OPAL and FSP interaction.
+		 */
+		prlog(PR_ERR, "%s: Inconsistent internal list state !\n",
+		      __func__);
+		fsp_elog_set_head_state(ELOG_STATE_NONE);
+		unlock(&elog_read_lock);
+		return OPAL_WRONG_STATE;
+	}
+
+	/* Check log ID and then read log from buffer */
+	if (opal_elog_id != log_data->log_id) {
+		unlock(&elog_read_lock);
+		return OPAL_PARAMETER;
+	}
+
+	/* Do not copy more than actual log size */
+	if (opal_elog_size > log_data->log_size)
+		size = log_data->log_size;
+
+	memset(buffer, 0, opal_elog_size);
+	memcpy(buffer, elog_read_buffer, size);
+
+	/*
+	 * Once log is read from linux move record from pending
+	 * to processed list and delete record from pending list
+	 * and change state of the log to fetch next record.
+	 */
+	list_del(&log_data->link);
+	list_add(&elog_read_processed, &log_data->link);
+	fsp_elog_set_head_state(ELOG_STATE_NONE);
+	unlock(&elog_read_lock);
+
+	/* Read error log from FSP */
+	fsp_elog_check_and_fetch_head();
+
+	return OPAL_SUCCESS;
+}
+
+/* Set state of the log head before fetching the log. */
+static void elog_reject_head(void)
+{
+	if (elog_read_from_fsp_head_state == ELOG_STATE_FETCHING)
+		fsp_elog_set_head_state(ELOG_STATE_REJECTED);
+	else
+		fsp_elog_set_head_state(ELOG_STATE_NONE);
+}
+
+/* OPAL interface for PowerNV to send ack to FSP with log ID */
+static int64_t fsp_opal_elog_ack(uint64_t ack_id)
+{
+	int rc = 0;
+	struct fsp_log_entry  *record, *next_record;
+
+	if (opal_elog_ack(ack_id))
+		return rc;
+
+	/* Send acknowledgement to FSP */
+	rc = fsp_send_elog_ack(ack_id);
+	if (rc != OPAL_SUCCESS) {
+		prerror("ELOG: failed to send acknowledgement: %d\n", rc);
+		return rc;
+	}
+
+	lock(&elog_read_lock);
+	list_for_each_safe(&elog_read_processed, record, next_record, link) {
+		if (record->log_id != ack_id)
+			continue;
+
+		list_del(&record->link);
+		list_add(&elog_read_free, &record->link);
+		unlock(&elog_read_lock);
+		return rc;
+	}
+
+	list_for_each_safe(&elog_read_pending, record, next_record, link) {
+		if (record->log_id != ack_id)
+			continue;
+		/*
+		 * It means PowerNV has sent ACK without reading actual data.
+		 * Because of this elog_read_from_fsp_head_state may be
+		 * stuck in wrong state (ELOG_STATE_HOST_INFO) and not able
+		 * to send remaining ELOGs to PowerNV. Hence reset ELOG state
+		 * and start sending remaining ELOGs.
+		 */
+		list_del(&record->link);
+		list_add(&elog_read_free, &record->link);
+		elog_reject_head();
+		unlock(&elog_read_lock);
+		fsp_elog_check_and_fetch_head();
+		return rc;
+	}
+
+	unlock(&elog_read_lock);
+	return OPAL_PARAMETER;
+}
+
+/*
+ * Once Linux kexec's it ask to resend all logs which
+ * are not acknowledged from Linux.
+ */
+static void fsp_opal_resend_pending_logs(void)
+{
+	struct fsp_log_entry  *entry;
+
+	lock(&elog_read_lock);
+	elog_enabled = true;
+	unlock(&elog_read_lock);
+
+	/* Check if any Sapphire logs are pending. */
+	opal_resend_pending_logs();
+
+	lock(&elog_read_lock);
+	/*
+	 * If processed list is not empty add all record from
+	 * processed list to pending list at head of the list
+	 * and delete records from processed list.
+	 */
+	while (!list_empty(&elog_read_processed)) {
+		entry = list_pop(&elog_read_processed,
+					 struct fsp_log_entry, link);
+		list_add(&elog_read_pending, &entry->link);
+	}
+
+	unlock(&elog_read_lock);
+
+	/* Read error log from FSP */
+	elog_reject_head();
+	fsp_elog_check_and_fetch_head();
+}
+
+/* Disable ELOG event flag until PowerNV is ready to receive event */
+static bool opal_kexec_elog_notify(void *data __unused)
+{
+	lock(&elog_read_lock);
+	elog_enabled = false;
+	opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL, 0);
+	unlock(&elog_read_lock);
+
+	return true;
+}
+
+/* FSP elog notify function */
+static bool fsp_elog_msg(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+	int rc = 0;
+	struct fsp_log_entry  *record;
+	uint32_t log_id;
+	uint32_t log_size;
+
+	if (cmd_sub_mod != FSP_CMD_ERRLOG_NOTIFICATION)
+		return false;
+
+	log_id = fsp_msg_get_data_word(msg, 0);
+	log_size = fsp_msg_get_data_word(msg, 1);
+
+	prlog(PR_TRACE, "ELOG: Notified of log 0x%08x (size: %d)\n",
+	       log_id, log_size);
+
+	/* Make sure we don't cross read buffer size */
+	if (log_size > ELOG_READ_BUFFER_SIZE) {
+		log_size = ELOG_READ_BUFFER_SIZE;
+		printf("ELOG: Truncated log (0x%08x) to 0x%x\n",
+		       log_id, log_size);
+	}
+
+	/* Take a lock until we take out the node from elog_read_free */
+	lock(&elog_read_lock);
+	if (!list_empty(&elog_read_free)) {
+		/* Create a new entry in the pending list. */
+		record = list_pop(&elog_read_free, struct fsp_log_entry, link);
+		record->log_id = log_id;
+		record->log_size = log_size;
+		list_add_tail(&elog_read_pending, &record->link);
+		unlock(&elog_read_lock);
+
+		/* Send response back to FSP for a new elog notify message. */
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_RSP_ERRLOG_NOTIFICATION,
+					1, log_id), fsp_freemsg);
+		if (rc)
+			prerror("ELOG: Failed to queue errlog notification"
+				" response: %d\n", rc);
+
+		/* Read error log from FSP */
+		fsp_elog_check_and_fetch_head();
+
+	} else {
+		prlog(PR_TRACE, "ELOG: Log entry 0x%08x discarded\n", log_id);
+
+		/* Unlock if elog_read_free is empty. */
+		unlock(&elog_read_lock);
+
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_RSP_ERRLOG_NOTIFICATION,
+					     1, log_id), fsp_freemsg);
+		if (rc)
+			prerror("ELOG: Failed to queue errlog notification"
+				" response: %d\n", rc);
+
+		/*
+		 * If list is full with max record then we send discarded by
+		 * phyp (condition full) ack to FSP.
+		 *
+		 * At some point in the future, we'll get notified again.
+		 * This is largely up to FSP as to when they tell us about
+		 * the log again.
+		 */
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_ERRLOG_PHYP_ACK | 0x02,
+				1, log_id), fsp_freemsg);
+		if (rc)
+			prerror("ELOG: Failed to queue errlog ack"
+				" response: %d\n", rc);
+	}
+
+	return true;
+}
+
+static struct fsp_client fsp_get_elog_notify = {
+	.message = fsp_elog_msg,
+};
+
+/* Pre-allocate memory for reading error log from FSP */
+static int init_elog_read_free_list(uint32_t num_entries)
+{
+	struct fsp_log_entry *entry;
+	int i;
+
+	entry = zalloc(sizeof(struct fsp_log_entry) * num_entries);
+	if (!entry)
+		goto out_err;
+
+	for (i = 0; i < num_entries; ++i) {
+		list_add_tail(&elog_read_free, &entry->link);
+		entry++;
+	}
+
+	return 0;
+
+out_err:
+	return -ENOMEM;
+}
+
+/* FSP elog read init function */
+void fsp_elog_read_init(void)
+{
+	int val = 0;
+
+	if (!fsp_present())
+		return;
+
+	elog_read_buffer = memalign(TCE_PSIZE, ELOG_READ_BUFFER_SIZE);
+	if (!elog_read_buffer) {
+		prerror("FSP: could not allocate FSP ELOG_READ_BUFFER!\n");
+		return;
+	}
+
+	/* Map TCEs */
+	fsp_tce_map(PSI_DMA_ERRLOG_READ_BUF, elog_read_buffer,
+					PSI_DMA_ERRLOG_READ_BUF_SZ);
+
+	/* Pre allocate memory for 128 record */
+	val = init_elog_read_free_list(ELOG_READ_MAX_RECORD);
+	if (val != 0)
+		return;
+
+	/* Register error log class D2 */
+	fsp_register_client(&fsp_get_elog_notify, FSP_MCLASS_ERR_LOG);
+
+	/* Register for sync on PowerNV reboot call */
+	opal_add_host_sync_notifier(opal_kexec_elog_notify, NULL);
+
+	/* Register OPAL interface */
+	opal_register(OPAL_ELOG_READ, fsp_opal_elog_read, 3);
+	opal_register(OPAL_ELOG_ACK, fsp_opal_elog_ack, 1);
+	opal_register(OPAL_ELOG_RESEND, fsp_opal_resend_pending_logs, 0);
+	opal_register(OPAL_ELOG_SIZE, fsp_opal_elog_info, 3);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-elog-write.c b/roms/skiboot/hw/fsp/fsp-elog-write.c
new file mode 100644
index 000000000..7b26a1867
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-elog-write.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * This code will enable generation and pushing of error log from Sapphire
+ * to FSP.
+ * Critical events from Sapphire that needs to be reported will be pushed
+ * on to FSP after converting the error log to Platform Error Log(PEL) format.
+ * This is termed as write action to FSP.
+ *
+ * Copyright 2013-2016 IBM Corp.
+ */
+
+#include <cpu.h>
+#include <errno.h>
+#include <fsp.h>
+#include <fsp-elog.h>
+#include <lock.h>
+#include <opal-api.h>
+#include <pel.h>
+#include <pool.h>
+#include <skiboot.h>
+#include <timebase.h>
+
+static LIST_HEAD(elog_write_to_fsp_pending);
+static LIST_HEAD(elog_write_to_host_pending);
+static LIST_HEAD(elog_write_to_host_processed);
+
+static struct lock elog_write_lock = LOCK_UNLOCKED;
+static struct lock elog_panic_write_lock = LOCK_UNLOCKED;
+static struct lock elog_write_to_host_lock = LOCK_UNLOCKED;
+
+#define ELOG_WRITE_TO_FSP_BUFFER_SIZE	0x00004000
+/* Log buffer to copy OPAL log for write to FSP. */
+static void *elog_write_to_fsp_buffer;
+
+#define ELOG_PANIC_WRITE_BUFFER_SIZE	0x00004000
+static void *elog_panic_write_buffer;
+
+#define ELOG_WRITE_TO_HOST_BUFFER_SIZE	0x00004000
+static void *elog_write_to_host_buffer;
+
+static uint32_t elog_write_retries;
+
+/* Manipulate this only with write_lock held */
+static uint32_t elog_plid_fsp_commit = -1;
+static enum elog_head_state elog_write_to_host_head_state = ELOG_STATE_NONE;
+
+/* Need forward declaration because of circular dependency */
+static int opal_send_elog_to_fsp(void);
+
+static void remove_elog_head_entry(void)
+{
+	struct errorlog *head, *entry;
+
+	lock(&elog_write_lock);
+	if (!list_empty(&elog_write_to_fsp_pending)) {
+		head = list_top(&elog_write_to_fsp_pending,
+					struct errorlog, link);
+		if (head->plid == elog_plid_fsp_commit) {
+			entry = list_pop(&elog_write_to_fsp_pending,
+					struct errorlog, link);
+			opal_elog_complete(entry,
+					elog_write_retries < MAX_RETRIES);
+			/* Reset the counter */
+			elog_plid_fsp_commit = -1;
+		}
+	}
+
+	elog_write_retries = 0;
+	unlock(&elog_write_lock);
+}
+
+static void opal_fsp_write_complete(struct fsp_msg *read_msg)
+{
+	uint8_t val;
+
+	val = (read_msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(read_msg);
+
+	switch (val) {
+	case FSP_STATUS_SUCCESS:
+		remove_elog_head_entry();
+		break;
+	default:
+		if (elog_write_retries++ >= MAX_RETRIES) {
+			remove_elog_head_entry();
+			prerror("ELOG: Error in writing to FSP (0x%x)!\n", val);
+		}
+
+		break;
+	}
+
+	if (opal_send_elog_to_fsp() != OPAL_SUCCESS)
+		prerror("ELOG: Error sending elog to FSP !\n");
+}
+
+/* Write PEL format hex dump of the log to FSP */
+static int64_t fsp_opal_elog_write(size_t opal_elog_size)
+{
+	struct fsp_msg *elog_msg;
+
+	elog_msg = fsp_mkmsg(FSP_CMD_CREATE_ERRLOG, 3, opal_elog_size,
+						 0, PSI_DMA_ERRLOG_WRITE_BUF);
+	if (!elog_msg) {
+		prerror("ELOG: Failed to create message for WRITE to FSP\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	if (fsp_queue_msg(elog_msg, opal_fsp_write_complete)) {
+		fsp_freemsg(elog_msg);
+		elog_msg = NULL;
+		prerror("FSP: Error queueing elog update\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+/* This should be called with elog_write_to_host_lock lock */
+static inline void fsp_elog_write_set_head_state(enum elog_head_state state)
+{
+	elog_set_head_state(true, state);
+	elog_write_to_host_head_state = state;
+}
+
+bool opal_elog_info(__be64 *opal_elog_id, __be64 *opal_elog_size)
+{
+	struct errorlog *head;
+	bool rc = false;
+
+	lock(&elog_write_to_host_lock);
+	if (elog_write_to_host_head_state == ELOG_STATE_FETCHED_DATA) {
+		head = list_top(&elog_write_to_host_pending,
+					struct errorlog, link);
+		if (!head) {
+			/**
+			 * @fwts-label ElogListInconsistent
+			 * @fwts-advice Bug in interaction between FSP and
+			 * OPAL. The state maintained by OPAL didn't match
+			 * what the FSP sent.
+			 */
+			prlog(PR_ERR,
+			      "%s: Inconsistent internal list state !\n",
+			      __func__);
+			fsp_elog_write_set_head_state(ELOG_STATE_NONE);
+		} else {
+			*opal_elog_id = cpu_to_be64(head->plid);
+			*opal_elog_size = cpu_to_be64(head->log_size);
+			fsp_elog_write_set_head_state(ELOG_STATE_HOST_INFO);
+			rc = true;
+		}
+	}
+
+	unlock(&elog_write_to_host_lock);
+	return rc;
+}
+
+static void opal_commit_elog_in_host(void)
+{
+	struct errorlog *buf;
+
+	lock(&elog_write_to_host_lock);
+	if (!list_empty(&elog_write_to_host_pending) &&
+			(elog_write_to_host_head_state == ELOG_STATE_NONE)) {
+		buf = list_top(&elog_write_to_host_pending,
+				struct errorlog, link);
+		buf->log_size = create_pel_log(buf,
+					(char *)elog_write_to_host_buffer,
+					ELOG_WRITE_TO_HOST_BUFFER_SIZE);
+		fsp_elog_write_set_head_state(ELOG_STATE_FETCHED_DATA);
+	}
+
+	unlock(&elog_write_to_host_lock);
+}
+
+bool opal_elog_read(void *buffer, uint64_t opal_elog_size,
+		    uint64_t opal_elog_id)
+{
+	struct errorlog *log_data;
+	bool rc = false;
+
+	lock(&elog_write_to_host_lock);
+	if (elog_write_to_host_head_state == ELOG_STATE_HOST_INFO) {
+		log_data = list_top(&elog_write_to_host_pending,
+					struct errorlog, link);
+		if (!log_data) {
+			fsp_elog_write_set_head_state(ELOG_STATE_NONE);
+			unlock(&elog_write_to_host_lock);
+			return rc;
+		}
+
+		if ((opal_elog_id != log_data->plid) &&
+		    (opal_elog_size != log_data->log_size)) {
+			unlock(&elog_write_to_host_lock);
+			return rc;
+		}
+
+		memcpy(buffer, elog_write_to_host_buffer, opal_elog_size);
+		list_del(&log_data->link);
+		list_add(&elog_write_to_host_processed, &log_data->link);
+		fsp_elog_write_set_head_state(ELOG_STATE_NONE);
+		rc = true;
+	}
+
+	unlock(&elog_write_to_host_lock);
+	opal_commit_elog_in_host();
+	return rc;
+}
+
+bool opal_elog_ack(uint64_t ack_id)
+{
+	bool rc = false;
+	struct errorlog *log_data;
+	struct errorlog *record, *next_record;
+
+	lock(&elog_write_to_host_lock);
+	if (!list_empty(&elog_write_to_host_processed)) {
+		list_for_each_safe(&elog_write_to_host_processed, record,
+						next_record, link) {
+			if (record->plid != ack_id)
+				continue;
+
+			list_del(&record->link);
+			opal_elog_complete(record, true);
+			rc = true;
+		}
+	}
+
+	if ((!rc) && (!list_empty(&elog_write_to_host_pending))) {
+		log_data = list_top(&elog_write_to_host_pending,
+						struct errorlog, link);
+		if (ack_id == log_data->plid)
+			fsp_elog_write_set_head_state(ELOG_STATE_NONE);
+
+		list_for_each_safe(&elog_write_to_host_pending, record,
+						next_record, link) {
+			if (record->plid != ack_id)
+				continue;
+
+			list_del(&record->link);
+			opal_elog_complete(record, true);
+			rc = true;
+			unlock(&elog_write_to_host_lock);
+			opal_commit_elog_in_host();
+			return rc;
+		}
+	}
+
+	unlock(&elog_write_to_host_lock);
+	return rc;
+}
+
+void opal_resend_pending_logs(void)
+{
+	struct errorlog *record;
+
+	lock(&elog_write_to_host_lock);
+	while (!list_empty(&elog_write_to_host_processed)) {
+		record = list_pop(&elog_write_to_host_processed,
+					struct errorlog, link);
+		list_add_tail(&elog_write_to_host_pending, &record->link);
+	}
+
+	fsp_elog_write_set_head_state(ELOG_STATE_NONE);
+	unlock(&elog_write_to_host_lock);
+	opal_commit_elog_in_host();
+}
+
+static inline u64 get_elog_timeout(void)
+{
+	return (mftb() + secs_to_tb(ERRORLOG_TIMEOUT_INTERVAL));
+}
+
+static int opal_send_elog_to_fsp(void)
+{
+	struct errorlog *head;
+	int rc = OPAL_SUCCESS;
+
+	/*
+	 * Convert entry to PEL and push it down to FSP.
+	 * Then we wait for the ack from FSP.
+	 */
+	lock(&elog_write_lock);
+	if (!list_empty(&elog_write_to_fsp_pending)) {
+		head = list_top(&elog_write_to_fsp_pending,
+					 struct errorlog, link);
+		/* Error needs to be committed, update the time out value */
+		head->elog_timeout = get_elog_timeout();
+
+		elog_plid_fsp_commit = head->plid;
+		head->log_size = create_pel_log(head,
+					(char *)elog_write_to_fsp_buffer,
+					ELOG_WRITE_TO_FSP_BUFFER_SIZE);
+		rc = fsp_opal_elog_write(head->log_size);
+		unlock(&elog_write_lock);
+		return rc;
+	}
+
+	unlock(&elog_write_lock);
+	return rc;
+}
+
+static int opal_push_logs_sync_to_fsp(struct errorlog *buf)
+{
+	struct fsp_msg *elog_msg;
+	int opal_elog_size = 0;
+	int rc = OPAL_SUCCESS;
+
+	lock(&elog_panic_write_lock);
+
+	/* Error needs to be committed, update the time out value */
+	buf->elog_timeout = get_elog_timeout();
+
+	opal_elog_size = create_pel_log(buf,
+					(char *)elog_panic_write_buffer,
+					ELOG_PANIC_WRITE_BUFFER_SIZE);
+
+	elog_msg = fsp_mkmsg(FSP_CMD_CREATE_ERRLOG, 3, opal_elog_size,
+					0, PSI_DMA_ELOG_PANIC_WRITE_BUF);
+	if (!elog_msg) {
+		prerror("ELOG: PLID: 0x%x Failed to create message for WRITE "
+							"to FSP\n", buf->plid);
+		unlock(&elog_panic_write_lock);
+		opal_elog_complete(buf, false);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	if (fsp_sync_msg(elog_msg, false)) {
+		fsp_freemsg(elog_msg);
+		rc = OPAL_INTERNAL_ERROR;
+	} else {
+		rc = (elog_msg->resp->word1 >> 8) & 0xff;
+		fsp_freemsg(elog_msg);
+	}
+
+	unlock(&elog_panic_write_lock);
+	if (rc != OPAL_SUCCESS)
+		opal_elog_complete(buf, false);
+	else
+		opal_elog_complete(buf, true);
+
+	return rc;
+}
+
+int elog_fsp_commit(struct errorlog *buf)
+{
+	int rc = OPAL_SUCCESS;
+
+	if (buf->event_severity == OPAL_ERROR_PANIC) {
+		rc = opal_push_logs_sync_to_fsp(buf);
+		return rc;
+	}
+
+	lock(&elog_write_lock);
+	if (list_empty(&elog_write_to_fsp_pending)) {
+		list_add_tail(&elog_write_to_fsp_pending, &buf->link);
+		unlock(&elog_write_lock);
+		rc = opal_send_elog_to_fsp();
+		return rc;
+	}
+
+	list_add_tail(&elog_write_to_fsp_pending, &buf->link);
+	unlock(&elog_write_lock);
+	return rc;
+}
+
+static void elog_append_write_to_host(struct errorlog *buf)
+{
+	lock(&elog_write_to_host_lock);
+	if (list_empty(&elog_write_to_host_pending)) {
+		list_add(&elog_write_to_host_pending, &buf->link);
+		unlock(&elog_write_to_host_lock);
+		opal_commit_elog_in_host();
+	} else {
+		list_add_tail(&elog_write_to_host_pending, &buf->link);
+		unlock(&elog_write_to_host_lock);
+	}
+}
+
+static void elog_timeout_poll(void *data __unused)
+{
+	uint64_t now;
+	struct errorlog *head, *entry;
+
+	lock(&elog_write_lock);
+	if (list_empty(&elog_write_to_fsp_pending)) {
+		unlock(&elog_write_lock);
+		return;
+	}
+
+	head = list_top(&elog_write_to_fsp_pending, struct errorlog, link);
+	now = mftb();
+	if ((tb_compare(now, head->elog_timeout) == TB_AAFTERB) ||
+			(tb_compare(now, head->elog_timeout) == TB_AEQUALB)) {
+		entry = list_pop(&elog_write_to_fsp_pending,
+				struct errorlog, link);
+		unlock(&elog_write_lock);
+		elog_append_write_to_host(entry);
+	} else {
+		unlock(&elog_write_lock);
+	}
+}
+
+/* FSP elog init function */
+void fsp_elog_write_init(void)
+{
+	if (!fsp_present())
+		return;
+
+	elog_panic_write_buffer = memalign(TCE_PSIZE,
+					ELOG_PANIC_WRITE_BUFFER_SIZE);
+	if (!elog_panic_write_buffer) {
+		prerror("FSP: could not allocate ELOG_PANIC_WRITE_BUFFER!\n");
+		return;
+	}
+
+	elog_write_to_fsp_buffer = memalign(TCE_PSIZE,
+					ELOG_WRITE_TO_FSP_BUFFER_SIZE);
+	if (!elog_write_to_fsp_buffer) {
+		prerror("FSP: could not allocate ELOG_WRITE_BUFFER!\n");
+		return;
+	}
+
+	elog_write_to_host_buffer = memalign(TCE_PSIZE,
+					ELOG_WRITE_TO_HOST_BUFFER_SIZE);
+	if (!elog_write_to_host_buffer) {
+		prerror("FSP: could not allocate ELOG_WRITE_TO_HOST_BUFFER!\n");
+		return;
+	}
+
+	/* Map TCEs */
+	fsp_tce_map(PSI_DMA_ELOG_PANIC_WRITE_BUF, elog_panic_write_buffer,
+					PSI_DMA_ELOG_PANIC_WRITE_BUF_SZ);
+
+	fsp_tce_map(PSI_DMA_ERRLOG_WRITE_BUF, elog_write_to_fsp_buffer,
+					PSI_DMA_ERRLOG_WRITE_BUF_SZ);
+
+	elog_init();
+
+	/* Add a poller */
+	opal_add_poller(elog_timeout_poll, NULL);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-epow.c b/roms/skiboot/hw/fsp/fsp-epow.c
new file mode 100644
index 000000000..8869e91e6
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-epow.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * FSP Environmental and Power Warnings (EPOW) support
+ *
+ * Copyright 2013-2016 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "FSP-EPOW: " fmt
+
+#include <fsp.h>
+#include <device.h>
+#include <lock.h>
+#include <opal-msg.h>
+#include <opal-api.h>
+
+#include "fsp-epow.h"
+
+/*
+ * System EPOW status
+ *
+ * This value is exported to the host. Each individual element in this
+ * array [0...(OPAL_SYSEPOW_MAX-1)] contains bitwise EPOW event info
+ * corresponding to particular defined EPOW sub class. For example.
+ * opal_epow_status[OPAL_SYSEPOW_POWER] will reflect power related EPOW events.
+ */
+static int16_t epow_status[OPAL_SYSEPOW_MAX];
+
+/* EPOW lock */
+static struct lock epow_lock = LOCK_UNLOCKED;
+
+/* Process FSP sent EPOW based information */
+static void epow_process_ex1_event(u8 *epow)
+{
+	memset(epow_status, 0, sizeof(epow_status));
+
+	if (epow[4] == EPOW_TMP_INT) {
+		prlog(PR_INFO, "Internal temp above normal\n");
+		epow_status[OPAL_SYSEPOW_TEMP] = OPAL_SYSTEMP_INT;
+
+	} else if (epow[4] == EPOW_TMP_AMB) {
+		prlog(PR_INFO, "Ambient temp above normal\n");
+		epow_status[OPAL_SYSEPOW_TEMP] = OPAL_SYSTEMP_AMB;
+
+	} else if (epow[4] == EPOW_ON_UPS) {
+		prlog(PR_INFO, "System running on UPS power\n");
+		epow_status[OPAL_SYSEPOW_POWER] = OPAL_SYSPOWER_UPS;
+
+	}
+}
+
+/* Process EPOW event */
+static void fsp_process_epow(struct fsp_msg *msg, int epow_type)
+{
+	int rc;
+	u8 epow[8];
+	bool epow_changed = false;
+	int16_t old_epow_status[OPAL_SYSEPOW_MAX];
+
+	/* Basic EPOW signature */
+	if (msg->data.bytes[0] != 0xF2) {
+		/**
+		 * @fwts-label EPOWSignatureMismatch
+		 * @fwts-advice Bug in skiboot/FSP code for EPOW event handling
+		 */
+		prlog(PR_ERR, "Signature mismatch\n");
+		return;
+	}
+
+	lock(&epow_lock);
+
+	/* Copy over and clear system EPOW status */
+	memcpy(old_epow_status, epow_status, sizeof(old_epow_status));
+
+	switch(epow_type) {
+	case EPOW_NORMAL:
+	case EPOW_EX2:
+		break;
+	case EPOW_EX1:
+		epow[0] = msg->data.bytes[0];
+		epow[1] = msg->data.bytes[1];
+		epow[2] = msg->data.bytes[2];
+		epow[3] = msg->data.bytes[3];
+		epow[4] = msg->data.bytes[4];
+
+		epow_process_ex1_event(epow);
+		break;
+	default:
+		prlog(PR_WARNING, "Unknown EPOW event notification\n");
+		break;
+	}
+
+	if (memcmp(epow_status, old_epow_status, sizeof(epow_status)))
+		epow_changed = true;
+
+	unlock(&epow_lock);
+
+	/* Send OPAL message notification */
+	if (epow_changed) {
+		rc = opal_queue_msg(OPAL_MSG_EPOW, NULL, NULL);
+		if (rc) {
+			/**
+			 * @fwts-label EPOWMessageQueueFailed
+			 * @fwts-advice Queueing a message from OPAL to FSP
+			 * failed. This is likely due to either an OPAL bug
+			 * or the FSP going away.
+			 */
+			prlog(PR_ERR, "OPAL EPOW message queuing failed\n");
+			return;
+		}
+		prlog(PR_INFO, "Notified host about EPOW event\n");
+	}
+}
+
+/*
+ * EPOW OPAL interface
+ *
+ * The host requests for the system EPOW status through this
+ * OPAl call, where it passes a buffer with a give length.
+ * Sapphire fills the buffer with updated system EPOW status
+ * and then updates the length variable back to reflect the
+ * number of EPOW sub classes it has updated the buffer with.
+ */
+static int64_t fsp_opal_get_epow_status(__be16 *out_epow, __be16 *length)
+{
+	int i;
+	int n_epow_class;
+	int l = be16_to_cpu(*length);
+
+	/*
+	 * There can be situations where the host and the Sapphire versions
+	 * don't match with eact other and hence the expected system EPOW status
+	 * details. Newer hosts might be expecting status for more number of EPOW
+	 * sub classes which Sapphire may not know about and older hosts might be
+	 * expecting status for EPOW sub classes which is a subset of what
+	 * Sapphire really knows about. Both these situations are handled here.
+	 *
+	 * (A) Host version >= Sapphire version
+	 *
+	 * Sapphire sends out EPOW status for sub classes it knows about
+	 * and keeps the status. Updates the length variable for the host.
+	 *
+	 * (B) Host version < Sapphire version
+	 *
+	 * Sapphire sends out EPOW status for sub classes host knows about
+	 * and can interpret correctly.
+	 */
+	if (l >= OPAL_SYSEPOW_MAX) {
+		n_epow_class = OPAL_SYSEPOW_MAX;
+		*length = cpu_to_be16(OPAL_SYSEPOW_MAX);
+	} else {
+		n_epow_class = l;
+	}
+
+	/* Transfer EPOW Status */
+	for (i = 0; i < n_epow_class; i++)
+		out_epow[i] = cpu_to_be16(epow_status[i]);
+
+	return OPAL_SUCCESS;
+}
+
+/* Handle EPOW sub-commands from FSP */
+static bool fsp_epow_message(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	switch(cmd_sub_mod) {
+	case FSP_CMD_PANELSTATUS:
+		fsp_process_epow(msg, EPOW_NORMAL);
+		return true;
+	case FSP_CMD_PANELSTATUS_EX1:
+		fsp_process_epow(msg, EPOW_EX1);
+		return true;
+	case FSP_CMD_PANELSTATUS_EX2:
+		fsp_process_epow(msg, EPOW_EX2);
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_epow_client = {
+	.message = fsp_epow_message,
+};
+
+void fsp_epow_init(void)
+{
+	struct dt_node *np;
+
+	fsp_register_client(&fsp_epow_client, FSP_MCLASS_SERVICE);
+	opal_register(OPAL_GET_EPOW_STATUS, fsp_opal_get_epow_status, 2);
+	np = dt_new(opal_node, "epow");
+	dt_add_property_strings(np, "compatible", "ibm,opal-v3-epow");
+	dt_add_property_strings(np, "epow-classes", "power", "temperature", "cooling");
+	prlog(PR_INFO, "FSP EPOW support initialized\n");
+}
diff --git a/roms/skiboot/hw/fsp/fsp-epow.h b/roms/skiboot/hw/fsp/fsp-epow.h
new file mode 100644
index 000000000..bc1df258e
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-epow.h
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Handle FSP EPOW event notifications
+ *
+ * Copyright 2013-2015 IBM Corp.
+ */
+
+#ifndef __FSP_EPOW_H
+#define __FSP_EPOW_H
+
+/* FSP based EPOW event notifications */
+#define EPOW_NORMAL	0x00	/* panel status normal */
+#define EPOW_EX1	0x01	/* panel status extended 1 */
+#define EPOW_EX2	0x02	/* Panel status extended 2 */
+
+/* EPOW reason code notifications */
+#define EPOW_ON_UPS	1	/* System on UPS */
+#define EPOW_TMP_AMB	2	/* Over ambient temperature */
+#define EPOW_TMP_INT	3	/* Over internal temperature */
+
+#endif
diff --git a/roms/skiboot/hw/fsp/fsp-ipmi.c b/roms/skiboot/hw/fsp/fsp-ipmi.c
new file mode 100644
index 000000000..e368c2828
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-ipmi.c
@@ -0,0 +1,400 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Conduit for IPMI messages to/from FSP
+ *
+ * Copyright 2014-2019 IBM Corp.
+ */
+
+#include <errorlog.h>
+#include <fsp.h>
+#include <ipmi.h>
+#include <lock.h>
+#include <opal-api.h>
+
+/*
+ * Under the hood, FSP IPMI component implements the KCS (Keyboard Controller
+ * Style) interface
+ *
+ * KCS interface request message format
+ *
+ *    BYTE 1	 BYTE 2	       BYTE 3:N
+ *  -------------------------------------
+ * | NetFn/LUN |    Cmd    |    Data     |
+ *  -------------------------------------
+ *
+ * KCS interface response message format
+ *
+ *    BYTE 1	 BYTE 2		BYTE 3	  BYTE 4:N
+ *  ------------------------------------------------
+ * | NetFn/LUN |    Cmd    |  CompCode  |   Data    |
+ *  ------------------------------------------------
+
+ */
+
+#define FSP_IPMI_REQ_MIN_LEN	2 /* NetFn + Cmd */
+#define FSP_IPMI_RESP_MIN_LEN	3 /* NetFn + Cmd + Completion code */
+
+DEFINE_LOG_ENTRY(OPAL_RC_IPMI_REQ, OPAL_PLATFORM_ERR_EVT, OPAL_IPMI,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_IPMI_RESP, OPAL_PLATFORM_ERR_EVT, OPAL_IPMI,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_IPMI_DMA_ERROR_RESP, OPAL_PLATFORM_ERR_EVT, OPAL_IPMI,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+		 OPAL_NA);
+
+struct fsp_ipmi_msg {
+	struct list_node	link;
+	struct ipmi_msg		ipmi_msg;
+};
+
+static struct fsp_ipmi {
+	struct list_head	msg_queue;
+	void			*ipmi_req_buf;
+	void			*ipmi_resp_buf;
+	/* There can only be one outstanding request whose reference is stored
+	 * in 'cur_msg' and the 'lock' protects against the concurrent updates
+	 * of it through request and response. The same 'lock' also protects
+	 * the list manipulation.
+	 */
+	struct fsp_ipmi_msg	*cur_msg;
+	struct lock		lock;
+} fsp_ipmi;
+
+static int fsp_ipmi_send_request(void);
+
+static void fsp_ipmi_cmd_done(uint8_t cmd, uint8_t netfn, uint8_t cc)
+{
+	struct fsp_ipmi_msg *fsp_ipmi_msg = fsp_ipmi.cur_msg;
+
+	lock(&fsp_ipmi.lock);
+	if (fsp_ipmi.cur_msg == NULL) {
+		unlock(&fsp_ipmi.lock);
+		return;
+	}
+	list_del(&fsp_ipmi_msg->link);
+	fsp_ipmi.cur_msg = NULL;
+	unlock(&fsp_ipmi.lock);
+
+	ipmi_cmd_done(cmd, netfn, cc, &fsp_ipmi_msg->ipmi_msg);
+}
+
+
+static void fsp_ipmi_req_complete(struct fsp_msg *msg)
+{
+	uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+	uint32_t length = fsp_msg_get_data_word(msg->resp, 0);
+	struct fsp_ipmi_msg *fsp_ipmi_msg = msg->user_data;
+	struct ipmi_msg *ipmi_msg;
+
+	fsp_freemsg(msg);
+
+	if (status != FSP_STATUS_SUCCESS) {
+		assert(fsp_ipmi_msg == fsp_ipmi.cur_msg);
+
+		ipmi_msg = &fsp_ipmi_msg->ipmi_msg;
+
+		if (length != (ipmi_msg->req_size + FSP_IPMI_REQ_MIN_LEN))
+			prlog(PR_DEBUG, "IPMI: Length mismatch in req completion "
+			      "(%d, %d)\n", ipmi_msg->req_size, length);
+
+		log_simple_error(&e_info(OPAL_RC_IPMI_REQ), "IPMI: Request "
+				 "failed with status:0x%02x\n", status);
+		/* FSP will not send the response now, so clear the current
+		 * outstanding request
+		 */
+		fsp_ipmi_cmd_done(ipmi_msg->cmd,
+				  IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn),
+				  IPMI_ERR_UNSPECIFIED);
+
+		/* Send the next request in the queue */
+		fsp_ipmi_send_request();
+	}
+}
+
+static int fsp_ipmi_send_request(void)
+{
+	uint8_t *req_buf = fsp_ipmi.ipmi_req_buf;
+	struct ipmi_msg *ipmi_msg;
+	struct fsp_msg *msg;
+	int rc;
+
+	if (fsp_in_rr())
+		return OPAL_BUSY;
+
+	lock(&fsp_ipmi.lock);
+	/* An outstanding request is still pending */
+	if (fsp_ipmi.cur_msg) {
+		unlock(&fsp_ipmi.lock);
+		return OPAL_SUCCESS;
+	}
+
+	fsp_ipmi.cur_msg = list_top(&fsp_ipmi.msg_queue, struct fsp_ipmi_msg,
+				    link);
+	unlock(&fsp_ipmi.lock);
+
+	if (!fsp_ipmi.cur_msg)
+		return OPAL_SUCCESS;
+
+	ipmi_msg = &fsp_ipmi.cur_msg->ipmi_msg;
+	prlog(PR_TRACE, "IPMI: Send request, netfn:0x%02x, cmd:0x%02x, "
+	      "req_len:%d\n", ipmi_msg->netfn, ipmi_msg->cmd, ipmi_msg->req_size);
+
+	/* KCS request message format */
+	*req_buf++ = ipmi_msg->netfn;	/* BYTE 1 */
+	*req_buf++ = ipmi_msg->cmd;	/* BYTE 2 */
+	if (ipmi_msg->req_size)
+		memcpy(req_buf, ipmi_msg->data, ipmi_msg->req_size);
+
+	msg = fsp_mkmsg(FSP_CMD_FETCH_PLAT_DATA, 5, 0, PSI_DMA_PLAT_REQ_BUF,
+			0, PSI_DMA_PLAT_RESP_BUF,
+			ipmi_msg->req_size + FSP_IPMI_REQ_MIN_LEN);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_IPMI_REQ), "IPMI: Failed to "
+				 "allocate request message\n");
+		fsp_ipmi_cmd_done(ipmi_msg->cmd,
+				  IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn),
+				  IPMI_ERR_UNSPECIFIED);
+		return OPAL_NO_MEM;
+	}
+
+	msg->user_data = fsp_ipmi.cur_msg;
+	rc = fsp_queue_msg(msg, fsp_ipmi_req_complete);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_IPMI_REQ), "IPMI: Failed to "
+				 "queue request message (%d)\n", rc);
+		fsp_freemsg(msg);
+		fsp_ipmi_cmd_done(ipmi_msg->cmd,
+				  IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn),
+				  IPMI_ERR_UNSPECIFIED);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static struct ipmi_msg *fsp_ipmi_alloc_msg(size_t req_size, size_t resp_size)
+{
+	struct fsp_ipmi_msg *fsp_ipmi_msg;
+	struct ipmi_msg *ipmi_msg;
+
+	fsp_ipmi_msg = zalloc(sizeof(*fsp_ipmi_msg) + MAX(req_size, resp_size));
+	if (!fsp_ipmi_msg)
+		return NULL;
+
+	ipmi_msg = &fsp_ipmi_msg->ipmi_msg;
+
+	ipmi_msg->req_size = req_size;
+	ipmi_msg->resp_size = resp_size;
+	ipmi_msg->data = (uint8_t *)(fsp_ipmi_msg + 1);
+
+	return ipmi_msg;
+}
+
+static void fsp_ipmi_free_msg(struct ipmi_msg *ipmi_msg)
+{
+	struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg,
+			struct fsp_ipmi_msg, ipmi_msg);
+
+	free(fsp_ipmi_msg);
+}
+
+static int fsp_ipmi_queue_msg(struct ipmi_msg *ipmi_msg)
+{
+	struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg,
+			struct fsp_ipmi_msg, ipmi_msg);
+
+	if (fsp_in_rr())
+		return OPAL_BUSY;
+
+	lock(&fsp_ipmi.lock);
+	list_add_tail(&fsp_ipmi.msg_queue, &fsp_ipmi_msg->link);
+	unlock(&fsp_ipmi.lock);
+
+	return fsp_ipmi_send_request();
+}
+
+static int fsp_ipmi_queue_msg_head(struct ipmi_msg *ipmi_msg)
+{
+	struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg,
+			struct fsp_ipmi_msg, ipmi_msg);
+
+	if (fsp_in_rr())
+		return OPAL_BUSY;
+
+	lock(&fsp_ipmi.lock);
+	list_add(&fsp_ipmi.msg_queue, &fsp_ipmi_msg->link);
+	unlock(&fsp_ipmi.lock);
+
+	return fsp_ipmi_send_request();
+}
+
+static int fsp_ipmi_dequeue_msg(struct ipmi_msg *ipmi_msg)
+{
+	struct fsp_ipmi_msg *fsp_ipmi_msg = container_of(ipmi_msg,
+			struct fsp_ipmi_msg, ipmi_msg);
+
+	lock(&fsp_ipmi.lock);
+	list_del_from(&fsp_ipmi.msg_queue, &fsp_ipmi_msg->link);
+	unlock(&fsp_ipmi.lock);
+
+	return 0;
+}
+
+static struct ipmi_backend fsp_ipmi_backend = {
+	.alloc_msg	= fsp_ipmi_alloc_msg,
+	.free_msg	= fsp_ipmi_free_msg,
+	.queue_msg	= fsp_ipmi_queue_msg,
+	.queue_msg_head	= fsp_ipmi_queue_msg_head,
+	.dequeue_msg	= fsp_ipmi_dequeue_msg,
+	/* FIXME if ever use ipmi_queue_msg_sync on FSP */
+	.poll           = NULL,
+};
+
+static bool fsp_ipmi_rr_notify(uint32_t cmd_sub_mod,
+			       struct fsp_msg *msg __unused)
+{
+	struct ipmi_msg *ipmi_msg;
+
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		return true;
+	case FSP_RELOAD_COMPLETE:
+		/*
+		 * We will not get response for outstanding request. Send error
+		 * message to caller and start sending new ipmi messages.
+		 */
+		if (fsp_ipmi.cur_msg) {
+			ipmi_msg = &fsp_ipmi.cur_msg->ipmi_msg;
+			fsp_ipmi_cmd_done(ipmi_msg->cmd,
+					  IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn),
+					  IPMI_ERR_UNSPECIFIED);
+		}
+		fsp_ipmi_send_request();
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_ipmi_client_rr = {
+	.message = fsp_ipmi_rr_notify,
+};
+
+static bool fsp_ipmi_send_response(uint32_t cmd)
+{
+	struct fsp_msg *resp;
+	int rc;
+
+	resp = fsp_mkmsg(cmd, 0);
+	if (!resp) {
+		log_simple_error(&e_info(OPAL_RC_IPMI_RESP), "IPMI: Failed to "
+				 "allocate response message\n");
+		return false;
+	}
+
+	rc = fsp_queue_msg(resp, fsp_freemsg);
+	if (rc) {
+		fsp_freemsg(resp);
+		log_simple_error(&e_info(OPAL_RC_IPMI_RESP), "IPMI: Failed to "
+				 "queue response message\n");
+		return false;
+	}
+
+	return true;
+}
+
+static bool fsp_ipmi_read_response(struct fsp_msg *msg)
+{
+	uint8_t *resp_buf = fsp_ipmi.ipmi_resp_buf;
+	uint32_t status = fsp_msg_get_data_word(msg, 3);
+	uint32_t length = fsp_msg_get_data_word(msg, 2);
+	struct ipmi_msg *ipmi_msg;
+	uint8_t netfn, cmd, cc;
+
+	assert(fsp_ipmi.cur_msg);
+	ipmi_msg = &fsp_ipmi.cur_msg->ipmi_msg;
+
+	/* Response TCE token */
+	assert(fsp_msg_get_data_word(msg, 1) == PSI_DMA_PLAT_RESP_BUF);
+
+	if (status != FSP_STATUS_SUCCESS) {
+		if(status == FSP_STATUS_DMA_ERROR)
+			log_simple_error(&e_info(OPAL_RC_IPMI_DMA_ERROR_RESP), "IPMI: Received "
+				"DMA ERROR response from FSP, this may be due to FSP "
+				"is in termination state:0x%02x\n", status);
+		else
+			log_simple_error(&e_info(OPAL_RC_IPMI_RESP), "IPMI: FSP response "
+				 "received with bad status:0x%02x\n", status);
+
+		fsp_ipmi_cmd_done(ipmi_msg->cmd,
+				  IPMI_NETFN_RETURN_CODE(ipmi_msg->netfn),
+				  IPMI_ERR_UNSPECIFIED);
+		return fsp_ipmi_send_response(FSP_RSP_PLAT_DATA |
+					      FSP_STATUS_SUCCESS);
+	}
+
+	/* KCS response message format */
+	netfn = *resp_buf++;
+	cmd = *resp_buf++;
+	cc = *resp_buf++;
+	length -= FSP_IPMI_RESP_MIN_LEN;
+
+	prlog(PR_TRACE, "IPMI: fsp response received, netfn:0x%02x, cmd:0x%02x,"
+	      " cc:0x%02x, length:%d\n", netfn, cmd, cc, length);
+
+	if (length > ipmi_msg->resp_size) {
+		prlog(PR_DEBUG, "IPMI: Length mismatch in response (%d, %d)\n",
+		      length, ipmi_msg->resp_size);
+		length = ipmi_msg->resp_size; /* Truncate */
+		cc = IPMI_ERR_MSG_TRUNCATED;
+	}
+
+	ipmi_msg->resp_size = length;
+	if (length)
+		memcpy(ipmi_msg->data, resp_buf, length);
+
+	fsp_ipmi_cmd_done(cmd, netfn, cc);
+
+	return fsp_ipmi_send_response(FSP_RSP_PLAT_DATA);
+}
+
+static bool fsp_ipmi_response(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+	bool rc;
+
+	switch (cmd_sub_mod) {
+	case FSP_CMD_SEND_PLAT_DATA:
+		prlog(PR_TRACE, "FSP_CMD_SEND_PLAT_DATA command received\n");
+		rc = fsp_ipmi_read_response(msg);
+		break;
+	default:
+		return false;
+	};
+
+	/* If response sent successfully, pick the next request */
+	if (rc == true)
+		fsp_ipmi_send_request();
+
+	return rc;
+}
+
+static struct fsp_client fsp_ipmi_client = {
+	.message = fsp_ipmi_response,
+};
+
+void fsp_ipmi_init(void)
+{
+	fsp_tce_map(PSI_DMA_PLAT_REQ_BUF, fsp_ipmi.ipmi_req_buf,
+		    PSI_DMA_PLAT_REQ_BUF_SIZE);
+	fsp_tce_map(PSI_DMA_PLAT_RESP_BUF, fsp_ipmi.ipmi_resp_buf,
+		    PSI_DMA_PLAT_RESP_BUF_SIZE);
+
+	list_head_init(&fsp_ipmi.msg_queue);
+	init_lock(&fsp_ipmi.lock);
+
+	fsp_register_client(&fsp_ipmi_client, FSP_MCLASS_FETCH_SPDATA);
+	fsp_register_client(&fsp_ipmi_client_rr, FSP_MCLASS_RR_EVENT);
+	ipmi_register_backend(&fsp_ipmi_backend);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-leds.c b/roms/skiboot/hw/fsp/fsp-leds.c
new file mode 100644
index 000000000..5a552ab3e
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-leds.c
@@ -0,0 +1,1939 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * LED location code and indicator handling
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "FSPLED: " fmt
+#include <skiboot.h>
+#include <fsp.h>
+#include <device.h>
+#include <spcn.h>
+#include <lock.h>
+#include <errorlog.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <fsp-leds.h>
+#include <fsp-sysparam.h>
+
+#define buf_write(p, type, val)  do { *(type *)(p) = val;\
+					p += sizeof(type); } while(0)
+#define buf_read(p, type, addr)  do { *addr = *(type *)(p);\
+					p += sizeof(type); } while(0)
+
+/* SPCN replay threshold */
+#define SPCN_REPLAY_THRESHOLD 2
+
+/* LED support status */
+enum led_support_state {
+	LED_STATE_ABSENT,
+	LED_STATE_READING,
+	LED_STATE_PRESENT,
+};
+
+static enum led_support_state led_support = LED_STATE_ABSENT;
+
+/*
+ *  PSI mapped buffer for LED data
+ *
+ * Mapped once and never unmapped. Used for fetching all
+ * available LED information and creating the list. Also
+ * used for setting individual LED state.
+ *
+ */
+static void *led_buffer;
+static u8 *loc_code_list_buffer = NULL;
+
+/* Maintain list of all LEDs
+ *
+ * The contents here will be used to cater requests from FSP
+ * async commands and HV initiated OPAL calls.
+ */
+static struct list_head  cec_ledq;		/* CEC LED list */
+static struct list_head	 encl_ledq;	/* Enclosure LED list */
+static struct list_head  spcn_cmdq;	/* SPCN command queue */
+
+/* LED lock */
+static struct lock led_lock = LOCK_UNLOCKED;
+static struct lock spcn_cmd_lock = LOCK_UNLOCKED;
+static struct lock sai_lock = LOCK_UNLOCKED;
+
+static bool spcn_cmd_complete = true;	/* SPCN command complete */
+
+/* Last SPCN command */
+static u32 last_spcn_cmd;
+static int replay = 0;
+
+/*
+ * FSP controls System Attention Indicator. But it expects hypervisor
+ * keep track of the status and serve get LED state request (both from
+ * Linux and FSP itself)!
+ */
+static struct sai_data sai_data;
+
+/* Forward declaration */
+static void fsp_read_leds_data_complete(struct fsp_msg *msg);
+static int process_led_state_change(void);
+
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_SPCN, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_BUFF, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_LC, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+		OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_STATE, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_SUPPORT, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+		OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+
+/* Find descendent LED record with CEC location code in CEC list */
+static struct fsp_led_data *fsp_find_cec_led(char *loc_code)
+{
+	struct fsp_led_data *led, *next;
+
+	list_for_each_safe(&cec_ledq, led, next, link) {
+		if (strcmp(led->loc_code, loc_code))
+			continue;
+		return led;
+	}
+	return NULL;
+}
+
+/* Find encl LED record with ENCL location code in ENCL list */
+static struct fsp_led_data *fsp_find_encl_led(char *loc_code)
+{
+	struct fsp_led_data *led, *next;
+
+	list_for_each_safe(&encl_ledq, led, next, link) {
+		if (strcmp(led->loc_code, loc_code))
+			continue;
+		return led;
+	}
+	return NULL;
+}
+
+/* Find encl LED record with CEC location code in CEC list */
+static struct fsp_led_data *fsp_find_encl_cec_led(char *loc_code)
+{
+	struct fsp_led_data *led, *next;
+
+	list_for_each_safe(&cec_ledq, led, next, link) {
+		if (strstr(led->loc_code, "-"))
+			continue;
+		if (!strstr(loc_code, led->loc_code))
+			continue;
+		return led;
+	}
+	return NULL;
+}
+
+/* Find encl LED record with CEC location code in ENCL list */
+static struct fsp_led_data *fsp_find_encl_encl_led(char *loc_code)
+{
+	struct fsp_led_data *led, *next;
+
+	list_for_each_safe(&encl_ledq, led, next, link) {
+		if (!strstr(loc_code, led->loc_code))
+			continue;
+		return led;
+	}
+	return NULL;
+}
+
+/* Compute the ENCL LED status in CEC list */
+static void compute_encl_status_cec(struct fsp_led_data *encl_led)
+{
+	struct fsp_led_data *led, *next;
+
+	encl_led->status &= ~SPCN_LED_IDENTIFY_MASK;
+	encl_led->status &= ~SPCN_LED_FAULT_MASK;
+
+	list_for_each_safe(&cec_ledq, led, next, link) {
+		if (!strstr(led->loc_code, encl_led->loc_code))
+			continue;
+
+		/* Don't count the enclsure LED itself */
+		if (!strcmp(led->loc_code, encl_led->loc_code))
+			continue;
+
+		if (led->status & SPCN_LED_IDENTIFY_MASK)
+			encl_led->status |= SPCN_LED_IDENTIFY_MASK;
+
+		if (led->status & SPCN_LED_FAULT_MASK)
+			encl_led->status |= SPCN_LED_FAULT_MASK;
+	}
+}
+
+/* Is a enclosure LED */
+static bool is_enclosure_led(char *loc_code)
+{
+	if (strstr(loc_code, "-"))
+		return false;
+	if (!fsp_find_cec_led(loc_code) || !fsp_find_encl_led(loc_code))
+		return false;
+	return true;
+}
+
+static inline void opal_led_update_complete(u64 async_token, u64 result)
+{
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			cpu_to_be64(async_token),
+			cpu_to_be64(result));
+}
+
+static inline bool is_sai_loc_code(const char *loc_code)
+{
+	if (!loc_code)
+		return false;
+
+	if (!strncmp(sai_data.loc_code, loc_code, strlen(sai_data.loc_code)))
+		return true;
+
+	return false;
+}
+
+/* Set/Reset System attention indicator */
+static void fsp_set_sai_complete(struct fsp_msg *msg)
+{
+	int ret = OPAL_SUCCESS;
+	int rc = msg->resp->word1 & 0xff00;
+	struct led_set_cmd *spcn_cmd = (struct led_set_cmd *)msg->user_data;
+
+	if (rc) {
+		/**
+		 * @fwts-label FSPSAIFailed
+		 * @fwts-advice Failed to update System Attention Indicator.
+		 * Likely means some bug with OPAL interacting with FSP.
+		 */
+		prlog(PR_ERR, "Update SAI cmd failed [rc=%d].\n", rc);
+		ret = OPAL_INTERNAL_ERROR;
+
+		/* Roll back */
+		lock(&sai_lock);
+		sai_data.state = spcn_cmd->ckpt_status;
+		unlock(&sai_lock);
+	}
+
+	if (spcn_cmd->cmd_src == SPCN_SRC_OPAL)
+		opal_led_update_complete(spcn_cmd->async_token, ret);
+
+	/* free msg and spcn command */
+	free(spcn_cmd);
+	fsp_freemsg(msg);
+
+	/* Process pending LED update request */
+	process_led_state_change();
+}
+
+static int fsp_set_sai(struct led_set_cmd *spcn_cmd)
+{
+	int rc = -ENOMEM;
+	uint32_t cmd = FSP_CMD_SA_INDICATOR;
+	struct fsp_msg *msg;
+
+	/*
+	 * FSP does not allow hypervisor to set real SAI, but we can
+	 * reset real SAI. Also in our case only host can control
+	 * LEDs, not guests. Hence we will set platform virtual SAI
+	 * and reset real SAI.
+	 */
+	if (spcn_cmd->state == LED_STATE_ON)
+		cmd |= FSP_LED_SET_PLAT_SAI;
+	else
+		cmd |= FSP_LED_RESET_REAL_SAI;
+
+	prlog(PR_TRACE, "Update SAI Indicator [cur : 0x%x, new : 0x%x].\n",
+	      sai_data.state, spcn_cmd->state);
+
+	msg = fsp_mkmsg(cmd, 0);
+	if (!msg) {
+		/**
+		 * @fwts-label SAIMallocFail
+		 * @fwts-advice OPAL ran out of memory while trying to
+		 * allocate an FSP message in SAI code path. This indicates
+		 * an OPAL bug that caused OPAL to run out of memory.
+		 */
+		prlog(PR_ERR, "%s: Memory allocation failed.\n", __func__);
+		goto sai_fail;
+	}
+
+	spcn_cmd->ckpt_status = sai_data.state;
+	msg->user_data = spcn_cmd;
+	rc = fsp_queue_msg(msg, fsp_set_sai_complete);
+	if (rc) {
+		fsp_freemsg(msg);
+		/**
+		 * @fwts-label SAIQueueFail
+		 * @fwts-advice Error in queueing message to FSP in SAI code
+		 * path. Likely an OPAL bug.
+		 */
+		prlog(PR_ERR, "%s: Failed to queue the message\n", __func__);
+		goto sai_fail;
+	}
+
+	lock(&sai_lock);
+	sai_data.state = spcn_cmd->state;
+	unlock(&sai_lock);
+
+	return OPAL_SUCCESS;
+
+sai_fail:
+	if (spcn_cmd->cmd_src == SPCN_SRC_OPAL)
+		opal_led_update_complete(spcn_cmd->async_token,
+					 OPAL_INTERNAL_ERROR);
+
+	return OPAL_INTERNAL_ERROR;
+}
+
+static void fsp_get_sai_complete(struct fsp_msg *msg)
+{
+	int rc = msg->resp->word1 & 0xff00;
+
+	if (rc) {
+		/**
+		 * @fwts-label FSPSAIGetFailed
+		 * @fwts-advice Possibly an error on FSP side, OPAL failed
+		 * to read state from FSP.
+		 */
+		prlog(PR_ERR, "Read real SAI cmd failed [rc = 0x%x].\n", rc);
+	} else { /* Update SAI state */
+		lock(&sai_lock);
+		sai_data.state = fsp_msg_get_data_word(msg->resp, 0) & 0xff;
+		unlock(&sai_lock);
+
+		prlog(PR_TRACE, "SAI initial state = 0x%x\n", sai_data.state);
+	}
+
+	fsp_freemsg(msg);
+}
+
+/* Read initial SAI state. */
+static void fsp_get_sai(void)
+{
+	int rc;
+	uint32_t cmd = FSP_CMD_SA_INDICATOR | FSP_LED_READ_REAL_SAI;
+	struct fsp_msg *msg;
+
+	msg = fsp_mkmsg(cmd, 0);
+	if (!msg) {
+		/**
+		 * @fwts-label FSPGetSAIMallocFail
+		 * @fwts-advice OPAL ran out of memory: OPAL bug.
+		 */
+		prlog(PR_ERR, "%s: Memory allocation failed.\n", __func__);
+		return;
+	}
+	rc = fsp_queue_msg(msg, fsp_get_sai_complete);
+	if (rc) {
+		fsp_freemsg(msg);
+		/**
+		 * @fwts-label FSPGetSAIQueueFail
+		 * @fwts-advice Failed to queue message to FSP: OPAL bug
+		 */
+		prlog(PR_ERR, "%s: Failed to queue the message\n", __func__);
+	}
+}
+
+static bool sai_update_notification(struct fsp_msg *msg)
+{
+	uint32_t state = fsp_msg_get_data_word(msg, 2);
+	uint32_t param_id = fsp_msg_get_data_word(msg, 0);
+	int len = fsp_msg_get_data_word(msg, 1) & 0xffff;
+
+	if (param_id != SYS_PARAM_REAL_SAI && param_id != SYS_PARAM_PLAT_SAI)
+		return false;
+
+	if (len != 4)
+		return false;
+
+	if (state != LED_STATE_ON && state != LED_STATE_OFF)
+		return false;
+
+	/* Update SAI state */
+	lock(&sai_lock);
+	sai_data.state = state;
+	unlock(&sai_lock);
+
+	prlog(PR_TRACE, "SAI updated. New SAI state = 0x%x\n", state);
+	return true;
+}
+
+
+/*
+ * Update both the local LED lists to reflect upon led state changes
+ * occurred with the recent SPCN command. Subsequent LED requests will
+ * be served with these updates changed to the list.
+ */
+static void update_led_list(char *loc_code, u32 led_state, u32 excl_bit)
+{
+	struct fsp_led_data *led = NULL, *encl_led = NULL, *encl_cec_led = NULL;
+	bool is_encl_led = is_enclosure_led(loc_code);
+
+	/* Enclosure LED in CEC list */
+	encl_cec_led = fsp_find_encl_cec_led(loc_code);
+	if (!encl_cec_led) {
+		log_simple_error(&e_info(OPAL_RC_LED_LC),
+			"Could not find enclosure LED in CEC LC=%s\n",
+			loc_code);
+		return;
+	}
+
+	/* Update state */
+	if (is_encl_led) {
+		/* Enclosure exclusive bit */
+		encl_cec_led->excl_bit = excl_bit;
+	} else {	/* Descendant LED in CEC list */
+		led = fsp_find_cec_led(loc_code);
+		if (!led) {
+			log_simple_error(&e_info(OPAL_RC_LED_LC),
+					 "Could not find descendent LED in \
+					 CEC LC=%s\n", loc_code);
+			return;
+		}
+		led->status = led_state;
+	}
+
+	/* Enclosure LED in ENCL list */
+	encl_led = fsp_find_encl_encl_led(loc_code);
+	if (!encl_led) {
+		log_simple_error(&e_info(OPAL_RC_LED_LC),
+			"Could not find enclosure LED in ENCL LC=%s\n",
+			loc_code);
+		return;
+	}
+
+	/* Compute descendent rolled up status */
+	compute_encl_status_cec(encl_cec_led);
+
+	/* Check whether exclussive bits set */
+	if (encl_cec_led->excl_bit & FSP_LED_EXCL_FAULT)
+		encl_cec_led->status |= SPCN_LED_FAULT_MASK;
+
+	if (encl_cec_led->excl_bit & FSP_LED_EXCL_IDENTIFY)
+		encl_cec_led->status |= SPCN_LED_IDENTIFY_MASK;
+
+	/* Copy over */
+	encl_led->status = encl_cec_led->status;
+	encl_led->excl_bit = encl_cec_led->excl_bit;
+}
+
+static int fsp_set_led_response(uint32_t cmd)
+{
+	struct fsp_msg *msg;
+	int rc = -1;
+
+	msg = fsp_mkmsg(cmd, 0);
+	if (!msg) {
+		prerror("Failed to allocate FSP_RSP_SET_LED_STATE [cmd=%x])\n",
+			cmd);
+	} else {
+		rc = fsp_queue_msg(msg, fsp_freemsg);
+		if (rc != OPAL_SUCCESS) {
+			fsp_freemsg(msg);
+			prerror("Failed to queue FSP_RSP_SET_LED_STATE"
+				" [cmd=%x]\n", cmd);
+		}
+	}
+	return rc;
+}
+
+static void fsp_spcn_set_led_completion(struct fsp_msg *msg)
+{
+	struct fsp_msg *resp = msg->resp;
+	u32 cmd = FSP_RSP_SET_LED_STATE;
+	u8 status = resp->word1 & 0xff00;
+	struct led_set_cmd *spcn_cmd = (struct led_set_cmd *)msg->user_data;
+
+	lock(&led_lock);
+
+	/*
+	 * LED state update request came as part of FSP async message
+	 * FSP_CMD_SET_LED_STATE, we need to send response message.
+	 *
+	 * Also if SPCN command failed, then roll back changes.
+	 */
+	if (status != FSP_STATUS_SUCCESS) {
+		log_simple_error(&e_info(OPAL_RC_LED_SPCN),
+			"Last SPCN command failed, status=%02x\n",
+			status);
+		cmd |= FSP_STATUS_GENERIC_ERROR;
+
+		/* Rollback the changes */
+		update_led_list(spcn_cmd->loc_code,
+				spcn_cmd->ckpt_status, spcn_cmd->ckpt_excl_bit);
+	}
+
+	/* FSP initiated SPCN command */
+	if (spcn_cmd->cmd_src == SPCN_SRC_FSP)
+		fsp_set_led_response(cmd);
+
+	/* OPAL initiated SPCN command */
+	if (spcn_cmd->cmd_src == SPCN_SRC_OPAL) {
+		if (status != FSP_STATUS_SUCCESS)
+			opal_led_update_complete(spcn_cmd->async_token,
+						 OPAL_INTERNAL_ERROR);
+		else
+			opal_led_update_complete(spcn_cmd->async_token,
+						 OPAL_SUCCESS);
+	}
+
+	unlock(&led_lock);
+
+	/* free msg and spcn command */
+	free(spcn_cmd);
+	fsp_freemsg(msg);
+
+	/* Process pending LED update request */
+	process_led_state_change();
+}
+
+/*
+ * Set the state of the LED pointed by the location code
+ *
+ * LED command:		FAULT state or IDENTIFY state
+ * LED state  :		OFF (reset) or ON (set)
+ *
+ * SPCN TCE mapped buffer entries for setting LED state
+ *
+ * struct spcn_led_data {
+ *	u8	lc_len;
+ *	u16	state;
+ *	char	lc_code[LOC_CODE_SIZE];
+ *};
+ */
+static int fsp_msg_set_led_state(struct led_set_cmd *spcn_cmd)
+{
+	struct spcn_led_data sled;
+	struct fsp_msg *msg = NULL;
+	struct fsp_led_data *led = NULL;
+	void *buf = led_buffer;
+	u16 data_len = 0;
+	u32 cmd_hdr = 0;
+	u32 cmd = FSP_RSP_SET_LED_STATE;
+	int rc = -1;
+
+	memset(sled.lc_code, 0, LOC_CODE_SIZE);
+	sled.lc_len = strlen(spcn_cmd->loc_code);
+	if (sled.lc_len >= LOC_CODE_SIZE)
+		sled.lc_len = LOC_CODE_SIZE - 1;
+	strncpy(sled.lc_code, spcn_cmd->loc_code, LOC_CODE_SIZE - 1);
+
+	lock(&led_lock);
+
+	/* Location code length + Location code + LED control */
+	data_len = LOC_CODE_LEN + sled.lc_len + LED_CONTROL_LEN;
+	cmd_hdr =  SPCN_MOD_SET_LED_CTL_LOC_CODE << 24 | SPCN_CMD_SET << 16 |
+		data_len;
+
+	/* Fetch the current state of LED */
+	led = fsp_find_cec_led(spcn_cmd->loc_code);
+
+	/* LED not present */
+	if (led == NULL) {
+		if (spcn_cmd->cmd_src == SPCN_SRC_FSP) {
+			cmd |= FSP_STATUS_INVALID_LC;
+			fsp_set_led_response(cmd);
+		}
+
+		if (spcn_cmd->cmd_src == SPCN_SRC_OPAL)
+			opal_led_update_complete(spcn_cmd->async_token,
+						 OPAL_INTERNAL_ERROR);
+
+		unlock(&led_lock);
+		return rc;
+	}
+
+	/*
+	 * Checkpoint the status here, will use it if the SPCN
+	 * command eventually fails.
+	 */
+	spcn_cmd->ckpt_status = led->status;
+	spcn_cmd->ckpt_excl_bit = led->excl_bit;
+	sled.state = cpu_to_be16(led->status);
+
+	/* Update the exclussive LED bits  */
+	if (is_enclosure_led(spcn_cmd->loc_code)) {
+		if (spcn_cmd->command == LED_COMMAND_FAULT) {
+			if (spcn_cmd->state == LED_STATE_ON)
+				led->excl_bit |= FSP_LED_EXCL_FAULT;
+			if (spcn_cmd->state == LED_STATE_OFF)
+				led->excl_bit &= ~FSP_LED_EXCL_FAULT;
+		}
+
+		if (spcn_cmd->command == LED_COMMAND_IDENTIFY) {
+			if (spcn_cmd->state == LED_STATE_ON)
+				led->excl_bit |= FSP_LED_EXCL_IDENTIFY;
+			if (spcn_cmd->state == LED_STATE_OFF)
+				led->excl_bit &= ~FSP_LED_EXCL_IDENTIFY;
+		}
+	}
+
+	/* LED FAULT commad */
+	if (spcn_cmd->command == LED_COMMAND_FAULT) {
+		if (spcn_cmd->state == LED_STATE_ON)
+			sled.state |= cpu_to_be16(SPCN_LED_FAULT_MASK);
+		if (spcn_cmd->state == LED_STATE_OFF)
+			sled.state &= cpu_to_be16(~SPCN_LED_FAULT_MASK);
+	}
+
+	/* LED IDENTIFY command */
+	if (spcn_cmd->command == LED_COMMAND_IDENTIFY) {
+		if (spcn_cmd->state == LED_STATE_ON)
+			sled.state |= cpu_to_be16(SPCN_LED_IDENTIFY_MASK);
+		if (spcn_cmd->state == LED_STATE_OFF)
+			sled.state &= cpu_to_be16(~SPCN_LED_IDENTIFY_MASK);
+	}
+
+	/* Write into SPCN TCE buffer */
+	buf_write(buf, u8, sled.lc_len);	/* Location code length */
+	memcpy(buf, sled.lc_code, sled.lc_len);	/* Location code */
+	buf += sled.lc_len;
+	buf_write(buf, __be16, sled.state);	/* LED state */
+
+	msg = fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+			SPCN_ADDR_MODE_CEC_NODE, cmd_hdr, 0, PSI_DMA_LED_BUF);
+	if (!msg) {
+		cmd |= FSP_STATUS_GENERIC_ERROR;
+		rc = -1;
+		goto update_fail;
+	}
+
+	/*
+	 * Update the local lists based on the attempted SPCN command to
+	 * set/reset an individual led (CEC or ENCL).
+	 */
+	update_led_list(spcn_cmd->loc_code, be16_to_cpu(sled.state), led->excl_bit);
+	msg->user_data = spcn_cmd;
+
+	rc = fsp_queue_msg(msg, fsp_spcn_set_led_completion);
+	if (rc != OPAL_SUCCESS) {
+		cmd |= FSP_STATUS_GENERIC_ERROR;
+		fsp_freemsg(msg);
+		/* Revert LED state update */
+		update_led_list(spcn_cmd->loc_code, spcn_cmd->ckpt_status,
+				spcn_cmd->ckpt_excl_bit);
+	}
+
+update_fail:
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_LED_STATE),
+				 "Set led state failed at LC=%s\n",
+				 spcn_cmd->loc_code);
+
+		if (spcn_cmd->cmd_src == SPCN_SRC_FSP)
+			fsp_set_led_response(cmd);
+
+		if (spcn_cmd->cmd_src == SPCN_SRC_OPAL)
+			opal_led_update_complete(spcn_cmd->async_token,
+						 OPAL_INTERNAL_ERROR);
+	}
+
+	unlock(&led_lock);
+	return rc;
+}
+
+/*
+ * process_led_state_change
+ *
+ * If the command queue is empty, it sets the 'spcn_cmd_complete' as true
+ * and just returns. Else it pops one element from the command queue
+ * and processes the command for the requested LED state change.
+ */
+static int process_led_state_change(void)
+{
+	struct led_set_cmd *spcn_cmd;
+	int rc = 0;
+
+	/*
+	 * The command queue is empty. This will only
+	 * happen during the SPCN command callback path
+	 * in which case we set 'spcn_cmd_complete' as true.
+	 */
+	lock(&spcn_cmd_lock);
+	if (list_empty(&spcn_cmdq)) {
+		spcn_cmd_complete = true;
+		unlock(&spcn_cmd_lock);
+		return rc;
+	}
+
+	spcn_cmd = list_pop(&spcn_cmdq, struct led_set_cmd, link);
+	unlock(&spcn_cmd_lock);
+
+	if (is_sai_loc_code(spcn_cmd->loc_code))
+		rc = fsp_set_sai(spcn_cmd);
+	else
+		rc = fsp_msg_set_led_state(spcn_cmd);
+
+	if (rc) {
+		free(spcn_cmd);
+		process_led_state_change();
+	}
+
+	return rc;
+}
+
+/*
+ * queue_led_state_change
+ *
+ * FSP async command or OPAL based request for LED state change gets queued
+ * up in the command queue. If no previous SPCN command is pending, then it
+ * immediately pops up one element from the list and processes it. If previous
+ * SPCN commands are still pending then it just queues up and return. When the
+ * SPCN command callback gets to execute, it processes one element from the
+ * list and keeps the chain execution going. At last when there are no elements
+ * in the command queue it sets 'spcn_cmd_complete' as true again.
+ */
+static int queue_led_state_change(char *loc_code, u8 command,
+				  u8 state, int cmd_src, uint64_t async_token)
+{
+	struct led_set_cmd *cmd;
+	int rc = 0;
+
+	/* New request node */
+	cmd = zalloc(sizeof(struct led_set_cmd));
+	if (!cmd) {
+		/**
+		 * @fwts-label FSPLEDRequestMallocFail
+		 * @fwts-advice OPAL failed to allocate memory for FSP LED
+		 * command. Likely an OPAL bug led to out of memory.
+		 */
+		prlog(PR_ERR, "SPCN set command node allocation failed\n");
+		return -1;
+	}
+
+	/* Save the request */
+	strncpy(cmd->loc_code, loc_code, LOC_CODE_SIZE - 1);
+	cmd->command = command;
+	cmd->state = state;
+	cmd->cmd_src = cmd_src;
+	cmd->async_token = async_token;
+
+	/* Add to the queue */
+	lock(&spcn_cmd_lock);
+	list_add_tail(&spcn_cmdq,  &cmd->link);
+
+	/* No previous SPCN command pending */
+	if (spcn_cmd_complete) {
+		spcn_cmd_complete = false;
+		unlock(&spcn_cmd_lock);
+		rc = process_led_state_change();
+		return rc;
+	}
+
+	unlock(&spcn_cmd_lock);
+	return rc;
+}
+
+/*
+ * Write single location code information into the TCE outbound buffer
+ *
+ * Data layout
+ *
+ * 2 bytes - Length of location code structure
+ * 4 bytes - CCIN in ASCII
+ * 1 byte  - Resource status flag
+ * 1 byte  - Indicator state
+ * 1 byte  - Raw loc code length
+ * 1 byte  - Loc code field size
+ * Field size byte - Null terminated ASCII string padded to 4 byte boundary
+ *
+ */
+static u32 fsp_push_data_to_tce(struct fsp_led_data *led, u8 *out_data,
+				u32 total_size)
+{
+	struct fsp_loc_code_data lcode;
+
+	/* CCIN value is irrelevant */
+	lcode.ccin = 0x0;
+
+	lcode.status = FSP_IND_NOT_IMPLMNTD;
+
+	if (led->parms & SPCN_LED_IDENTIFY_MASK)
+		lcode.status = FSP_IND_IMPLMNTD;
+
+	/* LED indicator status */
+	lcode.ind_state = FSP_IND_INACTIVE;
+	if (led->status & SPCN_LED_IDENTIFY_MASK)
+		lcode.ind_state |= FSP_IND_IDENTIFY_ACTV;
+	if (led->status & SPCN_LED_FAULT_MASK)
+		lcode.ind_state |= FSP_IND_FAULT_ACTV;
+
+	/* Location code */
+	memset(lcode.loc_code, 0, LOC_CODE_SIZE);
+	lcode.raw_len = strlen(led->loc_code);
+	strncpy(lcode.loc_code, led->loc_code, LOC_CODE_SIZE - 1);
+	lcode.fld_sz = sizeof(lcode.loc_code);
+
+	/* Rest of the structure */
+	lcode.size = cpu_to_be16(sizeof(lcode));
+	lcode.status &= 0x0f;
+
+	/*
+	 * Check for outbound buffer overflow. If there are still
+	 * more LEDs to be sent across to FSP, don't send, ignore.
+	 */
+	if ((total_size + be16_to_cpu(lcode.size)) > PSI_DMA_LOC_COD_BUF_SZ)
+		return 0;
+
+	/* Copy over to the buffer */
+	memcpy(out_data, &lcode, sizeof(lcode));
+
+	return be16_to_cpu(lcode.size);
+}
+
+/*
+ * Send out LED information structure pointed by "loc_code"
+ * to FSP through the PSI DMA mapping. Buffer layout structure
+ * must be followed.
+ */
+static void fsp_ret_loc_code_list(u16 req_type, char *loc_code)
+{
+	struct fsp_led_data *led, *next;
+	struct fsp_msg *msg;
+
+	u8 *data;			/* Start of TCE mapped buffer */
+	u8 *out_data;			/* Start of location code data */
+	u32 bytes_sent = 0, total_size = 0;
+	u16 header_size = 0, flags = 0;
+
+	if (loc_code_list_buffer == NULL) {
+		prerror("No loc_code_list_buffer\n");
+		return;
+	}
+
+	/* Init the addresses */
+	data = loc_code_list_buffer;
+	out_data = NULL;
+
+	/* Unmapping through FSP_CMD_RET_LOC_BUFFER command */
+	fsp_tce_map(PSI_DMA_LOC_COD_BUF, (void *)data, PSI_DMA_LOC_COD_BUF_SZ);
+	out_data = data + 8;
+
+	/* CEC LED list */
+	list_for_each_safe(&cec_ledq, led, next, link) {
+		/*
+		 * When the request type is system wide led list
+		 * i.e GET_LC_CMPLT_SYS, send the entire contents
+		 * of the CEC list including both all descendents
+		 * and all of their enclosures.
+		 */
+
+		if (req_type == GET_LC_ENCLOSURES)
+			break;
+
+		if (req_type == GET_LC_ENCL_DESCENDANTS) {
+			if (strstr(led->loc_code, loc_code) == NULL)
+				continue;
+		}
+
+		if (req_type == GET_LC_SINGLE_LOC_CODE) {
+			if (strcmp(led->loc_code, loc_code))
+				continue;
+		}
+
+		/* Push the data into TCE buffer */
+		bytes_sent = fsp_push_data_to_tce(led, out_data, total_size);
+
+		/* Advance the TCE pointer */
+		out_data += bytes_sent;
+		total_size += bytes_sent;
+	}
+
+	/* Enclosure LED list */
+	if (req_type == GET_LC_ENCLOSURES) {
+		list_for_each_safe(&encl_ledq, led, next, link) {
+
+			/* Push the data into TCE buffer */
+			bytes_sent = fsp_push_data_to_tce(led,
+							  out_data, total_size);
+
+			/* Advance the TCE pointer */
+			out_data += bytes_sent;
+			total_size += bytes_sent;
+		}
+	}
+
+	/* Count from 'data' instead of 'data_out' */
+	total_size += 8;
+	memcpy(data, &total_size, sizeof(total_size));
+
+	header_size = OUTBUF_HEADER_SIZE;
+	memcpy(data + sizeof(total_size), &header_size, sizeof(header_size));
+
+	if (req_type == GET_LC_ENCL_DESCENDANTS)
+		flags = 0x8000;
+
+	memcpy(data +  sizeof(total_size) + sizeof(header_size), &flags,
+	       sizeof(flags));
+	msg = fsp_mkmsg(FSP_RSP_GET_LED_LIST, 3, 0,
+			PSI_DMA_LOC_COD_BUF, total_size);
+	if (!msg) {
+		prerror("Failed to allocate FSP_RSP_GET_LED_LIST.\n");
+	} else {
+		if (fsp_queue_msg(msg, fsp_freemsg)) {
+			fsp_freemsg(msg);
+			prerror("Failed to queue FSP_RSP_GET_LED_LIST\n");
+		}
+	}
+}
+
+/*
+ * FSP async command: FSP_CMD_GET_LED_LIST
+ *
+ * (1) FSP sends the list of location codes through inbound buffer
+ * (2) HV sends the status of those location codes through outbound buffer
+ *
+ * Inbound buffer data layout (loc code request structure)
+ *
+ * 2 bytes - Length of entire structure
+ * 2 bytes - Request type
+ * 1 byte - Raw length of location code
+ * 1 byte - Location code field size
+ * `Field size` bytes - NULL terminated ASCII location code string
+ */
+static void fsp_get_led_list(struct fsp_msg *msg)
+{
+	struct fsp_loc_code_req req;
+	u32 tce_token = fsp_msg_get_data_word(msg, 1);
+	void *buf;
+
+	/* Parse inbound buffer */
+	buf = fsp_inbound_buf_from_tce(tce_token);
+	if (!buf) {
+		struct fsp_msg *msg;
+		msg = fsp_mkmsg(FSP_RSP_GET_LED_LIST | FSP_STATUS_INVALID_DATA,
+				0);
+		if (!msg) {
+			prerror("Failed to allocate FSP_RSP_GET_LED_LIST"
+				" | FSP_STATUS_INVALID_DATA\n");
+		} else {
+			if (fsp_queue_msg(msg, fsp_freemsg)) {
+				fsp_freemsg(msg);
+				prerror("Failed to queue "
+					"FSP_RSP_GET_LED_LIST |"
+					" FSP_STATUS_INVALID_DATA\n");
+			}
+		}
+		return;
+	}
+	memcpy(&req, buf, sizeof(req));
+
+	prlog(PR_TRACE, "Request for loc code list type 0x%04x LC=%s\n",
+	       be16_to_cpu(req.req_type), req.loc_code);
+
+	fsp_ret_loc_code_list(be16_to_cpu(req.req_type), req.loc_code);
+}
+
+/*
+ * FSP async command: FSP_CMD_RET_LOC_BUFFER
+ *
+ * With this command FSP returns ownership of the outbound buffer
+ * used by Sapphire to pass the indicator list previous time. That
+ * way FSP tells Sapphire that it has consumed all the data present
+ * on the outbound buffer and Sapphire can reuse it for next request.
+ */
+static void fsp_free_led_list_buf(struct fsp_msg *msg)
+{
+	u32 tce_token = fsp_msg_get_data_word(msg, 1);
+	u32 cmd = FSP_RSP_RET_LED_BUFFER;
+	struct fsp_msg *resp;
+
+	/* Token does not point to outbound buffer */
+	if (tce_token != PSI_DMA_LOC_COD_BUF) {
+		log_simple_error(&e_info(OPAL_RC_LED_BUFF),
+			"Invalid tce token from FSP\n");
+		cmd |=  FSP_STATUS_GENERIC_ERROR;
+		resp = fsp_mkmsg(cmd, 0);
+		if (!resp) {
+			prerror("Failed to allocate FSP_RSP_RET_LED_BUFFER"
+				"| FSP_STATUS_GENERIC_ERROR\n");
+			return;
+		}
+
+		if (fsp_queue_msg(resp, fsp_freemsg)) {
+			fsp_freemsg(resp);
+			prerror("Failed to queue "
+				"RET_LED_BUFFER|ERROR\n");
+		}
+		return;
+	}
+
+	/* Unmap the location code DMA buffer */
+	fsp_tce_unmap(PSI_DMA_LOC_COD_BUF, PSI_DMA_LOC_COD_BUF_SZ);
+
+	resp = fsp_mkmsg(cmd, 0);
+	if (!resp) {
+		prerror("Failed to allocate FSP_RSP_RET_LED_BUFFER\n");
+		return;
+	}
+	if (fsp_queue_msg(resp, fsp_freemsg)) {
+		fsp_freemsg(resp);
+		prerror("Failed to queue FSP_RSP_RET_LED_BUFFER\n");
+	}
+}
+
+static void fsp_ret_led_state(char *loc_code)
+{
+	bool found = false;
+	u8 ind_state = 0;
+	u32 cmd = FSP_RSP_GET_LED_STATE;
+	struct fsp_led_data *led, *next;
+	struct fsp_msg *msg;
+
+	if (is_sai_loc_code(loc_code)) {
+		if (sai_data.state & OPAL_SLOT_LED_STATE_ON)
+			ind_state = FSP_IND_FAULT_ACTV;
+		found = true;
+	} else {
+		list_for_each_safe(&cec_ledq, led, next, link) {
+			if (strcmp(loc_code, led->loc_code))
+				continue;
+
+			/* Found the location code */
+			if (led->status & SPCN_LED_IDENTIFY_MASK)
+				ind_state |= FSP_IND_IDENTIFY_ACTV;
+			if (led->status & SPCN_LED_FAULT_MASK)
+				ind_state |= FSP_IND_FAULT_ACTV;
+
+			found = true;
+			break;
+		}
+	}
+
+	/* Location code not found */
+	if (!found) {
+		log_simple_error(&e_info(OPAL_RC_LED_LC),
+				 "Could not find the location code LC=%s\n",
+				 loc_code);
+		cmd |= FSP_STATUS_INVALID_LC;
+		ind_state = 0xff;
+	}
+
+	msg = fsp_mkmsg(cmd, 1, ind_state);
+	if (!msg) {
+		prerror("Couldn't alloc FSP_RSP_GET_LED_STATE\n");
+		return;
+	}
+
+	if (fsp_queue_msg(msg, fsp_freemsg)) {
+		fsp_freemsg(msg);
+		prerror("Couldn't queue FSP_RSP_GET_LED_STATE\n");
+	}
+}
+
+/*
+ * FSP async command: FSP_CMD_GET_LED_STATE
+ *
+ * With this command FSP query the state for any given LED
+ */
+static void fsp_get_led_state(struct fsp_msg *msg)
+{
+	struct fsp_get_ind_state_req req;
+	u32 tce_token = fsp_msg_get_data_word(msg, 1);
+	void *buf;
+
+	/* Parse the inbound buffer */
+	buf = fsp_inbound_buf_from_tce(tce_token);
+	if (!buf) {
+		struct fsp_msg *msg;
+		msg = fsp_mkmsg(FSP_RSP_GET_LED_STATE |
+				FSP_STATUS_INVALID_DATA, 0);
+		if (!msg) {
+			prerror("Failed to allocate FSP_RSP_GET_LED_STATE"
+				" | FSP_STATUS_INVALID_DATA\n");
+			return;
+		}
+		if (fsp_queue_msg(msg, fsp_freemsg)) {
+			fsp_freemsg(msg);
+			prerror("Failed to queue FSP_RSP_GET_LED_STATE"
+				" | FSP_STATUS_INVALID_DATA\n");
+		}
+		return;
+	}
+	memcpy(&req, buf, sizeof(req));
+
+	prlog(PR_TRACE, "%s: tce=0x%08x buf=%p rq.sz=%d rq.lc_len=%d"
+	      " rq.fld_sz=%d LC: %02x %02x %02x %02x....\n", __func__,
+	      tce_token, buf, req.size, req.lc_len, req.fld_sz,
+	      req.loc_code[0], req.loc_code[1],
+	      req.loc_code[2], req.loc_code[3]);
+
+	/* Bound check */
+	if (req.lc_len >= LOC_CODE_SIZE) {
+		log_simple_error(&e_info(OPAL_RC_LED_LC),
+				 "Loc code too large in %s: %d bytes\n",
+				 __func__, req.lc_len);
+		req.lc_len = LOC_CODE_SIZE - 1;
+	}
+	/* Ensure NULL termination */
+	req.loc_code[req.lc_len] = 0;
+
+	/* Do the deed */
+	fsp_ret_led_state(req.loc_code);
+}
+
+/*
+ * FSP async command: FSP_CMD_SET_LED_STATE
+ *
+ * With this command FSP sets/resets the state for any given LED
+ */
+static void fsp_set_led_state(struct fsp_msg *msg)
+{
+	struct fsp_set_ind_state_req req;
+	struct fsp_led_data *led, *next;
+	u32 tce_token = fsp_msg_get_data_word(msg, 1);
+	bool command, state;
+	void *buf;
+	int rc;
+
+	/* Parse the inbound buffer */
+	buf = fsp_inbound_buf_from_tce(tce_token);
+	if (!buf) {
+		fsp_set_led_response(FSP_RSP_SET_LED_STATE |
+				     FSP_STATUS_INVALID_DATA);
+		return;
+	}
+	memcpy(&req, buf, sizeof(req));
+
+	prlog(PR_TRACE, "%s: tce=0x%08x buf=%p rq.sz=%d rq.typ=0x%04x"
+	      " rq.lc_len=%d rq.fld_sz=%d LC: %02x %02x %02x %02x....\n",
+	      __func__, tce_token, buf, be16_to_cpu(req.size), req.lc_len, req.fld_sz,
+	      be16_to_cpu(req.req_type),
+	      req.loc_code[0], req.loc_code[1],
+	      req.loc_code[2], req.loc_code[3]);
+
+	/* Bound check */
+	if (req.lc_len >= LOC_CODE_SIZE) {
+		log_simple_error(&e_info(OPAL_RC_LED_LC),
+				 "Loc code too large in %s: %d bytes\n",
+				 __func__, req.lc_len);
+		req.lc_len = LOC_CODE_SIZE - 1;
+	}
+	/* Ensure NULL termination */
+	req.loc_code[req.lc_len] = 0;
+
+	/* Decode command */
+	command =  (req.ind_state & LOGICAL_IND_STATE_MASK) ?
+		LED_COMMAND_FAULT : LED_COMMAND_IDENTIFY;
+	state = (req.ind_state & ACTIVE_LED_STATE_MASK) ?
+		LED_STATE_ON : LED_STATE_OFF;
+
+	/* Handle requests */
+	switch (be16_to_cpu(req.req_type)) {
+	case SET_IND_ENCLOSURE:
+		list_for_each_safe(&cec_ledq, led, next, link) {
+			/* Only descendants of the same enclosure */
+			if (!strstr(led->loc_code, req.loc_code))
+				continue;
+
+			/* Skip the enclosure */
+			if (!strcmp(led->loc_code, req.loc_code))
+				continue;
+
+			rc = queue_led_state_change(led->loc_code, command,
+						    state, SPCN_SRC_FSP, 0);
+			if (rc != 0)
+				fsp_set_led_response(FSP_RSP_SET_LED_STATE |
+						     FSP_STATUS_GENERIC_ERROR);
+		}
+		break;
+	case SET_IND_SINGLE_LOC_CODE:
+		/* Set led state for single descendent led */
+		rc = queue_led_state_change(req.loc_code,
+					    command, state, SPCN_SRC_FSP, 0);
+		if (rc != 0)
+			fsp_set_led_response(FSP_RSP_SET_LED_STATE |
+					     FSP_STATUS_GENERIC_ERROR);
+		break;
+	default:
+		fsp_set_led_response(FSP_RSP_SET_LED_STATE |
+				     FSP_STATUS_NOT_SUPPORTED);
+		break;
+	}
+}
+
+/* Handle received indicator message from FSP */
+static bool fsp_indicator_message(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	u32 cmd;
+	struct fsp_msg *resp;
+
+	/* LED support not available yet */
+	if (led_support != LED_STATE_PRESENT) {
+		log_simple_error(&e_info(OPAL_RC_LED_SUPPORT),
+			"Indicator message while LED support not"
+			" available yet\n");
+		return false;
+	}
+
+	switch (cmd_sub_mod) {
+	case FSP_CMD_GET_LED_LIST:
+		prlog(PR_TRACE, "FSP_CMD_GET_LED_LIST command received\n");
+		fsp_get_led_list(msg);
+		return true;
+	case FSP_CMD_RET_LED_BUFFER:
+		prlog(PR_TRACE, "FSP_CMD_RET_LED_BUFFER command received\n");
+		fsp_free_led_list_buf(msg);
+		return true;
+	case FSP_CMD_GET_LED_STATE:
+		prlog(PR_TRACE, "FSP_CMD_GET_LED_STATE command received\n");
+		fsp_get_led_state(msg);
+		return true;
+	case FSP_CMD_SET_LED_STATE:
+		prlog(PR_TRACE, "FSP_CMD_SET_LED_STATE command received\n");
+		fsp_set_led_state(msg);
+		return true;
+	/*
+	 * FSP async sub commands which have not been implemented.
+	 * For these async sub commands, print for the log and ack
+	 * the field service processor with a generic error.
+	 */
+	case FSP_CMD_GET_MTMS_LIST:
+		prlog(PR_TRACE, "FSP_CMD_GET_MTMS_LIST command received\n");
+		cmd = FSP_RSP_GET_MTMS_LIST;
+		break;
+	case FSP_CMD_RET_MTMS_BUFFER:
+		prlog(PR_TRACE, "FSP_CMD_RET_MTMS_BUFFER command received\n");
+		cmd = FSP_RSP_RET_MTMS_BUFFER;
+		break;
+	case FSP_CMD_SET_ENCL_MTMS:
+		prlog(PR_TRACE, "FSP_CMD_SET_MTMS command received\n");
+		cmd = FSP_RSP_SET_ENCL_MTMS;
+		break;
+	case FSP_CMD_CLR_INCT_ENCL:
+		prlog(PR_TRACE, "FSP_CMD_CLR_INCT_ENCL command received\n");
+		cmd = FSP_RSP_CLR_INCT_ENCL;
+		break;
+	case FSP_CMD_ENCL_MCODE_INIT:
+		prlog(PR_TRACE, "FSP_CMD_ENCL_MCODE_INIT command received\n");
+		cmd = FSP_RSP_ENCL_MCODE_INIT;
+		break;
+	case FSP_CMD_ENCL_MCODE_INTR:
+		prlog(PR_TRACE, "FSP_CMD_ENCL_MCODE_INTR command received\n");
+		cmd = FSP_RSP_ENCL_MCODE_INTR;
+		break;
+	case FSP_CMD_ENCL_POWR_TRACE:
+		prlog(PR_TRACE, "FSP_CMD_ENCL_POWR_TRACE command received\n");
+		cmd = FSP_RSP_ENCL_POWR_TRACE;
+		break;
+	case FSP_CMD_RET_ENCL_TRACE_BUFFER:
+		prlog(PR_TRACE, "FSP_CMD_RET_ENCL_TRACE_BUFFER command received\n");
+		cmd = FSP_RSP_RET_ENCL_TRACE_BUFFER;
+		break;
+	case FSP_CMD_GET_SPCN_LOOP_STATUS:
+		prlog(PR_TRACE, "FSP_CMD_GET_SPCN_LOOP_STATUS command received\n");
+		cmd = FSP_RSP_GET_SPCN_LOOP_STATUS;
+		break;
+	case FSP_CMD_INITIATE_LAMP_TEST:
+		/* XXX: FSP ACK not required for this sub command */
+		prlog(PR_TRACE, "FSP_CMD_INITIATE_LAMP_TEST command received\n");
+		return true;
+	default:
+		return false;
+	}
+	cmd |= FSP_STATUS_GENERIC_ERROR;
+	resp = fsp_mkmsg(cmd, 0);
+	if (!resp) {
+		prerror("Failed to allocate FSP_STATUS_GENERIC_ERROR\n");
+		return false;
+	}
+	if (fsp_queue_msg(resp, fsp_freemsg)) {
+		fsp_freemsg(resp);
+		prerror("Failed to queue FSP_STATUS_GENERIC_ERROR\n");
+		return false;
+	}
+	return true;
+}
+
+/* Indicator class client */
+static struct fsp_client fsp_indicator_client = {
+	.message = fsp_indicator_message,
+};
+
+
+static int fsp_opal_get_sai(__be64 *led_mask, __be64 *led_value)
+{
+	*led_mask |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ATTN);
+	if (sai_data.state & OPAL_SLOT_LED_STATE_ON)
+		*led_value |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ATTN);
+
+	return OPAL_SUCCESS;
+}
+
+static int fsp_opal_set_sai(uint64_t async_token, char *loc_code,
+			    const u64 led_mask, const u64 led_value)
+{
+	int state = LED_STATE_OFF;
+
+	if (!((led_mask >> OPAL_SLOT_LED_TYPE_ATTN) & OPAL_SLOT_LED_STATE_ON))
+		return OPAL_PARAMETER;
+
+	if ((led_value >> OPAL_SLOT_LED_TYPE_ATTN) & OPAL_SLOT_LED_STATE_ON)
+		state = LED_STATE_ON;
+
+	return queue_led_state_change(loc_code, 0,
+				      state, SPCN_SRC_OPAL, async_token);
+}
+
+/*
+ * fsp_opal_leds_get_ind (OPAL_LEDS_GET_INDICATOR)
+ *
+ * Argument	 Description				Updated By
+ * --------	 -----------				----------
+ * loc_code	 Location code of the LEDs		(Host)
+ * led_mask	 LED types whose status is available	(OPAL)
+ * led_value	 Status of the available LED types	(OPAL)
+ * max_led_type  Maximum number of supported LED types	(Host/OPAL)
+ *
+ * The host will pass the location code of the LED types (loc_code) and
+ * maximum number of LED types it understands (max_led_type). OPAL will
+ * update the 'led_mask' with set bits pointing to LED types whose status
+ * is available and updates the 'led_value' with actual status. OPAL checks
+ * the 'max_led_type' to understand whether the host is newer or older
+ * compared to itself. In the case where the OPAL is newer compared
+ * to host (OPAL's max_led_type > host's max_led_type), it will update
+ * led_mask and led_value according to max_led_type requested by the host.
+ * When the host is newer compared to the OPAL (host's max_led_type >
+ * OPAL's max_led_type), OPAL updates 'max_led_type' to the maximum
+ * number of LED type it understands and updates 'led_mask', 'led_value'
+ * based on that maximum value of LED types.
+ */
+static int64_t fsp_opal_leds_get_ind(char *loc_code, __be64 *led_mask,
+				     __be64 *led_value, __be64 *max_led_type)
+{
+	bool supported = true;
+	int64_t max;
+	int rc;
+	struct fsp_led_data *led;
+
+	/* FSP not present */
+	if (!fsp_present())
+		return OPAL_HARDWARE;
+
+	/* LED support not available */
+	if (led_support != LED_STATE_PRESENT)
+		return OPAL_HARDWARE;
+
+	max = be64_to_cpu(*max_led_type);
+
+	/* Adjust max LED type */
+	if (max > OPAL_SLOT_LED_TYPE_MAX) {
+		supported = false;
+		max = OPAL_SLOT_LED_TYPE_MAX;
+		*max_led_type = cpu_to_be64(max);
+	}
+
+	/* Invalid parameter */
+	if (max <= 0)
+		return OPAL_PARAMETER;
+
+	/* Get System attention indicator state */
+	if (is_sai_loc_code(loc_code)) {
+		rc = fsp_opal_get_sai(led_mask, led_value);
+		return rc;
+	}
+
+	/* LED not found */
+	led = fsp_find_cec_led(loc_code);
+	if (!led)
+		return OPAL_PARAMETER;
+
+	*led_mask = 0;
+	*led_value = 0;
+
+	/* Identify LED */
+	--max;
+	*led_mask |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ID);
+	if (led->status & SPCN_LED_IDENTIFY_MASK)
+		*led_value |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_ID);
+
+	/* Fault LED */
+	if (!max)
+		return OPAL_SUCCESS;
+
+	--max;
+	*led_mask |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_FAULT);
+	if (led->status & SPCN_LED_FAULT_MASK)
+		*led_value |= cpu_to_be64(OPAL_SLOT_LED_STATE_ON << OPAL_SLOT_LED_TYPE_FAULT);
+
+	/* OPAL doesn't support all the LED type requested by payload */
+	if (!supported)
+		return OPAL_PARTIAL;
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * fsp_opal_leds_set_ind (OPAL_LEDS_SET_INDICATOR)
+ *
+ * Argument	 Description				Updated By
+ * --------	 -----------				----------
+ * loc_code	 Location code of the LEDs		(Host)
+ * led_mask	 LED types whose status will be updated	(Host)
+ * led_value	 Requested status of various LED types	(Host)
+ * max_led_type  Maximum number of supported LED types	(Host/OPAL)
+ *
+ * The host will pass the location code of the LED types, mask, value
+ * and maximum number of LED types it understands. OPAL will update
+ * LED status for all the LED types mentioned in the mask with their
+ * value mentioned. OPAL checks the 'max_led_type' to understand
+ * whether the host is newer or older compared to itself. In case where
+ * the OPAL is newer compared to the host (OPAL's max_led_type >
+ * host's max_led_type), it updates LED status based on max_led_type
+ * requested from the host. When the host is newer compared to the OPAL
+ * (host's max_led_type > OPAL's max_led_type), OPAL updates
+ * 'max_led_type' to the maximum number of LED type it understands and
+ * then it updates LED status based on that updated  maximum value of LED
+ * types. Host needs to check the returned updated value of max_led_type
+ * to figure out which part of it's request got served and which ones got
+ * ignored.
+ */
+static int64_t fsp_opal_leds_set_ind(uint64_t async_token,
+				     char *loc_code, const u64 led_mask,
+				     const u64 led_value, __be64 *max_led_type)
+{
+	bool supported = true;
+	int command, state, rc = OPAL_SUCCESS;
+	int64_t max;
+	struct fsp_led_data *led;
+
+	/* FSP not present */
+	if (!fsp_present())
+		return OPAL_HARDWARE;
+
+	/* LED support not available */
+	if (led_support != LED_STATE_PRESENT)
+		return OPAL_HARDWARE;
+
+	max = be64_to_cpu(*max_led_type);
+
+	/* Adjust max LED type */
+	if (max > OPAL_SLOT_LED_TYPE_MAX) {
+		supported = false;
+		max = OPAL_SLOT_LED_TYPE_MAX;
+		*max_led_type = cpu_to_be64(max);
+	}
+
+	/* Invalid parameter */
+	if (max <= 0)
+		return OPAL_PARAMETER;
+
+	/* Set System attention indicator state */
+	if (is_sai_loc_code(loc_code)) {
+		supported = true;
+		rc = fsp_opal_set_sai(async_token,
+				      loc_code, led_mask, led_value);
+		goto success;
+	}
+
+	/* LED not found */
+	led = fsp_find_cec_led(loc_code);
+	if (!led)
+		return OPAL_PARAMETER;
+
+	/* Indentify LED mask */
+	--max;
+
+	if ((led_mask >> OPAL_SLOT_LED_TYPE_ID) & OPAL_SLOT_LED_STATE_ON) {
+		supported = true;
+
+		command = LED_COMMAND_IDENTIFY;
+		state = LED_STATE_OFF;
+		if ((led_value >> OPAL_SLOT_LED_TYPE_ID)
+					& OPAL_SLOT_LED_STATE_ON)
+			state = LED_STATE_ON;
+
+		rc = queue_led_state_change(loc_code, command,
+					    state, SPCN_SRC_OPAL, async_token);
+	}
+
+	if (!max)
+		goto success;
+
+	/* Fault LED mask */
+	--max;
+	if ((led_mask >> OPAL_SLOT_LED_TYPE_FAULT) & OPAL_SLOT_LED_STATE_ON) {
+		supported = true;
+
+		command = LED_COMMAND_FAULT;
+		state = LED_STATE_OFF;
+		if ((led_value >> OPAL_SLOT_LED_TYPE_FAULT)
+					& OPAL_SLOT_LED_STATE_ON)
+			state = LED_STATE_ON;
+
+		rc = queue_led_state_change(loc_code, command,
+					    state, SPCN_SRC_OPAL, async_token);
+	}
+
+success:
+	/* Unsupported LED type */
+	if (!supported)
+		return OPAL_UNSUPPORTED;
+
+	if (rc == OPAL_SUCCESS)
+		rc = OPAL_ASYNC_COMPLETION;
+	else
+		rc = OPAL_INTERNAL_ERROR;
+
+	return rc;
+}
+
+/* Get LED node from device tree */
+static struct dt_node *dt_get_led_node(void)
+{
+	struct dt_node *pled;
+
+	if (!opal_node) {
+		prlog(PR_WARNING, "OPAL parent device node not available\n");
+		return NULL;
+	}
+
+	pled = dt_find_by_path(opal_node, DT_PROPERTY_LED_NODE);
+	if (!pled)
+		prlog(PR_WARNING, "Parent device node not available\n");
+
+	return pled;
+}
+
+/* Get System attention indicator location code from device tree */
+static void dt_get_sai_loc_code(void)
+{
+	struct dt_node *pled, *child;
+	const char *led_type = NULL;
+
+	memset(sai_data.loc_code, 0, LOC_CODE_SIZE);
+
+	pled = dt_get_led_node();
+	if (!pled)
+		return;
+
+	list_for_each(&pled->children, child, list) {
+		led_type = dt_prop_get(child, DT_PROPERTY_LED_TYPES);
+		if (!led_type)
+			continue;
+
+		if (strcmp(led_type, LED_TYPE_ATTENTION))
+			continue;
+
+		memcpy(sai_data.loc_code, child->name, LOC_CODE_SIZE - 1);
+
+		prlog(PR_TRACE, "SAI Location code = %s\n", sai_data.loc_code);
+		return;
+	}
+}
+
+/*
+ * create_led_device_node
+ *
+ * Creates the system parent LED device node and all individual
+ * child LED device nodes under it. This is called right before
+ * starting the payload (Linux) to ensure that the SPCN command
+ * sequence to fetch the LED location code list has been finished
+ * and to have a better chance of creating the deviced nodes.
+ */
+void create_led_device_nodes(void)
+{
+	const char *led_mode = NULL;
+	struct fsp_led_data *led, *next;
+	struct dt_node *pled, *cled;
+
+	if (!fsp_present())
+		return;
+
+	/* Make sure LED list read is completed */
+	while (led_support == LED_STATE_READING)
+		opal_run_pollers();
+
+	if (led_support == LED_STATE_ABSENT) {
+		prlog(PR_WARNING, "LED support not available, \
+		      hence device tree nodes will not be created\n");
+		return;
+	}
+
+	/* Get LED node */
+	pled = dt_get_led_node();
+	if (!pled)
+		return;
+
+	/* Check if already populated (fast-reboot) */
+	if (dt_has_node_property(pled, "compatible", NULL))
+		return;
+	dt_add_property_strings(pled, "compatible", DT_PROPERTY_LED_COMPATIBLE);
+
+	led_mode = dt_prop_get(pled, DT_PROPERTY_LED_MODE);
+	if (!led_mode) {
+		prlog(PR_WARNING, "Unknown LED operating mode\n");
+		return;
+	}
+
+	/* LED child nodes */
+	list_for_each_safe(&cec_ledq, led, next, link) {
+		/* Duplicate LED location code */
+		if (dt_find_by_path(pled, led->loc_code)) {
+			prlog(PR_WARNING, "duplicate location code %s\n",
+			      led->loc_code);
+			continue;
+		}
+
+		cled = dt_new(pled, led->loc_code);
+		if (!cled) {
+			prlog(PR_WARNING, "Child device node creation "
+			      "failed\n");
+			continue;
+		}
+
+		if (!strcmp(led_mode, LED_MODE_LIGHT_PATH))
+			dt_add_property_strings(cled, DT_PROPERTY_LED_TYPES,
+						LED_TYPE_IDENTIFY,
+						LED_TYPE_FAULT);
+		else
+			dt_add_property_strings(cled, DT_PROPERTY_LED_TYPES,
+						LED_TYPE_IDENTIFY);
+	}
+}
+
+/*
+ * Process the received LED data from SPCN
+ *
+ * Every LED state data is added into the CEC list. If the location
+ * code is a enclosure type, its added into the enclosure list as well.
+ *
+ */
+static void fsp_process_leds_data(u16 len)
+{
+	struct fsp_led_data *led_data = NULL;
+	void *buf = NULL;
+
+	/*
+	 * Process the entire captured data from the last command
+	 *
+	 * TCE mapped 'led_buffer' contains the fsp_led_data structure
+	 * one after the other till the total length 'len'.
+	 *
+	 */
+	buf = led_buffer;
+	while (len) {
+		size_t lc_len;
+		__be16 tmp;
+
+		/* Prepare */
+		led_data = zalloc(sizeof(struct fsp_led_data));
+		assert(led_data);
+
+		/* Resource ID */
+		buf_read(buf, __be16, &tmp);
+		led_data->rid = be16_to_cpu(tmp);
+		len -= sizeof(led_data->rid);
+
+		/* Location code length */
+		buf_read(buf, u8, &led_data->lc_len);
+		len -= sizeof(led_data->lc_len);
+
+		lc_len = led_data->lc_len;
+		if (lc_len == 0) {
+			free(led_data);
+			break;
+		}
+
+		if (lc_len >= LOC_CODE_SIZE)
+			lc_len = LOC_CODE_SIZE - 1;
+
+		/* Location code */
+		strncpy(led_data->loc_code, buf, lc_len);
+		led_data->loc_code[lc_len] = '\0';
+
+		buf += led_data->lc_len;
+		len -= led_data->lc_len;
+
+		/* Parameters */
+		buf_read(buf, __be16, &tmp);
+		led_data->parms = be16_to_cpu(tmp);
+		len -=  sizeof(led_data->parms);
+
+		/* Status */
+		buf_read(buf, __be16, &tmp);
+		led_data->status = be16_to_cpu(tmp);
+		len -=  sizeof(led_data->status);
+
+		/*
+		 * This is Enclosure LED's location code, need to go
+		 * inside the enclosure LED list as well.
+		 */
+		if (!strstr(led_data->loc_code, "-")) {
+			struct fsp_led_data *encl_led_data = NULL;
+			encl_led_data = zalloc(sizeof(struct fsp_led_data));
+			assert(encl_led_data);
+
+			/* copy over the original */
+			memcpy(encl_led_data, led_data, sizeof(struct fsp_led_data));
+
+			/* Add to the list of enclosure LEDs */
+			list_add_tail(&encl_ledq, &encl_led_data->link);
+		}
+
+		/* Push this onto the list */
+		list_add_tail(&cec_ledq, &led_data->link);
+	}
+}
+
+/* Replay the SPCN command */
+static void replay_spcn_cmd(u32 last_spcn_cmd)
+{
+	u32 cmd_hdr = 0;
+	int rc = -1;
+
+	/* Reached threshold */
+	if (replay == SPCN_REPLAY_THRESHOLD) {
+		replay = 0;
+		led_support = LED_STATE_ABSENT;
+		return;
+	}
+
+	replay++;
+	if (last_spcn_cmd == SPCN_MOD_PRS_LED_DATA_FIRST) {
+		cmd_hdr = SPCN_MOD_PRS_LED_DATA_FIRST << 24 |
+			SPCN_CMD_PRS << 16;
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+					     SPCN_ADDR_MODE_CEC_NODE,
+					     cmd_hdr, 0,
+					     PSI_DMA_LED_BUF),
+				   fsp_read_leds_data_complete);
+		if (rc)
+			prlog(PR_ERR, "Replay SPCN_MOD_PRS_LED_DATA_FIRST"
+			      " command could not be queued\n");
+	}
+
+	if (last_spcn_cmd == SPCN_MOD_PRS_LED_DATA_SUB) {
+		cmd_hdr = SPCN_MOD_PRS_LED_DATA_SUB << 24 | SPCN_CMD_PRS << 16;
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+					     SPCN_ADDR_MODE_CEC_NODE, cmd_hdr,
+					     0, PSI_DMA_LED_BUF),
+				   fsp_read_leds_data_complete);
+		if (rc)
+			prlog(PR_ERR, "Replay SPCN_MOD_PRS_LED_DATA_SUB"
+			      " command could not be queued\n");
+	}
+
+	/* Failed to queue MBOX message */
+	if (rc)
+		led_support = LED_STATE_ABSENT;
+}
+
+/*
+ * FSP message response handler for following SPCN LED commands
+ * which are used to fetch all of the LED data from SPCN
+ *
+ * 1. SPCN_MOD_PRS_LED_DATA_FIRST      --> First 1KB of LED data
+ * 2. SPCN_MOD_PRS_LED_DATA_SUB        --> Subsequent 1KB of LED data
+ *
+ * Once the SPCN_RSP_STATUS_SUCCESS response code has been received
+ * indicating the last batch of 1KB LED data is here, the list addition
+ * process is now complete and we enable LED support for FSP async commands
+ * and for OPAL interface.
+ */
+static void fsp_read_leds_data_complete(struct fsp_msg *msg)
+{
+	struct fsp_led_data *led, *next;
+	struct fsp_msg *resp = msg->resp;
+	u32 cmd_hdr = 0;
+	int rc = 0;
+
+	u32 msg_status = resp->word1 & 0xff00;
+	u32 led_status = (fsp_msg_get_data_word(resp, 1) >> 24) & 0xff;
+	u16 data_len = (u16)(fsp_msg_get_data_word(resp, 1) & 0xffff);
+
+	if (msg_status != FSP_STATUS_SUCCESS) {
+		log_simple_error(&e_info(OPAL_RC_LED_SUPPORT),
+				 "FSP returned error %x LED not supported\n",
+				 msg_status);
+		/* LED support not available */
+		led_support = LED_STATE_ABSENT;
+
+		fsp_freemsg(msg);
+		return;
+	}
+
+	/* SPCN command status */
+	switch (led_status) {
+	/* Last 1KB of LED data */
+	case SPCN_RSP_STATUS_SUCCESS:
+		prlog(PR_DEBUG, "SPCN_RSP_STATUS_SUCCESS: %d bytes received\n",
+		      data_len);
+
+		led_support = LED_STATE_PRESENT;
+
+		/* Copy data to the local list */
+		fsp_process_leds_data(data_len);
+
+		/* LEDs captured on the system */
+		prlog(PR_DEBUG, "CEC LEDs captured on the system:\n");
+		list_for_each_safe(&cec_ledq, led, next, link) {
+			prlog(PR_DEBUG,
+			       "rid: %x\t"
+			       "len: %x      "
+			       "lcode: %-30s\t"
+			       "parms: %04x\t"
+			       "status: %04x\n",
+			       led->rid,
+			       led->lc_len,
+			       led->loc_code,
+			       led->parms,
+			       led->status);
+		}
+
+		prlog(PR_DEBUG, "ENCL LEDs captured on the system:\n");
+		list_for_each_safe(&encl_ledq, led, next, link) {
+			prlog(PR_DEBUG,
+			       "rid: %x\t"
+			       "len: %x      "
+			       "lcode: %-30s\t"
+			       "parms: %04x\t"
+			       "status: %04x\n",
+			       led->rid,
+			       led->lc_len,
+			       led->loc_code,
+			       led->parms,
+			       led->status);
+		}
+
+		break;
+
+	/* If more 1KB of LED data present */
+	case SPCN_RSP_STATUS_COND_SUCCESS:
+		prlog(PR_DEBUG, "SPCN_RSP_STATUS_COND_SUCCESS: %d bytes "
+		      " received\n", data_len);
+
+		/* Copy data to the local list */
+		fsp_process_leds_data(data_len);
+
+		/* Fetch the remaining data from SPCN */
+		last_spcn_cmd = SPCN_MOD_PRS_LED_DATA_SUB;
+		cmd_hdr = SPCN_MOD_PRS_LED_DATA_SUB << 24 | SPCN_CMD_PRS << 16;
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+					     SPCN_ADDR_MODE_CEC_NODE,
+					     cmd_hdr, 0, PSI_DMA_LED_BUF),
+				   fsp_read_leds_data_complete);
+		if (rc) {
+			prlog(PR_ERR, "SPCN_MOD_PRS_LED_DATA_SUB command"
+			      " could not be queued\n");
+
+			led_support = LED_STATE_ABSENT;
+		}
+		break;
+
+	/* Other expected error codes*/
+	case SPCN_RSP_STATUS_INVALID_RACK:
+	case SPCN_RSP_STATUS_INVALID_SLAVE:
+	case SPCN_RSP_STATUS_INVALID_MOD:
+	case SPCN_RSP_STATUS_STATE_PROHIBIT:
+	case SPCN_RSP_STATUS_UNKNOWN:
+	default:
+		/* Replay the previous SPCN command */
+		replay_spcn_cmd(last_spcn_cmd);
+	}
+	fsp_freemsg(msg);
+}
+
+/*
+ * Init the LED state
+ *
+ * This is called during the host boot process. This is the place where
+ * we figure out all the LEDs present on the system, their state and then
+ * create structure out of those information and popullate two master lists.
+ * One for all the LEDs on the CEC and one for all the LEDs on the enclosure.
+ * The LED information contained in the lists will cater either to various
+ * FSP initiated async commands or POWERNV initiated OPAL calls. Need to make
+ * sure that this initialization process is complete before allowing any requets
+ * on LED. Also need to be called to re-fetch data from SPCN after any LED state
+ * have been updated.
+ */
+static void fsp_leds_query_spcn(void)
+{
+	struct fsp_led_data *led = NULL;
+	int rc = 0;
+
+	u32 cmd_hdr = SPCN_MOD_PRS_LED_DATA_FIRST << 24 | SPCN_CMD_PRS << 16;
+
+	/* Till the last batch of LED data */
+	last_spcn_cmd = 0;
+
+	/* Empty the lists */
+	while (!list_empty(&cec_ledq)) {
+		led = list_pop(&cec_ledq, struct fsp_led_data, link);
+		free(led);
+	}
+
+	while (!list_empty(&encl_ledq)) {
+		led = list_pop(&encl_ledq, struct fsp_led_data, link);
+		free(led);
+	}
+
+	/* Allocate buffer with alignment requirements */
+	if (led_buffer == NULL) {
+		led_buffer = memalign(TCE_PSIZE, PSI_DMA_LED_BUF_SZ);
+		if (!led_buffer)
+			return;
+	}
+
+	/* TCE mapping - will not unmap */
+	fsp_tce_map(PSI_DMA_LED_BUF, led_buffer, PSI_DMA_LED_BUF_SZ);
+
+	/* Request the first 1KB of LED data */
+	last_spcn_cmd = SPCN_MOD_PRS_LED_DATA_FIRST;
+	rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+			SPCN_ADDR_MODE_CEC_NODE, cmd_hdr, 0,
+				PSI_DMA_LED_BUF), fsp_read_leds_data_complete);
+	if (rc)
+		prlog(PR_ERR,
+		      "SPCN_MOD_PRS_LED_DATA_FIRST command could"
+		      " not be queued\n");
+	else	/* Initiated LED list fetch MBOX command */
+		led_support = LED_STATE_READING;
+}
+
+/* Init the LED subsystem at boot time */
+void fsp_led_init(void)
+{
+	led_buffer = NULL;
+
+	if (!fsp_present())
+		return;
+
+	/* Init the master lists */
+	list_head_init(&cec_ledq);
+	list_head_init(&encl_ledq);
+	list_head_init(&spcn_cmdq);
+
+	fsp_leds_query_spcn();
+
+	loc_code_list_buffer = memalign(TCE_PSIZE, PSI_DMA_LOC_COD_BUF_SZ);
+	if (loc_code_list_buffer == NULL)
+		prerror("ERROR: Unable to allocate loc_code_list_buffer!\n");
+
+	prlog(PR_TRACE, "Init completed\n");
+
+	/* Get System attention indicator state */
+	dt_get_sai_loc_code();
+	fsp_get_sai();
+
+	/* Handle FSP initiated async LED commands */
+	fsp_register_client(&fsp_indicator_client, FSP_MCLASS_INDICATOR);
+	prlog(PR_TRACE, "FSP async command client registered\n");
+
+	/* Register for SAI update notification */
+	sysparam_add_update_notifier(sai_update_notification);
+
+	opal_register(OPAL_LEDS_GET_INDICATOR, fsp_opal_leds_get_ind, 4);
+	opal_register(OPAL_LEDS_SET_INDICATOR, fsp_opal_leds_set_ind, 5);
+	prlog(PR_TRACE, "LED OPAL interface registered\n");
+}
diff --git a/roms/skiboot/hw/fsp/fsp-mem-err.c b/roms/skiboot/hw/fsp/fsp-mem-err.c
new file mode 100644
index 000000000..2e3e65401
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-mem-err.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Sometimes some memory needs to go and sit in the naughty corner
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "FSPMEMERR: " fmt
+#include <skiboot.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <lock.h>
+#include <fsp.h>
+#include <errorlog.h>
+
+/* FSP sends real address of 4K memory page. */
+#define MEM_ERR_PAGE_SIZE_4K	(1UL << 12)
+
+/* maximum number of error event to hold until linux consumes it. */
+#define MERR_MAX_RECORD		1024
+
+struct fsp_mem_err_node {
+	struct list_node list;
+	struct OpalMemoryErrorData data;
+};
+
+static LIST_HEAD(merr_free_list);
+static LIST_HEAD(mem_error_list);
+/*
+ * lock is used to protect overwriting of merr_free_list and mem_error_list
+ * list.
+ */
+static struct lock mem_err_lock = LOCK_UNLOCKED;
+
+DEFINE_LOG_ENTRY(OPAL_RC_MEM_ERR_RES, OPAL_PLATFORM_ERR_EVT, OPAL_MEM_ERR,
+			OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+			OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_MEM_ERR_DEALLOC, OPAL_PLATFORM_ERR_EVT, OPAL_MEM_ERR,
+			OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+			OPAL_NA);
+
+static bool send_response_to_fsp(u32 cmd_sub_mod)
+{
+	struct fsp_msg *rsp;
+	int rc = -ENOMEM;
+
+	rsp = fsp_mkmsg(cmd_sub_mod, 0);
+	if (rsp)
+		rc = fsp_queue_msg(rsp, fsp_freemsg);
+	if (rc) {
+		fsp_freemsg(rsp);
+		/* XXX Generate error logs */
+		prerror("Error %d queueing FSP memory error reply\n", rc);
+		return false;
+	}
+	return true;
+}
+
+/*
+ * Queue up the memory error message for delivery.
+ *
+ * queue_event_for_delivery get called from two places.
+ * 1) from queue_mem_err_node when new fsp mem error is available and
+ * 2) from completion callback indicating that linux has consumed an message.
+ *
+ * TODO:
+ * There is a chance that, we may not get a free slot to queue our event
+ * for delivery to linux during both the above invocations. In that case
+ * we end up holding events with us until next fsp memory error comes in.
+ * We need to address this case either here OR fix up messaging infrastructure
+ * to make sure at least one slot will always be available per message type.
+ *
+ * XXX: BenH: I changed the msg infrastructure to attempt an allocation
+ *            in that case, at least until we clarify a bit better how
+ *            we want to handle things.
+ */
+static void queue_event_for_delivery(void *data __unused, int staus __unused)
+{
+	struct fsp_mem_err_node *entry;
+	uint64_t *merr_data;
+	int rc;
+
+	lock(&mem_err_lock);
+	entry = list_pop(&mem_error_list, struct fsp_mem_err_node, list);
+	unlock(&mem_err_lock);
+
+	if (!entry)
+		return;
+
+	/*
+	 * struct OpalMemoryErrorData is of (4 * 64 bits) size and well packed
+	 * structure. Hence use uint64_t pointer to pass entire structure
+	 * using 4 params in generic message format.
+	 */
+	merr_data = (uint64_t *)&entry->data;
+
+	/* queue up for delivery */
+	rc = opal_queue_msg(OPAL_MSG_MEM_ERR, NULL, queue_event_for_delivery,
+			    cpu_to_be64(merr_data[0]),
+			    cpu_to_be64(merr_data[1]),
+			    cpu_to_be64(merr_data[2]),
+			    cpu_to_be64(merr_data[3]));
+	lock(&mem_err_lock);
+	if (rc) {
+		/*
+		 * Failed to queue up the event for delivery. No free slot
+		 * available. There is a chance that we are trying to queue
+		 * up multiple event at the same time. We may already have
+		 * at least one event queued up, in that case we will be
+		 * called again through completion callback and we should
+		 * be able to grab empty slot then.
+		 *
+		 * For now, put this node back on mem_error_list.
+		 */
+		list_add(&mem_error_list, &entry->list);
+	} else
+		list_add(&merr_free_list, &entry->list);
+	unlock(&mem_err_lock);
+}
+
+static int queue_mem_err_node(struct OpalMemoryErrorData *merr_evt)
+{
+	struct fsp_mem_err_node *entry;
+
+	lock(&mem_err_lock);
+	entry = list_pop(&merr_free_list, struct fsp_mem_err_node, list);
+	if (!entry) {
+		printf("Failed to queue up memory error event.\n");
+		unlock(&mem_err_lock);
+		return -ENOMEM;
+	}
+
+	entry->data = *merr_evt;
+	list_add(&mem_error_list, &entry->list);
+	unlock(&mem_err_lock);
+
+	/* Queue up the event for delivery to OS. */
+	queue_event_for_delivery(NULL, OPAL_SUCCESS);
+	return 0;
+}
+
+/* Check if memory resilience event for same address already exists. */
+static bool is_resilience_event_exist(u64 paddr)
+{
+	struct fsp_mem_err_node *entry;
+	struct OpalMemoryErrorData *merr_evt;
+	int found = 0;
+
+	lock(&mem_err_lock);
+	list_for_each(&mem_error_list, entry, list) {
+		merr_evt = &entry->data;
+		if ((merr_evt->type == OPAL_MEM_ERR_TYPE_RESILIENCE) &&
+		    (be64_to_cpu(merr_evt->u.resilience.physical_address_start)
+							    == paddr)) {
+			found = 1;
+			break;
+		}
+	}
+	unlock(&mem_err_lock);
+	return !!found;
+}
+
+/*
+ * handle Memory Resilience error message.
+ * Section 28.2 of Hypervisor to FSP Mailbox Interface Specification.
+ *
+ * The flow for Memory Resilence Event is:
+ * 1. PRD component in FSP gets a recoverable attention from hardware when
+ *    there is a corretable/uncorrectable memory error to free up a page.
+ * 2. PRD sends Memory Resilence Command to hypervisor with the real address of
+ *    the 4K memory page in which the error occurred.
+ * 3. The hypervisor acknowledges with a status immediately. Immediate
+ *    acknowledgment doesn’t require the freeing of the page to be completed.
+ */
+static bool handle_memory_resilience(u32 cmd_sub_mod, u64 paddr)
+{
+	int rc = 0;
+	struct OpalMemoryErrorData mem_err_evt;
+	struct errorlog *buf;
+
+	memset(&mem_err_evt, 0, sizeof(struct OpalMemoryErrorData));
+	/* Check arguments */
+	if (paddr == 0) {
+		prerror("memory resilience: Invalid real address.\n");
+		return send_response_to_fsp(FSP_RSP_MEM_RES |
+					    FSP_STATUS_GENERIC_ERROR);
+	}
+
+	/* Check if event already exist for same address. */
+	if (is_resilience_event_exist(paddr))
+		goto send_response;
+
+	/* Populate an event. */
+	mem_err_evt.version = OpalMemErr_V1;
+	mem_err_evt.type = OPAL_MEM_ERR_TYPE_RESILIENCE;
+
+	switch (cmd_sub_mod) {
+	case FSP_CMD_MEM_RES_CE:
+		/*
+		 * Should we keep counter for corrected errors in
+		 * sapphire OR let linux (PowerNV) handle it?
+		 *
+		 * For now, send corrected errors to linux and let
+		 * linux handle corrected errors thresholding.
+		 */
+		mem_err_evt.flags |= cpu_to_be16(OPAL_MEM_CORRECTED_ERROR);
+		mem_err_evt.u.resilience.resil_err_type =
+					OPAL_MEM_RESILIENCE_CE;
+		break;
+	case FSP_CMD_MEM_RES_UE:
+		mem_err_evt.u.resilience.resil_err_type =
+					OPAL_MEM_RESILIENCE_UE;
+		break;
+	case FSP_CMD_MEM_RES_UE_SCRB:
+		mem_err_evt.u.resilience.resil_err_type =
+					OPAL_MEM_RESILIENCE_UE_SCRUB;
+		break;
+	}
+	mem_err_evt.u.resilience.physical_address_start = cpu_to_be64(paddr);
+	mem_err_evt.u.resilience.physical_address_end =
+		cpu_to_be64(paddr + MEM_ERR_PAGE_SIZE_4K);
+
+	/* Queue up the event and inform OS about it. */
+	rc = queue_mem_err_node(&mem_err_evt);
+
+send_response:
+	/* Queue up an OK response to the resilience message itself */
+	if (!rc)
+		return send_response_to_fsp(FSP_RSP_MEM_RES);
+	else {
+		buf = opal_elog_create(&e_info(OPAL_RC_MEM_ERR_RES), 0);
+		log_append_msg(buf,
+			"OPAL_MEM_ERR: Cannot queue up memory "
+			"resilience error event to the OS");
+		log_add_section(buf, OPAL_ELOG_SEC_DESC);
+		log_append_data(buf, (char *) &mem_err_evt,
+					   sizeof(struct OpalMemoryErrorData));
+		log_commit(buf);
+		return false;
+	}
+}
+
+/* update existing event entry if match is found. */
+static bool update_memory_deallocation_event(u64 paddr_start, u64 paddr_end)
+{
+	struct fsp_mem_err_node *entry;
+	struct OpalMemoryErrorData *merr_evt;
+	int found = 0;
+
+	lock(&mem_err_lock);
+	list_for_each(&mem_error_list, entry, list) {
+		merr_evt = &entry->data;
+		if ((merr_evt->type == OPAL_MEM_ERR_TYPE_DYN_DALLOC) &&
+		    (be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start)
+							    == paddr_start)) {
+			found = 1;
+			if (be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end)
+								< paddr_end)
+				merr_evt->u.dyn_dealloc.physical_address_end =
+					cpu_to_be64(paddr_end);
+			break;
+		}
+	}
+	unlock(&mem_err_lock);
+	return !!found;
+}
+
+/*
+ * Handle dynamic memory deallocation message.
+ *
+ * When a condition occurs in which we need to do a large scale memory
+ * deallocation, PRD will send a starting and ending address of an area of
+ * memory to Hypervisor. Hypervisor then need to use this to deallocate all
+ * pages between and including the addresses.
+ *
+ */
+static bool handle_memory_deallocation(u64 paddr_start, u64 paddr_end)
+{
+	int rc = 0;
+	u8 err = 0;
+	struct OpalMemoryErrorData mem_err_evt;
+	struct errorlog *buf;
+
+	memset(&mem_err_evt, 0, sizeof(struct OpalMemoryErrorData));
+	/* Check arguments */
+	if ((paddr_start == 0) || (paddr_end == 0)) {
+		prerror("memory deallocation: Invalid "
+			"starting/ending real address.\n");
+		err = FSP_STATUS_GENERIC_ERROR;
+	}
+
+	/* If we had an error, send response to fsp and return */
+	if (err)
+		return send_response_to_fsp(FSP_RSP_MEM_DYN_DEALLOC | err);
+
+	/*
+	 * FSP can send dynamic memory deallocation multiple times for the
+	 * same address/address ranges. Hence check and update if we already
+	 * have sam event queued.
+	 */
+	if (update_memory_deallocation_event(paddr_start, paddr_end))
+		goto send_response;
+
+	/* Populate an new event. */
+	mem_err_evt.version = OpalMemErr_V1;
+	mem_err_evt.type = OPAL_MEM_ERR_TYPE_DYN_DALLOC;
+	mem_err_evt.u.dyn_dealloc.dyn_err_type =
+					OPAL_MEM_DYNAMIC_DEALLOC;
+	mem_err_evt.u.dyn_dealloc.physical_address_start = cpu_to_be64(paddr_start);
+	mem_err_evt.u.dyn_dealloc.physical_address_end = cpu_to_be64(paddr_end);
+
+	/* Queue up the event and inform OS about it. */
+	rc = queue_mem_err_node(&mem_err_evt);
+
+send_response:
+	/* Queue up an OK response to the memory deallocation message itself */
+	if (!rc)
+		return send_response_to_fsp(FSP_RSP_MEM_DYN_DEALLOC);
+	else {
+		buf = opal_elog_create(&e_info(OPAL_RC_MEM_ERR_DEALLOC), 0);
+		log_append_msg(buf,
+			"OPAL_MEM_ERR: Cannot queue up memory "
+			"deallocation error event to the OS");
+		log_add_section(buf, OPAL_ELOG_SEC_DESC);
+		log_append_data(buf, (char *)&mem_err_evt,
+					   sizeof(struct OpalMemoryErrorData));
+		log_commit(buf);
+		return false;
+	}
+}
+
+/* Receive a memory error mesages and handle it. */
+static bool fsp_mem_err_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	u64 paddr_start, paddr_end;
+
+	printf("Received 0x%08ux command\n", cmd_sub_mod);
+	switch (cmd_sub_mod) {
+	case FSP_CMD_MEM_RES_CE:
+	case FSP_CMD_MEM_RES_UE:
+	case FSP_CMD_MEM_RES_UE_SCRB:
+		/*
+		 * We get the memory relilence command from FSP for
+		 * correctable/Uncorrectable/scrub UE errors with real
+		 * address of 4K memory page in which the error occurred.
+		 */
+		paddr_start = be64_to_cpu(*((__be64 *)&msg->data.bytes[0]));
+		printf("Got memory resilience error message for "
+		       "paddr=0x%016llux\n", paddr_start);
+		return handle_memory_resilience(cmd_sub_mod, paddr_start);
+	case FSP_CMD_MEM_DYN_DEALLOC:
+		paddr_start = be64_to_cpu(*((__be64 *)&msg->data.bytes[0]));
+		paddr_end = be64_to_cpu(*((__be64 *)&msg->data.bytes[8]));
+		printf("Got dynamic memory deallocation message: "
+		       "paddr_start=0x%016llux, paddr_end=0x%016llux\n",
+		       paddr_start, paddr_end);
+		return handle_memory_deallocation(paddr_start, paddr_end);
+	}
+	return false;
+}
+
+/*
+ * pre allocate memory to hold maximum of 128 memory error event until linux
+ * consumes it.
+ */
+static int init_merr_free_list(uint32_t num_entries)
+{
+	struct fsp_mem_err_node *entry;
+	int i;
+
+	entry = zalloc(sizeof(struct fsp_mem_err_node) * num_entries);
+	if (!entry)
+		return -ENOMEM;
+
+	for (i = 0; i < num_entries; ++i, entry++)
+		list_add_tail(&merr_free_list, &entry->list);
+
+	return 0;
+}
+
+static struct fsp_client fsp_mem_err_client = {
+	.message = fsp_mem_err_msg,
+};
+
+void fsp_memory_err_init(void)
+{
+	int rc;
+
+	printf("Intializing fsp memory handling.\n");
+	/* If we have an FSP, register for notifications */
+	if (!fsp_present())
+		return;
+
+	/* pre allocate memory for 128 record */
+	rc = init_merr_free_list(MERR_MAX_RECORD);
+	if (rc < 0)
+		return;
+
+	fsp_register_client(&fsp_mem_err_client, FSP_MCLASS_MEMORY_ERR);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-nvram.c b/roms/skiboot/hw/fsp/fsp-nvram.c
new file mode 100644
index 000000000..aa17cb5e7
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-nvram.c
@@ -0,0 +1,424 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Read/Write NVRAM from/to FSP
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <opal.h>
+#include <lock.h>
+#include <device.h>
+#include <errorlog.h>
+
+/*
+ * The FSP NVRAM API operates in "blocks" of 4K. It is entirely exposed
+ * to the OS via the OPAL APIs.
+ *
+ * In order to avoid dealing with complicated read/modify/write state
+ * machines (and added issues related to FSP failover in the middle)
+ * we keep a memory copy of the entire nvram which we load at boot
+ * time. We save only modified blocks.
+ *
+ * To limit the amount of memory used by the nvram image, we limit
+ * how much nvram we support to NVRAM_SIZE. Additionally, this limit
+ * of 1M is the maximum that the CHRP/PAPR nvram partition format
+ * supports for a partition entry.
+ *
+ * (Q: should we save the whole thing in case of FSP failover ?)
+ *
+ * The nvram is expected to comply with the CHRP/PAPR defined format,
+ * and specifically contain a System partition (ID 0x70) named "common"
+ * with configuration variables for the bootloader and a FW private
+ * partition for future use by skiboot.
+ *
+ * If the partition layout appears broken or lacks one of the above
+ * partitions, we reformat the entire nvram at boot time.
+ *
+ * We do not exploit the ability of the FSP to store a checksum. This
+ * is documented as possibly going away. The CHRP format for nvram
+ * that Linux uses has its own (though weak) checksum mechanism already
+ *
+ */
+
+#define NVRAM_BLKSIZE	0x1000
+
+struct nvram_triplet {
+	__be64		dma_addr;
+	__be32		blk_offset;
+	__be32		blk_count;
+} __packed;
+
+#define NVRAM_FLAG_CLEAR_WPEND	0x80000000
+
+enum nvram_state {
+	NVRAM_STATE_CLOSED,
+	NVRAM_STATE_OPENING,
+	NVRAM_STATE_BROKEN,
+	NVRAM_STATE_OPEN,
+	NVRAM_STATE_ABSENT,
+};
+
+static void *fsp_nvram_image;
+static uint32_t fsp_nvram_size;
+static struct lock fsp_nvram_lock = LOCK_UNLOCKED;
+static struct fsp_msg *fsp_nvram_msg;
+static uint32_t fsp_nvram_dirty_start;
+static uint32_t fsp_nvram_dirty_end;
+static bool fsp_nvram_was_read;
+static struct nvram_triplet fsp_nvram_triplet __align(0x1000);
+static enum nvram_state fsp_nvram_state = NVRAM_STATE_CLOSED;
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_INIT, OPAL_PLATFORM_ERR_EVT , OPAL_NVRAM,
+		OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_OPEN, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+		OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_SIZE, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+		OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_READ, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+		OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+		OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+static void fsp_nvram_send_write(void);
+
+static void fsp_nvram_wr_complete(struct fsp_msg *msg)
+{
+	struct fsp_msg *resp = msg->resp;
+	uint8_t rc;
+
+	lock(&fsp_nvram_lock);
+	fsp_nvram_msg = NULL;
+
+	/* Check for various errors. If an error occurred,
+	 * we generally assume the nvram is completely dirty
+	 * but we won't trigger a new write until we get
+	 * either a new attempt at writing, or an FSP reset
+	 * reload (TODO)
+	 */
+	if (!resp || resp->state != fsp_msg_response)
+		goto fail_dirty;
+	rc = (msg->word1 >> 8) & 0xff;
+	switch(rc) {
+	case 0:
+	case 0x44:
+		/* Sync to secondary required... XXX */
+	case 0x45:
+		break;
+	case 0xef:
+		/* Sync to secondary failed, let's ignore that for now,
+		 * maybe when (if) we handle redundant FSPs ...
+		 */
+		prerror("FSP: NVRAM sync to secondary failed\n");
+		break;
+	default:
+		log_simple_error(&e_info(OPAL_RC_NVRAM_WRITE),
+			"FSP: NVRAM write return error 0x%02x\n", rc);
+		goto fail_dirty;
+	}
+	fsp_freemsg(msg);
+	if (fsp_nvram_dirty_start <= fsp_nvram_dirty_end)
+		fsp_nvram_send_write();
+	unlock(&fsp_nvram_lock);
+	return;
+ fail_dirty:
+	fsp_nvram_dirty_start = 0;
+	fsp_nvram_dirty_end = fsp_nvram_size - 1;
+	fsp_freemsg(msg);
+	unlock(&fsp_nvram_lock);
+}
+
+static void fsp_nvram_send_write(void)
+{
+	uint32_t start = fsp_nvram_dirty_start;
+	uint32_t end = fsp_nvram_dirty_end;
+	uint32_t count;
+
+	if (start > end || fsp_nvram_state != NVRAM_STATE_OPEN)
+		return;
+	count = (end - start) / NVRAM_BLKSIZE + 1;
+	fsp_nvram_triplet.dma_addr = cpu_to_be64(PSI_DMA_NVRAM_BODY + start);
+	fsp_nvram_triplet.blk_offset = cpu_to_be32(start / NVRAM_BLKSIZE);
+	fsp_nvram_triplet.blk_count = cpu_to_be32(count);
+	fsp_nvram_msg = fsp_mkmsg(FSP_CMD_WRITE_VNVRAM, 6,
+				  0, PSI_DMA_NVRAM_TRIPL, 1,
+				  NVRAM_FLAG_CLEAR_WPEND, 0, 0);
+	if (fsp_queue_msg(fsp_nvram_msg, fsp_nvram_wr_complete)) {
+		fsp_freemsg(fsp_nvram_msg);
+		fsp_nvram_msg = NULL;
+		log_simple_error(&e_info(OPAL_RC_NVRAM_WRITE),
+				"FSP: Error queueing nvram update\n");
+		return;
+	}
+	fsp_nvram_dirty_start = fsp_nvram_size;
+	fsp_nvram_dirty_end = 0;
+}
+
+static void fsp_nvram_rd_complete(struct fsp_msg *msg)
+{
+	int64_t rc;
+
+	lock(&fsp_nvram_lock);
+
+	/* Read complete, check status. What to do if the read fails ?
+	 *
+	 * Well, there could be various reasons such as an FSP reboot
+	 * at the wrong time, but there is really not much we can do
+	 * so for now I'll just mark the nvram as closed, and we'll
+	 * attempt a re-open and re-read whenever the OS tries to
+	 * access it
+	 */
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_nvram_msg = NULL;
+	fsp_freemsg(msg);
+	if (rc) {
+		prerror("FSP: NVRAM read failed, will try again later\n");
+		fsp_nvram_state = NVRAM_STATE_CLOSED;
+	} else {
+		/* nvram was read once, no need to do it ever again */
+		fsp_nvram_was_read = true;
+		fsp_nvram_state = NVRAM_STATE_OPEN;
+
+		/* XXX Here we should look for nvram settings that concern
+		 * us such as guest kernel arguments etc...
+		 */
+	}
+	unlock(&fsp_nvram_lock);
+	nvram_read_complete(fsp_nvram_state == NVRAM_STATE_OPEN);
+	if (fsp_nvram_state != NVRAM_STATE_OPEN)
+		log_simple_error(&e_info(OPAL_RC_NVRAM_INIT),
+		"FSP: NVRAM not read, skipping init\n");
+}
+
+static void fsp_nvram_send_read(void)
+{
+	fsp_nvram_msg = fsp_mkmsg(FSP_CMD_READ_VNVRAM, 4,
+				  0, PSI_DMA_NVRAM_BODY, 0,
+				  fsp_nvram_size / NVRAM_BLKSIZE);
+	if (fsp_queue_msg(fsp_nvram_msg, fsp_nvram_rd_complete)) {
+		/* If the nvram read fails to queue, we mark ourselves
+		 * closed. Shouldn't have happened anyway. Not much else
+		 * we can do.
+		 */
+		fsp_nvram_state = NVRAM_STATE_CLOSED;
+		fsp_freemsg(fsp_nvram_msg);
+		fsp_nvram_msg = NULL;
+		log_simple_error(&e_info(OPAL_RC_NVRAM_READ),
+				"FSP: Error queueing nvram read\n");
+		return;
+	}
+}
+
+static void fsp_nvram_open_complete(struct fsp_msg *msg)
+{
+	int8_t rc;
+
+	lock(&fsp_nvram_lock);
+
+	/* Open complete, check status */
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_nvram_msg = NULL;
+	fsp_freemsg(msg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_NVRAM_OPEN),
+			"FSP: NVRAM open failed, FSP error 0x%02x\n", rc);
+		goto failed;
+	}
+	if (fsp_nvram_was_read)
+		fsp_nvram_state = NVRAM_STATE_OPEN;
+	else
+		fsp_nvram_send_read();
+	unlock(&fsp_nvram_lock);
+	return;
+ failed:
+	fsp_nvram_state = NVRAM_STATE_CLOSED;
+	unlock(&fsp_nvram_lock);
+}
+
+static void fsp_nvram_send_open(void)
+{
+	printf("FSP NVRAM: Opening nvram...\n");
+	fsp_nvram_msg = fsp_mkmsg(FSP_CMD_OPEN_VNVRAM, 1, fsp_nvram_size);
+	assert(fsp_nvram_msg);
+	fsp_nvram_state = NVRAM_STATE_OPENING;
+	if (!fsp_queue_msg(fsp_nvram_msg, fsp_nvram_open_complete))
+		return;
+
+	prerror("FSP NVRAM: Failed to queue nvram open message\n");
+	fsp_freemsg(fsp_nvram_msg);
+	fsp_nvram_msg = NULL;
+	fsp_nvram_state = NVRAM_STATE_CLOSED;
+}
+
+static bool fsp_nvram_get_size(uint32_t *out_size)
+{
+	struct fsp_msg *msg;
+	int rc, size;
+
+	msg = fsp_mkmsg(FSP_CMD_GET_VNVRAM_SIZE, 0);
+	assert(msg);
+
+	rc = fsp_sync_msg(msg, false);
+	size = msg->resp ? fsp_msg_get_data_word(msg->resp, 0) : 0;
+	fsp_freemsg(msg);
+	if (rc || size == 0) {
+		log_simple_error(&e_info(OPAL_RC_NVRAM_SIZE),
+			"FSP: Error %d nvram size reported is %d\n", rc, size);
+		fsp_nvram_state = NVRAM_STATE_BROKEN;
+		return false;
+	}
+	printf("FSP: NVRAM file size from FSP is %d bytes\n", size);
+	*out_size = size;
+	return true;
+}
+
+static bool fsp_nvram_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	assert(msg == NULL);
+
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		printf("FSP: Closing NVRAM on account of FSP Reset\n");
+		fsp_nvram_state = NVRAM_STATE_CLOSED;
+		return true;
+	case FSP_RELOAD_COMPLETE:
+		printf("FSP: Reopening NVRAM of FSP Reload complete\n");
+		lock(&fsp_nvram_lock);
+		fsp_nvram_send_open();
+		unlock(&fsp_nvram_lock);
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_nvram_client_rr = {
+	.message = fsp_nvram_msg_rr,
+};
+
+static bool fsp_vnvram_msg(u32 cmd_sub_mod, struct fsp_msg *msg __unused)
+{
+	u32 cmd;
+	struct fsp_msg *resp;
+
+	switch (cmd_sub_mod) {
+	case FSP_CMD_GET_VNV_STATS:
+		prlog(PR_DEBUG,
+		      "FSP NVRAM: Get vNVRAM statistics not supported\n");
+		cmd = FSP_RSP_GET_VNV_STATS | FSP_STATUS_INVALID_SUBCMD;
+		break;
+	case FSP_CMD_FREE_VNV_STATS:
+		prlog(PR_DEBUG,
+		      "FSP NVRAM: Free vNVRAM statistics buffer not supported\n");
+		cmd = FSP_RSP_FREE_VNV_STATS | FSP_STATUS_INVALID_SUBCMD;
+		break;
+	default:
+		return false;
+	}
+
+	resp = fsp_mkmsg(cmd, 0);
+	if (!resp) {
+		prerror("FSP NVRAM: Failed to allocate resp message\n");
+		return false;
+	}
+	if (fsp_queue_msg(resp, fsp_freemsg)) {
+		prerror("FSP NVRAM: Failed to queue resp message\n");
+		fsp_freemsg(resp);
+		return false;
+	}
+	return true;
+}
+
+static struct fsp_client fsp_vnvram_client = {
+	.message = fsp_vnvram_msg,
+};
+
+int fsp_nvram_info(uint32_t *total_size)
+{
+	if (!fsp_present()) {
+		fsp_nvram_state = NVRAM_STATE_ABSENT;
+		return OPAL_HARDWARE;
+	}
+
+	if (!fsp_nvram_get_size(total_size))
+		return OPAL_HARDWARE;
+	return OPAL_SUCCESS;
+}
+
+int fsp_nvram_start_read(void *dst, uint32_t src, uint32_t len)
+{
+	/* We are currently limited to fully aligned transfers */
+	assert((((uint64_t)dst) & 0xfff) == 0);
+	assert(dst);
+
+	/* Currently don't support src!=0 */
+	assert(src == 0);
+
+	if (!fsp_present())
+		return -ENODEV;
+
+	op_display(OP_LOG, OP_MOD_INIT, 0x0007);
+
+	lock(&fsp_nvram_lock);
+
+	/* Store image info */
+	fsp_nvram_image = dst;
+	fsp_nvram_size = len;
+
+	/* Mark nvram as not dirty */
+	fsp_nvram_dirty_start = len;
+	fsp_nvram_dirty_end = 0;
+
+	/* Map TCEs */
+	fsp_tce_map(PSI_DMA_NVRAM_TRIPL, &fsp_nvram_triplet,
+		    PSI_DMA_NVRAM_TRIPL_SZ);
+	fsp_tce_map(PSI_DMA_NVRAM_BODY, dst, PSI_DMA_NVRAM_BODY_SZ);
+
+	/* Register for the reset/reload event */
+	fsp_register_client(&fsp_nvram_client_rr, FSP_MCLASS_RR_EVENT);
+
+	/* Register for virtual NVRAM interface events */
+	fsp_register_client(&fsp_vnvram_client, FSP_MCLASS_VIRTUAL_NVRAM);
+
+	/* Open and load the nvram from the FSP */
+	fsp_nvram_send_open();
+
+	unlock(&fsp_nvram_lock);
+
+	return 0;
+}
+
+int fsp_nvram_write(uint32_t offset, void *src, uint32_t size)
+{
+	uint64_t end = offset + size - 1;
+
+	/* We only support writing from the original image */
+	if (src != fsp_nvram_image + offset)
+		return OPAL_HARDWARE;
+
+	offset &= ~(NVRAM_BLKSIZE - 1);
+	end &= ~(NVRAM_BLKSIZE - 1);
+
+	lock(&fsp_nvram_lock);
+	/* If the nvram is closed, try re-opening */
+	if (fsp_nvram_state == NVRAM_STATE_CLOSED)
+		fsp_nvram_send_open();
+	if (fsp_nvram_dirty_start > offset)
+		fsp_nvram_dirty_start = offset;
+	if (fsp_nvram_dirty_end < end)
+		fsp_nvram_dirty_end = end;
+	if (!fsp_nvram_msg && fsp_nvram_state == NVRAM_STATE_OPEN)
+		fsp_nvram_send_write();
+	unlock(&fsp_nvram_lock);
+
+	return 0;
+}
diff --git a/roms/skiboot/hw/fsp/fsp-occ.c b/roms/skiboot/hw/fsp/fsp-occ.c
new file mode 100644
index 000000000..58926f408
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-occ.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * FSP/OCC interactions
+ *
+ * Unlike OpenPOWER machines, FSP machines are much more tightly coupled
+ * between FSP, host, and OCC. On P8 we have to do a dance to start the
+ * OCC, but on P9 Hostboot does that, consistent with what we do on
+ * OpenPOWER.
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <xscom-p8-regs.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <fsp.h>
+#include <timebase.h>
+#include <hostservices.h>
+#include <errorlog.h>
+#include <opal-api.h>
+#include <opal-msg.h>
+#include <timer.h>
+#include <i2c.h>
+#include <powercap.h>
+#include <psr.h>
+#include <sensor.h>
+#include <occ.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_LOAD, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+struct occ_load_req {
+	u8 scope;
+	u32 dbob_id;
+	u32 seq_id;
+	struct list_node link;
+};
+static LIST_HEAD(occ_load_req_list);
+
+
+static void occ_queue_load(u8 scope, u32 dbob_id, u32 seq_id)
+{
+	struct occ_load_req *occ_req;
+
+	occ_req = zalloc(sizeof(struct occ_load_req));
+	if (!occ_req) {
+		/**
+		 * @fwts-label OCCload_reqENOMEM
+		 * @fwts-advice ENOMEM while allocating OCC load message.
+		 * OCCs not started, consequently no power/frequency scaling
+		 * will be functional.
+		 */
+		prlog(PR_ERR, "OCC: Could not allocate occ_load_req\n");
+		return;
+	}
+
+	occ_req->scope = scope;
+	occ_req->dbob_id = dbob_id;
+	occ_req->seq_id = seq_id;
+	list_add_tail(&occ_load_req_list, &occ_req->link);
+}
+
+static void __occ_do_load(u8 scope, u32 dbob_id __unused, u32 seq_id)
+{
+	struct fsp_msg *stat;
+	int rc = -ENOMEM;
+	int status_word = 0;
+	struct proc_chip *chip = next_chip(NULL);
+
+	/* Call HBRT... */
+	rc = host_services_occ_load();
+
+	/* Handle fallback to preload */
+	if (rc == -ENOENT && chip->homer_base) {
+		prlog(PR_INFO, "OCC: Load: Fallback to preloaded image\n");
+		rc = 0;
+	} else if (!rc) {
+		struct opal_occ_msg occ_msg = { CPU_TO_BE64(OCC_LOAD), 0, 0 };
+
+		rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL,
+				     sizeof(struct opal_occ_msg), &occ_msg);
+		if (rc)
+			prlog(PR_INFO, "OCC: Failed to queue message %d\n",
+			      OCC_LOAD);
+
+		/* Success, start OCC */
+		rc = host_services_occ_start();
+	}
+	if (rc) {
+		/* If either of hostservices call fail, send fail to FSP */
+		/* Find a chip ID to send failure */
+		for_each_chip(chip) {
+			if (scope == 0x01 && dbob_id != chip->dbob_id)
+				continue;
+			status_word = 0xB500 | (chip->pcid & 0xff);
+			break;
+		}
+		log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+			"OCC: Error %d in load/start OCC\n", rc);
+	}
+
+	/* Send a single response for all chips */
+	stat = fsp_mkmsg(FSP_CMD_LOAD_OCC_STAT, 2, status_word, seq_id);
+	if (stat)
+		rc = fsp_queue_msg(stat, fsp_freemsg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+			"OCC: Error %d queueing FSP OCC LOAD STATUS msg", rc);
+		fsp_freemsg(stat);
+	}
+}
+
+void occ_poke_load_queue(void)
+{
+	struct occ_load_req *occ_req, *next;
+
+	if (list_empty(&occ_load_req_list))
+		return;
+
+	list_for_each_safe(&occ_load_req_list, occ_req, next, link) {
+		__occ_do_load(occ_req->scope, occ_req->dbob_id,
+				occ_req->seq_id);
+		list_del(&occ_req->link);
+		free(occ_req);
+	}
+}
+
+static u32 last_seq_id;
+static bool in_ipl = true;
+static void occ_do_load(u8 scope, u32 dbob_id __unused, u32 seq_id)
+{
+	struct fsp_msg *rsp;
+	int rc = -ENOMEM;
+	u8 err = 0;
+
+	if (scope != 0x01 && scope != 0x02) {
+		/**
+		 * @fwts-label OCCLoadInvalidScope
+		 * @fwts-advice Invalid request for loading OCCs. Power and
+		 * frequency management not functional
+		 */
+		prlog(PR_ERR, "OCC: Load message with invalid scope 0x%x\n",
+		      scope);
+		err = 0x22;
+	}
+
+	/* First queue up an OK response to the load message itself */
+	rsp = fsp_mkmsg(FSP_RSP_LOAD_OCC | err, 0);
+	if (rsp)
+		rc = fsp_queue_msg(rsp, fsp_freemsg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+			"OCC: Error %d queueing FSP OCC LOAD reply\n", rc);
+		fsp_freemsg(rsp);
+		return;
+	}
+
+	if (err)
+		return;
+
+	if (proc_gen >= proc_gen_p9) {
+		if (in_ipl) {
+			/* OCC is pre-loaded in P9, so send SUCCESS to FSP */
+			rsp = fsp_mkmsg(FSP_CMD_LOAD_OCC_STAT, 2, 0, seq_id);
+			if (!rsp)
+				return;
+
+			rc = fsp_queue_msg(rsp, fsp_freemsg);
+			if (rc) {
+				log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+				"OCC: Error %d queueing OCC LOAD STATUS msg",
+						 rc);
+				fsp_freemsg(rsp);
+			}
+			in_ipl = false;
+		} else {
+			struct proc_chip *chip = next_chip(NULL);
+
+			last_seq_id = seq_id;
+			prd_fsp_occ_load_start(chip->id);
+		}
+		return;
+	}
+
+	/*
+	 * Check if hostservices lid caching is complete. If not, queue
+	 * the load request.
+	 */
+	if (!hservices_lid_preload_complete()) {
+		occ_queue_load(scope, dbob_id, seq_id);
+		return;
+	}
+
+	__occ_do_load(scope, dbob_id, seq_id);
+}
+
+int fsp_occ_reset_status(u64 chipid, s64 status)
+{
+	struct fsp_msg *stat;
+	int rc = OPAL_NO_MEM;
+	int status_word = 0;
+
+	prlog(PR_INFO, "HBRT: OCC stop() completed with %lld\n", status);
+
+	if (status) {
+		struct proc_chip *chip = get_chip(chipid);
+
+		if (!chip)
+			return OPAL_PARAMETER;
+
+		status_word = 0xfe00 | (chip->pcid & 0xff);
+		log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+				 "OCC: Error %lld in OCC reset of chip %lld\n",
+				 status, chipid);
+	} else {
+		occ_msg_queue_occ_reset();
+	}
+
+	stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2, status_word, last_seq_id);
+	if (!stat)
+		return rc;
+
+	rc = fsp_queue_msg(stat, fsp_freemsg);
+	if (rc) {
+		fsp_freemsg(stat);
+		log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+			"OCC: Error %d queueing FSP OCC RESET STATUS message\n",
+			rc);
+	}
+	return rc;
+}
+
+int fsp_occ_load_start_status(u64 chipid, s64 status)
+{
+	struct fsp_msg *stat;
+	int rc = OPAL_NO_MEM;
+	int status_word = 0;
+
+	if (status) {
+		struct proc_chip *chip = get_chip(chipid);
+
+		if (!chip)
+			return OPAL_PARAMETER;
+
+		status_word = 0xB500 | (chip->pcid & 0xff);
+		log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+				 "OCC: Error %d in load/start OCC %lld\n", rc,
+				 chipid);
+	}
+
+	stat = fsp_mkmsg(FSP_CMD_LOAD_OCC_STAT, 2, status_word, last_seq_id);
+	if (!stat)
+		return rc;
+
+	rc = fsp_queue_msg(stat, fsp_freemsg);
+	if (rc) {
+		fsp_freemsg(stat);
+		log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+			"OCC: Error %d queueing FSP OCC LOAD STATUS msg", rc);
+	}
+
+	return rc;
+}
+
+static void occ_do_reset(u8 scope, u32 dbob_id, u32 seq_id)
+{
+	struct fsp_msg *rsp, *stat;
+	struct proc_chip *chip = next_chip(NULL);
+	int rc = -ENOMEM;
+	u8 err = 0;
+
+	/* Check arguments */
+	if (scope != 0x01 && scope != 0x02) {
+		/**
+		 * @fwts-label OCCResetInvalidScope
+		 * @fwts-advice Invalid request for resetting OCCs. Power and
+		 * frequency management not functional
+		 */
+		prlog(PR_ERR, "OCC: Reset message with invalid scope 0x%x\n",
+		      scope);
+		err = 0x22;
+	}
+
+	/* First queue up an OK response to the reset message itself */
+	rsp = fsp_mkmsg(FSP_RSP_RESET_OCC | err, 0);
+	if (rsp)
+		rc = fsp_queue_msg(rsp, fsp_freemsg);
+	if (rc) {
+		fsp_freemsg(rsp);
+		log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+			"OCC: Error %d queueing FSP OCC RESET reply\n", rc);
+		return;
+	}
+
+	/* If we had an error, return */
+	if (err)
+		return;
+
+	/*
+	 * Call HBRT to stop OCC and leave it stopped.  FSP will send load/start
+	 * request subsequently.  Also after few runtime restarts (currently 3),
+	 * FSP will request OCC to left in stopped state.
+	 */
+
+	switch (proc_gen) {
+	case proc_gen_p8:
+		rc = host_services_occ_stop();
+		break;
+	case proc_gen_p9:
+	case proc_gen_p10:
+		last_seq_id = seq_id;
+		chip = next_chip(NULL);
+		prd_fsp_occ_reset(chip->id);
+		return;
+	default:
+		return;
+	}
+
+	/* Handle fallback to preload */
+	if (rc == -ENOENT && chip->homer_base) {
+		prlog(PR_INFO, "OCC: Reset: Fallback to preloaded image\n");
+		rc = 0;
+	}
+	if (!rc) {
+		/* Send a single success response for all chips */
+		stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2, 0, seq_id);
+		if (stat)
+			rc = fsp_queue_msg(stat, fsp_freemsg);
+		if (rc) {
+			fsp_freemsg(stat);
+			log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+				"OCC: Error %d queueing FSP OCC RESET"
+					" STATUS message\n", rc);
+		}
+		occ_msg_queue_occ_reset();
+	} else {
+
+		/*
+		 * Then send a matching OCC Reset Status message with an 0xFE
+		 * (fail) response code as well to the first matching chip
+		 */
+		for_each_chip(chip) {
+			if (scope == 0x01 && dbob_id != chip->dbob_id)
+				continue;
+			rc = -ENOMEM;
+			stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2,
+					 0xfe00 | (chip->pcid & 0xff), seq_id);
+			if (stat)
+				rc = fsp_queue_msg(stat, fsp_freemsg);
+			if (rc) {
+				fsp_freemsg(stat);
+				log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+					"OCC: Error %d queueing FSP OCC RESET"
+						" STATUS message\n", rc);
+			}
+			break;
+		}
+	}
+}
+
+static bool fsp_occ_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	u32 dbob_id, seq_id;
+	u8 scope;
+
+	switch (cmd_sub_mod) {
+	case FSP_CMD_LOAD_OCC:
+		/*
+		 * We get the "Load OCC" command at boot. We don't currently
+		 * support loading it ourselves (we don't have the procedures,
+		 * they will come with Host Services). For now HostBoot will
+		 * have loaded a OCC firmware for us, but we still need to
+		 * be nice and respond to OCC.
+		 */
+		scope = msg->data.bytes[3];
+		dbob_id = fsp_msg_get_data_word(msg, 1);
+		seq_id = fsp_msg_get_data_word(msg, 2);
+		prlog(PR_INFO, "OCC: Got OCC Load message, scope=0x%x"
+		      " dbob=0x%x seq=0x%x\n", scope, dbob_id, seq_id);
+		occ_do_load(scope, dbob_id, seq_id);
+		return true;
+
+	case FSP_CMD_RESET_OCC:
+		/*
+		 * We shouldn't be getting this one, but if we do, we have
+		 * to reply something sensible or the FSP will get upset
+		 */
+		scope = msg->data.bytes[3];
+		dbob_id = fsp_msg_get_data_word(msg, 1);
+		seq_id = fsp_msg_get_data_word(msg, 2);
+		prlog(PR_INFO, "OCC: Got OCC Reset message, scope=0x%x"
+		      " dbob=0x%x seq=0x%x\n", scope, dbob_id, seq_id);
+		occ_do_reset(scope, dbob_id, seq_id);
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_occ_client = {
+	.message = fsp_occ_msg,
+};
+
+void occ_fsp_init(void)
+{
+	/* If we have an FSP, register for notifications */
+	if (fsp_present())
+		fsp_register_client(&fsp_occ_client, FSP_MCLASS_OCC);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-op-panel.c b/roms/skiboot/hw/fsp/fsp-op-panel.c
new file mode 100644
index 000000000..a8ac00b7a
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-op-panel.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Small LCD screen on the front of FSP machines
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <opal.h>
+#include <device.h>
+#include <processor.h>
+#include <opal-msg.h>
+#include <errorlog.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_PANEL_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_OP_PANEL,
+		 OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL, OPAL_NA);
+
+/* For OPAL OP_PANEL API we can only have one in flight due to TCEs */
+static struct fsp_msg *op_req;
+static uint64_t op_async_token;
+static struct lock op_lock = LOCK_UNLOCKED;
+
+static void fsp_op_display_fatal(uint32_t w0, uint32_t w1)
+{
+	static struct fsp_msg op_msg_resp;
+	static struct fsp_msg op_msg = {
+		.resp = &op_msg_resp,
+	};
+
+	fsp_fillmsg(&op_msg, FSP_CMD_DISP_SRC_DIRECT, 3, 1, w0, w1);
+
+	/*
+	 * A special way to send a message: it doesn't run pollers.
+	 * This means we can call it while in a poller, which we may
+	 * well be in when we're terminating (and thus displaying a *fatal*
+	 * message on the op-panel).
+	 */
+	fsp_fatal_msg(&op_msg);
+}
+
+void fsp_op_display(enum op_severity sev, enum op_module mod, uint16_t code)
+{
+	struct fsp_msg *op_msg;
+	uint32_t w0;
+	uint32_t w1;
+
+	if (!fsp_present())
+		return;
+
+	w0 = sev << 16 | mod;
+
+	w1 =  tohex((code >> 12) & 0xf) << 24;
+	w1 |= tohex((code >>  8) & 0xf) << 16;
+	w1 |= tohex((code >>  4) & 0xf) <<  8;
+	w1 |= tohex((code      ) & 0xf);
+
+	if (sev == OP_FATAL) {
+		fsp_op_display_fatal(w0, w1);
+	} else {
+		op_msg = fsp_allocmsg(true);
+		if (!op_msg) {
+			prerror("Failed to allocate FSP message for PANEL\n");
+			return;
+		}
+
+		fsp_fillmsg(op_msg, FSP_CMD_DISP_SRC_DIRECT, 3, 1, w0, w1);
+
+		if(fsp_queue_msg(op_msg, fsp_freemsg))
+			prerror("Failed to queue FSP message for OP PANEL\n");
+	}
+}
+
+void op_panel_disable_src_echo(void)
+{
+	struct fsp_msg op_msg_resp;
+	struct fsp_msg op_msg = {
+		.resp = &op_msg_resp,
+	};
+
+	if (!fsp_present())
+		return;
+
+	fsp_fillmsg(&op_msg, FSP_CMD_DIS_SRC_ECHO, 0);
+	fsp_sync_msg(&op_msg, false);
+}
+
+void op_panel_clear_src(void)
+{
+	struct fsp_msg op_msg_resp;
+	struct fsp_msg op_msg = {
+		.resp = &op_msg_resp,
+	};
+
+	if (!fsp_present())
+		return;
+
+	fsp_fillmsg(&op_msg, FSP_CMD_CLEAR_SRC, 0);
+	fsp_sync_msg(&op_msg, false);
+}
+
+/* opal_write_oppanel - Write to the physical op panel.
+ *
+ * Pass in an array of oppanel_line_t structs defining the ASCII characters
+ * to display on each line of the oppanel. If there are two lines on the
+ * physical panel, and you only want to write to the first line, you only
+ * need to pass in one line. If you only want to write to the second line,
+ * you need to pass in both lines, and set the line_len of the first line
+ * to zero.
+ *
+ * This command is asynchronous. If OPAL_SUCCESS is returned, then the
+ * operation was initiated successfully. Subsequent calls will return
+ * OPAL_BUSY until the current operation is complete.
+ */
+struct op_src {
+	uint8_t version;
+#define OP_SRC_VERSION	2
+	uint8_t	flags;
+	uint8_t reserved;
+	uint8_t	hex_word_cnt;
+	__be16	reserved2;
+	__be16	total_size;
+	__be32	word2; /* SRC format in low byte */
+	__be32	word3;
+	__be32	word4;
+	__be32	word5;
+	__be32	word6;
+	__be32	word7;
+	__be32	word8;
+	__be32	word9;
+	uint8_t	ascii[OP_PANEL_NUM_LINES * OP_PANEL_LINE_LEN]; /* Word 11 */
+} __packed __align(4);
+
+/* Page align for the sake of TCE mapping */
+static struct op_src op_src __align(0x1000);
+
+static void __op_panel_write_complete(struct fsp_msg *msg)
+{
+	fsp_tce_unmap(PSI_DMA_OP_PANEL_MISC, 0x1000);
+
+	lock(&op_lock);
+	op_req = NULL;
+	unlock(&op_lock);
+
+	fsp_freemsg(msg);
+}
+
+static void op_panel_write_complete(struct fsp_msg *msg)
+{
+	uint8_t rc = (msg->resp->word1 >> 8) & 0xff;
+
+	if (rc)
+		prerror("OPPANEL: Error 0x%02x in display command\n", rc);
+
+	__op_panel_write_complete(msg);
+
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			cpu_to_be64(1),
+			cpu_to_be64(op_async_token));
+}
+
+static int64_t __opal_write_oppanel(oppanel_line_t *lines, uint64_t num_lines,
+				    uint64_t async_token)
+{
+	int64_t rc = OPAL_ASYNC_COMPLETION;
+	int len;
+	int i;
+
+	if (num_lines < 1 || num_lines > OP_PANEL_NUM_LINES)
+		return OPAL_PARAMETER;
+
+	/* Only one in flight */
+	lock(&op_lock);
+	if (op_req) {
+		rc = OPAL_BUSY_EVENT;
+		unlock(&op_lock);
+		goto bail;
+	}
+
+	op_req = fsp_allocmsg(true);
+	if (!op_req) {
+		rc = OPAL_NO_MEM;
+		unlock(&op_lock);
+		goto bail;
+	}
+	unlock(&op_lock);
+
+	op_async_token = async_token;
+
+	memset(&op_src, 0, sizeof(op_src));
+
+	op_src.version = OP_SRC_VERSION;
+	op_src.flags = 0;
+	op_src.reserved = 0;
+	op_src.hex_word_cnt = 1; /* header word only */
+	op_src.reserved2 = 0;
+	op_src.total_size = cpu_to_be16(sizeof(op_src));
+	op_src.word2 = 0; /* should be unneeded */
+
+	for (i = 0; i < num_lines; i++) {
+		uint8_t *current_line = op_src.ascii + (i * OP_PANEL_LINE_LEN);
+
+		len = be64_to_cpu(lines[i].line_len);
+		if (len < OP_PANEL_LINE_LEN)
+			memset(current_line + len, ' ', OP_PANEL_LINE_LEN-len);
+		else
+			len = OP_PANEL_LINE_LEN;
+		memcpy(current_line, (void *) be64_to_cpu(lines[i].line), len);
+	}
+
+	for (i = 0; i < sizeof(op_src.ascii); i++) {
+		/*
+		 * So, there's this interesting thing if you send
+		 * HTML/Javascript through the Operator Panel.
+		 * You get to inject it into the ASM web ui!
+		 * So we filter out anything suspect here,
+		 * at least for the time being.
+		 *
+		 * Allowed characters:
+		 *  . / 0-9 : a-z A-Z SPACE
+		 */
+		if (! ((op_src.ascii[i] >= '.' && op_src.ascii[i] <= ':') ||
+		       (op_src.ascii[i] >= 'a' && op_src.ascii[i] <= 'z') ||
+		       (op_src.ascii[i] >= 'A' && op_src.ascii[i] <= 'Z') ||
+		       op_src.ascii[i] == ' ')) {
+			op_src.ascii[i] = '.';
+		}
+	}
+
+	fsp_tce_map(PSI_DMA_OP_PANEL_MISC, &op_src, 0x1000);
+
+	fsp_fillmsg(op_req, FSP_CMD_DISP_SRC_INDIR, 3, 0,
+		    PSI_DMA_OP_PANEL_MISC, sizeof(struct op_src));
+	rc = fsp_queue_msg(op_req, op_panel_write_complete);
+	if (rc) {
+		__op_panel_write_complete(op_req);
+		rc = OPAL_INTERNAL_ERROR;
+	}
+ bail:
+	log_simple_error(&e_info(OPAL_RC_PANEL_WRITE),
+			"FSP: Error updating Op Panel: %lld\n", rc);
+	return rc;
+}
+
+static int64_t opal_write_oppanel_async(uint64_t async_token,
+					oppanel_line_t *lines,
+					uint64_t num_lines)
+{
+	return __opal_write_oppanel(lines, num_lines, async_token);
+}
+
+void fsp_oppanel_init(void)
+{
+	struct dt_node *oppanel;
+
+	if (!fsp_present())
+		return;
+
+	opal_register(OPAL_WRITE_OPPANEL_ASYNC, opal_write_oppanel_async, 3);
+
+	oppanel = dt_new(opal_node, "oppanel");
+	dt_add_property_cells(oppanel, "#length", OP_PANEL_LINE_LEN);
+	dt_add_property_cells(oppanel, "#lines", OP_PANEL_NUM_LINES);
+	dt_add_property_string(oppanel, "compatible", "ibm,opal-oppanel");
+}
diff --git a/roms/skiboot/hw/fsp/fsp-psi.c b/roms/skiboot/hw/fsp/fsp-psi.c
new file mode 100644
index 000000000..38f130dd7
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-psi.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <io.h>
+#include <psi.h>
+#include <lock.h>
+#include <fsp.h>
+
+static void psi_tce_enable(struct psi *psi, bool enable)
+{
+	void *addr = psi->regs + PSIHB_PHBSCR;
+	u64 val;
+
+	val = in_be64(addr);
+	if (enable)
+		val |=  PSIHB_PHBSCR_TCE_ENABLE;
+	else
+		val &= ~PSIHB_PHBSCR_TCE_ENABLE;
+	out_be64(addr, val);
+}
+
+/*
+ * Configure the PSI interface for communicating with
+ * an FSP, such as enabling the TCEs, FSP commands,
+ * etc...
+ */
+void psi_init_for_fsp(struct psi *psi)
+{
+	uint64_t reg;
+	bool enable_tce = true;
+
+	lock(&psi_lock);
+
+	/* Disable and setup TCE base address */
+	psi_tce_enable(psi, false);
+
+	switch (proc_gen) {
+	case proc_gen_p8:
+	case proc_gen_p9:
+	case proc_gen_p10:
+		out_be64(psi->regs + PSIHB_TAR, PSI_TCE_TABLE_BASE |
+			 PSIHB_TAR_256K_ENTRIES);
+		break;
+	default:
+		enable_tce = false;
+	};
+
+	/* Enable various other configuration register bits based
+	 * on what pHyp does. We keep interrupts disabled until
+	 * after the mailbox has been properly configured. We assume
+	 * basic stuff such as PSI link enable is already there.
+	 *
+	 *  - FSP CMD Enable
+	 *  - FSP MMIO Enable
+	 *  - TCE Enable
+	 *  - Error response enable
+	 *
+	 * Clear all other error bits
+	 */
+	if (!psi->active) {
+		prerror("PSI: psi_init_for_fsp() called on inactive link!\n");
+		unlock(&psi_lock);
+		return;
+	}
+
+	reg = in_be64(psi->regs + PSIHB_CR);
+	reg |= PSIHB_CR_FSP_CMD_ENABLE;
+	reg |= PSIHB_CR_FSP_MMIO_ENABLE;
+	reg |= PSIHB_CR_FSP_ERR_RSP_ENABLE;
+	reg &= ~0x00000000ffffffffull;
+	out_be64(psi->regs + PSIHB_CR, reg);
+	psi_tce_enable(psi, enable_tce);
+
+	unlock(&psi_lock);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-rtc.c b/roms/skiboot/hw/fsp/fsp-rtc.c
new file mode 100644
index 000000000..237560a8d
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-rtc.c
@@ -0,0 +1,567 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Real Time Clock (RTC) attached to FSP
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <timebase.h>
+#include <time.h>
+#include <time-utils.h>
+#include <opal-api.h>
+#include <opal-msg.h>
+#include <errorlog.h>
+#include <device.h>
+
+/*
+ * Note on how those operate:
+ *
+ * Because the RTC calls can be pretty slow, these functions will shoot
+ * an asynchronous request to the FSP (if none is already pending)
+ *
+ * The requests will return OPAL_BUSY_EVENT as long as the event has
+ * not been completed.
+ *
+ * WARNING: An attempt at doing an RTC write while one is already pending
+ * will simply ignore the new arguments and continue returning
+ * OPAL_BUSY_EVENT. This is to be compatible with existing Linux code.
+ *
+ * Completion of the request will result in an event OPAL_EVENT_RTC
+ * being signaled, which will remain raised until a corresponding call
+ * to opal_rtc_read() or opal_rtc_write() finally returns OPAL_SUCCESS,
+ * at which point the operation is complete and the event cleared.
+ *
+ * If we end up taking longer than rtc_read_timeout_ms millieconds waiting
+ * for the response from a read request, we simply return a cached value (plus
+ * an offset calculated from the timebase. When the read request finally
+ * returns, we update our cache value accordingly.
+ *
+ * There is two separate set of state for reads and writes. If both are
+ * attempted at the same time, the event bit will remain set as long as either
+ * of the two has a pending event to signal.
+ */
+
+#include <rtc.h>
+
+/* All of the below state is protected by rtc_lock.
+ * It should be held for the shortest amount of time possible.
+ * Certainly not across calls to FSP.
+ */
+static struct lock rtc_lock;
+
+static enum {
+	RTC_TOD_VALID,
+	RTC_TOD_INVALID,
+	RTC_TOD_PERMANENT_ERROR,
+} rtc_tod_state = RTC_TOD_INVALID;
+
+/* State machine for getting an RTC request.
+ * RTC_{READ/WRITE}_NO_REQUEST -> RTC_{READ/WRITE}_PENDING_REQUEST (one in flight)
+ * RTC_{READ/WRITE}_PENDING_REQUEST -> RTC_{READ/WRITE}_REQUEST_AVAILABLE,
+ * when FSP responds
+ * RTC_{READ/WRITE}_REQUEST_AVAILABLE -> RTC_{READ/WRITE}_NO_REQUEST,
+ * when OS retrieves it
+ */
+static enum {
+	RTC_READ_NO_REQUEST,
+	RTC_READ_PENDING_REQUEST,
+	RTC_READ_REQUEST_AVAILABLE,
+} rtc_read_request_state = RTC_READ_NO_REQUEST;
+
+static enum {
+	RTC_WRITE_NO_REQUEST,
+	RTC_WRITE_PENDING_REQUEST,
+	RTC_WRITE_REQUEST_AVAILABLE,
+} rtc_write_request_state = RTC_WRITE_NO_REQUEST;
+
+static bool rtc_tod_cache_dirty = false;
+
+struct opal_tpo_data {
+	uint64_t tpo_async_token;
+	__be32 *year_month_day;
+	__be32 *hour_min;
+};
+
+/* Timebase value when we last initiated a RTC read request */
+static unsigned long read_req_tb;
+
+/* If a RTC read takes longer than this, we return a value generated
+ * from the cache + timebase */
+static const int rtc_read_timeout_ms = 1500;
+
+DEFINE_LOG_ENTRY(OPAL_RC_RTC_TOD, OPAL_PLATFORM_ERR_EVT, OPAL_RTC,
+			OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_RTC_READ, OPAL_PLATFORM_ERR_EVT, OPAL_RTC,
+			OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+static void fsp_tpo_req_complete(struct fsp_msg *read_resp)
+{
+	struct opal_tpo_data *attr = read_resp->user_data;
+	int val;
+	int rc;
+
+	val = (read_resp->resp->word1 >> 8) & 0xff;
+	switch (val) {
+	case FSP_STATUS_TOD_RESET:
+		log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+			"RTC TPO in invalid state\n");
+		rc = OPAL_INTERNAL_ERROR;
+		break;
+
+	case FSP_STATUS_TOD_PERMANENT_ERROR:
+		log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+			"RTC TPO in permanent error state\n");
+		rc = OPAL_INTERNAL_ERROR;
+		break;
+	case FSP_STATUS_INVALID_DATA:
+		log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+			"RTC TPO: Invalid data\n");
+		rc = OPAL_PARAMETER;
+		break;
+	case FSP_STATUS_SUCCESS:
+		/* Save the read TPO value in our cache */
+		if (attr->year_month_day)
+			*attr->year_month_day = cpu_to_be32(fsp_msg_get_data_word(read_resp->resp, 0));
+		if (attr->hour_min)
+			*attr->hour_min = cpu_to_be32(fsp_msg_get_data_word(read_resp->resp, 1));
+		rc = OPAL_SUCCESS;
+		break;
+
+	default:
+		log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+			"TPO read failed: %d\n", val);
+		rc = OPAL_INTERNAL_ERROR;
+		break;
+	}
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+		       cpu_to_be64(attr->tpo_async_token),
+		       cpu_to_be64(rc));
+	free(attr);
+	fsp_freemsg(read_resp);
+}
+
+static void fsp_rtc_process_read(struct fsp_msg *read_resp)
+{
+	int val = (read_resp->word1 >> 8) & 0xff;
+	struct tm tm;
+
+	assert(lock_held_by_me(&rtc_lock));
+
+	assert(rtc_read_request_state == RTC_READ_PENDING_REQUEST);
+
+	switch (val) {
+	case FSP_STATUS_TOD_RESET:
+		log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+				"RTC TOD in invalid state\n");
+		rtc_tod_state = RTC_TOD_INVALID;
+		break;
+
+	case FSP_STATUS_TOD_PERMANENT_ERROR:
+		log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+			"RTC TOD in permanent error state\n");
+		rtc_tod_state = RTC_TOD_PERMANENT_ERROR;
+		break;
+
+	case FSP_STATUS_SUCCESS:
+		/* Save the read RTC value in our cache */
+		rtc_tod_state = RTC_TOD_VALID;
+		datetime_to_tm(fsp_msg_get_data_word(read_resp, 0),
+			       (u64)fsp_msg_get_data_word(read_resp, 1) << 32, &tm);
+		rtc_cache_update(&tm);
+		prlog(PR_TRACE, "FSP-RTC Got time: %d-%d-%d %d:%d:%d\n",
+		      tm.tm_year, tm.tm_mon, tm.tm_mday,
+		      tm.tm_hour, tm.tm_min, tm.tm_sec);
+		break;
+
+	default:
+		log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+				"RTC TOD read failed: %d\n", val);
+		rtc_tod_state = RTC_TOD_INVALID;
+	}
+	rtc_read_request_state = RTC_READ_REQUEST_AVAILABLE;
+}
+
+static void opal_rtc_eval_events(bool read_write)
+{
+	bool request_available;
+
+	if (read_write)
+		request_available = (rtc_read_request_state ==
+				     RTC_READ_REQUEST_AVAILABLE);
+	else
+		request_available = (rtc_write_request_state ==
+				    RTC_WRITE_REQUEST_AVAILABLE);
+
+	assert(lock_held_by_me(&rtc_lock));
+	opal_update_pending_evt(OPAL_EVENT_RTC,
+				request_available ? OPAL_EVENT_RTC : 0);
+}
+
+static void fsp_rtc_req_complete(struct fsp_msg *msg)
+{
+	lock(&rtc_lock);
+	prlog(PR_TRACE, "RTC completion %p\n", msg);
+
+	if (fsp_msg_cmd(msg) == (FSP_CMD_READ_TOD & 0xffffff)) {
+		fsp_rtc_process_read(msg->resp);
+		opal_rtc_eval_events(true);
+	} else {
+		assert(rtc_write_request_state == RTC_WRITE_PENDING_REQUEST);
+		rtc_write_request_state = RTC_WRITE_REQUEST_AVAILABLE;
+		opal_rtc_eval_events(false);
+	}
+
+	unlock(&rtc_lock);
+	fsp_freemsg(msg);
+}
+
+static int64_t fsp_rtc_send_read_request(void)
+{
+	struct fsp_msg *msg;
+	int rc;
+
+	assert(lock_held_by_me(&rtc_lock));
+	assert(rtc_read_request_state == RTC_READ_NO_REQUEST);
+
+	msg = fsp_mkmsg(FSP_CMD_READ_TOD, 0);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_RTC_READ),
+			"RTC: failed to allocate read message\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	rc = fsp_queue_msg(msg, fsp_rtc_req_complete);
+	if (rc) {
+		fsp_freemsg(msg);
+		log_simple_error(&e_info(OPAL_RC_RTC_READ),
+			"RTC: failed to queue read message: %d\n", rc);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	rtc_read_request_state = RTC_READ_PENDING_REQUEST;
+
+	read_req_tb = mftb();
+
+	return OPAL_BUSY_EVENT;
+}
+
+static int64_t fsp_opal_rtc_read(__be32 *__ymd, __be64 *__hmsm)
+{
+	int64_t rc;
+	uint32_t ymd;
+	uint64_t hmsm;
+
+	if (!__ymd || !__hmsm)
+		return OPAL_PARAMETER;
+
+	lock(&rtc_lock);
+
+	if (rtc_tod_state == RTC_TOD_PERMANENT_ERROR) {
+		rc = OPAL_HARDWARE;
+		goto out;
+	}
+
+	/* During R/R of FSP, read cached TOD */
+	if (fsp_in_rr()) {
+		if (rtc_tod_state == RTC_TOD_VALID) {
+			rtc_cache_get_datetime(&ymd, &hmsm);
+			rc = OPAL_SUCCESS;
+		} else {
+			rc = OPAL_INTERNAL_ERROR;
+		}
+		goto out;
+	}
+
+	/* If we don't have a read pending already, fire off a request and
+	 * return */
+	if (rtc_read_request_state == RTC_READ_NO_REQUEST) {
+		prlog(PR_TRACE, "Sending new RTC read request\n");
+		rc = fsp_rtc_send_read_request();
+	/* If our pending read is done, clear events and return the time
+	 * from the cache */
+	} else if (rtc_read_request_state == RTC_READ_REQUEST_AVAILABLE) {
+                prlog(PR_TRACE, "RTC read complete, state %d\n", rtc_tod_state);
+		rtc_read_request_state = RTC_READ_NO_REQUEST;
+
+                opal_rtc_eval_events(true);
+
+                if (rtc_tod_state == RTC_TOD_VALID) {
+                        rtc_cache_get_datetime(&ymd, &hmsm);
+                        prlog(PR_TRACE,"FSP-RTC Cached datetime: %x %llx\n",
+                              ymd, hmsm);
+                        rc = OPAL_SUCCESS;
+                } else {
+                        rc = OPAL_INTERNAL_ERROR;
+		}
+
+	/* Timeout: return our cached value (updated from tb), but leave the
+	 * read request pending so it will update the cache later */
+	} else if (mftb() > read_req_tb + msecs_to_tb(rtc_read_timeout_ms)) {
+		prlog(PR_TRACE, "RTC read timed out\n");
+
+		if (rtc_tod_state == RTC_TOD_VALID) {
+			rtc_cache_get_datetime(&ymd, &hmsm);
+			rc = OPAL_SUCCESS;
+		} else {
+                        rc = OPAL_INTERNAL_ERROR;
+		}
+	/* Otherwise, we're still waiting on the read to complete */
+	} else {
+		assert(rtc_read_request_state == RTC_READ_PENDING_REQUEST);
+		rc = OPAL_BUSY_EVENT;
+	}
+out:
+	unlock(&rtc_lock);
+
+	if (rc == OPAL_SUCCESS) {
+		*__ymd = cpu_to_be32(ymd);
+		*__hmsm = cpu_to_be64(hmsm);
+	}
+
+	return rc;
+}
+
+static int64_t fsp_rtc_send_write_request(uint32_t year_month_day,
+					  uint64_t hour_minute_second_millisecond)
+{
+	struct fsp_msg *msg;
+	uint32_t w0, w1, w2;
+
+	assert(lock_held_by_me(&rtc_lock));
+	assert(rtc_write_request_state == RTC_WRITE_NO_REQUEST);
+
+	/* Create a request and send it. Just like for read, we ignore
+	 * the "millisecond" field which is probably supposed to be
+	 * microseconds and which Linux ignores as well anyway
+	 */
+	w0 = year_month_day;
+	w1 = (hour_minute_second_millisecond >> 32) & 0xffffff00;
+	w2 = 0;
+
+	msg = fsp_mkmsg(FSP_CMD_WRITE_TOD, 3, w0, w1, w2);
+	if (!msg) {
+		prlog(PR_TRACE, " -> allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	prlog(PR_TRACE, " -> req at %p\n", msg);
+
+	if (fsp_queue_msg(msg, fsp_rtc_req_complete)) {
+		prlog(PR_TRACE, " -> queueing failed !\n");
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	rtc_write_request_state = RTC_WRITE_PENDING_REQUEST;
+
+	return OPAL_BUSY_EVENT;
+}
+
+static int64_t fsp_opal_rtc_write(uint32_t year_month_day,
+				  uint64_t hour_minute_second_millisecond)
+{
+	int rc;
+	struct tm tm;
+
+	lock(&rtc_lock);
+	if (rtc_tod_state == RTC_TOD_PERMANENT_ERROR) {
+		rc = OPAL_HARDWARE;
+		goto out;
+	}
+
+	if (fsp_in_rr()) {
+		datetime_to_tm(year_month_day,
+			       hour_minute_second_millisecond, &tm);
+		rtc_cache_update(&tm);
+		rtc_tod_cache_dirty = true;
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+
+	if (rtc_write_request_state == RTC_WRITE_NO_REQUEST) {
+		prlog(PR_TRACE, "Sending new RTC write request\n");
+		rc = fsp_rtc_send_write_request(year_month_day,
+						hour_minute_second_millisecond);
+	} else if (rtc_write_request_state == RTC_WRITE_PENDING_REQUEST) {
+		rc = OPAL_BUSY_EVENT;
+	} else {
+		assert(rtc_write_request_state == RTC_WRITE_REQUEST_AVAILABLE);
+		rtc_write_request_state = RTC_WRITE_NO_REQUEST;
+
+		opal_rtc_eval_events(false);
+		rc = OPAL_SUCCESS;
+	}
+
+out:
+	unlock(&rtc_lock);
+	return rc;
+}
+
+/* Set timed power on values to fsp */
+static int64_t fsp_opal_tpo_write(uint64_t async_token, uint32_t y_m_d,
+			uint32_t hr_min)
+{
+	static struct opal_tpo_data *attr;
+	struct fsp_msg *msg;
+
+	if (!fsp_present())
+		return OPAL_HARDWARE;
+
+	attr = zalloc(sizeof(struct opal_tpo_data));
+	if (!attr)
+		return OPAL_NO_MEM;
+
+	/* Create a request and send it.*/
+	attr->tpo_async_token = async_token;
+
+	/* check if this is a disable tpo request */
+	if (y_m_d == 0 && hr_min == 0) {
+		prlog(PR_TRACE, "Sending TPO disable request...\n");
+		msg = fsp_mkmsg(FSP_CMD_TPO_DISABLE, 0);
+	} else {
+		prlog(PR_TRACE, "Sending TPO write request...\n");
+		msg = fsp_mkmsg(FSP_CMD_TPO_WRITE, 2, y_m_d, hr_min);
+	}
+
+	if (!msg) {
+		prerror("TPO: Failed to create message for WRITE to FSP\n");
+		free(attr);
+		return OPAL_INTERNAL_ERROR;
+	}
+	msg->user_data = attr;
+	if (fsp_queue_msg(msg, fsp_tpo_req_complete)) {
+		free(attr);
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	return OPAL_ASYNC_COMPLETION;
+}
+
+/* Read Timed power on (TPO) from FSP */
+static int64_t fsp_opal_tpo_read(uint64_t async_token, __be32 *y_m_d,
+			__be32 *hr_min)
+{
+	static struct opal_tpo_data *attr;
+	struct fsp_msg *msg;
+	int64_t rc;
+
+	if (!fsp_present())
+		return OPAL_HARDWARE;
+
+	if (!y_m_d || !hr_min)
+		return OPAL_PARAMETER;
+
+	attr = zalloc(sizeof(*attr));
+	if (!attr)
+		return OPAL_NO_MEM;
+
+	/* Send read requet to FSP */
+	attr->tpo_async_token = async_token;
+	attr->year_month_day = y_m_d;
+	attr->hour_min = hr_min;
+
+	prlog(PR_TRACE, "Sending new TPO read request\n");
+	msg = fsp_mkmsg(FSP_CMD_TPO_READ, 0);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_RTC_READ),
+			"TPO: failed to allocate read message\n");
+		free(attr);
+		return OPAL_INTERNAL_ERROR;
+	}
+	msg->user_data = attr;
+	rc = fsp_queue_msg(msg, fsp_tpo_req_complete);
+	if (rc) {
+		free(attr);
+		fsp_freemsg(msg);
+		log_simple_error(&e_info(OPAL_RC_RTC_READ),
+			"TPO: failed to queue read message: %lld\n", rc);
+		return OPAL_INTERNAL_ERROR;
+	}
+	return OPAL_ASYNC_COMPLETION;
+}
+
+static void rtc_flush_cached_tod(void)
+{
+	struct fsp_msg *msg;
+	uint64_t h_m_s_m;
+	uint32_t y_m_d;
+
+	if (rtc_cache_get_datetime(&y_m_d, &h_m_s_m))
+		return;
+	msg = fsp_mkmsg(FSP_CMD_WRITE_TOD, 3, y_m_d,
+			(h_m_s_m >> 32) & 0xffffff00, 0);
+	if (!msg) {
+		prerror("TPO: %s : Failed to allocate write TOD message\n",
+			__func__);
+		return;
+	}
+	if (fsp_queue_msg(msg, fsp_freemsg)) {
+		fsp_freemsg(msg);
+		prerror("TPO: %s : Failed to queue WRITE_TOD command\n",
+			__func__);
+		return;
+	}
+}
+
+static bool fsp_rtc_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+
+	int rc = false;
+	assert(msg == NULL);
+
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		rc = true;
+		break;
+	case FSP_RELOAD_COMPLETE:
+		lock(&rtc_lock);
+		if (rtc_tod_cache_dirty) {
+			rtc_flush_cached_tod();
+			rtc_tod_cache_dirty = false;
+		}
+		unlock(&rtc_lock);
+		rc = true;
+		break;
+	}
+
+	return rc;
+}
+
+static struct fsp_client fsp_rtc_client_rr = {
+	.message = fsp_rtc_msg_rr,
+};
+
+void fsp_rtc_init(void)
+{
+	struct dt_node *np;
+
+	if (!fsp_present()) {
+		rtc_tod_state = RTC_TOD_PERMANENT_ERROR;
+		return;
+	}
+
+	opal_register(OPAL_RTC_READ, fsp_opal_rtc_read, 2);
+	opal_register(OPAL_RTC_WRITE, fsp_opal_rtc_write, 2);
+	opal_register(OPAL_WRITE_TPO, fsp_opal_tpo_write, 3);
+	opal_register(OPAL_READ_TPO,  fsp_opal_tpo_read, 3);
+
+	np = dt_new(opal_node, "rtc");
+	dt_add_property_strings(np, "compatible", "ibm,opal-rtc");
+	dt_add_property(np, "has-tpo", NULL, 0);
+
+	/* Register for the reset/reload event */
+	fsp_register_client(&fsp_rtc_client_rr, FSP_MCLASS_RR_EVENT);
+
+	prlog(PR_TRACE, "Getting initial RTC TOD\n");
+
+	/* We don't wait for RTC response and this is actually okay as
+	 * any OPAL callers will wait correctly and if we ever have
+	 * internal users then they should check the state properly
+	 */
+	lock(&rtc_lock);
+	fsp_rtc_send_read_request();
+	unlock(&rtc_lock);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-sensor.c b/roms/skiboot/hw/fsp/fsp-sensor.c
new file mode 100644
index 000000000..ffcd004f3
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-sensor.c
@@ -0,0 +1,860 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * This code will enable the 'powernv' to retrieve sensor related data from FSP
+ * using SPCN passthru mailbox commands.
+ *
+ * The OPAL read sensor API in Sapphire is implemented as an 'asynchronous' read
+ * call that returns after queuing the read request. A unique sensor-id is
+ * expected as an argument for OPAL read call which has already been exported
+ * to the device tree during fsp init. The sapphire code decodes this Id to
+ * determine requested attribute and sensor.
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <device.h>
+#include <spcn.h>
+#include <opal-api.h>
+#include <opal-msg.h>
+#include <errorlog.h>
+#include <sensor.h>
+
+#define INVALID_DATA	((uint32_t)-1)
+
+/* Entry size of PRS command modifiers */
+#define PRS_STATUS_ENTRY_SZ	0x08
+#define SENSOR_PARAM_ENTRY_SZ	0x10
+#define SENSOR_DATA_ENTRY_SZ	0x08
+#define PROC_JUNC_ENTRY_SZ	0x04
+
+DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_SENSOR,
+			OPAL_MISC_SUBSYSTEM,
+			OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+			OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_READ, OPAL_PLATFORM_ERR_EVT, OPAL_SENSOR,
+			OPAL_MISC_SUBSYSTEM, OPAL_INFO,
+			OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_ASYNC_COMPLETE, OPAL_PLATFORM_ERR_EVT,
+			OPAL_SENSOR, OPAL_MISC_SUBSYSTEM, OPAL_INFO,
+			OPAL_NA);
+
+/* FSP response status codes */
+enum {
+	SP_RSP_STATUS_VALID_DATA = 0x00,
+	SP_RSP_STATUS_INVALID_DATA = 0x22,
+	SP_RSP_STATUS_SPCN_ERR = 0xA8,
+	SP_RSP_STATUS_DMA_ERR = 0x24,
+};
+
+enum sensor_state {
+	SENSOR_VALID_DATA,
+	SENSOR_INVALID_DATA,
+	SENSOR_SPCN_ERROR,
+	SENSOR_DMA_ERROR,
+	SENSOR_PERMANENT_ERROR,
+	SENSOR_OPAL_ERROR,
+};
+
+enum spcn_attr {
+	SENSOR_STATUS,
+	SENSOR_THRS,
+	SENSOR_DATA,
+	SENSOR_MAX,
+};
+
+/* Parsed sensor attributes, passed through OPAL */
+struct opal_sensor_data {
+	uint64_t	async_token;	/* Asynchronous token */
+	__be64		*sensor_data;	/* Kernel pointer to copy data */
+	enum spcn_attr	spcn_attr;	/* Modifier attribute */
+	uint16_t	rid;		/* Sensor RID */
+	uint8_t		frc;		/* Sensor resource class */
+	uint32_t	mod_index;	/* Modifier index*/
+	uint32_t	offset;		/* Offset in sensor buffer */
+};
+
+struct spcn_mod {
+	uint8_t mod;		/* Modifier code */
+	uint8_t entry_size;	/* Size of each entry in response buffer */
+	uint16_t entry_count;	/* Number of entries */
+};
+
+static struct spcn_mod spcn_mod_data[] = {
+		{SPCN_MOD_PRS_STATUS_FIRST, PRS_STATUS_ENTRY_SZ, 0 },
+		{SPCN_MOD_PRS_STATUS_SUBS, PRS_STATUS_ENTRY_SZ, 0 },
+		{SPCN_MOD_SENSOR_PARAM_FIRST, SENSOR_PARAM_ENTRY_SZ, 0 },
+		{SPCN_MOD_SENSOR_PARAM_SUBS, SENSOR_PARAM_ENTRY_SZ, 0 },
+		{SPCN_MOD_SENSOR_DATA_FIRST, SENSOR_DATA_ENTRY_SZ, 0 },
+		{SPCN_MOD_SENSOR_DATA_SUBS, SENSOR_DATA_ENTRY_SZ, 0 },
+		/* TODO Support this modifier '0x14', if required */
+		/* {SPCN_MOD_PROC_JUNC_TEMP, PROC_JUNC_ENTRY_SZ, 0, NULL}, */
+		{SPCN_MOD_SENSOR_POWER, SENSOR_DATA_ENTRY_SZ, 0 },
+		{SPCN_MOD_LAST, 0xff, 0xffff}
+};
+
+/* Frame resource class (FRC) names */
+static const char *frc_names[] = {
+		/* 0x00 and 0x01 are reserved */
+		NULL,
+		NULL,
+		"power-controller",
+		"power",
+		"regulator",
+		"cooling-fan",
+		"cooling-controller",
+		"battery-charger",
+		"battery-pack",
+		"amb-temp",
+		"temp",
+		"vrm",
+		"riser-card",
+		"io-backplane"
+};
+
+#define SENSOR_MAX_SIZE		0x00100000
+static void *sensor_buffer = NULL;
+static enum sensor_state sensor_state;
+static bool prev_msg_consumed = true;
+static struct lock sensor_lock;
+
+/* Function prototypes */
+static int64_t fsp_sensor_send_read_request(struct opal_sensor_data *attr);
+static void queue_msg_for_delivery(int rc, struct opal_sensor_data *attr);
+
+
+/*
+ * Power Resource Status (PRS)
+ * Command: 0x42
+ *
+ * Modifier: 0x01
+ * --------------------------------------------------------------------------
+ * |    0        1         2      3         4        5         6        7   |
+ * --------------------------------------------------------------------------
+ * |Frame resrc class|      PRID       |      SRC        |       Status     |
+ * --------------------------------------------------------------------------
+ *
+ *
+ * Modifier: 0x10
+ * --------------------------------------------------------------------------
+ * |    0        1         2      3         4        5         6        7   |
+ * --------------------------------------------------------------------------
+ * |Frame resrc class|      PRID       |            Sensor location         |
+ * --------------------------------------------------------------------------
+ * --------------------------------------------------------------------------
+ * |    8        9         10      11         12   13          14    15     |
+ * --------------------------------------------------------------------------
+ * |    Reserved     |   Reserved      |   Threshold     |       Status     |
+ * --------------------------------------------------------------------------
+ *
+ *
+ * Modifier: 0x12
+ * --------------------------------------------------------------------------
+ * |    0        1         2      3         4        5         6        7   |
+ * --------------------------------------------------------------------------
+ * |Frame resrc class|      PRID      |   Sensor data    |       Status     |
+ * --------------------------------------------------------------------------
+ *
+ *
+ * Modifier: 0x14
+ * --------------------------------------------------------------------------
+ * |       0                 1                2                   3         |
+ * --------------------------------------------------------------------------
+ * |Enclosure Tj Avg | Chip Tj Avg    |    Reserved      |     Reserved     |
+ * --------------------------------------------------------------------------
+ */
+
+
+/*
+ * When coming from a SENSOR_POWER modifier command, the resource id
+ * of a power supply is on one byte and misses a "subclass" byte
+ * (0x10). This routine adds it to be consistent with the PRS_STATUS
+ * modifier command.
+ */
+#define normalize_power_rid(rid) (0x1000|(rid))
+
+static uint32_t sensor_power_process_data(uint16_t rid,
+					struct sensor_power *power)
+{
+	int i;
+
+	if (!sensor_power_is_valid(power)) {
+		prlog(PR_TRACE, "Power Sensor data not valid\n");
+		return INVALID_DATA;
+	}
+
+	for (i = 0; i < sensor_power_count(power); i++) {
+		prlog(PR_TRACE, "Power[%d]: %d mW\n", i,
+		      power->supplies[i].milliwatts);
+		if (rid == normalize_power_rid(power->supplies[i].rid))
+			return be32_to_cpu(power->supplies[i].milliwatts) / 1000;
+	}
+
+	return 0;
+}
+
+static inline uint16_t convert_status_to_fault(uint16_t status)
+{
+	return status & 0x06;
+}
+
+static void fsp_sensor_process_data(struct opal_sensor_data *attr)
+{
+	uint8_t *sensor_buf_ptr = (uint8_t *)sensor_buffer;
+	uint32_t sensor_data = INVALID_DATA;
+	__be16 sensor_mod_data[8];
+	int count;
+
+	for (count = 0; count < spcn_mod_data[attr->mod_index].entry_count;
+			count++) {
+		memcpy((void *)sensor_mod_data, sensor_buf_ptr,
+				spcn_mod_data[attr->mod_index].entry_size);
+		if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_PROC_JUNC_TEMP) {
+			/* TODO Support this modifier '0x14', if required */
+
+		} else if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_SENSOR_POWER) {
+			sensor_data = sensor_power_process_data(attr->rid,
+					(struct sensor_power *) sensor_buf_ptr);
+			break;
+		} else if (be16_to_cpu(sensor_mod_data[0]) == attr->frc &&
+				be16_to_cpu(sensor_mod_data[1]) == attr->rid) {
+			switch (attr->spcn_attr) {
+			case SENSOR_STATUS:
+				sensor_data =
+					convert_status_to_fault(be16_to_cpu(sensor_mod_data[3]));
+				break;
+			case SENSOR_THRS:
+				sensor_data = be16_to_cpu(sensor_mod_data[6]);
+				break;
+			case SENSOR_DATA:
+				sensor_data = be16_to_cpu(sensor_mod_data[2]);
+				break;
+			default:
+				break;
+			}
+
+			break;
+		}
+
+		sensor_buf_ptr += spcn_mod_data[attr->mod_index].entry_size;
+	}
+
+	*attr->sensor_data = cpu_to_be64(sensor_data);
+	if (sensor_data == INVALID_DATA)
+		queue_msg_for_delivery(OPAL_PARTIAL, attr);
+	else
+		queue_msg_for_delivery(OPAL_SUCCESS, attr);
+}
+
+static int fsp_sensor_process_read(struct fsp_msg *resp_msg)
+{
+	uint8_t mbx_rsp_status;
+	uint32_t size = 0;
+
+	mbx_rsp_status = (resp_msg->word1 >> 8) & 0xff;
+	switch (mbx_rsp_status) {
+	case SP_RSP_STATUS_VALID_DATA:
+		sensor_state = SENSOR_VALID_DATA;
+		size = fsp_msg_get_data_word(resp_msg, 1) & 0xffff;
+		break;
+	case SP_RSP_STATUS_INVALID_DATA:
+		log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+			"SENSOR: %s: Received invalid data\n", __func__);
+		sensor_state = SENSOR_INVALID_DATA;
+		break;
+	case SP_RSP_STATUS_SPCN_ERR:
+		log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+			"SENSOR: %s: Failure due to SPCN error\n", __func__);
+		sensor_state = SENSOR_SPCN_ERROR;
+		break;
+	case SP_RSP_STATUS_DMA_ERR:
+		log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+			"SENSOR: %s: Failure due to DMA error\n", __func__);
+		sensor_state = SENSOR_DMA_ERROR;
+		break;
+	default:
+		log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+			"SENSOR %s: Read failed, status:0x%02X\n",
+					__func__, mbx_rsp_status);
+		sensor_state = SENSOR_INVALID_DATA;
+		break;
+	}
+
+	return size;
+}
+
+static void queue_msg_for_delivery(int rc, struct opal_sensor_data *attr)
+{
+	prlog(PR_INSANE, "%s: rc:%d, data:%lld\n",
+	      __func__, rc, *(attr->sensor_data));
+	check_sensor_read(attr->async_token);
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			cpu_to_be64(attr->async_token),
+			cpu_to_be64(rc));
+	spcn_mod_data[attr->mod_index].entry_count = 0;
+	free(attr);
+	prev_msg_consumed = true;
+}
+
+static void fsp_sensor_read_complete(struct fsp_msg *msg)
+{
+	struct opal_sensor_data *attr = msg->user_data;
+	enum spcn_rsp_status status;
+	int rc, size;
+
+	prlog(PR_INSANE, "%s()\n", __func__);
+
+	status = (fsp_msg_get_data_word(msg->resp, 1) >> 24) & 0xff;
+	size = fsp_sensor_process_read(msg->resp);
+	fsp_freemsg(msg);
+
+	lock(&sensor_lock);
+	if (sensor_state == SENSOR_VALID_DATA) {
+		spcn_mod_data[attr->mod_index].entry_count += (size /
+				spcn_mod_data[attr->mod_index].entry_size);
+		attr->offset += size;
+		/* Fetch the subsequent entries of the same modifier type */
+		if (status == SPCN_RSP_STATUS_COND_SUCCESS) {
+			switch (spcn_mod_data[attr->mod_index].mod) {
+			case SPCN_MOD_PRS_STATUS_FIRST:
+			case SPCN_MOD_SENSOR_PARAM_FIRST:
+			case SPCN_MOD_SENSOR_DATA_FIRST:
+				attr->mod_index++;
+				spcn_mod_data[attr->mod_index].entry_count =
+						spcn_mod_data[attr->mod_index - 1].
+						entry_count;
+				spcn_mod_data[attr->mod_index - 1].entry_count = 0;
+				break;
+			default:
+				break;
+			}
+
+			rc = fsp_sensor_send_read_request(attr);
+			if (rc != OPAL_ASYNC_COMPLETION)
+				goto err;
+		} else { /* Notify 'powernv' of read completion */
+			fsp_sensor_process_data(attr);
+		}
+	} else {
+		rc = OPAL_INTERNAL_ERROR;
+		goto err;
+	}
+	unlock(&sensor_lock);
+	return;
+err:
+	*attr->sensor_data = cpu_to_be64(INVALID_DATA);
+	queue_msg_for_delivery(rc, attr);
+	unlock(&sensor_lock);
+	log_simple_error(&e_info(OPAL_RC_SENSOR_ASYNC_COMPLETE),
+		"SENSOR: %s: Failed to queue the "
+		"read request to fsp\n", __func__);
+}
+
+static int64_t fsp_sensor_send_read_request(struct opal_sensor_data *attr)
+{
+	int rc;
+	struct fsp_msg *msg;
+	uint32_t align;
+	uint32_t cmd_header;
+
+	if (fsp_in_rr())
+		return OPAL_BUSY;
+
+	prlog(PR_INSANE, "Get the data for modifier [%x]\n",
+	      spcn_mod_data[attr->mod_index].mod);
+
+	if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_PROC_JUNC_TEMP) {
+		/* TODO Support this modifier '0x14', if required */
+		align = attr->offset % sizeof(uint32_t);
+		if (align)
+			attr->offset += (sizeof(uint32_t) - align);
+
+		/* TODO Add 8 byte command data required for mod 0x14 */
+
+		attr->offset += 8;
+
+		cmd_header = spcn_mod_data[attr->mod_index].mod << 24 |
+				SPCN_CMD_PRS << 16 | 0x0008;
+	} else {
+		cmd_header = spcn_mod_data[attr->mod_index].mod << 24 |
+				SPCN_CMD_PRS << 16;
+	}
+
+	msg = fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+			SPCN_ADDR_MODE_CEC_NODE, cmd_header, 0,
+			PSI_DMA_SENSOR_BUF + attr->offset);
+
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_SENSOR_READ), "SENSOR: Failed "
+				 "to allocate read message\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	msg->user_data = attr;
+	rc = fsp_queue_msg(msg, fsp_sensor_read_complete);
+	if (rc) {
+		fsp_freemsg(msg);
+		msg = NULL;
+		log_simple_error(&e_info(OPAL_RC_SENSOR_READ), "SENSOR: Failed "
+				 "to queue read message (%d)\n", rc);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	return OPAL_ASYNC_COMPLETION;
+}
+
+/*
+ * These are the resources we know about and for which we provide a
+ * mapping in the device tree to capture data from the OS. Just
+ * discard the other ones for the moment.
+ */
+static inline bool sensor_frc_is_valid(uint16_t frc)
+{
+	switch (frc) {
+	case SENSOR_FRC_POWER_SUPPLY:
+	case SENSOR_FRC_COOLING_FAN:
+	case SENSOR_FRC_AMB_TEMP:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * Each attribute of a resource needs a request to the FSP to capture
+ * its data. The routine below provides the mapping between the
+ * attribute and the PRS command modifier to use.
+ *
+ *	resource        | data   |  thrs  | status    |
+ *	----------------+--------+--------+-----------+
+ *	power_supply    | POWER  |        |           |
+ *	                |        |        | PRS       |
+ *	----------------+--------+--------+-----------+
+ *	amb-temp        | DATA   |        | DATA      |
+ *	                |        | PARAM  | PARAM (*) |
+ *	----------------+--------+--------+-----------+
+ *	fan             | DATA   |        | DATA  (*) |
+ *	                |        | PARAM  | PARAM (*) |
+ *	                |        |        | PRS       |
+ *
+ * (*) don't use the attribute given by this command modifier
+ */
+static int64_t parse_sensor_id(uint32_t handler, struct opal_sensor_data *attr)
+{
+	uint32_t mod, index;
+
+	attr->frc = sensor_get_frc(handler);
+	attr->rid = sensor_get_rid(handler);
+	attr->spcn_attr = sensor_get_attr(handler);
+
+	if (!sensor_frc_is_valid(attr->frc))
+		return OPAL_PARAMETER;
+
+	/* now compute the PRS command modifier which will be used to
+	 * request a resource attribute from the FSP */
+	switch (attr->spcn_attr) {
+	case SENSOR_DATA:
+		if (attr->frc == SENSOR_FRC_POWER_SUPPLY)
+			mod = SPCN_MOD_SENSOR_POWER;
+		else
+			mod = SPCN_MOD_SENSOR_DATA_FIRST;
+		break;
+
+	case SENSOR_THRS:
+		mod = SPCN_MOD_SENSOR_PARAM_FIRST;
+		break;
+
+	case SENSOR_STATUS:
+		switch (attr->frc) {
+		case SENSOR_FRC_AMB_TEMP:
+			mod = SPCN_MOD_SENSOR_DATA_FIRST;
+			break;
+		case SENSOR_FRC_POWER_SUPPLY:
+		case SENSOR_FRC_COOLING_FAN:
+			mod = SPCN_MOD_PRS_STATUS_FIRST;
+			break;
+		default:
+			return OPAL_PARAMETER;
+		}
+		break;
+
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST; index++) {
+		if (spcn_mod_data[index].mod == mod)
+			break;
+	}
+
+	attr->mod_index = index;
+	return 0;
+}
+
+
+int64_t fsp_opal_read_sensor(uint32_t sensor_hndl, int token,
+				__be64 *sensor_data)
+{
+	struct opal_sensor_data *attr;
+	int64_t rc;
+
+	prlog(PR_INSANE, "fsp_opal_read_sensor [%08x]\n", sensor_hndl);
+
+	if (fsp_in_rr())
+		return OPAL_BUSY;
+
+	if (sensor_state == SENSOR_PERMANENT_ERROR) {
+		rc = OPAL_HARDWARE;
+		goto out;
+	}
+
+	if (!sensor_hndl) {
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	lock(&sensor_lock);
+	if (prev_msg_consumed) {
+		attr = zalloc(sizeof(*attr));
+		if (!attr) {
+			log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+				"SENSOR: Failed to allocate memory\n");
+			rc = OPAL_NO_MEM;
+			goto out_lock;
+		}
+
+		/* Parse the sensor id and store them to the local structure */
+		rc = parse_sensor_id(sensor_hndl, attr);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+				"SENSOR: %s: Failed to parse the sensor "
+				"handle[0x%08x]\n", __func__, sensor_hndl);
+			goto out_free;
+		}
+		/* Kernel buffer pointer to copy the data later when ready */
+		attr->sensor_data = sensor_data;
+		attr->async_token = token;
+
+		rc = fsp_sensor_send_read_request(attr);
+		if (rc != OPAL_ASYNC_COMPLETION) {
+			log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+				"SENSOR: %s: Failed to queue the read "
+					"request to fsp\n", __func__);
+			goto out_free;
+		}
+
+		prev_msg_consumed = false;
+	} else {
+		rc = OPAL_BUSY_EVENT;
+	}
+
+	unlock(&sensor_lock);
+	return rc;
+
+out_free:
+	free(attr);
+out_lock:
+	unlock(&sensor_lock);
+out:
+	return rc;
+}
+
+
+#define MAX_NAME	64
+
+static struct dt_node *sensor_get_node(struct dt_node *sensors,
+		       struct sensor_header *header, const char* attrname)
+{
+	char name[MAX_NAME];
+	struct dt_node *node;
+
+	/*
+	 * Just use the resource class name and resource id. This
+	 * should be obvious enough for a node name.
+	 */
+	snprintf(name, sizeof(name), "%s#%d-%s", frc_names[be16_to_cpu(header->frc)], be16_to_cpu(header->rid), attrname);
+
+	/*
+	 * The same resources are reported by the different PRS
+	 * subcommands (PRS_STATUS, SENSOR_PARAM, SENSOR_DATA). So we
+	 * need to check that we did not already create the device
+	 * node.
+	 */
+	node = dt_find_by_path(sensors, name);
+	if (!node) {
+		prlog(PR_INFO, "SENSOR: creating node %s\n", name);
+
+		node = dt_new(sensors, name);
+
+		snprintf(name, sizeof(name), "ibm,opal-sensor-%s",
+			 frc_names[be16_to_cpu(header->frc)]);
+		dt_add_property_string(node, "compatible", name);
+	} else {
+		/**
+		 * @fwts-label OPALSensorNodeExists
+		 * @fwts-advice OPAL had trouble creating the sensor
+		 * nodes in the device tree as there was already one there.
+		 * This indicates either the device tree from Hostboot
+		 * already filled in sensors or an OPAL bug.
+		 */
+		prlog(PR_ERR, "SENSOR: node %s exists\n", name);
+	}
+	return node;
+}
+
+#define sensor_handler(header, attr_num) \
+	sensor_make_handler(SENSOR_FSP, be16_to_cpu((header).frc), be16_to_cpu((header).rid), attr_num)
+
+static int add_sensor_prs(struct dt_node *sensors, struct sensor_prs *prs)
+{
+	struct dt_node *node;
+
+	node = sensor_get_node(sensors, &prs->header, "faulted");
+	if (!node)
+		return -1;
+
+	dt_add_property_cells(node, "sensor-id",
+			      sensor_handler(prs->header, SENSOR_STATUS));
+	return 0;
+}
+
+static int add_sensor_param(struct dt_node *sensors, struct sensor_param *param)
+{
+	struct dt_node *node;
+
+	node = sensor_get_node(sensors, &param->header, "thrs");
+	if (!node)
+		return -1;
+
+	dt_add_property_string(node, "ibm,loc-code", param->location);
+	dt_add_property_cells(node, "sensor-id",
+			      sensor_handler(param->header, SENSOR_THRS));
+	/* don't use the status coming from the response of the
+	 * SENSOR_PARAM subcommand */
+	return 0;
+}
+
+static int add_sensor_data(struct dt_node *sensors,
+				struct sensor_data *data)
+{
+	struct dt_node *node;
+
+	node = sensor_get_node(sensors, &data->header, "data");
+	if (!node)
+		return -1;
+
+	dt_add_property_cells(node, "sensor-id",
+			      sensor_handler(data->header, SENSOR_DATA));
+
+	/* Let's make sure we are not adding a duplicate device node.
+	 * Some resource, like fans, get their status attribute from
+	 * three different commands ...
+	 */
+	if (be16_to_cpu(data->header.frc) == SENSOR_FRC_AMB_TEMP) {
+		node = sensor_get_node(sensors, &data->header, "faulted");
+		if (!node)
+			return -1;
+
+		dt_add_property_cells(node, "sensor-id",
+				      sensor_handler(data->header, SENSOR_STATUS));
+	}
+
+	return 0;
+}
+
+static int add_sensor_power(struct dt_node *sensors, struct sensor_power *power)
+{
+	int i;
+	struct dt_node *node;
+
+	if (!sensor_power_is_valid(power))
+		return -1;
+
+	for (i = 0; i < sensor_power_count(power); i++) {
+		struct sensor_header header = {
+			cpu_to_be16(SENSOR_FRC_POWER_SUPPLY),
+			cpu_to_be16(normalize_power_rid(power->supplies[i].rid))
+		};
+
+		node = sensor_get_node(sensors, &header, "data");
+
+		prlog(PR_TRACE, "SENSOR: Power[%d] : %d mW\n",
+		      power->supplies[i].rid,
+		      be32_to_cpu(power->supplies[i].milliwatts));
+
+		dt_add_property_cells(node, "sensor-id",
+				      sensor_handler(header, SENSOR_DATA));
+	}
+	return 0;
+}
+
+static void add_sensor_ids(struct dt_node *sensors)
+{
+	uint8_t *sensor_buf_ptr = (uint8_t *)sensor_buffer;
+	struct spcn_mod *smod;
+	int i;
+
+	for (smod = spcn_mod_data; smod->mod != SPCN_MOD_LAST; smod++) {
+		/*
+		 * SPCN_MOD_SENSOR_POWER (0x1C) has a different layout.
+		 */
+		if (smod->mod == SPCN_MOD_SENSOR_POWER) {
+			add_sensor_power(sensors,
+				      (struct sensor_power *) sensor_buf_ptr);
+
+			sensor_buf_ptr += smod->entry_size * smod->entry_count;
+			continue;
+		}
+
+		for (i = 0; i < smod->entry_count; i++) {
+			struct sensor_header *header =
+				(struct sensor_header *) sensor_buf_ptr;
+
+			if (!sensor_frc_is_valid(be16_to_cpu(header->frc)))
+				goto out_sensor;
+
+			switch (smod->mod) {
+			case SPCN_MOD_PROC_JUNC_TEMP:
+				/* TODO Support this modifier '0x14',
+				   if required */
+				break;
+
+			case SPCN_MOD_PRS_STATUS_FIRST:
+			case SPCN_MOD_PRS_STATUS_SUBS:
+				add_sensor_prs(sensors,
+					(struct sensor_prs *) header);
+				break;
+
+			case SPCN_MOD_SENSOR_PARAM_FIRST:
+			case SPCN_MOD_SENSOR_PARAM_SUBS:
+				add_sensor_param(sensors,
+					(struct sensor_param *) header);
+				break;
+
+			case SPCN_MOD_SENSOR_DATA_FIRST:
+			case SPCN_MOD_SENSOR_DATA_SUBS:
+				add_sensor_data(sensors,
+					(struct sensor_data *) header);
+
+				break;
+
+			default:
+				prerror("SENSOR: unknown modifier : %x\n",
+					smod->mod);
+			}
+
+out_sensor:
+			sensor_buf_ptr += smod->entry_size;
+		}
+	}
+}
+
+static void add_opal_sensor_node(void)
+{
+	int index;
+
+	if (!fsp_present())
+		return;
+
+	add_sensor_ids(sensor_node);
+
+	/* Reset the entry count of each modifier */
+	for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST;
+			index++)
+		spcn_mod_data[index].entry_count = 0;
+}
+
+void fsp_init_sensor(void)
+{
+	uint32_t cmd_header, align, size, psi_dma_offset = 0;
+	enum spcn_rsp_status status;
+	struct fsp_msg msg, resp;
+	int index, rc;
+
+	if (!fsp_present()) {
+		sensor_state = SENSOR_PERMANENT_ERROR;
+		return;
+	}
+
+	sensor_buffer = memalign(TCE_PSIZE, SENSOR_MAX_SIZE);
+	if (!sensor_buffer) {
+		log_simple_error(&e_info(OPAL_RC_SENSOR_INIT), "SENSOR: could "
+				 "not allocate sensor_buffer!\n");
+		return;
+	}
+
+	/* Map TCE */
+	fsp_tce_map(PSI_DMA_SENSOR_BUF, sensor_buffer, PSI_DMA_SENSOR_BUF_SZ);
+
+	msg.resp = &resp;
+
+	/* Traverse using all the modifiers to know all the sensors available
+	 * in the system */
+	for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST &&
+			sensor_state == SENSOR_VALID_DATA;) {
+		prlog(PR_TRACE, "Get the data for modifier [%d]\n",
+		      spcn_mod_data[index].mod);
+		if (spcn_mod_data[index].mod == SPCN_MOD_PROC_JUNC_TEMP) {
+			/* TODO Support this modifier 0x14, if required */
+			align = psi_dma_offset % sizeof(uint32_t);
+			if (align)
+				psi_dma_offset += (sizeof(uint32_t) - align);
+
+			/* TODO Add 8 byte command data required for mod 0x14 */
+			psi_dma_offset += 8;
+
+			cmd_header = spcn_mod_data[index].mod << 24 |
+					SPCN_CMD_PRS << 16 | 0x0008;
+		} else {
+			cmd_header = spcn_mod_data[index].mod << 24 |
+					SPCN_CMD_PRS << 16;
+		}
+
+		fsp_fillmsg(&msg, FSP_CMD_SPCN_PASSTHRU, 4,
+				SPCN_ADDR_MODE_CEC_NODE, cmd_header, 0,
+				PSI_DMA_SENSOR_BUF + psi_dma_offset);
+
+		rc = fsp_sync_msg(&msg, false);
+		if (rc >= 0) {
+			status = (fsp_msg_get_data_word(&resp, 1) >> 24) & 0xff;
+			size = fsp_sensor_process_read(&resp);
+			psi_dma_offset += size;
+			spcn_mod_data[index].entry_count += (size /
+					spcn_mod_data[index].entry_size);
+		} else {
+			sensor_state = SENSOR_PERMANENT_ERROR;
+			break;
+		}
+
+		switch (spcn_mod_data[index].mod) {
+		case SPCN_MOD_PRS_STATUS_FIRST:
+		case SPCN_MOD_SENSOR_PARAM_FIRST:
+		case SPCN_MOD_SENSOR_DATA_FIRST:
+			if (status == SPCN_RSP_STATUS_COND_SUCCESS)
+				index++;
+			else
+				index += 2;
+
+			break;
+		case SPCN_MOD_PRS_STATUS_SUBS:
+		case SPCN_MOD_SENSOR_PARAM_SUBS:
+		case SPCN_MOD_SENSOR_DATA_SUBS:
+			if (status != SPCN_RSP_STATUS_COND_SUCCESS)
+				index++;
+			break;
+		case SPCN_MOD_SENSOR_POWER:
+			index++;
+		default:
+			break;
+		}
+	}
+
+	if (sensor_state != SENSOR_VALID_DATA)
+		sensor_state = SENSOR_PERMANENT_ERROR;
+	else
+		add_opal_sensor_node();
+}
diff --git a/roms/skiboot/hw/fsp/fsp-surveillance.c b/roms/skiboot/hw/fsp/fsp-surveillance.c
new file mode 100644
index 000000000..84e6878f3
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-surveillance.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * We don't want to go on the cart!
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <processor.h>
+#include <timebase.h>
+#include <fsp-sysparam.h>
+#include <errorlog.h>
+#include <opal-api.h>
+
+static bool fsp_surv_state = false;
+static bool fsp_surv_ack_pending = false;
+static u64 surv_timer;
+static u64 surv_ack_timer;
+static u32 surv_state_param;
+static struct lock surv_lock = LOCK_UNLOCKED;
+
+#define FSP_SURV_ACK_TIMEOUT	120	/* surv ack timeout in seconds */
+
+DEFINE_LOG_ENTRY(OPAL_RC_SURVE_INIT, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+		OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_MISCELLANEOUS_INFO_ONLY);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SURVE_STATUS, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+		OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_MISCELLANEOUS_INFO_ONLY);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SURVE_ACK, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+		OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_MISCELLANEOUS_INFO_ONLY);
+
+static void fsp_surv_ack(struct fsp_msg *msg)
+{
+	uint8_t val;
+
+	if (!msg->resp)
+		return;
+
+	val = (msg->resp->word1 >> 8) & 0xff;
+	if (val == 0) {
+		/* reset the pending flag */
+		prlog(PR_TRACE,
+		      "SURV: Received heartbeat acknowledge from FSP\n");
+		lock(&surv_lock);
+		fsp_surv_ack_pending = false;
+		unlock(&surv_lock);
+	} else {
+		/**
+		 * @fwts-label FSPHeartbeatAckError
+		 * @fwts-advice Error in acknowledging heartbeat to FSP.
+		 * This could mean the FSP has gone away or it may mean
+		 * the FSP may kill us for missing too many heartbeats.
+		 */
+		prlog(PR_ERR,
+		      "SURV: Heartbeat Acknowledgment error from FSP\n");
+	}
+
+	fsp_freemsg(msg);
+}
+
+static void fsp_surv_check_timeout(void)
+{
+	u64 now = mftb();
+
+	/*
+	 * We just checked fsp_surv_ack_pending to be true in fsp_surv_hbeat
+	 * and we haven't dropped the surv_lock between then and now. So, we
+	 * just go ahead and check timeouts.
+	 */
+	if (tb_compare(now, surv_ack_timer) == TB_AAFTERB) {
+		uint32_t plid = log_simple_error(&e_info(OPAL_RC_SURVE_ACK),
+			"SURV: Surv ACK timed out; initiating R/R\n");
+
+		/* Reset the pending trigger too */
+		fsp_surv_ack_pending = false;
+		fsp_trigger_reset(plid);
+	}
+
+	return;
+}
+
+/* Send surveillance heartbeat based on a timebase trigger */
+static void fsp_surv_hbeat(void)
+{
+	u64 now = mftb();
+	struct fsp_msg *msg;
+
+	/* Check if an ack is pending... if so, don't send the ping just yet */
+	if (fsp_surv_ack_pending) {
+		fsp_surv_check_timeout();
+		return;
+	}
+
+	/* add timebase callbacks */
+	/*
+	 * XXX This packet needs to be pushed to FSP in an interval
+	 * less than 120s that's advertised to FSP.
+	 *
+	 * Verify if the command building format and call is fine.
+	 */
+	if (surv_timer == 0 ||
+	    (tb_compare(now, surv_timer) == TB_AAFTERB) ||
+	    (tb_compare(now, surv_timer) == TB_AEQUALB)) {
+		prlog(PR_TRACE,
+		      "SURV: Sending the heartbeat command to FSP\n");
+		msg = fsp_mkmsg(FSP_CMD_SURV_HBEAT, 1, 120);
+		if (!msg) {
+			prerror("SURV: Failed to allocate heartbeat msg\n");
+			return;
+		}
+		if (fsp_queue_msg(msg, fsp_surv_ack)) {
+			fsp_freemsg(msg);
+			prerror("SURV: Failed to queue heartbeat msg\n");
+		} else {
+			fsp_surv_ack_pending = true;
+			surv_timer = now + secs_to_tb(60);
+			surv_ack_timer = now + secs_to_tb(FSP_SURV_ACK_TIMEOUT);
+		}
+	}
+}
+
+static void fsp_surv_poll(void *data __unused)
+{
+	if (!fsp_surv_state)
+		return;
+	lock(&surv_lock);
+	fsp_surv_hbeat();
+	unlock(&surv_lock);
+}
+
+static void fsp_surv_got_param(uint32_t param_id __unused, int err_len,
+			       void *data __unused)
+{
+	if (err_len != 4) {
+		uint32_t plid = log_simple_error(&e_info(OPAL_RC_SURVE_STATUS),
+		"SURV: Error (%d) retrieving surv status; initiating R/R\n",
+			err_len);
+		fsp_trigger_reset(plid);
+		return;
+	}
+
+	surv_state_param = be32_to_cpu((__be32)surv_state_param);
+	if (!(surv_state_param & 0x01)) {
+		prlog(PR_NOTICE, "SURV: Status from FSP: disabled\n");
+		return;
+	}
+	prlog(PR_NOTICE, "SURV: Status from FSP: enabled\n");
+
+	lock(&surv_lock);
+	fsp_surv_state = true;
+
+	/* Also send one heartbeat now. The next one will not happen
+	 * until we hit the OS.
+	 */
+	fsp_surv_hbeat();
+	unlock(&surv_lock);
+}
+
+void fsp_surv_query(void)
+{
+	int rc;
+
+	printf("SURV: Querying FSP's surveillance status\n");
+
+	/* Reset surveillance settings */
+	lock(&surv_lock);
+	fsp_surv_state = false;
+	surv_timer = 0;
+	surv_ack_timer = 0;
+	unlock(&surv_lock);
+
+	/* Query FPS for surveillance state */
+	rc = fsp_get_sys_param(SYS_PARAM_SURV, &surv_state_param, 4,
+			       fsp_surv_got_param, NULL);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SURVE_INIT),
+			"SURV: Error %d queueing param request\n", rc);
+	}
+}
+
+static bool fsp_surv_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	assert(msg == NULL);
+
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		printf("SURV: Disabling surveillance\n");
+		lock(&surv_lock);
+		fsp_surv_state = false;
+		fsp_surv_ack_pending = false;
+		unlock(&surv_lock);
+		return true;
+	case FSP_RELOAD_COMPLETE:
+		fsp_surv_query();
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_surv_client_rr = {
+	.message = fsp_surv_msg_rr,
+};
+
+/* This is called at boot time */
+void fsp_init_surveillance(void)
+{
+	/* Always register the poller, so we don't have to add/remove
+	 * it on reset-reload or change of surveillance state. Also the
+	 * poller list has no locking so we don't want to play with it
+	 * at runtime.
+	 */
+	opal_add_poller(fsp_surv_poll, NULL);
+
+	/* Register for the reset/reload event */
+	fsp_register_client(&fsp_surv_client_rr, FSP_MCLASS_RR_EVENT);
+
+	/* Send query to FSP */
+	fsp_surv_query();
+}
+
diff --git a/roms/skiboot/hw/fsp/fsp-sysdump.c b/roms/skiboot/hw/fsp/fsp-sysdump.c
new file mode 100644
index 000000000..cd8744062
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-sysdump.c
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Sapphire dump design:
+ *   - During initialization we setup Memory Dump Source Table (MDST) table
+ *     which contains address, size pair.
+ *   - We send MDST table update notification to FSP via MBOX command.
+ *   - During Sapphire checkstop:
+ *     - FSP retrieves HWDUMP.
+ *     - FSP retrieves CEC memory based on MDST table.
+ *   - Once Sapphire reboot FSP sends new dump avialable notification via HDAT
+ *
+ * Copyright 2013-2016 IBM Corp.
+ */
+
+#include <fsp.h>
+#include <psi.h>
+#include <opal.h>
+#include <lock.h>
+#include <skiboot.h>
+#include <errorlog.h>
+#include <opal-dump.h>
+
+/*
+ * Sapphire dump size
+ *   This is the maximum memory that FSP can retrieve during checkstop.
+ *
+ * Note:
+ *   Presently we are hardcoding this parameter. Eventually we need
+ *   new System parameter so that we can get max size dynamically.
+ */
+#define MAX_SAPPHIRE_DUMP_SIZE	0x1000000
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+		 OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_UPDATE, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE,
+		 OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+		 OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_ADD, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_REMOVE, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA);
+
+
+static struct mdst_table *mdst_table;
+static struct mdst_table *dump_mem_region;
+
+static int cur_mdst_entry;
+static int max_mdst_entry;
+static int cur_dump_size;
+/*
+ * Presently both sizes are same.. But if someday FSP gives more space
+ * than our TCE mapping then we need this validation..
+ *
+ * Also once FSP implements MAX_SAPPHIRE_DUMP_SIZE system param, we can
+ * move this validation to separate function.
+ */
+static int max_dump_size = MIN(MAX_SAPPHIRE_DUMP_SIZE, PSI_DMA_HYP_DUMP_SIZE);
+
+/* Protect MDST table entries */
+static struct lock mdst_lock = LOCK_UNLOCKED;
+
+static inline uint32_t get_dump_region_map_size(uint64_t addr, uint32_t size)
+{
+	uint64_t start, end;
+
+	start = addr & ~TCE_MASK;
+	end = addr + size;
+	end = ALIGN_UP(end, TCE_PSIZE);
+
+	return (end - start);
+}
+
+static int dump_region_tce_map(void)
+{
+	int i;
+	uint32_t t_size = 0, size;
+	uint64_t addr;
+
+	for (i = 0; i < cur_mdst_entry; i++) {
+
+		addr = be64_to_cpu(dump_mem_region[i].addr) & ~TCE_MASK;
+		size = get_dump_region_map_size(be64_to_cpu(dump_mem_region[i].addr),
+						be32_to_cpu(dump_mem_region[i].size));
+
+		if (t_size + size > max_dump_size)
+			break;
+
+		/* TCE mapping */
+		fsp_tce_map(PSI_DMA_HYP_DUMP + t_size, (void *)addr, size);
+
+		/* Add entry to MDST table */
+		mdst_table[i].data_region = dump_mem_region[i].data_region;
+		mdst_table[i].size = dump_mem_region[i].size;
+		mdst_table[i].addr = cpu_to_be64(PSI_DMA_HYP_DUMP + t_size);
+
+		/* TCE alignment adjustment */
+		mdst_table[i].addr = cpu_to_be64(be64_to_cpu(mdst_table[i].addr) +
+						 (be64_to_cpu(dump_mem_region[i].addr) & 0xfff));
+
+		t_size += size;
+	}
+
+	return i;
+}
+
+static inline void dump_region_tce_unmap(void)
+{
+	fsp_tce_unmap(PSI_DMA_HYP_DUMP, PSI_DMA_HYP_DUMP_SIZE);
+}
+
+static void update_mdst_table_complete(struct fsp_msg *msg)
+{
+	uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+
+	if (status)
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE),
+				 "MDST: Update table MBOX command failed: "
+				 "0x%x\n", status);
+	else
+		printf("MDST: Table updated.\n");
+
+	fsp_freemsg(msg);
+}
+
+/* Send MDST table to FSP */
+static int64_t fsp_update_mdst_table(void)
+{
+	struct fsp_msg *msg;
+	int count;
+	int rc = OPAL_SUCCESS;
+
+	if (cur_mdst_entry <= 0) {
+		printf("MDST: Table is empty\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	lock(&mdst_lock);
+
+	/* Unmap previous mapping */
+	dump_region_tce_unmap();
+	count = dump_region_tce_map();
+
+	msg = fsp_mkmsg(FSP_CMD_HYP_MDST_TABLE, 4, 0,
+			PSI_DMA_MDST_TABLE,
+			sizeof(*mdst_table) * count,
+			sizeof(*mdst_table));
+	unlock(&mdst_lock);
+
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE),
+				 "MDST: Message allocation failed.!\n");
+		rc = OPAL_INTERNAL_ERROR;
+	} else if (fsp_queue_msg(msg, update_mdst_table_complete)) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE),
+				 "MDST: Failed to queue MDST table message.\n");
+		fsp_freemsg(msg);
+		rc = OPAL_INTERNAL_ERROR;
+	}
+	return rc;
+}
+
+static int dump_region_del_entry(uint32_t id)
+{
+	int i;
+	uint32_t size;
+	bool found = false;
+	int rc = OPAL_SUCCESS;
+
+	lock(&mdst_lock);
+
+	for (i = 0; i < cur_mdst_entry; i++) {
+		if (dump_mem_region[i].data_region != id)
+			continue;
+
+		found = true;
+		break;
+	}
+
+	if (!found) {
+		rc = OPAL_PARAMETER;
+		goto del_out;
+	}
+
+	/* Adjust current dump size */
+	size = get_dump_region_map_size(be64_to_cpu(dump_mem_region[i].addr),
+					be32_to_cpu(dump_mem_region[i].size));
+	cur_dump_size -= size;
+
+	for ( ; i < cur_mdst_entry - 1; i++)
+		dump_mem_region[i] = dump_mem_region[i + 1];
+
+	dump_mem_region[i].data_region = 0;
+	cur_mdst_entry--;
+
+del_out:
+	unlock(&mdst_lock);
+	return rc;
+}
+
+/* Add entry to MDST table */
+static int __dump_region_add_entry(uint32_t id, uint64_t addr, uint32_t size)
+{
+	int rc = OPAL_INTERNAL_ERROR;
+	uint32_t act_size;
+
+	/* Delete function takes lock before modifying table */
+	dump_region_del_entry(id);
+
+	lock(&mdst_lock);
+
+	if (cur_mdst_entry >= max_mdst_entry) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD),
+				 "MDST: Table is full.\n");
+		goto out;
+	}
+
+	/* TCE alignment adjustment */
+	act_size = get_dump_region_map_size(addr, size);
+
+	/* Make sure we don't cross dump size limit */
+	if (cur_dump_size + act_size > max_dump_size) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD),
+			 "MDST: 0x%x is crossing max dump size (0x%x) limit.\n",
+			 cur_dump_size + act_size, max_dump_size);
+		goto out;
+	}
+
+	/* Add entry to dump memory region table */
+	dump_mem_region[cur_mdst_entry].data_region = (u8)id;
+	dump_mem_region[cur_mdst_entry].addr = cpu_to_be64(addr);
+	dump_mem_region[cur_mdst_entry].size = cpu_to_be32(size);
+
+	/* Update dump region count and dump size */
+	cur_mdst_entry++;
+	cur_dump_size += act_size;
+
+	printf("MDST: Addr = 0x%llx [size : 0x%x bytes] added to MDST table.\n",
+	       (uint64_t)addr, size);
+
+	rc = OPAL_SUCCESS;
+
+out:
+	unlock(&mdst_lock);
+	return rc;
+}
+
+static int dump_region_add_entries(void)
+{
+	int rc;
+
+	/* Add console buffer */
+	rc = __dump_region_add_entry(DUMP_REGION_CONSOLE,
+				     INMEM_CON_START, INMEM_CON_LEN);
+	if (rc)
+		return rc;
+
+	/* Add HBRT buffer */
+	rc = __dump_region_add_entry(DUMP_REGION_HBRT_LOG,
+				     HBRT_CON_START, HBRT_CON_LEN);
+
+	return rc;
+}
+
+static int64_t fsp_opal_register_dump_region(uint32_t id,
+					     uint64_t addr, uint64_t size)
+{
+	int rc = OPAL_SUCCESS;
+
+	if (!fsp_present())
+		return OPAL_UNSUPPORTED;
+
+	/* Validate memory region id */
+	if (id < DUMP_REGION_HOST_START || id > DUMP_REGION_HOST_END) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD),
+				 "MDST: Invalid dump region id : 0x%x\n", id);
+		return OPAL_PARAMETER;
+	}
+
+	if (size <= 0) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_ADD),
+				 "MDST: Invalid size : 0x%llx\n", size);
+		return OPAL_PARAMETER;
+	}
+
+	rc = __dump_region_add_entry(id, addr, size);
+	if (rc)
+		return rc;
+
+	/* Send updated MDST to FSP */
+	rc = fsp_update_mdst_table();
+
+	return rc;
+}
+
+static int64_t fsp_opal_unregister_dump_region(uint32_t id)
+{
+	int rc = OPAL_SUCCESS;
+
+	if (!fsp_present())
+		return OPAL_UNSUPPORTED;
+
+	/* Validate memory region id */
+	if (id < DUMP_REGION_HOST_START || id > DUMP_REGION_HOST_END) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_REMOVE),
+				 "MDST: Invalid dump region id : 0x%x\n", id);
+		return OPAL_PARAMETER;
+	}
+
+	rc = dump_region_del_entry(id);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_REMOVE),
+				 "MDST: dump region id : 0x%x not found\n", id);
+		return OPAL_PARAMETER;
+	}
+
+	/* Send updated MDST to FSP */
+	rc = fsp_update_mdst_table();
+
+	return rc;
+}
+
+/* TCE mapping */
+static inline void mdst_table_tce_map(void)
+{
+	fsp_tce_map(PSI_DMA_MDST_TABLE, mdst_table, PSI_DMA_MDST_TABLE_SIZE);
+}
+
+/* Initialize MDST table */
+static int mdst_table_init(void)
+{
+	dump_mem_region = memalign(TCE_PSIZE, PSI_DMA_MDST_TABLE_SIZE);
+	if (!dump_mem_region) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_INIT),
+			 "MDST: Failed to allocate memory for dump "
+			 "memory region table.\n");
+		return -ENOMEM;
+	}
+
+	memset(dump_mem_region, 0, PSI_DMA_MDST_TABLE_SIZE);
+
+	mdst_table = memalign(TCE_PSIZE, PSI_DMA_MDST_TABLE_SIZE);
+	if (!mdst_table) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_INIT),
+			 "MDST: Failed to allocate memory for MDST table.\n");
+		return -ENOMEM;
+	}
+
+	memset(mdst_table, 0, PSI_DMA_MDST_TABLE_SIZE);
+	mdst_table_tce_map();
+
+	max_mdst_entry = PSI_DMA_MDST_TABLE_SIZE / sizeof(*mdst_table);
+	printf("MDST: Max entries in MDST table : %d\n", max_mdst_entry);
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Handle FSP R/R event.
+ */
+static bool fsp_mdst_update_rr(uint32_t cmd_sub_mod,
+			       struct fsp_msg *msg __unused)
+{
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		return true;
+	case FSP_RELOAD_COMPLETE: /* Send MDST to FSP */
+		fsp_update_mdst_table();
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_mdst_client_rr = {
+	.message = fsp_mdst_update_rr,
+};
+
+/* Initialize MDST table and send notification to FSP */
+void fsp_mdst_table_init(void)
+{
+	if (!fsp_present())
+		return;
+
+	/* OPAL interface */
+	opal_register(OPAL_REGISTER_DUMP_REGION,
+		      fsp_opal_register_dump_region, 3);
+	opal_register(OPAL_UNREGISTER_DUMP_REGION,
+		      fsp_opal_unregister_dump_region, 1);
+
+	/* Initiate MDST */
+	if (mdst_table_init() != OPAL_SUCCESS)
+		return;
+
+	/*
+	 * Ignore return code from mdst_table_add_entries so that
+	 * we can atleast capture partial dump.
+	 */
+	dump_region_add_entries();
+	fsp_update_mdst_table();
+
+	/* Register for Class AA (FSP R/R) */
+	fsp_register_client(&fsp_mdst_client_rr, FSP_MCLASS_RR_EVENT);
+}
diff --git a/roms/skiboot/hw/fsp/fsp-sysparam.c b/roms/skiboot/hw/fsp/fsp-sysparam.c
new file mode 100644
index 000000000..adb424e5e
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp-sysparam.c
@@ -0,0 +1,508 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * There's some system level parameters that aren't over IPMI or NVRAM
+ * but that the FSP exposes through this interface.
+ *
+ * We expose these through an OPAL API as there really isn't any other/better
+ * way of doing so.
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <opal.h>
+#include <device.h>
+#include <lock.h>
+#include <processor.h>
+#include <psi.h>
+#include <opal-msg.h>
+#include <fsp-sysparam.h>
+
+struct sysparam_comp_data {
+	uint32_t param_len;
+	uint64_t async_token;
+};
+
+struct sysparam_req {
+	sysparam_compl_t	completion;
+	void			*comp_data;
+	void			*ubuf;
+	uint32_t		ulen;
+	struct fsp_msg		msg;
+	struct fsp_msg		resp;
+	bool			done;
+};
+
+static struct sysparam_attr {
+	const char	*name;
+	uint32_t	id;
+	uint32_t	length;
+	uint8_t		perm;
+} sysparam_attrs[] = {
+#define _R	OPAL_SYSPARAM_READ
+#define _W	OPAL_SYSPARAM_WRITE
+#define _RW	OPAL_SYSPARAM_RW
+	{"surveillance",	SYS_PARAM_SURV, 	4,	_RW},
+	{"hmc-management", 	SYS_PARAM_HMC_MANAGED,	4,	_R},
+	{"cupd-policy",		SYS_PARAM_FLASH_POLICY, 4,	_RW},
+	{"plat-hmc-managed",	SYS_PARAM_NEED_HMC,	4,	_RW},
+	{"fw-license-policy",	SYS_PARAM_FW_LICENSE,	4,	_RW},
+	{"world-wide-port-num", SYS_PARAM_WWPN,		12,	_W},
+	{"default-boot-device",	SYS_PARAM_DEF_BOOT_DEV,	1,	_RW},
+	{"next-boot-device",	SYS_PARAM_NEXT_BOOT_DEV,1,	_RW},
+	{"console-select",	SYS_PARAM_CONSOLE_SELECT,1,	_RW},
+	{"boot-device-path",	SYS_PARAM_BOOT_DEV_PATH,48,	_RW}
+#undef _R
+#undef _W
+#undef _RW
+};
+
+static int fsp_sysparam_process(struct sysparam_req *r)
+{
+	u32 param_id, len;
+	int stlen = 0;
+	u8 fstat;
+	/* Snapshot completion before we set the "done" flag */
+	sysparam_compl_t comp = r->completion;
+	void *cdata = r->comp_data;
+
+	if (r->msg.state != fsp_msg_done) {
+		prerror("FSP: Request for sysparam 0x%x got FSP failure!\n",
+			fsp_msg_get_data_word(&r->msg, 0));
+		stlen = -1; /* XXX Find saner error codes */
+		goto complete;
+	}
+
+	param_id = fsp_msg_get_data_word(&r->resp, 0);
+	len = fsp_msg_get_data_word(&r->resp, 1) & 0xffff;
+
+	/* Check params validity */
+	if (param_id != fsp_msg_get_data_word(&r->msg, 0)) {
+		prerror("FSP: Request for sysparam 0x%x got resp. for 0x%x!\n",
+			fsp_msg_get_data_word(&r->msg, 0), param_id);
+		stlen = -2; /* XXX Sane error codes */
+		goto complete;
+	}
+	if (len > r->ulen) {
+		prerror("FSP: Request for sysparam 0x%x truncated!\n",
+			param_id);
+		len = r->ulen;
+	}
+
+	/* Decode the request status */
+	fstat = (r->msg.resp->word1 >> 8) & 0xff;
+	switch(fstat) {
+	case 0x00: /* XXX Is that even possible ? */
+	case 0x11: /* Data in request */
+		memcpy(r->ubuf, &r->resp.data.bytes[8], len);
+		/* fallthrough */
+	case 0x12: /* Data in TCE */
+		stlen = len;
+		break;
+	default:
+		stlen = -fstat;
+	}
+ complete:
+	/* Call completion if any */
+	if (comp)
+		comp(fsp_msg_get_data_word(&r->msg, 0), stlen, cdata);
+	
+	free(r);
+
+	return stlen;
+}
+
+static void fsp_sysparam_get_complete(struct fsp_msg *msg)
+{
+	struct sysparam_req *r = container_of(msg, struct sysparam_req, msg);
+
+	/* If it's an asynchronous request, process it now */
+	if (r->completion) {
+		fsp_sysparam_process(r);
+		return;
+	}
+
+	/* Else just set the done flag */
+
+	/* Another CPU can be polling on the "done" flag without the
+	 * lock held, so let's order the udpates to the structure
+	 */
+	lwsync();
+	r->done = true;
+}
+
+int fsp_get_sys_param(uint32_t param_id, void *buffer, uint32_t length,
+		      sysparam_compl_t async_complete, void *comp_data)
+{
+	struct sysparam_req *r;
+	uint64_t baddr, tce_token;
+	int rc;
+
+	if (!fsp_present())
+		return -ENODEV;
+	/*
+	 * XXX FIXME: We currently always allocate the sysparam_req here
+	 * however, we want to avoid runtime allocations as much as
+	 * possible, so if this is going to be used a lot at runtime,
+	 * we probably want to pre-allocate a pool of these
+	 */
+	if (length > 4096)
+		return -EINVAL;
+	r = zalloc(sizeof(struct sysparam_req));
+	if (!r)
+		return -ENOMEM;
+	r->completion = async_complete;
+	r->comp_data = comp_data;
+	r->done = false;
+	r->ubuf = buffer;
+	r->ulen = length;
+	r->msg.resp = &r->resp;
+
+	/* Map always 1 page ... easier that way and none of that
+	 * is performance critical
+	 */
+	baddr = (uint64_t)buffer;
+	fsp_tce_map(PSI_DMA_GET_SYSPARAM, (void *)(baddr & ~0xffful), 0x1000);
+	tce_token = PSI_DMA_GET_SYSPARAM | (baddr & 0xfff);
+	fsp_fillmsg(&r->msg, FSP_CMD_QUERY_SPARM, 3,
+		    param_id, length, tce_token);
+	rc = fsp_queue_msg(&r->msg, fsp_sysparam_get_complete);
+
+	if (rc)
+		free(r);
+
+	/* Asynchronous operation or queueing failure, return */
+	if (rc || async_complete)
+		return rc;
+
+	/* Synchronous operation requested, spin and process */
+	while(!r->done)
+		opal_run_pollers();
+
+	/* Will free the request */
+	return fsp_sysparam_process(r);
+}
+
+static void fsp_opal_getparam_complete(uint32_t param_id __unused, int err_len,
+		void *data)
+{
+	struct sysparam_comp_data *comp_data = data;
+	int rc = OPAL_SUCCESS;
+
+	if (comp_data->param_len != err_len)
+		rc = OPAL_INTERNAL_ERROR;
+
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			cpu_to_be64(comp_data->async_token),
+			cpu_to_be64(rc));
+	free(comp_data);
+}
+
+static void fsp_opal_setparam_complete(struct fsp_msg *msg)
+{
+	struct sysparam_comp_data *comp_data = msg->user_data;
+	u8 fstat;
+	uint32_t param_id;
+	int rc = OPAL_SUCCESS;
+
+	if (msg->state != fsp_msg_done) {
+		prerror("FSP: Request for set sysparam 0x%x got FSP failure!\n",
+				fsp_msg_get_data_word(msg, 0));
+		rc = OPAL_INTERNAL_ERROR;
+		goto out;
+	}
+
+	param_id = fsp_msg_get_data_word(msg->resp, 0);
+	if (param_id != fsp_msg_get_data_word(msg, 0)) {
+		prerror("FSP: Request for set sysparam 0x%x got resp. for 0x%x!"
+				"\n", fsp_msg_get_data_word(msg, 0), param_id);
+		rc = OPAL_INTERNAL_ERROR;
+		goto out;
+	}
+
+	fstat = (msg->resp->word1 >> 8) & 0xff;
+	switch (fstat) {
+	case 0x00:
+		rc = OPAL_SUCCESS;
+		break;
+	case 0x22:
+		prerror("%s: Response status 0x%x, invalid data\n", __func__,
+				fstat);
+		rc = OPAL_INTERNAL_ERROR;
+		break;
+	case 0x24:
+		prerror("%s: Response status 0x%x, DMA error\n", __func__,
+				fstat);
+		rc = OPAL_INTERNAL_ERROR;
+		break;
+	default:
+		rc = OPAL_INTERNAL_ERROR;
+		break;
+	}
+
+out:
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			cpu_to_be64(comp_data->async_token),
+			cpu_to_be64(rc));
+	free(comp_data);
+	fsp_freemsg(msg);
+}
+
+/* OPAL interface for PowerNV to read the system parameter from FSP */
+static int64_t fsp_opal_get_param(uint64_t async_token, uint32_t param_id,
+				  uint64_t buffer, uint64_t length)
+{
+	struct sysparam_comp_data *comp_data;
+	int count, rc, i;
+
+	if (!fsp_present())
+		return OPAL_HARDWARE;
+
+	count = ARRAY_SIZE(sysparam_attrs);
+	for (i = 0; i < count; i++)
+		if (sysparam_attrs[i].id == param_id)
+			break;
+	if (i == count)
+		return OPAL_PARAMETER;
+
+	if (length < sysparam_attrs[i].length)
+		return OPAL_PARAMETER;
+	if (!(sysparam_attrs[i].perm & OPAL_SYSPARAM_READ))
+		return OPAL_PERMISSION;
+
+	comp_data = zalloc(sizeof(struct sysparam_comp_data));
+	if (!comp_data)
+		return OPAL_NO_MEM;
+
+	comp_data->param_len = sysparam_attrs[i].length;
+	comp_data->async_token = async_token;
+	rc = fsp_get_sys_param(param_id, (void *)buffer,
+			sysparam_attrs[i].length, fsp_opal_getparam_complete,
+			comp_data);
+	if (rc) {
+		free(comp_data);
+		prerror("%s: Error %d queuing param request\n", __func__, rc);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	return OPAL_ASYNC_COMPLETION;
+}
+
+/* OPAL interface for PowerNV to update the system parameter to FSP */
+static int64_t fsp_opal_set_param(uint64_t async_token, uint32_t param_id,
+				  uint64_t buffer, uint64_t length)
+{
+	struct sysparam_comp_data *comp_data;
+	struct fsp_msg *msg;
+	uint64_t tce_token;
+	int count, rc, i;
+
+	if (!fsp_present())
+		return OPAL_HARDWARE;
+
+	count = ARRAY_SIZE(sysparam_attrs);
+	for (i = 0; i < count; i++)
+		if (sysparam_attrs[i].id == param_id)
+			break;
+	if (i == count)
+		return OPAL_PARAMETER;
+
+	if (length < sysparam_attrs[i].length)
+		return OPAL_PARAMETER;
+	if (!(sysparam_attrs[i].perm & OPAL_SYSPARAM_WRITE))
+		return OPAL_PERMISSION;
+
+	fsp_tce_map(PSI_DMA_SET_SYSPARAM, (void *)(buffer & ~0xffful), 0x1000);
+	tce_token = PSI_DMA_SET_SYSPARAM | (buffer & 0xfff);
+
+	msg = fsp_mkmsg(FSP_CMD_SET_SPARM_2, 4, param_id, length,
+			tce_token >> 32, tce_token);
+	if (!msg) {
+		prerror("%s: Failed to allocate the message\n", __func__);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	comp_data = zalloc(sizeof(struct sysparam_comp_data));
+	if (!comp_data) {
+		fsp_freemsg(msg);
+		return OPAL_NO_MEM;
+	}
+
+	comp_data->param_len = length;
+	comp_data->async_token = async_token;
+	msg->user_data = comp_data;
+
+	rc = fsp_queue_msg(msg, fsp_opal_setparam_complete);
+	if (rc) {
+		free(comp_data);
+		fsp_freemsg(msg);
+		prerror("%s: Failed to queue the message\n", __func__);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	return OPAL_ASYNC_COMPLETION;
+}
+
+struct sysparam_notify_entry {
+	struct	list_node	link;
+	sysparam_update_notify	notify;
+};
+
+static LIST_HEAD(sysparam_update_notifiers);
+
+/* Add client to notifier chain */
+void sysparam_add_update_notifier(sysparam_update_notify notify)
+{
+	struct sysparam_notify_entry *entry;
+
+	entry = zalloc(sizeof(struct sysparam_notify_entry));
+	assert(entry);
+
+	entry->notify = notify;
+	list_add_tail(&sysparam_update_notifiers, &entry->link);
+}
+
+/* Remove client from notifier chain */
+void sysparam_del_update_notifier(sysparam_update_notify notify)
+{
+	struct sysparam_notify_entry *entry;
+
+	list_for_each(&sysparam_update_notifiers, entry, link) {
+		if (entry->notify == notify) {
+			list_del(&entry->link);
+			free(entry);
+			return;
+		}
+	}
+}
+
+/* Update notification chain */
+static void sysparam_run_update_notifier(struct fsp_msg *msg)
+{
+	bool ret;
+	struct sysparam_notify_entry *entry;
+
+	list_for_each(&sysparam_update_notifiers, entry, link) {
+		ret = entry->notify(msg);
+		if (ret == true)
+			break;
+	}
+}
+
+static bool fsp_sysparam_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	struct fsp_msg *rsp;
+	int rc = -ENOMEM;
+
+	switch(cmd_sub_mod) {
+	case FSP_CMD_SP_SPARM_UPD_0:
+	case FSP_CMD_SP_SPARM_UPD_1:
+		printf("FSP: Got sysparam update, param ID 0x%x\n",
+		       fsp_msg_get_data_word(msg, 0));
+
+		sysparam_run_update_notifier(msg);
+
+		rsp = fsp_mkmsg((cmd_sub_mod & 0xffff00) | 0x008000, 0);
+		if (rsp)
+			rc = fsp_queue_msg(rsp, fsp_freemsg);
+		if (rc) {
+			prerror("FSP: Error %d queuing sysparam reply\n", rc);
+			/* What to do here ? R/R ? */
+			fsp_freemsg(rsp);
+		}
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_sysparam_client = {
+	.message = fsp_sysparam_msg,
+};
+
+static void add_opal_sysparam_node(void)
+{
+	struct dt_node *sysparams;
+	char *names, *s;
+	__be32 *ids, *lens;
+	uint8_t *perms;
+	unsigned int i, count, size = 0;
+
+	if (!fsp_present())
+		return;
+
+	sysparams = dt_new(opal_node, "sysparams");
+	dt_add_property_string(sysparams, "compatible", "ibm,opal-sysparams");
+
+	count = ARRAY_SIZE(sysparam_attrs);
+	for (i = 0; i < count; i++)
+		size = size + strlen(sysparam_attrs[i].name) + 1;
+
+	names = zalloc(size);
+	if (!names) {
+		prerror("%s: Failed to allocate memory for parameter names\n",
+				__func__);
+		return;
+	}
+
+	ids = zalloc(count * sizeof(*ids));
+	if (!ids) {
+		prerror("%s: Failed to allocate memory for parameter ids\n",
+				__func__);
+		goto out_free_name;
+	}
+
+	lens = zalloc(count * sizeof(*lens));
+	if (!lens) {
+		prerror("%s: Failed to allocate memory for parameter length\n",
+				__func__);
+		goto out_free_id;
+	}
+
+	perms = zalloc(count * sizeof(*perms));
+	if (!perms) {
+		prerror("%s: Failed to allocate memory for parameter length\n",
+				__func__);
+		goto out_free_len;
+	}
+
+	s = names;
+	for (i = 0; i < count; i++) {
+		strcpy(s, sysparam_attrs[i].name);
+		s = s + strlen(sysparam_attrs[i].name) + 1;
+
+		ids[i] = cpu_to_be32(sysparam_attrs[i].id);
+		lens[i] = cpu_to_be32(sysparam_attrs[i].length);
+		perms[i] = sysparam_attrs[i].perm;
+	}
+
+	dt_add_property(sysparams, "param-name", names, size);
+	dt_add_property(sysparams, "param-id", ids, count * sizeof(*ids));
+	dt_add_property(sysparams, "param-len", lens, count * sizeof(*lens));
+	dt_add_property(sysparams, "param-perm", perms, count * sizeof(*perms));
+
+	free(perms);
+
+out_free_len:
+	free(lens);
+out_free_id:
+	free(ids);
+out_free_name:
+	free(names);
+}
+
+void fsp_sysparam_init(void)
+{
+	if (!fsp_present())
+		return;
+
+	/* Register change notifications */
+	fsp_register_client(&fsp_sysparam_client, FSP_MCLASS_SERVICE);
+
+	/* Register OPAL interfaces */
+	opal_register(OPAL_GET_PARAM, fsp_opal_get_param, 4);
+	opal_register(OPAL_SET_PARAM, fsp_opal_set_param, 4);
+
+	/* Add device-tree nodes */
+	add_opal_sysparam_node();
+}
diff --git a/roms/skiboot/hw/fsp/fsp.c b/roms/skiboot/hw/fsp/fsp.c
new file mode 100644
index 000000000..2c5f9d71b
--- /dev/null
+++ b/roms/skiboot/hw/fsp/fsp.c
@@ -0,0 +1,2709 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Base FSP (Flexible Service Processor) Support
+ *
+ * FSP is the BMC-like thing in some IBM POWER servers
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <stdarg.h>
+#include <processor.h>
+#include <io.h>
+#include <fsp.h>
+#include <lock.h>
+#include <interrupts.h>
+#include <device.h>
+#include <trace.h>
+#include <timebase.h>
+#include <cpu.h>
+#include <errorlog.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <ccan/list/list.h>
+
+extern uint32_t hir_trigger;
+
+DEFINE_LOG_ENTRY(OPAL_RC_FSP_POLL_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_FSP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_FSP_MBOX_ERR, OPAL_PLATFORM_ERR_EVT, OPAL_FSP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_FSP_DISR_HIR_MASK, OPAL_PLATFORM_ERR_EVT, OPAL_FSP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA);
+
+/* We make this look like a Surveillance error, even though it really
+ * isn't one.
+ */
+DEFINE_LOG_ENTRY(OPAL_INJECTED_HIR, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+		OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_MISCELLANEOUS_INFO_ONLY);
+
+#define FSP_TRACE_MSG
+#define FSP_TRACE_EVENT
+
+#define FSP_MAX_IOPATH	4
+
+enum fsp_path_state {
+	fsp_path_bad,
+	fsp_path_backup,
+	fsp_path_active,
+};
+
+struct fsp_iopath {
+	enum fsp_path_state	state;
+	void			*fsp_regs;
+	struct psi		*psi;
+};
+
+enum fsp_mbx_state {
+	fsp_mbx_idle,		/* Mailbox ready to send */
+	fsp_mbx_send,		/* Mailbox sent, waiting for ack */
+	fsp_mbx_crit_op,	/* Critical operation in progress */
+	fsp_mbx_prep_for_reset,	/* Prepare for reset sent */
+	fsp_mbx_hir_seq_done,	/* HIR sequence done, link forced down */
+	fsp_mbx_err,		/* Mailbox in error state, waiting for r&r */
+	fsp_mbx_rr,		/* Mailbox in r&r */
+};
+
+struct fsp {
+	struct fsp		*link;
+	unsigned int		index;
+	enum fsp_mbx_state	state;
+	struct fsp_msg		*pending;
+
+	unsigned int		iopath_count;
+	int			active_iopath;	/* -1: no active IO path */
+	struct fsp_iopath	iopath[FSP_MAX_IOPATH];
+};
+
+enum ipl_state {
+	ipl_initial		= 0x00000000,
+	ipl_opl_sent		= 0x00000001,
+	ipl_got_continue	= 0x00000002,
+	ipl_got_new_role	= 0x00000004,
+	ipl_got_caps		= 0x00000008,
+	ipl_got_fsp_functional	= 0x00000010
+};
+static enum ipl_state ipl_state = ipl_initial;
+
+static struct fsp *first_fsp;
+static struct fsp *active_fsp;
+static u16 fsp_curseq = 0x8000;
+static __be64 *fsp_tce_table;
+
+#define FSP_INBOUND_SIZE	0x00100000UL
+static void *fsp_inbound_buf = NULL;
+static u32 fsp_inbound_off;
+
+static struct lock fsp_lock = LOCK_UNLOCKED;
+static struct lock fsp_poll_lock = LOCK_UNLOCKED;
+
+static u64 fsp_cmdclass_resp_bitmask;
+static u64 timeout_timer;
+
+static u64 fsp_hir_timeout;
+
+#define FSP_CRITICAL_OP_TIMEOUT		128
+#define FSP_DRCR_CLEAR_TIMEOUT		128
+
+/* LID numbers. For now we hijack some of pHyp's own until i figure
+ * out the whole business with the MasterLID
+ */
+#define KERNEL_LID_PHYP			0x80a00701
+#define KERNEL_LID_OPAL			0x80f00101
+#define INITRAMFS_LID_OPAL		0x80f00102
+
+/*
+ * We keep track on last logged values for some things to print only on
+ * value changes, but also to relieve pressure on the tracer which
+ * doesn't do a very good job at detecting repeats when called from
+ * many different CPUs
+ */
+static u32 disr_last_print;
+static u32 drcr_last_print;
+static u32 hstate_last_print;
+
+void fsp_handle_resp(struct fsp_msg *msg);
+
+struct fsp_cmdclass {
+	int timeout;
+	bool busy;
+	struct list_head msgq;
+	struct list_head clientq;
+	struct list_head rr_queue;	/* To queue up msgs during R/R */
+	u64 timesent;
+};
+
+static struct fsp_cmdclass fsp_cmdclass_rr;
+
+static struct fsp_cmdclass fsp_cmdclass[FSP_MCLASS_LAST - FSP_MCLASS_FIRST + 1]
+= {
+#define DEF_CLASS(_cl, _to) [_cl - FSP_MCLASS_FIRST] = { .timeout = _to }
+	DEF_CLASS(FSP_MCLASS_SERVICE,		16),
+	DEF_CLASS(FSP_MCLASS_PCTRL_MSG,		16),
+	DEF_CLASS(FSP_MCLASS_PCTRL_ABORTS,	16),
+	DEF_CLASS(FSP_MCLASS_ERR_LOG,		16),
+	DEF_CLASS(FSP_MCLASS_CODE_UPDATE,	40),
+	DEF_CLASS(FSP_MCLASS_FETCH_SPDATA,	16),
+	DEF_CLASS(FSP_MCLASS_FETCH_HVDATA,	16),
+	DEF_CLASS(FSP_MCLASS_NVRAM,		16),
+	DEF_CLASS(FSP_MCLASS_MBOX_SURV,		 2),
+	DEF_CLASS(FSP_MCLASS_RTC,		16),
+	DEF_CLASS(FSP_MCLASS_SMART_CHIP,	20),
+	DEF_CLASS(FSP_MCLASS_INDICATOR,	       180),
+	DEF_CLASS(FSP_MCLASS_HMC_INTFMSG,	16),
+	DEF_CLASS(FSP_MCLASS_HMC_VT,		16),
+	DEF_CLASS(FSP_MCLASS_HMC_BUFFERS,	16),
+	DEF_CLASS(FSP_MCLASS_SHARK,		16),
+	DEF_CLASS(FSP_MCLASS_MEMORY_ERR,	16),
+	DEF_CLASS(FSP_MCLASS_CUOD_EVENT,	16),
+	DEF_CLASS(FSP_MCLASS_HW_MAINT,		16),
+	DEF_CLASS(FSP_MCLASS_VIO,		16),
+	DEF_CLASS(FSP_MCLASS_SRC_MSG,		16),
+	DEF_CLASS(FSP_MCLASS_DATA_COPY,		16),
+	DEF_CLASS(FSP_MCLASS_TONE,		16),
+	DEF_CLASS(FSP_MCLASS_VIRTUAL_NVRAM,	16),
+	DEF_CLASS(FSP_MCLASS_TORRENT,		16),
+	DEF_CLASS(FSP_MCLASS_NODE_PDOWN,	16),
+	DEF_CLASS(FSP_MCLASS_DIAG,		16),
+	DEF_CLASS(FSP_MCLASS_PCIE_LINK_TOPO,	16),
+	DEF_CLASS(FSP_MCLASS_OCC,		16),
+	DEF_CLASS(FSP_MCLASS_TRUSTED_BOOT,	2),
+	DEF_CLASS(FSP_MCLASS_HBRT,		2),
+};
+
+static void fsp_trace_msg(struct fsp_msg *msg, u8 dir __unused)
+{
+	union trace fsp __unused;
+#ifdef FSP_TRACE_MSG
+	size_t len = offsetof(struct trace_fsp_msg, data[msg->dlen]);
+
+	fsp.fsp_msg.dlen = msg->dlen;
+	fsp.fsp_msg.word0 = cpu_to_be32(msg->word0);
+	fsp.fsp_msg.word1 = cpu_to_be32(msg->word1);
+	fsp.fsp_msg.dir = dir;
+	memcpy(fsp.fsp_msg.data, msg->data.bytes, msg->dlen);
+	trace_add(&fsp, TRACE_FSP_MSG, len);
+#endif /* FSP_TRACE_MSG */
+	assert(msg->dlen <= sizeof(fsp.fsp_msg.data));
+}
+
+static struct fsp *fsp_get_active(void)
+{
+	/* XXX Handle transition between FSPs */
+	return active_fsp;
+}
+
+static u64 fsp_get_class_bit(u8 class)
+{
+	/* Alias classes CE and CF as the FSP has a single queue */
+	if (class == FSP_MCLASS_IPL)
+		class = FSP_MCLASS_SERVICE;
+
+	return 1ul << (class - FSP_MCLASS_FIRST);
+}
+
+static struct fsp_cmdclass *__fsp_get_cmdclass(u8 class)
+{
+	struct fsp_cmdclass *ret;
+
+	/* RR class is special */
+	if (class == FSP_MCLASS_RR_EVENT)
+		return &fsp_cmdclass_rr;
+
+	/* Bound check */
+	if (class < FSP_MCLASS_FIRST || class > FSP_MCLASS_LAST)
+		return NULL;
+
+	/* Alias classes CE and CF as the FSP has a single queue */
+	if (class == FSP_MCLASS_IPL)
+		class = FSP_MCLASS_SERVICE;
+
+	ret = &fsp_cmdclass[class - FSP_MCLASS_FIRST];
+
+	/* Unknown class */
+	if (ret->timeout == 0)
+		return NULL;
+
+	return ret;
+}
+
+static struct fsp_cmdclass *fsp_get_cmdclass(struct fsp_msg *msg)
+{
+	u8 c = msg->word0 & 0xff;
+
+	return __fsp_get_cmdclass(c);
+}
+
+static struct fsp_msg *__fsp_allocmsg(void)
+{
+	return zalloc(sizeof(struct fsp_msg));
+}
+
+struct fsp_msg *fsp_allocmsg(bool alloc_response)
+{
+	struct fsp_msg *msg;
+
+	msg = __fsp_allocmsg();
+	if (!msg)
+		return NULL;
+	if (alloc_response) {
+		msg->resp = __fsp_allocmsg();
+		if (!msg->resp) {
+			free(msg);
+			return NULL;
+		}
+	}
+
+	return msg;
+}
+
+void __fsp_freemsg(struct fsp_msg *msg)
+{
+	free(msg);
+}
+
+void fsp_freemsg(struct fsp_msg *msg)
+{
+	if (msg && msg->resp)
+		__fsp_freemsg(msg->resp);
+	__fsp_freemsg(msg);
+}
+
+void fsp_cancelmsg(struct fsp_msg *msg)
+{
+	bool need_unlock = false;
+	struct fsp_cmdclass* cmdclass = fsp_get_cmdclass(msg);
+
+	if (!fsp_in_rr()) {
+		prerror("FSP: Message cancel allowed only when"
+						"FSP is in reset\n");
+		return;
+	}
+
+	if (!cmdclass)
+		return;
+
+	/* Recursive locking */
+	need_unlock = lock_recursive(&fsp_lock);
+
+	list_del(&msg->link);
+	msg->state = fsp_msg_cancelled;
+
+	if (need_unlock)
+		unlock(&fsp_lock);
+}
+
+static void fsp_wreg(struct fsp *fsp, u32 reg, u32 val)
+{
+	struct fsp_iopath *iop;
+
+	if (fsp->active_iopath < 0)
+		return;
+	iop = &fsp->iopath[fsp->active_iopath];
+	if (iop->state == fsp_path_bad)
+		return;
+	out_be32(iop->fsp_regs + reg, val);
+}
+
+static u32 fsp_rreg(struct fsp *fsp, u32 reg)
+{
+	struct fsp_iopath *iop;
+
+	if (fsp->active_iopath < 0)
+		return 0xffffffff;
+	iop = &fsp->iopath[fsp->active_iopath];
+	if (iop->state == fsp_path_bad)
+		return 0xffffffff;
+	return in_be32(iop->fsp_regs + reg);
+}
+
+static void fsp_reg_dump(void)
+{
+#define FSP_DUMP_ONE(x)	\
+	prlog(PR_DEBUG, "  %20s: %x\n", #x, fsp_rreg(fsp, x));
+
+	struct fsp *fsp = fsp_get_active();
+
+	if (!fsp)
+		return;
+
+	prlog(PR_DEBUG, "FSP #%d: Register dump (state=%d)\n",
+	      fsp->index, fsp->state);
+	FSP_DUMP_ONE(FSP_DRCR_REG);
+	FSP_DUMP_ONE(FSP_DISR_REG);
+	FSP_DUMP_ONE(FSP_MBX1_HCTL_REG);
+	FSP_DUMP_ONE(FSP_MBX1_FCTL_REG);
+	FSP_DUMP_ONE(FSP_MBX2_HCTL_REG);
+	FSP_DUMP_ONE(FSP_MBX2_FCTL_REG);
+	FSP_DUMP_ONE(FSP_SDES_REG);
+	FSP_DUMP_ONE(FSP_HDES_REG);
+	FSP_DUMP_ONE(FSP_HDIR_REG);
+	FSP_DUMP_ONE(FSP_HDIM_SET_REG);
+	FSP_DUMP_ONE(FSP_PDIR_REG);
+	FSP_DUMP_ONE(FSP_PDIM_SET_REG);
+	FSP_DUMP_ONE(FSP_SCRATCH0_REG);
+	FSP_DUMP_ONE(FSP_SCRATCH1_REG);
+	FSP_DUMP_ONE(FSP_SCRATCH2_REG);
+	FSP_DUMP_ONE(FSP_SCRATCH3_REG);
+}
+
+static void fsp_notify_rr_state(u32 state)
+{
+	struct fsp_client *client, *next;
+	struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(FSP_MCLASS_RR_EVENT);
+
+	assert(cmdclass);
+	list_for_each_safe(&cmdclass->clientq, client, next, link)
+		client->message(state, NULL);
+}
+
+static void fsp_reset_cmdclass(void)
+{
+	int i;
+	struct fsp_msg *msg;
+
+	/*
+	 * The FSP is in reset and hence we can't expect any response
+	 * to outstanding messages that we've already sent. Clear the
+	 * bitmap to reflect that.
+	 */
+	fsp_cmdclass_resp_bitmask = 0;
+	for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+		struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i];
+		cmdclass->busy = false;
+		cmdclass->timesent = 0;
+
+		/* Make sure the message queue is empty */
+		while(!list_empty(&cmdclass->msgq)) {
+			msg = list_pop(&cmdclass->msgq, struct fsp_msg,
+				       link);
+			list_add_tail(&cmdclass->rr_queue, &msg->link);
+		}
+	}
+}
+
+static bool fsp_in_hir(struct fsp *fsp)
+{
+	switch (fsp->state) {
+	case fsp_mbx_crit_op:
+	case fsp_mbx_prep_for_reset:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool fsp_in_reset(struct fsp *fsp)
+{
+	switch (fsp->state) {
+	case fsp_mbx_hir_seq_done:	/* FSP reset triggered */
+	case fsp_mbx_err:		/* Will be reset soon */
+	case fsp_mbx_rr:		/* Mbx activity stopped pending reset */
+		return true;
+	default:
+		return false;
+	}
+}
+
+bool fsp_in_rr(void)
+{
+	struct fsp *fsp = fsp_get_active();
+	struct fsp_iopath *iop;
+
+	if (fsp->active_iopath < 0)
+		return true;
+
+	iop = &fsp->iopath[fsp->active_iopath];
+
+	if (fsp_in_reset(fsp) || fsp_in_hir(fsp) || !(psi_check_link_active(iop->psi)))
+		return true;
+
+	return false;
+}
+
+static bool fsp_hir_state_timeout(void)
+{
+	u64 now = mftb();
+
+	if (tb_compare(now, fsp_hir_timeout) == TB_AAFTERB)
+		return true;
+
+	return false;
+}
+
+static void fsp_set_hir_timeout(u32 seconds)
+{
+	u64 now = mftb();
+	fsp_hir_timeout = now + secs_to_tb(seconds);
+}
+
+static bool fsp_crit_op_in_progress(struct fsp *fsp)
+{
+	u32 disr = fsp_rreg(fsp, FSP_DISR_REG);
+
+	if (disr & FSP_DISR_CRIT_OP_IN_PROGRESS)
+		return true;
+
+	return false;
+}
+
+/* Notify the FSP that it will be reset soon by writing to the DRCR */
+static void fsp_prep_for_reset(struct fsp *fsp)
+{
+	u32 drcr;
+
+	/*
+	 * Its possible that the FSP went into reset by itself between the
+	 * time the HIR is triggered and we get here. Check and bail out if so.
+	 */
+	if (fsp_in_rr())
+		return;
+
+	drcr = fsp_rreg(fsp, FSP_DRCR_REG);
+
+	prlog(PR_TRACE, "FSP: Writing reset to DRCR\n");
+	drcr_last_print = drcr;
+	fsp_wreg(fsp, FSP_DRCR_REG, (drcr | FSP_PREP_FOR_RESET_CMD));
+	fsp->state = fsp_mbx_prep_for_reset;
+	fsp_set_hir_timeout(FSP_DRCR_CLEAR_TIMEOUT);
+}
+
+static void fsp_hir_poll(struct fsp *fsp, struct psi *psi)
+{
+	u32 drcr;
+
+	if (fsp_in_reset(fsp) || !(psi_check_link_active(psi)))
+		return;
+
+	switch (fsp->state) {
+	case fsp_mbx_crit_op:
+		if (fsp_crit_op_in_progress(fsp)) {
+			if (fsp_hir_state_timeout())
+				prerror("FSP: Critical operation timeout\n");
+				/* XXX What do do next? Check with FSP folks */
+		} else {
+			fsp_prep_for_reset(fsp);
+		}
+		break;
+	case fsp_mbx_prep_for_reset:
+		drcr = fsp_rreg(fsp, FSP_DRCR_REG);
+
+		if (drcr != drcr_last_print) {
+			prlog(PR_TRACE, "FSP: DRCR changed, old = %x,"
+			      " new = %x\n",
+			      drcr_last_print, drcr);
+			drcr_last_print = drcr;
+		}
+
+		if (drcr & FSP_DRCR_ACK_MASK) {
+			if (fsp_hir_state_timeout()) {
+				prerror("FSP: Ack timeout. Triggering reset\n");
+				psi_reset_fsp(psi);
+				fsp->state = fsp_mbx_hir_seq_done;
+			}
+		} else {
+			prlog(PR_TRACE, "FSP: DRCR ack received."
+			      " Triggering reset\n");
+			psi_reset_fsp(psi);
+			fsp->state = fsp_mbx_hir_seq_done;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * This is the main entry for the host initiated reset case.
+ * This gets called when:
+ *	a. Surveillance ack is not received in 120 seconds
+ *	b. A mailbox command doesn't get a response within the stipulated time.
+ */
+static void __fsp_trigger_reset(void)
+{
+	struct fsp *fsp = fsp_get_active();
+	u32 disr;
+
+	/* Already in one of the error processing states */
+	if (fsp_in_hir(fsp) || fsp_in_reset(fsp))
+		return;
+
+	prerror("FSP: fsp_trigger_reset() entry\n");
+
+	drcr_last_print = 0;
+	/*
+	 * Check if we are allowed to reset the FSP. We aren't allowed to
+	 * reset the FSP if the FSP_DISR_DBG_IN_PROGRESS is set.
+	 */
+	disr = fsp_rreg(fsp, FSP_DISR_REG);
+	if (disr & FSP_DISR_DBG_IN_PROGRESS) {
+		prerror("FSP: Host initiated reset disabled\n");
+		return;
+	}
+
+	/*
+	 * Check if some critical operation is in progress as indicated
+	 * by FSP_DISR_CRIT_OP_IN_PROGRESS. Timeout is 128 seconds
+	 */
+	if (fsp_crit_op_in_progress(fsp)) {
+		prlog(PR_NOTICE, "FSP: Critical operation in progress\n");
+		fsp->state = fsp_mbx_crit_op;
+		fsp_set_hir_timeout(FSP_CRITICAL_OP_TIMEOUT);
+	} else
+		fsp_prep_for_reset(fsp);
+}
+
+static uint32_t fsp_hir_reason_plid;
+
+void fsp_trigger_reset(uint32_t plid)
+{
+	lock(&fsp_lock);
+	fsp_hir_reason_plid = plid;
+	__fsp_trigger_reset();
+	unlock(&fsp_lock);
+}
+
+/*
+ * Called when we trigger a HIR or when the FSP tells us via the DISR's
+ * RR bit that one is impending. We should therefore stop all mbox activity.
+ */
+static void fsp_start_rr(struct fsp *fsp)
+{
+	struct fsp_iopath *iop;
+
+	if (fsp->state == fsp_mbx_rr)
+		return;
+
+	/* We no longer have an active path on that FSP */
+	if (fsp->active_iopath >= 0) {
+		iop = &fsp->iopath[fsp->active_iopath];
+		iop->state = fsp_path_bad;
+		fsp->active_iopath = -1;
+	}
+	fsp->state = fsp_mbx_rr;
+	disr_last_print = 0;
+	hstate_last_print = 0;
+
+	/*
+	 * Mark all command classes as non-busy and clear their
+	 * timeout, then flush all messages in our staging queue
+	 */
+	fsp_reset_cmdclass();
+
+	/* Notify clients. We have to drop the lock here */
+	unlock(&fsp_lock);
+	fsp_notify_rr_state(FSP_RESET_START);
+	lock(&fsp_lock);
+
+	/*
+	 * Unlike earlier, we don't trigger the PSI link polling
+	 * from this point. We wait for the PSI interrupt to tell
+	 * us the FSP is really down and then start the polling there.
+	 */
+}
+
+/*
+ * Called on normal/quick shutdown to give up the PSI link
+ */
+void fsp_reset_links(void)
+{
+	struct fsp *fsp = fsp_get_active();
+	struct fsp_iopath *iop;
+
+	if (!fsp)
+		return;
+
+	/* Already in one of the error states? */
+	if (fsp_in_hir(fsp) || fsp_in_reset(fsp))
+		return;
+
+	iop = &fsp->iopath[fsp->active_iopath];
+	prlog(PR_NOTICE, "FSP #%d: Host initiated shutdown."
+			" Giving up the PSI link\n", fsp->index);
+	psi_disable_link(iop->psi);
+	return;
+}
+
+static void fsp_trace_event(struct fsp *fsp, u32 evt,
+			    u32 data0, u32 data1, u32 data2, u32 data3)
+{
+	union trace tfsp __unused;
+#ifdef FSP_TRACE_EVENT
+	size_t len = sizeof(struct trace_fsp_event);
+
+	tfsp.fsp_evt.event = cpu_to_be16(evt);
+	tfsp.fsp_evt.fsp_state = cpu_to_be16(fsp->state);
+	tfsp.fsp_evt.data[0] = cpu_to_be32(data0);
+	tfsp.fsp_evt.data[1] = cpu_to_be32(data1);
+	tfsp.fsp_evt.data[2] = cpu_to_be32(data2);
+	tfsp.fsp_evt.data[3] = cpu_to_be32(data3);
+	trace_add(&tfsp, TRACE_FSP_EVENT, len);
+#endif /* FSP_TRACE_EVENT */
+}
+
+static void fsp_handle_errors(struct fsp *fsp)
+{
+	u32 hstate;
+	struct fsp_iopath *iop;
+	struct psi *psi;
+	u32 disr;
+
+	if (fsp->active_iopath < 0) {
+		prerror("FSP #%d: fsp_handle_errors() with no active IOP\n",
+			fsp->index);
+		return;
+	}
+
+	iop = &fsp->iopath[fsp->active_iopath];
+	if (!iop->psi) {
+		prerror("FSP: Active IOP with no PSI link !\n");
+		return;
+	}
+	psi = iop->psi;
+
+	/*
+	 * If the link is not up, start R&R immediately, we do call
+	 * psi_disable_link() in this case as while the link might
+	 * not be up, it might still be enabled and the PSI layer
+	 * "active" bit still set
+	 */
+	if (!psi_check_link_active(psi)) {
+		/* Start R&R process */
+		fsp_trace_event(fsp, TRACE_FSP_EVT_LINK_DOWN, 0, 0, 0, 0);
+		prerror("FSP #%d: Link down, starting R&R\n", fsp->index);
+
+		fsp_start_rr(fsp);
+		return;
+	}
+
+	/* Link is up, check for other conditions */
+	disr = fsp_rreg(fsp, FSP_DISR_REG);
+
+	/* If in R&R, log values */
+	if (disr != disr_last_print) {
+		fsp_trace_event(fsp, TRACE_FSP_EVT_DISR_CHG, disr, 0, 0, 0);
+
+		prlog(PR_TRACE, "FSP #%d: DISR stat change = 0x%08x\n",
+		      fsp->index, disr);
+		disr_last_print = disr;
+	}
+
+	/* On a deferred mbox error, trigger a HIR
+	 * Note: We may never get here since the link inactive case is handled
+	 * above and the other case is when the iop->psi is NULL, which is
+	 * quite rare.
+	 */
+	if (fsp->state == fsp_mbx_err) {
+		uint32_t plid;
+		plid = log_simple_error(&e_info(OPAL_RC_FSP_MBOX_ERR),
+					"FSP #%d: Triggering HIR on mbx_err\n",
+					fsp->index);
+		fsp_trigger_reset(plid);
+		return;
+	}
+
+	/*
+	 * If we get here as part of normal flow, the FSP is telling
+	 * us that there will be an impending R&R, so we stop all mbox
+	 * activity. The actual link down trigger is via a PSI
+	 * interrupt that may arrive in due course.
+	 */
+	if (disr & FSP_DISR_FSP_IN_RR) {
+		/*
+		 * If we get here with DEBUG_IN_PROGRESS also set, the
+		 * FSP is in debug and we should *not* reset it now
+		 */
+		if (disr & FSP_DISR_DBG_IN_PROGRESS)
+			return;
+
+		/*
+		 * When the linux comes back up, we still see that bit
+		 * set for a bit, so just move on, nothing to see here
+		 */
+		if (fsp->state == fsp_mbx_rr)
+			return;
+
+		if (fsp_dpo_pending) {
+			/*
+			 * If we are about to process a reset when DPO
+			 * is pending, its possible that the host has
+			 * gone down, and OPAL is on its way down and
+			 * hence will not see the subsequent PSI interrupt.
+			 * So, just give up the link here.
+			 */
+			prlog(PR_NOTICE, "FSP #%d: FSP reset with DPO pending."
+					" Giving up PSI link\n",
+					fsp->index);
+			psi_disable_link(psi);
+		} else {
+			prlog(PR_NOTICE, "FSP #%d: FSP in Reset."
+				" Waiting for PSI interrupt\n",
+				fsp->index);
+		}
+		fsp_start_rr(fsp);
+	}
+
+	/*
+	 * However, if any of Unit Check or Runtime Termintated or
+	 * Flash Terminated bits is also set, the FSP is asking us
+	 * to trigger a HIR so it can try to recover via the DRCR route.
+	 */
+	if (disr & FSP_DISR_HIR_TRIGGER_MASK) {
+		const char *reason = "Unknown FSP_DISR_HIR_TRIGGER";
+		uint32_t plid;
+		fsp_trace_event(fsp, TRACE_FSP_EVT_SOFT_RR, disr, 0, 0, 0);
+
+		if (disr & FSP_DISR_FSP_UNIT_CHECK)
+			reason = "DISR Unit Check set";
+		else if (disr & FSP_DISR_FSP_RUNTIME_TERM)
+			reason = "DISR Runtime Terminate set";
+		else if (disr & FSP_DISR_FSP_FLASH_TERM)
+			reason = "DISR Flash Terminate set";
+
+		plid = log_simple_error(&e_info(OPAL_RC_FSP_DISR_HIR_MASK),
+					"FSP: %s. Triggering host initiated "
+					"reset.", reason);
+
+		/* Clear all interrupt conditions */
+		fsp_wreg(fsp, FSP_HDIR_REG, FSP_DBIRQ_ALL);
+
+		/* Make sure this happened */
+		fsp_rreg(fsp, FSP_HDIR_REG);
+
+		fsp_trigger_reset(plid);
+		return;
+	}
+
+	/*
+	 * We detect an R&R complete indication, acknolwedge it
+	 */
+	if (disr & FSP_DISR_FSP_RR_COMPLETE) {
+		/*
+		 * Acking this bit doens't make it go away immediately, so
+		 * only do it while still in R&R state
+		 */
+		if (fsp->state == fsp_mbx_rr) {
+			fsp_trace_event(fsp, TRACE_FSP_EVT_RR_COMPL, 0,0,0,0);
+
+			prlog(PR_NOTICE, "FSP #%d: Detected R&R complete,"
+			      " acking\n", fsp->index);
+
+			/* Clear HDATA area */
+			fsp_wreg(fsp, FSP_MBX1_HDATA_AREA, 0xff);
+
+			/* Ack it (XDN) and clear HPEND & counts */
+			fsp_wreg(fsp, FSP_MBX1_HCTL_REG,
+				 FSP_MBX_CTL_PTS |
+				 FSP_MBX_CTL_XDN |
+				 FSP_MBX_CTL_HPEND |
+				 FSP_MBX_CTL_HCSP_MASK |
+				 FSP_MBX_CTL_DCSP_MASK);
+
+			/*
+			 * Mark the mbox as usable again so we can process
+			 * incoming messages
+			 */
+			fsp->state = fsp_mbx_idle;
+
+			/* Also clear R&R complete bit in DISR */
+			fsp_wreg(fsp, FSP_DISR_REG, FSP_DISR_FSP_RR_COMPLETE);
+
+			psi_enable_fsp_interrupt(psi);
+		}
+	}
+
+	/*
+	 * XXX
+	 *
+	 * Here we detect a number of errors, should we initiate
+	 * and R&R ?
+	 */
+
+	hstate = fsp_rreg(fsp, FSP_HDES_REG);
+	if (hstate != hstate_last_print) {
+		fsp_trace_event(fsp, TRACE_FSP_EVT_HDES_CHG, hstate, 0, 0, 0);
+
+		prlog(PR_DEBUG, "FSP #%d: HDES stat change = 0x%08x\n",
+		      fsp->index, hstate);
+		hstate_last_print = hstate;
+	}
+
+	if (hstate == 0xffffffff)
+		return;
+
+	/* Clear errors */
+	fsp_wreg(fsp, FSP_HDES_REG, FSP_DBERRSTAT_CLR1);
+
+	/*
+	 * Most of those errors shouldn't have happened, we just clear
+	 * the error state and return. In the long run, we might want
+	 * to start retrying commands, switching FSPs or links, etc...
+	 *
+	 * We currently don't set our mailbox to a permanent error state.
+	 */
+	if (hstate & FSP_DBERRSTAT_ILLEGAL1)
+		prerror("FSP #%d: Illegal command error !\n", fsp->index);
+
+	if (hstate & FSP_DBERRSTAT_WFULL1)
+		prerror("FSP #%d: Write to a full mbox !\n", fsp->index);
+
+	if (hstate & FSP_DBERRSTAT_REMPTY1)
+		prerror("FSP #%d: Read from an empty mbox !\n", fsp->index);
+
+	if (hstate & FSP_DBERRSTAT_PAR1)
+		prerror("FSP #%d: Parity error !\n", fsp->index);
+}
+
+/*
+ * This is called by fsp_post_msg() to check if the mbox
+ * is in a state that allows sending of a message
+ *
+ * Due to the various "interesting" contexts fsp_post_msg()
+ * can be called from, including recursive locks from lock
+ * error messages or console code, this should avoid doing
+ * anything more complex than checking a bit of state.
+ *
+ * Specifically, we cannot initiate an R&R and call back into
+ * clients etc... from this function.
+ *
+ * The best we can do is to se the mbox in error state and
+ * handle it later during a poll or interrupts.
+ */
+static bool fsp_check_can_send(struct fsp *fsp)
+{
+	struct fsp_iopath *iop;
+	struct psi *psi;
+
+	/* Look for FSP in non-idle state */
+	if (fsp->state != fsp_mbx_idle)
+		return false;
+
+	/* Look for an active IO path */
+	if (fsp->active_iopath < 0)
+		goto mbox_error;
+	iop = &fsp->iopath[fsp->active_iopath];
+	if (!iop->psi) {
+		prerror("FSP: Active IOP with no PSI link !\n");
+		goto mbox_error;
+	}
+	psi = iop->psi;
+
+	/* Check if link has gone down. This will be handled later */
+	if (!psi_check_link_active(psi)) {
+		prerror("FSP #%d: Link seems to be down on send\n", fsp->index);
+		goto mbox_error;
+	}
+
+	/* XXX Do we want to check for other error conditions ? */
+	return true;
+
+	/*
+	 * An error of some case occurred, we'll handle it later
+	 * from a more normal "poll" context
+	 */
+ mbox_error:
+	fsp->state = fsp_mbx_err;
+	return false;
+}
+
+static bool fsp_post_msg(struct fsp *fsp, struct fsp_msg *msg)
+{
+	u32 ctl, reg;
+	int i, wlen;
+
+	prlog(PR_INSANE, "FSP #%d: fsp_post_msg (w0: 0x%08x w1: 0x%08x)\n",
+	    fsp->index, msg->word0, msg->word1);
+
+	/* Note: We used to read HCTL here and only modify some of
+	 * the bits in it. This was bogus, because we would write back
+	 * the incoming bits as '1' and clear them, causing fsp_poll()
+	 * to then miss them. Let's just start with 0, which is how
+	 * I suppose the HW intends us to do.
+	 */
+
+	/* Set ourselves as busy */
+	fsp->pending = msg;
+	fsp->state = fsp_mbx_send;
+	msg->state = fsp_msg_sent;
+
+	/* We trace after setting the mailbox state so that if the
+	 * tracing recurses, it ends up just queuing the message up
+	 */
+	fsp_trace_msg(msg, TRACE_FSP_MSG_OUT);
+
+	/* Build the message in the mailbox */
+	reg = FSP_MBX1_HDATA_AREA;
+	fsp_wreg(fsp, reg, msg->word0); reg += 4;
+	fsp_wreg(fsp, reg, msg->word1); reg += 4;
+	wlen = (msg->dlen + 3) >> 2;
+	for (i = 0; i < wlen; i++) {
+		fsp_wreg(fsp, reg, fsp_msg_get_data_word(msg, i));
+		reg += 4;
+	}
+
+	/* Write the header */
+	fsp_wreg(fsp, FSP_MBX1_HHDR0_REG, (msg->dlen + 8) << 16);
+
+	/* Write the control register */
+	ctl = 4 << FSP_MBX_CTL_HCHOST_SHIFT;
+	ctl |= (msg->dlen + 8) << FSP_MBX_CTL_DCHOST_SHIFT;
+	ctl |= FSP_MBX_CTL_PTS | FSP_MBX_CTL_SPPEND;
+	prlog(PR_INSANE, "    new ctl: %08x\n", ctl);
+	fsp_wreg(fsp, FSP_MBX1_HCTL_REG, ctl);
+
+	return true;
+}
+
+static void fsp_poke_queue(struct fsp_cmdclass *cmdclass)
+{
+	struct fsp *fsp = fsp_get_active();
+	struct fsp_msg *msg;
+
+	if (!fsp)
+		return;
+	if (!fsp_check_can_send(fsp))
+		return;
+
+	/* From here to the point where fsp_post_msg() sets fsp->state
+	 * to !idle we must not cause any re-entrancy (no debug or trace)
+	 * in a code path that may hit fsp_post_msg() (it's ok to do so
+	 * if we are going to bail out), as we are committed to calling
+	 * fsp_post_msg() and so a re-entrancy could cause us to do a
+	 * double-send into the mailbox.
+	 */
+	if (cmdclass->busy || list_empty(&cmdclass->msgq))
+		return;
+
+	msg = list_top(&cmdclass->msgq, struct fsp_msg, link);
+	assert(msg);
+	cmdclass->busy = true;
+
+	if (!fsp_post_msg(fsp, msg)) {
+		prerror("FSP #%d: Failed to send message\n", fsp->index);
+		cmdclass->busy = false;
+		return;
+	}
+}
+
+static void __fsp_fillmsg(struct fsp_msg *msg, u32 cmd_sub_mod,
+			  u8 add_words, va_list list)
+{
+	bool response = !!(cmd_sub_mod & 0x1000000);
+	u8 cmd = (cmd_sub_mod >> 16) & 0xff;
+	u8 sub = (cmd_sub_mod >>  8) & 0xff;
+	u8 mod =  cmd_sub_mod & 0xff;
+	int i;
+
+	msg->word0 = cmd & 0xff;
+	msg->word1 = mod << 8 | sub;
+	msg->response = response;
+	msg->dlen = add_words << 2;
+
+	for (i = 0; i < add_words; i++)
+		fsp_msg_set_data_word(msg, i, va_arg(list, unsigned int));
+}
+
+void fsp_fillmsg(struct fsp_msg *msg, u32 cmd_sub_mod, u32 add_words, ...)
+{
+	va_list list;
+
+	va_start(list, add_words);
+	__fsp_fillmsg(msg, cmd_sub_mod, add_words, list);
+	va_end(list);
+}
+
+struct fsp_msg *fsp_mkmsg(u32 cmd_sub_mod, u32 add_words, ...)
+{
+	struct fsp_msg *msg = fsp_allocmsg(!!(cmd_sub_mod & 0x1000000));
+	va_list list;
+
+	if (!msg) {
+		prerror("FSP: Failed to allocate struct fsp_msg\n");
+		return NULL;
+	}
+
+	va_start(list, add_words);
+	__fsp_fillmsg(msg, cmd_sub_mod, add_words, list);
+	va_end(list);
+
+	return msg;
+}
+
+/*
+ * IMPORTANT NOTE: This is *guaranteed* to not call the completion
+ *                 routine recusrively for *any* fsp message, either the
+ *                 queued one or a previous one. Thus it is *ok* to call
+ *                 this function with a lock held which will itself be
+ *                 taken by the completion function.
+ *
+ *                 Any change to this implementation must respect this
+ *                 rule. This will be especially true of things like
+ *                 reset/reload and error handling, if we fail to queue
+ *                 we must just return an error, not call any completion
+ *                 from the scope of fsp_queue_msg().
+ */
+int fsp_queue_msg(struct fsp_msg *msg, void (*comp)(struct fsp_msg *msg))
+{
+	struct fsp_cmdclass *cmdclass;
+	struct fsp *fsp = fsp_get_active();
+	bool need_unlock;
+	u16 seq;
+	int rc = 0;
+
+	if (!fsp || !msg)
+		return -1;
+
+	/* Recursive locking */
+	need_unlock = lock_recursive(&fsp_lock);
+
+	/* Grab a new sequence number */
+	seq = fsp_curseq;
+	fsp_curseq = fsp_curseq + 1;
+	if (fsp_curseq == 0)
+		fsp_curseq = 0x8000;
+	msg->word0 = (msg->word0 & 0xffff) | seq << 16;
+
+	/* Set completion */
+	msg->complete = comp;
+
+	/* Clear response state */
+	if (msg->resp)
+		msg->resp->state = fsp_msg_unused;
+
+	/* Queue the message in the appropriate queue */
+	cmdclass = fsp_get_cmdclass(msg);
+	if (!cmdclass) {
+		prerror("FSP: Invalid msg in fsp_queue_msg w0/1=0x%08x/%08x\n",
+			msg->word0, msg->word1);
+		rc = -1;
+		goto unlock;
+	}
+
+	msg->state = fsp_msg_queued;
+
+	/*
+	 * If we have initiated or about to initiate a reset/reload operation,
+	 * we stash the message on the R&R backup queue. Otherwise, queue it
+	 * normally and poke the HW
+	 */
+	if (fsp_in_hir(fsp) || fsp_in_reset(fsp))
+		list_add_tail(&cmdclass->rr_queue, &msg->link);
+	else {
+		list_add_tail(&cmdclass->msgq, &msg->link);
+		fsp_poke_queue(cmdclass);
+	}
+
+ unlock:
+	if (need_unlock)
+		unlock(&fsp_lock);
+
+	return rc;
+}
+
+/* WARNING: This will drop the FSP lock !!! */
+static void fsp_complete_msg(struct fsp_msg *msg)
+{
+	struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
+	void (*comp)(struct fsp_msg *msg);
+
+	assert(cmdclass);
+
+	prlog(PR_INSANE, "  completing msg,  word0: 0x%08x\n", msg->word0);
+
+	comp = msg->complete;
+	list_del_from(&cmdclass->msgq, &msg->link);
+	cmdclass->busy = false;
+	msg->state = fsp_msg_done;
+
+	unlock(&fsp_lock);
+	if (comp)
+		(*comp)(msg);
+	lock(&fsp_lock);
+}
+
+/* WARNING: This will drop the FSP lock !!! */
+static void fsp_complete_send(struct fsp *fsp)
+{
+	struct fsp_msg *msg = fsp->pending;
+	struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
+
+	assert(msg);
+	assert(cmdclass);
+
+	fsp->pending = NULL;
+
+	prlog(PR_INSANE, "  completing send, word0: 0x%08x, resp: %d\n",
+	    msg->word0, msg->response);
+
+	if (msg->response) {
+		u64 setbit = fsp_get_class_bit(msg->word0 & 0xff);
+		msg->state = fsp_msg_wresp;
+		fsp_cmdclass_resp_bitmask |= setbit;
+		cmdclass->timesent = mftb();
+	} else
+		fsp_complete_msg(msg);
+}
+
+static void  fsp_alloc_inbound(struct fsp_msg *msg)
+{
+	u16 func_id = fsp_msg_get_data_word(msg, 0) & 0xffff;
+	u32 len = fsp_msg_get_data_word(msg, 1);
+	u32 tce_token = 0, act_len = 0;
+	u8 rc = 0;
+	void *buf;
+	struct fsp_msg *resp;
+
+	prlog(PR_DEBUG, "FSP: Allocate inbound buffer func: %04x len: %d\n",
+	      func_id, len);
+
+	lock(&fsp_lock);
+	if ((fsp_inbound_off + len) > FSP_INBOUND_SIZE) {
+		prerror("FSP: Out of space in buffer area !\n");
+		rc = 0xeb;
+		goto reply;
+	}
+
+	if (!fsp_inbound_buf) {
+		fsp_inbound_buf = memalign(TCE_PSIZE, FSP_INBOUND_SIZE);
+		if (!fsp_inbound_buf) {
+			prerror("FSP: could not allocate fsp_inbound_buf!\n");
+			rc = 0xeb;
+			goto reply;
+		}
+	}
+
+	buf = fsp_inbound_buf + fsp_inbound_off;
+	tce_token = PSI_DMA_INBOUND_BUF + fsp_inbound_off;
+	len = (len + TCE_MASK) & ~TCE_MASK;
+	fsp_inbound_off += len;
+	fsp_tce_map(tce_token, buf, len);
+	prlog(PR_DEBUG, "FSP:  -> buffer at 0x%p, TCE: 0x%08x, alen: 0x%x\n",
+	      buf, tce_token, len);
+	act_len = len;
+
+ reply:
+	unlock(&fsp_lock);
+
+	resp = fsp_mkmsg(FSP_RSP_ALLOC_INBOUND | rc, 3, 0, tce_token, act_len);
+	if (!resp) {
+		prerror("FSP: response message allocation failed\n");
+		return;
+	}
+	if (fsp_queue_msg(resp, fsp_freemsg)) {
+		fsp_freemsg(resp);
+		prerror("FSP: Failed to queue response message\n");
+		return;
+	}
+}
+
+void *fsp_inbound_buf_from_tce(u32 tce_token)
+{
+	u32 offset = tce_token - PSI_DMA_INBOUND_BUF;
+
+	if (tce_token < PSI_DMA_INBOUND_BUF || offset >= fsp_inbound_off) {
+		prerror("FSP: TCE token 0x%x out of bounds\n", tce_token);
+		return NULL;
+	}
+	return fsp_inbound_buf + offset;
+}
+
+static void fsp_repost_queued_msgs_post_rr(void)
+{
+	struct fsp_msg *msg;
+	int i;
+
+	for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+		struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i];
+		bool poke = false;
+
+		while(!list_empty(&cmdclass->rr_queue)) {
+			msg = list_pop(&cmdclass->rr_queue,
+				       struct fsp_msg, link);
+			list_add_tail(&cmdclass->msgq, &msg->link);
+			poke = true;
+		}
+		if (poke)
+			fsp_poke_queue(cmdclass);
+	}
+}
+
+static bool fsp_local_command(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	u32 cmd = 0;
+	u32 rsp_data = 0;
+	struct fsp_msg *resp;
+
+	switch(cmd_sub_mod) {
+	case FSP_CMD_CONTINUE_IPL:
+		/* We get a CONTINUE_IPL as a response to OPL */
+		prlog(PR_NOTICE, "FSP: Got CONTINUE_IPL !\n");
+		ipl_state |= ipl_got_continue;
+		return true;
+
+	case FSP_CMD_HV_STATE_CHG:
+		prlog(PR_NOTICE, "FSP: Got HV state change request to %d\n",
+		      msg->data.bytes[0]);
+
+		/* Send response synchronously for now, we might want to
+		 * deal with that sort of stuff asynchronously if/when
+		 * we add support for auto-freeing of messages
+		 */
+		resp = fsp_mkmsg(FSP_RSP_HV_STATE_CHG, 0);
+		if (!resp)
+			prerror("FSP: Failed to allocate HV state response\n");
+		else {
+			if (fsp_queue_msg(resp, fsp_freemsg)) {
+				fsp_freemsg(resp);
+				prerror("FSP: Failed to queue HV state resp\n");
+			}
+		}
+		return true;
+
+	case FSP_CMD_SP_NEW_ROLE:
+		/* FSP is assuming a new role */
+		prlog(PR_INFO, "FSP: FSP assuming new role\n");
+		resp = fsp_mkmsg(FSP_RSP_SP_NEW_ROLE, 0);
+		if (!resp)
+			prerror("FSP: Failed to allocate SP role response\n");
+		else {
+			if (fsp_queue_msg(resp, fsp_freemsg)) {
+				fsp_freemsg(resp);
+				prerror("FSP: Failed to queue SP role resp\n");
+			}
+		}
+		ipl_state |= ipl_got_new_role;
+		return true;
+
+	case FSP_CMD_SP_QUERY_CAPS:
+		prlog(PR_INFO, "FSP: FSP query capabilities\n");
+		/* XXX Do something saner. For now do a synchronous
+	         * response and hard code our capabilities
+		 */
+		resp = fsp_mkmsg(FSP_RSP_SP_QUERY_CAPS, 4, 0x3ff80000, 0, 0, 0);
+		if (!resp)
+			prerror("FSP: Failed to allocate CAPS response\n");
+		else {
+			if (fsp_queue_msg(resp, fsp_freemsg)) {
+				fsp_freemsg(resp);
+				prerror("FSP: Failed to queue CAPS resp\n");
+			}
+		}
+		ipl_state |= ipl_got_caps;
+		return true;
+	case FSP_CMD_FSP_FUNCTNAL:
+		prlog(PR_INFO, "FSP: Got FSP Functional\n");
+		ipl_state |= ipl_got_fsp_functional;
+		return true;
+	case FSP_CMD_ALLOC_INBOUND:
+		fsp_alloc_inbound(msg);
+		return true;
+	case FSP_CMD_SP_RELOAD_COMP:
+		if (msg->data.bytes[3] & PPC_BIT8(0)) {
+			fsp_fips_dump_notify(fsp_msg_get_data_word(msg, 1),
+					     fsp_msg_get_data_word(msg, 2));
+
+			if (msg->data.bytes[3] & PPC_BIT8(1))
+				prlog(PR_DEBUG, "      PLID is %x\n",
+				      fsp_msg_get_data_word(msg, 3));
+		}
+		if (msg->data.bytes[3] & PPC_BIT8(2)) {
+			prlog(PR_INFO, "FSP: SP Reset/Reload was NOT done\n");
+		} else {
+			prlog(PR_INFO, "FSP: SP says Reset/Reload complete\n");
+			/* Notify clients that the FSP is back up */
+			fsp_notify_rr_state(FSP_RELOAD_COMPLETE);
+			fsp_repost_queued_msgs_post_rr();
+		}
+		return true;
+	case FSP_CMD_CLOSE_HMC_INTF:
+		/* Close the HMC interface */
+		/* Though Sapphire does not support a HMC connection, the FSP
+		 * sends this message when it is trying to open any new
+		 * hypervisor session. So returning an error 0x51.
+		 */
+		cmd = FSP_RSP_CLOSE_HMC_INTF | FSP_STAUS_INVALID_HMC_ID;
+		rsp_data = msg->data.bytes[0] << 24 | msg->data.bytes[1] << 16;
+		rsp_data &= 0xffff0000;
+		resp = fsp_mkmsg(cmd, 1, rsp_data);
+		if (!resp)
+			prerror("FSP: Failed to allocate HMC close response\n");
+		else {
+			if (fsp_queue_msg(resp, fsp_freemsg)) {
+				fsp_freemsg(resp);
+				prerror("FSP: Failed to queue HMC close resp\n");
+			}
+		}
+		return true;
+	case FSP_CMD_GET_HIR_PLID:
+		/* Get Platform Log Id with reason for Host Initiated Reset */
+		prlog(PR_DEBUG, "FSP: Sending PLID 0x%x as HIR reason\n",
+		      fsp_hir_reason_plid);
+		resp = fsp_mkmsg(FSP_RSP_GET_HIR_PLID, 1, fsp_hir_reason_plid);
+		if (!resp)
+			prerror("FSP: Failed to allocate GET_HIR_PLID response\n");
+		else {
+			if (fsp_queue_msg(resp, fsp_freemsg)) {
+				fsp_freemsg(resp);
+				prerror("FSP: Failed to queue GET_HIR_PLID resp\n");
+			}
+		}
+		fsp_hir_reason_plid = 0;
+		return true;
+	}
+	return false;
+}
+
+
+/* This is called without the FSP lock */
+static void fsp_handle_command(struct fsp_msg *msg)
+{
+	struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
+	struct fsp_client *client, *next;
+	struct fsp_msg *resp;
+	u32 cmd_sub_mod;
+
+	if (!cmdclass) {
+		prerror("FSP: Got message for unknown class %x\n",
+			msg->word0 & 0xff);
+		goto free;
+	}
+
+	cmd_sub_mod =  (msg->word0 & 0xff) << 16;
+	cmd_sub_mod |= (msg->word1 & 0xff) << 8;
+	cmd_sub_mod |= (msg->word1 >> 8) & 0xff;
+
+	/* Some commands are handled locally */
+	if (fsp_local_command(cmd_sub_mod, msg))
+		goto free;
+
+	/* The rest go to clients */
+	list_for_each_safe(&cmdclass->clientq, client, next, link) {
+		if (client->message(cmd_sub_mod, msg))
+			goto free;
+	}
+
+	prerror("FSP: Unhandled message %06x\n", cmd_sub_mod);
+
+	/* We don't know whether the message expected some kind of
+	 * response, so we send one anyway
+	 */
+	resp = fsp_mkmsg((cmd_sub_mod & 0xffff00) | 0x008020, 0);
+	if (!resp)
+		prerror("FSP: Failed to allocate default response\n");
+	else {
+		if (fsp_queue_msg(resp, fsp_freemsg)) {
+			fsp_freemsg(resp);
+			prerror("FSP: Failed to queue default response\n");
+		}
+	}
+
+ free:
+	fsp_freemsg(msg);
+}
+
+static void __fsp_fill_incoming(struct fsp *fsp, struct fsp_msg *msg,
+				int dlen, u32 w0, u32 w1)
+{
+	unsigned int wlen, i, reg;
+
+	msg->dlen = dlen - 8;
+	msg->word0 = w0;
+	msg->word1 = w1;
+	wlen = (dlen + 3) >> 2;
+	reg = FSP_MBX1_FDATA_AREA + 8;
+	for (i = 0; i < wlen; i++) {
+		fsp_msg_set_data_word(msg, i, fsp_rreg(fsp, reg));
+		reg += 4;
+	}
+
+	/* Ack it (XDN) and clear HPEND & counts */
+	fsp_wreg(fsp, FSP_MBX1_HCTL_REG,
+		 FSP_MBX_CTL_PTS |
+		 FSP_MBX_CTL_XDN |
+		 FSP_MBX_CTL_HPEND |
+		 FSP_MBX_CTL_HCSP_MASK |
+		 FSP_MBX_CTL_DCSP_MASK);
+
+	fsp_trace_msg(msg, TRACE_FSP_MSG_IN);
+}
+
+static void __fsp_drop_incoming(struct fsp *fsp)
+{
+	/* Ack it (XDN) and clear HPEND & counts */
+	fsp_wreg(fsp, FSP_MBX1_HCTL_REG,
+		 FSP_MBX_CTL_PTS |
+		 FSP_MBX_CTL_XDN |
+		 FSP_MBX_CTL_HPEND |
+		 FSP_MBX_CTL_HCSP_MASK |
+		 FSP_MBX_CTL_DCSP_MASK);
+}
+
+/* WARNING: This will drop the FSP lock */
+static void fsp_handle_incoming(struct fsp *fsp)
+{
+	struct fsp_msg *msg;
+	u32 h0, w0, w1;
+	unsigned int dlen;
+	bool special_response = false;
+
+	h0 = fsp_rreg(fsp, FSP_MBX1_FHDR0_REG);
+	dlen = (h0 >> 16) & 0xff;
+
+	w0 = fsp_rreg(fsp, FSP_MBX1_FDATA_AREA);
+	w1 = fsp_rreg(fsp, FSP_MBX1_FDATA_AREA + 4);
+
+	prlog(PR_INSANE, "  Incoming: w0: 0x%08x, w1: 0x%08x, dlen: %d\n",
+	    w0, w1, dlen);
+
+	/* Some responses are expected out of band */
+	if ((w0 & 0xff) == FSP_MCLASS_HMC_INTFMSG  &&
+	    ((w1 & 0xff) == 0x8a || ((w1 & 0xff) == 0x8b)))
+		special_response = true;
+
+	/* Check for response bit */
+	if (w1 & 0x80 && !special_response) {
+		struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(w0 & 0xff);
+		struct fsp_msg *req;
+
+		if (!cmdclass) {
+			prerror("FSP: Got response for unknown class %x\n",
+				w0 & 0xff);
+			__fsp_drop_incoming(fsp);
+			return;
+		}
+
+		if (!cmdclass->busy || list_empty(&cmdclass->msgq)) {
+			prerror("FSP #%d: Got orphan response! w0 = 0x%08x w1 = 0x%08x\n",
+					fsp->index, w0, w1);
+			__fsp_drop_incoming(fsp);
+			return;
+		}
+		req = list_top(&cmdclass->msgq, struct fsp_msg, link);
+
+		/* Check if the response seems to match the message */
+		if (req->state != fsp_msg_wresp ||
+		    (req->word0 & 0xff) != (w0 & 0xff) ||
+		    (req->word1 & 0xff) != (w1 & 0x7f)) {
+			__fsp_drop_incoming(fsp);
+			prerror("FSP #%d: Response doesn't match pending msg. w0 = 0x%08x w1 = 0x%08x\n",
+				fsp->index, w0, w1);
+			return;
+		} else {
+			u64 resetbit = ~fsp_get_class_bit(req->word0 & 0xff);
+			fsp_cmdclass_resp_bitmask &= resetbit;
+			cmdclass->timesent = 0;
+		}
+
+		/* Allocate response if needed XXX We need to complete
+		 * the original message with some kind of error here ?
+		 */
+		if (!req->resp) {
+			req->resp = __fsp_allocmsg();
+			if (!req->resp) {
+				__fsp_drop_incoming(fsp);
+				prerror("FSP #%d: Failed to allocate response\n",
+					fsp->index);
+				return;
+			}
+		}
+
+		/* Populate and complete (will drop the lock) */
+		req->resp->state = fsp_msg_response;
+		__fsp_fill_incoming(fsp, req->resp, dlen, w0, w1);
+		fsp_complete_msg(req);
+		return;
+	}
+
+	/* Allocate an incoming message */
+	msg = __fsp_allocmsg();
+	if (!msg) {
+		__fsp_drop_incoming(fsp);
+		prerror("FSP #%d: Failed to allocate incoming msg\n",
+			fsp->index);
+		return;
+	}
+	msg->state = fsp_msg_incoming;
+	__fsp_fill_incoming(fsp, msg, dlen, w0, w1);
+
+	/* Handle FSP commands. This can recurse into fsp_queue_msg etc.. */
+	unlock(&fsp_lock);
+	fsp_handle_command(msg);
+	lock(&fsp_lock);
+}
+
+static void fsp_check_queues(struct fsp *fsp)
+{
+	int i;
+
+	/* XXX In the long run, we might want to have a queue of
+	 * classes waiting to be serviced to speed this up, either
+	 * that or a bitmap.
+	 */
+	for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+		struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i];
+
+		if (fsp->state != fsp_mbx_idle)
+			break;
+		if (cmdclass->busy || list_empty(&cmdclass->msgq))
+			continue;
+		fsp_poke_queue(cmdclass);
+	}
+}
+
+static void __fsp_poll(bool interrupt)
+{
+	struct fsp_iopath *iop;
+	struct fsp *fsp = fsp_get_active();
+	u32 ctl, hdir = 0;
+	bool psi_irq;
+
+	/*
+	 * The tracer isn't terribly efficient at detecting dups
+	 * especially when coming from multiple CPUs so we do our
+	 * own change-detection locally
+	 */
+	static u32 hdir_last_trace;
+	static u32 ctl_last_trace;
+	static bool psi_irq_last_trace;
+	static bool irq_last_trace;
+
+	if (!fsp)
+		return;
+
+	/* Crazy interrupt handling scheme:
+	 *
+	 * In order to avoid "losing" interrupts when polling the mbox
+	 * we only clear interrupt conditions when called as a result of
+	 * an interrupt.
+	 *
+	 * That way, if a poll clears, for example, the HPEND condition,
+	 * the interrupt remains, causing a dummy interrupt later on
+	 * thus allowing the OS to be notified of a state change (ie it
+	 * doesn't need every poll site to monitor every state change).
+	 *
+	 * However, this scheme is complicated by the fact that we need
+	 * to clear the interrupt condition after we have cleared the
+	 * original condition in HCTL, and we might have long stale
+	 * interrupts which we do need to eventually get rid of. However
+	 * clearing interrupts in such a way is racy, so we need to loop
+	 * and re-poll HCTL after having done so or we might miss an
+	 * event. It's a latency risk, but unlikely and probably worth it.
+	 */
+
+ again:
+	if (fsp->active_iopath < 0) {
+		/* That should never happen */
+		if (interrupt && (fsp->state != fsp_mbx_rr))
+			prerror("FSP: Interrupt with no working IO path\n");
+		return;
+	}
+	iop = &fsp->iopath[fsp->active_iopath];
+
+	/* Check for error state and handle R&R completion */
+	fsp_handle_errors(fsp);
+
+	/* Handle host initiated resets */
+	if (fsp_in_hir(fsp)) {
+		fsp_hir_poll(fsp, iop->psi);
+		return;
+	}
+
+	/*
+	 * The above might have triggered and R&R, check that we
+	 * are still functional
+	 */
+	if ((fsp->active_iopath < 0) || fsp_in_hir(fsp))
+		return;
+	iop = &fsp->iopath[fsp->active_iopath];
+
+	/* Read interrupt status (we may or may not use it) */
+	hdir = fsp_rreg(fsp, FSP_HDIR_REG);
+
+	/* Read control now as well so we can trace them */
+	ctl = fsp_rreg(fsp, FSP_MBX1_HCTL_REG);
+
+	/* Ditto with PSI irq state */
+	psi_irq = psi_poll_fsp_interrupt(iop->psi);
+
+	/* Trace it if anything changes */
+	if (hdir != hdir_last_trace || ctl != ctl_last_trace ||
+	    interrupt != irq_last_trace || psi_irq != psi_irq_last_trace) {
+		fsp_trace_event(fsp, TRACE_FSP_EVT_POLL_IRQ,
+				interrupt, hdir, ctl, psi_irq);
+
+		hdir_last_trace = hdir;
+		ctl_last_trace = ctl;
+		irq_last_trace = interrupt;
+		psi_irq_last_trace = psi_irq;
+	}
+
+	/*
+	 * We *MUST* ignore the MBOX2 bits here. While MBOX2 cannot generate
+	 * interrupt, it might still latch some bits here (and we found cases
+	 * where the MBOX2 XUP would be set). If that happens, clearing HDIR
+	 * never works (the bit gets set again immediately) because we don't
+	 * clear the condition in HTCL2 and thus we loop forever.
+	 */
+	hdir &= FSP_DBIRQ_MBOX1;
+
+	/*
+	 * Sanity check: If an interrupt is pending and we are in polling
+	 * mode, check that the PSI side is also pending. If some bit is
+	 * set, just clear and move on.
+	 */
+	if (hdir && !interrupt && !psi_irq) {
+		prerror("FSP: WARNING ! HDIR 0x%08x but no PSI irq !\n", hdir);
+		fsp_wreg(fsp, FSP_HDIR_REG, hdir);
+	}
+
+	/*
+	 * We should never have the mbox in error state here unless it
+	 * was fine until some printf inside fsp_handle_errors() caused
+	 * the console to poke the FSP which detected a branch new error
+	 * in the process. Let's be safe rather than sorry and handle that
+	 * here
+	 */
+	if (fsp_in_hir(fsp) || fsp->state == fsp_mbx_err) {
+		prerror("FSP: Late error state detection\n");
+		goto again;
+	}
+
+	/*
+	 * If we are in an R&R state with an active IO path, we
+	 * shouldn't be getting interrupts. If we do, just clear
+	 * the condition and print a message
+	 */
+	if (fsp->state == fsp_mbx_rr) {
+		if (interrupt) {
+			prerror("FSP: Interrupt in RR state [HDIR=0x%08x]\n",
+				hdir);
+			fsp_wreg(fsp, FSP_HDIR_REG, hdir);
+		}
+		return;
+	}
+
+	/* Poll FSP CTL */
+	if (ctl & (FSP_MBX_CTL_XUP | FSP_MBX_CTL_HPEND))
+		prlog(PR_INSANE, "FSP #%d: poll, ctl: %x\n", fsp->index, ctl);
+
+	/* Do we have a pending message waiting to complete ? */
+	if (ctl & FSP_MBX_CTL_XUP) {
+		fsp_wreg(fsp, FSP_MBX1_HCTL_REG, FSP_MBX_CTL_XUP);
+		if (fsp->state == fsp_mbx_send) {
+			/* mbox is free */
+			fsp->state = fsp_mbx_idle;
+
+			/* Complete message (will break the lock) */
+			fsp_complete_send(fsp);
+
+			/* Lock can have been broken, so ctl is now
+			 * potentially invalid, let's recheck
+			 */
+			goto again;
+		} else {
+			prerror("FSP #%d: Got XUP with no pending message !\n",
+				fsp->index);
+		}
+	}
+
+	if (fsp->state == fsp_mbx_send) {
+		/* XXX Handle send timeouts!!! */
+	}
+
+	/* Is there an incoming message ? This will break the lock as well */
+	if (ctl & FSP_MBX_CTL_HPEND)
+		fsp_handle_incoming(fsp);
+
+	/* Note: Lock may have been broken above, thus ctl might be invalid
+	 * now, don't use it any further.
+	 */
+
+	/* Check for something else to send */
+	if (fsp->state == fsp_mbx_idle)
+		fsp_check_queues(fsp);
+
+	/* Clear interrupts, and recheck HCTL if any occurred */
+	if (interrupt && hdir) {
+		fsp_wreg(fsp, FSP_HDIR_REG, hdir);
+		goto again;
+	}
+}
+
+void fsp_interrupt(void)
+{
+	lock(&fsp_lock);
+	__fsp_poll(true);
+	unlock(&fsp_lock);
+}
+
+
+int fsp_sync_msg(struct fsp_msg *msg, bool autofree)
+{
+	int rc;
+
+	rc = fsp_queue_msg(msg, NULL);
+	if (rc)
+		goto bail;
+
+	while(fsp_msg_busy(msg)) {
+		if (fsp_in_rr()) {
+			fsp_cancelmsg(msg);
+			rc = -1;
+			goto bail;
+		}
+		cpu_relax();
+		opal_run_pollers();
+	}
+
+	switch(msg->state) {
+	case fsp_msg_done:
+		rc = 0;
+		break;
+	case fsp_msg_timeout:
+		rc = -1; /* XXX to improve */
+		break;
+	default:
+		rc = -1; /* Should not happen... (assert ?) */
+	}
+
+	if (msg->resp)
+		rc = (msg->resp->word1 >> 8) & 0xff;
+ bail:
+	if (autofree)
+		fsp_freemsg(msg);
+	return rc;
+}
+
+void fsp_register_client(struct fsp_client *client, u8 msgclass)
+{
+	struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(msgclass);
+
+	if (!fsp_present())
+		return;
+	assert(cmdclass);
+	list_add_tail(&cmdclass->clientq, &client->link);
+}
+
+void fsp_unregister_client(struct fsp_client *client, u8 msgclass)
+{
+	struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(msgclass);
+
+	if (!fsp_present())
+		return;
+	assert(cmdclass);
+	list_del_from(&cmdclass->clientq, &client->link);
+}
+
+static int fsp_init_mbox(struct fsp *fsp)
+{
+	unsigned int i;
+	u32 reg;
+
+	/*
+	 * Note: The documentation contradicts itself as to
+	 * whether the HDIM bits should be set or cleared to
+	 * enable interrupts
+	 *
+	 * This seems to work...
+	 */
+
+	/* Mask all interrupts */
+	fsp_wreg(fsp, FSP_HDIM_CLR_REG, FSP_DBIRQ_ALL);
+
+	/* Clear all errors */
+	fsp_wreg(fsp, FSP_HDES_REG, FSP_DBERRSTAT_CLR1 | FSP_DBERRSTAT_CLR2);
+
+	/* Initialize data area as the doco says */
+	for (i = 0; i < 0x40; i += 4)
+		fsp_wreg(fsp, FSP_MBX1_HDATA_AREA + i, 0);
+
+	/*
+	 * Clear whatever crap may remain in HDCR. Do not write XDN as that
+	 * would be interpreted incorrectly as an R&R completion which
+	 * we aren't ready to send yet !
+	 */
+	fsp_wreg(fsp, FSP_MBX1_HCTL_REG, FSP_MBX_CTL_XUP | FSP_MBX_CTL_HPEND |
+		 FSP_MBX_CTL_HCSP_MASK | FSP_MBX_CTL_DCSP_MASK |
+		 FSP_MBX_CTL_PTS);
+
+	/* Clear all pending interrupts */
+	fsp_wreg(fsp, FSP_HDIR_REG, FSP_DBIRQ_ALL);
+
+	/* Enable all mbox1 interrupts */
+	fsp_wreg(fsp, FSP_HDIM_SET_REG, FSP_DBIRQ_MBOX1);
+
+	/* Decode what FSP we are connected to */
+	reg = fsp_rreg(fsp, FSP_SCRATCH0_REG);
+	if (reg & PPC_BIT32(0)) {		/* Is it a valid connection */
+		if (reg & PPC_BIT32(3))
+			prlog(PR_INFO, "FSP: Connected to FSP-B\n");
+		else
+			prlog(PR_INFO, "FSP: Connected to FSP-A\n");
+	}
+
+	return 0;
+}
+
+/* We use a single fixed TCE table for all PSI interfaces */
+static void fsp_init_tce_table(void)
+{
+	fsp_tce_table = (__be64 *)PSI_TCE_TABLE_BASE;
+
+	memset(fsp_tce_table, 0, PSI_TCE_TABLE_SIZE);
+}
+
+void fsp_tce_map(u32 offset, void *addr, u32 size)
+{
+	u64 raddr = (u64)addr;
+
+	assert(!(offset & TCE_MASK));
+	assert(!(raddr  & TCE_MASK));
+	assert(!(size   & TCE_MASK));
+
+	size   >>= TCE_SHIFT;
+	offset >>= TCE_SHIFT;
+
+	while(size--) {
+		fsp_tce_table[offset++] = cpu_to_be64(raddr | 0x3);
+		raddr += TCE_PSIZE;
+	}
+}
+
+void fsp_tce_unmap(u32 offset, u32 size)
+{
+	assert(!(offset & TCE_MASK));
+	assert(!(size   & TCE_MASK));
+
+	size   >>= TCE_SHIFT;
+	offset >>= TCE_SHIFT;
+
+	while(size--)
+		fsp_tce_table[offset++] = 0;
+}
+
+static struct fsp *fsp_find_by_index(int index)
+{
+	struct fsp *fsp = first_fsp;
+
+	do {
+		if (fsp->index == index)
+			return fsp;
+	} while (fsp->link != first_fsp);
+
+	return NULL;
+}
+
+static void fsp_init_links(struct dt_node *fsp_node)
+{
+	const struct dt_property *linksprop;
+	int i, index;
+	struct fsp *fsp;
+	struct fsp_iopath *fiop;
+
+	linksprop = dt_find_property(fsp_node, "ibm,psi-links");
+	assert(linksprop);
+
+	index = dt_prop_get_u32(fsp_node, "reg");
+	fsp = fsp_find_by_index(index);
+	if (!fsp) {
+		prerror("FSP: FSP with index %d not found\n", index);
+		return;
+	}
+
+	fsp->state = fsp_mbx_idle;
+
+	/* Iterate all links */
+	for (i = 0; i < fsp->iopath_count; i++) {
+		u64 reg;
+		u32 link;
+
+		link = dt_property_get_cell(linksprop, i);
+		fiop = &fsp->iopath[i];
+		fiop->psi = psi_find_link(link);
+		if (fiop->psi == NULL) {
+			prerror("FSP #%d: Couldn't find PSI link\n",
+				fsp->index);
+			continue;
+		}
+
+		prlog(PR_DEBUG, "FSP #%d: Found PSI HB link to chip %d\n",
+		      fsp->index, link);
+
+		psi_fsp_link_in_use(fiop->psi);
+
+		/* Get the FSP register window */
+		reg = in_be64(fiop->psi->regs + PSIHB_FSPBAR);
+		fiop->fsp_regs = (void *)(reg | (1ULL << 63) |
+				dt_prop_get_u32(fsp_node, "reg-offset"));
+	}
+}
+
+static void fsp_update_links_states(struct fsp *fsp)
+{
+	struct fsp_iopath *fiop;
+	unsigned int i;
+
+	/* Iterate all links */
+	for (i = 0; i < fsp->iopath_count; i++) {
+		fiop = &fsp->iopath[i];
+		if (!fiop->psi)
+			fiop->state = fsp_path_bad;
+		else if (fiop->psi->active) {
+			fsp->active_iopath = i;
+			fiop->state = fsp_path_active;
+		} else
+			fiop->state = fsp_path_backup;
+	}
+
+	if (fsp->active_iopath >= 0) {
+		if (!active_fsp || (active_fsp != fsp))
+			active_fsp = fsp;
+
+		fsp_inbound_off = 0;
+		fiop = &fsp->iopath[fsp->active_iopath];
+		psi_init_for_fsp(fiop->psi);
+		fsp_init_mbox(fsp);
+	}
+}
+
+void fsp_reinit_fsp(void)
+{
+	struct fsp *fsp;
+
+	/* Notify all FSPs to check for an updated link state */
+	for (fsp = first_fsp; fsp; fsp = fsp->link)
+		fsp_update_links_states(fsp);
+}
+
+static void fsp_create_fsp(struct dt_node *fsp_node)
+{
+	const struct dt_property *linksprop;
+	struct fsp *fsp;
+	int count, index;
+
+	index = dt_prop_get_u32(fsp_node, "reg");
+	prlog(PR_INFO, "FSP #%d: Found in device-tree, setting up...\n",
+	      index);
+
+	linksprop = dt_find_property(fsp_node, "ibm,psi-links");
+	if (!linksprop || linksprop->len < 4) {
+		prerror("FSP #%d: No links !\n", index);
+		return;
+	}
+
+	fsp = zalloc(sizeof(struct fsp));
+	if (!fsp) {
+		prerror("FSP #%d: Can't allocate memory !\n", index);
+		return;
+	}
+
+	fsp->index = index;
+	fsp->active_iopath = -1;
+
+	count = linksprop->len / 4;
+	prlog(PR_DEBUG, "FSP #%d: Found %d IO PATH\n", index, count);
+	if (count > FSP_MAX_IOPATH) {
+		prerror("FSP #%d: WARNING, limited to %d IO PATH\n",
+			index, FSP_MAX_IOPATH);
+		count = FSP_MAX_IOPATH;
+	}
+	fsp->iopath_count = count;
+
+	fsp->link = first_fsp;
+	first_fsp = fsp;
+
+	fsp_init_links(fsp_node);
+	fsp_update_links_states(fsp);
+
+	if (fsp->active_iopath >= 0)
+		psi_enable_fsp_interrupt(fsp->iopath[fsp->active_iopath].psi);
+}
+
+static void fsp_opal_poll(void *data __unused)
+{
+	/* Test the host initiated reset */
+	if (hir_trigger == 0xdeadbeef) {
+		uint32_t plid = log_simple_error(&e_info(OPAL_INJECTED_HIR),
+			"SURV: Injected HIR, initiating FSP R/R\n");
+		fsp_trigger_reset(plid);
+		hir_trigger = 0;
+	}
+
+	if (try_lock(&fsp_lock)) {
+		__fsp_poll(false);
+		unlock(&fsp_lock);
+	}
+}
+
+int fsp_fatal_msg(struct fsp_msg *msg)
+{
+	int rc = 0;
+
+	rc = fsp_queue_msg(msg, NULL);
+	if (rc)
+		return rc;
+
+	while(fsp_msg_busy(msg)) {
+		if (fsp_in_rr()) {
+			fsp_cancelmsg(msg);
+			return -1;
+		}
+
+		cpu_relax();
+		fsp_opal_poll(NULL);
+	}
+
+	switch(msg->state) {
+	case fsp_msg_done:
+		rc = 0;
+		break;
+	case fsp_msg_timeout:
+		rc = -1; /* XXX to improve */
+		break;
+	default:
+		rc = -1; /* Should not happen... (assert ?) */
+	}
+
+	if (msg->resp)
+		rc = (msg->resp->word1 >> 8) & 0xff;
+
+	return rc;
+}
+
+static bool fsp_init_one(const char *compat)
+{
+	struct dt_node *fsp_node;
+	bool inited = false;
+
+	dt_for_each_compatible(dt_root, fsp_node, compat) {
+		if (!inited) {
+			int i;
+	
+			/* Initialize the per-class msg queues */
+			for (i = 0;
+			     i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+				list_head_init(&fsp_cmdclass[i].msgq);
+				list_head_init(&fsp_cmdclass[i].clientq);
+				list_head_init(&fsp_cmdclass[i].rr_queue);
+			}
+
+			/* Init the queues for RR notifier cmdclass */
+			list_head_init(&fsp_cmdclass_rr.msgq);
+			list_head_init(&fsp_cmdclass_rr.clientq);
+			list_head_init(&fsp_cmdclass_rr.rr_queue);
+
+			/* Register poller */
+			opal_add_poller(fsp_opal_poll, NULL);
+
+			inited = true;
+		}
+
+		/* Create the FSP data structure */
+		fsp_create_fsp(fsp_node);
+	}
+
+	return inited;
+}
+
+void fsp_init(void)
+{
+	prlog(PR_DEBUG, "FSP: Looking for FSP...\n");
+
+	fsp_init_tce_table();
+
+	if (!fsp_init_one("ibm,fsp1") && !fsp_init_one("ibm,fsp2")) {
+		prlog(PR_DEBUG, "FSP: No FSP on this machine\n");
+		return;
+	}
+}
+
+bool fsp_present(void)
+{
+	return first_fsp != NULL;
+}
+
+static void fsp_timeout_poll(void *data __unused)
+{
+	u64 now = mftb();
+	u64 timeout_val = 0;
+	u64 cmdclass_resp_bitmask = fsp_cmdclass_resp_bitmask;
+	struct fsp_cmdclass *cmdclass = NULL;
+	struct fsp_msg *req = NULL;
+	u32 index = 0;
+
+	if (timeout_timer == 0)
+		timeout_timer = now + secs_to_tb(30);
+
+	/* The lowest granularity for a message timeout is 30 secs.
+	 * So every 30secs, check if there is any message
+	 * waiting for a response from the FSP
+	 */
+	if (tb_compare(now, timeout_timer) == TB_ABEFOREB)
+		return;
+	if (!try_lock(&fsp_poll_lock))
+		return;
+	if (tb_compare(now, timeout_timer) == TB_ABEFOREB) {
+		unlock(&fsp_poll_lock);
+		return;
+	}
+
+	while (cmdclass_resp_bitmask) {
+		u64 time_sent = 0;
+		u64 time_to_comp = 0;
+
+		if (!(cmdclass_resp_bitmask & 0x1))
+			goto next_bit;
+
+		cmdclass = &fsp_cmdclass[index];
+		timeout_val = secs_to_tb((cmdclass->timeout) * 60);
+		time_sent = cmdclass->timesent;
+		time_to_comp = now - cmdclass->timesent;
+
+		/* Now check if the response has timed out */
+		if (tb_compare(time_to_comp, timeout_val) == TB_AAFTERB) {
+			u32 w0, w1;
+			enum fsp_msg_state mstate;
+
+			/* Take the FSP lock now and re-check */
+			lock(&fsp_lock);
+			if (!(fsp_cmdclass_resp_bitmask & (1ull << index)) ||
+			    time_sent != cmdclass->timesent) {
+				unlock(&fsp_lock);
+				goto next_bit;
+			}
+			req = list_top(&cmdclass->msgq,	struct fsp_msg, link);
+			if (!req) {
+				printf("FSP: Timeout state mismatch on class %d\n",
+				       index);
+				fsp_cmdclass_resp_bitmask &= ~(1ull << index);
+				cmdclass->timesent = 0;
+				unlock(&fsp_lock);
+				goto next_bit;
+			}
+			w0 = req->word0;
+			w1 = req->word1;
+			mstate = req->state;
+			prlog(PR_WARNING, "FSP: Response from FSP timed out,"
+			      " cmd = %x subcmd = %x mod = %x state: %d\n",
+			      w0 & 0xff, w1 & 0xff, (w1 >> 8) & 0xff, mstate);
+			fsp_reg_dump();
+			fsp_cmdclass_resp_bitmask &= ~(1ull << index);
+			cmdclass->timesent = 0;
+			if (req->resp) {
+				req->resp->state = fsp_msg_timeout;
+				req->resp->word1 = (FSP_STATUS_BUSY << 8) |
+					(req->resp->word1 & 0xff);
+			}
+			fsp_complete_msg(req);
+			__fsp_trigger_reset();
+			unlock(&fsp_lock);
+			fsp_hir_reason_plid = log_simple_error(
+				&e_info(OPAL_RC_FSP_POLL_TIMEOUT),
+				"FSP: Response from FSP timed out,"
+				" cmd = %x subcmd = %x mod = %x state: %d\n",
+				w0 & 0xff, w1 & 0xff, (w1 >> 8) & 0xff, mstate);
+		}
+	next_bit:
+		cmdclass_resp_bitmask = cmdclass_resp_bitmask >> 1;
+		index++;
+	}
+	unlock(&fsp_poll_lock);
+}
+
+void fsp_opl(void)
+{
+	struct dt_node *iplp;
+
+	if (!fsp_present())
+		return;
+
+	/* Send OPL */
+	ipl_state |= ipl_opl_sent;
+	fsp_sync_msg(fsp_mkmsg(FSP_CMD_OPL, 0), true);
+	while(!(ipl_state & ipl_got_continue)) {
+		opal_run_pollers();
+		cpu_relax();
+	}
+
+	/* Send continue ACK */
+	fsp_sync_msg(fsp_mkmsg(FSP_CMD_CONTINUE_ACK, 0), true);
+
+	/* Wait for various FSP messages */
+	prlog(PR_INFO, "INIT: Waiting for FSP to advertise new role...\n");
+	while(!(ipl_state & ipl_got_new_role)) {
+		cpu_relax();
+		opal_run_pollers();
+	}
+	prlog(PR_INFO, "INIT: Waiting for FSP to request capabilities...\n");
+	while(!(ipl_state & ipl_got_caps)) {
+		cpu_relax();
+		opal_run_pollers();
+	}
+
+	/* Initiate the timeout poller */
+	opal_add_poller(fsp_timeout_poll, NULL);
+
+	/* Tell FSP we are in standby */
+	prlog(PR_INFO, "INIT: Sending HV Functional: Standby...\n");
+	fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x01000000), true);
+
+	/* Wait for FSP functional */
+	prlog(PR_INFO, "INIT: Waiting for FSP functional\n");
+	while(!(ipl_state & ipl_got_fsp_functional)) {
+		cpu_relax();
+		opal_run_pollers();
+	}
+
+	/* Tell FSP we are in running state */
+	prlog(PR_INFO, "INIT: Sending HV Functional: Runtime...\n");
+	fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x02000000), true);
+
+	/*
+	 * For the factory reset case, FSP sends us the PCI Bus
+	 * Reset request. We don't have to do anything special with
+	 * PCI bus numbers here; just send the Power Down message
+	 * with modifier 0x02 to FSP.
+	 */
+	iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+	if (iplp && dt_find_property(iplp, "pci-busno-reset-ipl")) {
+		prlog(PR_DEBUG, "INIT: PCI Bus Reset requested."
+		      " Sending Power Down\n");
+		fsp_sync_msg(fsp_mkmsg(FSP_CMD_POWERDOWN_PCIRS, 0), true);
+	}
+
+	/*
+	 * Tell FSP we are in running state with all partitions.
+	 *
+	 * This is need otherwise the FSP will not reset it's reboot count
+	 * on failures. Ideally we should send that when we know the
+	 * OS is up but we don't currently have a very good way to do
+	 * that so this will do as a stop-gap
+	 */
+	prlog(PR_NOTICE, "INIT: Sending HV Functional: Runtime all partitions\n");
+	fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x04000000), true);
+}
+
+uint32_t fsp_adjust_lid_side(uint32_t lid_no)
+{
+	struct dt_node *iplp;
+	const char *side = NULL;
+
+	iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+	if (iplp)
+		side = dt_prop_get_def(iplp, "cec-ipl-side", NULL);
+	if (!side || !strcmp(side, "temp"))
+		lid_no |= ADJUST_T_SIDE_LID_NO;
+	return lid_no;
+}
+
+struct fsp_fetch_lid_item {
+	enum resource_id id;
+	uint32_t idx;
+
+	uint32_t lid;
+	uint32_t lid_no;
+	uint64_t bsize;
+	uint32_t offset;
+	void *buffer;
+	size_t *length;
+	size_t remaining;
+	size_t chunk_requested;
+	struct list_node link;
+	int result;
+};
+
+/*
+ * We have a queue of things to fetch
+ * when fetched, it moves to fsp_fetched_lid until we're asked if it
+ * has been fetched, in which case it's free()d.
+ *
+ * Everything is protected with fsp_fetch_lock.
+ *
+ * We use PSI_DMA_FETCH TCE entry for this fetching queue. If something
+ * is in the fsp_fetch_lid_queue, it means we're using this TCE entry!
+ *
+ * If we add the first entry to fsp_fetch_lid_queue, we trigger fetching!
+ */
+static LIST_HEAD(fsp_fetch_lid_queue);
+static LIST_HEAD(fsp_fetched_lid);
+static struct lock fsp_fetch_lock = LOCK_UNLOCKED;
+
+/*
+ * Asynchronous fsp fetch data call
+ *
+ * Note:
+ *   buffer = PSI DMA address space
+ */
+int fsp_fetch_data_queue(uint8_t flags, uint16_t id, uint32_t sub_id,
+			 uint32_t offset, void *buffer, size_t *length,
+			 void (*comp)(struct fsp_msg *msg))
+{
+	struct fsp_msg *msg;
+	uint32_t chunk = *length;
+
+	if (!comp)
+		return OPAL_PARAMETER;
+
+	msg = fsp_mkmsg(FSP_CMD_FETCH_SP_DATA, 0x6, flags << 16 | id,
+			sub_id, offset, 0, buffer, chunk);
+	if (!msg) {
+		prerror("FSP: allocation failed!\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_queue_msg(msg, comp)) {
+		fsp_freemsg(msg);
+		prerror("FSP: Failed to queue fetch data message\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	return OPAL_SUCCESS;
+}
+
+#define CAPP_IDX_VENICE_DD10 0x100ea
+#define CAPP_IDX_VENICE_DD20 0x200ea
+#define CAPP_IDX_MURANO_DD20 0x200ef
+#define CAPP_IDX_MURANO_DD21 0x201ef
+#define CAPP_IDX_NAPLES_DD10 0x100d3
+#define CAPP_IDX_NIMBUS_DD10 0x100d1
+#define CAPP_IDX_NIMBUS_DD20 0x200d1
+#define CAPP_IDX_NIMBUS_DD21 0x201d1
+#define CAPP_IDX_NIMBUS_DD22 0x202d1
+#define CAPP_IDX_NIMBUS_DD23 0x203d1
+
+#define IMA_CATALOG_NIMBUS	0x4e0200
+#define IMA_CATALOG_P10_DD1	0x800100
+#define IMA_CATALOG_P10_DD2	0x800200
+
+
+static struct {
+	enum resource_id	id;
+	uint32_t		idx;
+	uint32_t		lid_no;
+} fsp_lid_map[] = {
+	{ RESOURCE_ID_KERNEL,	RESOURCE_SUBID_NONE,	KERNEL_LID_OPAL },
+	{ RESOURCE_ID_INITRAMFS,RESOURCE_SUBID_NONE,	INITRAMFS_LID_OPAL },
+	{ RESOURCE_ID_IMA_CATALOG,IMA_CATALOG_NIMBUS,	0x80f00103 },
+	{ RESOURCE_ID_CAPP,	CAPP_IDX_MURANO_DD20,	0x80a02002 },
+	{ RESOURCE_ID_CAPP,	CAPP_IDX_MURANO_DD21,	0x80a02001 },
+	{ RESOURCE_ID_CAPP,	CAPP_IDX_VENICE_DD10,	0x80a02003 },
+	{ RESOURCE_ID_CAPP,	CAPP_IDX_VENICE_DD20,	0x80a02004 },
+	{ RESOURCE_ID_CAPP,	CAPP_IDX_NAPLES_DD10,	0x80a02005 },
+	{ RESOURCE_ID_CAPP,	CAPP_IDX_NIMBUS_DD10,	0x80a02006 },
+	{ RESOURCE_ID_CAPP,	CAPP_IDX_NIMBUS_DD20,	0x80a02007 },
+	{ RESOURCE_ID_CAPP,	CAPP_IDX_NIMBUS_DD21,	0x80a02007 },
+	{ RESOURCE_ID_CAPP,	CAPP_IDX_NIMBUS_DD22,	0x80a02007 },
+	{ RESOURCE_ID_CAPP,	CAPP_IDX_NIMBUS_DD23,	0x80a02007 },
+	{ RESOURCE_ID_IMA_CATALOG,IMA_CATALOG_P10_DD1,	0x80f00103 },
+	{ RESOURCE_ID_IMA_CATALOG,IMA_CATALOG_P10_DD2,	0x80f00103 },
+};
+
+static void fsp_start_fetching_next_lid(void);
+static void fsp_fetch_lid_next_chunk(struct fsp_fetch_lid_item *last);
+
+static void fsp_fetch_lid_complete(struct fsp_msg *msg)
+{
+	struct fsp_fetch_lid_item *last;
+	uint32_t woffset, wlen;
+	uint8_t rc;
+
+	lock(&fsp_fetch_lock);
+	last = list_top(&fsp_fetch_lid_queue, struct fsp_fetch_lid_item, link);
+	fsp_tce_unmap(PSI_DMA_FETCH, last->bsize);
+
+	woffset = fsp_msg_get_data_word(msg->resp, 1);
+	wlen = fsp_msg_get_data_word(msg->resp, 2);
+	rc = (msg->resp->word1 >> 8) & 0xff;
+
+	/* Fall back to a PHYP LID for kernel loads */
+	if (rc && last->lid_no == KERNEL_LID_OPAL) {
+		const char *ltype = dt_prop_get_def(dt_root, "lid-type", NULL);
+		if (!ltype || strcmp(ltype, "opal")) {
+			prerror("Failed to load in OPAL mode...\n");
+			last->result = OPAL_PARAMETER;
+			last = list_pop(&fsp_fetch_lid_queue,
+					struct fsp_fetch_lid_item, link);
+			list_add_tail(&fsp_fetched_lid, &last->link);
+			fsp_start_fetching_next_lid();
+			unlock(&fsp_fetch_lock);
+			return;
+		}
+		printf("Trying to load as PHYP LID...\n");
+		last->lid = KERNEL_LID_PHYP;
+		/* Retry with different LID */
+		fsp_fetch_lid_next_chunk(last);
+	}
+
+	if (rc !=0 && rc != 2) {
+		last->result = -EIO;
+		last = list_pop(&fsp_fetch_lid_queue, struct fsp_fetch_lid_item, link);
+		prerror("FSP LID %08x load ERROR %d\n", last->lid_no, rc);
+		list_add_tail(&fsp_fetched_lid, &last->link);
+		fsp_start_fetching_next_lid();
+		unlock(&fsp_fetch_lock);
+		return;
+	}
+
+	/*
+	 * As per documentation, rc=2 means end of file not reached and
+	 * rc=1 means we reached end of file. But it looks like we always
+	 * get rc=0 irrespective of whether end of file is reached or not.
+	 * The old implementation (fsp_sync_msg) used to rely on
+	 * (wlen < chunk) to decide whether we reached end of file.
+	 *
+	 * Ideally FSP folks should be fix their code as per documentation.
+	 * but until they do, adding the old check (hack) here again.
+	 *
+	 * Without this hack some systems would load partial lid and won't
+	 * be able to boot into petitboot kernel.
+	 */
+	if (rc == 0 && (wlen < last->chunk_requested))
+		last->result = OPAL_SUCCESS;
+
+	fsp_freemsg(msg);
+
+	last->remaining -= wlen;
+	*(last->length) += wlen;
+	last->buffer += wlen;
+	last->offset += wlen;
+
+	prlog(PR_DEBUG, "FSP: LID %x Chunk read -> rc=0x%02x off: %08x"
+	      " twritten: %08x\n", last->lid, rc, woffset, wlen);
+
+	fsp_fetch_lid_next_chunk(last);
+
+	unlock(&fsp_fetch_lock);
+}
+
+static void fsp_fetch_lid_next_chunk(struct fsp_fetch_lid_item *last)
+{
+	uint64_t baddr;
+	uint64_t balign, boff;
+	uint32_t chunk;
+	uint32_t taddr;
+	struct fsp_msg *msg;
+	uint8_t flags = 0;
+	uint16_t id = FSP_DATASET_NONSP_LID;
+	uint32_t sub_id;
+
+	assert(lock_held_by_me(&fsp_fetch_lock));
+
+	if (last->remaining == 0 || last->result == OPAL_SUCCESS) {
+		last->result = OPAL_SUCCESS;
+		last = list_pop(&fsp_fetch_lid_queue,
+				struct fsp_fetch_lid_item, link);
+		list_add_tail(&fsp_fetched_lid, &last->link);
+		fsp_start_fetching_next_lid();
+		return;
+	}
+
+	baddr = (uint64_t)last->buffer;
+	balign = baddr & ~TCE_MASK;
+	boff = baddr & TCE_MASK;
+
+	chunk = last->remaining;
+	if (chunk > (PSI_DMA_FETCH_SIZE - boff))
+		chunk = PSI_DMA_FETCH_SIZE - boff;
+	last->bsize = ((boff + chunk) + TCE_MASK) & ~TCE_MASK;
+	last->chunk_requested = chunk;
+
+	prlog(PR_DEBUG, "FSP: LID %08x chunk 0x%08x bytes balign=%llx"
+	      " boff=%llx bsize=%llx\n",
+	      last->lid_no, chunk, balign, boff, last->bsize);
+
+	fsp_tce_map(PSI_DMA_FETCH, (void *)balign, last->bsize);
+	taddr = PSI_DMA_FETCH + boff;
+
+	sub_id = last->lid;
+
+	msg = fsp_mkmsg(FSP_CMD_FETCH_SP_DATA, 6,
+			flags << 16 | id, sub_id, last->offset,
+			0, taddr, chunk);
+
+	if (fsp_queue_msg(msg, fsp_fetch_lid_complete)) {
+		fsp_freemsg(msg);
+		prerror("FSP: Failed to queue fetch data message\n");
+		last->result = OPAL_INTERNAL_ERROR;
+		last = list_pop(&fsp_fetch_lid_queue,
+				struct fsp_fetch_lid_item, link);
+		list_add_tail(&fsp_fetched_lid, &last->link);
+	}
+	last->result = OPAL_BUSY;
+}
+
+static void fsp_start_fetching_next_lid(void)
+{
+	struct fsp_fetch_lid_item *last;
+
+	assert(lock_held_by_me(&fsp_fetch_lock));
+
+	last = list_top(&fsp_fetch_lid_queue, struct fsp_fetch_lid_item, link);
+
+	if (last == NULL)
+		return;
+
+	/* If we're not already fetching */
+	if (last->result == OPAL_EMPTY)
+		fsp_fetch_lid_next_chunk(last);
+}
+
+int fsp_start_preload_resource(enum resource_id id, uint32_t idx,
+				void *buf, size_t *size)
+{
+	struct fsp_fetch_lid_item *resource;
+	uint32_t lid_no = 0;
+	int i;
+
+	resource = malloc(sizeof(struct fsp_fetch_lid_item));
+	assert(resource != NULL);
+
+	resource->id = id;
+	resource->idx = idx;
+
+	resource->offset = 0;
+	resource->buffer = buf;
+	resource->remaining = *size;
+	*size = 0;
+	resource->length = size;
+	resource->result = OPAL_EMPTY;
+
+	for (i = 0; i < ARRAY_SIZE(fsp_lid_map); i++) {
+		if (id != fsp_lid_map[i].id)
+			continue;
+
+		if (fsp_lid_map[i].idx == idx) {
+			lid_no = fsp_lid_map[i].lid_no;
+			break;
+		}
+	}
+	if (lid_no == 0)
+		return OPAL_PARAMETER;
+
+	printf("Trying to load OPAL LID %08x...\n", lid_no);
+	resource->lid_no = lid_no;
+	resource->lid = fsp_adjust_lid_side(lid_no);
+
+	lock(&fsp_fetch_lock);
+	list_add_tail(&fsp_fetch_lid_queue, &resource->link);
+	fsp_start_fetching_next_lid();
+	unlock(&fsp_fetch_lock);
+
+	return OPAL_SUCCESS;
+}
+
+int fsp_resource_loaded(enum resource_id id, uint32_t idx)
+{
+	struct fsp_fetch_lid_item *resource = NULL;
+	struct fsp_fetch_lid_item *r;
+	int rc = OPAL_BUSY;
+
+	lock(&fsp_fetch_lock);
+	list_for_each(&fsp_fetched_lid, r, link) {
+		if (r->id == id && r->idx == idx) {
+			resource = r;
+			break;
+		}
+	}
+
+	if (resource) {
+		rc = resource->result;
+		list_del(&resource->link);
+		free(resource);
+	}
+	unlock(&fsp_fetch_lock);
+
+	return rc;
+}
+
+static int fsp_lid_loaded(uint32_t lid_no)
+{
+	struct fsp_fetch_lid_item *resource = NULL;
+	struct fsp_fetch_lid_item *r;
+	int rc = OPAL_BUSY;
+
+	lock(&fsp_fetch_lock);
+	list_for_each(&fsp_fetched_lid, r, link) {
+		if (r->lid_no == lid_no) {
+			resource = r;
+			break;
+		}
+	}
+
+	if (resource) {
+		rc = resource->result;
+		if (rc == OPAL_SUCCESS) {
+			list_del(&resource->link);
+			free(resource);
+		}
+	}
+	unlock(&fsp_fetch_lock);
+
+	return rc;
+}
+
+int fsp_preload_lid(uint32_t lid_no, char *buf, size_t *size)
+{
+	struct fsp_fetch_lid_item *resource;
+	int r = OPAL_SUCCESS;
+
+	resource = malloc(sizeof(struct fsp_fetch_lid_item));
+	assert(resource != NULL);
+
+	resource->id = -1;
+	resource->idx = -1;
+
+	resource->offset = 0;
+	resource->buffer = buf;
+	resource->remaining = *size;
+	*size = 0;
+	resource->length = size;
+	resource->result = OPAL_EMPTY;
+
+	if (lid_no == 0)
+		return OPAL_PARAMETER;
+
+	printf("Trying to load LID %08x from FSP\n", lid_no);
+	resource->lid_no = lid_no;
+	resource->lid = fsp_adjust_lid_side(lid_no);
+
+	lock(&fsp_fetch_lock);
+	list_add_tail(&fsp_fetch_lid_queue, &resource->link);
+	fsp_start_fetching_next_lid();
+	unlock(&fsp_fetch_lock);
+
+	return r;
+}
+
+int fsp_wait_lid_loaded(uint32_t lid_no)
+{
+	int r;
+	int waited = 0;
+
+	r = fsp_lid_loaded(lid_no);
+
+	while(r == OPAL_BUSY) {
+		opal_run_pollers();
+		time_wait_nopoll(msecs_to_tb(5));
+		waited+=5;
+		cpu_relax();
+		r = fsp_lid_loaded(lid_no);
+	}
+
+	prlog(PR_DEBUG, "FSP: fsp_wait_lid_loaded %x %u ms\n", lid_no, waited);
+
+	return r;
+}
+
+void fsp_used_by_console(void)
+{
+	fsp_lock.in_con_path = true;
+
+	/*
+	 * Some other processor might hold it without having
+	 * disabled the console locally so let's make sure that
+	 * is over by taking/releasing the lock ourselves
+	 */
+	lock(&fsp_lock);
+	unlock(&fsp_lock);
+}
diff --git a/roms/skiboot/hw/homer.c b/roms/skiboot/hw/homer.c
new file mode 100644
index 000000000..3ff6ed1ae
--- /dev/null
+++ b/roms/skiboot/hw/homer.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <hostservices.h>
+
+#define P8_PBA_BAR0		0x2013f00
+#define P8_PBA_BARMASK0		0x2013f04
+
+#define P9_PBA_BAR0		0x5012B00
+#define P9_PBA_BARMASK0		0x5012B04
+
+#define P10_PBA_BAR0		0x01010CDA
+#define P10_PBA_BARMASK0	0x01010CDE
+
+#define PBA_MASK_ALL_BITS 0x000001FFFFF00000ULL /* Bits 23:43 */
+
+enum P8_BAR {
+	P8_BAR_HOMER = 0,
+	P8_BAR_CENTAUR = 1,
+	P8_BAR_SLW = 2,
+	P8_BAR_OCC_COMMON = 3,
+};
+
+enum P9_BAR {
+	P9_BAR_HOMER = 0,
+	P9_BAR_CENTAUR = 1,
+	P9_BAR_OCC_COMMON = 2,
+	P9_BAR_SBE = 3,
+};
+
+enum P10_BAR {
+	P10_BAR_HOMER = 0,
+	P10_BAR_OCMB_THERMAL = 1,
+	P10_BAR_OCC_COMMON = 2,
+	P10_BAR_SBE = 3,
+};
+
+static u64 pba_bar0, pba_barmask0;
+static u8 bar_homer, bar_slw, bar_occ_common;
+
+static bool read_pba_bar(struct proc_chip *chip, unsigned int bar_no,
+			 uint64_t *base, uint64_t *size)
+{
+	uint64_t bar, mask;
+	int rc;
+
+	rc = xscom_read(chip->id, pba_bar0 + bar_no, &bar);
+	if (rc) {
+		prerror("SLW: Error %d reading PBA BAR%d on chip %d\n",
+			rc, bar_no, chip->id);
+		return false;
+	}
+	rc = xscom_read(chip->id, pba_barmask0 + bar_no, &mask);
+	if (rc) {
+		prerror("SLW: Error %d reading PBA BAR MASK%d on chip %d\n",
+			rc, bar_no, chip->id);
+		return false;
+	}
+	prlog(PR_DEBUG, "  PBA BAR%d : 0x%016llx\n", bar_no, bar);
+	prlog(PR_DEBUG, "  PBA MASK%d: 0x%016llx\n", bar_no, mask);
+
+	if (mask == PBA_MASK_ALL_BITS) {
+		/*
+		 * This could happen if all HOMER users are not enabled during
+		 * early system bringup. Skip using the PBA BAR.
+		 */
+		mask = 0;
+		bar = 0;
+		prerror("  PBA MASK%d uninitalized skipping BAR\n", bar_no);
+	}
+
+	*base = bar & 0x0ffffffffffffffful;
+	*size = (mask | 0xfffff) + 1;
+
+	return (*base) != 0;
+}
+
+static void homer_init_chip(struct proc_chip *chip)
+{
+	uint64_t hbase = 0, hsize = 0;
+	uint64_t sbase, ssize, obase, osize;
+
+	/*
+	 * PBA BARs assigned by HB:
+	 *
+	 * P8:
+	 *   0 : Entire HOMER
+	 *   1 : OCC to Centaur path (we don't care)
+	 *   2 : SLW image
+	 *   3 : OCC Common area
+	 *
+	 * We need to reserve the memory covered by BAR 0 and BAR 3, however
+	 * on earlier HBs, BAR0 isn't set so we need BAR 2 instead in that
+	 * case to cover SLW (OCC not running).
+	 *
+	 * P9:
+	 *   0 : Entire HOMER
+	 *   1 : OCC to Centaur path (Cumulus only)
+	 *   2 : OCC Common area
+	 *   3 : SBE communication
+	 *
+	 */
+	if (read_pba_bar(chip, bar_homer, &hbase, &hsize)) {
+		prlog(PR_DEBUG, "  HOMER Image at 0x%llx size %lldMB\n",
+		      hbase, hsize / 0x100000);
+
+		if (!mem_range_is_reserved(hbase, hsize)) {
+			prlog(PR_WARNING,
+				"HOMER image is not reserved! Reserving\n");
+			mem_reserve_fw("ibm,homer-image", hbase, hsize);
+		}
+
+		chip->homer_base = hbase;
+		chip->homer_size = hsize;
+	}
+
+	/*
+	 * We always read the SLW BAR since we need to grab info about the
+	 * SLW image in the struct proc_chip for use by the slw.c code
+	 */
+	if (proc_gen == proc_gen_p8 &&
+	    read_pba_bar(chip, bar_slw, &sbase, &ssize)) {
+		prlog(PR_DEBUG, "  SLW Image at 0x%llx size %lldMB\n",
+		      sbase, ssize / 0x100000);
+
+		/*
+		 * Only reserve it if we have no homer image or if it
+		 * doesn't fit in it (only check the base).
+		 */
+		if ((sbase < hbase || sbase > (hbase + hsize) ||
+				(hbase == 0 && sbase > 0)) &&
+				!mem_range_is_reserved(sbase, ssize)) {
+			prlog(PR_WARNING,
+				"SLW image is not reserved! Reserving\n");
+			mem_reserve_fw("ibm,slw-image", sbase, ssize);
+		}
+
+		chip->slw_base = sbase;
+		chip->slw_bar_size = ssize;
+		chip->slw_image_size = ssize; /* will be adjusted later */
+	}
+
+	if (read_pba_bar(chip, bar_occ_common, &obase, &osize)) {
+		prlog(PR_DEBUG, "  OCC Common Area at 0x%llx size %lldMB\n",
+		      obase, osize / 0x100000);
+		chip->occ_common_base = obase;
+		chip->occ_common_size = osize;
+	}
+}
+
+
+static void host_services_occ_base_setup(void)
+{
+	struct proc_chip *chip;
+	uint64_t occ_common;
+
+	chip = next_chip(NULL); /* Frist chip */
+	occ_common = (uint64_t) local_alloc(chip->id, OCC_COMMON_SIZE, OCC_COMMON_SIZE);
+
+	for_each_chip(chip) {
+		chip->occ_common_base = occ_common;
+		chip->occ_common_size = OCC_COMMON_SIZE;
+
+		chip->homer_base = (uint64_t) local_alloc(chip->id, HOMER_IMAGE_SIZE,
+							HOMER_IMAGE_SIZE);
+		chip->homer_size = HOMER_IMAGE_SIZE;
+		memset((void *)chip->homer_base, 0, chip->homer_size);
+
+		prlog(PR_DEBUG, "HBRT: Chip %d HOMER base %016llx : %08llx\n",
+		      chip->id, chip->homer_base, chip->homer_size);
+		prlog(PR_DEBUG, "HBRT: OCC common base %016llx : %08llx\n",
+		      chip->occ_common_base, chip->occ_common_size);
+	}
+}
+
+void homer_init(void)
+{
+	struct proc_chip *chip;
+
+	if (chip_quirk(QUIRK_NO_PBA))
+		return;
+
+	switch (proc_gen) {
+	case proc_gen_p8:
+		pba_bar0 = P8_PBA_BAR0;
+		pba_barmask0 = P8_PBA_BARMASK0;
+		bar_homer = P8_BAR_HOMER;
+		bar_slw = P8_BAR_SLW;
+		bar_occ_common = P8_BAR_OCC_COMMON;
+		break;
+	case proc_gen_p9:
+		pba_bar0 = P9_PBA_BAR0;
+		pba_barmask0 = P9_PBA_BARMASK0;
+		bar_homer = P9_BAR_HOMER;
+		bar_occ_common = P9_BAR_OCC_COMMON;
+		break;
+	case proc_gen_p10:
+		pba_bar0 = P10_PBA_BAR0;
+		pba_barmask0 = P10_PBA_BARMASK0;
+		bar_homer = P10_BAR_HOMER;
+		bar_occ_common = P10_BAR_OCC_COMMON;
+		break;
+	default:
+		return;
+	};
+
+	/*
+	 * XXX This is temporary, on P8 we look for any configured
+	 * SLW/OCC BAR and reserve the memory. Eventually, this will be
+	 * done via HostBoot using the device-tree "reserved-ranges"
+	 * or we'll load the SLW & OCC images ourselves using Host Services.
+	 */
+	for_each_chip(chip) {
+		prlog(PR_DEBUG, "HOMER: Init chip %d\n", chip->id);
+		homer_init_chip(chip);
+	}
+
+	/*
+	 * Check is PBA BARs are already loaded with HOMER and
+	 * skip host services.
+	 */
+
+	chip = next_chip(NULL);
+	/* Both HOMER images and OCC areas are setup */
+	if (chip->homer_base && chip->occ_common_base) {
+		/* Reserve OCC common area from BAR */
+		if (!mem_range_is_reserved(chip->occ_common_base,
+					chip->occ_common_size)) {
+			prlog(PR_WARNING,
+				"OCC common area is not reserved! Reserving\n");
+			mem_reserve_fw("ibm,occ-common-area",
+						chip->occ_common_base,
+						chip->occ_common_size);
+		}
+	} else if (chip->homer_base) {
+		/*
+		 * HOMER is setup but not OCC!! Do not allocate HOMER
+		 * regions.  This case is possible during early system
+		 * bringup where OCC images are not yet operational.
+		 */
+	} else {
+		/* Allocate memory for HOMER and OCC common area */
+		host_services_occ_base_setup();
+	}
+}
+
diff --git a/roms/skiboot/hw/imc.c b/roms/skiboot/hw/imc.c
new file mode 100644
index 000000000..cbd68edc4
--- /dev/null
+++ b/roms/skiboot/hw/imc.c
@@ -0,0 +1,1075 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * In-Memory Counters (IMC)
+ * Sometimes called IMA, but that's also a different thing.
+ *
+ * Copyright 2016-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt)  "IMC: " fmt
+#include <skiboot.h>
+#include <xscom.h>
+#include <imc.h>
+#include <chip.h>
+#include <libxz/xz.h>
+#include <device.h>
+#include <p9_stop_api.H>
+
+/*
+ * IMC trace scom values
+ */
+#define IMC_TRACE_CPMC1                0       /* select cpmc1 */
+#define IMC_TRACE_CPMC2                1       /* select cpmc2 */
+#define IMC_TRACE_CPMCLOAD_VAL	0xfa	/*
+					 * Value to be loaded into cpmc2
+					 * at sampling start
+					 */
+
+/* Event: CPM_32MHZ_CYC */
+#define IMC_TRACE_CPMC2SEL_VAL	2
+#define IMC_TRACE_CPMC1SEL_VAL	4
+
+#define IMC_TRACE_BUFF_SIZE	0	/*
+					 * b’000’- 4K entries * 64 per
+					 * entry = 256K buffersize
+					 */
+static uint64_t TRACE_IMC_ADDR;
+static uint64_t CORE_IMC_EVENT_MASK_ADDR;
+static uint64_t trace_scom_val;
+/*
+ * Initialise these with the pdbar and htm scom port address array
+ * at run time, based on the processor version.
+ */
+static unsigned int *pdbar_scom_index;
+static unsigned int *htm_scom_index;
+
+/*
+ * Nest IMC PMU names along with their bit values as represented in the
+ * imc_chip_avl_vector(in struct imc_chip_cb, look at include/imc.h).
+ * nest_pmus[] is an array containing all the possible nest IMC PMU node names.
+ */
+static char const *nest_pmus[] = {
+	"powerbus0",
+	"mcs0",
+	"mcs1",
+	"mcs2",
+	"mcs3",
+	"mcs4",
+	"mcs5",
+	"mcs6",
+	"mcs7",
+	"mba0",
+	"mba1",
+	"mba2",
+	"mba3",
+	"mba4",
+	"mba5",
+	"mba6",
+	"mba7",
+	"cen0",
+	"cen1",
+	"cen2",
+	"cen3",
+	"cen4",
+	"cen5",
+	"cen6",
+	"cen7",
+	"xlink0",
+	"xlink1",
+	"xlink2",
+	"mcd0",
+	"mcd1",
+	"phb0",
+	"phb1",
+	"phb2",
+	"phb3",
+	"phb4",
+	"phb5",
+	"nx",
+	"capp0",
+	"capp1",
+	"vas",
+	"int",
+	"alink0",
+	"alink1",
+	"alink2",
+	"alink3",
+	"nvlink0",
+	"nvlink1",
+	"nvlink2",
+	"nvlink3",
+	"nvlink4",
+	"nvlink5",
+	/* reserved bits : 51 - 63 */
+};
+
+/*
+ * Due to Nest HW/OCC restriction, microcode will not support individual unit
+ * events for these nest units mcs0, mcs1 ... mcs7 in the accumulation mode.
+ * And events to monitor each mcs units individually will be supported only
+ * in the debug mode (which will be supported by microcode in the future).
+ * These will be advertised only when OPAL provides interface for the it.
+ */
+static char const *debug_mode_units[] = {
+	"mcs0",
+	"mcs1",
+	"mcs2",
+	"mcs3",
+	"mcs4",
+	"mcs5",
+	"mcs6",
+	"mcs7",
+};
+
+/*
+ * Combined unit node events are counted when any of the individual
+ * unit is enabled in the availability vector. That is,
+ * ex, mcs01 unit node should be enabled only when mcs0 or mcs1 enabled.
+ * mcs23 unit node should be enabled only when mcs2 or mcs3 is enabled
+ */
+static struct combined_units_node cu_node[] = {
+	{ .name = "mcs01", .unit1 = PPC_BIT(1), .unit2 = PPC_BIT(2) },
+	{ .name = "mcs23", .unit1 = PPC_BIT(3), .unit2 = PPC_BIT(4) },
+	{ .name = "mcs45", .unit1 = PPC_BIT(5), .unit2 = PPC_BIT(6) },
+	{ .name = "mcs67", .unit1 = PPC_BIT(7), .unit2 = PPC_BIT(8) },
+};
+
+static char *compress_buf;
+static size_t compress_buf_size;
+const char **prop_to_fix(struct dt_node *node);
+static const char *props_to_fix[] = {"events", NULL};
+
+static bool is_nest_mem_initialized(struct imc_chip_cb *ptr)
+{
+	/*
+	 * Non zero value in "Status" field indicate memory initialized.
+	 */
+	if (!ptr->imc_chip_run_status)
+		return false;
+
+	return true;
+}
+
+/*
+ * A Quad contains 4 cores in Power 9, and there are 4 addresses for
+ * the Core Hardware Trace Macro (CHTM) attached to each core.
+ * So, for core index 0 to core index 3, we have a sequential range of
+ * SCOM port addresses in the arrays below, each for Hardware Trace Macro (HTM)
+ * mode and PDBAR.
+ */
+static unsigned int pdbar_scom_index_p9[] = {
+	0x1001220B,
+	0x1001230B,
+	0x1001260B,
+	0x1001270B
+};
+static unsigned int htm_scom_index_p9[] = {
+	0x10012200,
+	0x10012300,
+	0x10012600,
+	0x10012700
+};
+
+static unsigned int pdbar_scom_index_p10[] = {
+	0x2001868B,
+	0x2001468B,
+	0x2001268B,
+	0x2001168B
+};
+
+static unsigned int htm_scom_index_p10[] = {
+	0x20018680,
+	0x20014680,
+	0x20012680,
+	0x20011680
+};
+
+static struct imc_chip_cb *get_imc_cb(uint32_t chip_id)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct imc_chip_cb *cb;
+
+	if (!chip->homer_base)
+		return NULL; /* The No Homers Club */
+
+	cb = (struct imc_chip_cb *)(chip->homer_base + P9_CB_STRUCT_OFFSET);
+	if (!is_nest_mem_initialized(cb))
+		return NULL;
+
+	return cb;
+}
+
+static int pause_microcode_at_boot(void)
+{
+	struct proc_chip *chip;
+	struct imc_chip_cb *cb;
+
+	for_each_chip(chip) {
+		cb = get_imc_cb(chip->id);
+		if (cb)
+			cb->imc_chip_command =  cpu_to_be64(NEST_IMC_DISABLE);
+		else
+			return -1; /* ucode is not init-ed */
+	}
+
+	return 0;
+}
+
+/*
+ * Function return list of properties names for the fixup
+ */
+const char **prop_to_fix(struct dt_node *node)
+{
+	if (dt_node_is_compatible(node, "ibm,imc-counters"))
+		return props_to_fix;
+
+	return NULL;
+}
+
+/* Helper to get the IMC device type for a device node */
+static int get_imc_device_type(struct dt_node *node)
+{
+	const struct dt_property *type;
+	u32 val=0;
+
+	if (!node)
+		return -1;
+
+	type = dt_find_property(node, "type");
+	if (!type)
+		return -1;
+
+	val = dt_prop_get_u32(node, "type");
+	switch (val){
+	case IMC_COUNTER_CHIP:
+		return IMC_COUNTER_CHIP;
+	case IMC_COUNTER_CORE:
+		return IMC_COUNTER_CORE;
+	case IMC_COUNTER_THREAD:
+		return IMC_COUNTER_THREAD;
+	case IMC_COUNTER_TRACE:
+		return IMC_COUNTER_TRACE;
+	default:
+		break;
+	}
+
+	/* Unknown/Unsupported IMC device type */
+	return -1;
+}
+
+static bool is_nest_node(struct dt_node *node)
+{
+	if (get_imc_device_type(node) == IMC_COUNTER_CHIP)
+		return true;
+
+	return false;
+}
+
+static bool is_imc_device_type_supported(struct dt_node *node)
+{
+	u32 val = get_imc_device_type(node);
+	struct proc_chip *chip = get_chip(this_cpu()->chip_id);
+	uint64_t pvr;
+
+	if ((val == IMC_COUNTER_CHIP) || (val == IMC_COUNTER_CORE) ||
+						(val == IMC_COUNTER_THREAD))
+		return true;
+
+	if (val == IMC_COUNTER_TRACE) {
+		pvr = mfspr(SPR_PVR);
+
+		switch (chip->type) {
+		case PROC_CHIP_P9_NIMBUS:
+			/*
+			 * Trace mode is supported in Nimbus DD2.2
+			 * and later versions.
+			 */
+			if ((PVR_VERS_MAJ(pvr) == 2) &&
+				(PVR_VERS_MIN(pvr) >= 2))
+					return true;
+			break;
+		case PROC_CHIP_P10:
+			return true;
+		default:
+			return false;
+		}
+
+	}
+	return false;
+}
+
+/*
+ * Helper to check for the imc device type in the incoming device tree.
+ * Remove unsupported device node.
+ */
+static void check_imc_device_type(struct dt_node *dev)
+{
+	struct dt_node *node;
+
+	dt_for_each_compatible(dev, node, "ibm,imc-counters") {
+		if (!is_imc_device_type_supported(node)) {
+			/*
+			 * ah nice, found a device type which I didnt know.
+			 * Remove it and also mark node as NULL, since dt_next
+			 * will try to fetch info for "prev" which is removed
+			 * by dt_free.
+			 */
+			dt_free(node);
+			node = NULL;
+		}
+	}
+
+	return;
+}
+
+static void imc_dt_exports_prop_add(struct dt_node *dev)
+{
+	struct dt_node *node;
+	struct proc_chip *chip;
+	const struct dt_property *type;
+	uint32_t offset = 0, size = 0;
+	uint64_t baddr;
+	char namebuf[32];
+
+
+	dt_for_each_compatible(dev, node, "ibm,imc-counters") {
+		type = dt_find_property(node, "type");
+		if (type && is_nest_node(node)) {
+			offset = dt_prop_get_u32(node, "offset");
+			size = dt_prop_get_u32(node, "size");
+		}
+	}
+
+	/*
+	 * Enable only if we have valid values.
+	 */
+	if (!size && !offset)
+		return;
+
+	node = dt_find_by_name(opal_node, "exports");
+	if (!node)
+		return;
+
+	for_each_chip(chip) {
+		snprintf(namebuf, sizeof(namebuf), "imc_nest_chip_%x", chip->id);
+		baddr = chip->homer_base;
+		baddr += offset;
+		dt_add_property_u64s(node, namebuf, baddr, size);
+	}
+}
+
+/*
+ * Remove the PMU device nodes from the incoming new subtree, if they are not
+ * available in the hardware. The availability is described by the
+ * control block's imc_chip_avl_vector.
+ * Each bit represents a device unit. If the device is available, then
+ * the bit is set else its unset.
+ */
+static void disable_unavailable_units(struct dt_node *dev)
+{
+	uint64_t avl_vec;
+	struct imc_chip_cb *cb;
+	struct dt_node *target;
+	int i;
+	bool disable_all_nests = false;
+	struct proc_chip *chip;
+
+	/*
+	 * Check the state of ucode in all the chip.
+	 * Disable the nest unit if ucode is not initialized
+	 * in any of the chip.
+	 */
+	for_each_chip(chip) {
+		cb = get_imc_cb(chip->id);
+		if (!cb) {
+			/*
+			 * At least currently, if one chip isn't functioning,
+			 * none of the IMC Nest units will be functional.
+			 * So while you may *think* this should be per chip,
+			 * it isn't.
+			 */
+			disable_all_nests = true;
+			break;
+		}
+	}
+
+	/* Add a property to "exports" node in opal_node */
+	imc_dt_exports_prop_add(dev);
+
+	/* Fetch the IMC control block structure */
+	cb = get_imc_cb(this_cpu()->chip_id);
+	if (cb && !disable_all_nests)
+		avl_vec = be64_to_cpu(cb->imc_chip_avl_vector);
+	else {
+		avl_vec = 0; /* Remove only nest imc device nodes */
+
+		/* Incase of mambo, just fake it */
+		if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+			avl_vec = (0xffULL) << 56;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(nest_pmus); i++) {
+		if (!(PPC_BITMASK(i, i) & avl_vec)) {
+			/* Check if the device node exists */
+			target = dt_find_by_name(dev, nest_pmus[i]);
+			if (!target)
+				continue;
+			/* Remove the device node */
+			dt_free(target);
+		}
+	}
+
+	/*
+	 * Loop to detect debug mode units and remove them
+	 * since the microcode does not support debug mode function yet.
+	 */
+	for (i = 0; i < ARRAY_SIZE(debug_mode_units); i++) {
+		target = dt_find_by_name(dev, debug_mode_units[i]);
+		if (!target)
+			continue;
+		/* Remove the device node */
+		dt_free(target);
+	}
+
+	/*
+	 * Based on availability unit vector from control block,
+	 * check and enable combined unit nodes in the device tree.
+	 */
+	for (i = 0; i < MAX_NEST_COMBINED_UNITS ; i++ ) {
+		if (!(cu_node[i].unit1 & avl_vec) &&
+				!(cu_node[i].unit2 & avl_vec)) {
+			target = dt_find_by_name(dev, cu_node[i].name);
+			if (!target)
+				continue;
+
+			/* Remove the device node */
+			dt_free(target);
+		}
+	}
+
+	return;
+}
+
+static void disable_imc_type_from_dt(struct dt_node *dev, int imc_type)
+{
+	struct dt_node *node;
+
+	dt_for_each_compatible(dev, node, "ibm,imc-counters") {
+		if (get_imc_device_type(node) == imc_type) {
+			dt_free(node);
+			node = NULL;
+		}
+	}
+
+	return;
+}
+
+/*
+ * Function to queue the loading of imc catalog data
+ * from the IMC pnor partition.
+ */
+void imc_catalog_preload(void)
+{
+	uint32_t pvr = (mfspr(SPR_PVR) & ~(0xf0ff));
+	int ret = OPAL_SUCCESS;
+	compress_buf_size = MAX_COMPRESSED_IMC_DTB_SIZE;
+
+	if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+		return;
+
+	/* Enable only for power 9/10 */
+	if (proc_gen < proc_gen_p9)
+		return;
+
+	compress_buf = malloc(MAX_COMPRESSED_IMC_DTB_SIZE);
+	if (!compress_buf) {
+		prerror("Memory allocation for catalog failed\n");
+		return;
+	}
+
+	ret = start_preload_resource(RESOURCE_ID_IMA_CATALOG,
+					pvr, compress_buf, &compress_buf_size);
+	if (ret != OPAL_SUCCESS) {
+		prerror("Failed to load IMA_CATALOG: %d\n", ret);
+		free(compress_buf);
+		compress_buf = NULL;
+	}
+
+	return;
+}
+
+static void imc_dt_update_nest_node(struct dt_node *dev)
+{
+	struct proc_chip *chip;
+	__be64 *base_addr = NULL;
+	__be32 *chipids = NULL;
+	int i=0, nr_chip = nr_chips();
+	struct dt_node *node;
+	const struct dt_property *type;
+
+	/* Add the base_addr and chip-id properties for the nest node */
+	base_addr = malloc(sizeof(u64) * nr_chip);
+	chipids = malloc(sizeof(u32) * nr_chip);
+	for_each_chip(chip) {
+		base_addr[i] = cpu_to_be64(chip->homer_base);
+		chipids[i] = cpu_to_be32(chip->id);
+		i++;
+	}
+
+	dt_for_each_compatible(dev, node, "ibm,imc-counters") {
+		type = dt_find_property(node, "type");
+		if (type && is_nest_node(node)) {
+			dt_add_property(node, "base-addr", base_addr, (i * sizeof(u64)));
+			dt_add_property(node, "chip-id", chipids, (i * sizeof(u32)));
+		}
+	}
+}
+
+static struct xz_decompress *imc_xz;
+
+void imc_decompress_catalog(void)
+{
+	void *decompress_buf = NULL;
+	uint32_t pvr = (mfspr(SPR_PVR) & ~(0xf0ff));
+	int ret;
+
+	/* Check we succeeded in starting the preload */
+	if (compress_buf == NULL)
+		return;
+
+	ret = wait_for_resource_loaded(RESOURCE_ID_IMA_CATALOG, pvr);
+	if (ret != OPAL_SUCCESS) {
+		prerror("IMC Catalog load failed\n");
+		return;
+	}
+
+	/*
+	 * Memory for decompression.
+	 */
+	decompress_buf = malloc(MAX_DECOMPRESSED_IMC_DTB_SIZE);
+	if (!decompress_buf) {
+		prerror("No memory for decompress_buf \n");
+		return;
+	}
+
+	/*
+	 * Decompress the compressed buffer
+	 */
+	imc_xz = malloc(sizeof(struct xz_decompress));
+	if (!imc_xz) {
+		prerror("No memory to decompress IMC catalog\n");
+		free(decompress_buf);
+		return;
+	}
+
+	imc_xz->dst = decompress_buf;
+	imc_xz->src = compress_buf;
+	imc_xz->dst_size = MAX_DECOMPRESSED_IMC_DTB_SIZE;
+	imc_xz->src_size = compress_buf_size;
+	xz_start_decompress(imc_xz);
+}
+
+static int setup_imc_scoms(void)
+{
+	switch (proc_gen) {
+	case proc_gen_p9:
+		CORE_IMC_EVENT_MASK_ADDR = CORE_IMC_EVENT_MASK_ADDR_P9;
+		TRACE_IMC_ADDR = TRACE_IMC_ADDR_P9;
+		pdbar_scom_index = pdbar_scom_index_p9;
+		htm_scom_index = htm_scom_index_p9;
+		trace_scom_val = TRACE_IMC_SCOM(IMC_TRACE_CPMC2,
+						IMC_TRACE_CPMCLOAD_VAL,
+						IMC_TRACE_CPMC1SEL_VAL,
+						IMC_TRACE_CPMC2SEL_VAL,
+						IMC_TRACE_BUFF_SIZE);
+		return 0;
+	case proc_gen_p10:
+		CORE_IMC_EVENT_MASK_ADDR = CORE_IMC_EVENT_MASK_ADDR_P10;
+		TRACE_IMC_ADDR = TRACE_IMC_ADDR_P10;
+		pdbar_scom_index = pdbar_scom_index_p10;
+		htm_scom_index = htm_scom_index_p10;
+		trace_scom_val = TRACE_IMC_SCOM(IMC_TRACE_CPMC1,
+						IMC_TRACE_CPMCLOAD_VAL,
+						IMC_TRACE_CPMC1SEL_VAL,
+						IMC_TRACE_CPMC2SEL_VAL,
+						IMC_TRACE_BUFF_SIZE);
+		return 0;
+	default:
+		prerror("%s: Unknown cpu type\n", __func__);
+		break;
+	}
+	return -1;
+}
+
+/*
+ * Load the IMC pnor partition and find the appropriate sub-partition
+ * based on the platform's PVR.
+ * Decompress the sub-partition and link the imc device tree to the
+ * existing device tree.
+ */
+void imc_init(void)
+{
+	struct dt_node *dev;
+	int err_flag = -1;
+
+	if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) {
+		dev = dt_find_compatible_node(dt_root, NULL,
+					"ibm,opal-in-memory-counters");
+		if (!dev)
+			return;
+
+		goto imc_mambo;
+	}
+
+	/* Enable only for power 9/10 */
+	if (proc_gen < proc_gen_p9)
+		return;
+
+	if (!imc_xz)
+		return;
+
+	wait_xz_decompress(imc_xz);
+	if (imc_xz->status != OPAL_SUCCESS) {
+		prerror("IMC: xz_decompress failed\n");
+		goto err;
+	}
+
+	/*
+	 * Flow of the data from PNOR to main device tree:
+	 *
+	 * PNOR -> compressed local buffer (compress_buf)
+	 * compressed local buffer -> decompressed local buf (decompress_buf)
+	 * decompress local buffer -> main device tree
+	 * free compressed local buffer
+	 */
+
+
+	/* Create a device tree entry for imc counters */
+	dev = dt_new_root("imc-counters");
+	if (!dev) {
+		prerror("IMC: Failed to add an imc-counters root node\n");
+		goto err;
+	}
+
+	/*
+	 * Attach the new decompress_buf to the imc-counters node.
+	 * dt_expand_node() does sanity checks for fdt_header, piggyback
+	 */
+	if (dt_expand_node(dev, imc_xz->dst, 0) < 0) {
+		dt_free(dev);
+		prerror("IMC: dt_expand_node failed\n");
+		goto err;
+	}
+
+imc_mambo:
+	if (setup_imc_scoms()) {
+		prerror("IMC: Failed to setup the scoms\n");
+		goto err;
+	}
+
+	/* Check and remove unsupported imc device types */
+	check_imc_device_type(dev);
+
+	/*
+	 * Check and remove unsupported nest unit nodes by the microcode,
+	 * from the incoming device tree.
+	 */
+	disable_unavailable_units(dev);
+
+	/* Fix the phandle in the incoming device tree */
+	dt_adjust_subtree_phandle(dev, prop_to_fix);
+
+	/* Update the base_addr and chip-id for nest nodes */
+	imc_dt_update_nest_node(dev);
+
+	if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+		return;
+
+	/*
+	 * IMC nest counters has both in-band (ucode access) and out of band
+	 * access to it. Since not all nest counter configurations are supported
+	 * by ucode, out of band tools are used to characterize other
+	 * configuration.
+	 *
+	 * If the ucode not paused and OS does not have IMC driver support,
+	 * then out to band tools will race with ucode and end up getting
+	 * undesirable values. Hence pause the ucode if it is already running.
+	 */
+	if (pause_microcode_at_boot()) {
+		prerror("IMC: Pausing ucode failed, disabling nest imc\n");
+		disable_imc_type_from_dt(dev, IMC_COUNTER_CHIP);
+	}
+
+	/*
+	 * If the dt_attach_root() fails, "imc-counters" node will not be
+	 * seen in the device-tree and hence OS should not make any
+	 * OPAL_IMC_* calls.
+	 */
+	if (!dt_attach_root(dt_root, dev)) {
+		dt_free(dev);
+		prerror("IMC: Failed to attach imc-counter node to dt root\n");
+		goto err;
+	}
+
+	err_flag = OPAL_SUCCESS;
+
+err:
+	if (err_flag != OPAL_SUCCESS)
+		prerror("IMC Devices not added\n");
+
+	free(compress_buf);
+	free(imc_xz->dst);
+	free(imc_xz);
+}
+
+static int stop_api_init(struct proc_chip *chip, int phys_core_id,
+			uint32_t scoms,  uint64_t data,
+			const ScomOperation_t operation,
+			const ScomSection_t section,
+			const char *type)
+{
+	int ret;
+
+	prlog(PR_DEBUG, "Configuring stopapi for IMC\n");
+	ret = p9_stop_save_scom((void *)chip->homer_base, scoms,
+				data, operation, section);
+	if (ret) {
+		prerror("IMC %s stopapi ret = %d, scoms = %x (core id = %x)\n",\
+				type, ret, scoms, phys_core_id);
+		if (ret != STOP_SAVE_SCOM_ENTRY_UPDATE_FAILED)
+			wakeup_engine_state = WAKEUP_ENGINE_FAILED;
+		else
+			prerror("SCOM entries are full\n");
+		return OPAL_HARDWARE;
+	}
+
+	return ret;
+}
+
+/* Function to return the scom address for the specified core */
+static uint32_t get_imc_scom_addr_for_core(int core, uint64_t addr)
+{
+	uint32_t scom_addr;
+
+	switch (proc_gen) {
+	case proc_gen_p9:
+		scom_addr = XSCOM_ADDR_P9_EC(core, addr);
+		return scom_addr;
+	case proc_gen_p10:
+		scom_addr = XSCOM_ADDR_P10_EC(core, addr);
+		return scom_addr;
+	default:
+		return 0;
+	}
+}
+
+/* Function to return the scom address for the specified core in the quad */
+static uint32_t get_imc_scom_addr_for_quad(int core, uint64_t addr)
+{
+	uint32_t scom_addr;
+
+	switch (proc_gen) {
+	case proc_gen_p9:
+		scom_addr = XSCOM_ADDR_P9_EQ(core, addr);
+		return scom_addr;
+	case proc_gen_p10:
+		scom_addr = XSCOM_ADDR_P10_EQ(core, addr);
+		return scom_addr;
+	default:
+		return 0;
+	}
+}
+
+static int64_t core_imc_counters_init(uint64_t addr, int port_id,
+				int phys_core_id, struct cpu_thread *c)
+{
+	uint32_t pdbar_addr, event_mask_addr, htm_addr;
+	int ret;
+
+	/* Get the scom address for this core, based on the platform */
+	pdbar_addr = get_imc_scom_addr_for_quad(phys_core_id,
+				pdbar_scom_index[port_id]);
+	event_mask_addr = get_imc_scom_addr_for_core(phys_core_id,
+				CORE_IMC_EVENT_MASK_ADDR);
+
+	/*
+	 * Core IMC hardware mandate initing of three scoms
+	 * to enbale or disable of the Core IMC engine.
+	 *
+	 * PDBAR: Scom contains the real address to store per-core
+	 *        counter data in memory along with other bits.
+	 *
+	 * EventMask: Scom contain bits to denote event to multiplex
+	 *            at different MSR[HV PR] values, along with bits for
+	 *            sampling duration.
+	 *
+	 * HTM Scom: scom to enable counter data movement to memory.
+	 */
+
+
+	 if (xscom_write(c->chip_id, pdbar_addr,
+			(u64)(CORE_IMC_PDBAR_MASK & addr))) {
+		prerror("error in xscom_write for pdbar\n");
+		return OPAL_HARDWARE;
+	}
+
+	if (has_deep_states) {
+		if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) {
+			struct proc_chip *chip = get_chip(c->chip_id);
+
+			ret = stop_api_init(chip, phys_core_id, pdbar_addr,
+					(u64)(CORE_IMC_PDBAR_MASK & addr),
+					P9_STOP_SCOM_REPLACE,
+					P9_STOP_SECTION_EQ_SCOM,
+					"pdbar");
+			if (ret)
+				return ret;
+			ret = stop_api_init(chip, phys_core_id,
+					event_mask_addr,
+					(u64)CORE_IMC_EVENT_MASK,
+					P9_STOP_SCOM_REPLACE,
+					P9_STOP_SECTION_CORE_SCOM,
+					"event_mask");
+			if (ret)
+				return ret;
+		} else {
+			prerror("IMC: Wakeup engine not present!");
+			return OPAL_HARDWARE;
+		}
+	}
+
+	if (xscom_write(c->chip_id, event_mask_addr,
+				(u64)CORE_IMC_EVENT_MASK)) {
+		prerror("error in xscom_write for event mask\n");
+		return OPAL_HARDWARE;
+	}
+
+	/* Get the scom address for htm_mode scom based on the platform */
+	htm_addr = get_imc_scom_addr_for_quad(phys_core_id,
+			htm_scom_index[port_id]);
+	if (xscom_write(c->chip_id, htm_addr,
+			(u64)CORE_IMC_HTM_MODE_DISABLE)) {
+		prerror("error in xscom_write for htm mode\n");
+		return OPAL_HARDWARE;
+	}
+	return OPAL_SUCCESS;
+}
+
+/*
+ * opal_imc_counters_init : This call initialize the IMC engine.
+ *
+ * For Nest IMC, this is no-op and returns OPAL_SUCCESS at this point.
+ * For Core IMC, this initializes core IMC Engine, by initializing
+ * these scoms "PDBAR", "HTM_MODE" and the "EVENT_MASK" in a given cpu.
+ */
+static int64_t opal_imc_counters_init(uint32_t type, uint64_t addr, uint64_t cpu_pir)
+{
+	struct cpu_thread *c = find_cpu_by_pir(cpu_pir);
+	int port_id, phys_core_id;
+	int ret;
+	uint32_t htm_addr, trace_addr;
+
+	switch (type) {
+	case OPAL_IMC_COUNTERS_NEST:
+		return OPAL_SUCCESS;
+	case OPAL_IMC_COUNTERS_CORE:
+		if (!c)
+			return OPAL_PARAMETER;
+
+		/*
+		 * Core IMC hardware mandates setting of htm_mode and
+		 * pdbar in specific scom ports. port_id are in
+		 * pdbar_scom_index[] and htm_scom_index[].
+		 */
+		phys_core_id = pir_to_core_id(c->pir);
+		port_id = phys_core_id % 4;
+
+		if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+			return OPAL_SUCCESS;
+
+		ret = core_imc_counters_init(addr, port_id, phys_core_id, c);
+		if (ret < 0)
+			return ret;
+		/*
+		 * If fused core is supported, do the scoms for the
+		 * secondary core also.
+		 */
+		if (this_cpu()->is_fused_core) {
+			struct cpu_thread *c1 = find_cpu_by_pir(cpu_pir ^ 1);
+
+			phys_core_id = pir_to_core_id(c1->pir);
+			port_id = phys_core_id % 4;
+
+			ret = core_imc_counters_init(addr, port_id, phys_core_id, c1);
+			if (ret < 0)
+				return ret;
+		}
+		return ret;
+	case OPAL_IMC_COUNTERS_TRACE:
+		if (!c)
+			return OPAL_PARAMETER;
+
+		phys_core_id = pir_to_core_id(c->pir);
+		port_id = phys_core_id % 4;
+
+		if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+			return OPAL_SUCCESS;
+
+		trace_addr = get_imc_scom_addr_for_core(phys_core_id,
+				TRACE_IMC_ADDR);
+		htm_addr = get_imc_scom_addr_for_quad(phys_core_id,
+				htm_scom_index[port_id]);
+
+		if (has_deep_states) {
+			if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) {
+				struct proc_chip *chip = get_chip(c->chip_id);
+
+				ret = stop_api_init(chip, phys_core_id,
+						    trace_addr,
+						    trace_scom_val,
+						    P9_STOP_SCOM_REPLACE,
+						    P9_STOP_SECTION_CORE_SCOM,
+						    "trace_imc");
+				if (ret)
+					return ret;
+			} else {
+				prerror("IMC-trace:Wakeup engine not present!");
+				return OPAL_HARDWARE;
+			}
+		}
+		if (xscom_write(c->chip_id, htm_addr, (u64)CORE_IMC_HTM_MODE_DISABLE)) {
+				prerror("IMC-trace: error in xscom_write for htm mode\n");
+				return OPAL_HARDWARE;
+		}
+		if (xscom_write(c->chip_id, trace_addr, trace_scom_val)) {
+			prerror("IMC-trace: error in xscom_write for trace mode\n");
+			return OPAL_HARDWARE;
+		}
+		return OPAL_SUCCESS;
+
+	}
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_IMC_COUNTERS_INIT, opal_imc_counters_init, 3);
+
+/* opal_imc_counters_control_start: This call starts the nest/core imc engine. */
+static int64_t opal_imc_counters_start(uint32_t type, uint64_t cpu_pir)
+{
+	u64 op;
+	struct cpu_thread *c = find_cpu_by_pir(cpu_pir);
+	struct imc_chip_cb *cb;
+	int port_id, phys_core_id;
+	uint32_t htm_addr;
+
+	if (!c)
+		return OPAL_PARAMETER;
+
+	switch (type) {
+	case OPAL_IMC_COUNTERS_NEST:
+		/* Fetch the IMC control block structure */
+		cb = get_imc_cb(c->chip_id);
+		if (!cb)
+			return OPAL_HARDWARE;
+
+		/* Set the run command */
+		op = NEST_IMC_ENABLE;
+
+		if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+			return OPAL_SUCCESS;
+
+		/* Write the command to the control block now */
+		cb->imc_chip_command = cpu_to_be64(op);
+
+		return OPAL_SUCCESS;
+	case OPAL_IMC_COUNTERS_CORE:
+	case OPAL_IMC_COUNTERS_TRACE:
+		/*
+		 * Core IMC hardware mandates setting of htm_mode in specific
+		 * scom ports (port_id are in htm_scom_index[])
+		 */
+		phys_core_id = pir_to_core_id(c->pir);
+		port_id = phys_core_id % 4;
+
+		if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+			return OPAL_SUCCESS;
+
+		htm_addr = get_imc_scom_addr_for_quad(phys_core_id,
+					htm_scom_index[port_id]);
+		/*
+		 * Enables the core imc engine by appropriately setting
+		 * bits 4-9 of the HTM_MODE scom port. No initialization
+		 * is done in this call. This just enables the the counters
+		 * to count with the previous initialization.
+		 */
+		if (xscom_write(c->chip_id, htm_addr, (u64)CORE_IMC_HTM_MODE_ENABLE)) {
+			prerror("IMC OPAL_start: error in xscom_write for htm_mode\n");
+			return OPAL_HARDWARE;
+		}
+
+		return OPAL_SUCCESS;
+	}
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_IMC_COUNTERS_START, opal_imc_counters_start, 2);
+
+/* opal_imc_counters_control_stop: This call stops the nest imc engine. */
+static int64_t opal_imc_counters_stop(uint32_t type, uint64_t cpu_pir)
+{
+	u64 op;
+	struct imc_chip_cb *cb;
+	struct cpu_thread *c = find_cpu_by_pir(cpu_pir);
+	int port_id, phys_core_id;
+	uint32_t htm_addr;
+
+	if (!c)
+		return OPAL_PARAMETER;
+
+	switch (type) {
+	case OPAL_IMC_COUNTERS_NEST:
+		/* Fetch the IMC control block structure */
+		cb = get_imc_cb(c->chip_id);
+		if (!cb)
+			return OPAL_HARDWARE;
+
+		/* Set the run command */
+		op = NEST_IMC_DISABLE;
+
+		if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+			return OPAL_SUCCESS;
+
+		/* Write the command to the control block */
+		cb->imc_chip_command = cpu_to_be64(op);
+
+		return OPAL_SUCCESS;
+
+	case OPAL_IMC_COUNTERS_CORE:
+	case OPAL_IMC_COUNTERS_TRACE:
+		/*
+		 * Core IMC hardware mandates setting of htm_mode in specific
+		 * scom ports (port_id are in htm_scom_index[])
+		 */
+		phys_core_id = pir_to_core_id(c->pir);
+		port_id = phys_core_id % 4;
+
+		if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS)
+			return OPAL_SUCCESS;
+
+		htm_addr = get_imc_scom_addr_for_quad(phys_core_id,
+					htm_scom_index[port_id]);
+		/*
+		 * Disables the core imc engine by clearing
+		 * bits 4-9 of the HTM_MODE scom port.
+		 */
+		if (xscom_write(c->chip_id, htm_addr, (u64) CORE_IMC_HTM_MODE_DISABLE)) {
+			prerror("error in xscom_write for htm_mode\n");
+			return OPAL_HARDWARE;
+		}
+
+		return OPAL_SUCCESS;
+	}
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_IMC_COUNTERS_STOP, opal_imc_counters_stop, 2);
diff --git a/roms/skiboot/hw/ipmi/Makefile.inc b/roms/skiboot/hw/ipmi/Makefile.inc
new file mode 100644
index 000000000..c6b36a2b3
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/Makefile.inc
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+
+SUBDIRS += hw/ipmi
+
+IPMI_OBJS  = ipmi-rtc.o ipmi-power.o ipmi-fru.o ipmi-sel.o
+IPMI_OBJS += ipmi-watchdog.o ipmi-sensor.o ipmi-attn.o ipmi-info.o
+
+IPMI = hw/ipmi/built-in.a
+$(IPMI): $(IPMI_OBJS:%=hw/ipmi/%)
diff --git a/roms/skiboot/hw/ipmi/ipmi-attn.c b/roms/skiboot/hw/ipmi/ipmi-attn.c
new file mode 100644
index 000000000..280b2525f
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-attn.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * When everything is terrible, tell the FSP as much as possible as to why
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <errorlog.h>
+#include <ipmi.h>
+#include <pel.h>
+#include <platform.h>
+#include <processor.h>
+#include <sbe-p9.h>
+#include <skiboot.h>
+#include <stack.h>
+#include <timebase.h>
+#include <xscom.h>
+
+/* Use same attention SRC for BMC based machine */
+DEFINE_LOG_ENTRY(OPAL_RC_ATTN, OPAL_PLATFORM_ERR_EVT,
+		 OPAL_ATTN, OPAL_PLATFORM_FIRMWARE,
+		 OPAL_ERROR_PANIC, OPAL_ABNORMAL_POWER_OFF);
+
+/* Maximum buffer size to capture backtrace and other useful information */
+#define IPMI_TI_BUFFER_SIZE	(IPMI_MAX_PEL_SIZE - PEL_MIN_SIZE)
+static char ti_buffer[IPMI_TI_BUFFER_SIZE];
+
+#define STACK_BUF_ENTRIES       20
+static struct bt_entry bt_buf[STACK_BUF_ENTRIES];
+
+/* Log eSEL event with OPAL backtrace */
+static void ipmi_log_terminate_event(const char *msg)
+{
+	struct bt_metadata metadata;
+	unsigned int ti_len;
+	unsigned int ti_size;
+	struct errorlog *elog_buf;
+
+	/* Fill OPAL version */
+	ti_len = snprintf(ti_buffer, IPMI_TI_BUFFER_SIZE,
+			  "OPAL version : %s\n", version);
+
+	/* File information */
+	ti_len += snprintf(ti_buffer + ti_len, IPMI_TI_BUFFER_SIZE - ti_len,
+			   "File info : %s\n", msg);
+	ti_size = IPMI_TI_BUFFER_SIZE - ti_len;
+
+	/* Backtrace */
+	backtrace_create(bt_buf, STACK_BUF_ENTRIES, &metadata);
+	metadata.token = OPAL_LAST + 1;
+	backtrace_print(bt_buf, &metadata, ti_buffer + ti_len, &ti_size, true);
+
+	/* Create eSEL event and commit */
+	elog_buf = opal_elog_create(&e_info(OPAL_RC_ATTN), 0);
+	log_append_data(elog_buf, (char *)&ti_buffer, ti_len + ti_size);
+	log_commit(elog_buf);
+}
+
+void __attribute__((noreturn)) ipmi_terminate(const char *msg)
+{
+	/* Log eSEL event */
+	if (ipmi_present())
+		ipmi_log_terminate_event(msg);
+
+	/*
+	 * If mpipl is supported then trigger SBE interrupt
+	 * to initiate mpipl
+	 */
+	p9_sbe_terminate();
+
+	/*
+	 * Trigger software xstop (OPAL TI). It will stop all the CPU threads
+	 * moving them into quiesced state. OCC will collect all FIR data.
+	 * Upon checkstop signal, BMC will then decide whether to reboot/IPL or
+	 * not depending on AutoReboot policy, if any. This helps in cases
+	 * where OPAL is crashing/terminating before host reaches to runtime.
+	 * With OpenBMC AutoReboot policy, in such cases, it will make sure
+	 * that system is moved to Quiesced state after 3 or so attempts to
+	 * IPL.  Without OPAL TI, OpenBMC will never know that OPAL is
+	 * terminating and system would go into never ending IPL'ing loop.
+	 *
+	 * Once the system reaches to runtime, OpenBMC resets the boot counter.
+	 * Hence next time when BMC receieves the OPAL TI, it will IPL the
+	 * system if AutoReboot is enabled. We don't need to worry about self
+	 * rebooting.
+	 */
+
+	xscom_trigger_xstop();
+	/*
+	 * Control will not reach here if software xstop has been supported and
+	 * enabled. If not supported then fallback to cec reboot path below.
+	 */
+
+	/* Reboot call */
+	if (platform.cec_reboot)
+		platform.cec_reboot();
+
+	while (1)
+		time_wait_ms(100);
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-fru.c b/roms/skiboot/hw/ipmi/ipmi-fru.c
new file mode 100644
index 000000000..86c9ca0ce
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-fru.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Fill out firmware related FRUs (Field Replaceable Units)
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ipmi.h>
+#include <lock.h>
+#include <opal.h>
+#include <device.h>
+
+struct product_info {
+	char *manufacturer;
+	char *product;
+	char *part_no;
+	char *version;
+	char *serial_no;
+	char *asset_tag;
+};
+
+struct common_header {
+	u8 version;
+	u8 internal_offset;
+	u8 chassis_offset;
+	u8 board_offset;
+	u8 product_offset;
+	u8 multirecord_offset;
+	u8 pad;
+	u8 checksum;
+} __packed;
+
+/* The maximum amount of FRU data we can store. */
+#define FRU_DATA_SIZE 256
+
+/* We allocate two bytes at these locations in the data array to track
+ * state. */
+#define WRITE_INDEX 256
+#define REMAINING 257
+
+/* The ASCII string encoding used only has 5 bits to encode length
+ * hence the maximum is 31 characters. */
+#define MAX_STR_LEN 31
+
+static u8 fru_dev_id = 0;
+
+static int fru_insert_string(u8 *buf, char *str)
+{
+	int len = strlen(str);
+
+	/* The ASCII type/length format only supports a string length
+	 * between 2 and 31 characters. Zero characters is ok though
+	 * as it indicates no data present. */
+	if (len == 1 || len > MAX_STR_LEN)
+		return OPAL_PARAMETER;
+
+	buf[0] = 0xc0 | len;
+	memcpy(&buf[1], str, len);
+
+	return len + 1;
+}
+
+static u8 fru_checksum(u8 *buf, int len)
+{
+	int i;
+	u8 checksum = 0;
+
+	for(i = 0; i < len; i++) {
+		checksum += buf[i];
+	}
+	checksum = ~checksum + 1;
+	return checksum;
+}
+
+#define FRU_INSERT_STRING(x, y)						\
+	({ rc = fru_insert_string(x, y);				\
+	 { if (rc < 1) return OPAL_PARAMETER; } rc; })
+
+static int fru_fill_product_info(u8 *buf, struct product_info *info, size_t size)
+{
+	size_t total_size = 11;
+	int index = 0;
+	int rc;
+
+	total_size += strlen(info->manufacturer);
+	total_size += strlen(info->product);
+	total_size += strlen(info->part_no);
+	total_size += strlen(info->version);
+	total_size += strlen(info->serial_no);
+	total_size += strlen(info->asset_tag);
+	total_size += (8 - (total_size % 8)) % 8;
+	if (total_size > size)
+		return OPAL_PARAMETER;
+
+	buf[index++] = 0x1;		/* Version */
+	buf[index++] = total_size / 8;	/* Size */
+	buf[index++] = 0;		/* Language code (English) */
+
+	index += FRU_INSERT_STRING(&buf[index], info->manufacturer);
+	index += FRU_INSERT_STRING(&buf[index], info->product);
+	index += FRU_INSERT_STRING(&buf[index], info->part_no);
+	index += FRU_INSERT_STRING(&buf[index], info->version);
+	index += FRU_INSERT_STRING(&buf[index], info->serial_no);
+	index += FRU_INSERT_STRING(&buf[index], info->asset_tag);
+
+	buf[index++] = 0xc1;		/* End of data marker */
+	memset(&buf[index], 0, total_size - index - 1);
+	index += total_size - index - 1;
+	buf[index] = fru_checksum(buf, index);
+	assert(index == total_size - 1);
+
+	return total_size;
+}
+
+static int fru_add(u8 *buf, int size)
+{
+	int len;
+	struct common_header common_hdr;
+	char *short_version;
+	struct product_info info = {
+		.manufacturer = (char *) "IBM",
+		.product = (char *) "skiboot",
+		.part_no = (char *) "",
+		.serial_no = (char *) "",
+		.asset_tag = (char *) "",
+	};
+
+	if (size < sizeof(common_hdr))
+		return OPAL_PARAMETER;
+
+	/* We currently only support adding the version number at the
+	 * product information offset. We choose an offset of 64 bytes
+	 * because that's what the standard recommends. */
+	common_hdr.version = 1;
+	common_hdr.internal_offset = 0;
+	common_hdr.chassis_offset = 0;
+	common_hdr.board_offset = 0;
+	common_hdr.product_offset = 64/8;
+	common_hdr.multirecord_offset = 0;
+	common_hdr.pad = 0;
+	common_hdr.checksum = fru_checksum((u8 *) &common_hdr, sizeof(common_hdr) - 1);
+	memcpy(buf, &common_hdr, sizeof(common_hdr));
+
+	short_version = strdup(version);
+	info.version = short_version;
+	if (!strncmp(version, "skiboot-", 8))
+		info.version = &short_version[8];
+
+	if (strlen(info.version) >= MAX_STR_LEN) {
+		if (info.version[MAX_STR_LEN] != '\0')
+			info.version[MAX_STR_LEN - 1] = '+';
+		info.version[MAX_STR_LEN] = '\0';
+	}
+
+	len = fru_fill_product_info(&buf[64], &info, size - 64);
+	free(short_version);
+	if (len < 0)
+		return OPAL_PARAMETER;
+
+	return len + 64;
+}
+
+static void fru_write_complete(struct ipmi_msg *msg)
+{
+	u8 write_count = msg->data[0];
+	u16 offset;
+
+	msg->data[WRITE_INDEX] += write_count;
+	msg->data[REMAINING] -= write_count;
+	if (msg->data[REMAINING] == 0)
+		goto out;
+
+	offset = msg->data[WRITE_INDEX];
+	ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE, IPMI_WRITE_FRU,
+		      fru_write_complete, NULL,
+		      MIN(msg->data[REMAINING] + 3, IPMI_MAX_REQ_SIZE), 2);
+
+	memmove(&msg->data[3], &msg->data[offset + 3], msg->req_size - 3);
+
+	msg->data[0] = fru_dev_id;     		/* FRU Device ID */
+	msg->data[1] = offset & 0xff;		/* Offset LSB */
+	msg->data[2] = (offset >> 8) & 0xff;	/* Offset MSB */
+
+	ipmi_queue_msg(msg);
+
+	return;
+
+out:
+	ipmi_free_msg(msg);
+}
+
+static int fru_write(void)
+{
+	struct ipmi_msg *msg;
+	int len;
+
+	/* We allocate FRU_DATA_SIZE + 5 bytes for the message:
+	 * - 3 bytes for the the write FRU command header
+	 * - FRU_DATA_SIZE bytes for FRU data
+	 * - 2 bytes for offset & bytes remaining count
+	 */
+	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_WRITE_FRU,
+			 fru_write_complete, NULL, NULL, FRU_DATA_SIZE + 5, 2);
+	if (!msg)
+		return OPAL_RESOURCE;
+
+	msg->data[0] = fru_dev_id;	/* FRU Device ID */
+	msg->data[1] = 0x0;		/* Offset LSB (we always write a new common header) */
+	msg->data[2] = 0x0;		/* Offset MSB */
+	len = fru_add(&msg->data[3], FRU_DATA_SIZE);
+
+	if (len < 0)
+		return len;
+
+	/* Three bytes for the actual FRU Data Command */
+	msg->data[WRITE_INDEX] = 0;
+	msg->data[REMAINING] = len;
+	msg->req_size = MIN(len + 3, IPMI_MAX_REQ_SIZE);
+	return ipmi_queue_msg(msg);
+}
+
+void ipmi_fru_init(u8 dev_id)
+{
+	fru_dev_id = dev_id;
+	fru_write();
+
+	return;
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-info.c b/roms/skiboot/hw/ipmi/ipmi-info.c
new file mode 100644
index 000000000..d93b59d7d
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-info.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Various bits of info retreived over IPMI
+ *
+ * Copyright 2018-2019 IBM Corp.
+ */
+
+#include <device.h>
+#include <skiboot.h>
+#include <stdlib.h>
+#include <ipmi.h>
+#include <mem_region-malloc.h>
+#include <opal.h>
+#include <timebase.h>
+
+/*
+ * Response data from IPMI Get device ID command (As defined in
+ * Section 20.1 Get Device ID Command - IPMI standard spec).
+ */
+struct ipmi_dev_id {
+	uint8_t	dev_id;
+	uint8_t	dev_revision;
+	uint8_t	fw_rev1;
+	uint8_t	fw_rev2;
+	uint8_t	ipmi_ver;
+	uint8_t	add_dev_support;
+	uint8_t	manufactur_id[3];
+	uint8_t	product_id[2];
+	uint8_t	aux_fw_rev[4];
+};
+static struct ipmi_dev_id *ipmi_dev_id;
+
+/*
+ * Response data from IPMI Chassis Get System Boot Option (As defined in
+ * Section 28.13 Get System Boot Options Command - IPMI standard spec).
+ */
+struct ipmi_sys_boot_opt {
+	uint8_t param_version;
+	uint8_t param_valid;
+	/*
+	 * Fields for OEM parameter 0x62. This parameter does not follow
+	 * the normal layout and just has a single byte to signal if it
+	 * is active or not.
+	 */
+	uint8_t flag_set;
+};
+static struct ipmi_sys_boot_opt *ipmi_sys_boot_opt;
+
+/* Got response from BMC? */
+static bool bmc_info_waiting = false;
+static bool bmc_info_valid = false;
+static bool bmc_boot_opt_waiting = false;
+static bool bmc_boot_opt_valid = false;
+
+/* This will free ipmi_dev_id structure */
+void ipmi_dt_add_bmc_info(void)
+{
+	char buf[8];
+	struct dt_node *dt_fw_version;
+
+	while (bmc_info_waiting)
+		time_wait_ms(5);
+
+	if (!bmc_info_valid)
+		return;
+
+	dt_fw_version = dt_find_by_name(dt_root, "ibm,firmware-versions");
+	if (!dt_fw_version) {
+		free(ipmi_dev_id);
+		return;
+	}
+
+	memset(buf, 0, sizeof(buf));
+	snprintf(buf, sizeof(buf), "%x.%02x",
+		 ipmi_dev_id->fw_rev1, ipmi_dev_id->fw_rev2);
+	dt_add_property_string(dt_fw_version, "bmc-firmware-version", buf);
+
+	free(ipmi_dev_id);
+}
+
+static void ipmi_get_bmc_info_resp(struct ipmi_msg *msg)
+{
+	bmc_info_waiting = false;
+
+	if (msg->cc != IPMI_CC_NO_ERROR) {
+		prlog(PR_ERR, "IPMI: IPMI_BMC_GET_DEVICE_ID cmd returned error"
+		      " [rc : 0x%x]\n", msg->data[0]);
+		return;
+	}
+
+	/* ipmi_dev_id has optional fields */
+	if (msg->resp_size <= sizeof(struct ipmi_dev_id)) {
+		bmc_info_valid = true;
+		memcpy(ipmi_dev_id, msg->data, msg->resp_size);
+	} else {
+		prlog(PR_WARNING, "IPMI: IPMI_BMC_GET_DEVICE_ID unexpected response size\n");
+	}
+
+	ipmi_free_msg(msg);
+}
+
+int ipmi_get_bmc_info_request(void)
+{
+	int rc;
+	struct ipmi_msg *msg;
+
+	ipmi_dev_id = zalloc(sizeof(struct ipmi_dev_id));
+	assert(ipmi_dev_id);
+
+	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_BMC_GET_DEVICE_ID,
+			 ipmi_get_bmc_info_resp, NULL, NULL,
+			 0, sizeof(struct ipmi_dev_id));
+	if (!msg)
+		return OPAL_NO_MEM;
+
+	msg->error = ipmi_get_bmc_info_resp;
+	prlog(PR_INFO, "IPMI: Requesting IPMI_BMC_GET_DEVICE_ID\n");
+	rc = ipmi_queue_msg(msg);
+	if (rc) {
+		prlog(PR_ERR, "IPMI: Failed to queue IPMI_BMC_GET_DEVICE_ID\n");
+		ipmi_free_msg(msg);
+		return rc;
+	}
+
+	bmc_info_waiting = true;
+	return rc;
+}
+
+/* This will free ipmi_sys_boot_opt structure */
+int ipmi_chassis_check_sbe_validation(void)
+{
+	int rc = -1;
+
+	while (bmc_boot_opt_waiting)
+		time_wait_ms(10);
+
+	if (!bmc_boot_opt_valid)
+		goto out;
+
+	if ((ipmi_sys_boot_opt->param_valid & 0x8) != 0)
+		goto out;
+	if (ipmi_sys_boot_opt->param_valid != 0x62)
+		goto out;
+
+	rc = ipmi_sys_boot_opt->flag_set;
+
+out:
+	free(ipmi_sys_boot_opt);
+	return rc;
+}
+
+static void ipmi_get_chassis_boot_opt_resp(struct ipmi_msg *msg)
+{
+	bmc_boot_opt_waiting = false;
+
+	if (msg->cc != IPMI_CC_NO_ERROR) {
+		prlog(PR_INFO, "IPMI: IPMI_CHASSIS_GET_BOOT_OPT cmd returned error"
+		      " [rc : 0x%x]\n", msg->data[0]);
+		ipmi_free_msg(msg);
+		return;
+	}
+
+	if (msg->resp_size == sizeof(struct ipmi_sys_boot_opt)) {
+		bmc_boot_opt_valid = true;
+		memcpy(ipmi_sys_boot_opt, msg->data, msg->resp_size);
+	} else {
+		prlog(PR_WARNING, "IPMI: IPMI_CHASSIS_GET_BOOT_OPT unexpected response size\n");
+	}
+
+	ipmi_free_msg(msg);
+}
+
+int ipmi_get_chassis_boot_opt_request(void)
+{
+	int rc;
+	struct ipmi_msg *msg;
+	uint8_t req[] = {
+		0x62, /* OEM parameter (SBE Validation on astbmc) */
+		0x00, /* no set selector */
+		0x00, /* no block selector */
+	};
+
+	ipmi_sys_boot_opt = zalloc(sizeof(struct ipmi_sys_boot_opt));
+	assert(ipmi_sys_boot_opt);
+
+	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_CHASSIS_GET_BOOT_OPT,
+			 ipmi_get_chassis_boot_opt_resp, NULL, req,
+			 sizeof(req), sizeof(struct ipmi_sys_boot_opt));
+	if (!msg) {
+		free(ipmi_sys_boot_opt);
+		return OPAL_NO_MEM;
+	}
+
+	msg->error = ipmi_get_chassis_boot_opt_resp;
+	prlog(PR_INFO, "IPMI: Requesting IPMI_CHASSIS_GET_BOOT_OPT\n");
+	rc = ipmi_queue_msg(msg);
+	if (rc) {
+		prlog(PR_ERR, "IPMI: Failed to queue IPMI_CHASSIS_GET_BOOT_OPT\n");
+		free(ipmi_sys_boot_opt);
+		ipmi_free_msg(msg);
+		return rc;
+	}
+
+	bmc_boot_opt_waiting = true;
+	return rc;
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-power.c b/roms/skiboot/hw/ipmi/ipmi-power.c
new file mode 100644
index 000000000..8101a8524
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-power.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Power as in electricity, not POWER as in POWER
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <stdlib.h>
+#include <ipmi.h>
+#include <opal.h>
+#include <timebase.h>
+
+static void ipmi_chassis_control_complete(struct ipmi_msg *msg)
+{
+	uint8_t request = msg->data[0];
+	uint8_t cc = msg->cc;
+
+	ipmi_free_msg(msg);
+	if (cc == IPMI_CC_NO_ERROR)
+		return;
+
+	prlog(PR_INFO, "IPMI: Chassis control request failed. "
+	      "request=0x%02x, rc=0x%02x\n", request, cc);
+
+	if (ipmi_chassis_control(request)) {
+		prlog(PR_INFO, "IPMI: Failed to resend chassis control "
+		      "request [0x%02x]\n", request);
+	}
+}
+
+int ipmi_chassis_control(uint8_t request)
+{
+	struct ipmi_msg *msg;
+
+	if (!ipmi_present())
+		return OPAL_CLOSED;
+
+	if (request > IPMI_CHASSIS_SOFT_SHUTDOWN)
+		return OPAL_PARAMETER;
+
+	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_CHASSIS_CONTROL,
+			 ipmi_chassis_control_complete, NULL,
+			 &request, sizeof(request), 0);
+	if (!msg)
+		return OPAL_HARDWARE;
+	/* Set msg->error callback function */
+	msg->error = ipmi_chassis_control_complete;
+
+	prlog(PR_INFO, "IPMI: sending chassis control request 0x%02x\n",
+			request);
+
+	return ipmi_queue_msg(msg);
+}
+
+int ipmi_set_power_state(uint8_t system, uint8_t device)
+{
+	struct ipmi_msg *msg;
+	struct {
+		uint8_t system;
+		uint8_t device;
+	} power_state;
+
+	if (!ipmi_present())
+		return OPAL_CLOSED;
+
+	power_state.system = system;
+	power_state.device = device;
+
+	if (system != IPMI_PWR_NOCHANGE)
+		power_state.system |= 0x80;
+	if (device != IPMI_PWR_NOCHANGE)
+		power_state.device |= 0x80;
+
+	msg = ipmi_mkmsg_simple(IPMI_SET_POWER_STATE, &power_state,
+				sizeof(power_state));
+
+	if (!msg)
+		return OPAL_HARDWARE;
+
+	prlog(PR_INFO, "IPMI: setting power state: sys %02x, dev %02x\n",
+			power_state.system, power_state.device);
+
+	return ipmi_queue_msg(msg);
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-rtc.c b/roms/skiboot/hw/ipmi/ipmi-rtc.c
new file mode 100644
index 000000000..52da2946c
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-rtc.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Talk to a Real Time Clock (RTC) over IPMI
+ *
+ * Copyright 2013-2015 IBM Corp.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ipmi.h>
+#include <time.h>
+#include <time-utils.h>
+#include <device.h>
+#include <opal.h>
+#include <rtc.h>
+
+static enum {idle, waiting, updated, error} time_status;
+
+static void get_sel_time_error(struct ipmi_msg *msg)
+{
+	time_status = error;
+	ipmi_free_msg(msg);
+}
+
+static void get_sel_time_complete(struct ipmi_msg *msg)
+{
+	struct tm tm;
+	le32 result;
+	time_t time;
+
+	memcpy(&result, msg->data, 4);
+	time = le32_to_cpu(result);
+	gmtime_r(&time, &tm);
+	rtc_cache_update(&tm);
+	time_status = updated;
+	ipmi_free_msg(msg);
+}
+
+static int64_t ipmi_get_sel_time(void)
+{
+	struct ipmi_msg *msg;
+
+	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_GET_SEL_TIME,
+			 get_sel_time_complete, NULL, NULL, 0, 4);
+	if (!msg)
+		return OPAL_HARDWARE;
+
+	msg->error = get_sel_time_error;
+
+	return ipmi_queue_msg(msg);
+}
+
+static int64_t ipmi_set_sel_time(uint32_t _tv)
+{
+	struct ipmi_msg *msg;
+	const le32 tv = cpu_to_le32(_tv);
+
+	msg = ipmi_mkmsg_simple(IPMI_SET_SEL_TIME, (void*)&tv, sizeof(tv));
+	if (!msg)
+		return OPAL_HARDWARE;
+
+	return ipmi_queue_msg(msg);
+}
+
+static int64_t ipmi_opal_rtc_read(__be32 *__ymd, __be64 *__hmsm)
+{
+	int ret = 0;
+	uint32_t ymd;
+	uint64_t hmsm;
+
+	if (!__ymd || !__hmsm)
+		return OPAL_PARAMETER;
+
+	switch(time_status) {
+	case idle:
+		if (ipmi_get_sel_time() < 0)
+			return OPAL_HARDWARE;
+		time_status = waiting;
+		ret = OPAL_BUSY_EVENT;
+		break;
+
+	case waiting:
+		ret = OPAL_BUSY_EVENT;
+		break;
+
+	case updated:
+		rtc_cache_get_datetime(&ymd, &hmsm);
+		*__ymd = cpu_to_be32(ymd);
+		*__hmsm = cpu_to_be64(hmsm);
+		time_status = idle;
+		ret = OPAL_SUCCESS;
+		break;
+
+	case error:
+		time_status = idle;
+		ret = OPAL_HARDWARE;
+		break;
+	}
+
+	return ret;
+}
+
+static int64_t ipmi_opal_rtc_write(uint32_t year_month_day,
+				  uint64_t hour_minute_second_millisecond)
+{
+	time_t t;
+	struct tm tm;
+
+	datetime_to_tm(year_month_day, hour_minute_second_millisecond, &tm);
+	t = mktime(&tm);
+	if (ipmi_set_sel_time(t))
+		return OPAL_HARDWARE;
+
+	return OPAL_SUCCESS;
+}
+
+void ipmi_rtc_init(void)
+{
+	struct dt_node *np = dt_new(opal_node, "rtc");
+	dt_add_property_strings(np, "compatible", "ibm,opal-rtc");
+
+	opal_register(OPAL_RTC_READ, ipmi_opal_rtc_read, 2);
+	opal_register(OPAL_RTC_WRITE, ipmi_opal_rtc_write, 2);
+
+	/* Initialise the rtc cache */
+	ipmi_get_sel_time();
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-sel.c b/roms/skiboot/hw/ipmi/ipmi-sel.c
new file mode 100644
index 000000000..215b8ba7d
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-sel.c
@@ -0,0 +1,701 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2018 IBM Corp. */
+
+#define pr_fmt(fmt) "IPMI: " fmt
+#include <ccan/list/list.h>
+#include <ccan/str/str.h>
+#include <compiler.h>
+#include <errno.h>
+#include <skiboot.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ipmi.h>
+#include <device.h>
+#include <opal.h>
+#include <lock.h>
+#include <errorlog.h>
+#include <pel.h>
+#include <opal-msg.h>
+#include <debug_descriptor.h>
+#include <occ.h>
+#include <timebase.h>
+
+/* OEM SEL fields */
+#define SEL_OEM_ID_0		0x55
+#define SEL_OEM_ID_1		0x55
+#define SEL_RECORD_TYPE_OEM	0xC0
+#define SEL_RECORD_TYPE_EVENT	0x02
+
+#define SEL_NETFN_IBM		0x3a
+
+/* OEM SEL Commands */
+/* TODO: Move these to their respective source files */
+#define CMD_AMI_POWER		0x04
+#define CMD_AMI_PNOR_ACCESS	0x07
+#define CMD_AMI_OCC_RESET	0x0e
+#define CMD_HEARTBEAT		0xff
+
+/* XXX: Listed here for completeness, registered in libflash/ipmi-flash.c */
+#define CMD_OP_HIOMAP_EVENT	0x0f
+
+#define SOFT_OFF	        0x00
+#define SOFT_REBOOT	        0x01
+
+#define RELEASE_PNOR		0x00
+#define REQUEST_PNOR		0x01
+
+/* 32.1 SEL Event Records type */
+#define SEL_REC_TYPE_SYS_EVENT	0x02
+#define SEL_REC_TYPE_AMI_ESEL	0xDF
+
+/* OEM SEL generator ID for AMI */
+#define SEL_GENERATOR_ID_AMI	0x0020
+
+/* IPMI SEL version */
+#define SEL_EVM_VER_1		0x03
+#define SEL_EVM_VER_2		0x04
+
+/*
+ * Sensor type for System events
+ *
+ * Sensor information (type, number, etc) is passed to us via
+ * device tree. Currently we are using System Event type to
+ * log OPAL events.
+ */
+#define SENSOR_TYPE_SYS_EVENT	0x12
+
+/*
+ * 42.1 Event/Reading Type Codes
+ *
+ * Note that device hotplug and availability related events
+ * are not defined as we are not using those events type.
+ */
+#define SEL_EVENT_DIR_TYPE_UNSPECIFIED	0x00
+#define SEL_EVENT_DIR_TYPE_THRESHOLD	0x01
+#define SEL_EVENT_DIR_TYPE_STATE	0x03
+#define SEL_EVENT_DIR_TYPE_PREDICTIVE	0x04
+#define SEL_EVENT_DIR_TYPE_LIMIT	0x05
+#define SEL_EVENT_DIR_TYPE_PERFORMANCE	0x06
+#define SEL_EVENT_DIR_TYPE_TRANSITION	0x07
+#define SEL_EVENT_DIR_TYPE_OEM		0x70
+
+/*
+ * 42.1 Event/Reading Type Codes
+ */
+#define SEL_DATA1_AMI			0xAA
+#define SEL_DATA1_DEASSERTED		0x00
+#define SEL_DATA1_ASSERTED		0x01
+#define SEL_DATA1_OK			0x00
+#define SEL_DATA1_NON_CRIT_FROM_OK	0x01
+#define SEL_DATA1_CRIT_FROM_LESS_SEV	0x02
+#define SEL_DATA1_NON_REC_FROM_LESS_SEV	0x03
+#define SEL_DATA1_NON_CRIT		0x04
+#define SEL_DATA1_CRITICAL		0x05
+#define SEL_DATA1_NON_RECOVERABLE	0X06
+#define SEL_DATA1_MONITOR		0x07
+#define SEL_DATA1_INFORMATIONAL		0x08
+
+/* SEL Record Entry */
+struct sel_record {
+	le16		record_id;
+	uint8_t		record_type;
+	le32		timestamp;
+	le16		generator_id;
+	uint8_t		evm_ver;
+	uint8_t		sensor_type;
+	uint8_t		sensor_number;
+	uint8_t		event_dir_type;
+	uint8_t		event_data1;
+	uint8_t		event_data2;
+	uint8_t		event_data3;
+} __packed;
+
+static struct sel_record sel_record;
+
+struct oem_sel {
+	/* SEL header */
+	uint8_t id[2];
+	uint8_t type;
+	uint8_t timestamp[4];
+	uint8_t manuf_id[3];
+	/* OEM SEL data (6 bytes) follows */
+	uint8_t netfun;
+	uint8_t cmd;
+	uint8_t data[4];
+};
+
+#define ESEL_HDR_SIZE 7
+
+/* Used for sending PANIC events like abort() path */
+struct ipmi_sel_panic_msg {
+	bool		busy;
+	struct ipmi_msg	*msg;
+	struct lock	lock;
+};
+static struct ipmi_sel_panic_msg ipmi_sel_panic_msg;
+
+static LIST_HEAD(sel_handlers);
+
+/* Forward declaration */
+static void ipmi_elog_poll(struct ipmi_msg *msg);
+
+/*
+ * Allocate IPMI message:
+ * For normal event, allocate memory using ipmi_mkmsg and for PANIC
+ * event, use pre-allocated buffer.
+ */
+static struct ipmi_msg *ipmi_sel_alloc_msg(struct errorlog *elog_buf)
+{
+	struct ipmi_msg *msg = NULL;
+
+	if (elog_buf->event_severity == OPAL_ERROR_PANIC) {
+		/* Called before initialization completes */
+		if (ipmi_sel_panic_msg.msg == NULL) {
+			ipmi_sel_init();	/* Try to allocate IPMI message */
+			if (ipmi_sel_panic_msg.msg == NULL)
+				return NULL;
+		}
+
+		if (ipmi_sel_panic_msg.busy == true)
+			return NULL;
+
+		lock(&ipmi_sel_panic_msg.lock);
+		msg = ipmi_sel_panic_msg.msg;
+		ipmi_sel_panic_msg.busy = true;
+		unlock(&ipmi_sel_panic_msg.lock);
+
+		ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE, IPMI_RESERVE_SEL,
+				ipmi_elog_poll, elog_buf, IPMI_MAX_REQ_SIZE, 2);
+	} else {
+		msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_RESERVE_SEL,
+				ipmi_elog_poll, elog_buf, NULL,
+				IPMI_MAX_REQ_SIZE, 2);
+	}
+
+	return msg;
+}
+
+static void ipmi_sel_free_msg(struct ipmi_msg *msg)
+{
+	if (msg == ipmi_sel_panic_msg.msg) {
+		lock(&ipmi_sel_panic_msg.lock);
+		ipmi_sel_panic_msg.busy = false;
+		unlock(&ipmi_sel_panic_msg.lock);
+	} else {
+		ipmi_free_msg(msg);
+	}
+
+	msg = NULL;
+}
+
+/* Initialize eSEL record */
+static void ipmi_init_esel_record(void)
+{
+	memset(&sel_record, 0, sizeof(struct sel_record));
+	sel_record.record_type = SEL_REC_TYPE_AMI_ESEL;
+	sel_record.generator_id = cpu_to_le16(SEL_GENERATOR_ID_AMI);
+	sel_record.evm_ver = SEL_EVM_VER_2;
+	sel_record.sensor_type	= SENSOR_TYPE_SYS_EVENT;
+	sel_record.sensor_number =
+		ipmi_get_sensor_number(SENSOR_TYPE_SYS_EVENT);
+	sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_OEM;
+	sel_record.event_data1 = SEL_DATA1_AMI;
+}
+
+/* Update required fields in SEL record */
+static void ipmi_update_sel_record(uint8_t event_severity, uint16_t esel_record_id)
+{
+	sel_record.record_type = SEL_REC_TYPE_SYS_EVENT;
+	sel_record.event_data2 = (esel_record_id >> 8) & 0xff;
+	sel_record.event_data3 = esel_record_id & 0xff;
+
+	switch (event_severity) {
+	case OPAL_ERROR_PANIC:
+		sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION;
+		sel_record.event_data1 = SEL_DATA1_CRITICAL;
+		break;
+	case OPAL_UNRECOVERABLE_ERR_GENERAL:	/* Fall through */
+	case OPAL_UNRECOVERABLE_ERR_DEGRADE_PERF:
+	case OPAL_UNRECOVERABLE_ERR_LOSS_REDUNDANCY:
+	case OPAL_UNRECOVERABLE_ERR_LOSS_REDUNDANCY_PERF:
+	case OPAL_UNRECOVERABLE_ERR_LOSS_OF_FUNCTION:
+		sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION;
+		sel_record.event_data1 = SEL_DATA1_NON_RECOVERABLE;
+		break;
+	case OPAL_PREDICTIVE_ERR_GENERAL:	/* Fall through */
+	case OPAL_PREDICTIVE_ERR_DEGRADED_PERF:
+	case OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT:
+	case OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_BOOT_DEGRADE_PERF:
+	case OPAL_PREDICTIVE_ERR_LOSS_OF_REDUNDANCY:
+		sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_PREDICTIVE;
+		sel_record.event_data1 = SEL_DATA1_NON_CRIT_FROM_OK;
+		break;
+	case OPAL_RECOVERED_ERR_GENERAL:
+		sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION;
+		sel_record.event_data1 = SEL_DATA1_OK;
+		break;
+	case OPAL_INFO:
+		sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_TRANSITION;
+		sel_record.event_data1 = SEL_DATA1_INFORMATIONAL;
+		break;
+	default:
+		sel_record.event_dir_type = SEL_EVENT_DIR_TYPE_STATE;
+		sel_record.event_data1 = SEL_DATA1_ASSERTED;
+		break;
+	}
+}
+
+static void ipmi_elog_error(struct ipmi_msg *msg)
+{
+	if (msg->cc == IPMI_LOST_ARBITRATION_ERR)
+		/* Retry due to SEL erase */
+		ipmi_queue_msg(msg);
+	else {
+		opal_elog_complete(msg->user_data, false);
+		ipmi_sel_free_msg(msg);
+	}
+}
+
+static void ipmi_log_sel_event_error(struct ipmi_msg *msg)
+{
+	if (msg->cc != IPMI_CC_NO_ERROR)
+		prlog(PR_INFO, "SEL: Failed to log SEL event\n");
+
+	ipmi_sel_free_msg(msg);
+}
+
+static void ipmi_log_sel_event_complete(struct ipmi_msg *msg)
+{
+	prlog(PR_INFO, "SEL: New event logged [ID : %x%x]\n", msg->data[1],
+		msg->data[0]);
+
+	ipmi_sel_free_msg(msg);
+}
+
+/* Log SEL event with eSEL record ID */
+static void ipmi_log_sel_event(struct ipmi_msg *msg, uint8_t event_severity,
+				uint16_t esel_record_id)
+{
+	/* Fill required SEL event fields */
+	ipmi_update_sel_record(event_severity, esel_record_id);
+
+	/* Fill IPMI message */
+	ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE, IPMI_ADD_SEL_EVENT,
+		      ipmi_log_sel_event_complete, NULL,
+		      sizeof(struct sel_record), 2);
+
+	/* Copy SEL data */
+	memcpy(msg->data, &sel_record, sizeof(struct sel_record));
+
+	msg->error = ipmi_log_sel_event_error;
+	ipmi_queue_msg_head(msg);
+}
+
+/* Goes through the required steps to add a complete eSEL:
+ *
+ *  1. Get a reservation
+ *  2. Add eSEL header
+ *  3. Partially add data to the SEL
+ *
+ * Because a reservation is needed we need to ensure eSEL's are added
+ * as a single transaction as concurrent/interleaved adds would cancel
+ * the reservation. We guarantee this by always adding our messages to
+ * the head of the transmission queue, blocking any other messages
+ * being sent until we have completed sending this message.
+ *
+ * There is still a very small chance that we will accidentally
+ * interleave a message if there is another one waiting at the head of
+ * the ipmi queue and another cpu calls the ipmi poller before we
+ * complete. However this should just cause a resevation cancelled
+ * error which we have to deal with anyway (eg. because there may be a
+ * SEL erase in progress) so it shouldn't cause any problems.
+ */
+static void ipmi_elog_poll(struct ipmi_msg *msg)
+{
+	static bool first = false;
+	static char pel_buf[IPMI_MAX_PEL_SIZE];
+	static size_t pel_size;
+	static size_t esel_size;
+	static int esel_index = 0;
+	int pel_index;
+	static unsigned int reservation_id = 0;
+	static unsigned int record_id = 0;
+	struct errorlog *elog_buf = (struct errorlog *) msg->user_data;
+	size_t req_size;
+
+	if (bmc_platform->sw->ipmi_oem_partial_add_esel == 0) {
+		prlog(PR_WARNING, "Dropped eSEL: BMC code is buggy/missing\n");
+		ipmi_sel_free_msg(msg);
+		return;
+	}
+
+	ipmi_init_esel_record();
+	if (msg->cmd == IPMI_CMD(IPMI_RESERVE_SEL)) {
+		first = true;
+		reservation_id = msg->data[0];
+		reservation_id |= msg->data[1] << 8;
+		if (!reservation_id) {
+			/*
+			 * According to specification we should never
+			 * get here, but just in case we do we cancel
+			 * sending the message.
+			 */
+			prerror("Invalid reservation id");
+			opal_elog_complete(elog_buf, false);
+			ipmi_sel_free_msg(msg);
+			return;
+		}
+
+		pel_size = create_pel_log(elog_buf, pel_buf, IPMI_MAX_PEL_SIZE);
+		esel_size = pel_size + sizeof(struct sel_record);
+		esel_index = 0;
+		record_id = 0;
+	} else {
+		record_id = msg->data[0];
+		record_id |= msg->data[1] << 8;
+	}
+
+	/* Start or continue the IPMI_PARTIAL_ADD_SEL */
+	if (esel_index >= esel_size) {
+		/*
+		 * We're all done. Invalidate the resevation id to
+		 * ensure we get an error if we cut in on another eSEL
+		 * message.
+		 */
+		reservation_id = 0;
+		esel_index = 0;
+
+		/* Log SEL event and free ipmi message */
+		ipmi_log_sel_event(msg, elog_buf->event_severity, record_id);
+
+		opal_elog_complete(elog_buf, true);
+		return;
+	}
+
+	if ((esel_size - esel_index) <= (IPMI_MAX_REQ_SIZE - ESEL_HDR_SIZE)) {
+		/* Last data to send */
+		msg->data[6] = 1;
+		req_size = esel_size - esel_index + ESEL_HDR_SIZE;
+	} else {
+		msg->data[6] = 0;
+		req_size = IPMI_MAX_REQ_SIZE;
+	}
+
+	ipmi_init_msg(msg, IPMI_DEFAULT_INTERFACE,
+		      bmc_platform->sw->ipmi_oem_partial_add_esel,
+		      ipmi_elog_poll, elog_buf, req_size, 2);
+
+	msg->data[0] = reservation_id & 0xff;
+	msg->data[1] = (reservation_id >> 8) & 0xff;
+	msg->data[2] = record_id & 0xff;
+	msg->data[3] = (record_id >> 8) & 0xff;
+	msg->data[4] = esel_index & 0xff;
+	msg->data[5] = (esel_index >> 8) & 0xff;
+
+	if (first) {
+		first = false;
+		memcpy(&msg->data[ESEL_HDR_SIZE], &sel_record,
+			sizeof(struct sel_record));
+		esel_index = sizeof(struct sel_record);
+		msg->req_size = esel_index + ESEL_HDR_SIZE;
+	} else {
+		pel_index = esel_index - sizeof(struct sel_record);
+		memcpy(&msg->data[ESEL_HDR_SIZE], &pel_buf[pel_index],
+			msg->req_size - ESEL_HDR_SIZE);
+		esel_index += msg->req_size - ESEL_HDR_SIZE;
+	}
+
+	ipmi_queue_msg_head(msg);
+	return;
+}
+
+int ipmi_elog_commit(struct errorlog *elog_buf)
+{
+	struct ipmi_msg *msg;
+
+	/* Only log events that needs attention */
+	if (elog_buf->event_severity <
+			OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT ||
+			elog_buf->elog_origin != ORG_SAPPHIRE) {
+		prlog(PR_INFO, "dropping non severe PEL event\n");
+		opal_elog_complete(elog_buf, true);
+		return 0;
+	}
+
+	/*
+	 * We pass a large request size in to mkmsg so that we have a
+	 * large enough allocation to reuse the message to pass the
+	 * PEL data via a series of partial add commands.
+	 */
+	msg = ipmi_sel_alloc_msg(elog_buf);
+	if (!msg) {
+		opal_elog_complete(elog_buf, false);
+		return OPAL_RESOURCE;
+	}
+
+	msg->error = ipmi_elog_error;
+	msg->req_size = 0;
+	if (elog_buf->event_severity == OPAL_ERROR_PANIC) {
+		ipmi_queue_msg_sync(msg);
+
+		/*
+		 * eSEL logs are split into multiple smaller chunks and sent
+		 * to BMC. Lets wait until we finish sending all the chunks
+		 * to BMC.
+		 */
+		while (ipmi_sel_panic_msg.busy != false) {
+			if (msg->backend->poll)
+				msg->backend->poll();
+			time_wait_ms(10);
+		}
+	} else {
+		ipmi_queue_msg(msg);
+	}
+
+	return 0;
+}
+
+#define ACCESS_DENIED	0x00
+#define ACCESS_GRANTED	0x01
+
+static void sel_pnor(uint8_t access, void *context __unused)
+{
+	struct ipmi_msg *msg;
+	uint8_t granted = ACCESS_GRANTED;
+
+	switch (access) {
+	case REQUEST_PNOR:
+		prlog(PR_NOTICE, "PNOR access requested\n");
+		if (bmc_platform->sw->ipmi_oem_pnor_access_status == 0) {
+			/**
+			 * @fwts-label PNORAccessYeahButNoBut
+			 * @fwts-advice OPAL doesn't know that the BMC supports
+			 * PNOR access commands. This will be a bug in the OPAL
+			 * support for this BMC.
+			 */
+			prlog(PR_ERR, "PNOR BUG: access requested but BMC doesn't support request\n");
+			break;
+		}
+
+		granted = flash_reserve();
+		if (granted)
+			occ_pnor_set_owner(PNOR_OWNER_EXTERNAL);
+		/* Ack the request */
+		msg = ipmi_mkmsg_simple(bmc_platform->sw->ipmi_oem_pnor_access_status, &granted, 1);
+		ipmi_queue_msg(msg);
+		break;
+	case RELEASE_PNOR:
+		prlog(PR_NOTICE, "PNOR access released\n");
+		flash_release();
+		occ_pnor_set_owner(PNOR_OWNER_HOST);
+		break;
+	default:
+		/**
+		 * @fwts-label InvalidPNORAccessRequest
+		 * @fwts-advice In negotiating PNOR access with BMC, we
+		 * got an odd/invalid request from the BMC. Likely a bug
+		 * in OPAL/BMC interaction.
+		 */
+		prlog(PR_ERR, "invalid PNOR access requested: %02x\n",
+		      access);
+	}
+}
+
+static void sel_power(uint8_t power, void *context __unused)
+{
+	switch (power) {
+	case SOFT_OFF:
+		prlog(PR_NOTICE, "Soft shutdown requested\n");
+		if (opal_booting() && platform.cec_power_down) {
+			prlog(PR_NOTICE, "Host not up, shutting down now\n");
+			platform.cec_power_down(IPMI_CHASSIS_PWR_DOWN);
+		} else {
+			opal_queue_msg(OPAL_MSG_SHUTDOWN, NULL, NULL,
+					cpu_to_be64(SOFT_OFF));
+		}
+
+		break;
+	case SOFT_REBOOT:
+		prlog(PR_NOTICE, "Soft reboot requested\n");
+		if (opal_booting() && platform.cec_reboot) {
+			prlog(PR_NOTICE, "Host not up, rebooting now\n");
+			platform.cec_reboot();
+		} else {
+			opal_queue_msg(OPAL_MSG_SHUTDOWN, NULL, NULL,
+					cpu_to_be64(SOFT_REBOOT));
+		}
+
+		break;
+	default:
+		prlog(PR_WARNING, "requested bad power state: %02x\n",
+		      power);
+	}
+}
+
+static void sel_heartbeat(uint8_t heartbeat, void *context __unused)
+{
+	/* There is only one sub-command so no processing needed */
+	prlog(PR_DEBUG, "BMC issued heartbeat command: %02x\n",
+	      heartbeat);
+}
+
+static uint32_t occ_sensor_id_to_chip(uint8_t sensor, uint32_t *chip)
+{
+	struct dt_node *node, *bmc_node, *sensors_node;
+
+	/* Default chip id */
+	*chip = 0;
+
+	bmc_node = dt_find_by_name(dt_root, "bmc");
+	if (!bmc_node)
+		return 0;
+
+	sensors_node = dt_find_by_name(bmc_node, "sensors");
+	if (!sensors_node)
+		return 0;
+
+	node = dt_find_by_name_addr(sensors_node, "sensor", sensor);
+	if (!node) {
+		prlog(PR_DEBUG, "Could not find OCC sensor node. Id : %d\n",
+		      (u32)sensor);
+		return 0;
+	}
+
+	if (!dt_has_node_property(node, "ibm,chip-id", NULL)) {
+		prlog(PR_DEBUG, "Could not find chip-id for OCC sensor : %d\n",
+		      (u32)sensor);
+		return 0;
+	}
+
+	*chip = dt_get_chip_id(node);
+	return 0;
+}
+
+static void sel_occ_reset(uint8_t sensor, void *context __unused)
+{
+	uint32_t chip;
+	int rc;
+
+	rc = occ_sensor_id_to_chip(sensor, &chip);
+	if (rc) {
+		/**
+		 * @fwts-label: SELUnknownOCCReset
+		 * @fwts-advice: Likely bug in what sent us the OCC reset.
+		 */
+		prlog(PR_ERR, "SEL message to reset an unknown OCC "
+				"(sensor ID 0x%02x)\n", sensor);
+		return;
+	}
+
+	prd_occ_reset(chip);
+}
+
+struct ipmi_sel_handler {
+	uint8_t oem_cmd;
+	void (*fn)(uint8_t data, void *context);
+	void *context;
+	struct list_node node;
+};
+
+int ipmi_sel_register(uint8_t oem_cmd,
+		      void (*fn)(uint8_t data, void *context),
+		      void *context)
+{
+	struct ipmi_sel_handler *handler;
+
+	list_for_each(&sel_handlers, handler, node) {
+		if (handler->oem_cmd == oem_cmd) {
+			prerror("Handler for SEL command 0x%02x already registered\n",
+				oem_cmd);
+			return -EINVAL;
+		}
+	}
+
+	handler = malloc(sizeof(*handler));
+	if (!handler)
+		return -ENOMEM;
+
+	handler->oem_cmd = oem_cmd;
+	handler->fn = fn;
+	handler->context = context;
+
+	list_add(&sel_handlers, &handler->node);
+
+	return 0;
+}
+
+void ipmi_sel_init(void)
+{
+	int rc;
+
+	/* Already done */
+	if (ipmi_sel_panic_msg.msg != NULL)
+		return;
+
+	memset(&ipmi_sel_panic_msg, 0, sizeof(struct ipmi_sel_panic_msg));
+	ipmi_sel_panic_msg.msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
+					IPMI_RESERVE_SEL, ipmi_elog_poll,
+					NULL, NULL, IPMI_MAX_REQ_SIZE, 2);
+
+	/* Hackishly register these old-style handlers here for now */
+	/* TODO: Move them to their appropriate source files */
+	rc = ipmi_sel_register(CMD_AMI_POWER, sel_power, NULL);
+	if (rc < 0) {
+		prerror("Failed to register SEL handler for %s",
+			stringify(CMD_AMI_POWER));
+	}
+
+	rc = ipmi_sel_register(CMD_AMI_OCC_RESET, sel_occ_reset, NULL);
+	if (rc < 0) {
+		prerror("Failed to register SEL handler for %s",
+			stringify(CMD_AMI_OCC_RESET));
+	}
+
+	rc = ipmi_sel_register(CMD_AMI_PNOR_ACCESS, sel_pnor, NULL);
+	if (rc < 0) {
+		prerror("Failed to register SEL handler for %s",
+			stringify(CMD_AMI_PNOR_ACCESS));
+	}
+
+	rc = ipmi_sel_register(CMD_HEARTBEAT, sel_heartbeat, NULL);
+	if (rc < 0) {
+		prerror("Failed to register SEL handler for %s",
+			stringify(CMD_HEARTBEAT));
+	}
+}
+
+void ipmi_parse_sel(struct ipmi_msg *msg)
+{
+	struct ipmi_sel_handler *handler;
+	struct oem_sel sel;
+
+	assert(msg->resp_size <= 16);
+
+	memcpy(&sel, msg->data, msg->resp_size);
+
+	/* We do not process system event records */
+	if (sel.type == SEL_RECORD_TYPE_EVENT) {
+		prlog(PR_INFO, "dropping System Event Record SEL\n");
+		return;
+	}
+
+	prlog(PR_DEBUG, "SEL received (%d bytes, netfn %d, cmd %d)\n",
+			msg->resp_size, sel.netfun, sel.cmd);
+
+	/* Only accept OEM SEL messages */
+	if (sel.id[0] != SEL_OEM_ID_0 || sel.id[1] != SEL_OEM_ID_1 ||
+		sel.type != SEL_RECORD_TYPE_OEM) {
+		prlog(PR_WARNING, "unknown SEL %02x%02x (type %02x)\n",
+		      sel.id[0], sel.id[1], sel.type);
+		return;
+	}
+
+	list_for_each(&sel_handlers, handler, node) {
+		if (handler->oem_cmd == sel.cmd) {
+			handler->fn(sel.data[0], handler->context);
+			return;
+		}
+	}
+
+	prlog(PR_WARNING, "unknown OEM SEL command %02x received\n", sel.cmd);
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-sensor.c b/roms/skiboot/hw/ipmi/ipmi-sensor.c
new file mode 100644
index 000000000..857b789e4
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-sensor.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2017 IBM Corp. */
+
+#include <device.h>
+#include <ipmi.h>
+#include <opal.h>
+#include <skiboot.h>
+#include <string.h>
+#include <stdbool.h>
+
+#define IPMI_WRITE_SENSOR		(1 << 0)
+
+#define FW_PROGRESS_SENSOR_TYPE	0x0F
+#define BOOT_COUNT_SENSOR_TYPE	0xC3
+
+static int16_t sensors[MAX_IPMI_SENSORS];
+
+static bool sensors_present = false;
+
+struct set_sensor_req {
+	u8 sensor_number;
+	u8 operation;
+	u8 sensor_reading;
+	u8 assertion_mask[2];
+	u8 deassertion_mask[2];
+	u8 event_data[3];
+};
+
+static bool ipmi_sensor_type_present(uint8_t sensor_type)
+{
+        const struct dt_property *type_prop;
+        uint8_t type;
+        struct dt_node *node;
+
+        dt_for_each_compatible(dt_root, node, "ibm,ipmi-sensor") {
+                type_prop = dt_find_property(node, "ipmi-sensor-type");
+                if (!type_prop) {
+                        prlog(PR_ERR, "IPMI: sensor doesn't have ipmi-sensor-type\n");
+                        continue;
+                }
+
+                type = (uint8_t)dt_property_get_cell(type_prop, 0);
+                if (type == sensor_type)
+                        return true;
+        }
+        return false;
+}
+
+uint8_t ipmi_get_sensor_number(uint8_t sensor_type)
+{
+	assert(sensor_type < MAX_IPMI_SENSORS);
+	return sensors[sensor_type];
+}
+
+int ipmi_set_boot_count(void)
+{
+	struct set_sensor_req req;
+	struct ipmi_msg *msg;
+	int boot_count_sensor;
+
+	if (!sensors_present)
+		return OPAL_UNSUPPORTED;
+
+	if (!ipmi_present())
+		return OPAL_CLOSED;
+
+        if (!ipmi_sensor_type_present(BOOT_COUNT_SENSOR_TYPE))
+                return OPAL_HARDWARE;
+
+	boot_count_sensor = sensors[BOOT_COUNT_SENSOR_TYPE];
+
+	if (boot_count_sensor < 0) {
+		prlog(PR_DEBUG, "IPMI: boot count set but not present\n");
+		return OPAL_HARDWARE;
+	}
+
+	memset(&req, 0, sizeof(req));
+
+	req.sensor_number = boot_count_sensor;
+	req.operation = IPMI_WRITE_SENSOR;
+	req.sensor_reading = 0x00;
+	req.assertion_mask[0] = 0x02;
+
+	msg = ipmi_mkmsg_simple(IPMI_SET_SENSOR_READING, &req, sizeof(req));
+	if (!msg)
+		return OPAL_HARDWARE;
+
+	printf("IPMI: Resetting boot count on successful boot\n");
+
+	return ipmi_queue_msg(msg);
+}
+
+int ipmi_set_fw_progress_sensor(uint8_t state)
+{
+	struct ipmi_msg *msg;
+	struct set_sensor_req request;
+	int fw_sensor_num;
+
+	if (!sensors_present)
+		return OPAL_UNSUPPORTED;
+
+	if (!ipmi_present())
+		return OPAL_CLOSED;
+
+        if (!ipmi_sensor_type_present(FW_PROGRESS_SENSOR_TYPE))
+                return OPAL_HARDWARE;
+
+	fw_sensor_num = sensors[FW_PROGRESS_SENSOR_TYPE];
+
+	if (fw_sensor_num < 0) {
+		prlog(PR_DEBUG, "IPMI: fw progress set but not present\n");
+		return OPAL_HARDWARE;
+	}
+
+	memset(&request, 0, sizeof(request));
+
+	request.sensor_number = fw_sensor_num;
+	request.operation = 0xa0; /* Set event data bytes, assertion bits */
+	request.assertion_mask[0] = 0x04; /* Firmware progress offset */
+	request.event_data[0] = 0xc2;
+	request.event_data[1] = state;
+
+	prlog(PR_INFO, "IPMI: setting fw progress sensor %02x to %02x\n",
+			request.sensor_number, request.event_data[1]);
+
+	msg = ipmi_mkmsg_simple(IPMI_SET_SENSOR_READING, &request,
+			sizeof(request));
+	if (!msg)
+		return OPAL_HARDWARE;
+
+	return ipmi_queue_msg(msg);
+}
+
+void ipmi_sensor_init(void)
+{
+	const struct dt_property *type_prop, *num_prop;
+	uint8_t num, type;
+	struct dt_node *n;
+
+	memset(sensors, -1, sizeof(sensors));
+
+	dt_for_each_compatible(dt_root, n, "ibm,ipmi-sensor") {
+		type_prop = dt_find_property(n, "ipmi-sensor-type");
+		if (!type_prop) {
+			prerror("IPMI: sensor doesn't have ipmi-sensor-type\n");
+			continue;
+		}
+
+		num_prop = dt_find_property(n, "reg");
+		if (!num_prop) {
+			prerror("IPMI: sensor doesn't have reg property\n");
+			continue;
+		}
+		num = (uint8_t)dt_property_get_cell(num_prop, 0);
+		type = (uint8_t)dt_property_get_cell(type_prop, 0);
+		assert(type < MAX_IPMI_SENSORS);
+		sensors[type] = num;
+	}
+	sensors_present = true;
+}
diff --git a/roms/skiboot/hw/ipmi/ipmi-watchdog.c b/roms/skiboot/hw/ipmi/ipmi-watchdog.c
new file mode 100644
index 000000000..dc0a9e5b4
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/ipmi-watchdog.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright 2013-2018 IBM Corp.
+ * Copyright 2018 Google Corp.
+ */
+
+#include <stdlib.h>
+#include <ipmi.h>
+#include <lock.h>
+#include <opal.h>
+#include <device.h>
+#include <timer.h>
+#include <timebase.h>
+#include <pool.h>
+#include <skiboot.h>
+
+#define TIMER_USE_DONT_LOG	0x80
+#define TIMER_USE_DONT_STOP	0x40
+#define TIMER_USE_POST		0x02
+
+/* WDT expiration actions */
+#define WDT_PRETIMEOUT_SMI	0x10
+#define WDT_RESET_ACTION 	0x01
+#define WDT_NO_ACTION		0x00
+
+/* IPMI defined custom completion codes for the watchdog */
+#define WDT_CC_OK		0x00
+#define WDT_CC_NOT_INITIALIZED	0x80
+
+/* Flags used for IPMI callbacks */
+#define WDT_SET_DO_RESET	0x01
+#define WDT_RESET_NO_REINIT	0x01
+
+/* How long to set the overall watchdog timeout for. In units of
+ * 100ms. If the timer is not reset within this time the watchdog
+ * expiration action will occur. */
+#define WDT_TIMEOUT		600
+
+/* How often to reset the timer using schedule_timer(). Too short and
+we risk accidentally resetting the system due to opal_run_pollers() not
+being called in time, too short and we waste time resetting the wdt
+more frequently than necessary. */
+#define WDT_MARGIN		300
+
+static struct timer wdt_timer;
+static bool wdt_stopped;
+static bool wdt_ticking;
+
+/* Saved values from the last watchdog set action */
+static uint8_t last_action;
+static uint16_t last_count;
+static uint8_t last_pretimeout;
+
+static void reset_wdt(struct timer *t, void *data, uint64_t now);
+
+static void set_wdt_complete(struct ipmi_msg *msg)
+{
+	const uintptr_t flags = (uintptr_t)msg->user_data;
+
+	if (flags & WDT_SET_DO_RESET) {
+		/* Make sure the reset action does not create a loop and
+		 * perform a reset in the case where the BMC send an
+		 * uninitialized error. */
+		reset_wdt(NULL, (void *)WDT_RESET_NO_REINIT, 0);
+	}
+
+	ipmi_free_msg(msg);
+}
+
+static void set_wdt(uint8_t action, uint16_t count, uint8_t pretimeout,
+		bool dont_stop, bool do_reset)
+{
+	struct ipmi_msg *ipmi_msg;
+	uintptr_t completion_flags = 0;
+
+	if (do_reset)
+		completion_flags |= WDT_SET_DO_RESET;
+
+	/* Save the values prior to issuing the set operation so that we can
+	 * re-initialize the watchdog in error cases. */
+	last_action = action;
+	last_count = count;
+	last_pretimeout = pretimeout;
+
+	ipmi_msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_SET_WDT,
+			      set_wdt_complete, NULL, NULL, 6, 0);
+	if (!ipmi_msg) {
+		prerror("Unable to allocate set wdt message\n");
+		return;
+	}
+	ipmi_msg->error = set_wdt_complete;
+	ipmi_msg->user_data = (void *)completion_flags;
+	ipmi_msg->data[0] = TIMER_USE_POST |
+		TIMER_USE_DONT_LOG |
+		(dont_stop ? TIMER_USE_DONT_STOP : 0);
+	ipmi_msg->data[1] = action;			/* Timer Actions */
+	ipmi_msg->data[2] = pretimeout;			/* Pre-timeout Interval */
+	ipmi_msg->data[3] = 0;				/* Timer Use Flags */
+	ipmi_msg->data[4] = count & 0xff;		/* Initial countdown (lsb) */
+	ipmi_msg->data[5] = (count >> 8) & 0xff;	/* Initial countdown (msb) */
+	ipmi_queue_msg(ipmi_msg);
+}
+
+static void reset_wdt_complete(struct ipmi_msg *msg)
+{
+	const uintptr_t flags = (uintptr_t)msg->user_data;
+	uint64_t reset_delay_ms = (WDT_TIMEOUT - WDT_MARGIN) * 100;
+
+	if (msg->cc == WDT_CC_NOT_INITIALIZED &&
+			!(flags & WDT_RESET_NO_REINIT)) {
+		/* If our timer was not initialized on the BMC side, we should
+		 * perform a single attempt to set it up again. */
+		set_wdt(last_action, last_count, last_pretimeout, true, true);
+	} else if (msg->cc != WDT_CC_OK) {
+		/* Use a short (10s) timeout before performing the next reset
+		 * if we encounter an unknown error. This makes sure that we
+		 * are able to reset and re-initialize the timer since it might
+		 * expire. */
+		reset_delay_ms = 10 * 1000;
+	}
+
+	/* If we are inside of skiboot we need to periodically restart the
+	 * timer. Reschedule a reset so it happens before the timeout. */
+	if (wdt_ticking)
+		schedule_timer(&wdt_timer, msecs_to_tb(reset_delay_ms));
+
+	ipmi_free_msg(msg);
+}
+
+static struct ipmi_msg *wdt_reset_mkmsg(void)
+{
+	struct ipmi_msg *ipmi_msg;
+
+	ipmi_msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE, IPMI_RESET_WDT,
+			      reset_wdt_complete, NULL, NULL, 0, 0);
+	if (!ipmi_msg) {
+		prerror("Unable to allocate reset wdt message\n");
+		return NULL;
+	}
+	ipmi_msg->error = reset_wdt_complete;
+
+	return ipmi_msg;
+}
+
+static void sync_reset_wdt(void)
+{
+	struct ipmi_msg *ipmi_msg;
+
+	if ((ipmi_msg = wdt_reset_mkmsg()))
+		ipmi_queue_msg_sync(ipmi_msg);
+}
+
+static void reset_wdt(struct timer *t __unused, void *data,
+		      uint64_t now __unused)
+{
+	struct ipmi_msg *ipmi_msg;
+
+	if ((ipmi_msg = wdt_reset_mkmsg())) {
+		ipmi_msg->user_data = data;
+		ipmi_queue_msg_head(ipmi_msg);
+	}
+}
+
+void ipmi_wdt_stop(void)
+{
+	if (!wdt_stopped) {
+		/* Make sure the background reset timer is disabled before
+		 * stopping the watchdog. If we issue a reset after disabling
+		 * the timer, it will be re-enabled. */
+		wdt_ticking = false;
+		cancel_timer(&wdt_timer);
+
+		/* Configure the watchdog to be disabled and do no action
+		 * in case the underlying implementation is buggy and times
+		 * out anyway. */
+		wdt_stopped = true;
+		set_wdt(WDT_NO_ACTION, 100, 0, false, false);
+	}
+}
+
+void ipmi_wdt_final_reset(void)
+{
+	/* We can safely stop the timer prior to setting up our final
+	 * watchdog timeout since we have enough margin before the
+	 * timeout. */
+	wdt_ticking = false;
+	cancel_timer(&wdt_timer);
+
+	/*
+	 * We're going to wait a little while before requiring
+	 * BOOTKERNEL to have IPMI watchdog support so that people
+	 * can catch up in their development environments.
+	 * If you still read this after 2018, send a patch!
+	 */
+#if 0
+	/* Configure the watchdog and make sure it is still enabled */
+	set_wdt(WDT_RESET_ACTION | WDT_PRETIMEOUT_SMI, WDT_TIMEOUT,
+		WDT_MARGIN/10, true, true);
+	sync_reset_wdt();
+#else
+	set_wdt(WDT_NO_ACTION, 100, 0, false, false);
+#endif
+	ipmi_set_boot_count();
+}
+
+void ipmi_wdt_init(void)
+{
+	init_timer(&wdt_timer, reset_wdt, NULL);
+	set_wdt(WDT_RESET_ACTION, WDT_TIMEOUT, 0, true, false);
+
+	/* Start the WDT. We do it synchronously to make sure it has
+	 * started before skiboot continues booting. Otherwise we
+	 * could crash before the wdt has actually been started. */
+	wdt_ticking = true;
+	sync_reset_wdt();
+
+	return;
+}
diff --git a/roms/skiboot/hw/ipmi/test/Makefile.check b/roms/skiboot/hw/ipmi/test/Makefile.check
new file mode 100644
index 000000000..ceed1ed39
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/test/Makefile.check
@@ -0,0 +1,34 @@
+# -*-Makefile-*-
+IPMI_TEST := hw/ipmi/test/run-fru
+
+LCOV_EXCLUDE += $(IPMI_TEST:%=%.c)
+
+.PHONY : hw-ipmi-check hw-ipmi-coverage
+hw-ipmi-check: $(IPMI_TEST:%=%-check)
+hw-ipmi-coverage: $(IPMI_TEST:%=%-gcov-run)
+
+check: hw-ipmi-check
+coverage: hw-ipmi-coverage
+
+$(IPMI_TEST:%=%-gcov-run) : %-run: %
+	$(call Q, TEST-COVERAGE ,$< , $<)
+
+$(IPMI_TEST:%=%-check) : %-check: %
+	$(call Q, RUN-TEST ,$(VALGRIND) $<, $<)
+
+$(IPMI_TEST) : % : %.c
+	$(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) -O0 -g -I include -I . -o $@ $<, $<)
+
+$(IPMI_TEST:%=%-gcov): %-gcov : %.c %
+	$(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) $(HOSTGCOVCFLAGS) -I include -I . -I libfdt -lgcov -o $@ $<, $<)
+
+$(IPMI_TEST:%=%-gcov): % : $(%.d:-gcov=)
+
+-include $(wildcard hw/ipmi/test/*.d)
+
+clean: ipmi-test-clean
+
+ipmi-test-clean:
+	$(RM) -f hw/ipmi/test/*.[od] $(IPMI_TEST) $(IPMI_TEST:%=%-gcov)
+	$(RM) -f *.gcda *.gcno skiboot.info
+	$(RM) -rf coverage-report
diff --git a/roms/skiboot/hw/ipmi/test/run-fru.c b/roms/skiboot/hw/ipmi/test/run-fru.c
new file mode 100644
index 000000000..fa79c98a1
--- /dev/null
+++ b/roms/skiboot/hw/ipmi/test/run-fru.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define __TEST__
+
+#include "../ipmi-fru.c"
+
+#include <string.h>
+
+int error = 0;
+
+const char version[] = "a-too-long-version-test-string-is-here";
+
+void ipmi_free_msg(struct ipmi_msg __unused *msg)
+{
+}
+
+void ipmi_init_msg(struct ipmi_msg __unused *msg, int __unused interface,
+		   uint32_t __unused code,
+		   void __unused (*complete)(struct ipmi_msg *),
+		   void __unused *user_data, size_t __unused req_size,
+		   size_t __unused resp_size)
+{
+}
+
+struct ipmi_msg *ipmi_mkmsg(int __unused interface, uint32_t __unused code,
+			    void __unused (*complete)(struct ipmi_msg *),
+			    void __unused *user_data, void __unused *req_data, size_t __unused req_size,
+			    size_t __unused resp_size)
+{
+	return NULL;
+}
+
+int ipmi_queue_msg(struct ipmi_msg __unused *msg)
+{
+	return 0;
+}
+
+void _prlog(int __unused log_level, const __unused char* fmt, ...)
+{
+	return;
+}
+
+int main(void)
+{
+	u8 *buf;
+	int len;
+	struct product_info info = {
+		.manufacturer = (char *) "IBM",
+		.product = (char *) "skiboot",
+		.part_no = (char *) "hello",
+		.version = (char *) "12345",
+		.serial_no = (char *) "12345",
+		.asset_tag = (char *) "abcd",
+	};
+	struct product_info invalid_info = {
+		.manufacturer = (char *) "I",
+		.product = (char *) "skiboot",
+		.part_no = (char *) "hello",
+		.version = (char *) "12345",
+		.serial_no = (char *) "12345",
+		.asset_tag = (char *) "abcd",
+	};
+	struct product_info invalid_info2 = {
+		.manufacturer = (char *) "IBM",
+		.product = (char *) "skiboot",
+		.part_no = (char *) "this is a really long string that's more"
+		"than 32 characters, because it turns out that's invalid.",
+		.version = (char *) "12345",
+		.serial_no = (char *) "12345",
+		.asset_tag = (char *) "abcd",
+	};
+
+	buf = malloc(256);
+
+	len = fru_fill_product_info(buf, &info, 40);
+	assert(len == 40);
+	assert(memcmp(buf, "\001\005\000\303IBM\307skiboot\305hello"
+		      "\30512345\30512345\304abcd\301-",len) == 0);
+
+
+	/* Make sure the checksum is right */
+	assert(!fru_checksum(buf, len));
+
+	/* This should fail (not enough space) */
+	assert(fru_fill_product_info(buf, &info, 39) < 0);
+
+	memset(buf, 0, 256);
+	len = fru_fill_product_info(buf, &invalid_info, 40);
+	assert(len == OPAL_PARAMETER);
+
+	memset(buf, 0, 256);
+	len = fru_fill_product_info(buf, &invalid_info2, 256);
+	assert(len == OPAL_PARAMETER);
+
+	memset(buf, 0, 256);
+	assert(fru_add(buf, 256) > 0);
+	assert(0 == memcmp(&buf[64], "\001\a\000\303IBM\307skiboot\300"
+			   "\337a-too-long-version-test-string+\300\300\301"
+			   "\0\0\0",54));
+
+
+	memset(buf, 0, 256);
+	assert(fru_add(buf, 1) == OPAL_PARAMETER);
+
+	memset(buf, 0, 256);
+	assert(fru_add(buf, 65) == OPAL_PARAMETER);
+
+	free(buf);
+
+	return 0;
+}
diff --git a/roms/skiboot/hw/lpc-mbox.c b/roms/skiboot/hw/lpc-mbox.c
new file mode 100644
index 000000000..f5bb97ea4
--- /dev/null
+++ b/roms/skiboot/hw/lpc-mbox.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * LPC MBOX
+ *
+ * Copyright 2017-2018 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "LPC-MBOX: " fmt
+
+#include <skiboot.h>
+#include <lpc.h>
+#include <console.h>
+#include <opal.h>
+#include <device.h>
+#include <interrupts.h>
+#include <processor.h>
+#include <errorlog.h>
+#include <trace.h>
+#include <timebase.h>
+#include <timer.h>
+#include <cpu.h>
+#include <chip.h>
+#include <io.h>
+
+#include <lpc-mbox.h>
+
+#define MBOX_FLAG_REG 0x0f
+#define MBOX_STATUS_0 0x10
+#define MBOX_STATUS_1 0x11
+#define   MBOX_STATUS_1_ATTN (1 << 7)
+#define   MBOX_STATUS_1_RESP (1 << 5)
+#define MBOX_BMC_CTRL 0x12
+#define   MBOX_CTRL_INT_STATUS (1 << 7)
+#define   MBOX_CTRL_INT_MASK (1 << 1)
+#define   MBOX_CTRL_INT_PING (1 << 0)
+#define   MBOX_CTRL_INT_SEND (MBOX_CTRL_INT_PING | MBOX_CTRL_INT_MASK)
+#define MBOX_HOST_CTRL 0x13
+#define MBOX_BMC_INT_EN_0 0x14
+#define MBOX_BMC_INT_EN_1 0x15
+#define MBOX_HOST_INT_EN_0 0x16
+#define MBOX_HOST_INT_EN_1 0x17
+
+#define MBOX_MAX_QUEUE_LEN 5
+
+struct mbox {
+	uint32_t base;
+	int queue_len;
+	bool irq_ok;
+	uint8_t seq;
+	struct timer poller;
+	void (*callback)(struct bmc_mbox_msg *msg, void *priv);
+	void *drv_data;
+	void (*attn)(uint8_t bits, void *priv);
+	void *attn_data;
+	struct lock lock;
+	uint8_t sequence;
+	unsigned long timeout;
+};
+
+static struct mbox mbox;
+
+/*
+ * MBOX accesses
+ */
+
+static void bmc_mbox_outb(uint8_t val, uint8_t reg)
+{
+	lpc_outb(val, mbox.base + reg);
+}
+
+static uint8_t bmc_mbox_inb(uint8_t reg)
+{
+	return lpc_inb(mbox.base + reg);
+}
+
+static void bmc_mbox_recv_message(struct bmc_mbox_msg *msg)
+{
+	uint8_t *msg_data = (uint8_t *)msg;
+	int i;
+
+	for (i = 0; i < BMC_MBOX_READ_REGS; i++)
+		msg_data[i] = bmc_mbox_inb(i);
+}
+
+/* This needs work, don't write the data bytes that aren't needed */
+static void bmc_mbox_send_message(struct bmc_mbox_msg *msg)
+{
+	uint8_t *msg_data = (uint8_t *)msg;
+	int i;
+
+	if (!lpc_ok())
+		/* We're going to have to handle this better */
+		prlog(PR_ERR, "LPC isn't ok\n");
+
+	for (i = 0; i < BMC_MBOX_WRITE_REGS; i++)
+		bmc_mbox_outb(msg_data[i], i);
+
+	/*
+	 * Don't touch the response byte - it's setup to generate an interrupt
+	 * to the host (us) when written to, or the host status reg - we don't
+	 * currently use it, or the BMC status reg - we're not allowed to.
+	 */
+
+	/* Ping */
+	prlog(PR_TRACE, "Sending BMC interrupt\n");
+	bmc_mbox_outb(MBOX_CTRL_INT_SEND, MBOX_HOST_CTRL);
+}
+
+int bmc_mbox_enqueue(struct bmc_mbox_msg *msg, unsigned int timeout_sec)
+{
+	if (!mbox.base) {
+		prlog(PR_CRIT, "Using MBOX without init!\n");
+		return OPAL_WRONG_STATE;
+	}
+
+	lock(&mbox.lock);
+	if (mbox.timeout) {
+		prlog(PR_DEBUG, "MBOX message already in flight\n");
+		if (mftb() > mbox.timeout) {
+			prlog(PR_ERR, "In flight message dropped on the floor\n");
+		} else {
+			unlock(&mbox.lock);
+			return OPAL_BUSY;
+		}
+	}
+
+	mbox.timeout = mftb() + secs_to_tb(timeout_sec);
+	msg->seq = ++mbox.sequence;
+
+	bmc_mbox_send_message(msg);
+	unlock(&mbox.lock);
+
+	schedule_timer(&mbox.poller, mbox.irq_ok ?
+			TIMER_POLL : msecs_to_tb(MBOX_DEFAULT_POLL_MS));
+
+	return 0;
+}
+
+static void mbox_poll(struct timer *t __unused, void *data __unused,
+		uint64_t now __unused)
+{
+	struct bmc_mbox_msg msg;
+
+	if (!lpc_ok())
+		return;
+
+	/*
+	 * This status bit being high means that someone touched the
+	 * response byte (byte 13).
+	 * There is probably a response for the previously sent commant
+	 */
+	lock(&mbox.lock);
+	if (bmc_mbox_inb(MBOX_STATUS_1) & MBOX_STATUS_1_RESP) {
+		/* W1C on that reg */
+		bmc_mbox_outb(MBOX_STATUS_1_RESP, MBOX_STATUS_1);
+
+		prlog(PR_INSANE, "Got a regular interrupt\n");
+
+		bmc_mbox_recv_message(&msg);
+		if (mbox.sequence != msg.seq) {
+			prlog(PR_ERR, "Got a response to a message we no longer care about\n");
+			goto out_response;
+		}
+
+		mbox.timeout = 0;
+		if (mbox.callback)
+			mbox.callback(&msg, mbox.drv_data);
+		else
+			prlog(PR_ERR, "Detected NULL callback for mbox message\n");
+	}
+
+out_response:
+
+	/*
+	 * The BMC has touched byte 15 to get our attention as it has
+	 * something to tell us.
+	 */
+	if (bmc_mbox_inb(MBOX_STATUS_1) & MBOX_STATUS_1_ATTN) {
+		uint8_t action, all;
+
+		/* W1C on that reg */
+		bmc_mbox_outb(MBOX_STATUS_1_ATTN, MBOX_STATUS_1);
+
+		all = action = bmc_mbox_inb(MBOX_FLAG_REG);
+		prlog(PR_TRACE, "Got a status register interrupt with action 0x%02x\n",
+				action);
+		if (action & MBOX_ATTN_BMC_REBOOT) {
+			/*
+			 * It's unlikely that something needs to be done at the
+			 * driver level. Let libflash deal with it.
+			 * Print something just in case, it is quite a signficant
+			 * event.
+			 */
+			prlog(PR_WARNING, "BMC reset detected\n");
+			action &= ~MBOX_ATTN_BMC_REBOOT;
+		}
+
+		if (action & MBOX_ATTN_BMC_WINDOW_RESET)
+			action &= ~MBOX_ATTN_BMC_WINDOW_RESET;
+
+		if (action & MBOX_ATTN_BMC_FLASH_LOST)
+			action &= ~MBOX_ATTN_BMC_FLASH_LOST;
+
+		if (action & MBOX_ATTN_BMC_DAEMON_READY)
+			action &= ~MBOX_ATTN_BMC_DAEMON_READY;
+
+		if (action)
+			prlog(PR_ERR, "Got a status bit set that don't know about: 0x%02x\n",
+					action);
+
+		mbox.attn(all, mbox.attn_data);
+	}
+
+	unlock(&mbox.lock);
+
+	schedule_timer(&mbox.poller,
+		       mbox.irq_ok ? TIMER_POLL : msecs_to_tb(MBOX_DEFAULT_POLL_MS));
+}
+
+static void mbox_irq(uint32_t chip_id __unused, uint32_t irq_mask __unused)
+{
+	mbox.irq_ok = true;
+	mbox_poll(NULL, NULL, 0);
+}
+
+static struct lpc_client mbox_lpc_client = {
+	.interrupt = mbox_irq,
+};
+
+static bool mbox_init_hw(void)
+{
+	/* Disable all status interrupts except attentions */
+	bmc_mbox_outb(0x00, MBOX_HOST_INT_EN_0);
+	bmc_mbox_outb(MBOX_STATUS_1_ATTN, MBOX_HOST_INT_EN_1);
+
+	/* Cleanup host interrupt and status */
+	bmc_mbox_outb(MBOX_CTRL_INT_STATUS, MBOX_HOST_CTRL);
+
+	/* Disable host control interrupt for now (will be
+	 * re-enabled when needed). Clear BMC interrupts
+	 */
+	bmc_mbox_outb(MBOX_CTRL_INT_MASK, MBOX_BMC_CTRL);
+
+	return true;
+}
+
+int bmc_mbox_register_callback(void (*callback)(struct bmc_mbox_msg *msg, void *priv),
+		void *drv_data)
+{
+	mbox.callback = callback;
+	mbox.drv_data = drv_data;
+	return 0;
+}
+
+int bmc_mbox_register_attn(void (*callback)(uint8_t bits, void *priv),
+		void *drv_data)
+{
+	mbox.attn = callback;
+	mbox.attn_data = drv_data;
+	return 0;
+}
+
+uint8_t bmc_mbox_get_attn_reg(void)
+{
+	return bmc_mbox_inb(MBOX_FLAG_REG);
+}
+
+void mbox_init(void)
+{
+	const struct dt_property *prop;
+	struct dt_node *np;
+	uint32_t irq, chip_id;
+
+	if (mbox.base) {
+		prlog(PR_ERR, "Duplicate call to mbox_init()\n");
+		return;
+	}
+
+	prlog(PR_DEBUG, "Attempting mbox init\n");
+	np = dt_find_compatible_node(dt_root, NULL, "mbox");
+	if (!np) {
+		/* Only an ERROR on P9 and above, otherwise just
+		 * a warning for someone doing development
+		 */
+		prlog((proc_gen <= proc_gen_p8) ? PR_DEBUG : PR_ERR,
+		      "No device tree entry\n");
+		return;
+	}
+
+	/* Read the interrupts property if any */
+	irq = dt_prop_get_u32_def(np, "interrupts", 0);
+	if (!irq) {
+		prlog(PR_ERR, "No interrupts property\n");
+		return;
+	}
+
+	if (!lpc_present()) {
+		prlog(PR_ERR, "LPC not present\n");
+		return;
+	}
+
+	/* Get IO base */
+	prop = dt_find_property(np, "reg");
+	if (!prop) {
+		prlog(PR_ERR, "Can't find reg property\n");
+		return;
+	}
+	if (dt_property_get_cell(prop, 0) != OPAL_LPC_IO) {
+		prlog(PR_ERR, "Only supports IO addresses\n");
+		return;
+	}
+	mbox.base = dt_property_get_cell(prop, 1);
+
+	if (!mbox_init_hw()) {
+		prlog(PR_DEBUG, "Couldn't init HW\n");
+		return;
+	}
+
+	/* Disable the standard interrupt we don't care */
+	bmc_mbox_outb(MBOX_CTRL_INT_MASK, MBOX_HOST_CTRL);
+
+	/* Clear the status reg bits that we intend to use for interrupts */
+	/* W1C */
+	bmc_mbox_outb(MBOX_STATUS_1_RESP | MBOX_STATUS_1_ATTN, MBOX_STATUS_1);
+
+	mbox.queue_len = 0;
+	mbox.callback = NULL;
+	mbox.drv_data = NULL;
+	mbox.timeout = 0;
+	mbox.sequence = 0;
+	init_lock(&mbox.lock);
+
+	init_timer(&mbox.poller, mbox_poll, NULL);
+
+	chip_id = dt_get_chip_id(np);
+	mbox_lpc_client.interrupts = LPC_IRQ(irq);
+	lpc_register_client(chip_id, &mbox_lpc_client, IRQ_ATTR_TARGET_OPAL);
+
+	/* Enable interrupts */
+	bmc_mbox_outb(MBOX_STATUS_1_ATTN | MBOX_STATUS_1_RESP, MBOX_HOST_INT_EN_1);
+
+	prlog(PR_DEBUG, "Enabled on chip %d, IO port 0x%x, IRQ %d\n",
+	      chip_id, mbox.base, irq);
+}
+
+
diff --git a/roms/skiboot/hw/lpc-port80h.c b/roms/skiboot/hw/lpc-port80h.c
new file mode 100644
index 000000000..0d1fee99e
--- /dev/null
+++ b/roms/skiboot/hw/lpc-port80h.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * op_display() but over the 1 byte LPC port 80h just like an original IBM PC
+ *
+ * Copyright 2018-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt)	"Port80h: " fmt
+
+#include <lpc.h>
+#include <op-panel.h>
+#include <chip.h>
+
+/*
+ * Convert our detailed op_display() call into 1 byte for LPC port 80h
+ *
+ * Our layout looks like this:
+ * MSB (bit 7): 1 = Comes from OPAL
+ *      bit 6 : 0 = OP_MOD_INIT (the main one), 1 = (see bit 5)
+ *      bit 5432 : (if bit 6=0, low nibble of op-panel code)
+ *      bit 5432 : (if bit 6=1, other OP_MOD_ values in bits 54:
+ *                              00b=OP_MOD_CPU, 01b=OP_MOD_LOCK,
+ *                              10b=OP_MOD_MEM, 11b=OP_MOD_CHIPTOD
+ *                  bits 0,1 from code in bits 32)
+ *
+ *      bit 1,0: 00b=OP_LOG, 10b=OP_WARN, 01b=OP_ERROR, 11b=OP_FATAL
+ *               i.e. bit 0 indicates ERROR or FATAL.
+ *
+ * If port 80h number has the MSB and LSB set, then you died in OPAL.
+ * Any *odd* number with the MSB set (i.e. > 0x80) indicates error.
+ */
+static inline uint8_t op_display_to_port80(uint8_t last_value, enum op_severity s, enum op_module m, uint16_t c)
+{
+	uint8_t r = 0x80; /* Start with top bit set indicating in OPAL */
+
+	switch(m) {
+	case OP_MOD_INIT:
+		/* bit 6 is zero */
+		/* bits 5432 have low nibble of c */
+		r |= (c & 0x0f) << 2;
+		break;
+	case OP_MOD_CPU:
+		r |= 0x40 | (c & 0x03) << 2;
+		break;
+	case OP_MOD_LOCK:
+		r |= 0x50 | (c & 0x03) << 2;
+		break;
+	case OP_MOD_MEM:
+		r |= 0x60 | (c & 0x03) << 2;
+		break;
+	case OP_MOD_CHIPTOD:
+		r |= 0x70 | (c & 0x03) << 2;
+		break;
+	case OP_MOD_CORE:
+		/*
+		 * Only current OP_MOD_CORE is where we're OP_FATAL,
+		 * So let's go for the last value set and tweak the
+		 * bits for OP_FATAL.
+		 */
+		r = last_value & 0xFC;
+		break;
+	case OP_MOD_FSP:
+	case OP_MOD_FSPCON:
+		/* Should never be hit, port80h only used on non-FSP! */
+		break;
+	}
+
+	switch(s) {
+	case OP_LOG:
+		break;
+	case OP_WARN:
+		r |= 0x02;
+		break;
+	case OP_ERROR:
+		r |= 0x01;
+		break;
+	case OP_FATAL:
+		r |= 0x03;
+	}
+
+	return r;
+}
+
+/*
+ * Convert our detailed op_display() call into 2 bytes for LPC port 81h and 82h
+ *
+ * This looks pretty similar to our port80 code.
+ * Notably we now have more bits to throw progress into.
+ *
+ * Our layout looks like this:
+ * MSB (bit 15): 1 = Comes from OPAL
+ *      bit 14 : 0 = OP_MOD_INIT (the main one), 1 = (see bit 13)
+ *      bits 13-2 : (if bit 6=0, low 12 bits of op-panel code)
+ *      bit 13,12 : (if bit 6=1, other OP_MOD_ values in bits 13 and 12:
+ *                              00b=OP_MOD_CPU, 01b=OP_MOD_LOCK,
+ *                              10b=OP_MOD_MEM, 11b=OP_MOD_CHIPTOD)
+ *                   and bits 11-2 are low 10 bits of op-panel code)
+ *
+ *      bit 1,0: 00b=OP_LOG, 10b=OP_WARN, 01b=OP_ERROR, 11b=OP_FATAL
+ *               i.e. bit 0 indicates ERROR or FATAL.
+ *
+ * If port 80h number has the MSB and LSB set, then you died in OPAL.
+ * Any *odd* number with the MSB set (i.e. > 0x80) indicates error.
+ */
+static inline uint16_t op_display_to_port8x(uint16_t last_value, enum op_severity s, enum op_module m, uint16_t c)
+{
+	uint16_t r = 0x8000; /* Start with top bit set indicating in OPAL */
+
+	switch(m) {
+	case OP_MOD_INIT:
+		/* bit 6 is zero */
+		/* bits 13 through 2 have low 12 bits of c */
+		r |= (c & 0xFFF) << 2;
+		break;
+	case OP_MOD_CPU:
+		r |= 0x4000 | (c & 0x03FF) << 2;
+		break;
+	case OP_MOD_LOCK:
+		r |= 0x5000 | (c & 0x03FF) << 2;
+		break;
+	case OP_MOD_MEM:
+		r |= 0x6000 | (c & 0x03FF) << 2;
+		break;
+	case OP_MOD_CHIPTOD:
+		r |= 0x7000 | (c & 0x03FF) << 2;
+		break;
+	case OP_MOD_CORE:
+		/*
+		 * Only current OP_MOD_CORE is where we're OP_FATAL,
+		 * So let's go for the last value set and tweak the
+		 * bits for OP_FATAL.
+		 */
+		r = last_value & 0xFFFC;
+		break;
+	case OP_MOD_FSP:
+	case OP_MOD_FSPCON:
+		/* Should never be hit, port80h only used on non-FSP! */
+		break;
+	}
+
+	switch(s) {
+	case OP_LOG:
+		break;
+	case OP_WARN:
+		r |= 0x02;
+		break;
+	case OP_ERROR:
+		r |= 0x01;
+		break;
+	case OP_FATAL:
+		r |= 0x03;
+	}
+
+	return r;
+}
+
+
+void op_display_lpc(enum op_severity s, enum op_module m, uint16_t c)
+{
+	static uint8_t port80_val = 0x80;
+	static uint16_t port8x_val = 0x8000;
+
+	if (chip_quirk(QUIRK_SIMICS))
+		return;
+
+	port80_val = op_display_to_port80(port80_val, s, m, c);
+	port8x_val = op_display_to_port8x(port8x_val, s, m, c);
+
+	lpc_probe_write(OPAL_LPC_IO, 0x80, port80_val,        1);
+	lpc_probe_write(OPAL_LPC_IO, 0x81, port8x_val >> 8,   1);
+	lpc_probe_write(OPAL_LPC_IO, 0x82, port8x_val & 0xff, 1);
+}
+
diff --git a/roms/skiboot/hw/lpc-rtc.c b/roms/skiboot/hw/lpc-rtc.c
new file mode 100644
index 000000000..dc4a484b3
--- /dev/null
+++ b/roms/skiboot/hw/lpc-rtc.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Real Time Clock hanging off LPC
+ *
+ * Copyright 2015 IBM Corp.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ipmi.h>
+#include <time.h>
+#include <time-utils.h>
+#include <device.h>
+#include <opal.h>
+#include <rtc.h>
+#include <lpc.h>
+#include <lock.h>
+#include <timebase.h>
+
+/* Legacy RTC registers */
+#define RTC_REG_SECONDS		0
+#define RTC_REG_MINUTES		2
+#define RTC_REG_HOURS		4
+#define RTC_REG_DAY_OF_WEEK	6
+#define RTC_REG_DAY_OF_MONTH	7
+#define RTC_REG_MONTH		8
+#define RTC_REG_YEAR		9
+#define RTC_REG_A		10
+#define   RTC_REG_A_UIP			0x80
+#define RTC_REG_B		11
+#define   RTC_REG_B_DIS_UPD		0x80
+#define   RTC_REG_B_PIE			0x40
+#define   RTC_REG_B_AIE			0x20
+#define   RTC_REG_B_UIE			0x10
+#define   RTC_REG_B_SQWE		0x08
+#define   RTC_REG_B_DM_BINARY		0x04
+#define   RTC_REG_B_24H			0x02
+#define   RTC_REG_B_DST_EN		0x01
+#define RTC_REG_C		12
+#define RTC_REG_D		13
+#define   RTC_REG_D_VALID		0x80
+
+/* Init value is no interrupts, 24H mode, updates enabled */
+#define RTC_REG_B_INIT		(RTC_REG_B_24H)
+
+static u32 rtc_port;
+static struct lock rtc_lock = LOCK_UNLOCKED;
+
+static uint8_t rtc_read(uint8_t reg)
+{
+	lpc_outb(reg, rtc_port);
+	return lpc_inb(rtc_port + 1);
+}
+
+static void rtc_write(uint8_t reg, uint8_t val)
+{
+	lpc_outb(reg, rtc_port);
+	lpc_outb(val, rtc_port + 1);
+}
+
+static bool lpc_rtc_read_tm(struct tm *tm)
+{
+	struct tm tm2;
+	unsigned int loops = 0;
+
+	/* Read until two series provide identical values, this
+	 * should deal with update races in all practical cases
+	 */
+	for (;;) {
+		tm2 = *tm;
+		tm->tm_sec = rtc_read(RTC_REG_SECONDS);
+		tm->tm_min = rtc_read(RTC_REG_MINUTES);
+		tm->tm_hour = rtc_read(RTC_REG_HOURS);
+		tm->tm_mday = rtc_read(RTC_REG_DAY_OF_MONTH);
+		tm->tm_mon = rtc_read(RTC_REG_MONTH);
+		tm->tm_year = rtc_read(RTC_REG_YEAR);
+		if (loops > 0 && memcmp(&tm2, tm, sizeof(struct tm)) == 0)
+			break;
+		loops++;
+		if (loops > 10) {
+			prerror("RTC: Failed to obtain stable values\n");
+			return false;
+		}
+	}
+	tm->tm_sec = bcd_byte(tm->tm_sec, 0);
+	tm->tm_min = bcd_byte(tm->tm_min, 0);
+	tm->tm_hour = bcd_byte(tm->tm_hour, 0);
+	tm->tm_mday = bcd_byte(tm->tm_mday, 0);
+	tm->tm_mon = bcd_byte(tm->tm_mon, 0) - 1;
+	tm->tm_year = bcd_byte(tm->tm_year, 0);
+
+	/* 2000 wrap */
+	if (tm->tm_year < 69)
+		tm->tm_year += 100;
+
+	/* Base */
+	tm->tm_year += 1900;
+
+	return true;
+}
+
+static void lpc_rtc_write_tm(struct tm *tm __unused)
+{
+	/* XXX */
+}
+
+static void lpc_init_time(void)
+{
+	uint8_t val;
+	struct tm tm;
+	bool valid;
+
+	memset(&tm, 0, sizeof(tm));
+
+	lock(&rtc_lock);
+
+	/* If update is in progress, wait a bit */
+	val = rtc_read(RTC_REG_A);
+	if (val & RTC_REG_A_UIP)
+		time_wait_ms(10);
+
+	/* Read from RTC */
+	valid = lpc_rtc_read_tm(&tm);
+
+	unlock(&rtc_lock);
+
+	/* Update cache */
+	if (valid)
+		rtc_cache_update(&tm);
+}
+
+static void lpc_init_hw(void)
+{
+	lock(&rtc_lock);
+
+	/* Set REG B to a suitable default */
+	rtc_write(RTC_REG_B, RTC_REG_B_INIT);
+
+	unlock(&rtc_lock);
+}
+
+static int64_t lpc_opal_rtc_read(__be32 *__ymd, __be64 *__hmsm)
+{
+	uint8_t val;
+	int64_t rc = OPAL_SUCCESS;
+	struct tm tm;
+	uint32_t ymd;
+	uint64_t hmsm;
+
+	if (!__ymd || !__hmsm)
+		return OPAL_PARAMETER;
+
+	/* Return busy if updating. This is somewhat racy, but will
+	 * do for now, most RTCs nowadays are smart enough to atomically
+	 * update. Alternatively we could just read from the cache...
+	 */
+	lock(&rtc_lock);
+	val = rtc_read(RTC_REG_A);
+	if (val & RTC_REG_A_UIP) {
+		unlock(&rtc_lock);
+		return OPAL_BUSY_EVENT;
+	}
+
+	/* Read from RTC */
+	if (lpc_rtc_read_tm(&tm))
+		rc = OPAL_SUCCESS;
+	else
+		rc = OPAL_HARDWARE;
+	unlock(&rtc_lock);
+
+	if (rc == OPAL_SUCCESS) {
+		/* Update cache */
+		rtc_cache_update(&tm);
+
+		/* Convert to OPAL time */
+		tm_to_datetime(&tm, &ymd, &hmsm);
+		*__ymd = cpu_to_be32(ymd);
+		*__hmsm = cpu_to_be64(hmsm);
+	}
+
+	return rc;
+}
+
+static int64_t lpc_opal_rtc_write(uint32_t year_month_day,
+				  uint64_t hour_minute_second_millisecond)
+{
+	struct tm tm;
+
+	/* Convert to struct tm */
+	datetime_to_tm(year_month_day, hour_minute_second_millisecond, &tm);
+
+	/* Write it out */
+	lock(&rtc_lock);
+	lpc_rtc_write_tm(&tm);
+	unlock(&rtc_lock);
+
+	return OPAL_SUCCESS;
+}
+
+void lpc_rtc_init(void)
+{
+	struct dt_node *rtc_node, *np;
+
+	if (!lpc_present())
+		return;
+
+	/* We support only one */
+	rtc_node = dt_find_compatible_node(dt_root, NULL, "pnpPNP,b00");
+	if (!rtc_node)
+		return;
+
+	/* Get IO base */
+	rtc_port = dt_prop_get_cell_def(rtc_node, "reg", 1, 0);
+	if (!rtc_port) {
+		prerror("RTC: Can't find reg property\n");
+		return;
+	}
+	if (dt_prop_get_cell_def(rtc_node, "reg", 0, 0) != OPAL_LPC_IO) {
+		prerror("RTC: Unsupported address type\n");
+		return;
+	}
+
+	/* Init the HW */
+	lpc_init_hw();
+
+	/* Create OPAL API node and register OPAL calls */
+	np = dt_new(opal_node, "rtc");
+	dt_add_property_strings(np, "compatible", "ibm,opal-rtc");
+
+	opal_register(OPAL_RTC_READ, lpc_opal_rtc_read, 2);
+	opal_register(OPAL_RTC_WRITE, lpc_opal_rtc_write, 2);
+
+	/* Initialise the rtc cache */
+	lpc_init_time();
+}
diff --git a/roms/skiboot/hw/lpc-uart.c b/roms/skiboot/hw/lpc-uart.c
new file mode 100644
index 000000000..834011b37
--- /dev/null
+++ b/roms/skiboot/hw/lpc-uart.c
@@ -0,0 +1,738 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Serial port hanging off LPC
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <lpc.h>
+#include <console.h>
+#include <opal.h>
+#include <device.h>
+#include <interrupts.h>
+#include <processor.h>
+#include <errorlog.h>
+#include <trace.h>
+#include <timebase.h>
+#include <cpu.h>
+#include <chip.h>
+#include <io.h>
+#include <nvram.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_UART_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_UART,
+		 OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA);
+
+/* UART reg defs */
+#define REG_RBR		0
+#define REG_THR		0
+#define REG_DLL		0
+#define REG_IER		1
+#define REG_DLM		1
+#define REG_FCR		2
+#define REG_IIR		2
+#define REG_LCR		3
+#define REG_MCR		4
+#define REG_LSR		5
+#define REG_MSR		6
+#define REG_SCR		7
+
+#define LSR_DR		0x01  /* Data ready */
+#define LSR_OE		0x02  /* Overrun */
+#define LSR_PE		0x04  /* Parity error */
+#define LSR_FE		0x08  /* Framing error */
+#define LSR_BI		0x10  /* Break */
+#define LSR_THRE	0x20  /* Xmit holding register empty */
+#define LSR_TEMT	0x40  /* Xmitter empty */
+#define LSR_ERR		0x80  /* Error */
+
+#define LCR_DLAB 	0x80  /* DLL access */
+
+#define IER_RX		0x01
+#define IER_THRE	0x02
+#define IER_ALL		0x0f
+
+static struct lock uart_lock = LOCK_UNLOCKED;
+static struct dt_node *uart_node;
+static uint32_t uart_base;
+static uint64_t uart_tx_full_time;
+static bool has_irq = false, irq_ok, rx_full, tx_full;
+static uint8_t tx_room;
+static uint8_t cached_ier;
+static void *mmio_uart_base;
+static int uart_console_policy = UART_CONSOLE_OPAL;
+static int lpc_irq = -1;
+
+void uart_set_console_policy(int policy)
+{
+	uart_console_policy = policy;
+}
+
+static void uart_trace(u8 ctx, u8 cnt, u8 irq_state, u8 in_count)
+{
+	union trace t;
+
+	t.uart.ctx = ctx;
+	t.uart.cnt = cnt;
+	t.uart.irq_state = irq_state;
+	t.uart.in_count = cpu_to_be16(in_count);
+	trace_add(&t, TRACE_UART, sizeof(struct trace_uart));
+}
+
+static inline uint8_t uart_read(unsigned int reg)
+{
+	if (mmio_uart_base)
+		return in_8(mmio_uart_base + reg);
+	else
+		return lpc_inb(uart_base + reg);
+}
+
+static inline void uart_write(unsigned int reg, uint8_t val)
+{
+	if (mmio_uart_base)
+		out_8(mmio_uart_base + reg, val);
+	else
+		lpc_outb(val, uart_base + reg);
+}
+
+static bool uart_check_tx_room(void)
+{
+	if (tx_room)
+		return true;
+
+	if (uart_read(REG_LSR) & LSR_THRE) {
+		/* FIFO is 16 entries */
+		tx_room = 16;
+		tx_full = false;
+		return true;
+	}
+
+	return false;
+}
+
+/* Must be called with UART lock held */
+static void uart_write_thr(uint8_t val)
+{
+	uart_write(REG_THR, val);
+
+	tx_room--;
+	if (tx_room == 0) {
+		if (!uart_check_tx_room())
+			uart_tx_full_time = mftb();
+	}
+}
+
+static bool uart_timed_out(unsigned long msecs)
+{
+	if (uart_check_tx_room())
+		return false;
+
+	if (chip_quirk(QUIRK_SLOW_SIM))
+		msecs *= 5;
+
+	if (tb_compare(mftb(), uart_tx_full_time + msecs_to_tb(msecs)) == TB_AAFTERB)
+		return true;
+
+	return false;
+}
+
+static bool uart_wait_tx_room(void)
+{
+	if (uart_check_tx_room())
+		return true;
+
+	smt_lowest();
+	while (!uart_check_tx_room()) {
+		if (uart_timed_out(100)) {
+			smt_medium();
+			return false;
+		}
+	}
+	smt_medium();
+
+	return true;
+}
+
+static void uart_update_ier(void)
+{
+	uint8_t ier = 0;
+
+	if (!has_irq)
+		return;
+
+	/* If we have never got an interrupt, enable them all,
+	 * the first interrupt received will tell us if interrupts
+	 * are functional (some boards are missing an EC or FPGA
+	 * programming causing LPC interrupts not to work).
+	 */
+	if (!irq_ok)
+		ier = IER_ALL;
+	if (!rx_full)
+		ier |= IER_RX;
+	if (tx_full)
+		ier |= IER_THRE;
+	if (ier != cached_ier) {
+		uart_write(REG_IER, ier);
+		cached_ier = ier;
+	}
+}
+
+bool uart_enabled(void)
+{
+	return mmio_uart_base || uart_base;
+}
+
+/*
+ * Internal console driver (output only)
+ */
+static size_t uart_con_write(const char *buf, size_t len)
+{
+	size_t written = 0;
+
+	/* If LPC bus is bad, we just swallow data */
+	if (!lpc_ok() && !mmio_uart_base)
+		return len;
+
+	lock(&uart_lock);
+	while (written < len) {
+		if (!uart_wait_tx_room())
+			break;
+
+		uart_write_thr(buf[written++]);
+	}
+
+	if (!written && uart_timed_out(1000)) {
+		unlock(&uart_lock);
+		return len; /* swallow data */
+	}
+
+	unlock(&uart_lock);
+
+	return written;
+}
+
+static struct con_ops uart_con_driver = {
+	.write = uart_con_write,
+};
+
+/*
+ * OPAL console driver
+ */
+
+/*
+ * We implement a simple buffer to buffer input data as some bugs in
+ * Linux make it fail to read fast enough after we get an interrupt.
+ *
+ * We use it on non-interrupt operations as well while at it because
+ * it doesn't cost us much and might help in a few cases where Linux
+ * is calling opal_poll_events() but not actually reading.
+ *
+ * Most of the time I expect we'll flush it completely to Linux into
+ * it's tty flip buffers so I don't bother with a ring buffer.
+ */
+#define IN_BUF_SIZE	0x1000
+static uint8_t	*in_buf;
+static uint32_t	in_count;
+
+/*
+ * We implement a ring buffer for output data as well to speed things
+ * up a bit. This allows us to have interrupt driven sends. This is only
+ * for the output data coming from the OPAL API, not the internal one
+ * which is already bufferred.
+ */
+#define OUT_BUF_SIZE	0x1000
+static uint8_t *out_buf;
+static uint32_t out_buf_prod;
+static uint32_t out_buf_cons;
+
+/* Asynchronous flush, uart_lock must be held */
+static int64_t uart_con_flush(void)
+{
+	bool tx_was_full = tx_full;
+	uint32_t out_buf_cons_initial = out_buf_cons;
+
+	while(out_buf_prod != out_buf_cons) {
+		if (tx_room == 0) {
+			/*
+			 * If the interrupt is not functional,
+			 * we force a full synchronous flush,
+			 * otherwise the Linux console isn't
+			 * usable (too slow).
+			 */
+			if (irq_ok)
+				uart_check_tx_room();
+			else
+				uart_wait_tx_room();
+		}
+		if (tx_room == 0) {
+			tx_full = true;
+			break;
+		}
+
+		uart_write_thr(out_buf[out_buf_cons++]);
+		out_buf_cons %= OUT_BUF_SIZE;
+	}
+	if (tx_full != tx_was_full)
+		uart_update_ier();
+	if (out_buf_prod != out_buf_cons) {
+		/* Return busy if nothing was flushed this call */
+		if (out_buf_cons == out_buf_cons_initial) {
+			if (uart_timed_out(1000))
+				return OPAL_TIMEOUT;
+			return OPAL_BUSY;
+		}
+		/* Return partial if there's more to flush */
+		return OPAL_PARTIAL;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static uint32_t uart_tx_buf_space(void)
+{
+	return OUT_BUF_SIZE - 1 -
+		(out_buf_prod + OUT_BUF_SIZE - out_buf_cons) % OUT_BUF_SIZE;
+}
+
+static int64_t uart_opal_write(int64_t term_number, __be64 *__length,
+			       const uint8_t *buffer)
+{
+	size_t written = 0, len = be64_to_cpu(*__length);
+	int64_t ret = OPAL_SUCCESS;
+
+	if (term_number != 0)
+		return OPAL_PARAMETER;
+
+	lock(&uart_lock);
+
+	/* Copy data to out buffer */
+	while (uart_tx_buf_space() && len--) {
+		out_buf[out_buf_prod++] = *(buffer++);
+		out_buf_prod %= OUT_BUF_SIZE;
+		written++;
+	}
+
+	/* Flush out buffer again */
+	uart_con_flush();
+
+	if (!written && uart_timed_out(1000))
+		ret = OPAL_TIMEOUT;
+	unlock(&uart_lock);
+
+	*__length = cpu_to_be64(written);
+
+	return ret;
+}
+
+static int64_t uart_opal_write_buffer_space(int64_t term_number,
+					    __be64 *__length)
+{
+	int64_t ret = OPAL_SUCCESS;
+	int64_t tx_buf_len;
+
+	if (term_number != 0)
+		return OPAL_PARAMETER;
+
+	lock(&uart_lock);
+	tx_buf_len = uart_tx_buf_space();
+
+	if ((tx_buf_len < be64_to_cpu(*__length)) && uart_timed_out(1000))
+		ret = OPAL_TIMEOUT;
+
+	*__length = cpu_to_be64(tx_buf_len);
+	unlock(&uart_lock);
+
+	return ret;
+}
+
+/* Must be called with UART lock held */
+static void uart_read_to_buffer(void)
+{
+	/* As long as there is room in the buffer */
+	while(in_count < IN_BUF_SIZE) {
+		/* Read status register */
+		uint8_t lsr = uart_read(REG_LSR);
+
+		/* Nothing to read ... */
+		if ((lsr & LSR_DR) == 0)
+			break;
+
+		/* Read and add to buffer */
+		in_buf[in_count++] = uart_read(REG_RBR);
+	}
+
+	/* If the buffer is full disable the interrupt */
+	rx_full = (in_count == IN_BUF_SIZE);
+	uart_update_ier();
+}
+
+static void uart_adjust_opal_event(void)
+{
+	if (in_count)
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT,
+					OPAL_EVENT_CONSOLE_INPUT);
+	else
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0);
+}
+
+/* This is called with the console lock held */
+static int64_t uart_opal_read(int64_t term_number, __be64 *__length,
+			      uint8_t *buffer)
+{
+	size_t req_count = be64_to_cpu(*__length), read_cnt = 0;
+	uint8_t lsr = 0;
+
+	if (term_number != 0)
+		return OPAL_PARAMETER;
+	if (!in_buf)
+		return OPAL_INTERNAL_ERROR;
+
+	lock(&uart_lock);
+
+	/* Read from buffer first */
+	if (in_count) {
+		read_cnt = in_count;
+		if (req_count < read_cnt)
+			read_cnt = req_count;
+		memcpy(buffer, in_buf, read_cnt);
+		req_count -= read_cnt;
+		if (in_count != read_cnt)
+			memmove(in_buf, in_buf + read_cnt, in_count - read_cnt);
+		in_count -= read_cnt;
+	}
+
+	/*
+	 * If there's still room in the user buffer, read from the UART
+	 * directly
+	 */
+	while(req_count) {
+		lsr = uart_read(REG_LSR);
+		if ((lsr & LSR_DR) == 0)
+			break;
+		buffer[read_cnt++] = uart_read(REG_RBR);
+		req_count--;
+	}
+
+	/* Finally, flush whatever's left in the UART into our buffer */
+	uart_read_to_buffer();
+
+	uart_trace(TRACE_UART_CTX_READ, read_cnt, tx_full, in_count);
+
+	unlock(&uart_lock);
+
+	/* Adjust the OPAL event */
+	uart_adjust_opal_event();
+
+	*__length = cpu_to_be64(read_cnt);
+	return OPAL_SUCCESS;
+}
+
+static int64_t uart_opal_flush(int64_t term_number)
+{
+	int64_t rc;
+
+	if (term_number != 0)
+		return OPAL_PARAMETER;
+
+	lock(&uart_lock);
+	rc = uart_con_flush();
+	unlock(&uart_lock);
+
+	return rc;
+}
+
+static void __uart_do_poll(u8 trace_ctx)
+{
+	if (!in_buf)
+		return;
+
+	lock(&uart_lock);
+	uart_read_to_buffer();
+	uart_con_flush();
+	uart_trace(trace_ctx, 0, tx_full, in_count);
+	unlock(&uart_lock);
+
+	uart_adjust_opal_event();
+}
+
+static void uart_console_poll(void *data __unused)
+{
+	__uart_do_poll(TRACE_UART_CTX_POLL);
+}
+
+static void uart_irq(uint32_t chip_id __unused, uint32_t irq_mask __unused)
+{
+	if (!irq_ok) {
+		prlog(PR_DEBUG, "UART: IRQ functional !\n");
+		irq_ok = true;
+	}
+	__uart_do_poll(TRACE_UART_CTX_IRQ);
+}
+
+/*
+ * Common setup/inits
+ */
+
+static void uart_setup_os_passthrough(void)
+{
+	char *path;
+
+	static struct lpc_client uart_lpc_os_client = {
+		.reset = NULL,
+		.interrupt = NULL,
+		.interrupts = 0
+	};
+
+	dt_add_property_strings(uart_node, "status", "ok");
+	path = dt_get_path(uart_node);
+	dt_add_property_string(dt_chosen, "linux,stdout-path", path);
+	free(path);
+
+	/* Setup LPC client for OS interrupts */
+	if (lpc_irq >= 0) {
+		uint32_t chip_id = dt_get_chip_id(uart_node);
+		uart_lpc_os_client.interrupts = LPC_IRQ(lpc_irq);
+		lpc_register_client(chip_id, &uart_lpc_os_client,
+				    IRQ_ATTR_TARGET_LINUX);
+	}
+	prlog(PR_DEBUG, "UART: Enabled as OS pass-through\n");
+}
+
+static void uart_setup_opal_console(void)
+{
+	static struct lpc_client uart_lpc_opal_client = {
+		.interrupt = uart_irq,
+	};
+
+	/* Add the opal console node */
+	add_opal_console_node(0, "raw", OUT_BUF_SIZE);
+
+	dt_add_property_string(dt_chosen, "linux,stdout-path",
+			       "/ibm,opal/consoles/serial@0");
+
+	/*
+	 * We mark the UART as reserved since we don't want the
+	 * kernel to start using it with its own 8250 driver
+	 */
+	dt_add_property_strings(uart_node, "status", "reserved");
+
+	/* Allocate an input buffer */
+	in_buf = zalloc(IN_BUF_SIZE);
+	out_buf = zalloc(OUT_BUF_SIZE);
+
+	/* Setup LPC client for OPAL interrupts */
+	if (lpc_irq >= 0) {
+		uint32_t chip_id = dt_get_chip_id(uart_node);
+		uart_lpc_opal_client.interrupts = LPC_IRQ(lpc_irq);
+		lpc_register_client(chip_id, &uart_lpc_opal_client,
+				    IRQ_ATTR_TARGET_OPAL);
+		has_irq = true;
+	}
+
+	/*
+	 * If the interrupt is enabled, turn on RX interrupts (and
+	 * only these for now
+	 */
+	tx_full = rx_full = false;
+	uart_update_ier();
+
+	/* Start console poller */
+	opal_add_poller(uart_console_poll, NULL);
+}
+
+static void uart_init_opal_console(void)
+{
+	const char *nv_policy;
+
+	/* Update the policy if the corresponding nvram variable
+	 * is present
+	 */
+	nv_policy = nvram_query_dangerous("uart-con-policy");
+	if (nv_policy) {
+		if (!strcmp(nv_policy, "opal"))
+			uart_console_policy = UART_CONSOLE_OPAL;
+		else if (!strcmp(nv_policy, "os"))
+			uart_console_policy = UART_CONSOLE_OS;
+		else
+			prlog(PR_WARNING,
+			      "UART: Unknown console policy in NVRAM: %s\n",
+			      nv_policy);
+	}
+	if (uart_console_policy == UART_CONSOLE_OPAL)
+		uart_setup_opal_console();
+	else
+		uart_setup_os_passthrough();
+}
+
+struct opal_con_ops uart_opal_con = {
+	.name = "OPAL UART console",
+	.init = uart_init_opal_console,
+	.read = uart_opal_read,
+	.write = uart_opal_write,
+	.space = uart_opal_write_buffer_space,
+	.flush = uart_opal_flush,
+};
+
+static bool uart_init_hw(unsigned int speed, unsigned int clock)
+{
+	unsigned int dll = (clock / 16) / speed;
+
+	/* Clear line control */
+	uart_write(REG_LCR, 0x00);
+
+	/* Check if the UART responds */
+	uart_write(REG_IER, 0x01);
+	if (uart_read(REG_IER) != 0x01)
+		goto detect_fail;
+	uart_write(REG_IER, 0x00);
+	if (uart_read(REG_IER) != 0x00)
+		goto detect_fail;
+
+	uart_write(REG_LCR, LCR_DLAB);
+	uart_write(REG_DLL, dll & 0xff);
+	uart_write(REG_DLM, dll >> 8);
+	uart_write(REG_LCR, 0x03); /* 8N1 */
+	uart_write(REG_MCR, 0x03); /* RTS/DTR */
+	uart_write(REG_FCR, 0x07); /* clear & en. fifos */
+
+	/*
+	 * On some UART implementations[1], we have observed that characters
+	 * written to the UART during early boot (where no RX path is used,
+	 * so we don't read from RBR) can cause a character timeout interrupt
+	 * once we eventually enable interrupts through the IER. This
+	 * interrupt can only be cleared by reading from RBR (even though we've
+	 * cleared the RX FIFO!).
+	 *
+	 * Unfortunately though, the LCR[DR] bit does *not* indicate that there
+	 * are characters to be read from RBR, so we may never read it, so the
+	 * interrupt continuously fires.
+	 *
+	 * So, manually clear the timeout interrupt by reading the RBR here.
+	 * We discard the read data, but that shouldn't matter as we've just
+	 * reset the FIFO anyway.
+	 *
+	 * 1: seen on the AST2500 SUART. I assume this applies to 2400 too.
+	 */
+	uart_read(REG_RBR);
+
+	return true;
+
+ detect_fail:
+	prerror("UART: Presence detect failed !\n");
+	return false;
+}
+
+/*
+ * early_uart_init() is similar to uart_init() in that it configures skiboot
+ * console log to output via a UART. The main differences are that the early
+ * version only works with MMIO UARTs and will not setup interrupts or locks.
+ */
+void early_uart_init(void)
+{
+	struct dt_node *uart_node;
+	u32 clk, baud;
+
+	uart_node = dt_find_compatible_node(dt_root, NULL, "ns16550");
+	if (!uart_node)
+		return;
+
+	/* Try translate the address, if this fails then it's not a MMIO UART */
+	mmio_uart_base = (void *) dt_translate_address(uart_node, 0, NULL);
+	if (!mmio_uart_base)
+		return;
+
+	clk = dt_prop_get_u32(uart_node, "clock-frequency");
+	baud = dt_prop_get_u32(uart_node, "current-speed");
+
+	if (uart_init_hw(baud, clk)) {
+		set_console(&uart_con_driver);
+		prlog(PR_DEBUG, "UART: Using UART at %p\n", mmio_uart_base);
+	} else {
+		prerror("UART: Early init failed!");
+		mmio_uart_base = NULL;
+	}
+}
+
+void uart_init(void)
+{
+	const struct dt_property *prop;
+	struct dt_node *n;
+	char *path __unused;
+	const be32 *irqp;
+
+	/* Clean up after early_uart_init() */
+	mmio_uart_base = NULL;
+
+	/* UART lock is in the console path and thus must block
+	 * printf re-entrancy
+	 */
+	uart_lock.in_con_path = true;
+
+	/* We support only one */
+	uart_node = n = dt_find_compatible_node(dt_root, NULL, "ns16550");
+	if (!n)
+		return;
+
+	/* Read the interrupts property if any */
+	irqp = dt_prop_get_def(n, "interrupts", NULL);
+
+	/* Now check if the UART is on the root bus. This is the case of
+	 * directly mapped UARTs in simulation environments
+	 */
+	if (n->parent == dt_root) {
+		printf("UART: Found at root !\n");
+		mmio_uart_base = (void *)dt_translate_address(n, 0, NULL);
+		if (!mmio_uart_base) {
+			printf("UART: Failed to translate address !\n");
+			return;
+		}
+
+		/* If it has an interrupt properly, we consider this to be
+		 * a direct XICS/XIVE interrupt
+		 */
+		if (irqp)
+			has_irq = true;
+
+	} else {
+		if (!lpc_present())
+			return;
+
+		/* Get IO base */
+		prop = dt_find_property(n, "reg");
+		if (!prop) {
+			log_simple_error(&e_info(OPAL_RC_UART_INIT),
+					 "UART: Can't find reg property\n");
+			return;
+		}
+		if (dt_property_get_cell(prop, 0) != OPAL_LPC_IO) {
+			log_simple_error(&e_info(OPAL_RC_UART_INIT),
+					 "UART: Only supports IO addresses\n");
+			return;
+		}
+		uart_base = dt_property_get_cell(prop, 1);
+
+		if (irqp) {
+			lpc_irq = be32_to_cpu(*irqp);
+			prlog(PR_DEBUG, "UART: Using LPC IRQ %d\n", lpc_irq);
+		}
+	}
+
+
+	if (!uart_init_hw(dt_prop_get_u32(n, "current-speed"),
+			  dt_prop_get_u32(n, "clock-frequency"))) {
+		prerror("UART: Initialization failed\n");
+		dt_add_property_strings(n, "status", "bad");
+		return;
+	}
+
+	/*
+	 * Mark LPC used by the console (will mark the relevant
+	 * locks to avoid deadlocks when flushing the console)
+	 */
+	lpc_used_by_console();
+
+	/* Install console backend for printf() */
+	set_console(&uart_con_driver);
+}
+
diff --git a/roms/skiboot/hw/lpc.c b/roms/skiboot/hw/lpc.c
new file mode 100644
index 000000000..bf3ab1fae
--- /dev/null
+++ b/roms/skiboot/hw/lpc.c
@@ -0,0 +1,1407 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Low Pin Count (LPC) Bus.
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt)	"LPC: " fmt
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <lock.h>
+#include <chip.h>
+#include <lpc.h>
+#include <timebase.h>
+#include <errorlog.h>
+#include <opal-api.h>
+#include <platform.h>
+#include <psi.h>
+#include <interrupts.h>
+
+//#define DBG_IRQ(fmt...) prerror(fmt)
+#define DBG_IRQ(fmt...) do { } while(0)
+
+DEFINE_LOG_ENTRY(OPAL_RC_LPC_READ, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
+		 OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LPC_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
+		 OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LPC_SYNC, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
+		 OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA);
+
+/* Used exclusively in manufacturing mode */
+DEFINE_LOG_ENTRY(OPAL_RC_LPC_SYNC_PERF, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
+		 OPAL_MISC_SUBSYSTEM, OPAL_UNRECOVERABLE_ERR_DEGRADE_PERF,
+		 OPAL_NA);
+
+#define ECCB_CTL	0 /* b0020 -> b00200 */
+#define ECCB_STAT	2 /* b0022 -> b00210 */
+#define ECCB_DATA	3 /* b0023 -> b00218 */
+
+#define ECCB_CTL_MAGIC		0xd000000000000000ul
+#define ECCB_CTL_DATASZ		PPC_BITMASK(4,7)
+#define ECCB_CTL_READ		PPC_BIT(15)
+#define ECCB_CTL_ADDRLEN	PPC_BITMASK(23,25)
+#define 	ECCB_ADDRLEN_4B	0x4
+#define ECCB_CTL_ADDR		PPC_BITMASK(32,63)
+
+#define ECCB_STAT_PIB_ERR	PPC_BITMASK(0,5)
+#define ECCB_STAT_RD_DATA	PPC_BITMASK(6,37)
+#define ECCB_STAT_BUSY		PPC_BIT(44)
+#define ECCB_STAT_ERRORS1	PPC_BITMASK(45,51)
+#define ECCB_STAT_OP_DONE	PPC_BIT(52)
+#define ECCB_STAT_ERRORS2	PPC_BITMASK(53,55)
+
+#define ECCB_STAT_ERR_MASK	(ECCB_STAT_PIB_ERR | \
+				 ECCB_STAT_ERRORS1 | \
+				 ECCB_STAT_ERRORS2)
+
+#define ECCB_TIMEOUT	1000000
+
+/* OPB Master LS registers */
+#define OPB_MASTER_LS_IRQ_STAT	0x50
+#define OPB_MASTER_LS_IRQ_MASK	0x54
+#define OPB_MASTER_LS_IRQ_POL	0x58
+#define   OPB_MASTER_IRQ_LPC	       	0x00000800
+
+/* LPC HC registers */
+#define LPC_HC_FW_SEG_IDSEL	0x24
+#define LPC_HC_FW_RD_ACC_SIZE	0x28
+#define   LPC_HC_FW_RD_1B		0x00000000
+#define   LPC_HC_FW_RD_2B		0x01000000
+#define   LPC_HC_FW_RD_4B		0x02000000
+#define   LPC_HC_FW_RD_16B		0x04000000
+#define   LPC_HC_FW_RD_128B		0x07000000
+#define LPC_HC_IRQSER_CTRL	0x30
+#define   LPC_HC_IRQSER_EN		0x80000000
+#define   LPC_HC_IRQSER_QMODE		0x40000000
+#define   LPC_HC_IRQSER_START_MASK	0x03000000
+#define   LPC_HC_IRQSER_START_4CLK	0x00000000
+#define   LPC_HC_IRQSER_START_6CLK	0x01000000
+#define   LPC_HC_IRQSER_START_8CLK	0x02000000
+#define   LPC_HC_IRQSER_AUTO_CLEAR	0x00800000
+#define LPC_HC_IRQMASK		0x34	/* same bit defs as LPC_HC_IRQSTAT */
+#define LPC_HC_IRQSTAT		0x38
+#define   LPC_HC_IRQ_SERIRQ0		0x80000000u /* all bits down to ... */
+#define   LPC_HC_IRQ_SERIRQ16		0x00008000 /* IRQ16=IOCHK#, IRQ2=SMI# */
+#define   LPC_HC_IRQ_SERIRQ_ALL		0xffff8000
+#define   LPC_HC_IRQ_LRESET		0x00000400
+#define   LPC_HC_IRQ_SYNC_ABNORM_ERR	0x00000080
+#define   LPC_HC_IRQ_SYNC_NORESP_ERR	0x00000040
+#define   LPC_HC_IRQ_SYNC_NORM_ERR	0x00000020
+#define   LPC_HC_IRQ_SYNC_TIMEOUT_ERR	0x00000010
+#define   LPC_HC_IRQ_TARG_TAR_ERR	0x00000008
+#define   LPC_HC_IRQ_BM_TAR_ERR		0x00000004
+#define   LPC_HC_IRQ_BM0_REQ		0x00000002
+#define   LPC_HC_IRQ_BM1_REQ		0x00000001
+#define   LPC_HC_IRQ_BASE_IRQS		(		     \
+	LPC_HC_IRQ_LRESET |				     \
+	LPC_HC_IRQ_SYNC_ABNORM_ERR |			     \
+	LPC_HC_IRQ_SYNC_NORESP_ERR |			     \
+	LPC_HC_IRQ_SYNC_NORM_ERR |			     \
+	LPC_HC_IRQ_SYNC_TIMEOUT_ERR |			     \
+	LPC_HC_IRQ_TARG_TAR_ERR |			     \
+	LPC_HC_IRQ_BM_TAR_ERR)
+#define LPC_HC_ERROR_ADDRESS	0x40
+
+#define LPC_NUM_SERIRQ		17
+
+enum {
+	LPC_ROUTE_FREE = 0,
+	LPC_ROUTE_OPAL,
+	LPC_ROUTE_LINUX
+};
+
+struct lpc_error_entry {
+	int64_t rc;
+	const char *description;
+};
+
+struct lpcm {
+	uint32_t		chip_id;
+	uint32_t		xbase;
+	void			*mbase;
+	struct lock		lock;
+	uint8_t			fw_idsel;
+	uint8_t			fw_rdsz;
+	struct list_head	clients;
+	bool			has_serirq;
+	uint8_t			sirq_routes[LPC_NUM_SERIRQ];
+	bool			sirq_routed[LPC_NUM_SERIRQ];
+	uint32_t		sirq_rmasks[4];
+	uint8_t			sirq_ralloc[4];
+	struct dt_node		*node;
+};
+
+
+#define	LPC_BUS_DEGRADED_PERF_THRESHOLD		5
+
+struct lpc_client_entry {
+	struct list_node node;
+	const struct lpc_client *clt;
+	uint32_t policy;
+};
+
+/* Default LPC bus */
+static int32_t lpc_default_chip_id = -1;
+static bool lpc_irqs_ready;
+
+/*
+ * These are expected to be the same on all chips and should probably
+ * be read (or configured) dynamically. This is how things are configured
+ * today on Tuletta.
+ */
+static uint32_t lpc_io_opb_base		= 0xd0010000;
+static uint32_t lpc_mem_opb_base	= 0xe0000000;
+static uint32_t lpc_fw_opb_base		= 0xf0000000;
+static uint32_t lpc_reg_opb_base	= 0xc0012000;
+static uint32_t opb_master_reg_base	= 0xc0010000;
+
+static int64_t opb_mmio_write(struct lpcm *lpc, uint32_t addr, uint32_t data,
+			      uint32_t sz)
+{
+	switch (sz) {
+	case 1:
+		out_8(lpc->mbase + addr, data);
+		return OPAL_SUCCESS;
+	case 2:
+		out_be16(lpc->mbase + addr, data);
+		return OPAL_SUCCESS;
+	case 4:
+		out_be32(lpc->mbase + addr, data);
+		return OPAL_SUCCESS;
+	}
+	prerror("Invalid data size %d\n", sz);
+	return OPAL_PARAMETER;
+}
+
+static int64_t opb_write(struct lpcm *lpc, uint32_t addr, uint32_t data,
+			 uint32_t sz)
+{
+	uint64_t ctl = ECCB_CTL_MAGIC, stat;
+	int64_t rc, tout;
+	uint64_t data_reg;
+
+	if (lpc->mbase)
+		return opb_mmio_write(lpc, addr, data, sz);
+
+	switch(sz) {
+	case 1:
+		data_reg = ((uint64_t)data) << 56;
+		break;
+	case 2:
+		data_reg = ((uint64_t)data) << 48;
+		break;
+	case 4:
+		data_reg = ((uint64_t)data) << 32;
+		break;
+	default:
+		prerror("Invalid data size %d\n", sz);
+		return OPAL_PARAMETER;
+	}
+
+	rc = xscom_write(lpc->chip_id, lpc->xbase + ECCB_DATA, data_reg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+			"LPC: XSCOM write to ECCB DATA error %lld\n", rc);
+		return rc;
+	}
+
+	ctl = SETFIELD(ECCB_CTL_DATASZ, ctl, sz);
+	ctl = SETFIELD(ECCB_CTL_ADDRLEN, ctl, ECCB_ADDRLEN_4B);
+	ctl = SETFIELD(ECCB_CTL_ADDR, ctl, addr);
+	rc = xscom_write(lpc->chip_id, lpc->xbase + ECCB_CTL, ctl);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+			"LPC: XSCOM write to ECCB CTL error %lld\n", rc);
+		return rc;
+	}
+
+	for (tout = 0; tout < ECCB_TIMEOUT; tout++) {
+		rc = xscom_read(lpc->chip_id, lpc->xbase + ECCB_STAT,
+				&stat);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+				"LPC: XSCOM read from ECCB STAT err %lld\n",
+									rc);
+			return rc;
+		}
+		if (stat & ECCB_STAT_OP_DONE) {
+			if (stat & ECCB_STAT_ERR_MASK) {
+				log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+					"LPC: Error status: 0x%llx\n", stat);
+				return OPAL_HARDWARE;
+			}
+			return OPAL_SUCCESS;
+		}
+		time_wait_nopoll(100);
+	}
+	log_simple_error(&e_info(OPAL_RC_LPC_WRITE), "LPC: Write timeout !\n");
+	return OPAL_HARDWARE;
+}
+
+static int64_t opb_mmio_read(struct lpcm *lpc, uint32_t addr, uint32_t *data,
+			     uint32_t sz)
+{
+	switch (sz) {
+	case 1:
+		*data = in_8(lpc->mbase + addr);
+		return OPAL_SUCCESS;
+	case 2:
+		*data = in_be16(lpc->mbase + addr);
+		return OPAL_SUCCESS;
+	case 4:
+		*data = in_be32(lpc->mbase + addr);
+		return OPAL_SUCCESS;
+	}
+	prerror("Invalid data size %d\n", sz);
+	return OPAL_PARAMETER;
+}
+
+static int64_t opb_read(struct lpcm *lpc, uint32_t addr, uint32_t *data,
+		        uint32_t sz)
+{
+	uint64_t ctl = ECCB_CTL_MAGIC | ECCB_CTL_READ, stat;
+	int64_t rc, tout;
+
+	if (lpc->mbase)
+		return opb_mmio_read(lpc, addr, data, sz);
+
+	if (sz != 1 && sz != 2 && sz != 4) {
+		prerror("Invalid data size %d\n", sz);
+		return OPAL_PARAMETER;
+	}
+
+	ctl = SETFIELD(ECCB_CTL_DATASZ, ctl, sz);
+	ctl = SETFIELD(ECCB_CTL_ADDRLEN, ctl, ECCB_ADDRLEN_4B);
+	ctl = SETFIELD(ECCB_CTL_ADDR, ctl, addr);
+	rc = xscom_write(lpc->chip_id, lpc->xbase + ECCB_CTL, ctl);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_LPC_READ),
+			"LPC: XSCOM write to ECCB CTL error %lld\n", rc);
+		return rc;
+	}
+
+	for (tout = 0; tout < ECCB_TIMEOUT; tout++) {
+		rc = xscom_read(lpc->chip_id, lpc->xbase + ECCB_STAT,
+				&stat);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_LPC_READ),
+				"LPC: XSCOM read from ECCB STAT err %lld\n",
+									rc);
+			return rc;
+		}
+		if (stat & ECCB_STAT_OP_DONE) {
+			uint32_t rdata = GETFIELD(ECCB_STAT_RD_DATA, stat);
+			if (stat & ECCB_STAT_ERR_MASK) {
+				log_simple_error(&e_info(OPAL_RC_LPC_READ),
+					"LPC: Error status: 0x%llx\n", stat);
+				return OPAL_HARDWARE;
+			}
+			switch(sz) {
+			case 1:
+				*data = rdata >> 24;
+				break;
+			case 2:
+				*data = rdata >> 16;
+				break;
+			default:
+				*data = rdata;
+				break;
+			}
+			return 0;
+		}
+		time_wait_nopoll(100);
+	}
+	log_simple_error(&e_info(OPAL_RC_LPC_READ), "LPC: Read timeout !\n");
+	return OPAL_HARDWARE;
+}
+
+static int64_t lpc_set_fw_idsel(struct lpcm *lpc, uint8_t idsel)
+{
+	uint32_t val;
+	int64_t rc;
+
+	if (idsel == lpc->fw_idsel)
+		return OPAL_SUCCESS;
+	if (idsel > 0xf)
+		return OPAL_PARAMETER;
+
+	rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_FW_SEG_IDSEL,
+		      &val, 4);
+	if (rc) {
+		prerror("Failed to read HC_FW_SEG_IDSEL register !\n");
+		return rc;
+	}
+	val = (val & 0xfffffff0) | idsel;
+	rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_FW_SEG_IDSEL,
+		       val, 4);
+	if (rc) {
+		prerror("Failed to write HC_FW_SEG_IDSEL register !\n");
+		return rc;
+	}
+	lpc->fw_idsel = idsel;
+	return OPAL_SUCCESS;
+}
+
+static int64_t lpc_set_fw_rdsz(struct lpcm *lpc, uint8_t rdsz)
+{
+	uint32_t val;
+	int64_t rc;
+
+	if (rdsz == lpc->fw_rdsz)
+		return OPAL_SUCCESS;
+	switch(rdsz) {
+	case 1:
+		val = LPC_HC_FW_RD_1B;
+		break;
+	case 2:
+		val = LPC_HC_FW_RD_2B;
+		break;
+	case 4:
+		val = LPC_HC_FW_RD_4B;
+		break;
+	default:
+		/*
+		 * The HW supports 16 and 128 via a buffer/cache
+		 * but I have never exprimented with it and am not
+		 * sure it works the way we expect so let's leave it
+		 * at that for now
+		 */
+		return OPAL_PARAMETER;
+	}
+	rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_FW_RD_ACC_SIZE,
+		       val, 4);
+	if (rc) {
+		prerror("Failed to write LPC_HC_FW_RD_ACC_SIZE !\n");
+		return rc;
+	}
+	lpc->fw_rdsz = rdsz;
+	return OPAL_SUCCESS;
+}
+
+static int64_t lpc_opb_prepare(struct lpcm *lpc,
+			       enum OpalLPCAddressType addr_type,
+			       uint32_t addr, uint32_t sz,
+			       uint32_t *opb_base, bool is_write)
+{
+	uint32_t top = addr + sz;
+	uint8_t fw_idsel;
+	int64_t rc;
+
+	/* Address wraparound */
+	if (top < addr)
+		return OPAL_PARAMETER;
+
+	/*
+	 * Bound check access and get the OPB base address for
+	 * the window corresponding to the access type
+	 */
+	switch(addr_type) {
+	case OPAL_LPC_IO:
+		/* IO space is 64K */
+		if (top > 0x10000)
+			return OPAL_PARAMETER;
+		/* And only supports byte accesses */
+		if (sz != 1)
+			return OPAL_PARAMETER;
+		*opb_base = lpc_io_opb_base;
+		break;
+	case OPAL_LPC_MEM:
+		/* MEM space is 256M */
+		if (top > 0x10000000)
+			return OPAL_PARAMETER;
+		/* And only supports byte accesses */
+		if (sz != 1)
+			return OPAL_PARAMETER;
+		*opb_base = lpc_mem_opb_base;
+		break;
+	case OPAL_LPC_FW:
+		/*
+		 * FW space is in segments of 256M controlled
+		 * by IDSEL, make sure we don't cross segments
+		 */
+		*opb_base = lpc_fw_opb_base;
+		fw_idsel = (addr >> 28);
+		if (((top - 1) >> 28) != fw_idsel)
+			return OPAL_PARAMETER;
+
+		/* Set segment */
+		rc = lpc_set_fw_idsel(lpc, fw_idsel);
+		if (rc)
+			return rc;
+		/* Set read access size */
+		if (!is_write) {
+			rc = lpc_set_fw_rdsz(lpc, sz);
+			if (rc)
+				return rc;
+		}
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+	return OPAL_SUCCESS;
+}
+
+#define LPC_ERROR_IDX(x) (__builtin_ffs(x) - 1 - 2)
+#define LPC_ERROR(_sts, _rc, _description) \
+	[LPC_ERROR_IDX(_sts)] = { _rc, _description }
+static const struct lpc_error_entry lpc_error_table[] = {
+	LPC_ERROR(LPC_HC_IRQ_BM_TAR_ERR, OPAL_WRONG_STATE, "Got bus master TAR error."),
+	LPC_ERROR(LPC_HC_IRQ_TARG_TAR_ERR, OPAL_WRONG_STATE, "Got abnormal TAR error."),
+	LPC_ERROR(LPC_HC_IRQ_SYNC_TIMEOUT_ERR, OPAL_TIMEOUT, "Got SYNC timeout error."),
+	LPC_ERROR(LPC_HC_IRQ_SYNC_NORM_ERR, OPAL_WRONG_STATE, "Got SYNC normal error."),
+	LPC_ERROR(LPC_HC_IRQ_SYNC_NORESP_ERR, OPAL_HARDWARE, "Got SYNC no-response error."),
+	LPC_ERROR(LPC_HC_IRQ_SYNC_ABNORM_ERR, OPAL_WRONG_STATE, "Got SYNC abnormal error."),
+};
+
+static int64_t lpc_probe_prepare(struct lpcm *lpc)
+{
+	const uint32_t irqmask_addr = lpc_reg_opb_base + LPC_HC_IRQMASK;
+	const uint32_t irqstat_addr = lpc_reg_opb_base + LPC_HC_IRQSTAT;
+	uint32_t irqmask;
+	int rc;
+
+	rc = opb_read(lpc, irqmask_addr, &irqmask, 4);
+	if (rc)
+		return rc;
+
+	irqmask &= ~LPC_HC_IRQ_SYNC_NORESP_ERR;
+	rc = opb_write(lpc, irqmask_addr, irqmask, 4);
+	if (rc)
+		return rc;
+
+	return opb_write(lpc, irqstat_addr, LPC_HC_IRQ_SYNC_NORESP_ERR, 4);
+}
+
+static int64_t lpc_probe_test(struct lpcm *lpc)
+{
+	const uint32_t irqmask_addr = lpc_reg_opb_base + LPC_HC_IRQMASK;
+	const uint32_t irqstat_addr = lpc_reg_opb_base + LPC_HC_IRQSTAT;
+	uint32_t irqmask, irqstat;
+	int64_t idx;
+	int rc;
+
+	rc = opb_read(lpc, irqstat_addr, &irqstat, 4);
+	if (rc)
+		return rc;
+
+	rc = opb_write(lpc, irqstat_addr, LPC_HC_IRQ_SYNC_NORESP_ERR, 4);
+	if (rc)
+		return rc;
+
+	rc = opb_read(lpc, irqmask_addr, &irqmask, 4);
+	if (rc)
+		return rc;
+
+	irqmask |= LPC_HC_IRQ_SYNC_NORESP_ERR;
+	rc = opb_write(lpc, irqmask_addr, irqmask, 4);
+	if (rc)
+		return rc;
+
+	if (!(irqstat & LPC_HC_IRQ_BASE_IRQS))
+		return OPAL_SUCCESS;
+
+	/* Ensure we can perform a valid lookup in the error table */
+	idx = LPC_ERROR_IDX(irqstat);
+	if (idx < 0 || idx >= ARRAY_SIZE(lpc_error_table)) {
+		prerror("LPC bus error translation failed with status 0x%x\n",
+			irqstat);
+		return OPAL_PARAMETER;
+	}
+
+	rc = lpc_error_table[idx].rc;
+	return rc;
+}
+
+static int64_t __lpc_write(struct lpcm *lpc, enum OpalLPCAddressType addr_type,
+			   uint32_t addr, uint32_t data, uint32_t sz,
+			   bool probe)
+{
+	uint32_t opb_base;
+	int64_t rc;
+
+	lock(&lpc->lock);
+	if (probe) {
+		rc = lpc_probe_prepare(lpc);
+		if (rc)
+			goto bail;
+	}
+
+	/*
+	 * Convert to an OPB access and handle LPC HC configuration
+	 * for FW accesses (IDSEL)
+	 */
+	rc = lpc_opb_prepare(lpc, addr_type, addr, sz, &opb_base, true);
+	if (rc)
+		goto bail;
+
+	/* Perform OPB access */
+	rc = opb_write(lpc, opb_base + addr, data, sz);
+	if (rc)
+		goto bail;
+
+	if (probe)
+		rc = lpc_probe_test(lpc);
+ bail:
+	unlock(&lpc->lock);
+	return rc;
+}
+
+static int64_t __lpc_write_sanity(enum OpalLPCAddressType addr_type,
+				  uint32_t addr, uint32_t data, uint32_t sz,
+				  bool probe)
+{
+	struct proc_chip *chip;
+
+	if (lpc_default_chip_id < 0)
+		return OPAL_PARAMETER;
+	chip = get_chip(lpc_default_chip_id);
+	if (!chip || !chip->lpc)
+		return OPAL_PARAMETER;
+	return __lpc_write(chip->lpc, addr_type, addr, data, sz, probe);
+}
+
+int64_t lpc_write(enum OpalLPCAddressType addr_type, uint32_t addr,
+		  uint32_t data, uint32_t sz)
+{
+	return __lpc_write_sanity(addr_type, addr, data, sz, false);
+}
+
+int64_t lpc_probe_write(enum OpalLPCAddressType addr_type, uint32_t addr,
+			uint32_t data, uint32_t sz)
+{
+	return __lpc_write_sanity(addr_type, addr, data, sz, true);
+}
+
+/*
+ * The "OPAL" variant add the emulation of 2 and 4 byte accesses using
+ * byte accesses for IO and MEM space in order to be compatible with
+ * existing Linux expectations
+ */
+static int64_t opal_lpc_write(uint32_t chip_id, enum OpalLPCAddressType addr_type,
+			      uint32_t addr, uint32_t data, uint32_t sz)
+{
+	struct proc_chip *chip;
+	int64_t rc;
+
+	chip = get_chip(chip_id);
+	if (!chip || !chip->lpc)
+		return OPAL_PARAMETER;
+
+	if (addr_type == OPAL_LPC_FW || sz == 1)
+		return __lpc_write(chip->lpc, addr_type, addr, data, sz, false);
+	while(sz--) {
+		rc = __lpc_write(chip->lpc, addr_type, addr, data & 0xff, 1, false);
+		if (rc)
+			return rc;
+		addr++;
+		data >>= 8;
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t __lpc_read(struct lpcm *lpc, enum OpalLPCAddressType addr_type,
+			  uint32_t addr, uint32_t *data, uint32_t sz,
+			  bool probe)
+{
+	uint32_t opb_base;
+	int64_t rc;
+
+	lock(&lpc->lock);
+	if (probe) {
+		rc = lpc_probe_prepare(lpc);
+		if (rc)
+			goto bail;
+	}
+
+	/*
+	 * Convert to an OPB access and handle LPC HC configuration
+	 * for FW accesses (IDSEL and read size)
+	 */
+	rc = lpc_opb_prepare(lpc, addr_type, addr, sz, &opb_base, false);
+	if (rc)
+		goto bail;
+
+	/* Perform OPB access */
+	rc = opb_read(lpc, opb_base + addr, data, sz);
+	if (rc)
+		goto bail;
+
+	if (probe)
+		rc = lpc_probe_test(lpc);
+ bail:
+	unlock(&lpc->lock);
+	return rc;
+}
+
+static int64_t __lpc_read_sanity(enum OpalLPCAddressType addr_type,
+				 uint32_t addr, uint32_t *data, uint32_t sz,
+				 bool probe)
+{
+	struct proc_chip *chip;
+
+	if (lpc_default_chip_id < 0)
+		return OPAL_PARAMETER;
+	chip = get_chip(lpc_default_chip_id);
+	if (!chip || !chip->lpc)
+		return OPAL_PARAMETER;
+	return __lpc_read(chip->lpc, addr_type, addr, data, sz, probe);
+}
+
+int64_t lpc_read(enum OpalLPCAddressType addr_type, uint32_t addr,
+		 uint32_t *data, uint32_t sz)
+{
+	return __lpc_read_sanity(addr_type, addr, data, sz, false);
+}
+
+int64_t lpc_probe_read(enum OpalLPCAddressType addr_type, uint32_t addr,
+		       uint32_t *data, uint32_t sz)
+{
+	return __lpc_read_sanity(addr_type, addr, data, sz, true);
+}
+
+/*
+ * The "OPAL" variant add the emulation of 2 and 4 byte accesses using
+ * byte accesses for IO and MEM space in order to be compatible with
+ * existing Linux expectations
+ */
+static int64_t opal_lpc_read(uint32_t chip_id, enum OpalLPCAddressType addr_type,
+			     uint32_t addr, __be32 *data, uint32_t sz)
+{
+	struct proc_chip *chip;
+	int64_t rc;
+	uint32_t tmp;
+
+	chip = get_chip(chip_id);
+	if (!chip || !chip->lpc)
+		return OPAL_PARAMETER;
+
+	if (addr_type == OPAL_LPC_FW) {
+		rc = __lpc_read(chip->lpc, addr_type, addr, &tmp, sz, false);
+		if (rc)
+			return rc;
+
+	} else {
+		tmp = 0;
+		while (sz--) {
+			uint32_t byte;
+
+			rc = __lpc_read(chip->lpc, addr_type, addr, &byte, 1, false);
+			if (rc)
+				return rc;
+			tmp = tmp | (byte << (8 * sz));
+			addr++;
+		}
+	}
+
+	*data = cpu_to_be32(tmp);
+
+	return OPAL_SUCCESS;
+}
+
+bool lpc_present(void)
+{
+	return lpc_default_chip_id >= 0;
+}
+
+/* Called with LPC lock held */
+static void lpc_setup_serirq(struct lpcm *lpc)
+{
+	struct lpc_client_entry *ent;
+	uint32_t mask = LPC_HC_IRQ_BASE_IRQS;
+	int rc;
+
+	if (!lpc_irqs_ready)
+		return;
+
+	/* Collect serirq enable bits */
+	list_for_each(&lpc->clients, ent, node)
+		mask |= ent->clt->interrupts & LPC_HC_IRQ_SERIRQ_ALL;
+
+	rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, mask, 4);
+	if (rc) {
+		prerror("Failed to update irq mask\n");
+		return;
+	}
+	DBG_IRQ("IRQ mask set to 0x%08x\n", mask);
+
+	/* Enable the LPC interrupt in the OPB Master */
+	opb_write(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_POL, 0, 4);
+	rc = opb_write(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_MASK,
+		       OPB_MASTER_IRQ_LPC, 4);
+	if (rc)
+		prerror("Failed to enable IRQs in OPB\n");
+
+	/* Check whether we should enable serirq */
+	if (mask & LPC_HC_IRQ_SERIRQ_ALL) {
+		rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL,
+			       LPC_HC_IRQSER_EN |
+			       LPC_HC_IRQSER_START_4CLK |
+			       /*
+				* New mode bit for P9N DD2.0 (ignored otherwise)
+				* when set we no longer have to manually clear
+				* the SerIRQs on EOI.
+				*/
+			       LPC_HC_IRQSER_AUTO_CLEAR, 4);
+		DBG_IRQ("SerIRQ enabled\n");
+	} else {
+		rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL,
+			       0, 4);
+		DBG_IRQ("SerIRQ disabled\n");
+	}
+	if (rc)
+		prerror("Failed to configure SerIRQ\n");
+	{
+		u32 val;
+		rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, &val, 4);
+		if (rc)
+			prerror("Failed to readback mask");
+		else
+			DBG_IRQ("MASK READBACK=%x\n", val);
+
+		rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL,
+			      &val, 4);
+		if (rc)
+			prerror("Failed to readback ctrl");
+		else
+			DBG_IRQ("CTRL READBACK=%x\n", val);
+	}
+}
+
+static void lpc_route_serirq(struct lpcm *lpc, uint32_t sirq,
+			     uint32_t psi_idx)
+{
+	uint32_t reg, shift, val, psi_old;
+	int64_t rc;
+
+	psi_old = lpc->sirq_routes[sirq];
+	lpc->sirq_rmasks[psi_old] &= ~(LPC_HC_IRQ_SERIRQ0 >> sirq);
+	lpc->sirq_rmasks[psi_idx] |=  (LPC_HC_IRQ_SERIRQ0 >> sirq);
+	lpc->sirq_routes[sirq] = psi_idx;
+	lpc->sirq_routed[sirq] = true;
+
+	/* We may not be ready yet ... */
+	if (!lpc->has_serirq)
+		return;
+
+	if (sirq < 14) {
+		reg = 0xc;
+		shift = 4 + (sirq << 1);
+	} else {
+		reg = 0x8;
+		shift = 8 + ((sirq - 14) << 1);
+	}
+	shift = 30-shift;
+	rc = opb_read(lpc, opb_master_reg_base + reg, &val, 4);
+	if (rc)
+		return;
+	val = val & ~(3 << shift);
+	val |= (psi_idx & 3) << shift;
+	opb_write(lpc, opb_master_reg_base + reg, val, 4);
+}
+
+static void lpc_alloc_route(struct lpcm *lpc, unsigned int irq,
+			    unsigned int policy)
+{
+	unsigned int i, r, c;
+	int route = -1;
+
+	if (policy == IRQ_ATTR_TARGET_OPAL)
+		r = LPC_ROUTE_OPAL;
+	else
+		r = LPC_ROUTE_LINUX;
+
+	prlog(PR_DEBUG, "Routing irq %d, policy: %d (r=%d)\n",
+	      irq, policy, r);
+
+	/* Are we already routed ? */
+	if (lpc->sirq_routed[irq] &&
+	    r != lpc->sirq_ralloc[lpc->sirq_routes[irq]]) {
+		prerror("irq %d has conflicting policies\n", irq);
+		return;
+	}
+
+	/* First try to find a free route. Leave one for another
+	 * policy though
+	 */
+	for (i = 0, c = 0; i < 4; i++) {
+		/* Count routes with identical policy */
+		if (lpc->sirq_ralloc[i] == r)
+			c++;
+
+		/* Use the route if it's free and there is no more
+		 * than 3 existing routes with that policy
+		 */
+		if (lpc->sirq_ralloc[i] == LPC_ROUTE_FREE && c < 4) {
+			lpc->sirq_ralloc[i] = r;
+			route = i;
+			break;
+		}
+	}
+
+	/* If we couldn't get a free one, try to find an existing one
+	 * with a matching policy
+	 */
+	for (i = 0; route < 0 && i < 4; i++) {
+		if (lpc->sirq_ralloc[i] == r)
+			route = i;
+	}
+
+	/* Still no route ? bail. That should never happen */
+	if (route < 0) {
+		prerror("Can't find a route for irq %d\n", irq);
+		return;
+	}
+
+	/* Program route */
+	lpc_route_serirq(lpc, irq, route);
+
+	prlog(PR_DEBUG, "SerIRQ %d using route %d targetted at %s\n",
+	      irq, route, r == LPC_ROUTE_LINUX ? "OS" : "OPAL");
+}
+
+unsigned int lpc_get_irq_policy(uint32_t chip_id, uint32_t psi_idx)
+{
+	struct proc_chip *c = get_chip(chip_id);
+
+	if (!c || !c->lpc)
+		return IRQ_ATTR_TARGET_LINUX;
+
+	if (c->lpc->sirq_ralloc[psi_idx] == LPC_ROUTE_LINUX)
+		return IRQ_ATTR_TARGET_LINUX;
+	else
+		return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TYPE_LSI;
+}
+
+static void lpc_create_int_map(struct lpcm *lpc, struct dt_node *psi_node)
+{
+	__be32 map[LPC_NUM_SERIRQ * 5], *pmap;
+	uint32_t i;
+
+	if (!psi_node)
+		return;
+	pmap = map;
+	for (i = 0; i < LPC_NUM_SERIRQ; i++) {
+		if (!lpc->sirq_routed[i])
+			continue;
+		*(pmap++) = 0;
+		*(pmap++) = 0;
+		*(pmap++) = cpu_to_be32(i);
+		*(pmap++) = cpu_to_be32(psi_node->phandle);
+		*(pmap++) = cpu_to_be32(lpc->sirq_routes[i] + P9_PSI_IRQ_LPC_SIRQ0);
+	}
+	if (pmap == map)
+		return;
+	dt_add_property(lpc->node, "interrupt-map", map,
+			(pmap - map) * sizeof(uint32_t));
+	dt_add_property_cells(lpc->node, "interrupt-map-mask", 0, 0, 0xff);
+	dt_add_property_cells(lpc->node, "#interrupt-cells", 1);
+}
+
+void lpc_finalize_interrupts(void)
+{
+	struct proc_chip *chip;
+
+	lpc_irqs_ready = true;
+
+	for_each_chip(chip) {
+		if (chip->lpc && chip->psi &&
+		    (chip->type == PROC_CHIP_P9_NIMBUS ||
+		     chip->type == PROC_CHIP_P9_CUMULUS ||
+		     chip->type == PROC_CHIP_P9P ||
+		     chip->type == PROC_CHIP_P10))
+			lpc_create_int_map(chip->lpc, chip->psi->node);
+	}
+}
+
+static void lpc_init_interrupts_one(struct proc_chip *chip)
+{
+	struct lpcm *lpc = chip->lpc;
+	int i, rc;
+
+	lock(&lpc->lock);
+
+	/* First mask them all */
+	rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, 0, 4);
+	if (rc) {
+		prerror("Failed to init interrutps\n");
+		goto bail;
+	}
+
+	switch(chip->type) {
+	case PROC_CHIP_P8_MURANO:
+	case PROC_CHIP_P8_VENICE:
+		/* On Murano/Venice, there is no SerIRQ, only enable error
+		 * interrupts
+		 */
+		rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK,
+			       LPC_HC_IRQ_BASE_IRQS, 4);
+		if (rc) {
+			prerror("Failed to set interrupt mask\n");
+			goto bail;
+		}
+		opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSER_CTRL, 0, 4);
+		break;
+	case PROC_CHIP_P8_NAPLES:
+		/* On Naples, we support LPC interrupts, enable them based
+		 * on what clients requests. This will setup the mask and
+		 * enable processing
+		 */
+		lpc->has_serirq = true;
+		lpc_setup_serirq(lpc);
+		break;
+	case PROC_CHIP_P9_NIMBUS:
+	case PROC_CHIP_P9_CUMULUS:
+	case PROC_CHIP_P9P:
+	case PROC_CHIP_P10:
+		/* On P9, we additionally setup the routing. */
+		lpc->has_serirq = true;
+		for (i = 0; i < LPC_NUM_SERIRQ; i++) {
+			if (lpc->sirq_routed[i])
+				lpc_route_serirq(lpc, i, lpc->sirq_routes[i]);
+		}
+		lpc_setup_serirq(lpc);
+		break;
+	default:
+		;
+	}
+ bail:
+	unlock(&lpc->lock);
+}
+
+void lpc_init_interrupts(void)
+{
+	struct proc_chip *chip;
+
+	lpc_irqs_ready = true;
+
+	for_each_chip(chip) {
+		if (chip->lpc)
+			lpc_init_interrupts_one(chip);
+	}
+}
+
+static void lpc_dispatch_reset(struct lpcm *lpc)
+{
+	struct lpc_client_entry *ent;
+
+	/* XXX We are going to hit this repeatedly while reset is
+	 * asserted which might be sub-optimal. We should instead
+	 * detect assertion and start a poller that will wait for
+	 * de-assertion. We could notify clients of LPC being
+	 * on/off rather than just reset
+	 */
+
+	prerror("Got LPC reset on chip 0x%x !\n", lpc->chip_id);
+
+	/* Collect serirq enable bits */
+	list_for_each(&lpc->clients, ent, node) {
+		if (!ent->clt->reset)
+			continue;
+		unlock(&lpc->lock);
+		ent->clt->reset(lpc->chip_id);
+		lock(&lpc->lock);
+	}
+
+	/* Reconfigure serial interrupts */
+	if (lpc->has_serirq)
+		lpc_setup_serirq(lpc);
+}
+
+static void lpc_dispatch_err_irqs(struct lpcm *lpc, uint32_t irqs)
+{
+	const struct lpc_error_entry *err;
+	static int lpc_bus_err_count;
+	struct opal_err_info *info;
+	uint32_t addr;
+	int64_t idx;
+	int rc;
+
+	/* Write back to clear error interrupts, we clear SerIRQ later
+	 * as they are handled as level interrupts
+	 */
+	rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT,
+		       LPC_HC_IRQ_BASE_IRQS, 4);
+	if (rc)
+		prerror("Failed to clear IRQ error latches !\n");
+
+	if (irqs & LPC_HC_IRQ_LRESET) {
+		lpc_dispatch_reset(lpc);
+		return;
+	}
+
+	/* Ensure we can perform a valid lookup in the error table */
+	idx = LPC_ERROR_IDX(irqs);
+	if (idx < 0 || idx >= ARRAY_SIZE(lpc_error_table)) {
+		prerror("LPC bus error translation failed with status 0x%x\n",
+			irqs);
+		return;
+	}
+
+	/* Find and report the error */
+	err = &lpc_error_table[idx];
+	lpc_bus_err_count++;
+	if (manufacturing_mode && (lpc_bus_err_count > LPC_BUS_DEGRADED_PERF_THRESHOLD))
+		info = &e_info(OPAL_RC_LPC_SYNC_PERF);
+	else
+		info = &e_info(OPAL_RC_LPC_SYNC);
+
+	rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_ERROR_ADDRESS, &addr, 4);
+	if (rc)
+		log_simple_error(info, "LPC[%03x]: %s "
+				 "Error reading error address register\n",
+				 lpc->chip_id, err->description);
+	else
+		log_simple_error(info, "LPC[%03x]: %s Error address reg: "
+				 "0x%08x\n",
+				 lpc->chip_id, err->description, addr);
+}
+
+static void lpc_dispatch_ser_irqs(struct lpcm *lpc, uint32_t irqs,
+				  bool clear_latch)
+{
+	struct lpc_client_entry *ent;
+	uint32_t cirqs;
+	int rc;
+
+	irqs &= LPC_HC_IRQ_SERIRQ_ALL;
+
+	/* Collect serirq enable bits */
+	list_for_each(&lpc->clients, ent, node) {
+		if (!ent->clt->interrupt)
+			continue;
+		cirqs = ent->clt->interrupts & irqs;
+		if (cirqs) {
+			unlock(&lpc->lock);
+			ent->clt->interrupt(lpc->chip_id, cirqs);
+			lock(&lpc->lock);
+		}
+	}
+
+	/* Our SerIRQ are level sensitive, we clear the latch after
+	 * we call the handler.
+	 */
+	if (!clear_latch)
+		return;
+
+	rc = opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, irqs, 4);
+	if (rc)
+		prerror("Failed to clear SerIRQ latches !\n");
+}
+
+void lpc_interrupt(uint32_t chip_id)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct lpcm *lpc;
+	uint32_t irqs, opb_irqs;
+	int rc;
+
+	/* No initialized LPC controller on that chip */
+	if (!chip || !chip->lpc)
+		return;
+	lpc = chip->lpc;
+
+	lock(&lpc->lock);
+
+	/* Grab OPB Master LS interrupt status */
+	rc = opb_read(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_STAT,
+		      &opb_irqs, 4);
+	if (rc) {
+		prerror("Failed to read OPB IRQ state\n");
+		unlock(&lpc->lock);
+		return;
+	}
+
+	DBG_IRQ("OPB IRQ on chip 0x%x, oirqs=0x%08x\n", chip_id, opb_irqs);
+
+	/* Check if it's an LPC interrupt */
+	if (!(opb_irqs & OPB_MASTER_IRQ_LPC)) {
+		/* Something we don't support ? Ack it anyway... */
+		goto bail;
+	}
+
+	/* Handle the lpc interrupt source (errors etc...) */
+	rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, &irqs, 4);
+	if (rc) {
+		prerror("Failed to read LPC IRQ state\n");
+		goto bail;
+	}
+
+	DBG_IRQ("LPC IRQ on chip 0x%x, irqs=0x%08x\n", chip_id, irqs);
+
+	/* Handle error interrupts */
+	if (irqs & LPC_HC_IRQ_BASE_IRQS)
+		lpc_dispatch_err_irqs(lpc, irqs);
+
+	/* Handle SerIRQ interrupts */
+	if (irqs & LPC_HC_IRQ_SERIRQ_ALL)
+		lpc_dispatch_ser_irqs(lpc, irqs, true);
+ bail:
+	/* Ack it at the OPB level */
+	opb_write(lpc, opb_master_reg_base + OPB_MASTER_LS_IRQ_STAT,
+		  opb_irqs, 4);
+	unlock(&lpc->lock);
+}
+
+void lpc_serirq(uint32_t chip_id, uint32_t index)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct lpcm *lpc;
+	uint32_t irqs, rmask;
+	int rc;
+
+	/* No initialized LPC controller on that chip */
+	if (!chip || !chip->lpc)
+		return;
+	lpc = chip->lpc;
+
+	lock(&lpc->lock);
+
+	/* Handle the lpc interrupt source (errors etc...) */
+	rc = opb_read(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT, &irqs, 4);
+	if (rc) {
+		prerror("Failed to read LPC IRQ state\n");
+		goto bail;
+	}
+	rmask = lpc->sirq_rmasks[index];
+
+	DBG_IRQ("IRQ on chip 0x%x, irqs=0x%08x rmask=0x%08x\n",
+		chip_id, irqs, rmask);
+	irqs &= rmask;
+
+	/*
+	 * Handle SerIRQ interrupts. Don't clear the latch,
+	 * it will be done in our special EOI callback if
+	 * necessary on DD1
+	 */
+	if (irqs)
+		lpc_dispatch_ser_irqs(lpc, irqs, false);
+
+ bail:
+	unlock(&lpc->lock);
+}
+
+void lpc_all_interrupts(uint32_t chip_id)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct lpcm *lpc;
+
+	/* No initialized LPC controller on that chip */
+	if (!chip || !chip->lpc)
+		return;
+	lpc = chip->lpc;
+
+	/* Dispatch all */
+	lock(&lpc->lock);
+	lpc_dispatch_ser_irqs(lpc, LPC_HC_IRQ_SERIRQ_ALL, false);
+	unlock(&lpc->lock);
+}
+
+static void lpc_init_chip_p8(struct dt_node *xn)
+ {
+	uint32_t gcid = dt_get_chip_id(xn);
+	struct proc_chip *chip;
+	struct lpcm *lpc;
+
+	chip = get_chip(gcid);
+	assert(chip);
+
+	lpc = zalloc(sizeof(struct lpcm));
+	assert(lpc);
+	lpc->chip_id = gcid;
+	lpc->xbase = dt_get_address(xn, 0, NULL);
+	lpc->fw_idsel = 0xff;
+	lpc->fw_rdsz = 0xff;
+	lpc->node = xn;
+	list_head_init(&lpc->clients);
+	init_lock(&lpc->lock);
+
+	if (lpc_default_chip_id < 0 ||
+	    dt_has_node_property(xn, "primary", NULL)) {
+		lpc_default_chip_id = gcid;
+	}
+
+	/* Mask all interrupts for now */
+	opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, 0, 4);
+
+	printf("LPC[%03x]: Initialized, access via XSCOM @0x%x\n",
+	       gcid, lpc->xbase);
+
+	dt_add_property(xn, "interrupt-controller", NULL, 0);
+	dt_add_property_cells(xn, "#interrupt-cells", 1);
+	assert(dt_prop_get_u32(xn, "#address-cells") == 2);
+
+	chip->lpc = lpc;
+}
+
+static void lpc_init_chip_p9(struct dt_node *opb_node)
+{
+	uint32_t gcid = dt_get_chip_id(opb_node);
+	struct dt_node *lpc_node;
+	struct proc_chip *chip;
+	struct lpcm *lpc;
+	u64 addr;
+	u32 val;
+
+	chip = get_chip(gcid);
+	assert(chip);
+
+	/* Grab OPB base address */
+	addr = dt_prop_get_cell(opb_node, "ranges", 1);
+	addr <<= 32;
+	addr |= dt_prop_get_cell(opb_node, "ranges", 2);
+
+	/* Find the "lpc" child node */
+	lpc_node = dt_find_compatible_node(opb_node, NULL, "ibm,power9-lpc");
+	if (!lpc_node)
+		return;
+
+	lpc = zalloc(sizeof(struct lpcm));
+	assert(lpc);
+	lpc->chip_id = gcid;
+	lpc->mbase = (void *)addr;
+	lpc->fw_idsel = 0xff;
+	lpc->fw_rdsz = 0xff;
+	lpc->node = lpc_node;
+	list_head_init(&lpc->clients);
+	init_lock(&lpc->lock);
+
+	if (lpc_default_chip_id < 0 ||
+	    dt_has_node_property(opb_node, "primary", NULL)) {
+		lpc_default_chip_id = gcid;
+	}
+
+	/* Mask all interrupts for now */
+	opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQMASK, 0, 4);
+
+	/* Clear any stale LPC bus errors */
+	opb_write(lpc, lpc_reg_opb_base + LPC_HC_IRQSTAT,
+		       LPC_HC_IRQ_BASE_IRQS, 4);
+
+	/* Default with routing to PSI SerIRQ 0, this will be updated
+	 * later when interrupts are initialized.
+	 */
+	opb_read(lpc, opb_master_reg_base + 8, &val, 4);
+	val &= 0xff03ffff;
+	opb_write(lpc, opb_master_reg_base + 8, val, 4);
+	opb_read(lpc, opb_master_reg_base + 0xc, &val, 4);
+	val &= 0xf0000000;
+	opb_write(lpc, opb_master_reg_base + 0xc, val, 4);
+
+	prlog(PR_INFO, "LPC[%03x]: Initialized\n", gcid);
+	prlog(PR_DEBUG,"access via MMIO @%p\n", lpc->mbase);
+
+	chip->lpc = lpc;
+}
+
+void lpc_init(void)
+{
+	struct dt_node *xn;
+	bool has_lpc = false;
+
+	/* Look for P9 first as the DT is compatile for both 8 and 9 */
+	dt_for_each_compatible(dt_root, xn, "ibm,power9-lpcm-opb") {
+		lpc_init_chip_p9(xn);
+		has_lpc = true;
+	}
+
+	if (!has_lpc) {
+		dt_for_each_compatible(dt_root, xn, "ibm,power8-lpc") {
+			lpc_init_chip_p8(xn);
+			has_lpc = true;
+		}
+	}
+	if (lpc_default_chip_id >= 0)
+		prlog(PR_DEBUG, "Default bus on chip 0x%x\n",
+		      lpc_default_chip_id);
+
+	if (has_lpc) {
+		opal_register(OPAL_LPC_WRITE, opal_lpc_write, 5);
+		opal_register(OPAL_LPC_READ, opal_lpc_read, 5);
+	}
+}
+
+void lpc_used_by_console(void)
+{
+	struct proc_chip *chip;
+
+	xscom_used_by_console();
+
+	for_each_chip(chip) {
+		struct lpcm *lpc = chip->lpc;
+		if (lpc) {
+			lpc->lock.in_con_path = true;
+			lock(&lpc->lock);
+			unlock(&lpc->lock);
+		}
+	}
+}
+
+bool lpc_ok(void)
+{
+	struct proc_chip *chip;
+
+	if (lpc_default_chip_id < 0)
+		return false;
+	if (!xscom_ok())
+		return false;
+	chip = get_chip(lpc_default_chip_id);
+	if (!chip->lpc)
+		return false;
+	return !lock_held_by_me(&chip->lpc->lock);
+}
+
+void lpc_register_client(uint32_t chip_id,
+			 const struct lpc_client *clt,
+			 uint32_t policy)
+{
+	struct lpc_client_entry *ent;
+	struct proc_chip *chip;
+	struct lpcm *lpc;
+	bool has_routes;
+
+	chip = get_chip(chip_id);
+	assert(chip);
+	lpc = chip->lpc;
+	if (!lpc) {
+		prerror("Attempt to register client on bad chip 0x%x\n",
+			chip_id);
+		return;
+	}
+
+	has_routes =
+		chip->type == PROC_CHIP_P9_NIMBUS ||
+		chip->type == PROC_CHIP_P9_CUMULUS ||
+		chip->type == PROC_CHIP_P9P ||
+		chip->type == PROC_CHIP_P10;
+
+	if (policy != IRQ_ATTR_TARGET_OPAL && !has_routes) {
+		prerror("Chip doesn't support OS interrupt policy\n");
+		return;
+	}
+
+	ent = malloc(sizeof(*ent));
+	assert(ent);
+	ent->clt = clt;
+	ent->policy = policy;
+	lock(&lpc->lock);
+	list_add(&lpc->clients, &ent->node);
+
+	if (has_routes) {
+		unsigned int i;
+		for (i = 0; i < LPC_NUM_SERIRQ; i++)
+			if (clt->interrupts & LPC_IRQ(i))
+				lpc_alloc_route(lpc, i, policy);
+	}
+
+	if (lpc->has_serirq)
+		lpc_setup_serirq(lpc);
+	unlock(&lpc->lock);
+}
diff --git a/roms/skiboot/hw/npu-hw-procedures.c b/roms/skiboot/hw/npu-hw-procedures.c
new file mode 100644
index 000000000..91bbb0f15
--- /dev/null
+++ b/roms/skiboot/hw/npu-hw-procedures.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NPU (NVLink1, POWER8NVL) Hardware Procedures
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-virt.h>
+#include <interrupts.h>
+#include <npu-regs.h>
+#include <npu.h>
+#include <xscom.h>
+
+typedef uint32_t (*step)(struct npu_dev *);
+
+struct procedure {
+	const char *name;
+	step steps[];
+};
+
+#define DEFINE_PROCEDURE(NAME, STEPS...)		\
+	static struct procedure procedure_##NAME =	\
+	{.name = #NAME, .steps = {NAME, ##STEPS}}
+
+#define PROCEDURE_INPROGRESS	(1 << 31)
+#define PROCEDURE_COMPLETE	(1 << 30)
+#define PROCEDURE_NEXT		(1 << 29)
+#define PROCEDURE_FAILED	2
+#define PROCEDURE_ABORTED 	3
+#define PROCEDURE_UNSUPPORTED	4
+
+/* Mask defining which status bits we want to expose */
+#define PROCEDURE_STATUS_MASK	0xc000000f
+
+/* Accesors for PHY registers. These can be done either via MMIO or SCOM. */
+static bool pl_use_scom = 1;
+static void phy_write(struct npu_dev *npu_dev, uint64_t addr, uint32_t val)
+{
+	if (pl_use_scom)
+		xscom_write(npu_dev->npu->chip_id, npu_dev->pl_xscom_base | addr, val);
+	else
+		out_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr), val);
+}
+
+static uint16_t phy_read(struct npu_dev *npu_dev, uint64_t addr)
+{
+	uint64_t val;
+
+	if (pl_use_scom)
+		xscom_read(npu_dev->npu->chip_id, npu_dev->pl_xscom_base + addr, &val);
+	else
+		val = in_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr));
+
+	return val & 0xffff;
+}
+
+/* The DL registers can be accessed indirectly via the NTL */
+static void dl_write(struct npu_dev *npu_dev, uint32_t addr, uint32_t val)
+{
+	xscom_write(npu_dev->npu->chip_id,
+		    npu_dev->xscom + NX_DL_REG_ADDR, addr);
+	xscom_write(npu_dev->npu->chip_id,
+		    npu_dev->xscom + NX_DL_REG_DATA, val);
+}
+
+static uint64_t __unused dl_read(struct npu_dev *npu_dev, uint32_t addr)
+{
+	uint64_t val;
+
+	xscom_write(npu_dev->npu->chip_id,
+		    npu_dev->xscom + NX_DL_REG_ADDR, addr);
+	xscom_read(npu_dev->npu->chip_id,
+		   npu_dev->xscom + NX_DL_REG_DATA, &val);
+	return val;
+}
+
+/* Our hardware bits are backwards here. The lane vectors are 16-bit
+ * values represented in IBM bit ordering. This means lane 0 is
+ * represented by bit 15 in most of the registers. Internally we keep
+ * this sane (ie. npu_dev->lane_mask[0] == lane 0) as we need sane
+ * numbering for set_lane_reg() anyway.  */
+static uint32_t phy_lane_mask(struct npu_dev *npu_dev)
+{
+	/* We only train 8 lanes at a time so we don't do a full
+	 * bit-swap */
+	assert(npu_dev->lane_mask == 0xff00 || npu_dev->lane_mask == 0xff);
+
+	return ~npu_dev->lane_mask & 0xffff;
+}
+
+static void set_lane_reg(struct npu_dev *npu_dev, uint64_t base_reg,
+			 uint64_t data, uint64_t mask)
+{
+	uint64_t val, i;
+	uint32_t lane_mask = npu_dev->lane_mask;
+
+	for (i = 0; i <= 23; i++) {
+		if (lane_mask & (1ul << i)) {
+			uint64_t tx_rxcal_reg = base_reg + (i << 32);
+			val = phy_read(npu_dev, tx_rxcal_reg);
+			val = (val & ~mask) | data;
+			phy_write(npu_dev, tx_rxcal_reg, val);
+		}
+	}
+}
+
+static uint32_t stop(struct npu_dev *npu_dev __unused)
+{
+	return PROCEDURE_COMPLETE | PROCEDURE_ABORTED;
+}
+DEFINE_PROCEDURE(stop);
+
+static uint32_t nop(struct npu_dev *npu_dev __unused)
+{
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(nop);
+
+/* Procedure 1.2.1 (RESET_NPU_DL) from opt_programmerguide.odt. Also
+ * incorporates AT reset. */
+static uint32_t reset_npu_dl(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+
+	/* Assert NPU reset */
+	xscom_read(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, &val);
+	val |= NTL_CONTROL_RESET;
+	xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, val);
+
+	/* Put the Nvidia logic in reset */
+	dl_write(npu_dev, NDL_CONTROL, 0xe8000000);
+
+	/* Release Nvidia logic from reset */
+	dl_write(npu_dev, NDL_CONTROL, 0);
+
+	/* Release NPU from reset */
+	val &= ~NTL_CONTROL_RESET;
+	xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_NTL_CONTROL, val);
+
+	/* Setup up TL credits */
+	xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_CMD_CR, PPC_BIT(0));
+	xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_CMD_D_CR, PPC_BIT(0));
+	xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_RSP_CR, PPC_BIT(15));
+	xscom_write(npu_dev->npu->chip_id, npu_dev->xscom + NX_TL_RSP_D_CR, PPC_BIT(15));
+
+	/* Reset error registers.  TODO: are there more we should clear here? */
+	npu_ioda_sel(npu_dev->npu, NPU_IODA_TBL_PESTB, 0, true);
+	for (val = 0; val < NPU_NUM_OF_PES; val++)
+		out_be64(npu_dev->npu->at_regs + NPU_IODA_DATA0, 0);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(reset_npu_dl);
+
+/* Procedures 1.2.3 (reset_lanes) & 1.2.4
+ * (io_register_write_reset_values) */
+static uint32_t phy_reset(struct npu_dev *npu_dev)
+{
+	uint16_t val;
+
+	/* Lower run_lane inputs for lanes to be reset */
+	val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+	val &= ~phy_lane_mask(npu_dev);
+	phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_reset_wait(struct npu_dev *npu_dev)
+{
+	uint16_t val;
+
+	/* Wait for lane busy outputs to go to zero for lanes to be
+	 * reset */
+	val = phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15);
+	if (val & phy_lane_mask(npu_dev))
+		return PROCEDURE_INPROGRESS;
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_reset_complete(struct npu_dev *npu_dev)
+{
+	uint16_t val;
+	uint32_t lane_mask = phy_lane_mask(npu_dev);
+
+	/* Set ioreset_vec for the desired lanes bit positions */
+	val = phy_read(npu_dev, RX_IORESET_VEC_0_15);
+	phy_write(npu_dev, RX_IORESET_VEC_0_15, val | lane_mask);
+
+	val = phy_read(npu_dev, TX_IORESET_VEC_0_15);
+	phy_write(npu_dev, TX_IORESET_VEC_0_15, val | lane_mask);
+
+	/* Clear ioreset_vec */
+	val = phy_read(npu_dev, RX_IORESET_VEC_0_15);
+	phy_write(npu_dev, RX_IORESET_VEC_0_15, val & ~lane_mask);
+
+	val = phy_read(npu_dev, TX_IORESET_VEC_0_15);
+	phy_write(npu_dev, TX_IORESET_VEC_0_15, val & ~lane_mask);
+
+	/* Reset RX phase rotators */
+	set_lane_reg(npu_dev, RX_PR_CNTL_PL, RX_PR_RESET, RX_PR_RESET);
+	set_lane_reg(npu_dev, RX_PR_CNTL_PL, 0, RX_PR_RESET);
+
+	/* Restore registers from scominit that may have changed */
+	set_lane_reg(npu_dev, RX_PR_MODE, 0x8, RX_PR_PHASE_STEP);
+	set_lane_reg(npu_dev, RX_A_DAC_CNTL,
+		     0x7 << MASK_TO_LSH(RX_PR_IQ_RES_SEL),
+		     RX_PR_IQ_RES_SEL);
+	set_lane_reg(npu_dev, TX_MODE1_PL, 0, TX_LANE_PDWN);
+	set_lane_reg(npu_dev, RX_BANK_CONTROLS, 0, RX_LANE_ANA_PDWN);
+	set_lane_reg(npu_dev, RX_MODE, 0, RX_LANE_DIG_PDWN);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete);
+
+/* Round a fixed decimal number. Frac is the number of fractional
+ * bits */
+static uint32_t round(uint32_t val, int frac)
+{
+	if (val >> (frac - 1) & 0x1)
+		return (val >> frac) + 1;
+	else
+		return val >> frac;
+}
+
+#define ZCAL_MIN	(10 << 3)
+#define ZCAL_MAX	(40 << 3)
+#define ZCAL_K0		0x0
+#define ZCAL_M 		128
+/* TODO: add a test case for the following values:
+
+   Initial values:
+     zcal_n = 0xda;
+     zcal_p = 0xc7;
+
+   Results:
+   	pre_p = 0x0
+	pre_n = 0x0
+	margin_p = 0x0
+	margin_n = 0x0
+	total_en_p = 0x32
+	total_en_n = 0x37
+ */
+
+static uint32_t phy_tx_zcal(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+
+	if (npu_dev->index < 2 && npu_dev->npu->tx_zcal_complete[0])
+		return PROCEDURE_COMPLETE;
+
+	if (npu_dev->index >= 2 && npu_dev->npu->tx_zcal_complete[1])
+		return PROCEDURE_COMPLETE;
+
+	/* Start calibration */
+	val = phy_read(npu_dev, TX_IMPCAL_SWO1_PB);
+	val &= TX_ZCAL_SWO_EN;
+	phy_write(npu_dev, TX_IMPCAL_SWO1_PB, val);
+	phy_write(npu_dev, TX_IMPCAL_SWO2_PB, 0x50 << 2);
+	val = phy_read(npu_dev, TX_IMPCAL_PB);
+	val |= TX_ZCAL_REQ;
+	phy_write(npu_dev, TX_IMPCAL_PB, val);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_tx_zcal_wait(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+
+	val = phy_read(npu_dev, TX_IMPCAL_PB);
+	if (!(val & TX_ZCAL_DONE))
+		return PROCEDURE_INPROGRESS;
+
+	if (val & TX_ZCAL_ERROR)
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_tx_zcal_calculate(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+	uint64_t zcal_n;
+	uint64_t zcal_p;
+	uint64_t margin_n;
+	uint64_t margin_p;
+	uint64_t pre_n;
+	uint64_t pre_p;
+	uint64_t total_en_n;
+	uint64_t total_en_p;
+
+	val = phy_read(npu_dev, TX_IMPCAL_NVAL_PB);
+	zcal_n = GETFIELD(TX_ZCAL_N, val);
+	val = phy_read(npu_dev, TX_IMPCAL_PVAL_PB);
+	zcal_p = GETFIELD(TX_ZCAL_P, val);
+
+	if ((zcal_n < ZCAL_MIN) || (zcal_n > ZCAL_MAX) ||
+	    (zcal_p < ZCAL_MIN) || (zcal_p > ZCAL_MAX))
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+	margin_n = (0x80 - ZCAL_M) * zcal_n / 2;
+	margin_p = (0x80 - ZCAL_M) * zcal_p / 2;
+	pre_n = (((0x80 * zcal_n) - (2 * margin_n)) * ZCAL_K0) / 0x80;
+	pre_p = (((0x80 * zcal_p) - (2 * margin_p)) * ZCAL_K0) / 0x80;
+
+	total_en_n = 0x80 * zcal_n - (2 * margin_n) - (pre_n & 1023);
+	total_en_p = 0x80 * zcal_p - (2 * margin_p) - (pre_p & 1023);
+
+	pre_p = round(pre_p, 9);
+	pre_n = round(pre_n, 9);
+	margin_p = round(margin_p, 9);
+	margin_n = round(margin_n, 9);
+	total_en_p = round(total_en_p, 9);
+	total_en_n = round(total_en_n, 9);
+
+	val = SETFIELD(TX_FFE_TOTAL_ENABLE_N_ENC, 0, total_en_n);
+	val = SETFIELD(TX_FFE_TOTAL_ENABLE_P_ENC, val, total_en_p);
+	phy_write(npu_dev, TX_FFE_TOTAL_2RSTEP_EN, val);
+
+	val = SETFIELD(TX_FFE_PRE_N_SEL_ENC, 0, pre_n);
+	val = SETFIELD(TX_FFE_PRE_P_SEL_ENC, val, pre_p);
+	phy_write(npu_dev, TX_FFE_PRE_2RSTEP_SEL, val);
+
+	val = SETFIELD(TX_FFE_MARGIN_PD_N_SEL_ENC, 0, margin_n);
+	val = SETFIELD(TX_FFE_MARGIN_PU_P_SEL_ENC, val, margin_p);
+	phy_write(npu_dev, TX_FFE_MARGIN_2RSTEP_SEL, val);
+
+	if (npu_dev->index < 2)
+		npu_dev->npu->tx_zcal_complete[0] = true;
+	else
+		npu_dev->npu->tx_zcal_complete[1] = true;
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate);
+
+static uint32_t phy_enable_tx_rxcal(struct npu_dev *npu_dev)
+{
+	/* Turn common mode on */
+	set_lane_reg(npu_dev, TX_MODE2_PL, TX_RXCAL, TX_RXCAL);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_enable_tx_rxcal);
+
+static uint32_t phy_disable_tx_rxcal(struct npu_dev *npu_dev)
+{
+	/* Turn common mode off */
+	set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_RXCAL);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_disable_tx_rxcal);
+
+static uint32_t phy_rx_dccal(struct npu_dev *npu_dev)
+{
+	if (phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15)
+	    & ~phy_read(npu_dev, RX_INIT_DONE_VEC_0_15))
+		return PROCEDURE_INPROGRESS;
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_start(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+
+	/* Save EO step control */
+	val = phy_read(npu_dev, RX_EO_STEP_CNTL_PG);
+	npu_dev->procedure_data = val;
+
+	phy_write(npu_dev, RX_EO_STEP_CNTL_PG,
+		  RX_EO_ENABLE_LATCH_OFFSET_CAL
+		  | RX_EO_ENABLE_CM_COARSE_CAL);
+
+	val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15);
+	val |= phy_lane_mask(npu_dev);
+	phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val);
+
+	val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+	val |= phy_lane_mask(npu_dev);
+	phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_complete(struct npu_dev *npu_dev)
+{
+	/* Poll for completion on relevant lanes */
+	if ((phy_read(npu_dev, RX_INIT_DONE_VEC_0_15) & phy_lane_mask(npu_dev))
+	    != phy_lane_mask(npu_dev))
+		return PROCEDURE_INPROGRESS;
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_fifo_init(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+
+	val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+	val &= ~phy_lane_mask(npu_dev);
+	phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+
+	/* Turn off recal abort */
+	val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15);
+	val &= ~phy_lane_mask(npu_dev);
+	phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val);
+
+	/* Restore original settings */
+	phy_write(npu_dev, RX_EO_STEP_CNTL_PG, npu_dev->procedure_data);
+
+	/* FIFO Init */
+	set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_UNLOAD_CLK_DISABLE);
+	set_lane_reg(npu_dev, TX_CNTL_STAT2, TX_FIFO_INIT, TX_FIFO_INIT);
+	set_lane_reg(npu_dev, TX_MODE2_PL, TX_UNLOAD_CLK_DISABLE,
+		     TX_UNLOAD_CLK_DISABLE);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_start, phy_rx_dccal_complete,
+		 phy_rx_dccal_fifo_init);
+
+static uint32_t phy_rx_training(struct npu_dev *npu_dev)
+{
+	uint16_t val;
+
+	if (!npu_dev->procedure_data) {
+		val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+		val |= phy_lane_mask(npu_dev);
+		phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+	}
+
+	npu_dev->procedure_data++;
+	if (npu_dev->procedure_data >= 1000000)
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+	val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+	if ((val & phy_lane_mask(npu_dev)) != phy_lane_mask(npu_dev))
+		return PROCEDURE_INPROGRESS;
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_rx_training);
+
+static struct procedure *npu_procedures[] = {
+	&procedure_stop,
+	&procedure_nop,
+	NULL,
+	NULL,
+	&procedure_phy_reset,
+	&procedure_phy_tx_zcal,
+	&procedure_phy_rx_dccal,
+	&procedure_phy_enable_tx_rxcal,
+	&procedure_phy_disable_tx_rxcal,
+	&procedure_phy_rx_training,
+	&procedure_reset_npu_dl,
+
+	/* Place holders for pre-terminate and terminate procedures */
+	&procedure_nop,
+	&procedure_nop};
+
+/* Run a procedure step(s) and return status */
+static uint32_t get_procedure_status(struct npu_dev *dev)
+{
+	uint32_t result;
+	uint16_t procedure = dev->procedure_number;
+	uint16_t step = dev->procedure_step;
+	const char *name = npu_procedures[procedure]->name;
+
+	do {
+		result = npu_procedures[procedure]->steps[step](dev);
+
+		if (result & PROCEDURE_NEXT) {
+			step++;
+			NPUDEVINF(dev, "Running procedure %s step %d\n", name, step);
+		}
+	} while (result & PROCEDURE_NEXT);
+
+	dev->procedure_step = step;
+
+	if (result & PROCEDURE_COMPLETE)
+		NPUDEVINF(dev, "Procedure %s complete\n", name);
+	else if (mftb() > dev->procedure_tb + msecs_to_tb(100)) {
+		NPUDEVINF(dev, "Procedure %s timed out\n", name);
+		result = PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+	}
+
+	/* Mask off internal state bits */
+	dev->procedure_status = result & PROCEDURE_STATUS_MASK;
+
+	return dev->procedure_status;
+}
+
+static int64_t npu_dev_procedure_read(struct npu_dev *dev, uint32_t offset,
+				      uint32_t size, uint32_t *data)
+{
+	int64_t rc = OPAL_SUCCESS;
+
+	if (size != 4) {
+		/* Short config reads are not supported */
+		prlog(PR_ERR, "NPU%d: Short read of procedure register\n", dev->npu->phb.opal_id);
+		return OPAL_PARAMETER;
+	}
+
+	*data = 0;
+
+	switch (offset) {
+	case 0:
+		/* Only run the procedure if not already complete */
+		if (dev->procedure_status & PROCEDURE_COMPLETE)
+			*data = dev->procedure_status;
+		else
+			*data = get_procedure_status(dev);
+
+		break;
+
+	case 4:
+		*data = dev->procedure_number;
+		break;
+
+	default:
+		prlog(PR_ERR, "NPU%d: Invalid vendor specific offset 0x%08x\n",
+		      dev->npu->phb.opal_id, offset);
+		rc = OPAL_PARAMETER;
+	}
+
+	return rc;
+}
+
+static int64_t npu_dev_procedure_write(struct npu_dev *dev, uint32_t offset,
+				       uint32_t size, uint32_t data)
+{
+	const char *name;
+	int64_t rc = OPAL_SUCCESS;
+
+	if (size != 4) {
+		/* Short config writes are not supported */
+		prlog(PR_ERR, "NPU%d: Short read of procedure register\n",
+		      dev->npu->phb.opal_id);
+		return OPAL_PARAMETER;
+	}
+
+	switch (offset) {
+	case 0:
+		/* We ignore writes to the status register */
+		NPUDEVINF(dev, "Ignoring writes to status register\n");
+		break;
+
+	case 4:
+		if (data >= ARRAY_SIZE(npu_procedures) ||
+		    !npu_procedures[data]) {
+			NPUDEVINF(dev, "Unsupported procedure number %d\n", data);
+			dev->procedure_status = PROCEDURE_COMPLETE
+				| PROCEDURE_UNSUPPORTED;
+			break;
+		}
+
+		name = npu_procedures[data]->name;
+		if (dev->procedure_number == data
+		    && !(dev->procedure_status & PROCEDURE_COMPLETE))
+			NPUDEVINF(dev, "Restarting procuedure %s\n", name);
+		else
+			NPUDEVINF(dev, "Starting procedure %s\n", name);
+
+		dev->procedure_status = PROCEDURE_INPROGRESS;
+		dev->procedure_number = data;
+		dev->procedure_step = 0;
+		dev->procedure_data = 0;
+		dev->procedure_tb = mftb();
+		break;
+
+	default:
+		NPUDEVINF(dev, "Invalid vendor specific offset 0x%08x\n", offset);
+		rc = OPAL_PARAMETER;
+	}
+
+	return rc;
+}
+
+int64_t npu_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf,
+			  uint32_t offset, uint32_t len, uint32_t *data,
+			  bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu_dev *ndev = pvd->data;
+
+	if (write)
+		return npu_dev_procedure_write(ndev, offset - pcrf->start,
+					       len, *data);
+
+	return npu_dev_procedure_read(ndev, offset - pcrf->start, len, data);
+}
+
+void npu_dev_procedure_reset(struct npu_dev *dev)
+{
+	dev->procedure_status = 0;
+	dev->procedure_number = 0;
+	dev->procedure_step = 0;
+	dev->procedure_data = 0;
+}
diff --git a/roms/skiboot/hw/npu-opal.c b/roms/skiboot/hw/npu-opal.c
new file mode 100644
index 000000000..412ea460e
--- /dev/null
+++ b/roms/skiboot/hw/npu-opal.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <pci.h>
+#include <phb4.h>
+#include <npu2.h>
+#include <npu3.h>
+
+static int64_t opal_npu_init_context(uint64_t phb_id, int pid __unused,
+				     uint64_t msr, uint64_t bdf)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+
+	if (!phb)
+		return OPAL_PARAMETER;
+
+	if (phb->phb_type == phb_type_npu_v2)
+		return npu2_init_context(phb, msr, bdf);
+
+	if (phb->phb_type == phb_type_npu_v3)
+		return npu3_init_context(phb, msr, bdf);
+
+	return OPAL_PARAMETER;
+}
+opal_call(OPAL_NPU_INIT_CONTEXT, opal_npu_init_context, 4);
+
+static int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid __unused,
+					uint64_t bdf)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+
+	if (!phb)
+		return OPAL_PARAMETER;
+
+	if (phb->phb_type == phb_type_npu_v2)
+		return npu2_destroy_context(phb, bdf);
+
+	if (phb->phb_type == phb_type_npu_v3)
+		return npu3_destroy_context(phb, bdf);
+
+	return OPAL_PARAMETER;
+}
+opal_call(OPAL_NPU_DESTROY_CONTEXT, opal_npu_destroy_context, 3);
+
+static int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
+				 uint64_t lpcr)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+
+	if (!phb)
+		return OPAL_PARAMETER;
+
+	if (phb->phb_type == phb_type_npu_v2)
+		return npu2_map_lpar(phb, bdf, lparid, lpcr);
+
+	if (phb->phb_type == phb_type_npu_v3)
+		return npu3_map_lpar(phb, bdf, lparid, lpcr);
+
+	return OPAL_PARAMETER;
+}
+opal_call(OPAL_NPU_MAP_LPAR, opal_npu_map_lpar, 4);
+
+static int npu_check_relaxed_ordering(struct phb *phb, struct pci_device *pd,
+				      void *enable)
+{
+	/*
+	 * IBM PCIe bridge devices (ie. the root ports) can always allow relaxed
+	 * ordering
+	 */
+	if (pd->vdid == 0x04c11014)
+		pd->allow_relaxed_ordering = true;
+
+	PCIDBG(phb, pd->bdfn, "Checking relaxed ordering config\n");
+	if (pd->allow_relaxed_ordering)
+		return 0;
+
+	PCIDBG(phb, pd->bdfn, "Relaxed ordering not allowed\n");
+	*(bool *)enable = false;
+
+	return 1;
+}
+
+static int64_t npu_set_relaxed_order(uint32_t gcid, int pec, bool enable)
+{
+	struct phb *phb;
+	int64_t rc;
+
+	for_each_phb(phb) {
+		if (phb->phb_type == phb_type_npu_v2)
+			rc = npu2_set_relaxed_order(phb, gcid, pec, enable);
+		else if (phb->phb_type == phb_type_npu_v3)
+			rc = npu3_set_relaxed_order(phb, gcid, pec, enable);
+		else
+			continue;
+
+		if (rc)
+			return rc;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_npu_set_relaxed_order(uint64_t phb_id, uint16_t bdfn,
+					  bool request_enabled)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	struct phb4 *phb4;
+	uint32_t chip_id, pec;
+	struct pci_device *pd;
+	bool enable = true;
+
+	if (!phb || phb->phb_type != phb_type_pcie_v4)
+		return OPAL_PARAMETER;
+
+	phb4 = phb_to_phb4(phb);
+	pec = phb4->pec;
+	chip_id = phb4->chip_id;
+
+	if (chip_id & ~0x1b)
+		return OPAL_PARAMETER;
+
+	pd = pci_find_dev(phb, bdfn);
+	if (!pd)
+		return OPAL_PARAMETER;
+
+	/*
+	 * Not changing state, so no need to rescan PHB devices to determine if
+	 * we need to enable/disable it
+	 */
+	if (pd->allow_relaxed_ordering == request_enabled)
+		return OPAL_SUCCESS;
+
+	pd->allow_relaxed_ordering = request_enabled;
+
+	/*
+	 * Walk all devices on this PHB to ensure they all support relaxed
+	 * ordering
+	 */
+	pci_walk_dev(phb, NULL, npu_check_relaxed_ordering, &enable);
+
+	if (request_enabled && !enable) {
+		/*
+		 * Not all devices on this PHB support relaxed-ordering
+		 * mode so we can't enable it as requested
+		 */
+		prlog(PR_INFO, "Cannot set relaxed ordering for PEC %d on chip %d\n",
+		      pec, chip_id);
+		return OPAL_CONSTRAINED;
+	}
+
+	if (npu_set_relaxed_order(chip_id, pec, request_enabled)) {
+		npu_set_relaxed_order(chip_id, pec, false);
+		return OPAL_RESOURCE;
+	}
+
+	phb4->ro_state = request_enabled;
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_NPU_SET_RELAXED_ORDER, opal_npu_set_relaxed_order, 3);
+
+static int64_t opal_npu_get_relaxed_order(uint64_t phb_id,
+					  uint16_t bdfn __unused)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	struct phb4 *phb4;
+
+	if (!phb || phb->phb_type != phb_type_pcie_v4)
+		return OPAL_PARAMETER;
+
+	phb4 = phb_to_phb4(phb);
+	return phb4->ro_state;
+}
+opal_call(OPAL_NPU_GET_RELAXED_ORDER, opal_npu_get_relaxed_order, 2);
diff --git a/roms/skiboot/hw/npu.c b/roms/skiboot/hw/npu.c
new file mode 100644
index 000000000..dba7ee50f
--- /dev/null
+++ b/roms/skiboot/hw/npu.c
@@ -0,0 +1,1693 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NVLink1, supported by the NPU (POWER8)
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-virt.h>
+#include <pci-slot.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <opal-api.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <ccan/build_assert/build_assert.h>
+#include <affinity.h>
+#include <npu-regs.h>
+#include <npu.h>
+#include <xscom.h>
+#include <string.h>
+
+/*
+ * Terminology:
+ *
+ *  Brick - A group of either 8 TX or 8 RX lanes
+ *  Link - A group of 8 TX and 8 RX lanes
+ *
+ * Each link is represented in system software as an emulated PCI
+ * device. Garrison has two chips each with 4 links, therefore there
+ * are 8 emulated PCI devices in total.
+ *
+ *  +----------------------------------------------------------------+
+ *  |              PBCQ3 (SCOM Base Address 0x2012c00)               |
+ *  |               PHB3 (SCOM Base Address 0x9012c00)               |
+ *  +----------------------------------------------------------------+
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *  +----------------------------------------------------------------+
+ *  |                             PCIe x8                            |
+ *  +----------------------------------------------------------------+
+ *  |                               GPU0                             |
+ *  +--------------------------------+-------------------------------+
+ *  |           NV Link 1            |           NV Link 0           |
+ *  +---------------+----------------+---------------+---------------+
+ *  |      RX       |      TX        |      RX       |      TX       |
+ *  +---------------+----------------+---------------+---------------+
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *  +---------------+----------------+---------------+---------------+
+ *  |      TX       |      RX        |      TX       |      RX       |
+ *  +---------------+----------------+---------------+---------------+
+ *  |           Lanes [0:7]         PHY 0       Lanes [8:15]         |
+ *  |               SCOM Base Address 0x8000080008010c3f             |
+ *  +--------------------------------+-------------------------------+
+ *  |          Link 0 NDL/NTL        |         Link 1 NTL/NDL        |
+ *  |   SCOM Base Address 0x8013c00  |  SCOM Base Address 0x8013c40  |
+ *  +--------------------------------+-------------------------------+
+ *  |                                                                |
+ *  |          Address Translation/AT (shared for all links)         |
+ *  |                 SCOM Base Address 0x8013d80                    |
+ *  |                                                                |
+ *  +--------------------------------+-------------------------------+
+ *  |          Link 3 NDL/NTL        |         Link 4 NTL/NDL        |
+ *  |   SCOM Base Address 0x8013d00  |  SCOM Base Address 0x8013d40  |
+ *  +--------------------------------+-------------------------------+
+ *  |           Lanes [8:15]        PHY 1       Lanes [0:7]          |
+ *  |               SCOM Base Address 0x8000080008010c7f             |
+ *  +---------------+----------------+---------------+---------------+
+ *  |      TX       |      RX        |      TX       |      RX       |
+ *  +---------------+----------------+---------------+---------------+
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *  +---------------+----------------+---------------+---------------+
+ *  |      RX       |      TX        |      RX       |      TX       |
+ *  +---------------+----------------+---------------+---------------+
+ *  |           NV Link 2            |           NV Link 3           |
+ *  +--------------------------------+-------------------------------+
+ *  |                               GPU1                             |
+ *  +----------------------------------------------------------------+
+ *  |                             PCIe x8                            |
+ *  +----------------------------------------------------------------+
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *  +----------------------------------------------------------------+
+ *  |               PHB2 (SCOM Base Address 0x9012800)               |
+ *  |              PBCQ2 (SCOM Base Address 0x2012800)               |
+ *  +----------------------------------------------------------------+
+ *
+ */
+
+static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev,
+						   uint16_t id);
+
+#define OPAL_NPU_VERSION          0x02
+
+#define PCIE_CAP_START	          0x40
+#define PCIE_CAP_END	          0x80
+#define VENDOR_CAP_START          0x80
+#define VENDOR_CAP_END	          0x90
+
+#define VENDOR_CAP_PCI_DEV_OFFSET 0x0d
+
+/* Returns the scom base for the given link index */
+static uint64_t npu_link_scom_base(struct dt_node *dn, uint32_t scom_base,
+				   int index)
+{
+	struct dt_node *link;
+	uint32_t link_index;
+	char namebuf[32];
+
+	snprintf(namebuf, sizeof(namebuf), "link@%x", index);
+	link = dt_find_by_name(dn, namebuf);
+	assert(link);
+	link_index = dt_prop_get_u32(link, "ibm,npu-link-index");
+	return scom_base + (link_index * NPU_LINK_SIZE);
+}
+
+static uint64_t get_bar_size(uint64_t bar)
+{
+	return (1 << GETFIELD(NX_MMIO_BAR_SIZE, bar)) * 0x10000;
+}
+
+/* Update the changes of the device BAR to link BARs */
+static void npu_dev_bar_update(uint32_t gcid, struct npu_dev_bar *bar,
+			       bool enable)
+{
+	uint64_t val;
+
+	if (!bar->xscom)
+		return;
+
+	val = bar->base;
+	val = SETFIELD(NX_MMIO_BAR_SIZE, val, ilog2(bar->size / 0x10000));
+	if (enable)
+		val |= NX_MMIO_BAR_ENABLE;
+	xscom_write(gcid, bar->xscom, val);
+}
+
+/* Trap for PCI command (0x4) to enable or disable device's BARs */
+static int64_t npu_dev_cfg_write_cmd(void *dev,
+				     struct pci_cfg_reg_filter *pcrf __unused,
+				     uint32_t offset, uint32_t size,
+				     uint32_t *data, bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu_dev *ndev = pvd->data;
+	bool enable;
+
+	if (!write)
+		return OPAL_PARTIAL;
+
+	if (offset != PCI_CFG_CMD)
+		return OPAL_PARAMETER;
+	if (size != 1 && size != 2 && size != 4)
+		return OPAL_PARAMETER;
+
+	/* Update device BARs and link BARs will be syncrhonized
+	 * with hardware automatically.
+	 */
+	enable = !!(*data & PCI_CFG_CMD_MEM_EN);
+	npu_dev_bar_update(ndev->npu->chip_id, &ndev->bar, enable);
+
+	/* Normal path to update PCI config buffer */
+	return OPAL_PARTIAL;
+}
+
+/*
+ * Trap for memory BARs: 0xFF's should be written to BAR register
+ * prior to getting its size.
+ */
+static int64_t npu_dev_cfg_bar_read(struct npu_dev *dev __unused,
+				    struct pci_cfg_reg_filter *pcrf,
+				    uint32_t offset, uint32_t size,
+				    uint32_t *data)
+{
+	struct npu_dev_bar *bar = (struct npu_dev_bar *)(pcrf->data);
+
+	/* Revert to normal path if we weren't trapped for BAR size */
+	if (!bar->trapped)
+		return OPAL_PARTIAL;
+
+	if (offset != pcrf->start &&
+	    offset != pcrf->start + 4)
+		return OPAL_PARAMETER;
+	if (size != 4)
+		return OPAL_PARAMETER;
+
+	bar->trapped = false;
+	*data = bar->bar_sz;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_dev_cfg_bar_write(struct npu_dev *dev,
+				     struct pci_cfg_reg_filter *pcrf,
+				     uint32_t offset, uint32_t size,
+				     uint32_t data)
+{
+	struct pci_virt_device *pvd = dev->pvd;
+	struct npu_dev_bar *bar = (struct npu_dev_bar *)(pcrf->data);
+	uint32_t pci_cmd;
+
+	if (offset != pcrf->start &&
+	    offset != pcrf->start + 4)
+		return OPAL_PARAMETER;
+	if (size != 4)
+		return OPAL_PARAMETER;
+
+	/* Return BAR size on next read */
+	if (data == 0xffffffff) {
+		bar->trapped = true;
+		if (offset == pcrf->start)
+			bar->bar_sz = (bar->size & 0xffffffff);
+		else
+			bar->bar_sz = (bar->size >> 32);
+
+		return OPAL_SUCCESS;
+	}
+
+	/* Update BAR base address */
+	if (offset == pcrf->start) {
+		bar->base &= 0xffffffff00000000UL;
+		bar->base |= (data & 0xfffffff0);
+	} else {
+		bar->base &= 0x00000000ffffffffUL;
+		bar->base |= ((uint64_t)data << 32);
+
+		PCI_VIRT_CFG_NORMAL_RD(pvd, PCI_CFG_CMD, 4, &pci_cmd);
+		npu_dev_bar_update(dev->npu->chip_id, bar,
+				   !!(pci_cmd & PCI_CFG_CMD_MEM_EN));
+	}
+
+	/* We still depend on the normal path to update the
+	 * cached config buffer.
+	 */
+	return OPAL_PARAMETER;
+}
+
+static int64_t npu_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
+			       uint32_t offset, uint32_t len, uint32_t *data,
+			       bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu_dev *ndev = pvd->data;
+
+	if (write)
+		return npu_dev_cfg_bar_write(ndev, pcrf, offset, len, *data);
+
+	return npu_dev_cfg_bar_read(ndev, pcrf, offset, len, data);
+}
+
+static int64_t npu_dev_cfg_exp_devcap(void *dev,
+		struct pci_cfg_reg_filter *pcrf __unused,
+		uint32_t offset, uint32_t size,
+		uint32_t *data, bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu_dev *ndev = pvd->data;
+
+	assert(write);
+
+	if ((size != 2) || (offset & 1)) {
+		/* Short config writes are not supported */
+		prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n",
+		      ndev->phb->opal_id);
+		return OPAL_PARAMETER;
+	}
+
+	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
+		npu_dev_procedure_reset(ndev);
+
+	return OPAL_PARTIAL;
+}
+
+static struct npu_dev *bdfn_to_npu_dev(struct npu *p, uint32_t bdfn)
+{
+	struct pci_virt_device *pvd;
+
+	/* Sanity check */
+	if (bdfn & ~0xff)
+		return NULL;
+
+	pvd = pci_virt_find_device(&p->phb, bdfn);
+	if (pvd)
+		return pvd->data;
+
+	return NULL;
+}
+
+#define NPU_CFG_READ(size, type)						\
+static int64_t npu_cfg_read##size(struct phb *phb, uint32_t bdfn,		\
+				  uint32_t offset, type *data)			\
+{										\
+	uint32_t val;								\
+	int64_t ret;								\
+										\
+	ret = pci_virt_cfg_read(phb, bdfn, offset, sizeof(*data), &val);	\
+	*data = (type)val;							\
+	return ret;								\
+}
+#define NPU_CFG_WRITE(size, type)						\
+static int64_t npu_cfg_write##size(struct phb *phb, uint32_t bdfn,		\
+				   uint32_t offset, type data)			\
+{										\
+	uint32_t val = data;                                            	\
+										\
+	return pci_virt_cfg_write(phb, bdfn, offset, sizeof(data), val);	\
+}
+
+NPU_CFG_READ(8,   u8);
+NPU_CFG_READ(16,  u16);
+NPU_CFG_READ(32,  u32);
+NPU_CFG_WRITE(8,  u8);
+NPU_CFG_WRITE(16, u16);
+NPU_CFG_WRITE(32, u32);
+
+static int __npu_dev_bind_pci_dev(struct phb *phb __unused,
+				  struct pci_device *pd,
+				  void *data)
+{
+	struct npu_dev *dev = data;
+	struct dt_node *pci_dt_node;
+	char *pcislot;
+
+	/* Ignore non-nvidia PCI devices */
+	if ((pd->vdid & 0xffff) != 0x10de)
+		return 0;
+
+	/* Find the PCI device's slot location */
+	for (pci_dt_node = pd->dn;
+	     pci_dt_node && !dt_find_property(pci_dt_node, "ibm,slot-label");
+	     pci_dt_node = pci_dt_node->parent);
+
+	if (!pci_dt_node)
+		return 0;
+
+	pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,slot-label");
+
+	prlog(PR_DEBUG, "NPU: comparing GPU %s and NPU %s\n",
+	      pcislot, dev->slot_label);
+
+	if (streq(pcislot, dev->slot_label))
+		return 1;
+
+	return 0;
+}
+
+static void npu_dev_bind_pci_dev(struct npu_dev *dev)
+{
+	struct phb *phb;
+	uint32_t i;
+
+	if (dev->pd)
+		return;
+
+	for (i = 0; i < 64; i++) {
+		if (dev->npu->phb.opal_id == i)
+			continue;
+
+		phb = pci_get_phb(i);
+		if (!phb)
+			continue;
+
+		dev->pd = pci_walk_dev(phb, NULL, __npu_dev_bind_pci_dev, dev);
+		if (dev->pd) {
+			dev->phb = phb;
+			/* Found the device, set the bit in config space */
+			PCI_VIRT_CFG_INIT_RO(dev->pvd, VENDOR_CAP_START +
+				VENDOR_CAP_PCI_DEV_OFFSET, 1, 0x01);
+			return;
+		}
+	}
+
+	prlog(PR_INFO, "%s: No PCI device for NPU device %04x:%02x:%02x.%x to bind to. If you expect a GPU to be there, this is a problem.\n",
+	      __func__, dev->npu->phb.opal_id,
+	      dev->pvd->bdfn >> 8 & 0xff,
+	      dev->pvd->bdfn >> 3 & 0x1f,
+	      dev->pvd->bdfn & 0x7);
+
+}
+
+static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED;
+
+/* Appends an NPU phandle to the given PCI device node ibm,npu
+ * property */
+static void npu_append_pci_phandle(struct dt_node *dn, u32 phandle)
+{
+	uint32_t *npu_phandles;
+	struct dt_property *pci_npu_phandle_prop;
+	size_t prop_len;
+
+	/* Use a lock to make sure no one else has a reference to an
+	 * ibm,npu property (this assumes this is the only function
+	 * that holds a reference to it). */
+	lock(&pci_npu_phandle_lock);
+
+	/* This function shouldn't be called unless ibm,npu exists */
+	pci_npu_phandle_prop = (struct dt_property *)
+		dt_require_property(dn, "ibm,npu", -1);
+
+	/* Need to append to the properties */
+	prop_len = pci_npu_phandle_prop->len;
+	prop_len += sizeof(*npu_phandles);
+	dt_resize_property(&pci_npu_phandle_prop, prop_len);
+
+	npu_phandles = (uint32_t *) pci_npu_phandle_prop->prop;
+	npu_phandles[prop_len/sizeof(*npu_phandles) - 1] = phandle;
+	unlock(&pci_npu_phandle_lock);
+}
+
+static int npu_dn_fixup(struct phb *phb,
+			struct pci_device *pd,
+			void *data __unused)
+{
+	struct npu *p = phb_to_npu(phb);
+	struct npu_dev *dev;
+
+	dev = bdfn_to_npu_dev(p, pd->bdfn);
+	assert(dev);
+
+	if (dev->phb || dev->pd)
+		return 0;
+
+	/* NPU devices require a slot location to associate with GPUs */
+	dev->slot_label = dt_prop_get(pd->dn, "ibm,slot-label");
+
+	/* Bind the emulated PCI device with the real one, which can't
+	 * be done until the PCI devices are populated. Once the real
+	 * PCI device is identified, we also need fix the device-tree
+	 * for it
+	 */
+	npu_dev_bind_pci_dev(dev);
+	if (dev->phb && dev->pd && dev->pd->dn) {
+		if (dt_find_property(dev->pd->dn, "ibm,npu"))
+			npu_append_pci_phandle(dev->pd->dn, pd->dn->phandle);
+		else
+			dt_add_property_cells(dev->pd->dn, "ibm,npu", pd->dn->phandle);
+
+		dt_add_property_cells(pd->dn, "ibm,gpu", dev->pd->dn->phandle);
+	}
+
+	return 0;
+}
+
+static void npu_phb_final_fixup(struct phb *phb)
+{
+	pci_walk_dev(phb, NULL, npu_dn_fixup, NULL);
+}
+
+static void npu_ioda_init(struct npu *p)
+{
+	uint64_t *data64;
+	uint32_t i;
+
+	/* LXIVT - Disable all LSIs */
+	for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) {
+		data64 = &p->lxive_cache[i];
+		*data64 = SETFIELD(NPU_IODA_LXIVT_PRIORITY, 0ul, 0xff);
+		*data64 = SETFIELD(NPU_IODA_LXIVT_SERVER, *data64, 0);
+	}
+
+	/* PCT - Reset to reserved PE# */
+	for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++) {
+		data64 = &p->pce_cache[i];
+		*data64 = SETFIELD(NPU_IODA_PCT_PE, 0ul, 0ul);
+		*data64 |= NPU_IODA_PCT_LINK_ENABLED;
+	}
+
+	/* Clear TVT */
+	memset(p->tve_cache, 0, sizeof(p->tve_cache));
+}
+
+static int64_t npu_ioda_reset(struct phb *phb, bool purge)
+{
+	struct npu *p = phb_to_npu(phb);
+	uint32_t i;
+
+	if (purge) {
+		NPUDBG(p, "Purging all IODA tables...\n");
+		npu_ioda_init(p);
+	}
+
+	/* LIST */
+	npu_ioda_sel(p, NPU_IODA_TBL_LIST, 0, true);
+	for (i = 0; i < 8; i++)
+		out_be64(p->at_regs + NPU_IODA_DATA0, 0x1);
+
+	/* LIXVT */
+	npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++)
+		out_be64(p->at_regs + NPU_IODA_DATA0, p->lxive_cache[i]);
+
+	/* PCT */
+	npu_ioda_sel(p, NPU_IODA_TBL_PCT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++)
+		out_be64(p->at_regs + NPU_IODA_DATA0, p->pce_cache[i]);
+
+	/* TVT */
+	npu_ioda_sel(p, NPU_IODA_TBL_TVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+		out_be64(p->at_regs + NPU_IODA_DATA0, p->tve_cache[i]);
+
+	return OPAL_SUCCESS;
+}
+
+static int npu_isn_valid(struct npu *p, uint32_t isn)
+{
+	if (p->chip_id != p8_irq_to_chip(isn) || p->index != 0 ||
+	    NPU_IRQ_NUM(isn) < NPU_LSI_IRQ_MIN ||
+	    NPU_IRQ_NUM(isn) > NPU_LSI_IRQ_MAX) {
+		/**
+		 * @fwts-label NPUisnInvalid
+		 * @fwts-advice NVLink not functional
+		 */
+		prlog(PR_ERR, "NPU%d: isn 0x%x not valid for this NPU\n",
+		      p->phb.opal_id, isn);
+		return false;
+	}
+
+	return true;
+}
+
+static int64_t npu_lsi_get_xive(struct irq_source *is, uint32_t isn,
+				uint16_t *server, uint8_t *prio)
+{
+	struct npu *p = is->data;
+	uint32_t irq = NPU_IRQ_NUM(isn);
+	uint64_t lxive;
+
+	if (!npu_isn_valid(p, isn))
+		return OPAL_PARAMETER;
+
+	/* The content is fetched from the cache, which requires
+	 * that the initial cache should be initialized with the
+	 * default values
+	 */
+	irq -= NPU_LSI_IRQ_MIN;
+	lxive = p->lxive_cache[irq];
+	*server = GETFIELD(NPU_IODA_LXIVT_SERVER, lxive);
+	*prio = GETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_lsi_set_xive(struct irq_source *is, uint32_t isn,
+				uint16_t server, uint8_t prio)
+{
+	struct npu *p = is->data;
+	uint32_t irq = NPU_IRQ_NUM(isn);
+	uint64_t lxive;
+
+	if (!npu_isn_valid(p, isn))
+		return OPAL_PARAMETER;
+
+	/* Figure out LXIVT entry */
+	lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, 0ul, server);
+	lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio);
+
+	/* Cache LXIVT entry */
+	irq -= NPU_LSI_IRQ_MIN;
+	p->lxive_cache[irq] = lxive;
+
+	/* Update to LXIVT entry */
+	npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, irq, false);
+	lxive = in_be64(p->at_regs + NPU_IODA_DATA0);
+	lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, lxive, server);
+	lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio);
+	out_be64(p->at_regs + NPU_IODA_DATA0, lxive);
+
+	return OPAL_SUCCESS;
+}
+
+static void npu_err_interrupt(struct irq_source *is, uint32_t isn)
+{
+	struct npu *p = is->data;
+	uint32_t irq = NPU_IRQ_NUM(isn);
+
+	if (!npu_isn_valid(p, isn))
+		return;
+
+	/* There're 4 LSIs used for error reporting: 4/5 for data
+	 * link error reporting while 6/7 for frozen PE detection
+	 */
+	irq -= NPU_LSI_IRQ_MIN;
+	switch (irq) {
+	case 4 ... 5:
+		prerror("Invalid NPU error interrupt received\n");
+		break;
+	case 6 ... 7:
+		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+					OPAL_EVENT_PCI_ERROR);
+	}
+}
+
+static uint64_t npu_lsi_attributes(struct irq_source *is, uint32_t isn)
+{
+	struct npu *p = is->data;
+	uint32_t idx = isn - p->base_lsi;
+
+	if (idx >= 4)
+		return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI;
+	return IRQ_ATTR_TARGET_LINUX;
+}
+
+/* Error LSIs (skiboot owned) */
+static const struct irq_source_ops npu_lsi_irq_ops = {
+	.get_xive	= npu_lsi_get_xive,
+	.set_xive	= npu_lsi_set_xive,
+	.attributes	= npu_lsi_attributes,
+	.interrupt	= npu_err_interrupt,
+};
+
+static void npu_register_irq(struct npu *p)
+{
+	register_irq_source(&npu_lsi_irq_ops, p, p->base_lsi, 8);
+}
+
+static void npu_hw_init(struct npu *p)
+{
+	/* 3 MMIO setup for AT */
+	out_be64(p->at_regs + NPU_LSI_SOURCE_ID,
+		 SETFIELD(NPU_LSI_SRC_ID_BASE, 0ul, NPU_LSI_IRQ_MIN >> 4));
+	BUILD_ASSERT((NPU_LSI_IRQ_MIN & 0x07F0) == NPU_LSI_IRQ_MIN);
+	out_be64(p->at_regs + NPU_INTREP_TIMER, 0x0ul);
+	npu_ioda_reset(&p->phb, false);
+}
+
+static int64_t npu_map_pe_dma_window_real(struct phb *phb,
+					   uint64_t pe_number,
+					   uint16_t window_id,
+					   uint64_t pci_start_addr,
+					   uint64_t pci_mem_size)
+{
+	struct npu *p = phb_to_npu(phb);
+	uint64_t end;
+	uint64_t tve;
+
+	/* Sanity check. Each PE has one corresponding TVE */
+	if (pe_number >= NPU_NUM_OF_PES ||
+	    window_id != pe_number)
+		return OPAL_PARAMETER;
+
+	if (pci_mem_size) {
+		/* Enable */
+
+		end = pci_start_addr + pci_mem_size;
+
+		/* We have to be 16M aligned */
+		if ((pci_start_addr & 0x00ffffff) ||
+		    (pci_mem_size & 0x00ffffff))
+			return OPAL_PARAMETER;
+
+		/*
+		 * It *looks* like this is the max we can support (we need
+		 * to verify this. Also we are not checking for rollover,
+		 * but then we aren't trying too hard to protect ourselves
+		 * againt a completely broken OS.
+		 */
+		if (end > 0x0003ffffffffffffull)
+			return OPAL_PARAMETER;
+
+		/*
+		 * Put start address bits 49:24 into TVE[52:53]||[0:23]
+		 * and end address bits 49:24 into TVE[54:55]||[24:47]
+		 * and set TVE[51]
+		 */
+		tve  = (pci_start_addr << 16) & (0xffffffull << 48);
+		tve |= (pci_start_addr >> 38) & (3ull << 10);
+		tve |= (end >>  8) & (0xfffffful << 16);
+		tve |= (end >> 40) & (3ull << 8);
+		tve |= PPC_BIT(51);
+	} else {
+		/* Disable */
+		tve = 0;
+	}
+
+	npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
+	out_be64(p->at_regs + NPU_IODA_DATA0, tve);
+	p->tve_cache[window_id] = tve;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_map_pe_dma_window(struct phb *phb,
+					 uint64_t pe_number,
+					 uint16_t window_id,
+					 uint16_t tce_levels,
+					 uint64_t tce_table_addr,
+					 uint64_t tce_table_size,
+					 uint64_t tce_page_size)
+{
+	struct npu *p = phb_to_npu(phb);
+	uint64_t tts_encoded;
+	uint64_t data64 = 0;
+
+	/* Sanity check. Each PE has one corresponding TVE */
+	if (pe_number >= NPU_NUM_OF_PES ||
+	    window_id != pe_number)
+		return OPAL_PARAMETER;
+
+	/* Special condition, zero TCE table size used to disable
+	 * the TVE.
+	 */
+	if (!tce_table_size) {
+		npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
+		out_be64(p->at_regs + NPU_IODA_DATA0, 0ul);
+		p->tve_cache[window_id] = 0ul;
+		return OPAL_SUCCESS;
+	}
+
+	/* Additional arguments validation */
+	if (tce_levels < 1 ||
+	    tce_levels > 4 ||
+	    !is_pow2(tce_table_size) ||
+	    tce_table_size < 0x1000)
+		return OPAL_PARAMETER;
+
+	/* TCE table size */
+	data64 = SETFIELD(NPU_IODA_TVT_TTA, 0ul, tce_table_addr >> 12);
+	tts_encoded = ilog2(tce_table_size) - 11;
+	if (tts_encoded > 39)
+		return OPAL_PARAMETER;
+	data64 = SETFIELD(NPU_IODA_TVT_SIZE, data64, tts_encoded);
+
+	/* TCE page size */
+	switch (tce_page_size) {
+	case 0x10000:		/* 64K */
+		data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 5);
+		break;
+	case 0x1000000:		/* 16M */
+		data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 13);
+		break;
+	case 0x10000000:	/* 256M */
+		data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 17);
+		break;
+	case 0x1000:		/* 4K */
+	default:
+		data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 1);
+	}
+
+	/* Number of levels */
+	data64 = SETFIELD(NPU_IODA_TVT_LEVELS, data64, tce_levels - 1);
+
+	/* Update to hardware */
+	npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
+	out_be64(p->at_regs + NPU_IODA_DATA0, data64);
+	p->tve_cache[window_id] = data64;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_set_pe(struct phb *phb,
+			      uint64_t pe_number,
+			      uint64_t bdfn,
+			      uint8_t bcompare,
+			      uint8_t dcompare,
+			      uint8_t fcompare,
+			      uint8_t action)
+{
+	struct npu *p = phb_to_npu(phb);
+	struct npu_dev *dev;
+	uint32_t link_idx;
+	uint64_t *data64;
+
+	/* Sanity check */
+	if (action != OPAL_MAP_PE &&
+	    action != OPAL_UNMAP_PE)
+		return OPAL_PARAMETER;
+	if (pe_number >= NPU_NUM_OF_PES)
+		return OPAL_PARAMETER;
+
+	/* All emulated PCI devices hooked to root bus, whose
+	 * bus number is zero.
+	 */
+	dev = bdfn_to_npu_dev(p, bdfn);
+	if (PCI_BUS_NUM(bdfn) || !dev)
+		return OPAL_PARAMETER;
+
+	link_idx = dev->index;
+	dev->pe_number = pe_number;
+
+	/* Separate links will be mapped to different PEs */
+	if (bcompare != OpalPciBusAll ||
+	    dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
+	    fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
+		return OPAL_UNSUPPORTED;
+
+	/* Map the link to the corresponding PE */
+	data64 = &p->pce_cache[link_idx];
+	if (action == OPAL_MAP_PE)
+		*data64 = SETFIELD(NPU_IODA_PCT_PE, *data64,
+				   pe_number);
+	else
+		*data64 = SETFIELD(NPU_IODA_PCT_PE, *data64,
+				   NPU_NUM_OF_PES);
+
+	*data64 |= NPU_IODA_PCT_LINK_ENABLED;
+
+	npu_ioda_sel(p, NPU_IODA_TBL_PCT, link_idx, false);
+	out_be64(p->at_regs + NPU_IODA_DATA0, *data64);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_get_link_state(struct pci_slot *slot __unused, uint8_t *val)
+{
+	/* As we're emulating all PCI stuff, the link bandwidth
+	 * isn't big deal anyway.
+	 */
+	*val = OPAL_SHPC_LINK_UP_x1;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_get_power_state(struct pci_slot *slot __unused, uint8_t *val)
+{
+	*val = PCI_SLOT_POWER_ON;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_hreset(struct pci_slot *slot __unused)
+{
+	prlog(PR_DEBUG, "NPU: driver should call reset procedure here\n");
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_freset(struct pci_slot *slot __unused)
+{
+	/* FIXME: PHB fundamental reset, which need to be
+	 * figured out later. It's used by EEH recovery
+	 * upon fenced AT.
+	 */
+	return OPAL_SUCCESS;
+}
+
+static struct pci_slot *npu_slot_create(struct phb *phb)
+{
+	struct pci_slot *slot;
+
+	slot = pci_slot_alloc(phb, NULL);
+	if (!slot)
+		return slot;
+
+	/* Elementary functions */
+	slot->ops.get_presence_state  = NULL;
+	slot->ops.get_link_state      = npu_get_link_state;
+	slot->ops.get_power_state     = npu_get_power_state;
+	slot->ops.get_attention_state = NULL;
+	slot->ops.get_latch_state     = NULL;
+	slot->ops.set_power_state     = NULL;
+	slot->ops.set_attention_state = NULL;
+
+	slot->ops.prepare_link_change = NULL;
+	slot->ops.poll_link           = NULL;
+	slot->ops.hreset              = npu_hreset;
+	slot->ops.freset              = npu_freset;
+	slot->ops.creset              = NULL;
+
+	return slot;
+}
+
+static int64_t npu_freeze_status(struct phb *phb,
+				     uint64_t pe_number __unused,
+				     uint8_t *freeze_state,
+				     uint16_t *pci_error_type __unused,
+				     uint16_t *severity __unused)
+{
+	/* FIXME: When it's called by skiboot PCI config accessor,
+	 * the PE number is fixed to 0, which is incorrect. We need
+	 * introduce another PHB callback to translate it. For now,
+	 * it keeps the skiboot PCI enumeration going.
+	 */
+	struct npu *p = phb_to_npu(phb);
+	if (p->fenced)
+		*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+	else
+		*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_eeh_next_error(struct phb *phb,
+				  uint64_t *first_frozen_pe,
+				  uint16_t *pci_error_type,
+				  uint16_t *severity)
+{
+	struct npu *p = phb_to_npu(phb);
+	int i;
+	uint64_t result = 0;
+	*first_frozen_pe = -1;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+	*severity = OPAL_EEH_SEV_NO_ERROR;
+
+	if (p->fenced) {
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_FENCED;
+		return OPAL_SUCCESS;
+	}
+
+	npu_ioda_sel(p, NPU_IODA_TBL_PESTB, 0, true);
+	for (i = 0; i < NPU_NUM_OF_PES; i++) {
+		result = in_be64(p->at_regs + NPU_IODA_DATA0);
+		if (result > 0) {
+			*first_frozen_pe = i;
+			*pci_error_type = OPAL_EEH_PE_ERROR;
+			*severity = OPAL_EEH_SEV_PE_ER;
+			break;
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
+
+/* For use in error injection and handling. */
+void npu_set_fence_state(struct npu *p, bool fence) {
+	p->fenced = fence;
+
+	if (fence)
+		prlog(PR_ERR, "NPU: Chip %x is fenced, reboot required.\n",
+		      p->chip_id);
+	else
+		prlog(PR_WARNING, "NPU: un-fencing is dangerous and should \
+		      only be used for development purposes.");
+}
+
+/* Sets the NPU to trigger an error when a DMA occurs */
+static int64_t npu_err_inject(struct phb *phb, uint64_t pe_number,
+			      uint32_t type, uint32_t func __unused,
+			      uint64_t addr __unused, uint64_t mask __unused)
+{
+	struct npu *p = phb_to_npu(phb);
+	struct npu_dev *dev = NULL;
+	int i;
+
+	if (pe_number >= NPU_NUM_OF_PES) {
+		prlog(PR_ERR, "NPU: error injection failed, bad PE given\n");
+		return OPAL_PARAMETER;
+	}
+
+	for (i = 0; i < p->total_devices; i++) {
+		if (p->devices[i].pe_number == pe_number) {
+			dev = &p->devices[i];
+			break;
+		}
+	}
+
+	if (!dev) {
+		prlog(PR_ERR, "NPU: couldn't find device with PE%llx\n", pe_number);
+		return OPAL_PARAMETER;
+	}
+
+	/* TODO: extend this to conform to OPAL injection standards */
+	if (type > 1) {
+		prlog(PR_ERR, "NPU: invalid error injection type\n");
+		return OPAL_PARAMETER;
+	} else if (type == 1) {
+		/* Emulate fence mode. */
+		npu_set_fence_state(p, true);
+	} else {
+		/* Cause a freeze with an invalid MMIO read.  If the BAR is not
+		 * enabled, this will checkstop the machine.
+		 */
+		npu_dev_bar_update(p->chip_id, &dev->bar, true);
+		in_be64((void *)dev->bar.base);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static const struct phb_ops npu_ops = {
+	.cfg_read8		= npu_cfg_read8,
+	.cfg_read16		= npu_cfg_read16,
+	.cfg_read32		= npu_cfg_read32,
+	.cfg_write8		= npu_cfg_write8,
+	.cfg_write16		= npu_cfg_write16,
+	.cfg_write32		= npu_cfg_write32,
+	.get_reserved_pe_number	= NULL,
+	.device_init		= NULL,
+	.phb_final_fixup	= npu_phb_final_fixup,
+	.ioda_reset		= npu_ioda_reset,
+	.papr_errinjct_reset	= NULL,
+	.pci_reinit		= NULL,
+	.set_phb_mem_window	= NULL,
+	.phb_mmio_enable	= NULL,
+	.map_pe_mmio_window	= NULL,
+	.map_pe_dma_window	= npu_map_pe_dma_window,
+	.map_pe_dma_window_real	= npu_map_pe_dma_window_real,
+	.pci_msi_eoi		= NULL,
+	.set_xive_pe		= NULL,
+	.get_msi_32		= NULL,
+	.get_msi_64		= NULL,
+	.set_pe			= npu_set_pe,
+	.set_peltv		= NULL,
+	.eeh_freeze_status	= npu_freeze_status,
+	.eeh_freeze_clear	= NULL,
+	.eeh_freeze_set		= NULL,
+	.next_error		= npu_eeh_next_error,
+	.err_inject		= npu_err_inject,
+	.get_diag_data2		= NULL,
+	.set_capi_mode		= NULL,
+	.set_capp_recovery	= NULL,
+};
+
+static void assign_mmio_bars(uint32_t gcid, uint32_t xscom,
+			     struct dt_node *npu_dn, uint64_t mm_win[2],
+			     uint64_t at_bar[2])
+{
+	uint64_t mem_start, mem_end;
+	struct npu_dev_bar bar;
+	struct dt_node *link;
+
+	/* Configure BAR selection.
+	 *
+	 * Currently, each PHY contains 2 links and each link has 2
+	 * BARs. The first BAR is assigned to the DLTL region which is
+	 * what the kernel uses. The second BAR is either assigned to
+	 * either the PL or AT region or unassigned. The PL0/PL1/AT
+	 * MMIO regions are not exposed to the kernel so we assigned
+	 * them at the start of the available memory area followed by
+	 * the DLTL regions. So we end up with the following memory
+	 * map (assuming we're given a memory region starting at
+	 * 0x3fff000000000):
+	 *
+	 * Link#0-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000420000
+	 * Link#0-BAR#1:     PL0 BAR (  2MB) - 0x3fff000000000
+	 * Link#1-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000440000
+	 * Link#1-BAR#1:      AT BAR ( 64KB) - 0x3fff000400000
+	 * Link#2-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000460000
+	 * Link#2-BAR#1:     PL1 BAR (  2MB) - 0x3fff000200000
+	 * Link#3-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000480000
+	 * Link#3-BAR#1:  UNASSIGNED
+	 */
+	xscom_write(gcid, xscom + NPU_AT_SCOM_OFFSET + NX_BAR,
+		    0x0211000043500000UL);
+
+	xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_0,
+		   &mem_start);
+	mem_start = GETFIELD(NX_MMIO_BAR_BASE, mem_start) << 12;
+
+	xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 5) + NX_MMIO_BAR_0,
+		   &mem_end);
+	mem_end = (GETFIELD(NX_MMIO_BAR_BASE, mem_end) << 12) +
+		get_bar_size(mem_end);
+
+	/* PL0 BAR comes first at 0x3fff000000000 */
+	bar.xscom = npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_1;
+	bar.base = mem_start;
+	bar.size = NX_MMIO_PL_SIZE;
+	npu_dev_bar_update(gcid, &bar, true);
+
+	/* PL1 BAR */
+	bar.xscom = npu_link_scom_base(npu_dn, xscom, 4) + NX_MMIO_BAR_1;
+	bar.base += bar.size;
+	bar.size = NX_MMIO_PL_SIZE;
+	npu_dev_bar_update(gcid, &bar, true);
+
+	/* Then the AT BAR */
+	bar.xscom = npu_link_scom_base(npu_dn, xscom, 1) + NX_MMIO_BAR_1;
+	bar.base += bar.size;
+	bar.size = NX_MMIO_AT_SIZE;
+	at_bar[0] = bar.base;
+	at_bar[1] = NX_MMIO_AT_SIZE;
+	npu_dev_bar_update(gcid, &bar, true);
+
+	/* Now we configure all the DLTL BARs. These are the ones
+	 * actually exposed to the kernel. */
+	mm_win[0] = bar.base + bar.size;
+	dt_for_each_node(npu_dn, link) {
+		uint32_t index;
+
+		index = dt_prop_get_u32(link, "ibm,npu-link-index");
+		bar.xscom = npu_link_scom_base(npu_dn, xscom, index) +
+			NX_MMIO_BAR_0;
+		bar.base += bar.size;
+		bar.size = NX_MMIO_DL_SIZE;
+		bar.base = ALIGN_UP(bar.base, bar.size);
+		npu_dev_bar_update(gcid, &bar, false);
+	}
+	mm_win[1] = (bar.base + bar.size) - mm_win[0];
+
+	/* If we weren't given enough room to setup all the BARs we
+	 * require it's better to crash here than risk creating
+	 * overlapping BARs which will xstop the machine randomly in
+	 * the future.*/
+	assert(bar.base + bar.size <= mem_end);
+}
+
+/* Probe NPU device node and create PCI root device node
+ * accordingly. The NPU deivce node should specify number
+ * of links and xscom base address to access links.
+ */
+static void npu_probe_phb(struct dt_node *dn)
+{
+	struct dt_node *np;
+	uint32_t gcid, index, phb_index, xscom;
+	uint64_t at_bar[2], mm_win[2];
+	uint32_t links;
+	char *path;
+
+	/* Retrieve chip id */
+	path = dt_get_path(dn);
+	gcid = dt_get_chip_id(dn);
+	index = dt_prop_get_u32(dn, "ibm,npu-index");
+	phb_index = dt_prop_get_u32(dn, "ibm,phb-index");
+	links = dt_prop_get_u32(dn, "ibm,npu-links");
+	prlog(PR_INFO, "Chip %d Found NPU%d (%d links) at %s\n",
+	      gcid, index, links, path);
+	free(path);
+
+	/* Retrieve xscom base addr */
+	xscom = dt_get_address(dn, 0, NULL);
+	prlog(PR_INFO, "   XSCOM Base:  %08x\n", xscom);
+
+	assign_mmio_bars(gcid, xscom, dn, mm_win, at_bar);
+	prlog(PR_INFO, "   AT BAR:      %016llx (%lldKB)\n",
+	      at_bar[0], at_bar[1] / 0x400);
+
+	/* Create PCI root device node */
+	np = dt_new_addr(dt_root, "pciex", at_bar[0]);
+	assert(np);
+
+	dt_add_property_strings(np, "compatible",
+				"ibm,power8-npu-pciex", "ibm,ioda2-npu-phb");
+	dt_add_property_strings(np, "device_type", "pciex");
+	dt_add_property(np, "reg", at_bar, sizeof(at_bar));
+
+	dt_add_property_cells(np, "ibm,phb-index", phb_index);
+	dt_add_property_cells(np, "ibm,npu-index", index);
+	dt_add_property_cells(np, "ibm,chip-id", gcid);
+	dt_add_property_cells(np, "ibm,xscom-base", xscom);
+	dt_add_property_cells(np, "ibm,npcq", dn->phandle);
+	dt_add_property_cells(np, "ibm,links", links);
+	dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win));
+	dt_add_property_cells(np, "ibm,phb-diag-data-size", 0);
+
+	/* Disable fast reboot - not currently supported */
+	disable_fast_reboot("NVLink device enabled");
+}
+
+static void npu_dev_populate_vendor_cap(struct npu_dev_cap *cap)
+{
+	struct npu_dev *dev = cap->dev;
+	struct pci_virt_device *pvd = dev->pvd;
+	uint32_t offset = cap->start;
+	uint8_t val;
+
+	/* Add length and version information */
+	val = cap->end - cap->start;
+	PCI_VIRT_CFG_INIT_RO(pvd, offset + 2, 1, val);
+	PCI_VIRT_CFG_INIT_RO(pvd, offset + 3, 1, OPAL_NPU_VERSION);
+	offset += 4;
+
+	/* Defaults when the trap can't handle the read/write (eg. due
+	 * to reading/writing less than 4 bytes). */
+	val = 0x0;
+	PCI_VIRT_CFG_INIT_RO(pvd, offset, 4, val);
+	PCI_VIRT_CFG_INIT_RO(pvd, offset + 4, 4, val);
+
+	/* Create a trap for AT/PL procedures */
+	pci_virt_add_filter(pvd, offset, 8,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu_dev_procedure, NULL);
+	offset += 8;
+
+	PCI_VIRT_CFG_INIT_RO(pvd, offset, 1, dev->index);
+}
+
+static void npu_dev_populate_pcie_cap(struct npu_dev_cap *cap)
+{
+	struct npu_dev *dev = cap->dev;
+	struct pci_virt_device *pvd = dev->pvd;
+	uint32_t base = cap->start;
+	uint32_t val;
+
+	/* Sanity check on capability ID */
+	if (cap->id != PCI_CFG_CAP_ID_EXP) {
+		prlog(PR_NOTICE, "%s: Invalid capability ID %d (%d)\n",
+		      __func__, cap->id, PCI_CFG_CAP_ID_EXP);
+		return;
+	}
+
+	/* Sanity check on spanned registers */
+	if ((cap->end - cap->start) < PCIE_CAP_START) {
+		prlog(PR_NOTICE, "%s: Invalid reg region [%x, %x] for cap %d\n",
+		      __func__, cap->start, cap->end, cap->id);
+		return;
+	}
+
+	/* 0x00 - ID/PCIE capability */
+	val = cap->id;
+	val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20));
+	PCI_VIRT_CFG_INIT_RO(pvd, base, 4, val);
+
+	/* 0x04 - Device capability
+	 *
+	 * We should support FLR. Otherwise, it might have
+	 * problem passing it through to userland via Linux
+	 * VFIO infrastructure
+	 */
+	val = ((PCIE_MPSS_128) |
+	       (PCIE_PHANTOM_NONE << 3) |
+	       (PCIE_L0SL_MAX_NO_LIMIT << 6) |
+	       (PCIE_L1L_MAX_NO_LIMIT << 9) |
+	       (PCICAP_EXP_DEVCAP_FUNC_RESET));
+	PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_DEVCAP, 4, val);
+
+	pci_virt_add_filter(pvd, base + PCICAP_EXP_DEVCTL, 2,
+			    PCI_REG_FLAG_WRITE,
+			    npu_dev_cfg_exp_devcap, NULL);
+
+	/* 0x08 - Device control and status */
+	PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DEVCTL, 4, 0x00002810,
+			 0xffff0000, 0x000f0000);
+
+	/* 0x0c - Link capability */
+	val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4));
+	PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_LCAP, 4, val);
+
+	/* 0x10 - Link control and status */
+	PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_LCTL, 4, 0x00130000,
+			 0xfffff000, 0xc0000000);
+
+	/* 0x14 - Slot capability */
+	PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
+
+	/* 0x18 - Slot control and status */
+	PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
+
+	/* 0x1c - Root control and capability */
+	PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_RC, 4, 0x00000000,
+			 0xffffffe0, 0x00000000);
+
+	/* 0x20 - Root status */
+	PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_RSTAT, 4, 0x00000000,
+			 0xffffffff, 0x00010000);
+
+	/* 0x24 - Device capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, base + PCIECAP_EXP_DCAP2, 4, 0x00000000);
+
+	/* 0x28 - Device Control and status 2 */
+	PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DCTL2, 4, 0x00070000,
+			 0xffff0000, 0x00000000);
+
+	/* 0x2c - Link capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_LCAP2, 4, 0x00000007);
+
+	/* 0x30 - Link control and status 2 */
+	PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_LCTL2, 4, 0x00000003,
+			 0xffff0000, 0x00200000);
+
+	/* 0x34 - Slot capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SCAP2, 4, 0x00000000);
+
+	/* 0x38 - Slot control and status 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SCTL2, 4, 0x00000000);
+}
+
+static struct npu_dev_cap *npu_dev_create_capability(struct npu_dev *dev,
+				  void (*populate)(struct npu_dev_cap *),
+				  uint16_t id,
+				  uint16_t start,
+				  uint16_t end)
+{
+	struct npu_dev_cap *cap;
+
+	/* Check if the capability is existing */
+	cap = npu_dev_find_capability(dev, id);
+	if (cap)
+		return cap;
+
+	/* Allocate new one */
+	cap = zalloc(sizeof(struct npu_dev_cap));
+	assert(cap);
+
+	/* Put it into the pool */
+	cap->id         = id;
+	cap->start      = start;
+	cap->end        = end;
+	cap->dev        = dev;
+	cap->populate	= populate;
+	list_add_tail(&dev->capabilities, &cap->link);
+
+	return cap;
+}
+
+static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev,
+						   uint16_t id)
+{
+	struct npu_dev_cap *cap;
+
+	list_for_each(&dev->capabilities, cap, link) {
+		if (cap->id == id)
+			return cap;
+	}
+
+	return NULL;
+}
+
+/*
+ * All capabilities should be put into the device capability
+ * list according to register offset in ascending order for
+ * easy access at later point.
+ */
+static void npu_dev_create_capabilities(struct npu_dev *dev)
+{
+	list_head_init(&dev->capabilities);
+
+	/* PCI express capability */
+	npu_dev_create_capability(dev, npu_dev_populate_pcie_cap,
+				  PCI_CFG_CAP_ID_EXP, PCIE_CAP_START,
+				  PCIE_CAP_END);
+
+	/* Vendor specific capability */
+	npu_dev_create_capability(dev, npu_dev_populate_vendor_cap,
+				  PCI_CFG_CAP_ID_VENDOR, VENDOR_CAP_START,
+				  VENDOR_CAP_END);
+}
+
+static void npu_dev_create_cfg(struct npu_dev *dev)
+{
+	struct pci_virt_device *pvd = dev->pvd;
+	struct npu_dev_cap *cap;
+	uint32_t offset;
+	uint32_t last_cap_offset;
+
+	/* 0x00 - Vendor/Device ID */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
+
+	/* 0x04 - Command/Status
+	 *
+	 * Create one trap to trace toggling memory BAR enable bit
+	 */
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
+			 0xf9000000);
+
+	pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
+			    npu_dev_cfg_write_cmd, NULL);
+
+	/* 0x08 - Rev/Class/Cache */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800100);
+
+	/* 0x0c - CLS/Latency Timer/Header/BIST */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
+
+	/* 0x10 - BARs, always 64-bits non-prefetchable
+	 *
+	 * Each emulated device represents one link and therefore
+	 * there is one BAR for the associated DLTL region.
+	 */
+
+	/* Low 32-bits */
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4,
+			 (dev->bar.base & 0xfffffff0) | dev->bar.flags,
+			 0x0000000f, 0x00000000);
+
+	/* High 32-bits */
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (dev->bar.base >> 32),
+			 0x00000000, 0x00000000);
+
+	/*
+	 * Create trap. Writting 0xFF's to BAR registers should be
+	 * trapped and return size on next read
+	 */
+	pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu_dev_cfg_bar, &dev->bar);
+
+	/* 0x18/1c/20/24 - Disabled BAR#2/3/4/5
+	 *
+	 * Mark those BARs readonly so that 0x0 will be returned when
+	 * probing the length and the BARs will be skipped.
+	 */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR2, 4, 0x00000000);
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR3, 4, 0x00000000);
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
+
+	/* 0x28 - Cardbus CIS pointer */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
+
+	/* 0x2c - Subsystem ID */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
+
+	/* 0x30 - ROM BAR
+	 *
+	 * Force its size to be zero so that the kernel will skip
+	 * probing the ROM BAR. We needn't emulate ROM BAR.
+	 */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
+
+	/* 0x34 - PCI Capability
+	 *
+	 * By default, we don't have any capabilities
+	 */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
+
+	last_cap_offset = PCI_CFG_CAP - 1;
+	list_for_each(&dev->capabilities, cap, link) {
+		offset = cap->start;
+
+		/* Initialize config space for the capability */
+		if (cap->populate)
+			cap->populate(cap);
+
+		/* Add capability header */
+		PCI_VIRT_CFG_INIT_RO(pvd, offset, 2, cap->id);
+
+		/* Update the next capability pointer */
+		PCI_VIRT_CFG_NORMAL_WR(pvd, last_cap_offset + 1, 1, offset);
+
+		last_cap_offset = offset;
+	}
+
+	/* 0x38 - Reserved */
+	PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
+
+	/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
+	if (!(dev->index % 2))
+		PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100);
+	else
+		PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000200);
+}
+
+static uint32_t npu_allocate_bdfn(struct npu *p, uint32_t group)
+{
+	int i;
+	int bdfn = (group << 3);
+
+	for (i = 0; i < p->total_devices; i++) {
+		if ((p->devices[i].pvd->bdfn & 0xf8) == (bdfn & 0xf8))
+			bdfn++;
+	}
+
+	return bdfn;
+}
+
+static void npu_create_devices(struct dt_node *dn, struct npu *p)
+{
+	struct npu_dev *dev;
+	struct dt_node *npu_dn, *link;
+	uint32_t bdfn, npu_phandle, index = 0;
+	uint64_t buid_reg;
+	uint64_t lsisrcid;
+	uint64_t buid;
+
+
+	/* The bits in the LSI ID Base register are always compared and
+	 * can be set to 0 in the buid base and mask fields.  The
+	 * buid (bus unit id) is the full irq minus the last 4 bits. */
+	lsisrcid = GETFIELD(NPU_LSI_SRC_ID_BASE, NPU_LSI_SRC_ID_BASE);
+	buid = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) >> 4;
+
+	buid_reg = SETFIELD(NP_IRQ_LEVELS, NP_BUID_ENABLE, ~0);
+	buid_reg = SETFIELD(NP_BUID_MASK, buid_reg, ~lsisrcid);
+	buid_reg = SETFIELD(NP_BUID_BASE, buid_reg, (buid & ~lsisrcid));
+
+	/* Get the npu node which has the links which we expand here
+	 * into pci like devices attached to our emulated phb. */
+	npu_phandle = dt_prop_get_u32(dn, "ibm,npcq");
+	npu_dn = dt_find_by_phandle(dt_root, npu_phandle);
+	assert(npu_dn);
+
+	/* Walk the link@x nodes to initialize devices */
+	p->total_devices = 0;
+	p->phb.scan_map = 0;
+	list_head_init(&p->phb.virt_devices);
+	dt_for_each_compatible(npu_dn, link, "ibm,npu-link") {
+		struct npu_dev_bar *bar;
+		uint32_t group_id;
+		uint64_t val;
+
+		dev = &p->devices[index];
+		dev->index = dt_prop_get_u32(link, "ibm,npu-link-index");
+		dev->xscom = npu_link_scom_base(npu_dn, p->xscom_base,
+						dev->index);
+
+		dev->npu = p;
+		dev->dt_node = link;
+
+		/* We don't support MMIO PHY access yet */
+		dev->pl_base = NULL;
+
+		group_id = dt_prop_get_u32(link, "ibm,npu-group-id");
+		bdfn = npu_allocate_bdfn(p, group_id);
+
+		/* This must be done after calling
+		 * npu_allocate_bdfn() */
+		p->total_devices++;
+		p->phb.scan_map |= 0x1 << ((bdfn & 0xf8) >> 3);
+
+		dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy");
+		dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
+
+		/* Setup BUID/ISRN */
+		xscom_write(p->chip_id, dev->xscom + NX_NP_BUID, buid_reg);
+
+		/* Create PCI virtual device */
+		dev->pvd = pci_virt_add_device(&p->phb, bdfn, NPU_DEV_CFG_SIZE, dev);
+		assert(dev->pvd);
+		bar = &dev->bar;
+		bar->flags = (PCI_CFG_BAR_TYPE_MEM |
+			      PCI_CFG_BAR_MEM64);
+
+		/* Update BAR info */
+		bar->xscom = dev->xscom + NX_MMIO_BAR_0;
+		xscom_read(p->chip_id, bar->xscom, &val);
+		bar->base  = GETFIELD(NX_MMIO_BAR_BASE, val) << 12;
+		bar->size = get_bar_size(val);
+
+		/*
+		 * The config space is initialised with the BARs
+		 * disabled, so make sure it is actually disabled in
+		 * hardware.
+		 */
+		npu_dev_bar_update(p->chip_id, bar, false);
+
+		/* Initialize capabilities */
+		npu_dev_create_capabilities(dev);
+
+		/* Initialize config space */
+		npu_dev_create_cfg(dev);
+
+		index++;
+	}
+}
+
+static void npu_add_phb_properties(struct npu *p)
+{
+	struct dt_node *np = p->phb.dt_node;
+	uint32_t icsp = get_ics_phandle();
+	uint64_t tkill, mm_base, mm_size;
+	uint32_t base_lsi = p->base_lsi;
+	uint32_t map[] = {
+		/* Dev 0 INT#A (used by fn0) */
+		0x0000, 0x0, 0x0, 0x1, icsp, base_lsi + NPU_LSI_INT_DL0, 1,
+		/* Dev 0 INT#B (used by fn1) */
+		0x0000, 0x0, 0x0, 0x2, icsp, base_lsi + NPU_LSI_INT_DL1, 1,
+		/* Dev 1 INT#A (used by fn0) */
+		0x0800, 0x0, 0x0, 0x1, icsp, base_lsi + NPU_LSI_INT_DL2, 1,
+		/* Dev 1 INT#B (used by fn1) */
+		0x0800, 0x0, 0x0, 0x2, icsp, base_lsi + NPU_LSI_INT_DL3, 1,
+	};
+	/* Mask is bus, device and INT# */
+	uint32_t mask[] = {0xf800, 0x0, 0x0, 0x7};
+	char slotbuf[32];
+
+	/* Add various properties that HB doesn't have to
+	 * add, some of them simply because they result from
+	 * policy decisions made in skiboot rather than in HB
+	 * such as the MMIO windows going to PCI, interrupts,
+	 * etc.
+	 */
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+	dt_add_property_cells(np, "bus-range", 0, 0xff);
+	dt_add_property_cells(np, "clock-frequency", 0x200, 0);
+        dt_add_property_cells(np, "interrupt-parent", icsp);
+
+        /* DLPL Interrupts, we don't use the standard swizzle */
+	p->phb.lstate.int_size = 0;
+	dt_add_property(np, "interrupt-map", map, sizeof(map));
+	dt_add_property(np, "interrupt-map-mask", mask, sizeof(mask));
+
+	/* NPU PHB properties */
+	/* TODO: Due to an errata TCE KILL only works when DMA traffic
+	 * has been stopped. We need to implement the work around
+	 * which is to do a TCE kill all instead. */
+	tkill = cleanup_addr((uint64_t)p->at_regs) + NPU_TCE_KILL;
+	dt_add_property_cells(np, "ibm,opal-num-pes",
+			      NPU_NUM_OF_PES);
+	dt_add_property_cells(np, "ibm,opal-reserved-pe",
+			      0);
+        dt_add_property_u64(np, "ibm,opal-tce-kill", tkill);
+
+	/* Memory window is exposed as 32-bits non-prefetchable
+	 * one because 64-bits prefetchable one is kind of special
+	 * to kernel.
+	 */
+	mm_base = p->mm_base;
+	mm_size = p->mm_size;
+	dt_add_property_cells(np, "ranges", 0x02000000,
+			      hi32(mm_base), lo32(mm_base),
+			      hi32(mm_base), lo32(mm_base),
+			      hi32(mm_size), lo32(mm_size));
+
+	/* Set the slot location on the NPU PHB.  This PHB can contain
+	 * devices that correlate with multiple physical slots, so
+	 * present the chip ID instead.
+	 */
+	snprintf(slotbuf, sizeof(slotbuf), "NPU Chip %d", p->chip_id);
+	dt_add_property_string(np, "ibm,io-base-loc-code", slotbuf);
+}
+
+static void npu_create_phb(struct dt_node *dn)
+{
+	const struct dt_property *prop;
+	struct npu *p;
+	struct pci_slot *slot;
+	uint32_t links;
+	void *pmem;
+
+	/* Retrieve number of devices */
+	links = dt_prop_get_u32(dn, "ibm,links");
+	pmem = zalloc(sizeof(struct npu) + links * sizeof(struct npu_dev));
+	assert(pmem);
+
+	/* Populate PHB */
+	p = pmem;
+	p->index = dt_prop_get_u32(dn, "ibm,npu-index");
+	p->chip_id = dt_prop_get_u32(dn, "ibm,chip-id");
+	p->xscom_base = dt_prop_get_u32(dn, "ibm,xscom-base");
+	p->total_devices = links;
+
+	/* TODO: When hardware fences are implemented, detect them here */
+	p->fenced = false;
+
+	/* This is the AT base */
+	p->at_xscom = p->xscom_base + NPU_AT_SCOM_OFFSET;
+	p->at_regs = (void *)dt_get_address(dn, 0, NULL);
+
+	prop = dt_require_property(dn, "ibm,mmio-window", -1);
+	assert(prop->len >= (2 * sizeof(uint64_t)));
+	p->mm_base = ((const uint64_t *)prop->prop)[0];
+	p->mm_size = ((const uint64_t *)prop->prop)[1];
+
+	p->devices = pmem + sizeof(struct npu);
+
+	/* Interrupt */
+        p->base_lsi = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) +
+		NPU_LSI_IRQ_MIN;
+
+	/* Generic PHB */
+	p->phb.dt_node = dn;
+	p->phb.ops = &npu_ops;
+	p->phb.phb_type = phb_type_pcie_v3;
+
+	/* Populate devices */
+	npu_create_devices(dn, p);
+
+	/* Populate extra properties */
+	npu_add_phb_properties(p);
+
+	/* Create PHB slot */
+	slot = npu_slot_create(&p->phb);
+	if (!slot)
+	{
+		/**
+		 * @fwts-label NPUCannotCreatePHBSlot
+		 * @fwts-advice Firmware probably ran out of memory creating
+		 * NPU slot. NVLink functionality could be broken.
+		 */
+		prlog(PR_ERR, "NPU: Cannot create PHB slot\n");
+	}
+
+	/* Register PHB */
+	pci_register_phb(&p->phb, OPAL_DYNAMIC_PHB_ID);
+
+	/* Initialize IODA cache */
+	npu_ioda_init(p);
+
+	/* Register interrupt source */
+	npu_register_irq(p);
+
+	/* Initialize hardware */
+	npu_hw_init(p);
+}
+
+void probe_npu(void)
+{
+	struct dt_node *np;
+
+	/* Scan NPU XSCOM nodes */
+	dt_for_each_compatible(dt_root, np, "ibm,power8-npu")
+		npu_probe_phb(np);
+
+	/* Scan newly created PHB nodes */
+	dt_for_each_compatible(dt_root, np, "ibm,power8-npu-pciex")
+		npu_create_phb(np);
+}
diff --git a/roms/skiboot/hw/npu2-common.c b/roms/skiboot/hw/npu2-common.c
new file mode 100644
index 000000000..3bc9bcee6
--- /dev/null
+++ b/roms/skiboot/hw/npu2-common.c
@@ -0,0 +1,681 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2019 IBM Corp. */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <pci.h>
+#include <npu2.h>
+#include <npu2-regs.h>
+#include <bitutils.h>
+#include <nvram.h>
+#include <i2c.h>
+#include <interrupts.h>
+#include <xive.h>
+
+#define NPU2_IRQ_BASE_SHIFT 13
+#define NPU2_N_DL_IRQS 35
+#define NPU2_N_DL_IRQS_ALIGN 64
+
+/*
+ * We use the indirect method because it uses the same addresses as
+ * the MMIO offsets (NPU RING)
+ */
+static void npu2_scom_set_addr(uint64_t gcid, uint64_t scom_base,
+			       uint64_t addr, uint64_t size)
+{
+	addr = SETFIELD(NPU2_MISC_DA_ADDR, 0ull, addr);
+	addr = SETFIELD(NPU2_MISC_DA_LEN, addr, size);
+	xscom_write(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_ADDR, addr);
+}
+
+void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
+		     uint64_t reg, uint64_t size,
+		     uint64_t val)
+{
+	npu2_scom_set_addr(gcid, scom_base, reg, size);
+	xscom_write(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_DATA, val);
+}
+
+uint64_t npu2_scom_read(uint64_t gcid, uint64_t scom_base,
+			uint64_t reg, uint64_t size)
+{
+	uint64_t val;
+
+	npu2_scom_set_addr(gcid, scom_base, reg, size);
+	xscom_read(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_DATA, &val);
+
+	return val;
+}
+
+void npu2_write_4b(struct npu2 *p, uint64_t reg, uint32_t val)
+{
+	npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_4B,
+			(uint64_t)val << 32);
+}
+
+uint32_t npu2_read_4b(struct npu2 *p, uint64_t reg)
+{
+	return npu2_scom_read(p->chip_id, p->xscom_base, reg,
+			      NPU2_MISC_DA_LEN_4B) >> 32;
+}
+
+void npu2_write(struct npu2 *p, uint64_t reg, uint64_t val)
+{
+	npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B, val);
+}
+
+uint64_t npu2_read(struct npu2 *p, uint64_t reg)
+{
+	return npu2_scom_read(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B);
+}
+
+void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask)
+{
+	uint64_t new_val;
+
+	new_val = npu2_read(p, reg);
+	new_val &= ~mask;
+	new_val |= val & mask;
+	npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B, new_val);
+}
+
+void npu2_write_mask_4b(struct npu2 *p, uint64_t reg, uint32_t val, uint32_t mask)
+{
+	uint32_t new_val;
+
+	new_val = npu2_read_4b(p, reg);
+	new_val &= ~mask;
+	new_val |= val & mask;
+	npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_4B,
+			(uint64_t)new_val << 32);
+}
+
+typedef struct {
+	const char *name;
+	uint32_t block;
+	uint32_t offset;
+} npu2_scom_dump_t;
+
+static npu2_scom_dump_t npu2_scom_dump_global[] = {
+	/* CQ State Machine */
+	{ "CS.SM0.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0 },
+	{ "CS.SM1.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0 },
+	{ "CS.SM2.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0 },
+	{ "CS.SM3.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0 },
+
+	{ "CS.SM0.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1 },
+	{ "CS.SM1.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1 },
+	{ "CS.SM2.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1 },
+	{ "CS.SM3.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1 },
+
+	{ "CS.SM0.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2 },
+	{ "CS.SM1.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2 },
+	{ "CS.SM2.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2 },
+	{ "CS.SM3.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2 },
+
+	{ "CS.SM0.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3 },
+	{ "CS.SM1.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3 },
+	{ "CS.SM2.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3 },
+	{ "CS.SM3.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3 },
+
+	{ "CS.SM0.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4 },
+	{ "CS.SM1.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4 },
+	{ "CS.SM2.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4 },
+	{ "CS.SM3.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4 },
+
+	{ "CS.SM0.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5 },
+	{ "CS.SM1.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5 },
+	{ "CS.SM2.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5 },
+	{ "CS.SM3.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5 },
+
+	{ "CS.SM0.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6 },
+	{ "CS.SM1.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6 },
+	{ "CS.SM2.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6 },
+	{ "CS.SM3.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6 },
+
+	{ "CS.SM0.MISC.CERR_FIRST0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0 },
+	{ "CS.SM1.MISC.CERR_FIRST0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0 },
+	{ "CS.SM2.MISC.CERR_FIRST0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0 },
+	{ "CS.SM3.MISC.CERR_FIRST0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0 },
+
+	{ "CS.SM0.MISC.CERR_FIRST1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1 },
+	{ "CS.SM1.MISC.CERR_FIRST1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1 },
+	{ "CS.SM2.MISC.CERR_FIRST1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1 },
+	{ "CS.SM3.MISC.CERR_FIRST1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1 },
+
+	{ "CS.SM0.MISC.CERR_FIRST2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2 },
+	{ "CS.SM1.MISC.CERR_FIRST2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2 },
+	{ "CS.SM2.MISC.CERR_FIRST2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2 },
+	{ "CS.SM3.MISC.CERR_FIRST2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2 },
+
+	/* CQ Control */
+	{ "CS.CTL.MISC.CERR_MESSAGE0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0 },
+	{ "CS.CTL.MISC.CERR_MESSAGE1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1 },
+	{ "CS.CTL.MISC.CERR_FIRST0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0 },
+	{ "CS.CTL.MISC.CERR_FIRST1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1 },
+
+	/* CQ Data */
+	{ "DAT.MISC.CERR_ECC_HOLD", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS },
+	{ "DAT.MISC.CERR_ECC_MASK", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK },
+	{ "DAT.MISC.CERR_ECC_FIRST", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST },
+	{ "DAT.MISC.REM0", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0 },
+	{ "DAT.MISC.REM1", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1 },
+};
+
+static npu2_scom_dump_t npu2_scom_dump_nvlink[] = {
+	{ "NTL0.REGS.CERR_FIRST1", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF },
+	{ "NTL1.REGS.CERR_FIRST1", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF },
+	{ "NTL0.REGS.CERR_FIRST2", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF },
+	{ "NTL1.REGS.CERR_FIRST2", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF },
+};
+
+static npu2_scom_dump_t npu2_scom_dump_ocapi[] = {
+	{ "OTL0.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0 },
+	{ "OTL1.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0 },
+	{ "OTL0.MISC.OTL_REM0", NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0 },
+	{ "OTL1.MISC.OTL_REM0", NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0 },
+	{ "OTL0.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG },
+	{ "OTL1.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG },
+	{ "OTL0.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG },
+	{ "OTL1.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG },
+	{ "OTL0.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1 },
+	{ "OTL1.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1 },
+};
+
+static void print_one_npu_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int stack)
+{
+	uint64_t reg, val;
+
+	reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset);
+	val = npu2_scom_read(npu->chip_id, npu->xscom_base,
+			reg, NPU2_MISC_DA_LEN_8B);
+
+	prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n",
+		npu->chip_id, stack - 4, scom->name, reg, val);
+}
+
+/* same as above, but for direct access registers */
+static void print_one_reg(int chip_id, int brick_index,
+			uint64_t reg_addr, const char *reg_name)
+{
+	uint64_t val;
+
+	xscom_read(chip_id, reg_addr, &val);
+	prlog(PR_ERR, "NPU[%d] %s brick %d 0x%llx = 0x%016llx\n",
+		chip_id, reg_name, brick_index, reg_addr, val);
+}
+
+static void show_nvlink_regs(struct npu2 *npu, int brick_index)
+{
+	uint32_t stack, ntl;
+	int i;
+
+	stack = NPU2_STACK_STCK_0 + brick_index / 2;
+	ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2;
+
+	for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_nvlink); i++) {
+		if (npu2_scom_dump_nvlink[i].block == ntl)
+			print_one_npu_reg(npu, &npu2_scom_dump_nvlink[i], stack);
+	}
+}
+
+static void show_opencapi_regs(struct npu2 *npu, int brick_index)
+{
+	uint32_t stack, otl;
+	int i;
+
+	stack = NPU2_STACK_STCK_0 + brick_index / 2;
+	otl = NPU2_BLOCK_OTL0 + (brick_index % 2);
+
+	/* NPU registers */
+	for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_ocapi); i++) {
+		if (npu2_scom_dump_ocapi[i].block == otl)
+			print_one_npu_reg(npu, &npu2_scom_dump_ocapi[i], stack);
+	}
+
+	/* Fabric registers */
+	print_one_reg(npu->chip_id, brick_index,
+		OB_ODL_STATUS(brick_index), "ODL status");
+	print_one_reg(npu->chip_id, brick_index,
+		OB_ODL_TRAINING_STATUS(brick_index), "ODL training status");
+	print_one_reg(npu->chip_id, brick_index,
+		OB_ODL_ENDPOINT_INFO(brick_index), "ODL endpoint info");
+}
+
+static void show_all_regs(struct npu2 *npu, int brick_index)
+{
+	int i, stack, stack_min, stack_max;
+	uint64_t fir_val, mask_val, fir_addr, mask_addr;
+	struct npu2_dev *dev;
+	npu2_scom_dump_t scom_reg;
+
+	if (brick_index != -1) {
+		stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2;
+	} else {
+		stack_min = NPU2_STACK_STCK_0;
+		stack_max = NPU2_STACK_STCK_2;
+		/* Avoid dumping unused stacks for opencapi on Lagrange */
+		if (npu->total_devices == 2)
+			stack_min = stack_max = NPU2_STACK_STCK_1;
+	}
+
+	/* NPU FIRs */
+	for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
+		fir_addr  = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET;
+		mask_addr = fir_addr + NPU2_FIR_MASK_OFFSET;
+		xscom_read(npu->chip_id, fir_addr, &fir_val);
+		xscom_read(npu->chip_id, mask_addr, &mask_val);
+		prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx => 0x%016llx)\n",
+			npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val);
+	}
+
+	/* NPU global, per-stack registers */
+	for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_global); i++) {
+		for (stack = stack_min; stack <= stack_max; stack++)
+			print_one_npu_reg(npu, &npu2_scom_dump_global[i], stack);
+	}
+
+	/*
+	 * NPU global registers, stack independent
+	 * We have only one for now, so dump it directly
+	 */
+	scom_reg.name = "XTS.REG.ERR_HOLD";
+	scom_reg.block = NPU2_BLOCK_XTS;
+	scom_reg.offset = 0;
+	print_one_npu_reg(npu, &scom_reg, NPU2_STACK_MISC);
+
+	/* nvlink- or opencapi-specific registers */
+	for (i = 0; i < npu->total_devices; i++) {
+		dev = &npu->devices[i];
+		if (brick_index == -1 || dev->brick_index == brick_index) {
+			if (dev->type == NPU2_DEV_TYPE_NVLINK)
+				show_nvlink_regs(npu, dev->brick_index);
+			else if (dev->type == NPU2_DEV_TYPE_OPENCAPI)
+				show_opencapi_regs(npu, dev->brick_index);
+		}
+	}
+}
+
+void npu2_dump_scoms(int chip_id)
+{
+	struct npu2 *npu;
+	struct phb *phb;
+	struct npu2_dev *dev;
+
+	/*
+	 * Look for the npu2 structure for that chip ID. We can access it
+	 * through the array of phbs, looking for a nvlink or opencapi
+	 * phb. We can have several entries, but they all point
+	 * to the same npu2 structure
+	 */
+	for_each_phb(phb) {
+		npu = NULL;
+		if (phb->phb_type == phb_type_npu_v2) {
+			npu = phb_to_npu2_nvlink(phb);
+		} else if (phb->phb_type == phb_type_npu_v2_opencapi) {
+			dev = phb_to_npu2_dev_ocapi(phb);
+			npu = dev->npu;
+		}
+		if (npu && npu->chip_id == chip_id) {
+			show_all_regs(npu, -1 /* all bricks */);
+			break;
+		}
+	}
+}
+
+static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused)
+{
+	struct npu2 *p = is->data;
+	uint32_t idx = isn - p->base_lsi;
+
+	if ((idx == 18) || (idx >= 27 && idx <= 34))
+		/*
+		 * level 18: TCE Interrupt - used to detect a frozen PE (nvlink)
+		 * level 27-30: OTL interrupt (opencapi)
+		 * level 31-34: XSL interrupt (opencapi)
+		 */
+		return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_MSI;
+	else
+		return IRQ_ATTR_TARGET_LINUX;
+}
+
+static char *npu2_ipi_name(struct irq_source *is, uint32_t isn)
+{
+	struct npu2 *p = is->data;
+	uint32_t idx = isn - p->base_lsi;
+	const char *name;
+
+	switch (idx) {
+	case 0: name = "NDL 0 Stall Event (brick 0)"; break;
+	case 1: name = "NDL 0 No-Stall Event (brick 0)"; break;
+	case 2: name = "NDL 1 Stall Event (brick 1)"; break;
+	case 3: name = "NDL 1 No-Stall Event (brick 1)"; break;
+	case 4: name = "NDL 2 Stall Event (brick 2)"; break;
+	case 5: name = "NDL 2 No-Stall Event (brick 2)"; break;
+	case 6: name = "NDL 5 Stall Event (brick 3)"; break;
+	case 7: name = "NDL 5 No-Stall Event (brick 3)"; break;
+	case 8: name = "NDL 4 Stall Event (brick 4)"; break;
+	case 9: name = "NDL 4 No-Stall Event (brick 4)"; break;
+	case 10: name = "NDL 3 Stall Event (brick 5)"; break;
+	case 11: name = "NDL 3 No-Stall Event (brick 5)"; break;
+	case 12: name = "NTL 0 Event"; break;
+	case 13: name = "NTL 1 Event"; break;
+	case 14: name = "NTL 2 Event"; break;
+	case 15: name = "NTL 3 Event"; break;
+	case 16: name = "NTL 4 Event"; break;
+	case 17: name = "NTL 5 Event"; break;
+	case 18: name = "TCE Event"; break;
+	case 19: name = "ATS Event"; break;
+	case 20: name = "CQ Event"; break;
+	case 21: name = "MISC Event"; break;
+	case 22: name = "NMMU Local Xstop"; break;
+	case 23: name = "Translate Fail (brick 2)"; break;
+	case 24: name = "Translate Fail (brick 3)"; break;
+	case 25: name = "Translate Fail (brick 4)"; break;
+	case 26: name = "Translate Fail (brick 5)"; break;
+	case 27: name = "OTL Event (brick 2)"; break;
+	case 28: name = "OTL Event (brick 3)"; break;
+	case 29: name = "OTL Event (brick 4)"; break;
+	case 30: name = "OTL Event (brick 5)"; break;
+	case 31: name = "XSL Event (brick 2)"; break;
+	case 32: name = "XSL Event (brick 3)"; break;
+	case 33: name = "XSL Event (brick 4)"; break;
+	case 34: name = "XSL Event (brick 5)"; break;
+	default: name = "Unknown";
+	}
+	return strdup(name);
+}
+
+static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
+{
+	struct npu2 *p = is->data;
+	uint32_t idx = isn - p->base_lsi;
+	char *irq_name;
+	int brick;
+
+	switch (idx) {
+	case 18:
+		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+					OPAL_EVENT_PCI_ERROR);
+		break;
+	case 27 ... 34:
+		/* opencapi only */
+		brick = 2 + ((idx - 27) % 4);
+		irq_name = npu2_ipi_name(is, isn);
+		prlog(PR_ERR, "NPU[%d] received error interrupt '%s'\n",
+			p->chip_id, irq_name);
+		free(irq_name);
+		show_all_regs(p, brick);
+		/*
+		 * P9 NPU doesn't support recovering a link going down
+		 * unexpectedly. So we mark the device as broken and
+		 * report it to the OS, so that the error is logged
+		 * and the drivers notified.
+		 */
+		npu2_opencapi_set_broken(p, brick);
+		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+					OPAL_EVENT_PCI_ERROR);
+		break;
+	default:
+		prerror("OPAL received unknown NPU2 interrupt %d\n", idx);
+		return;
+	}
+}
+
+static const struct irq_source_ops npu2_ipi_ops = {
+	.interrupt	= npu2_err_interrupt,
+	.attributes	= npu2_ipi_attributes,
+	.name = npu2_ipi_name,
+};
+
+static void setup_irqs(struct npu2 *p)
+{
+	uint64_t reg, val;
+	void *tp;
+
+	p->base_lsi = xive_alloc_ipi_irqs(p->chip_id, NPU2_N_DL_IRQS, NPU2_N_DL_IRQS_ALIGN);
+	if (p->base_lsi == XIVE_IRQ_ERROR) {
+		prlog(PR_ERR, "NPU: Failed to allocate interrupt sources\n");
+		return;
+	}
+	xive_register_ipi_source(p->base_lsi, NPU2_N_DL_IRQS, p, &npu2_ipi_ops);
+
+	/* Set IPI configuration */
+	reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, NPU2_MISC_CFG);
+	val = npu2_read(p, reg);
+	val = SETFIELD(NPU2_MISC_CFG_IPI_PS, val, NPU2_MISC_CFG_IPI_PS_64K);
+	val = SETFIELD(NPU2_MISC_CFG_IPI_OS, val, NPU2_MISC_CFG_IPI_OS_AIX);
+	npu2_write(p, reg, val);
+
+	/* Set IRQ base */
+	reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, NPU2_MISC_IRQ_BASE);
+	tp = xive_get_trigger_port(p->base_lsi);
+	val = ((uint64_t)tp) << NPU2_IRQ_BASE_SHIFT;
+	npu2_write(p, reg, val);
+}
+
+static bool _i2c_presence_detect(struct npu2_dev *dev)
+{
+	uint8_t state, data;
+	int rc;
+
+	rc = i2c_request_send(dev->npu->i2c_port_id_ocapi,
+			platform.ocapi->i2c_presence_addr,
+			SMBUS_READ, 0, 1,
+			&state, 1, 120);
+	if (rc) {
+		OCAPIERR(dev, "error detecting link presence: %d\n", rc);
+		return true; /* assume link exists */
+	}
+
+	OCAPIDBG(dev, "I2C presence detect: 0x%x\n", state);
+
+	switch (dev->link_index) {
+	case 2:
+		data = platform.ocapi->i2c_presence_brick2;
+		break;
+	case 3:
+		data = platform.ocapi->i2c_presence_brick3;
+		break;
+	case 4:
+		data = platform.ocapi->i2c_presence_brick4;
+		break;
+	case 5:
+		data = platform.ocapi->i2c_presence_brick5;
+		break;
+	default:
+		OCAPIERR(dev, "presence detection on invalid link\n");
+		return true;
+	}
+	/* Presence detect bits are active low */
+	return !(state & data);
+}
+
+/*
+ * A default presence detection implementation for platforms like ZZ and Zaius
+ * that don't implement their own. Assumes all devices found will be OpenCAPI.
+ */
+void npu2_i2c_presence_detect(struct npu2 *npu)
+{
+	struct npu2_dev *dev;
+	assert(platform.ocapi);
+	for (int i = 0; i < npu->total_devices; i++) {
+		dev = &npu->devices[i];
+		if (_i2c_presence_detect(dev))
+			dev->type = NPU2_DEV_TYPE_OPENCAPI;
+		else
+			dev->type = NPU2_DEV_TYPE_UNKNOWN;
+	}
+}
+
+static struct npu2 *setup_npu(struct dt_node *dn)
+{
+	struct npu2 *npu;
+	struct npu2_dev *dev;
+	struct dt_node *np;
+	uint32_t num_links;
+	char port_name[17];
+	void *npumem;
+	char *path;
+	int gcid;
+	struct proc_chip *chip;
+	int i = 0;
+
+	/* Retrieve chip ID */
+	path = dt_get_path(dn);
+	gcid = dt_get_chip_id(dn);
+	chip = get_chip(gcid);
+	assert(chip);
+
+	num_links = dt_prop_get_u32(dn, "ibm,npu-links");
+	npumem = zalloc(sizeof(struct npu2) + num_links *
+			sizeof(struct npu2_dev));
+	assert(npumem);
+	npu = npumem;
+
+	npu->dt_node = dn;
+	npu->index = dt_prop_get_u32(dn, "ibm,npu-index");
+	npu->chip_id = gcid;
+	npu->xscom_base = dt_get_address(dn, 0, NULL);
+
+	init_lock(&npu->i2c_lock);
+	npu->i2c_pin_mode = ~0; // input mode by default
+	npu->i2c_pin_wr_state = ~0; // reset is active low
+	if (platform.ocapi) {
+		/* Find I2C port for handling device presence/reset */
+		snprintf(port_name, sizeof(port_name), "p8_%08x_e%dp%d",
+			 gcid, platform.ocapi->i2c_engine,
+			 platform.ocapi->i2c_port);
+		prlog(PR_DEBUG, "NPU: Looking for I2C port %s\n", port_name);
+
+		dt_for_each_compatible(dt_root, np, "ibm,power9-i2c-port") {
+			if (streq(port_name, dt_prop_get(np, "ibm,port-name"))) {
+				npu->i2c_port_id_ocapi = dt_prop_get_u32(np, "ibm,opal-id");
+				break;
+			}
+		}
+
+		if (!npu->i2c_port_id_ocapi) {
+			prlog(PR_ERR, "NPU: Couldn't find I2C port %s\n",
+			      port_name);
+			goto failed;
+		}
+	}
+
+	npu->devices = npumem + sizeof(struct npu2);
+
+	dt_for_each_compatible(dn, np, "ibm,npu-link") {
+		assert(i < num_links);
+		dev = &npu->devices[i];
+		dev->link_index = dt_prop_get_u32(np, "ibm,npu-link-index");
+		/* May be overridden by platform presence detection */
+		dev->brick_index = dev->link_index;
+		/* Will be overridden by presence detection */
+		dev->type = NPU2_DEV_TYPE_UNKNOWN;
+		dev->npu = npu;
+		dev->dt_node = np;
+		dev->pl_xscom_base = dt_prop_get_u64(np, "ibm,npu-phy");
+		dev->lane_mask = dt_prop_get_u32(np, "ibm,npu-lane-mask");
+		dev->link_speed = dt_prop_get_u64(np, "ibm,link-speed");
+		i++;
+	};
+	npu->total_devices = i;
+
+	prlog(PR_INFO, "NPU: Chip %d Found NPU2#%d (%d links) at %s\n",
+	      npu->chip_id, npu->index, npu->total_devices, path);
+	prlog(PR_INFO, "   SCOM Base:  %08llx\n", npu->xscom_base);
+	free(path);
+	return npu;
+
+failed:
+	prlog(PR_ERR, "NPU: Chip %d NPU setup failed\n", gcid);
+	free(path);
+	free(npu);
+	return NULL;
+}
+
+static void setup_devices(struct npu2 *npu)
+{
+	bool nvlink_detected = false, ocapi_detected = false;
+	struct npu2_dev *dev;
+
+	/*
+	 * TODO: In future, we'll do brick configuration here to support mixed
+	 * setups.
+	 */
+	for (int i = 0; i < npu->total_devices; i++) {
+		dev = &npu->devices[i];
+		switch (dev->type) {
+		case NPU2_DEV_TYPE_NVLINK:
+			nvlink_detected = true;
+			dt_add_property_strings(dev->dt_node,
+						"ibm,npu-link-type",
+						"nvlink");
+			break;
+		case NPU2_DEV_TYPE_OPENCAPI:
+			ocapi_detected = true;
+			dt_add_property_strings(dev->dt_node,
+						"ibm,npu-link-type",
+						"opencapi");
+			break;
+		default:
+			prlog(PR_INFO, "NPU: Link %d device not present\n",
+			      npu->devices[i].link_index);
+			dt_add_property_strings(dev->dt_node,
+						"ibm,npu-link-type",
+						"unknown");
+		}
+	}
+
+	if (nvlink_detected && ocapi_detected) {
+		prlog(PR_ERR, "NPU: NVLink and OpenCAPI devices on same chip not supported, aborting NPU init\n");
+		return;
+	}
+
+	setup_irqs(npu);
+
+	if (nvlink_detected)
+		npu2_nvlink_init_npu(npu);
+	else if (ocapi_detected)
+		npu2_opencapi_init_npu(npu);
+}
+
+void probe_npu2(void)
+{
+	struct proc_chip *chip = next_chip(NULL);
+	struct npu2 *npu;
+	struct dt_node *np;
+	const char *zcal;
+
+	/* npu2 only */
+	if (!dt_find_compatible_node(dt_root, NULL, "ibm,power9-npu"))
+		return;
+
+	/* Abort if we're running on POWER9C DD1 (P9N DD1 is not supported) */
+	if (chip &&
+	    chip->type == PROC_CHIP_P9_CUMULUS &&
+	    (chip->ec_level & 0xf0) == 0x10) {
+		prlog(PR_INFO, "NPU2: DD1 not supported\n");
+		return;
+	}
+
+	/* Check for a zcal override */
+	zcal = nvram_query_dangerous("nv_zcal_override");
+	if (zcal) {
+		nv_zcal_nominal = atoi(zcal);
+		prlog(PR_WARNING, "NPU2: Using ZCAL impedance override = %d\n", nv_zcal_nominal);
+	}
+
+	if (!platform.npu2_device_detect) {
+		prlog(PR_INFO, "NPU: Platform does not support NPU\n");
+		return;
+	}
+
+	dt_for_each_compatible(dt_root, np, "ibm,power9-npu") {
+	        npu = setup_npu(np);
+		if (!npu)
+			continue;
+		platform.npu2_device_detect(npu);
+		setup_devices(npu);
+	}
+}
diff --git a/roms/skiboot/hw/npu2-hw-procedures.c b/roms/skiboot/hw/npu2-hw-procedures.c
new file mode 100644
index 000000000..fb88dfdf6
--- /dev/null
+++ b/roms/skiboot/hw/npu2-hw-procedures.c
@@ -0,0 +1,1079 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NPU2 (POWER9) Hardware Procedures
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-virt.h>
+#include <interrupts.h>
+#include <npu2.h>
+#include <npu2-regs.h>
+#include <xscom.h>
+
+/* Set in npu2.c if there is an nvram override for the zcal settings on this
+ * machine */
+int nv_zcal_nominal = -1;
+
+/* PHY Registers. The documentation for the PHY training is written in
+ * terms of bits within an actual register so we use that
+ * representation here. */
+struct npu2_phy_reg {
+	uint64_t offset;
+	uint64_t start;
+	uint64_t len;
+};
+
+/*
+ * Currently unused, but documented here:
+static struct npu2_phy_reg NPU2_PHY_RX_DATA_DAC_SPARE_MODE = {0x000, 63, 64};
+static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL6	   = {0x00c, 63, 64};
+static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL5	   = {0x028, 63, 64};
+static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL9	   = {0x030, 63, 64};
+static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL5_EO	   = {0x00a, 63, 64};
+static struct npu2_phy_reg NPU2_PHY_RX_DAC_CNTL4	   = {0x026, 63, 64};
+*/
+static struct npu2_phy_reg NPU2_PHY_RX_RUN_LANE		   = {0x0c8, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_IORESET		   = {0x096, 63, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_IORESET		   = {0x113, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_RESET		   = {0x096, 62, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_LANE_ANA_PDWN	   = {0x002, 54, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_LANE_DIG_PDWN	   = {0x088, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_IQ_RES_SEL	   = {0x004, 59, 3};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_PHASE_STEP	   = {0x08a, 60, 4};
+static struct npu2_phy_reg NPU2_PHY_TX_LANE_PDWN	   = {0x101, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_RUN_DCCAL	   = {0x0c8, 49, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_DCCAL_DONE	   = {0x0ca, 49, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_LANE_BUSY	   = {0x0ca, 50, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_B_BANK_CONTROLS	   = {0x002, 58, 6};
+static struct npu2_phy_reg NPU2_PHY_TX_UNLOAD_CLK_DISABLE  = {0x103, 56, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_FIFO_INIT	   = {0x105, 53, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_RXCAL		   = {0x103, 57, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_INIT_DONE	   = {0x0ca, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_EDGE_TRACK_CNTL  = {0x092, 48, 2};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_BUMP_SL_1UI	   = {0x092, 57, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_FW_OFF	   = {0x08a, 56, 1};
+static struct npu2_phy_reg NPU2_PHY_RX_PR_FW_INERTIA_AMT   = {0x08a, 57, 3};
+static struct npu2_phy_reg NPU2_PHY_RX_CFG_LTE_MC	   = {0x000, 60, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_A_INTEG_COARSE_GAIN = {0x00a, 48, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_A_CTLE_COARSE	   = {0x00c, 48, 5};
+static struct npu2_phy_reg NPU2_PHY_RX_A_CTLE_GAIN	   = {0x00c, 53, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_B_INTEG_COARSE_GAIN = {0x026, 48, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_B_CTLE_COARSE	   = {0x028, 48, 5};
+static struct npu2_phy_reg NPU2_PHY_RX_B_CTLE_GAIN	   = {0x028, 53, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_E_INTEG_COARSE_GAIN = {0x030, 48, 4};
+static struct npu2_phy_reg NPU2_PHY_RX_E_CTLE_COARSE	   = {0x032, 48, 5};
+static struct npu2_phy_reg NPU2_PHY_RX_E_CTLE_GAIN	   = {0x032, 53, 4};
+
+/* These registers are per-PHY, not per lane */
+static struct npu2_phy_reg NPU2_PHY_RX_SPEED_SELECT	       = {0x262, 51, 2};
+static struct npu2_phy_reg NPU2_PHY_RX_AC_COUPLED	       = {0x262, 53, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_SWO_EN	       = {0x3c9, 48, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_REQ		       = {0x3c1, 49, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_DONE	       = {0x3c1, 50, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_ERROR	       = {0x3c1, 51, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_N		       = {0x3c3, 48, 9};
+static struct npu2_phy_reg NPU2_PHY_TX_ZCAL_P		       = {0x3c5, 48, 9};
+static struct npu2_phy_reg NPU2_PHY_TX_FFE_BOOST_EN	       = {0x34b, 59, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_PRE_EN	       = {0x34d, 51, 5};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_PRE_SELECT	       = {0x34d, 56, 5};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_PRE_EN	       = {0x34f, 51, 5};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_PRE_SELECT	       = {0x34f, 56, 5};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_POST_EN	       = {0x361, 49, 7};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_POST_SELECT        = {0x361, 56, 7};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_POST_EN	       = {0x363, 49, 7};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_POST_SELECT        = {0x363, 56, 7};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_MARGINPU_EN        = {0x351, 48, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_MARGINPU_EN        = {0x353, 48, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_MARGINPD_EN        = {0x351, 56, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_MARGINPD_EN        = {0x353, 56, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_MARGINPU_SELECT	       = {0x355, 48, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_MARGINPD_SELECT	       = {0x355, 56, 8};
+static struct npu2_phy_reg NPU2_PHY_TX_PSEG_MAIN_EN	       = {0x357, 51, 7};
+static struct npu2_phy_reg NPU2_PHY_TX_NSEG_MAIN_EN	       = {0x359, 51, 7};
+/* Currently unused, but documented here
+static struct npu2_phy_reg NPU2_PHY_RX_HIST_MIN_EYE_WIDTH      = {0x24e, 54, 8};
+static struct npu2_phy_reg NPU2_PHY_RX_HIST_MIN_EYE_WIDTH_LANE = {0x24e, 49, 5};
+static struct npu2_phy_reg NPU2_PHY_RX_HIST_MIN_EYE_WIDTH_VALID= {0x24e, 48, 1};
+*/
+static struct npu2_phy_reg NPU2_PHY_RX_RC_ENABLE_AUTO_RECAL    = {0x25c, 51, 1};
+
+static struct npu2_phy_reg NPU2_PHY_RX_CLKDIST_PDWN	       = {0x204, 48, 3};
+static struct npu2_phy_reg NPU2_PHY_RX_IREF_PDWN	       = {0x230, 54, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_CLKDIST_PDWN	       = {0x305, 48, 3};
+static struct npu2_phy_reg NPU2_PHY_RX_CTL_DATASM_CLKDIST_PDWN = {0x2e0, 60, 1};
+static struct npu2_phy_reg NPU2_PHY_TX_DRV_DATA_PATTERN_GCRMSG = {0x309, 50, 4};
+
+#define NPU2_PHY_REG(scom_base, reg, lane)					\
+	SETFIELD(PPC_BITMASK(27, 31), ((reg)->offset << 42) | scom_base, lane)
+
+#define NPU2_MAX_PHY_LANE			23
+
+/* This is a bit of a gross hack but it does the job */
+#define FOR_EACH_LANE(ndev, lane) \
+	for (lane = 0; lane <= NPU2_MAX_PHY_LANE; lane++)	\
+		if (!(ndev->lane_mask & (1 << (NPU2_MAX_PHY_LANE - lane)))) \
+			continue;				\
+		else
+
+typedef uint32_t (*step)(struct npu2_dev *);
+
+struct procedure {
+	const char *name;
+	step steps[];
+};
+
+#define DEFINE_PROCEDURE(NAME, STEPS...)		\
+	static struct procedure procedure_##NAME =	\
+	{.name = #NAME, .steps = {NAME, ##STEPS}}
+
+#define PROCEDURE_INPROGRESS	(1 << 31)
+#define PROCEDURE_COMPLETE	(1 << 30)
+#define PROCEDURE_NEXT		(1 << 29)
+#define PROCEDURE_FAILED	2
+#define PROCEDURE_ABORTED 	3
+#define PROCEDURE_UNSUPPORTED	4
+
+/* Mask defining which status bits we want to expose */
+#define PROCEDURE_STATUS_MASK	0xc000000f
+
+static void phy_write_lane(struct npu2_dev *ndev, struct npu2_phy_reg *reg, int lane, uint64_t val)
+{
+	uint64_t old_val, reg_addr;
+	int rc;
+	uint64_t mask = PPC_BITMASK(reg->start, reg->start + reg->len - 1);
+
+	/* Check to make sure we're not trying to specify a lane to a
+	 * non-per-lane register */
+	if (lane >= 0)
+		assert(reg->offset < 0x200);
+	else
+		assert(reg->offset >= 0x200);
+
+	reg_addr = NPU2_PHY_REG(ndev->pl_xscom_base, reg, lane);
+	rc = xscom_read(ndev->npu->chip_id, reg_addr, &old_val);
+	if (rc)
+		NPU2DEVERR(ndev, "error %d reading scom 0x%llx\n", rc, reg_addr);
+	val = SETFIELD(mask, old_val, val);
+	rc = xscom_write(ndev->npu->chip_id, reg_addr, val);
+	if (rc)
+		NPU2DEVERR(ndev, "error %d writing scom 0x%llx\n", rc, reg_addr);
+}
+
+static uint64_t phy_read_lane(struct npu2_dev *ndev, struct npu2_phy_reg *reg, int lane)
+{
+	uint64_t val, reg_addr;
+	int rc;
+	uint64_t mask = PPC_BITMASK(reg->start, reg->start + reg->len - 1);
+
+	/* Check to make sure we're not trying to specify a lane to a
+	 * non-per-lane register */
+	if (lane >= 0)
+		assert(reg->offset < 0x200);
+	else
+		assert(reg->offset >= 0x200);
+
+	reg_addr = NPU2_PHY_REG(ndev->pl_xscom_base, reg, lane);
+	rc = xscom_read(ndev->npu->chip_id, reg_addr, &val);
+	if (rc)
+		NPU2DEVERR(ndev, "error %d reading scom 0x%llx\n", rc, reg_addr);
+
+	return GETFIELD(mask, val);
+}
+
+#define phy_write(ndev, reg, val) phy_write_lane(ndev, reg, -1, val)
+#define phy_read(ndev, reg) phy_read_lane(ndev, reg, -1)
+
+static uint32_t stop(struct npu2_dev *npu_dev __unused)
+{
+	return PROCEDURE_COMPLETE | PROCEDURE_ABORTED;
+}
+DEFINE_PROCEDURE(stop);
+
+static uint32_t nop(struct npu2_dev *npu_dev __unused)
+{
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(nop);
+
+/*
+ * Return the obus (0 or 1) of a device
+ *
+ * Using the brick index is dangerous, because it varies for a link
+ * depending on the mode (opencapi or nvlink)
+ */
+static int obus_index(struct npu2_dev *ndev)
+{
+	if ((ndev->pl_xscom_base & 0x3F000000) == 0x09000000)
+		return 0;
+	else
+		return 1;
+}
+
+/*
+ * Return the brick number (0-2) within an obus chiplet.
+ * Only valid for nvlink devices
+ */
+static int obus_brick_index(struct npu2_dev *ndev)
+{
+	int index = ndev->brick_index % 3;
+
+	assert(ndev->type != NPU2_DEV_TYPE_OPENCAPI);
+	/* On the second obus chiplet, index is reversed */
+	if ((ndev->pl_xscom_base & 0x3F000000) != 0x09000000)
+		return 2 - index;
+
+	return index;
+}
+
+static void set_iovalid(struct npu2_dev *ndev, bool raise)
+{
+	uint64_t addr, val, mask;
+	int rc;
+
+	if (ndev->type == NPU2_DEV_TYPE_OPENCAPI)
+		return;
+
+	addr = (ndev->pl_xscom_base & 0x3F000000) | 0x9;
+	mask = PPC_BIT(6 + obus_brick_index(ndev));
+	val = raise ? mask : 0;
+
+	rc = xscom_write_mask(ndev->npu->chip_id, addr, val, mask);
+	if (rc)
+		NPU2DEVERR(ndev, "error %d writing scom 0x%llx\n", rc, addr);
+}
+
+static bool poll_fence_status(struct npu2_dev *ndev, uint64_t val)
+{
+	uint64_t fs;
+	int i;
+
+	for (i = 0; i < 4096; i++) {
+		fs = npu2_read(ndev->npu, NPU2_NTL_CQ_FENCE_STATUS(ndev));
+		if ((fs & 0xc000000000000000UL) == val)
+			return true;
+	}
+
+	NPU2DEVERR(ndev, "NPU2_NTL_CQ_FENCE_STATUS timeout (0x%llx)\n", val);
+	return false;
+}
+
+/* Procedure 1.2.1 - Reset NPU/NDL */
+uint32_t reset_ntl(struct npu2_dev *ndev)
+{
+	uint64_t val, check;
+	int lane, i;
+
+	set_iovalid(ndev, true);
+
+	/* Power on clocks */
+	phy_write(ndev, &NPU2_PHY_RX_CLKDIST_PDWN, 0);
+	phy_write(ndev, &NPU2_PHY_RX_IREF_PDWN, 1);
+	phy_write(ndev, &NPU2_PHY_TX_CLKDIST_PDWN, 0);
+	phy_write(ndev, &NPU2_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0);
+
+	FOR_EACH_LANE(ndev, lane) {
+		phy_write_lane(ndev, &NPU2_PHY_RX_LANE_ANA_PDWN, lane, 0);
+		phy_write_lane(ndev, &NPU2_PHY_RX_LANE_DIG_PDWN, lane, 0);
+		phy_write_lane(ndev, &NPU2_PHY_TX_LANE_PDWN, lane, 0);
+	}
+
+	/* Clear fence state for the brick */
+	val = npu2_read(ndev->npu, NPU2_MISC_FENCE_STATE);
+	if (val) {
+		NPU2DEVINF(ndev, "Clearing all bricks fence\n");
+		npu2_write(ndev->npu, NPU2_MISC_FENCE_STATE, val);
+		for (i = 0, check = 0; i < 4096; i++) {
+			check = npu2_read(ndev->npu, NPU2_NTL_CQ_FENCE_STATUS(ndev));
+			if (!check)
+				break;
+		}
+		if (check)
+			NPU2DEVERR(ndev, "Clearing NPU2_MISC_FENCE_STATE=0x%llx timeout, current=0x%llx\n",
+					val, check);
+	}
+
+	/* Write PRI */
+	val = SETFIELD(PPC_BITMASK(0,1), 0ull, obus_brick_index(ndev));
+	npu2_write_mask(ndev->npu, NPU2_NTL_PRI_CFG(ndev), val, -1ULL);
+
+	val = NPU2_NTL_MISC_CFG2_NDL_RX_PARITY_ENA;
+	npu2_write_mask(ndev->npu, NPU2_NTL_MISC_CFG2(ndev), 0ull, val);
+
+	/* NTL Reset */
+	val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev));
+	val |= PPC_BIT(8) | PPC_BIT(9);
+	npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val);
+
+	if (!poll_fence_status(ndev, 0xc000000000000000UL))
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t reset_ndl(struct npu2_dev *ndev)
+{
+	uint64_t val;
+
+	val = npu2_read_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev));
+	val |= PPC_BIT32(0) | PPC_BIT32(1);
+	npu2_write_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev), val);
+
+	val = npu2_read_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev));
+	val &= ~(PPC_BIT32(0) | PPC_BIT32(1));
+	npu2_write_4b(ndev->npu, NPU2_NTL_DL_CONTROL(ndev), val);
+
+	val = PPC_BIT32(0);
+	npu2_write_4b(ndev->npu, NPU2_NTL_DL_CONFIG(ndev), val);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t reset_ntl_release(struct npu2_dev *ndev)
+{
+	uint64_t val;
+	uint64_t npu2_fir;
+	uint64_t npu2_fir_addr;
+	int i;
+
+	/* Clear FIR bits */
+	npu2_fir_addr = NPU2_FIR_REGISTER_0;
+	npu2_fir = 0;
+
+	for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
+		xscom_write(ndev->npu->chip_id, npu2_fir_addr, npu2_fir);
+		npu2_fir_addr += NPU2_FIR_OFFSET;
+
+	}
+
+	val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev));
+	val &= 0xFFBFFFFFFFFFFFFFUL;
+	npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val);
+
+	if (!poll_fence_status(ndev, 0x8000000000000000UL))
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t reset_ntl_finish(struct npu2_dev *ndev)
+{
+	/* Credit Setup */
+	npu2_write(ndev->npu, NPU2_NTL_CRED_HDR_CREDIT_TX(ndev), 0x0200000000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_PRB_HDR_CREDIT_TX(ndev), 0x0200000000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_ATR_HDR_CREDIT_TX(ndev), 0x0200000000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_RSP_HDR_CREDIT_TX(ndev), 0x0200000000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_CRED_DATA_CREDIT_TX(ndev), 0x1000000000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_RSP_DATA_CREDIT_TX(ndev), 0x1000000000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_CRED_HDR_CREDIT_RX(ndev), 0x0000BE0000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_DBD_HDR_CREDIT_RX(ndev), 0x0000640000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_ATSD_HDR_CREDIT_RX(ndev), 0x0000200000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_RSP_HDR_CREDIT_RX(ndev), 0x0000BE0000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_CRED_DATA_CREDIT_RX(ndev), 0x0001000000000000UL);
+	npu2_write(ndev->npu, NPU2_NTL_RSP_DATA_CREDIT_RX(ndev), 0x0001000000000000UL);
+
+	npu2_set_link_flag(ndev, NPU2_DEV_DL_RESET);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(reset_ntl, reset_ndl, reset_ntl_release, reset_ntl_finish);
+
+/* Procedure 1.2.2 - Reset I/O PHY Lanes */
+static uint32_t phy_reset(struct npu2_dev *ndev)
+{
+	int lane;
+
+	set_iovalid(ndev, false);
+
+	/* Power on clocks */
+	phy_write(ndev, &NPU2_PHY_RX_CLKDIST_PDWN, 0);
+	phy_write(ndev, &NPU2_PHY_RX_IREF_PDWN, 1);
+	phy_write(ndev, &NPU2_PHY_TX_CLKDIST_PDWN, 0);
+	phy_write(ndev, &NPU2_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0);
+
+	FOR_EACH_LANE(ndev, lane)
+		phy_write_lane(ndev, &NPU2_PHY_RX_RUN_LANE, lane, 0);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_reset_wait(struct npu2_dev *ndev)
+{
+	int lane;
+
+	/* Wait for all lanes to become inactive */
+	FOR_EACH_LANE(ndev, lane)
+		if (phy_read_lane(ndev, &NPU2_PHY_RX_LANE_BUSY, lane))
+			return PROCEDURE_INPROGRESS;
+
+	FOR_EACH_LANE(ndev, lane) {
+		/* Set lane in reset */
+		phy_write_lane(ndev, &NPU2_PHY_RX_IORESET, lane, 1);
+		phy_write_lane(ndev, &NPU2_PHY_TX_IORESET, lane, 1);
+
+		/* Release lane from reset */
+		phy_write_lane(ndev, &NPU2_PHY_RX_IORESET, lane, 0);
+		phy_write_lane(ndev, &NPU2_PHY_TX_IORESET, lane, 0);
+
+		/* Reset the phase rotator */
+		phy_write_lane(ndev, &NPU2_PHY_RX_PR_RESET, lane, 1);
+		phy_write_lane(ndev, &NPU2_PHY_RX_PR_RESET, lane, 0);
+	}
+
+	return PROCEDURE_NEXT;
+}
+
+/* Procedure 1.2.3 - Initialise I/O PHY Registers */
+static uint32_t phy_reset_complete(struct npu2_dev *ndev)
+{
+	int lane;
+
+	FOR_EACH_LANE(ndev, lane) {
+		phy_write_lane(ndev, &NPU2_PHY_RX_LANE_ANA_PDWN, lane, 0);
+		phy_write_lane(ndev, &NPU2_PHY_RX_LANE_DIG_PDWN, lane, 0);
+		phy_write_lane(ndev, &NPU2_PHY_RX_PR_IQ_RES_SEL, lane, 0x7);
+		phy_write_lane(ndev, &NPU2_PHY_RX_PR_PHASE_STEP, lane, 0xc);
+		phy_write_lane(ndev, &NPU2_PHY_TX_LANE_PDWN, lane, 0);
+		phy_write_lane(ndev, &NPU2_PHY_RX_PR_FW_INERTIA_AMT, lane, 4);
+		phy_write_lane(ndev, &NPU2_PHY_RX_CFG_LTE_MC, lane, 3);
+		phy_write_lane(ndev, &NPU2_PHY_RX_A_INTEG_COARSE_GAIN, lane, 11);
+		phy_write_lane(ndev, &NPU2_PHY_RX_B_INTEG_COARSE_GAIN, lane, 11);
+		phy_write_lane(ndev, &NPU2_PHY_RX_E_INTEG_COARSE_GAIN, lane, 11);
+
+		if (ndev->type == NPU2_DEV_TYPE_OPENCAPI) {
+			phy_write_lane(ndev, &NPU2_PHY_RX_A_CTLE_GAIN, lane, 0);
+			phy_write_lane(ndev, &NPU2_PHY_RX_B_CTLE_GAIN, lane, 0);
+			phy_write_lane(ndev, &NPU2_PHY_RX_E_CTLE_GAIN, lane, 0);
+
+			phy_write_lane(ndev, &NPU2_PHY_RX_A_CTLE_COARSE, lane, 20);
+			phy_write_lane(ndev, &NPU2_PHY_RX_B_CTLE_COARSE, lane, 20);
+			phy_write_lane(ndev, &NPU2_PHY_RX_E_CTLE_COARSE, lane, 20);
+		}
+	}
+
+	set_iovalid(ndev, true);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete);
+
+/* Procedure 1.2.6 - I/O PHY Tx Impedance Calibration */
+static uint32_t phy_tx_zcal(struct npu2_dev *ndev)
+{
+	if (ndev->npu->tx_zcal_complete[obus_index(ndev)])
+		return PROCEDURE_COMPLETE;
+
+	/* Turn off SW enable and enable zcal state machine */
+	phy_write(ndev, &NPU2_PHY_TX_ZCAL_SWO_EN, 0);
+
+	/* Start impedance calibration state machine */
+	phy_write(ndev, &NPU2_PHY_TX_ZCAL_REQ, 1);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_tx_zcal_wait(struct npu2_dev *ndev)
+{
+	int done, error;
+
+	done = phy_read(ndev, &NPU2_PHY_TX_ZCAL_DONE);
+	error = phy_read(ndev, &NPU2_PHY_TX_ZCAL_ERROR);
+
+	/* We have never seen this in the field and it is not expected.
+	 * Therefore it's best to error out which will complain loudly. Nominal
+	 * vaules may be set in nvram to ignore this error. */
+	if (error && nv_zcal_nominal < 0) {
+		NPU2DEVERR(ndev, "ZCAL failed. Nominal values may be used by"
+			   " setting nvram variable nv_zcal_override = 50\n");
+		NPU2DEVERR(ndev, "However this may impact link performance\n");
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+	}
+
+	if (!done)
+		return PROCEDURE_INPROGRESS;
+
+	return PROCEDURE_NEXT;
+}
+
+#define MARGIN_RATIO		(0)
+#define FFE_PRE_COEFF		(0)
+#define FFE_POST_COEFF		(0)
+
+#define PRE_WIDTH		(5)
+#define POST_WIDTH		(7)
+#define MAIN_WIDTH		(7)
+#define ZCAL_MIN		(16 * 2)
+#define ZCAL_MAX		(33 * 2)
+#define PRECURSOR_X2_MAX	(4 * 2 + 1)
+#define POSTCURSOR_X2_MAX	(6 * 2 + 1)
+#define MARGIN_X2_MAX		(8 * 2)
+#define MAIN_X2_MAX		((6 * 2) + 1)
+#define TOTAL_X2_MAX		(PRECURSOR_X2_MAX + POSTCURSOR_X2_MAX + 2*MARGIN_X2_MAX + MAIN_X2_MAX)
+
+static uint32_t therm(uint32_t dec)
+{
+	return ((0x1 << dec) - 1);
+}
+
+static uint32_t therm_with_half(uint32_t dec, uint8_t width)
+{
+	/* If the LSB of the 2r equivalent is on, then we need to set the 2r bit (MSB) */
+	uint32_t half_on = ( dec & 0x1 ) << ( width - 1 );
+
+	/* Shift the 2r equivalent to a 1r value and convert to a thermometer code. */
+	uint32_t x1_equiv = ((1 << (dec >> 1 )) - 1);
+
+	/* Combine 1r equivalent thermometer code + the 2r MSB value. */
+	return half_on | x1_equiv;
+}
+
+static uint32_t phy_tx_zcal_calculate(struct npu2_dev *ndev)
+{
+	int p_value, n_value;
+	int ffe_pre_coeff = FFE_PRE_COEFF;
+	int ffe_post_coeff = FFE_POST_COEFF;
+	uint32_t zcal_n;
+	uint32_t zcal_p;
+	uint32_t p_main_enable = MAIN_X2_MAX;
+	uint32_t p_margin_pu_enable = MARGIN_X2_MAX;
+	uint32_t p_margin_pd_enable = MARGIN_X2_MAX;
+	uint32_t p_precursor_select;
+	uint32_t p_postcursor_select;
+	uint32_t margin_pu_select;
+	uint32_t n_main_enable = MAIN_X2_MAX;
+	uint32_t n_margin_pu_enable = MARGIN_X2_MAX;
+	uint32_t n_margin_pd_enable = MARGIN_X2_MAX;
+	uint32_t n_precursor_select;
+	uint32_t n_postcursor_select;
+	uint32_t margin_pd_select;
+	uint32_t margin_select;
+
+	if (nv_zcal_nominal < 0) {
+		/* Convert the value from 8R to 2R by / 4 */
+		zcal_n = phy_read(ndev, &NPU2_PHY_TX_ZCAL_N) / 4;
+		zcal_p = phy_read(ndev, &NPU2_PHY_TX_ZCAL_P) / 4;
+	} else {
+		zcal_n = zcal_p = nv_zcal_nominal;
+		NPU2DEVINF(ndev, "Using nominal values for zcal, performance may be impacted\n");
+	}
+
+	/* Again, if the hardware detects an unexpected condition it's
+	 * better just to fail loudly. */
+	if ((zcal_n < ZCAL_MIN) || (zcal_n > ZCAL_MAX) ||
+	    (zcal_p < ZCAL_MIN) || (zcal_p > ZCAL_MAX))
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+	if (ndev->type == NPU2_DEV_TYPE_OPENCAPI &&
+	    platform.ocapi->phy_setup) {
+		ffe_pre_coeff = platform.ocapi->phy_setup->tx_ffe_pre_coeff;
+		ffe_post_coeff = platform.ocapi->phy_setup->tx_ffe_post_coeff;
+	}
+
+	p_value = zcal_p - TOTAL_X2_MAX;
+	p_precursor_select = (p_value * ffe_pre_coeff)/128;
+	p_postcursor_select = (p_value * ffe_post_coeff)/128;
+	margin_pu_select = (p_value * MARGIN_RATIO)/256;
+
+	if (p_value % 2) {
+		p_main_enable--;
+		p_value++;
+	}
+
+	while (p_value < 0) {
+		if (p_main_enable > 1) {
+			p_main_enable -= 2;
+		} else if ((p_margin_pu_enable + p_margin_pd_enable) > 0) {
+			if (p_margin_pu_enable == p_margin_pd_enable)
+				p_margin_pd_enable -= 2;
+			else
+				p_margin_pu_enable -= 2;
+		}
+		p_value += 2;
+	}
+
+	n_value = zcal_n - TOTAL_X2_MAX;
+	n_precursor_select = (n_value * ffe_pre_coeff)/128;
+	n_postcursor_select = (n_value * ffe_post_coeff)/128;
+	margin_pd_select = (p_value * MARGIN_RATIO)/256;
+
+	if (n_value % 2) {
+		n_main_enable--;
+		n_value++;
+	}
+
+	while (n_value < 0) {
+		if (n_main_enable > 1) {
+			n_main_enable -= 2;
+		} else if ((n_margin_pu_enable + n_margin_pd_enable) > 0) {
+			if (n_margin_pu_enable == n_margin_pd_enable)
+				n_margin_pd_enable -= 2;
+			else
+				n_margin_pu_enable -= 2;
+		}
+		n_value += 2;
+	}
+
+	margin_select = therm((margin_pu_select + 1)/2) &
+		therm((margin_pd_select + 1)/2) &
+		therm((p_margin_pu_enable + 1)/2) &
+		therm((p_margin_pd_enable + 1)/2) &
+		therm((n_margin_pu_enable + 1)/2) &
+		therm((n_margin_pd_enable + 1)/2);
+
+	phy_write(ndev, &NPU2_PHY_TX_PSEG_PRE_EN, therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
+	phy_write(ndev, &NPU2_PHY_TX_PSEG_PRE_SELECT, therm_with_half(p_precursor_select, PRE_WIDTH));
+	phy_write(ndev, &NPU2_PHY_TX_PSEG_POST_EN, therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
+	phy_write(ndev, &NPU2_PHY_TX_PSEG_POST_SELECT, therm_with_half(p_postcursor_select, POST_WIDTH));
+	phy_write(ndev, &NPU2_PHY_TX_PSEG_MARGINPU_EN, therm((p_margin_pu_enable + 1)/2));
+	phy_write(ndev, &NPU2_PHY_TX_PSEG_MARGINPD_EN, therm((p_margin_pd_enable + 1)/2));
+	phy_write(ndev, &NPU2_PHY_TX_PSEG_MAIN_EN, therm_with_half(p_main_enable, MAIN_WIDTH));
+
+	phy_write(ndev, &NPU2_PHY_TX_NSEG_PRE_EN, therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
+	phy_write(ndev, &NPU2_PHY_TX_NSEG_PRE_SELECT, therm_with_half(n_precursor_select, PRE_WIDTH));
+	phy_write(ndev, &NPU2_PHY_TX_NSEG_POST_EN, therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
+	phy_write(ndev, &NPU2_PHY_TX_NSEG_POST_SELECT, therm_with_half(n_postcursor_select, POST_WIDTH));
+	phy_write(ndev, &NPU2_PHY_TX_NSEG_MARGINPU_EN, therm((n_margin_pu_enable + 1)/2));
+	phy_write(ndev, &NPU2_PHY_TX_NSEG_MARGINPD_EN, therm((n_margin_pd_enable + 1)/2));
+	phy_write(ndev, &NPU2_PHY_TX_NSEG_MAIN_EN, therm_with_half(n_main_enable, MAIN_WIDTH));
+
+	phy_write(ndev, &NPU2_PHY_TX_MARGINPU_SELECT, therm(margin_select + 1)/2);
+	phy_write(ndev, &NPU2_PHY_TX_MARGINPD_SELECT, therm(margin_select + 1)/2);
+
+	ndev->npu->tx_zcal_complete[obus_index(ndev)] = 1;
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate);
+
+/* Procedure 1.2.8 - Enable Downstream Link Training */
+static uint32_t phy_enable_tx_rxcal(struct npu2_dev *ndev)
+{
+	int lane;
+
+	FOR_EACH_LANE(ndev, lane)
+		phy_write_lane(ndev, &NPU2_PHY_TX_RXCAL, lane, 1);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_enable_tx_rxcal);
+
+/* Procedure 1.2.9 - Disable Downstream Link Training */
+static uint32_t phy_disable_tx_rxcal(struct npu2_dev *ndev)
+{
+	int lane;
+
+	FOR_EACH_LANE(ndev, lane)
+		phy_write_lane(ndev, &NPU2_PHY_TX_RXCAL, lane, 0);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_disable_tx_rxcal);
+
+/* Procedure 1.2.4 - I/O PHY DC Calibration */
+static uint32_t phy_rx_dccal(struct npu2_dev *ndev)
+{
+	int lane;
+
+	set_iovalid(ndev, false);
+
+	FOR_EACH_LANE(ndev, lane)
+		phy_write_lane(ndev, &NPU2_PHY_RX_PR_FW_OFF, lane, 1);
+
+	FOR_EACH_LANE(ndev, lane)
+		phy_write_lane(ndev, &NPU2_PHY_RX_RUN_DCCAL, lane, 1);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_complete(struct npu2_dev *ndev)
+{
+	int lane;
+
+	FOR_EACH_LANE(ndev, lane)
+		if (!phy_read_lane(ndev, &NPU2_PHY_RX_DCCAL_DONE, lane))
+			return PROCEDURE_INPROGRESS;
+
+	FOR_EACH_LANE(ndev, lane)
+		phy_write_lane(ndev, &NPU2_PHY_RX_RUN_DCCAL, lane, 0);
+
+	FOR_EACH_LANE(ndev, lane) {
+		phy_write_lane(ndev, &NPU2_PHY_RX_B_BANK_CONTROLS, lane, 0);
+		phy_write_lane(ndev, &NPU2_PHY_RX_PR_EDGE_TRACK_CNTL, lane, 0);
+		phy_write_lane(ndev, &NPU2_PHY_RX_PR_FW_OFF, lane, 0);
+	}
+
+	set_iovalid(ndev, true);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_clock_sel(struct npu2_dev *ndev)
+{
+	if (ndev->type != NPU2_DEV_TYPE_OPENCAPI) {
+		/*
+		 * Change the RX clk mux control to be done by
+		 * software instead of HW. This avoids glitches caused
+		 * by changing the mux setting.
+		 *
+		 * Work around a known DL bug by doing these writes
+		 * twice.
+		 */
+		npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev),
+				0x80000002, 0x80000003);
+		npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev),
+				0x80000002, 0x80000003);
+
+		npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev),
+				0x80000000, 0x80000003);
+		npu2_write_mask_4b(ndev->npu, NPU2_NTL_DL_CLK_CTRL(ndev),
+				0x80000000, 0x80000003);
+	}
+	return PROCEDURE_NEXT;
+}
+
+/* Procedure 1.2.5 - IO PHY Tx FIFO Init */
+static uint32_t phy_tx_fifo_init(struct npu2_dev *ndev)
+{
+	int lane;
+
+	FOR_EACH_LANE(ndev, lane) {
+		phy_write_lane(ndev, &NPU2_PHY_TX_UNLOAD_CLK_DISABLE, lane, 0);
+		phy_write_lane(ndev, &NPU2_PHY_TX_FIFO_INIT, lane, 1);
+		phy_write_lane(ndev, &NPU2_PHY_TX_UNLOAD_CLK_DISABLE, lane, 1);
+	}
+
+	return PROCEDURE_COMPLETE;
+}
+
+/* We group TX FIFO init in here mainly because that's what was done
+ * on NVLink1 */
+DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_complete, phy_rx_clock_sel,
+		 phy_tx_fifo_init);
+
+/* Procedure 1.2.7 - I/O PHY Upstream Link Training */
+static uint32_t phy_rx_training(struct npu2_dev *ndev)
+{
+	int lane;
+
+	FOR_EACH_LANE(ndev, lane)
+		phy_write_lane(ndev, &NPU2_PHY_RX_RUN_LANE, lane, 1);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_training_wait(struct npu2_dev *ndev)
+{
+	int lane;
+
+	FOR_EACH_LANE(ndev, lane)
+		if (!phy_read_lane(ndev, &NPU2_PHY_RX_INIT_DONE, lane))
+			return PROCEDURE_INPROGRESS;
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_rx_training, phy_rx_training_wait);
+
+static uint32_t check_credit(struct npu2_dev *ndev, uint64_t reg,
+			     const char *reg_name, uint64_t expected)
+{
+	uint64_t val;
+
+	val = npu2_read(ndev->npu, reg);
+	if (val == expected)
+		return 0;
+
+	NPU2DEVERR(ndev, "%s: expected 0x%llx, read 0x%llx\n",
+		   reg_name, expected, val);
+
+	return 1;
+}
+
+#define CHECK_CREDIT(ndev, reg, expected) \
+	check_credit(ndev, reg(ndev), #reg, expected);
+
+static uint32_t check_credits(struct npu2_dev *ndev)
+{
+	uint64_t val;
+
+	CHECK_CREDIT(ndev, NPU2_NTL_CRED_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL);
+	CHECK_CREDIT(ndev, NPU2_NTL_RSP_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL);
+	CHECK_CREDIT(ndev, NPU2_NTL_CRED_DATA_CREDIT_RX, 0x1001000000000000ULL);
+	CHECK_CREDIT(ndev, NPU2_NTL_RSP_DATA_CREDIT_RX, 0x1001000000000000ULL);
+	CHECK_CREDIT(ndev, NPU2_NTL_DBD_HDR_CREDIT_RX, 0x0640640000000000ULL);
+	CHECK_CREDIT(ndev, NPU2_NTL_ATSD_HDR_CREDIT_RX, 0x0200200000000000ULL);
+
+	val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev));
+	val &= 0xFF3FFFFFFFFFFFFFUL;
+	npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val);
+
+	if (!poll_fence_status(ndev, 0x0))
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+	val = NPU2_NTL_MISC_CFG2_NDL_RX_PARITY_ENA;
+	npu2_write_mask(ndev->npu, NPU2_NTL_MISC_CFG2(ndev), val, val);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(check_credits);
+
+static struct procedure *npu_procedures[] = {
+	&procedure_stop,
+	&procedure_nop,
+	NULL,
+	NULL,
+	&procedure_phy_reset,
+	&procedure_phy_tx_zcal,
+	&procedure_phy_rx_dccal,
+	&procedure_phy_enable_tx_rxcal,
+	&procedure_phy_disable_tx_rxcal,
+	&procedure_phy_rx_training,
+	&procedure_reset_ntl,
+
+	/* Place holders for pre-terminate and terminate procedures */
+	&procedure_nop,
+	&procedure_nop,
+	&procedure_check_credits
+};
+
+/* Run a procedure step(s) and return status */
+static uint32_t get_procedure_status(struct npu2_dev *dev)
+{
+	uint32_t result;
+	uint16_t procedure = dev->procedure_number;
+	uint16_t step = dev->procedure_step;
+	const char *name = npu_procedures[procedure]->name;
+
+	do {
+		result = npu_procedures[procedure]->steps[step](dev);
+
+		if (result & PROCEDURE_NEXT) {
+			step++;
+			NPU2DEVINF(dev, "Running procedure %s step %d\n", name, step);
+		}
+	} while (result & PROCEDURE_NEXT);
+
+	dev->procedure_step = step;
+
+	if (result & PROCEDURE_COMPLETE)
+		NPU2DEVINF(dev, "Procedure %s complete\n", name);
+	else if (mftb() > dev->procedure_tb + msecs_to_tb(1000)) {
+		NPU2DEVINF(dev, "Procedure %s timed out\n", name);
+		result = PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+	}
+
+	/* Mask off internal state bits */
+	dev->procedure_status = result & PROCEDURE_STATUS_MASK;
+
+	return dev->procedure_status;
+}
+
+static int64_t npu_dev_procedure_read(struct npu2_dev *dev, uint32_t offset,
+				      uint32_t size, uint32_t *data)
+{
+	int64_t rc = OPAL_SUCCESS;
+
+	if (size != 4) {
+		/* Short config reads are not supported */
+		prlog(PR_ERR, "NPU%d: Short read of procedure register\n", npu2_dev_to_phb(dev)->opal_id);
+		return OPAL_PARAMETER;
+	}
+
+	*data = 0;
+
+	switch (offset) {
+	case 0:
+		/* Only run the procedure if not already complete */
+		if (dev->procedure_status & PROCEDURE_COMPLETE)
+			*data = dev->procedure_status;
+		else
+			*data = get_procedure_status(dev);
+
+		break;
+
+	case 4:
+		*data = dev->procedure_number;
+		break;
+
+	default:
+		prlog(PR_ERR, "NPU%d: Invalid vendor specific offset 0x%08x\n",
+		      npu2_dev_to_phb(dev)->opal_id, offset);
+		rc = OPAL_PARAMETER;
+	}
+
+	return rc;
+}
+
+static int64_t npu_dev_procedure_write(struct npu2_dev *dev, uint32_t offset,
+				       uint32_t size, uint32_t data)
+{
+	const char *name;
+	int64_t rc = OPAL_SUCCESS;
+
+	if (size != 4) {
+		/* Short config writes are not supported */
+		prlog(PR_ERR, "NPU%d: Short read of procedure register\n",
+		      npu2_dev_to_phb(dev)->opal_id);
+		return OPAL_PARAMETER;
+	}
+
+	switch (offset) {
+	case 0:
+		/* We ignore writes to the status register */
+		NPU2DEVINF(dev, "Ignoring writes to status register\n");
+		break;
+
+	case 4:
+		if (data >= ARRAY_SIZE(npu_procedures) ||
+		    !npu_procedures[data]) {
+			NPU2DEVINF(dev, "Unsupported procedure number %d\n", data);
+			dev->procedure_status = PROCEDURE_COMPLETE
+				| PROCEDURE_UNSUPPORTED;
+			break;
+		}
+
+		name = npu_procedures[data]->name;
+		if (dev->procedure_number == data
+		    && !(dev->procedure_status & PROCEDURE_COMPLETE))
+			NPU2DEVINF(dev, "Restarting procedure %s\n", name);
+		else
+			NPU2DEVINF(dev, "Starting procedure %s\n", name);
+
+		dev->procedure_status = PROCEDURE_INPROGRESS;
+		dev->procedure_number = data;
+		dev->procedure_step = 0;
+		dev->procedure_tb = mftb();
+		break;
+
+	default:
+		NPU2DEVINF(dev, "Invalid vendor specific offset 0x%08x\n", offset);
+		rc = OPAL_PARAMETER;
+	}
+
+	return rc;
+}
+
+int64_t npu2_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf,
+			   uint32_t offset, uint32_t len, uint32_t *data,
+			   bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu2_dev *ndev = pvd->data;
+
+	if (write)
+		return npu_dev_procedure_write(ndev, offset - pcrf->start,
+					       len, *data);
+
+	return npu_dev_procedure_read(ndev, offset - pcrf->start, len, data);
+}
+
+void npu2_dev_procedure_reset(struct npu2_dev *dev)
+{
+	uint64_t val;
+
+	/* Fence the brick */
+	val = npu2_read(dev->npu, NPU2_NTL_MISC_CFG1(dev));
+	val |= PPC_BIT(8) | PPC_BIT(9);
+	npu2_write(dev->npu, NPU2_NTL_MISC_CFG1(dev), val);
+
+	npu2_clear_link_flag(dev, NPU2_DEV_DL_RESET);
+}
+
+static uint32_t run_procedure(struct npu2_dev *dev, uint16_t procedure_number)
+{
+	struct procedure *proc;
+	const char *name;
+	uint32_t result;
+
+	assert(procedure_number <= ARRAY_SIZE(npu_procedures));
+	proc = npu_procedures[procedure_number];
+	assert(proc);
+
+	name = proc->name;
+	NPU2DEVINF(dev, "Running procedure %s\n", name);
+	dev->procedure_status = PROCEDURE_INPROGRESS;
+	dev->procedure_number = procedure_number;
+	dev->procedure_step = 0;
+	dev->procedure_tb = mftb();
+
+	result = get_procedure_status(dev);
+	while (!(result & PROCEDURE_COMPLETE)) {
+		time_wait_ms(1);
+		result = get_procedure_status(dev);
+	}
+	return result;
+}
+
+void npu2_opencapi_bump_ui_lane(struct npu2_dev *dev)
+{
+	uint64_t reg;
+	uint64_t status_xscom;
+	int lane, bit = 7;
+
+	status_xscom = OB_ODL_TRAINING_STATUS(dev->brick_index);
+	xscom_read(dev->npu->chip_id, status_xscom, &reg);
+	reg = GETFIELD(OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B, reg);
+
+	FOR_EACH_LANE(dev, lane) {
+		if (reg & (1 << bit--))
+			continue;
+		prlog(PR_TRACE, "OCAPI: bumpui bumping lane %d\n", lane);
+		for (int i = 0; i < 4; i++) {
+			phy_write_lane(dev, &NPU2_PHY_RX_PR_BUMP_SL_1UI, lane, 1);
+			phy_write_lane(dev, &NPU2_PHY_RX_PR_BUMP_SL_1UI, lane, 0);
+		}
+	}
+}
+
+void npu2_opencapi_phy_init(struct npu2_dev *dev)
+{
+	if (platform.ocapi->phy_setup) {
+		OCAPIINF(dev, "Enabling platform-specific PHY setup\n");
+		phy_write(dev, &NPU2_PHY_TX_FFE_BOOST_EN,
+			  platform.ocapi->phy_setup->tx_ffe_boost_en);
+	}
+
+	run_procedure(dev, 5); /* procedure_phy_tx_zcal */
+	/*
+	 * This is only required for OpenCAPI - Hostboot tries to set this
+	 * on systems where it can tell a link is OpenCAPI, but for
+	 * Witherspoon it needs to be done in skiboot after device detection.
+	 */
+	phy_write(dev, &NPU2_PHY_RX_RC_ENABLE_AUTO_RECAL, 0x1);
+	phy_write(dev, &NPU2_PHY_RX_AC_COUPLED, 1);
+
+	switch (dev->link_speed) {
+	case 20000000000UL:
+		OCAPIINF(dev, "Link speed set at 20Gb/s\n");
+		phy_write(dev, &NPU2_PHY_RX_SPEED_SELECT, 1);
+		break;
+	case 25000000000UL:
+	case 25781250000UL:
+		OCAPIINF(dev, "Link speed set at 25.xGb/s\n");
+		phy_write(dev, &NPU2_PHY_RX_SPEED_SELECT, 0);
+		break;
+	default:
+		OCAPIERR(dev, "Invalid link speed!\n");
+		assert(false);
+	}
+}
+
+int npu2_opencapi_phy_reset(struct npu2_dev *dev)
+{
+	int rc;
+
+	rc = run_procedure(dev, 4); /* procedure_phy_reset */
+	if (rc != PROCEDURE_COMPLETE)
+		return -1;
+	rc = run_procedure(dev, 6); /* procedure_phy_rx_dccal */
+	if (rc != PROCEDURE_COMPLETE)
+		return -1;
+	return 0;
+}
+
+void npu2_opencapi_phy_prbs31(struct npu2_dev *dev)
+{
+	phy_write(dev, &NPU2_PHY_TX_DRV_DATA_PATTERN_GCRMSG, 0xD);
+}
diff --git a/roms/skiboot/hw/npu2-opencapi.c b/roms/skiboot/hw/npu2-opencapi.c
new file mode 100644
index 000000000..035c6cdc3
--- /dev/null
+++ b/roms/skiboot/hw/npu2-opencapi.c
@@ -0,0 +1,2370 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Support for OpenCAPI on POWER9 NPUs
+ *
+ * This file provides support for OpenCAPI as implemented on POWER9.
+ *
+ * At present, we initialise the NPU separately from the NVLink code in npu2.c.
+ * As such, we don't currently support mixed NVLink and OpenCAPI configurations
+ * on the same NPU for machines such as Witherspoon.
+ *
+ * Procedure references in this file are to the POWER9 OpenCAPI NPU Workbook
+ * (IBM internal document).
+ *
+ * TODO:
+ *   - Support for mixed NVLink and OpenCAPI on the same NPU
+ *   - Support for link ganging (one AFU using multiple links)
+ *   - Link reset and error handling
+ *   - Presence detection
+ *   - Consume HDAT NPU information
+ *   - LPC Memory support
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <opal-api.h>
+#include <npu2.h>
+#include <npu2-regs.h>
+#include <phys-map.h>
+#include <i2c.h>
+#include <nvram.h>
+
+#define NPU_IRQ_LEVELS_XSL	23
+#define MAX_PE_HANDLE		((1 << 15) - 1)
+#define TL_MAX_TEMPLATE		63
+#define TL_RATE_BUF_SIZE	32
+
+#define OCAPI_SLOT_NORMAL                   PCI_SLOT_STATE_NORMAL
+#define OCAPI_SLOT_LINK                     PCI_SLOT_STATE_LINK
+#define   OCAPI_SLOT_LINK_START             (OCAPI_SLOT_LINK + 1)
+#define   OCAPI_SLOT_LINK_WAIT              (OCAPI_SLOT_LINK + 2)
+#define   OCAPI_SLOT_LINK_TRAINED           (OCAPI_SLOT_LINK + 3)
+#define OCAPI_SLOT_FRESET                   PCI_SLOT_STATE_FRESET
+#define   OCAPI_SLOT_FRESET_START           (OCAPI_SLOT_FRESET + 1)
+#define   OCAPI_SLOT_FRESET_INIT            (OCAPI_SLOT_FRESET + 2)
+#define   OCAPI_SLOT_FRESET_ASSERT_DELAY    (OCAPI_SLOT_FRESET + 3)
+#define   OCAPI_SLOT_FRESET_DEASSERT_DELAY  (OCAPI_SLOT_FRESET + 4)
+#define   OCAPI_SLOT_FRESET_INIT_DELAY      (OCAPI_SLOT_FRESET + 5)
+
+#define OCAPI_LINK_TRAINING_RETRIES	2
+#define OCAPI_LINK_TRAINING_TIMEOUT	3000 /* ms */
+#define OCAPI_LINK_STATE_TRAINED        0x7
+
+enum npu2_link_training_state {
+	NPU2_TRAIN_DEFAULT, /* fully train the link */
+	NPU2_TRAIN_PRBS31,  /* used for Signal Integrity testing */
+	NPU2_TRAIN_NONE,    /* used for testing with loopback cable */
+};
+static enum npu2_link_training_state npu2_ocapi_training_state = NPU2_TRAIN_DEFAULT;
+
+static const struct phb_ops npu2_opencapi_ops;
+
+static inline uint64_t index_to_stack(uint64_t index) {
+	switch (index) {
+	case 2:
+	case 3:
+		return NPU2_STACK_STCK_1;
+		break;
+	case 4:
+	case 5:
+		return NPU2_STACK_STCK_2;
+		break;
+	default:
+		assert(false);
+	}
+}
+
+static inline uint64_t index_to_stacku(uint64_t index) {
+	switch (index) {
+	case 2:
+	case 3:
+		return NPU2_STACK_STCK_1U;
+		break;
+	case 4:
+	case 5:
+		return NPU2_STACK_STCK_2U;
+		break;
+	default:
+		assert(false);
+	}
+}
+
+static inline uint64_t index_to_block(uint64_t index) {
+	switch (index) {
+	case 2:
+	case 4:
+		return NPU2_BLOCK_OTL0;
+		break;
+	case 3:
+	case 5:
+		return NPU2_BLOCK_OTL1;
+		break;
+	default:
+		assert(false);
+	}
+}
+
+static uint64_t get_odl_status(uint32_t gcid, uint64_t index)
+{
+	uint64_t reg, status_xscom;
+
+	status_xscom = OB_ODL_STATUS(index);
+	xscom_read(gcid, status_xscom, &reg);
+	return reg;
+}
+
+static uint64_t get_odl_training_status(uint32_t gcid, uint64_t index)
+{
+	uint64_t status_xscom, reg;
+
+	status_xscom = OB_ODL_TRAINING_STATUS(index);
+	xscom_read(gcid, status_xscom, &reg);
+	return reg;
+}
+
+static uint64_t get_odl_endpoint_info(uint32_t gcid, uint64_t index)
+{
+	uint64_t status_xscom, reg;
+
+	status_xscom = OB_ODL_ENDPOINT_INFO(index);
+	xscom_read(gcid, status_xscom, &reg);
+	return reg;
+}
+
+static void disable_nvlink(uint32_t gcid, int index)
+{
+	uint64_t phy_config_scom, reg;
+
+	switch (index) {
+	case 2:
+	case 3:
+		phy_config_scom = OBUS_LL0_IOOL_PHY_CONFIG;
+		break;
+	case 4:
+	case 5:
+		phy_config_scom = OBUS_LL3_IOOL_PHY_CONFIG;
+		break;
+	default:
+		assert(false);
+	}
+	/* Disable NV-Link link layers */
+	xscom_read(gcid, phy_config_scom, &reg);
+	reg &= ~OBUS_IOOL_PHY_CONFIG_NV0_NPU_ENABLED;
+	reg &= ~OBUS_IOOL_PHY_CONFIG_NV1_NPU_ENABLED;
+	reg &= ~OBUS_IOOL_PHY_CONFIG_NV2_NPU_ENABLED;
+	xscom_write(gcid, phy_config_scom, reg);
+}
+
+/* Procedure 13.1.3.1 - select OCAPI vs NVLink for bricks 2-3/4-5 */
+
+static void set_transport_mux_controls(uint32_t gcid, uint32_t scom_base,
+				       int index, enum npu2_dev_type type)
+{
+	/* Step 1 - Set Transport MUX controls to select correct OTL or NTL */
+	uint64_t reg;
+	uint64_t field;
+
+	/* TODO: Rework this to select for NVLink too */
+	assert(type == NPU2_DEV_TYPE_OPENCAPI);
+
+	prlog(PR_DEBUG, "OCAPI: %s: Setting transport mux controls\n", __func__);
+
+	/* Optical IO Transport Mux Config for Bricks 0-2 and 4-5 */
+	reg = npu2_scom_read(gcid, scom_base, NPU2_MISC_OPTICAL_IO_CFG0,
+			     NPU2_MISC_DA_LEN_8B);
+	switch (index) {
+	case 0:
+	case 1:
+		/* not valid for OpenCAPI */
+		assert(false);
+		break;
+	case 2:	 /* OTL1.0 */
+		field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg);
+		field &= ~0b100;
+		reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg,
+			       field);
+		field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg);
+		field |= 0b10;
+		reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg,
+			       field);
+		break;
+	case 3:	 /* OTL1.1 */
+		field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg);
+		field &= ~0b010;
+		reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_NDLMUX_BRK0TO2, reg,
+			       field);
+		field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg);
+		field |= 0b01;
+		reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK0TO1, reg,
+			       field);
+		break;
+	case 4:	 /* OTL2.0 */
+		field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg);
+		field |= 0b10;
+		reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg,
+			       field);
+		break;
+	case 5:	 /* OTL2.1 */
+		field = GETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg);
+		field |= 0b01;
+		reg = SETFIELD(NPU2_MISC_OPTICAL_IO_CFG0_OCMUX_BRK4TO5, reg,
+			       field);
+		break;
+	default:
+		assert(false);
+	}
+	npu2_scom_write(gcid, scom_base, NPU2_MISC_OPTICAL_IO_CFG0,
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	/*
+	 * PowerBus Optical Miscellaneous Config Register - select
+	 * OpenCAPI for b4/5 and A-Link for b3
+	 */
+	xscom_read(gcid, PU_IOE_PB_MISC_CFG, &reg);
+	switch (index) {
+	case 0:
+	case 1:
+	case 2:
+	case 3:
+		break;
+	case 4:
+		reg = SETFIELD(PU_IOE_PB_MISC_CFG_SEL_04_NPU_NOT_PB, reg, 1);
+		break;
+	case 5:
+		reg = SETFIELD(PU_IOE_PB_MISC_CFG_SEL_05_NPU_NOT_PB, reg, 1);
+		break;
+	}
+	xscom_write(gcid, PU_IOE_PB_MISC_CFG, reg);
+}
+
+static void assert_odl_reset(uint32_t gcid, int index)
+{
+	uint64_t reg, config_xscom;
+
+	config_xscom = OB_ODL_CONFIG(index);
+	/* Reset ODL */
+	reg = OB_ODL_CONFIG_RESET;
+	reg = SETFIELD(OB_ODL_CONFIG_VERSION, reg, 0b000001);
+	reg = SETFIELD(OB_ODL_CONFIG_TRAIN_MODE, reg, 0b0110);
+	reg = SETFIELD(OB_ODL_CONFIG_SUPPORTED_MODES, reg, 0b0010);
+	reg |= OB_ODL_CONFIG_X4_BACKOFF_ENABLE;
+	reg = SETFIELD(OB_ODL_CONFIG_PHY_CNTR_LIMIT, reg, 0b1111);
+	reg |= OB_ODL_CONFIG_DEBUG_ENABLE;
+	reg = SETFIELD(OB_ODL_CONFIG_FWD_PROGRESS_TIMER, reg, 0b0110);
+	xscom_write(gcid, config_xscom, reg);
+}
+
+static void deassert_odl_reset(uint32_t gcid, int index)
+{
+	uint64_t reg, config_xscom;
+
+	config_xscom = OB_ODL_CONFIG(index);
+	xscom_read(gcid, config_xscom, &reg);
+	reg &= ~OB_ODL_CONFIG_RESET;
+	xscom_write(gcid, config_xscom, reg);
+}
+
+static void enable_odl_phy_mux(uint32_t gcid, int index)
+{
+	uint64_t reg;
+	uint64_t phy_config_scom;
+	prlog(PR_DEBUG, "OCAPI: %s: Enabling ODL to PHY MUXes\n", __func__);
+	/* Step 2 - Enable MUXes for ODL to PHY connection */
+	switch (index) {
+	case 2:
+	case 3:
+		phy_config_scom = OBUS_LL0_IOOL_PHY_CONFIG;
+		break;
+	case 4:
+	case 5:
+		phy_config_scom = OBUS_LL3_IOOL_PHY_CONFIG;
+		break;
+	default:
+		assert(false);
+	}
+
+	/*
+	 * ODL must be in reset when enabling.
+	 * It stays in reset until the link is trained
+	 */
+	assert_odl_reset(gcid, index);
+
+	/* PowerBus OLL PHY Training Config Register */
+	xscom_read(gcid, phy_config_scom, &reg);
+
+	/*
+	 * Enable ODL to use shared PHYs
+	 *
+	 * On obus3, OTL0 is connected to ODL1 (and OTL1 to ODL0), so
+	 * even if it may look odd at first, we do want to enable ODL0
+	 * for links 2 and 5
+	 */
+	switch (index) {
+	case 2:
+	case 5:
+		reg |= OBUS_IOOL_PHY_CONFIG_ODL0_ENABLED;
+		break;
+	case 3:
+	case 4:
+		reg |= OBUS_IOOL_PHY_CONFIG_ODL1_ENABLED;
+		break;
+	}
+
+	/*
+	 * Based on the platform, we may have to activate an extra mux
+	 * to connect the ODL to the right set of lanes.
+	 *
+	 * FIXME: to be checked once we have merged with nvlink
+	 * code. Need to verify that it's a platform parameter and not
+	 * slot-dependent
+	 */
+	if (platform.ocapi->odl_phy_swap)
+		reg |= OBUS_IOOL_PHY_CONFIG_ODL_PHY_SWAP;
+	else
+		reg &= ~OBUS_IOOL_PHY_CONFIG_ODL_PHY_SWAP;
+
+	/* Disable A-Link link layers */
+	reg &= ~OBUS_IOOL_PHY_CONFIG_LINK0_OLL_ENABLED;
+	reg &= ~OBUS_IOOL_PHY_CONFIG_LINK1_OLL_ENABLED;
+
+	xscom_write(gcid, phy_config_scom, reg);
+}
+
+static void disable_alink_fp(uint32_t gcid)
+{
+	uint64_t reg = 0;
+
+	prlog(PR_DEBUG, "OCAPI: %s: Disabling A-Link framer/parsers\n", __func__);
+	/* Step 3 - Disable A-Link framers/parsers */
+	/* TODO: Confirm if needed on OPAL system */
+
+	reg |= PU_IOE_PB_FP_CFG_FP0_FMR_DISABLE;
+	reg |= PU_IOE_PB_FP_CFG_FP0_PRS_DISABLE;
+	reg |= PU_IOE_PB_FP_CFG_FP1_FMR_DISABLE;
+	reg |= PU_IOE_PB_FP_CFG_FP1_PRS_DISABLE;
+	xscom_write(gcid, PU_IOE_PB_FP01_CFG, reg);
+	xscom_write(gcid, PU_IOE_PB_FP23_CFG, reg);
+	xscom_write(gcid, PU_IOE_PB_FP45_CFG, reg);
+	xscom_write(gcid, PU_IOE_PB_FP67_CFG, reg);
+}
+
+static void enable_xsl_clocks(uint32_t gcid, uint32_t scom_base, int index)
+{
+	/* Step 5 - Enable Clocks in XSL */
+
+	prlog(PR_DEBUG, "OCAPI: %s: Enable clocks in XSL\n", __func__);
+
+	npu2_scom_write(gcid, scom_base, NPU2_REG_OFFSET(index_to_stack(index),
+							 NPU2_BLOCK_XSL,
+							 NPU2_XSL_WRAP_CFG),
+			NPU2_MISC_DA_LEN_8B, NPU2_XSL_WRAP_CFG_XSLO_CLOCK_ENABLE);
+}
+
+#define CQ_CTL_STATUS_TIMEOUT	10 /* milliseconds */
+
+static int set_fence_control(uint32_t gcid, uint32_t scom_base,
+			     int index, uint8_t status)
+{
+	int stack, block;
+	uint64_t reg, status_field;
+	uint8_t status_val;
+	uint64_t fence_control;
+	uint64_t timeout = mftb() + msecs_to_tb(CQ_CTL_STATUS_TIMEOUT);
+
+	stack = index_to_stack(index);
+	block = index_to_block(index);
+
+	fence_control = NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL,
+					block == NPU2_BLOCK_OTL0 ?
+					NPU2_CQ_CTL_FENCE_CONTROL_0 :
+					NPU2_CQ_CTL_FENCE_CONTROL_1);
+
+	reg = SETFIELD(NPU2_CQ_CTL_FENCE_CONTROL_REQUEST_FENCE, 0ull, status);
+	npu2_scom_write(gcid, scom_base, fence_control,
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	/* Wait for fence status to update */
+	if (index_to_block(index) == NPU2_BLOCK_OTL0)
+		status_field = NPU2_CQ_CTL_STATUS_BRK0_AM_FENCED;
+	else
+		status_field = NPU2_CQ_CTL_STATUS_BRK1_AM_FENCED;
+
+	do {
+		reg = npu2_scom_read(gcid, scom_base,
+				     NPU2_REG_OFFSET(index_to_stack(index),
+						     NPU2_BLOCK_CTL,
+						     NPU2_CQ_CTL_STATUS),
+				     NPU2_MISC_DA_LEN_8B);
+		status_val = GETFIELD(status_field, reg);
+		if (status_val == status)
+			return OPAL_SUCCESS;
+		time_wait_ms(1);
+	} while (tb_compare(mftb(), timeout) == TB_ABEFOREB);
+
+	/**
+	 * @fwts-label OCAPIFenceStatusTimeout
+	 * @fwts-advice The NPU fence status did not update as expected. This
+	 * could be the result of a firmware or hardware bug. OpenCAPI
+	 * functionality could be broken.
+	 */
+	prlog(PR_ERR,
+	      "OCAPI: Fence status for brick %d stuck: expected 0x%x, got 0x%x\n",
+	      index, status, status_val);
+	return OPAL_HARDWARE;
+}
+
+static void set_npcq_config(uint32_t gcid, uint32_t scom_base, int index)
+{
+	uint64_t reg, stack, block;
+
+	prlog(PR_DEBUG, "OCAPI: %s: Set NPCQ Config\n", __func__);
+	/* Step 6 - Set NPCQ configuration */
+	/* CQ_CTL Misc Config Register #0 */
+	stack = index_to_stack(index);
+	block = index_to_block(index);
+
+	/* Enable OTL */
+	npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG0(stack, block),
+			NPU2_MISC_DA_LEN_8B, NPU2_OTL_CONFIG0_EN);
+	set_fence_control(gcid, scom_base, index, 0b01);
+	reg = npu2_scom_read(gcid, scom_base,
+			     NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL,
+					     NPU2_CQ_CTL_MISC_CFG),
+			     NPU2_MISC_DA_LEN_8B);
+	/* Set OCAPI mode */
+	reg |= NPU2_CQ_CTL_MISC_CFG_CONFIG_OCAPI_MODE;
+	if (block == NPU2_BLOCK_OTL0)
+		reg |= NPU2_CQ_CTL_MISC_CFG_CONFIG_OTL0_ENABLE;
+	else
+		reg |= NPU2_CQ_CTL_MISC_CFG_CONFIG_OTL1_ENABLE;
+	npu2_scom_write(gcid, scom_base,
+			NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL,
+					NPU2_CQ_CTL_MISC_CFG),
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	/* NPU Fenced */
+	set_fence_control(gcid, scom_base, index, 0b11);
+
+	/* NPU Half Fenced */
+	set_fence_control(gcid, scom_base, index, 0b10);
+
+	/* CQ_DAT Misc Config Register #1 */
+	reg = npu2_scom_read(gcid, scom_base,
+			     NPU2_REG_OFFSET(stack, NPU2_BLOCK_DAT,
+					     NPU2_CQ_DAT_MISC_CFG),
+			     NPU2_MISC_DA_LEN_8B);
+	/* Set OCAPI mode for bricks 2-5 */
+	reg |= NPU2_CQ_DAT_MISC_CFG_CONFIG_OCAPI_MODE;
+	npu2_scom_write(gcid, scom_base,
+			NPU2_REG_OFFSET(stack, NPU2_BLOCK_DAT,
+					NPU2_CQ_DAT_MISC_CFG),
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	/* CQ_SM Misc Config Register #0 */
+	for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+		reg = npu2_scom_read(gcid, scom_base,
+				     NPU2_REG_OFFSET(stack, block,
+						     NPU2_CQ_SM_MISC_CFG0),
+				     NPU2_MISC_DA_LEN_8B);
+		/* Set OCAPI mode for bricks 2-5 */
+		reg |= NPU2_CQ_SM_MISC_CFG0_CONFIG_OCAPI_MODE;
+		npu2_scom_write(gcid, scom_base,
+				NPU2_REG_OFFSET(stack, block,
+						NPU2_CQ_SM_MISC_CFG0),
+				NPU2_MISC_DA_LEN_8B, reg);
+	}
+}
+
+static void enable_xsl_xts_interfaces(uint32_t gcid, uint32_t scom_base, int index)
+{
+	uint64_t reg;
+
+	prlog(PR_DEBUG, "OCAPI: %s: Enable XSL-XTS Interfaces\n", __func__);
+	/* Step 7 - Enable XSL-XTS interfaces */
+	/* XTS Config Register - Enable XSL-XTS interface */
+	reg = npu2_scom_read(gcid, scom_base, NPU2_XTS_CFG, NPU2_MISC_DA_LEN_8B);
+	reg |= NPU2_XTS_CFG_OPENCAPI;
+	npu2_scom_write(gcid, scom_base, NPU2_XTS_CFG, NPU2_MISC_DA_LEN_8B, reg);
+
+	/* XTS Config2 Register - Enable XSL1/2 */
+	reg = npu2_scom_read(gcid, scom_base, NPU2_XTS_CFG2, NPU2_MISC_DA_LEN_8B);
+	switch (index_to_stack(index)) {
+	case NPU2_STACK_STCK_1:
+		reg |= NPU2_XTS_CFG2_XSL1_ENA;
+		break;
+	case NPU2_STACK_STCK_2:
+		reg |= NPU2_XTS_CFG2_XSL2_ENA;
+		break;
+	}
+	npu2_scom_write(gcid, scom_base, NPU2_XTS_CFG2, NPU2_MISC_DA_LEN_8B, reg);
+}
+
+static void enable_sm_allocation(uint32_t gcid, uint32_t scom_base, int index)
+{
+	uint64_t reg, block;
+	int stack = index_to_stack(index);
+
+	prlog(PR_DEBUG, "OCAPI: %s: Enable State Machine Allocation\n", __func__);
+	/* Step 8 - Enable state-machine allocation */
+	/* Low-Water Marks Registers - Enable state machine allocation */
+	for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+		reg = npu2_scom_read(gcid, scom_base,
+				     NPU2_REG_OFFSET(stack, block,
+						     NPU2_LOW_WATER_MARKS),
+				     NPU2_MISC_DA_LEN_8B);
+		reg |= NPU2_LOW_WATER_MARKS_ENABLE_MACHINE_ALLOC;
+		npu2_scom_write(gcid, scom_base,
+				NPU2_REG_OFFSET(stack, block,
+						NPU2_LOW_WATER_MARKS),
+				NPU2_MISC_DA_LEN_8B, reg);
+	}
+}
+
+static void enable_pb_snooping(uint32_t gcid, uint32_t scom_base, int index)
+{
+	uint64_t reg, block;
+	int stack = index_to_stack(index);
+
+	prlog(PR_DEBUG, "OCAPI: %s: Enable PowerBus snooping\n", __func__);
+	/* Step 9 - Enable PowerBus snooping */
+	/* CQ_SM Misc Config Register #0 - Enable PowerBus snooping */
+	for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+		reg = npu2_scom_read(gcid, scom_base,
+				     NPU2_REG_OFFSET(stack, block,
+						     NPU2_CQ_SM_MISC_CFG0),
+				     NPU2_MISC_DA_LEN_8B);
+		reg |= NPU2_CQ_SM_MISC_CFG0_CONFIG_ENABLE_PBUS;
+		npu2_scom_write(gcid, scom_base,
+				NPU2_REG_OFFSET(stack, block,
+						NPU2_CQ_SM_MISC_CFG0),
+				NPU2_MISC_DA_LEN_8B, reg);
+	}
+}
+
+static void brick_config(uint32_t gcid, uint32_t scom_base, int index)
+{
+	/*
+	 * We assume at this point that the PowerBus Hotplug Mode Control
+	 * register is correctly set by Hostboot
+	 */
+	disable_nvlink(gcid, index);
+	set_transport_mux_controls(gcid, scom_base, index,
+				   NPU2_DEV_TYPE_OPENCAPI);
+	enable_odl_phy_mux(gcid, index);
+	disable_alink_fp(gcid);
+	enable_xsl_clocks(gcid, scom_base, index);
+	set_npcq_config(gcid, scom_base, index);
+	enable_xsl_xts_interfaces(gcid, scom_base, index);
+	enable_sm_allocation(gcid, scom_base, index);
+	enable_pb_snooping(gcid, scom_base, index);
+}
+
+/* Procedure 13.1.3.4 - Brick to PE Mapping */
+static void pe_config(struct npu2_dev *dev)
+{
+	/* We currently use a fixed PE assignment per brick */
+	uint64_t val, reg;
+	val = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE;
+	val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE, val, NPU2_OCAPI_PE(dev));
+	val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF, val, 0);
+	reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC,
+			      NPU2_MISC_BRICK0_BDF2PE_MAP0 +
+			      (dev->brick_index * 0x18));
+	npu2_write(dev->npu, reg, val);
+}
+
+/* Procedure 13.1.3.5 - TL Configuration */
+static void tl_config(uint32_t gcid, uint32_t scom_base, uint64_t index)
+{
+	uint64_t reg;
+	uint64_t stack = index_to_stack(index);
+	uint64_t block = index_to_block(index);
+
+	prlog(PR_DEBUG, "OCAPI: %s: TL Configuration\n", __func__);
+	/* OTL Config 0 Register */
+	reg = 0;
+	/* OTL Enable */
+	reg |= NPU2_OTL_CONFIG0_EN;
+	/* Block PE Handle from ERAT Index */
+	reg |= NPU2_OTL_CONFIG0_BLOCK_PE_HANDLE;
+	/* OTL Brick ID */
+	reg = SETFIELD(NPU2_OTL_CONFIG0_BRICKID, reg, index - 2);
+	/* ERAT Hash 0 */
+	reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_0, reg, 0b011001);
+	/* ERAT Hash 1 */
+	reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_1, reg, 0b000111);
+	/* ERAT Hash 2 */
+	reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_2, reg, 0b101100);
+	/* ERAT Hash 3 */
+	reg = SETFIELD(NPU2_OTL_CONFIG0_ERAT_HASH_3, reg, 0b100110);
+	npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG0(stack, block),
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	/* OTL Config 1 Register */
+	reg = 0;
+	/*
+	 * We leave Template 1-3 bits at 0 to force template 0 as required
+	 * for unknown devices.
+	 *
+	 * Template 0 Transmit Rate is set to most conservative setting which
+	 * will always be supported. Other Template Transmit rates are left
+	 * unset and will be set later by OS.
+	 */
+	reg = SETFIELD(NPU2_OTL_CONFIG1_TX_TEMP0_RATE, reg, 0b1111);
+	/* Extra wait cycles TXI-TXO */
+	reg = SETFIELD(NPU2_OTL_CONFIG1_TX_DRDY_WAIT, reg, 0b001);
+	/* Minimum Frequency to Return TLX Credits to AFU */
+	reg = SETFIELD(NPU2_OTL_CONFIG1_TX_CRET_FREQ, reg, 0b001);
+	/* Frequency to add age to Transmit Requests */
+	reg = SETFIELD(NPU2_OTL_CONFIG1_TX_AGE_FREQ, reg, 0b11000);
+	/* Response High Priority Threshold */
+	reg = SETFIELD(NPU2_OTL_CONFIG1_TX_RS2_HPWAIT, reg, 0b011011);
+	/* 4-slot Request High Priority Threshold */
+	reg = SETFIELD(NPU2_OTL_CONFIG1_TX_RQ4_HPWAIT, reg, 0b011011);
+	/* 6-slot Request High Priority */
+	reg = SETFIELD(NPU2_OTL_CONFIG1_TX_RQ6_HPWAIT, reg, 0b011011);
+	/* Stop the OCAPI Link on Uncorrectable Error
+	 * TODO: Confirm final value - disabled for debug */
+
+	npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG1(stack, block),
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	/* TLX Credit Configuration Register */
+	reg = 0;
+	/* VC0/VC3/DCP0/DCP1 credits to send to AFU */
+	reg = SETFIELD(NPU2_OTL_TLX_CREDITS_VC0_CREDITS, reg, 0x40);
+	reg = SETFIELD(NPU2_OTL_TLX_CREDITS_VC3_CREDITS, reg, 0x40);
+	reg = SETFIELD(NPU2_OTL_TLX_CREDITS_DCP0_CREDITS, reg, 0x80);
+	reg = SETFIELD(NPU2_OTL_TLX_CREDITS_DCP1_CREDITS, reg, 0x80);
+	npu2_scom_write(gcid, scom_base, NPU2_OTL_TLX_CREDITS(stack, block),
+			NPU2_MISC_DA_LEN_8B, reg);
+}
+
+/* Detect Nimbus DD2.0 and DD2.01 */
+static int get_nimbus_level(void)
+{
+	struct proc_chip *chip = next_chip(NULL);
+
+	if (chip && chip->type == PROC_CHIP_P9_NIMBUS)
+		return chip->ec_level & 0xff;
+	return -1;
+}
+
+/* Procedure 13.1.3.6 - Address Translation Configuration */
+static void address_translation_config(uint32_t gcid, uint32_t scom_base,
+				       uint64_t index)
+{
+	int chip_level;
+	uint64_t reg;
+	uint64_t stack = index_to_stack(index);
+
+	prlog(PR_DEBUG, "OCAPI: %s: Address Translation Configuration\n", __func__);
+	/* PSL_SCNTL_A0 Register */
+	/*
+	 * ERAT shared between multiple AFUs
+	 *
+	 * The workbook has this bit around the wrong way from the hardware.
+	 *
+	 * TODO: handle correctly with link ganging
+	 */
+	reg = npu2_scom_read(gcid, scom_base,
+			     NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL,
+					     NPU2_XSL_PSL_SCNTL_A0),
+			     NPU2_MISC_DA_LEN_8B);
+	reg |= NPU2_XSL_PSL_SCNTL_A0_MULTI_AFU_DIAL;
+	npu2_scom_write(gcid, scom_base,
+			NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL,
+					NPU2_XSL_PSL_SCNTL_A0),
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	chip_level = get_nimbus_level();
+	if (chip_level == 0x20) {
+		/*
+		 * Errata HW408041 (section 15.1.10 of NPU workbook)
+		 * "RA mismatch when both tlbie and checkout response
+		 * are seen in same cycle"
+		 */
+		/* XSL_GP Register - Bloom Filter Disable */
+		reg = npu2_scom_read(gcid, scom_base,
+				NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_GP),
+				NPU2_MISC_DA_LEN_8B);
+		/* To update XSL_GP, we must first write a magic value to it */
+		npu2_scom_write(gcid, scom_base,
+				NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_GP),
+				NPU2_MISC_DA_LEN_8B, 0x0523790323000000UL);
+		reg &= ~NPU2_XSL_GP_BLOOM_FILTER_ENABLE;
+		npu2_scom_write(gcid, scom_base,
+				NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_GP),
+				NPU2_MISC_DA_LEN_8B, reg);
+	}
+
+	if (chip_level == 0x20 || chip_level == 0x21) {
+		/*
+		 * DD2.0/2.1 EOA Bug. Fixed in DD2.2
+		 */
+		reg = 0x32F8000000000001UL;
+		npu2_scom_write(gcid, scom_base,
+				NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL,
+						NPU2_XSL_DEF),
+				NPU2_MISC_DA_LEN_8B, reg);
+	}
+}
+
+/* TODO: Merge this with NVLink implementation - we don't use the npu2_bar
+ * wrapper for the PHY BARs yet */
+static void write_bar(uint32_t gcid, uint32_t scom_base, uint64_t reg,
+		      uint64_t addr, uint64_t size)
+{
+	uint64_t val;
+	int block;
+	switch (NPU2_REG(reg)) {
+	case NPU2_PHY_BAR:
+		val = SETFIELD(NPU2_PHY_BAR_ADDR, 0ul, addr >> 21);
+		val = SETFIELD(NPU2_PHY_BAR_ENABLE, val, 1);
+		break;
+	case NPU2_NTL0_BAR:
+	case NPU2_NTL1_BAR:
+		val = SETFIELD(NPU2_NTL_BAR_ADDR, 0ul, addr >> 16);
+		val = SETFIELD(NPU2_NTL_BAR_SIZE, val, ilog2(size >> 16));
+		val = SETFIELD(NPU2_NTL_BAR_ENABLE, val, 1);
+		break;
+	case NPU2_GENID_BAR:
+		val = SETFIELD(NPU2_GENID_BAR_ADDR, 0ul, addr >> 16);
+		val = SETFIELD(NPU2_GENID_BAR_ENABLE, val, 1);
+		break;
+	default:
+		val = 0ul;
+	}
+
+	for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+		npu2_scom_write(gcid, scom_base, NPU2_REG_OFFSET(0, block, reg),
+				NPU2_MISC_DA_LEN_8B, val);
+		prlog(PR_DEBUG, "OCAPI: Setting BAR %llx to %llx\n",
+		      NPU2_REG_OFFSET(0, block, reg), val);
+	}
+}
+
+static void setup_global_mmio_bar(uint32_t gcid, uint32_t scom_base,
+				  uint64_t reg[])
+{
+	uint64_t addr, size;
+
+	prlog(PR_DEBUG, "OCAPI: patching up PHY0 bar, %s\n", __func__);
+	phys_map_get(gcid, NPU_PHY, 0, &addr, &size);
+	write_bar(gcid, scom_base,
+		  NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_PHY_BAR),
+		addr, size);
+	prlog(PR_DEBUG, "OCAPI: patching up PHY1 bar, %s\n", __func__);
+	phys_map_get(gcid, NPU_PHY, 1, &addr, &size);
+	write_bar(gcid, scom_base,
+		  NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_PHY_BAR),
+		addr, size);
+
+	prlog(PR_DEBUG, "OCAPI: setup global mmio, %s\n", __func__);
+	phys_map_get(gcid, NPU_REGS, 0, &addr, &size);
+	write_bar(gcid, scom_base,
+		  NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_PHY_BAR),
+		addr, size);
+	reg[0] = addr;
+	reg[1] = size;
+}
+
+/* Procedure 13.1.3.8 - AFU MMIO Range BARs */
+static void setup_afu_mmio_bars(uint32_t gcid, uint32_t scom_base,
+				struct npu2_dev *dev)
+{
+	uint64_t stack = index_to_stack(dev->brick_index);
+	uint64_t offset = index_to_block(dev->brick_index) == NPU2_BLOCK_OTL0 ?
+		NPU2_NTL0_BAR : NPU2_NTL1_BAR;
+	uint64_t pa_offset = index_to_block(dev->brick_index) == NPU2_BLOCK_OTL0 ?
+		NPU2_CQ_CTL_MISC_MMIOPA0_CONFIG :
+		NPU2_CQ_CTL_MISC_MMIOPA1_CONFIG;
+	uint64_t addr, size, reg;
+
+	prlog(PR_DEBUG, "OCAPI: %s: Setup AFU MMIO BARs\n", __func__);
+	phys_map_get(gcid, NPU_OCAPI_MMIO, dev->brick_index, &addr, &size);
+
+	prlog(PR_DEBUG, "OCAPI: AFU MMIO set to %llx, size %llx\n", addr, size);
+	write_bar(gcid, scom_base, NPU2_REG_OFFSET(stack, 0, offset), addr,
+		size);
+	dev->bars[0].npu2_bar.base = addr;
+	dev->bars[0].npu2_bar.size = size;
+
+	reg = SETFIELD(NPU2_CQ_CTL_MISC_MMIOPA_ADDR, 0ull, addr >> 16);
+	reg = SETFIELD(NPU2_CQ_CTL_MISC_MMIOPA_SIZE, reg, ilog2(size >> 16));
+	prlog(PR_DEBUG, "OCAPI: PA translation %llx\n", reg);
+	npu2_scom_write(gcid, scom_base,
+			NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL,
+					pa_offset),
+			NPU2_MISC_DA_LEN_8B, reg);
+}
+
+/* Procedure 13.1.3.9 - AFU Config BARs */
+static void setup_afu_config_bars(uint32_t gcid, uint32_t scom_base,
+				  struct npu2_dev *dev)
+{
+	uint64_t stack = index_to_stack(dev->brick_index);
+	int stack_num = stack - NPU2_STACK_STCK_0;
+	uint64_t addr, size;
+
+	prlog(PR_DEBUG, "OCAPI: %s: Setup AFU Config BARs\n", __func__);
+	phys_map_get(gcid, NPU_GENID, stack_num, &addr, &size);
+	prlog(PR_DEBUG, "OCAPI: Assigning GENID BAR: %016llx\n", addr);
+	write_bar(gcid, scom_base, NPU2_REG_OFFSET(stack, 0, NPU2_GENID_BAR),
+		addr, size);
+	dev->bars[1].npu2_bar.base = addr;
+	dev->bars[1].npu2_bar.size = size;
+}
+
+static void otl_enabletx(uint32_t gcid, uint32_t scom_base,
+			struct npu2_dev *dev)
+{
+	uint64_t stack = index_to_stack(dev->brick_index);
+	uint64_t block = index_to_block(dev->brick_index);
+	uint64_t reg;
+
+	/* OTL Config 2 Register */
+	/* Transmit Enable */
+	OCAPIDBG(dev, "Enabling TX\n");
+	reg = 0;
+	reg |= NPU2_OTL_CONFIG2_TX_SEND_EN;
+	npu2_scom_write(gcid, scom_base, NPU2_OTL_CONFIG2(stack, block),
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	reg = npu2_scom_read(gcid, scom_base, NPU2_OTL_VC_CREDITS(stack, block),
+			     NPU2_MISC_DA_LEN_8B);
+	OCAPIDBG(dev, "credit counter: %llx\n", reg);
+	/* TODO: Abort if credits are zero */
+}
+
+static uint8_t get_reset_pin(struct npu2_dev *dev)
+{
+	uint8_t pin;
+
+	switch (dev->brick_index) {
+	case 2:
+		pin = platform.ocapi->i2c_reset_brick2;
+		break;
+	case 3:
+		pin = platform.ocapi->i2c_reset_brick3;
+		break;
+	case 4:
+		pin = platform.ocapi->i2c_reset_brick4;
+		break;
+	case 5:
+		pin = platform.ocapi->i2c_reset_brick5;
+		break;
+	default:
+		assert(false);
+	}
+	return pin;
+}
+
+static void assert_adapter_reset(struct npu2_dev *dev)
+{
+	uint8_t pin, data;
+	int rc;
+
+	pin = get_reset_pin(dev);
+	/*
+	 * set the i2c reset pin in output mode
+	 *
+	 * On the 9554 device, register 3 is the configuration
+	 * register and a pin is in output mode if its value is 0
+	 */
+	lock(&dev->npu->i2c_lock);
+	dev->npu->i2c_pin_mode &= ~pin;
+	data = dev->npu->i2c_pin_mode;
+
+	rc = i2c_request_send(dev->npu->i2c_port_id_ocapi,
+			platform.ocapi->i2c_reset_addr, SMBUS_WRITE,
+			0x3, 1,
+			&data, sizeof(data), 120);
+	if (rc)
+		goto err;
+
+	/* register 1 controls the signal, reset is active low */
+	dev->npu->i2c_pin_wr_state &= ~pin;
+	data = dev->npu->i2c_pin_wr_state;
+
+	rc = i2c_request_send(dev->npu->i2c_port_id_ocapi,
+			platform.ocapi->i2c_reset_addr, SMBUS_WRITE,
+			0x1, 1,
+			&data, sizeof(data), 120);
+	if (rc)
+		goto err;
+	unlock(&dev->npu->i2c_lock);
+	return;
+
+err:
+	unlock(&dev->npu->i2c_lock);
+	/**
+	 * @fwts-label OCAPIDeviceResetFailed
+	 * @fwts-advice There was an error attempting to send
+	 * a reset signal over I2C to the OpenCAPI device.
+	 */
+	OCAPIERR(dev, "Error writing I2C reset signal: %d\n", rc);
+}
+
+static void deassert_adapter_reset(struct npu2_dev *dev)
+{
+	uint8_t pin, data;
+	int rc, rc2;
+
+	pin = get_reset_pin(dev);
+
+	/*
+	 * All we need to do here is deassert the reset signal by
+	 * setting the reset pin to high. However, we cannot leave the
+	 * pin in output mode, as it can cause troubles with the
+	 * opencapi adapter: when the slot is powered off (on a reboot
+	 * for example), if the i2c controller is actively setting the
+	 * reset signal to high, it maintains voltage on part of the
+	 * fpga and can leak current. It can lead the fpga to be in an
+	 * unspecified state and potentially cause damage.
+	 *
+	 * The circumvention is to set the pin back to input
+	 * mode. There are pullup resistors on the planar on all
+	 * platforms to make sure the signal will "naturally" be high,
+	 * without the i2c controller actively setting it, so we won't
+	 * have problems when the slot is powered off. And it takes
+	 * the adapter out of reset.
+	 *
+	 * To summarize:
+	 * 1. set the pin to input mode. That is enough to raise the
+	 *    signal
+	 * 2. set the value of the pin to high. The pin is input mode,
+	 *    so it won't really do anything. But it's more coherent
+	 *    and avoids bad surprises on the next call to
+	 *    assert_adapter_reset()
+	 */
+	lock(&dev->npu->i2c_lock);
+	dev->npu->i2c_pin_mode |= pin;
+	data = dev->npu->i2c_pin_mode;
+
+	rc = i2c_request_send(dev->npu->i2c_port_id_ocapi,
+			      platform.ocapi->i2c_reset_addr, SMBUS_WRITE,
+			      0x3, 1,
+			      &data, sizeof(data), 120);
+
+	dev->npu->i2c_pin_wr_state |= pin;
+	data = dev->npu->i2c_pin_wr_state;
+	rc2 = i2c_request_send(dev->npu->i2c_port_id_ocapi,
+			       platform.ocapi->i2c_reset_addr, SMBUS_WRITE,
+			       0x1, 1,
+			       &data, sizeof(data), 120);
+	unlock(&dev->npu->i2c_lock);
+	if (!rc)
+		rc = rc2;
+	if (rc) {
+		/**
+		 * @fwts-label OCAPIDeviceResetFailed
+		 * @fwts-advice There was an error attempting to send
+		 * a reset signal over I2C to the OpenCAPI device.
+		 */
+		OCAPIERR(dev, "Error writing I2C reset signal: %d\n", rc);
+	}
+}
+
+static void setup_perf_counters(struct npu2_dev *dev)
+{
+	uint64_t addr, reg, link;
+
+	/*
+	 * setup the DLL perf counters to check CRC errors detected by
+	 * the NPU or the adapter.
+	 *
+	 * Counter 0: link 0/ODL0, CRC error detected by ODL
+	 * Counter 1: link 0/ODL0, CRC error detected by DLx
+	 * Counter 2: link 1/ODL1, CRC error detected by ODL
+	 * Counter 3: link 1/ODL1, CRC error detected by DLx
+	 */
+	if ((dev->brick_index == 2) || (dev->brick_index == 5))
+		link = 0;
+	else
+		link = 1;
+
+	addr = OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index);
+	xscom_read(dev->npu->chip_id, addr, &reg);
+	if (link == 0) {
+		reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE, reg,
+			OB_DLL_PERF_MONITOR_CONFIG_LINK0);
+		reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 2, reg,
+			OB_DLL_PERF_MONITOR_CONFIG_LINK0);
+	} else {
+		reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 4, reg,
+			OB_DLL_PERF_MONITOR_CONFIG_LINK1);
+		reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 6, reg,
+			OB_DLL_PERF_MONITOR_CONFIG_LINK1);
+	}
+	reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_SIZE, reg,
+		OB_DLL_PERF_MONITOR_CONFIG_SIZE16);
+	xscom_write(dev->npu->chip_id,
+		OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index), reg);
+	OCAPIDBG(dev, "perf counter config %llx = %llx\n", addr, reg);
+
+	addr = OB_DLL_PERF_MONITOR_SELECT(dev->brick_index);
+	xscom_read(dev->npu->chip_id, addr, &reg);
+	reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> (link * 16),
+		reg, OB_DLL_PERF_MONITOR_SELECT_CRC_ODL);
+	reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> ((link * 16) + 8),
+		reg, OB_DLL_PERF_MONITOR_SELECT_CRC_DLX);
+	xscom_write(dev->npu->chip_id, addr, reg);
+	OCAPIDBG(dev, "perf counter select %llx = %llx\n", addr, reg);
+}
+
+static void check_perf_counters(struct npu2_dev *dev)
+{
+	uint64_t addr, reg, link0, link1;
+
+	addr = OB_DLL_PERF_COUNTER0(dev->brick_index);
+	xscom_read(dev->npu->chip_id, addr, &reg);
+	link0 = GETFIELD(PPC_BITMASK(0, 31), reg);
+	link1 = GETFIELD(PPC_BITMASK(32, 63), reg);
+	if (link0 || link1)
+		OCAPIERR(dev, "CRC error count link0=%08llx link1=%08llx\n",
+			link0, link1);
+}
+
+static void set_init_pattern(uint32_t gcid, struct npu2_dev *dev)
+{
+	uint64_t reg, config_xscom;
+
+	config_xscom = OB_ODL_CONFIG(dev->brick_index);
+	/* Transmit Pattern A */
+	xscom_read(gcid, config_xscom, &reg);
+	reg = SETFIELD(OB_ODL_CONFIG_TRAIN_MODE, reg, 0b0001);
+	xscom_write(gcid, config_xscom, reg);
+}
+
+static void start_training(uint32_t gcid, struct npu2_dev *dev)
+{
+	uint64_t reg, config_xscom;
+
+	config_xscom = OB_ODL_CONFIG(dev->brick_index);
+	/* Start training */
+	xscom_read(gcid, config_xscom, &reg);
+	reg = SETFIELD(OB_ODL_CONFIG_TRAIN_MODE, reg, 0b1000);
+	xscom_write(gcid, config_xscom, reg);
+}
+
+static int64_t npu2_opencapi_get_presence_state(struct pci_slot __unused *slot,
+						uint8_t *val)
+{
+	/*
+	 * Presence detection for OpenCAPI is currently done at the start of
+	 * NPU initialisation, and we only create slots if a device is present.
+	 * As such we will never be asked to get the presence of a slot that's
+	 * empty.
+	 *
+	 * This may change if we ever support surprise hotplug down
+	 * the track.
+	 */
+	*val = OPAL_PCI_SLOT_PRESENT;
+	return OPAL_SUCCESS;
+}
+
+static void fence_brick(struct npu2_dev *dev)
+{
+	OCAPIDBG(dev, "Fencing brick\n");
+	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
+			  dev->brick_index, 0b11);
+	/* from 13.2.1, Quiesce Fence State */
+	npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
+		   PPC_BIT(dev->brick_index + 6));
+}
+
+static void unfence_brick(struct npu2_dev *dev)
+{
+	OCAPIDBG(dev, "Unfencing brick\n");
+	npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
+		   PPC_BIT(dev->brick_index));
+
+	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
+			  dev->brick_index, 0b10);
+	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
+			  dev->brick_index, 0b00);
+}
+
+static enum OpalShpcLinkState get_link_width(uint64_t odl_status)
+{
+	uint64_t tx_lanes, rx_lanes, state;
+
+	/*
+	 * On P9, the 'trained mode' field of the ODL status is
+	 * hard-coded to x8 and is useless for us. We need to look at
+	 * the status of the individual lanes.
+	 * The link trains at x8, x4 or not at all.
+	 */
+	state = GETFIELD(OB_ODL_STATUS_TRAINING_STATE_MACHINE, odl_status);
+	if (state != OCAPI_LINK_STATE_TRAINED)
+		return OPAL_SHPC_LINK_DOWN;
+
+	rx_lanes = GETFIELD(OB_ODL_STATUS_RX_TRAINED_LANES, odl_status);
+	tx_lanes = GETFIELD(OB_ODL_STATUS_TX_TRAINED_LANES, odl_status);
+	if ((rx_lanes != 0xFF) || (tx_lanes != 0xFF))
+		return OPAL_SHPC_LINK_UP_x4;
+	else
+		return OPAL_SHPC_LINK_UP_x8;
+}
+
+static int64_t npu2_opencapi_get_link_state(struct pci_slot *slot, uint8_t *val)
+{
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+	uint64_t reg;
+
+	reg = get_odl_status(dev->npu->chip_id, dev->brick_index);
+	*val = get_link_width(reg);
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_get_power_state(struct pci_slot *slot,
+					     uint8_t *val)
+{
+	*val = slot->power_state;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_set_power_state(struct pci_slot *slot, uint8_t val)
+{
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+
+	switch (val) {
+	case PCI_SLOT_POWER_OFF:
+		OCAPIDBG(dev, "Fake power off\n");
+		fence_brick(dev);
+		assert_adapter_reset(dev);
+		slot->power_state = PCI_SLOT_POWER_OFF;
+		return OPAL_SUCCESS;
+
+	case PCI_SLOT_POWER_ON:
+		if (slot->power_state != PCI_SLOT_POWER_OFF)
+			return OPAL_SUCCESS;
+		OCAPIDBG(dev, "Fake power on\n");
+		slot->power_state = PCI_SLOT_POWER_ON;
+		slot->state = OCAPI_SLOT_NORMAL;
+		return OPAL_SUCCESS;
+
+	default:
+		return OPAL_UNSUPPORTED;
+	}
+}
+
+static void check_trained_link(struct npu2_dev *dev, uint64_t odl_status)
+{
+	if (get_link_width(odl_status) != OPAL_SHPC_LINK_UP_x8) {
+		OCAPIERR(dev, "Link trained in degraded mode (%016llx)\n",
+			odl_status);
+		OCAPIDBG(dev, "Link endpoint info: %016llx\n",
+			get_odl_endpoint_info(dev->npu->chip_id, dev->brick_index));
+	}
+}
+
+static int64_t npu2_opencapi_retry_state(struct pci_slot *slot,
+					 uint64_t odl_status)
+{
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+	uint32_t chip_id = dev->npu->chip_id;
+
+	if (!slot->link_retries--) {
+		/**
+		 * @fwts-label OCAPILinkTrainingFailed
+		 * @fwts-advice The OpenCAPI link training procedure failed.
+		 * This indicates a hardware or firmware bug. OpenCAPI
+		 * functionality will not be available on this link.
+		 */
+		OCAPIERR(dev,
+			"Link failed to train, final link status: %016llx\n",
+			odl_status);
+		OCAPIDBG(dev, "Final link training status: %016llx\n",
+			get_odl_training_status(chip_id, dev->brick_index));
+		return OPAL_HARDWARE;
+	}
+
+	OCAPIERR(dev, "Link failed to train, retrying\n");
+	OCAPIDBG(dev, "Link status: %016llx, training status: %016llx\n",
+		odl_status,
+		get_odl_training_status(chip_id, dev->brick_index));
+
+	pci_slot_set_state(slot, OCAPI_SLOT_FRESET_INIT);
+	return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+}
+
+static void npu2_opencapi_prepare_link_change(struct pci_slot *slot __unused,
+					      bool up __unused)
+{
+	/*
+	 * PCI hotplug wants it defined, but we don't need to do anything
+	 */
+}
+
+static int64_t npu2_opencapi_poll_link(struct pci_slot *slot)
+{
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+	uint32_t chip_id = dev->npu->chip_id;
+	uint64_t reg;
+
+	switch (slot->state) {
+	case OCAPI_SLOT_NORMAL:
+	case OCAPI_SLOT_LINK_START:
+		OCAPIDBG(dev, "Start polling\n");
+		pci_slot_set_state(slot, OCAPI_SLOT_LINK_WAIT);
+		/* fall-through */
+	case OCAPI_SLOT_LINK_WAIT:
+		reg = get_odl_status(chip_id, dev->brick_index);
+		if (GETFIELD(OB_ODL_STATUS_TRAINING_STATE_MACHINE, reg) ==
+			OCAPI_LINK_STATE_TRAINED) {
+			OCAPIINF(dev, "link trained in %ld ms\n",
+				 tb_to_msecs(mftb() - dev->train_start));
+			check_trained_link(dev, reg);
+			pci_slot_set_state(slot, OCAPI_SLOT_LINK_TRAINED);
+			return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+		}
+		if (tb_compare(mftb(), dev->train_timeout) == TB_AAFTERB)
+			return npu2_opencapi_retry_state(slot, reg);
+
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+
+	case OCAPI_SLOT_LINK_TRAINED:
+		otl_enabletx(chip_id, dev->npu->xscom_base, dev);
+		pci_slot_set_state(slot, OCAPI_SLOT_NORMAL);
+		if (dev->flags & NPU2_DEV_BROKEN) {
+			OCAPIERR(dev, "Resetting a device which hit a previous error. Device recovery is not supported, so future behavior is undefined\n");
+			dev->flags &= ~NPU2_DEV_BROKEN;
+		}
+		check_perf_counters(dev);
+		dev->phb_ocapi.scan_map = 1;
+		return OPAL_SUCCESS;
+
+	default:
+		OCAPIERR(dev, "unexpected slot state %08x\n", slot->state);
+
+	}
+	pci_slot_set_state(slot, OCAPI_SLOT_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+static int64_t npu2_opencapi_creset(struct pci_slot *slot)
+{
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+
+	OCAPIERR(dev, "creset not supported\n");
+	return OPAL_UNSUPPORTED;
+}
+
+static int64_t npu2_opencapi_freset(struct pci_slot *slot)
+{
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+	uint32_t chip_id = dev->npu->chip_id;
+	uint8_t presence = 1;
+	int rc;
+
+	switch (slot->state) {
+	case OCAPI_SLOT_NORMAL:
+	case OCAPI_SLOT_FRESET_START:
+		OCAPIDBG(dev, "FRESET starts\n");
+
+		if (slot->ops.get_presence_state)
+			slot->ops.get_presence_state(slot, &presence);
+		if (!presence) {
+			/*
+			 * FIXME: if there's no card on the link, we
+			 * should consider powering off the unused
+			 * lanes to save energy
+			 */
+			OCAPIINF(dev, "no card detected\n");
+			return OPAL_SUCCESS;
+		}
+		slot->link_retries = OCAPI_LINK_TRAINING_RETRIES;
+		/* fall-through */
+	case OCAPI_SLOT_FRESET_INIT:
+		fence_brick(dev);
+		assert_odl_reset(chip_id, dev->brick_index);
+		assert_adapter_reset(dev);
+		pci_slot_set_state(slot,
+				OCAPI_SLOT_FRESET_ASSERT_DELAY);
+		/* assert for 5ms */
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(5));
+
+	case OCAPI_SLOT_FRESET_ASSERT_DELAY:
+		rc = npu2_opencapi_phy_reset(dev);
+		if (rc) {
+			OCAPIERR(dev, "FRESET: couldn't reset PHY state\n");
+			return OPAL_HARDWARE;
+		}
+		deassert_odl_reset(chip_id, dev->brick_index);
+		deassert_adapter_reset(dev);
+		pci_slot_set_state(slot,
+				OCAPI_SLOT_FRESET_DEASSERT_DELAY);
+		/* give 250ms to device to be ready */
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(250));
+
+	case OCAPI_SLOT_FRESET_DEASSERT_DELAY:
+		unfence_brick(dev);
+		set_init_pattern(chip_id, dev);
+		pci_slot_set_state(slot,
+				OCAPI_SLOT_FRESET_INIT_DELAY);
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(5));
+
+	case OCAPI_SLOT_FRESET_INIT_DELAY:
+		/* Bump lanes - this improves training reliability */
+		npu2_opencapi_bump_ui_lane(dev);
+		start_training(chip_id, dev);
+		dev->train_start = mftb();
+		dev->train_timeout = dev->train_start + msecs_to_tb(OCAPI_LINK_TRAINING_TIMEOUT);
+		pci_slot_set_state(slot, OCAPI_SLOT_LINK_START);
+		return slot->ops.poll_link(slot);
+
+	default:
+		OCAPIERR(dev, "FRESET: unexpected slot state %08x\n",
+			slot->state);
+	}
+	pci_slot_set_state(slot, OCAPI_SLOT_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+static int64_t npu2_opencapi_hreset(struct pci_slot *slot __unused)
+{
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
+
+	OCAPIERR(dev, "hreset not supported\n");
+	return OPAL_UNSUPPORTED;
+}
+
+static void make_slot_hotpluggable(struct pci_slot *slot, struct phb *phb)
+{
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+	char name[40];
+	const char *label = NULL;
+
+	/*
+	 * Add a few definitions to the DT so that the linux PCI
+	 * hotplug framework can find the slot and identify it as
+	 * hot-pluggable.
+	 *
+	 * The "ibm,slot-label" property is used by linux as the slot name
+	 */
+	slot->pluggable = 1;
+	pci_slot_add_dt_properties(slot, phb->dt_node);
+
+	if (platform.ocapi->ocapi_slot_label)
+		label = platform.ocapi->ocapi_slot_label(dev->npu->chip_id,
+							 dev->brick_index);
+
+	if (!label) {
+		snprintf(name, sizeof(name), "OPENCAPI-%04x",
+			 (int)PCI_SLOT_PHB_INDEX(slot->id));
+		label = name;
+	}
+	dt_add_property_string(phb->dt_node, "ibm,slot-label", label);
+}
+
+static struct pci_slot *npu2_opencapi_slot_create(struct phb *phb)
+{
+	struct pci_slot *slot;
+
+	slot = pci_slot_alloc(phb, NULL);
+	if (!slot)
+		return slot;
+
+	/* TODO: Figure out other slot functions */
+	slot->ops.get_presence_state  = npu2_opencapi_get_presence_state;
+	slot->ops.get_link_state      = npu2_opencapi_get_link_state;
+	slot->ops.get_power_state     = npu2_opencapi_get_power_state;
+	slot->ops.get_attention_state = NULL;
+	slot->ops.get_latch_state     = NULL;
+	slot->ops.set_power_state     = npu2_opencapi_set_power_state;
+	slot->ops.set_attention_state = NULL;
+
+	slot->ops.prepare_link_change = npu2_opencapi_prepare_link_change;
+	slot->ops.poll_link           = npu2_opencapi_poll_link;
+	slot->ops.creset              = npu2_opencapi_creset;
+	slot->ops.freset              = npu2_opencapi_freset;
+	slot->ops.hreset              = npu2_opencapi_hreset;
+
+	return slot;
+}
+
+static int64_t npu2_opencapi_pcicfg_check(struct npu2_dev *dev, uint32_t offset,
+					  uint32_t size)
+{
+	if (!dev || offset > 0xfff || (offset & (size - 1)))
+		return OPAL_PARAMETER;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_pcicfg_read(struct phb *phb, uint32_t bdfn,
+					 uint32_t offset, uint32_t size,
+					 void *data)
+{
+	uint64_t cfg_addr;
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+	uint64_t genid_base;
+	int64_t rc;
+
+	rc = npu2_opencapi_pcicfg_check(dev, offset, size);
+	if (rc)
+		return rc;
+
+	genid_base = dev->bars[1].npu2_bar.base +
+		(index_to_block(dev->brick_index) == NPU2_BLOCK_OTL1 ? 256 : 0);
+
+	cfg_addr = NPU2_CQ_CTL_CONFIG_ADDR_ENABLE;
+	cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_BUS_NUMBER |
+			    NPU2_CQ_CTL_CONFIG_ADDR_DEVICE_NUMBER |
+			    NPU2_CQ_CTL_CONFIG_ADDR_FUNCTION_NUMBER,
+			    cfg_addr, bdfn);
+	cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_REGISTER_NUMBER,
+			    cfg_addr, offset & ~3u);
+
+	out_be64((beint64_t *)genid_base, cfg_addr);
+	sync();
+
+	switch (size) {
+	case 1:
+		*((uint8_t *)data) =
+			in_8((volatile uint8_t *)(genid_base + 128 + (offset & 3)));
+		break;
+	case 2:
+		*((uint16_t *)data) =
+			in_le16((volatile leint16_t *)(genid_base + 128 + (offset & 2)));
+		break;
+	case 4:
+		*((uint32_t *)data) = in_le32((volatile leint32_t *)(genid_base + 128));
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+#define NPU2_OPENCAPI_PCI_CFG_READ(size, type)				\
+static int64_t npu2_opencapi_pcicfg_read##size(struct phb *phb,		\
+					       uint32_t bdfn,		\
+					       uint32_t offset,		\
+					       type *data)		\
+{									\
+	/* Initialize data in case of error */				\
+	*data = (type)0xffffffff;					\
+	return npu2_opencapi_pcicfg_read(phb, bdfn, offset,		\
+					 sizeof(type), data);		\
+}
+
+static int64_t npu2_opencapi_pcicfg_write(struct phb *phb, uint32_t bdfn,
+					  uint32_t offset, uint32_t size,
+					  uint32_t data)
+{
+	uint64_t cfg_addr;
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+	uint64_t genid_base;
+	int64_t rc;
+
+	rc = npu2_opencapi_pcicfg_check(dev, offset, size);
+	if (rc)
+		return rc;
+
+	genid_base = dev->bars[1].npu2_bar.base +
+		(index_to_block(dev->brick_index) == NPU2_BLOCK_OTL1 ? 256 : 0);
+
+	cfg_addr = NPU2_CQ_CTL_CONFIG_ADDR_ENABLE;
+	cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_BUS_NUMBER |
+			    NPU2_CQ_CTL_CONFIG_ADDR_DEVICE_NUMBER |
+			    NPU2_CQ_CTL_CONFIG_ADDR_FUNCTION_NUMBER,
+			    cfg_addr, bdfn);
+	cfg_addr = SETFIELD(NPU2_CQ_CTL_CONFIG_ADDR_REGISTER_NUMBER,
+			    cfg_addr, offset & ~3u);
+
+	out_be64((beint64_t *)genid_base, cfg_addr);
+	sync();
+
+	switch (size) {
+	case 1:
+		out_8((volatile uint8_t *)(genid_base + 128 + (offset & 3)),
+		      data);
+		break;
+	case 2:
+		out_le16((volatile leint16_t *)(genid_base + 128 + (offset & 2)),
+					       data);
+		break;
+	case 4:
+		out_le32((volatile leint32_t *)(genid_base + 128), data);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+#define NPU2_OPENCAPI_PCI_CFG_WRITE(size, type)				\
+static int64_t npu2_opencapi_pcicfg_write##size(struct phb *phb,	\
+						uint32_t bdfn,		\
+						uint32_t offset,	\
+						type data)		\
+{									\
+	return npu2_opencapi_pcicfg_write(phb, bdfn, offset,		\
+					  sizeof(type), data);		\
+}
+
+NPU2_OPENCAPI_PCI_CFG_READ(8, u8)
+NPU2_OPENCAPI_PCI_CFG_READ(16, u16)
+NPU2_OPENCAPI_PCI_CFG_READ(32, u32)
+NPU2_OPENCAPI_PCI_CFG_WRITE(8, u8)
+NPU2_OPENCAPI_PCI_CFG_WRITE(16, u16)
+NPU2_OPENCAPI_PCI_CFG_WRITE(32, u32)
+
+static int64_t npu2_opencapi_ioda_reset(struct phb __unused *phb,
+				    bool __unused purge)
+{
+	/* Not relevant to OpenCAPI - we do this just to silence the error */
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_set_pe(struct phb *phb,
+				    uint64_t pe_num,
+				    uint64_t __unused bdfn,
+				    uint8_t __unused bcompare,
+				    uint8_t __unused dcompare,
+				    uint8_t __unused fcompare,
+				    uint8_t action)
+{
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+	/*
+	 * Ignored on OpenCAPI - we use fixed PE assignments. May need
+	 * addressing when we support dual-link devices.
+	 *
+	 * We nonetheless store the PE reported by the OS so that we
+	 * can send it back in case of error. If there are several PCI
+	 * functions on the device, the OS can define many PEs, we
+	 * only keep one, the OS will handle it.
+	 */
+	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+		return OPAL_PARAMETER;
+
+	if (action == OPAL_UNMAP_PE)
+		pe_num = -1;
+	dev->linux_pe = pe_num;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_freeze_status(struct phb *phb __unused,
+			   uint64_t pe_number __unused,
+			   uint8_t *freeze_state,
+			   uint16_t *pci_error_type,
+			   uint16_t *severity)
+{
+	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+	if (severity)
+		*severity = OPAL_EEH_SEV_NO_ERROR;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_opencapi_eeh_next_error(struct phb *phb,
+				   uint64_t *first_frozen_pe,
+				   uint16_t *pci_error_type,
+				   uint16_t *severity)
+{
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+
+	if (!first_frozen_pe || !pci_error_type || !severity)
+		return OPAL_PARAMETER;
+
+	if (dev->flags & NPU2_DEV_BROKEN) {
+		OCAPIDBG(dev, "Reporting device as broken\n");
+		*first_frozen_pe = dev->linux_pe;
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_DEAD;
+	} else {
+		*first_frozen_pe = -1;
+		*pci_error_type = OPAL_EEH_NO_ERROR;
+		*severity = OPAL_EEH_SEV_NO_ERROR;
+	}
+	return OPAL_SUCCESS;
+}
+
+static int npu2_add_mmio_regs(struct phb *phb, struct pci_device *pd,
+			      void *data __unused)
+{
+	uint32_t irq;
+	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
+	uint64_t block = index_to_block(dev->brick_index);
+	uint64_t stacku = index_to_stacku(dev->brick_index);
+	uint64_t dsisr, dar, tfc, handle;
+
+	/*
+	 * Pass the hw irq number for the translation fault irq
+	 * irq levels 23 -> 26 are for translation faults, 1 per brick
+	 */
+	irq = dev->npu->base_lsi + NPU_IRQ_LEVELS_XSL;
+	if (stacku == NPU2_STACK_STCK_2U)
+		irq += 2;
+	if (block == NPU2_BLOCK_OTL1)
+		irq++;
+
+	/*
+	 * Add the addresses of the registers needed by the OS to handle
+	 * faults. The OS accesses them by mmio.
+	 */
+	dsisr  = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_DSISR(stacku, block);
+	dar    = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_DAR(stacku, block);
+	tfc    = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_TFC(stacku, block);
+	handle = (uint64_t) dev->npu->regs + NPU2_OTL_OSL_PEHANDLE(stacku,
+								   block);
+	dt_add_property_cells(pd->dn, "ibm,opal-xsl-irq", irq);
+	dt_add_property_cells(pd->dn, "ibm,opal-xsl-mmio",
+			hi32(dsisr), lo32(dsisr),
+			hi32(dar), lo32(dar),
+			hi32(tfc), lo32(tfc),
+			hi32(handle), lo32(handle));
+	return 0;
+}
+
+static void npu2_opencapi_final_fixup(struct phb *phb)
+{
+	pci_walk_dev(phb, NULL, npu2_add_mmio_regs, NULL);
+}
+
+static void mask_nvlink_fir(struct npu2 *p)
+{
+	uint64_t reg;
+
+	/*
+	 * From section 13.1.3.10 of the NPU workbook: "the NV-Link
+	 * Datalink Layer Stall and NoStall signals are used for a
+	 * different purpose when the link is configured for
+	 * OpenCAPI. Therefore, the corresponding bits in NPU FIR
+	 * Register 1 must be masked and configured to NOT cause the
+	 * NPU to go into Freeze or Fence mode or send an Interrupt."
+	 *
+	 * FIXME: will need to revisit when mixing nvlink with
+	 * opencapi. Assumes an opencapi-only setup on both PHYs for
+	 * now.
+	 */
+
+	/* Mask FIRs */
+	xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, &reg);
+	reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0xFFF);
+	xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, reg);
+
+	/* freeze disable */
+	reg = npu2_scom_read(p->chip_id, p->xscom_base,
+			NPU2_MISC_FREEZE_ENABLE1, NPU2_MISC_DA_LEN_8B);
+	reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0);
+	npu2_scom_write(p->chip_id, p->xscom_base,
+			NPU2_MISC_FREEZE_ENABLE1, NPU2_MISC_DA_LEN_8B, reg);
+
+	/* fence disable */
+	reg = npu2_scom_read(p->chip_id, p->xscom_base,
+			NPU2_MISC_FENCE_ENABLE1, NPU2_MISC_DA_LEN_8B);
+	reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0);
+	npu2_scom_write(p->chip_id, p->xscom_base,
+			NPU2_MISC_FENCE_ENABLE1, NPU2_MISC_DA_LEN_8B, reg);
+
+	/* irq disable */
+	reg = npu2_scom_read(p->chip_id, p->xscom_base,
+			NPU2_MISC_IRQ_ENABLE1, NPU2_MISC_DA_LEN_8B);
+	reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0);
+	npu2_scom_write(p->chip_id, p->xscom_base,
+			NPU2_MISC_IRQ_ENABLE1, NPU2_MISC_DA_LEN_8B, reg);
+}
+
+static int enable_interrupts(struct npu2 *p)
+{
+	uint64_t reg, xsl_fault, xstop_override, xsl_mask;
+
+	/*
+	 * We need to:
+	 * - enable translation interrupts for all bricks
+	 * - override most brick-fatal errors from FIR2 to send an
+	 *   interrupt instead of the default action of checkstopping
+	 *   the systems, since we can just fence the brick and keep
+	 *   the system alive.
+	 * - the exception to the above is 2 FIRs for XSL errors
+	 *   resulting from bad AFU behavior, for which we don't want to
+	 *   checkstop but can't configure to send an error interrupt
+	 *   either, as the XSL errors are reported on 2 links (the
+	 *   XSL is shared between 2 links). Instead, we mask
+	 *   them. The XSL errors will result in an OTL error, which
+	 *   is reported only once, for the correct link.
+	 *
+	 * FIR bits configured to trigger an interrupt must have their
+	 * default action masked
+	 */
+	xsl_fault = PPC_BIT(0) | PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3);
+	xstop_override = 0x0FFFEFC00F91B000;
+	xsl_mask = NPU2_CHECKSTOP_REG2_XSL_XLAT_REQ_WHILE_SPAP_INVALID |
+		   NPU2_CHECKSTOP_REG2_XSL_INVALID_PEE;
+
+	xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, &reg);
+	reg |= xsl_fault | xstop_override | xsl_mask;
+	xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, reg);
+
+	reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2,
+			     NPU2_MISC_DA_LEN_8B);
+	reg |= xsl_fault | xstop_override;
+	npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2,
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	/*
+	 * Make sure the brick is fenced on those errors.
+	 * Fencing is incompatible with freezing, but there's no
+	 * freeze defined for FIR2, so we don't have to worry about it
+	 *
+	 * For the 2 XSL bits we ignore, we need to make sure they
+	 * don't fence the link, as the NPU logic could allow it even
+	 * when masked.
+	 */
+	reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2,
+			     NPU2_MISC_DA_LEN_8B);
+	reg |= xstop_override;
+	reg &= ~NPU2_CHECKSTOP_REG2_XSL_XLAT_REQ_WHILE_SPAP_INVALID;
+	reg &= ~NPU2_CHECKSTOP_REG2_XSL_INVALID_PEE;
+	npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2,
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	mask_nvlink_fir(p);
+	return 0;
+}
+
+static void setup_debug_training_state(struct npu2_dev *dev)
+{
+	npu2_opencapi_phy_reset(dev);
+
+	switch (npu2_ocapi_training_state) {
+	case NPU2_TRAIN_PRBS31:
+		OCAPIINF(dev, "sending PRBS31 pattern per NVRAM setting\n");
+		npu2_opencapi_phy_prbs31(dev);
+		break;
+
+	case NPU2_TRAIN_NONE:
+		OCAPIINF(dev, "link not trained per NVRAM setting\n");
+		break;
+	default:
+		assert(false);
+	}
+}
+
+static void setup_device(struct npu2_dev *dev)
+{
+	struct dt_node *dn_phb;
+	struct pci_slot *slot;
+	uint64_t mm_win[2];
+
+	/* Populate PHB device node */
+	phys_map_get(dev->npu->chip_id, NPU_OCAPI_MMIO, dev->brick_index, &mm_win[0],
+		     &mm_win[1]);
+	prlog(PR_DEBUG, "OCAPI: Setting MMIO window to %016llx + %016llx\n",
+	      mm_win[0], mm_win[1]);
+	dn_phb = dt_new_addr(dt_root, "pciex", mm_win[0]);
+	assert(dn_phb);
+	dt_add_property_strings(dn_phb,
+				"compatible",
+				"ibm,power9-npu-opencapi-pciex",
+				"ibm,ioda2-npu2-opencapi-phb");
+
+	dt_add_property_cells(dn_phb, "#address-cells", 3);
+	dt_add_property_cells(dn_phb, "#size-cells", 2);
+	dt_add_property_cells(dn_phb, "#interrupt-cells", 1);
+	dt_add_property_cells(dn_phb, "bus-range", 0, 0xff);
+	dt_add_property_cells(dn_phb, "clock-frequency", 0x200, 0);
+        dt_add_property_cells(dn_phb, "interrupt-parent", get_ics_phandle());
+
+	dt_add_property_strings(dn_phb, "device_type", "pciex");
+	dt_add_property(dn_phb, "reg", mm_win, sizeof(mm_win));
+	dt_add_property_cells(dn_phb, "ibm,npu-index", dev->npu->index);
+	dt_add_property_cells(dn_phb, "ibm,phb-index",
+			      npu2_get_phb_index(dev->brick_index));
+	dt_add_property_cells(dn_phb, "ibm,chip-id", dev->npu->chip_id);
+	dt_add_property_cells(dn_phb, "ibm,xscom-base", dev->npu->xscom_base);
+	dt_add_property_cells(dn_phb, "ibm,npcq", dev->npu->dt_node->phandle);
+	dt_add_property_cells(dn_phb, "ibm,links", 1);
+	dt_add_property(dn_phb, "ibm,mmio-window", mm_win, sizeof(mm_win));
+	dt_add_property_cells(dn_phb, "ibm,phb-diag-data-size", 0);
+
+	/*
+	 * We ignore whatever PE numbers Linux tries to set, so we just
+	 * advertise enough that Linux won't complain
+	 */
+	dt_add_property_cells(dn_phb, "ibm,opal-num-pes", NPU2_MAX_PE_NUM);
+	dt_add_property_cells(dn_phb, "ibm,opal-reserved-pe", NPU2_RESERVED_PE_NUM);
+
+	dt_add_property_cells(dn_phb, "ranges", 0x02000000,
+			      hi32(mm_win[0]), lo32(mm_win[0]),
+			      hi32(mm_win[0]), lo32(mm_win[0]),
+			      hi32(mm_win[1]), lo32(mm_win[1]));
+
+	dev->phb_ocapi.dt_node = dn_phb;
+	dev->phb_ocapi.ops = &npu2_opencapi_ops;
+	dev->phb_ocapi.phb_type = phb_type_npu_v2_opencapi;
+	dev->phb_ocapi.scan_map = 0;
+
+	dev->bdfn = 0;
+	dev->linux_pe = -1;
+
+	/* TODO: Procedure 13.1.3.7 - AFU Memory Range BARs */
+	/* Procedure 13.1.3.8 - AFU MMIO Range BARs */
+	setup_afu_mmio_bars(dev->npu->chip_id, dev->npu->xscom_base, dev);
+	/* Procedure 13.1.3.9 - AFU Config BARs */
+	setup_afu_config_bars(dev->npu->chip_id, dev->npu->xscom_base, dev);
+	setup_perf_counters(dev);
+	npu2_opencapi_phy_init(dev);
+
+	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, dev->brick_index, 0b00);
+
+	pci_register_phb(&dev->phb_ocapi, OPAL_DYNAMIC_PHB_ID);
+
+	if (npu2_ocapi_training_state != NPU2_TRAIN_DEFAULT) {
+		setup_debug_training_state(dev);
+	} else {
+		slot = npu2_opencapi_slot_create(&dev->phb_ocapi);
+		if (!slot) {
+			/**
+			 * @fwts-label OCAPICannotCreatePHBSlot
+			 * @fwts-advice Firmware probably ran out of memory creating
+			 * NPU slot. OpenCAPI functionality could be broken.
+			 */
+			prlog(PR_ERR, "OCAPI: Cannot create PHB slot\n");
+		}
+		make_slot_hotpluggable(slot, &dev->phb_ocapi);
+	}
+	return;
+}
+
+static void read_nvram_training_state(void)
+{
+	const char *state;
+
+	state = nvram_query_dangerous("opencapi-link-training");
+	if (state) {
+		if (!strcmp(state, "prbs31"))
+			npu2_ocapi_training_state = NPU2_TRAIN_PRBS31;
+		else if (!strcmp(state, "none"))
+			npu2_ocapi_training_state = NPU2_TRAIN_NONE;
+		else
+			prlog(PR_WARNING,
+			      "OCAPI: invalid training state in NVRAM: %s\n",
+			      state);
+	}
+}
+
+int npu2_opencapi_init_npu(struct npu2 *npu)
+{
+	struct npu2_dev *dev;
+	uint64_t reg[2];
+
+	assert(platform.ocapi);
+	read_nvram_training_state();
+
+	/* TODO: Test OpenCAPI with fast reboot and make it work */
+	disable_fast_reboot("OpenCAPI device enabled");
+
+	setup_global_mmio_bar(npu->chip_id, npu->xscom_base, reg);
+
+	npu->regs = (void *)reg[0];
+
+	for (int i = 0; i < npu->total_devices; i++) {
+		dev = &npu->devices[i];
+		if (dev->type != NPU2_DEV_TYPE_OPENCAPI)
+			continue;
+
+		prlog(PR_INFO, "OCAPI: Configuring link index %d, brick %d\n",
+		      dev->link_index, dev->brick_index);
+
+		/* Procedure 13.1.3.1 - Select OCAPI vs NVLink */
+		brick_config(npu->chip_id, npu->xscom_base, dev->brick_index);
+
+		/* Procedure 13.1.3.4 - Brick to PE Mapping */
+		pe_config(dev);
+
+		/* Procedure 13.1.3.5 - Transaction Layer Configuration */
+		tl_config(npu->chip_id, npu->xscom_base, dev->brick_index);
+
+		/* Procedure 13.1.3.6 - Address Translation Configuration */
+		address_translation_config(npu->chip_id, npu->xscom_base, dev->brick_index);
+	}
+
+	enable_interrupts(npu);
+
+	for (int i = 0; i < npu->total_devices; i++) {
+		dev = &npu->devices[i];
+		if (dev->type != NPU2_DEV_TYPE_OPENCAPI)
+			continue;
+		setup_device(dev);
+	}
+
+	return 0;
+}
+
+static const struct phb_ops npu2_opencapi_ops = {
+	.cfg_read8		= npu2_opencapi_pcicfg_read8,
+	.cfg_read16		= npu2_opencapi_pcicfg_read16,
+	.cfg_read32		= npu2_opencapi_pcicfg_read32,
+	.cfg_write8		= npu2_opencapi_pcicfg_write8,
+	.cfg_write16		= npu2_opencapi_pcicfg_write16,
+	.cfg_write32		= npu2_opencapi_pcicfg_write32,
+	.device_init		= NULL,
+	.phb_final_fixup	= npu2_opencapi_final_fixup,
+	.ioda_reset		= npu2_opencapi_ioda_reset,
+	.papr_errinjct_reset	= NULL,
+	.pci_reinit		= NULL,
+	.set_phb_mem_window	= NULL,
+	.phb_mmio_enable	= NULL,
+	.map_pe_mmio_window	= NULL,
+	.map_pe_dma_window	= NULL,
+	.map_pe_dma_window_real	= NULL,
+	.pci_msi_eoi		= NULL,
+	.set_xive_pe		= NULL,
+	.get_msi_32		= NULL,
+	.get_msi_64		= NULL,
+	.set_pe			= npu2_opencapi_set_pe,
+	.set_peltv		= NULL,
+	.eeh_freeze_status	= npu2_opencapi_freeze_status,
+	.eeh_freeze_clear	= NULL,
+	.eeh_freeze_set		= NULL,
+	.next_error		= npu2_opencapi_eeh_next_error,
+	.err_inject		= NULL,
+	.get_diag_data2		= NULL,
+	.set_capi_mode		= NULL,
+	.set_capp_recovery	= NULL,
+	.tce_kill		= NULL,
+};
+
+void npu2_opencapi_set_broken(struct npu2 *npu, int brick)
+{
+	struct phb *phb;
+	struct npu2_dev *dev;
+
+	for_each_phb(phb) {
+		if (phb->phb_type == phb_type_npu_v2_opencapi) {
+			dev = phb_to_npu2_dev_ocapi(phb);
+			if (dev->npu == npu &&
+			    dev->brick_index == brick)
+				dev->flags |= NPU2_DEV_BROKEN;
+		}
+	}
+}
+
+static int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t __unused bdfn,
+				uint64_t addr, uint64_t PE_mask)
+{
+	uint64_t stack, block, offset, reg;
+	struct phb *phb = pci_get_phb(phb_id);
+	struct npu2_dev *dev;
+	int rc;
+
+	if (!phb || phb->phb_type != phb_type_npu_v2_opencapi)
+		return OPAL_PARAMETER;
+
+	/* 4k aligned */
+	if (addr & 0xFFF)
+		return OPAL_PARAMETER;
+
+	if (PE_mask > 15)
+		return OPAL_PARAMETER;
+
+	dev = phb_to_npu2_dev_ocapi(phb);
+	if (!dev)
+		return OPAL_PARAMETER;
+
+	block = index_to_block(dev->brick_index);
+	stack = index_to_stack(dev->brick_index);
+	if (block == NPU2_BLOCK_OTL1)
+		offset = NPU2_XSL_PSL_SPAP_A1;
+	else
+		offset = NPU2_XSL_PSL_SPAP_A0;
+
+
+	lock(&dev->npu->lock);
+	/*
+	 * set the SPAP used by the device
+	 */
+	reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base,
+			NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, offset),
+			NPU2_MISC_DA_LEN_8B);
+	if ((addr && (reg & NPU2_XSL_PSL_SPAP_EN)) ||
+		(!addr && !(reg & NPU2_XSL_PSL_SPAP_EN))) {
+		rc = OPAL_BUSY;
+		goto out;
+	}
+	/* SPA is disabled by passing a NULL address */
+	reg = addr;
+	if (addr)
+		reg = addr | NPU2_XSL_PSL_SPAP_EN;
+
+	npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base,
+			NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, offset),
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	/*
+	 * set the PE mask that the OS uses for PASID -> PE handle
+	 * conversion
+	 */
+	reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base,
+			NPU2_OTL_CONFIG0(stack, block), NPU2_MISC_DA_LEN_8B);
+	reg &= ~NPU2_OTL_CONFIG0_PE_MASK;
+	reg |= (PE_mask << (63-7));
+	npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base,
+			NPU2_OTL_CONFIG0(stack, block), NPU2_MISC_DA_LEN_8B,
+			reg);
+	rc = OPAL_SUCCESS;
+out:
+	unlock(&dev->npu->lock);
+	return rc;
+}
+opal_call(OPAL_NPU_SPA_SETUP, opal_npu_spa_setup, 4);
+
+static int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t __unused bdfn,
+					uint64_t PE_handle)
+{
+	uint64_t cc_inv, stack, block, reg, rc;
+	uint32_t retries = 5;
+	struct phb *phb = pci_get_phb(phb_id);
+	struct npu2_dev *dev;
+
+	if (!phb || phb->phb_type != phb_type_npu_v2_opencapi)
+		return OPAL_PARAMETER;
+
+	if (PE_handle > MAX_PE_HANDLE)
+		return OPAL_PARAMETER;
+
+	dev = phb_to_npu2_dev_ocapi(phb);
+	if (!dev)
+		return OPAL_PARAMETER;
+
+	block = index_to_block(dev->brick_index);
+	stack = index_to_stack(dev->brick_index);
+	cc_inv = NPU2_REG_OFFSET(stack, NPU2_BLOCK_XSL, NPU2_XSL_PSL_LLCMD_A0);
+
+	lock(&dev->npu->lock);
+	reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base, cc_inv,
+			NPU2_MISC_DA_LEN_8B);
+	if (reg & PPC_BIT(16)) {
+		rc = OPAL_BUSY;
+		goto out;
+	}
+
+	reg = PE_handle | PPC_BIT(15);
+	if (block == NPU2_BLOCK_OTL1)
+		reg |= PPC_BIT(48);
+	npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base, cc_inv,
+			NPU2_MISC_DA_LEN_8B, reg);
+
+	rc = OPAL_HARDWARE;
+	while (retries--) {
+		reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base,
+				     cc_inv, NPU2_MISC_DA_LEN_8B);
+		if (!(reg & PPC_BIT(16))) {
+			rc = OPAL_SUCCESS;
+			break;
+		}
+		/* the bit expected to flip in less than 200us */
+		time_wait_us(200);
+	}
+out:
+	unlock(&dev->npu->lock);
+	return rc;
+}
+opal_call(OPAL_NPU_SPA_CLEAR_CACHE, opal_npu_spa_clear_cache, 3);
+
+static int get_template_rate(unsigned int templ, char *rate_buf)
+{
+	int shift, idx, val;
+
+	/*
+	 * Each rate is encoded over 4 bits (0->15), with 15 being the
+	 * slowest. The buffer is a succession of rates for all the
+	 * templates. The first 4 bits are for template 63, followed
+	 * by 4 bits for template 62, ... etc. So the rate for
+	 * template 0 is at the very end of the buffer.
+	 */
+	idx = (TL_MAX_TEMPLATE - templ) / 2;
+	shift = 4 * (1 - ((TL_MAX_TEMPLATE - templ) % 2));
+	val = rate_buf[idx] >> shift;
+	return val;
+}
+
+static bool is_template_supported(unsigned int templ, long capabilities)
+{
+	return !!(capabilities & (1ull << templ));
+}
+
+static int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t __unused bdfn,
+			long capabilities, uint64_t rate_phys, int rate_sz)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	struct npu2_dev *dev;
+	uint64_t stack, block, reg, templ_rate;
+	int i, rate_pos;
+	char *rate = (char *) rate_phys;
+
+	if (!phb || phb->phb_type != phb_type_npu_v2_opencapi)
+		return OPAL_PARAMETER;
+	if (!opal_addr_valid(rate) || rate_sz != TL_RATE_BUF_SIZE)
+		return OPAL_PARAMETER;
+
+	dev = phb_to_npu2_dev_ocapi(phb);
+	if (!dev)
+		return OPAL_PARAMETER;
+
+	block = index_to_block(dev->brick_index);
+	stack = index_to_stack(dev->brick_index);
+	/*
+	 * The 'capabilities' argument defines what TL template the
+	 * device can receive. OpenCAPI 3.0 and 4.0 define 64 templates, so
+	 * that's one bit per template.
+	 *
+	 * For each template, the device processing time may vary, so
+	 * the device advertises at what rate a message of a given
+	 * template can be sent. That's encoded in the 'rate' buffer.
+	 *
+	 * On P9, NPU only knows about TL templates 0 -> 3.
+	 * Per the spec, template 0 must be supported.
+	 */
+	if (!is_template_supported(0, capabilities))
+		return OPAL_PARAMETER;
+
+	reg = npu2_scom_read(dev->npu->chip_id, dev->npu->xscom_base,
+			     NPU2_OTL_CONFIG1(stack, block),
+			     NPU2_MISC_DA_LEN_8B);
+	reg &= ~(NPU2_OTL_CONFIG1_TX_TEMP1_EN | NPU2_OTL_CONFIG1_TX_TEMP2_EN |
+		 NPU2_OTL_CONFIG1_TX_TEMP3_EN);
+	for (i = 0; i < 4; i++) {
+		/* Skip template 0 as it is implicitly enabled */
+		if (i && is_template_supported(i, capabilities))
+			reg |= PPC_BIT(i);
+		/* The tx rate should still be set for template 0 */
+		templ_rate = get_template_rate(i, rate);
+		rate_pos = 8 + i * 4;
+		reg = SETFIELD(PPC_BITMASK(rate_pos, rate_pos + 3), reg,
+			       templ_rate);
+	}
+	npu2_scom_write(dev->npu->chip_id, dev->npu->xscom_base,
+			NPU2_OTL_CONFIG1(stack, block), NPU2_MISC_DA_LEN_8B,
+			reg);
+	OCAPIDBG(dev, "OTL configuration 1 register set to %llx\n", reg);
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_NPU_TL_SET, opal_npu_tl_set, 5);
+
+static void set_mem_bar(struct npu2_dev *dev, uint64_t base, uint64_t size)
+{
+	uint64_t stack, val, reg, bar_offset, pa_config_offset;
+	uint8_t memsel;
+
+	stack = index_to_stack(dev->brick_index);
+	switch (dev->brick_index) {
+	case 2:
+	case 4:
+		bar_offset = NPU2_GPU0_MEM_BAR;
+		pa_config_offset = NPU2_CQ_CTL_MISC_PA0_CONFIG;
+		break;
+	case 3:
+	case 5:
+		bar_offset = NPU2_GPU1_MEM_BAR;
+		pa_config_offset = NPU2_CQ_CTL_MISC_PA1_CONFIG;
+		break;
+	default:
+		assert(false);
+	}
+
+	assert((!size && !base) || (size && base));
+
+	/*
+	 * Memory select configuration:
+	 * - 0b000 - BAR disabled
+	 * - 0b001 - match 0b00, 0b01
+	 * - 0b010 - match 0b01, 0b10
+	 * - 0b011 - match 0b00, 0b10
+	 * - 0b100 - match 0b00
+	 * - 0b101 - match 0b01
+	 * - 0b110 - match 0b10
+	 * - 0b111 - match 0b00, 0b01, 0b10
+	 */
+	memsel = GETFIELD(PPC_BITMASK(13, 14), base);
+	if (size)
+		val = SETFIELD(NPU2_MEM_BAR_EN | NPU2_MEM_BAR_SEL_MEM, 0ULL, 0b100 + memsel);
+	else
+		val = 0;
+
+	/* Base address - 12 bits, 1G aligned */
+	val = SETFIELD(NPU2_MEM_BAR_NODE_ADDR, val, GETFIELD(PPC_BITMASK(22, 33), base));
+
+	/* GCID */
+	val = SETFIELD(NPU2_MEM_BAR_GROUP, val, GETFIELD(PPC_BITMASK(15, 18), base));
+	val = SETFIELD(NPU2_MEM_BAR_CHIP, val, GETFIELD(PPC_BITMASK(19, 21), base));
+
+	/* Other settings */
+	val = SETFIELD(NPU2_MEM_BAR_POISON, val, 1);
+	val = SETFIELD(NPU2_MEM_BAR_GRANULE, val, 0);
+	val = SETFIELD(NPU2_MEM_BAR_BAR_SIZE, val, ilog2(size >> 30));
+	val = SETFIELD(NPU2_MEM_BAR_MODE, val, 0);
+
+	for (int block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+		reg = NPU2_REG_OFFSET(stack, block, bar_offset);
+		npu2_write(dev->npu, reg, val);
+	}
+
+	/* Set PA config */
+	if (size)
+		val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_MEMSELMATCH, 0ULL, 0b100 + memsel);
+	else
+		val = 0;
+	val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_GRANULE, val, 0);
+	val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_SIZE, val, ilog2(size >> 30));
+	val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_MODE, val, 0);
+	val = SETFIELD(NPU2_CQ_CTL_MISC_PA_CONFIG_MASK, val, 0);
+	reg = NPU2_REG_OFFSET(stack, NPU2_BLOCK_CTL, pa_config_offset);
+	npu2_write(dev->npu, reg, val);
+}
+
+static int64_t alloc_mem_bar(struct npu2_dev *dev, uint64_t size, uint64_t *bar)
+{
+	uint64_t phys_map_base, phys_map_size, val;
+	int rc = OPAL_SUCCESS;
+
+	lock(&dev->npu->lock);
+
+	if (dev->lpc_mem_base) {
+		OCAPIERR(dev, "LPC allocation failed - BAR already in use\n");
+		rc = OPAL_RESOURCE;
+		goto out;
+	}
+
+	/*
+	 * The supported chip address extension mask is 1100 100 (mask
+	 * off 2 bits from group ID and 1 bit from chip ID).
+	 *
+	 * Fall back to only permitting a single allocation if we
+	 * don't see this mask value.
+	 */
+	xscom_read(dev->npu->chip_id, PB_CENT_MODE, &val);
+	if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 0b1100100) {
+		phys_map_get(dev->npu->chip_id, OCAPI_MEM,
+			     dev->brick_index - 2, &phys_map_base,
+			     &phys_map_size);
+	} else {
+		bool in_use = false;
+
+		for (int i = 0; i < dev->npu->total_devices; i++) {
+			if (dev->npu->devices[i].lpc_mem_base)
+				in_use = true;
+		}
+
+		if (in_use) {
+			OCAPIERR(dev, "LPC allocation failed - single device per chip limit, FW upgrade required (pb_cent_mode=0x%016llx)\n", val);
+			rc = OPAL_RESOURCE;
+			goto out;
+		}
+
+		phys_map_get(dev->npu->chip_id, OCAPI_MEM, 0, &phys_map_base,
+			     &phys_map_size);
+	}
+
+	if (size > phys_map_size) {
+		/**
+		 * @fwts-label OCAPIInvalidLPCMemoryBARSize
+		 * @fwts-advice The operating system requested an unsupported
+		 * amount of OpenCAPI LPC memory. This is possibly a kernel
+		 * bug, or you may need to upgrade your firmware.
+		 */
+		OCAPIERR(dev, "Invalid LPC memory BAR allocation size requested: 0x%llx bytes (limit 0x%llx)\n",
+			 size, phys_map_size);
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	/* Minimum BAR size is 1 GB */
+	if (size < (1 << 30)) {
+		size = 1 << 30;
+	}
+
+	if (!is_pow2(size)) {
+		size = 1ull << (ilog2(size) + 1);
+	}
+
+	set_mem_bar(dev, phys_map_base, size);
+	*bar = phys_map_base;
+	dev->lpc_mem_base = phys_map_base;
+	dev->lpc_mem_size = size;
+
+out:
+	unlock(&dev->npu->lock);
+	return rc;
+}
+
+static int64_t release_mem_bar(struct npu2_dev *dev)
+{
+	int rc = OPAL_SUCCESS;
+
+	lock(&dev->npu->lock);
+
+	if (!dev->lpc_mem_base) {
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	set_mem_bar(dev, 0, 0);
+	dev->lpc_mem_base = 0;
+	dev->lpc_mem_size = 0;
+
+out:
+	unlock(&dev->npu->lock);
+	return rc;
+}
+
+static int64_t opal_npu_mem_alloc(uint64_t phb_id, uint32_t __unused bdfn,
+				  uint64_t size, __be64 *__bar)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	struct npu2_dev *dev;
+	uint64_t bar;
+	int64_t rc;
+
+
+	if (!phb || phb->phb_type != phb_type_npu_v2_opencapi)
+		return OPAL_PARAMETER;
+
+	dev = phb_to_npu2_dev_ocapi(phb);
+	if (!dev)
+		return OPAL_PARAMETER;
+
+	if (!opal_addr_valid(__bar))
+		return OPAL_PARAMETER;
+
+	rc = alloc_mem_bar(dev, size, &bar);
+	if (rc == OPAL_SUCCESS)
+		*__bar = cpu_to_be64(bar);
+
+	return rc;
+}
+opal_call(OPAL_NPU_MEM_ALLOC, opal_npu_mem_alloc, 4);
+
+static int64_t opal_npu_mem_release(uint64_t phb_id, uint32_t __unused bdfn)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	struct npu2_dev *dev;
+
+
+	if (!phb || phb->phb_type != phb_type_npu_v2_opencapi)
+		return OPAL_PARAMETER;
+
+	dev = phb_to_npu2_dev_ocapi(phb);
+	if (!dev)
+		return OPAL_PARAMETER;
+
+	return release_mem_bar(dev);
+}
+opal_call(OPAL_NPU_MEM_RELEASE, opal_npu_mem_release, 2);
diff --git a/roms/skiboot/hw/npu2.c b/roms/skiboot/hw/npu2.c
new file mode 100644
index 000000000..cf57eeb0c
--- /dev/null
+++ b/roms/skiboot/hw/npu2.c
@@ -0,0 +1,2323 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NPU - NVlink and OpenCAPI
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci-cfg.h>
+#include <pci.h>
+#include <pci-slot.h>
+#include <pci-virt.h>
+#include <opal.h>
+#include <opal-api.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <affinity.h>
+#include <npu2.h>
+#include <lock.h>
+#include <xscom.h>
+#include <bitutils.h>
+#include <chip.h>
+#include <phys-map.h>
+#include <nvram.h>
+#include <xscom-p9-regs.h>
+#include <phb4.h>
+#include <cache-p9.h>
+
+#define VENDOR_CAP_START    0x80
+#define VENDOR_CAP_END      0x90
+#define VENDOR_CAP_LEN      0x10
+#define VENDOR_CAP_VERSION  0x01
+#define VENDOR_CAP_PCI_DEV_OFFSET 0x0d
+
+/*
+ * NPU2 BAR layout definition. We have 3 stacks and each of them
+ * contains 2 bricks. So every NPU2 has 6 bricks in total. There are 2
+ * PHY BARs and each of them is shared by 3 bricks. Every brick has
+ * one NTL BAR and two bricks share one GENID BAR. There is also a
+ * global MMIO BAR. We only expose DL and GENID BARs to the OS and all
+ * other BARs will be hidden in skiboot.
+ *
+ * Before the global MMIO BAR is configured, scom is the only way to
+ * access the BAR registers. At NPU2 PHB probing time, we rely on scom
+ * to assign all BARs until the global MMIO BAR is established.
+ *
+ * We need to access 4 SM registers in the same stack in order to
+ * configure one particular BAR.
+ */
+
+/* Set a specific flag in the vendor config space */
+void npu2_set_link_flag(struct npu2_dev *ndev, uint8_t flag)
+{
+	ndev->nvlink.link_flags |= flag;
+	PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START +
+			     VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags);
+}
+
+void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag)
+{
+	ndev->nvlink.link_flags &= ~flag;
+	PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START +
+			     VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags);
+}
+
+static inline void npu2_ioda_sel(struct npu2 *p, uint32_t table,
+				uint32_t index, bool autoinc)
+{
+	out_be64(p->regs + NPU2_ATS_IODA_TBL,
+		 (autoinc ? NPU2_ATS_IODA_TBL_AUTOINC : 0ul)	|
+		 SETFIELD(NPU2_ATS_IODA_TBL_SELECT, 0ul, table)	|
+		 SETFIELD(NPU2_ATS_IODA_TBL_INDEX,  0ul, index));
+}
+
+static struct npu2_dev *npu2_bdf_to_dev(struct npu2 *p,
+					uint32_t bdfn)
+{
+	struct pci_virt_device *pvd;
+
+	/* All emulated devices are attached to root bus */
+	if (bdfn & ~0xff)
+		return NULL;
+
+	pvd = pci_virt_find_device(&p->phb_nvlink, bdfn);
+	if (pvd)
+		return pvd->data;
+
+	return NULL;
+}
+
+static inline void npu2_get_bar(uint32_t gcid, struct npu2_bar *bar)
+{
+	phys_map_get(gcid, bar->type, bar->index, &bar->base, &bar->size);
+}
+
+static void npu2_read_bar(struct npu2 *p, struct npu2_bar *bar)
+{
+	uint64_t reg, val;
+	int enabled;
+
+	reg = NPU2_REG_OFFSET(0, NPU2_BLOCK_SM_0, bar->reg);
+	val = npu2_read(p, reg);
+
+	switch (NPU2_REG(bar->reg)) {
+	case NPU2_PHY_BAR:
+		bar->base = GETFIELD(NPU2_PHY_BAR_ADDR, val) << 21;
+		enabled = GETFIELD(NPU2_PHY_BAR_ENABLE, val);
+
+		if (NPU2_REG_STACK(reg) == NPU2_STACK_STCK_2)
+			/* This is the global MMIO BAR */
+			bar->size = 0x1000000;
+		else
+			bar->size = 0x200000;
+		break;
+	case NPU2_NTL0_BAR:
+	case NPU2_NTL1_BAR:
+		bar->base = GETFIELD(NPU2_NTL_BAR_ADDR, val) << 16;
+		enabled = GETFIELD(NPU2_NTL_BAR_ENABLE, val);
+		bar->size = 0x10000 << GETFIELD(NPU2_NTL_BAR_SIZE, val);
+		break;
+	case NPU2_GENID_BAR:
+		bar->base = GETFIELD(NPU2_GENID_BAR_ADDR, val) << 16;
+		enabled = GETFIELD(NPU2_GENID_BAR_ENABLE, val);
+		bar->size = 0x20000;
+		break;
+	default:
+		bar->base = 0ul;
+		enabled = 0;
+		bar->size = 0;
+		break;
+	}
+
+	bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, bar->flags, enabled);
+}
+
+static void npu2_write_bar(struct npu2 *p,
+			   struct npu2_bar *bar,
+			   uint32_t gcid,
+			   uint32_t scom)
+{
+	uint64_t reg, val, enable = !!(bar->flags & NPU2_BAR_FLAG_ENABLED);
+	int block;
+
+	switch (NPU2_REG(bar->reg)) {
+	case NPU2_PHY_BAR:
+		val = SETFIELD(NPU2_PHY_BAR_ADDR, 0ul, bar->base >> 21);
+		val = SETFIELD(NPU2_PHY_BAR_ENABLE, val, enable);
+		break;
+	case NPU2_NTL0_BAR:
+	case NPU2_NTL1_BAR:
+		val = SETFIELD(NPU2_NTL_BAR_ADDR, 0ul, bar->base >> 16);
+		val = SETFIELD(NPU2_NTL_BAR_ENABLE, val, enable);
+		val = SETFIELD(NPU2_NTL_BAR_SIZE, val, 1);
+		break;
+	case NPU2_GENID_BAR:
+		val = SETFIELD(NPU2_GENID_BAR_ADDR, 0ul, bar->base >> 16);
+		val = SETFIELD(NPU2_GENID_BAR_ENABLE, val, enable);
+		break;
+	default:
+		val = 0ul;
+	}
+
+	for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+		reg = NPU2_REG_OFFSET(0, block, bar->reg);
+		if (p)
+			npu2_write(p, reg, val);
+		else
+			npu2_scom_write(gcid, scom, reg, NPU2_MISC_DA_LEN_8B, val);
+	}
+}
+
+/* Trap for PCI command (0x4) to enable or disable device's BARs */
+static int64_t npu2_cfg_write_cmd(void *dev,
+				  struct pci_cfg_reg_filter *pcrf __unused,
+				  uint32_t offset, uint32_t size,
+				  uint32_t *data, bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu2_dev *ndev = pvd->data;
+	struct npu2_bar *ntl_npu_bar, *genid_npu_bar;
+	bool enabled;
+
+	if (!write)
+		return OPAL_PARTIAL;
+
+	if (offset != PCI_CFG_CMD)
+		return OPAL_PARAMETER;
+	if (size != 1 && size != 2 && size != 4)
+		return OPAL_PARAMETER;
+
+	/*
+	 * Enable or disable NTL and GENID BAR. Two bricks share
+	 * one GENID BAR, which is exposed via the first brick.
+	 */
+	enabled = !!(*data & PCI_CFG_CMD_MEM_EN);
+	ntl_npu_bar = &ndev->bars[0].npu2_bar;
+	genid_npu_bar = &ndev->bars[1].npu2_bar;
+
+	ntl_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, ntl_npu_bar->flags, enabled);
+	npu2_write_bar(ndev->npu, ntl_npu_bar, 0, 0);
+
+	/*
+	 * Enable/disable the GENID BAR. Two bricks share one GENID
+	 * BAR which is exposed via the first brick so we need to
+	 * track the enables separately.
+	 */
+	if (NPU2DEV_BRICK(ndev))
+		genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED1, genid_npu_bar->flags,
+						enabled);
+	else
+		genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED0, genid_npu_bar->flags,
+						enabled);
+
+	/* Enable the BAR if either device requests it enabled, otherwise disable it */
+	genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, genid_npu_bar->flags,
+					!!(genid_npu_bar->flags & (NPU2_BAR_FLAG_ENABLED0 |
+								   NPU2_BAR_FLAG_ENABLED1)));
+	npu2_write_bar(ndev->npu, genid_npu_bar, 0, 0);
+
+	return OPAL_PARTIAL;
+}
+
+static int64_t npu2_cfg_read_bar(struct npu2_dev *dev __unused,
+				 struct pci_cfg_reg_filter *pcrf,
+				 uint32_t offset, uint32_t size,
+				 uint32_t *data)
+{
+	struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
+
+	if (!(bar->flags & NPU2_PCIE_BAR_FLAG_TRAPPED))
+		return OPAL_PARTIAL;
+
+	if ((size != 4) ||
+	    (offset != pcrf->start && offset != pcrf->start + 4))
+		return OPAL_PARAMETER;
+
+	if (bar->flags & NPU2_PCIE_BAR_FLAG_SIZE_HI)
+		*data = bar->npu2_bar.size >> 32;
+	else
+		*data = bar->npu2_bar.size;
+	bar->flags &= ~(NPU2_PCIE_BAR_FLAG_TRAPPED | NPU2_PCIE_BAR_FLAG_SIZE_HI);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_cfg_write_bar(struct npu2_dev *dev,
+				  struct pci_cfg_reg_filter *pcrf,
+				  uint32_t offset, uint32_t size,
+				  uint32_t data)
+{
+	struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
+	struct npu2_bar old_bar, *npu2_bar = &bar->npu2_bar;
+
+	if ((size != 4) ||
+	    (offset != pcrf->start && offset != pcrf->start + 4))
+		return OPAL_PARAMETER;
+
+	/* Return BAR size on next read */
+	if (data == 0xffffffff) {
+		bar->flags |= NPU2_PCIE_BAR_FLAG_TRAPPED;
+		if (offset == pcrf->start + 4)
+			bar->flags |= NPU2_PCIE_BAR_FLAG_SIZE_HI;
+
+		return OPAL_SUCCESS;
+	}
+
+	if (offset == pcrf->start) {
+		npu2_bar->base &= 0xffffffff00000000UL;
+		npu2_bar->base |= (data & 0xfffffff0);
+	} else {
+		npu2_bar->base &= 0x00000000ffffffffUL;
+		npu2_bar->base |= ((uint64_t)data << 32);
+
+		if (NPU2_REG(npu2_bar->reg) == NPU2_GENID_BAR && NPU2DEV_BRICK(dev))
+			npu2_bar->base -= 0x10000;
+
+		old_bar.reg = npu2_bar->reg;
+		npu2_read_bar(dev->npu, &old_bar);
+
+		/* Only allow changing the base address if the BAR is not enabled */
+		if ((npu2_bar->flags & NPU2_BAR_FLAG_ENABLED) &&
+		    (npu2_bar->base != old_bar.base)) {
+			npu2_bar->base = old_bar.base;
+			return OPAL_HARDWARE;
+		}
+
+		npu2_write_bar(dev->npu, &bar->npu2_bar, 0, 0);
+	}
+
+	/* To update the config cache */
+	return OPAL_PARTIAL;
+}
+
+static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
+				uint32_t offset, uint32_t len, uint32_t *data,
+				bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu2_dev *ndev = (struct npu2_dev *) pvd->data;
+
+	if (write)
+		return npu2_cfg_write_bar(ndev, pcrf, offset, len, *data);
+
+	return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
+}
+
+static int64_t npu2_dev_cfg_exp_devcap(void *dev,
+		struct pci_cfg_reg_filter *pcrf __unused,
+		uint32_t offset, uint32_t size,
+		uint32_t *data, bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu2_dev *ndev = pvd->data;
+	int rc;
+
+	assert(write);
+
+	if ((size != 2) || (offset & 1)) {
+		/* Short config writes are not supported */
+		prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n",
+		      ndev->nvlink.phb->opal_id);
+		return OPAL_PARAMETER;
+	}
+
+	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
+		npu2_dev_procedure_reset(ndev);
+
+	rc = purge_l2_l3_caches();
+	if (rc)
+		return rc;
+
+	return OPAL_PARTIAL;
+}
+
+#define NPU2_CFG_READ(size, type)					\
+static int64_t npu2_cfg_read##size(struct phb *phb, uint32_t bdfn,	\
+				   uint32_t offset, type *data)		\
+{									\
+	uint32_t val;							\
+	int64_t ret;							\
+									\
+	ret = pci_virt_cfg_read(phb, bdfn, offset,			\
+				sizeof(*data), &val);			\
+	*data = (type)val;						\
+        return ret;							\
+}
+#define NPU2_CFG_WRITE(size, type)					\
+static int64_t npu2_cfg_write##size(struct phb *phb, uint32_t bdfn,	\
+				    uint32_t offset, type data)		\
+{									\
+	uint32_t val = data;						\
+	int64_t ret;							\
+									\
+	ret = pci_virt_cfg_write(phb, bdfn, offset,			\
+				 sizeof(data), val);			\
+	return ret;							\
+}
+
+NPU2_CFG_READ(8, u8);
+NPU2_CFG_READ(16, u16);
+NPU2_CFG_READ(32, u32);
+NPU2_CFG_WRITE(8, u8);
+NPU2_CFG_WRITE(16, u16);
+NPU2_CFG_WRITE(32, u32);
+
+static int __npu2_dev_bind_pci_dev(struct phb *phb __unused,
+				  struct pci_device *pd,
+				  void *data)
+{
+	struct npu2_dev *dev = data;
+	struct dt_node *pci_dt_node;
+	char *pcislot;
+
+	/* Ignore non-nvidia PCI devices */
+	if ((pd->vdid & 0xffff) != 0x10de)
+		return 0;
+
+	/* Find the PCI device's slot location */
+	for (pci_dt_node = pd->dn;
+	     pci_dt_node && !dt_find_property(pci_dt_node, "ibm,loc-code");
+	     pci_dt_node = pci_dt_node->parent);
+
+	if (!pci_dt_node)
+		return 0;
+
+	pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,loc-code");
+
+	NPU2DEVDBG(dev, "Comparing GPU '%s' and NPU2 '%s'\n",
+		   pcislot, dev->nvlink.slot_label);
+
+	if (streq(pcislot, dev->nvlink.slot_label))
+		return 1;
+
+	return 0;
+}
+
+static int64_t npu2_gpu_bridge_sec_bus_reset(void *dev,
+		struct pci_cfg_reg_filter *pcrf __unused,
+		uint32_t offset, uint32_t len,
+		uint32_t *data, bool write)
+{
+	struct pci_device *pd = dev;
+	struct pci_device *gpu;
+	struct phb *npphb;
+	struct npu2 *npu;
+	struct dt_node *np;
+	struct npu2_dev	*ndev;
+	int i;
+
+	assert(write);
+
+	if ((len != 2) || (offset & 1)) {
+		/* Short config writes are not supported */
+		PCIERR(pd->phb, pd->bdfn,
+		       "Unsupported write to bridge control register\n");
+		return OPAL_PARAMETER;
+	}
+
+	gpu = list_top(&pd->children, struct pci_device, link);
+	if (gpu && (*data & PCI_CFG_BRCTL_SECONDARY_RESET)) {
+		int64_t rc;
+
+		dt_for_each_compatible(dt_root, np, "ibm,power9-npu-pciex") {
+			npphb = pci_get_phb(dt_prop_get_cell(np,
+					"ibm,opal-phbid", 1));
+			if (!npphb || npphb->phb_type != phb_type_npu_v2)
+				continue;
+
+			npu = phb_to_npu2_nvlink(npphb);
+			for (i = 0; i < npu->total_devices; ++i) {
+				ndev = &npu->devices[i];
+				if (ndev->nvlink.pd == gpu)
+					npu2_dev_procedure_reset(ndev);
+			}
+		}
+
+		rc = purge_l2_l3_caches();
+		if (rc)
+			return rc;
+	}
+
+	return OPAL_PARTIAL;
+}
+
+static void npu2_dev_bind_pci_dev(struct npu2_dev *dev)
+{
+	struct phb *phb;
+	uint32_t i;
+
+	if (dev->nvlink.pd)
+		return;
+
+	for (i = 0; i < 64; i++) {
+		if (dev->npu->phb_nvlink.opal_id == i)
+			continue;
+
+		phb = pci_get_phb(i);
+		if (!phb)
+			continue;
+
+		dev->nvlink.pd = pci_walk_dev(phb, NULL, __npu2_dev_bind_pci_dev, dev);
+		if (dev->nvlink.pd) {
+			dev->nvlink.phb = phb;
+			/* Found the device, set the bit in config space */
+			npu2_set_link_flag(dev, NPU2_DEV_PCI_LINKED);
+
+			/*
+			 * We define a custom sec bus reset handler for a slot
+			 * with an NVLink-connected GPU to prevent HMIs which
+			 * will otherwise happen if we reset GPU before
+			 * resetting NVLinks.
+			 */
+			if (dev->nvlink.pd->parent &&
+			    dev->nvlink.pd->parent->slot)
+				pci_add_cfg_reg_filter(dev->nvlink.pd->parent,
+						PCI_CFG_BRCTL, 2,
+						PCI_REG_FLAG_WRITE,
+						npu2_gpu_bridge_sec_bus_reset);
+			return;
+		}
+	}
+
+	NPU2DEVINF(dev, "No PCI device found for slot '%s'\n",
+		   dev->nvlink.slot_label);
+}
+
+static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED;
+
+static void npu2_append_phandle(struct dt_node *dn,
+				u32 phandle)
+{
+	struct dt_property *prop;
+	uint32_t *npu_phandles;
+	size_t len;
+
+	/*
+	 * Use a lock to make sure no one else has a reference to an
+	 * ibm,npu property (this assumes this is the only function
+	 * that holds a reference to it)
+	 */
+	lock(&pci_npu_phandle_lock);
+
+	/* This function shouldn't be called unless ibm,npu exists */
+	prop = (struct dt_property *)dt_require_property(dn, "ibm,npu", -1);
+
+	/* Need to append to the properties */
+	len = prop->len + sizeof(*npu_phandles);
+	dt_resize_property(&prop, len);
+
+	npu_phandles = (uint32_t *)prop->prop;
+	npu_phandles[len / sizeof(*npu_phandles) - 1] = phandle;
+	unlock(&pci_npu_phandle_lock);
+}
+
+static struct dt_node *npu2_create_memory_dn(uint64_t addr, uint64_t size)
+{
+	struct dt_node *mem;
+	static u32 chip_id = 255;
+
+	mem = dt_find_by_name_addr(dt_root, "memory", addr);
+	if (mem)
+		return mem;
+
+	mem = dt_new_addr(dt_root, "memory", addr);
+	if (!mem)
+		return NULL;
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
+	dt_add_property_u64s(mem, "reg", addr, size);
+	dt_add_property_cells(mem, "ibm,chip-id", chip_id);
+	dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
+	dt_add_property_cells(mem, "ibm,associativity", 4, chip_id, chip_id, chip_id, chip_id);
+	chip_id--;
+
+	assert(chip_id);
+	return mem;
+}
+
+/* There are potentially multiple links per GPU, so lookup the GPU memory based
+ * on bdfn. */
+static void npu2_get_gpu_base(struct npu2_dev *ndev, uint64_t *addr, uint64_t *size)
+{
+	struct npu2 *p = ndev->npu;
+	int group;
+
+	group = PCI_DEV(ndev->bdfn);
+	phys_map_get(ndev->npu->chip_id, p->gpu_map_type, group, addr, size);
+}
+
+static void npu2_dn_fixup_gmb(struct dt_node *pd_dn, struct npu2_dev *ndev)
+{
+	uint64_t gpu_base, gpu_size, gta;
+	struct dt_node *mem_dn;
+
+	npu2_get_gpu_base(ndev, &gpu_base, &gpu_size);
+	mem_dn = npu2_create_memory_dn(gpu_base, gpu_size);
+	assert(mem_dn);
+	dt_add_property_cells(pd_dn, "memory-region", mem_dn->phandle);
+
+	/* Coral mode address compression. This is documented in Figure 3.5
+	 * "P9->GPU RA Compression (Coral) of the NPU2 workbook". */
+	gta  = ((gpu_base >> 42) & 0x1) << 42;
+	gta |= ((gpu_base >> 45) & 0x3) << 43;
+	gta |= ((gpu_base >> 49) & 0x3) << 45;
+	gta |= gpu_base & ((1UL << 43) - 1);
+
+	dt_add_property_u64s(pd_dn, "ibm,device-tgt-addr", gta);
+}
+
+static int npu2_assign_gmb(struct npu2_dev *ndev)
+{
+	struct npu2 *p = ndev->npu;
+	int peers, mode;
+	uint32_t bdfn;
+	uint64_t base, size, reg, val, gmb;
+
+	/* Need to work out number of link peers. This amount to
+	 * working out the maximum function number. So work start at
+	 * the highest bdfn (fn = 6) and count back until we find a
+	 * npu2_dev. */
+	for (bdfn = (ndev->bdfn & ~0x7) | NPU2_LINKS_PER_CHIP;
+	     PCI_FUNC(bdfn) != 0x7; bdfn = (bdfn & ~0x7) | (PCI_FUNC(bdfn) - 1))
+		if (npu2_bdf_to_dev(p, bdfn))
+			break;
+	peers = PCI_FUNC(bdfn);
+
+	npu2_get_gpu_base(ndev, &base, &size);
+
+	NPU2DBG(p, "Setting BAR region dt:%llx\n", base);
+	val = SETFIELD(NPU2_MEM_BAR_EN, 0ULL, 1);
+	val = SETFIELD(NPU2_MEM_BAR_SEL_MEM, val, base >> (63-14));
+	val = SETFIELD(NPU2_MEM_BAR_GROUP, val, base >> (63-18));
+	val = SETFIELD(NPU2_MEM_BAR_CHIP, val, base >> (63-21));
+	val = SETFIELD(NPU2_MEM_BAR_NODE_ADDR, val, base >> (63-33));
+	val = SETFIELD(NPU2_MEM_BAR_POISON, val, 1);
+	val = SETFIELD(NPU2_MEM_BAR_GRANULE, val, 0);
+
+	/* We don't know how much memory the GPU has, so we may as well just
+	 * pass the whole aperture through at this point. */
+	val = SETFIELD(NPU2_MEM_BAR_BAR_SIZE, val, ilog2(size >> 30));
+
+	switch (peers) {
+	case 0:
+		mode = 0;
+		break;
+	case 1:
+		mode = 1;
+		break;
+	case 2:
+		mode = 3;
+		break;
+	case 3:
+		mode = 6;
+		break;
+	case 5:
+		mode = 10;
+		break;
+	default:
+		/* Hardware does not support this configuration */
+		assert(0);
+	}
+
+	mode += PCI_FUNC(ndev->bdfn);
+	val = SETFIELD(NPU2_MEM_BAR_MODE, val, mode);
+
+	gmb = NPU2_GPU0_MEM_BAR;
+	if (NPU2DEV_BRICK(ndev))
+		gmb = NPU2_GPU1_MEM_BAR;
+
+	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+			      NPU2_BLOCK_SM_0, gmb);
+
+	npu2_write(p, reg, val);
+	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+			      NPU2_BLOCK_SM_1, gmb);
+	npu2_write(p, reg, val);
+	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+			      NPU2_BLOCK_SM_2, gmb);
+	npu2_write(p, reg, val);
+	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+			      NPU2_BLOCK_SM_3, gmb);
+	npu2_write(p, reg, val);
+
+	return 0;
+}
+
+static int npu2_dn_fixup(struct phb *phb,
+			 struct pci_device *pd,
+			 void *data __unused)
+{
+	struct npu2 *p = phb_to_npu2_nvlink(phb);
+	struct npu2_dev *dev;
+	uint32_t speed;
+	const char *label;
+
+	dev = npu2_bdf_to_dev(p, pd->bdfn);
+	assert(dev);
+	if (dev->nvlink.phb || dev->nvlink.pd)
+		return 0;
+
+	npu2_assign_gmb(dev);
+	npu2_dn_fixup_gmb(pd->dn, dev);
+	dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dt_node->phandle);
+
+	/*
+	 * NVLink supports multiple speeds and device drivers need to know what
+	 * speed has been set by firmware. Hostboot does the inits that set the
+	 * link speed and tell us via HDAT and we need to copy that from the
+	 * link node.
+	 */
+	speed = dt_prop_get_u32_def(dev->dt_node, "nvidia,link-speed", 0xff);
+	if (speed != 0xff)
+		dt_add_property_cells(pd->dn, "ibm,nvlink-speed", speed);
+
+	/*
+	 * NPU2 devices have a slot label that indicates which GPU slot
+	 * this NPU is connected to. Add a location code to the NVlink
+	 * device node based on the slot label.
+	 */
+	label = dt_prop_get_def(dev->dt_node, "ibm,slot-label", NULL);
+	if (!label) {
+		/**
+		 * @fwts-label NPUNoPHBSlotLabel
+		 * @fwts-advice No GPU/NPU2 slot information was found.
+		 * NVLink2 functionality will not work.
+		 */
+		prlog(PR_ERR, "NPU: Cannot find GPU slot information\n");
+		return 0;
+	}
+	dt_add_property_string(pd->dn, "ibm,loc-code", label);
+
+	dev->nvlink.slot_label = label;
+
+	/*
+	 * Bind the emulated PCI device with the real one, which can't
+	 * be done until the PCI devices are populated. Once the real
+	 * PCI device is identified, we also need fix the device-tree
+	 * for it
+	 */
+	npu2_dev_bind_pci_dev(dev);
+	if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) {
+		if (dt_find_property(dev->nvlink.pd->dn, "ibm,npu"))
+			npu2_append_phandle(dev->nvlink.pd->dn, pd->dn->phandle);
+		else
+			dt_add_property_cells(dev->nvlink.pd->dn, "ibm,npu", pd->dn->phandle);
+
+		dt_add_property_cells(pd->dn, "ibm,gpu", dev->nvlink.pd->dn->phandle);
+		dev->nvlink.gpu_bdfn = dev->nvlink.pd->bdfn;
+	}
+
+	return 0;
+}
+
+static int npu2_links_per_gpu(struct phb *phb,
+			      struct pci_device *pd,
+			      void *data)
+{
+	struct npu2 *p = phb_to_npu2_nvlink(phb);
+	struct npu2_dev *dev;
+	int *nlinks = (int *)data;
+
+	dev = npu2_bdf_to_dev(p, pd->bdfn);
+	assert(dev);
+
+	if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) {
+		const struct dt_property *prop;
+		int n;
+
+		/* The link count is the number of phandles in "ibm,npu" */
+		prop = dt_find_property(dev->nvlink.pd->dn, "ibm,npu");
+		if (!prop)
+			return 0;
+
+		/* Count could vary by gpu, so find the max */
+		n = prop->len / sizeof(uint32_t);
+		if (n > *nlinks)
+			*nlinks = n;
+	}
+
+	return 0;
+}
+
+static void npu2_phb_fixup_scominit(struct dt_node *dn, int links_per_gpu)
+{
+	uint32_t gcid = dt_get_chip_id(dn);
+	uint64_t val, mask;
+
+	/*
+	 * MRBSP settings for 2- and 3-link GPU systems. These can improve
+	 * GPU peer-to-peer fully ordered write performance.
+	 */
+	if (links_per_gpu == 3) {
+		val = PPC_BIT(30) | PPC_BIT(34) | PPC_BIT(36) | PPC_BIT(37) |
+		      PPC_BIT(44) | PPC_BIT(45);
+		mask = PPC_BITMASK(28,39) | PPC_BITMASK(44,47);
+	} else if (links_per_gpu == 2) {
+		val = PPC_BIT(46) | PPC_BIT(47);
+		mask = PPC_BITMASK(44,47);
+	} else
+		return;
+
+	xscom_write_mask(gcid, 0x50110c0, val, mask);
+	xscom_write_mask(gcid, 0x50112c0, val, mask);
+	xscom_write_mask(gcid, 0x50114c0, val, mask);
+}
+
+static void npu2_phb_final_fixup(struct phb *phb)
+{
+	int links_per_gpu = 0;
+	struct dt_node *np;
+
+	pci_walk_dev(phb, NULL, npu2_dn_fixup, NULL);
+
+	/*
+	 * Now that the emulated devices are bound to the real ones, we can
+	 * determine links_per_gpu and do some final init.
+	 */
+	pci_walk_dev(phb, NULL, npu2_links_per_gpu, &links_per_gpu);
+	dt_for_each_compatible(dt_root, np, "ibm,power9-npu")
+		npu2_phb_fixup_scominit(np, links_per_gpu);
+}
+
+static void npu2_init_ioda_cache(struct npu2 *p)
+{
+	/* TVT */
+	memset(p->tve_cache, 0, sizeof(p->tve_cache));
+}
+
+static int64_t npu2_ioda_reset(struct phb *phb, bool purge)
+{
+	struct npu2 *p = phb_to_npu2_nvlink(phb);
+	uint32_t i;
+
+	if (purge) {
+		NPU2DBG(p, "Purging all IODA tables...\n");
+		npu2_init_ioda_cache(p);
+	}
+
+	/* TVT */
+	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+		out_be64(p->regs + NPU2_ATS_IODA_DATA, p->tve_cache[i]);
+
+	return OPAL_SUCCESS;
+}
+
+static void npu2_write_mcd(struct npu2 *p, uint64_t pcb_addr, uint64_t addr,
+			   uint64_t size)
+{
+	uint64_t val;
+
+	NPU2DBG(p, "Setting MCD addr:%llx\n", pcb_addr);
+	assert(is_pow2(size));
+
+	val = MCD_BANK_CN_VALID;
+	val = SETFIELD(MCD_BANK_CN_SIZE, val, (size >> 25) - 1);
+	val = SETFIELD(MCD_BANK_CN_ADDR, val, addr >> 25);
+	xscom_write(p->chip_id, pcb_addr, val);
+}
+
+static void npu2_mcd_init(struct npu2 *p)
+{
+	int i;
+	uint64_t size, addr, gpu_min_addr, gpu_max_addr, total_size;
+
+	/* Init memory cache directory (MCD) registers. */
+	phys_map_get(p->chip_id, p->gpu_map_type, NPU2_LINKS_PER_CHIP - 1,
+			&gpu_min_addr, NULL);
+	phys_map_get(p->chip_id, p->gpu_map_type, 0, &gpu_max_addr, &size);
+	gpu_max_addr += size;
+
+	/* We assume GPU memory is contiguous from the first possible GPU to the
+	 * last and that the size is the same so best to check that. */
+	for (i = 0; i < NPU2_LINKS_PER_CHIP; i++) {
+		uint64_t tmp;
+		phys_map_get(p->chip_id, p->gpu_map_type, i, &addr, &tmp);
+		assert((addr >= gpu_min_addr) && (addr + tmp <= gpu_max_addr));
+		assert(tmp == size);
+	}
+
+	/* We have two MCDs, so if neccessary we can split the region covered
+	 * across both if total_size is not a power of two. */
+	total_size = gpu_max_addr - gpu_min_addr;
+	size = 1ull << ilog2(total_size);
+
+	/* Allocate the biggest chunk first as we assume gpu_max_addr has the
+	 * highest alignment. */
+	addr = gpu_max_addr - size;
+	npu2_write_mcd(p, MCD0_BANK0_CN3, addr, size);
+	total_size -= size;
+	if (total_size) {
+	/* total_size was not a power of two, but the remainder should
+	 * be if all GPUs were assigned the same size. */
+		assert(is_pow2(total_size));
+		size = 1ull << ilog2(total_size);
+		addr -= size;
+		assert(addr <= gpu_min_addr);
+		npu2_write_mcd(p, MCD1_BANK0_CN3, addr, size);
+	}
+}
+
+static void npu2_hw_init(struct npu2 *p)
+{
+	uint64_t reg, val;
+	int s, b;
+
+	npu2_ioda_reset(&p->phb_nvlink, false);
+
+	/* Enable XTS retry mode */
+	val = npu2_read(p, NPU2_XTS_CFG);
+	npu2_write(p, NPU2_XTS_CFG, val | NPU2_XTS_CFG_MMIOSD | NPU2_XTS_CFG_TRY_ATR_RO);
+
+	val = npu2_read(p, NPU2_XTS_CFG2);
+	npu2_write(p, NPU2_XTS_CFG2, val | NPU2_XTS_CFG2_NO_FLUSH_ENA);
+
+	/*
+	 * There are three different ways we configure the MCD and memory map.
+	 * 1) Old way
+	 *    Skiboot configures the MCD and puts GPUs at 4TB and below
+	 * 2) New way with MCD
+	 *    Hostboot configures the MCD and skiboot puts GPU at 4TB and above
+	 * 3) New way without MCD
+	 *    No one configures the MCD and skiboot puts GPU at 4TB and below
+	 *
+	 * 1) Will go away evenutally as it's a configuration that can
+	 *    cause an xstop or data integrity problems. We are keeping
+	 *    it around to support existing hostboot. Print error
+	 *    message if used.
+	 * 2) Is for smaller memory configurations and will be used
+	 *    initially for GPUs on Witherspoon. Supports only to
+	 *    512GB of memory and 4 GPUs per socket.
+	 * 3) Is for fully populated configurations of 4TB of memory
+	 *    and 6GPUs per socket. May have performance impacts.
+	 *
+	 * The different configurations can be detected via the following scoms:
+	 * 1) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 0
+	 * 2) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 7
+	 * 3) 0x5011c0c bit 2 = 0, 0x5011c0a bits 42:48 = 0
+	 */
+
+	/* Get 0x05011c0c bit 2 = 1 */
+	xscom_read(p->chip_id, PB_CENT_HP_MODE_CURR, &val);
+	if ((val & PB_CFG_CHG_RATE_GP_MASTER) != 0) {
+		/* Get 0x05011c0a bits 42:48 */
+		xscom_read(p->chip_id, PB_CENT_MODE, &val);
+		if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 0) {
+			/* 1) */
+			NPU2DBG(p, "Using old memory map + MCD enabled in skiboot\n");
+			NPU2ERR(p, "!!! Old firmware detected. Update hostboot for new MCD mapping !!!\n");
+			p->gpu_map_type = GPU_MEM_4T_DOWN;
+			npu2_mcd_init(p);
+		} else if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 7) {
+			/* 2) */
+			NPU2DBG(p, "Using small memory map + MCD enabled\n");
+			p->gpu_map_type = GPU_MEM_4T_UP;
+		} else
+			NPU2ERR(p, "!!! Unsupported NPU2 configuration. "
+				"0x%llx!!!\n", val);
+	} else {
+		/* 3) */
+		NPU2DBG(p, "Using large memory map + MCD disabled\n");
+		p->gpu_map_type = GPU_MEM_4T_DOWN;
+	}
+
+	/* Static initialization of every relaxed-ordering cfg[2] register */
+	val = NPU2_RELAXED_ORDERING_CMD_CL_DMA_W |
+	      NPU2_RELAXED_ORDERING_CMD_CL_DMA_W_HP |
+	      NPU2_RELAXED_ORDERING_CMD_CL_DMA_INJ |
+	      NPU2_RELAXED_ORDERING_CMD_PR_DMA_INJ |
+	      NPU2_RELAXED_ORDERING_CMD_DMA_PR_W |
+	      NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0 |
+	      NPU2_RELAXED_ORDERING_SOURCE4_RDENA;
+
+	for (s = NPU2_STACK_STCK_0; s <= NPU2_STACK_STCK_2; s++) {
+		for (b = NPU2_BLOCK_SM_0; b <= NPU2_BLOCK_SM_3; b++) {
+			reg = NPU2_REG_OFFSET(s, b, NPU2_RELAXED_ORDERING_CFG(2));
+			npu2_write(p, reg, val);
+		}
+	}
+}
+
+static int64_t npu2_map_pe_dma_window_real(struct phb *phb,
+					   uint64_t pe_num,
+					   uint16_t window_id,
+					   uint64_t pci_start_addr __unused,
+					   uint64_t pci_mem_size __unused)
+{
+	struct npu2 *p = phb_to_npu2_nvlink(phb);
+	uint64_t tve;
+
+	/* Sanity check. Each PE has one corresponding TVE */
+	if (pe_num >= NPU2_MAX_PE_NUM ||
+	    window_id != pe_num)
+		return OPAL_PARAMETER;
+
+	if (pci_mem_size) {
+		/* GPUs need to be able to access the MMIO memory space as well.
+		 * On POWER9 this is above the top of ram so disable the TVT
+		 * range check allowing access to all memory addresses. */
+		tve = 0;
+	} else {
+		/* Disable */
+		tve = PPC_BIT(51);
+	}
+
+	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
+	out_be64(p->regs + NPU2_ATS_IODA_DATA, tve);
+	p->tve_cache[window_id] = tve;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_map_pe_dma_window(struct phb *phb,
+				      uint64_t pe_num,
+				      uint16_t window_id,
+				      uint16_t tce_levels,
+				      uint64_t tce_table_addr,
+				      uint64_t tce_table_size,
+				      uint64_t tce_page_size)
+{
+	struct npu2 *p = phb_to_npu2_nvlink(phb);
+	uint64_t tts_encoded;
+	uint64_t data64 = 0;
+
+	/* Sanity check. Each PE has one corresponding TVE */
+	if (pe_num >= NPU2_MAX_PE_NUM ||
+	    window_id != pe_num)
+		return OPAL_PARAMETER;
+
+	/*
+	 * Special condition, zero TCE table size used to disable
+	 * the TVE.
+	 */
+	if (!tce_table_size) {
+		npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
+		out_be64(p->regs + NPU2_ATS_IODA_DATA, 0ul);
+		p->tve_cache[window_id] = 0ul;
+		return OPAL_SUCCESS;
+	}
+
+	/* Additional arguments validation */
+	if (tce_levels < 1 ||
+	    tce_levels > 4 ||
+	    !is_pow2(tce_table_size) ||
+	    tce_table_size < 0x1000)
+		return OPAL_PARAMETER;
+
+	/* TCE table size */
+	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_TTA, 0ul, tce_table_addr >> 12);
+	tts_encoded = ilog2(tce_table_size) - 11;
+	if (tts_encoded > 39)
+		return OPAL_PARAMETER;
+	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_SIZE, data64, tts_encoded);
+
+	/* TCE page size */
+	switch (tce_page_size) {
+	case 0x10000:		/* 64K */
+		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 5);
+		break;
+	case 0x1000000:		/* 16M */
+		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 13);
+		break;
+	case 0x10000000:	/* 256M */
+		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 17);
+		break;
+	case 0x1000:		/* 4K */
+	default:
+		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 1);
+	}
+
+	/* Number of levels */
+	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_LEVEL, data64, tce_levels - 1);
+
+	/* Update to hardware */
+	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
+	out_be64(p->regs + NPU2_ATS_IODA_DATA, data64);
+	p->tve_cache[window_id] = data64;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_set_pe(struct phb *phb,
+			   uint64_t pe_num,
+			   uint64_t bdfn,
+			   uint8_t bcompare,
+			   uint8_t dcompare,
+			   uint8_t fcompare,
+			   uint8_t action)
+{
+	struct npu2 *p;
+	struct npu2_dev *dev;
+	uint64_t reg, val;
+
+	/* Sanity check */
+	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+		return OPAL_PARAMETER;
+	if (pe_num >= NPU2_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+	if (bdfn >> 8)
+		return OPAL_PARAMETER;
+	if (bcompare != OpalPciBusAll ||
+	    dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
+	    fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
+		return OPAL_UNSUPPORTED;
+	if (phb->phb_type != phb_type_npu_v2)
+		return OPAL_PARAMETER;
+
+	p = phb_to_npu2_nvlink(phb);
+	if (!p)
+		return OPAL_PARAMETER;
+
+	dev = npu2_bdf_to_dev(p, bdfn);
+	if (!dev)
+		return OPAL_PARAMETER;
+
+	val = NPU2_CQ_BRICK_BDF2PE_MAP_ENABLE;
+	val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_PE, val, pe_num);
+	val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn);
+
+	if (!NPU2DEV_BRICK(dev))
+		reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2,
+				      NPU2_BLOCK_CTL, NPU2_CQ_BRICK0_BDF2PE_MAP0);
+	else
+		reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2,
+				      NPU2_BLOCK_CTL, NPU2_CQ_BRICK1_BDF2PE_MAP0);
+
+	npu2_write(p, reg, val);
+	val = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE;
+	val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE, val, pe_num);
+	val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn);
+	reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC,
+			      NPU2_MISC_BRICK0_BDF2PE_MAP0 + (dev->brick_index * 0x18));
+	npu2_write(p, reg, val);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_get_link_state(struct pci_slot *slot __unused, uint8_t *val)
+{
+	/*
+	 * As we're emulating all PCI stuff, the link bandwidth
+	 * isn't big deal anyway.
+	 */
+	*val = OPAL_SHPC_LINK_UP_x1;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_get_power_state(struct pci_slot *slot __unused, uint8_t *val)
+{
+	*val = PCI_SLOT_POWER_ON;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_hreset(struct pci_slot *slot __unused)
+{
+	struct npu2 *p;
+	int i;
+	struct npu2_dev *ndev;
+
+	p = phb_to_npu2_nvlink(slot->phb);
+	NPU2INF(p, "Hreset PHB state\n");
+
+	for (i = 0; i < p->total_devices; i++) {
+		ndev = &p->devices[i];
+		if (ndev) {
+			NPU2DEVINF(ndev, "Resetting device\n");
+			reset_ntl(ndev);
+		}
+	}
+	return purge_l2_l3_caches();
+}
+
+static int64_t npu2_freset(struct pci_slot *slot __unused)
+{
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_creset(struct pci_slot *slot)
+{
+	struct npu2 *p;
+	int i;
+	struct npu2_dev *ndev;
+
+	p = phb_to_npu2_nvlink(slot->phb);
+	NPU2INF(p, "Creset PHB state\n");
+
+	for (i = 0; i < p->total_devices; i++) {
+		ndev = &p->devices[i];
+		if (ndev) {
+			NPU2DEVINF(ndev, "Resetting device\n");
+			reset_ntl(ndev);
+		}
+	}
+	return OPAL_SUCCESS;
+}
+
+static struct pci_slot *npu2_slot_create(struct phb *phb)
+{
+	struct pci_slot *slot;
+
+	slot = pci_slot_alloc(phb, NULL);
+	if (!slot)
+		return slot;
+
+	/* Elementary functions */
+	slot->ops.get_presence_state  = NULL;
+	slot->ops.get_link_state      = npu2_get_link_state;
+	slot->ops.get_power_state     = npu2_get_power_state;
+	slot->ops.get_attention_state = NULL;
+	slot->ops.get_latch_state     = NULL;
+	slot->ops.set_power_state     = NULL;
+	slot->ops.set_attention_state = NULL;
+
+	slot->ops.prepare_link_change = NULL;
+	slot->ops.poll_link           = NULL;
+	slot->ops.hreset              = npu2_hreset;
+	slot->ops.freset              = npu2_freset;
+	slot->ops.creset              = npu2_creset;
+
+	return slot;
+}
+
+int64_t npu2_freeze_status(struct phb *phb __unused,
+			   uint64_t pe_number __unused,
+			   uint8_t *freeze_state,
+			   uint16_t *pci_error_type,
+			   uint16_t *severity)
+{
+	/*
+	 * FIXME: When it's called by skiboot PCI config accessor,
+	 * the PE number is fixed to 0, which is incorrect. We need
+	 * introduce another PHB callback to translate it. For now,
+	 * it keeps the skiboot PCI enumeration going.
+	 */
+	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+	if (severity)
+		*severity = OPAL_EEH_SEV_NO_ERROR;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_eeh_next_error(struct phb *phb,
+				   uint64_t *first_frozen_pe,
+				   uint16_t *pci_error_type,
+				   uint16_t *severity)
+{
+	struct npu2 *p = phb_to_npu2_nvlink(phb);
+	int i;
+	uint64_t result = 0;
+
+	if (!first_frozen_pe || !pci_error_type || !severity)
+		return OPAL_PARAMETER;
+
+	*first_frozen_pe = -1;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+	*severity = OPAL_EEH_SEV_NO_ERROR;
+
+	for (i = 0; i < NPU2_MAX_PE_NUM; i++) {
+		result = npu2_read(p, NPU2_MISC_PESTB(i));
+		if (result > 0) {
+			*first_frozen_pe = i;
+			*pci_error_type = OPAL_EEH_PE_ERROR;
+			*severity = OPAL_EEH_SEV_PE_ER;
+			break;
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_tce_kill(struct phb *phb, uint32_t kill_type,
+			     uint64_t pe_number, uint32_t tce_size,
+			     uint64_t dma_addr, uint32_t npages)
+{
+	struct npu2 *npu = phb_to_npu2_nvlink(phb);
+	uint32_t tce_page_size;
+	uint64_t val;
+
+	if (pe_number > NPU2_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	sync();
+	switch(kill_type) {
+	case OPAL_PCI_TCE_KILL_PAGES:
+		tce_page_size = 1ULL << (
+				11 + GETFIELD(npu->tve_cache[pe_number],
+					NPU2_ATS_IODA_TBL_TVT_PSIZE));
+		if (tce_page_size != tce_size) {
+			NPU2ERR(npu, "npu2_tce_kill: Unexpected TCE size (got 0x%x expected 0x%x)\n",
+				tce_size, tce_page_size);
+			return OPAL_PARAMETER;
+		}
+
+		if (npages < 128) {
+			while (npages--) {
+				val = SETFIELD(NPU2_ATS_TCE_KILL_PENUM, dma_addr, pe_number);
+				npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ONE | val);
+				dma_addr += tce_size;
+			}
+			break;
+		}
+		/*
+		 * For too many TCEs do not bother with the loop above and simply
+		 * flush everything, going to be lot faster.
+		 */
+		/* Fall through */
+	case OPAL_PCI_TCE_KILL_PE:
+		/*
+		 * NPU2 doesn't support killing a PE so fall through
+		 * and do a kill all instead.
+		 */
+	case OPAL_PCI_TCE_KILL_ALL:
+		npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ALL);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static const struct phb_ops npu_ops = {
+	.cfg_read8		= npu2_cfg_read8,
+	.cfg_read16		= npu2_cfg_read16,
+	.cfg_read32		= npu2_cfg_read32,
+	.cfg_write8		= npu2_cfg_write8,
+	.cfg_write16		= npu2_cfg_write16,
+	.cfg_write32		= npu2_cfg_write32,
+	.device_init		= NULL,
+	.phb_final_fixup	= npu2_phb_final_fixup,
+	.ioda_reset		= npu2_ioda_reset,
+	.papr_errinjct_reset	= NULL,
+	.pci_reinit		= NULL,
+	.set_phb_mem_window	= NULL,
+	.phb_mmio_enable	= NULL,
+	.map_pe_mmio_window	= NULL,
+	.map_pe_dma_window	= npu2_map_pe_dma_window,
+	.map_pe_dma_window_real	= npu2_map_pe_dma_window_real,
+	.pci_msi_eoi		= NULL,
+	.set_xive_pe		= NULL,
+	.get_msi_32		= NULL,
+	.get_msi_64		= NULL,
+	.set_pe			= npu2_set_pe,
+	.set_peltv		= NULL,
+	.eeh_freeze_status	= npu2_freeze_status,
+	.eeh_freeze_clear	= NULL,
+	.eeh_freeze_set		= NULL,
+	.next_error		= npu2_eeh_next_error,
+	.err_inject		= NULL,
+	.get_diag_data2		= NULL,
+	.set_capi_mode		= NULL,
+	.set_capp_recovery	= NULL,
+	.tce_kill		= npu2_tce_kill,
+};
+
+static void assign_mmio_bars(uint64_t gcid, uint32_t scom, uint64_t reg[2], uint64_t mm_win[2])
+{
+	uint32_t i;
+	struct npu2_bar *bar;
+	struct npu2_bar npu2_bars[] = {
+		/* NPU_REGS must be first in this list */
+		{ .type = NPU_REGS, .index = 0,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_PHY_BAR),
+		  .flags = NPU2_BAR_FLAG_ENABLED },
+		{ .type = NPU_PHY, .index = 0,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_PHY_BAR),
+		  .flags = NPU2_BAR_FLAG_ENABLED },
+		{ .type = NPU_PHY, .index = 1,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_PHY_BAR),
+		  .flags = NPU2_BAR_FLAG_ENABLED },
+		{ .type = NPU_NTL, .index = 0,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL0_BAR) },
+		{ .type = NPU_NTL, .index = 1,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL1_BAR) },
+		{ .type = NPU_NTL, .index = 2,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL0_BAR) },
+		{ .type = NPU_NTL, .index = 3,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL1_BAR) },
+		{ .type = NPU_NTL, .index = 4,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL0_BAR) },
+		{ .type = NPU_NTL, .index = 5,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL1_BAR) },
+		{ .type = NPU_GENID, .index = 0,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_GENID_BAR) },
+		{ .type = NPU_GENID, .index = 1,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_GENID_BAR) },
+		{ .type = NPU_GENID, .index = 2,
+		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_GENID_BAR) },
+	};
+
+	for (i = 0; i < ARRAY_SIZE(npu2_bars); i++) {
+		bar = &npu2_bars[i];
+		npu2_get_bar(gcid, bar);
+		npu2_write_bar(NULL, bar, gcid, scom);
+	}
+
+	/* Global MMIO BAR */
+	reg[0] = npu2_bars[0].base;
+	reg[1] = npu2_bars[0].size;
+
+	/* NTL and GENID BARs are exposed to kernel via the mm
+	 * window */
+	mm_win[0] = npu2_bars[3].base;
+	mm_win[1] = npu2_bars[ARRAY_SIZE(npu2_bars) - 1].base +
+		    npu2_bars[ARRAY_SIZE(npu2_bars) - 1].size -
+		    mm_win[0];
+}
+
+/*
+ * Set up NPU for NVLink and create PCI root device node
+ * accordingly.
+ */
+int npu2_nvlink_init_npu(struct npu2 *npu)
+{
+	struct dt_node *np;
+	uint64_t reg[2], mm_win[2], val, mask;
+
+	/* TODO: Clean this up with register names, etc. when we get
+	 * time. This just turns NVLink mode on in each brick and should
+	 * get replaced with a patch from ajd once we've worked out how
+	 * things are going to work there.
+	 *
+	 * Obviously if the year is now 2020 that didn't happen and you
+	 * should fix this :-) */
+
+	val = PPC_BIT(58);
+	mask = PPC_BIT(58) | /* CONFIG_NVLINK_MODE */
+	       PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */
+
+	/*
+	 * V100 GPUs are known to violate NVLink2 protocol if some GPU memory
+	 * mapped by a CPU was also "linear-block" mapped by a GPU. When this
+	 * happens, it breaks the NPU2 cache coherency state machine and
+	 * it throws machine checkstop. Disabling snarfing fixes this so let's
+	 * disable it by default.
+	 */
+	if (nvram_query_eq_dangerous("opal-npu2-snarf-cpm", "enable")) {
+		prlog(PR_WARNING, "NPU2#%d: enabling Probe.I.MO snarfing, a bad GPU driver may crash the system!\n",
+				npu->index);
+		val |= PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */
+	}
+
+	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM0_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM1_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM2_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM3_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM0_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM1_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM2_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM3_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM0_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM1_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM2_MISC_CONFIG0,
+			 val, mask);
+	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM3_MISC_CONFIG0,
+			 val, mask);
+
+	xscom_write_mask(npu->chip_id, 0x50110c0, PPC_BIT(53), PPC_BIT(53));
+	xscom_write_mask(npu->chip_id, 0x50112c0, PPC_BIT(53), PPC_BIT(53));
+	xscom_write_mask(npu->chip_id, 0x50114c0, PPC_BIT(53), PPC_BIT(53));
+	xscom_write_mask(npu->chip_id, 0x50110f1, PPC_BIT(41), PPC_BIT(41));
+	xscom_write_mask(npu->chip_id, 0x50112f1, PPC_BIT(41), PPC_BIT(41));
+	xscom_write_mask(npu->chip_id, 0x50114f1, PPC_BIT(41), PPC_BIT(41));
+
+	val = NPU2_NTL_MISC_CFG2_BRICK_ENABLE |
+	      NPU2_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
+	      NPU2_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA |
+	      NPU2_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA;
+	xscom_write_mask(npu->chip_id, 0x5011110, val, val);
+	xscom_write_mask(npu->chip_id, 0x5011130, val, val);
+	xscom_write_mask(npu->chip_id, 0x5011310, val, val);
+	xscom_write_mask(npu->chip_id, 0x5011330, val, val);
+	xscom_write_mask(npu->chip_id, 0x5011510, val, val);
+	xscom_write_mask(npu->chip_id, 0x5011530, val, val);
+
+	val = PPC_BIT(6) | PPC_BIT(7) | PPC_BIT(11);
+	xscom_write_mask(npu->chip_id, 0x5011009, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011039, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011069, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011099, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011209, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011239, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011269, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011299, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011409, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011439, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011469, val, PPC_BITMASK(6,11));
+	xscom_write_mask(npu->chip_id, 0x5011499, val, PPC_BITMASK(6,11));
+
+	/* Reassign the BARs */
+	assign_mmio_bars(npu->chip_id, npu->xscom_base, reg, mm_win);
+	npu->regs = (void *)reg[0];
+	npu->mm_base = mm_win[0];
+	npu->mm_size = mm_win[1];
+
+	if (reg[0] && reg[1])
+		prlog(PR_INFO, "   Global MMIO BAR:  %016llx (%lldMB)\n",
+		      reg[0], reg[1] >> 20);
+	else
+		prlog(PR_ERR, "    Global MMIO BAR: Disabled\n");
+
+	/* Populate PCI root device node */
+	np = dt_new_addr(dt_root, "pciex", reg[0]);
+	assert(np);
+	dt_add_property_strings(np,
+				"compatible",
+				"ibm,power9-npu-pciex",
+				"ibm,ioda2-npu2-phb");
+	dt_add_property_strings(np, "device_type", "pciex");
+	dt_add_property(np, "reg", reg, sizeof(reg));
+	dt_add_property_cells(np, "ibm,phb-index", npu2_get_phb_index(0));
+	dt_add_property_cells(np, "ibm,npu-index", npu->index);
+	dt_add_property_cells(np, "ibm,chip-id", npu->chip_id);
+	dt_add_property_cells(np, "ibm,xscom-base", npu->xscom_base);
+	dt_add_property_cells(np, "ibm,npcq", npu->dt_node->phandle);
+	dt_add_property_cells(np, "ibm,links", npu->total_devices);
+	dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win));
+	dt_add_property_cells(np, "ibm,phb-diag-data-size", 0);
+
+	/* Disable fast reboot - not currently supported */
+	disable_fast_reboot("NVLink device enabled");
+
+	npu2_nvlink_create_phb(npu, np);
+
+	return 0;
+}
+
+static uint32_t npu2_populate_pcie_cap(struct npu2_dev *dev,
+				       uint32_t start,
+				       uint32_t prev_cap)
+{
+	struct pci_virt_device *pvd = dev->nvlink.pvd;
+	uint32_t val;
+
+	/* Add capability list */
+	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
+	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
+
+	/* 0x00 - ID/PCIE capability */
+	val = PCI_CFG_CAP_ID_EXP;
+	val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20));
+	PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
+
+	/* 0x04 - Device capability
+	 *
+	 * We should support FLR. Otherwise, it might have
+	 * problem passing it through to userland via Linux
+	 * VFIO infrastructure
+	 */
+	val = ((PCIE_MPSS_128) |
+	       (PCIE_PHANTOM_NONE << 3) |
+	       (PCIE_L0SL_MAX_NO_LIMIT << 6) |
+	       (PCIE_L1L_MAX_NO_LIMIT << 9) |
+	       (PCICAP_EXP_DEVCAP_FUNC_RESET));
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
+
+	pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2,
+			    PCI_REG_FLAG_WRITE,
+			    npu2_dev_cfg_exp_devcap, NULL);
+
+	/* 0x08 - Device control and status */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
+			  0xffff0000, 0x000f0000);
+
+	/* 0x0c - Link capability */
+	val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4));
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
+
+	/* 0x10 - Link control and status */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
+			 0xfffff000, 0xc0000000);
+
+	/* 0x14 - Slot capability */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
+
+	/* 0x18 - Slot control and status */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
+
+	/* 0x1c - Root control and capability */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
+			  0xffffffe0, 0x00000000);
+
+	/* 0x20 - Root status */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
+			 0xffffffff, 0x00010000);
+
+	/* 0x24 - Device capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
+
+	/* 0x28 - Device Control and status 2 */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
+			 0xffff0000, 0x00000000);
+
+	/* 0x2c - Link capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
+
+	/* 0x30 - Link control and status 2 */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
+			 0xffff0000, 0x00200000);
+
+	/* 0x34 - Slot capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
+
+	/* 0x38 - Slot control and status 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
+
+	return start + PCICAP_EXP_SCTL2 + 8;
+}
+
+static uint32_t npu2_populate_vendor_cap(struct npu2_dev *dev,
+					 uint32_t start,
+					 uint32_t prev_cap)
+{
+	struct pci_virt_device *pvd = dev->nvlink.pvd;
+
+	/* Capbility list */
+	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
+	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
+
+	/* Length and version */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN);
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION);
+
+	/*
+	 * Defaults when the trap can't handle the read/write (eg. due
+	 * to reading/writing less than 4 bytes).
+	 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
+
+	/* Add NVLink2 PHY procedures trap */
+	pci_virt_add_filter(pvd, start + 4, 8,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu2_dev_procedure,
+			    NULL);
+
+	/* Link index */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, dev->link_index);
+
+	return start + VENDOR_CAP_LEN;
+}
+
+static void npu2_populate_cfg(struct npu2_dev *dev)
+{
+	struct pci_virt_device *pvd = dev->nvlink.pvd;
+	struct npu2_pcie_bar *bar;
+	uint32_t pos;
+
+	/* 0x00 - Vendor/Device ID */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
+
+	/* 0x04 - Command/Status */
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
+			  0xf9000000);
+
+	pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
+			    npu2_cfg_write_cmd, NULL);
+
+	/* 0x08 - Rev/Class/Cache */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800101);
+
+	/* 0x0c - CLS/Latency Timer/Header/BIST */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
+
+	/* 0x10/14 - BAR#0, NTL BAR */
+	bar = &dev->bars[0];
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4,
+			  (bar->npu2_bar.base & 0xfffffff0) | (bar->flags & 0xF),
+			  0x0000000f, 0x00000000);
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (bar->npu2_bar.base >> 32),
+			  0x00000000, 0x00000000);
+	pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu2_dev_cfg_bar, bar);
+
+	/* 0x18/1c - BAR#1, GENID BAR */
+	bar = &dev->bars[1];
+	if (NPU2DEV_BRICK(dev) == 0)
+		PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, (bar->npu2_bar.base & 0xfffffff0) |
+				  (bar->flags & 0xF),
+				  0x0000000f, 0x00000000);
+	else
+		/* Brick 1 gets the upper portion of the generation id register */
+		PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, ((bar->npu2_bar.base + 0x10000) & 0xfffffff0) |
+				  (bar->flags & 0xF),
+				  0x0000000f, 0x00000000);
+
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, (bar->npu2_bar.base >> 32), 0x00000000,
+			  0x00000000);
+	pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu2_dev_cfg_bar, bar);
+
+	/* 0x20/0x24 - BARs, disabled */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
+
+	/* 0x28 - Cardbus CIS pointer */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
+
+	/* 0x2c - Subsystem ID */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
+
+	/* 0x30 - ROM BAR, zero sized */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
+
+	/* 0x34 - PCI Capability */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
+
+	/* 0x38 - Reserved */
+	PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
+
+	/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */
+
+	/* PCIE and vendor specific capability */
+	pos = npu2_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
+	pos = npu2_populate_vendor_cap(dev, pos, 0x41);
+	PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
+}
+
+static uint32_t npu_allocate_bdfn(struct npu2 *p, uint32_t group)
+{
+	int i;
+	int bdfn = (group << 3);
+
+	for (i = 0; i < p->total_devices; i++) {
+		if ((p->devices[i].bdfn & 0xf8) == (bdfn & 0xf8))
+			bdfn++;
+	}
+
+	return bdfn;
+}
+
+static void npu2_populate_devices(struct npu2 *p,
+				  struct dt_node *dn)
+{
+	struct npu2_dev *dev;
+	struct dt_node *npu2_dn, *link;
+	uint32_t npu_phandle, index = 0;
+	int stack;
+
+	/*
+	 * Get the npu node which has the links which we expand here
+	 * into pci like devices attached to our emulated phb.
+	 */
+	npu_phandle = dt_prop_get_u32(dn, "ibm,npcq");
+	npu2_dn = dt_find_by_phandle(dt_root, npu_phandle);
+	assert(npu2_dn);
+
+	/* Walk the link@x nodes to initialize devices */
+	p->total_devices = 0;
+	p->phb_nvlink.scan_map = 0;
+	dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
+		uint32_t group_id;
+		struct npu2_bar *npu2_bar;
+
+		dev = &p->devices[index];
+		dev->type = NPU2_DEV_TYPE_NVLINK;
+		dev->npu = p;
+		dev->dt_node = link;
+		dev->link_index = dt_prop_get_u32(link, "ibm,npu-link-index");
+		dev->brick_index = dev->link_index;
+
+		group_id = dt_prop_get_u32(link, "ibm,npu-group-id");
+		dev->bdfn = npu_allocate_bdfn(p, group_id);
+
+		/* This must be done after calling
+		 * npu_allocate_bdfn() */
+		p->total_devices++;
+		p->phb_nvlink.scan_map |= 0x1 << ((dev->bdfn & 0xf8) >> 3);
+
+		dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy");
+		dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
+
+		/* Populate BARs. BAR0/1 is the NTL bar. */
+		stack = NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev);
+		npu2_bar = &dev->bars[0].npu2_bar;
+		npu2_bar->type = NPU_NTL;
+		npu2_bar->index = dev->brick_index;
+		npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2DEV_BRICK(dev) == 0 ?
+						NPU2_NTL0_BAR : NPU2_NTL1_BAR);
+	        npu2_get_bar(p->chip_id, npu2_bar);
+
+		dev->bars[0].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
+
+		/* BAR2/3 is the GENID bar. */
+		npu2_bar = &dev->bars[1].npu2_bar;
+		npu2_bar->type = NPU_GENID;
+		npu2_bar->index = NPU2DEV_STACK(dev);
+		npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2_GENID_BAR);
+	        npu2_get_bar(p->chip_id, npu2_bar);
+
+		/* The GENID is a single physical BAR that we split
+		 * for each emulated device */
+		npu2_bar->size = 0x10000;
+		if (NPU2DEV_BRICK(dev))
+			npu2_bar->base += 0x10000;
+		dev->bars[1].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
+
+		/* Initialize PCI virtual device */
+		dev->nvlink.pvd = pci_virt_add_device(&p->phb_nvlink, dev->bdfn, 0x100, dev);
+		if (dev->nvlink.pvd)
+			npu2_populate_cfg(dev);
+
+		index++;
+	}
+}
+
+static void npu2_add_interrupt_map(struct npu2 *p,
+				  struct dt_node *dn)
+{
+	struct dt_node *npu2_dn, *link, *phb_dn;
+	uint32_t npu2_phandle, index = 0, i;
+	uint32_t icsp = get_ics_phandle();
+	uint32_t *map;
+	size_t map_size;
+	uint32_t mask[] = {0xff00, 0x0, 0x0, 0x7};
+
+	assert(p->phb_nvlink.dt_node);
+	phb_dn = p->phb_nvlink.dt_node;
+
+	npu2_phandle = dt_prop_get_u32(dn, "ibm,npcq");
+	npu2_dn = dt_find_by_phandle(dt_root, npu2_phandle);
+	assert(npu2_dn);
+	map_size = 7 * sizeof(*map) * p->total_devices;
+	map = malloc(map_size);
+	index = 0;
+	dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
+		i = index * 7;
+		map[i + 0] = (p->devices[index].bdfn << 8);
+		map[i + 1] = 0;
+		map[i + 2] = 0;
+
+		map[i + 3] = 1; /* INT A */
+		map[i + 4] = icsp; /* interrupt-parent */
+		map[i + 5] = p->base_lsi + (index * 2) + 1; /* NDL No-Stall Event */
+		map[i + 6] = 0; /* 0 = EDGE, 1 = LEVEL. */
+		index++;
+	}
+	dt_add_property(phb_dn, "interrupt-map", map, map_size);
+	free(map);
+	dt_add_property(phb_dn, "interrupt-map-mask", mask, sizeof(mask));
+}
+
+static void npu2_add_phb_properties(struct npu2 *p)
+{
+	struct dt_node *np = p->phb_nvlink.dt_node;
+	uint32_t icsp = get_ics_phandle();
+	uint64_t mm_base, mm_size;
+
+	/*
+	 * Add various properties that HB doesn't have to
+	 * add, some of them simply because they result from
+	 * policy decisions made in skiboot rather than in HB
+	 * such as the MMIO windows going to PCI, interrupts,
+	 * etc.
+	 */
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+	dt_add_property_cells(np, "bus-range", 0, 0xff);
+	dt_add_property_cells(np, "clock-frequency", 0x200, 0);
+        dt_add_property_cells(np, "interrupt-parent", icsp);
+
+	/* NPU2 PHB properties */
+	dt_add_property_cells(np, "ibm,opal-num-pes",
+			      NPU2_MAX_PE_NUM);
+	dt_add_property_cells(np, "ibm,opal-reserved-pe",
+			      NPU2_RESERVED_PE_NUM);
+	dt_add_property_cells(np, "ibm,supported-tce-sizes",
+			      12, // 4K
+			      16, // 64K
+			      24, // 16M
+			      28); // 256M
+
+	dt_add_property_u64s(np, "ibm,mmio-atsd",
+			MMIO_ATSD_ADDR(p->regs, 0),
+			MMIO_ATSD_ADDR(p->regs, 1),
+			MMIO_ATSD_ADDR(p->regs, 2),
+			MMIO_ATSD_ADDR(p->regs, 3),
+			MMIO_ATSD_ADDR(p->regs, 4),
+			MMIO_ATSD_ADDR(p->regs, 5),
+			MMIO_ATSD_ADDR(p->regs, 6),
+			MMIO_ATSD_ADDR(p->regs, 7));
+
+	/*
+	 * Memory window is exposed as 64-bits non-prefetchable
+	 * one because 64-bits prefetchable one is kind of special
+	 * to kernel.
+	 */
+	mm_base = p->mm_base;
+	mm_size = p->mm_size;
+	dt_add_property_cells(np, "ranges", 0x02000000,
+			      hi32(mm_base), lo32(mm_base),
+			      hi32(mm_base), lo32(mm_base),
+			      hi32(mm_size), lo32(mm_size));
+}
+
+void npu2_nvlink_create_phb(struct npu2 *npu, struct dt_node *dn)
+{
+	struct pci_slot *slot;
+
+	/* Generic PHB */
+	npu->phb_nvlink.dt_node = dn;
+	npu->phb_nvlink.ops = &npu_ops;
+	npu->phb_nvlink.phb_type = phb_type_npu_v2;
+	init_lock(&npu->lock);
+	init_lock(&npu->phb_nvlink.lock);
+	list_head_init(&npu->phb_nvlink.devices);
+	list_head_init(&npu->phb_nvlink.virt_devices);
+
+	npu2_populate_devices(npu, dn);
+	npu2_add_interrupt_map(npu, dn);
+	npu2_add_phb_properties(npu);
+
+	slot = npu2_slot_create(&npu->phb_nvlink);
+	if (!slot)
+	{
+		/**
+		 * @fwts-label NPUCannotCreatePHBSlot
+		 * @fwts-advice Firmware probably ran out of memory creating
+		 * NPU2 slot. NVLink functionality could be broken.
+		 */
+		prlog(PR_ERR, "NPU: Cannot create PHB slot\n");
+	}
+
+	pci_register_phb(&npu->phb_nvlink, OPAL_DYNAMIC_PHB_ID);
+
+	npu2_init_ioda_cache(npu);
+	npu2_hw_init(npu);
+}
+
+/*
+ * Search a table for an entry with matching value under mask. Returns
+ * the index and the current value in *value.
+ */
+static int npu_table_search(struct npu2 *p, uint64_t table_addr, int stride,
+			    int table_size, uint64_t *value, uint64_t mask)
+{
+	int i;
+	uint64_t val;
+
+	assert(value);
+
+	for (i = 0; i < table_size; i++) {
+		val = npu2_read(p, table_addr + i*stride);
+		if ((val & mask) == *value) {
+			*value = val;
+			return i;
+		}
+	}
+
+	return -1;
+}
+
+/*
+ * Allocate a context ID and initialise the tables with the relevant
+ * information. Returns the ID on or error if one couldn't be
+ * allocated.
+ */
+#define NPU2_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF)
+int64_t npu2_init_context(struct phb *phb, uint64_t msr, uint64_t bdf)
+{
+	struct npu2 *p;
+	uint64_t xts_bdf, old_xts_bdf_pid, xts_bdf_pid;
+	int id;
+
+	/*
+	 * MSR bits should be masked by the caller to allow for future
+	 * expansion if required.
+	 */
+	if (msr & ~NPU2_VALID_ATS_MSR_BITS)
+		return OPAL_UNSUPPORTED;
+
+	/*
+	 * Need to get LPARSHORT.
+	 */
+	p = phb_to_npu2_nvlink(phb);
+	lock(&p->lock);
+	xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf);
+	if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
+			     &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) {
+		NPU2ERR(p, "LPARID not associated with any GPU\n");
+		id = OPAL_PARAMETER;
+		goto out;
+	}
+
+	id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf);
+	NPU2DBG(p, "Found LPARSHORT = 0x%x for BDF = 0x%03llx\n", id, bdf);
+
+	/* Enable this mapping for both real and virtual addresses */
+	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA0, 0UL, 1);
+	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA1, xts_bdf_pid, 1);
+
+	/* Enables TLBIE/MMIOSD forwarding for this entry */
+	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATSD, xts_bdf_pid, 1);
+	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_LPARSHORT, xts_bdf_pid, id);
+
+	/* Set the relevant MSR bits */
+	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_DR, xts_bdf_pid,
+			       !!(msr & MSR_DR));
+	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_HV, xts_bdf_pid,
+			       !!(msr & MSR_HV));
+	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_PR, xts_bdf_pid,
+			       !!(msr & MSR_PR));
+
+	/* We don't support anything other than 64-bit so we can safely hardcode
+	 * it here */
+	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_SF, xts_bdf_pid, 1);
+
+	/*
+	 * Throw an error if the wildcard entry for this bdf is already set
+	 * with different msr bits.
+	 */
+	old_xts_bdf_pid = npu2_read(p, NPU2_XTS_PID_MAP + id*0x20);
+	if (old_xts_bdf_pid) {
+		if (GETFIELD(NPU2_XTS_PID_MAP_MSR, old_xts_bdf_pid) !=
+		    GETFIELD(NPU2_XTS_PID_MAP_MSR, xts_bdf_pid)) {
+			NPU2ERR(p, "%s: Unexpected MSR value\n", __func__);
+			id = OPAL_PARAMETER;
+			goto out;
+		} else if (!p->ctx_ref[id]) {
+			NPU2ERR(p, "%s: Unexpected mapping\n", __func__);
+			id = OPAL_INTERNAL_ERROR;
+			goto out;
+		}
+	}
+
+	/* Write the entry */
+	if (!p->ctx_ref[id]) {
+		NPU2DBG(p, "XTS_PID_MAP[%03d] = 0x%08llx\n", id, xts_bdf_pid);
+		npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, xts_bdf_pid);
+
+		if (!GETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf)) {
+			xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf, 1);
+			npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf);
+		}
+	}
+	++p->ctx_ref[id];
+
+out:
+	unlock(&p->lock);
+	return id;
+}
+
+int64_t npu2_destroy_context(struct phb *phb, uint64_t bdf)
+{
+	struct npu2 *p;
+	uint64_t xts_bdf;
+	int rc = OPAL_PARAMETER, id;
+
+	p = phb_to_npu2_nvlink(phb);
+	lock(&p->lock);
+
+	/* Need to find lparshort for this bdf */
+	xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf);
+	if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
+			     &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) {
+		NPU2ERR(p, "LPARID not associated with any GPU\n");
+	} else {
+		/*
+		 * The bdf/pid table contains wildcard entries and MSR bits
+		 * which we need to clear between switching a device from
+		 * a host to a guest or vice versa.
+		 */
+		id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf);
+		if (p->ctx_ref[id]) {
+			--p->ctx_ref[id];
+			if (!p->ctx_ref[id]) {
+				NPU2DBG(p, "XTS_PID_MAP[%03d] = 0 (destroy)\n",
+					id);
+				npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0);
+			}
+			rc = OPAL_SUCCESS;
+		}
+	}
+	unlock(&p->lock);
+	return rc;
+}
+
+/*
+ * Map the given virtual bdf to lparid with given lpcr.
+ */
+int64_t npu2_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
+		      uint64_t lpcr)
+{
+	struct npu2 *p;
+	struct npu2_dev *ndev = NULL;
+	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
+	int i;
+	int id;
+	static uint64_t atsd_lpar_regs[] = {
+		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
+		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
+		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
+		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID
+	};
+
+	if (lpcr)
+		/* The LPCR bits are only required for hash based ATS,
+		 * which we don't currently support but may need to in
+		 * future. */
+		return OPAL_UNSUPPORTED;
+
+	p = phb_to_npu2_nvlink(phb);
+	lock(&p->lock);
+
+	/* Find any existing entries and update them */
+	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0L, bdf);
+	id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
+			      &xts_bdf_lpar, NPU2_XTS_BDF_MAP_BDF);
+	if (id < 0) {
+		/* No existing mapping found, find space for a new one */
+		xts_bdf_lpar = 0;
+		id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
+				      &xts_bdf_lpar, -1UL);
+	}
+
+	if (id < 0) {
+		/* Unable to find a free mapping */
+		NPU2ERR(p, "No free XTS_BDF[] entry\n");
+		rc = OPAL_RESOURCE;
+		goto out;
+	}
+
+	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_UNFILT, 0UL, 1);
+	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, xts_bdf_lpar, bdf);
+
+	/* We only support radix for the moment */
+	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_XLAT, xts_bdf_lpar, 0x3);
+	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid);
+	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
+
+	/* Need to find an NVLink to send the ATSDs for this device over */
+	for (i = 0; i < p->total_devices; i++) {
+		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
+			ndev = &p->devices[i];
+			break;
+		}
+	}
+
+	if (!ndev) {
+		NPU2ERR(p, "Unable to find nvlink for bdf %llx\n", bdf);
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	/*
+	 * We need to allocate an ATSD per NVLink bridge if possible,
+	 * use the ibm,npu-link-index property for that.
+	 */
+	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
+	if (!lparid)
+		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
+
+	if (ndev->link_index < ARRAY_SIZE(atsd_lpar_regs))
+		npu2_write(p, atsd_lpar_regs[ndev->link_index], atsd_lpar);
+	else
+		NPU2ERR(p, "Unable to assign ATSD for link index %u\n",
+				ndev->link_index);
+
+	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_STACK, xts_bdf_lpar,
+				0x4 >> (ndev->brick_index / 2));
+	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BRICK, xts_bdf_lpar,
+				(ndev->brick_index % 2));
+
+	NPU2DBG(p, "XTS_BDF_MAP[%03d] = 0x%08llx\n", id, xts_bdf_lpar);
+	npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf_lpar);
+
+	/* Reset wildcard in the PID map and the refcounter */
+	if (npu2_read(p, NPU2_XTS_PID_MAP + id*0x20) || p->ctx_ref[id]) {
+		prlog(PR_INFO, "Resetting PID MAP for LPID %lld\n", lparid);
+		p->ctx_ref[id] = 0;
+		npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0);
+	}
+
+out:
+	unlock(&p->lock);
+	return rc;
+}
+
+static inline uint32_t npu2_relaxed_ordering_source_grpchp(uint32_t gcid)
+{
+	if (gcid & ~0x1b)
+		return OPAL_PARAMETER;
+
+	/* Repack 0bGGGGCCC to 0bGGCC */
+	return ((gcid & 0x18) >> 1) | (gcid & 0x3);
+}
+
+static uint64_t npu2_relaxed_ordering_cfg_read(struct npu2_dev *ndev, int n)
+{
+	uint64_t reg = NPU2_SM_REG_OFFSET(ndev, 0, NPU2_RELAXED_ORDERING_CFG(n));
+
+	return npu2_read(ndev->npu, reg);
+}
+
+static void npu2_relaxed_ordering_cfg_write(struct npu2_dev *ndev, int n,
+					    uint64_t val)
+{
+	uint64_t reg;
+	int sm;
+
+	/* Set every register on our stack */
+	for (sm = NPU2_BLOCK_SM_0; sm <= NPU2_BLOCK_SM_3; sm++) {
+		reg = NPU2_SM_REG_OFFSET(ndev, sm, NPU2_RELAXED_ORDERING_CFG(n));
+		npu2_write(ndev->npu, reg, val);
+	}
+}
+
+/*
+ * Parse the value of a relaxed ordering config register. Returns SOURCE0 or
+ * SOURCE1 register mask if relaxed ordering is set for the given chip/pec.
+ * Returns 0 if unset.
+ */
+static uint64_t npu2_relaxed_ordering_cfg_enabled(uint64_t val, uint32_t gcid,
+						  int pec)
+{
+	uint32_t src, grpchp;
+	uint64_t mask;
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		mask = NPU2_RELAXED_ORDERING_SOURCE(i);
+		src = GETFIELD(mask, val);
+
+		if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, src))
+			continue;
+
+		if (GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src) != pec)
+			continue;
+
+		grpchp = GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src);
+		if (grpchp == npu2_relaxed_ordering_source_grpchp(gcid))
+			return mask;
+
+		if (grpchp == 0xf) /* match all */
+			return mask;
+	}
+
+	return 0;
+}
+
+static int npu2_enable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
+					int pec)
+{
+	uint64_t val, mask;
+	uint32_t src;
+	int rc = OPAL_RESOURCE;
+	int i;
+
+	NPU2DEVINF(ndev, "Enabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
+	lock(&ndev->npu->lock);
+
+	for (i = 0; i < 2; i++) {
+		val = npu2_relaxed_ordering_cfg_read(ndev, i);
+		if (!npu2_relaxed_ordering_cfg_enabled(val, gcid, pec))
+			continue;
+
+		/* Already enabled */
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+
+	src = NPU2_RELAXED_ORDERING_SOURCE_WRENA |
+	      NPU2_RELAXED_ORDERING_SOURCE_RDENA;
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src, pec);
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src,
+		       npu2_relaxed_ordering_source_grpchp(gcid));
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMIN, src, 0);
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMAX, src, 23);
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMIN, src, 0);
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMAX, src, 47);
+
+	/* Find somewhere to write this config */
+	for (i = 0; i < 2; i++) {
+		val = npu2_relaxed_ordering_cfg_read(ndev, i);
+
+		if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA << 32, val))
+			mask = NPU2_RELAXED_ORDERING_SOURCE(0);
+		else if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, val))
+			mask = NPU2_RELAXED_ORDERING_SOURCE(1);
+		else
+			continue;
+
+		val = SETFIELD(mask, val, src);
+		npu2_relaxed_ordering_cfg_write(ndev, i, val);
+
+		rc = OPAL_SUCCESS;
+		break;
+	}
+
+out:
+	unlock(&ndev->npu->lock);
+	return rc;
+}
+
+static void npu2_disable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
+					  int pec)
+{
+	uint64_t val, mask;
+	int i;
+
+	NPU2DEVINF(ndev, "Disabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
+	lock(&ndev->npu->lock);
+
+	for (i = 0; i < 2; i++) {
+		val = npu2_relaxed_ordering_cfg_read(ndev, i);
+
+		mask = npu2_relaxed_ordering_cfg_enabled(val, gcid, pec);
+		if (!mask)
+			continue;
+
+		val = SETFIELD(mask, val, 0);
+		npu2_relaxed_ordering_cfg_write(ndev, i, val);
+	}
+
+	unlock(&ndev->npu->lock);
+}
+
+/*
+ * Enable or disable relaxed ordering on all nvlinks for a given PEC. May leave
+ * relaxed ordering partially enabled if there are insufficient HW resources to
+ * enable it on all links.
+ */
+int64_t npu2_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
+			       bool enable)
+{
+	struct npu2 *npu = phb_to_npu2_nvlink(phb);
+	struct npu2_dev *ndev;
+	int64_t rc = OPAL_SUCCESS;
+
+	for (int i = 0; i < npu->total_devices; i++) {
+		ndev = &npu->devices[i];
+		if (enable)
+			rc = npu2_enable_relaxed_ordering(ndev, gcid, pec);
+		else
+			npu2_disable_relaxed_ordering(ndev, gcid, pec);
+
+		if (rc != OPAL_SUCCESS) {
+			NPU2DEVINF(ndev, "Insufficient resources to activate relaxed ordering mode\n");
+			return OPAL_RESOURCE;
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
diff --git a/roms/skiboot/hw/npu3-hw-procedures.c b/roms/skiboot/hw/npu3-hw-procedures.c
new file mode 100644
index 000000000..098e6e467
--- /dev/null
+++ b/roms/skiboot/hw/npu3-hw-procedures.c
@@ -0,0 +1,792 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <npu3.h>
+#include <npu3-regs.h>
+#include <timebase.h>
+#include <xscom.h>
+#include <xscom-p9-regs.h>
+
+#define NPU3DEVLOG(l, dev, fmt, a...)		\
+	prlog(l, "NPU[%d:%d:%d]: " fmt,		\
+	      (dev)->npu->chip_id,		\
+	      (dev)->npu->index,		\
+	      (dev)->index, ##a)
+#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
+#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
+#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
+
+/*
+ * The documentation for the PHY training is written in terms of bits within an
+ * actual register so we use that representation here.
+ */
+struct npu3_phy_reg {
+	uint64_t offset;
+	uint64_t mask;
+};
+
+static struct npu3_phy_reg
+NPU3_PHY_RX_RUN_LANE			= { 0x0c8, PPC_BIT(48) },
+NPU3_PHY_RX_IORESET			= { 0x096, PPC_BIT(63) },
+NPU3_PHY_TX_IORESET			= { 0x113, PPC_BIT(48) },
+NPU3_PHY_RX_PR_RESET			= { 0x096, PPC_BIT(62) },
+NPU3_PHY_RX_LANE_ANA_PDWN		= { 0x002, PPC_BIT(54) },
+NPU3_PHY_RX_LANE_DIG_PDWN		= { 0x088, PPC_BIT(48) },
+NPU3_PHY_RX_PR_PHASE_STEP		= { 0x08a, PPC_BITMASK(60, 63) },
+NPU3_PHY_TX_LANE_PDWN			= { 0x101, PPC_BIT(48) },
+NPU3_PHY_RX_RUN_DCCAL			= { 0x0c8, PPC_BIT(49) },
+NPU3_PHY_RX_DCCAL_DONE			= { 0x0ca, PPC_BIT(49) },
+NPU3_PHY_RX_LANE_BUSY			= { 0x0ca, PPC_BIT(50) },
+NPU3_PHY_RX_B_BANK_CONTROLS		= { 0x002, PPC_BITMASK(58, 63) },
+NPU3_PHY_TX_UNLOAD_CLK_DISABLE		= { 0x103, PPC_BIT(56) },
+NPU3_PHY_TX_FIFO_INIT			= { 0x105, PPC_BIT(53) },
+NPU3_PHY_TX_RXCAL			= { 0x103, PPC_BIT(57) },
+NPU3_PHY_RX_INIT_DONE			= { 0x0ca, PPC_BIT(48) },
+NPU3_PHY_RX_PR_EDGE_TRACK_CNTL		= { 0x092, PPC_BITMASK(48, 49) },
+NPU3_PHY_RX_PR_FW_OFF			= { 0x08a, PPC_BIT(56) },
+NPU3_PHY_RX_PR_FW_INERTIA_AMT		= { 0x08a, PPC_BITMASK(57, 59) },
+NPU3_PHY_RX_CFG_LTE_MC			= { 0x000, PPC_BITMASK(60, 63) },
+NPU3_PHY_RX_A_INTEG_COARSE_GAIN		= { 0x00a, PPC_BITMASK(48, 51) },
+NPU3_PHY_RX_B_INTEG_COARSE_GAIN		= { 0x026, PPC_BITMASK(48, 51) },
+NPU3_PHY_RX_E_INTEG_COARSE_GAIN		= { 0x030, PPC_BITMASK(48, 51) },
+
+/* These registers are per-PHY, not per lane */
+NPU3_PHY_TX_ZCAL_SWO_EN			= { 0x3c9, PPC_BIT(48) },
+NPU3_PHY_TX_ZCAL_REQ			= { 0x3c1, PPC_BIT(49) },
+NPU3_PHY_TX_ZCAL_DONE			= { 0x3c1, PPC_BIT(50) },
+NPU3_PHY_TX_ZCAL_ERROR			= { 0x3c1, PPC_BIT(51) },
+NPU3_PHY_TX_ZCAL_N			= { 0x3c3, PPC_BITMASK(48, 56) },
+NPU3_PHY_TX_ZCAL_P			= { 0x3c5, PPC_BITMASK(48, 56) },
+NPU3_PHY_TX_PSEG_PRE_EN			= { 0x34d, PPC_BITMASK(51, 55) },
+NPU3_PHY_TX_PSEG_PRE_SELECT		= { 0x34d, PPC_BITMASK(56, 60) },
+NPU3_PHY_TX_NSEG_PRE_EN			= { 0x34f, PPC_BITMASK(51, 55) },
+NPU3_PHY_TX_NSEG_PRE_SELECT		= { 0x34f, PPC_BITMASK(56, 60) },
+NPU3_PHY_TX_PSEG_POST_EN		= { 0x361, PPC_BITMASK(49, 55) },
+NPU3_PHY_TX_PSEG_POST_SELECT		= { 0x361, PPC_BITMASK(56, 62) },
+NPU3_PHY_TX_NSEG_POST_EN		= { 0x363, PPC_BITMASK(49, 55) },
+NPU3_PHY_TX_NSEG_POST_SELECT		= { 0x363, PPC_BITMASK(56, 62) },
+NPU3_PHY_TX_PSEG_MARGINPU_EN		= { 0x351, PPC_BITMASK(48, 55) },
+NPU3_PHY_TX_NSEG_MARGINPU_EN		= { 0x353, PPC_BITMASK(48, 55) },
+NPU3_PHY_TX_PSEG_MARGINPD_EN		= { 0x351, PPC_BITMASK(56, 63) },
+NPU3_PHY_TX_NSEG_MARGINPD_EN		= { 0x353, PPC_BITMASK(56, 63) },
+NPU3_PHY_TX_MARGINPU_SELECT		= { 0x355, PPC_BITMASK(48, 55) },
+NPU3_PHY_TX_MARGINPD_SELECT		= { 0x355, PPC_BITMASK(56, 63) },
+NPU3_PHY_TX_PSEG_MAIN_EN		= { 0x357, PPC_BITMASK(51, 57) },
+NPU3_PHY_TX_NSEG_MAIN_EN		= { 0x359, PPC_BITMASK(51, 57) },
+NPU3_PHY_RX_CLKDIST_PDWN		= { 0x204, PPC_BITMASK(48, 50) },
+NPU3_PHY_RX_IREF_PDWN			= { 0x230, PPC_BIT(54) },
+NPU3_PHY_TX_CLKDIST_PDWN		= { 0x305, PPC_BITMASK(48, 50) },
+NPU3_PHY_RX_CTL_DATASM_CLKDIST_PDWN	= { 0x2e0, PPC_BIT(60) };
+
+static uint64_t npu3_phy_scom(struct npu3_dev *dev, struct npu3_phy_reg *reg,
+			      int lane)
+{
+	uint64_t scom;
+
+	/* Don't specify a lane for a non-per-lane register */
+	if (lane >= 0)
+		assert(reg->offset < 0x200);
+	else
+		assert(reg->offset >= 0x200);
+
+	scom = OB_INDIRECT(dev->ob_chiplet);
+	scom = SETFIELD(PPC_BITMASK(12, 21), scom, reg->offset);
+
+	if (lane > 0)
+		scom = SETFIELD(PPC_BITMASK(27, 31), scom, lane);
+
+	return scom;
+}
+
+static void npu3_phy_write_lane(struct npu3_dev *dev, struct npu3_phy_reg *reg,
+				int lane, uint64_t val)
+{
+	struct npu3 *npu = dev->npu;
+	uint64_t scom, scom_val;
+
+	scom = npu3_phy_scom(dev, reg, lane);
+
+	xscom_read(npu->chip_id, scom, &scom_val);
+	scom_val = SETFIELD(reg->mask, scom_val, val);
+	xscom_write(npu->chip_id, scom, scom_val);
+}
+
+static uint64_t npu3_phy_read_lane(struct npu3_dev *dev,
+				   struct npu3_phy_reg *reg,
+				   int lane)
+{
+	struct npu3 *npu = dev->npu;
+	uint64_t scom, scom_val;
+
+	scom = npu3_phy_scom(dev, reg, lane);
+	xscom_read(npu->chip_id, scom, &scom_val);
+
+	return GETFIELD(reg->mask, scom_val);
+}
+
+static inline void npu3_phy_write(struct npu3_dev *dev,
+				  struct npu3_phy_reg *reg,
+				  uint64_t val)
+{
+	npu3_phy_write_lane(dev, reg, -1, val);
+}
+
+static inline uint64_t npu3_phy_read(struct npu3_dev *dev,
+				     struct npu3_phy_reg *reg)
+{
+	return npu3_phy_read_lane(dev, reg, -1);
+}
+
+struct procedure {
+	const char *name;
+	uint32_t (*steps[])(struct npu3_dev *);
+};
+
+#define DEFINE_PROCEDURE(NAME, STEPS...)	\
+static struct procedure procedure_##NAME = {	\
+	.name = #NAME,				\
+	.steps = { NAME, ##STEPS }		\
+}
+
+static uint32_t stop(struct npu3_dev *npu_dev __unused)
+{
+	return NPU3_PROC_COMPLETE | NPU3_PROC_ABORTED;
+}
+
+DEFINE_PROCEDURE(stop);
+
+static uint32_t nop(struct npu3_dev *npu_dev __unused)
+{
+	return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(nop);
+
+static void set_iovalid(struct npu3_dev *dev, bool raise)
+{
+	struct npu3 *npu = dev->npu;
+	uint64_t reg, val;
+
+	reg = OB_CPLT_CONF1(dev->ob_chiplet);
+
+	xscom_read(npu->chip_id, reg, &val);
+	val = SETFIELD(OB_CPLT_CONF1_NV_IOVALID(dev->index), val, raise);
+	xscom_write(npu->chip_id, reg, val);
+}
+
+#define NPU3_PHY_LANES 24
+
+#define npu3_for_each_lane(lane, dev)				\
+	for (lane = 0; lane < NPU3_PHY_LANES; lane++)		\
+		if (dev->phy_lane_mask & PPC_BIT32(lane))	\
+
+static uint32_t phy_reset(struct npu3_dev *dev)
+{
+	uint32_t lane;
+
+	set_iovalid(dev, false);
+
+	npu3_for_each_lane(lane, dev)
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_LANE, lane, 0);
+
+	return NPU3_PROC_NEXT;
+}
+
+static uint32_t phy_reset_wait(struct npu3_dev *dev)
+{
+	int lane;
+
+	/* Wait for all lanes to become inactive */
+	npu3_for_each_lane(lane, dev)
+		if (npu3_phy_read_lane(dev, &NPU3_PHY_RX_LANE_BUSY, lane))
+			return NPU3_PROC_INPROGRESS;
+
+	npu3_for_each_lane(lane, dev) {
+		/* Set lane in reset */
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_IORESET, lane, 1);
+		npu3_phy_write_lane(dev, &NPU3_PHY_TX_IORESET, lane, 1);
+
+		/* Release lane from reset */
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_IORESET, lane, 0);
+		npu3_phy_write_lane(dev, &NPU3_PHY_TX_IORESET, lane, 0);
+
+		/* Reset the phase rotator */
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_RESET, lane, 1);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_RESET, lane, 0);
+	}
+
+	return NPU3_PROC_NEXT;
+}
+
+/* Procedure 1.2.3 - Initialise I/O PHY Registers */
+static uint32_t phy_reset_complete(struct npu3_dev *dev)
+{
+	int lane;
+
+	npu3_for_each_lane(lane, dev) {
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_ANA_PDWN, lane, 0);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_DIG_PDWN, lane, 0);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_PHASE_STEP, lane, 0xc);
+		npu3_phy_write_lane(dev, &NPU3_PHY_TX_LANE_PDWN, lane, 0);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_INERTIA_AMT, lane, 4);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_CFG_LTE_MC, lane, 3);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_A_INTEG_COARSE_GAIN, lane, 11);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_B_INTEG_COARSE_GAIN, lane, 11);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_E_INTEG_COARSE_GAIN, lane, 11);
+	}
+
+	set_iovalid(dev, true);
+
+	return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete);
+
+/* Procedure 1.2.6 - I/O PHY Tx Impedance Calibration */
+static uint32_t phy_tx_zcal(struct npu3_dev *dev)
+{
+	if (dev->npu->tx_zcal_complete)
+		return NPU3_PROC_COMPLETE;
+
+	/* Turn off SW enable and enable zcal state machine */
+	npu3_phy_write(dev, &NPU3_PHY_TX_ZCAL_SWO_EN, 0);
+
+	/* Start impedance calibration state machine */
+	npu3_phy_write(dev, &NPU3_PHY_TX_ZCAL_REQ, 1);
+
+	return NPU3_PROC_NEXT;
+}
+
+static uint32_t phy_tx_zcal_wait(struct npu3_dev *dev)
+{
+	if (npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_ERROR))
+		return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
+
+	if (!npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_DONE))
+		return NPU3_PROC_INPROGRESS;
+
+	return NPU3_PROC_NEXT;
+}
+
+#define MARGIN_RATIO		0
+#define FFE_PRE_COEFF		0
+#define FFE_POST_COEFF		0
+
+#define PRE_WIDTH		5
+#define POST_WIDTH		7
+#define MAIN_WIDTH		7
+#define ZCAL_MIN		(16 * 2)
+#define ZCAL_MAX		(33 * 2)
+#define PRECURSOR_X2_MAX	(4 * 2 + 1)
+#define POSTCURSOR_X2_MAX	(6 * 2 + 1)
+#define MARGIN_X2_MAX		(8 * 2)
+#define MAIN_X2_MAX		(6 * 2 + 1)
+#define TOTAL_X2_MAX		(PRECURSOR_X2_MAX + POSTCURSOR_X2_MAX + \
+				 2 * MARGIN_X2_MAX + MAIN_X2_MAX)
+
+static uint32_t therm(uint32_t dec)
+{
+	return (0x1 << dec) - 1;
+}
+
+static uint32_t therm_with_half(uint32_t dec, uint8_t width)
+{
+	/* If the LSB of the 2r equivalent is on, then we need to set the 2r bit (MSB) */
+	uint32_t half_on = (dec & 0x1) << (width - 1);
+
+	/* Shift the 2r equivalent to a 1r value and convert to a thermometer code. */
+	uint32_t x1_equiv = ((1 << (dec >> 1)) - 1);
+
+	/* Combine 1r equivalent thermometer code + the 2r MSB value. */
+	return half_on | x1_equiv;
+}
+
+static uint32_t phy_tx_zcal_calculate(struct npu3_dev *dev)
+{
+	int p_value, n_value;
+	uint32_t zcal_n;
+	uint32_t zcal_p;
+	uint32_t p_main_enable = MAIN_X2_MAX;
+	uint32_t p_margin_pu_enable = MARGIN_X2_MAX;
+	uint32_t p_margin_pd_enable = MARGIN_X2_MAX;
+	uint32_t p_precursor_select;
+	uint32_t p_postcursor_select;
+	uint32_t margin_pu_select;
+	uint32_t n_main_enable = MAIN_X2_MAX;
+	uint32_t n_margin_pu_enable = MARGIN_X2_MAX;
+	uint32_t n_margin_pd_enable = MARGIN_X2_MAX;
+	uint32_t n_precursor_select;
+	uint32_t n_postcursor_select;
+	uint32_t margin_pd_select;
+	uint32_t margin_select;
+
+	/* Convert the value from 8R to 2R by / 4 */
+	zcal_n = npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_N) / 4;
+	zcal_p = npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_P) / 4;
+
+	/*
+	 * Again, if the hardware detects an unexpected condition it's
+	 * better just to fail loudly.
+	 */
+	if (zcal_n < ZCAL_MIN || zcal_n > ZCAL_MAX ||
+	    zcal_p < ZCAL_MIN || zcal_p > ZCAL_MAX)
+		return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
+
+	p_value = zcal_p - TOTAL_X2_MAX;
+	p_precursor_select = p_value * FFE_PRE_COEFF / 128;
+	p_postcursor_select = p_value * FFE_POST_COEFF / 128;
+	margin_pu_select = p_value * MARGIN_RATIO / 256;
+
+	if (p_value % 2) {
+		p_main_enable--;
+		p_value++;
+	}
+
+	while (p_value < 0) {
+		if (p_main_enable > 1) {
+			p_main_enable -= 2;
+		} else if (p_margin_pu_enable + p_margin_pd_enable > 0) {
+			if (p_margin_pu_enable == p_margin_pd_enable)
+				p_margin_pd_enable -= 2;
+			else
+				p_margin_pu_enable -= 2;
+		}
+		p_value += 2;
+	}
+
+	n_value = zcal_n - TOTAL_X2_MAX;
+	n_precursor_select = n_value * FFE_PRE_COEFF / 128;
+	n_postcursor_select = n_value * FFE_POST_COEFF / 128;
+	margin_pd_select = p_value * MARGIN_RATIO / 256;
+
+	if (n_value % 2) {
+		n_main_enable--;
+		n_value++;
+	}
+
+	while (n_value < 0) {
+		if (n_main_enable > 1) {
+			n_main_enable -= 2;
+		} else if (n_margin_pu_enable + n_margin_pd_enable > 0) {
+			if (n_margin_pu_enable == n_margin_pd_enable)
+				n_margin_pd_enable -= 2;
+			else
+				n_margin_pu_enable -= 2;
+		}
+		n_value += 2;
+	}
+
+	margin_select = therm((margin_pu_select + 1) / 2) &
+			therm((margin_pd_select + 1) / 2) &
+			therm((p_margin_pu_enable + 1) / 2) &
+			therm((p_margin_pd_enable + 1) / 2) &
+			therm((n_margin_pu_enable + 1) / 2) &
+			therm((n_margin_pd_enable + 1) / 2);
+
+	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_PRE_EN,      therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
+	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_PRE_SELECT,  therm_with_half(p_precursor_select, PRE_WIDTH));
+	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_POST_EN,     therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
+	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_POST_SELECT, therm_with_half(p_postcursor_select, POST_WIDTH));
+	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MARGINPU_EN, therm((p_margin_pu_enable + 1) / 2));
+	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MARGINPD_EN, therm((p_margin_pd_enable + 1) / 2));
+	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MAIN_EN,     therm_with_half(p_main_enable, MAIN_WIDTH));
+
+	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_PRE_EN,      therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
+	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_PRE_SELECT,  therm_with_half(n_precursor_select, PRE_WIDTH));
+	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_POST_EN,     therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
+	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_POST_SELECT, therm_with_half(n_postcursor_select, POST_WIDTH));
+	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MARGINPU_EN, therm((n_margin_pu_enable + 1) / 2));
+	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MARGINPD_EN, therm((n_margin_pd_enable + 1) / 2));
+	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MAIN_EN,     therm_with_half(n_main_enable, MAIN_WIDTH));
+
+	npu3_phy_write(dev, &NPU3_PHY_TX_MARGINPU_SELECT,  therm(margin_select + 1) / 2);
+	npu3_phy_write(dev, &NPU3_PHY_TX_MARGINPD_SELECT,  therm(margin_select + 1) / 2);
+
+	dev->npu->tx_zcal_complete = true;
+
+	return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate);
+
+/* Procedure 1.2.4 - I/O PHY DC Calibration */
+static uint32_t phy_rx_dccal(struct npu3_dev *dev)
+{
+	int lane;
+
+	set_iovalid(dev, false);
+
+	npu3_for_each_lane(lane, dev)
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_OFF, lane, 1);
+
+	npu3_for_each_lane(lane, dev)
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_DCCAL, lane, 1);
+
+	return NPU3_PROC_NEXT;
+}
+
+static uint32_t phy_rx_dccal_complete(struct npu3_dev *dev)
+{
+	int lane;
+
+	npu3_for_each_lane(lane, dev)
+		if (!npu3_phy_read_lane(dev, &NPU3_PHY_RX_DCCAL_DONE, lane))
+			return NPU3_PROC_INPROGRESS;
+
+	npu3_for_each_lane(lane, dev)
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_DCCAL, lane, 0);
+
+	npu3_for_each_lane(lane, dev) {
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_B_BANK_CONTROLS, lane, 0);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_EDGE_TRACK_CNTL, lane, 0);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_OFF, lane, 0);
+	}
+
+	return NPU3_PROC_NEXT;
+}
+
+/* Procedure 1.2.5 - IO PHY Tx FIFO Init */
+static uint32_t phy_tx_fifo_init(struct npu3_dev *dev)
+{
+	int lane;
+
+	npu3_for_each_lane(lane, dev) {
+		npu3_phy_write_lane(dev, &NPU3_PHY_TX_UNLOAD_CLK_DISABLE, lane, 0);
+		npu3_phy_write_lane(dev, &NPU3_PHY_TX_FIFO_INIT, lane, 1);
+		npu3_phy_write_lane(dev, &NPU3_PHY_TX_UNLOAD_CLK_DISABLE, lane, 1);
+	}
+
+	set_iovalid(dev, true);
+
+	return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_complete, phy_tx_fifo_init);
+
+/* Procedure 1.2.8 - Enable Downstream Link Training */
+static uint32_t phy_enable_tx_rxcal(struct npu3_dev *dev)
+{
+	int lane;
+
+	npu3_for_each_lane(lane, dev)
+		npu3_phy_write_lane(dev, &NPU3_PHY_TX_RXCAL, lane, 1);
+
+	return NPU3_PROC_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_enable_tx_rxcal);
+
+/* Procedure 1.2.9 - Disable Downstream Link Training */
+static uint32_t phy_disable_tx_rxcal(struct npu3_dev *dev)
+{
+	int lane;
+
+	npu3_for_each_lane(lane, dev)
+		npu3_phy_write_lane(dev, &NPU3_PHY_TX_RXCAL, lane, 0);
+
+	return NPU3_PROC_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_disable_tx_rxcal);
+
+/* Procedure 1.2.7 - I/O PHY Upstream Link Training */
+static uint32_t phy_rx_training(struct npu3_dev *dev)
+{
+	int lane;
+
+	npu3_for_each_lane(lane, dev)
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_LANE, lane, 1);
+
+	return NPU3_PROC_NEXT;
+}
+
+static uint32_t phy_rx_training_wait(struct npu3_dev *dev)
+{
+	int lane;
+
+	npu3_for_each_lane(lane, dev)
+		if (!npu3_phy_read_lane(dev, &NPU3_PHY_RX_INIT_DONE, lane))
+			return NPU3_PROC_INPROGRESS;
+
+	return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(phy_rx_training, phy_rx_training_wait);
+
+static void npu3_dev_fence_set(struct npu3_dev *dev, uint8_t state)
+{
+	struct npu3 *npu = dev->npu;
+	uint64_t val;
+
+	val = npu3_read(npu, NPU3_NTL_MISC_CFG1(dev->index));
+	val = SETFIELD(NPU3_NTL_MISC_CFG1_NTL_RESET, val, state);
+	npu3_write(npu, NPU3_NTL_MISC_CFG1(dev->index), val);
+}
+
+static uint8_t npu3_dev_fence_get(struct npu3_dev *dev)
+{
+	uint64_t val;
+
+	val = npu3_read(dev->npu, NPU3_NTL_CQ_FENCE_STATUS(dev->index));
+	return GETFIELD(NPU3_NTL_CQ_FENCE_STATUS_FIELD, val);
+}
+
+/* Procedure 1.2.1 - Reset NPU/NDL */
+static uint32_t reset_ntl(struct npu3_dev *dev)
+{
+	struct npu3 *npu = dev->npu;
+	uint64_t val;
+	int lane;
+
+	set_iovalid(dev, true);
+
+	/* Power on clocks */
+	npu3_phy_write(dev, &NPU3_PHY_RX_CLKDIST_PDWN, 0);
+	npu3_phy_write(dev, &NPU3_PHY_RX_IREF_PDWN, 1);
+	npu3_phy_write(dev, &NPU3_PHY_TX_CLKDIST_PDWN, 0);
+	npu3_phy_write(dev, &NPU3_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0);
+
+	npu3_for_each_lane(lane, dev) {
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_ANA_PDWN, lane, 0);
+		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_DIG_PDWN, lane, 0);
+		npu3_phy_write_lane(dev, &NPU3_PHY_TX_LANE_PDWN, lane, 0);
+	}
+
+	/* Write PRI */
+	val = SETFIELD(NPU3_NTL_PRI_CFG_NDL, 0ull, dev->index);
+	npu3_write(npu, NPU3_NTL_PRI_CFG(dev->index), val);
+
+	/* Disable parity checking */
+	val = npu3_read(npu, NPU3_NTL_MISC_CFG2(dev->index));
+	val &= ~(NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA |
+		 NPU3_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
+		 NPU3_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA);
+	npu3_write(npu, NPU3_NTL_MISC_CFG2(dev->index), val);
+
+	if (dev->type == NPU3_DEV_TYPE_NVLINK)
+		npu3_pvd_flag_clear(dev, NPU3_DEV_DL_RESET);
+
+	npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_FULL);
+
+	return NPU3_PROC_NEXT;
+}
+
+static uint32_t reset_ndl(struct npu3_dev *dev)
+{
+	struct npu3 *npu = dev->npu;
+	uint64_t reg;
+	uint32_t val32;
+
+	if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_FULL)
+		return NPU3_PROC_INPROGRESS;
+
+	reg = NPU3_DLPL_CTL(dev->index);
+	val32 = npu3_read_4b(npu, reg);
+	val32 |= NPU3_DLPL_CTL_RESET_RX | NPU3_DLPL_CTL_RESET_MISC;
+	npu3_write_4b(npu, reg, val32);
+
+	val32 = npu3_read_4b(npu, reg);
+	val32 &= ~(NPU3_DLPL_CTL_RESET_RX | NPU3_DLPL_CTL_RESET_MISC);
+	npu3_write_4b(npu, reg, val32);
+
+	reg = NPU3_DLPL_CFG(dev->index);
+	val32 = NPU3_DLPL_CFG_PRI_BYTESWAP;
+	npu3_write_4b(npu, reg, val32);
+
+	/* Clear FIR bits */
+	for (uint32_t i = 0; i < NPU3_FIR_MAX; i++)
+		xscom_write(npu->chip_id, npu->xscom_base + NPU3_FIR(i), 0ull);
+
+	npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_HALF);
+
+	return NPU3_PROC_NEXT;
+}
+
+static uint32_t reset_ntl_release(struct npu3_dev *dev)
+{
+	struct npu3 *npu = dev->npu;
+	uint32_t i = dev->index;
+
+	if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_HALF)
+		return NPU3_PROC_INPROGRESS;
+
+	/* Credit setup */
+	npu3_write(npu, NPU3_NTL_CREQ_HDR_CRED_SND(i), 0x0200000000000000);
+	npu3_write(npu, NPU3_NTL_PRB_HDR_CRED_SND(i),  0x0200000000000000);
+	npu3_write(npu, NPU3_NTL_ATR_HDR_CRED_SND(i),  0x0200000000000000);
+	npu3_write(npu, NPU3_NTL_RSP_HDR_CRED_SND(i),  0x0200000000000000);
+	npu3_write(npu, NPU3_NTL_CREQ_DAT_CRED_SND(i), 0x1000000000000000);
+	npu3_write(npu, NPU3_NTL_RSP_DAT_CRED_SND(i),  0x1000000000000000);
+
+	npu3_write(npu, NPU3_NTL_CREQ_HDR_CRED_RCV(i), 0x0000be0000000000);
+	npu3_write(npu, NPU3_NTL_DGD_HDR_CRED_RCV(i),  0x0000640000000000);
+	npu3_write(npu, NPU3_NTL_ATSD_HDR_CRED_RCV(i), 0x0000200000000000);
+	npu3_write(npu, NPU3_NTL_RSP_HDR_CRED_RCV(i),  0x0000be0000000000);
+	npu3_write(npu, NPU3_NTL_CREQ_DAT_CRED_RCV(i), 0x0001000000000000);
+	npu3_write(npu, NPU3_NTL_RSP_DAT_CRED_RCV(i),  0x0001000000000000);
+
+	npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_NONE);
+
+	return NPU3_PROC_NEXT;
+}
+
+static uint32_t reset_ntl_finish(struct npu3_dev *dev) {
+	struct npu3 *npu = dev->npu;
+	uint64_t val;
+
+	if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_NONE)
+		return NPU3_PROC_INPROGRESS;
+
+	/* Enable parity checking */
+	val = npu3_read(npu, NPU3_NTL_MISC_CFG2(dev->index));
+	val |= NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA |
+	       NPU3_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
+	       NPU3_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA;
+	npu3_write(npu, NPU3_NTL_MISC_CFG2(dev->index), val);
+
+	if (dev->type == NPU3_DEV_TYPE_NVLINK)
+		npu3_pvd_flag_set(dev, NPU3_DEV_DL_RESET);
+
+	return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(reset_ntl, reset_ndl, reset_ntl_release, reset_ntl_finish);
+
+static int npu3_dev_regcmp(struct npu3_dev *dev, uint64_t reg,
+			   const char *reg_name, uint64_t expected)
+{
+	uint64_t val;
+
+	val = npu3_read(dev->npu, reg);
+	if (val == expected)
+		return 0;
+
+	NPU3DEVERR(dev, "%s: expected 0x%llx, read 0x%llx\n",
+		   reg_name, expected, val);
+
+	return 1;
+}
+
+#define REGCMP(reg, expected) \
+	npu3_dev_regcmp(dev, reg(dev->index), #reg, expected)
+
+static uint32_t check_credits(struct npu3_dev *dev)
+{
+	/* Use bitwise OR to prevent short-circuit evaluation */
+	if (REGCMP(NPU3_NTL_CREQ_HDR_CRED_RCV, 0x0be0be0000000000ull) |
+	    REGCMP(NPU3_NTL_DGD_HDR_CRED_RCV,  0x0640640000000000ull) |
+	    REGCMP(NPU3_NTL_ATSD_HDR_CRED_RCV, 0x0200200000000000ull) |
+	    REGCMP(NPU3_NTL_RSP_HDR_CRED_RCV,  0x0be0be0000000000ull) |
+	    REGCMP(NPU3_NTL_CREQ_DAT_CRED_RCV, 0x1001000000000000ull) |
+	    REGCMP(NPU3_NTL_RSP_DAT_CRED_RCV,  0x1001000000000000ull))
+		return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
+
+	return NPU3_PROC_COMPLETE;
+}
+
+DEFINE_PROCEDURE(check_credits);
+
+static struct procedure *procedures[] = {
+	 [0] = &procedure_stop,
+	 [1] = &procedure_nop,
+	 [4] = &procedure_phy_reset,
+	 [5] = &procedure_phy_tx_zcal,
+	 [6] = &procedure_phy_rx_dccal,
+	 [7] = &procedure_phy_enable_tx_rxcal,
+	 [8] = &procedure_phy_disable_tx_rxcal,
+	 [9] = &procedure_phy_rx_training,
+	[10] = &procedure_reset_ntl,
+	[11] = &procedure_nop, /* Placeholder for pre-terminate */
+	[12] = &procedure_nop, /* Placeholder for terminate */
+	[13] = &procedure_check_credits,
+};
+
+void npu3_dev_procedure_init(struct npu3_dev *dev, uint32_t pnum)
+{
+	struct npu3_procedure *proc = &dev->proc;
+	const char *name;
+
+	if (pnum >= ARRAY_SIZE(procedures) || !procedures[pnum]) {
+		NPU3DEVERR(dev, "Unsupported procedure number %d\n", pnum);
+		proc->status = NPU3_PROC_COMPLETE | NPU3_PROC_UNSUPPORTED;
+		return;
+	}
+
+	name = procedures[pnum]->name;
+
+	if (proc->number == pnum && !(proc->status & NPU3_PROC_COMPLETE))
+		NPU3DEVINF(dev, "Restarting procedure %s\n", name);
+	else
+		NPU3DEVINF(dev, "Starting procedure %s\n", name);
+
+	proc->status = NPU3_PROC_INPROGRESS;
+	proc->number = pnum;
+	proc->step = 0;
+	proc->timeout = mftb() + msecs_to_tb(1000);
+}
+
+static uint32_t npu3_dev_procedure_run_step(struct npu3_dev *dev)
+{
+	struct npu3_procedure *proc = &dev->proc;
+	uint32_t result;
+
+	result = procedures[proc->number]->steps[proc->step](dev);
+	if (result & NPU3_PROC_NEXT) {
+		proc->step++;
+
+		NPU3DEVINF(dev, "Running procedure %s step %d\n",
+			   procedures[proc->number]->name, proc->step);
+	}
+
+	return result;
+}
+
+static void npu3_dev_procedure_run(struct npu3_dev *dev)
+{
+	struct npu3_procedure *proc = &dev->proc;
+	const char *name;
+	uint32_t result;
+
+	do {
+		result = npu3_dev_procedure_run_step(dev);
+	} while (result & NPU3_PROC_NEXT);
+
+	name = procedures[proc->number]->name;
+
+	if (result & NPU3_PROC_COMPLETE) {
+		NPU3DEVINF(dev, "Procedure %s complete\n", name);
+	} else if (tb_compare(mftb(), proc->timeout) == TB_AAFTERB) {
+		NPU3DEVINF(dev, "Procedure %s timed out\n", name);
+		result = NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
+	}
+
+	/* Mask off internal state bits */
+	proc->status = result & NPU3_PROC_STATUS_MASK;
+}
+
+uint32_t npu3_dev_procedure_status(struct npu3_dev *dev)
+{
+	/* Run the procedure if not already complete */
+	if (!(dev->proc.status & NPU3_PROC_COMPLETE))
+		npu3_dev_procedure_run(dev);
+
+	return dev->proc.status;
+}
+
+int64_t npu3_dev_reset(struct npu3_dev *dev)
+{
+	unsigned long timeout;
+
+	reset_ntl(dev);
+	timeout = mftb() + msecs_to_tb(1000);
+
+	while (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_FULL) {
+		if (tb_compare(mftb(), timeout) == TB_AAFTERB) {
+			NPU3DEVINF(dev, "Device reset timed out\n");
+			return OPAL_BUSY;
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
diff --git a/roms/skiboot/hw/npu3-nvlink.c b/roms/skiboot/hw/npu3-nvlink.c
new file mode 100644
index 000000000..920864b32
--- /dev/null
+++ b/roms/skiboot/hw/npu3-nvlink.c
@@ -0,0 +1,1828 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <device.h>
+#include <phys-map.h>
+#include <npu3.h>
+#include <npu3-regs.h>
+#include <pci-virt.h>
+#include <xscom.h>
+#include <xscom-p9-regs.h>
+#include <interrupts.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <cache-p9.h>
+
+#define NPU3LOG(l, npu, fmt, a...)		\
+	prlog(l, "NPU#%04x[%d:%d]: " fmt,	\
+	      (npu)->nvlink.phb.opal_id,	\
+	      (npu)->chip_id,			\
+	      (npu)->index, ##a)
+#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a)
+#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a)
+#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a)
+
+#define NPU3DEVLOG(l, dev, fmt, a...)			\
+	prlog(l, "NPU#%04x:%02x:%02x.%x " fmt,		\
+	      (dev)->npu->nvlink.phb.opal_id,		\
+	      PCI_BUS_NUM((dev)->nvlink.pvd->bdfn),	\
+	      PCI_DEV((dev)->nvlink.pvd->bdfn),	\
+	      PCI_FUNC((dev)->nvlink.pvd->bdfn), ##a)
+#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
+#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
+#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
+
+#define NPU3_CFG_READ(size, type)					\
+static int64_t npu3_cfg_read##size(struct phb *phb, uint32_t bdfn,	\
+				   uint32_t offset, type *data)		\
+{									\
+	uint32_t val;							\
+	int64_t ret;							\
+									\
+	ret = pci_virt_cfg_read(phb, bdfn, offset,			\
+				sizeof(*data), &val);			\
+	*data = (type)val;						\
+	return ret;							\
+}
+
+#define NPU3_CFG_WRITE(size, type)					\
+static int64_t npu3_cfg_write##size(struct phb *phb, uint32_t bdfn,	\
+				    uint32_t offset, type data)		\
+{									\
+	uint32_t val = data;						\
+	int64_t ret;							\
+									\
+	ret = pci_virt_cfg_write(phb, bdfn, offset,			\
+				 sizeof(data), val);			\
+	return ret;							\
+}
+
+NPU3_CFG_READ(8, u8);
+NPU3_CFG_READ(16, u16);
+NPU3_CFG_READ(32, u32);
+NPU3_CFG_WRITE(8, u8);
+NPU3_CFG_WRITE(16, u16);
+NPU3_CFG_WRITE(32, u32);
+
+static int64_t npu3_eeh_freeze_status(struct phb *phb __unused,
+				      uint64_t pe_num __unused,
+				      uint8_t *freeze_state,
+				      uint16_t *pci_error_type,
+				      uint16_t *severity)
+{
+	/*
+	 * FIXME: When it's called by skiboot PCI config accessor,
+	 * the PE number is fixed to 0, which is incorrect. We need
+	 * introduce another PHB callback to translate it. For now,
+	 * it keeps the skiboot PCI enumeration going.
+	 */
+	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+
+	if (severity)
+		*severity = OPAL_EEH_SEV_NO_ERROR;
+
+	return OPAL_SUCCESS;
+}
+
+/* Number of PEs supported */
+#define NPU3_MAX_PE_NUM		16
+#define NPU3_RESERVED_PE_NUM	15
+
+static int64_t npu3_ioda_reset(struct phb *phb, bool purge __unused)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+	uint64_t val;
+
+	val = NPU3_ATS_IODA_ADDR_AUTO_INC;
+	val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, val,
+		       NPU3_ATS_IODA_ADDR_TBL_TVT);
+	npu3_write(npu, NPU3_ATS_IODA_ADDR, val);
+
+	for (uint32_t i = 0; i < NPU3_MAX_PE_NUM; i++)
+		npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull);
+
+	return OPAL_SUCCESS;
+}
+
+static inline void npu3_ioda_sel(struct npu3 *npu, uint32_t table,
+				 uint32_t index)
+{
+	uint64_t val;
+
+	val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, 0ull, table);
+	val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_ADDR, val, index);
+	npu3_write(npu, NPU3_ATS_IODA_ADDR, val);
+}
+
+static int64_t npu3_map_pe_dma_window(struct phb *phb,
+				      uint64_t pe_num,
+				      uint16_t window_id,
+				      uint16_t tce_levels,
+				      uint64_t tce_table_addr,
+				      uint64_t tce_table_size,
+				      uint64_t tce_page_size)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+	uint64_t tts_encoded, val;
+	uint32_t page_size;
+
+	/* Each PE has one corresponding TVE */
+	if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
+
+	/* TCE table size zero is used to disable the TVE */
+	if (!tce_table_size) {
+		npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull);
+		return OPAL_SUCCESS;
+	}
+
+	/* TCE table size */
+	if (!is_pow2(tce_table_size) || tce_table_size < 0x1000)
+		return OPAL_PARAMETER;
+
+	tts_encoded = ilog2(tce_table_size) - 11;
+	if (tts_encoded > 39)
+		return OPAL_PARAMETER;
+
+	val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_SIZE, 0ull, tts_encoded);
+
+	/* Number of levels */
+	if (tce_levels < 1 || tce_levels > 4)
+		return OPAL_PARAMETER;
+
+	val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_LEVEL, val, tce_levels - 1);
+
+	/* TCE page size */
+	switch (tce_page_size) {
+	case 256 << 20:
+		page_size = 17;
+		break;
+	case 16 << 20:
+		page_size = 13;
+		break;
+	case 64 << 10:
+		page_size = 5;
+		break;
+	default:
+		page_size = 1;
+	}
+
+	val = SETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val, page_size);
+	val = SETFIELD(NPU3_ATS_IODA_TVT_XLAT_ADDR, val, tce_table_addr >> 12);
+	npu3_write(npu, NPU3_ATS_IODA_DATA, val);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu3_map_pe_dma_window_real(struct phb *phb,
+					   uint64_t pe_num,
+					   uint16_t window_id,
+					   uint64_t pci_start_addr __unused,
+					   uint64_t pci_mem_size __unused)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+	uint64_t val;
+
+	/* Each PE has one corresponding TVE */
+	if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	if (pci_mem_size) {
+		/*
+		 * GPUs need to be able to access the MMIO memory space as well.
+		 * On POWER9 this is above the top of RAM, so disable the TVT
+		 * range check, allowing access to all memory addresses.
+		 */
+		val = 0;
+	} else {
+		/* Disable */
+		val = PPC_BIT(51);
+	}
+
+	npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
+	npu3_write(npu, NPU3_ATS_IODA_DATA, val);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu3_next_error(struct phb *phb,
+			       uint64_t *first_frozen_pe,
+			       uint16_t *pci_error_type,
+			       uint16_t *severity)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+	uint64_t val;
+	uint32_t pe_num;
+
+	if (!first_frozen_pe || !pci_error_type || !severity)
+		return OPAL_PARAMETER;
+
+	*first_frozen_pe = -1;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+	*severity = OPAL_EEH_SEV_NO_ERROR;
+
+	for (pe_num = 0; pe_num < NPU3_MAX_PE_NUM; pe_num++) {
+		val = npu3_read(npu, NPU3_MISC_PESTB_DATA(pe_num));
+		if (!GETFIELD(NPU3_MISC_PESTB_DATA_DMA_STOPPED_STATE, val))
+			continue;
+
+		*first_frozen_pe = pe_num;
+		*pci_error_type = OPAL_EEH_PE_ERROR;
+		*severity = OPAL_EEH_SEV_PE_ER;
+		break;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static struct npu3_dev *npu3_bdfn_to_dev(struct npu3 *npu, uint32_t bdfn)
+{
+	struct pci_virt_device *pvd;
+
+	/* All emulated devices are attached to root bus */
+	if (bdfn & ~0xff)
+		return NULL;
+
+	pvd = pci_virt_find_device(&npu->nvlink.phb, bdfn);
+	if (pvd)
+		return pvd->data;
+
+	return NULL;
+}
+
+static int npu3_match_gpu(struct phb *phb __unused, struct pci_device *pd,
+			  void *data)
+{
+	const char *slot = data;
+	struct dt_node *dn;
+	char *loc_code;
+
+	/* Ignore non-NVIDIA devices */
+	if (PCI_VENDOR_ID(pd->vdid) != 0x10de)
+		return 0;
+
+	/* Find the PCI device's slot location */
+	for (dn = pd->dn;
+	     dn && !dt_find_property(dn, "ibm,loc-code");
+	     dn = dn->parent);
+
+	if (!dn)
+		return 0;
+
+	loc_code = (char *)dt_prop_get(dn, "ibm,loc-code");
+	if (streq(loc_code, slot))
+		return 1;
+
+	return 0;
+}
+
+static void npu3_dev_find_gpu(struct npu3_dev *dev)
+{
+	const char *slot = dev->nvlink.loc_code;
+	struct phb *phb;
+	struct pci_device *gpu;
+
+	if (!slot)
+		return;
+
+	for_each_phb(phb) {
+		gpu = pci_walk_dev(phb, NULL, npu3_match_gpu, (void *)slot);
+		if (!gpu)
+			continue;
+
+		dev->nvlink.gpu = gpu;
+		return;
+	}
+
+	NPU3DEVINF(dev, "No PCI device found for slot '%s'\n", slot);
+}
+
+#define VENDOR_CAP_START		0x80
+#define VENDOR_CAP_LINK_FLAG_OFFSET	0x0d
+
+void npu3_pvd_flag_set(struct npu3_dev *dev, uint8_t flag)
+{
+	uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET;
+	uint32_t flags;
+
+	PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags);
+	flags |= flag;
+	PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags);
+}
+
+void npu3_pvd_flag_clear(struct npu3_dev *dev, uint8_t flag)
+{
+	uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET;
+	uint32_t flags;
+
+	PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags);
+	flags &= ~flag;
+	PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags);
+}
+
+static struct lock npu3_phandle_lock = LOCK_UNLOCKED;
+
+static void npu3_append_phandle(struct dt_node *dn, const char *name,
+				uint32_t phandle)
+{
+	struct dt_property *prop;
+	uint32_t *phandles;
+	size_t len;
+
+	prop = __dt_find_property(dn, name);
+	if (!prop) {
+		dt_add_property_cells(dn, name, phandle);
+		return;
+	}
+
+	/*
+	 * Make sure no one else has a reference to the property. Assume
+	 * this is the only function that holds a reference to it.
+	 */
+	lock(&npu3_phandle_lock);
+
+	/* Need to append to the property */
+	len = prop->len + sizeof(*phandles);
+	dt_resize_property(&prop, len);
+
+	phandles = (uint32_t *)prop->prop;
+	phandles[len / sizeof(*phandles) - 1] = phandle;
+
+	unlock(&npu3_phandle_lock);
+}
+
+static void npu3_dev_fixup_dt(struct npu3_dev *dev)
+{
+	struct pci_device *pd = dev->nvlink.pd;
+	struct pci_device *gpu = dev->nvlink.gpu;
+
+	dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dn->phandle);
+	dt_add_property_string(pd->dn, "ibm,loc-code", dev->nvlink.loc_code);
+	if (dev->link_speed != 0xff)
+		dt_add_property_cells(pd->dn, "ibm,nvlink-speed",
+				      lo32(dev->link_speed));
+
+	if (!gpu)
+		return;
+
+	npu3_append_phandle(gpu->dn, "ibm,npu", pd->dn->phandle);
+	dt_add_property_cells(pd->dn, "ibm,gpu", gpu->dn->phandle);
+}
+
+static int64_t npu3_gpu_bridge_sec_bus_reset(void *pdev,
+				struct pci_cfg_reg_filter *pcrf __unused,
+				uint32_t offset, uint32_t len,
+				uint32_t *data, bool write)
+{
+	struct pci_device *pd = pdev;
+	struct pci_device *gpu;
+	struct npu3 *npu;
+	struct npu3_dev *dev;
+	bool purge = false;
+
+	if (!write)
+		return OPAL_PARAMETER;
+
+	if (len != 2 || offset & 1) {
+		PCIERR(pd->phb, pd->bdfn,
+		       "Unsupported write to bridge control register\n");
+		return OPAL_PARAMETER;
+	}
+
+	if (!(*data & PCI_CFG_BRCTL_SECONDARY_RESET))
+		return OPAL_PARTIAL;
+
+	gpu = list_top(&pd->children, struct pci_device, link);
+	if (!gpu)
+		return OPAL_PARTIAL;
+
+	npu3_for_each_nvlink_npu(npu)
+		npu3_for_each_nvlink_dev(dev, npu)
+			if (dev->nvlink.gpu == gpu)
+				if (!npu3_dev_reset(dev))
+					purge = true;
+
+	if (purge)
+		purge_l2_l3_caches();
+
+	return OPAL_PARTIAL;
+}
+
+static int npu3_dev_bind(struct phb *phb, struct pci_device *pd,
+			 void *data __unused)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+	struct npu3_dev *dev = npu3_bdfn_to_dev(npu, pd->bdfn);
+	struct pci_device *gpu;
+
+	dev->nvlink.pd = pd;
+
+	/* The slot label indicates which GPU this link is connected to */
+	dev->nvlink.loc_code = dt_prop_get_def(dev->dn, "ibm,slot-label", NULL);
+	if (!dev->nvlink.loc_code) {
+		/**
+		 * @fwts-label NPUNoPHBSlotLabel
+		 * @fwts-advice No GPU/NPU slot information was found.
+		 * NVLink3 functionality will not work.
+		 */
+		NPU3DEVERR(dev, "Cannot find GPU slot information\n");
+	}
+
+	npu3_dev_find_gpu(dev);
+	npu3_dev_fixup_dt(dev);
+
+	gpu = dev->nvlink.gpu;
+	if (!gpu)
+		return 0;
+
+	/* When a GPU is reset, ensure all of its links are reset too */
+	if (gpu->parent && gpu->parent->slot)
+		pci_add_cfg_reg_filter(gpu->parent, PCI_CFG_BRCTL, 2,
+				       PCI_REG_FLAG_WRITE,
+				       npu3_gpu_bridge_sec_bus_reset);
+
+	npu3_pvd_flag_set(dev, NPU3_DEV_PCI_LINKED);
+
+	return 0;
+}
+
+struct npu3 *npu3_next_nvlink_npu(struct npu3 *npu, uint32_t chip_id)
+{
+	uint64_t phb_id = 0;
+	struct phb *phb;
+
+	if (npu)
+		phb_id = npu->nvlink.phb.opal_id + 1;
+
+	for (; (phb = __pci_next_phb_idx(&phb_id));) {
+		if (phb->phb_type != phb_type_npu_v3)
+			continue;
+
+		npu = npu3_phb_to_npu(phb);
+		if (npu->chip_id == chip_id || chip_id == NPU3_ANY_CHIP)
+			return npu;
+	}
+
+	return NULL;
+}
+
+static struct npu3 *npu3_last_npu(void)
+{
+	static struct npu3 *last = NULL;
+	struct npu3 *npu;
+
+	if (last)
+		return last;
+
+	npu3_for_each_nvlink_npu(npu)
+		last = npu;
+
+	return last;
+}
+
+static uint32_t npu3_gpu_links(struct pci_device *gpu)
+{
+	const struct dt_property *prop;
+
+	if (!gpu)
+		return 0;
+
+	/* The link count is the number of phandles in "ibm,npu" */
+	prop = dt_find_property(gpu->dn, "ibm,npu");
+	if (!prop)
+		return 0;
+
+	return prop->len / sizeof(uint32_t);
+}
+
+static uint32_t npu3_links_per_gpu(void)
+{
+	struct npu3 *npu;
+	struct npu3_dev *dev;
+	uint32_t links = 0;
+
+	/* Use the first GPU we find to figure this out */
+	npu3_for_each_nvlink_npu(npu) {
+		npu3_for_each_nvlink_dev(dev, npu) {
+			links = npu3_gpu_links(dev->nvlink.gpu);
+			if (links)
+				goto out;
+		}
+	}
+
+out:
+	prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, links);
+
+	return links;
+}
+
+int32_t npu3_dev_gpu_index(struct npu3_dev *dev)
+{
+	const char *slot;
+	char *p = NULL;
+	int ret;
+
+	slot = dev->nvlink.loc_code;
+	if (!slot)
+		return -1;
+
+	if (memcmp(slot, "GPU", 3))
+		return -1;
+
+	ret = strtol(slot + 3, &p, 10);
+	if (*p || p == slot + 3)
+		return -1;
+
+	return ret;
+}
+
+static uint32_t npu3_chip_possible_gpu_links(void)
+{
+	struct proc_chip *chip;
+	struct npu3 *npu;
+	struct npu3_dev *dev;
+	uint32_t possible = 0;
+
+	for_each_chip(chip) {
+		npu3_for_each_chip_nvlink_npu(npu, chip->id)
+			npu3_for_each_nvlink_dev(dev, npu)
+				if (npu3_dev_gpu_index(dev) != -1)
+					possible++;
+
+		if (possible)
+			break;
+	}
+
+	prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible);
+
+	return possible;
+}
+
+uint32_t npu3_chip_possible_gpus(void)
+{
+	static uint32_t possible = -1;
+	uint32_t links_per_gpu;
+
+	/* Static value, same for all chips; only do this once */
+	if (possible != -1)
+		return possible;
+
+	possible = 0;
+
+	links_per_gpu = npu3_links_per_gpu();
+	if (links_per_gpu)
+		possible = npu3_chip_possible_gpu_links() / links_per_gpu;
+
+	prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible);
+
+	return possible;
+}
+
+static void npu3_dev_assign_gmb(struct npu3_dev *dev, uint64_t addr,
+				uint64_t size)
+{
+	uint32_t mode;
+	uint64_t val;
+
+	switch (npu3_gpu_links(dev->nvlink.gpu)) {
+	case 0:
+		return;
+	case 1:
+		mode = 0;
+		break;
+	case 2:
+		mode = 1;
+		break;
+	case 3:
+		mode = 3;
+		break;
+	case 4:
+		mode = 6;
+		break;
+	case 6:
+		mode = 10;
+		break;
+	default:
+		/* Hardware does not support this configuration */
+		assert(0);
+	}
+
+	mode += PCI_FUNC(dev->nvlink.pvd->bdfn);
+
+	val = NPU3_GPU_MEM_BAR_ENABLE |
+	      NPU3_GPU_MEM_BAR_POISON;
+	val = SETFIELD(NPU3_GPU_MEM_BAR_ADDR, val, addr >> 30);
+	val = SETFIELD(NPU3_GPU_MEM_BAR_SIZE, val, size >> 30);
+	val = SETFIELD(NPU3_GPU_MEM_BAR_MODE, val, mode);
+
+	npu3_write(dev->npu, NPU3_GPU_MEM_BAR(dev->index), val);
+}
+
+static struct dt_node *npu3_create_memory_dn(struct npu3_dev *dev,
+					     uint32_t gpu_index, uint64_t addr,
+					     uint64_t size)
+{
+	uint32_t nid = 255 - gpu_index;
+	struct dt_node *mem;
+
+	mem = dt_find_by_name_addr(dt_root, "memory", addr);
+	if (mem)
+		return mem;
+
+	mem = dt_new_addr(dt_root, "memory", addr);
+	assert(mem);
+
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
+	dt_add_property_u64s(mem, "reg", addr, size);
+	dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
+	dt_add_property_cells(mem, "ibm,chip-id", nid);
+	dt_add_property_cells(mem, "ibm,associativity", 4, nid, nid, nid, nid);
+
+	NPU3INF(dev->npu, "%s mem: 0x%016llx (nid %d)\n", dev->nvlink.loc_code,
+		addr, nid);
+
+	return mem;
+}
+
+static void npu3_dev_init_gpu_mem(struct npu3_dev *dev)
+{
+	struct pci_device *pd = dev->nvlink.pd;
+	struct npu3 *npu = dev->npu;
+	struct dt_node *mem;
+	uint64_t addr, size, gta;
+	uint32_t gpu_index;
+
+	if (!dev->nvlink.gpu)
+		return;
+
+	gpu_index = npu3_dev_gpu_index(dev) % npu3_chip_possible_gpus();
+	phys_map_get(npu->chip_id, GPU_MEM_4T_DOWN, gpu_index, &addr, &size);
+
+	npu3_dev_assign_gmb(dev, addr, size);
+	mem = npu3_create_memory_dn(dev, gpu_index, addr, size);
+
+	/*
+	 * Coral mode address compression. This is documented in Figure 3.5 of
+	 * the NPU workbook; "P9->GPU RA Compression (Coral)".
+	 */
+	gta  = (addr >> 42 & 0x1) << 42;
+	gta |= (addr >> 45 & 0x3) << 43;
+	gta |= (addr >> 49 & 0x3) << 45;
+	gta |= addr & ((1ul << 43) - 1);
+
+	dt_add_property_cells(pd->dn, "memory-region", mem->phandle);
+	dt_add_property_u64s(pd->dn, "ibm,device-tgt-addr", gta);
+}
+
+static void npu3_final_fixup(void)
+{
+	struct npu3 *npu;
+	struct npu3_dev *dev;
+
+	npu3_for_each_nvlink_npu(npu)
+		npu3_for_each_nvlink_dev(dev, npu)
+			npu3_dev_init_gpu_mem(dev);
+}
+
+static void npu3_phb_final_fixup(struct phb *phb)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+
+	pci_walk_dev(phb, NULL, npu3_dev_bind, NULL);
+
+	/*
+	 * After every npu's devices are bound, do gpu-related fixup. This
+	 * counts on npu3_last_npu() walking the phbs in the same order as
+	 * the PHB final fixup loop in __pci_init_slots().
+	 */
+	if (npu == npu3_last_npu())
+		npu3_final_fixup();
+}
+
+static int64_t npu3_set_pe(struct phb *phb,
+			   uint64_t pe_num,
+			   uint64_t bdfn,
+			   uint8_t bcompare,
+			   uint8_t dcompare,
+			   uint8_t fcompare,
+			   uint8_t action)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+	struct npu3_dev *dev;
+	uint64_t val;
+
+	dev = npu3_bdfn_to_dev(npu, bdfn);
+	if (!dev)
+		return OPAL_PARAMETER;
+
+	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+		return OPAL_PARAMETER;
+
+	if (pe_num >= NPU3_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	if (bcompare != OpalPciBusAll ||
+	    dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
+	    fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
+		return OPAL_UNSUPPORTED;
+
+	if (!dev->nvlink.gpu)
+		return OPAL_SUCCESS;
+
+	val = NPU3_CTL_BDF2PE_CFG_ENABLE;
+	val = SETFIELD(NPU3_CTL_BDF2PE_CFG_PE, val, pe_num);
+	val = SETFIELD(NPU3_CTL_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn);
+	npu3_write(npu, NPU3_CTL_BDF2PE_CFG(pe_num), val);
+
+	val = NPU3_MISC_BDF2PE_CFG_ENABLE;
+	val = SETFIELD(NPU3_MISC_BDF2PE_CFG_PE, val, pe_num);
+	val = SETFIELD(NPU3_MISC_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn);
+	npu3_write(npu, NPU3_MISC_BDF2PE_CFG(pe_num), val);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu3_tce_kill_pages(struct npu3 *npu,
+				   uint64_t pe_num,
+				   uint32_t tce_size,
+				   uint64_t dma_addr,
+				   uint32_t npages)
+{
+	uint32_t check_tce_size;
+	uint64_t val;
+
+	if (pe_num >= NPU3_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
+	val = npu3_read(npu, NPU3_ATS_IODA_DATA);
+
+	check_tce_size = 0x800 << GETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val);
+	if (check_tce_size != tce_size) {
+		NPU3ERR(npu, "%s: Unexpected TCE size (got 0x%x, expected 0x%x)\n",
+			__func__, tce_size, check_tce_size);
+
+		return OPAL_PARAMETER;
+	}
+
+	val = NPU3_ATS_TCE_KILL_ONE;
+	val = SETFIELD(NPU3_ATS_TCE_KILL_PE_NUMBER, val, pe_num);
+
+	while (npages--) {
+		val = SETFIELD(NPU3_ATS_TCE_KILL_ADDRESS, val, dma_addr >> 12);
+		npu3_write(npu, NPU3_ATS_TCE_KILL, val);
+
+		dma_addr += tce_size;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu3_tce_kill(struct phb *phb,
+			     uint32_t kill_type,
+			     uint64_t pe_num,
+			     uint32_t tce_size,
+			     uint64_t dma_addr,
+			     uint32_t npages)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+
+	sync();
+
+	switch(kill_type) {
+	case OPAL_PCI_TCE_KILL_PAGES:
+		return npu3_tce_kill_pages(npu, pe_num, tce_size,
+					   dma_addr, npages);
+	case OPAL_PCI_TCE_KILL_PE:
+		/*
+		 * NPU doesn't support killing a PE so fall through
+		 * and do a kill all instead.
+		 */
+	case OPAL_PCI_TCE_KILL_ALL:
+		npu3_write(npu, NPU3_ATS_TCE_KILL, NPU3_ATS_TCE_KILL_ALL);
+		return OPAL_SUCCESS;
+	}
+
+	return OPAL_PARAMETER;
+}
+
+static const struct phb_ops npu_ops = {
+	.cfg_read8		= npu3_cfg_read8,
+	.cfg_read16		= npu3_cfg_read16,
+	.cfg_read32		= npu3_cfg_read32,
+	.cfg_write8		= npu3_cfg_write8,
+	.cfg_write16		= npu3_cfg_write16,
+	.cfg_write32		= npu3_cfg_write32,
+	.eeh_freeze_status	= npu3_eeh_freeze_status,
+	.ioda_reset		= npu3_ioda_reset,
+	.map_pe_dma_window	= npu3_map_pe_dma_window,
+	.map_pe_dma_window_real	= npu3_map_pe_dma_window_real,
+	.next_error		= npu3_next_error,
+	.phb_final_fixup	= npu3_phb_final_fixup,
+	.set_pe			= npu3_set_pe,
+	.tce_kill		= npu3_tce_kill,
+};
+
+static int64_t npu3_reset(struct pci_slot *slot)
+{
+	struct npu3 *npu = npu3_phb_to_npu(slot->phb);
+	struct npu3_dev *dev;
+	int64_t rc = OPAL_SUCCESS;
+	bool purge = false;
+
+	npu3_for_each_nvlink_dev(dev, npu) {
+		rc = npu3_dev_reset(dev);
+		if (rc)
+			break;
+
+		purge = true;
+	}
+
+	/* No devices reset; don't purge, just return */
+	if (!purge)
+		return rc;
+
+	/* All devices reset */
+	if (!rc)
+		return purge_l2_l3_caches();
+
+	/* Some devices successfully reset; purge, but still return error */
+	purge_l2_l3_caches();
+	return rc;
+}
+
+static int64_t npu3_freset(struct pci_slot *slot __unused)
+{
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu3_get_link_state(struct pci_slot *slot __unused,
+				   uint8_t *val)
+{
+	*val = OPAL_SHPC_LINK_UP_x1;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu3_get_power_state(struct pci_slot *slot __unused,
+				    uint8_t *val)
+{
+	*val = PCI_SLOT_POWER_ON;
+	return OPAL_SUCCESS;
+}
+
+static void npu3_create_phb_slot(struct npu3 *npu)
+{
+	struct pci_slot *slot;
+
+	slot = pci_slot_alloc(&npu->nvlink.phb, NULL);
+	if (!slot)
+		return;
+
+	/* Elementary functions */
+	slot->ops.creset		= npu3_reset;
+	slot->ops.freset		= npu3_freset;
+	slot->ops.hreset		= npu3_reset;
+	slot->ops.get_link_state	= npu3_get_link_state;
+	slot->ops.get_power_state	= npu3_get_power_state;
+}
+
+static void npu3_create_phb(struct npu3 *npu)
+{
+	struct phb *phb = &npu->nvlink.phb;
+
+	phb->phb_type = phb_type_npu_v3;
+	phb->ops = &npu_ops;
+	phb->dt_node = dt_new_addr(dt_root, "pciex", npu->regs[0]);
+	assert(phb->dt_node);
+
+	list_head_init(&phb->virt_devices);
+	pci_register_phb(phb, npu3_get_opal_id(npu->chip_id,
+					       npu3_get_phb_index(npu->index)));
+	npu3_create_phb_slot(npu);
+	npu3_ioda_reset(phb, true);
+}
+
+static void npu3_dev_init_hw(struct npu3_dev *dev)
+{
+	struct npu3 *npu = dev->npu;
+	uint64_t reg, val;
+
+	reg = NPU3_RELAXED_CFG2(dev->index);
+	val = npu3_read(npu, reg);
+	val |= NPU3_RELAXED_CFG2_CMD_CL_DMA_W |
+	       NPU3_RELAXED_CFG2_CMD_CL_DMA_W_HP |
+	       NPU3_RELAXED_CFG2_CMD_CL_DMA_INJ |
+	       NPU3_RELAXED_CFG2_CMD_PR_DMA_INJ |
+	       NPU3_RELAXED_CFG2_CMD_DMA_PR_W |
+	       NPU3_RELAXED_CFG2_CMD_CL_RD_NC_F0 |
+	       NPU3_RELAXED_CFG2_SRC_RDENA(0);
+	npu3_write(npu, reg, val);
+
+	reg = NPU3_NTL_MISC_CFG2(dev->index);
+	val = npu3_read(npu, reg);
+	val |= NPU3_NTL_MISC_CFG2_BRICK_ENABLE |
+	       NPU3_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA;
+	npu3_write(npu, reg, val);
+}
+
+static void npu3_init_hw(struct npu3 *npu)
+{
+	struct npu3_dev *dev;
+	uint64_t reg, val;
+
+	reg = NPU3_XTS_CFG;
+	val = npu3_read(npu, reg);
+	val |= NPU3_XTS_CFG_MMIOSD | NPU3_XTS_CFG_TRY_ATR_RO;
+	npu3_write(npu, reg, val);
+
+	reg = NPU3_XTS_CFG2;
+	val = npu3_read(npu, reg);
+	val |= NPU3_XTS_CFG2_NO_FLUSH_ENA;
+	npu3_write(npu, reg, val);
+
+	reg = NPU3_RELAXED_SRC(0);
+	val = NPU3_RELAXED_SRC_MASK_NPU;
+	npu3_write(npu, reg, val);
+
+	npu3_for_each_nvlink_dev(dev, npu)
+		npu3_dev_init_hw(dev);
+}
+
+/* PCI command register (BAR enable/disable) */
+static int64_t npu3_cfg_cmd(void *pvd,
+			    struct pci_cfg_reg_filter *pcrf __unused,
+			    uint32_t offset, uint32_t size,
+			    uint32_t *data, bool write)
+{
+	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
+
+	if (!write)
+		return OPAL_PARTIAL;
+
+	if (offset != PCI_CFG_CMD)
+		return OPAL_PARAMETER;
+
+	if (size != 1 && size != 2 && size != 4)
+		return OPAL_PARAMETER;
+
+	npu3_dev_enable_bars(dev, !!(*data & PCI_CFG_CMD_MEM_EN));
+
+	return OPAL_PARTIAL;
+}
+
+static int64_t npu3_cfg_bar_write(struct npu3_bar *bar, uint64_t mask,
+				  uint32_t data)
+{
+	if (data != 0xffffffff)
+		return OPAL_HARDWARE;
+
+	/* Return BAR size on next read */
+	bar->trap |= mask;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu3_cfg_bar_read(struct npu3_bar *bar, uint64_t mask,
+				 uint32_t *data)
+{
+	if (!(bar->trap & mask))
+		return OPAL_PARTIAL;
+
+	*data = GETFIELD(mask, bar->size);
+	bar->trap &= ~mask;
+
+	return OPAL_SUCCESS;
+}
+
+/* PCI BAR registers (NTL/GENID) */
+static int64_t npu3_cfg_bar(void *pvd __unused,
+			    struct pci_cfg_reg_filter *pcrf,
+			    uint32_t offset, uint32_t size, uint32_t *data,
+			    bool write)
+{
+	struct npu3_bar *bar = (struct npu3_bar *)pcrf->data;
+	uint64_t mask;
+
+	if (size != 4)
+		return OPAL_PARAMETER;
+
+	if (offset == pcrf->start)
+		mask = 0xffffffff;
+	else if (offset == pcrf->start + 4)
+		mask = 0xffffffffull << 32;
+	else
+		return OPAL_PARAMETER;
+
+	if (write)
+		return npu3_cfg_bar_write(bar, mask, *data);
+
+	return npu3_cfg_bar_read(bar, mask, data);
+}
+
+/* PCI control register */
+static int64_t npu3_cfg_devctl(void *pvd,
+			       struct pci_cfg_reg_filter *pcrf __unused,
+			       uint32_t offset, uint32_t size,
+			       uint32_t *data, bool write)
+{
+	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
+
+	if (!write)
+		return OPAL_HARDWARE;
+
+	if (size != 2 || offset & 1) {
+		NPU3DEVERR(dev, "Unsupported write to pcie control register\n");
+		return OPAL_PARAMETER;
+	}
+
+	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
+		if (!npu3_dev_reset(dev))
+			purge_l2_l3_caches();
+
+	return OPAL_PARTIAL;
+}
+
+static uint32_t npu3_cfg_populate_pcie_cap(struct npu3_dev *dev, uint32_t start,
+					   uint32_t prev_cap)
+{
+	struct pci_virt_device *pvd = dev->nvlink.pvd;
+	uint32_t val;
+
+	/* Add capability list */
+	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
+	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
+
+	/* 0x00 - ID/PCIE capability */
+	val = PCI_CFG_CAP_ID_EXP;
+	val |= 0x2 << 16 | PCIE_TYPE_ENDPOINT << 20;
+	PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
+
+	/* 0x04 - Device capability */
+	val = PCIE_MPSS_128 |
+	      PCIE_PHANTOM_NONE << 3 |
+	      PCIE_L0SL_MAX_NO_LIMIT << 6 |
+	      PCIE_L1L_MAX_NO_LIMIT << 9 |
+	      PCICAP_EXP_DEVCAP_FUNC_RESET;
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
+
+	pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2,
+			    PCI_REG_FLAG_WRITE,
+			    npu3_cfg_devctl, NULL);
+
+	/* 0x08 - Device control and status */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
+			  0xffff0000, 0x000f0000);
+
+	/* 0x0c - Link capability */
+	val = PCIE_LSPEED_VECBIT_2 | PCIE_LWIDTH_1X << 4;
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
+
+	/* 0x10 - Link control and status */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
+			  0xfffff000, 0xc0000000);
+
+	/* 0x14 - Slot capability */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
+
+	/* 0x18 - Slot control and status */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
+
+	/* 0x1c - Root control and capability */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
+			  0xffffffe0, 0x00000000);
+
+	/* 0x20 - Root status */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
+			  0xffffffff, 0x00010000);
+
+	/* 0x24 - Device capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
+
+	/* 0x28 - Device Control and status 2 */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
+			  0xffff0000, 0x00000000);
+
+	/* 0x2c - Link capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
+
+	/* 0x30 - Link control and status 2 */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
+			  0xffff0000, 0x00200000);
+
+	/* 0x34 - Slot capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
+
+	/* 0x38 - Slot control and status 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
+
+	return start + PCICAP_EXP_SCTL2 + 8;
+}
+
+static int64_t npu3_dev_procedure_write(struct npu3_dev *dev, uint32_t offset,
+					uint32_t data)
+{
+	switch (offset) {
+	case 0:
+		NPU3DEVINF(dev, "Ignoring write to status register\n");
+		break;
+	case 4:
+		npu3_dev_procedure_init(dev, data);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu3_dev_procedure_read(struct npu3_dev *dev, uint32_t offset,
+				       uint32_t *data)
+{
+	switch (offset) {
+	case 0:
+		*data = npu3_dev_procedure_status(dev);
+		break;
+	case 4:
+		*data = dev->proc.number;
+		break;
+	default:
+		*data = 0;
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+/* Hardware procedure control/status registers */
+static int64_t npu3_dev_procedure(void *pvd, struct pci_cfg_reg_filter *pcrf,
+				  uint32_t offset, uint32_t size,
+				  uint32_t *data, bool write)
+{
+	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
+
+	if (size != 4)
+		return OPAL_PARAMETER;
+
+	offset -= pcrf->start;
+
+	if (write)
+		return npu3_dev_procedure_write(dev, offset, *data);
+
+	return npu3_dev_procedure_read(dev, offset, data);
+}
+
+/* PPE SRAM access is indirect via CSAR/CSDR */
+static void npu3_dev_ppe_sram_sel(struct npu3_dev *dev, uint32_t reg)
+{
+	uint64_t val;
+
+	val = SETFIELD(OB_PPE_CSAR_SRAM_ADDR, 0ull, reg);
+	xscom_write(dev->npu->chip_id, OB_PPE_CSAR(dev->ob_chiplet), val);
+}
+
+static void npu3_dev_ppe_sram_write(struct npu3_dev *dev, uint32_t reg,
+				    uint64_t val)
+{
+	npu3_dev_ppe_sram_sel(dev, reg);
+	xscom_write(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), val);
+}
+
+static uint64_t npu3_dev_ppe_sram_read(struct npu3_dev *dev, uint32_t reg)
+{
+	uint64_t val;
+
+	npu3_dev_ppe_sram_sel(dev, reg);
+	xscom_read(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), &val);
+
+	return val;
+}
+
+/* Software-implemented autonomous link training (SALT) */
+static int64_t npu3_dev_salt(void *pvd, struct pci_cfg_reg_filter *pcrf,
+			     uint32_t offset, uint32_t size, uint32_t *data,
+			     bool write)
+{
+	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
+	unsigned long timeout;
+	uint32_t cmd_reg;
+	uint64_t val;
+
+	if (size != 4 || offset != pcrf->start)
+		return OPAL_PARAMETER;
+
+	/* The config register before this one holds CMD_REG */
+	PCI_VIRT_CFG_NORMAL_RD(pvd, pcrf->start - 4, 4, &cmd_reg);
+	if (cmd_reg == 0xffffffff)
+		return OPAL_PARAMETER;
+
+	/* Check for another command in progress */
+	val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD);
+	if (GETFIELD(OB_PPE_SALT_CMD_READY, val)) {
+		NPU3DEVINF(dev, "SALT_CMD 0x%x: Not ready\n", cmd_reg);
+		return OPAL_BUSY;
+	}
+
+	val = OB_PPE_SALT_CMD_READY;
+	val = SETFIELD(OB_PPE_SALT_CMD_RW, val, write);
+	val = SETFIELD(OB_PPE_SALT_CMD_LINKNUM, val, npu3_chip_dev_index(dev));
+	val = SETFIELD(OB_PPE_SALT_CMD_REG, val, cmd_reg);
+	if (write)
+		val = SETFIELD(OB_PPE_SALT_CMD_DATA, val, *data);
+
+	npu3_dev_ppe_sram_write(dev, OB_PPE_SALT_CMD, val);
+
+	/* Wait for the go bit to clear */
+	timeout = mftb() + msecs_to_tb(1000);
+
+	while (GETFIELD(OB_PPE_SALT_CMD_READY, val)) {
+		if (tb_compare(mftb(), timeout) == TB_AAFTERB) {
+			NPU3DEVINF(dev, "SALT_CMD 0x%x: Timeout\n", cmd_reg);
+			return OPAL_BUSY;
+		}
+
+		val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD);
+	}
+
+	if (GETFIELD(OB_PPE_SALT_CMD_ERR, val))
+		NPU3DEVINF(dev, "SALT_CMD 0x%x: Error\n", cmd_reg);
+
+	if (!write)
+		*data = GETFIELD(OB_PPE_SALT_CMD_DATA, val);
+
+	return OPAL_SUCCESS;
+}
+
+#define VENDOR_CAP_LEN		0x1c
+#define VENDOR_CAP_VERSION	0x02
+
+static uint32_t npu3_cfg_populate_vendor_cap(struct npu3_dev *dev,
+					     uint32_t start, uint32_t prev_cap)
+{
+	struct pci_virt_device *pvd = dev->nvlink.pvd;
+
+	/* Capabilities list */
+	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
+	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
+
+	/* Length and version */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN);
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION);
+
+	/*
+	 * Defaults when the trap can't handle the read/write (eg. due to
+	 * reading/writing less than 4 bytes).
+	 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
+
+	/* PHY procedure trap */
+	pci_virt_add_filter(pvd, start + 4, 8,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu3_dev_procedure, NULL);
+
+	/* Link index */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, npu3_chip_dev_index(dev));
+
+	/* SALT registers */
+	PCI_VIRT_CFG_INIT(pvd, start + 0x10, 4, 0xffffffff, 0, 0);
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 0x14, 4, 0);
+
+	pci_virt_add_filter(pvd, start + 0x14, 4,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu3_dev_salt, NULL);
+
+	return start + VENDOR_CAP_LEN;
+}
+
+static void npu3_cfg_populate(struct npu3_dev *dev)
+{
+	struct pci_virt_device *pvd = dev->nvlink.pvd;
+	uint64_t addr;
+	uint32_t pos;
+
+	/* 0x00 - Vendor/Device ID */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
+
+	/* 0x04 - Command/Status */
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
+			  0xf9000000);
+
+	pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
+			    npu3_cfg_cmd, NULL);
+
+	/* 0x08 - Rev/Class/Cache */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800102);
+
+	/* 0x0c - CLS/Latency Timer/Header/BIST */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
+
+	/* 0x10/14 - NTL BAR */
+	addr = SETFIELD(0xf, dev->ntl_bar.addr,
+			PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64);
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4, lo32(addr), 0xf, 0);
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, hi32(addr), 0, 0);
+
+	pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu3_cfg_bar, &dev->ntl_bar);
+
+	/* 0x18/1c - GENID BAR */
+	addr = SETFIELD(0xf, dev->genid_bar.addr,
+			PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64);
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, lo32(addr), 0xf, 0);
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, hi32(addr), 0, 0);
+
+	pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu3_cfg_bar, &dev->genid_bar);
+
+	/* 0x20/0x24 - BARs, disabled */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
+
+	/* 0x28 - Cardbus CIS pointer */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
+
+	/* 0x2c - Subsystem ID */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
+
+	/* 0x30 - ROM BAR, zero sized */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
+
+	/* 0x34 - PCI Capability */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
+
+	/* 0x38 - Reserved */
+	PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
+
+	/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */
+
+	/* PCIE and vendor specific capability */
+	pos = npu3_cfg_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
+	pos = npu3_cfg_populate_vendor_cap(dev, pos, 0x41);
+	PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
+}
+
+static void npu3_dev_create_pvd(struct npu3_dev *dev)
+{
+	struct npu3 *npu = dev->npu;
+	struct phb *phb = &npu->nvlink.phb;
+
+	dev->nvlink.pvd = pci_virt_add_device(phb, dev->index, 0x100, dev);
+	if (!dev->nvlink.pvd)
+		return;
+
+	phb->scan_map |= 0x1 << GETFIELD(0xf8, dev->nvlink.pvd->bdfn);
+	npu3_cfg_populate(dev);
+}
+
+static void npu3_dt_add_mmio_atsd(struct npu3 *npu)
+{
+	struct dt_node *dn = npu->nvlink.phb.dt_node;
+	uint64_t mmio_atsd[NPU3_XTS_ATSD_MAX];
+
+	for (uint32_t i = 0; i < NPU3_XTS_ATSD_MAX; i++)
+		mmio_atsd[i] = npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(i);
+
+	dt_add_property(dn, "ibm,mmio-atsd", mmio_atsd, sizeof(mmio_atsd));
+}
+
+static void npu3_dt_add_mmio_window(struct npu3 *npu)
+{
+	struct dt_node *dn = npu->nvlink.phb.dt_node;
+	uint32_t ntl0_index = npu->index * NPU3_LINKS_PER_NPU;
+	uint64_t addr, size, win[2];
+
+	/* Device MMIO window (NTL/GENID regs only) */
+	phys_map_get(npu->chip_id, NPU_NTL, ntl0_index, &win[0], NULL);
+	phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, &size);
+	win[1] = addr + size - win[0];
+
+	dt_add_property(dn, "ibm,mmio-window", win, sizeof(win));
+	dt_add_property_cells(dn, "ranges", 0x02000000,
+			      hi32(win[0]), lo32(win[0]),
+			      hi32(win[0]), lo32(win[0]),
+			      hi32(win[1]), lo32(win[1]));
+}
+
+/* NDL No-Stall Event level */
+static uint32_t npu3_dev_interrupt_level(struct npu3_dev *dev)
+{
+	const uint32_t level[12] = {  1,  3,  5,  7,  9, 11,
+				     43, 45, 47, 49, 51, 53 };
+
+	return level[npu3_chip_dev_index(dev)];
+}
+
+static void npu3_dt_add_interrupts(struct npu3 *npu)
+{
+	struct dt_node *dn = npu->nvlink.phb.dt_node;
+	uint32_t *map, icsp, i = 0;
+	struct npu3_dev *dev;
+	size_t map_size = 0;
+
+	npu3_for_each_nvlink_dev(dev, npu)
+		map_size += sizeof(*map) * 7;
+
+	if (!map_size)
+		return;
+
+	icsp = get_ics_phandle();
+	map = zalloc(map_size);
+	assert(map);
+
+	npu3_for_each_nvlink_dev(dev, npu) {
+		map[i] = dev->nvlink.pvd->bdfn << 8;
+		map[i + 3] = 1;		/* INT A */
+		map[i + 4] = icsp;	/* interrupt-parent */
+		map[i + 5] = npu->irq_base + npu3_dev_interrupt_level(dev);
+		map[i + 6] = 0;		/* 0 = EDGE, 1 = LEVEL */
+		i += 7;
+	}
+
+	dt_add_property_cells(dn, "interrupt-parent", icsp);
+	dt_add_property(dn, "interrupt-map", map, map_size);
+	dt_add_property_cells(dn, "interrupt-map-mask", 0xff00, 0x0, 0x0, 0x7);
+
+	free(map);
+}
+
+/* Populate PCI root device node */
+static void npu3_dt_add_props(struct npu3 *npu)
+{
+	struct dt_node *dn = npu->nvlink.phb.dt_node;
+
+	dt_add_property_cells(dn, "#address-cells", 3);
+	dt_add_property_cells(dn, "#size-cells", 2);
+	dt_add_property_cells(dn, "#interrupt-cells", 1);
+	dt_add_property_cells(dn, "bus-range", 0, 0xff);
+	dt_add_property_cells(dn, "clock-frequency", 0x200, 0);
+
+	dt_add_property_strings(dn, "device_type", "pciex");
+
+	/*
+	 * To the OS, npu2 and npu3 are both ibm,ioda2-npu2-phb. The added
+	 * ibm,ioda3-npu3-phb allows for possible quirks.
+	 */
+	dt_add_property_strings(dn, "compatible",
+				"ibm,power9-npu-pciex",
+				"ibm,ioda2-npu2-phb",
+				"ibm,ioda2-npu3-phb");
+
+	dt_add_property_cells(dn, "ibm,phb-index",
+			      npu3_get_phb_index(npu->index));
+	dt_add_property_cells(dn, "ibm,phb-diag-data-size", 0);
+	dt_add_property_cells(dn, "ibm,opal-num-pes", NPU3_MAX_PE_NUM);
+	dt_add_property_cells(dn, "ibm,opal-reserved-pe", NPU3_RESERVED_PE_NUM);
+	dt_add_property_cells(dn, "ibm,supported-tce-sizes",
+			      12, /* 4K */
+			      16, /* 64K */
+			      24, /* 16M */
+			      28); /* 256M */
+
+	dt_add_property_cells(dn, "ibm,chip-id", npu->chip_id);
+	dt_add_property_cells(dn, "ibm,npu-index", npu->index);
+	dt_add_property_cells(dn, "ibm,npcq", npu->dt_node->phandle);
+	dt_add_property_cells(dn, "ibm,xscom-base", npu->xscom_base);
+	dt_add_property_cells(dn, "ibm,links", NPU3_LINKS_PER_NPU);
+
+	dt_add_property(dn, "reg", npu->regs, sizeof(npu->regs));
+
+	npu3_dt_add_mmio_atsd(npu);
+	npu3_dt_add_mmio_window(npu);
+	npu3_dt_add_interrupts(npu);
+}
+
+void npu3_init_nvlink(struct npu3 *npu)
+{
+	struct npu3_dev *dev;
+
+	if (!npu3_next_dev(npu, NULL, NPU3_DEV_TYPE_NVLINK))
+		return;
+
+	npu3_init_hw(npu);
+	npu3_create_phb(npu);
+
+	npu3_for_each_nvlink_dev(dev, npu)
+		npu3_dev_create_pvd(dev);
+
+	npu3_dt_add_props(npu);
+
+	/* TODO: Sort out if/why we still can't enable this */
+	disable_fast_reboot("NVLink device enabled");
+}
+
+static int64_t npu3_init_context_pid(struct npu3 *npu, uint32_t index,
+				     uint64_t msr)
+{
+	uint64_t map, old_map;
+
+	/* Unfiltered XTS mode; index is lparshort */
+	map = SETFIELD(NPU3_XTS_PID_MAP_LPARSHORT, 0ull, index);
+
+	/* Enable this mapping for both real and virtual addresses */
+	map |= NPU3_XTS_PID_MAP_VALID_ATRGPA0 | NPU3_XTS_PID_MAP_VALID_ATRGPA1;
+
+	/* Enable TLBIE/MMIOSD forwarding for this entry */
+	map |= NPU3_XTS_PID_MAP_VALID_ATSD;
+
+	/* Set the relevant MSR bits */
+	if (msr & MSR_DR)
+		map |= NPU3_XTS_PID_MAP_MSR_DR;
+
+	if (msr & MSR_HV)
+		map |= NPU3_XTS_PID_MAP_MSR_HV;
+
+	if (msr & MSR_PR)
+		map |= NPU3_XTS_PID_MAP_MSR_PR;
+
+	/* We don't support anything other than 64-bit so hardcode it here */
+	map |= NPU3_XTS_PID_MAP_MSR_SF;
+
+	old_map = npu3_read(npu, NPU3_XTS_PID_MAP(index));
+
+	/* Error out if this entry is already set with different msr bits */
+	if (old_map && GETFIELD(NPU3_XTS_PID_MAP_MSR, old_map) !=
+		       GETFIELD(NPU3_XTS_PID_MAP_MSR, map)) {
+		NPU3ERR(npu, "%s: Unexpected MSR value\n", __func__);
+		return OPAL_PARAMETER;
+	}
+
+	if (!old_map) {
+		NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0x%08llx\n", index, map);
+		npu3_write(npu, NPU3_XTS_PID_MAP(index), map);
+	}
+
+	npu->nvlink.ctx_ref[index]++;
+
+	return OPAL_SUCCESS;
+}
+
+#define NPU3_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF)
+
+/*
+ * Allocate a context ID and initialize the tables with the relevant
+ * information. Returns the ID or error if one couldn't be allocated.
+ */
+int64_t npu3_init_context(struct phb *phb, uint64_t msr, uint64_t bdf)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+	uint32_t lparshort, i;
+	uint64_t map;
+	int64_t rc;
+
+	/*
+	 * MSR bits should be masked by the caller to allow for future
+	 * expansion if required.
+	 */
+	if (msr & ~NPU3_VALID_ATS_MSR_BITS)
+		return OPAL_UNSUPPORTED;
+
+	lock(&npu->lock);
+
+	for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
+		map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
+
+		if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
+			break;
+	}
+
+	if (i == NPU3_XTS_BDF_MAP_MAX) {
+		NPU3ERR(npu, "LPARID not associated with any GPU\n");
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map);
+	NPU3DBG(npu, "Found LPARSHORT 0x%x for bdf %02llx:%02llx.%llx\n",
+		lparshort, PCI_BUS_NUM(bdf), PCI_DEV(bdf), PCI_FUNC(bdf));
+
+	rc = npu3_init_context_pid(npu, lparshort, msr);
+	if (rc)
+		goto out;
+
+	if (!(map & NPU3_XTS_BDF_MAP_VALID)) {
+		map |= NPU3_XTS_BDF_MAP_VALID;
+		npu3_write(npu, NPU3_XTS_BDF_MAP(i), map);
+	}
+
+	rc = lparshort;
+
+out:
+	unlock(&npu->lock);
+	return rc;
+}
+
+static int64_t npu3_destroy_context_pid(struct npu3 *npu, uint32_t index)
+{
+	if (!npu->nvlink.ctx_ref[index])
+		return OPAL_PARAMETER;
+
+	/* Only destroy when refcount hits 0 */
+	if (--npu->nvlink.ctx_ref[index])
+		return OPAL_PARTIAL;
+
+	NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0 (destroy)\n", index);
+	npu3_write(npu, NPU3_XTS_PID_MAP(index), 0ull);
+
+	return OPAL_SUCCESS;
+}
+
+int64_t npu3_destroy_context(struct phb *phb, uint64_t bdf)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+	uint32_t lparshort, i;
+	int64_t map, rc;
+
+	lock(&npu->lock);
+
+	for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
+		map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
+
+		if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
+			break;
+	}
+
+	if (i == NPU3_XTS_BDF_MAP_MAX) {
+		NPU3ERR(npu, "LPARID not associated with any GPU\n");
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map);
+	rc = npu3_destroy_context_pid(npu, lparshort);
+
+out:
+	unlock(&npu->lock);
+	return rc;
+}
+
+/* Map the given virtual bdf to lparid with given lpcr */
+int64_t npu3_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
+		      uint64_t lpcr)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+	struct npu3_dev *dev;
+	int64_t rc = OPAL_SUCCESS;
+	uint64_t map, val;
+	uint32_t i;
+
+	/*
+	 * The LPCR bits are only required for hash based ATS, which we don't
+	 * currently support, but may need to in the future.
+	 */
+	if (lpcr)
+		return OPAL_UNSUPPORTED;
+
+	lock(&npu->lock);
+
+	/* Update the entry if it already exists */
+	for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
+		map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
+
+		if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
+			break;
+	}
+
+	if (i == NPU3_XTS_BDF_MAP_MAX) {
+		/* No existing mapping found, find space for a new one */
+		for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++)
+			if (!npu3_read(npu, NPU3_XTS_BDF_MAP(i)))
+				break;
+	}
+
+	if (i == NPU3_XTS_BDF_MAP_MAX) {
+		NPU3ERR(npu, "No free XTS_BDF[] entry\n");
+		rc = OPAL_RESOURCE;
+		goto out;
+	}
+
+	map = NPU3_XTS_BDF_MAP_UNFILT;
+	map = SETFIELD(NPU3_XTS_BDF_MAP_BDF, map, bdf);
+	map = SETFIELD(NPU3_XTS_BDF_MAP_LPARID, map, lparid);
+	map = SETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map, i);
+
+	/* We only support radix at the moment */
+	map = SETFIELD(NPU3_XTS_BDF_MAP_XLAT, map, 0x3);
+
+	/* Find a link on which to send ATSDs for this device */
+	npu3_for_each_nvlink_dev(dev, npu)
+		if (dev->nvlink.gpu->bdfn == bdf)
+			break;
+
+	if (!dev || dev->nvlink.gpu->bdfn != bdf) {
+		NPU3ERR(npu, "Can't find a link for bdf %02llx:%02llx.%llx\n",
+			PCI_BUS_NUM(bdf), PCI_DEV(bdf), PCI_FUNC(bdf));
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	map = SETFIELD(NPU3_XTS_BDF_MAP_BRICK, map, dev->index);
+
+	NPU3DBG(npu, "XTS_BDF_MAP[%03d] = 0x%08llx\n", i, map);
+	npu3_write(npu, NPU3_XTS_BDF_MAP(i), map);
+
+	/* We need to allocate an ATSD per link */
+	val = SETFIELD(NPU3_XTS_ATSD_HYP_LPARID, 0ull, lparid);
+	if (!lparid)
+		val |= NPU3_XTS_ATSD_HYP_MSR_HV;
+
+	npu3_write(npu, NPU3_XTS_ATSD_HYP(dev->index), val);
+
+out:
+	unlock(&npu->lock);
+	return rc;
+}
+
+static int64_t npu3_relaxed_order_enable(struct npu3 *npu, uint64_t src)
+{
+	struct npu3_dev *dev;
+	uint32_t i;
+
+	for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
+		if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src)
+			return OPAL_SUCCESS; /* Already enabled */
+
+	/* Find somewhere to write this source */
+	for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
+		if (!npu3_read(npu, NPU3_RELAXED_SRC(i)))
+			break;
+
+	if (i == NPU3_RELAXED_SRC_MAX) {
+		NPU3ERR(npu, "Insufficient resources to activate relaxed ordering mode\n");
+		return OPAL_RESOURCE;
+	}
+
+	npu3_write(npu, NPU3_RELAXED_SRC(i), src);
+
+	npu3_for_each_nvlink_dev(dev, npu) {
+		uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index));
+
+		val |= NPU3_RELAXED_CFG2_SRC_WRENA(i) |
+		       NPU3_RELAXED_CFG2_SRC_RDENA(i);
+		npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static void npu3_relaxed_order_disable(struct npu3 *npu, uint64_t src)
+{
+	struct npu3_dev *dev;
+	uint32_t i;
+
+	for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
+		if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src)
+			break;
+
+	if (i == NPU3_RELAXED_SRC_MAX)
+		return; /* Already disabled */
+
+	npu3_for_each_nvlink_dev(dev, npu) {
+		uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index));
+
+		val &= ~NPU3_RELAXED_CFG2_SRC_WRENA(i);
+		val &= ~NPU3_RELAXED_CFG2_SRC_RDENA(i);
+		npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val);
+	}
+
+	npu3_write(npu, NPU3_RELAXED_SRC(i), 0ull);
+}
+
+/* Enable or disable relaxed ordering on all nvlinks for a given PEC. */
+int64_t npu3_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
+			       bool enable)
+{
+	struct npu3 *npu = npu3_phb_to_npu(phb);
+	int64_t rc = OPAL_SUCCESS;
+	uint64_t src;
+
+	NPU3INF(npu, "%s relaxed ordering for PEC %d on chip %d\n",
+		enable ? "Enabling" : "Disabling",
+		pec, gcid);
+
+	lock(&npu->lock);
+
+	src = SETFIELD(NPU3_RELAXED_SRC_GRPCHP, 0ull, gcid);
+	src = SETFIELD(NPU3_RELAXED_SRC_PEC, src, pec);
+	src = SETFIELD(NPU3_RELAXED_SRC_RDSTART, src, 0);
+	src = SETFIELD(NPU3_RELAXED_SRC_RDEND, src, 47);
+	src = SETFIELD(NPU3_RELAXED_SRC_WRSTART, src, 0);
+	src = SETFIELD(NPU3_RELAXED_SRC_WREND, src, 23);
+
+	if (enable)
+		rc = npu3_relaxed_order_enable(npu, src);
+	else
+		npu3_relaxed_order_disable(npu, src);
+
+	unlock(&npu->lock);
+	return rc;
+}
diff --git a/roms/skiboot/hw/npu3.c b/roms/skiboot/hw/npu3.c
new file mode 100644
index 000000000..03461373e
--- /dev/null
+++ b/roms/skiboot/hw/npu3.c
@@ -0,0 +1,549 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#include <io.h>
+#include <xscom.h>
+#include <npu3.h>
+#include <npu3-regs.h>
+#include <nvram.h>
+#include <interrupts.h>
+#include <xive.h>
+
+#define NPU3LOG(l, npu, fmt, a...) \
+	prlog(l, "NPU[%d:%d]: " fmt, (npu)->chip_id, (npu)->index, ##a)
+#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a)
+#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a)
+#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a)
+
+#define NPU3DEVLOG(l, dev, fmt, a...)		\
+	prlog(l, "NPU[%d:%d:%d]: " fmt,		\
+	      (dev)->npu->chip_id,		\
+	      (dev)->npu->index,		\
+	      (dev)->index, ##a)
+#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
+#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
+#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
+
+static void npu3_dt_create_link(struct dt_node *npu, uint32_t npu_index,
+				uint32_t dev_index)
+{
+	struct dt_node *link;
+	uint32_t phy_lane_mask, ob_chiplet;
+
+	link = dt_new_addr(npu, "link", dev_index);
+
+	dt_add_property_string(link, "compatible", "ibm,npu-link");
+	dt_add_property_cells(link, "reg", dev_index);
+	dt_add_property_cells(link, "ibm,npu-link-index", dev_index);
+
+	switch (npu_index) {
+	case 0:
+		/* fall through */
+	case 2:
+		ob_chiplet = npu_index ? 3 : 0;
+
+		switch (dev_index) {
+		case 0:
+			phy_lane_mask = PPC_BITMASK32(0, 3);
+			break;
+		case 1:
+			phy_lane_mask = PPC_BITMASK32(13, 16);
+			break;
+		case 2:
+			phy_lane_mask = PPC_BITMASK32(7, 10);
+			break;
+		case 3:
+			phy_lane_mask = PPC_BITMASK32(20, 23);
+			break;
+		}
+
+		break;
+	case 1:
+		switch (dev_index) {
+		case 0:
+			ob_chiplet = 1;
+			phy_lane_mask = PPC_BITMASK32(0, 3);
+			break;
+		case 1:
+			ob_chiplet = 2;
+			phy_lane_mask = PPC_BITMASK32(0, 3);
+			break;
+		case 2:
+			ob_chiplet = 1;
+			phy_lane_mask = PPC_BITMASK32(7, 10);
+			break;
+		case 3:
+			ob_chiplet = 2;
+			phy_lane_mask = PPC_BITMASK32(7, 10);
+			break;
+		}
+
+		break;
+	default:
+		return;
+	}
+
+	dt_add_property_cells(link, "ibm,npu-phy", ob_chiplet);
+	dt_add_property_cells(link, "ibm,npu-lane-mask", phy_lane_mask);
+}
+
+static void npu3_dt_create_npu(struct dt_node *xscom, uint32_t npu_index)
+{
+	const uint32_t npu_base[] = { 0x5011000, 0x5011400, 0x3011c00 };
+	struct dt_node *npu;
+
+	npu = dt_new_addr(xscom, "npu", npu_base[npu_index]);
+
+	dt_add_property_cells(npu, "#size-cells", 0);
+	dt_add_property_cells(npu, "#address-cells", 1);
+	dt_add_property_cells(npu, "reg", npu_base[npu_index], 0x2c);
+	dt_add_property_string(npu, "compatible", "ibm,power9-npu3");
+	dt_add_property_cells(npu, "ibm,npu-index", npu_index);
+
+	for (uint32_t i = 0; i < NPU3_LINKS_PER_NPU; i++)
+		npu3_dt_create_link(npu, npu_index, i);
+}
+
+/* This can be removed when/if we decide to use HDAT instead */
+static bool npu3_dt_create(void)
+{
+	struct proc_chip *chip = next_chip(NULL);
+	struct dt_node *xscom;
+
+	/* npu3 chips only */
+	if (proc_gen < proc_gen_p9 ||
+	    chip->type == PROC_CHIP_P9_NIMBUS ||
+	    chip->type == PROC_CHIP_P9_CUMULUS)
+		return false;
+
+	dt_for_each_compatible(dt_root, xscom, "ibm,xscom")
+		for (uint32_t i = 0; i < 3; i++)
+			npu3_dt_create_npu(xscom, i);
+
+	return true;
+}
+
+static struct npu3 *npu3_create(struct dt_node *dn)
+{
+	struct npu3 *npu;
+	struct dt_node *link;
+	struct npu3_dev *dev;
+	char *path;
+	uint32_t i;
+
+	npu = zalloc(sizeof(*npu));
+	assert(npu);
+
+	init_lock(&npu->lock);
+
+	npu->dt_node = dn;
+	npu->index = dt_prop_get_u32(dn, "ibm,npu-index");
+	npu->xscom_base = dt_get_address(dn, 0, NULL);
+
+	npu->chip_id = dt_get_chip_id(dn);
+	assert(get_chip(npu->chip_id));
+
+	dt_for_each_compatible(dn, link, "ibm,npu-link") {
+		i = dt_prop_get_u32(link, "ibm,npu-link-index");
+		assert(i < NPU3_LINKS_PER_NPU);
+
+		dev = &npu->devices[i];
+		dev->index = i;
+		dev->npu = npu;
+		dev->dn = link;
+		dev->ob_chiplet = dt_prop_get_u32(link, "ibm,npu-phy");
+		dev->phy_lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
+		dev->proc.status = NPU3_PROC_COMPLETE;
+	};
+
+	path = dt_get_path(dn);
+	NPU3INF(npu, "Found %s\n", path);
+	NPU3INF(npu, "SCOM base: 0x%llx\n", npu->xscom_base);
+	free(path);
+
+	return npu;
+}
+
+struct npu3_dev *npu3_next_dev(struct npu3 *npu, struct npu3_dev *dev,
+			       enum npu3_dev_type type)
+{
+	uint32_t i = 0;
+
+	if (dev)
+		i = dev->index + 1;
+
+	for (; i < NPU3_LINKS_PER_NPU; i++) {
+		dev = &npu->devices[i];
+
+		if (dev->type == type || type == NPU3_DEV_TYPE_ANY)
+			return dev;
+	}
+
+	return NULL;
+}
+
+static void npu3_device_detect_fixup(struct npu3_dev *dev)
+{
+	struct dt_node *dn = dev->dn;
+
+	if (dev->type == NPU3_DEV_TYPE_NVLINK) {
+		dt_add_property_strings(dn, "ibm,npu-link-type", "nvlink");
+		dev->link_speed = dt_prop_get_u32_def(
+					dn, "nvidia,link-speed", 0xff);
+		return;
+	}
+
+	NPU3DEVDBG(dev, "Link type unknown\n");
+	dt_add_property_strings(dn, "ibm,npu-link-type", "unknown");
+}
+
+/*
+ * We use the indirect method because it uses the same addresses as
+ * the MMIO offsets (NPU RING)
+ */
+static void npu3_scom_sel(struct npu3 *npu, uint64_t reg, uint64_t size)
+{
+	uint64_t val;
+
+	val = SETFIELD(NPU3_MISC_DA_ADDR, 0ull, reg);
+	val = SETFIELD(NPU3_MISC_DA_LEN, val, size);
+	xscom_write(npu->chip_id,
+		    npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_ADDR,
+		    val);
+}
+
+static void npu3_scom_write(struct npu3 *npu, uint64_t reg, uint64_t size,
+			    uint64_t val)
+{
+	npu3_scom_sel(npu, reg, size);
+	xscom_write(npu->chip_id,
+		    npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA,
+		    val);
+}
+
+static uint64_t npu3_scom_read(struct npu3 *npu, uint64_t reg, uint64_t size)
+{
+	uint64_t val;
+
+	npu3_scom_sel(npu, reg, size);
+	xscom_read(npu->chip_id,
+		   npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA,
+		   &val);
+
+	return val;
+}
+
+void npu3_write(struct npu3 *npu, uint64_t reg, uint64_t val)
+{
+	void *mmio = (void *)npu->regs[0];
+
+	if (mmio)
+		out_be64(mmio + reg, val);
+	else
+		npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_8B, val);
+
+	/* CQ_SM writes should be mirrored in all four blocks */
+	if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0))
+		return;
+
+	for (uint32_t i = 1; i < 4; i++)
+		npu3_write(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg),
+			   val);
+}
+
+uint64_t npu3_read(struct npu3 *npu, uint64_t reg)
+{
+	void *mmio = (void *)npu->regs[0];
+
+	if (mmio)
+		return in_be64(mmio + reg);
+
+	return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_8B);
+}
+
+void npu3_write_4b(struct npu3 *npu, uint64_t reg, uint32_t val)
+{
+	void *mmio = (void *)npu->regs[0];
+
+	if (mmio)
+		out_be32(mmio + reg, val);
+	else
+		npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_4B,
+				(uint64_t)val << 32);
+
+	if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0))
+		return;
+
+	for (uint32_t i = 1; i < 4; i++)
+		npu3_write_4b(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg),
+			      val);
+}
+
+uint32_t npu3_read_4b(struct npu3 *npu, uint64_t reg)
+{
+	void *mmio = (void *)npu->regs[0];
+
+	if (mmio)
+		return in_be32(mmio + reg);
+
+	return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_4B) >> 32;
+}
+
+static void npu3_misc_config(struct npu3 *npu)
+{
+	struct npu3_dev *dev;
+	uint32_t typemap = 0;
+	uint64_t reg, val;
+
+	npu3_for_each_nvlink_dev(dev, npu)
+		typemap |= 0x10 >> dev->index;
+
+	reg = NPU3_MCP_MISC_CFG0;
+	val = npu3_read(npu, reg);
+	val |= NPU3_MCP_MISC_CFG0_ENABLE_PBUS;
+	val &= ~NPU3_MCP_MISC_CFG0_ENABLE_SNARF_CPM;
+	val = SETFIELD(NPU3_MCP_MISC_CFG0_NVLINK_MODE, val, typemap);
+	val = SETFIELD(NPU3_MCP_MISC_CFG0_OCAPI_MODE, val, ~typemap);
+	npu3_write(npu, reg, val);
+
+	reg = NPU3_SNP_MISC_CFG0;
+	val = npu3_read(npu, reg);
+	val |= NPU3_SNP_MISC_CFG0_ENABLE_PBUS;
+	val = SETFIELD(NPU3_SNP_MISC_CFG0_NVLINK_MODE, val, typemap);
+	val = SETFIELD(NPU3_SNP_MISC_CFG0_OCAPI_MODE, val, ~typemap);
+	npu3_write(npu, reg, val);
+
+	reg = NPU3_CTL_MISC_CFG2;
+	val = npu3_read(npu, reg);
+	val = SETFIELD(NPU3_CTL_MISC_CFG2_NVLINK_MODE, val, typemap);
+	val = SETFIELD(NPU3_CTL_MISC_CFG2_OCAPI_MODE, val, ~typemap);
+	npu3_write(npu, reg, val);
+
+	reg = NPU3_DAT_MISC_CFG1;
+	val = npu3_read(npu, reg);
+	val = SETFIELD(NPU3_DAT_MISC_CFG1_NVLINK_MODE, val, typemap);
+	val = SETFIELD(NPU3_DAT_MISC_CFG1_OCAPI_MODE, val, ~typemap);
+	npu3_write(npu, reg, val);
+}
+
+static void npu3_assign_bars(struct npu3 *npu)
+{
+	struct npu3_dev *dev;
+	uint64_t addr, size, val;
+
+	/* Global MMIO bar (per npu) */
+	phys_map_get(npu->chip_id, NPU_REGS, npu->index, &addr, &size);
+	val = SETFIELD(NPU3_MMIO_BAR_ADDR, 0ull, addr >> 24);
+	val |= NPU3_MMIO_BAR_ENABLE;
+	npu3_write(npu, NPU3_MMIO_BAR, val);
+
+	NPU3INF(npu, "MMIO base: 0x%016llx (%lldMB)\n", addr, size >> 20);
+	npu->regs[0] = addr;
+	npu->regs[1] = size;
+
+	/* NTL bar (per device) */
+	npu3_for_each_dev(dev, npu) {
+		phys_map_get(npu->chip_id, NPU_NTL, npu3_chip_dev_index(dev),
+			     &addr, &size);
+		val = SETFIELD(NPU3_NTL_BAR_ADDR, 0ull, addr >> 16);
+		val = SETFIELD(NPU3_NTL_BAR_SIZE, val, ilog2(size >> 16));
+		npu3_write(npu, NPU3_NTL_BAR(dev->index), val);
+
+		dev->ntl_bar.addr = addr;
+		dev->ntl_bar.size = size;
+	}
+
+	/* GENID bar (logically divided per device) */
+	phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, NULL);
+	val = SETFIELD(NPU3_GENID_BAR_ADDR, 0ull, addr >> 19);
+	npu3_write(npu, NPU3_GENID_BAR, val);
+
+	npu3_for_each_dev(dev, npu) {
+		dev->genid_bar.addr = addr + (dev->index << 16);
+		dev->genid_bar.size = 64 << 10;
+	}
+}
+
+void npu3_dev_enable_bars(struct npu3_dev *dev, bool enable)
+{
+	struct npu3 *npu = dev->npu;
+	uint64_t reg, val;
+
+	if (dev->ntl_bar.enable == enable) /* No state change */
+		return;
+
+	dev->ntl_bar.enable = enable;
+	dev->genid_bar.enable = enable;
+
+	reg = NPU3_NTL_BAR(dev->index);
+	val = npu3_read(npu, reg);
+	val = SETFIELD(NPU3_NTL_BAR_ENABLE, val, enable);
+	npu3_write(npu, reg, val);
+
+	/*
+	 * Generation IDs are a single space in the hardware but we split them
+	 * per device. Only disable in hardware if every device has disabled.
+	 */
+	if (!enable)
+		npu3_for_each_dev(dev, npu)
+			if (dev->genid_bar.enable)
+				return;
+
+	reg = NPU3_GENID_BAR;
+	val = npu3_read(npu, reg);
+	val = SETFIELD(NPU3_GENID_BAR_ENABLE, val, enable);
+	npu3_write(npu, reg, val);
+}
+
+static uint64_t npu3_ipi_attributes(struct irq_source *is, uint32_t isn)
+{
+	struct npu3 *npu = is->data;
+	uint32_t level = isn - npu->irq_base;
+
+	/* TCE interrupt is used to detect a frozen PE */
+	if (level == 18)
+		return IRQ_ATTR_TARGET_OPAL |
+		       IRQ_ATTR_TARGET_RARE |
+		       IRQ_ATTR_TYPE_MSI;
+
+	return IRQ_ATTR_TARGET_LINUX;
+}
+
+static void npu3_ipi_interrupt(struct irq_source *is, uint32_t isn)
+{
+	struct npu3 *npu = is->data;
+	uint32_t level = isn - npu->irq_base;
+
+	if (level != 18) {
+		NPU3ERR(npu, "Received unknown interrupt %d\n", level);
+		return;
+	}
+
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR);
+}
+
+#define NPU3_IRQ_LEVELS 60
+
+static char *npu3_ipi_name(struct irq_source *is, uint32_t isn)
+{
+	struct npu3 *npu = is->data;
+	uint32_t level = isn - npu->irq_base;
+	static const char *names[NPU3_IRQ_LEVELS] = {
+		[0] = "NDL 0 Stall Event (brick 0)",
+		[1] = "NDL 0 No-Stall Event (brick 0)",
+		[2] = "NDL 1 Stall Event (brick 1)",
+		[3] = "NDL 1 No-Stall Event (brick 1)",
+		[4] = "NDL 2 Stall Event (brick 2)",
+		[5] = "NDL 2 No-Stall Event (brick 2)",
+		[6] = "NDL 3 Stall Event (brick 3)",
+		[7] = "NDL 3 No-Stall Event (brick 3)",
+		[8] = "NDL 4 Stall Event (brick 4)",
+		[9] = "NDL 4 No-Stall Event (brick 4)",
+		[10] = "NDL 5 Stall Event (brick 5)",
+		[11] = "NDL 5 No-Stall Event (brick 5)",
+		[12] = "NTL 0 Event",
+		[13] = "NTL 1 Event",
+		[14] = "NTL 2 Event",
+		[15] = "NTL 3 Event",
+		[16] = "NTL 4 Event",
+		[17] = "NTL 5 Event",
+		[18] = "TCE Event",
+		[19] = "ATS Event",
+		[20] = "CQ Event",
+		[21] = "MISC Event",
+		[41] = "Memory Controller Event",
+		[42] = "NDL 6 Stall Event (brick 6)",
+		[43] = "NDL 6 No-Stall Event (brick 6)",
+		[44] = "NDL 7 Stall Event (brick 7)",
+		[45] = "NDL 7 No-Stall Event (brick 7)",
+		[46] = "NDL 8 Stall Event (brick 8)",
+		[47] = "NDL 8 No-Stall Event (brick 8)",
+		[48] = "NDL 9 Stall Event (brick 9)",
+		[49] = "NDL 9 No-Stall Event (brick 9)",
+		[50] = "NDL 10 Stall Event (brick 10)",
+		[51] = "NDL 10 No-Stall Event (brick 10)",
+		[52] = "NDL 11 Stall Event (brick 11)",
+		[53] = "NDL 11 No-Stall Event (brick 11)",
+		[54] = "NTL 6 Event",
+		[55] = "NTL 7 Event",
+		[56] = "NTL 8 Event",
+		[57] = "NTL 9 Event",
+		[58] = "NTL 10 Event",
+		[59] = "NTL 11 Event",
+	};
+
+	if (level >= NPU3_IRQ_LEVELS || !names[level])
+		return strdup("Unknown");
+
+	return strdup(names[level]);
+}
+
+static const struct irq_source_ops npu3_ipi_ops = {
+	.attributes	= npu3_ipi_attributes,
+	.interrupt	= npu3_ipi_interrupt,
+	.name		= npu3_ipi_name,
+};
+
+static void npu3_setup_irqs(struct npu3 *npu)
+{
+	uint64_t reg, val;
+	uint32_t base;
+
+	base = xive_alloc_ipi_irqs(npu->chip_id, NPU3_IRQ_LEVELS, 64);
+	if (base == XIVE_IRQ_ERROR) {
+		NPU3ERR(npu, "Failed to allocate interrupt sources\n");
+		return;
+	}
+
+	xive_register_ipi_source(base, NPU3_IRQ_LEVELS, npu, &npu3_ipi_ops);
+
+	/* Set IPI configuration */
+	reg = NPU3_MISC_CFG;
+	val = npu3_read(npu, reg);
+	val = SETFIELD(NPU3_MISC_CFG_IPI_PS, val, NPU3_MISC_CFG_IPI_PS_64K);
+	val = SETFIELD(NPU3_MISC_CFG_IPI_OS, val, NPU3_MISC_CFG_IPI_OS_AIX);
+	npu3_write(npu, reg, val);
+
+	/* Set IRQ base */
+	reg = NPU3_MISC_INT_BAR;
+	val = SETFIELD(NPU3_MISC_INT_BAR_ADDR, 0ull,
+		       (uint64_t)xive_get_trigger_port(base) >> 12);
+	npu3_write(npu, reg, val);
+
+	npu->irq_base = base;
+}
+
+static void npu3_init(struct npu3 *npu)
+{
+	struct npu3_dev *dev;
+
+	platform.npu3_device_detect(npu);
+	npu3_for_each_dev(dev, npu)
+		npu3_device_detect_fixup(dev);
+
+	npu3_misc_config(npu);
+	npu3_assign_bars(npu);
+	npu3_setup_irqs(npu);
+	npu3_init_nvlink(npu);
+}
+
+void probe_npu3(void)
+{
+	struct dt_node *dn;
+	struct npu3 *npu;
+
+	if (!npu3_dt_create())
+		return;
+
+	if (!platform.npu3_device_detect) {
+		prlog(PR_INFO, "NPU: Platform does not support NPU\n");
+		return;
+	}
+
+	dt_for_each_compatible(dt_root, dn, "ibm,power9-npu3") {
+		npu = npu3_create(dn);
+		npu3_init(npu);
+	}
+}
diff --git a/roms/skiboot/hw/nx-842.c b/roms/skiboot/hw/nx-842.c
new file mode 100644
index 000000000..0cb87dcc8
--- /dev/null
+++ b/roms/skiboot/hw/nx-842.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX unit 842 compression accellerator
+ *
+ * Copyright 2015-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+#include <vas.h>
+
+/* Configuration settings */
+#define CFG_842_FC_ENABLE	(0x1f) /* enable all 842 functions */
+#define CFG_842_ENABLE		(1) /* enable 842 engines */
+#define DMA_CSB_WR		NX_DMA_CSB_WR_CI
+#define DMA_COMPLETION_MODE	NX_DMA_COMPLETION_MODE_CI
+#define DMA_CPB_WR		NX_DMA_CPB_WR_CI_PAD
+#define DMA_OUTPUT_DATA_WR	NX_DMA_OUTPUT_DATA_WR_CI
+#define EE_1			(1) /* enable engine 842 1 */
+#define EE_0			(1) /* enable engine 842 0 */
+
+static int nx_cfg_842(u32 gcid, u64 xcfg)
+{
+	u64 cfg, ci, ct;
+	int rc, instance = gcid + 1;
+
+	BUILD_ASSERT(MAX_CHIPS < NX_842_CFG_CI_MAX);
+
+	rc = xscom_read(gcid, xcfg, &cfg);
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM 842 config read failure %d\n",
+                 gcid, rc);
+		return rc;
+	}
+
+	ct = GETFIELD(NX_842_CFG_CT, cfg);
+	if (!ct)
+		prlog(PR_INFO, "NX%d:   842 CT set to %u\n", gcid, NX_CT_842);
+	else if (ct == NX_CT_842)
+		prlog(PR_INFO, "NX%d:   842 CT already set to %u\n",
+		      gcid, NX_CT_842);
+	else
+		prlog(PR_INFO, "NX%d:   842 CT already set to %u, "
+		      "changing to %u\n", gcid, (unsigned int)ct, NX_CT_842);
+	ct = NX_CT_842;
+	cfg = SETFIELD(NX_842_CFG_CT, cfg, ct);
+
+	/* Coprocessor Instance must be shifted left.
+	 * See hw doc Section 5.5.1.
+	 */
+	ci = GETFIELD(NX_842_CFG_CI, cfg) >> NX_842_CFG_CI_LSHIFT;
+	if (!ci)
+		prlog(PR_INFO, "NX%d:   842 CI set to %d\n", gcid, instance);
+	else if (ci == instance)
+		prlog(PR_INFO, "NX%d:   842 CI already set to %u\n", gcid,
+		      (unsigned int)ci);
+	else
+		prlog(PR_INFO, "NX%d:   842 CI already set to %u, "
+		      "changing to %d\n", gcid, (unsigned int)ci, instance);
+	ci = instance;
+	cfg = SETFIELD(NX_842_CFG_CI, cfg, ci << NX_842_CFG_CI_LSHIFT);
+
+	/* Enable all functions */
+	cfg = SETFIELD(NX_842_CFG_FC_ENABLE, cfg, CFG_842_FC_ENABLE);
+
+	cfg = SETFIELD(NX_842_CFG_ENABLE, cfg, CFG_842_ENABLE);
+
+	rc = xscom_write(gcid, xcfg, cfg);
+	if (rc)
+		prerror("NX%d: ERROR: 842 CT %u CI %u config failure %d\n",
+			gcid, (unsigned int)ct, (unsigned int)ci, rc);
+	else
+		prlog(PR_DEBUG, "NX%d:   842 Config 0x%016lx\n",
+		      gcid, (unsigned long)cfg);
+
+	return rc;
+}
+
+static int nx_cfg_842_umac(struct dt_node *node, u32 gcid, u32 pb_base)
+{
+	int rc;
+	u64 umac_bar, umac_notify;
+	struct dt_node *nx_node;
+	static u32 nx842_tid = 1; /* tid counter within coprocessor type */
+
+	nx_node = dt_new(node, "ibm,842-high-fifo");
+	umac_bar = pb_base + NX_P9_842_HIGH_PRI_RX_FIFO_BAR;
+	umac_notify = pb_base + NX_P9_842_HIGH_PRI_RX_FIFO_NOTIFY_MATCH;
+	rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-842", "High", gcid,
+				NX_CT_842, nx842_tid++, umac_bar,
+				umac_notify);
+	if (rc)
+		return rc;
+
+	nx_node = dt_new(node, "ibm,842-normal-fifo");
+	umac_bar = pb_base + NX_P9_842_NORMAL_PRI_RX_FIFO_BAR;
+	umac_notify = pb_base + NX_P9_842_NORMAL_PRI_RX_FIFO_NOTIFY_MATCH;
+	rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-842", "Normal", gcid,
+				NX_CT_842, nx842_tid++, umac_bar,
+				umac_notify);
+
+	return rc;
+}
+
+static int nx_cfg_842_dma(u32 gcid, u64 xcfg)
+{
+	u64 cfg;
+	int rc;
+
+	rc = xscom_read(gcid, xcfg, &cfg);
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM DMA config read failure %d\n",
+                 gcid, rc);
+                return rc;
+	}
+
+	cfg = SETFIELD(NX_DMA_CFG_842_COMPRESS_PREFETCH, cfg,
+		       DMA_COMPRESS_PREFETCH);
+	cfg = SETFIELD(NX_DMA_CFG_842_DECOMPRESS_PREFETCH, cfg,
+		       DMA_DECOMPRESS_PREFETCH);
+	cfg = SETFIELD(NX_DMA_CFG_842_COMPRESS_MAX_RR, cfg,
+		       DMA_COMPRESS_MAX_RR);
+	cfg = SETFIELD(NX_DMA_CFG_842_DECOMPRESS_MAX_RR, cfg,
+		       DMA_DECOMPRESS_MAX_RR);
+	cfg = SETFIELD(NX_DMA_CFG_842_SPBC, cfg,
+		       DMA_SPBC);
+	if (proc_gen < proc_gen_p9) {
+		cfg = SETFIELD(NX_DMA_CFG_842_CSB_WR, cfg,
+		       DMA_CSB_WR);
+		cfg = SETFIELD(NX_DMA_CFG_842_COMPLETION_MODE, cfg,
+		       DMA_COMPLETION_MODE);
+		cfg = SETFIELD(NX_DMA_CFG_842_CPB_WR, cfg,
+		       DMA_CPB_WR);
+		cfg = SETFIELD(NX_DMA_CFG_842_OUTPUT_DATA_WR, cfg,
+		       DMA_OUTPUT_DATA_WR);
+	}
+
+	rc = xscom_write(gcid, xcfg, cfg);
+	if (rc)
+		prerror("NX%d: ERROR: DMA config failure %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d:   DMA 0x%016lx\n", gcid,
+		      (unsigned long)cfg);
+
+	return rc;
+}
+
+static int nx_cfg_842_ee(u32 gcid, u64 xcfg)
+{
+	u64 cfg;
+	int rc;
+
+	rc = xscom_read(gcid, xcfg, &cfg);
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM EE config read failure %d\n",
+                 gcid, rc);
+		return rc;
+	}
+
+	cfg = SETFIELD(NX_EE_CFG_CH1, cfg, EE_1);
+	cfg = SETFIELD(NX_EE_CFG_CH0, cfg, EE_0);
+
+	rc = xscom_write(gcid, xcfg, cfg);
+	if (rc)
+		prerror("NX%d: ERROR: Engine Enable failure %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d:   Engine Enable 0x%016lx\n",
+		      gcid, (unsigned long)cfg);
+
+	return rc;
+}
+
+void nx_enable_842(struct dt_node *node, u32 gcid, u32 pb_base)
+{
+	u64 cfg_dma, cfg_842, cfg_ee;
+	int rc;
+
+	if (dt_node_is_compatible(node, "ibm,power8-nx")) {
+		cfg_dma = pb_base + NX_P8_DMA_CFG;
+		cfg_842 = pb_base + NX_P8_842_CFG;
+		cfg_ee = pb_base + NX_P8_EE_CFG;
+	} else {
+		prerror("NX%d: ERROR: Unknown NX type!\n", gcid);
+		return;
+	}
+
+	rc = nx_cfg_842_dma(gcid, cfg_dma);
+	if (rc)
+		return;
+
+	rc = nx_cfg_842(gcid, cfg_842);
+	if (rc)
+		return;
+
+	rc = nx_cfg_842_ee(gcid, cfg_ee);
+	if (rc)
+		return;
+
+	prlog(PR_INFO, "NX%d: 842 Coprocessor Enabled\n", gcid);
+
+	dt_add_property_cells(node, "ibm,842-coprocessor-type", NX_CT_842);
+	dt_add_property_cells(node, "ibm,842-coprocessor-instance", gcid + 1);
+}
+
+void p9_nx_enable_842(struct dt_node *node, u32 gcid, u32 pb_base)
+{
+	u64 cfg_dma, cfg_ee;
+	int rc;
+
+	cfg_dma = pb_base + NX_P9_DMA_CFG;
+	cfg_ee = pb_base + NX_P9_EE_CFG;
+
+	rc = nx_cfg_842_dma(gcid, cfg_dma);
+	if (rc)
+		return;
+
+	rc = nx_cfg_842_umac(node, gcid, pb_base);
+	if (rc)
+		return;
+
+	rc = nx_cfg_842_ee(gcid, cfg_ee);
+	if (rc)
+		return;
+
+	prlog(PR_INFO, "NX%d: 842 Coprocessor Enabled\n", gcid);
+
+}
diff --git a/roms/skiboot/hw/nx-compress.c b/roms/skiboot/hw/nx-compress.c
new file mode 100644
index 000000000..9b3c6717d
--- /dev/null
+++ b/roms/skiboot/hw/nx-compress.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX has 842 and GZIP (P9) accellerators
+ *
+ * Copyright 2015-2018 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+#include <vas.h>
+#include <opal.h>
+
+static int nx_cfg_umac_tx_wc(u32 gcid, u64 xcfg)
+{
+	int rc = 0;
+	u64 cfg;
+
+	cfg = vas_get_wcbs_bar(gcid);
+	if (!cfg) {
+		prerror("NX%d: ERROR finding WC Backing store BAR\n", gcid);
+		return -ENOMEM;
+	}
+
+	/*
+	 * NOTE: Write the entire bar address to SCOM. VAS/NX will extract
+	 *       the relevant (NX_P9_UMAC_TX_WINDOW_CONTEXT_ADDR) bits.
+	 *       IOW, _don't_ just write the bit field like:
+	 *
+	 *       cfg = SETFIELD(NX_P9_UMAC_TX_WINDOW_CONTEXT_ADDR, 0ULL, cfg);
+	 */
+	rc = xscom_write(gcid, xcfg, cfg);
+
+	if (rc)
+		prerror("NX%d: ERROR: UMAC SEND WC BAR, %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d: UMAC SEND WC BAR, 0x%016lx, "
+				"xcfg 0x%llx\n",
+			gcid, (unsigned long)cfg, xcfg);
+
+	return rc;
+}
+
+static int nx_cfg_dma_vas_mmio(u32 gcid, u64 xcfg)
+{
+	int rc = 0;
+	u64 cfg;
+
+	cfg = vas_get_hvwc_mmio_bar(gcid);
+	/*
+	 * NOTE: Write the entire bar address to SCOM. VAS/NX will extract
+	 *       the relevant (NX_P9_UMAC_VAS_MMIO_ADDR) bits. IOW, _don't_
+	 *       just write the bit field like:
+	 *
+	 *	cfg = SETFIELD(NX_P9_DMA_VAS_MMIO_ADDR, 0ULL, cfg);
+	 */
+	rc = xscom_write(gcid, xcfg, cfg);
+
+	if (rc)
+		prerror("NX%d: ERROR: DMA VAS MMIO BAR, %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d: DMA VAS MMIO BAR, 0x%016lx, xcfg 0x%llx\n",
+			gcid, (unsigned long)cfg, xcfg);
+
+	return rc;
+}
+
+static int nx_cfg_umac_vas_mmio(u32 gcid, u64 xcfg)
+{
+	int rc = 0;
+	u64 cfg;
+
+	cfg = vas_get_hvwc_mmio_bar(gcid);
+	/*
+	 * NOTE: Write the entire bar address to SCOM. VAS/NX will extract
+	 *       the relevant (NX_P9_UMAC_VAS_MMIO_ADDR) bits. IOW, _don't_
+	 *	 just write the bit field like:
+	 *
+	 *       cfg = SETFIELD(NX_P9_UMAC_VAS_MMIO_ADDR, 0ULL, cfg);
+	 */
+	rc = xscom_write(gcid, xcfg, cfg);
+
+	if (rc)
+		prerror("NX%d: ERROR: UMAC VAS MMIO BAR, %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d: UMAC VAS MMIO BAR, 0x%016lx, "
+				"xcfg 0x%llx\n",
+			gcid, (unsigned long)cfg, xcfg);
+
+	return rc;
+}
+
+static int nx_cfg_umac_status_ctrl(u32 gcid, u64 xcfg)
+{
+	u64 uctrl;
+	int rc;
+#define CRB_ENABLE	1
+
+	rc = xscom_read(gcid, xcfg, &uctrl);
+	if (rc)
+		return rc;
+
+	uctrl = SETFIELD(NX_P9_UMAC_STATUS_CTRL_CRB_ENABLE, uctrl, CRB_ENABLE);
+	rc = xscom_write(gcid, xcfg, uctrl);
+	if (rc)
+		prerror("NX%d: ERROR: Setting UMAC Status Control failure %d\n",
+			gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d: Setting UMAC Status Control 0x%016lx\n",
+			gcid, (unsigned long)uctrl);
+
+	return rc;
+}
+
+static int nx_cfg_vas_rma_bar(u32 gcid, u64 xcfg)
+{
+	int rc = 0;
+	u64 cfg;
+
+	cfg = vas_get_rma_bar(gcid);
+	/*
+	 * NOTE: Write the entire bar address to SCOM. VAS/NX will extract
+	 *	 the relevant (NX_P10_VAS_RMA_WRITE_BAR) bits. IOW, _don't_
+	 *	 just write the bit field like:
+	 *	 cfg = SETFIELD(NX_P10_VAS_RMA_WRITE_BAR, 0ULL, cfg);
+	 */
+	rc = xscom_write(gcid, xcfg, cfg);
+
+	if (rc)
+		prerror("NX%d: ERROR: VAS RMA WRITE BAR, %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d: VAS RMA WRITE BAR, 0x%016lx, "
+				"xcfg 0x%llx\n", gcid, (unsigned long)cfg,
+				xcfg);
+
+	return rc;
+}
+
+int nx_cfg_rx_fifo(struct dt_node *node, const char *compat,
+			const char *priority, u32 gcid, u32 pid, u32 tid,
+			u64 umac_bar, u64 umac_notify)
+{
+	u64 cfg;
+	int rc, size;
+	uint64_t fifo;
+	u32 lpid = 0xfff; /* All 1's for 12 bits in UMAC notify match reg */
+#define MATCH_ENABLE    1
+
+	fifo = (uint64_t) local_alloc(gcid, RX_FIFO_SIZE, RX_FIFO_SIZE);
+	assert(fifo);
+
+	/*
+	 * When configuring the address of the Rx FIFO into the Receive FIFO
+	 * BAR, we should _NOT_ shift the address into bits 8:53. Instead we
+	 * should copy the address as is and VAS/NX will extract relevant bits.
+	 */
+	/*
+	 * Section 5.21 of P9 NX Workbook Version 2.42 shows Receive FIFO BAR
+	 * 54:56 represents FIFO size
+	 * 000 = 1KB, 8 CRBs
+	 * 001 = 2KB, 16 CRBs
+	 * 010 = 4KB, 32 CRBs
+	 * 011 = 8KB, 64 CRBs
+	 * 100 = 16KB, 128 CRBs
+	 * 101 = 32KB, 256 CRBs
+	 * 110 = 111 reserved
+	 */
+	size = RX_FIFO_SIZE / 1024;
+	cfg = SETFIELD(NX_P9_RX_FIFO_BAR_SIZE, fifo, ilog2(size));
+
+	rc = xscom_write(gcid, umac_bar, cfg);
+	if (rc) {
+		prerror("NX%d: ERROR: Setting UMAC FIFO bar failure %d\n",
+			gcid, rc);
+		return rc;
+	} else
+		prlog(PR_DEBUG, "NX%d: Setting UMAC FIFO bar 0x%016lx\n",
+			gcid, (unsigned long)cfg);
+
+	rc = xscom_read(gcid, umac_notify, &cfg);
+	if (rc)
+		return rc;
+
+	/*
+	 * VAS issues asb_notify with the unique ID to identify the target
+	 * co-processor/engine. Logical partition ID (lpid), process ID (pid),
+	 * and thread ID (tid) combination is used to define the unique ID
+	 * in the system. Export these values in device-tree such that the
+	 * driver configure RxFIFO with VAS. Set these values in RxFIFO notify
+	 * match register for each engine which compares the ID with each
+	 * request.
+	 * To define unique indentification, 0xfff (1's for 12 bits),
+	 * co-processor type, and counter within coprocessor type are used
+	 * for lpid, pid, and tid respectively.
+	 */
+	cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_LPID, cfg, lpid);
+	cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_PID, cfg, pid);
+	cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_TID, cfg, tid);
+	cfg = SETFIELD(NX_P9_RX_FIFO_NOTIFY_MATCH_MATCH_ENABLE, cfg,
+			MATCH_ENABLE);
+
+	rc = xscom_write(gcid, umac_notify, cfg);
+	if (rc) {
+		prerror("NX%d: ERROR: Setting UMAC notify match failure %d\n",
+			gcid, rc);
+		return rc;
+	} else
+		prlog(PR_DEBUG, "NX%d: Setting UMAC notify match 0x%016lx\n",
+				gcid, (unsigned long)cfg);
+
+	dt_add_property_string(node, "compatible", compat);
+	dt_add_property_string(node, "priority", priority);
+	dt_add_property_u64(node, "rx-fifo-address", fifo);
+	dt_add_property_cells(node, "rx-fifo-size", RX_FIFO_SIZE);
+	dt_add_property_cells(node, "lpid", lpid);
+	dt_add_property_cells(node, "pid", pid);
+	dt_add_property_cells(node, "tid", tid);
+
+	return 0;
+}
+
+static int nx_init_fifo_ctrl(u32 gcid, u64 fifo_ctrl)
+{
+	u64 cfg;
+	int rc = 0;
+
+	rc = xscom_read(gcid, fifo_ctrl, &cfg);
+	if (rc)
+		return rc;
+
+	cfg = SETFIELD(NX_P9_RX_FIFO_CTRL_READ_OFFSET, cfg, 0);
+	cfg = SETFIELD(NX_P9_RX_FIFO_CTRL_QUEUED, cfg, 0);
+
+	rc = xscom_write(gcid, fifo_ctrl, cfg);
+
+	return rc;
+}
+
+
+static int opal_nx_coproc_init(u32 gcid, u32 ct)
+{
+	struct proc_chip *chip;
+	u64 fifo, fifo_hi;
+	u32 nx_base;
+	int rc;
+
+	if (proc_gen < proc_gen_p9)
+		return OPAL_UNSUPPORTED;
+
+	chip =  get_chip(gcid);
+	if (!chip)
+		return OPAL_PARAMETER;
+
+	nx_base =  chip->nx_base;
+	if (!nx_base)
+		return OPAL_PARAMETER;
+
+	switch (ct) {
+	case NX_CT_842:
+		fifo_hi = nx_base + NX_P9_842_HIGH_PRI_RX_FIFO_CTRL;
+		fifo = nx_base + NX_P9_842_NORMAL_PRI_RX_FIFO_CTRL;
+		break;
+	case NX_CT_GZIP:
+		fifo_hi = nx_base + NX_P9_GZIP_HIGH_PRI_RX_FIFO_CTRL;
+		fifo = nx_base + NX_P9_GZIP_NORMAL_PRI_RX_FIFO_CTRL;
+		break;
+	default:
+		prlog(PR_EMERG, "OPAL: Unknown NX coprocessor type\n");
+		return OPAL_PARAMETER;
+	}
+
+	rc  = nx_init_fifo_ctrl(gcid, fifo_hi);
+
+	if (!rc)
+		rc  = nx_init_fifo_ctrl(gcid, fifo);
+
+	return rc;
+}
+
+opal_call(OPAL_NX_COPROC_INIT, opal_nx_coproc_init, 2);
+
+void nx_create_compress_node(struct dt_node *node)
+{
+	u32 gcid, pb_base;
+	struct proc_chip *chip;
+	int rc;
+
+	gcid = dt_get_chip_id(node);
+	pb_base = dt_get_address(node, 0, NULL);
+
+	chip = get_chip(gcid);
+	chip->nx_base =  pb_base;
+
+	prlog(PR_INFO, "NX%d: 842 at 0x%x\n", gcid, pb_base);
+
+	/*
+	 * ibm,power9-nx is compatible on P10. So using same
+	 * compatible string.
+	 */
+	if (dt_node_is_compatible(node, "ibm,power9-nx")) {
+		u64 cfg_mmio, cfg_txwc, cfg_uctrl, cfg_dma;
+
+		prlog(PR_DEBUG, "Found ibm,power9-nx\n");
+		cfg_mmio = pb_base + NX_P9_UMAC_VAS_MMIO_BAR;
+		cfg_dma = pb_base + NX_P9_DMA_VAS_MMIO_BAR;
+		cfg_txwc = pb_base + NX_P9_UMAC_TX_WINDOW_CONTEXT_BAR;
+		cfg_uctrl = pb_base + NX_P9_UMAC_STATUS_CTRL;
+
+		rc = nx_cfg_umac_vas_mmio(gcid, cfg_mmio);
+		if (rc)
+			return;
+
+		rc = nx_cfg_dma_vas_mmio(gcid, cfg_dma);
+		if (rc)
+			return;
+
+		rc = nx_cfg_umac_tx_wc(gcid, cfg_txwc);
+		if (rc)
+			return;
+
+		rc = nx_cfg_umac_status_ctrl(gcid, cfg_uctrl);
+		if (rc)
+			return;
+
+		if (proc_gen > proc_gen_p9) {
+			u64 cfg_rma = pb_base + NX_P10_VAS_RMA_WRITE_BAR;
+
+			rc = nx_cfg_vas_rma_bar(gcid, cfg_rma);
+			if (rc)
+				return;
+		}
+
+		p9_nx_enable_842(node, gcid, pb_base);
+		p9_nx_enable_gzip(node, gcid, pb_base);
+	} else
+		nx_enable_842(node, gcid, pb_base);
+}
diff --git a/roms/skiboot/hw/nx-crypto.c b/roms/skiboot/hw/nx-crypto.c
new file mode 100644
index 000000000..8b8ff5ee5
--- /dev/null
+++ b/roms/skiboot/hw/nx-crypto.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX Cryptographic accellerators
+ *
+ * Copyright 2015-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+
+/* Configuration settings  */
+#define CFG_SYM_FC_ENABLE	(0) /* disable all sym functions */
+#define CFG_SYM_ENABLE		(0) /* disable sym engines */
+#define CFG_ASYM_FC_ENABLE	(0) /* disable all asym functions */
+#define CFG_ASYM_ENABLE		(0) /* disable asym engines */
+#define CFG_CRB_IQ_SYM		(0) /* don't use any extra input queues */
+#define CFG_CRB_IQ_ASYM	(0) /* don't use any extra input queues */
+#define AES_SHA_MAX_RR		(1) /* valid range: 1-8 */
+#define AES_SHA_CSB_WR		NX_DMA_CSB_WR_PDMA
+#define AES_SHA_COMPLETION_MODE	NX_DMA_COMPLETION_MODE_PDMA
+#define AES_SHA_CPB_WR		NX_DMA_CPB_WR_DMA_NOPAD
+#define AES_SHA_OUTPUT_DATA_WR	NX_DMA_OUTPUT_DATA_WR_DMA
+#define AMF_MAX_RR		(1) /* valid range: 1-8 */
+#define AMF_CSB_WR		NX_DMA_CSB_WR_PDMA
+#define AMF_COMPLETION_MODE	NX_DMA_COMPLETION_MODE_PDMA
+#define AMF_CPB_WR		(0) /* CPB WR not done with AMF */
+#define AMF_OUTPUT_DATA_WR	NX_DMA_OUTPUT_DATA_WR_DMA
+#define EE_CH7			(0) /* disable engine AMF 3(P8) */
+#define EE_CH6			(0) /* disable engine AMF 2(P8) */
+#define EE_CH5			(0) /* disable engine AMF 1(P8) */
+#define EE_CH4			(0) /* disable engine SYM AMF 0(P8) */
+#define EE_CH3			(0) /* disable engine SYM 1 */
+#define EE_CH2			(0) /* disable engine SYM 0 */
+
+static int nx_cfg_sym(u32 gcid, u64 xcfg)
+{
+	u64 cfg, ci, ct;
+	int rc, instance = gcid + 1;
+
+	BUILD_ASSERT(MAX_CHIPS < NX_SYM_CFG_CI_MAX);
+
+	rc = xscom_read(gcid, xcfg, &cfg);
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM SYM config read failure %d\n",
+		 gcid, rc);
+		return rc;
+	}
+
+	ct = GETFIELD(NX_SYM_CFG_CT, cfg);
+	if (!ct)
+		prlog(PR_INFO, "NX%d:   SYM CT set to %u\n", gcid, NX_CT_SYM);
+	else if (ct == NX_CT_SYM)
+		prlog(PR_INFO, "NX%d:   SYM CT already set to %u\n",
+		      gcid, NX_CT_SYM);
+	else
+		prlog(PR_INFO, "NX%d:   SYM CT already set to %u, "
+		      "changing to %u\n", gcid, (unsigned int)ct, NX_CT_SYM);
+	ct = NX_CT_SYM;
+	cfg = SETFIELD(NX_SYM_CFG_CT, cfg, ct);
+
+	/* Coprocessor Instance must be shifted left.
+	 * See hw doc Section 5.5.1.
+	 */
+	ci = GETFIELD(NX_SYM_CFG_CI, cfg) >> NX_SYM_CFG_CI_LSHIFT;
+	if (!ci)
+		prlog(PR_INFO, "NX%d:   SYM CI set to %d\n", gcid, instance);
+	else if (ci == instance)
+		prlog(PR_INFO, "NX%d:   SYM CI already set to %u\n", gcid,
+		      (unsigned int)ci);
+	else
+		prlog(PR_INFO, "NX%d:   SYM CI already set to %u, "
+		      "changing to %d\n", gcid, (unsigned int)ci, instance);
+	ci = instance;
+	cfg = SETFIELD(NX_SYM_CFG_CI, cfg, ci << NX_SYM_CFG_CI_LSHIFT);
+
+	cfg = SETFIELD(NX_SYM_CFG_FC_ENABLE, cfg, CFG_SYM_FC_ENABLE);
+
+	cfg = SETFIELD(NX_SYM_CFG_ENABLE, cfg, CFG_SYM_ENABLE);
+
+	rc = xscom_write(gcid, xcfg, cfg);
+	if (rc)
+		prerror("NX%d: ERROR: SYM CT %u CI %u config failure %d\n",
+			gcid, (unsigned int)ct, (unsigned int)ci, rc);
+	else
+		prlog(PR_DEBUG, "NX%d:   SYM Config 0x%016lx\n",
+		      gcid, (unsigned long)cfg);
+
+	return rc;
+}
+
+static int nx_cfg_asym(u32 gcid, u64 xcfg)
+{
+	u64 cfg, ci, ct;
+	int rc, instance = gcid + 1;
+
+	BUILD_ASSERT(MAX_CHIPS < NX_ASYM_CFG_CI_MAX);
+
+	rc = xscom_read(gcid, xcfg, &cfg);
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM ASYM config read failure %d\n",
+		 gcid, rc);
+		return rc;
+	}
+
+	ct = GETFIELD(NX_ASYM_CFG_CT, cfg);
+	if (!ct)
+		prlog(PR_INFO, "NX%d:   ASYM CT set to %u\n",
+		      gcid, NX_CT_ASYM);
+	else if (ct == NX_CT_ASYM)
+		prlog(PR_INFO, "NX%d:   ASYM CT already set to %u\n",
+		      gcid, NX_CT_ASYM);
+	else
+		prlog(PR_INFO, "NX%d:   ASYM CT already set to %u, "
+		      "changing to %u\n", gcid, (unsigned int)ct, NX_CT_ASYM);
+	ct = NX_CT_ASYM;
+	cfg = SETFIELD(NX_ASYM_CFG_CT, cfg, ct);
+
+	/* Coprocessor Instance must be shifted left.
+	 * See hw doc Section 5.5.1.
+	 */
+	ci = GETFIELD(NX_ASYM_CFG_CI, cfg) >> NX_ASYM_CFG_CI_LSHIFT;
+	if (!ci)
+		prlog(PR_INFO, "NX%d:   ASYM CI set to %d\n", gcid, instance);
+	else if (ci == instance)
+		prlog(PR_INFO, "NX%d:   ASYM CI already set to %u\n", gcid,
+		      (unsigned int)ci);
+	else
+		prlog(PR_INFO, "NX%d:   ASYM CI already set to %u, "
+		      "changing to %d\n", gcid, (unsigned int)ci, instance);
+	ci = instance;
+	cfg = SETFIELD(NX_ASYM_CFG_CI, cfg, ci << NX_ASYM_CFG_CI_LSHIFT);
+
+	cfg = SETFIELD(NX_ASYM_CFG_FC_ENABLE, cfg, CFG_ASYM_FC_ENABLE);
+
+	cfg = SETFIELD(NX_ASYM_CFG_ENABLE, cfg, CFG_ASYM_ENABLE);
+
+	rc = xscom_write(gcid, xcfg, cfg);
+	if (rc)
+		prerror("NX%d: ERROR: ASYM CT %u CI %u config failure %d\n",
+			gcid, (unsigned int)ct, (unsigned int)ci, rc);
+	else
+		prlog(PR_DEBUG, "NX%d:   ASYM Config 0x%016lx\n",
+		      gcid, (unsigned long)cfg);
+
+	return rc;
+}
+
+static int nx_cfg_dma(u32 gcid, u64 xcfg)
+{
+	u64 cfg;
+	int rc;
+
+	rc = xscom_read(gcid, xcfg, &cfg);
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM DMA config read failure %d\n",
+		 gcid, rc);
+		return rc;
+	}
+
+	cfg = SETFIELD(NX_DMA_CFG_AES_SHA_MAX_RR, cfg,
+		       AES_SHA_MAX_RR);
+	cfg = SETFIELD(NX_DMA_CFG_AES_SHA_CSB_WR, cfg,
+		       AES_SHA_CSB_WR);
+	cfg = SETFIELD(NX_DMA_CFG_AES_SHA_COMPLETION_MODE, cfg,
+		       AES_SHA_COMPLETION_MODE);
+	cfg = SETFIELD(NX_DMA_CFG_AES_SHA_CPB_WR, cfg,
+		       AES_SHA_CPB_WR);
+	cfg = SETFIELD(NX_DMA_CFG_AES_SHA_OUTPUT_DATA_WR, cfg,
+		       AES_SHA_OUTPUT_DATA_WR);
+
+	cfg = SETFIELD(NX_DMA_CFG_AMF_MAX_RR, cfg,
+		       AMF_MAX_RR);
+	cfg = SETFIELD(NX_DMA_CFG_AMF_CSB_WR, cfg,
+		       AMF_CSB_WR);
+	cfg = SETFIELD(NX_DMA_CFG_AMF_COMPLETION_MODE, cfg,
+		       AMF_COMPLETION_MODE);
+	cfg = SETFIELD(NX_DMA_CFG_AMF_CPB_WR, cfg,
+		       AMF_CPB_WR);
+	cfg = SETFIELD(NX_DMA_CFG_AMF_OUTPUT_DATA_WR, cfg,
+		       AMF_OUTPUT_DATA_WR);
+
+	rc = xscom_write(gcid, xcfg, cfg);
+	if (rc)
+		prerror("NX%d: ERROR: DMA config failure %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d:   DMA 0x%016lx\n", gcid,
+		      (unsigned long)cfg);
+
+	return rc;
+}
+
+static int nx_cfg_iq(u32 gcid, u64 xcfg)
+{
+	u64 cfg;
+	int rc;
+
+	rc = xscom_read(gcid, xcfg, &cfg);
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM CRB IQ config read failure %d\n",
+		 gcid, rc);
+		return rc;
+	}
+
+	cfg = SETFIELD(NX_CRB_IQ_SYM, cfg, CFG_CRB_IQ_SYM);
+	cfg = SETFIELD(NX_CRB_IQ_ASYM, cfg, CFG_CRB_IQ_ASYM);
+
+	rc = xscom_write(gcid, xcfg, cfg);
+	if (rc)
+		prerror("NX%d: ERROR: CRB Input Queue failure %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d:   CRB Input Queue 0x%016lx\n",
+		      gcid, (unsigned long)cfg);
+
+	return rc;
+}
+
+static int nx_cfg_ee(u32 gcid, u64 xcfg)
+{
+	u64 cfg;
+	int rc;
+
+	rc = xscom_read(gcid, xcfg, &cfg);
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM EE config read failure %d\n",
+		 gcid, rc);
+		return rc;
+	}
+
+	cfg = SETFIELD(NX_EE_CFG_CH7, cfg, EE_CH7);
+	cfg = SETFIELD(NX_EE_CFG_CH6, cfg, EE_CH6);
+	cfg = SETFIELD(NX_EE_CFG_CH5, cfg, EE_CH5);
+	cfg = SETFIELD(NX_EE_CFG_CH4, cfg, EE_CH4);
+	cfg = SETFIELD(NX_EE_CFG_CH3, cfg, EE_CH3);
+	cfg = SETFIELD(NX_EE_CFG_CH2, cfg, EE_CH2);
+
+	rc = xscom_write(gcid, xcfg, cfg);
+	if (rc)
+		prerror("NX%d: ERROR: Engine Enable failure %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d:   Engine Enable 0x%016lx\n",
+		      gcid, (unsigned long)cfg);
+
+	return rc;
+}
+
+void nx_create_crypto_node(struct dt_node *node)
+{
+	u32 gcid;
+	u32 pb_base;
+	u64 cfg_dma, cfg_sym, cfg_asym, cfg_iq, cfg_ee;
+	int rc;
+
+	gcid = dt_get_chip_id(node);
+	pb_base = dt_get_address(node, 0, NULL);
+
+	prlog(PR_INFO, "NX%d: Crypto at 0x%x\n", gcid, pb_base);
+
+	if (dt_node_is_compatible(node, "ibm,power8-nx")) {
+		cfg_dma = pb_base + NX_P8_DMA_CFG;
+		cfg_sym = pb_base + NX_P8_SYM_CFG;
+		cfg_asym = pb_base + NX_P8_ASYM_CFG;
+		cfg_iq = pb_base + NX_P8_CRB_IQ;
+		cfg_ee = pb_base + NX_P8_EE_CFG;
+	} else if (dt_node_is_compatible(node, "ibm,power9-nx")) {
+		prlog(PR_INFO, "NX%d: POWER9 nx-crypto not yet supported\n",
+		      gcid);
+		return;
+	} else {
+		prerror("NX%d: ERROR: Unknown NX type!\n", gcid);
+		return;
+	}
+
+	rc = nx_cfg_dma(gcid, cfg_dma);
+	if (rc)
+		return;
+
+	rc = nx_cfg_sym(gcid, cfg_sym);
+	if (rc)
+		return;
+
+	rc = nx_cfg_asym(gcid, cfg_asym);
+	if (rc)
+		return;
+
+	rc = nx_cfg_iq(gcid, cfg_iq);
+	if (rc)
+		return;
+
+	rc = nx_cfg_ee(gcid, cfg_ee);
+	if (rc)
+		return;
+
+	prlog(PR_INFO, "NX%d: Crypto Coprocessors Disabled (not supported)\n", gcid);
+}
diff --git a/roms/skiboot/hw/nx-gzip.c b/roms/skiboot/hw/nx-gzip.c
new file mode 100644
index 000000000..9bc491e70
--- /dev/null
+++ b/roms/skiboot/hw/nx-gzip.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX GZIP (p9) accellerator support
+ *
+ * Copyright 2016-2017 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+
+#define EE			(1) /* enable gzip engine */
+
+static int nx_cfg_gzip_umac(struct dt_node *node, u32 gcid, u32 pb_base)
+{
+	int rc;
+	u64 umac_bar, umac_notify;
+	struct dt_node *nx_node;
+	static u32 nxgzip_tid = 1; /* tid counter within coprocessor type */
+
+	nx_node = dt_new(node, "ibm,gzip-high-fifo");
+	umac_bar = pb_base + NX_P9_GZIP_HIGH_PRI_RX_FIFO_BAR;
+	umac_notify = pb_base + NX_P9_GZIP_HIGH_PRI_RX_FIFO_NOTIFY_MATCH;
+
+	rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-gzip", "High", gcid,
+				NX_CT_GZIP, nxgzip_tid++, umac_bar,
+				umac_notify);
+	if (rc)
+		return rc;
+
+	nx_node = dt_new(node, "ibm,gzip-normal-fifo");
+	umac_bar = pb_base + NX_P9_GZIP_NORMAL_PRI_RX_FIFO_BAR;
+	umac_notify = pb_base + NX_P9_GZIP_NORMAL_PRI_RX_FIFO_NOTIFY_MATCH;
+
+	rc = nx_cfg_rx_fifo(nx_node, "ibm,p9-nx-gzip", "Normal", gcid,
+				NX_CT_GZIP, nxgzip_tid++, umac_bar,
+				umac_notify);
+
+	return rc;
+}
+
+static int nx_cfg_gzip_dma(u32 gcid, u64 xcfg)
+{
+	u64 cfg;
+	int rc;
+
+	rc = xscom_read(gcid, xcfg, &cfg);
+	if (rc)
+		return rc;
+
+	cfg = SETFIELD(NX_DMA_CFG_GZIP_COMPRESS_PREFETCH, cfg,
+		       DMA_COMPRESS_PREFETCH);
+	cfg = SETFIELD(NX_DMA_CFG_GZIP_DECOMPRESS_PREFETCH, cfg,
+		       DMA_DECOMPRESS_PREFETCH);
+
+	cfg = SETFIELD(NX_DMA_CFG_GZIP_COMPRESS_MAX_RR, cfg,
+		       DMA_COMPRESS_MAX_RR);
+	cfg = SETFIELD(NX_DMA_CFG_GZIP_DECOMPRESS_MAX_RR, cfg,
+		       DMA_DECOMPRESS_MAX_RR);
+
+	rc = xscom_write(gcid, xcfg, cfg);
+	if (rc)
+		prerror("NX%d: ERROR: DMA config failure %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d:   DMA 0x%016lx\n", gcid,
+		      (unsigned long)cfg);
+
+	return rc;
+}
+
+static int nx_cfg_gzip_ee(u32 gcid, u64 xcfg)
+{
+	u64 cfg;
+	int rc;
+
+	rc = xscom_read(gcid, xcfg, &cfg);
+	if (rc)
+		return rc;
+
+	cfg = SETFIELD(NX_P9_EE_CFG_CH4, cfg, EE);
+
+	rc = xscom_write(gcid, xcfg, cfg);
+	if (rc)
+		prerror("NX%d: ERROR: Engine Enable failure %d\n", gcid, rc);
+	else
+		prlog(PR_DEBUG, "NX%d:   Engine Enable 0x%016lx\n",
+		      gcid, (unsigned long)cfg);
+
+	return rc;
+}
+
+void p9_nx_enable_gzip(struct dt_node *node, u32 gcid, u32 pb_base)
+{
+	u64 cfg_dma, cfg_ee;
+	int rc;
+
+	prlog(PR_INFO, "NX%d: gzip at 0x%x\n", gcid, pb_base);
+
+	cfg_dma = pb_base + NX_P9_DMA_CFG;
+	cfg_ee = pb_base + NX_P9_EE_CFG;
+
+	rc = nx_cfg_gzip_dma(gcid, cfg_dma);
+	if (rc)
+		return;
+
+	rc = nx_cfg_gzip_ee(gcid, cfg_ee);
+	if (rc)
+		return;
+
+	rc = nx_cfg_gzip_umac(node, gcid, pb_base);
+	if (rc)
+		return;
+
+	prlog(PR_INFO, "NX%d: gzip Coprocessor Enabled\n", gcid);
+}
diff --git a/roms/skiboot/hw/nx-rng.c b/roms/skiboot/hw/nx-rng.c
new file mode 100644
index 000000000..274b33211
--- /dev/null
+++ b/roms/skiboot/hw/nx-rng.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX Hardware Random Number Generator
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+#include <chip.h>
+#include <phys-map.h>
+#include <xscom-p9-regs.h>
+
+/*
+ * On P9 the DARN instruction is used to access the HW RNG. There is still
+ * an NX RNG BAR, but it is used to configure which NX a core will source
+ * random numbers from rather than being a MMIO window.
+ */
+static void nx_init_p9_rng(uint32_t chip_id)
+{
+	uint64_t bar, tmp;
+
+	if (chip_quirk(QUIRK_NO_RNG))
+		return;
+
+	phys_map_get(chip_id, NX_RNG, 0, &bar, NULL);
+	xscom_write(chip_id, P9X_NX_MMIO_BAR, bar | P9X_NX_MMIO_BAR_EN);
+
+	/* Read config register for pace info */
+	xscom_read(chip_id, P9X_NX_RNG_CFG, &tmp);
+	prlog(PR_INFO, "NX RNG[%x] pace:%lli\n", chip_id, 0xffff & (tmp >> 2));
+}
+
+void nx_create_rng_node(struct dt_node *node)
+{
+	u64 bar, cfg;
+	u64 xbar, xcfg;
+	u32 pb_base;
+	u32 gcid;
+	u64 rng_addr, rng_len, len, addr_mask;
+	struct dt_node *rng;
+	int rc;
+
+	gcid = dt_get_chip_id(node);
+	pb_base = dt_get_address(node, 0, NULL);
+
+	if (dt_node_is_compatible(node, "ibm,power8-nx")) {
+		xbar = pb_base + NX_P8_RNG_BAR;
+		xcfg = pb_base + NX_P8_RNG_CFG;
+		addr_mask = NX_P8_RNG_BAR_ADDR;
+	} else if (dt_node_is_compatible(node, "ibm,power9-nx")) {
+		nx_init_p9_rng(gcid);
+		return;
+	} else {
+		prerror("NX%d: Unknown NX type!\n", gcid);
+		return;
+	}
+
+	rc = xscom_read(gcid, xbar, &bar); /* Get RNG BAR */
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM RNG BAR read failure %d\n",
+			 gcid, rc);
+		return;
+	}
+
+	rc = xscom_read(gcid, xcfg, &cfg); /* Get RNG CFG */
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM RNG config read failure %d\n",
+			 gcid, rc);
+		return;
+	}
+
+	/*
+	 * We mask in-place rather than using GETFIELD for the base address
+	 * as we happen to *know* that it's properly aligned in the register.
+	 *
+	 * FIXME? Always assusme BAR gets a valid address from FSP
+	 */
+	rng_addr = bar & addr_mask;
+	len  = GETFIELD(NX_RNG_BAR_SIZE, bar);
+	if (len > 4) {
+		prerror("NX%d: Corrupted bar size %lld\n", gcid, len);
+		return;
+	}
+	rng_len = (u64[]){  0x1000,         /* 4K */
+			    0x10000,        /* 64K */
+			    0x400000000UL,    /* 16G*/
+			    0x100000,       /* 1M */
+			    0x1000000       /* 16M */} [len];
+
+
+	prlog(PR_INFO, "NX%d: RNG BAR set to 0x%016llx..0x%016llx\n",
+	      gcid, rng_addr, rng_addr + rng_len - 1);
+
+	/* RNG must be enabled before MMIO is enabled */
+	rc = xscom_write(gcid, xcfg, cfg | NX_RNG_CFG_ENABLE);
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM RNG config enable failure %d\n",
+			 gcid, rc);
+		return;
+	}
+
+	/* The BAR needs to be enabled too */
+	rc = xscom_write(gcid, xbar, bar | NX_RNG_BAR_ENABLE);
+	if (rc) {
+                prerror("NX%d: ERROR: XSCOM RNG config enable failure %d\n",
+			 gcid, rc);
+		return;
+	}
+
+	rng = dt_new_addr(dt_root, "hwrng", rng_addr);
+	if (!rng)
+		return;
+
+	dt_add_property_strings(rng, "compatible", "ibm,power-rng");
+	dt_add_property_u64s(rng, "reg", rng_addr, rng_len);
+	dt_add_property_cells(rng, "ibm,chip-id", gcid);
+}
diff --git a/roms/skiboot/hw/nx.c b/roms/skiboot/hw/nx.c
new file mode 100644
index 000000000..fdadf53c7
--- /dev/null
+++ b/roms/skiboot/hw/nx.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * NX Accellerator unit support
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+#include <chip.h>
+#include <xscom-p9-regs.h>
+#include <xscom-p10-regs.h>
+#include <phys-map.h>
+#include <vas.h>
+#include <p9_stop_api.H>
+
+static void darn_init(void)
+{
+	struct dt_node *nx;
+	struct proc_chip *chip;
+	struct cpu_thread *c;
+	uint64_t bar, default_bar;
+
+	if (chip_quirk(QUIRK_NO_RNG))
+		return;
+
+	/*
+	 * To allow the DARN instruction to function there must be at least
+	 * one NX available in the system. Otherwise using DARN will result
+	 * in a checkstop. I suppose we could mask the FIR...
+	 */
+	dt_for_each_compatible(dt_root, nx, "ibm,power9-nx")
+		break;
+	assert(nx);
+
+	phys_map_get(dt_get_chip_id(nx), NX_RNG, 0, &default_bar, NULL);
+
+	for_each_chip(chip) {
+		/* is this NX enabled? */
+		xscom_read(chip->id, P9X_NX_MMIO_BAR, &bar);
+		if (!(bar & ~P9X_NX_MMIO_BAR_EN))
+			bar = default_bar;
+
+		for_each_available_core_in_chip(c, chip->id) {
+			uint64_t addr;
+
+			if (proc_gen == proc_gen_p9) {
+				addr = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir),
+						P9X_EX_NCU_DARN_BAR);
+				xscom_write(chip->id, addr,
+				    bar | P9X_EX_NCU_DARN_BAR_EN);
+			} else if (proc_gen >= proc_gen_p10) {
+				addr = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir),
+						P10_NCU_DARN_BAR);
+				xscom_write(chip->id, addr,
+				    bar | P10_NCU_DARN_BAR_EN);
+				/*  Init for sibling core also */
+				if (c->is_fused_core) {
+					addr = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir + 1),
+								  P10_NCU_DARN_BAR);
+					xscom_write(chip->id, addr,
+						    bar | P10_NCU_DARN_BAR_EN);
+				}
+			}
+		}
+	}
+}
+
+void nx_p9_rng_late_init(void)
+{
+	struct cpu_thread *c;
+	uint64_t rc;
+
+	if (proc_gen < proc_gen_p9)
+		return;
+	if (chip_quirk(QUIRK_NO_RNG))
+		return;
+
+	prlog(PR_INFO, "SLW: Configuring self-restore for P9X_EX_NCU_DARN_BAR\n");
+	for_each_present_cpu(c) {
+		if(cpu_is_thread0(c)) {
+			struct proc_chip *chip = get_chip(c->chip_id);
+			uint64_t addr, bar;
+
+			phys_map_get(chip->id, NX_RNG, 0, &bar, NULL);
+			addr = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir),
+					P9X_EX_NCU_DARN_BAR);
+			/* Bail out if wakeup engine has already failed */
+			if ( wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
+				prlog(PR_ERR,"DARN BAR p9_stop_api fail detected\n");
+				break;
+			}
+			rc = p9_stop_save_scom((void *)chip->homer_base,
+					addr, bar | P9X_EX_NCU_DARN_BAR_EN,
+					P9_STOP_SCOM_REPLACE,
+					P9_STOP_SECTION_EQ_SCOM);
+			if (rc) {
+				prlog(PR_ERR,
+				"p9_stop_api for DARN_BAR failed rc= %lld",
+				rc);
+				prlog(PR_ERR, "Disabling deep stop states\n");
+				wakeup_engine_state = WAKEUP_ENGINE_FAILED;
+				break;
+			}
+		}
+	}
+}
+
+static void nx_init_one(struct dt_node *node)
+{
+	nx_create_rng_node(node);
+
+	if (!vas_nx_enabled())
+		return;
+
+	nx_create_crypto_node(node);
+
+	nx_create_compress_node(node);
+}
+
+void nx_init(void)
+{
+	struct dt_node *node;
+
+	dt_for_each_compatible(dt_root, node, "ibm,power-nx") {
+		nx_init_one(node);
+	}
+
+	dt_for_each_compatible(dt_root, node, "ibm,power9-nx") {
+		nx_init_one(node);
+	}
+
+	if (proc_gen >= proc_gen_p9)
+		darn_init();
+}
diff --git a/roms/skiboot/hw/occ-sensor.c b/roms/skiboot/hw/occ-sensor.c
new file mode 100644
index 000000000..6efaf908b
--- /dev/null
+++ b/roms/skiboot/hw/occ-sensor.c
@@ -0,0 +1,640 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * OCC (On Chip Controller) exports a bunch of sensors
+ *
+ * Copyright 2017-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <chip.h>
+#include <sensor.h>
+#include <device.h>
+#include <cpu.h>
+#include <occ.h>
+
+enum sensor_attr {
+	SENSOR_SAMPLE,
+	SENSOR_SAMPLE_MIN,	/* OCC's min/max */
+	SENSOR_SAMPLE_MAX,
+	SENSOR_CSM_MIN,		/* CSM's min/max */
+	SENSOR_CSM_MAX,
+	SENSOR_ACCUMULATOR,
+	MAX_SENSOR_ATTR,
+};
+
+#define HWMON_SENSORS_MASK	(OCC_SENSOR_TYPE_CURRENT | \
+				 OCC_SENSOR_TYPE_VOLTAGE | \
+				 OCC_SENSOR_TYPE_TEMPERATURE | \
+				 OCC_SENSOR_TYPE_POWER)
+
+/*
+ * Standard HWMON linux interface expects the below units for the
+ * environment sensors:
+ * - Current		: milliampere
+ * - Voltage		: millivolt
+ * - Temperature	: millidegree Celsius (scaled in kernel)
+ * - Power		: microWatt	      (scaled in kernel)
+ * - Energy		: microJoule
+ */
+
+/*
+ * OCC sensor units are obtained after scaling the sensor values.
+ * https://github.com/open-power/occ/blob/master/src/occ_405/sensor/sensor_info.c
+ */
+
+static struct str_map {
+	const char *occ_str;
+	const char *opal_str;
+} str_maps[] = {
+	{"PWRSYS", "System"},
+	/* Bulk power of the system: Watt */
+	{"PWRFAN", "Fan"},
+	/* Power consumption of the system fans: Watt */
+	{"PWRIO", "IO"},
+	/* Power consumption of the IO subsystem: Watt */
+	{"PWRSTORE", "Storage"},
+	/* Power comsumption of the storage subsystem: Watt */
+	{"PWRGPU", "GPU"},
+	/* Power consumption for GPUs per socket read from APSS: Watt */
+	{"PWRAPSSCH", "APSS"},
+	/* Power Provided by APSS channel x (where x=0…15): Watt */
+	{"PWRPROC", ""},
+	/* Power consumption for this Processor: Watt */
+	{"PWRVDD", "Vdd"},
+	/* Power consumption for this Processor's Vdd(AVSBus readings): Watt */
+	{"PWRVDN", "Vdn"},
+	/* Power consumption for  this Processor's Vdn (nest)
+	 * Calculated from AVSBus readings: Watt */
+	{"PWRMEM", "Memory"},
+	/* Power consumption for Memory  for this Processor read from APSS:
+	 * Watt */
+	{"CURVDD", "Vdd"},
+	/* Processor Vdd Current (read from AVSBus): Ampere */
+	{"CURVDN", "Vdn"},
+	/* Processor Vdn Current (read from AVSBus): Ampere */
+	{"VOLTVDDSENSE", "Vdd Remote Sense"},
+	/* Vdd Voltage at the remote sense.
+	 * AVS reading adjusted for loadline: millivolt */
+	{"VOLTVDNSENSE", "Vdn Remote Sense"},
+	/* Vdn Voltage at the remote sense.
+	 * AVS reading adjusted for loadline: millivolt */
+	{"VOLTVDD", "Vdd"},
+	/* Processor Vdd Voltage (read from AVSBus): millivolt */
+	{"VOLTVDN", "Vdn"},
+	/* Processor Vdn Voltage (read from AVSBus): millivolt */
+	{"TEMPC", "Core"},
+	/* Average temperature of core DTS sensors for Processor's Core y:
+	 * Celsius */
+	{"TEMPQ", "Quad"},
+	/* Average temperature of quad (in cache) DTS sensors for
+	 * Processor’s Quad y: Celsius */
+	{"TEMPNEST", "Nest"},
+	/* Average temperature of nest DTS sensors: Celsius */
+	{"TEMPPROCTHRMC", "Core"},
+	/* The combined weighted core/quad temperature for processor core y:
+	 * Celsius */
+	{"TEMPDIMM", "DIMM"},
+	/* DIMM temperature for DIMM x: Celsius */
+	{"TEMPGPU", "GPU"},
+	/* GPU x (0..2) board temperature: Celsius */
+	/* TEMPGPUxMEM: GPU x hottest HBM temperature (individual memory
+	 * temperatures are not available): Celsius */
+	{"TEMPVDD", "VRM VDD"},
+	/* VRM Vdd temperature: Celsius */
+};
+
+static u64 occ_sensor_base;
+
+static inline
+struct occ_sensor_data_header *get_sensor_header_block(int occ_num)
+{
+	return (struct occ_sensor_data_header *)
+		(occ_sensor_base + occ_num * OCC_SENSOR_DATA_BLOCK_SIZE);
+}
+
+static inline
+struct occ_sensor_name *get_names_block(struct occ_sensor_data_header *hb)
+{
+	return ((struct occ_sensor_name *)((u64)hb + be32_to_cpu(hb->names_offset)));
+}
+
+static inline u32 sensor_handler(int occ_num, int sensor_id, int attr)
+{
+	return sensor_make_handler(SENSOR_OCC, occ_num, sensor_id, attr);
+}
+
+/*
+ * The scaling factor for the sensors is encoded in the below format:
+ * (((UINT32)mantissa << 8) | (UINT32)((UINT8) 256 + (UINT8)exp))
+ * https://github.com/open-power/occ/blob/master/src/occ_405/sensor/sensor.h
+ */
+static void scale_sensor(struct occ_sensor_name *md, u64 *sensor)
+{
+	u32 factor = be32_to_cpu(md->scale_factor);
+	int i;
+	s8 exp;
+
+	if (be16_to_cpu(md->type) == OCC_SENSOR_TYPE_CURRENT)
+		*sensor *= 1000; //convert to mA
+
+	*sensor *= factor >> 8;
+	exp = factor & 0xFF;
+
+	if (exp > 0) {
+		for (i = labs(exp); i > 0; i--)
+			*sensor *= 10;
+	} else {
+		for (i = labs(exp); i > 0; i--)
+			*sensor /= 10;
+	}
+}
+
+static void scale_energy(struct occ_sensor_name *md, u64 *sensor)
+{
+	u32 factor = be32_to_cpu(md->freq);
+	int i;
+	s8 exp;
+
+	*sensor *= 1000000; //convert to uJ
+
+	*sensor /= factor >> 8;
+	exp = factor & 0xFF;
+
+	if (exp > 0) {
+		for (i = labs(exp); i > 0; i--)
+			*sensor /= 10;
+	} else {
+		for (i = labs(exp); i > 0; i--)
+			*sensor *= 10;
+	}
+}
+
+static u64 read_sensor(struct occ_sensor_record *sensor, int attr)
+{
+	switch (attr) {
+	case SENSOR_SAMPLE:
+		return be16_to_cpu(sensor->sample);
+	case SENSOR_SAMPLE_MIN:
+		return be16_to_cpu(sensor->sample_min);
+	case SENSOR_SAMPLE_MAX:
+		return be16_to_cpu(sensor->sample_max);
+	case SENSOR_CSM_MIN:
+		return be16_to_cpu(sensor->csm_min);
+	case SENSOR_CSM_MAX:
+		return be16_to_cpu(sensor->csm_max);
+	case SENSOR_ACCUMULATOR:
+		return be64_to_cpu(sensor->accumulator);
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static void *select_sensor_buffer(struct occ_sensor_data_header *hb, int id)
+{
+	struct occ_sensor_name *md;
+	u8 *ping, *pong;
+	void *buffer = NULL;
+	u32 reading_offset;
+
+	if (!hb)
+		return NULL;
+
+	md = get_names_block(hb);
+
+	ping = (u8 *)((u64)hb + be32_to_cpu(hb->reading_ping_offset));
+	pong = (u8 *)((u64)hb + be32_to_cpu(hb->reading_pong_offset));
+	reading_offset = be32_to_cpu(md[id].reading_offset);
+
+	/* Check which buffer is valid  and read the data from that.
+	 * Ping Pong	Action
+	 *  0	0	Return with error
+	 *  0	1	Read Pong
+	 *  1	0	Read Ping
+	 *  1	1	Read the buffer with latest timestamp
+	 */
+
+	if (*ping && *pong) {
+		u64 tping, tpong;
+		u64 ping_buf = (u64)ping + reading_offset;
+		u64 pong_buf = (u64)pong + reading_offset;
+
+		tping = be64_to_cpu(((struct occ_sensor_record *)ping_buf)->timestamp);
+		tpong = be64_to_cpu(((struct occ_sensor_record *)pong_buf)->timestamp);
+
+		if (tping > tpong)
+			buffer = ping;
+		else
+			buffer = pong;
+	} else if (*ping && !*pong) {
+		buffer = ping;
+	} else if (!*ping && *pong) {
+		buffer = pong;
+	} else if (!*ping && !*pong) {
+		prlog(PR_DEBUG, "OCC: Both ping and pong sensor buffers are invalid\n");
+		return NULL;
+	}
+
+	assert(buffer);
+	buffer = (void *)((u64)buffer + reading_offset);
+
+	return buffer;
+}
+
+int occ_sensor_read(u32 handle, __be64 *data)
+{
+	struct occ_sensor_data_header *hb;
+	struct occ_sensor_name *md;
+	u16 id = sensor_get_rid(handle);
+	u8 occ_num = sensor_get_frc(handle);
+	u8 attr = sensor_get_attr(handle);
+	u64 d;
+	void *buff;
+
+	if (occ_num > MAX_OCCS)
+		return OPAL_PARAMETER;
+
+	if (attr > MAX_SENSOR_ATTR)
+		return OPAL_PARAMETER;
+
+	if (is_occ_reset())
+		return OPAL_HARDWARE;
+
+	hb = get_sensor_header_block(occ_num);
+
+	if (hb->valid != 1)
+		return OPAL_HARDWARE;
+
+	if (id > be16_to_cpu(hb->nr_sensors))
+		return OPAL_PARAMETER;
+
+	buff = select_sensor_buffer(hb, id);
+	if (!buff)
+		return OPAL_HARDWARE;
+
+	d = read_sensor(buff, attr);
+	if (!d)
+		goto out_success;
+
+	md = get_names_block(hb);
+	if (be16_to_cpu(md[id].type) == OCC_SENSOR_TYPE_POWER && attr == SENSOR_ACCUMULATOR)
+		scale_energy(&md[id], &d);
+	else
+		scale_sensor(&md[id], &d);
+
+out_success:
+	*data = cpu_to_be64(d);
+
+	return OPAL_SUCCESS;
+}
+
+static bool occ_sensor_sanity(struct occ_sensor_data_header *hb, int chipid)
+{
+	if (hb->valid != 0x01) {
+		prerror("OCC: Chip %d sensor data invalid\n", chipid);
+		return false;
+	}
+
+	if (hb->version != 0x01) {
+		prerror("OCC: Chip %d unsupported sensor header block version %d\n",
+			chipid, hb->version);
+		return false;
+	}
+
+	if (hb->reading_version != 0x01) {
+		prerror("OCC: Chip %d unsupported sensor record format %d\n",
+			chipid, hb->reading_version);
+		return false;
+	}
+
+	if (hb->names_version != 0x01) {
+		prerror("OCC: Chip %d unsupported sensor names format %d\n",
+			chipid, hb->names_version);
+		return false;
+	}
+
+	if (hb->name_length != sizeof(struct occ_sensor_name)) {
+		prerror("OCC: Chip %d unsupported sensor names length %d\n",
+			chipid, hb->name_length);
+		return false;
+	}
+
+	if (!hb->nr_sensors) {
+		prerror("OCC: Chip %d has no sensors\n", chipid);
+		return false;
+	}
+
+	if (!hb->names_offset ||
+	    !hb->reading_ping_offset ||
+	    !hb->reading_pong_offset) {
+		prerror("OCC: Chip %d Invalid sensor buffer pointers\n",
+			chipid);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * parse_entity: Parses OCC sensor name to return the entity number like
+ *		 chipid, core-id, dimm-no, gpu-no. 'end' is used to
+ *		 get the subentity strings. Returns -1 if no number is found.
+ *		 TEMPC4 --> returns 4, end will be NULL
+ *		 TEMPGPU2DRAM1 --> returns 2, end = "DRAM1"
+ *		 PWRSYS --> returns -1, end = NULL
+ */
+static int parse_entity(const char *name, char **end)
+{
+	while (*name != '\0') {
+		if (isdigit(*name))
+			break;
+		name++;
+	}
+
+	if (*name)
+		return strtol(name, end, 10);
+	else
+		return -1;
+}
+
+static void add_sensor_label(struct dt_node *node, struct occ_sensor_name *md,
+			     int chipid)
+{
+	char sname[30] = "";
+	char prefix[30] = "";
+	uint16_t location = be16_to_cpu(md->location);
+	int i;
+
+	if (location != OCC_SENSOR_LOC_SYSTEM)
+		snprintf(prefix, sizeof(prefix), "%s %d ", "Chip", chipid);
+
+	for (i = 0; i < ARRAY_SIZE(str_maps); i++)
+		if (!strncmp(str_maps[i].occ_str, md->name,
+			     strlen(str_maps[i].occ_str))) {
+			char *end;
+			int num = -1;
+
+			if (location != OCC_SENSOR_LOC_CORE)
+				num = parse_entity(md->name, &end);
+
+			if (num != -1) {
+				snprintf(sname, sizeof(sname), "%s%s %d %s",
+					 prefix, str_maps[i].opal_str, num,
+					 end);
+			} else {
+				snprintf(sname, sizeof(sname), "%s%s", prefix,
+					 str_maps[i].opal_str);
+			}
+			dt_add_property_string(node, "label", sname);
+			return;
+		}
+
+	/* Fallback to OCC literal if mapping is not found */
+	if (location == OCC_SENSOR_LOC_SYSTEM) {
+		dt_add_property_string(node, "label", md->name);
+	} else {
+		snprintf(sname, sizeof(sname), "%s%s", prefix, md->name);
+		dt_add_property_string(node, "label", sname);
+	}
+}
+
+static const char *get_sensor_type_string(enum occ_sensor_type type)
+{
+	switch (type) {
+	case OCC_SENSOR_TYPE_POWER:
+		return "power";
+	case OCC_SENSOR_TYPE_TEMPERATURE:
+		return "temp";
+	case OCC_SENSOR_TYPE_CURRENT:
+		return "curr";
+	case OCC_SENSOR_TYPE_VOLTAGE:
+		return "in";
+	default:
+		break;
+	}
+
+	return "unknown";
+}
+
+static const char *get_sensor_loc_string(enum occ_sensor_location loc)
+{
+	switch (loc) {
+	case OCC_SENSOR_LOC_SYSTEM:
+		return "sys";
+	case OCC_SENSOR_LOC_PROCESSOR:
+		return "proc";
+	case OCC_SENSOR_LOC_MEMORY:
+		return "mem";
+	case OCC_SENSOR_LOC_VRM:
+		return "vrm";
+	case OCC_SENSOR_LOC_CORE:
+		return "core";
+	case OCC_SENSOR_LOC_QUAD:
+		return "quad";
+	case OCC_SENSOR_LOC_GPU:
+		return "gpu";
+	default:
+		break;
+	}
+
+	return "unknown";
+}
+
+/*
+ * Power sensors can be 0 valued in few platforms like Zaius, Romulus
+ * which do not have APSS. At the moment there is no HDAT/DT property
+ * to indicate if APSS is present. So for now skip zero valued power
+ * sensors.
+ */
+static bool check_sensor_sample(struct occ_sensor_data_header *hb, u32 offset)
+{
+	struct occ_sensor_record *ping, *pong;
+
+	ping = (struct occ_sensor_record *)((u64)hb
+			+ be32_to_cpu(hb->reading_ping_offset) + offset);
+	pong = (struct occ_sensor_record *)((u64)hb
+			+ be32_to_cpu(hb->reading_pong_offset) + offset);
+	return ping->sample || pong->sample;
+}
+
+static void add_sensor_node(const char *loc, const char *type, int i, int attr,
+			    struct occ_sensor_name *md, __be32 *phandle, u32 *ptype,
+			    u32 pir, u32 occ_num, u32 chipid)
+{
+	char name[30];
+	struct dt_node *node;
+	u32 handler;
+
+	snprintf(name, sizeof(name), "%s-%s", loc, type);
+	handler = sensor_handler(occ_num, i, attr);
+	node = dt_new_addr(sensor_node, name, handler);
+	dt_add_property_string(node, "sensor-type", type);
+	dt_add_property_cells(node, "sensor-data", handler);
+	dt_add_property_cells(node, "reg", handler);
+	dt_add_property_string(node, "occ_label", md->name);
+	add_sensor_label(node, md, chipid);
+
+	if (be16_to_cpu(md->location) == OCC_SENSOR_LOC_CORE)
+		dt_add_property_cells(node, "ibm,pir", pir);
+
+	*ptype = be16_to_cpu(md->type);
+
+	if (attr == SENSOR_SAMPLE) {
+		handler = sensor_handler(occ_num, i, SENSOR_CSM_MAX);
+		dt_add_property_cells(node, "sensor-data-max", handler);
+
+		handler = sensor_handler(occ_num, i, SENSOR_CSM_MIN);
+		dt_add_property_cells(node, "sensor-data-min", handler);
+	}
+
+	dt_add_property_string(node, "compatible", "ibm,opal-sensor");
+	*phandle = cpu_to_be32(node->phandle);
+}
+
+bool occ_sensors_init(void)
+{
+	struct proc_chip *chip;
+	struct dt_node *sg, *exports;
+	int occ_num = 0, i;
+	bool has_gpu = false;
+
+	/* OCC inband sensors is only supported in P9/10 */
+	if (proc_gen < proc_gen_p9)
+		return false;
+
+	/* Sensors are copied to BAR2 OCC Common Area */
+	chip = next_chip(NULL);
+	if (!chip->occ_common_base) {
+		prerror("OCC: Unassigned OCC Common Area. No sensors found\n");
+		return false;
+	}
+
+	occ_sensor_base = chip->occ_common_base + OCC_SENSOR_DATA_BLOCK_OFFSET;
+
+	sg = dt_new(opal_node, "sensor-groups");
+	if (!sg) {
+		prerror("OCC: Failed to create sensor groups node\n");
+		return false;
+	}
+	dt_add_property_string(sg, "compatible", "ibm,opal-sensor-group");
+	dt_add_property_cells(sg, "#address-cells", 1);
+	dt_add_property_cells(sg, "#size-cells", 0);
+
+	/*
+	 * On POWER9, ibm,ioda2-npu2-phb indicates the presence of a
+	 * GPU NVlink.
+	 */
+	if (dt_find_compatible_node(dt_root, NULL, "ibm,ioda2-npu2-phb")) {
+
+		for_each_chip(chip) {
+			int max_gpus_per_chip = 3, i;
+
+			for(i = 0; i < max_gpus_per_chip; i++) {
+				has_gpu = occ_get_gpu_presence(chip, i);
+
+				if (has_gpu)
+					break;
+			}
+
+			if (has_gpu)
+				break;
+		}
+	}
+
+	for_each_chip(chip) {
+		struct occ_sensor_data_header *hb;
+		struct occ_sensor_name *md;
+		__be32 *phandles;
+		u32 *ptype, phcount = 0;
+		unsigned int nr_sensors;
+
+		hb = get_sensor_header_block(occ_num);
+		md = get_names_block(hb);
+
+		/* Sanity check of the Sensor Data Header Block */
+		if (!occ_sensor_sanity(hb, chip->id))
+			continue;
+
+		nr_sensors = be16_to_cpu(hb->nr_sensors);
+
+		phandles = malloc(nr_sensors * sizeof(__be32));
+		assert(phandles);
+		ptype = malloc(nr_sensors * sizeof(u32));
+		assert(ptype);
+
+		for (i = 0; i < nr_sensors; i++) {
+			const char *type_name, *loc;
+			struct cpu_thread *c = NULL;
+			uint32_t pir = 0;
+			uint16_t type = be16_to_cpu(md[i].type);
+			uint16_t location = be16_to_cpu(md[i].location);
+
+			if (md[i].structure_type != OCC_SENSOR_READING_FULL)
+				continue;
+
+			if (!(type & HWMON_SENSORS_MASK))
+				continue;
+
+			if (location == OCC_SENSOR_LOC_GPU && !has_gpu)
+				continue;
+
+			if (type == OCC_SENSOR_TYPE_POWER &&
+			    !check_sensor_sample(hb, be32_to_cpu(md[i].reading_offset)))
+				continue;
+
+			if (location == OCC_SENSOR_LOC_CORE) {
+				int num = parse_entity(md[i].name, NULL);
+
+				for_each_available_core_in_chip(c, chip->id)
+					if (pir_to_core_id(c->pir) == num)
+						break;
+				if (!c)
+					continue;
+				pir = c->pir;
+			}
+
+			type_name = get_sensor_type_string(type);
+			loc = get_sensor_loc_string(location);
+
+			add_sensor_node(loc, type_name, i, SENSOR_SAMPLE, &md[i],
+					&phandles[phcount], &ptype[phcount],
+					pir, occ_num, chip->id);
+			phcount++;
+
+			/* Add energy sensors */
+			if (type == OCC_SENSOR_TYPE_POWER &&
+			    md[i].structure_type == OCC_SENSOR_READING_FULL) {
+				add_sensor_node(loc, "energy", i,
+						SENSOR_ACCUMULATOR, &md[i],
+						&phandles[phcount], &ptype[phcount],
+						pir, occ_num, chip->id);
+				phcount++;
+			}
+
+		}
+		occ_num++;
+		occ_add_sensor_groups(sg, phandles, ptype, phcount, chip->id);
+		free(phandles);
+		free(ptype);
+	}
+	/* clear the device tree property if no sensors */
+	if (list_empty(&sg->children)) {
+               dt_free(sg);
+	}
+
+	if (!occ_num)
+		return false;
+
+	exports = dt_find_by_path(dt_root, "/ibm,opal/firmware/exports");
+	if (!exports) {
+		prerror("OCC: dt node /ibm,opal/firmware/exports not found\n");
+		return false;
+	}
+
+	dt_add_property_u64s(exports, "occ_inband_sensors", occ_sensor_base,
+			     OCC_SENSOR_DATA_BLOCK_SIZE * occ_num);
+
+	return true;
+}
diff --git a/roms/skiboot/hw/occ.c b/roms/skiboot/hw/occ.c
new file mode 100644
index 000000000..8d7bcbec9
--- /dev/null
+++ b/roms/skiboot/hw/occ.c
@@ -0,0 +1,2339 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Interface with the On Chip Controller,
+ * which enforces power and thermal management
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <xscom-p8-regs.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <timebase.h>
+#include <errorlog.h>
+#include <opal-api.h>
+#include <opal-msg.h>
+#include <timer.h>
+#include <i2c.h>
+#include <powercap.h>
+#include <psr.h>
+#include <sensor.h>
+#include <occ.h>
+#include <psi.h>
+
+/* OCC Communication Area for PStates */
+
+#define P8_HOMER_OPAL_DATA_OFFSET	0x1F8000
+#define P9_HOMER_OPAL_DATA_OFFSET	0x0E2000
+
+#define OPAL_DYNAMIC_DATA_OFFSET	0x0B80
+/* relative to HOMER_OPAL_DATA_OFFSET */
+
+#define MAX_PSTATES			256
+#define MAX_P8_CORES			12
+#define MAX_P9_CORES			24
+#define MAX_P10_CORES			32
+
+#define MAX_OPAL_CMD_DATA_LENGTH	4090
+#define MAX_OCC_RSP_DATA_LENGTH		8698
+
+#define P8_PIR_CORE_MASK		0xFFF8
+#define P9_PIR_QUAD_MASK		0xFFF0
+#define P10_PIR_CHIP_MASK		0x0000
+#define FREQ_MAX_IN_DOMAIN		0
+#define FREQ_MOST_RECENTLY_SET		1
+
+/**
+ * OCC-OPAL Shared Memory Region
+ *
+ * Reference document :
+ * https://github.com/open-power/docs/blob/master/occ/OCC_OpenPwr_FW_Interfaces.pdf
+ *
+ * Supported layout versions:
+ * - 0x01, 0x02 : P8
+ * https://github.com/open-power/occ/blob/master_p8/src/occ/proc/proc_pstate.h
+ *
+ * - 0x90 : P9
+ * https://github.com/open-power/occ/blob/master/src/occ_405/proc/proc_pstate.h
+ *   In 0x90 the data is separated into :-
+ *   -- Static Data (struct occ_pstate_table): Data is written once by OCC
+ *   -- Dynamic Data (struct occ_dynamic_data): Data is updated at runtime
+ *
+ * struct occ_pstate_table -	Pstate table layout
+ * @valid:			Indicates if data is valid
+ * @version:			Layout version [Major/Minor]
+ * @v2.throttle:		Reason for limiting the max pstate
+ * @v9.occ_role:		OCC role (Master/Slave)
+ * @v#.pstate_min:		Minimum pstate ever allowed
+ * @v#.pstate_nom:		Nominal pstate
+ * @v#.pstate_turbo:		Maximum turbo pstate
+ * @v#.pstate_ultra_turbo:	Maximum ultra turbo pstate and the maximum
+ *				pstate ever allowed
+ * @v#.pstates:			Pstate-id and frequency list from Pmax to Pmin
+ * @v#.pstates.id:		Pstate-id
+ * @v#.pstates.flags:		Pstate-flag(reserved)
+ * @v2.pstates.vdd:		Voltage Identifier
+ * @v2.pstates.vcs:		Voltage Identifier
+ * @v#.pstates.freq_khz:	Frequency in KHz
+ * @v#.core_max[1..N]:		Max pstate with N active cores
+ * @spare/reserved/pad:		Unused data
+ */
+struct occ_pstate_table {
+	u8 valid;
+	u8 version;
+	union __packed {
+		struct __packed { /* Version 0x01 and 0x02 */
+			u8 throttle;
+			s8 pstate_min;
+			s8 pstate_nom;
+			s8 pstate_turbo;
+			s8 pstate_ultra_turbo;
+			u8 spare;
+			u64 reserved;
+			struct __packed {
+				s8 id;
+				u8 flags;
+				u8 vdd;
+				u8 vcs;
+				__be32 freq_khz;
+			} pstates[MAX_PSTATES];
+			s8 core_max[MAX_P8_CORES];
+			u8 pad[100];
+		} v2;
+		struct __packed { /* Version 0x90 */
+			u8 occ_role;
+			u8 pstate_min;
+			u8 pstate_nom;
+			u8 pstate_turbo;
+			u8 pstate_ultra_turbo;
+			u8 spare;
+			u64 reserved1;
+			u64 reserved2;
+			struct __packed {
+				u8 id;
+				u8 flags;
+				u16 reserved;
+				__be32 freq_khz;
+			} pstates[MAX_PSTATES];
+			u8 core_max[MAX_P9_CORES];
+			u8 pad[56];
+		} v9;
+		struct __packed { /* Version 0xA0 */
+			u8 occ_role;
+			u8 pstate_min;
+			u8 pstate_fixed_freq;
+			u8 pstate_base;
+			u8 pstate_ultra_turbo;
+			u8 pstate_fmax;
+			u8 minor;
+			u8 pstate_bottom_throttle;
+			u8 spare;
+			u8 spare1;
+			u32 reserved_32;
+			u64 reserved_64;
+			struct __packed {
+				u8 id;
+				u8 valid;
+				u16 reserved;
+				__be32 freq_khz;
+			} pstates[MAX_PSTATES];
+			u8 core_max[MAX_P10_CORES];
+			u8 pad[48];
+		} v10;
+	};
+} __packed;
+
+/**
+ * OPAL-OCC Command Response Interface
+ *
+ * OPAL-OCC Command Buffer
+ *
+ * ---------------------------------------------------------------------
+ * | OPAL  |  Cmd    | OPAL |	       | Cmd Data | Cmd Data | OPAL    |
+ * | Cmd   | Request | OCC  | Reserved | Length   | Length   | Cmd     |
+ * | Flags |   ID    | Cmd  |	       | (MSB)    | (LSB)    | Data... |
+ * ---------------------------------------------------------------------
+ * |  ….OPAL Command Data up to max of Cmd Data Length 4090 bytes      |
+ * |								       |
+ * ---------------------------------------------------------------------
+ *
+ * OPAL Command Flag
+ *
+ * -----------------------------------------------------------------
+ * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 |
+ * | (msb) |	   |	   |	   |	   |	   |	   | (lsb) |
+ * -----------------------------------------------------------------
+ * |Cmd    |       |       |       |       |       |       |       |
+ * |Ready  |	   |	   |	   |	   |	   |	   |	   |
+ * -----------------------------------------------------------------
+ *
+ * struct opal_command_buffer -	Defines the layout of OPAL command buffer
+ * @flag:			Provides general status of the command
+ * @request_id:			Token to identify request
+ * @cmd:			Command sent
+ * @data_size:			Command data length
+ * @data:			Command specific data
+ * @spare:			Unused byte
+ */
+struct opal_command_buffer {
+	u8 flag;
+	u8 request_id;
+	u8 cmd;
+	u8 spare;
+	u16 data_size;
+	u8 data[MAX_OPAL_CMD_DATA_LENGTH];
+} __packed;
+
+/**
+ * OPAL-OCC Response Buffer
+ *
+ * ---------------------------------------------------------------------
+ * | OCC   |  Cmd    | OPAL | Response | Rsp Data | Rsp Data | OPAL    |
+ * | Rsp   | Request | OCC  |  Status  | Length   | Length   | Rsp     |
+ * | Flags |   ID    | Cmd  |	       | (MSB)    | (LSB)    | Data... |
+ * ---------------------------------------------------------------------
+ * |  ….OPAL Response Data up to max of Rsp Data Length 8698 bytes     |
+ * |								       |
+ * ---------------------------------------------------------------------
+ *
+ * OCC Response Flag
+ *
+ * -----------------------------------------------------------------
+ * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 |
+ * | (msb) |	   |	   |	   |	   |	   |	   | (lsb) |
+ * -----------------------------------------------------------------
+ * |       |       |       |       |       |       |OCC in  | Rsp  |
+ * |       |	   |	   |	   |	   |	   |progress|Ready |
+ * -----------------------------------------------------------------
+ *
+ * struct occ_response_buffer -	Defines the layout of OCC response buffer
+ * @flag:			Provides general status of the response
+ * @request_id:			Token to identify request
+ * @cmd:			Command requested
+ * @status:			Indicates success/failure status of
+ *				the command
+ * @data_size:			Response data length
+ * @data:			Response specific data
+ */
+struct occ_response_buffer {
+	u8 flag;
+	u8 request_id;
+	u8 cmd;
+	u8 status;
+	u16 data_size;
+	u8 data[MAX_OCC_RSP_DATA_LENGTH];
+} __packed;
+
+/**
+ * OCC-OPAL Shared Memory Interface Dynamic Data Vx90
+ *
+ * struct occ_dynamic_data -	Contains runtime attributes
+ * @occ_state:			Current state of OCC
+ * @major_version:		Major version number
+ * @minor_version:		Minor version number (backwards compatible)
+ *				Version 1 indicates GPU presence populated
+ * @gpus_present:		Bitmask of GPUs present (on systems where GPU
+ *				presence is detected through APSS)
+ * @cpu_throttle:		Reason for limiting the max pstate
+ * @mem_throttle:		Reason for throttling memory
+ * @quick_pwr_drop:		Indicates if QPD is asserted
+ * @pwr_shifting_ratio:		Indicates the current percentage of power to
+ *				take away from the CPU vs GPU when shifting
+ *				power to maintain a power cap. Value of 100
+ *				means take all power from CPU.
+ * @pwr_cap_type:		Indicates type of power cap in effect
+ * @hard_min_pwr_cap:		Hard minimum system power cap in Watts.
+ *				Guaranteed unless hardware failure
+ * @max_pwr_cap:		Maximum allowed system power cap in Watts
+ * @cur_pwr_cap:		Current system power cap
+ * @soft_min_pwr_cap:		Soft powercap minimum. OCC may or may not be
+ *				able to maintain this
+ * @spare/reserved:		Unused data
+ * @cmd:			Opal Command Buffer
+ * @rsp:			OCC Response Buffer
+ */
+struct occ_dynamic_data {
+	u8 occ_state;
+	u8 major_version;
+	u8 minor_version;
+	u8 gpus_present;
+	struct __packed { /* Version 0x90 */
+		u8 spare1;
+	} v9;
+	struct __packed { /* Version 0xA0 */
+		u8 wof_enabled;
+	} v10;
+	u8 cpu_throttle;
+	u8 mem_throttle;
+	u8 quick_pwr_drop;
+	u8 pwr_shifting_ratio;
+	u8 pwr_cap_type;
+	u16 hard_min_pwr_cap;
+	u16 max_pwr_cap;
+	u16 cur_pwr_cap;
+	u16 soft_min_pwr_cap;
+	u8 pad[110];
+	struct opal_command_buffer cmd;
+	struct occ_response_buffer rsp;
+} __packed;
+
+static bool occ_reset;
+static struct lock occ_lock = LOCK_UNLOCKED;
+static unsigned long homer_opal_data_offset;
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_PSTATE_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+		OPAL_CEC_HARDWARE, OPAL_INFO,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+		OPAL_CEC_HARDWARE, OPAL_UNRECOVERABLE_ERR_GENERAL,
+		OPAL_NA);
+
+/*
+ * POWER9 and newer platforms have pstate values which are unsigned
+ * positive values.  They are continuous set of unsigned integers
+ * [0 to +N] where Pmax is 0 and Pmin is N. The linear ordering of
+ * pstates for P9 has changed compared to P8.  Where P8 has negative
+ * pstate values advertised as [0 to -N] where Pmax is 0 and
+ * Pmin is -N.  The following routine helps to abstract pstate
+ * comparison with pmax and perform sanity checks on pstate limits.
+ */
+
+/**
+ * cmp_pstates: Compares the given two pstates and determines which
+ *              among them is associated with a higher pstate.
+ *
+ * @a,@b: The pstate ids of the pstates being compared.
+ *
+ * Returns: -1 : If pstate associated with @a is smaller than
+ *               the pstate associated with @b.
+ *	     0 : If pstates associated with @a and @b are equal.
+ *	     1 : If pstate associated with @a is greater than
+ *               the pstate associated with @b.
+ */
+static int cmp_pstates(int a, int b)
+{
+	/* P8 has 0 to -N (pmax to pmin), P9 has 0 to +N (pmax to pmin) */
+	if (a > b)
+		return (proc_gen == proc_gen_p8)? 1 : -1;
+	else if (a < b)
+		return (proc_gen == proc_gen_p8)? -1 : 1;
+
+	return 0;
+}
+
+static inline
+struct occ_pstate_table *get_occ_pstate_table(struct proc_chip *chip)
+{
+	return (struct occ_pstate_table *)
+	       (chip->homer_base + homer_opal_data_offset);
+}
+
+static inline
+struct occ_dynamic_data *get_occ_dynamic_data(struct proc_chip *chip)
+{
+	return (struct occ_dynamic_data *)
+	       (chip->homer_base + homer_opal_data_offset +
+		OPAL_DYNAMIC_DATA_OFFSET);
+}
+
+/*
+ * On Chips which have at least one active EX unit, check the
+ * HOMER area for pstate-table valid bit on versions 0x1 and 0x2, or
+ * HOMER dynamic area occ_state on version 0x90.
+ */
+static bool wait_for_all_occ_init(void)
+{
+	struct proc_chip *chip;
+	struct dt_node *xn;
+	struct occ_pstate_table *occ_data;
+	struct occ_dynamic_data *occ_dyn_data;
+	int tries;
+	uint64_t start_time, end_time;
+	uint32_t timeout = 0;
+
+	if (platform.occ_timeout)
+		timeout = platform.occ_timeout();
+
+	start_time = mftb();
+	for_each_chip(chip) {
+		u8 version;
+
+		/*
+		 * If the chip doesn't any EX unit present, then OCC
+		 * will not update the pstate-table. So, skip the
+		 * check.
+		 */
+		if (!chip->ex_present) {
+			prlog(PR_DEBUG, "OCC: Chip %02x has no active EX units. Skipping check\n",
+			      chip->id);
+			continue;
+		}
+
+		/* Check for valid homer address */
+		if (!chip->homer_base) {
+			/**
+			 * @fwts-label OCCInvalidHomerBase
+			 * @fwts-advice The HOMER base address for a chip
+			 * was not valid. This means that OCC (On Chip
+			 * Controller) will be non-functional and CPU
+			 * frequency scaling will not be functional. CPU may
+			 * be set to a safe, low frequency. Power savings in
+			 * CPU idle or CPU hotplug may be impacted.
+			 */
+			prlog(PR_ERR,"OCC: Chip: %x homer_base is not valid\n",
+				chip->id);
+			return false;
+		}
+
+		/* Get PState table address */
+		occ_data = get_occ_pstate_table(chip);
+
+		/*
+		 * Wait for the OCC to set an appropriate version bit.
+		 * The wait is needed since on some platforms (such P8
+		 * Tuletta), OCC is not loaded before OPAL boot. Hence
+		 * initialization can take a while.
+		 *
+		 * Note: Checking for occ_data->version == (0x01/0x02/0x90/0xA0)
+		 * is ok because we clear all of
+		 * homer_base+size before passing memory to host
+		 * services.  This ensures occ_data->version == 0x0
+		 * before OCC load.
+		 */
+		tries = timeout * 10;
+		while (tries--) {
+			version = occ_data->version;
+
+			if (version == 0x01 || version == 0x02 ||
+			    version == 0x90 || version == 0xA0)
+				break;
+
+			time_wait_ms(100);
+		}
+
+		version = occ_data->version;
+		switch (version) {
+		case 0x1:
+		case 0x2:
+		/*
+		 * OCC-OPAL interface version 0x1 and 0x2 do not have
+		 * the dynamic data.  Hence the the only way to figure out
+		 * if the OCC is up or not is to check the valid-bit
+		 * in the pstate table.
+		 */
+			if (occ_data->valid != 1) {
+				/**
+				 * @fwts-label OCCInvalidPStateTable
+				 * @fwts-advice The pstate table for a chip
+				 * was not valid. This means that OCC (On Chip
+				 * Controller) will be non-functional and CPU
+				 * frequency scaling will not be functional. CPU may
+				 * be set to a low, safe frequency. This means
+				 * that CPU idle states and CPU frequency scaling
+				 * may not be functional.
+				 */
+				prlog(PR_ERR, "OCC: Chip: %x PState table is not valid\n",
+				      chip->id);
+				return false;
+			}
+			break;
+
+		case 0x90:
+			/*
+			 * OCC-OPAL interface version 0x90 has a
+			 * dynamic data section.  This has an
+			 * occ_state field whose values inform about
+			 * the state of the OCC.
+			 *
+			 * 0x00 = OCC not running. No communication
+			 *        allowed.
+			 *
+			 * 0x01 = Standby. No communication allowed.
+			 *
+			 * 0x02 = Observation State. Communication
+			 *        allowed and is command dependent.
+			 *
+			 * 0x03 = Active State. Communication allowed
+			 *        and is command dependent.
+			 *
+			 * 0x04 = Safe State. No communication
+			 *        allowed. Just like CPU throttle
+			 *        status, some failures will not allow
+			 *        for OCC to update state to safe.
+			 *
+			 * 0x05 = Characterization State.
+			 *        Communication allowed and is command
+			 *        dependent.
+			 *
+			 * We will error out if OCC is not in the
+			 * Active State.
+			 *
+			 * XXX : Should we error out only if no
+			 *       communication is allowed with the
+			 *       OCC ?
+			 */
+			occ_dyn_data = get_occ_dynamic_data(chip);
+			if (occ_dyn_data->occ_state != 0x3) {
+				/**
+				 * @fwts-label OCCInactive
+				 * @fwts-advice The OCC for a chip was not active.
+				 * This means that CPU frequency scaling will
+				 * not be functional. CPU may be set to a low,
+				 * safe frequency. This means that CPU idle
+				 * states and CPU frequency scaling may not be
+				 * functional.
+				 */
+				prlog(PR_ERR, "OCC: Chip: %x: OCC not active\n",
+				      chip->id);
+				return false;
+			}
+			break;
+
+		case 0xA0:
+			/*
+			 * OCC-OPAL interface version 0x90 has a
+			 * dynamic data section.  This has an
+			 * occ_state field whose values inform about
+			 * the state of the OCC.
+			 *
+			 * 0x00 = OCC not running. No communication
+			 *        allowed.
+			 *
+			 * 0x01 = Standby. No communication allowed.
+			 *
+			 * 0x02 = Observation State. Communication
+			 *        allowed and is command dependent.
+			 *
+			 * 0x03 = Active State. Communication allowed
+			 *        and is command dependent.
+			 *
+			 * 0x04 = Safe State. No communication
+			 *        allowed. Just like CPU throttle
+			 *        status, some failures will not allow
+			 *        for OCC to update state to safe.
+			 *
+			 * 0x05 = Characterization State.
+			 *        Communication allowed and is command
+			 *        dependent.
+			 *
+			 * We will error out if OCC is not in the
+			 * Active State.
+			 *
+			 * XXX : Should we error out only if no
+			 *       communication is allowed with the
+			 *       OCC ?
+			 */
+			occ_dyn_data = get_occ_dynamic_data(chip);
+			if (occ_dyn_data->occ_state != 0x3) {
+				/**
+				 * @fwts-label OCCInactive
+				 * @fwts-advice The OCC for a chip was not active.
+				 * This means that CPU frequency scaling will
+				 * not be functional. CPU may be set to a low,
+				 * safe frequency. This means that CPU idle
+				 * states and CPU frequency scaling may not be
+				 * functional.
+				 */
+				prlog(PR_ERR, "OCC: Chip: %x: OCC not active\n",
+				      chip->id);
+				return false;
+			}
+			break;
+
+		default:
+			prlog(PR_ERR, "OCC: Unknown OCC-OPAL interface version.\n");
+			return false;
+		}
+
+		if (!chip->occ_functional)
+			chip->occ_functional = true;
+
+		prlog(PR_DEBUG, "OCC: Chip %02x Data (%016llx) = %016llx\n",
+		      chip->id, (uint64_t)occ_data, be64_to_cpu(*(__be64 *)occ_data));
+
+		if (version == 0x90 || version == 0xA0) {
+			occ_dyn_data = get_occ_dynamic_data(chip);
+			prlog(PR_DEBUG, "OCC: Chip %02x Dynamic Data (%016llx) = %016llx\n",
+			      chip->id, (uint64_t)occ_dyn_data,
+			      be64_to_cpu(*(__be64 *)occ_dyn_data));
+		}
+	}
+
+	end_time = mftb();
+	prlog(PR_NOTICE, "OCC: All Chip Rdy after %lu ms\n",
+	      tb_to_msecs(end_time - start_time));
+
+        dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+	        const struct dt_property *p;
+		p = dt_find_property(xn, "ibm,occ-functional-state");
+		if (!p)
+			dt_add_property_cells(xn, "ibm,occ-functional-state",
+					      0x1);
+	}
+	return true;
+}
+
+/*
+ * OCC provides pstate table entries in continuous descending order.
+ * Parse the pstate table to skip pstate_ids that are greater
+ * than Pmax. If a pstate_id is equal to Pmin then add it to
+ * the list and break from the loop as this is the last valid
+ * element in the pstate table.
+ */
+static void parse_pstates_v2(struct occ_pstate_table *data, __be32 *dt_id,
+			     __be32 *dt_freq, int nr_pstates, int pmax, int pmin)
+{
+	int i, j;
+
+	for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) {
+		if (cmp_pstates(data->v2.pstates[i].id, pmax) > 0)
+			continue;
+
+		dt_id[j] = cpu_to_be32(data->v2.pstates[i].id);
+		dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v2.pstates[i].freq_khz) / 1000);
+		j++;
+
+		if (data->v2.pstates[i].id == pmin)
+			break;
+	}
+
+	if (j != nr_pstates)
+		prerror("OCC: Expected pstates(%d) is not equal to parsed pstates(%d)\n",
+			nr_pstates, j);
+}
+
+static void parse_pstates_v9(struct occ_pstate_table *data, __be32 *dt_id,
+			     __be32 *dt_freq, int nr_pstates, int pmax, int pmin)
+{
+	int i, j;
+
+	for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) {
+		if (cmp_pstates(data->v9.pstates[i].id, pmax) > 0)
+			continue;
+
+		dt_id[j] = cpu_to_be32(data->v9.pstates[i].id);
+		dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v9.pstates[i].freq_khz) / 1000);
+		j++;
+
+		if (data->v9.pstates[i].id == pmin)
+			break;
+	}
+
+	if (j != nr_pstates)
+		prerror("OCC: Expected pstates(%d) is not equal to parsed pstates(%d)\n",
+			nr_pstates, j);
+}
+
+static void parse_pstates_v10(struct occ_pstate_table *data, __be32 *dt_id,
+			     __be32 *dt_freq, int nr_pstates, int pmax, int pmin)
+{
+	int i, j;
+	int invalid = 0;
+
+	for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) {
+		if (cmp_pstates(data->v10.pstates[i].id, pmax) > 0)
+			continue;
+
+		if (!data->v10.pstates[i].valid) {
+			prlog(PR_WARNING, "OCC: Found Invalid pstate with index %d. Skipping it.\n", i);
+			invalid++;
+			continue;
+		}
+
+		dt_id[j] = cpu_to_be32(data->v10.pstates[i].id);
+		dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v10.pstates[i].freq_khz) / 1000);
+		j++;
+
+		if (data->v10.pstates[i].id == pmin)
+			break;
+	}
+
+	if ((j + invalid) != nr_pstates) {
+		prerror("OCC: Expected pstates(%d) not equal to (Parsed pstates(%d) + Invalid Pstates (%d))\n",
+			nr_pstates, j, invalid);
+	}
+}
+
+static void parse_vid(struct occ_pstate_table *occ_data,
+		      struct dt_node *node, u8 nr_pstates,
+		      int pmax, int pmin)
+{
+	u8 *dt_vdd, *dt_vcs;
+	int i, j;
+
+	dt_vdd = malloc(nr_pstates);
+	assert(dt_vdd);
+	dt_vcs = malloc(nr_pstates);
+	assert(dt_vcs);
+
+	for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) {
+		if (cmp_pstates(occ_data->v2.pstates[i].id, pmax) > 0)
+			continue;
+
+		dt_vdd[j] = occ_data->v2.pstates[i].vdd;
+		dt_vcs[j] = occ_data->v2.pstates[i].vcs;
+		j++;
+
+		if (occ_data->v2.pstates[i].id == pmin)
+			break;
+	}
+
+	dt_add_property(node, "ibm,pstate-vdds", dt_vdd, nr_pstates);
+	dt_add_property(node, "ibm,pstate-vcss", dt_vcs, nr_pstates);
+
+	free(dt_vdd);
+	free(dt_vcs);
+}
+
+/* Add device tree properties to describe pstates states */
+/* Return nominal pstate to set in each core */
+static bool add_cpu_pstate_properties(struct dt_node *power_mgt,
+				      int *pstate_nom)
+{
+	struct proc_chip *chip;
+	uint64_t occ_data_area;
+	struct occ_pstate_table *occ_data = NULL;
+	struct occ_dynamic_data *occ_dyn_data;
+	/* Arrays for device tree */
+	__be32 *dt_id, *dt_freq;
+	int pmax, pmin, pnom;
+	u8 nr_pstates;
+	bool ultra_turbo_supported;
+	int i, major, minor;
+
+	prlog(PR_DEBUG, "OCC: CPU pstate state device tree init\n");
+
+	/*
+	 * Find first chip with an OCC which has as a valid
+	 * pstate-table
+	 */
+	for_each_chip(chip) {
+		occ_data = get_occ_pstate_table(chip);
+
+		/* Dump first 16 bytes of PState table */
+		occ_data_area = (uint64_t)occ_data;
+		prlog(PR_DEBUG, "OCC: Chip %02d :Data (%16llx) = %16llx %16llx\n",
+			chip->id, occ_data_area,
+			be64_to_cpu(*(__be64 *)occ_data_area),
+			be64_to_cpu(*(__be64 *)(occ_data_area + 8)));
+
+		if (occ_data->valid)
+			break;
+		/*
+		 * XXX : Error out if !occ_data->valid but Chip has at
+		 * least one EX Unit?
+		 */
+	}
+
+	assert(occ_data);
+	if (!occ_data->valid) {
+		/**
+		 * @fwts-label OCCInvalidPStateTableDT
+		 * @fwts-advice The pstate tables for none of the chips
+		 * are valid. This means that OCC (On Chip
+		 * Controller) will be non-functional. This means
+		 * that CPU idle states and CPU frequency scaling
+		 * will not be functional as OPAL doesn't populate
+		 * the device tree with pstates in this case.
+		 */
+		prlog(PR_ERR, "OCC: PState table is not valid\n");
+		return false;
+	}
+
+	/*
+	 * Workload-Optimized-Frequency(WOF) or Ultra-Turbo is supported
+	 * from version 0x02 onwards. If WOF is disabled then, the max
+	 * ultra_turbo pstate will be equal to max turbo pstate.
+	 */
+	ultra_turbo_supported = true;
+
+	major = occ_data->version >> 4;
+	minor = occ_data->version & 0xF;
+
+	/* Parse Pmax, Pmin and Pnominal */
+	switch (major) {
+	case 0:
+		if (proc_gen >= proc_gen_p9) {
+			/**
+			 * @fwts-label OCCInvalidVersion02
+			 * @fwts-advice The PState table layout version is not
+			 * supported in P9. So OPAL will not parse the PState
+			 * table. CPU frequency scaling will not be functional
+			 * as frequency and pstate-ids are not added to DT.
+			 */
+			prerror("OCC: Version %x is not supported in P9\n",
+				occ_data->version);
+			return false;
+		}
+		if (minor == 0x1)
+			ultra_turbo_supported = false;
+		pmin = occ_data->v2.pstate_min;
+		pnom = occ_data->v2.pstate_nom;
+		if (ultra_turbo_supported)
+			pmax = occ_data->v2.pstate_ultra_turbo;
+		else
+			pmax = occ_data->v2.pstate_turbo;
+		break;
+	case 0x9:
+		if (proc_gen == proc_gen_p8) {
+			/**
+			 * @fwts-label OCCInvalidVersion90
+			 * @fwts-advice The PState table layout version is not
+			 * supported in P8. So OPAL will not parse the PState
+			 * table. CPU frequency scaling will not be functional
+			 * as frequency and pstate-ids are not added to DT.
+			 */
+			prerror("OCC: Version %x is not supported in P8\n",
+				occ_data->version);
+			return false;
+		}
+		pmin = occ_data->v9.pstate_min;
+		pnom = occ_data->v9.pstate_nom;
+		pmax = occ_data->v9.pstate_ultra_turbo;
+		break;
+	case 0xA:
+		pmin = occ_data->v10.pstate_min;
+		pnom = occ_data->v10.pstate_fixed_freq;
+		occ_dyn_data = get_occ_dynamic_data(chip);
+		if (occ_dyn_data->v10.wof_enabled)
+			pmax = occ_data->v10.pstate_ultra_turbo;
+		else
+			pmax = occ_data->v10.pstate_fmax;
+		break;
+	default:
+		/**
+		 * @fwts-label OCCUnsupportedVersion
+		 * @fwts-advice The PState table layout version is not
+		 * supported. So OPAL will not parse the PState table.
+		 * CPU frequency scaling will not be functional as OPAL
+		 * doesn't populate the device tree with pstates.
+		 */
+		prerror("OCC: Unsupported pstate table layout version %d\n",
+			occ_data->version);
+		return false;
+	}
+
+	/* Sanity check for pstate limits */
+	if (cmp_pstates(pmin, pmax) > 0) {
+		/**
+		 * @fwts-label OCCInvalidPStateLimits
+		 * @fwts-advice The min pstate is greater than the
+		 * max pstate, this could be due to corrupted/invalid
+		 * data in OCC-OPAL shared memory region. So OPAL has
+		 * not added pstates to device tree. This means that
+		 * CPU Frequency management will not be functional in
+		 * the host.
+		 */
+		prerror("OCC: Invalid pstate limits. Pmin(%d) > Pmax (%d)\n",
+			pmin, pmax);
+		return false;
+	}
+
+	if (cmp_pstates(pnom, pmax) > 0) {
+		/**
+		 * @fwts-label OCCInvalidNominalPState
+		 * @fwts-advice The nominal pstate is greater than the
+		 * max pstate, this could be due to corrupted/invalid
+		 * data in OCC-OPAL shared memory region. So OPAL has
+		 * limited the nominal pstate to max pstate.
+		 */
+		prerror("OCC: Clipping nominal pstate(%d) to Pmax(%d)\n",
+			pnom, pmax);
+		pnom = pmax;
+	}
+
+	nr_pstates = labs(pmax - pmin) + 1;
+	prlog(PR_DEBUG, "OCC: Version %x Min %d Nom %d Max %d Nr States %d\n",
+	      occ_data->version, pmin, pnom, pmax, nr_pstates);
+	if (((major == 0x9 || major == 0xA) && nr_pstates <= 1) ||
+	    (major == 0 && (nr_pstates <= 1 || nr_pstates > 128))) {
+		/**
+		 * @fwts-label OCCInvalidPStateRange
+		 * @fwts-advice The number of pstates is outside the valid
+		 * range (currently <=1 or > 128 on p8, >255 on P9), so OPAL
+		 * has not added pstates to the device tree. This means that
+		 * OCC (On Chip Controller) will be non-functional. This means
+		 * that CPU idle states and CPU frequency scaling
+		 * will not be functional.
+		 */
+		prerror("OCC: OCC range is not valid; No of pstates = %d\n",
+			nr_pstates);
+		return false;
+	}
+
+	dt_id = malloc(nr_pstates * sizeof(__be32));
+	assert(dt_id);
+	dt_freq = malloc(nr_pstates * sizeof(__be32));
+	assert(dt_freq);
+
+	switch (major) {
+	case 0:
+		parse_pstates_v2(occ_data, dt_id, dt_freq, nr_pstates,
+				 pmax, pmin);
+		break;
+	case 0x9:
+		parse_pstates_v9(occ_data, dt_id, dt_freq, nr_pstates,
+				 pmax, pmin);
+		break;
+	case 0xA:
+		parse_pstates_v10(occ_data, dt_id, dt_freq, nr_pstates,
+				 pmax, pmin);
+		break;
+	default:
+		return false;
+	}
+
+	/* Add the device-tree entries */
+	dt_add_property(power_mgt, "ibm,pstate-ids", dt_id,
+			nr_pstates * sizeof(__be32));
+	dt_add_property(power_mgt, "ibm,pstate-frequencies-mhz", dt_freq,
+			nr_pstates * sizeof(__be32));
+	dt_add_property_cells(power_mgt, "ibm,pstate-min", pmin);
+	dt_add_property_cells(power_mgt, "ibm,pstate-nominal", pnom);
+	dt_add_property_cells(power_mgt, "ibm,pstate-max", pmax);
+
+	free(dt_freq);
+	free(dt_id);
+
+	/*
+	 * Parse and add WOF properties: turbo, ultra-turbo and core_max array.
+	 * core_max[1..n] array provides the max sustainable pstate that can be
+	 * achieved with i active cores in the chip.
+	 */
+	if (ultra_turbo_supported) {
+		int pturbo, pultra_turbo;
+		u8 nr_cores = get_available_nr_cores_in_chip(chip->id);
+		__be32 *dt_cmax;
+
+		dt_cmax = malloc(nr_cores * sizeof(u32));
+		assert(dt_cmax);
+		switch (major) {
+		case 0:
+			pturbo = occ_data->v2.pstate_turbo;
+			pultra_turbo = occ_data->v2.pstate_ultra_turbo;
+			for (i = 0; i < nr_cores; i++)
+				dt_cmax[i] = cpu_to_be32(occ_data->v2.core_max[i]);
+			break;
+		case 0x9:
+			pturbo = occ_data->v9.pstate_turbo;
+			pultra_turbo = occ_data->v9.pstate_ultra_turbo;
+			for (i = 0; i < nr_cores; i++)
+				dt_cmax[i] = cpu_to_be32(occ_data->v9.core_max[i]);
+			break;
+		case 0xA:
+			pturbo = occ_data->v10.pstate_base;
+			pultra_turbo = occ_data->v10.pstate_ultra_turbo;
+			for (i = 0; i < nr_cores; i++)
+				dt_cmax[i] = cpu_to_be32(occ_data->v10.core_max[i]);
+			break;
+		default:
+			return false;
+		}
+
+		if (cmp_pstates(pturbo, pmax) > 0) {
+			prerror("OCC: Clipping turbo pstate(%d) to Pmax(%d)\n",
+				pturbo, pmax);
+			dt_add_property_cells(power_mgt, "ibm,pstate-turbo",
+					      pmax);
+		} else {
+			dt_add_property_cells(power_mgt, "ibm,pstate-turbo",
+					      pturbo);
+		}
+
+		dt_add_property_cells(power_mgt, "ibm,pstate-ultra-turbo",
+				      pultra_turbo);
+		dt_add_property(power_mgt, "ibm,pstate-core-max", dt_cmax,
+				nr_cores * sizeof(u32));
+
+		dt_add_property_cells(power_mgt, "ibm,pstate-base", pturbo);
+		free(dt_cmax);
+	}
+
+	if (major == 0x9 || major == 0xA)
+		goto out;
+
+	dt_add_property_cells(power_mgt, "#address-cells", 2);
+	dt_add_property_cells(power_mgt, "#size-cells", 1);
+
+	/* Add chip specific pstate properties */
+	for_each_chip(chip) {
+		struct dt_node *occ_node;
+
+		occ_data = get_occ_pstate_table(chip);
+		occ_node = dt_new_addr(power_mgt, "occ", (uint64_t)occ_data);
+		if (!occ_node) {
+			/**
+			 * @fwts-label OCCDTFailedNodeCreation
+			 * @fwts-advice Failed to create
+			 * /ibm,opal/power-mgt/occ. Per-chip pstate properties
+			 * are not added to Device Tree.
+			 */
+			prerror("OCC: Failed to create /ibm,opal/power-mgt/occ@%llx\n",
+				(uint64_t)occ_data);
+			return false;
+		}
+
+		dt_add_property_cells(occ_node, "reg",
+				      hi32((uint64_t)occ_data),
+				      lo32((uint64_t)occ_data),
+				      OPAL_DYNAMIC_DATA_OFFSET +
+				      sizeof(struct occ_dynamic_data));
+		dt_add_property_cells(occ_node, "ibm,chip-id", chip->id);
+
+		/*
+		 * Parse and add pstate Voltage Identifiers (VID) to DT which
+		 * are provided by OCC in version 0x01 and 0x02
+		 */
+		parse_vid(occ_data, occ_node, nr_pstates, pmax, pmin);
+	}
+out:
+	/* Return pstate to set for each core */
+	*pstate_nom = pnom;
+	return true;
+}
+
+/*
+ * Prepare chip for pstate transitions
+ */
+
+static bool cpu_pstates_prepare_core(struct proc_chip *chip,
+				     struct cpu_thread *c,
+				     int pstate_nom)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	uint64_t tmp, pstate;
+	int rc;
+
+	/*
+	 * Currently Fastsleep init clears EX_PM_SPR_OVERRIDE_EN.
+	 * Need to ensure only relevant bits are inited
+	 */
+
+	/* Init PM GP1 for SCOM based PSTATE control to set nominal freq
+	 *
+	 * Use the OR SCOM to set the required bits in PM_GP1 register
+	 * since the OCC might be mainpulating the PM_GP1 register as well.
+	 */
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SET_GP1),
+			 EX_PM_SETUP_GP1_PM_SPR_OVERRIDE_EN);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"OCC: Failed to write PM_GP1 in pstates init\n");
+		return false;
+	}
+
+	/* Set new pstate to core */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMCR), &tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"OCC: Failed to read PM_PPMCR from OCC in pstates init\n");
+		return false;
+	}
+	tmp = tmp & ~0xFFFF000000000000ULL;
+	pstate = ((uint64_t) pstate_nom) & 0xFF;
+	tmp = tmp | (pstate << 56) | (pstate << 48);
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMCR), tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"OCC: Failed to write PM_PPMCR in pstates init\n");
+		return false;
+	}
+	time_wait_ms(1); /* Wait for PState to change */
+	/*
+	 * Init PM GP1 for SPR based PSTATE control.
+	 * Once OCC is active EX_PM_SETUP_GP1_DPLL_FREQ_OVERRIDE_EN will be
+	 * cleared by OCC.  Sapphire need not clear.
+	 * However wait for DVFS state machine to become idle after min->nominal
+	 * transition initiated above.  If not switch over to SPR control could fail.
+	 *
+	 * Use the AND SCOM to clear the required bits in PM_GP1 register
+	 * since the OCC might be mainpulating the PM_GP1 register as well.
+	 */
+	tmp = ~EX_PM_SETUP_GP1_PM_SPR_OVERRIDE_EN;
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_CLEAR_GP1),
+			tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"OCC: Failed to write PM_GP1 in pstates init\n");
+		return false;
+	}
+
+	/* Just debug */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMSR), &tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"OCC: Failed to read PM_PPMSR from OCC"
+				 "in pstates init\n");
+		return false;
+	}
+	prlog(PR_DEBUG, "OCC: Chip %x Core %x PPMSR %016llx\n",
+	      chip->id, core, tmp);
+
+	/*
+	 * If PMSR is still in transition at this point due to PState change
+	 * initiated above, then the switchover to SPR may not work.
+	 * ToDo: Check for DVFS state machine idle before change.
+	 */
+
+	return true;
+}
+
+static bool occ_opal_msg_outstanding = false;
+static void occ_msg_consumed(void *data __unused, int status __unused)
+{
+	lock(&occ_lock);
+	occ_opal_msg_outstanding = false;
+	unlock(&occ_lock);
+}
+
+static inline u8 get_cpu_throttle(struct proc_chip *chip)
+{
+	struct occ_pstate_table *pdata = get_occ_pstate_table(chip);
+	struct occ_dynamic_data *data;
+
+	switch (pdata->version >> 4) {
+	case 0:
+		return pdata->v2.throttle;
+	case 0x9:
+	case 0xA:
+		data = get_occ_dynamic_data(chip);
+		return data->cpu_throttle;
+	default:
+		return 0;
+	};
+}
+
+bool is_occ_reset(void)
+{
+	return occ_reset;
+}
+
+static void occ_throttle_poll(void *data __unused)
+{
+	struct proc_chip *chip;
+	struct occ_pstate_table *occ_data;
+	struct opal_occ_msg occ_msg;
+	int rc;
+
+	if (!try_lock(&occ_lock))
+		return;
+	if (occ_reset) {
+		int inactive = 0;
+
+		for_each_chip(chip) {
+			occ_data = get_occ_pstate_table(chip);
+			if (occ_data->valid != 1) {
+				inactive = 1;
+				break;
+			}
+		}
+		if (!inactive) {
+			/*
+			 * Queue OCC_THROTTLE with throttle status as 0 to
+			 * indicate all OCCs are active after a reset.
+			 */
+			occ_msg.type = cpu_to_be64(OCC_THROTTLE);
+			occ_msg.chip = 0;
+			occ_msg.throttle_status = 0;
+			rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL,
+					     sizeof(struct opal_occ_msg),
+					     &occ_msg);
+			if (!rc)
+				occ_reset = false;
+		}
+	} else {
+		if (occ_opal_msg_outstanding)
+			goto done;
+		for_each_chip(chip) {
+			u8 throttle;
+
+			occ_data = get_occ_pstate_table(chip);
+			throttle = get_cpu_throttle(chip);
+			if ((occ_data->valid == 1) &&
+			    (chip->throttle != throttle) &&
+			    (throttle <= OCC_MAX_THROTTLE_STATUS)) {
+				occ_msg.type = cpu_to_be64(OCC_THROTTLE);
+				occ_msg.chip = cpu_to_be64(chip->id);
+				occ_msg.throttle_status = cpu_to_be64(throttle);
+				rc = _opal_queue_msg(OPAL_MSG_OCC, NULL,
+						     occ_msg_consumed,
+						     sizeof(struct opal_occ_msg),
+						     &occ_msg);
+				if (!rc) {
+					chip->throttle = throttle;
+					occ_opal_msg_outstanding = true;
+					break;
+				}
+			}
+		}
+	}
+done:
+	unlock(&occ_lock);
+}
+
+/* OPAL-OCC Command/Response Interface */
+
+enum occ_state {
+	OCC_STATE_NOT_RUNNING		= 0x00,
+	OCC_STATE_STANDBY		= 0x01,
+	OCC_STATE_OBSERVATION		= 0x02,
+	OCC_STATE_ACTIVE		= 0x03,
+	OCC_STATE_SAFE			= 0x04,
+	OCC_STATE_CHARACTERIZATION	= 0x05,
+};
+
+enum occ_role {
+	OCC_ROLE_SLAVE		= 0x0,
+	OCC_ROLE_MASTER		= 0x1,
+};
+
+enum occ_cmd {
+	OCC_CMD_CLEAR_SENSOR_DATA,
+	OCC_CMD_SET_POWER_CAP,
+	OCC_CMD_SET_POWER_SHIFTING_RATIO,
+	OCC_CMD_SELECT_SENSOR_GROUP,
+};
+
+struct opal_occ_cmd_info {
+	enum	occ_cmd cmd;
+	u8	cmd_value;
+	u16	cmd_size;
+	u16	rsp_size;
+	int	timeout_ms;
+	u16	state_mask;
+	u8	role_mask;
+};
+
+static struct opal_occ_cmd_info occ_cmds[] = {
+	{	OCC_CMD_CLEAR_SENSOR_DATA,
+		0xD0, 4, 4, 1000,
+		PPC_BIT16(OCC_STATE_OBSERVATION) |
+		PPC_BIT16(OCC_STATE_ACTIVE) |
+		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
+	},
+	{	OCC_CMD_SET_POWER_CAP,
+		0xD1, 2, 2, 1000,
+		PPC_BIT16(OCC_STATE_OBSERVATION) |
+		PPC_BIT16(OCC_STATE_ACTIVE) |
+		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+		PPC_BIT8(OCC_ROLE_MASTER)
+	},
+	{	OCC_CMD_SET_POWER_SHIFTING_RATIO,
+		0xD2, 1, 1, 1000,
+		PPC_BIT16(OCC_STATE_OBSERVATION) |
+		PPC_BIT16(OCC_STATE_ACTIVE) |
+		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
+	},
+	{	OCC_CMD_SELECT_SENSOR_GROUP,
+		0xD3, 2, 2, 1000,
+		PPC_BIT16(OCC_STATE_OBSERVATION) |
+		PPC_BIT16(OCC_STATE_ACTIVE) |
+		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
+	},
+};
+
+enum occ_response_status {
+	OCC_RSP_SUCCESS			= 0x00,
+	OCC_RSP_INVALID_COMMAND		= 0x11,
+	OCC_RSP_INVALID_CMD_DATA_LENGTH	= 0x12,
+	OCC_RSP_INVALID_DATA		= 0x13,
+	OCC_RSP_INTERNAL_ERROR		= 0x15,
+};
+
+#define OCC_FLAG_RSP_READY		0x01
+#define OCC_FLAG_CMD_IN_PROGRESS	0x02
+#define OPAL_FLAG_CMD_READY		0x80
+
+struct opal_occ_cmd_data {
+	u8 *data;
+	enum occ_cmd cmd;
+};
+
+static struct cmd_interface {
+	struct lock queue_lock;
+	struct timer timeout;
+	struct opal_occ_cmd_data *cdata;
+	struct opal_command_buffer *cmd;
+	struct occ_response_buffer *rsp;
+	u8 *occ_state;
+	u8 *valid;
+	u32 chip_id;
+	u32 token;
+	u16 enabled_sensor_mask;
+	u8 occ_role;
+	u8 request_id;
+	bool cmd_in_progress;
+	bool retry;
+} *chips;
+
+static int nr_occs;
+
+static inline struct cmd_interface *get_chip_cmd_interface(int chip_id)
+{
+	int i;
+
+	for (i = 0; i < nr_occs; i++)
+		if (chips[i].chip_id == chip_id)
+			return &chips[i];
+
+	return NULL;
+}
+
+static inline bool occ_in_progress(struct cmd_interface *chip)
+{
+	return (chip->rsp->flag == OCC_FLAG_CMD_IN_PROGRESS);
+}
+
+static int write_occ_cmd(struct cmd_interface *chip)
+{
+	struct opal_command_buffer *cmd = chip->cmd;
+	enum occ_cmd ocmd = chip->cdata->cmd;
+
+	if (!chip->retry && occ_in_progress(chip)) {
+		chip->cmd_in_progress = false;
+		return OPAL_BUSY;
+	}
+
+	cmd->flag = chip->rsp->flag = 0;
+	cmd->cmd = occ_cmds[ocmd].cmd_value;
+	cmd->request_id = chip->request_id++;
+	cmd->data_size = occ_cmds[ocmd].cmd_size;
+	memcpy(&cmd->data, chip->cdata->data, cmd->data_size);
+	cmd->flag = OPAL_FLAG_CMD_READY;
+
+	schedule_timer(&chip->timeout,
+		       msecs_to_tb(occ_cmds[ocmd].timeout_ms));
+
+	return OPAL_ASYNC_COMPLETION;
+}
+
+static int64_t opal_occ_command(struct cmd_interface *chip, int token,
+				struct opal_occ_cmd_data *cdata)
+{
+	int rc;
+
+	if (!(*chip->valid) ||
+	    (!(PPC_BIT16(*chip->occ_state) & occ_cmds[cdata->cmd].state_mask)))
+		return OPAL_HARDWARE;
+
+	if (!(PPC_BIT8(chip->occ_role) & occ_cmds[cdata->cmd].role_mask))
+		return OPAL_PERMISSION;
+
+	lock(&chip->queue_lock);
+	if (chip->cmd_in_progress) {
+		rc = OPAL_BUSY;
+		goto out;
+	}
+
+	chip->cdata = cdata;
+	chip->token = token;
+	chip->cmd_in_progress = true;
+	chip->retry = false;
+	rc = write_occ_cmd(chip);
+out:
+	unlock(&chip->queue_lock);
+	return rc;
+}
+
+static inline bool sanity_check_opal_cmd(struct opal_command_buffer *cmd,
+					 struct cmd_interface *chip)
+{
+	return ((cmd->cmd == occ_cmds[chip->cdata->cmd].cmd_value) &&
+		(cmd->request_id == chip->request_id - 1) &&
+		(cmd->data_size == occ_cmds[chip->cdata->cmd].cmd_size));
+}
+
+static inline bool check_occ_rsp(struct opal_command_buffer *cmd,
+				 struct occ_response_buffer *rsp)
+{
+	if (cmd->cmd != rsp->cmd) {
+		prlog(PR_DEBUG, "OCC: Command value mismatch in OCC response"
+		      "rsp->cmd = %d cmd->cmd = %d\n", rsp->cmd, cmd->cmd);
+		return false;
+	}
+
+	if (cmd->request_id != rsp->request_id) {
+		prlog(PR_DEBUG, "OCC: Request ID mismatch in OCC response"
+		      "rsp->request_id = %d cmd->request_id = %d\n",
+		      rsp->request_id, cmd->request_id);
+		return false;
+	}
+
+	return true;
+}
+
+static inline void queue_occ_rsp_msg(int token, int rc)
+{
+	int ret;
+
+	ret = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			cpu_to_be64(token),
+			cpu_to_be64(rc));
+	if (ret)
+		prerror("OCC: Failed to queue OCC response status message\n");
+}
+
+static void occ_cmd_timeout_handler(struct timer *t __unused, void *data,
+				    uint64_t now __unused)
+{
+	struct cmd_interface *chip = data;
+
+	lock(&chip->queue_lock);
+	if (!chip->cmd_in_progress)
+		goto exit;
+
+	if (!chip->retry) {
+		prlog(PR_DEBUG, "OCC: Command timeout, retrying\n");
+		chip->retry = true;
+		write_occ_cmd(chip);
+	} else {
+		chip->cmd_in_progress = false;
+		queue_occ_rsp_msg(chip->token, OPAL_TIMEOUT);
+		prlog(PR_DEBUG, "OCC: Command timeout after retry\n");
+	}
+exit:
+	unlock(&chip->queue_lock);
+}
+
+static int read_occ_rsp(struct occ_response_buffer *rsp)
+{
+	switch (rsp->status) {
+	case OCC_RSP_SUCCESS:
+		return OPAL_SUCCESS;
+	case OCC_RSP_INVALID_COMMAND:
+		prlog(PR_DEBUG, "OCC: Rsp status: Invalid command\n");
+		break;
+	case OCC_RSP_INVALID_CMD_DATA_LENGTH:
+		prlog(PR_DEBUG, "OCC: Rsp status: Invalid command data length\n");
+		break;
+	case OCC_RSP_INVALID_DATA:
+		prlog(PR_DEBUG, "OCC: Rsp status: Invalid command data\n");
+		break;
+	case OCC_RSP_INTERNAL_ERROR:
+		prlog(PR_DEBUG, "OCC: Rsp status: OCC internal error\n");
+		break;
+	default:
+		break;
+	}
+
+	/* Clear the OCC response flag */
+	rsp->flag = 0;
+	return OPAL_INTERNAL_ERROR;
+}
+
+static void handle_occ_rsp(uint32_t chip_id)
+{
+	struct cmd_interface *chip;
+	struct opal_command_buffer *cmd;
+	struct occ_response_buffer *rsp;
+
+	chip = get_chip_cmd_interface(chip_id);
+	if (!chip)
+		return;
+
+	cmd = chip->cmd;
+	rsp = chip->rsp;
+
+	/*Read rsp*/
+	if (rsp->flag != OCC_FLAG_RSP_READY)
+		return;
+	lock(&chip->queue_lock);
+	if (!chip->cmd_in_progress)
+		goto exit;
+
+	cancel_timer(&chip->timeout);
+	if (!sanity_check_opal_cmd(cmd, chip) ||
+	    !check_occ_rsp(cmd, rsp)) {
+		if (!chip->retry) {
+			prlog(PR_DEBUG, "OCC: Command-response mismatch, retrying\n");
+			chip->retry = true;
+			write_occ_cmd(chip);
+		} else {
+			chip->cmd_in_progress = false;
+			queue_occ_rsp_msg(chip->token, OPAL_INTERNAL_ERROR);
+			prlog(PR_DEBUG, "OCC: Command-response mismatch\n");
+		}
+		goto exit;
+	}
+
+	if (rsp->cmd == occ_cmds[OCC_CMD_SELECT_SENSOR_GROUP].cmd_value &&
+	    rsp->status == OCC_RSP_SUCCESS)
+		chip->enabled_sensor_mask = *(u16 *)chip->cdata->data;
+
+	chip->cmd_in_progress = false;
+	queue_occ_rsp_msg(chip->token, read_occ_rsp(chip->rsp));
+exit:
+	unlock(&chip->queue_lock);
+}
+
+bool occ_get_gpu_presence(struct proc_chip *chip, int gpu_num)
+{
+	struct occ_dynamic_data *ddata;
+	static int max_retries = 20;
+	static bool found = false;
+
+	assert(gpu_num <= 2);
+
+	ddata = get_occ_dynamic_data(chip);
+	while (!found && max_retries) {
+		if (ddata->major_version == 0 && ddata->minor_version >= 1) {
+			found = true;
+			break;
+		}
+		time_wait_ms(100);
+		max_retries--;
+		ddata = get_occ_dynamic_data(chip);
+	}
+
+	if (!found) {
+		prlog(PR_INFO, "OCC: No GPU slot presence, assuming GPU present\n");
+		return true;
+	}
+
+	return (bool)(ddata->gpus_present & 1 << gpu_num);
+}
+
+static void occ_add_powercap_sensors(struct dt_node *power_mgt);
+static void occ_add_psr_sensors(struct dt_node *power_mgt);
+
+static void occ_cmd_interface_init(void)
+{
+	struct occ_dynamic_data *data;
+	struct occ_pstate_table *pdata;
+	struct dt_node *power_mgt;
+	struct proc_chip *chip;
+	int i = 0, major;
+
+	/* Check if the OCC data is valid */
+	for_each_chip(chip) {
+		pdata = get_occ_pstate_table(chip);
+		if (!pdata->valid)
+			return;
+	}
+
+	chip = next_chip(NULL);
+	pdata = get_occ_pstate_table(chip);
+	major = pdata->version >> 4;
+	if (major != 0x9 || major != 0xA)
+		return;
+
+	for_each_chip(chip)
+		nr_occs++;
+
+	chips = malloc(sizeof(*chips) * nr_occs);
+	assert(chips);
+
+	for_each_chip(chip) {
+		pdata = get_occ_pstate_table(chip);
+		data = get_occ_dynamic_data(chip);
+		chips[i].chip_id = chip->id;
+		chips[i].occ_state = &data->occ_state;
+		chips[i].valid = &pdata->valid;
+		chips[i].cmd = &data->cmd;
+		chips[i].rsp = &data->rsp;
+		switch (major) {
+		case 0x9:
+			chips[i].occ_role = pdata->v9.occ_role;
+			break;
+		case 0xA:
+			chips[i].occ_role = pdata->v10.occ_role;
+			break;
+		}
+		init_lock(&chips[i].queue_lock);
+		chips[i].cmd_in_progress = false;
+		chips[i].request_id = 0;
+		chips[i].enabled_sensor_mask = OCC_ENABLED_SENSOR_MASK;
+		init_timer(&chips[i].timeout, occ_cmd_timeout_handler,
+			   &chips[i]);
+		i++;
+	}
+
+	power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt");
+	if (!power_mgt) {
+		prerror("OCC: dt node /ibm,opal/power-mgt not found\n");
+		return;
+	}
+
+	/* Add powercap sensors to DT */
+	occ_add_powercap_sensors(power_mgt);
+
+	/* Add power-shifting-ratio CPU-GPU sensors to DT */
+	occ_add_psr_sensors(power_mgt);
+}
+
+/* Powercap interface */
+enum sensor_powercap_occ_attr {
+	POWERCAP_OCC_SOFT_MIN,
+	POWERCAP_OCC_MAX,
+	POWERCAP_OCC_CUR,
+	POWERCAP_OCC_HARD_MIN,
+};
+
+static void occ_add_powercap_sensors(struct dt_node *power_mgt)
+{
+	struct dt_node *pcap, *node;
+	u32 handle;
+
+	pcap = dt_new(power_mgt, "powercap");
+	if (!pcap) {
+		prerror("OCC: Failed to create powercap node\n");
+		return;
+	}
+
+	dt_add_property_string(pcap, "compatible", "ibm,opal-powercap");
+	node = dt_new(pcap, "system-powercap");
+	if (!node) {
+		prerror("OCC: Failed to create system powercap node\n");
+		return;
+	}
+
+	handle = powercap_make_handle(POWERCAP_CLASS_OCC, POWERCAP_OCC_CUR);
+	dt_add_property_cells(node, "powercap-current", handle);
+
+	handle = powercap_make_handle(POWERCAP_CLASS_OCC,
+				      POWERCAP_OCC_SOFT_MIN);
+	dt_add_property_cells(node, "powercap-min", handle);
+
+	handle = powercap_make_handle(POWERCAP_CLASS_OCC, POWERCAP_OCC_MAX);
+	dt_add_property_cells(node, "powercap-max", handle);
+
+	handle = powercap_make_handle(POWERCAP_CLASS_OCC,
+				      POWERCAP_OCC_HARD_MIN);
+	dt_add_property_cells(node, "powercap-hard-min", handle);
+
+}
+
+int occ_get_powercap(u32 handle, u32 *pcap)
+{
+	struct occ_pstate_table *pdata;
+	struct occ_dynamic_data *ddata;
+	struct proc_chip *chip;
+
+	chip = next_chip(NULL);
+	pdata = get_occ_pstate_table(chip);
+	ddata = get_occ_dynamic_data(chip);
+
+	if (!pdata->valid)
+		return OPAL_HARDWARE;
+
+	switch (powercap_get_attr(handle)) {
+	case POWERCAP_OCC_SOFT_MIN:
+		*pcap = ddata->soft_min_pwr_cap;
+		break;
+	case POWERCAP_OCC_MAX:
+		*pcap = ddata->max_pwr_cap;
+		break;
+	case POWERCAP_OCC_CUR:
+		*pcap = ddata->cur_pwr_cap;
+		break;
+	case POWERCAP_OCC_HARD_MIN:
+		*pcap = ddata->hard_min_pwr_cap;
+		break;
+	default:
+		*pcap = 0;
+		return OPAL_UNSUPPORTED;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static u16 pcap_cdata;
+static struct opal_occ_cmd_data pcap_data = {
+	.data		= (u8 *)&pcap_cdata,
+	.cmd		= OCC_CMD_SET_POWER_CAP,
+};
+
+int __attribute__((__const__)) occ_set_powercap(u32 handle, int token, u32 pcap)
+{
+	struct occ_dynamic_data *ddata;
+	struct proc_chip *chip;
+	int i;
+
+	if (powercap_get_attr(handle) != POWERCAP_OCC_CUR)
+		return OPAL_PERMISSION;
+
+	if (!chips)
+		return OPAL_HARDWARE;
+
+	for (i = 0; i < nr_occs; i++)
+		if (chips[i].occ_role == OCC_ROLE_MASTER)
+			break;
+
+	if (!(*chips[i].valid))
+		return OPAL_HARDWARE;
+
+	chip = get_chip(chips[i].chip_id);
+	ddata = get_occ_dynamic_data(chip);
+
+	if (pcap == ddata->cur_pwr_cap)
+		return OPAL_SUCCESS;
+
+	if (pcap && (pcap > ddata->max_pwr_cap ||
+	    pcap < ddata->soft_min_pwr_cap))
+		return OPAL_PARAMETER;
+
+	pcap_cdata = pcap;
+	return opal_occ_command(&chips[i], token, &pcap_data);
+};
+
+/* Power-Shifting Ratio */
+enum psr_type {
+	PSR_TYPE_CPU_TO_GPU, /* 0% Cap GPU first, 100% Cap CPU first */
+};
+
+int occ_get_psr(u32 handle, u32 *ratio)
+{
+	struct occ_dynamic_data *ddata;
+	struct proc_chip *chip;
+	u8 i = psr_get_rid(handle);
+
+	if (psr_get_type(handle) != PSR_TYPE_CPU_TO_GPU)
+		return OPAL_UNSUPPORTED;
+
+	if (i > nr_occs)
+		return OPAL_UNSUPPORTED;
+
+	if (!(*chips[i].valid))
+		return OPAL_HARDWARE;
+
+	chip = get_chip(chips[i].chip_id);
+	ddata = get_occ_dynamic_data(chip);
+	*ratio = ddata->pwr_shifting_ratio;
+	return OPAL_SUCCESS;
+}
+
+static u8 psr_cdata;
+static struct opal_occ_cmd_data psr_data = {
+	.data		= &psr_cdata,
+	.cmd		= OCC_CMD_SET_POWER_SHIFTING_RATIO,
+};
+
+int occ_set_psr(u32 handle, int token, u32 ratio)
+{
+	struct occ_dynamic_data *ddata;
+	struct proc_chip *chip;
+	u8 i = psr_get_rid(handle);
+
+	if (psr_get_type(handle) != PSR_TYPE_CPU_TO_GPU)
+		return OPAL_UNSUPPORTED;
+
+	if (ratio > 100)
+		return OPAL_PARAMETER;
+
+	if (i > nr_occs)
+		return OPAL_UNSUPPORTED;
+
+	if (!(*chips[i].valid))
+		return OPAL_HARDWARE;
+
+	chip = get_chip(chips[i].chip_id);
+	ddata = get_occ_dynamic_data(chip);
+	if (ratio == ddata->pwr_shifting_ratio)
+		return OPAL_SUCCESS;
+
+	psr_cdata = ratio;
+	return opal_occ_command(&chips[i], token, &psr_data);
+}
+
+static void occ_add_psr_sensors(struct dt_node *power_mgt)
+{
+	struct dt_node *node;
+	int i;
+
+	node = dt_new(power_mgt, "psr");
+	if (!node) {
+		prerror("OCC: Failed to create power-shifting-ratio node\n");
+		return;
+	}
+
+	dt_add_property_string(node, "compatible",
+			       "ibm,opal-power-shift-ratio");
+	dt_add_property_cells(node, "#address-cells", 1);
+	dt_add_property_cells(node, "#size-cells", 0);
+	for (i = 0; i < nr_occs; i++) {
+		struct dt_node *cnode;
+		char name[20];
+		u32 handle = psr_make_handle(PSR_CLASS_OCC, i,
+					     PSR_TYPE_CPU_TO_GPU);
+
+		cnode = dt_new_addr(node, "cpu-to-gpu", handle);
+		if (!cnode) {
+			prerror("OCC: Failed to create power-shifting-ratio node\n");
+			return;
+		}
+
+		snprintf(name, 20, "cpu_to_gpu_%d", chips[i].chip_id);
+		dt_add_property_string(cnode, "label", name);
+		dt_add_property_cells(cnode, "handle", handle);
+		dt_add_property_cells(cnode, "reg", chips[i].chip_id);
+	}
+}
+
+/* OCC clear sensor limits CSM/Profiler/Job-scheduler */
+
+enum occ_sensor_limit_group {
+	OCC_SENSOR_LIMIT_GROUP_CSM		= 0x10,
+	OCC_SENSOR_LIMIT_GROUP_PROFILER		= 0x20,
+	OCC_SENSOR_LIMIT_GROUP_JOB_SCHED	= 0x40,
+};
+
+static u32 sensor_limit;
+static struct opal_occ_cmd_data slimit_data = {
+	.data		= (u8 *)&sensor_limit,
+	.cmd		= OCC_CMD_CLEAR_SENSOR_DATA,
+};
+
+int occ_sensor_group_clear(u32 group_hndl, int token)
+{
+	u32 limit = sensor_get_rid(group_hndl);
+	u8 i = sensor_get_attr(group_hndl);
+
+	if (i > nr_occs)
+		return OPAL_UNSUPPORTED;
+
+	switch (limit) {
+	case OCC_SENSOR_LIMIT_GROUP_CSM:
+	case OCC_SENSOR_LIMIT_GROUP_PROFILER:
+	case OCC_SENSOR_LIMIT_GROUP_JOB_SCHED:
+		break;
+	default:
+		return OPAL_UNSUPPORTED;
+	}
+
+	if (!(*chips[i].valid))
+		return OPAL_HARDWARE;
+
+	sensor_limit = limit << 24;
+	return opal_occ_command(&chips[i], token, &slimit_data);
+}
+
+static u16 sensor_enable;
+static struct opal_occ_cmd_data sensor_mask_data = {
+	.data		= (u8 *)&sensor_enable,
+	.cmd		= OCC_CMD_SELECT_SENSOR_GROUP,
+};
+
+int occ_sensor_group_enable(u32 group_hndl, int token, bool enable)
+{
+	u16 type = sensor_get_rid(group_hndl);
+	u8 i = sensor_get_attr(group_hndl);
+
+	if (i > nr_occs)
+		return OPAL_UNSUPPORTED;
+
+	switch (type) {
+	case OCC_SENSOR_TYPE_GENERIC:
+	case OCC_SENSOR_TYPE_CURRENT:
+	case OCC_SENSOR_TYPE_VOLTAGE:
+	case OCC_SENSOR_TYPE_TEMPERATURE:
+	case OCC_SENSOR_TYPE_UTILIZATION:
+	case OCC_SENSOR_TYPE_TIME:
+	case OCC_SENSOR_TYPE_FREQUENCY:
+	case OCC_SENSOR_TYPE_POWER:
+	case OCC_SENSOR_TYPE_PERFORMANCE:
+		break;
+	default:
+		return OPAL_UNSUPPORTED;
+	}
+
+	if (!(*chips[i].valid))
+		return OPAL_HARDWARE;
+
+	if (enable && (type & chips[i].enabled_sensor_mask))
+		return OPAL_SUCCESS;
+	else if (!enable && !(type & chips[i].enabled_sensor_mask))
+		return OPAL_SUCCESS;
+
+	sensor_enable = enable ? type | chips[i].enabled_sensor_mask :
+				~type & chips[i].enabled_sensor_mask;
+
+	return opal_occ_command(&chips[i], token, &sensor_mask_data);
+}
+
+void occ_add_sensor_groups(struct dt_node *sg, __be32 *phandles, u32 *ptype,
+			   int nr_phandles, int chipid)
+{
+	struct group_info {
+		int type;
+		const char *str;
+		u32 ops;
+	} groups[] = {
+		{ OCC_SENSOR_LIMIT_GROUP_CSM, "csm",
+		  OPAL_SENSOR_GROUP_CLEAR
+		},
+		{ OCC_SENSOR_LIMIT_GROUP_PROFILER, "profiler",
+		  OPAL_SENSOR_GROUP_CLEAR
+		},
+		{ OCC_SENSOR_LIMIT_GROUP_JOB_SCHED, "js",
+		  OPAL_SENSOR_GROUP_CLEAR
+		},
+		{ OCC_SENSOR_TYPE_GENERIC, "generic",
+		  OPAL_SENSOR_GROUP_ENABLE
+		},
+		{ OCC_SENSOR_TYPE_CURRENT, "curr",
+		  OPAL_SENSOR_GROUP_ENABLE
+		},
+		{ OCC_SENSOR_TYPE_VOLTAGE, "in",
+		  OPAL_SENSOR_GROUP_ENABLE
+		},
+		{ OCC_SENSOR_TYPE_TEMPERATURE, "temp",
+		  OPAL_SENSOR_GROUP_ENABLE
+		},
+		{ OCC_SENSOR_TYPE_UTILIZATION, "utilization",
+		  OPAL_SENSOR_GROUP_ENABLE
+		},
+		{ OCC_SENSOR_TYPE_TIME, "time",
+		  OPAL_SENSOR_GROUP_ENABLE
+		},
+		{ OCC_SENSOR_TYPE_FREQUENCY, "frequency",
+		  OPAL_SENSOR_GROUP_ENABLE
+		},
+		{ OCC_SENSOR_TYPE_POWER, "power",
+		  OPAL_SENSOR_GROUP_ENABLE
+		},
+		{ OCC_SENSOR_TYPE_PERFORMANCE, "performance",
+		  OPAL_SENSOR_GROUP_ENABLE
+		},
+	};
+	int i, j;
+
+	/*
+	 * Dont add sensor groups if cmd-interface is not intialized
+	 */
+	if (!chips)
+		return;
+
+	for (i = 0; i < nr_occs; i++)
+		if (chips[i].chip_id == chipid)
+			break;
+
+	for (j = 0; j < ARRAY_SIZE(groups); j++) {
+		struct dt_node *node;
+		char name[20];
+		u32 handle;
+
+		snprintf(name, 20, "occ-%s", groups[j].str);
+		handle = sensor_make_handler(SENSOR_OCC, 0,
+					     groups[j].type, i);
+		node = dt_new_addr(sg, name, handle);
+		if (!node) {
+			prerror("Failed to create sensor group nodes\n");
+			return;
+		}
+
+		dt_add_property_cells(node, "sensor-group-id", handle);
+		dt_add_property_string(node, "type", groups[j].str);
+
+		if (groups[j].type == OCC_SENSOR_TYPE_CURRENT ||
+		    groups[j].type == OCC_SENSOR_TYPE_VOLTAGE ||
+		    groups[j].type == OCC_SENSOR_TYPE_TEMPERATURE ||
+		    groups[j].type == OCC_SENSOR_TYPE_POWER) {
+			dt_add_property_string(node, "sensor-type",
+					      groups[j].str);
+			dt_add_property_string(node, "compatible",
+					       "ibm,opal-sensor");
+		}
+
+		dt_add_property_cells(node, "ibm,chip-id", chipid);
+		dt_add_property_cells(node, "reg", handle);
+		if (groups[j].ops == OPAL_SENSOR_GROUP_ENABLE) {
+			__be32 *_phandles;
+			int k, pcount = 0;
+
+			_phandles = malloc(sizeof(u32) * nr_phandles);
+			assert(_phandles);
+			for (k = 0; k < nr_phandles; k++)
+				if (ptype[k] == groups[j].type)
+					_phandles[pcount++] = phandles[k];
+			if (pcount)
+				dt_add_property(node, "sensors", _phandles,
+						pcount * sizeof(u32));
+			free(_phandles);
+		} else {
+			dt_add_property(node, "sensors", phandles,
+					nr_phandles * sizeof(u32));
+		}
+		dt_add_property_cells(node, "ops", groups[j].ops);
+	}
+}
+
+/* CPU-OCC PState init */
+/* Called after OCC init on P8 and P9 */
+void occ_pstates_init(void)
+{
+	struct proc_chip *chip;
+	struct cpu_thread *c;
+	struct dt_node *power_mgt;
+	int pstate_nom;
+	u32 freq_domain_mask;
+	u8 domain_runs_at;
+	static bool occ_pstates_initialized;
+
+	power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt");
+	if (!power_mgt) {
+		/**
+		 * @fwts-label OCCDTNodeNotFound
+		 * @fwts-advice Device tree node /ibm,opal/power-mgt not
+		 * found. OPAL didn't add pstate information to device tree.
+		 * Probably a firmware bug.
+		 */
+		prlog(PR_ERR, "OCC: dt node /ibm,opal/power-mgt not found\n");
+		return;
+	}
+
+	/* Handle fast reboots */
+	if (occ_pstates_initialized) {
+		struct dt_node *child;
+		int i;
+		const char *props[] = {
+				"ibm,pstate-core-max",
+				"ibm,pstate-frequencies-mhz",
+				"ibm,pstate-ids",
+				"ibm,pstate-max",
+				"ibm,pstate-min",
+				"ibm,pstate-nominal",
+				"ibm,pstate-turbo",
+				"ibm,pstate-ultra-turbo",
+				"ibm,pstate-base",
+				"#address-cells",
+				"#size-cells",
+				};
+
+		for (i = 0; i < ARRAY_SIZE(props); i++)
+			dt_check_del_prop(power_mgt, props[i]);
+
+		dt_for_each_child(power_mgt, child)
+			if (!strncmp(child->name, "occ", 3))
+				dt_free(child);
+	}
+
+	switch (proc_gen) {
+	case proc_gen_p8:
+		homer_opal_data_offset = P8_HOMER_OPAL_DATA_OFFSET;
+		break;
+	case proc_gen_p9:
+	case proc_gen_p10:
+		homer_opal_data_offset = P9_HOMER_OPAL_DATA_OFFSET;
+		break;
+	default:
+		return;
+	}
+
+	chip = next_chip(NULL);
+	if (!chip->homer_base) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"OCC: No HOMER detected, assuming no pstates\n");
+		return;
+	}
+
+	/* Wait for all OCC to boot up */
+	if(!wait_for_all_occ_init()) {
+		log_simple_error(&e_info(OPAL_RC_OCC_TIMEOUT),
+			 "OCC: Initialization on all chips did not complete"
+			 "(timed out)\n");
+		return;
+	}
+
+	/*
+	 * Check boundary conditions and add device tree nodes
+	 * and return nominal pstate to set for the core
+	 */
+	if (!add_cpu_pstate_properties(power_mgt, &pstate_nom)) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"Skiping core cpufreq init due to OCC error\n");
+	} else if (proc_gen == proc_gen_p8) {
+		/*
+		 * Setup host based pstates and set nominal frequency only in
+		 * P8.
+		 */
+		for_each_chip(chip)
+			for_each_available_core_in_chip(c, chip->id)
+				cpu_pstates_prepare_core(chip, c, pstate_nom);
+	}
+
+	if (occ_pstates_initialized)
+		return;
+
+	/* Add opal_poller to poll OCC throttle status of each chip */
+	for_each_chip(chip)
+		chip->throttle = 0;
+	opal_add_poller(occ_throttle_poll, NULL);
+	occ_pstates_initialized = true;
+
+	/* Init OPAL-OCC command-response interface */
+	occ_cmd_interface_init();
+
+	/* TODO Firmware plumbing required so as to have two modes to set
+	 * PMCR based on max in domain or most recently used. As of today,
+	 * it is always max in domain for P9.
+	 */
+	domain_runs_at = 0;
+	freq_domain_mask = 0;
+	if (proc_gen == proc_gen_p8) {
+		freq_domain_mask = P8_PIR_CORE_MASK;
+		domain_runs_at = FREQ_MOST_RECENTLY_SET;
+	} else if (proc_gen == proc_gen_p9) {
+		freq_domain_mask = P9_PIR_QUAD_MASK;
+		domain_runs_at = FREQ_MAX_IN_DOMAIN;
+	} else if (proc_gen == proc_gen_p10) {
+		freq_domain_mask = P10_PIR_CHIP_MASK;
+		domain_runs_at = FREQ_MAX_IN_DOMAIN;
+	} else {
+		assert(0);
+	}
+
+	dt_add_property_cells(power_mgt, "freq-domain-mask", freq_domain_mask);
+	dt_add_property_cells(power_mgt, "domain-runs-at", domain_runs_at);
+}
+
+int find_master_and_slave_occ(uint64_t **master, uint64_t **slave,
+			      int *nr_masters, int *nr_slaves)
+{
+	struct proc_chip *chip;
+	int nr_chips = 0, i;
+	uint64_t chipids[MAX_CHIPS];
+
+	for_each_chip(chip) {
+		chipids[nr_chips++] = chip->id;
+	}
+
+	chip = next_chip(NULL);
+	/*
+	 * Proc0 is the master OCC for Tuleta/Alpine boxes.
+	 * Hostboot expects the pair of chips for MURANO, so pass the sibling
+	 * chip id along with proc0 to hostboot.
+	 */
+	*nr_masters = (chip->type == PROC_CHIP_P8_MURANO) ? 2 : 1;
+	*master = (uint64_t *)malloc(*nr_masters * sizeof(uint64_t));
+
+	if (!*master) {
+		printf("OCC: master array alloc failure\n");
+		return -ENOMEM;
+	}
+
+	if (nr_chips - *nr_masters > 0) {
+		*nr_slaves = nr_chips - *nr_masters;
+		*slave = (uint64_t *)malloc(*nr_slaves * sizeof(uint64_t));
+		if (!*slave) {
+			printf("OCC: slave array alloc failure\n");
+			return -ENOMEM;
+		}
+	}
+
+	for (i = 0; i < nr_chips; i++) {
+		if (i < *nr_masters) {
+			*(*master + i) = chipids[i];
+			continue;
+		}
+		*(*slave + i - *nr_masters) = chipids[i];
+	}
+	return 0;
+}
+
+
+int occ_msg_queue_occ_reset(void)
+{
+	struct opal_occ_msg occ_msg = { CPU_TO_BE64(OCC_RESET), 0, 0 };
+	struct proc_chip *chip;
+	int rc;
+
+	lock(&occ_lock);
+	rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL,
+			     sizeof(struct opal_occ_msg), &occ_msg);
+	if (rc) {
+		prlog(PR_INFO, "OCC: Failed to queue OCC_RESET message\n");
+		goto out;
+	}
+	/*
+	 * Set 'valid' byte of occ_pstate_table to 0 since OCC
+	 * may not clear this byte on a reset.
+	 * OCC will set the 'valid' byte to 1 when it becomes
+	 * active again.
+	 */
+	for_each_chip(chip) {
+		struct occ_pstate_table *occ_data;
+
+		occ_data = get_occ_pstate_table(chip);
+		occ_data->valid = 0;
+		chip->throttle = 0;
+	}
+	occ_reset = true;
+out:
+	unlock(&occ_lock);
+	return rc;
+}
+
+#define PV_OCC_GP0		0x01000000
+#define PV_OCC_GP0_AND		0x01000004
+#define PV_OCC_GP0_OR		0x01000005
+#define PV_OCC_GP0_PNOR_OWNER	PPC_BIT(18) /* 1 = OCC / Host, 0 = BMC */
+
+static void occ_pnor_set_one_owner(uint32_t chip_id, enum pnor_owner owner)
+{
+	uint64_t reg, mask;
+
+	if (owner == PNOR_OWNER_HOST) {
+		reg = PV_OCC_GP0_OR;
+		mask = PV_OCC_GP0_PNOR_OWNER;
+	} else {
+		reg = PV_OCC_GP0_AND;
+		mask = ~PV_OCC_GP0_PNOR_OWNER;
+	}
+
+	xscom_write(chip_id, reg, mask);
+}
+
+void occ_pnor_set_owner(enum pnor_owner owner)
+{
+	struct proc_chip *chip;
+
+	for_each_chip(chip)
+		occ_pnor_set_one_owner(chip->id, owner);
+}
+
+
+#define P8_OCB_OCI_OCCMISC		0x6a020
+#define P8_OCB_OCI_OCCMISC_AND		0x6a021
+#define P8_OCB_OCI_OCCMISC_OR		0x6a022
+
+#define P9_OCB_OCI_OCCMISC		0x6c080
+#define P9_OCB_OCI_OCCMISC_CLEAR	0x6c081
+#define P9_OCB_OCI_OCCMISC_OR		0x6c082
+
+#define OCB_OCI_OCIMISC_IRQ		PPC_BIT(0)
+#define OCB_OCI_OCIMISC_IRQ_TMGT	PPC_BIT(1)
+#define OCB_OCI_OCIMISC_IRQ_SLW_TMR	PPC_BIT(14)
+#define OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY	PPC_BIT(15)
+
+#define P8_OCB_OCI_OCIMISC_MASK		(OCB_OCI_OCIMISC_IRQ_TMGT | \
+					 OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY | \
+					 OCB_OCI_OCIMISC_IRQ_SLW_TMR)
+
+#define OCB_OCI_OCIMISC_IRQ_I2C		PPC_BIT(2)
+#define OCB_OCI_OCIMISC_IRQ_SHMEM	PPC_BIT(3)
+#define P9_OCB_OCI_OCIMISC_MASK		(OCB_OCI_OCIMISC_IRQ_TMGT | \
+					 OCB_OCI_OCIMISC_IRQ_I2C | \
+					 OCB_OCI_OCIMISC_IRQ_SHMEM | \
+					 OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY)
+
+void occ_send_dummy_interrupt(void)
+{
+	struct psi *psi;
+	struct proc_chip *chip = get_chip(this_cpu()->chip_id);
+
+	/* Emulators don't do this */
+	if (chip_quirk(QUIRK_NO_OCC_IRQ))
+		return;
+
+	/* Find a functional PSI. This ensures an interrupt even if
+	 * the psihb on the current chip is not configured */
+	if (chip->psi)
+		psi = chip->psi;
+	else
+		psi = psi_find_functional_chip();
+
+	if (!psi) {
+		prlog_once(PR_WARNING, "PSI: no functional PSI HB found, "
+				       "no self interrupts delivered\n");
+		return;
+	}
+
+	switch (proc_gen) {
+	case proc_gen_p8:
+		xscom_write(psi->chip_id, P8_OCB_OCI_OCCMISC_OR,
+			    OCB_OCI_OCIMISC_IRQ |
+			    OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY);
+		break;
+	case proc_gen_p9:
+		xscom_write(psi->chip_id, P9_OCB_OCI_OCCMISC_OR,
+			    OCB_OCI_OCIMISC_IRQ |
+			    OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY);
+		break;
+	case proc_gen_p10:
+		xscom_write(psi->chip_id, P9_OCB_OCI_OCCMISC_OR,
+			    OCB_OCI_OCIMISC_IRQ |
+			    OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY);
+		break;
+	default:
+		break;
+	}
+}
+
+void occ_p8_interrupt(uint32_t chip_id)
+{
+	uint64_t ireg;
+	int64_t rc;
+
+	/* The OCC interrupt is used to mux up to 15 different sources */
+	rc = xscom_read(chip_id, P8_OCB_OCI_OCCMISC, &ireg);
+	if (rc) {
+		prerror("OCC: Failed to read interrupt status !\n");
+		/* Should we mask it in the XIVR ? */
+		return;
+	}
+	prlog(PR_TRACE, "OCC: IRQ received: %04llx\n", ireg >> 48);
+
+	/* Clear the bits */
+	xscom_write(chip_id, P8_OCB_OCI_OCCMISC_AND, ~ireg);
+
+	/* Dispatch */
+	if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT)
+		prd_tmgt_interrupt(chip_id);
+	if (ireg & OCB_OCI_OCIMISC_IRQ_SLW_TMR)
+		check_timers(true);
+
+	/* We may have masked-out OCB_OCI_OCIMISC_IRQ in the previous
+	 * OCCMISC_AND write. Check if there are any new source bits set,
+	 * and trigger another interrupt if so.
+	 */
+	rc = xscom_read(chip_id, P8_OCB_OCI_OCCMISC, &ireg);
+	if (!rc && (ireg & P8_OCB_OCI_OCIMISC_MASK))
+		xscom_write(chip_id, P8_OCB_OCI_OCCMISC_OR,
+			    OCB_OCI_OCIMISC_IRQ);
+}
+
+void occ_p9_interrupt(uint32_t chip_id)
+{
+	u64 ireg;
+	s64 rc;
+
+	/* The OCC interrupt is used to mux up to 15 different sources */
+	rc = xscom_read(chip_id, P9_OCB_OCI_OCCMISC, &ireg);
+	if (rc) {
+		prerror("OCC: Failed to read interrupt status !\n");
+		return;
+	}
+	prlog(PR_TRACE, "OCC: IRQ received: %04llx\n", ireg >> 48);
+
+	/* Clear the bits */
+	xscom_write(chip_id, P9_OCB_OCI_OCCMISC_CLEAR, ireg);
+
+	/* Dispatch */
+	if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT)
+		prd_tmgt_interrupt(chip_id);
+
+	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM) {
+		occ_throttle_poll(NULL);
+		handle_occ_rsp(chip_id);
+	}
+
+	if (ireg & OCB_OCI_OCIMISC_IRQ_I2C)
+		p9_i2c_bus_owner_change(chip_id);
+
+	/* We may have masked-out OCB_OCI_OCIMISC_IRQ in the previous
+	 * OCCMISC_AND write. Check if there are any new source bits set,
+	 * and trigger another interrupt if so.
+	 */
+	rc = xscom_read(chip_id, P9_OCB_OCI_OCCMISC, &ireg);
+	if (!rc && (ireg & P9_OCB_OCI_OCIMISC_MASK))
+		xscom_write(chip_id, P9_OCB_OCI_OCCMISC_OR,
+			    OCB_OCI_OCIMISC_IRQ);
+}
diff --git a/roms/skiboot/hw/ocmb.c b/roms/skiboot/hw/ocmb.c
new file mode 100644
index 000000000..bc470d0ab
--- /dev/null
+++ b/roms/skiboot/hw/ocmb.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Open Capi Memory Buffer chip
+ *
+ * Copyright 2020 IBM Corp.
+ */
+
+
+#define pr_fmt(fmt)	"OCMB: " fmt
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <device.h>
+#include <ocmb.h>
+#include <io.h>
+#include <inttypes.h>
+
+struct ocmb_range {
+	uint64_t start;
+	uint64_t end;
+	uint64_t flags;
+
+	/* flags come from hdat */
+#define ACCESS_8B PPC_BIT(0)
+#define ACCESS_4B PPC_BIT(1)
+#define ACCESS_SIZE_MASK (ACCESS_8B | ACCESS_4B)
+};
+
+struct ocmb {
+	struct scom_controller scom;
+	int range_count;
+	struct ocmb_range ranges[];
+};
+
+static const struct ocmb_range *find_range(const struct ocmb *o, uint64_t offset)
+{
+	int i;
+	uint64_t addr = offset & ~(HRMOR_BIT);
+
+	for (i = 0; i < o->range_count; i++) {
+		uint64_t start = o->ranges[i].start;
+		uint64_t end = o->ranges[i].end;
+
+		if (addr >= start && addr <= end)
+			return &o->ranges[i];
+	}
+
+	return NULL;
+}
+
+static int64_t ocmb_fake_scom_write(struct scom_controller *f,
+				    uint32_t __unused chip_id,
+				    uint64_t offset, uint64_t val)
+{
+	const struct ocmb *o = f->private;
+	const struct ocmb_range *r;
+
+	r = find_range(o, offset);
+	if (!r) {
+		prerror("no matching address range!\n");
+		return OPAL_XSCOM_ADDR_ERROR;
+	}
+
+	switch (r->flags & ACCESS_SIZE_MASK) {
+	case ACCESS_8B:
+		if (offset & 0x7)
+			return OPAL_XSCOM_ADDR_ERROR;
+		out_be64((void *) offset, val);
+		break;
+
+	case ACCESS_4B:
+		if (offset & 0x3)
+			return OPAL_XSCOM_ADDR_ERROR;
+		out_be32((void *) offset, val);
+		break;
+	default:
+		prerror("bad flags? %llx\n", r->flags);
+		return OPAL_XSCOM_ADDR_ERROR;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t ocmb_fake_scom_read(struct scom_controller *f,
+				   uint32_t chip_id __unused,
+				   uint64_t offset, uint64_t *val)
+{
+	const struct ocmb *o = f->private;
+	const struct ocmb_range *r = NULL;
+
+	r = find_range(o, offset);
+	if (!r) {
+		prerror("no matching address range!\n");
+		return OPAL_XSCOM_ADDR_ERROR;
+	}
+
+
+	switch (r->flags & ACCESS_SIZE_MASK) {
+	case ACCESS_8B:
+		if (offset & 0x7)
+			return OPAL_XSCOM_ADDR_ERROR;
+		*val = in_be64((void *) offset);
+		break;
+
+	case ACCESS_4B:
+		if (offset & 0x3)
+			return OPAL_XSCOM_ADDR_ERROR;
+		*val = in_be32((void *) offset);
+		break;
+	default:
+		prerror("bad flags? %llx\n", r->flags);
+		return OPAL_XSCOM_ADDR_ERROR;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static bool ocmb_probe_one(struct dt_node *ocmb_node)
+{
+	uint64_t chip_id = dt_prop_get_u32(ocmb_node, "ibm,chip-id");
+	const struct dt_property *flags;
+	int i = 0, num = 0;
+	struct ocmb *ocmb;
+
+	num = dt_count_addresses(ocmb_node);
+
+	ocmb = zalloc(sizeof(*ocmb) + sizeof(*ocmb->ranges) * num);
+	if (!ocmb)
+		return false;
+
+	ocmb->scom.private = ocmb;
+	ocmb->scom.part_id = chip_id;
+	ocmb->scom.write = ocmb_fake_scom_write;
+	ocmb->scom.read = ocmb_fake_scom_read;
+	ocmb->range_count = num;
+
+	flags = dt_require_property(ocmb_node, "flags", sizeof(u64) * num);
+
+	for (i = 0; i < num; i++) {
+		uint64_t start, size;
+
+		start = dt_get_address(ocmb_node, i, &size);
+
+		ocmb->ranges[i].start = start;
+		ocmb->ranges[i].end = start + size - 1;
+		ocmb->ranges[i].flags = dt_property_get_u64(flags, i);
+
+		prlog(PR_DEBUG, "Added range:  %" PRIx64 " - [%llx - %llx]\n",
+			chip_id, start, start + size - 1);
+	}
+
+	if (scom_register(&ocmb->scom))
+		prerror("Error registering fake scom\n");
+
+	dt_add_property(ocmb_node, "scom-controller", NULL, 0);
+	prlog(PR_NOTICE, "Added scom controller for %s\n", ocmb_node->name);
+
+	return true;
+}
+
+void ocmb_init(void)
+{
+	struct dt_node *dn;
+
+	dt_for_each_compatible(dt_root, dn, "ibm,explorer")
+		ocmb_probe_one(dn);
+}
diff --git a/roms/skiboot/hw/p8-i2c.c b/roms/skiboot/hw/p8-i2c.c
new file mode 100644
index 000000000..45815858e
--- /dev/null
+++ b/roms/skiboot/hw/p8-i2c.c
@@ -0,0 +1,1688 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * P8 i2c master
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#undef DEBUG
+
+#include <opal.h>
+#include <skiboot.h>
+#include <mem_region-malloc.h>
+#include <lock.h>
+#include <chip.h>
+#include <i2c.h>
+#include <xscom.h>
+#include <timebase.h>
+#include <timer.h>
+#include <opal-msg.h>
+#include <errorlog.h>
+#include <centaur.h>
+#include <debug_descriptor.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_I2C_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_I2C,
+		 OPAL_IO_SUBSYSTEM, OPAL_PREDICTIVE_ERR_DEGRADED_PERF,
+		 OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_I2C_START_REQ, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C,
+		 OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_I2C_TIMEOUT, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C,
+		 OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_I2C_TRANSFER, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C,
+		 OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_I2C_RESET, OPAL_INPUT_OUTPUT_ERR_EVT, OPAL_I2C,
+		 OPAL_IO_SUBSYSTEM, OPAL_INFO, OPAL_NA);
+
+#ifdef DEBUG
+#define DBG(fmt...) prlog(PR_ERR, "I2C: " fmt)
+#define I2C_TIMEOUT_IRQ_MS		100	/* 100ms/byte timeout */
+#define I2C_TIMEOUT_POLL_MS		4000	/* 4s/byte timeout */
+#else
+#define DBG(fmt...) prlog(PR_TRACE, "I2C: " fmt)
+#define I2C_TIMEOUT_IRQ_MS		1	/* 1ms/byte timeout */
+#define I2C_TIMEOUT_POLL_MS		4000	/* 4s/byte timeout */
+#endif
+
+/* How long to keep the sensor cache disabled after an access
+ * in milliseconds
+ */
+#define SENSOR_CACHE_EN_DELAY 10
+
+#define USEC_PER_SEC		1000000
+#define USEC_PER_MSEC		1000
+#define I2C_RESET_DELAY_MS	5 /* 5 msecs */
+#define I2C_FIFO_HI_LVL		4
+#define I2C_FIFO_LO_LVL		4
+
+/*
+ * I2C registers set.
+ * Below is the offset of registers from base which is stored in the
+ * 'struct p8_i2c_master'
+ */
+
+/* I2C FIFO register */
+#define I2C_FIFO_REG			0x4
+#define I2C_FIFO			PPC_BITMASK(0, 7)
+
+/* I2C command register */
+#define I2C_CMD_REG			0x5
+#define I2C_CMD_WITH_START		PPC_BIT(0)
+#define I2C_CMD_WITH_ADDR		PPC_BIT(1)
+#define I2C_CMD_READ_CONT		PPC_BIT(2)
+#define I2C_CMD_WITH_STOP		PPC_BIT(3)
+#define I2C_CMD_INTR_STEERING		PPC_BITMASK(6,7) /* P9 */
+#define   I2C_CMD_INTR_STEER_HOST	1
+#define   I2C_CMD_INTR_STEER_OCC	2
+#define I2C_CMD_DEV_ADDR		PPC_BITMASK(8, 14)
+#define I2C_CMD_READ_NOT_WRITE		PPC_BIT(15)
+#define I2C_CMD_LEN_BYTES		PPC_BITMASK(16, 31)
+#define I2C_MAX_TFR_LEN			0xfff0ull
+
+/* I2C mode register */
+#define I2C_MODE_REG			0x6
+#define I2C_MODE_BIT_RATE_DIV		PPC_BITMASK(0, 15)
+#define I2C_MODE_PORT_NUM		PPC_BITMASK(16, 21)
+#define I2C_MODE_ENHANCED		PPC_BIT(28)
+#define I2C_MODE_DIAGNOSTIC		PPC_BIT(29)
+#define I2C_MODE_PACING_ALLOW		PPC_BIT(30)
+#define I2C_MODE_WRAP			PPC_BIT(31)
+
+/* I2C watermark register */
+#define I2C_WATERMARK_REG		0x7
+#define I2C_WATERMARK_HIGH		PPC_BITMASK(16, 19)
+#define I2C_WATERMARK_LOW		PPC_BITMASK(24, 27)
+
+/*
+ * I2C interrupt mask and condition registers
+ *
+ * NB: The function of 0x9 and 0xa changes depending on whether you're reading
+ *     or writing to them. When read they return the interrupt condition bits
+ *     and on writes they update the interrupt mask register.
+ *
+ *  The bit definitions are the same for all the interrupt registers.
+ */
+#define I2C_INTR_MASK_REG		0x8
+
+#define I2C_INTR_RAW_COND_REG 		0x9 /* read */
+#define I2C_INTR_MASK_OR_REG		0x9 /* write*/
+
+#define I2C_INTR_COND_REG 		0xa /* read */
+#define I2C_INTR_MASK_AND_REG		0xa /* write */
+
+#define I2C_INTR_ALL			PPC_BITMASK(16, 31)
+#define I2C_INTR_INVALID_CMD		PPC_BIT(16)
+#define I2C_INTR_LBUS_PARITY_ERR	PPC_BIT(17)
+#define I2C_INTR_BKEND_OVERRUN_ERR	PPC_BIT(18)
+#define I2C_INTR_BKEND_ACCESS_ERR	PPC_BIT(19)
+#define I2C_INTR_ARBT_LOST_ERR		PPC_BIT(20)
+#define I2C_INTR_NACK_RCVD_ERR		PPC_BIT(21)
+#define I2C_INTR_DATA_REQ		PPC_BIT(22)
+#define I2C_INTR_CMD_COMP		PPC_BIT(23)
+#define I2C_INTR_STOP_ERR		PPC_BIT(24)
+#define I2C_INTR_I2C_BUSY		PPC_BIT(25)
+#define I2C_INTR_NOT_I2C_BUSY		PPC_BIT(26)
+#define I2C_INTR_SCL_EQ_1		PPC_BIT(28)
+#define I2C_INTR_SCL_EQ_0		PPC_BIT(29)
+#define I2C_INTR_SDA_EQ_1		PPC_BIT(30)
+#define I2C_INTR_SDA_EQ_0		PPC_BIT(31)
+
+/* I2C status register */
+#define I2C_RESET_I2C_REG		0xb
+#define I2C_RESET_ERRORS		0xc
+#define I2C_STAT_REG			0xb
+#define I2C_STAT_INVALID_CMD		PPC_BIT(0)
+#define I2C_STAT_LBUS_PARITY_ERR	PPC_BIT(1)
+#define I2C_STAT_BKEND_OVERRUN_ERR	PPC_BIT(2)
+#define I2C_STAT_BKEND_ACCESS_ERR	PPC_BIT(3)
+#define I2C_STAT_ARBT_LOST_ERR		PPC_BIT(4)
+#define I2C_STAT_NACK_RCVD_ERR		PPC_BIT(5)
+#define I2C_STAT_DATA_REQ		PPC_BIT(6)
+#define I2C_STAT_CMD_COMP		PPC_BIT(7)
+#define I2C_STAT_STOP_ERR		PPC_BIT(8)
+#define I2C_STAT_UPPER_THRS		PPC_BITMASK(9, 15)
+#define I2C_STAT_ANY_I2C_INTR		PPC_BIT(16)
+#define I2C_STAT_PORT_HISTORY_BUSY	PPC_BIT(19)
+#define I2C_STAT_SCL_INPUT_LEVEL	PPC_BIT(20)
+#define I2C_STAT_SDA_INPUT_LEVEL	PPC_BIT(21)
+#define I2C_STAT_PORT_BUSY		PPC_BIT(22)
+#define I2C_STAT_INTERFACE_BUSY         PPC_BIT(23)
+#define I2C_STAT_FIFO_ENTRY_COUNT	PPC_BITMASK(24, 31)
+
+#define I2C_STAT_ANY_ERR (I2C_STAT_INVALID_CMD | I2C_STAT_LBUS_PARITY_ERR | \
+			  I2C_STAT_BKEND_OVERRUN_ERR | \
+			  I2C_STAT_BKEND_ACCESS_ERR | I2C_STAT_ARBT_LOST_ERR | \
+			  I2C_STAT_NACK_RCVD_ERR | I2C_STAT_STOP_ERR)
+
+
+#define I2C_INTR_ACTIVE \
+	((I2C_STAT_ANY_ERR >> 16) | I2C_INTR_CMD_COMP | I2C_INTR_DATA_REQ)
+
+/* Pseudo-status used for timeouts */
+#define I2C_STAT_PSEUDO_TIMEOUT		PPC_BIT(63)
+
+
+/* I2C extended status register */
+#define I2C_EXTD_STAT_REG		0xc
+#define I2C_EXTD_STAT_FIFO_SIZE		PPC_BITMASK(0, 7)
+#define I2C_EXTD_STAT_MSM_CURSTATE	PPC_BITMASK(11, 15)
+#define I2C_EXTD_STAT_SCL_IN_SYNC	PPC_BIT(16)
+#define I2C_EXTD_STAT_SDA_IN_SYNC	PPC_BIT(17)
+#define I2C_EXTD_STAT_S_SCL		PPC_BIT(18)
+#define I2C_EXTD_STAT_S_SDA		PPC_BIT(19)
+#define I2C_EXTD_STAT_M_SCL		PPC_BIT(20)
+#define I2C_EXTD_STAT_M_SDA		PPC_BIT(21)
+#define I2C_EXTD_STAT_HIGH_WATER	PPC_BIT(22)
+#define I2C_EXTD_STAT_LOW_WATER		PPC_BIT(23)
+#define I2C_EXTD_STAT_I2C_BUSY		PPC_BIT(24)
+#define I2C_EXTD_STAT_SELF_BUSY		PPC_BIT(25)
+#define I2C_EXTD_STAT_I2C_VERSION	PPC_BITMASK(27, 31)
+
+/* I2C residual front end/back end length */
+#define I2C_RESIDUAL_LEN_REG		0xd
+#define I2C_RESIDUAL_FRONT_END		PPC_BITMASK(0, 15)
+#define I2C_RESIDUAL_BACK_END		PPC_BITMASK(16, 31)
+
+/* Port busy register */
+#define I2C_PORT_BUSY_REG		0xe
+#define I2C_SET_S_SCL_REG		0xd
+#define I2C_RESET_S_SCL_REG		0xf
+#define I2C_SET_S_SDA_REG		0x10
+#define I2C_RESET_S_SDA_REG		0x11
+
+enum p8_i2c_master_type {
+	I2C_POWER8,
+	I2C_CENTAUR,
+	MAX_I2C_TYPE,
+};
+
+struct p8_i2c_master {
+	struct dt_node		*dt_node;
+	struct lock		lock;		/* Lock to guard the members */
+	enum p8_i2c_master_type	type;		/* P8 vs. Centaur */
+	uint64_t		start_time;	/* Request start time */
+	uint64_t		last_update;
+	uint64_t		poll_interval;	/* Polling interval  */
+	uint64_t		xscom_base;	/* xscom base of i2cm */
+	uint32_t		fifo_size;	/* Maximum size of FIFO  */
+	uint32_t		chip_id;	/* Chip the i2cm sits on */
+	uint32_t		engine_id;	/* Engine# on chip */
+	uint8_t			obuf[4];	/* Offset buffer */
+	uint32_t		bytes_sent;
+	bool			irq_ok;		/* Interrupt working ? */
+	bool			occ_cache_dis;  /* I have disabled the cache */
+	bool			occ_lock_acquired; /* Acquired lock from OCC */
+	enum request_state {
+		state_idle,
+		state_occache_dis,
+		state_offset,
+		state_data,
+		state_error,
+		state_recovery,
+	}			state;
+	struct list_head	req_list;	/* Request queue head */
+	struct timer		poller;
+	struct timer		timeout;
+	struct timer		recovery;
+	struct timer		sensor_cache;
+	uint8_t			recovery_pass;
+	struct list_node	link;
+	struct list_head	ports;
+};
+
+struct p8_i2c_master_port {
+	struct i2c_bus		bus; /* Abstract bus struct for the client */
+	struct p8_i2c_master	*master;
+	uint32_t		port_num;
+	uint32_t		bit_rate_div;	/* Divisor to set bus speed*/
+	uint64_t		byte_timeout;	/* Timeout per byte */
+	uint64_t		poll_interval;	/* Polling interval */
+	struct list_node	link;
+};
+
+static int occ_i2c_unlock(struct p8_i2c_master *master);
+
+static int64_t i2cm_read_reg(struct p8_i2c_master *m, int reg, uint64_t *val)
+{
+	return xscom_read(m->chip_id, m->xscom_base + reg, val);
+}
+
+static int64_t i2cm_write_reg(struct p8_i2c_master *m, int reg, uint64_t val)
+{
+	return xscom_write(m->chip_id, m->xscom_base + reg, val);
+}
+
+static void p8_i2c_print_debug_info(struct p8_i2c_master_port *port,
+				    struct i2c_request *req, uint64_t end_time)
+{
+	struct p8_i2c_master *master = port->master;
+	uint64_t cmd, mode, stat, estat, intm, intc;
+
+	/* Print master and request structure bits */
+	log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+			 "I2C: Chip %08x Eng. %d Port %d--\n"
+			 " xscom_base=0x%016llx\tstate=%d\tbytes_sent=%d\n",
+			 master->chip_id, master->engine_id, port->port_num,
+			 master->xscom_base, master->state, master->bytes_sent);
+
+	log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Request info--\n"
+			 " addr=0x%04x\toffset_bytes=%d\toffset=%d\tlen=%d\n",
+			 req->dev_addr, req->offset_bytes, req->offset,
+			 req->rw_len);
+
+	log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: "
+			 " start_time=%016llx end_time=%016llx (duration=%016llx)\n",
+			 master->start_time, end_time, end_time - master->start_time);
+
+	/* initialise to some fake value in case of read errors */
+	cmd = mode = stat = estat = intm = intc = 0xDEAD;
+
+	/* Dump the current state of i2c registers */
+	i2cm_read_reg(master, I2C_CMD_REG, &cmd);
+	i2cm_read_reg(master, I2C_MODE_REG, &mode);
+	i2cm_read_reg(master, I2C_MODE_REG, &mode);
+	i2cm_read_reg(master, I2C_STAT_REG, &stat);
+	i2cm_read_reg(master, I2C_EXTD_STAT_REG, &estat);
+	i2cm_read_reg(master, I2C_INTR_MASK_REG, &intm);
+	i2cm_read_reg(master, I2C_INTR_RAW_COND_REG, &intc);
+
+	log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Register dump--\n"
+			 "    cmd:0x%016llx\tmode:0x%016llx\tstat:0x%016llx\n"
+			 "  estat:0x%016llx\tintm:0x%016llx\tintc:0x%016llx\n",
+			 cmd, mode, stat, estat, intm, intc);
+
+	log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+		"I2C: Error bits set: %s%s%s%s%s%s%s\n",
+		(stat & I2C_STAT_NACK_RCVD_ERR) ? "nack, " : "",
+		(stat & I2C_STAT_INVALID_CMD) ? "cmd invalid, " : "",
+		(stat & I2C_STAT_LBUS_PARITY_ERR) ? "interal parity, " : "",
+		(stat & I2C_STAT_BKEND_OVERRUN_ERR) ? "backend overrun, " : "",
+		(stat & I2C_STAT_BKEND_ACCESS_ERR) ? "backend access, " : "",
+		(stat & I2C_STAT_ARBT_LOST_ERR) ? "arbitration loss, " : "",
+		(stat & I2C_STAT_STOP_ERR) ? "stop error, " : "");
+}
+
+static bool p8_i2c_has_irqs(struct p8_i2c_master *master)
+{
+	struct proc_chip *chip;
+
+	/* Centaur I2C doesn't have interrupts */
+	if (master->type == I2C_CENTAUR)
+		return false;
+
+	chip = get_chip(master->chip_id);
+
+	/* The i2c interrupts was only added to Murano DD2.1 and Venice
+	 * DD2.0. When operating without interrupts, we need to bump the
+	 * timeouts as we rely solely on the polls from Linux which can
+	 * be up to 2s apart !
+	 */
+	if (proc_gen >= proc_gen_p9)
+		return true;
+	else if (chip->type == PROC_CHIP_P8_MURANO)
+		return chip->ec_level >= 0x21;
+	else if (chip->type == PROC_CHIP_P8_VENICE)
+		return chip->ec_level >= 0x20;
+
+	return true;
+}
+
+static int p8_i2c_enable_irqs(struct p8_i2c_master *master)
+{
+	int rc;
+
+	/* enable interrupts we're interested in */
+	rc = i2cm_write_reg(master, I2C_INTR_MASK_OR_REG, I2C_INTR_ACTIVE);
+	if (rc)
+		prlog(PR_ERR, "I2C: Failed to enable the interrupts\n");
+
+	return rc;
+}
+
+static void p8_i2c_reset_timeout(struct p8_i2c_master *master,
+		struct i2c_request *req)
+{
+	uint64_t now = mftb();
+
+	master->last_update = now;
+	schedule_timer_at(&master->timeout, now + msecs_to_tb(req->timeout));
+}
+
+static int p8_i2c_prog_watermark(struct p8_i2c_master *master)
+{
+	uint64_t watermark;
+	int rc;
+
+	rc = xscom_read(master->chip_id, master->xscom_base + I2C_WATERMARK_REG,
+			&watermark);
+	if (rc) {
+		prlog(PR_ERR, "I2C: Failed to read the WATERMARK_REG\n");
+		return rc;
+	}
+
+	/* Set the high/low watermark */
+	watermark = SETFIELD(I2C_WATERMARK_HIGH, watermark, I2C_FIFO_HI_LVL);
+	watermark = SETFIELD(I2C_WATERMARK_LOW, watermark, I2C_FIFO_LO_LVL);
+	rc = xscom_write(master->chip_id, master->xscom_base +
+			 I2C_WATERMARK_REG, watermark);
+	if (rc)
+		prlog(PR_ERR, "I2C: Failed to set high/low watermark level\n");
+
+	return rc;
+}
+
+static int p8_i2c_prog_mode(struct p8_i2c_master_port *port, bool enhanced_mode)
+{
+	struct p8_i2c_master *master = port->master;
+	uint64_t mode, omode;
+	int rc;
+
+	rc = xscom_read(master->chip_id, master->xscom_base +
+			I2C_MODE_REG, &mode);
+	if (rc) {
+		prlog(PR_ERR, "I2C: Failed to read the MODE_REG\n");
+		return rc;
+	}
+	omode = mode;
+	mode = SETFIELD(I2C_MODE_PORT_NUM, mode, port->port_num);
+	mode = SETFIELD(I2C_MODE_BIT_RATE_DIV, mode, port->bit_rate_div);
+	if (enhanced_mode)
+		mode |= I2C_MODE_ENHANCED;
+	else
+		mode &= ~I2C_MODE_ENHANCED;
+	if (mode == omode)
+		return 0;
+
+	rc = xscom_write(master->chip_id, master->xscom_base + I2C_MODE_REG,
+			 mode);
+	if (rc)
+		prlog(PR_ERR, "I2C: Failed to write the MODE_REG\n");
+
+	return rc;
+}
+
+static void p8_i2c_complete_request(struct p8_i2c_master *master,
+				    struct i2c_request *req, int ret)
+{
+	/* We only complete the current top level request */
+	assert(req == list_top(&master->req_list, struct i2c_request, link));
+
+	cancel_timer_async(&master->timeout);
+
+	list_del(&req->link);
+	master->state = state_idle;
+	req->result = ret;
+	req->req_state = i2c_req_done;
+
+	/* Schedule re-enabling of sensor cache */
+	if (master->occ_cache_dis)
+		schedule_timer(&master->sensor_cache,
+			       msecs_to_tb(SENSOR_CACHE_EN_DELAY));
+
+	/* If we're done with i2c master, allow OCC to use it */
+	if (master->occ_lock_acquired && list_empty(&master->req_list))
+		occ_i2c_unlock(master);
+
+	unlock(&master->lock);
+	if (req->completion)
+		req->completion(ret, req);
+	/* req might have been freed at this point */
+	lock(&master->lock);
+}
+
+
+static int p8_i2c_engine_reset(struct p8_i2c_master_port *port)
+{
+	struct p8_i2c_master *master = port->master;
+	int rc;
+
+	/* Reset the i2c engine */
+	rc = xscom_write(master->chip_id, master->xscom_base +
+			 I2C_RESET_I2C_REG, 0);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_RESET), "I2C: Failed "
+				 "to reset the i2c engine\n");
+		return rc;
+	}
+
+	/* Reprogram the watermark and mode */
+	rc = p8_i2c_prog_watermark(port->master);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_RESET), "I2C: Failed to"
+				 "program the WATERMARK_REG\n");
+		return rc;
+	}
+
+	rc = p8_i2c_prog_mode(port, false);
+	if (rc)
+		log_simple_error(&e_info(OPAL_RC_I2C_RESET), "I2C: Failed to"
+				 "program the MODE_REG\n");
+
+	return rc;
+}
+
+static void p8_i2c_translate_error(struct i2c_request *req, uint64_t status)
+{
+	/* Assuming there are not more than one type of error simultaneously */
+	if (status & I2C_STAT_NACK_RCVD_ERR)
+		req->result = OPAL_I2C_NACK_RCVD;
+	else if (status & I2C_STAT_INVALID_CMD)
+		req->result = OPAL_I2C_INVALID_CMD;
+	else if (status & I2C_STAT_LBUS_PARITY_ERR)
+		req->result = OPAL_I2C_LBUS_PARITY;
+	else if (status & I2C_STAT_BKEND_OVERRUN_ERR)
+		req->result = OPAL_I2C_BKEND_OVERRUN;
+	else if (status & I2C_STAT_BKEND_ACCESS_ERR)
+		req->result = OPAL_I2C_BKEND_ACCESS;
+	else if (status & I2C_STAT_ARBT_LOST_ERR)
+		req->result = OPAL_I2C_ARBT_LOST;
+	else if (status & I2C_STAT_STOP_ERR)
+		req->result = OPAL_I2C_STOP_ERR;
+	else if (status & I2C_STAT_PSEUDO_TIMEOUT)
+		req->result = OPAL_I2C_TIMEOUT;
+}
+
+static int p8_i2c_reset_port(struct p8_i2c_master_port *p)
+{
+	struct p8_i2c_master *master = p->master;
+	int reset_loops, rc;
+	uint64_t status;
+
+	/* FIXME: this should per per-port rather than per-master */
+	master->state = state_error;
+
+	/*
+	 * Put the master into enhanced STOP mode when recovering the
+	 * port. This causes the master to send additional STOP conditions
+	 * to work around some particularly stupid I2C devices and it's
+	 * required on secure I2C masters since they will not send a bare
+	 * stop condition.
+	 */
+	rc = p8_i2c_prog_mode(p, true);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_RESET),
+				 "I2C: Failed to enable enhanced mode\n");
+		return -1;
+	}
+
+	rc = xscom_write(master->chip_id, master->xscom_base +
+			 I2C_CMD_REG, I2C_CMD_WITH_STOP);
+	if (rc)
+		goto err;
+
+	/* Wait for COMMAND COMPLETE */
+	for (reset_loops = 0; reset_loops < 10; reset_loops++) {
+		time_wait_ms(10);
+
+		rc = xscom_read(master->chip_id,
+				master->xscom_base + I2C_STAT_REG,
+				&status);
+		if (rc)
+			goto err;
+
+		if (status & I2C_STAT_CMD_COMP)
+			break;
+	}
+
+	if (status & I2C_STAT_CMD_COMP)
+		return 0;
+err:
+	prerror("I2C: Failed to reset c%de%dp%d\n",
+		master->chip_id, master->engine_id, p->port_num);
+	return -1;
+}
+
+static void p8_i2c_status_error(struct p8_i2c_master_port *port,
+				struct i2c_request *req,
+				uint64_t status, uint64_t end_time)
+{
+	struct p8_i2c_master *master = port->master;
+	int rc;
+
+	/* Display any error other than I2C_INTR_NACK_RCVD_ERR or
+	 * timeout since getting NACK's is normal if Linux is probing
+	 * the bus and timeouts will have already logged something.
+	 */
+	if (!(status & (I2C_STAT_NACK_RCVD_ERR | I2C_STAT_PSEUDO_TIMEOUT))) {
+		log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+				 "I2C: Transfer error occurred\n");
+		p8_i2c_print_debug_info(port, req, end_time);
+	} else if (status == I2C_STAT_PSEUDO_TIMEOUT) {
+		log_simple_error(&e_info(OPAL_RC_I2C_TIMEOUT),
+				 "I2C: request timed out!\n");
+		p8_i2c_print_debug_info(port, req, end_time);
+	}
+
+	p8_i2c_translate_error(req, status);
+
+	rc = p8_i2c_engine_reset(port);
+	if (rc)
+		goto exit;
+
+	if (status & (I2C_STAT_LBUS_PARITY_ERR | I2C_STAT_ARBT_LOST_ERR |
+		      I2C_STAT_STOP_ERR)) {
+		/*
+		 * Don't bother issuing a STOP command for those errors
+		 * just get rid of the current request and start off with
+		 * the fresh one in the list
+		 */
+		p8_i2c_complete_request(master, req, req->result);
+	} else {
+		if (p8_i2c_reset_port(port))
+			goto exit;
+		/* Enable the interrupt */
+		p8_i2c_enable_irqs(master);
+	}
+	return;
+
+exit:
+	p8_i2c_complete_request(master, req, req->result);
+}
+
+static int p8_i2c_fifo_read(struct p8_i2c_master *master,
+			    uint8_t *buf, uint32_t count)
+{
+	uint64_t fifo;
+	uint32_t i;
+	int rc = 0;
+
+	for (i = 0; i < count; i++, buf++) {
+		rc = xscom_read(master->chip_id, master->xscom_base +
+				I2C_FIFO_REG, &fifo);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+					 "I2C: Failed to read the fifo\n");
+			break;
+		}
+
+		*buf = GETFIELD(I2C_FIFO, fifo);
+	}
+	return rc;
+}
+
+static int p8_i2c_fifo_write(struct p8_i2c_master *master,
+			     uint8_t *buf, uint32_t count)
+{
+	uint64_t fifo;
+	uint32_t i;
+	int rc = 0;
+
+	for (i = 0; i < count; i++, buf++) {
+		fifo = SETFIELD(I2C_FIFO, 0ull, *buf);
+		rc = xscom_write(master->chip_id, master->xscom_base +
+				 I2C_FIFO_REG, fifo);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+					 "I2C: Failed to write the fifo\n");
+			break;
+		}
+	}
+	return rc;
+}
+
+static void p8_i2c_status_data_request(struct p8_i2c_master *master,
+				       struct i2c_request *req,
+				       uint64_t status)
+{
+	uint32_t fifo_count, fifo_free, count;
+	uint8_t *buf;
+	int rc = 0;
+
+	fifo_count = GETFIELD(I2C_STAT_FIFO_ENTRY_COUNT, status);
+	fifo_free = master->fifo_size - fifo_count;
+
+	DBG("Data request, state=%d fifo_count=%d/%d bytes_sent=%d\n",
+	    master->state, fifo_count, master->fifo_size, master->bytes_sent);
+
+	switch(master->state) {
+	case state_offset:
+		/* We assume the offset can always be written in one go */
+		if (fifo_free < req->offset_bytes) {
+			log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+					 "I2C: Fifo too small for offset !\n");
+			rc = OPAL_HARDWARE;
+		} else {
+			rc = p8_i2c_fifo_write(master, master->obuf,
+					       req->offset_bytes);
+		}
+
+		/* For read, wait address phase to complete */
+		if (rc || req->op != SMBUS_WRITE)
+			break;
+
+		/* For writes, transition to data phase now */
+		master->state = state_data;
+		fifo_free -= req->offset_bytes;
+		/* Fall through */
+	case state_data:
+		/* Sanity check */
+		if (master->bytes_sent >= req->rw_len) {
+			log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: "
+					 "Data req with no data to send sent=%d "
+					 "req=%d\n", master->bytes_sent,
+					 req->rw_len);
+			rc = OPAL_HARDWARE;
+			break;
+		}
+
+		/* Get next chunk */
+		buf = req->rw_buf + master->bytes_sent;
+		count = req->rw_len - master->bytes_sent;
+
+		/* Check direction */
+		if (req->op == I2C_READ || req->op == SMBUS_READ) {
+			if (count > fifo_count)
+				count = fifo_count;
+			rc = p8_i2c_fifo_read(master, buf, count);
+		} else {
+			if (count > fifo_free)
+				count = fifo_free;
+			rc = p8_i2c_fifo_write(master, buf, count);
+		}
+		if (rc == 0)
+			master->bytes_sent += count;
+		break;
+	default:
+		log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Invalid "
+				 "state %d in data req !\n", master->state);
+		rc = OPAL_WRONG_STATE;
+	}
+
+	if (rc) {
+		p8_i2c_complete_request(master, req, rc);
+	} else {
+		p8_i2c_enable_irqs(master);
+		p8_i2c_reset_timeout(master, req);
+	}
+}
+
+static void p8_i2c_complete_offset(struct p8_i2c_master *master,
+				   struct i2c_request *req)
+{
+	uint64_t cmd;
+	int rc = 0;
+
+	DBG("Completing offset phase\n");
+
+	/* If it's a write, we should only get here for empty
+	 * write commands
+	 */
+	if (req->op == SMBUS_WRITE && req->rw_len != 0) {
+		log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Write "
+				 "completion in offset state !\n");
+		rc = OPAL_HARDWARE;
+		goto complete;
+	}
+
+	/* Switch to data phase */
+	master->state = state_data;
+
+	/* If it's not a read command, or there are no data to read,
+	 * then we complete the command
+	 */
+	if (req->op != SMBUS_READ || req->rw_len == 0)
+		goto complete;
+
+	/* Otherwise, let's start the data phase */
+	cmd = I2C_CMD_WITH_START | I2C_CMD_WITH_ADDR |
+		I2C_CMD_WITH_STOP | I2C_CMD_READ_NOT_WRITE;
+	cmd = SETFIELD(I2C_CMD_DEV_ADDR, cmd, req->dev_addr);
+	cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd, req->rw_len);
+	cmd = SETFIELD(I2C_CMD_INTR_STEERING, cmd, I2C_CMD_INTR_STEER_HOST);
+
+	DBG("Command: %016llx, state: %d\n", cmd, master->state);
+
+	/* Send command */
+	rc = xscom_write(master->chip_id, master->xscom_base + I2C_CMD_REG,
+			 cmd);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Failed "
+				 "to write the CMD_REG\n");
+		goto complete;
+	}
+
+	/* Enable the interrupts */
+	p8_i2c_enable_irqs(master);
+	p8_i2c_reset_timeout(master, req);
+	return;
+
+ complete:
+	p8_i2c_complete_request(master, req, rc);
+}
+
+static void p8_i2c_status_cmd_completion(struct p8_i2c_master *master,
+					 struct i2c_request *req,
+					 uint64_t end_time __unused)
+{
+	int rc;
+
+	DBG("Command completion, state=%d bytes_sent=%d\n",
+	    master->state, master->bytes_sent);
+	DBG("  start_time=%016llx end_time=%016llx (duration=%016llx)\n",
+	    master->start_time, end_time, end_time - master->start_time);
+
+	/* If we complete an offset, we probably need to transition
+	 * do a data read, check if that all makes sense
+	 */
+	if (master->state == state_offset) {
+		p8_i2c_complete_offset(master, req);
+		return;
+	}
+
+	/* If we are not already in error state, check if we have
+	 * completed our data transfer properly
+	 */
+	if (master->state != state_error && master->bytes_sent != req->rw_len) {
+		log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER), "I2C: Request "
+				 "complete with residual data req=%d done=%d\n",
+				 req->rw_len, master->bytes_sent);
+		/* Should we error out here ? */
+	}
+	rc = master->state == state_error ? req->result : OPAL_SUCCESS;
+	p8_i2c_complete_request(master, req, rc);
+}
+
+static void p8_i2c_check_status(struct p8_i2c_master *master)
+{
+	struct p8_i2c_master_port *port;
+	uint64_t status, deadline, now;
+	struct i2c_request *req;
+	int rc;
+
+	/*
+	 * When idle or waiting for the occ to release the bus there's
+	 * nothing to check. Also ignore recovery state, as the bus
+	 * can be reset in that state, and a request can think it's
+	 * complete when it just means the reset is complete.
+	 * Error states are handled when starting a new request.
+	 */
+	if (master->state == state_idle || master->state == state_occache_dis ||
+		master->state == state_recovery)
+		return;
+
+	/* A non-idle master should always have a pending request */
+	req = list_top(&master->req_list, struct i2c_request, link);
+	if (!req) {
+		prerror("I2C: Master is not idle and has no pending request\n");
+		return;
+	}
+
+	rc = i2cm_read_reg(master, I2C_STAT_REG, &status);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+			"I2C: Failed to read the STAT_REG\n");
+		return;
+	}
+
+	/* mask interrupts while we're mucking with the master */
+	rc = i2cm_write_reg(master, I2C_INTR_MASK_AND_REG, ~I2C_INTR_ALL);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_TRANSFER),
+			"I2C: Failed to disable the interrupts\n");
+		return;
+	}
+
+	/* Get port for current request */
+	port = container_of(req->bus, struct p8_i2c_master_port, bus);
+	now = mftb();
+
+	deadline = master->last_update + msecs_to_tb(req->timeout);
+
+	if (status & I2C_STAT_ANY_ERR)
+		p8_i2c_status_error(port, req, status & I2C_STAT_ANY_ERR, now);
+	else if (status & I2C_STAT_DATA_REQ)
+		p8_i2c_status_data_request(master, req, status);
+	else if (status & I2C_STAT_CMD_COMP)
+		p8_i2c_status_cmd_completion(master, req, now);
+	else if (tb_compare(now, deadline) == TB_AAFTERB)
+		p8_i2c_status_error(port, req, I2C_STAT_PSEUDO_TIMEOUT, now);
+	else
+		p8_i2c_enable_irqs(master);
+}
+
+static int p8_i2c_check_initial_status(struct p8_i2c_master_port *port)
+{
+	struct p8_i2c_master *master = port->master;
+	uint64_t status, estat;
+	int rc;
+
+	master->recovery_pass++;
+
+	/* Read status register */
+	rc = xscom_read(master->chip_id, master->xscom_base + I2C_STAT_REG,
+			&status);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed "
+				 "to read the STAT_REG\n");
+		return rc;
+	}
+
+	rc = xscom_read(master->chip_id,
+			master->xscom_base + I2C_EXTD_STAT_REG,
+			&estat);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed "
+				 "to read the EXTD_STAT_REG\n");
+		return rc;
+	}
+	if (estat & (I2C_EXTD_STAT_I2C_BUSY | I2C_EXTD_STAT_SELF_BUSY)) {
+		DBG("Initial estat busy ! %016llx\n", estat);
+		/* Just a warning for now */
+	}
+
+	/* Nothing happened ? Go back */
+	if (status & I2C_STAT_ANY_ERR) {
+		log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: "
+				 "Initial error status 0x%016llx\n", status);
+
+		if (master->recovery_pass > 1) {
+			log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: "
+					 "Error stuck, aborting !!\n");
+			return OPAL_HARDWARE;
+		}
+
+		/* Mark state as "recovery" to block any other activity */
+		master->state = state_recovery;
+
+		/* Reset the engine */
+		p8_i2c_engine_reset(port);
+
+		/* Delay 5ms for bus to settle */
+		schedule_timer(&master->recovery, msecs_to_tb(5));
+		return OPAL_BUSY;
+	}
+
+	/* Still busy ? */
+	if (!(status & I2C_STAT_CMD_COMP)) {
+		log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Initial "
+				 "command complete not set\n");
+
+		if (master->recovery_pass > 5) {
+			log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: "
+					 "Command stuck, aborting !!\n");
+			return OPAL_HARDWARE;
+		}
+
+
+		master->state = state_recovery;
+
+		/* Delay 5ms for bus to settle */
+		schedule_timer(&master->recovery, msecs_to_tb(5));
+		return OPAL_BUSY;
+	}
+
+	master->recovery_pass = 0;
+	return 0;
+}
+
+/*
+ * On POWER9, the I2C may also wish to use some of the i2cm engines,
+ * to do things like read sensor data. There's a couple of shared
+ * registers with the OCC to negotiate locking of the i2cm engines.
+ * See occ/src/occ_405/lock/lock.c
+ */
+static bool occ_uses_master(struct p8_i2c_master *master)
+{
+	/* OCC uses I2CM Engines 1,2 and 3, only on POWER9/10 */
+	if (master->type == I2C_POWER8 && proc_gen >= proc_gen_p9)
+		return master->engine_id >= 1;
+
+	return false;
+}
+
+static uint32_t occflg;
+#define OCCFLG_BASE  0
+#define OCCFLG_CLEAR 1
+#define OCCFLG_SET   2
+
+static int occ_i2c_lock(struct p8_i2c_master *master)
+{
+	u64 occflags, busflag;
+	int rc;
+
+	if (!occ_uses_master(master) || !occflg)
+		return 0;
+
+	if (master->occ_lock_acquired)
+		return 0;
+
+	rc = xscom_read(master->chip_id, occflg, &occflags);
+	if (rc) {
+		prerror("I2C: Failed to read OCC FLAG register\n");
+		return rc;
+	}
+
+	assert(master->engine_id > 0);
+
+	busflag = PPC_BIT(16 + (master->engine_id - 1) * 2);
+
+	DBG("I2C: c%de%d: occflags = %llx (locks = %x:%x:%x)\n",
+		master->chip_id, master->engine_id, (u64) occflags,
+		(u32) GETFIELD(PPC_BITMASK(16, 17), occflags),
+		(u32) GETFIELD(PPC_BITMASK(18, 19), occflags),
+		(u32) GETFIELD(PPC_BITMASK(20, 21), occflags));
+
+	rc = xscom_write(master->chip_id, occflg + OCCFLG_SET, busflag);
+	if (rc) {
+		prerror("I2C: Failed to write OCC FLAG register\n");
+		return rc;
+	}
+
+	/* If the OCC also has this bus locked then wait for IRQ */
+	if (occflags & (busflag >> 1)) {
+		DBG("I2C: c%de%d: Master in use by OCC\n",
+			master->chip_id, master->engine_id);
+		return 1;
+	}
+
+	master->occ_lock_acquired = true;
+
+	return 0;
+}
+
+static int occ_i2c_unlock(struct p8_i2c_master *master)
+{
+	u64 busflag, occflags;
+	int rc;
+
+	if (!occ_uses_master(master) || !occflg)
+		return 0;
+
+	rc = xscom_read(master->chip_id, occflg, &occflags);
+	if (rc) {
+		prerror("I2C: Failed to read OCC Flag register\n");
+		return rc;
+	}
+
+	busflag = PPC_BIT(16 + (master->engine_id - 1) * 2);
+
+	if (!(occflags & busflag)) {
+		DBG("I2C: spurious unlock for c%de%d already cleared (flags = %.16llx)",
+			master->chip_id, master->engine_id, occflags);
+	}
+
+	rc = xscom_write(master->chip_id, occflg + OCCFLG_CLEAR, busflag);
+	if (rc)
+		prerror("I2C: Failed to write OCC Flag register\n");
+
+	master->occ_lock_acquired = false;
+
+	return rc;
+}
+
+static int p8_i2c_start_request(struct p8_i2c_master *master,
+				struct i2c_request *req)
+{
+	struct p8_i2c_master_port *port;
+	uint64_t cmd;
+	int64_t rc;
+
+	DBG("Starting req %d len=%d addr=%02x (offset=%x)\n",
+	    req->op, req->rw_len, req->dev_addr, req->offset);
+
+	/* Get port */
+	port = container_of(req->bus, struct p8_i2c_master_port, bus);
+
+	/* Check if we need to disable the OCC cache first */
+	if (master->type == I2C_CENTAUR && !master->occ_cache_dis) {
+		DBG("Disabling OCC cache...\n");
+		rc = centaur_disable_sensor_cache(master->chip_id);
+
+		if (rc < 0) {
+			log_simple_error(&e_info(OPAL_RC_I2C_START_REQ),
+					 "I2C: Failed "
+					 "to disable the sensor cache\n");
+			return rc;
+		}
+		master->occ_cache_dis = true;
+
+		/* Do we need to wait ? */
+		if (rc > 0) {
+			DBG("Waiting %lld\n", rc);
+			master->state = state_occache_dis;
+			schedule_timer(&master->recovery, rc);
+			return 0;
+		}
+	}
+
+	/*
+	 * on P9 we need to set the "I2C master using bit" so we don't
+	 * conflict with the OCC's use of the i2c master.
+	 */
+	rc = occ_i2c_lock(master);
+	if (rc < 0) {
+		log_simple_error(&e_info(OPAL_RC_I2C_START_REQ),
+				 "I2C: Failed to get I2CM lock from OCC\n");
+		return rc;
+	}
+	if (rc > 0) {
+		/* Wait for OCC IRQ */
+		master->state = state_occache_dis;
+		schedule_timer(&master->recovery, msecs_to_tb(10));
+		return 0;
+	}
+
+	/* Convert the offset if needed */
+	if (req->offset_bytes) {
+		int i;
+
+		for (i = 0; i < req->offset_bytes; i++) {
+			uint8_t b;
+
+			b = req->offset >> (8 * (req->offset_bytes - i - 1));
+			master->obuf[i] = b;
+		}
+		DBG("Offset %d bytes: %02x %02x %02x %02x\n",
+		    req->offset_bytes, master->obuf[0], master->obuf[1],
+		    master->obuf[2], master->obuf[3]);
+	}
+
+	/* Program mode register */
+	rc = p8_i2c_prog_mode(port, false);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed "
+				 "to program the MODE_REG\n");
+		return rc;
+	}
+
+	/* Check status */
+	rc = p8_i2c_check_initial_status(port);
+	if (rc != OPAL_BUSY)
+		master->recovery_pass = 0;
+	if (rc)
+		return rc;
+
+	/* program the watermark register */
+	rc = p8_i2c_prog_watermark(master);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_INIT),
+			 "I2C: Failed to program the WATERMARK_REG\n");
+		return rc;
+	}
+
+	/* Initialize bytes_sent */
+	master->bytes_sent = 0;
+
+	/* Set up the command register */
+	cmd = I2C_CMD_WITH_START | I2C_CMD_WITH_ADDR;
+	cmd = SETFIELD(I2C_CMD_DEV_ADDR, cmd, req->dev_addr);
+	cmd = SETFIELD(I2C_CMD_INTR_STEERING, cmd, I2C_CMD_INTR_STEER_HOST);
+	switch (req->op) {
+	case I2C_READ:
+		cmd |= I2C_CMD_READ_NOT_WRITE;
+		/* Fall through */
+	case I2C_WRITE:
+		cmd |= I2C_CMD_WITH_STOP;
+		cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd, req->rw_len);
+		master->state = state_data;
+		break;
+	case SMBUS_READ:
+		cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd, req->offset_bytes);
+		master->state = state_offset;
+		break;
+	case SMBUS_WRITE:
+		cmd |= I2C_CMD_WITH_STOP;
+		cmd = SETFIELD(I2C_CMD_LEN_BYTES, cmd,
+				req->rw_len + req->offset_bytes);
+		master->state = state_offset;
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+	DBG("Command: %016llx, state: %d\n", cmd, master->state);
+
+	master->start_time = mftb();
+
+	/* Send command */
+	rc = xscom_write(master->chip_id, master->xscom_base + I2C_CMD_REG,
+			 cmd);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_START_REQ), "I2C: Failed "
+				 "to write the CMD_REG\n");
+		return rc;
+	}
+
+	/* Enable the interrupts */
+	p8_i2c_enable_irqs(master);
+
+	/* Run a poll timer for boot cases or non-working interrupts
+	 * cases
+	 */
+	if (!opal_booting() && master->irq_ok)
+		master->poll_interval = TIMER_POLL;
+	else
+		master->poll_interval = port->poll_interval;
+	schedule_timer(&master->poller, master->poll_interval);
+
+	/* If we don't have a user-set timeout then use the master's default */
+	if (!req->timeout)
+		req->timeout = port->byte_timeout;
+
+	/* Start the timeout */
+	p8_i2c_reset_timeout(master, req);
+
+	return OPAL_SUCCESS;
+}
+
+static void p8_i2c_check_work(struct p8_i2c_master *master)
+{
+	struct i2c_request *req;
+	int rc;
+
+	while (master->state == state_idle && !list_empty(&master->req_list)) {
+		req = list_top(&master->req_list, struct i2c_request, link);
+		rc = p8_i2c_start_request(master, req);
+		if (rc) {
+			/*
+			 * If it didn't work the first three times then
+			 * odds are it's not going to work on the 4th.
+			 */
+			if (rc && req->retries > 3)
+				p8_i2c_complete_request(master, req, rc);
+			else
+				req->retries++;
+		}
+	}
+}
+
+/* OCC IRQ Handler for I2C Ownership Change*/
+void p9_i2c_bus_owner_change(u32 chip_id)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct p8_i2c_master *master = NULL;
+
+	assert(chip);
+	list_for_each(&chip->i2cms, master, link) {
+		lock(&master->lock);
+
+		/* spurious */
+		if (master->state != state_occache_dis)
+			goto done;
+
+		/* Can we now lock this master? */
+		if (occ_i2c_lock(master))
+			goto done;
+
+		/* clear the existing wait timer */
+		cancel_timer_async(&master->recovery);
+
+		/* re-start the request now that we own the master */
+		master->state = state_idle;
+
+		p8_i2c_check_work(master);
+		p8_i2c_check_status(master);
+done:
+		unlock(&master->lock);
+	}
+}
+
+static int p8_i2c_queue_request(struct i2c_request *req)
+{
+	struct i2c_bus *bus = req->bus;
+	struct p8_i2c_master_port *port =
+		container_of(bus, struct p8_i2c_master_port, bus);
+	struct p8_i2c_master *master = port->master;
+	int rc = 0;
+
+	/* Parameter check */
+	if (req->rw_len > I2C_MAX_TFR_LEN) {
+		prlog(PR_ERR, "I2C: Too large transfer %d bytes\n", req->rw_len);
+		return OPAL_PARAMETER;
+	}
+
+	if (req->offset_bytes > 4) {
+		prlog(PR_ERR, "I2C: Invalid offset size %d\n", req->offset_bytes);
+		return OPAL_PARAMETER;
+	}
+	lock(&master->lock);
+	list_add_tail(&master->req_list, &req->link);
+	p8_i2c_check_work(master);
+	unlock(&master->lock);
+
+	return rc;
+}
+
+static uint64_t p8_i2c_run_request(struct i2c_request *req)
+{
+	struct i2c_bus *bus = req->bus;
+	struct p8_i2c_master_port *port =
+		container_of(bus, struct p8_i2c_master_port, bus);
+	struct p8_i2c_master *master = port->master;
+	uint64_t poll_interval = 0;
+
+	lock(&master->lock);
+	p8_i2c_check_status(master);
+	p8_i2c_check_work(master);
+	poll_interval = master->poll_interval;
+	unlock(&master->lock);
+
+	return poll_interval;
+}
+
+static inline uint32_t p8_i2c_get_bit_rate_divisor(uint32_t lb_freq,
+						   uint32_t bus_speed)
+{
+	assert(bus_speed > 0);
+	return (((lb_freq / bus_speed) - 1) / 4);
+}
+
+static inline uint64_t p8_i2c_get_poll_interval(uint32_t bus_speed)
+{
+	uint64_t usec;
+
+	assert(bus_speed > 0);
+
+	/* Polling Interval = 8 * (1/bus_speed) * (1/10) -> convert to uSec */
+	usec = ((8 * USEC_PER_SEC) / (10 * bus_speed));
+	return usecs_to_tb(usec);
+}
+
+static void p8_i2c_timeout(struct timer *t __unused, void *data,
+		uint64_t __unused now)
+{
+	struct p8_i2c_master *master = data;
+
+	lock(&master->lock);
+
+	DBG("timeout on c%de%d\n", master->chip_id, master->engine_id);
+
+	/*
+	 * Run through the usual status checks. It's possible to get spurious
+	 * timeouts due to races between the interrupt/poller paths and the
+	 * timeout handler. So we do all the checking, all the time.
+	 */
+	p8_i2c_check_status(master);
+	p8_i2c_check_work(master);
+
+	unlock(&master->lock);
+}
+
+static void p8_i2c_recover(struct timer *t __unused, void *data,
+			   uint64_t now __unused)
+{
+	struct p8_i2c_master *master = data;
+
+	lock(&master->lock);
+
+	/*
+	 * The recovery timer can race with the OCC interrupt. If the interrupt
+	 * comes in just before this is called, then we'll get a spurious
+	 * timeout which we need to ignore.
+	 */
+	if (master->state != state_recovery &&
+		master->state != state_occache_dis) {
+		unlock(&master->lock);
+		return;
+	}
+
+	master->state = state_idle;
+
+	/* We may or may not still have work pending, re-enable the sensor cache
+	 * immediately if we don't (we just waited the recovery time so there is
+	 * little point waiting longer).
+	 */
+	if (master->occ_cache_dis && list_empty(&master->req_list)) {
+		DBG("Re-enabling OCC cache after recovery\n");
+		centaur_enable_sensor_cache(master->chip_id);
+		master->occ_cache_dis = false;
+	}
+
+	if (master->occ_lock_acquired && list_empty(&master->req_list))
+		occ_i2c_unlock(master);
+
+	/* Re-check for new work */
+	p8_i2c_check_work(master);
+	unlock(&master->lock);
+}
+
+static void p8_i2c_enable_scache(struct timer *t __unused, void *data,
+				 uint64_t now __unused)
+{
+	struct p8_i2c_master *master = data;
+
+	lock(&master->lock);
+
+	/* Check if we are still idle */
+	if (master->state == state_idle && master->occ_cache_dis) {
+		DBG("Re-enabling OCC cache\n");
+		centaur_enable_sensor_cache(master->chip_id);
+		master->occ_cache_dis = false;
+	}
+	unlock(&master->lock);
+}
+
+static void p8_i2c_poll(struct timer *t __unused, void *data, uint64_t now)
+{
+	struct p8_i2c_master *master = data;
+
+	/*
+	 * This is called when the interrupt isn't functional or
+	 * generally from the opal pollers, so fast while booting
+	 * and slowly when Linux is up.
+	 */
+
+	/* Lockless fast bailout */
+	if (master->state == state_idle)
+		return;
+
+	lock(&master->lock);
+	p8_i2c_check_status(master);
+	if (master->state != state_idle)
+		schedule_timer_at(&master->poller, now + master->poll_interval);
+	p8_i2c_check_work(master);
+	unlock(&master->lock);
+}
+
+void p8_i2c_interrupt(uint32_t chip_id)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct p8_i2c_master *master = NULL;
+
+	assert(chip);
+	list_for_each(&chip->i2cms, master, link) {
+
+		/* Lockless fast bailout (shared interrupt) */
+		if (master->state == state_idle)
+			continue;
+
+		lock(&master->lock);
+
+		/* Run the state machine */
+		p8_i2c_check_status(master);
+
+		/* Check for new work */
+		p8_i2c_check_work(master);
+
+		unlock(&master->lock);
+	}
+}
+
+static const char *compat[] = {
+	"ibm,power8-i2cm",
+	"ibm,centaur-i2cm"
+};
+
+static void p8_i2c_add_bus_prop(struct p8_i2c_master_port *port)
+{
+	const struct dt_property *c, *p;
+	struct dt_node *np = port->bus.dt_node;
+	char name[32];
+
+	c = dt_find_property(np, "compatible");
+	p = dt_find_property(np, "ibm,port-name");
+
+	if (!c) {
+		if (port->master->type == I2C_POWER8)
+			dt_add_property_strings(np, "compatible",
+						"ibm,power8-i2c-port",
+						"ibm,opal-i2c");
+		else if (port->master->type == I2C_CENTAUR)
+			dt_add_property_strings(np, "compatible",
+						"ibm,centaur-i2c-port",
+						"ibm,opal-i2c");
+	}
+
+	if (!p) {
+		if (port->master->type == I2C_POWER8)
+			snprintf(name, sizeof(name), "p8_%08x_e%dp%d",
+				 port->master->chip_id, port->master->engine_id,
+				 port->port_num);
+		else if (port->master->type == I2C_CENTAUR)
+			snprintf(name, sizeof(name), "cen_%08x_e%dp%d",
+				 port->master->chip_id, port->master->engine_id,
+				 port->port_num);
+
+		dt_add_property_string(np, "ibm,port-name", name);
+	}
+}
+
+static struct p8_i2c_master_port *p8_i2c_init_one_port(struct p8_i2c_master *m,
+				struct dt_node *n)
+{
+	struct p8_i2c_master_port *port;
+	uint64_t def_timeout, lb_freq;
+	uint32_t speed, div;
+
+	port = zalloc(sizeof(*port));
+	if (!port)
+		return NULL;
+
+	def_timeout = m->irq_ok ? I2C_TIMEOUT_IRQ_MS : I2C_TIMEOUT_POLL_MS;
+
+	lb_freq = dt_prop_get_u32_def(m->dt_node, "clock-frequency", 150000000);
+	speed = dt_prop_get_u32_def(n, "bus-frequency", 100000);
+	div = p8_i2c_get_bit_rate_divisor(lb_freq, speed);
+
+	/* p8-i2c stuff */
+	port->master       = m;
+	port->bit_rate_div = div;
+	port->poll_interval = p8_i2c_get_poll_interval(speed);
+	port->port_num     = dt_prop_get_u32(n, "reg");
+	port->byte_timeout = dt_prop_get_u32_def(n, "timeout-ms", def_timeout);
+	list_add_tail(&m->ports, &port->link);
+
+	/* core i2c stuff */
+	port->bus.dt_node   = n;
+	port->bus.queue_req = p8_i2c_queue_request;
+	port->bus.run_req   = p8_i2c_run_request;
+	i2c_add_bus(&port->bus);
+
+	/* add the bus name and compatible (if needed) */
+	p8_i2c_add_bus_prop(port);
+
+	prlog(PR_INFO, " P%d: <%s> %d kHz\n", port->port_num,
+			(char *) dt_prop_get(n, "ibm,port-name"), speed / 1000);
+
+	return port;
+}
+
+static struct p8_i2c_master *p8_i2c_init_one(struct dt_node *i2cm,
+						enum p8_i2c_master_type type)
+{
+	struct p8_i2c_master *master;
+	struct list_head *chip_list;
+	struct dt_node *i2cm_port;
+	uint64_t ex_stat;
+	uint32_t lb_freq;
+	int64_t rc;
+
+	master = zalloc(sizeof(*master));
+	if (!master) {
+		log_simple_error(&e_info(OPAL_RC_I2C_INIT),
+				 "I2C: Failed to allocate master "
+				 "structure\n");
+		return NULL;
+	}
+	master->type = type;
+
+	/* Local bus speed in Hz */
+	lb_freq = dt_prop_get_u32(i2cm, "clock-frequency");
+
+	/* Initialise the i2c master structure */
+	master->state = state_idle;
+	master->chip_id = dt_get_chip_id(i2cm);
+	master->engine_id = dt_prop_get_u32(i2cm, "chip-engine#");
+	master->xscom_base = dt_get_address(i2cm, 0, NULL);
+	master->dt_node = i2cm;
+	if (master->type == I2C_CENTAUR) {
+		struct centaur_chip *centaur = get_centaur(master->chip_id);
+		if (centaur == NULL) {
+			log_simple_error(&e_info(OPAL_RC_I2C_INIT),
+					 "I2C: Failed to get centaur 0x%x ",
+					 master->chip_id);
+			free(master);
+			return NULL;
+		}
+		chip_list = &centaur->i2cms;
+
+		/* Detect bad device-tree from HostBoot giving us bogus
+		 * i2c masters
+		 */
+		if (master->engine_id > 0) {
+			prlog(PR_ERR, "I2C: Skipping Centaur Master #1\n");
+			free(master);
+			return NULL;
+		}
+	} else {
+		struct proc_chip *chip = get_chip(master->chip_id);
+		assert(chip);
+		chip_list = &chip->i2cms;
+	}
+	init_timer(&master->timeout, p8_i2c_timeout, master);
+	init_timer(&master->poller, p8_i2c_poll, master);
+	init_timer(&master->recovery, p8_i2c_recover, master);
+	init_timer(&master->sensor_cache, p8_i2c_enable_scache, master);
+
+	master->irq_ok = p8_i2c_has_irqs(master);
+
+	prlog(PR_INFO, "I2C: Chip %08x Eng. %d Clock %d Mhz %s\n",
+	      master->chip_id, master->engine_id, lb_freq / 1000000,
+	      master->irq_ok ? "" : "(no interrupt)");
+
+	/* Disable OCC cache during inits */
+	if (master->type == I2C_CENTAUR) {
+		rc = centaur_disable_sensor_cache(master->chip_id);
+		if (rc < 0) {
+			log_simple_error(&e_info(OPAL_RC_I2C_INIT), "I2C: "
+					 "Error %lld disabling sensor cache\n",
+					 rc);
+			/* Ignore error and move on ... */
+		} else
+			time_wait(rc);
+	}
+	rc = xscom_read(master->chip_id, master->xscom_base +
+			I2C_EXTD_STAT_REG, &ex_stat);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_I2C_INIT), "I2C: "
+				 "Failed to read EXTD_STAT_REG\n");
+		if (master->type == I2C_CENTAUR)
+			centaur_enable_sensor_cache(master->chip_id);
+
+		free(master);
+		return NULL;
+	}
+
+	master->fifo_size = GETFIELD(I2C_EXTD_STAT_FIFO_SIZE, ex_stat);
+	list_head_init(&master->req_list);
+	list_head_init(&master->ports);
+
+	/* Re-enable the sensor cache, we aren't touching HW anymore */
+	if (master->type == I2C_CENTAUR)
+		centaur_enable_sensor_cache(master->chip_id);
+
+	/* Add master to chip's list */
+	list_add_tail(chip_list, &master->link);
+
+	/* initialise ports */
+	dt_for_each_child(i2cm, i2cm_port)
+		p8_i2c_init_one_port(master, i2cm_port);
+
+	return master;
+}
+
+void p8_i2c_init(void)
+{
+	struct dt_node *i2cm;
+	int i;
+
+	/* setup the handshake reg */
+	if (proc_gen <= proc_gen_p9)
+		occflg = 0x6C08A;
+	else if (proc_gen == proc_gen_p10)
+		occflg = 0x6C0AC;
+	else
+		return;
+
+	prlog(PR_INFO, "I2C: OCC flag reg: %x\n", occflg);
+
+	for (i = 0; i < MAX_I2C_TYPE; i++) {
+		dt_for_each_compatible(dt_root, i2cm, compat[i])
+			p8_i2c_init_one(i2cm, i);
+	}
+}
+
+struct i2c_bus *p8_i2c_find_bus_by_port(uint32_t chip_id, int eng, int port_num)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct p8_i2c_master *m, *master = NULL;
+	struct p8_i2c_master_port *port;
+
+	if (!chip)
+		return NULL;
+
+	list_for_each(&chip->i2cms, m, link) {
+		if (m->engine_id == eng) {
+			master = m;
+			break;
+		}
+	}
+
+	if (!master)
+		return NULL;
+
+	list_for_each(&master->ports, port, link)
+		if (port->port_num == port_num)
+			return &port->bus;
+
+	return NULL;
+}
+
+/* Adds a new i2c port to the DT and initialises it */
+struct i2c_bus *p8_i2c_add_bus(uint32_t chip_id, int eng_id, int port_id,
+				uint32_t bus_speed)
+{
+	struct proc_chip *c = get_chip(chip_id);
+	struct p8_i2c_master *m, *master = NULL;
+	struct p8_i2c_master_port *port;
+	struct dt_node *pn;
+
+	if (!c) {
+		prerror("I2C: Unable to add i2c bus: c%de%dp%d: chip doesn't exist\n",
+			chip_id, eng_id, port_id);
+		return NULL;
+	}
+
+	list_for_each(&c->i2cms, m, link) {
+		if (m->engine_id == eng_id) {
+			master = m;
+			break;
+		}
+	}
+
+	if (!master) {
+		struct dt_node *mn;
+
+		mn = p8_i2c_add_master_node(c->devnode, eng_id);
+		if (!mn) {
+			prerror("I2C: Unable to add DT node for I2CM c%xe%d\n",
+					chip_id, eng_id);
+			return NULL;
+		}
+
+		master = p8_i2c_init_one(mn, I2C_POWER8);
+		if (!master) {
+			prerror("I2C: Unable to initialise I2CM c%xe%d\n",
+					chip_id, eng_id);
+			return NULL;
+		}
+	}
+
+	list_for_each(&master->ports, port, link)
+		if (port->port_num == port_id)
+			return &port->bus;
+
+	pn = __p8_i2c_add_port_node(master->dt_node, port_id, bus_speed);
+	if (!pn) {
+		prerror("I2C: Unable to add dt node for bus c%xe%dp%d\n",
+					chip_id, eng_id, port_id);
+		return NULL;
+	}
+
+	port = p8_i2c_init_one_port(master, pn);
+	if (!port) {
+		prerror("I2C: Unable to init bus c%xe%dp%d\n",
+					chip_id, eng_id, port_id);
+		return NULL;
+	}
+
+	return &port->bus;
+}
diff --git a/roms/skiboot/hw/phb3.c b/roms/skiboot/hw/phb3.c
new file mode 100644
index 000000000..8af6b6164
--- /dev/null
+++ b/roms/skiboot/hw/phb3.c
@@ -0,0 +1,5052 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * PHB3: PCI Host Bridge 3, in POWER8
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci-cfg.h>
+#include <pci.h>
+#include <pci-slot.h>
+#include <vpd.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <xscom.h>
+#include <affinity.h>
+#include <phb3.h>
+#include <phb3-regs.h>
+#include <phb3-capp.h>
+#include <capp.h>
+#include <fsp.h>
+#include <chip.h>
+#include <chiptod.h>
+
+/* Enable this to disable error interrupts for debug purposes */
+#undef DISABLE_ERR_INTS
+
+static void phb3_init_hw(struct phb3 *p, bool first_init);
+
+#define PHBDBG(p, fmt, a...)	prlog(PR_DEBUG, "PHB#%04x: " fmt, \
+				      (p)->phb.opal_id, ## a)
+#define PHBINF(p, fmt, a...)	prlog(PR_INFO, "PHB#%04x: " fmt, \
+				      (p)->phb.opal_id, ## a)
+#define PHBERR(p, fmt, a...)	prlog(PR_ERR, "PHB#%04x: " fmt, \
+				      (p)->phb.opal_id, ## a)
+
+#define PE_CAPP_EN 0x9013c03
+
+#define PE_REG_OFFSET(p) \
+	((PHB3_IS_NAPLES(p) && (p)->index) ? 0x40 : 0x0)
+
+/* Helper to select an IODA table entry */
+static inline void phb3_ioda_sel(struct phb3 *p, uint32_t table,
+				 uint32_t addr, bool autoinc)
+{
+	out_be64(p->regs + PHB_IODA_ADDR,
+		 (autoinc ? PHB_IODA_AD_AUTOINC : 0)	|
+		 SETFIELD(PHB_IODA_AD_TSEL, 0ul, table)	|
+		 SETFIELD(PHB_IODA_AD_TADR, 0ul, addr));
+}
+
+static void phb3_eeh_dump_regs(struct phb3 *p,
+				struct OpalIoPhb3ErrorData *regs);
+
+/* Check if AIB is fenced via PBCQ NFIR */
+static bool phb3_fenced(struct phb3 *p)
+{
+	uint64_t nfir;
+
+	/* We still probably has crazy xscom */
+	xscom_read(p->chip_id, p->pe_xscom + 0x0, &nfir);
+	if (nfir & PPC_BIT(16)) {
+		p->flags |= PHB3_AIB_FENCED;
+
+		phb3_eeh_dump_regs(p, NULL);
+		return true;
+	}
+	return false;
+}
+
+static int64_t phb3_pcicfg_rc_pref_window(void *dev __unused,
+					  struct pci_cfg_reg_filter *pcrf,
+					  uint32_t offset, uint32_t len,
+					  uint32_t *data,  bool write)
+{
+	uint8_t *pdata;
+	uint32_t i;
+
+	/* Cache whatever we received */
+	if (write) {
+		pdata = &pcrf->data[offset - pcrf->start];
+		for (i = 0; i < len; i++, pdata++)
+			*pdata = (uint8_t)(*data >> (8 * i));
+		return OPAL_SUCCESS;
+	}
+
+	/* Return whatever we cached */
+	*data = 0;
+	pdata = &pcrf->data[offset - pcrf->start + len - 1];
+	for (i = len; i > 0; i--, pdata--) {
+		*data = (*data) << 8;
+		if (offset + i == PCI_CFG_PREF_MEM_BASE) {
+			*data |= ((*pdata & 0xf0) | 0x1);
+			continue;
+		}
+
+		*data |= *pdata;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Configuration space access
+ *
+ * The PHB lock is assumed to be already held
+ */
+static int64_t phb3_pcicfg_check(struct phb3 *p, uint32_t bdfn,
+				 uint32_t offset, uint32_t size,
+				 uint8_t *pe)
+{
+	uint32_t sm = size - 1;
+
+	if (offset > 0xfff || bdfn > 0xffff)
+		return OPAL_PARAMETER;
+	if (offset & sm)
+		return OPAL_PARAMETER;
+
+	/* The root bus only has a device at 0 and we get into an
+	 * error state if we try to probe beyond that, so let's
+	 * avoid that and just return an error to Linux
+	 */
+	if (PCI_BUS_NUM(bdfn) == 0 && (bdfn & 0xff))
+		return OPAL_HARDWARE;
+
+	/* Check PHB state */
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	/* Fetch the PE# from cache */
+	*pe = p->rte_cache[bdfn];
+
+	return OPAL_SUCCESS;
+}
+
+static void phb3_link_update(struct phb *phb, uint16_t data)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint32_t new_spd, new_wid;
+	uint32_t old_spd, old_wid;
+	uint16_t old_data;
+	uint64_t lreg;
+	int i;
+
+	/* Read the old speed and width */
+	pci_cfg_read16(phb, 0, 0x5a, &old_data);
+
+	/* Decode the register values */
+	new_spd = data & PCICAP_EXP_LSTAT_SPEED;
+	new_wid = (data & PCICAP_EXP_LSTAT_WIDTH) >> 4;
+	old_spd = old_data & PCICAP_EXP_LSTAT_SPEED;
+	old_wid = (old_data & PCICAP_EXP_LSTAT_WIDTH) >> 4;
+
+	/* Apply maximums */
+	if (new_wid > 16)
+		new_wid = 16;
+	if (new_wid < 1)
+		new_wid = 1;
+	if (new_spd > 3)
+		new_spd = 3;
+	if (new_spd < 1)
+		new_spd = 1;
+
+	PHBINF(p, "Link change request: speed %d->%d, width %d->%d\n",
+	       old_spd, new_spd, old_wid, new_wid);
+
+	/* Check if width needs to be changed */
+	if (old_wid != new_wid) {
+		PHBINF(p, "Changing width...\n");
+		lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT);
+		lreg = SETFIELD(PHB_PCIE_LM_TGT_LINK_WIDTH, lreg, new_wid);
+		lreg |= PHB_PCIE_LM_CHG_LINK_WIDTH;
+		out_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT, lreg);
+		for (i=0; i<10;i++) {
+			lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT);
+			if (lreg & PHB_PCIE_LM_DL_WCHG_PENDING)
+				break;
+			time_wait_ms_nopoll(1);
+		}
+		if (!(lreg & PHB_PCIE_LM_DL_WCHG_PENDING))
+			PHBINF(p, "Timeout waiting for speed change start\n");
+		for (i=0; i<100;i++) {
+			lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT);
+			if (!(lreg & PHB_PCIE_LM_DL_WCHG_PENDING))
+				break;
+			time_wait_ms_nopoll(1);
+		}
+		if (lreg & PHB_PCIE_LM_DL_WCHG_PENDING)
+			PHBINF(p, "Timeout waiting for speed change end\n");
+	}
+	/* Check if speed needs to be changed */
+	if (old_spd != new_spd) {
+		PHBINF(p, "Changing speed...\n");
+		lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT);
+		if (lreg & PPC_BIT(19)) {
+			uint16_t lctl2;
+			PHBINF(p, " Bit19 set ! working around...\n");
+			pci_cfg_read16(phb, 0, 0x78, &lctl2);
+			PHBINF(p, " LCTL2=%04x\n", lctl2);
+			lctl2 &= ~PCICAP_EXP_LCTL2_HWAUTSPDIS;
+			pci_cfg_write16(phb, 0, 0x78, lctl2);
+		}
+		lreg = in_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT);
+		lreg = SETFIELD(PHB_PCIE_LM_TGT_SPEED, lreg, new_spd);
+		lreg |= PHB_PCIE_LM_CHG_SPEED;
+		out_be64(p->regs + PHB_PCIE_LINK_MANAGEMENT, lreg);
+	}
+}
+
+static int64_t phb3_pcicfg_rc_link_speed(void *dev,
+					 struct pci_cfg_reg_filter *pcrf __unused,
+					 uint32_t offset, uint32_t len,
+					 uint32_t *data,  bool write)
+{
+	struct pci_device *pd = dev;
+
+	/* Hack for link speed changes. We intercept attempts at writing
+	 * the link control/status register
+	 */
+	if (write && len == 4 && offset == 0x58) {
+		phb3_link_update(pd->phb, (*data) >> 16);
+		return OPAL_SUCCESS;
+	}
+	if (write && len == 2 && offset == 0x5a) {
+		phb3_link_update(pd->phb, *(uint16_t *)data);
+		return OPAL_SUCCESS;
+	}
+
+	return OPAL_PARTIAL;
+}
+
+#define PHB3_PCI_CFG_READ(size, type)	\
+static int64_t phb3_pcicfg_read##size(struct phb *phb, uint32_t bdfn,	\
+                                      uint32_t offset, type *data)	\
+{									\
+	struct phb3 *p = phb_to_phb3(phb);				\
+	uint64_t addr, val64;						\
+	int64_t rc;							\
+	uint8_t pe;							\
+	bool use_asb = false;						\
+									\
+	/* Initialize data in case of error */				\
+	*data = (type)0xffffffff;					\
+									\
+	rc = phb3_pcicfg_check(p, bdfn, offset, sizeof(type), &pe);	\
+	if (rc)								\
+		return rc;						\
+									\
+	if (p->flags & PHB3_AIB_FENCED) {				\
+		if (!(p->flags & PHB3_CFG_USE_ASB))			\
+			return OPAL_HARDWARE;				\
+		use_asb = true;						\
+	} else if ((p->flags & PHB3_CFG_BLOCKED) && bdfn != 0) {	\
+		return OPAL_HARDWARE;					\
+	}								\
+									\
+	rc = pci_handle_cfg_filters(phb, bdfn, offset, sizeof(type),	\
+				    (uint32_t *)data, false);		\
+	if (rc != OPAL_PARTIAL)						\
+		return rc;						\
+									\
+	addr = PHB_CA_ENABLE;						\
+	addr = SETFIELD(PHB_CA_BDFN, addr, bdfn);			\
+	addr = SETFIELD(PHB_CA_REG, addr, offset);			\
+	addr = SETFIELD(PHB_CA_PE, addr, pe);				\
+	if (use_asb) {							\
+		phb3_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr);	\
+		sync();							\
+		val64 = bswap_64(phb3_read_reg_asb(p, PHB_CONFIG_DATA));	\
+		*data = (type)(val64 >> (8 * (offset & (4 - sizeof(type)))));	\
+	} else {							\
+		out_be64(p->regs + PHB_CONFIG_ADDRESS, addr);		\
+		*data = in_le##size(p->regs + PHB_CONFIG_DATA +		\
+				    (offset & (4 - sizeof(type))));	\
+	}								\
+									\
+	return OPAL_SUCCESS;						\
+}
+
+#define PHB3_PCI_CFG_WRITE(size, type)	\
+static int64_t phb3_pcicfg_write##size(struct phb *phb, uint32_t bdfn,	\
+                                       uint32_t offset, type data)	\
+{									\
+	struct phb3 *p = phb_to_phb3(phb);				\
+	uint64_t addr, val64 = 0;					\
+	int64_t rc;							\
+	uint8_t pe;							\
+	bool use_asb = false;						\
+									\
+	rc = phb3_pcicfg_check(p, bdfn, offset, sizeof(type), &pe);	\
+	if (rc)								\
+		return rc;						\
+									\
+	if (p->flags & PHB3_AIB_FENCED) {				\
+		if (!(p->flags & PHB3_CFG_USE_ASB))			\
+			return OPAL_HARDWARE;				\
+		use_asb = true;						\
+	} else if ((p->flags & PHB3_CFG_BLOCKED) && bdfn != 0) {	\
+		return OPAL_HARDWARE;					\
+	}								\
+									\
+	rc = pci_handle_cfg_filters(phb, bdfn, offset, sizeof(type),	\
+				    (uint32_t *)&data, true);		\
+	if (rc != OPAL_PARTIAL)						\
+		return rc;						\
+									\
+	addr = PHB_CA_ENABLE;						\
+	addr = SETFIELD(PHB_CA_BDFN, addr, bdfn);			\
+	addr = SETFIELD(PHB_CA_REG, addr, offset);			\
+	addr = SETFIELD(PHB_CA_PE, addr, pe);				\
+	if (use_asb) {							\
+		val64 = data;						\
+		val64 = bswap_64(val64 << 8 * (offset & (4 - sizeof(type))));	\
+		phb3_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr);	\
+		sync();							\
+		phb3_write_reg_asb(p, PHB_CONFIG_DATA, val64);		\
+	} else {							\
+		out_be64(p->regs + PHB_CONFIG_ADDRESS, addr);		\
+		out_le##size(p->regs + PHB_CONFIG_DATA +		\
+			     (offset & (4 - sizeof(type))), data);	\
+	}								\
+									\
+        return OPAL_SUCCESS;						\
+}
+
+PHB3_PCI_CFG_READ(8, u8)
+PHB3_PCI_CFG_READ(16, u16)
+PHB3_PCI_CFG_READ(32, u32)
+PHB3_PCI_CFG_WRITE(8, u8)
+PHB3_PCI_CFG_WRITE(16, u16)
+PHB3_PCI_CFG_WRITE(32, u32)
+
+static int64_t phb3_get_reserved_pe_number(struct phb *phb __unused)
+{
+	return PHB3_RESERVED_PE_NUM;
+}
+
+static inline void phb3_enable_ecrc(struct phb *phb, bool enable)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint32_t ctl;
+
+	if (p->aercap <= 0)
+		return;
+
+	pci_cfg_read32(phb, 0, p->aercap + PCIECAP_AER_CAPCTL, &ctl);
+	if (enable) {
+		ctl |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+			PCIECAP_AER_CAPCTL_ECRCC_EN);
+	} else {
+		ctl &= ~(PCIECAP_AER_CAPCTL_ECRCG_EN |
+			 PCIECAP_AER_CAPCTL_ECRCC_EN);
+	}
+
+	pci_cfg_write32(phb, 0, p->aercap + PCIECAP_AER_CAPCTL, ctl);
+}
+
+static void phb3_root_port_init(struct phb *phb, struct pci_device *dev,
+				int ecap, int aercap)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/* Use PHB's callback so that the UTL events will be masked
+	 * or unmasked when the link is down or up.
+	 */
+	if (dev->slot && dev->slot->ops.prepare_link_change &&
+	    phb->slot && phb->slot->ops.prepare_link_change)
+		dev->slot->ops.prepare_link_change =
+			phb->slot->ops.prepare_link_change;
+
+	/* Mask UTL link down event if root slot supports surprise
+	 * hotplug as the event should be handled by hotplug driver
+	 * instead of EEH subsystem.
+	 */
+	if (dev->slot && dev->slot->surprise_pluggable)
+		out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0xad42800000000000UL);
+
+	/* Enable SERR and parity checking */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_SERR_EN | PCI_CFG_CMD_PERR_RESP);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+		  PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT |
+		  PCICAP_EXP_DEVCTL_UR_REPORT);
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	if (!aercap) return;
+
+	/* Mask various unrecoverable errors. The link surprise down
+	 * event should be masked when its PCI slot support surprise
+	 * hotplug. The link surprise down event should be handled by
+	 * PCI hotplug driver instead of EEH subsystem.
+	 */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, &val32);
+	val32 |= (PCIECAP_AER_UE_MASK_POISON_TLP |
+		  PCIECAP_AER_UE_MASK_COMPL_TIMEOUT |
+		  PCIECAP_AER_UE_MASK_COMPL_ABORT |
+		  PCIECAP_AER_UE_MASK_ECRC);
+	if (dev->slot && dev->slot->surprise_pluggable)
+		val32 |= PCIECAP_AER_UE_MASK_SURPRISE_DOWN;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, val32);
+
+	/* Report various unrecoverable errors as fatal errors */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, &val32);
+	val32 |= (PCIECAP_AER_UE_SEVERITY_DLLP |
+		  PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+		  PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+		  PCIECAP_AER_UE_SEVERITY_UNEXP_COMPL |
+		  PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+		  PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+	/* Mask various recoverable errors */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, &val32);
+	val32 |= PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+	/* Enable ECRC check */
+	phb3_enable_ecrc(phb, true);
+
+	/* Enable all error reporting */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, &val32);
+	val32 |= (PCIECAP_AER_RERR_CMD_FE |
+		  PCIECAP_AER_RERR_CMD_NFE |
+		  PCIECAP_AER_RERR_CMD_CE);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, val32);
+}
+
+static void phb3_switch_port_init(struct phb *phb,
+				  struct pci_device *dev,
+				  int ecap, int aercap)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/* Enable SERR and parity checking and disable INTx */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_PERR_RESP |
+		  PCI_CFG_CMD_SERR_EN |
+		  PCI_CFG_CMD_INTx_DIS);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Disable partity error and enable system error */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_BRCTL, &val16);
+	val16 &= ~PCI_CFG_BRCTL_PERR_RESP_EN;
+	val16 |= PCI_CFG_BRCTL_SERR_EN;
+	pci_cfg_write16(phb, bdfn, PCI_CFG_BRCTL, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+		  PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT);
+	/* HW279570 - Disable reporting of correctable errors */
+	val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	/* Unmask all unrecoverable errors for upstream port. For
+	 * downstream port, the surprise link down is masked because
+	 * it should be handled by hotplug driver instead of EEH
+	 * subsystem.
+	 */
+	if (!aercap) return;
+	if (dev->dev_type == PCIE_TYPE_SWITCH_DNPORT &&
+	    dev->slot && dev->slot->surprise_pluggable)
+		pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK,
+				PCIECAP_AER_UE_MASK_SURPRISE_DOWN);
+	else
+		pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, 0x0);
+
+	/* Severity of unrecoverable errors */
+	if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT)
+		val32 = (PCIECAP_AER_UE_SEVERITY_DLLP |
+			 PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+			 PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+			 PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+			 PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP |
+			 PCIECAP_AER_UE_SEVERITY_INTERNAL);
+	else
+		val32 = (PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+			 PCIECAP_AER_UE_SEVERITY_INTERNAL);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+	/*
+	 * Mask various correctable errors
+	 *
+         * On Murano and Venice DD1.0 we disable emission of corrected
+         * error messages to the PHB completely to workaround errata
+         * HW257476 causing the loss of tags.
+	 */
+	if (p->rev < PHB3_REV_MURANO_DD20)
+		val32 = 0xffffffff;
+	else
+		val32 = PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+	/* Enable ECRC generation and disable ECRC check */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= PCIECAP_AER_CAPCTL_ECRCG_EN;
+	val32 &= ~PCIECAP_AER_CAPCTL_ECRCC_EN;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static void phb3_endpoint_init(struct phb *phb,
+			       struct pci_device *dev,
+			       int ecap, int aercap)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/* Enable SERR and parity checking */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_PERR_RESP |
+		  PCI_CFG_CMD_SERR_EN);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+	val16 |= (PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT |
+		  PCICAP_EXP_DEVCTL_UR_REPORT);
+	/* HW279570 - Disable reporting of correctable errors */
+	val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	/*
+	 * On Murano and Venice DD1.0 we disable emission of corrected
+	 * error messages to the PHB completely to workaround errata
+	 * HW257476 causing the loss of tags.
+	 */
+	if (p->rev < PHB3_REV_MURANO_DD20)
+		pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK,
+				0xffffffff);
+
+	/* Enable ECRC generation and check */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+		  PCIECAP_AER_CAPCTL_ECRCC_EN);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static int64_t phb3_pcicfg_no_dstate(void *dev __unused,
+				     struct pci_cfg_reg_filter *pcrf,
+				     uint32_t offset, uint32_t len __unused,
+				     uint32_t *data __unused,  bool write)
+{
+	uint32_t loff = offset - pcrf->start;
+
+	/* Disable D-state change on children of the PHB. For now we
+	 * simply block all writes to the PM control/status
+	 */
+	if (write && loff >= 4 && loff < 6)
+		return OPAL_SUCCESS;
+
+	return OPAL_PARTIAL;
+}
+
+static void phb3_check_device_quirks(struct phb *phb, struct pci_device *dev)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	if (dev->primary_bus != 0 &&
+	    dev->primary_bus != 1)
+		return;
+
+	if (dev->primary_bus == 1) {
+		u64 modectl;
+
+		/*
+		 * For these adapters, if they are directly under the PHB, we
+		 * adjust the disable_wr_scope_group bit for performances
+		 *
+		 * 15b3:1003   Mellanox Travis3-EN (CX3)
+		 * 15b3:1011   Mellanox HydePark (ConnectIB)
+		 * 15b3:1013   Mellanox GlacierPark (CX4)
+		 */
+		xscom_read(p->chip_id, p->pe_xscom + 0x0b, &modectl);
+		if (PCI_VENDOR_ID(dev->vdid) == 0x15b3 &&
+		    (PCI_DEVICE_ID(dev->vdid) == 0x1003	||
+		     PCI_DEVICE_ID(dev->vdid) == 0x1011 ||
+		     PCI_DEVICE_ID(dev->vdid) == 0x1013))
+			modectl |= PPC_BIT(14);
+		else
+			modectl &= ~PPC_BIT(14);
+		xscom_write(p->chip_id, p->pe_xscom + 0x0b, modectl);
+
+		/*
+		 * Naples has a problem with D-states at least on Mellanox CX4,
+		 * disable changing D-state on Naples like we do it for PHB4.
+		 */
+		if (PHB3_IS_NAPLES(p) &&
+		    pci_has_cap(dev, PCI_CFG_CAP_ID_PM, false)) {
+			pci_add_cfg_reg_filter(dev,
+					pci_cap(dev, PCI_CFG_CAP_ID_PM, false),
+					8,
+					PCI_REG_FLAG_WRITE,
+					phb3_pcicfg_no_dstate);
+		}
+	} else if (dev->primary_bus == 0) {
+		/*
+		 * Emulate the prefetchable window of the root port
+		 * when the corresponding HW registers are readonly.
+		 *
+		 * 1014:03dc   Root port on P8/P8E/P8NVL
+		 */
+		if (PCI_VENDOR_ID(dev->vdid) == 0x1014 &&
+		    PCI_DEVICE_ID(dev->vdid) == 0x03dc) {
+			uint32_t pref_hi, tmp;
+
+			pci_cfg_read32(phb, dev->bdfn,
+				PCI_CFG_PREF_MEM_BASE_U32, &pref_hi);
+			pci_cfg_write32(phb, dev->bdfn,
+				PCI_CFG_PREF_MEM_BASE_U32, ~pref_hi);
+			pci_cfg_read32(phb, dev->bdfn,
+				PCI_CFG_PREF_MEM_BASE_U32, &tmp);
+			pci_cfg_write32(phb, dev->bdfn,
+				PCI_CFG_PREF_MEM_BASE_U32, pref_hi);
+			if (tmp == pref_hi)
+				pci_add_cfg_reg_filter(dev,
+					PCI_CFG_PREF_MEM_BASE_U32, 12,
+					PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+					phb3_pcicfg_rc_pref_window);
+			/* Add filter to control link speed */
+			pci_add_cfg_reg_filter(dev,
+					       0x58, 4,
+					       PCI_REG_FLAG_WRITE,
+					       phb3_pcicfg_rc_link_speed);
+		}
+	}
+}
+
+static inline int phb3_should_disable_ecrc(struct pci_device *pd)
+{
+	/*
+	 * When we have PMC PCIe switch, we need disable ECRC on root port.
+	 * Otherwise, the adapters behind the switch downstream ports might
+	 * not probed successfully.
+	 */
+	if (pd->vdid == 0x854611f8)
+		return true;
+
+	return false;
+}
+
+static int phb3_device_init(struct phb *phb,
+			    struct pci_device *dev,
+			    void *data)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	int ecap, aercap;
+
+	/* Some special adapter tweaks for devices directly under the PHB */
+	phb3_check_device_quirks(phb, dev);
+
+	/* Common initialization for the device */
+	pci_device_init(phb, dev);
+
+	ecap = pci_cap(dev, PCI_CFG_CAP_ID_EXP, false);
+	aercap = pci_cap(dev, PCIECAP_ID_AER, true);
+	if (dev->dev_type == PCIE_TYPE_ROOT_PORT)
+		phb3_root_port_init(phb, dev, ecap, aercap);
+	else if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT ||
+		 dev->dev_type == PCIE_TYPE_SWITCH_DNPORT)
+		phb3_switch_port_init(phb, dev, ecap, aercap);
+	else
+		phb3_endpoint_init(phb, dev, ecap, aercap);
+
+	/*
+	 * Check if we need disable ECRC functionality on root port. It
+	 * only happens when PCI topology changes, meaning it's skipped
+	 * when reinitializing PCI device after EEH reset.
+	 */
+	if (!data && phb3_should_disable_ecrc(dev)) {
+		if (p->no_ecrc_devs++ == 0)
+			phb3_enable_ecrc(phb, false);
+	}
+
+	return 0;
+}
+
+static void phb3_device_remove(struct phb *phb, struct pci_device *pd)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	if (!phb3_should_disable_ecrc(pd) || p->no_ecrc_devs == 0)
+		return;
+
+	if (--p->no_ecrc_devs == 0)
+		phb3_enable_ecrc(phb, true);
+}
+
+static int64_t phb3_pci_reinit(struct phb *phb, uint64_t scope, uint64_t data)
+{
+	struct pci_device *pd;
+	uint16_t bdfn = data;
+	int ret;
+
+	if (scope != OPAL_REINIT_PCI_DEV)
+		return OPAL_PARAMETER;
+
+	pd = pci_find_dev(phb, bdfn);
+	if (!pd)
+		return OPAL_PARAMETER;
+
+	ret = phb3_device_init(phb, pd, pd);
+	if (ret)
+		return OPAL_HARDWARE;
+
+	return OPAL_SUCCESS;
+}
+
+/* Clear IODA cache tables */
+static void phb3_init_ioda_cache(struct phb3 *p)
+{
+	uint32_t i;
+	uint64_t *data64;
+
+	/*
+	 * RTT and PELTV. RTE should be 0xFF's to indicate
+	 * invalid PE# for the corresponding RID.
+	 *
+	 * Note: Instead we set all RTE entries to 0x00 to
+	 * work around a problem where PE lookups might be
+	 * done before Linux has established valid PE's
+	 * (during PCI probing). We can revisit that once/if
+	 * Linux has been fixed to always setup valid PEs.
+	 *
+	 * The value 0x00 corresponds to the default PE# Linux
+	 * uses to check for config space freezes before it
+	 * has assigned PE# to busses.
+	 *
+	 * WARNING: Additionally, we need to be careful, there's
+	 * a HW issue, if we get an MSI on an RTT entry that is
+	 * FF, things will go bad. We need to ensure we don't
+	 * ever let a live FF RTT even temporarily when resetting
+	 * for EEH etc... (HW278969).
+	 */
+	for (i = 0; i < ARRAY_SIZE(p->rte_cache); i++)
+		p->rte_cache[i] = PHB3_RESERVED_PE_NUM;
+	memset(p->peltv_cache, 0x0,  sizeof(p->peltv_cache));
+
+	/* Disable all LSI */
+	for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) {
+		data64 = &p->lxive_cache[i];
+		*data64 = SETFIELD(IODA2_LXIVT_PRIORITY, 0ul, 0xff);
+		*data64 = SETFIELD(IODA2_LXIVT_SERVER, *data64, 0x0);
+	}
+
+	/* Diable all MSI */
+	for (i = 0; i < ARRAY_SIZE(p->ive_cache); i++) {
+		data64 = &p->ive_cache[i];
+		*data64 = SETFIELD(IODA2_IVT_PRIORITY, 0ul, 0xff);
+		*data64 = SETFIELD(IODA2_IVT_SERVER, *data64, 0x0);
+	}
+
+	/* Clear TVT */
+	memset(p->tve_cache, 0x0, sizeof(p->tve_cache));
+	/* Clear M32 domain */
+	memset(p->m32d_cache, 0x0, sizeof(p->m32d_cache));
+	/* Clear M64 domain */
+	memset(p->m64b_cache, 0x0, sizeof(p->m64b_cache));
+}
+
+/* phb3_ioda_reset - Reset the IODA tables
+ *
+ * @purge: If true, the cache is cleared and the cleared values
+ *         are applied to HW. If false, the cached values are
+ *         applied to HW
+ *
+ * This reset the IODA tables in the PHB. It is called at
+ * initialization time, on PHB reset, and can be called
+ * explicitly from OPAL
+ */
+static int64_t phb3_ioda_reset(struct phb *phb, bool purge)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t server, prio;
+	uint64_t *pdata64, data64;
+	uint32_t i;
+
+	if (purge) {
+		prlog(PR_DEBUG, "PHB%x: Purging all IODA tables...\n",
+		      p->phb.opal_id);
+		phb3_init_ioda_cache(p);
+	}
+
+	/* Init_27..28 - LIXVT */
+	phb3_ioda_sel(p, IODA2_TBL_LXIVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) {
+		data64 = p->lxive_cache[i];
+		server = GETFIELD(IODA2_LXIVT_SERVER, data64);
+		prio = GETFIELD(IODA2_LXIVT_PRIORITY, data64);
+		data64 = SETFIELD(IODA2_LXIVT_SERVER, data64, server);
+		data64 = SETFIELD(IODA2_LXIVT_PRIORITY, data64, prio);
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Init_29..30 - MRT */
+	phb3_ioda_sel(p, IODA2_TBL_MRT, 0, true);
+	for (i = 0; i < 8; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+	/* Init_31..32 - TVT */
+	phb3_ioda_sel(p, IODA2_TBL_TVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]);
+
+	/* Init_33..34 - M64BT */
+	phb3_ioda_sel(p, IODA2_TBL_M64BT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->m64b_cache); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->m64b_cache[i]);
+
+	/* Init_35..36 - M32DT */
+	phb3_ioda_sel(p, IODA2_TBL_M32DT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->m32d_cache); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->m32d_cache[i]);
+
+	/* Load RTE, PELTV */
+	if (p->tbl_rtt)
+		memcpy((void *)p->tbl_rtt, p->rte_cache, RTT_TABLE_SIZE);
+	if (p->tbl_peltv)
+		memcpy((void *)p->tbl_peltv, p->peltv_cache, PELTV_TABLE_SIZE);
+
+	/* Load IVT */
+	if (p->tbl_ivt) {
+		pdata64 = (uint64_t *)p->tbl_ivt;
+		for (i = 0; i < IVT_TABLE_ENTRIES; i++)
+			pdata64[i * IVT_TABLE_STRIDE] = p->ive_cache[i];
+	}
+
+	/* Invalidate RTE, IVE, TCE cache */
+	out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL);
+	out_be64(p->regs + PHB_IVC_INVALIDATE, PHB_IVC_INVALIDATE_ALL);
+	out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_ALL);
+
+	/* Clear RBA */
+	if (p->rev >= PHB3_REV_MURANO_DD20) {
+		phb3_ioda_sel(p, IODA2_TBL_RBA, 0, true);
+		for (i = 0; i < 32; i++)
+			out_be64(p->regs + PHB_IODA_DATA0, 0x0ul);
+	}
+
+	/* Clear PEST & PEEV */
+	for (i = 0; i < PHB3_MAX_PE_NUM; i++) {
+		uint64_t pesta, pestb;
+
+		phb3_ioda_sel(p, IODA2_TBL_PESTA, i, false);
+		pesta = in_be64(p->regs + PHB_IODA_DATA0);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+		phb3_ioda_sel(p, IODA2_TBL_PESTB, i, false);
+		pestb = in_be64(p->regs + PHB_IODA_DATA0);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+		if ((pesta & IODA2_PESTA_MMIO_FROZEN) ||
+		    (pestb & IODA2_PESTB_DMA_STOPPED))
+			PHBDBG(p, "Frozen PE#%x (%s - %s)\n",
+			       i, (pesta & IODA2_PESTA_MMIO_FROZEN) ? "DMA" : "",
+			       (pestb & IODA2_PESTB_DMA_STOPPED) ? "MMIO" : "");
+	}
+
+	phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+	for (i = 0; i < 4; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Clear anything we have in PAPR Error Injection registers. Though
+ * the spec says the PAPR error injection should be one-shot without
+ * the "sticky" bit. However, that's false according to the experiments
+ * I had. So we have to clear it at appropriate point in kernel to
+ * avoid endless frozen PE.
+ */
+static int64_t phb3_papr_errinjct_reset(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul);
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_ADDR, 0x0ul);
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_MASK, 0x0ul);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_phb_mem_window(struct phb *phb,
+				       uint16_t window_type,
+				       uint16_t window_num,
+				       uint64_t addr,
+				       uint64_t __unused pci_addr,
+				       uint64_t size)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t data64;
+
+	/*
+	 * By design, PHB3 doesn't support IODT any more.
+	 * Besides, we can't enable M32 BAR as well. So
+	 * the function is used to do M64 mapping and each
+	 * BAR is supposed to be shared by all PEs.
+	 */
+	switch (window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+	case OPAL_M32_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num >= 16)
+			return OPAL_PARAMETER;
+
+		data64 = p->m64b_cache[window_num];
+		if (data64 & IODA2_M64BT_SINGLE_PE) {
+			if ((addr & 0x1FFFFFFul) ||
+			    (size & 0x1FFFFFFul))
+				return OPAL_PARAMETER;
+		} else {
+			if ((addr & 0xFFFFFul) ||
+			    (size & 0xFFFFFul))
+				return OPAL_PARAMETER;
+		}
+
+		/* size should be 2^N */
+		if (!size || size & (size-1))
+			return OPAL_PARAMETER;
+
+		/* address should be size aligned */
+		if (addr & (size - 1))
+			return OPAL_PARAMETER;
+
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	if (data64 & IODA2_M64BT_SINGLE_PE) {
+		data64 = SETFIELD(IODA2_M64BT_SINGLE_BASE, data64,
+				  addr >> 25);
+		data64 = SETFIELD(IODA2_M64BT_SINGLE_MASK, data64,
+				  0x20000000 - (size >> 25));
+	} else {
+		data64 = SETFIELD(IODA2_M64BT_BASE, data64,
+				  addr >> 20);
+		data64 = SETFIELD(IODA2_M64BT_MASK, data64,
+				  0x40000000 - (size >> 20));
+	}
+	p->m64b_cache[window_num] = data64;
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * For one specific M64 BAR, it can be shared by all PEs,
+ * or owned by single PE exclusively.
+ */
+static int64_t phb3_phb_mmio_enable(struct phb *phb,
+				    uint16_t window_type,
+				    uint16_t window_num,
+				    uint16_t enable)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t data64, base, mask;
+
+	/*
+	 * By design, PHB3 doesn't support IODT any more.
+	 * Besides, we can't enable M32 BAR as well. So
+	 * the function is used to do M64 mapping and each
+	 * BAR is supposed to be shared by all PEs.
+	 */
+	switch (window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+	case OPAL_M32_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num >= 16 ||
+		    enable > OPAL_ENABLE_M64_NON_SPLIT)
+			return OPAL_PARAMETER;
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/*
+	 * We need check the base/mask while enabling
+	 * the M64 BAR. Otherwise, invalid base/mask
+	 * might cause fenced AIB unintentionally
+	 */
+	data64 = p->m64b_cache[window_num];
+	switch (enable) {
+	case OPAL_DISABLE_M64:
+		data64 &= ~IODA2_M64BT_SINGLE_PE;
+		data64 &= ~IODA2_M64BT_ENABLE;
+		break;
+	case OPAL_ENABLE_M64_SPLIT:
+		if (data64 & IODA2_M64BT_SINGLE_PE)
+			return OPAL_PARAMETER;
+		base = GETFIELD(IODA2_M64BT_BASE, data64);
+		base = (base << 20);
+		mask = GETFIELD(IODA2_M64BT_MASK, data64);
+		if (base < p->mm0_base || !mask)
+			return OPAL_PARTIAL;
+
+		data64 |= IODA2_M64BT_ENABLE;
+		break;
+	case OPAL_ENABLE_M64_NON_SPLIT:
+		if (!(data64 & IODA2_M64BT_SINGLE_PE))
+			return OPAL_PARAMETER;
+		base = GETFIELD(IODA2_M64BT_SINGLE_BASE, data64);
+		base = (base << 25);
+		mask = GETFIELD(IODA2_M64BT_SINGLE_MASK, data64);
+		if (base < p->mm0_base || !mask)
+			return OPAL_PARTIAL;
+
+		data64 |= IODA2_M64BT_SINGLE_PE;
+		data64 |= IODA2_M64BT_ENABLE;
+		break;
+	}
+
+	/* Update HW and cache */
+	phb3_ioda_sel(p, IODA2_TBL_M64BT, window_num, false);
+	out_be64(p->regs + PHB_IODA_DATA0, data64);
+	p->m64b_cache[window_num] = data64;
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_map_pe_mmio_window(struct phb *phb,
+				       uint64_t pe_number,
+				       uint16_t window_type,
+				       uint16_t window_num,
+				       uint16_t segment_num)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t data64, *cache;
+
+	if (pe_number >= PHB3_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	/*
+	 * PHB3 doesn't support IODT any more. On the other
+	 * hand, PHB3 support M64DT with much more flexibility.
+	 * we need figure it out later. At least, we never use
+	 * M64DT in kernel.
+	 */
+	switch(window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M32_WINDOW_TYPE:
+		if (window_num != 0 || segment_num >= PHB3_MAX_PE_NUM)
+			return OPAL_PARAMETER;
+
+		cache = &p->m32d_cache[segment_num];
+		phb3_ioda_sel(p, IODA2_TBL_M32DT, segment_num, false);
+		out_be64(p->regs + PHB_IODA_DATA0,
+			 SETFIELD(IODA2_M32DT_PE, 0ull, pe_number));
+		*cache = SETFIELD(IODA2_M32DT_PE, 0ull, pe_number);
+
+		break;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num >= 16)
+			return OPAL_PARAMETER;
+		cache = &p->m64b_cache[window_num];
+		data64 = *cache;
+
+		/* The BAR shouldn't be enabled yet */
+		if (data64 & IODA2_M64BT_ENABLE)
+			return OPAL_PARTIAL;
+
+		data64 |= IODA2_M64BT_SINGLE_PE;
+		data64 = SETFIELD(IODA2_M64BT_PE_HI, data64, pe_number >> 5);
+		data64 = SETFIELD(IODA2_M64BT_PE_LOW, data64, pe_number);
+		*cache = data64;
+
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_map_pe_dma_window(struct phb *phb,
+				      uint64_t pe_number,
+				      uint16_t window_id,
+				      uint16_t tce_levels,
+				      uint64_t tce_table_addr,
+				      uint64_t tce_table_size,
+				      uint64_t tce_page_size)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t tts_encoded;
+	uint64_t data64 = 0;
+
+	/*
+	 * Sanity check. We currently only support "2 window per PE" mode
+	 * ie, only bit 59 of the PCI address is used to select the window
+	 */
+	if (pe_number >= PHB3_MAX_PE_NUM ||
+	    (window_id >> 1) != pe_number)
+		return OPAL_PARAMETER;
+
+	/*
+	 * tce_table_size == 0 is used to disable an entry, in this case
+	 * we ignore other arguments
+	 */
+	if (tce_table_size == 0) {
+		phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+		p->tve_cache[window_id] = 0;
+		return OPAL_SUCCESS;
+	}
+
+	/* Additional arguments validation */
+	if (tce_levels < 1 || tce_levels > 5 ||
+	    !is_pow2(tce_table_size) ||
+	    tce_table_size < 0x1000)
+		return OPAL_PARAMETER;
+
+	/* Encode TCE table size */
+	data64 = SETFIELD(IODA2_TVT_TABLE_ADDR, 0ul, tce_table_addr >> 12);
+	tts_encoded = ilog2(tce_table_size) - 11;
+	if (tts_encoded > 31)
+		return OPAL_PARAMETER;
+	data64 = SETFIELD(IODA2_TVT_TCE_TABLE_SIZE, data64, tts_encoded);
+
+	/* Encode TCE page size */
+	switch (tce_page_size) {
+	case 0x1000:	/* 4K */
+		data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 1);
+		break;
+	case 0x10000:	/* 64K */
+		data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 5);
+		break;
+	case 0x1000000:	/* 16M */
+		data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 13);
+		break;
+	case 0x10000000: /* 256M */
+		data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 17);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/* Encode number of levels */
+	data64 = SETFIELD(IODA2_TVT_NUM_LEVELS, data64, tce_levels - 1);
+
+	phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false);
+	out_be64(p->regs + PHB_IODA_DATA0, data64);
+	p->tve_cache[window_id] = data64;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_map_pe_dma_window_real(struct phb *phb,
+					   uint64_t pe_number,
+					   uint16_t window_id,
+					   uint64_t pci_start_addr,
+					   uint64_t pci_mem_size)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t end;
+	uint64_t tve;
+
+	if (pe_number >= PHB3_MAX_PE_NUM ||
+	    (window_id >> 1) != pe_number)
+		return OPAL_PARAMETER;
+
+	if (pci_mem_size) {
+		/* Enable */
+
+		/*
+		 * Check that the start address has the right TVE index,
+		 * we only support the 1 bit mode where each PE has 2
+		 * TVEs
+		 */
+		if ((pci_start_addr >> 59) != (window_id & 1))
+			return OPAL_PARAMETER;
+		pci_start_addr &= ((1ull << 59) - 1);
+		end = pci_start_addr + pci_mem_size;
+
+		/* We have to be 16M aligned */
+		if ((pci_start_addr & 0x00ffffff) ||
+		    (pci_mem_size & 0x00ffffff))
+			return OPAL_PARAMETER;
+
+		/*
+		 * It *looks* like this is the max we can support (we need
+		 * to verify this. Also we are not checking for rollover,
+		 * but then we aren't trying too hard to protect ourselves
+		 * againt a completely broken OS.
+		 */
+		if (end > 0x0003ffffffffffffull)
+			return OPAL_PARAMETER;
+
+		/*
+		 * Put start address bits 49:24 into TVE[52:53]||[0:23]
+		 * and end address bits 49:24 into TVE[54:55]||[24:47]
+		 * and set TVE[51]
+		 */
+		tve  = (pci_start_addr << 16) & (0xffffffull << 48);
+		tve |= (pci_start_addr >> 38) & (3ull << 10);
+		tve |= (end >>  8) & (0xfffffful << 16);
+		tve |= (end >> 40) & (3ull << 8);
+		tve |= PPC_BIT(51);
+	} else {
+		/* Disable */
+		tve = 0;
+	}
+
+	phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false);
+	out_be64(p->regs + PHB_IODA_DATA0, tve);
+	p->tve_cache[window_id] = tve;
+
+	return OPAL_SUCCESS;
+}
+
+static bool phb3_pci_msi_check_q(struct phb3 *p, uint32_t ive_num)
+{
+	uint64_t ive, ivc, ffi, state;
+	uint8_t *q_byte;
+
+	/* Each IVE has 16-bytes or 128-bytes */
+	ive = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8);
+	q_byte = (uint8_t *)(ive + 5);
+
+	/*
+	 * Handle Q bit. If the Q bit doesn't show up,
+	 * we would have CI load to make that.
+	 */
+	if (!(*q_byte & 0x1)) {
+		/* Read from random PHB reg to force flush */
+		in_be64(p->regs + PHB_IVC_UPDATE);
+
+		/* Order with subsequent read of Q */
+		sync();
+
+		/* Q still not set, bail out */
+		if (!(*q_byte & 0x1))
+			return false;
+	}
+
+	/* Lock FFI and send interrupt */
+	while (1) {
+		state = in_be64(p->regs + PHB_FFI_LOCK);
+		if (!state)
+			break;
+		if (state == ~0ULL) /* PHB Fenced */
+			return false;
+	}
+
+	/* Clear Q bit and update IVC */
+	*q_byte = 0;
+	ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) |
+		PHB_IVC_UPDATE_ENABLE_Q;
+	out_be64(p->regs + PHB_IVC_UPDATE, ivc);
+
+	/*
+	 * Resend interrupt. Note the lock clear bit isn't documented in
+	 * the PHB3 spec and thus is probably unnecessary but it's in
+	 * IODA2 so let's be safe here, it won't hurt to set it
+	 */
+	ffi = SETFIELD(PHB_FFI_REQUEST_ISN, 0ul, ive_num) | PHB_FFI_LOCK_CLEAR;
+	out_be64(p->regs + PHB_FFI_REQUEST, ffi);
+
+	return true;
+}
+
+static void phb3_pci_msi_flush_ive(struct phb3 *p, uint32_t ive_num)
+{
+	asm volatile("dcbf %0,%1"
+		     :
+		     : "b" (p->tbl_ivt), "r" (ive_num * IVT_TABLE_STRIDE * 8)
+		     : "memory");
+}
+
+static int64_t phb3_pci_msi_eoi(struct phb *phb,
+				uint32_t hwirq)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint32_t ive_num = PHB3_IRQ_NUM(hwirq);
+	uint64_t ive, ivc;
+	uint8_t *p_byte, gp, gen, newgen;
+
+	/* OS might not configure IVT yet */
+	if (!p->tbl_ivt)
+		return OPAL_HARDWARE;
+
+	/* Each IVE has 16-bytes or 128-bytes */
+	ive = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8);
+	p_byte = (uint8_t *)(ive + 4);
+
+	/* Read generation and P */
+	gp = *p_byte;
+	gen = (gp >> 1) & 3;
+	newgen = (gen + 1) & 3;
+
+	/* Increment generation count and clear P */
+	*p_byte = newgen << 1;
+
+	/* If at this point:
+	 *   - the IVC is invalid (due to high IRQ load) and
+	 *   - we get a new interrupt on this hwirq.
+	 * Due to the new interrupt, the IVC will fetch from the IVT.
+	 * This IVC reload will result in P set and gen=n+1.  This
+	 * interrupt may not actually be delievered at this point
+	 * though.
+	 *
+	 * Software will then try to clear P in the IVC (out_be64
+	 * below).  This could cause an interrupt to be lost because P
+	 * is cleared in the IVC without the new interrupt being
+	 * delivered.
+	 *
+	 * To avoid this race, we increment the generation count in
+	 * the IVT when we clear P. When software writes the IVC with
+	 * P cleared but with gen=n, the IVC won't actually clear P
+	 * because gen doesn't match what it just cached from the IVT.
+	 * Hence we don't lose P being set.
+	 */
+
+	/* Update the P bit in the IVC is gen count matches */
+	ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) |
+		PHB_IVC_UPDATE_ENABLE_P |
+		PHB_IVC_UPDATE_ENABLE_GEN |
+		PHB_IVC_UPDATE_ENABLE_CON |
+		SETFIELD(PHB_IVC_UPDATE_GEN_MATCH, 0ul, gen) |
+		SETFIELD(PHB_IVC_UPDATE_GEN, 0ul, newgen);
+	/* out_be64 has a sync to order with the IVT update above */
+	out_be64(p->regs + PHB_IVC_UPDATE, ivc);
+
+	/* Handle Q bit */
+	phb3_pci_msi_check_q(p, ive_num);
+
+	phb3_pci_msi_flush_ive(p, ive_num);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_ive_pe(struct phb *phb,
+			       uint64_t pe_number,
+			       uint32_t ive_num)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t *cache, ivep, data64;
+	uint16_t *pe_word;
+
+	/* OS should enable the BAR in advance */
+	if (!p->tbl_ivt)
+		return OPAL_HARDWARE;
+
+	/* Each IVE reserves 128 bytes */
+	if (pe_number >= PHB3_MAX_PE_NUM ||
+	    ive_num >= IVT_TABLE_ENTRIES)
+		return OPAL_PARAMETER;
+
+	/* Update IVE cache */
+	cache = &p->ive_cache[ive_num];
+	*cache = SETFIELD(IODA2_IVT_PE, *cache, pe_number);
+
+	/* Update in-memory IVE without clobbering P and Q */
+	ivep = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8);
+	pe_word = (uint16_t *)(ivep + 6);
+	*pe_word = pe_number;
+
+	/* Invalidate IVC */
+	data64 = SETFIELD(PHB_IVC_INVALIDATE_SID, 0ul, ive_num);
+	out_be64(p->regs + PHB_IVC_INVALIDATE, data64);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_msi_32(struct phb *phb __unused,
+			       uint64_t pe_number,
+			       uint32_t ive_num,
+			       uint8_t msi_range,
+			       uint32_t *msi_address,
+			       uint32_t *message_data)
+{
+	/*
+	 * Sanity check. We needn't check on mve_number (PE#)
+	 * on PHB3 since the interrupt source is purely determined
+	 * by its DMA address and data, but the check isn't
+	 * harmful.
+	 */
+	if (pe_number >= PHB3_MAX_PE_NUM ||
+	    ive_num >= IVT_TABLE_ENTRIES ||
+	    msi_range != 1 || !msi_address|| !message_data)
+		return OPAL_PARAMETER;
+
+	/*
+	 * DMA address and data will form the IVE index.
+	 * For more details, please refer to IODA2 spec.
+	 */
+	*msi_address = 0xFFFF0000 | ((ive_num << 4) & 0xFFFFFE0F);
+	*message_data = ive_num & 0x1F;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_msi_64(struct phb *phb __unused,
+			       uint64_t pe_number,
+			       uint32_t ive_num,
+			       uint8_t msi_range,
+			       uint64_t *msi_address,
+			       uint32_t *message_data)
+{
+	/* Sanity check */
+	if (pe_number >= PHB3_MAX_PE_NUM ||
+	    ive_num >= IVT_TABLE_ENTRIES ||
+	    msi_range != 1 || !msi_address || !message_data)
+		return OPAL_PARAMETER;
+
+	/*
+	 * DMA address and data will form the IVE index.
+	 * For more details, please refer to IODA2 spec.
+	 */
+	*msi_address = (0x1ul << 60) | ((ive_num << 4) & 0xFFFFFFFFFFFFFE0Ful);
+	*message_data = ive_num & 0x1F;
+
+	return OPAL_SUCCESS;
+}
+
+static bool phb3_err_check_pbcq(struct phb3 *p)
+{
+	uint64_t nfir, mask, wof, val64;
+	int32_t class, bit;
+	uint64_t severity[PHB3_ERR_CLASS_LAST] = {
+		0x0000000000000000UL,	/* NONE	*/
+		0x018000F800000000UL,	/* DEAD */
+		0x7E7DC70000000000UL,	/* FENCED */
+		0x0000000000000000UL,	/* ER	*/
+		0x0000000000000000UL	/* INF	*/
+	};
+
+	/*
+	 * Read on NFIR to see if XSCOM is working properly.
+	 * If XSCOM doesn't work well, we need take the PHB
+	 * into account any more.
+	 */
+	xscom_read(p->chip_id, p->pe_xscom + 0x0, &nfir);
+	if (nfir == 0xffffffffffffffffUL) {
+		p->err.err_src = PHB3_ERR_SRC_NONE;
+		p->err.err_class = PHB3_ERR_CLASS_DEAD;
+		phb3_set_err_pending(p, true);
+		return true;
+	}
+
+	/*
+	 * Check WOF. We need handle unmasked errors firstly.
+	 * We probably run into the situation (on simulator)
+	 * where we have asserted FIR bits, but WOF has nothing.
+	 * For that case, we should check FIR as well.
+	 */
+	xscom_read(p->chip_id, p->pe_xscom + 0x3, &mask);
+	xscom_read(p->chip_id, p->pe_xscom + 0x8, &wof);
+	if (wof & ~mask)
+		wof &= ~mask;
+	if (!wof) {
+		if (nfir & ~mask)
+			nfir &= ~mask;
+		if (!nfir)
+			return false;
+		wof = nfir;
+	}
+
+	/* We shouldn't hit class PHB3_ERR_CLASS_NONE */
+	for (class = PHB3_ERR_CLASS_NONE;
+	     class < PHB3_ERR_CLASS_LAST;
+	     class++) {
+		val64 = wof & severity[class];
+		if (!val64)
+			continue;
+
+		for (bit = 0; bit < 64; bit++) {
+			if (val64 & PPC_BIT(bit)) {
+				p->err.err_src = PHB3_ERR_SRC_PBCQ;
+				p->err.err_class = class;
+				p->err.err_bit = 63 - bit;
+				phb3_set_err_pending(p, true);
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+static bool phb3_err_check_lem(struct phb3 *p)
+{
+	uint64_t fir, wof, mask, val64;
+	int32_t class, bit;
+	uint64_t severity[PHB3_ERR_CLASS_LAST] = {
+		0x0000000000000000UL,	/* NONE */
+		0x0000000000000000UL,	/* DEAD */
+		0xADB670C980ADD151UL,	/* FENCED */
+		0x000800107F500A2CUL,	/* ER   */
+		0x42018E2200002482UL	/* INF  */
+	};
+
+	/*
+	 * Read FIR. If XSCOM or ASB is frozen, we needn't
+	 * go forward and just mark the PHB with dead state
+	 */
+	fir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM);
+	if (fir == 0xffffffffffffffffUL) {
+		p->err.err_src = PHB3_ERR_SRC_PHB;
+		p->err.err_class = PHB3_ERR_CLASS_DEAD;
+		phb3_set_err_pending(p, true);
+		return true;
+	}
+
+	/*
+	 * Check on WOF for the unmasked errors firstly. Under
+	 * some situation where we run skiboot on simulator,
+	 * we already had FIR bits asserted, but WOF is still zero.
+	 * For that case, we check FIR directly.
+	 */
+	wof = phb3_read_reg_asb(p, PHB_LEM_WOF);
+	mask = phb3_read_reg_asb(p, PHB_LEM_ERROR_MASK);
+	if (wof & ~mask)
+		wof &= ~mask;
+	if (!wof) {
+		if (fir & ~mask)
+			fir &= ~mask;
+		if (!fir)
+			return false;
+		wof = fir;
+	}
+
+	/* We shouldn't hit PHB3_ERR_CLASS_NONE */
+	for (class = PHB3_ERR_CLASS_NONE;
+	     class < PHB3_ERR_CLASS_LAST;
+	     class++) {
+		val64 = wof & severity[class];
+		if (!val64)
+			continue;
+
+		for (bit = 0; bit < 64; bit++) {
+			if (val64 & PPC_BIT(bit)) {
+				p->err.err_src = PHB3_ERR_SRC_PHB;
+				p->err.err_class = class;
+				p->err.err_bit = 63 - bit;
+				phb3_set_err_pending(p, true);
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+/*
+ * The function can be called during error recovery for INF
+ * and ER class. For INF case, it's expected to be called
+ * when grabbing the error log. We will call it explicitly
+ * when clearing frozen PE state for ER case.
+ */
+static void phb3_err_ER_clear(struct phb3 *p)
+{
+	uint32_t val32;
+	uint64_t val64;
+	uint64_t fir = in_be64(p->regs + PHB_LEM_FIR_ACCUM);
+
+	/* Rec 1: Grab the PCI config lock */
+	/* Removed... unnecessary. We have our own lock here */
+
+	/* Rec 2/3/4: Take all inbound transactions */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000001c00000000ul);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0x10000000);
+
+	/* Rec 5/6/7: Clear pending non-fatal errors */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000005000000000ul);
+	val32 = in_be32(p->regs + PHB_CONFIG_DATA);
+	out_be32(p->regs + PHB_CONFIG_DATA, (val32 & 0xe0700000) | 0x0f000f00);
+
+	/* Rec 8/9/10: Clear pending fatal errors for AER */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000010400000000ul);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+	/* Rec 11/12/13: Clear pending non-fatal errors for AER */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000011000000000ul);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+	/* Rec 22/23/24: Clear root port errors */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000013000000000ul);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+	/* Rec 25/26/27: Enable IO and MMIO bar */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000004000000000ul);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0x470100f8);
+
+	/* Rec 28: Release the PCI config lock */
+	/* Removed... unnecessary. We have our own lock here */
+
+	/* Rec 29...34: Clear UTL errors */
+	val64 = in_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS);
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS, val64);
+	val64 = in_be64(p->regs + UTL_PCIE_PORT_STATUS);
+	out_be64(p->regs + UTL_PCIE_PORT_STATUS, val64);
+	val64 = in_be64(p->regs + UTL_RC_STATUS);
+	out_be64(p->regs + UTL_RC_STATUS, val64);
+
+	/* Rec 39...66: Clear PHB error trap */
+	val64 = in_be64(p->regs + PHB_ERR_STATUS);
+	out_be64(p->regs + PHB_ERR_STATUS, val64);
+	out_be64(p->regs + PHB_ERR1_STATUS, 0x0ul);
+	out_be64(p->regs + PHB_ERR_LOG_0, 0x0ul);
+	out_be64(p->regs + PHB_ERR_LOG_1, 0x0ul);
+
+	val64 = in_be64(p->regs + PHB_OUT_ERR_STATUS);
+	out_be64(p->regs + PHB_OUT_ERR_STATUS, val64);
+	out_be64(p->regs + PHB_OUT_ERR1_STATUS, 0x0ul);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_0, 0x0ul);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_1, 0x0ul);
+
+	val64 = in_be64(p->regs + PHB_INA_ERR_STATUS);
+	out_be64(p->regs + PHB_INA_ERR_STATUS, val64);
+	out_be64(p->regs + PHB_INA_ERR1_STATUS, 0x0ul);
+	out_be64(p->regs + PHB_INA_ERR_LOG_0, 0x0ul);
+	out_be64(p->regs + PHB_INA_ERR_LOG_1, 0x0ul);
+
+	val64 = in_be64(p->regs + PHB_INB_ERR_STATUS);
+	out_be64(p->regs + PHB_INB_ERR_STATUS, val64);
+	out_be64(p->regs + PHB_INB_ERR1_STATUS, 0x0ul);
+	out_be64(p->regs + PHB_INB_ERR_LOG_0, 0x0ul);
+	out_be64(p->regs + PHB_INB_ERR_LOG_1, 0x0ul);
+
+	/* Rec 67/68: Clear FIR/WOF */
+	out_be64(p->regs + PHB_LEM_FIR_AND_MASK, ~fir);
+	out_be64(p->regs + PHB_LEM_WOF, 0x0ul);
+}
+
+static void phb3_read_phb_status(struct phb3 *p,
+				 struct OpalIoPhb3ErrorData *stat)
+{
+	uint16_t val;
+	uint64_t *pPEST;
+	uint64_t val64 = 0;
+	uint32_t i;
+
+	memset(stat, 0, sizeof(struct OpalIoPhb3ErrorData));
+
+	/* Error data common part */
+	stat->common.version = OPAL_PHB_ERROR_DATA_VERSION_1;
+	stat->common.ioType  = OPAL_PHB_ERROR_DATA_TYPE_PHB3;
+	stat->common.len     = sizeof(struct OpalIoPhb3ErrorData);
+
+	/*
+	 * We read some registers using config space through AIB.
+	 *
+	 * Get to other registers using ASB when possible to get to them
+	 * through a fence if one is present.
+	 */
+
+	/* Use ASB to access PCICFG if the PHB has been fenced */
+	p->flags |= PHB3_CFG_USE_ASB;
+
+	/* Grab RC bridge control, make it 32-bit */
+	phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &val);
+	stat->brdgCtl = val;
+
+	/* Grab UTL status registers */
+	stat->portStatusReg = hi32(phb3_read_reg_asb(p, UTL_PCIE_PORT_STATUS));
+	stat->rootCmplxStatus = hi32(phb3_read_reg_asb(p, UTL_RC_STATUS));
+	stat->busAgentStatus = hi32(phb3_read_reg_asb(p, UTL_SYS_BUS_AGENT_STATUS));
+
+	/*
+	 * Grab various RC PCIe capability registers. All device, slot
+	 * and link status are 16-bit, so we grab the pair control+status
+	 * for each of them
+	 */
+	phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL,
+			   &stat->deviceStatus);
+	phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_SLOTCTL,
+			   &stat->slotStatus);
+	phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL,
+			   &stat->linkStatus);
+
+	/*
+	 * I assume those are the standard config space header, cmd & status
+	 * together makes 32-bit. Secondary status is 16-bit so I'll clear
+	 * the top on that one
+	 */
+	phb3_pcicfg_read32(&p->phb, 0, PCI_CFG_CMD, &stat->devCmdStatus);
+	phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, &val);
+	stat->devSecStatus = val;
+
+	/* Grab a bunch of AER regs */
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA,
+			   &stat->rootErrorStatus);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS,
+			   &stat->uncorrErrorStatus);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS,
+			   &stat->corrErrorStatus);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG0,
+			   &stat->tlpHdr1);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG1,
+			   &stat->tlpHdr2);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG2,
+			   &stat->tlpHdr3);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG3,
+			   &stat->tlpHdr4);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_SRCID,
+			   &stat->sourceId);
+
+	/* Restore to AIB */
+	p->flags &= ~PHB3_CFG_USE_ASB;
+
+	/* PEC NFIR */
+	xscom_read(p->chip_id, p->pe_xscom + 0x0, &stat->nFir);
+	xscom_read(p->chip_id, p->pe_xscom + 0x3, &stat->nFirMask);
+	xscom_read(p->chip_id, p->pe_xscom + 0x8, &stat->nFirWOF);
+
+	/* PHB3 inbound and outbound error Regs */
+	stat->phbPlssr = phb3_read_reg_asb(p, PHB_CPU_LOADSTORE_STATUS);
+	stat->phbCsr = phb3_read_reg_asb(p, PHB_DMA_CHAN_STATUS);
+	stat->lemFir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM);
+	stat->lemErrorMask = phb3_read_reg_asb(p, PHB_LEM_ERROR_MASK);
+	stat->lemWOF = phb3_read_reg_asb(p, PHB_LEM_WOF);
+	stat->phbErrorStatus = phb3_read_reg_asb(p, PHB_ERR_STATUS);
+	stat->phbFirstErrorStatus = phb3_read_reg_asb(p, PHB_ERR1_STATUS);
+	stat->phbErrorLog0 = phb3_read_reg_asb(p, PHB_ERR_LOG_0);
+	stat->phbErrorLog1 = phb3_read_reg_asb(p, PHB_ERR_LOG_1);
+	stat->mmioErrorStatus = phb3_read_reg_asb(p, PHB_OUT_ERR_STATUS);
+	stat->mmioFirstErrorStatus = phb3_read_reg_asb(p, PHB_OUT_ERR1_STATUS);
+	stat->mmioErrorLog0 = phb3_read_reg_asb(p, PHB_OUT_ERR_LOG_0);
+	stat->mmioErrorLog1 = phb3_read_reg_asb(p, PHB_OUT_ERR_LOG_1);
+	stat->dma0ErrorStatus = phb3_read_reg_asb(p, PHB_INA_ERR_STATUS);
+	stat->dma0FirstErrorStatus = phb3_read_reg_asb(p, PHB_INA_ERR1_STATUS);
+	stat->dma0ErrorLog0 = phb3_read_reg_asb(p, PHB_INA_ERR_LOG_0);
+	stat->dma0ErrorLog1 = phb3_read_reg_asb(p, PHB_INA_ERR_LOG_1);
+	stat->dma1ErrorStatus = phb3_read_reg_asb(p, PHB_INB_ERR_STATUS);
+	stat->dma1FirstErrorStatus = phb3_read_reg_asb(p, PHB_INB_ERR1_STATUS);
+	stat->dma1ErrorLog0 = phb3_read_reg_asb(p, PHB_INB_ERR_LOG_0);
+	stat->dma1ErrorLog1 = phb3_read_reg_asb(p, PHB_INB_ERR_LOG_1);
+
+	/*
+	 * Grab PESTA & B content. The error bit (bit#0) should
+	 * be fetched from IODA and the left content from memory
+	 * resident tables.
+	 */
+	pPEST = (uint64_t *)p->tbl_pest;
+	val64 = PHB_IODA_AD_AUTOINC;
+	val64 = SETFIELD(PHB_IODA_AD_TSEL, val64, IODA2_TBL_PESTA);
+	phb3_write_reg_asb(p, PHB_IODA_ADDR, val64);
+	for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
+		stat->pestA[i] = phb3_read_reg_asb(p, PHB_IODA_DATA0);
+		stat->pestA[i] |= pPEST[2 * i];
+	}
+
+	val64 = PHB_IODA_AD_AUTOINC;
+	val64 = SETFIELD(PHB_IODA_AD_TSEL, val64, IODA2_TBL_PESTB);
+	phb3_write_reg_asb(p, PHB_IODA_ADDR, val64);
+	for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
+		stat->pestB[i] = phb3_read_reg_asb(p, PHB_IODA_DATA0);
+		stat->pestB[i] |= pPEST[2 * i + 1];
+	}
+}
+
+static void phb3_eeh_dump_regs(struct phb3 *p, struct OpalIoPhb3ErrorData *regs)
+{
+	struct OpalIoPhb3ErrorData *s;
+	unsigned int i;
+
+	if (!verbose_eeh)
+		return;
+
+	if (!regs) {
+		s = zalloc(sizeof(struct OpalIoPhb3ErrorData));
+		if (!s) {
+			PHBERR(p, "Failed to allocate error info !\n");
+			return;
+		}
+
+		phb3_read_phb_status(p, s);
+	} else {
+		s = regs;
+	}
+
+	PHBERR(p, "Error detected!\n");
+
+	PHBERR(p, "       portStatusReg = %08x\n", s->portStatusReg);
+	PHBERR(p, "     rootCmplxStatus = %08x\n", s->rootCmplxStatus);
+	PHBERR(p, "      busAgentStatus = %08x\n", s->busAgentStatus);
+
+	PHBERR(p, "          errorClass = %016llx\n", s->errorClass);
+	PHBERR(p, "          correlator = %016llx\n", s->correlator);
+
+	PHBERR(p, "             brdgCtl = %08x\n", s->brdgCtl);
+	PHBERR(p, "        deviceStatus = %08x\n", s->deviceStatus);
+	PHBERR(p, "          slotStatus = %08x\n", s->slotStatus);
+	PHBERR(p, "          linkStatus = %08x\n", s->linkStatus);
+	PHBERR(p, "        devCmdStatus = %08x\n", s->devCmdStatus);
+	PHBERR(p, "        devSecStatus = %08x\n", s->devSecStatus);
+	PHBERR(p, "     rootErrorStatus = %08x\n", s->rootErrorStatus);
+	PHBERR(p, "     corrErrorStatus = %08x\n", s->corrErrorStatus);
+	PHBERR(p, "   uncorrErrorStatus = %08x\n", s->uncorrErrorStatus);
+
+	/* Byte swap TLP headers so they are the same as the PCIe spec */
+	PHBERR(p, "             tlpHdr1 = %08x\n", bswap_32(s->tlpHdr1));
+	PHBERR(p, "             tlpHdr2 = %08x\n", bswap_32(s->tlpHdr2));
+	PHBERR(p, "             tlpHdr3 = %08x\n", bswap_32(s->tlpHdr3));
+	PHBERR(p, "             tlpHdr4 = %08x\n", bswap_32(s->tlpHdr4));
+	PHBERR(p, "            sourceId = %08x\n", s->sourceId);
+
+	PHBERR(p, "                nFir = %016llx\n", s->nFir);
+	PHBERR(p, "            nFirMask = %016llx\n", s->nFirMask);
+	PHBERR(p, "             nFirWOF = %016llx\n", s->nFirWOF);
+	PHBERR(p, "            phbPlssr = %016llx\n", s->phbPlssr);
+	PHBERR(p, "              phbCsr = %016llx\n", s->phbCsr);
+	PHBERR(p, "              lemFir = %016llx\n", s->lemFir);
+	PHBERR(p, "        lemErrorMask = %016llx\n", s->lemErrorMask);
+	PHBERR(p, "              lemWOF = %016llx\n", s->lemWOF);
+
+	PHBERR(p, "      phbErrorStatus = %016llx\n", s->phbErrorStatus);
+	PHBERR(p, " phbFirstErrorStatus = %016llx\n", s->phbFirstErrorStatus);
+	PHBERR(p, "        phbErrorLog0 = %016llx\n", s->phbErrorLog0);
+	PHBERR(p, "        phbErrorLog1 = %016llx\n", s->phbErrorLog1);
+
+	PHBERR(p, "     mmioErrorStatus = %016llx\n", s->mmioErrorStatus);
+	PHBERR(p, "mmioFirstErrorStatus = %016llx\n", s->mmioFirstErrorStatus);
+	PHBERR(p, "       mmioErrorLog0 = %016llx\n", s->mmioErrorLog0);
+	PHBERR(p, "       mmioErrorLog1 = %016llx\n", s->mmioErrorLog1);
+
+	PHBERR(p, "     dma0ErrorStatus = %016llx\n", s->dma0ErrorStatus);
+	PHBERR(p, "dma0FirstErrorStatus = %016llx\n", s->dma0FirstErrorStatus);
+	PHBERR(p, "       dma0ErrorLog0 = %016llx\n", s->dma0ErrorLog0);
+	PHBERR(p, "       dma0ErrorLog1 = %016llx\n", s->dma0ErrorLog1);
+
+	PHBERR(p, "     dma1ErrorStatus = %016llx\n", s->dma1ErrorStatus);
+	PHBERR(p, "dma1FirstErrorStatus = %016llx\n", s->dma1FirstErrorStatus);
+	PHBERR(p, "       dma1ErrorLog0 = %016llx\n", s->dma1ErrorLog0);
+	PHBERR(p, "       dma1ErrorLog1 = %016llx\n", s->dma1ErrorLog1);
+
+	for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
+		if (!s->pestA[i] && !s->pestB[i])
+			continue;
+		PHBERR(p, "          PEST[%03x] = %016llx %016llx\n",
+		       i, s->pestA[i], s->pestB[i]);
+	}
+
+	if (s != regs)
+		free(s);
+}
+
+static int64_t phb3_msi_get_xive(struct irq_source *is, uint32_t isn,
+				 uint16_t *server, uint8_t *prio)
+{
+	struct phb3 *p = is->data;
+	uint32_t chip, index, irq;
+	uint64_t ive;
+
+	chip = p8_irq_to_chip(isn);
+	index = p8_irq_to_phb(isn);
+	irq = PHB3_IRQ_NUM(isn);
+
+	if (chip != p->chip_id ||
+	    index != p->index ||
+	    irq > PHB3_MSI_IRQ_MAX)
+		return OPAL_PARAMETER;
+
+	/*
+	 * Each IVE has 16 bytes in cache. Note that the kernel
+	 * should strip the link bits from server field.
+	 */
+	ive = p->ive_cache[irq];
+	*server = GETFIELD(IODA2_IVT_SERVER, ive);
+	*prio = GETFIELD(IODA2_IVT_PRIORITY, ive);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_msi_set_xive(struct irq_source *is, uint32_t isn,
+				 uint16_t server, uint8_t prio)
+{
+	struct phb3 *p = is->data;
+	uint32_t chip, index;
+	uint64_t *cache, ive_num, data64, m_server, m_prio, ivc;
+	uint32_t *ive;
+
+	chip = p8_irq_to_chip(isn);
+	index = p8_irq_to_phb(isn);
+	ive_num = PHB3_IRQ_NUM(isn);
+
+	if (p->broken || !p->tbl_rtt)
+		return OPAL_HARDWARE;
+	if (chip != p->chip_id ||
+	    index != p->index ||
+	    ive_num > PHB3_MSI_IRQ_MAX)
+		return OPAL_PARAMETER;
+
+	phb_lock(&p->phb);
+
+	/*
+	 * We need strip the link from server. As Milton told
+	 * me, the server is assigned as follows and the left
+	 * bits unused: node/chip/core/thread/link = 2/3/4/3/2
+	 *
+	 * Note: the server has added the link bits to server.
+	 */
+	m_server = server;
+	m_prio = prio;
+
+	cache = &p->ive_cache[ive_num];
+	*cache = SETFIELD(IODA2_IVT_SERVER,   *cache, m_server);
+	*cache = SETFIELD(IODA2_IVT_PRIORITY, *cache, m_prio);
+
+	/*
+	 * Update IVT and IVC. We need use IVC update register
+	 * to do that. Each IVE in the table has 128 bytes
+	 */
+	ive = (uint32_t *)(p->tbl_ivt + ive_num * IVT_TABLE_STRIDE * 8);
+	data64 = PHB_IVC_UPDATE_ENABLE_SERVER | PHB_IVC_UPDATE_ENABLE_PRI;
+	data64 = SETFIELD(PHB_IVC_UPDATE_SID, data64, ive_num);
+	data64 = SETFIELD(PHB_IVC_UPDATE_SERVER, data64, m_server);
+	data64 = SETFIELD(PHB_IVC_UPDATE_PRI, data64, m_prio);
+
+	/*
+	 * We don't use SETFIELD because we are doing a 32-bit access
+	 * in order to avoid touching the P and Q bits
+	 */
+	*ive = (m_server << 8) | m_prio;
+	out_be64(p->regs + PHB_IVC_UPDATE, data64);
+
+	if (prio != 0xff) {
+		/*
+		 * Handle Q bit if we're going to enable the
+		 * interrupt.  The OS should make sure the interrupt
+		 * handler has been installed already.
+		 */
+		if (phb3_pci_msi_check_q(p, ive_num))
+			phb3_pci_msi_flush_ive(p, ive_num);
+	} else {
+		/* Read from random PHB reg to force flush */
+		in_be64(p->regs + PHB_IVC_UPDATE);
+
+		/* Order with subsequent read of Q */
+		sync();
+
+		/* Clear P, Q and Gen, preserve PE# */
+		ive[1] &= 0x0000ffff;
+
+		/*
+		 * Update the IVC with a match against the old gen
+		 * count. No need to worry about racing with P being
+		 * set in the cache since IRQ is masked at this point.
+		 */
+		ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) |
+			PHB_IVC_UPDATE_ENABLE_P |
+			PHB_IVC_UPDATE_ENABLE_Q |
+			PHB_IVC_UPDATE_ENABLE_GEN;
+		out_be64(p->regs + PHB_IVC_UPDATE, ivc);
+	}
+
+	phb_unlock(&p->phb);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_lsi_get_xive(struct irq_source *is, uint32_t isn,
+				 uint16_t *server, uint8_t *prio)
+{
+	struct phb3 *p = is->data;
+	uint32_t chip, index, irq;
+	uint64_t lxive;
+
+	chip = p8_irq_to_chip(isn);
+	index = p8_irq_to_phb(isn);
+	irq = PHB3_IRQ_NUM(isn);
+
+	if (chip != p->chip_id	||
+	    index != p->index	||
+	    irq < PHB3_LSI_IRQ_MIN ||
+	    irq > PHB3_LSI_IRQ_MAX)
+		return OPAL_PARAMETER;
+
+	lxive = p->lxive_cache[irq - PHB3_LSI_IRQ_MIN];
+	*server = GETFIELD(IODA2_LXIVT_SERVER, lxive);
+	*prio = GETFIELD(IODA2_LXIVT_PRIORITY, lxive);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_lsi_set_xive(struct irq_source *is, uint32_t isn,
+				 uint16_t server, uint8_t prio)
+{
+	struct phb3 *p = is->data;
+	uint32_t chip, index, irq, entry;
+	uint64_t lxive;
+
+	chip = p8_irq_to_chip(isn);
+	index = p8_irq_to_phb(isn);
+	irq = PHB3_IRQ_NUM(isn);
+
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	if (chip != p->chip_id	||
+	    index != p->index	||
+	    irq < PHB3_LSI_IRQ_MIN ||
+	    irq > PHB3_LSI_IRQ_MAX)
+		return OPAL_PARAMETER;
+
+	lxive = SETFIELD(IODA2_LXIVT_SERVER, 0ul, server);
+	lxive = SETFIELD(IODA2_LXIVT_PRIORITY, lxive, prio);
+
+	phb_lock(&p->phb);
+
+	/*
+	 * We cache the arguments because we have to mangle
+	 * it in order to hijack 3 bits of priority to extend
+	 * the server number
+	 */
+	entry = irq - PHB3_LSI_IRQ_MIN;
+	p->lxive_cache[entry] = lxive;
+
+	/* We use HRT entry 0 always for now */
+	phb3_ioda_sel(p, IODA2_TBL_LXIVT, entry, false);
+	lxive = in_be64(p->regs + PHB_IODA_DATA0);
+	lxive = SETFIELD(IODA2_LXIVT_SERVER, lxive, server);
+	lxive = SETFIELD(IODA2_LXIVT_PRIORITY, lxive, prio);
+	out_be64(p->regs + PHB_IODA_DATA0, lxive);
+
+	phb_unlock(&p->phb);
+
+	return OPAL_SUCCESS;
+}
+
+static void phb3_err_interrupt(struct irq_source *is, uint32_t isn)
+{
+	struct phb3 *p = is->data;
+
+	PHBDBG(p, "Got interrupt 0x%08x\n", isn);
+
+	/* Update pending event */
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+				OPAL_EVENT_PCI_ERROR);
+
+	/* If the PHB is broken, go away */
+	if (p->broken)
+		return;
+
+	/*
+	 * Mark the PHB has pending error so that the OS
+	 * can handle it at late point.
+	 */
+	phb3_set_err_pending(p, true);
+}
+
+static uint64_t phb3_lsi_attributes(struct irq_source *is, uint32_t isn)
+{
+#ifndef DISABLE_ERR_INTS
+	struct phb3 *p = is->data;
+	uint32_t idx = isn - p->base_lsi;
+
+	if (idx == PHB3_LSI_PCIE_INF || idx == PHB3_LSI_PCIE_ER)
+		return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI;
+#endif
+	return IRQ_ATTR_TARGET_LINUX;
+}
+
+/* MSIs (OS owned) */
+static const struct irq_source_ops phb3_msi_irq_ops = {
+	.get_xive = phb3_msi_get_xive,
+	.set_xive = phb3_msi_set_xive,
+};
+
+/* LSIs (OS owned) */
+static const struct irq_source_ops phb3_lsi_irq_ops = {
+	.get_xive = phb3_lsi_get_xive,
+	.set_xive = phb3_lsi_set_xive,
+	.attributes = phb3_lsi_attributes,
+	.interrupt = phb3_err_interrupt,
+};
+
+static int64_t phb3_set_pe(struct phb *phb,
+			   uint64_t pe_number,
+			   uint64_t bdfn,
+			   uint8_t bcompare,
+			   uint8_t dcompare,
+			   uint8_t fcompare,
+			   uint8_t action)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t mask, val, tmp, idx;
+	int32_t all = 0;
+	uint16_t *rte;
+
+	/* Sanity check */
+	if (!p->tbl_rtt)
+		return OPAL_HARDWARE;
+	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+		return OPAL_PARAMETER;
+	if (pe_number >= PHB3_MAX_PE_NUM || bdfn > 0xffff ||
+	    bcompare > OpalPciBusAll ||
+	    dcompare > OPAL_COMPARE_RID_DEVICE_NUMBER ||
+	    fcompare > OPAL_COMPARE_RID_FUNCTION_NUMBER)
+		return OPAL_PARAMETER;
+
+	/* Figure out the RID range */
+	if (bcompare == OpalPciBusAny) {
+		mask = 0x0;
+		val  = 0x0;
+		all  = 0x1;
+	} else {
+		tmp  = ((0x1 << (bcompare + 1)) - 1) << (15 - bcompare);
+		mask = tmp;
+		val  = bdfn & tmp;
+	}
+
+	if (dcompare == OPAL_IGNORE_RID_DEVICE_NUMBER)
+		all = (all << 1) | 0x1;
+	else {
+		mask |= 0xf8;
+		val  |= (bdfn & 0xf8);
+	}
+
+	if (fcompare == OPAL_IGNORE_RID_FUNCTION_NUMBER)
+		all = (all << 1) | 0x1;
+	else {
+		mask |= 0x7;
+		val  |= PCI_FUNC(bdfn);
+	}
+
+	/* Map or unmap the RTT range */
+	if (all == 0x7) {
+		if (action == OPAL_MAP_PE) {
+			for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++)
+				p->rte_cache[idx] = pe_number;
+		} else {
+			for ( idx = 0; idx < ARRAY_SIZE(p->rte_cache); idx++)
+				p->rte_cache[idx] = PHB3_RESERVED_PE_NUM;
+		}
+		memcpy((void *)p->tbl_rtt, p->rte_cache, RTT_TABLE_SIZE);
+	} else {
+		rte = (uint16_t *)p->tbl_rtt;
+		for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++, rte++) {
+			if ((idx & mask) != val)
+				continue;
+			if (action == OPAL_MAP_PE)
+				p->rte_cache[idx] = pe_number;
+			else
+				p->rte_cache[idx] = PHB3_RESERVED_PE_NUM;
+			*rte = p->rte_cache[idx];
+		}
+	}
+
+	/* Invalidate the entire RTC */
+	out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_peltv(struct phb *phb,
+			      uint32_t parent_pe,
+			      uint32_t child_pe,
+			      uint8_t state)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint8_t *peltv;
+	uint32_t idx, mask;
+
+	/* Sanity check */
+	if (!p->tbl_peltv)
+		return OPAL_HARDWARE;
+	if (parent_pe >= PHB3_MAX_PE_NUM || child_pe >= PHB3_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	/* Find index for parent PE */
+	idx = parent_pe * (PHB3_MAX_PE_NUM / 8);
+	idx += (child_pe / 8);
+	mask = 0x1 << (7 - (child_pe % 8));
+
+	peltv = (uint8_t *)p->tbl_peltv;
+	peltv += idx;
+	if (state) {
+		*peltv |= mask;
+		p->peltv_cache[idx] |= mask;
+	} else {
+		*peltv &= ~mask;
+		p->peltv_cache[idx] &= ~mask;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static void phb3_prepare_link_change(struct pci_slot *slot,
+				     bool is_up)
+{
+	struct phb3 *p = phb_to_phb3(slot->phb);
+	struct pci_device *pd = slot->pd;
+	uint32_t reg32;
+
+	p->has_link = is_up;
+	if (!is_up) {
+		if (!pd || !pd->slot || !pd->slot->surprise_pluggable) {
+			/* Mask PCIE port interrupts */
+			out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,
+				 0xad42800000000000UL);
+
+			pci_cfg_read32(&p->phb, 0,
+				       p->aercap + PCIECAP_AER_UE_MASK, &reg32);
+			reg32 |= PCIECAP_AER_UE_MASK_SURPRISE_DOWN;
+			pci_cfg_write32(&p->phb, 0,
+					p->aercap + PCIECAP_AER_UE_MASK, reg32);
+                }
+
+		/* Mask AER receiver error */
+		phb3_pcicfg_read32(&p->phb, 0,
+				   p->aercap + PCIECAP_AER_CE_MASK, &reg32);
+		reg32 |= PCIECAP_AER_CE_RECVR_ERR;
+		phb3_pcicfg_write32(&p->phb, 0,
+				    p->aercap + PCIECAP_AER_CE_MASK, reg32);
+
+		/* Block PCI-CFG access */
+		p->flags |= PHB3_CFG_BLOCKED;
+	} else {
+		/* Clear AER receiver error status */
+		phb3_pcicfg_write32(&p->phb, 0,
+				    p->aercap + PCIECAP_AER_CE_STATUS,
+				    PCIECAP_AER_CE_RECVR_ERR);
+
+		/* Unmask receiver error status in AER */
+		phb3_pcicfg_read32(&p->phb, 0,
+				   p->aercap + PCIECAP_AER_CE_MASK, &reg32);
+		reg32 &= ~PCIECAP_AER_CE_RECVR_ERR;
+		phb3_pcicfg_write32(&p->phb, 0,
+				    p->aercap + PCIECAP_AER_CE_MASK, reg32);
+
+		/* Clear spurrious errors and enable PCIE port interrupts */
+		out_be64(p->regs + UTL_PCIE_PORT_STATUS,
+			 0xffdfffffffffffffUL);
+
+		if (!pd || !pd->slot || !pd->slot->surprise_pluggable) {
+			out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,
+				 0xad52800000000000UL);
+
+			pci_cfg_read32(&p->phb, 0,
+				       p->aercap + PCIECAP_AER_UE_MASK, &reg32);
+			reg32 &= ~PCIECAP_AER_UE_MASK_SURPRISE_DOWN;
+			pci_cfg_write32(&p->phb, 0,
+					p->aercap + PCIECAP_AER_UE_MASK, reg32);
+		}
+
+		/* Don't block PCI-CFG */
+		p->flags &= ~PHB3_CFG_BLOCKED;
+
+		/*
+		 * We might lose the bus numbers during the reset operation
+		 * and we need to restore them. Otherwise, some adapters (e.g.
+		 * IPR) can't be probed properly by the kernel. We don't need
+		 * to restore bus numbers for every kind of reset, however,
+		 * it's not harmful to always restore the bus numbers, which
+		 * simplifies the logic.
+		 */
+		pci_restore_bridge_buses(slot->phb, slot->pd);
+		if (slot->phb->ops->device_init)
+			pci_walk_dev(slot->phb, slot->pd,
+				     slot->phb->ops->device_init, NULL);
+	}
+}
+
+static int64_t phb3_get_presence_state(struct pci_slot *slot, uint8_t *val)
+{
+	struct phb3 *p = phb_to_phb3(slot->phb);
+	uint64_t hp_override;
+
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	/*
+	 * On P8, the slot status isn't wired up properly, we have
+	 * to use the hotplug override A/B bits.
+	 */
+	hp_override = in_be64(p->regs + PHB_HOTPLUG_OVERRIDE);
+	if ((hp_override & PHB_HPOVR_PRESENCE_A) &&
+	    (hp_override & PHB_HPOVR_PRESENCE_B))
+		*val = OPAL_PCI_SLOT_EMPTY;
+	else
+		*val = OPAL_PCI_SLOT_PRESENT;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_link_state(struct pci_slot *slot, uint8_t *val)
+{
+	struct phb3 *p = phb_to_phb3(slot->phb);
+	uint64_t reg;
+	uint16_t state;
+	int64_t rc;
+
+	/* Link is up, let's find the actual speed */
+	reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+	if (!(reg & PHB_PCIE_DLP_TC_DL_LINKACT)) {
+		*val = 0;
+		return OPAL_SUCCESS;
+	}
+
+	rc = phb3_pcicfg_read16(&p->phb, 0,
+				p->ecap + PCICAP_EXP_LSTAT, &state);
+	if (rc != OPAL_SUCCESS) {
+		PHBERR(p, "%s: Error %lld getting link state\n", __func__, rc);
+		return OPAL_HARDWARE;
+	}
+
+	if (state & PCICAP_EXP_LSTAT_DLLL_ACT)
+		*val = ((state & PCICAP_EXP_LSTAT_WIDTH) >> 4);
+	else
+		*val = 0;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_retry_state(struct pci_slot *slot)
+{
+	struct phb3 *p = phb_to_phb3(slot->phb);
+
+	if (slot->retry_state == PCI_SLOT_STATE_NORMAL)
+		return OPAL_WRONG_STATE;
+
+	PHBDBG(p, "Retry state %08x\n", slot->retry_state);
+	slot->delay_tgt_tb = 0;
+	pci_slot_set_state(slot, slot->retry_state);
+	slot->retry_state = PCI_SLOT_STATE_NORMAL;
+	return slot->ops.run_sm(slot);
+}
+
+static int64_t phb3_poll_link(struct pci_slot *slot)
+{
+	struct phb3 *p = phb_to_phb3(slot->phb);
+	uint64_t reg;
+	int64_t rc;
+
+	switch (slot->state) {
+	case PHB3_SLOT_NORMAL:
+	case PHB3_SLOT_LINK_START:
+		PHBDBG(p, "LINK: Start polling\n");
+		slot->retries = PHB3_LINK_ELECTRICAL_RETRIES;
+		pci_slot_set_state(slot, PHB3_SLOT_LINK_WAIT_ELECTRICAL);
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+	case PHB3_SLOT_LINK_WAIT_ELECTRICAL:
+		/*
+		 * Wait for the link electrical connection to be
+		 * established (shorter timeout). This allows us to
+		 * workaround spurrious presence detect on some machines
+		 * without waiting 10s each time
+		 *
+		 * Note: We *also* check for the full link up bit here
+		 * because simics doesn't seem to implement the electrical
+		 * link bit at all
+		 */
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (reg & (PHB_PCIE_DLP_INBAND_PRESENCE |
+			   PHB_PCIE_DLP_TC_DL_LINKACT)) {
+			PHBDBG(p, "LINK: Electrical link detected\n");
+			pci_slot_set_state(slot, PHB3_SLOT_LINK_WAIT);
+			slot->retries = PHB3_LINK_WAIT_RETRIES;
+			return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+		}
+
+		if (slot->retries-- == 0) {
+			PHBDBG(p, "LINK: Timeout waiting for electrical link\n");
+			PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg);
+			rc = phb3_retry_state(slot);
+			if (rc >= OPAL_SUCCESS)
+				return rc;
+
+			pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+			return OPAL_SUCCESS;
+		}
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+	case PHB3_SLOT_LINK_WAIT:
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (reg & PHB_PCIE_DLP_TC_DL_LINKACT) {
+			PHBDBG(p, "LINK: Link is up\n");
+			if (slot->ops.prepare_link_change)
+				slot->ops.prepare_link_change(slot, true);
+			pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+			return OPAL_SUCCESS;
+		}
+
+		if (slot->retries-- == 0) {
+			PHBDBG(p, "LINK: Timeout waiting for link up\n");
+			PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg);
+			rc = phb3_retry_state(slot);
+			if (rc >= OPAL_SUCCESS)
+				return rc;
+
+			pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+			return OPAL_SUCCESS;
+		}
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+	default:
+		PHBERR(p, "LINK: Unexpected slot state %08x\n",
+		       slot->state);
+	}
+
+	pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+static int64_t phb3_hreset(struct pci_slot *slot)
+{
+	struct phb3 *p = phb_to_phb3(slot->phb);
+	uint16_t brctl;
+	uint8_t presence = 1;
+
+	switch (slot->state) {
+	case PHB3_SLOT_NORMAL:
+		PHBDBG(p, "HRESET: Starts\n");
+		if (slot->ops.get_presence_state)
+			slot->ops.get_presence_state(slot, &presence);
+		if (!presence) {
+			PHBDBG(p, "HRESET: No device\n");
+			return OPAL_SUCCESS;
+		}
+
+		PHBDBG(p, "HRESET: Prepare for link down\n");
+		if (slot->ops.prepare_link_change)
+			slot->ops.prepare_link_change(slot, false);
+		/* fall through */
+	case PHB3_SLOT_HRESET_START:
+		PHBDBG(p, "HRESET: Assert\n");
+
+		phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+		brctl |= PCI_CFG_BRCTL_SECONDARY_RESET;
+		phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+		pci_slot_set_state(slot, PHB3_SLOT_HRESET_DELAY);
+
+		return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+	case PHB3_SLOT_HRESET_DELAY:
+		PHBDBG(p, "HRESET: Deassert\n");
+
+		phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+		brctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+		phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+
+		/*
+		 * Due to some oddball adapters bouncing the link
+		 * training a couple of times, we wait for a full second
+		 * before we start checking the link status, otherwise
+		 * we can get a spurrious link down interrupt which
+		 * causes us to EEH immediately.
+		 */
+		pci_slot_set_state(slot, PHB3_SLOT_HRESET_DELAY2);
+		return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+	case PHB3_SLOT_HRESET_DELAY2:
+		pci_slot_set_state(slot, PHB3_SLOT_LINK_START);
+		return slot->ops.poll_link(slot);
+	default:
+		PHBERR(p, "Unexpected slot state %08x\n", slot->state);
+	}
+
+	pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+static int64_t phb3_freset(struct pci_slot *slot)
+{
+	struct phb3 *p = phb_to_phb3(slot->phb);
+	uint8_t presence = 1;
+	uint64_t reg;
+
+	switch(slot->state) {
+	case PHB3_SLOT_NORMAL:
+		PHBDBG(p, "FRESET: Starts\n");
+
+		/* Nothing to do without adapter connected */
+		if (slot->ops.get_presence_state)
+			slot->ops.get_presence_state(slot, &presence);
+		if (!presence) {
+			PHBDBG(p, "FRESET: No device\n");
+			return OPAL_SUCCESS;
+		}
+
+		PHBDBG(p, "FRESET: Prepare for link down\n");
+		slot->retry_state = PHB3_SLOT_FRESET_START;
+		if (slot->ops.prepare_link_change)
+			slot->ops.prepare_link_change(slot, false);
+		/* fall through */
+	case PHB3_SLOT_FRESET_START:
+		if (!p->skip_perst) {
+			PHBDBG(p, "FRESET: Assert\n");
+			reg = in_be64(p->regs + PHB_RESET);
+			reg &= ~0x2000000000000000ul;
+			out_be64(p->regs + PHB_RESET, reg);
+			pci_slot_set_state(slot,
+				PHB3_SLOT_FRESET_ASSERT_DELAY);
+			return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+		}
+
+		/* To skip the assert during boot time */
+		PHBDBG(p, "FRESET: Assert skipped\n");
+		pci_slot_set_state(slot, PHB3_SLOT_FRESET_ASSERT_DELAY);
+		p->skip_perst = false;
+		/* fall through */
+	case PHB3_SLOT_FRESET_ASSERT_DELAY:
+		PHBDBG(p, "FRESET: Deassert\n");
+		reg = in_be64(p->regs + PHB_RESET);
+		reg |= 0x2000000000000000ul;
+		out_be64(p->regs + PHB_RESET, reg);
+		pci_slot_set_state(slot,
+			PHB3_SLOT_FRESET_DEASSERT_DELAY);
+
+		/* CAPP FPGA requires 1s to flash before polling link */
+		return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+	case PHB3_SLOT_FRESET_DEASSERT_DELAY:
+		pci_slot_set_state(slot, PHB3_SLOT_LINK_START);
+		return slot->ops.poll_link(slot);
+	default:
+		PHBERR(p, "Unexpected slot state %08x\n", slot->state);
+	}
+
+	pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+static int64_t load_capp_ucode(struct phb3 *p)
+{
+	int64_t rc;
+
+	if (p->index > PHB3_CAPP_MAX_PHB_INDEX(p))
+		return OPAL_HARDWARE;
+
+	/* 0x434150504c494448 = 'CAPPLIDH' in ASCII */
+	rc = capp_load_ucode(p->chip_id, p->phb.opal_id, p->index,
+			0x434150504c494448UL, PHB3_CAPP_REG_OFFSET(p),
+			CAPP_APC_MASTER_ARRAY_ADDR_REG,
+			CAPP_APC_MASTER_ARRAY_WRITE_REG,
+			CAPP_SNP_ARRAY_ADDR_REG,
+			CAPP_SNP_ARRAY_WRITE_REG);
+	return rc;
+}
+
+static void do_capp_recovery_scoms(struct phb3 *p)
+{
+	uint64_t reg;
+	uint32_t offset;
+
+	PHBDBG(p, "Doing CAPP recovery scoms\n");
+
+	offset = PHB3_CAPP_REG_OFFSET(p);
+	/* disable snoops */
+	xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0);
+	load_capp_ucode(p);
+	/* clear err rpt reg*/
+	xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0);
+	/* clear capp fir */
+	xscom_write(p->chip_id, CAPP_FIR + offset, 0);
+
+	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+	reg &= ~(PPC_BIT(0) | PPC_BIT(1));
+	xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg);
+}
+
+/*
+ * Disable CAPI mode on a PHB.
+ *
+ * Must be done while PHB is fenced and in recovery. Leaves CAPP in recovery -
+ * we can't come out of recovery until the PHB has been reinitialised.
+ *
+ * We don't reset generic error registers here - we rely on phb3_init_hw() to
+ * do that.
+ *
+ * Sets PHB3_CAPP_DISABLING flag when complete.
+ */
+static void disable_capi_mode(struct phb3 *p)
+{
+	struct proc_chip *chip = get_chip(p->chip_id);
+	uint64_t reg;
+	uint32_t offset = PHB3_CAPP_REG_OFFSET(p);
+
+	lock(&capi_lock);
+
+	xscom_read(p->chip_id, PE_CAPP_EN + PE_REG_OFFSET(p), &reg);
+	if (!(reg & PPC_BIT(0))) {
+	        /* Not in CAPI mode, no action required */
+	        goto out;
+	}
+
+	PHBDBG(p, "CAPP: Disabling CAPI mode\n");
+	if (!(chip->capp_phb3_attached_mask & (1 << p->index)))
+		PHBERR(p, "CAPP: CAPP attached mask not set!\n");
+
+	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+	if (!(reg & PPC_BIT(0))) {
+		PHBERR(p, "CAPP: not in recovery, can't disable CAPI mode!\n");
+		goto out;
+	}
+
+	/* Snoop CAPI Configuration Register - disable snooping */
+	xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0ull);
+
+	/* APC Master PB Control Register - disable examining cResps */
+	xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, &reg);
+	reg &= ~PPC_BIT(3);
+	xscom_write(p->chip_id, APC_MASTER_PB_CTRL + offset, reg);
+
+	/* APC Master Config Register - de-select PHBs */
+	xscom_read(p->chip_id, APC_MASTER_CAPI_CTRL + offset, &reg);
+	reg &= ~PPC_BITMASK(1, 3);
+	xscom_write(p->chip_id, APC_MASTER_CAPI_CTRL + offset, reg);
+
+	/* PE Bus AIB Mode Bits */
+	xscom_read(p->chip_id, p->pci_xscom + 0xf, &reg);
+	reg |= PPC_BITMASK(7, 8);	/* Ch2 command credit */
+	reg &= ~PPC_BITMASK(40, 42);	/* Disable HOL blocking */
+	xscom_write(p->chip_id, p->pci_xscom + 0xf, reg);
+
+	/* PCI Hardware Configuration 0 Register - all store queues free */
+	xscom_read(p->chip_id, p->pe_xscom + 0x18, &reg);
+	reg &= ~PPC_BIT(14);
+	reg |= PPC_BIT(15);
+	xscom_write(p->chip_id, p->pe_xscom + 0x18, reg);
+
+	/*
+	 * PCI Hardware Configuration 1 Register - enable read response
+	 * arrival/address request ordering
+	 */
+	xscom_read(p->chip_id, p->pe_xscom + 0x19, &reg);
+	reg |= PPC_BITMASK(17,18);
+	xscom_write(p->chip_id, p->pe_xscom + 0x19, reg);
+
+	/*
+	 * AIB TX Command Credit Register - set AIB credit values back to
+	 * normal
+	 */
+	xscom_read(p->chip_id, p->pci_xscom + 0xd, &reg);
+	reg |= PPC_BIT(42);
+	reg &= ~PPC_BITMASK(43, 47);
+	xscom_write(p->chip_id, p->pci_xscom + 0xd, reg);
+
+	/* AIB TX Credit Init Timer - reset timer */
+	xscom_write(p->chip_id, p->pci_xscom + 0xc, 0xff00000000000000UL);
+
+	/*
+	 * PBCQ Mode Control Register - set dcache handling to normal, not CAPP
+	 * mode
+	 */
+	xscom_read(p->chip_id, p->pe_xscom + 0xb, &reg);
+	reg &= ~PPC_BIT(25);
+	xscom_write(p->chip_id, p->pe_xscom + 0xb, reg);
+
+	/* Registers touched by phb3_init_capp_regs() */
+
+	/* CAPP Transport Control Register */
+	xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, 0x0001000000000000UL);
+
+	/* Canned pResp Map Register 0/1/2 */
+	xscom_write(p->chip_id, CANNED_PRESP_MAP0 + offset, 0);
+	xscom_write(p->chip_id, CANNED_PRESP_MAP1 + offset, 0);
+	xscom_write(p->chip_id, CANNED_PRESP_MAP2 + offset, 0);
+
+	/* Flush SUE State Map Register */
+	xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset, 0);
+
+	/* CAPP Epoch and Recovery Timers Control Register */
+	xscom_write(p->chip_id, CAPP_EPOCH_TIMER_CTRL + offset, 0);
+
+	/* PE Secure CAPP Enable Register - we're all done! Disable CAPP mode! */
+	xscom_write(p->chip_id, PE_CAPP_EN + PE_REG_OFFSET(p), 0ull);
+
+	/* Trigger CAPP recovery scoms after reinit */
+	p->flags |= PHB3_CAPP_DISABLING;
+
+	chip->capp_phb3_attached_mask &= ~(1 << p->index);
+
+out:
+	unlock(&capi_lock);
+}
+
+static int64_t phb3_creset(struct pci_slot *slot)
+{
+	struct phb3 *p = phb_to_phb3(slot->phb);
+	uint64_t cqsts, val;
+
+	switch (slot->state) {
+	case PHB3_SLOT_NORMAL:
+	case PHB3_SLOT_CRESET_START:
+		PHBDBG(p, "CRESET: Starts\n");
+
+		/* do steps 3-5 of capp recovery procedure */
+		if (p->flags & PHB3_CAPP_RECOVERY)
+			do_capp_recovery_scoms(p);
+
+		/*
+		 * The users might be doing error injection through PBCQ
+		 * Error Inject Control Register. Without clearing that,
+		 * we will get recrusive error during recovery and it will
+		 * fail eventually.
+		 */
+		xscom_write(p->chip_id, p->pe_xscom + 0xa, 0x0ul);
+
+		/*
+		 * We might have escalated frozen state on non-existing PE
+		 * to fenced PHB. For the case, the PHB isn't fenced in the
+		 * hardware level and it's not safe to do ETU reset. So we
+		 * have to force fenced PHB prior to ETU reset.
+		 */
+		if (!phb3_fenced(p))
+			xscom_write(p->chip_id, p->pe_xscom + 0x2, 0x000000f000000000ull);
+
+		/* Now that we're guaranteed to be fenced, disable CAPI mode */
+		if (!(p->flags & PHB3_CAPP_RECOVERY))
+			disable_capi_mode(p);
+
+		/* Clear errors in NFIR and raise ETU reset */
+		xscom_read(p->chip_id, p->pe_xscom + 0x0, &p->nfir_cache);
+
+		xscom_read(p->chip_id, p->spci_xscom + 1, &val);/* HW275117 */
+		xscom_write(p->chip_id, p->pci_xscom + 0xa,
+			    0x8000000000000000UL);
+		pci_slot_set_state(slot, PHB3_SLOT_CRESET_WAIT_CQ);
+		slot->retries = 500;
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+	case PHB3_SLOT_CRESET_WAIT_CQ:
+		xscom_read(p->chip_id, p->pe_xscom + 0x1c, &val);
+		xscom_read(p->chip_id, p->pe_xscom + 0x1d, &val);
+		xscom_read(p->chip_id, p->pe_xscom + 0x1e, &val);
+		xscom_read(p->chip_id, p->pe_xscom + 0xf, &cqsts);
+		if (!(cqsts & 0xC000000000000000UL)) {
+			PHBDBG(p, "CRESET: No pending transactions\n");
+			xscom_write(p->chip_id, p->pe_xscom + 0x1, ~p->nfir_cache);
+
+			pci_slot_set_state(slot, PHB3_SLOT_CRESET_REINIT);
+			return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+		}
+
+		if (slot->retries-- == 0) {
+			PHBERR(p, "Timeout waiting for pending transaction\n");
+			goto error;
+		}
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+	case PHB3_SLOT_CRESET_REINIT:
+		PHBDBG(p, "CRESET: Reinitialization\n");
+
+		/*
+		 * Clear AIB fenced state. Otherwise, we can't access the
+		 * PCI config space of root complex when reinitializing
+		 * the PHB.
+		 */
+		p->flags &= ~PHB3_AIB_FENCED;
+		p->flags &= ~PHB3_CAPP_RECOVERY;
+		phb3_init_hw(p, false);
+
+		if (p->flags & PHB3_CAPP_DISABLING) {
+			do_capp_recovery_scoms(p);
+			p->flags &= ~PHB3_CAPP_DISABLING;
+		}
+
+		pci_slot_set_state(slot, PHB3_SLOT_CRESET_FRESET);
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+	case PHB3_SLOT_CRESET_FRESET:
+		pci_slot_set_state(slot, PHB3_SLOT_NORMAL);
+		return slot->ops.freset(slot);
+	default:
+		PHBERR(p, "CRESET: Unexpected slot state %08x\n",
+		       slot->state);
+	}
+
+error:
+	return OPAL_HARDWARE;
+}
+
+/*
+ * Initialize root complex slot, which is mainly used to
+ * do fundamental reset before PCI enumeration in PCI core.
+ * When probing root complex and building its real slot,
+ * the operations will be copied over.
+ */
+static struct pci_slot *phb3_slot_create(struct phb *phb)
+{
+	struct pci_slot *slot;
+
+	slot = pci_slot_alloc(phb, NULL);
+	if (!slot)
+		return slot;
+
+	/* Elementary functions */
+	slot->ops.get_presence_state  = phb3_get_presence_state;
+	slot->ops.get_link_state      = phb3_get_link_state;
+	slot->ops.get_power_state     = NULL;
+	slot->ops.get_attention_state = NULL;
+	slot->ops.get_latch_state     = NULL;
+	slot->ops.set_power_state     = NULL;
+	slot->ops.set_attention_state = NULL;
+
+	/*
+	 * For PHB slots, we have to split the fundamental reset
+	 * into 2 steps. We might not have the first step which
+	 * is to power off/on the slot, or it's controlled by
+	 * individual platforms.
+	 */
+	slot->ops.prepare_link_change	= phb3_prepare_link_change;
+	slot->ops.poll_link		= phb3_poll_link;
+	slot->ops.hreset		= phb3_hreset;
+	slot->ops.freset		= phb3_freset;
+	slot->ops.creset		= phb3_creset;
+
+	return slot;
+}
+
+static int64_t phb3_eeh_freeze_status(struct phb *phb, uint64_t pe_number,
+				      uint8_t *freeze_state,
+				      uint16_t *pci_error_type,
+				      uint16_t *severity)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t peev_bit = PPC_BIT(pe_number & 0x3f);
+	uint64_t peev, pesta, pestb;
+
+	/* Defaults: not frozen */
+	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+
+	/* Check dead */
+	if (p->broken) {
+		*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		if (severity)
+			*severity = OPAL_EEH_SEV_PHB_DEAD;
+		return OPAL_HARDWARE;
+	}
+
+	/* Check fence and CAPP recovery */
+	if (phb3_fenced(p) || (p->flags & PHB3_CAPP_RECOVERY)) {
+		*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		if (severity)
+			*severity = OPAL_EEH_SEV_PHB_FENCED;
+		return OPAL_SUCCESS;
+	}
+
+	/* Check the PEEV */
+	phb3_ioda_sel(p, IODA2_TBL_PEEV, pe_number / 64, false);
+	peev = in_be64(p->regs + PHB_IODA_DATA0);
+	if (!(peev & peev_bit))
+		return OPAL_SUCCESS;
+
+	/* Indicate that we have an ER pending */
+	phb3_set_err_pending(p, true);
+	if (severity)
+		*severity = OPAL_EEH_SEV_PE_ER;
+
+	/* Read the PESTA & PESTB */
+	phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false);
+	pesta = in_be64(p->regs + PHB_IODA_DATA0);
+	phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false);
+	pestb = in_be64(p->regs + PHB_IODA_DATA0);
+
+	/* Convert them */
+	if (pesta & IODA2_PESTA_MMIO_FROZEN)
+		*freeze_state |= OPAL_EEH_STOPPED_MMIO_FREEZE;
+	if (pestb & IODA2_PESTB_DMA_STOPPED)
+		*freeze_state |= OPAL_EEH_STOPPED_DMA_FREEZE;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_eeh_freeze_clear(struct phb *phb, uint64_t pe_number,
+				     uint64_t eeh_action_token)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t err, peev[4];
+	int32_t i;
+	bool frozen_pe = false;
+
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	/* Summary. If nothing, move to clearing the PESTs which can
+	 * contain a freeze state from a previous error or simply set
+	 * explicitely by the user
+	 */
+	err = in_be64(p->regs + PHB_ETU_ERR_SUMMARY);
+	if (err == 0xffffffffffffffffUL) {
+		if (phb3_fenced(p)) {
+			PHBERR(p, "eeh_freeze_clear on fenced PHB\n");
+			return OPAL_HARDWARE;
+		}
+	}
+	if (err != 0)
+		phb3_err_ER_clear(p);
+
+	/*
+	 * We have PEEV in system memory. It would give more performance
+	 * to access that directly.
+	 */
+	if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO) {
+		phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+	}
+	if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_DMA) {
+		phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+	}
+
+
+	/* Update ER pending indication */
+	phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+	for (i = 0; i < ARRAY_SIZE(peev); i++) {
+		peev[i] = in_be64(p->regs + PHB_IODA_DATA0);
+		if (peev[i]) {
+			frozen_pe = true;
+			break;
+		}
+	}
+	if (frozen_pe) {
+		p->err.err_src	 = PHB3_ERR_SRC_PHB;
+		p->err.err_class = PHB3_ERR_CLASS_ER;
+		p->err.err_bit   = -1;
+		phb3_set_err_pending(p, true);
+	} else
+		phb3_set_err_pending(p, false);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_eeh_freeze_set(struct phb *phb, uint64_t pe_number,
+                                   uint64_t eeh_action_token)
+{
+        struct phb3 *p = phb_to_phb3(phb);
+        uint64_t data;
+
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	if (pe_number >= PHB3_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	if (eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_MMIO &&
+	    eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_DMA &&
+	    eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_ALL)
+		return OPAL_PARAMETER;
+
+	if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_MMIO) {
+		phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false);
+		data = in_be64(p->regs + PHB_IODA_DATA0);
+		data |= IODA2_PESTA_MMIO_FROZEN;
+		out_be64(p->regs + PHB_IODA_DATA0, data);
+	}
+
+	if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_DMA) {
+		phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false);
+		data = in_be64(p->regs + PHB_IODA_DATA0);
+		data |= IODA2_PESTB_DMA_STOPPED;
+		out_be64(p->regs + PHB_IODA_DATA0, data);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_eeh_next_error(struct phb *phb,
+				   uint64_t *first_frozen_pe,
+				   uint16_t *pci_error_type,
+				   uint16_t *severity)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t fir, peev[4];
+	uint32_t cfg32;
+	int32_t i, j;
+
+	/* If the PHB is broken, we needn't go forward */
+	if (p->broken) {
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_DEAD;
+		return OPAL_SUCCESS;
+	}
+
+	if ((p->flags & PHB3_CAPP_RECOVERY)) {
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_FENCED;
+		return OPAL_SUCCESS;
+	}
+
+	/*
+	 * Check if we already have pending errors. If that's
+	 * the case, then to get more information about the
+	 * pending errors. Here we try PBCQ prior to PHB.
+	 */
+	if (phb3_err_pending(p) &&
+	    !phb3_err_check_pbcq(p) &&
+	    !phb3_err_check_lem(p))
+		phb3_set_err_pending(p, false);
+
+	/* Clear result */
+	*pci_error_type  = OPAL_EEH_NO_ERROR;
+	*severity	 = OPAL_EEH_SEV_NO_ERROR;
+	*first_frozen_pe = (uint64_t)-1;
+
+	/* Check frozen PEs */
+	if (!phb3_err_pending(p)) {
+		phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+		for (i = 0; i < ARRAY_SIZE(peev); i++) {
+			peev[i] = in_be64(p->regs + PHB_IODA_DATA0);
+			if (peev[i]) {
+				p->err.err_src	 = PHB3_ERR_SRC_PHB;
+				p->err.err_class = PHB3_ERR_CLASS_ER;
+				p->err.err_bit	 = -1;
+				phb3_set_err_pending(p, true);
+				break;
+			}
+		}
+        }
+
+	/* Mapping errors */
+	if (phb3_err_pending(p)) {
+		/*
+		 * If the frozen PE is caused by a malfunctioning TLP, we
+		 * need reset the PHB. So convert ER to PHB-fatal error
+		 * for the case.
+		 */
+		if (p->err.err_class == PHB3_ERR_CLASS_ER) {
+			fir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM);
+			if (fir & PPC_BIT(60)) {
+				phb3_pcicfg_read32(&p->phb, 0,
+					p->aercap + PCIECAP_AER_UE_STATUS, &cfg32);
+				if (cfg32 & PCIECAP_AER_UE_MALFORMED_TLP)
+					p->err.err_class = PHB3_ERR_CLASS_FENCED;
+			}
+		}
+
+		switch (p->err.err_class) {
+		case PHB3_ERR_CLASS_DEAD:
+			*pci_error_type = OPAL_EEH_PHB_ERROR;
+			*severity = OPAL_EEH_SEV_PHB_DEAD;
+			break;
+		case PHB3_ERR_CLASS_FENCED:
+			*pci_error_type = OPAL_EEH_PHB_ERROR;
+			*severity = OPAL_EEH_SEV_PHB_FENCED;
+			break;
+		case PHB3_ERR_CLASS_ER:
+			*pci_error_type = OPAL_EEH_PE_ERROR;
+			*severity = OPAL_EEH_SEV_PE_ER;
+
+			phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+			for (i = 0; i < ARRAY_SIZE(peev); i++)
+				peev[i] = in_be64(p->regs + PHB_IODA_DATA0);
+			for (i = ARRAY_SIZE(peev) - 1; i >= 0; i--) {
+				for (j = 0; j < 64; j++) {
+					if (peev[i] & PPC_BIT(j)) {
+						*first_frozen_pe = i * 64 + j;
+						break;
+					}
+				}
+
+				if (*first_frozen_pe != (uint64_t)(-1))
+					break;
+			}
+
+			/* No frozen PE ? */
+			if (*first_frozen_pe == (uint64_t)-1) {
+				*pci_error_type = OPAL_EEH_NO_ERROR;
+				*severity = OPAL_EEH_SEV_NO_ERROR;
+				phb3_set_err_pending(p, false);
+			}
+
+                        break;
+		case PHB3_ERR_CLASS_INF:
+			*pci_error_type = OPAL_EEH_PHB_ERROR;
+			*severity = OPAL_EEH_SEV_INF;
+			break;
+		default:
+			*pci_error_type = OPAL_EEH_NO_ERROR;
+			*severity = OPAL_EEH_SEV_NO_ERROR;
+			phb3_set_err_pending(p, false);
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_err_inject_finalize(struct phb3 *p, uint64_t addr,
+					uint64_t mask, uint64_t ctrl,
+					bool is_write)
+{
+	if (is_write)
+		ctrl |= PHB_PAPR_ERR_INJ_CTL_WR;
+	else
+		ctrl |= PHB_PAPR_ERR_INJ_CTL_RD;
+
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_ADDR, addr);
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_MASK, mask);
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, ctrl);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_err_inject_mem32(struct phb3 *p, uint64_t pe_number,
+				     uint64_t addr, uint64_t mask,
+				     bool is_write)
+{
+	uint64_t base, len, segstart, segsize;
+	uint64_t a, m;
+	uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_OUTB;
+	uint32_t index;
+
+	segsize = (M32_PCI_SIZE / PHB3_MAX_PE_NUM);
+	a = base = len = 0x0ull;
+
+	for (index = 0; index < PHB3_MAX_PE_NUM; index++) {
+		if (GETFIELD(IODA2_M32DT_PE, p->m32d_cache[index]) != pe_number)
+			continue;
+
+		/* Obviously, we can't support discontiguous segments.
+		 * We have to pick the first batch of contiguous segments
+		 * for that case
+		 */
+		segstart = p->mm1_base + segsize * index;
+		if (!len) {
+			base = segstart;
+			len = segsize;
+		} else if ((base + len) == segstart) {
+			len += segsize;
+		}
+
+		/* Check the specified address is valid one */
+		if (addr >= segstart && addr < (segstart + segsize)) {
+			a = addr;
+			break;
+		}
+	}
+
+	/* No MM32 segments assigned to the PE */
+	if (!len)
+		return OPAL_PARAMETER;
+
+	/* Specified address is out of range */
+	if (!a) {
+		a = base;
+		len = len & ~(len - 1);
+		m = ~(len - 1);
+	} else {
+		m = mask;
+	}
+
+	a = SETFIELD(PHB_PAPR_ERR_INJ_ADDR_MMIO, 0x0ull, a);
+	m = SETFIELD(PHB_PAPR_ERR_INJ_MASK_MMIO, 0x0ull, m);
+
+	return phb3_err_inject_finalize(p, a, m, ctrl, is_write);
+}
+
+static int64_t phb3_err_inject_mem64(struct phb3 *p, uint64_t pe_number,
+				     uint64_t addr, uint64_t mask,
+				     bool is_write)
+{
+	uint64_t base, len, segstart, segsize;
+	uint64_t cache, a, m;
+	uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_OUTB;
+	uint32_t index, s_index, e_index;
+
+	/* By default, the PE is PCI device dependent one */
+	s_index = 0;
+	e_index = ARRAY_SIZE(p->m64b_cache) - 2;
+	for (index = 0; index < RTT_TABLE_ENTRIES; index++) {
+		if (p->rte_cache[index] != pe_number)
+			continue;
+
+		if (index + 8 >= RTT_TABLE_ENTRIES)
+			break;
+
+		/* PCI bus dependent PE */
+		if (p->rte_cache[index + 8] == pe_number) {
+			s_index = e_index = ARRAY_SIZE(p->m64b_cache) - 1;
+			break;
+		}
+	}
+
+	a = base = len = 0x0ull;
+	for (index = s_index; !len && index <= e_index; index++) {
+		cache = p->m64b_cache[index];
+		if (!(cache & IODA2_M64BT_ENABLE))
+			continue;
+
+		if (cache & IODA2_M64BT_SINGLE_PE) {
+			if (GETFIELD(IODA2_M64BT_PE_HI, cache) != (pe_number >> 5) ||
+			    GETFIELD(IODA2_M64BT_PE_LOW, cache) != (pe_number & 0x1f))
+				continue;
+
+			segstart = GETFIELD(IODA2_M64BT_SINGLE_BASE, cache);
+			segstart <<= 25;	/* 32MB aligned */
+			segsize = GETFIELD(IODA2_M64BT_SINGLE_MASK, cache);
+			segsize = (0x2000000ull - segsize) << 25;
+		} else {
+			segstart = GETFIELD(IODA2_M64BT_BASE, cache);
+			segstart <<= 20;	/* 1MB aligned */
+			segsize = GETFIELD(IODA2_M64BT_MASK, cache);
+			segsize = (0x40000000ull - segsize) << 20;
+
+			segsize /= PHB3_MAX_PE_NUM;
+			segstart = segstart + segsize * pe_number;
+		}
+
+		/* First window always wins based on the ascending
+		 * searching priority the 16 BARs have. We're using
+		 * the feature to assign resource for SRIOV VFs.
+		 */
+		if (!len) {
+			base = segstart;
+			len = segsize;
+		}
+
+		/* Specified address is valid one */
+		if (addr >= segstart && addr < (segstart + segsize)) {
+			a = addr;
+		}
+	}
+
+	/* No MM64 segments assigned to the PE */
+	if (!len)
+		return OPAL_PARAMETER;
+
+	/* Address specified or calculated */
+	if (!a) {
+		a = base;
+		len = len & ~(len - 1);
+		m = ~(len - 1);
+	} else {
+		m = mask;
+	}
+
+	a = SETFIELD(PHB_PAPR_ERR_INJ_ADDR_MMIO, 0x0ull, a);
+	m = SETFIELD(PHB_PAPR_ERR_INJ_MASK_MMIO, 0x0ull, m);
+
+	return phb3_err_inject_finalize(p, a, m, ctrl, is_write);
+}
+
+static int64_t phb3_err_inject_cfg(struct phb3 *p, uint64_t pe_number,
+				   uint64_t addr, uint64_t mask,
+				   bool is_write)
+{
+	uint64_t a, m, prefer;
+	uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_CFG;
+	int bdfn;
+	bool is_bus_pe;
+
+	a = 0xffffull;
+	prefer = 0xffffull;
+	m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL;
+	for (bdfn = 0; bdfn < RTT_TABLE_ENTRIES; bdfn++) {
+		if (p->rte_cache[bdfn] != pe_number)
+			continue;
+
+		/* The PE can be associated with PCI bus or device */
+		is_bus_pe = false;
+		if ((bdfn + 8) < RTT_TABLE_ENTRIES &&
+		    p->rte_cache[bdfn + 8] == pe_number)
+			is_bus_pe = true;
+
+		/* Figure out the PCI config address */
+		if (prefer == 0xffffull) {
+			if (is_bus_pe) {
+				m = PHB_PAPR_ERR_INJ_MASK_CFG;
+				prefer = SETFIELD(m, 0x0ull, PCI_BUS_NUM(bdfn));
+			} else {
+				m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL;
+				prefer = SETFIELD(m, 0x0ull, bdfn);
+			}
+		}
+
+		/* Check the input address is valid or not */
+		if (!is_bus_pe &&
+		    GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG_ALL, addr) == bdfn) {
+			a = addr;
+			break;
+		}
+
+		if (is_bus_pe &&
+		    GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG, addr) == PCI_BUS_NUM(bdfn)) {
+			a = addr;
+			break;
+		}
+	}
+
+	/* Invalid PE number */
+	if (prefer == 0xffffull)
+		return OPAL_PARAMETER;
+
+	/* Specified address is out of range */
+	if (a == 0xffffull)
+		a = prefer;
+	else
+		m = mask;
+
+	return phb3_err_inject_finalize(p, a, m, ctrl, is_write);
+}
+
+static int64_t phb3_err_inject_dma(struct phb3 *p, uint64_t pe_number,
+				   uint64_t addr, uint64_t mask,
+				   bool is_write, bool is_64bits)
+{
+	uint32_t index, page_size;
+	uint64_t tve, table_entries;
+	uint64_t base, start, end, len, a, m;
+	uint64_t ctrl = PHB_PAPR_ERR_INJ_CTL_INB;
+
+	/* TVE index and base address */
+	if (!is_64bits) {
+		index = (pe_number << 1);
+		base = 0x0ull;
+	} else {
+		index = ((pe_number << 1) + 1);
+		base = (0x1ull << 59);
+	}
+
+	/* Raw data of table entries and page size */
+	tve = p->tve_cache[index];
+	table_entries = GETFIELD(IODA2_TVT_TCE_TABLE_SIZE, tve);
+	table_entries = (0x1ull << (table_entries + 8));
+	page_size = GETFIELD(IODA2_TVT_IO_PSIZE, tve);
+	if (!page_size && !(tve & PPC_BIT(51)))
+		return OPAL_UNSUPPORTED;
+
+	/* Check the page size */
+	switch (page_size) {
+	case 0:	/* bypass */
+		start = ((tve & (0x3ull << 10)) << 14) |
+			((tve & (0xffffffull << 40)) >> 40);
+		end   = ((tve & (0x3ull << 8)) << 16) |
+			((tve & (0xffffffull << 16)) >> 16);
+
+		/* 16MB aligned size */
+		len   = (end - start) << 24;
+		break;
+	case 5:  /* 64KB */
+		len = table_entries * 0x10000ull;
+		break;
+	case 13: /* 16MB */
+		len = table_entries * 0x1000000ull;
+		break;
+	case 17: /* 256MB */
+		len = table_entries * 0x10000000ull;
+		break;
+	case 1:  /* 4KB */
+	default:
+		len = table_entries * 0x1000ull;
+	}
+
+	/* The specified address is in range */
+	if (addr && addr >= base && addr < (base + len)) {
+		a = addr;
+		m = mask;
+	} else {
+		a = base;
+		len = len & ~(len - 1);
+		m = ~(len - 1);
+	}
+
+	return phb3_err_inject_finalize(p, a, m, ctrl, is_write);
+}
+
+static int64_t phb3_err_inject_dma32(struct phb3 *p, uint64_t pe_number,
+				     uint64_t addr, uint64_t mask,
+				     bool is_write)
+{
+	return phb3_err_inject_dma(p, pe_number, addr, mask, is_write, false);
+}
+
+static int64_t phb3_err_inject_dma64(struct phb3 *p, uint64_t pe_number,
+				     uint64_t addr, uint64_t mask,
+				     bool is_write)
+{
+	return phb3_err_inject_dma(p, pe_number, addr, mask, is_write, true);
+}
+
+static int64_t phb3_err_inject(struct phb *phb, uint64_t pe_number,
+			       uint32_t type, uint32_t func,
+			       uint64_t addr, uint64_t mask)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	int64_t (*handler)(struct phb3 *p, uint64_t pe_number,
+			   uint64_t addr, uint64_t mask, bool is_write);
+	bool is_write;
+
+	/* How could we get here without valid RTT? */
+	if (!p->tbl_rtt)
+		return OPAL_HARDWARE;
+
+	/* We can't inject error to the reserved PE */
+	if (pe_number == PHB3_RESERVED_PE_NUM || pe_number >= PHB3_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	/* Clear leftover from last time */
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul);
+
+	switch (func) {
+	case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_DATA:
+		is_write = false;
+		if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+			handler = phb3_err_inject_mem64;
+		else
+			handler = phb3_err_inject_mem32;
+		break;
+	case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_DATA:
+		is_write = true;
+		if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+			handler = phb3_err_inject_mem64;
+		else
+			handler = phb3_err_inject_mem32;
+		break;
+	case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_DATA:
+		is_write = false;
+		handler = phb3_err_inject_cfg;
+		break;
+	case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_DATA:
+		is_write = true;
+		handler = phb3_err_inject_cfg;
+		break;
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_DATA:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_MASTER:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_TARGET:
+		is_write = false;
+		if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+			handler = phb3_err_inject_dma64;
+		else
+			handler = phb3_err_inject_dma32;
+		break;
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_DATA:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_MASTER:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_TARGET:
+		is_write = true;
+		if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+			handler = phb3_err_inject_dma64;
+		else
+			handler = phb3_err_inject_dma32;
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return handler(p, pe_number, addr, mask, is_write);
+}
+
+static int64_t phb3_get_diag_data(struct phb *phb,
+				  void *diag_buffer,
+				  uint64_t diag_buffer_len)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	struct OpalIoPhb3ErrorData *data = diag_buffer;
+	bool fenced;
+
+	if (diag_buffer_len < sizeof(struct OpalIoPhb3ErrorData))
+		return OPAL_PARAMETER;
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	/*
+	 * Dummy check for fence so that phb3_read_phb_status knows
+	 * whether to use ASB or AIB
+	 */
+	fenced = phb3_fenced(p);
+	phb3_read_phb_status(p, data);
+
+	if (!fenced)
+		phb3_eeh_dump_regs(p, data);
+
+	/*
+	 * We're running to here probably because of errors
+	 * (INF class). For that case, we need clear the error
+	 * explicitly.
+	 */
+	if (phb3_err_pending(p) &&
+	    p->err.err_class == PHB3_ERR_CLASS_INF &&
+	    p->err.err_src == PHB3_ERR_SRC_PHB) {
+		phb3_err_ER_clear(p);
+		phb3_set_err_pending(p, false);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_capp_info(int chip_id, struct phb *phb,
+				  struct capp_info *info)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	struct proc_chip *chip = get_chip(p->chip_id);
+	uint32_t offset;
+
+	if (chip_id != p->chip_id)
+		return OPAL_PARAMETER;
+
+	if (!((1 << p->index) & chip->capp_phb3_attached_mask))
+		return OPAL_PARAMETER;
+
+	offset = PHB3_CAPP_REG_OFFSET(p);
+
+	if (PHB3_IS_NAPLES(p)) {
+		if (p->index == 0)
+			info->capp_index = 0;
+		else
+			info->capp_index = 1;
+	} else
+		info->capp_index = 0;
+	info->phb_index = p->index;
+	info->capp_fir_reg = CAPP_FIR + offset;
+	info->capp_fir_mask_reg = CAPP_FIR_MASK + offset;
+	info->capp_fir_action0_reg = CAPP_FIR_ACTION0 + offset;
+	info->capp_fir_action1_reg = CAPP_FIR_ACTION1 + offset;
+	info->capp_err_status_ctrl_reg = CAPP_ERR_STATUS_CTRL + offset;
+
+	return OPAL_SUCCESS;
+}
+
+static void phb3_init_capp_regs(struct phb3 *p, bool dma_mode)
+{
+	uint64_t reg;
+	uint32_t offset;
+	uint64_t read_buffers = 0;
+
+	offset = PHB3_CAPP_REG_OFFSET(p);
+	xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, &reg);
+	reg &= ~PPC_BITMASK(10, 11);
+	reg |= PPC_BIT(3);
+	if (dma_mode) {
+		/* In DMA mode, the CAPP only owns some of the PHB read buffers */
+		read_buffers = 0x1;
+
+		/*
+		 * HW301991 - XSL sends PTE updates with nodal scope instead of
+		 * group scope. The workaround is to force all commands to
+		 * unlimited scope by setting bit 4. This may have a slight
+		 * performance impact, but it would be negligible on the XSL.
+		 * To avoid the possibility it might impact other cards, key it
+		 * off DMA mode since the XSL based Mellanox CX4 is the only
+		 * card to use this mode in P8 timeframe:
+		 */
+		reg |= PPC_BIT(4);
+	}
+	reg |= read_buffers << PPC_BITLSHIFT(11);
+	xscom_write(p->chip_id, APC_MASTER_PB_CTRL + offset, reg);
+
+	/* Dynamically workout which PHB to connect to port 0 of the CAPP.
+	 * Here is the table from the CAPP workbook:
+	 *	     APC_MASTER		CAPP		CAPP
+	 *	      bits 1:3		port0		port1
+	 *		000		 disabled	 disabled
+	 *	     *	001		 PHB2		 disabled
+	 *	     *	010		 PHB1		 disabled
+	 *		011		 PHB1		 PHB2
+	 *	     *	100		 PHB0		 disabled
+	 *		101		 PHB0		 PHB2
+	 *		110		 PHB0		 PHB1
+	 *
+	 * We don't use port1 so only those starred above are used.
+	 * Hence reduce table to:
+	 *    PHB0 -> APC MASTER(bits 1:3) = 0b100
+	 *    PHB1 -> APC MASTER(bits 1:3) = 0b010
+	 *    PHB2 -> APC MASTER(bits 1:3) = 0b001
+	 *
+	 * Note: Naples has two CAPP units, statically mapped:
+	 *    CAPP0/PHB0 -> APC MASTER(bits 1:3) = 0b100
+	 *    CAPP1/PHB1 -> APC MASTER(bits 1:3) = 0b010
+	 */
+	reg = 0x4000000000000000ULL >> p->index;
+	reg |= 0x0070000000000000UL;
+	xscom_write(p->chip_id, APC_MASTER_CAPI_CTRL + offset, reg);
+	PHBINF(p, "CAPP: port attached\n");
+
+	/* tlb and mmio */
+	xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, 0x4028000104000000UL);
+
+	xscom_write(p->chip_id, CANNED_PRESP_MAP0 + offset, 0);
+	xscom_write(p->chip_id, CANNED_PRESP_MAP1 + offset, 0xFFFFFFFF00000000UL);
+	xscom_write(p->chip_id, CANNED_PRESP_MAP2 + offset, 0);
+
+	/* error recovery */
+	xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, 0);
+
+	xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset,
+		    0x1DC20B6600000000UL);
+	xscom_write(p->chip_id, CAPP_EPOCH_TIMER_CTRL + offset,
+		    0xC0000000FFF0FFE0UL);
+	xscom_write(p->chip_id,  FLUSH_UOP_CONFIG1 + offset,
+		    0xB188280728000000UL);
+	xscom_write(p->chip_id, FLUSH_UOP_CONFIG2 + offset, 0xB188400F00000000UL);
+
+	reg = 0xA1F0000000000000UL;
+	reg |= read_buffers << PPC_BITLSHIFT(39);
+	xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, reg);
+}
+
+/* override some inits with CAPI defaults */
+static void phb3_init_capp_errors(struct phb3 *p)
+{
+	out_be64(p->regs + PHB_ERR_AIB_FENCE_ENABLE,       0xffffffdd8c80ffc0UL);
+	out_be64(p->regs + PHB_OUT_ERR_AIB_FENCE_ENABLE,   0x9cf3fe08f8dc700fUL);
+	out_be64(p->regs + PHB_INA_ERR_AIB_FENCE_ENABLE,   0xffff57fbff01ffdeUL);
+	out_be64(p->regs + PHB_INB_ERR_AIB_FENCE_ENABLE,   0xfcffe0fbff7ff0ecUL);
+	out_be64(p->regs + PHB_LEM_ERROR_MASK,		   0x40018e2400022482UL);
+}
+
+/*
+ * Enable CAPI mode on a PHB
+ *
+ * Changes to this init sequence may require updating disable_capi_mode().
+ */
+static int64_t enable_capi_mode(struct phb3 *p, uint64_t pe_number, bool dma_mode)
+{
+	uint64_t reg;
+	int i;
+
+	xscom_read(p->chip_id, PE_CAPP_EN + PE_REG_OFFSET(p), &reg);
+	if (reg & PPC_BIT(0)) {
+		PHBDBG(p, "Already in CAPP mode\n");
+	}
+
+	/* poll cqstat */
+	for (i = 0; i < 500000; i++) {
+		xscom_read(p->chip_id, p->pe_xscom + 0xf, &reg);
+		if (!(reg & 0xC000000000000000UL))
+			break;
+		time_wait_us(10);
+	}
+	if (reg & 0xC000000000000000UL) {
+		PHBERR(p, "CAPP: Timeout waiting for pending transaction\n");
+		return OPAL_HARDWARE;
+	}
+
+	/* pb aib capp enable */
+	reg = PPC_BIT(0); /* capp enable */
+	if (dma_mode)
+		reg |= PPC_BIT(1); /* capp dma mode */
+	xscom_write(p->chip_id, p->spci_xscom + 0x3, reg);
+
+	/* FIXME security timer bar
+	xscom_write(p->chip_id, p->spci_xscom + 0x4, 0x8000000000000000ull);
+	*/
+
+	/* aib mode */
+	xscom_read(p->chip_id, p->pci_xscom + 0xf, &reg);
+	reg &= ~PPC_BITMASK(6,7);
+	reg |= PPC_BIT(8);
+	reg |= PPC_BITMASK(40, 41);
+	reg &= ~PPC_BIT(42);
+	xscom_write(p->chip_id, p->pci_xscom + 0xf, reg);
+
+	/* pci hwconf0 */
+	xscom_read(p->chip_id, p->pe_xscom + 0x18, &reg);
+	reg |= PPC_BIT(14);
+	reg &= ~PPC_BIT(15);
+	xscom_write(p->chip_id, p->pe_xscom + 0x18, reg);
+
+	/* pci hwconf1 */
+	xscom_read(p->chip_id, p->pe_xscom + 0x19, &reg);
+	reg &= ~PPC_BITMASK(17,18);
+	xscom_write(p->chip_id, p->pe_xscom + 0x19, reg);
+
+	/* aib tx cmd cred */
+	xscom_read(p->chip_id, p->pci_xscom + 0xd, &reg);
+	if (dma_mode) {
+		/*
+		 * In DMA mode, increase AIB credit value for ch 2 (DMA read)
+		 * for performance reasons
+		 */
+		reg &= ~PPC_BITMASK(42, 47);
+		reg |= PPC_BITMASK(43, 45);
+	} else {
+		reg &= ~PPC_BITMASK(42, 46);
+		reg |= PPC_BIT(47);
+	}
+	xscom_write(p->chip_id, p->pci_xscom + 0xd, reg);
+
+	xscom_write(p->chip_id, p->pci_xscom + 0xc, 0xff00000000000000ull);
+
+	/* pci mode ctl */
+	xscom_read(p->chip_id, p->pe_xscom + 0xb, &reg);
+	reg |= PPC_BIT(25);
+	xscom_write(p->chip_id, p->pe_xscom + 0xb, reg);
+
+	/* set tve no translate mode allow mmio window */
+	memset(p->tve_cache, 0x0, sizeof(p->tve_cache));
+	if (dma_mode) {
+		/*
+		 * CAPP DMA mode needs access to all of memory, set address
+		 * range to 0x0000000000000000: 0x0002FFFFFFFFFFF
+		 */
+		p->tve_cache[pe_number * 2] = 0x000000FFFFFF0200ULL;
+	} else {
+		/* Allow address range 0x0002000000000000: 0x0002FFFFFFFFFFF */
+		p->tve_cache[pe_number * 2] = 0x000000FFFFFF0a00ULL;
+	}
+
+	phb3_ioda_sel(p, IODA2_TBL_TVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]);
+
+	/* set m64 bar to pass mmio window */
+	memset(p->m64b_cache, 0x0, sizeof(p->m64b_cache));
+	p->m64b_cache[0] = PPC_BIT(0); /*enable*/
+	p->m64b_cache[0] |= PPC_BIT(1); /*single pe*/
+	p->m64b_cache[0] |= (p->mm0_base << 12) | ((pe_number & 0x3e0) << 27); /*base and upper pe*/
+	p->m64b_cache[0] |= 0x3fffc000 | (pe_number & 0x1f); /*mask and lower pe*/
+
+	p->m64b_cache[1] = PPC_BIT(0); /*enable*/
+	p->m64b_cache[1] |= PPC_BIT(1); /*single pe*/
+	p->m64b_cache[1] |= (0x0002000000000000ULL << 12) | ((pe_number & 0x3e0) << 27); /*base and upper pe*/
+	p->m64b_cache[1] |= 0x3f000000 | (pe_number & 0x1f); /*mask and lower pe*/
+
+	phb3_ioda_sel(p, IODA2_TBL_M64BT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->m64b_cache); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->m64b_cache[i]);
+
+	out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64B_TCE_EN);
+	out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64BIT_MSI_EN);
+
+	phb3_init_capp_errors(p);
+
+	phb3_init_capp_regs(p, dma_mode);
+
+	if (!chiptod_capp_timebase_sync(p->chip_id, CAPP_TFMR, CAPP_TB,
+					PHB3_CAPP_REG_OFFSET(p))) {
+		PHBERR(p, "CAPP: Failed to sync timebase\n");
+		return OPAL_HARDWARE;
+	}
+
+	/* set callbacks to handle HMI events */
+	capi_ops.get_capp_info = &phb3_get_capp_info;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_capi_mode(struct phb *phb, uint64_t mode,
+				  uint64_t pe_number)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	struct proc_chip *chip = get_chip(p->chip_id);
+	uint64_t reg;
+	uint64_t read_buffers;
+	uint32_t offset;
+	u8 mask;
+
+	if (!capp_ucode_loaded(chip, p->index)) {
+		PHBERR(p, "CAPP: ucode not loaded\n");
+		return OPAL_RESOURCE;
+	}
+
+	lock(&capi_lock);
+	if (PHB3_IS_NAPLES(p)) {
+		/* Naples has two CAPP units, statically mapped. */
+		chip->capp_phb3_attached_mask |= 1 << p->index;
+	} else {
+		/*
+		* Check if CAPP port is being used by any another PHB.
+		* Check and set chip->capp_phb3_attached_mask atomically
+		* incase two phb3_set_capi_mode() calls race.
+		*/
+		mask = ~(1 << p->index);
+		if (chip->capp_phb3_attached_mask & mask) {
+			PHBERR(p,
+			       "CAPP: port already in use by another PHB:%x\n",
+			       chip->capp_phb3_attached_mask);
+			unlock(&capi_lock);
+			return false;
+		}
+		chip->capp_phb3_attached_mask = 1 << p->index;
+	}
+	unlock(&capi_lock);
+
+	offset = PHB3_CAPP_REG_OFFSET(p);
+	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+	if ((reg & PPC_BIT(5))) {
+		PHBERR(p, "CAPP: recovery failed (%016llx)\n", reg);
+		return OPAL_HARDWARE;
+	} else if ((reg & PPC_BIT(0)) && (!(reg & PPC_BIT(1)))) {
+		PHBDBG(p, "CAPP: recovery in progress\n");
+		return OPAL_BUSY;
+	}
+
+	switch (mode) {
+	case OPAL_PHB_CAPI_MODE_PCIE:
+		/* Switching back to PCIe mode requires a creset */
+		return OPAL_UNSUPPORTED;
+
+	case OPAL_PHB_CAPI_MODE_CAPI:
+		return enable_capi_mode(p, pe_number, false);
+
+	case OPAL_PHB_CAPI_MODE_DMA:
+		return enable_capi_mode(p, pe_number, true);
+
+	case OPAL_PHB_CAPI_MODE_SNOOP_OFF:
+		xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset,
+			    0x0000000000000000);
+		return OPAL_SUCCESS;
+
+	case OPAL_PHB_CAPI_MODE_SNOOP_ON:
+		xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset,
+			    0x0000000000000000);
+		/*
+		 * Make sure the PHB read buffers being snooped match those
+		 * being used so we don't need another mode to set SNOOP+DMA
+		 */
+		xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, &reg);
+		read_buffers = (reg >> PPC_BITLSHIFT(11)) & 0x3;
+		reg = 0xA1F0000000000000UL;
+		reg |= read_buffers << PPC_BITLSHIFT(39);
+		xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, reg);
+
+		return OPAL_SUCCESS;
+	}
+
+	return OPAL_UNSUPPORTED;
+}
+
+static int64_t phb3_set_capp_recovery(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	if (p->flags & PHB3_CAPP_RECOVERY)
+		return 0;
+
+	/* set opal event flag to indicate eeh condition */
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+				OPAL_EVENT_PCI_ERROR);
+
+	p->flags |= PHB3_CAPP_RECOVERY;
+
+	return 0;
+}
+
+static const struct phb_ops phb3_ops = {
+	.cfg_read8		= phb3_pcicfg_read8,
+	.cfg_read16		= phb3_pcicfg_read16,
+	.cfg_read32		= phb3_pcicfg_read32,
+	.cfg_write8		= phb3_pcicfg_write8,
+	.cfg_write16		= phb3_pcicfg_write16,
+	.cfg_write32		= phb3_pcicfg_write32,
+	.get_reserved_pe_number	= phb3_get_reserved_pe_number,
+	.device_init		= phb3_device_init,
+	.device_remove		= phb3_device_remove,
+	.ioda_reset		= phb3_ioda_reset,
+	.papr_errinjct_reset	= phb3_papr_errinjct_reset,
+	.pci_reinit		= phb3_pci_reinit,
+	.set_phb_mem_window	= phb3_set_phb_mem_window,
+	.phb_mmio_enable	= phb3_phb_mmio_enable,
+	.map_pe_mmio_window	= phb3_map_pe_mmio_window,
+	.map_pe_dma_window	= phb3_map_pe_dma_window,
+	.map_pe_dma_window_real = phb3_map_pe_dma_window_real,
+	.pci_msi_eoi		= phb3_pci_msi_eoi,
+	.set_xive_pe		= phb3_set_ive_pe,
+	.get_msi_32		= phb3_get_msi_32,
+	.get_msi_64		= phb3_get_msi_64,
+	.set_pe			= phb3_set_pe,
+	.set_peltv		= phb3_set_peltv,
+	.eeh_freeze_status	= phb3_eeh_freeze_status,
+	.eeh_freeze_clear	= phb3_eeh_freeze_clear,
+	.eeh_freeze_set		= phb3_eeh_freeze_set,
+	.next_error		= phb3_eeh_next_error,
+	.err_inject		= phb3_err_inject,
+	.get_diag_data2		= phb3_get_diag_data,
+	.set_capi_mode		= phb3_set_capi_mode,
+	.set_capp_recovery	= phb3_set_capp_recovery,
+};
+
+/*
+ * We should access those registers at the stage since the
+ * AIB isn't ready yet.
+ */
+static void phb3_setup_aib(struct phb3 *p)
+{
+	/* Init_2 - AIB TX Channel Mapping Register */
+	phb3_write_reg_asb(p, PHB_AIB_TX_CHAN_MAPPING,    	0x0211230000000000UL);
+
+	/* Init_3 - AIB RX command credit register */
+	if (p->rev >= PHB3_REV_VENICE_DD20)
+		phb3_write_reg_asb(p, PHB_AIB_RX_CMD_CRED,	0x0020000100020001UL);
+	else
+		phb3_write_reg_asb(p, PHB_AIB_RX_CMD_CRED,	0x0020000100010001UL);
+	
+	/* Init_4 - AIB rx data credit register */
+	if (p->rev >= PHB3_REV_VENICE_DD20)
+		phb3_write_reg_asb(p, PHB_AIB_RX_DATA_CRED,	0x0020002000010001UL);
+	else
+		phb3_write_reg_asb(p, PHB_AIB_RX_DATA_CRED,	0x0020002000000001UL);
+
+	/* Init_5 - AIB rx credit init timer register */
+	phb3_write_reg_asb(p, PHB_AIB_RX_CRED_INIT_TIMER,	0x0f00000000000000UL);
+
+	/* Init_6 - AIB Tag Enable register */
+	phb3_write_reg_asb(p, PHB_AIB_TAG_ENABLE,		0xffffffff00000000UL);
+
+	/* Init_7 - TCE Tag Enable register */
+	phb3_write_reg_asb(p, PHB_TCE_TAG_ENABLE,         0xffffffff00000000UL);
+}
+
+static void phb3_init_ioda2(struct phb3 *p)
+{
+	/* Init_14 - LSI Source ID */
+	out_be64(p->regs + PHB_LSI_SOURCE_ID,
+		 SETFIELD(PHB_LSI_SRC_ID, 0ul, 0xff));
+
+	/* Init_15 - IVT BAR / Length
+	 * Init_16 - RBA BAR
+	 * 	   - RTT BAR
+	 * Init_17 - PELT-V BAR
+	 */
+	out_be64(p->regs + PHB_RTT_BAR,
+		 p->tbl_rtt | PHB_RTT_BAR_ENABLE);
+	out_be64(p->regs + PHB_PELTV_BAR,
+		 p->tbl_peltv | PHB_PELTV_BAR_ENABLE);
+	out_be64(p->regs + PHB_IVT_BAR,
+		 p->tbl_ivt | 0x800 | PHB_IVT_BAR_ENABLE);
+
+	/* DD2.0 or the subsequent chips don't have memory
+	 * resident RBA.
+	 */
+	if (p->rev >= PHB3_REV_MURANO_DD20)
+		out_be64(p->regs + PHB_RBA_BAR, 0x0ul);
+	else
+		out_be64(p->regs + PHB_RBA_BAR,
+			 p->tbl_rba | PHB_RBA_BAR_ENABLE);
+
+	/* Init_18..21 - Setup M32 */
+	out_be64(p->regs + PHB_M32_BASE_ADDR, p->mm1_base);
+	out_be64(p->regs + PHB_M32_BASE_MASK, ~(M32_PCI_SIZE - 1));
+	out_be64(p->regs + PHB_M32_START_ADDR, M32_PCI_START);
+
+	/* Init_22 - Setup PEST BAR */
+	out_be64(p->regs + PHB_PEST_BAR,
+		 p->tbl_pest | PHB_PEST_BAR_ENABLE);
+
+	/* Init_23 - PCIE Outbound upper address */
+	out_be64(p->regs + PHB_M64_UPPER_BITS, 0);
+
+	/* Init_24 - Interrupt represent timers
+	 * The register doesn't take effect on Murano DD1.0
+	 */
+	if (p->rev >= PHB3_REV_NAPLES_DD10)
+		out_be64(p->regs + PHB_INTREP_TIMER, 0x0014000000000000UL);
+	else if (p->rev >= PHB3_REV_MURANO_DD20)
+		out_be64(p->regs + PHB_INTREP_TIMER, 0x0004000000000000UL);
+	else
+		out_be64(p->regs + PHB_INTREP_TIMER, 0);
+
+	/* Init_25 - PHB3 Configuration Register. Clear TCE cache then
+	 *           configure the PHB
+	 */
+	out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64B_TCE_EN);
+	out_be64(p->regs + PHB_PHB3_CONFIG,
+		 PHB_PHB3C_M32_EN | PHB_PHB3C_32BIT_MSI_EN |
+		 PHB_PHB3C_64BIT_MSI_EN);
+
+	/* Init_26 - At least 512ns delay according to spec */
+	time_wait_us(2);
+
+	/* Init_27..36 - On-chip IODA tables init */
+	phb3_ioda_reset(&p->phb, false);
+}
+
+static bool phb3_wait_dlp_reset(struct phb3 *p)
+{
+	unsigned int i;
+	uint64_t val;
+
+	/*
+	 * Firmware cannot access the UTL core regs or PCI config space
+	 * until the cores are out of DL_PGRESET.
+	 * DL_PGRESET should be polled until it is inactive with a value
+	 * of '0'. The recommended polling frequency is once every 1ms.
+	 * Firmware should poll at least 200 attempts before giving up.
+	 * MMIO Stores to the link are silently dropped by the UTL core if
+	 * the link is down.
+	 * MMIO Loads to the link will be dropped by the UTL core and will
+	 * eventually time-out and will return an all ones response if the
+	 * link is down.
+	 */
+#define DLP_RESET_ATTEMPTS	40000
+
+	PHBDBG(p, "Waiting for DLP PG reset to complete...\n");
+	for (i = 0; i < DLP_RESET_ATTEMPTS; i++) {
+		val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (!(val & PHB_PCIE_DLP_TC_DL_PGRESET))
+			break;
+		time_wait_us(10);
+	}
+	if (val & PHB_PCIE_DLP_TC_DL_PGRESET) {
+		PHBERR(p, "Timeout waiting for DLP PG reset !\n");
+		return false;
+	}
+	return true;
+}
+
+/* phb3_init_rc - Initialize the Root Complex config space
+ */
+static bool phb3_init_rc_cfg(struct phb3 *p)
+{
+	int64_t ecap, aercap;
+
+	/* XXX Handle errors ? */
+
+	/* Init_45..46:
+	 *
+	 * Set primary bus to 0, secondary to 1 and subordinate to 0xff
+	 */
+	phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_PRIMARY_BUS, 0x00ff0100);
+
+	/* Init_47..52
+	 *
+	 * IO and Memory base & limits are set to base > limit, which
+	 * allows all inbounds.
+	 *
+	 * XXX This has the potential of confusing the OS which might
+	 * think that nothing is forwarded downstream. We probably need
+	 * to fix this to match the IO and M32 PHB windows
+	 */
+	phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_IO_BASE, 0x0010);
+	phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_MEM_BASE, 0x00000010);
+	phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_PREF_MEM_BASE, 0x00000010);
+
+	/* Init_53..54 - Setup bridge control enable forwarding of CORR, FATAL,
+	 * and NONFATAL errors
+	*/
+	phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, PCI_CFG_BRCTL_SERR_EN);
+
+	/* Init_55..56
+	 *
+	 * PCIE Device control/status, enable error reporting, disable relaxed
+	 * ordering, set MPS to 128 (see note), clear errors.
+	 *
+	 * Note: The doc recommends to set MPS to 4K. This has proved to have
+	 * some issues as it requires specific claming of MRSS on devices and
+	 * we've found devices in the field that misbehave when doing that.
+	 *
+	 * We currently leave it all to 128 bytes (minimum setting) at init
+	 * time. The generic PCIe probing later on might apply a different
+	 * value, or the kernel will, but we play it safe at early init
+	 */
+	if (p->ecap <= 0) {
+		ecap = pci_find_cap(&p->phb, 0, PCI_CFG_CAP_ID_EXP);
+		if (ecap < 0) {
+			PHBERR(p, "Can't locate PCI-E capability\n");
+			return false;
+		}
+		p->ecap = ecap;
+	} else {
+		ecap = p->ecap;
+	}
+
+	phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVSTAT,
+			     PCICAP_EXP_DEVSTAT_CE	|
+			     PCICAP_EXP_DEVSTAT_NFE	|
+			     PCICAP_EXP_DEVSTAT_FE	|
+			     PCICAP_EXP_DEVSTAT_UE);
+
+	phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVCTL,
+			     PCICAP_EXP_DEVCTL_CE_REPORT	|
+			     PCICAP_EXP_DEVCTL_NFE_REPORT	|
+			     PCICAP_EXP_DEVCTL_FE_REPORT	|
+			     PCICAP_EXP_DEVCTL_UR_REPORT	|
+			     SETFIELD(PCICAP_EXP_DEVCTL_MPS, 0, PCIE_MPS_128B));
+
+	/* Init_57..58
+	 *
+	 * Root Control Register. Enable error reporting
+	 *
+	 * Note: Added CRS visibility.
+	 */
+	phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_RC,
+			     PCICAP_EXP_RC_SYSERR_ON_CE		|
+			     PCICAP_EXP_RC_SYSERR_ON_NFE	|
+			     PCICAP_EXP_RC_SYSERR_ON_FE		|
+			     PCICAP_EXP_RC_CRS_VISIBLE);
+
+	/* Init_59..60
+	 *
+	 * Device Control 2. Enable ARI fwd, set timer to RTOS timer
+	 */
+	phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DCTL2,
+			     SETFIELD(PCICAP_EXP_DCTL2_CMPTOUT, 0, 0xf) |
+			     PCICAP_EXP_DCTL2_ARI_FWD);
+
+	/* Init_61..76
+	 *
+	 * AER inits
+	 */
+	if (p->aercap <= 0) {
+		aercap = pci_find_ecap(&p->phb, 0, PCIECAP_ID_AER, NULL);
+		if (aercap < 0) {
+			PHBERR(p, "Can't locate AER capability\n");
+			return false;
+		}
+		p->aercap = aercap;
+	} else {
+		aercap = p->aercap;
+	}
+
+	/* Clear all UE status */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_STATUS,
+			     0xffffffff);
+	/* Disable some error reporting as per the PHB3 spec */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_MASK,
+			     PCIECAP_AER_UE_POISON_TLP		|
+			     PCIECAP_AER_UE_COMPL_TIMEOUT	|
+			     PCIECAP_AER_UE_COMPL_ABORT		|
+			     PCIECAP_AER_UE_ECRC);
+	/* Report some errors as fatal */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_SEVERITY,
+			     PCIECAP_AER_UE_DLP 		|
+			     PCIECAP_AER_UE_SURPRISE_DOWN	|
+			     PCIECAP_AER_UE_FLOW_CTL_PROT	|
+			     PCIECAP_AER_UE_UNEXP_COMPL		|
+			     PCIECAP_AER_UE_RECV_OVFLOW		|
+			     PCIECAP_AER_UE_MALFORMED_TLP);
+	/* Clear all CE status */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CE_STATUS,
+			     0xffffffff);
+	/* Disable some error reporting as per the PHB3 spec */
+	/* Note: When link down, also disable rcvr errors */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CE_MASK,
+			    PCIECAP_AER_CE_ADV_NONFATAL |
+			    (p->has_link ? 0 : PCIECAP_AER_CE_RECVR_ERR));
+
+	/* Enable or disable ECRC generation & checking */
+	phb3_enable_ecrc(&p->phb, !p->no_ecrc_devs);
+
+	/* Enable reporting in root error control */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_RERR_CMD,
+			     PCIECAP_AER_RERR_CMD_FE		|
+			     PCIECAP_AER_RERR_CMD_NFE		|
+			     PCIECAP_AER_RERR_CMD_CE);
+	/* Clear root error status */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_RERR_STA,
+			     0xffffffff);
+
+	return true;
+}
+
+static void phb3_init_utl(struct phb3 *p)
+{
+	/* Init_77..79: Clear spurrious errors and assign errors to the
+	 * right "interrupt" signal
+	 */
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS,       0xffffffffffffffffUL);
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_ERR_SEVERITY, 0x5000000000000000UL);
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_IRQ_EN,       0xfcc0000000000000UL);
+
+	/* Init_80..81: Setup tag allocations
+	 *
+         * Stick to HW defaults. May differs between PHB implementations
+	 */
+
+	/* Init_82: PCI Express port control
+	 * SW283991: Set Outbound Non-Posted request timeout to 16ms (RTOS).
+	 */
+	out_be64(p->regs + UTL_PCIE_PORT_CONTROL,          0x8588007000000000UL);
+
+	/* Init_83..85: Clean & setup port errors */
+	out_be64(p->regs + UTL_PCIE_PORT_STATUS,           0xffdfffffffffffffUL);
+	out_be64(p->regs + UTL_PCIE_PORT_ERROR_SEV,        0x5039000000000000UL);
+
+	if (p->has_link)
+		out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,   0xad52800000000000UL);
+	else
+		out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,   0xad42800000000000UL);
+
+	/* Init_86 : Cleanup RC errors */
+	out_be64(p->regs + UTL_RC_STATUS,                  0xffffffffffffffffUL);
+}
+
+static void phb3_init_errors(struct phb3 *p)
+{
+	/* Init_88: LEM Error Mask : Temporarily disable error interrupts */
+	out_be64(p->regs + PHB_LEM_ERROR_MASK,		   0xffffffffffffffffUL);
+
+	/* Init_89..97: Disable all error interrupts until end of init */
+	out_be64(p->regs + PHB_ERR_STATUS,		   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_ERR1_STATUS,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_ERR_LEM_ENABLE,		   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_ERR_FREEZE_ENABLE,	   0x0000000080800000UL);
+	out_be64(p->regs + PHB_ERR_AIB_FENCE_ENABLE,	   0xffffffdd0c00ffc0UL);
+	out_be64(p->regs + PHB_ERR_LOG_0,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_ERR_LOG_1,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_ERR_STATUS_MASK,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_ERR1_STATUS_MASK,	   0x0000000000000000UL);
+
+	/* Init_98_106: Configure MMIO error traps & clear old state
+	 *
+	 * Don't enable BAR multi-hit detection in bit 41.
+	 */
+	out_be64(p->regs + PHB_OUT_ERR_STATUS,		   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_OUT_ERR1_STATUS,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_OUT_ERR_LEM_ENABLE,	   0xfdffffffffbfffffUL);
+	out_be64(p->regs + PHB_OUT_ERR_FREEZE_ENABLE,	   0x0000420800000000UL);
+	out_be64(p->regs + PHB_OUT_ERR_AIB_FENCE_ENABLE,   0x9cf3bc00f89c700fUL);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_0,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_1,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_OUT_ERR_STATUS_MASK,	   0x0000000000400000UL);
+	out_be64(p->regs + PHB_OUT_ERR1_STATUS_MASK,	   0x0000000000400000UL);
+
+	/* Init_107_115: Configure DMA_A error traps & clear old state */
+	out_be64(p->regs + PHB_INA_ERR_STATUS,		   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_INA_ERR1_STATUS,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_INA_ERR_LEM_ENABLE,	   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_INA_ERR_FREEZE_ENABLE,	   0xc00003a901006000UL);
+	out_be64(p->regs + PHB_INA_ERR_AIB_FENCE_ENABLE,   0x3fff5452fe019fdeUL);
+	out_be64(p->regs + PHB_INA_ERR_LOG_0,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_INA_ERR_LOG_1,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_INA_ERR_STATUS_MASK,	   0x0000000000000000UL);
+	out_be64(p->regs + PHB_INA_ERR1_STATUS_MASK,	   0x0000000000000000UL);
+
+	/* Init_116_124: Configure DMA_B error traps & clear old state */
+	out_be64(p->regs + PHB_INB_ERR_STATUS,		   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_INB_ERR1_STATUS,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_INB_ERR_LEM_ENABLE,	   0xffffffffffffffffUL);
+
+	/*
+	 * Workaround for errata HW257476, turn correctable messages into
+	 * ER freezes on Murano and Venice DD1.0
+	 */
+	if (p->rev < PHB3_REV_MURANO_DD20)
+		out_be64(p->regs + PHB_INB_ERR_FREEZE_ENABLE,
+			                                   0x0000600000000070UL);
+	else
+		out_be64(p->regs + PHB_INB_ERR_FREEZE_ENABLE,
+			                                   0x0000600000000060UL);
+
+	out_be64(p->regs + PHB_INB_ERR_AIB_FENCE_ENABLE,   0xfcff80fbff7ff08cUL);
+	out_be64(p->regs + PHB_INB_ERR_LOG_0,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_INB_ERR_LOG_1,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_INB_ERR_STATUS_MASK,	   0x0000000000000000UL);
+	out_be64(p->regs + PHB_INB_ERR1_STATUS_MASK,	   0x0000000000000000UL);
+
+	/* Init_125..128: Cleanup & configure LEM */
+	out_be64(p->regs + PHB_LEM_FIR_ACCUM,		   0x0000000000000000UL);
+	out_be64(p->regs + PHB_LEM_ACTION0,		   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_LEM_ACTION1,		   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_LEM_WOF,			   0x0000000000000000UL);
+}
+
+static int64_t phb3_fixup_pec_inits(struct phb3 *p)
+{
+	int64_t rc;
+	uint64_t val;
+
+	/* These fixups handle some timer updates that HB doesn't yet do
+	 * to work around problems with some adapters or external drawers
+	 * (SW283991)
+	 */
+
+	/* PCI Hardware Configuration 0 Register */
+	rc = xscom_read(p->chip_id, p->pe_xscom + 0x18, &val);
+	if (rc) {
+		PHBERR(p, "Can't read CS0 !\n");
+		return rc;
+	}
+	val = val & 0x0f0fffffffffffffull;
+	val = val | 0x1010000000000000ull;
+	rc = xscom_write(p->chip_id, p->pe_xscom + 0x18, val);
+	if (rc) {
+		PHBERR(p, "Can't write CS0 !\n");
+		return rc;
+	}
+	return 0;
+}
+
+static void phb3_init_hw(struct phb3 *p, bool first_init)
+{
+	uint64_t val;
+
+	PHBDBG(p, "Initializing PHB...\n");
+
+	/* Fixups for PEC inits */
+	if (phb3_fixup_pec_inits(p)) {
+		PHBERR(p, "Failed to init PEC, PHB appears broken\n");
+		goto failed;
+	}
+
+	/* Lift reset */
+	xscom_read(p->chip_id, p->spci_xscom + 1, &val);/* HW275117 */
+	xscom_write(p->chip_id, p->pci_xscom + 0xa, 0);
+
+	/* XXX FIXME, turn that into a state machine or a worker thread */
+	time_wait_ms(100);
+
+	/* Grab version and fit it in an int */
+	val = phb3_read_reg_asb(p, PHB_VERSION);
+	if (val == 0 || val == 0xffffffffffffffffUL) {
+		PHBERR(p, "Failed to read version, PHB appears broken\n");
+		goto failed;
+	}
+
+	p->rev = ((val >> 16) & 0x00ff0000) | (val & 0xffff);
+	PHBDBG(p, "Core revision 0x%x\n", p->rev);
+
+	/* Setup AIB credits etc... */
+	phb3_setup_aib(p);
+
+	/* Init_8 - PCIE System Configuration Register
+	 *
+	 * Use default values, clear bit 15 (SYS_EC00_SLOT) to avoid incorrect
+	 * slot power limit message and adjust max speed based on system
+	 * config. Don't hard wire default value as some bits are different
+	 * between implementations.
+	 */
+	val = in_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG);
+	PHBDBG(p, "Default system config: 0x%016llx\n", val);
+	val = SETFIELD(PHB_PCIE_SCONF_SLOT, val, 0);
+	val = SETFIELD(PHB_PCIE_SCONF_MAXLINKSPEED, val, p->max_link_speed);
+	out_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG, val);
+	PHBDBG(p, "New system config    : 0x%016llx\n",
+	       in_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG));
+
+	/* Init_9..12 - PCIE DLP Lane EQ control */
+	if (p->lane_eq) {
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL0,
+			 be64_to_cpu(p->lane_eq[0]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL1,
+			 be64_to_cpu(p->lane_eq[1]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL2,
+			 be64_to_cpu(p->lane_eq[2]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL3,
+			 be64_to_cpu(p->lane_eq[3]));
+	}
+
+	/* Init_XX - (PHB2 errata)
+	 *
+         * Set proper credits, needs adjustment due to wrong defaults
+	 * on PHB2 before we lift the reset. This only applies to Murano
+	 * and Venice
+	 */
+	if (p->index == 2 && p->rev < PHB3_REV_NAPLES_DD10)
+		out_be64(p->regs + PHB_PCIE_SYS_LINK_INIT, 0x9008133332120000UL);
+
+	/* Init_13 - PCIE Reset */
+	/*
+	 * Lift the PHB resets but not PERST, this will be lifted
+	 * later by the initial PERST state machine
+	 */
+	PHBDBG(p, "PHB_RESET is 0x%016llx\n", in_be64(p->regs + PHB_RESET));
+	out_be64(p->regs + PHB_RESET,			   0xd000000000000000UL);
+
+	/* Architected IODA2 inits */
+	phb3_init_ioda2(p);
+
+	/* Init_37..42 - Clear UTL & DLP error logs */
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG1,	   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG2,	   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG3,	   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG4,	   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_PCIE_DLP_ERRLOG1,	   0xffffffffffffffffUL);
+	out_be64(p->regs + PHB_PCIE_DLP_ERRLOG2,	   0xffffffffffffffffUL);
+
+	/* Init_43 - Wait for UTL core to come out of reset */
+	if (!phb3_wait_dlp_reset(p))
+		goto failed;
+
+	/* Init_44 - Clear port status */
+	out_be64(p->regs + UTL_PCIE_PORT_STATUS,	   0xffffffffffffffffUL);
+
+	/* Init_45..76: Init root complex config space */
+	if (!phb3_init_rc_cfg(p))
+		goto failed;
+
+	/* Init_77..86 : Init UTL */
+	phb3_init_utl(p);
+
+	/*
+	 * Init_87: PHB Control register. Various PHB settings
+	 *          Enable IVC for Murano DD2.0 or later one
+	 */
+#ifdef IVT_TABLE_IVE_16B
+	val = 0xf3a80e4b00000000UL;
+#else
+	val = 0xf3a80ecb00000000UL;
+#endif
+	if (p->rev >= PHB3_REV_MURANO_DD20)
+		val |= 0x0000010000000000UL;
+	if (first_init && p->rev >= PHB3_REV_NAPLES_DD10) {
+		/* Enable 32-bit bypass support on Naples and tell the OS
+		 * about it
+		 */
+		val |= 0x0010000000000000UL;
+		dt_add_property(p->phb.dt_node,
+				"ibm,32-bit-bypass-supported", NULL, 0);
+	}
+	out_be64(p->regs + PHB_CONTROL, val);
+
+	/* Init_88..128  : Setup error registers */
+	phb3_init_errors(p);
+
+	/* Init_129: Read error summary */
+	val = in_be64(p->regs + PHB_ETU_ERR_SUMMARY);
+	if (val) {
+		PHBERR(p, "Errors detected during PHB init: 0x%16llx\n", val);
+		goto failed;
+	}
+
+	/* NOTE: At this point the spec waits for the link to come up. We
+	 * don't bother as we are doing a PERST soon.
+	 */
+
+	/* XXX I don't know why the spec does this now and not earlier, so
+	 * to be sure to get it right we might want to move it to the freset
+	 * state machine, though the generic PCI layer will probably do
+	 * this anyway (ie, enable MEM, etc... in the RC)
+	 *
+	 * Note:The spec enables IO but PHB3 doesn't do IO space .... so we
+	 * leave that clear.
+	 */
+	phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_CMD,
+			    PCI_CFG_CMD_MEM_EN |
+			    PCI_CFG_CMD_BUS_MASTER_EN |
+			    PCI_CFG_CMD_PERR_RESP |
+			    PCI_CFG_CMD_SERR_EN);
+
+	/* Clear errors */
+	phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_STAT,
+			    PCI_CFG_STAT_SENT_TABORT |
+			    PCI_CFG_STAT_RECV_TABORT |
+			    PCI_CFG_STAT_RECV_MABORT |
+			    PCI_CFG_STAT_SENT_SERR |
+			    PCI_CFG_STAT_RECV_PERR);
+
+	/* Init_136 - Re-enable error interrupts */
+
+	/* TBD: Should we mask any of these for PERST ? */
+	out_be64(p->regs + PHB_ERR_IRQ_ENABLE,	   0x0000002280b80000UL);
+	out_be64(p->regs + PHB_OUT_ERR_IRQ_ENABLE, 0x600c42fc042080f0UL);
+	out_be64(p->regs + PHB_INA_ERR_IRQ_ENABLE, 0xc000a3a901826020UL);
+	out_be64(p->regs + PHB_INB_ERR_IRQ_ENABLE, 0x0000600000800070UL);
+	out_be64(p->regs + PHB_LEM_ERROR_MASK,	   0x42498e367f502eaeUL);
+
+	/*
+	 * Init_141 - Enable DMA address speculation
+	 *
+	 * Errata#20131017: Disable speculation until Murano DD2.0
+	 *
+	 * Note: We keep IVT speculation disabled (bit 4). It should work with
+	 * Murano DD2.0 and later but lacks sufficient testing. We will re-enable
+	 * it once that has been done.
+	 */
+	if (p->rev >= PHB3_REV_MURANO_DD20)
+		out_be64(p->regs + PHB_TCE_SPEC_CTL,		0xf000000000000000UL);
+	else
+		out_be64(p->regs + PHB_TCE_SPEC_CTL,		0x0ul);
+
+	/* Errata#20131017: avoid TCE queue overflow */
+	if (p->rev == PHB3_REV_MURANO_DD20)
+		phb3_write_reg_asb(p, PHB_TCE_WATERMARK,	0x0003000000030302UL);
+
+	/* Init_142 - PHB3 - Timeout Control Register 1
+	 * SW283991: Increase timeouts
+	 */
+	out_be64(p->regs + PHB_TIMEOUT_CTRL1,			0x1715152016200000UL);
+
+	/* Init_143 - PHB3 - Timeout Control Register 2 */
+	out_be64(p->regs + PHB_TIMEOUT_CTRL2,			0x2320d71600000000UL);
+
+	/* Mark the PHB as functional which enables all the various sequences */
+	p->broken = false;
+
+	PHBDBG(p, "Initialization complete\n");
+
+	return;
+
+ failed:
+	PHBERR(p, "Initialization failed\n");
+	p->broken = true;
+}
+
+static void phb3_allocate_tables(struct phb3 *p)
+{
+	uint16_t *rte;
+	uint32_t i;
+
+	/* XXX Our current memalign implementation sucks,
+	 *
+	 * It will do the job, however it doesn't support freeing
+	 * the memory and wastes space by always allocating twice
+	 * as much as requested (size + alignment)
+	 */
+	p->tbl_rtt = (uint64_t)local_alloc(p->chip_id, RTT_TABLE_SIZE, RTT_TABLE_SIZE);
+	assert(p->tbl_rtt);
+	rte = (uint16_t *)(p->tbl_rtt);
+	for (i = 0; i < RTT_TABLE_ENTRIES; i++, rte++)
+		*rte = PHB3_RESERVED_PE_NUM;
+
+	p->tbl_peltv = (uint64_t)local_alloc(p->chip_id, PELTV_TABLE_SIZE, PELTV_TABLE_SIZE);
+	assert(p->tbl_peltv);
+	memset((void *)p->tbl_peltv, 0, PELTV_TABLE_SIZE);
+
+	p->tbl_pest = (uint64_t)local_alloc(p->chip_id, PEST_TABLE_SIZE, PEST_TABLE_SIZE);
+	assert(p->tbl_pest);
+	memset((void *)p->tbl_pest, 0, PEST_TABLE_SIZE);
+
+	p->tbl_ivt = (uint64_t)local_alloc(p->chip_id, IVT_TABLE_SIZE, IVT_TABLE_SIZE);
+	assert(p->tbl_ivt);
+	memset((void *)p->tbl_ivt, 0, IVT_TABLE_SIZE);
+
+	p->tbl_rba = (uint64_t)local_alloc(p->chip_id, RBA_TABLE_SIZE, RBA_TABLE_SIZE);
+	assert(p->tbl_rba);
+	memset((void *)p->tbl_rba, 0, RBA_TABLE_SIZE);
+}
+
+static void phb3_add_properties(struct phb3 *p)
+{
+	struct dt_node *np = p->phb.dt_node;
+	uint32_t lsibase, icsp = get_ics_phandle();
+	uint64_t m32b, m64b, m64s, reg, tkill;
+
+	reg = cleanup_addr((uint64_t)p->regs);
+
+	/* Add various properties that HB doesn't have to
+	 * add, some of them simply because they result from
+	 * policy decisions made in skiboot rather than in HB
+	 * such as the MMIO windows going to PCI, interrupts,
+	 * etc...
+	 */
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+	dt_add_property_cells(np, "bus-range", 0, 0xff);
+	dt_add_property_cells(np, "clock-frequency", 0x200, 0); /* ??? */
+
+	dt_add_property_cells(np, "interrupt-parent", icsp);
+
+	/* XXX FIXME: add slot-name */
+	//dt_property_cell("bus-width", 8); /* Figure it out from VPD ? */
+
+	/* "ranges", we only expose M32 (PHB3 doesn't do IO)
+	 *
+	 * Note: The kernel expects us to have chopped of 64k from the
+	 * M32 size (for the 32-bit MSIs). If we don't do that, it will
+	 * get confused (OPAL does it)
+	 */
+	m32b = cleanup_addr(p->mm1_base);
+	m64b = cleanup_addr(p->mm0_base);
+	m64s = p->mm0_size;
+	dt_add_property_cells(np, "ranges",
+			      /* M32 space */
+			      0x02000000, 0x00000000, M32_PCI_START,
+			      hi32(m32b), lo32(m32b), 0, M32_PCI_SIZE - 0x10000);
+
+	/* XXX FIXME: add opal-memwin32, dmawins, etc... */
+	dt_add_property_u64s(np, "ibm,opal-m64-window", m64b, m64b, m64s);
+	dt_add_property(np, "ibm,opal-single-pe", NULL, 0);
+	//dt_add_property_cells(np, "ibm,opal-msi-ports", 2048);
+	dt_add_property_cells(np, "ibm,opal-num-pes", 256);
+	dt_add_property_cells(np, "ibm,opal-reserved-pe",
+			      PHB3_RESERVED_PE_NUM);
+	dt_add_property_cells(np, "ibm,opal-msi-ranges",
+			      p->base_msi, PHB3_MSI_IRQ_COUNT);
+	tkill = reg + PHB_TCE_KILL;
+	dt_add_property_cells(np, "ibm,opal-tce-kill",
+			      hi32(tkill), lo32(tkill));
+	dt_add_property_cells(np, "ibm,supported-tce-sizes",
+			      12, // 4K
+			      16, // 64K
+			      24, // 16M
+			      28); // 256M
+
+	/*
+	 * Indicate to Linux that the architected IODA2 MSI EOI method
+	 * is supported
+	 */
+	dt_add_property_string(np, "ibm,msi-eoi-method", "ioda2");
+
+	/* Indicate to Linux that CAPP timebase sync is supported */
+	dt_add_property_string(np, "ibm,capp-timebase-sync", NULL);
+
+	/* The interrupt maps will be generated in the RC node by the
+	 * PCI code based on the content of this structure:
+	 */
+	lsibase = p->base_lsi;
+	p->phb.lstate.int_size = 2;
+	p->phb.lstate.int_val[0][0] = lsibase + PHB3_LSI_PCIE_INTA;
+	p->phb.lstate.int_val[0][1] = 1;
+	p->phb.lstate.int_val[1][0] = lsibase + PHB3_LSI_PCIE_INTB;
+	p->phb.lstate.int_val[1][1] = 1;
+	p->phb.lstate.int_val[2][0] = lsibase + PHB3_LSI_PCIE_INTC;
+	p->phb.lstate.int_val[2][1] = 1;
+	p->phb.lstate.int_val[3][0] = lsibase + PHB3_LSI_PCIE_INTD;
+	p->phb.lstate.int_val[3][1] = 1;
+	p->phb.lstate.int_parent[0] = icsp;
+	p->phb.lstate.int_parent[1] = icsp;
+	p->phb.lstate.int_parent[2] = icsp;
+	p->phb.lstate.int_parent[3] = icsp;
+
+	/* Indicators for variable tables */
+	dt_add_property_cells(np, "ibm,opal-rtt-table",
+		hi32(p->tbl_rtt), lo32(p->tbl_rtt), RTT_TABLE_SIZE);
+	dt_add_property_cells(np, "ibm,opal-peltv-table",
+		hi32(p->tbl_peltv), lo32(p->tbl_peltv), PELTV_TABLE_SIZE);
+	dt_add_property_cells(np, "ibm,opal-pest-table",
+		hi32(p->tbl_pest), lo32(p->tbl_pest), PEST_TABLE_SIZE);
+	dt_add_property_cells(np, "ibm,opal-ivt-table",
+		hi32(p->tbl_ivt), lo32(p->tbl_ivt), IVT_TABLE_SIZE);
+	dt_add_property_cells(np, "ibm,opal-ive-stride",
+		IVT_TABLE_STRIDE);
+	dt_add_property_cells(np, "ibm,opal-rba-table",
+		hi32(p->tbl_rba), lo32(p->tbl_rba), RBA_TABLE_SIZE);
+
+	dt_add_property_cells(np, "ibm,phb-diag-data-size",
+			      sizeof(struct OpalIoPhb3ErrorData));
+}
+
+static bool phb3_calculate_windows(struct phb3 *p)
+{
+	const struct dt_property *prop;
+
+	/* Get PBCQ MMIO windows from device-tree */
+	prop = dt_require_property(p->phb.dt_node,
+				   "ibm,mmio-window", -1);
+	assert(prop->len >= (2 * sizeof(uint64_t)));
+
+	p->mm0_base = ((const uint64_t *)prop->prop)[0];
+	p->mm0_size = ((const uint64_t *)prop->prop)[1];
+	if (prop->len > 16) {
+		p->mm1_base = ((const uint64_t *)prop->prop)[2];
+		p->mm1_size = ((const uint64_t *)prop->prop)[3];
+	}
+
+	/* Sort them so that 0 is big and 1 is small */
+	if (p->mm1_size && p->mm1_size > p->mm0_size) {
+		uint64_t b = p->mm0_base;
+		uint64_t s = p->mm0_size;
+		p->mm0_base = p->mm1_base;
+		p->mm0_size = p->mm1_size;
+		p->mm1_base = b;
+		p->mm1_size = s;
+	}
+
+	/* If 1 is too small, ditch it */
+	if (p->mm1_size < M32_PCI_SIZE)
+		p->mm1_size = 0;
+
+	/* If 1 doesn't exist, carve it out of 0 */
+	if (p->mm1_size == 0) {
+		p->mm0_size /= 2;
+		p->mm1_base = p->mm0_base + p->mm0_size;
+		p->mm1_size = p->mm0_size;
+	}
+
+	/* Crop mm1 to our desired size */
+	if (p->mm1_size > M32_PCI_SIZE)
+		p->mm1_size = M32_PCI_SIZE;
+
+	return true;
+}
+
+/*
+ * Trigger a creset to disable CAPI mode on kernel shutdown.
+ *
+ * This helper is called repeatedly by the host sync notifier mechanism, which
+ * relies on the kernel to regularly poll the OPAL_SYNC_HOST_REBOOT call as it
+ * shuts down.
+ *
+ * This is a somewhat hacky abuse of the host sync notifier mechanism, but the
+ * alternatives require a new API call which won't work for older kernels.
+ */
+static bool phb3_host_sync_reset(void *data)
+{
+	struct phb3 *p = (struct phb3 *)data;
+	struct pci_slot *slot = p->phb.slot;
+	struct proc_chip *chip = get_chip(p->chip_id);
+	int64_t rc;
+
+	switch (slot->state) {
+	case PHB3_SLOT_NORMAL:
+		lock(&capi_lock);
+		rc = (chip->capp_phb3_attached_mask & (1 << p->index)) ?
+			OPAL_PHB_CAPI_MODE_CAPI :
+			OPAL_PHB_CAPI_MODE_PCIE;
+		unlock(&capi_lock);
+
+		if (rc == OPAL_PHB_CAPI_MODE_PCIE)
+			return true;
+
+		PHBINF(p, "PHB in CAPI mode, resetting\n");
+		p->flags &= ~PHB3_CAPP_RECOVERY;
+		phb3_creset(slot);
+		return false;
+	default:
+		rc = slot->ops.run_sm(slot);
+		return rc <= OPAL_SUCCESS;
+	}
+}
+
+static void phb3_create(struct dt_node *np)
+{
+	const struct dt_property *prop;
+	struct phb3 *p = zalloc(sizeof(struct phb3));
+	struct pci_slot *slot;
+	size_t lane_eq_len;
+	struct dt_node *iplp;
+	struct proc_chip *chip;
+	int opal_id;
+	char *path;
+
+	assert(p);
+
+	/* Populate base stuff */
+	p->index = dt_prop_get_u32(np, "ibm,phb-index");
+	p->chip_id = dt_prop_get_u32(np, "ibm,chip-id");
+	p->regs = (void *)dt_get_address(np, 0, NULL);
+	p->base_msi = PHB3_MSI_IRQ_BASE(p->chip_id, p->index);
+	p->base_lsi = PHB3_LSI_IRQ_BASE(p->chip_id, p->index);
+	p->phb.dt_node = np;
+	p->phb.ops = &phb3_ops;
+	p->phb.phb_type = phb_type_pcie_v3;
+	p->phb.scan_map = 0x1; /* Only device 0 to scan */
+
+	if (!phb3_calculate_windows(p))
+		return;
+
+	/* Get the various XSCOM register bases from the device-tree */
+	prop = dt_require_property(np, "ibm,xscom-bases", 3 * sizeof(uint32_t));
+	p->pe_xscom = ((const uint32_t *)prop->prop)[0];
+	p->spci_xscom = ((const uint32_t *)prop->prop)[1];
+	p->pci_xscom = ((const uint32_t *)prop->prop)[2];
+
+	/*
+	 * We skip the initial PERST assertion requested by the generic code
+	 * when doing a cold boot because we are coming out of cold boot already
+	 * so we save boot time that way. The PERST state machine will still
+	 * handle waiting for the link to come up, it will just avoid actually
+	 * asserting & deasserting the PERST output
+	 *
+	 * For a hot IPL, we still do a PERST
+	 *
+	 * Note: In absence of property (ie, FSP-less), we stick to the old
+	 * behaviour and set skip_perst to true
+	 */
+	p->skip_perst = true; /* Default */
+
+	iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+	if (iplp) {
+		const char *ipl_type = dt_prop_get_def(iplp, "cec-major-type", NULL);
+		if (ipl_type && (!strcmp(ipl_type, "hot")))
+			p->skip_perst = false;
+	}
+
+	/* By default link is assumed down */
+	p->has_link = false;
+
+	/* We register the PHB before we initialize it so we
+	 * get a useful OPAL ID for it. We use a different numbering here
+	 * between Naples and Venice/Murano in order to leave room for the
+	 * NPU on Naples.
+	 */
+	chip = next_chip(NULL); /* Just need any chip */
+	if (chip && chip->type == PROC_CHIP_P8_NAPLES)
+		opal_id = p->chip_id * 8 + p->index;
+	else
+		opal_id = p->chip_id * 4 + p->index;
+	pci_register_phb(&p->phb, opal_id);
+	slot = phb3_slot_create(&p->phb);
+	if (!slot)
+		PHBERR(p, "Cannot create PHB slot\n");
+
+	/* Hello ! */
+	path = dt_get_path(np);
+	PHBINF(p, "Found %s @[%d:%d]\n", path, p->chip_id, p->index);
+	PHBINF(p, "  M32 [0x%016llx..0x%016llx]\n",
+	       p->mm1_base, p->mm1_base + p->mm1_size - 1);
+	PHBINF(p, "  M64 [0x%016llx..0x%016llx]\n",
+	       p->mm0_base, p->mm0_base + p->mm0_size - 1);
+	free(path);
+
+	/* Find base location code from root node */
+	p->phb.base_loc_code = dt_prop_get_def(dt_root,
+					       "ibm,io-base-loc-code", NULL);
+	if (!p->phb.base_loc_code)
+		PHBDBG(p, "Base location code not found !\n");
+
+	/* Priority order: NVRAM -> dt -> GEN3 */
+	p->max_link_speed = 3;
+	if (dt_has_node_property(np, "ibm,max-link-speed", NULL))
+		p->max_link_speed = dt_prop_get_u32(np, "ibm,max-link-speed");
+	if (pcie_max_link_speed)
+		p->max_link_speed = pcie_max_link_speed;
+	if (p->max_link_speed > 3) /* clamp to 3 */
+		p->max_link_speed = 3;
+	PHBINF(p, "Max link speed: GEN%i\n", p->max_link_speed);
+
+	/* Check for lane equalization values from HB or HDAT */
+	p->lane_eq = dt_prop_get_def_size(np, "ibm,lane-eq", NULL, &lane_eq_len);
+	if (p->lane_eq && lane_eq_len != (8 * 4)) {
+		PHBERR(p, "Device-tree has ibm,lane-eq with wrong len %ld\n",
+			lane_eq_len);
+		p->lane_eq = NULL;
+	}
+	if (p->lane_eq) {
+		PHBDBG(p, "Override lane equalization settings:\n");
+		PHBDBG(p, "  0x%016llx 0x%016llx\n",
+		       be64_to_cpu(p->lane_eq[0]), be64_to_cpu(p->lane_eq[1]));
+		PHBDBG(p, "  0x%016llx 0x%016llx\n",
+		       be64_to_cpu(p->lane_eq[2]), be64_to_cpu(p->lane_eq[3]));
+	}
+
+	/*
+	 * Grab CEC IO VPD load info from the root of the device-tree,
+	 * on P8 there's a single such VPD for the whole machine
+	 */
+	prop = dt_find_property(dt_root, "ibm,io-vpd");
+	if (!prop) {
+		/* LX VPD Lid not already loaded */
+		if (platform.vpd_iohub_load)
+			platform.vpd_iohub_load(dt_root);
+	}
+
+	/* Allocate the SkiBoot internal in-memory tables for the PHB */
+	phb3_allocate_tables(p);
+
+	phb3_add_properties(p);
+
+	/* Clear IODA2 cache */
+	phb3_init_ioda_cache(p);
+
+	/* Register interrupt sources */
+	register_irq_source(&phb3_msi_irq_ops, p, p->base_msi,
+			    PHB3_MSI_IRQ_COUNT);
+	register_irq_source(&phb3_lsi_irq_ops, p, p->base_lsi, 8);
+
+	/* Get the HW up and running */
+	phb3_init_hw(p, true);
+
+	/* Load capp microcode into capp unit */
+	load_capp_ucode(p);
+
+	opal_add_host_sync_notifier(phb3_host_sync_reset, p);
+
+	/* Platform additional setup */
+	if (platform.pci_setup_phb)
+		platform.pci_setup_phb(&p->phb, p->index);
+}
+
+static void phb3_probe_pbcq(struct dt_node *pbcq)
+{
+	uint32_t spci_xscom, pci_xscom, pe_xscom, gcid, pno;
+	uint64_t val, phb_bar, bar_en;
+	uint64_t mmio0_bar, mmio0_bmask, mmio0_sz;
+	uint64_t mmio1_bar, mmio1_bmask, mmio1_sz;
+	uint64_t reg[2];
+	uint64_t mmio_win[4];
+	unsigned int mmio_win_sz;
+	struct dt_node *np;
+	char *path;
+	uint64_t capp_ucode_base;
+	unsigned int max_link_speed;
+
+	gcid = dt_get_chip_id(pbcq);
+	pno = dt_prop_get_u32(pbcq, "ibm,phb-index");
+	path = dt_get_path(pbcq);
+	prlog(PR_NOTICE, "Chip %d Found PBCQ%d at %s\n", gcid, pno, path);
+	free(path);
+
+	pe_xscom = dt_get_address(pbcq, 0, NULL);
+	pci_xscom = dt_get_address(pbcq, 1, NULL);
+	spci_xscom = dt_get_address(pbcq, 2, NULL);
+	prlog(PR_DEBUG, "PHB3[%x:%x]: X[PE]=0x%08x X[PCI]=0x%08x"
+	      " X[SPCI]=0x%08x\n",
+	      gcid, pno, pe_xscom, pci_xscom, spci_xscom);
+
+	/* Check if CAPP mode */
+	if (xscom_read(gcid, spci_xscom + 0x03, &val)) {
+		prerror("PHB3[%x:%x]: Cannot read AIB CAPP ENABLE\n",
+			gcid, pno);
+		return;
+	}
+	if (val >> 63) {
+		prerror("PHB3[%x:%x]: Ignoring bridge in CAPP mode\n",
+			gcid, pno);
+		return;
+	}
+
+	/* Get PE BARs, assume only 0 and 2 are used for now */
+	xscom_read(gcid, pe_xscom + 0x42, &phb_bar);
+	phb_bar >>= 14;
+	prlog(PR_DEBUG, "PHB3[%x:%x] REGS     = 0x%016llx [4k]\n",
+		gcid, pno, phb_bar);
+	if (phb_bar == 0) {
+		prerror("PHB3[%x:%x]: No PHB BAR set !\n", gcid, pno);
+		return;
+	}
+
+	/* Dbl check PHB BAR */
+	xscom_read(gcid, spci_xscom + 1, &val);/* HW275117 */
+	xscom_read(gcid, pci_xscom + 0x0b, &val);
+	val >>= 14;
+	prlog(PR_DEBUG, "PHB3[%x:%x] PCIBAR   = 0x%016llx\n", gcid, pno, val);
+	if (phb_bar != val) {
+		prerror("PHB3[%x:%x] PCIBAR invalid, fixing up...\n",
+			gcid, pno);
+		xscom_read(gcid, spci_xscom + 1, &val);/* HW275117 */
+		xscom_write(gcid, pci_xscom + 0x0b, phb_bar << 14);
+	}
+
+	/* Check MMIO BARs */
+	xscom_read(gcid, pe_xscom + 0x40, &mmio0_bar);
+	xscom_read(gcid, pe_xscom + 0x43, &mmio0_bmask);
+	mmio0_bmask &= 0xffffffffc0000000ull;
+	mmio0_sz = ((~mmio0_bmask) >> 14) + 1;
+	mmio0_bar >>= 14;
+	prlog(PR_DEBUG, "PHB3[%x:%x] MMIO0    = 0x%016llx [0x%016llx]\n",
+		gcid, pno, mmio0_bar, mmio0_sz);
+	xscom_read(gcid, pe_xscom + 0x41, &mmio1_bar);
+	xscom_read(gcid, pe_xscom + 0x44, &mmio1_bmask);
+	mmio1_bmask &= 0xffffffffc0000000ull;
+	mmio1_sz = ((~mmio1_bmask) >> 14) + 1;
+	mmio1_bar >>= 14;
+	prlog(PR_DEBUG, "PHB3[%x:%x] MMIO1    = 0x%016llx [0x%016llx]\n",
+		gcid, pno, mmio1_bar, mmio1_sz);
+
+	/* Check BAR enable
+	 *
+	 * XXX BAR aren't always enabled by HB, we'll make assumptions
+	 * that BARs are valid if they value is non-0
+	 */
+	xscom_read(gcid, pe_xscom + 0x45, &bar_en);
+	prlog(PR_DEBUG, "PHB3[%x:%x] BAREN    = 0x%016llx\n",
+		gcid, pno, bar_en);
+
+	/* Always enable PHB BAR */
+	bar_en |= 0x2000000000000000ull;
+
+	/* Build MMIO windows list */
+	mmio_win_sz = 0;
+	if (mmio0_bar) {
+		mmio_win[mmio_win_sz++] = mmio0_bar;
+		mmio_win[mmio_win_sz++] = mmio0_sz;
+		bar_en |= 0x8000000000000000ul;
+	}
+	if (mmio1_bar) {
+		mmio_win[mmio_win_sz++] = mmio1_bar;
+		mmio_win[mmio_win_sz++] = mmio1_sz;
+		bar_en |= 0x4000000000000000ul;
+	}
+
+	/* No MMIO windows ? Barf ! */
+	if (mmio_win_sz == 0) {
+		prerror("PHB3[%x:%x]: No MMIO windows enabled !\n",
+			gcid, pno);
+		return;
+	}
+
+	/* Set the interrupt routing stuff, 8 relevant bits in mask
+	 * (11 bits per PHB)
+	 */
+	val = p8_chip_irq_phb_base(gcid, pno);
+	val = (val << 45);
+	xscom_write(gcid, pe_xscom + 0x1a, val);
+	xscom_write(gcid, pe_xscom + 0x1b, 0xff00000000000000ul);
+
+	/* Configure LSI location to the top of the map */
+	xscom_write(gcid, pe_xscom + 0x1f, 0xff00000000000000ul);
+
+	/* Now add IRSN message bits to BAR enable and write it */
+	bar_en |= 0x1800000000000000ul;
+	xscom_write(gcid, pe_xscom + 0x45, bar_en);
+
+	prlog(PR_DEBUG, "PHB3[%x:%x] NEWBAREN = 0x%016llx\n",
+	      gcid, pno, bar_en);
+
+	xscom_read(gcid, pe_xscom + 0x1a, &val);
+	prlog(PR_DEBUG, "PHB3[%x:%x] IRSNC    = 0x%016llx\n",
+	      gcid, pno, val);
+	xscom_read(gcid, pe_xscom + 0x1b, &val);
+	prlog(PR_DEBUG, "PHB3[%x:%x] IRSNM    = 0x%016llx\n",
+	      gcid, pno, val);
+	prlog(PR_DEBUG, "PHB3[%x:%x] LSI      = 0x%016llx\n",
+	      gcid, pno, val);
+
+	/* Create PHB node */
+	reg[0] = phb_bar;
+	reg[1] = 0x1000;
+
+	np = dt_new_addr(dt_root, "pciex", reg[0]);
+	if (!np)
+		return;
+
+	dt_add_property_strings(np, "compatible", "ibm,power8-pciex",
+				"ibm,ioda2-phb");
+	dt_add_property_strings(np, "device_type", "pciex");
+	dt_add_property(np, "reg", reg, sizeof(reg));
+
+	/* Everything else is handled later by skiboot, we just
+	 * stick a few hints here
+	 */
+	dt_add_property_cells(np, "ibm,xscom-bases",
+			      pe_xscom, spci_xscom, pci_xscom);
+	dt_add_property(np, "ibm,mmio-window", mmio_win, 8 * mmio_win_sz);
+	dt_add_property_cells(np, "ibm,phb-index", pno);
+	dt_add_property_cells(np, "ibm,pbcq", pbcq->phandle);
+	dt_add_property_cells(np, "ibm,chip-id", gcid);
+	if (dt_has_node_property(pbcq, "ibm,use-ab-detect", NULL))
+		dt_add_property(np, "ibm,use-ab-detect", NULL, 0);
+	if (dt_has_node_property(pbcq, "ibm,hub-id", NULL))
+		dt_add_property_cells(np, "ibm,hub-id",
+				      dt_prop_get_u32(pbcq, "ibm,hub-id"));
+	if (dt_has_node_property(pbcq, "ibm,loc-code", NULL)) {
+		const char *lc = dt_prop_get(pbcq, "ibm,loc-code");
+		dt_add_property_string(np, "ibm,loc-code", lc);
+	}
+	if (dt_has_node_property(pbcq, "ibm,lane-eq", NULL)) {
+		size_t leq_size;
+		const void *leq = dt_prop_get_def_size(pbcq, "ibm,lane-eq",
+						       NULL, &leq_size);
+		if (leq != NULL && leq_size == 4 * 8)
+			dt_add_property(np, "ibm,lane-eq", leq, leq_size);
+	}
+	if (dt_has_node_property(pbcq, "ibm,capp-ucode", NULL)) {
+		capp_ucode_base = dt_prop_get_u32(pbcq, "ibm,capp-ucode");
+		dt_add_property_cells(np, "ibm,capp-ucode", capp_ucode_base);
+	}
+	if (dt_has_node_property(pbcq, "ibm,max-link-speed", NULL)) {
+		max_link_speed = dt_prop_get_u32(pbcq, "ibm,max-link-speed");
+		dt_add_property_cells(np, "ibm,max-link-speed", max_link_speed);
+	}
+	dt_add_property_cells(np, "ibm,capi-flags",
+			      OPAL_PHB_CAPI_FLAG_SNOOP_CONTROL);
+
+	add_chip_dev_associativity(np);
+}
+
+
+void probe_phb3(void)
+{
+	struct dt_node *np;
+
+	/* Look for PBCQ XSCOM nodes */
+	dt_for_each_compatible(dt_root, np, "ibm,power8-pbcq")
+		phb3_probe_pbcq(np);
+
+	/* Look for newly created PHB nodes */
+	dt_for_each_compatible(dt_root, np, "ibm,power8-pciex")
+		phb3_create(np);
+}
+
+
diff --git a/roms/skiboot/hw/phb4.c b/roms/skiboot/hw/phb4.c
new file mode 100644
index 000000000..79083d4a1
--- /dev/null
+++ b/roms/skiboot/hw/phb4.c
@@ -0,0 +1,6400 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * PHB4: PCI Host Bridge 4, in POWER9
+ *
+ * Copyright 2013-2019 IBM Corp.
+ * Copyright 2018 Raptor Engineering, LLC
+ */
+
+/*
+ *
+ * FIXME:
+ *   More stuff for EEH support:
+ *      - PBCQ error reporting interrupt
+ *	- I2C-based power management (replacing SHPC)
+ *	- Directly detect fenced PHB through one dedicated HW reg
+ */
+
+/*
+ * This is a simplified view of the PHB4 reset and link training steps
+ *
+ * Step 1:
+ * - Check for hotplug status:
+ *  o PHB_PCIE_HOTPLUG_STATUS bit PHB_PCIE_HPSTAT_PRESENCE
+ *  o If not set -> Bail out (Slot is empty)
+ *
+ * Step 2:
+ * - Do complete PHB reset:
+ *   o PHB/ETU reset procedure
+ *
+ * Step 3:
+ * - Drive PERST active (skip if already asserted. ie. after cold reboot)
+ * - Wait 250ms (for cards to reset)
+ *   o powervm have used 250ms for a long time without any problems
+ *
+ * Step 4:
+ * - Drive PERST inactive
+ *
+ * Step 5:
+ * - Look for inband presence:
+ *   o From PERST we have two stages to get inband presence detected
+ *     1) Devices must enter Detect state within 20 ms of the end of
+ *          Fundamental Reset
+ *     2) Receiver detect pulse are every 12ms
+ *      - Hence minimum wait time 20 + 12 = 32ms
+ *   o Unfortunatey, we've seen cards take 440ms
+ *   o Hence we are conservative and poll here for 1000ms (> 440ms)
+ * - If no inband presence after 100ms -> Bail out (Slot is broken)
+ *   o PHB_PCIE_DLP_TRAIN_CTL bit PHB_PCIE_DLP_INBAND_PRESENCE
+ *
+ * Step 6:
+ * - Look for link training done:
+ *   o PHB_PCIE_DLP_TRAIN_CTL bit PHB_PCIE_DLP_TL_LINKACT
+ * - If not set after 2000ms, Retry (3 times) -> Goto Step 2
+ *   o phy lockup could link training failure, hence going back to a
+ *     complete PHB reset on retry
+ *   o not expect to happen very often
+ *
+ * Step 7:
+ * - Wait for 1 sec (before touching device config space):
+ * -  From PCIe spec:
+ *     Root Complex and/or system software must allow at least 1.0 s after
+ *     a Conventional Reset of a device, before it may determine that a
+ *     device which fails to return a Successful Completion status for a
+ *     valid Configuration Request is a broken device.
+ *
+ * Step 8:
+ * - Sanity check for fence and link still up:
+ *   o If fenced or link down, Retry (3 times) -> Goto Step 2
+ *   o This is not nessary but takes no time and can be useful
+ *   o Once we leave here, much harder to recover from errors
+ *
+ * Step 9:
+ * - Check for optimised link for directly attached devices:
+ *   o Wait for CRS (so we can read device config space)
+ *   o Check chip and device are in allowlist. if not, Goto Step 10
+ *   o If trained link speed is degraded, retry ->  Goto Step 2
+ *   o If trained link width is degraded, retry -> Goto Step 2
+ *   o If still degraded after 3 retries. Give up, Goto Step 10.
+ *
+ * Step 10:
+ *  - PHB good, start probing config space.
+ *    o core/pci.c: pci_reset_phb() -> pci_scan_phb()
+ */
+
+
+#undef NO_ASB
+#undef LOG_CFG
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <vpd.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <xscom.h>
+#include <affinity.h>
+#include <phb4.h>
+#include <phb4-regs.h>
+#include <phb4-capp.h>
+#include <capp.h>
+#include <fsp.h>
+#include <chip.h>
+#include <chiptod.h>
+#include <xive.h>
+#include <xscom-p9-regs.h>
+#include <phys-map.h>
+#include <nvram.h>
+
+/* Enable this to disable error interrupts for debug purposes */
+#undef DISABLE_ERR_INTS
+
+static void phb4_init_hw(struct phb4 *p);
+
+#define PHBDBG(p, fmt, a...)	prlog(PR_DEBUG, "PHB#%04x[%d:%d]: " fmt, \
+				      (p)->phb.opal_id, (p)->chip_id, \
+				      (p)->index,  ## a)
+#define PHBINF(p, fmt, a...)	prlog(PR_INFO, "PHB#%04x[%d:%d]: " fmt, \
+				      (p)->phb.opal_id, (p)->chip_id, \
+				      (p)->index,  ## a)
+#define PHBNOTICE(p, fmt, a...)	prlog(PR_NOTICE, "PHB#%04x[%d:%d]: " fmt, \
+				      (p)->phb.opal_id, (p)->chip_id, \
+				      (p)->index,  ## a)
+#define PHBERR(p, fmt, a...)	prlog(PR_ERR, "PHB#%04x[%d:%d]: " fmt, \
+				      (p)->phb.opal_id, (p)->chip_id, \
+				      (p)->index,  ## a)
+#ifdef LOG_CFG
+#define PHBLOGCFG(p, fmt, a...)	PHBDBG(p, fmt, ## a)
+#else
+#define PHBLOGCFG(p, fmt, a...) do {} while (0)
+#endif
+
+static bool pci_eeh_mmio;
+static bool pci_retry_all;
+static int rx_err_max = PHB4_RX_ERR_MAX;
+
+static inline bool is_phb4(void)
+{
+	return (proc_gen == proc_gen_p9);
+}
+
+static inline bool is_phb5(void)
+{
+	return (proc_gen == proc_gen_p10);
+}
+
+/* PQ offloading on the XIVE IC. */
+static inline bool phb_pq_disable(struct phb4 *p __unused)
+{
+	if (is_phb5())
+		return xive2_cap_phb_pq_disable();
+
+	return false;
+}
+
+/*
+ * Use the ESB page of the XIVE IC for event notification. Latency
+ * improvement.
+ */
+static inline bool phb_abt_mode(struct phb4 *p __unused)
+{
+	if (is_phb5())
+		return xive2_cap_phb_abt();
+
+	return false;
+}
+
+static inline bool phb_can_store_eoi(struct phb4 *p)
+{
+	if (is_phb5())
+		/* PQ offloading is required for StoreEOI */
+		return XIVE2_STORE_EOI_ENABLED && phb_pq_disable(p);
+
+	return XIVE_STORE_EOI_ENABLED;
+}
+
+/* Note: The "ASB" name is historical, practically this means access via
+ * the XSCOM backdoor
+ */
+static inline uint64_t phb4_read_reg_asb(struct phb4 *p, uint32_t offset)
+{
+#ifdef NO_ASB
+	return in_be64(p->regs + offset);
+#else
+	int64_t rc;
+	uint64_t addr, val;
+
+	/* Address register: must use 4 bytes for built-in config space.
+	 *
+	 * This path isn't usable for outbound configuration space
+	 */
+	if (((offset & 0xfffffffc) == PHB_CONFIG_DATA) && (offset & 3)) {
+		PHBERR(p, "XSCOM unaligned access to CONFIG_DATA unsupported\n");
+		return -1ull;
+	}
+	addr = XETU_HV_IND_ADDR_VALID | offset;
+	if ((offset >= 0x1000 && offset < 0x1800) || (offset == PHB_CONFIG_DATA))
+		addr |= XETU_HV_IND_ADDR_4B;
+ 	rc = xscom_write(p->chip_id, p->etu_xscom + XETU_HV_IND_ADDRESS, addr);
+	if (rc != 0) {
+		PHBERR(p, "XSCOM error addressing register 0x%x\n", offset);
+		return -1ull;
+	}
+ 	rc = xscom_read(p->chip_id, p->etu_xscom + XETU_HV_IND_DATA, &val);
+	if (rc != 0) {
+		PHBERR(p, "XSCOM error reading register 0x%x\n", offset);
+		return -1ull;
+	}
+	return val;
+#endif
+}
+
+static inline void phb4_write_reg_asb(struct phb4 *p,
+				      uint32_t offset, uint64_t val)
+{
+#ifdef NO_ASB
+	out_be64(p->regs + offset, val);
+#else
+	int64_t rc;
+	uint64_t addr;
+
+	/* Address register: must use 4 bytes for built-in config space.
+	 *
+	 * This path isn't usable for outbound configuration space
+	 */
+	if (((offset & 0xfffffffc) == PHB_CONFIG_DATA) && (offset & 3)) {
+		PHBERR(p, "XSCOM access to CONFIG_DATA unsupported\n");
+		return;
+	}
+	addr = XETU_HV_IND_ADDR_VALID | offset;
+	if ((offset >= 0x1000 && offset < 0x1800) || (offset == PHB_CONFIG_DATA))
+		addr |= XETU_HV_IND_ADDR_4B;
+ 	rc = xscom_write(p->chip_id, p->etu_xscom + XETU_HV_IND_ADDRESS, addr);
+	if (rc != 0) {
+		PHBERR(p, "XSCOM error addressing register 0x%x\n", offset);
+		return;
+	}
+ 	rc = xscom_write(p->chip_id, p->etu_xscom + XETU_HV_IND_DATA, val);
+	if (rc != 0) {
+		PHBERR(p, "XSCOM error writing register 0x%x\n", offset);
+		return;
+	}
+#endif
+}
+
+static uint64_t phb4_read_reg(struct phb4 *p, uint32_t offset)
+{
+	/* No register accesses are permitted while in reset */
+	if (p->flags & PHB4_ETU_IN_RESET)
+		return -1ull;
+
+	if (p->flags & PHB4_CFG_USE_ASB)
+		return phb4_read_reg_asb(p, offset);
+	else
+		return in_be64(p->regs + offset);
+}
+
+static void phb4_write_reg(struct phb4 *p, uint32_t offset, uint64_t val)
+{
+	/* No register accesses are permitted while in reset */
+	if (p->flags & PHB4_ETU_IN_RESET)
+		return;
+
+	if (p->flags & PHB4_CFG_USE_ASB)
+		phb4_write_reg_asb(p, offset, val);
+	else
+		return out_be64(p->regs + offset, val);
+}
+
+/* Helper to select an IODA table entry */
+static inline void phb4_ioda_sel(struct phb4 *p, uint32_t table,
+				 uint32_t addr, bool autoinc)
+{
+	phb4_write_reg(p, PHB_IODA_ADDR,
+		       (autoinc ? PHB_IODA_AD_AUTOINC : 0)	|
+		       SETFIELD(PHB_IODA_AD_TSEL, 0ul, table)	|
+		       SETFIELD(PHB_IODA_AD_TADR, 0ul, addr));
+}
+
+/*
+ * Configuration space access
+ *
+ * The PHB lock is assumed to be already held
+ */
+static int64_t phb4_pcicfg_check(struct phb4 *p, uint32_t bdfn,
+				 uint32_t offset, uint32_t size,
+				 uint16_t *pe)
+{
+	uint32_t sm = size - 1;
+
+	if (offset > 0xfff || bdfn > 0xffff)
+		return OPAL_PARAMETER;
+	if (offset & sm)
+		return OPAL_PARAMETER;
+
+	/* The root bus only has a device at 0 and we get into an
+	 * error state if we try to probe beyond that, so let's
+	 * avoid that and just return an error to Linux
+	 */
+	if (PCI_BUS_NUM(bdfn) == 0 && (bdfn & 0xff))
+		return OPAL_HARDWARE;
+
+	/* Check PHB state */
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	/* Fetch the PE# from cache */
+	*pe = be16_to_cpu(p->tbl_rtt[bdfn]);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_rc_read(struct phb4 *p, uint32_t offset, uint8_t sz,
+			    void *data, bool use_asb)
+{
+	uint32_t reg = offset & ~3;
+	uint32_t oval;
+
+	/* Some registers are handled locally */
+	switch (reg) {
+		/* Bridge base/limit registers are cached here as HW
+		 * doesn't implement them (it hard codes values that
+		 * will confuse a proper PCI implementation).
+		 */
+	case PCI_CFG_MEM_BASE:		/* Includes PCI_CFG_MEM_LIMIT */
+		oval = p->rc_cache[(reg - 0x20) >> 2] & 0xfff0fff0;
+		break;
+	case PCI_CFG_PREF_MEM_BASE:	/* Includes PCI_CFG_PREF_MEM_LIMIT */
+		oval = p->rc_cache[(reg - 0x20) >> 2] & 0xfff0fff0;
+		oval |= 0x00010001;
+		break;
+	case PCI_CFG_IO_BASE_U16:	/* Includes PCI_CFG_IO_LIMIT_U16 */
+		oval = 0;
+		break;
+	case PCI_CFG_PREF_MEM_BASE_U32:
+	case PCI_CFG_PREF_MEM_LIMIT_U32:
+		oval = p->rc_cache[(reg - 0x20) >> 2];
+		break;
+	default:
+		oval = 0xffffffff; /* default if offset too big */
+		if (reg < PHB_RC_CONFIG_SIZE) {
+			if (use_asb)
+				oval = bswap_32(phb4_read_reg_asb(p, PHB_RC_CONFIG_BASE
+								  + reg));
+			else
+				oval = in_le32(p->regs + PHB_RC_CONFIG_BASE + reg);
+		}
+	}
+
+	/* Apply any post-read fixups */
+	switch (reg) {
+	case PCI_CFG_IO_BASE:
+		oval |= 0x01f1; /* Set IO base < limit to disable the window */
+		break;
+	}
+
+	switch (sz) {
+	case 1:
+		offset &= 3;
+		*((uint8_t *)data) = (oval >> (offset << 3)) & 0xff;
+		PHBLOGCFG(p, "000 CFG08 Rd %02x=%02x\n",
+			  offset, *((uint8_t *)data));
+		break;
+	case 2:
+		offset &= 2;
+		*((uint16_t *)data) = (oval >> (offset << 3)) & 0xffff;
+		PHBLOGCFG(p, "000 CFG16 Rd %02x=%04x\n",
+			  offset, *((uint16_t *)data));
+		break;
+	case 4:
+		*((uint32_t *)data) = oval;
+		PHBLOGCFG(p, "000 CFG32 Rd %02x=%08x\n",
+			  offset, *((uint32_t *)data));
+		break;
+	default:
+		assert(false);
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_rc_write(struct phb4 *p, uint32_t offset, uint8_t sz,
+			     uint32_t val, bool use_asb)
+{
+	uint32_t reg = offset & ~3;
+	uint32_t old, mask, shift, oldold;
+	int64_t rc;
+
+	if (reg > PHB_RC_CONFIG_SIZE)
+		return OPAL_SUCCESS;
+
+	/* If size isn't 4-bytes, do a RMW cycle */
+	if (sz < 4) {
+		rc = phb4_rc_read(p, reg, 4, &old, use_asb);
+		if (rc != OPAL_SUCCESS)
+			return rc;
+
+		/*
+		 * Since we have to Read-Modify-Write here, we need to filter
+		 * out registers that have write-1-to-clear bits to prevent
+		 * clearing stuff we shouldn't be.  So for any register this
+		 * applies to, mask out those bits.
+		 */
+		oldold = old;
+		switch(reg) {
+		case 0x1C: /* Secondary status */
+			old &= 0x00ffffff; /* mask out 24-31 */
+			break;
+		case 0x50: /* EC - Device status */
+			old &= 0xfff0ffff; /* mask out 16-19 */
+			break;
+		case 0x58: /* EC - Link status */
+			old &= 0x3fffffff; /* mask out 30-31 */
+			break;
+		case 0x78: /* EC - Link status 2 */
+			old &= 0xf000ffff; /* mask out 16-27 */
+			break;
+		/* These registers *only* have write-1-to-clear bits */
+		case 0x104: /* AER - Uncorr. error status */
+		case 0x110: /* AER - Corr. error status */
+		case 0x130: /* AER - Root error status */
+		case 0x180: /* P16 - status */
+		case 0x184: /* P16 - LDPM status */
+		case 0x188: /* P16 - FRDPM status */
+		case 0x18C: /* P16 - SRDPM status */
+			old &= 0x00000000;
+			break;
+		}
+
+		if (old != oldold) {
+			PHBLOGCFG(p, "Rewrote %x to %x for reg %x for W1C\n",
+				  oldold, old, reg);
+		}
+
+		if (sz == 1) {
+			shift = (offset & 3) << 3;
+			mask = 0xff << shift;
+			val = (old & ~mask) | ((val & 0xff) << shift);
+		} else {
+			shift = (offset & 2) << 3;
+			mask = 0xffff << shift;
+			val = (old & ~mask) | ((val & 0xffff) << shift);
+		}
+	}
+
+	/* Some registers are handled locally */
+	switch (reg) {
+		/* See comment in phb4_rc_read() */
+	case PCI_CFG_MEM_BASE:		/* Includes PCI_CFG_MEM_LIMIT */
+	case PCI_CFG_PREF_MEM_BASE:	/* Includes PCI_CFG_PREF_MEM_LIMIT */
+	case PCI_CFG_PREF_MEM_BASE_U32:
+	case PCI_CFG_PREF_MEM_LIMIT_U32:
+		p->rc_cache[(reg - 0x20) >> 2] = val;
+		break;
+	case PCI_CFG_IO_BASE_U16:	/* Includes PCI_CFG_IO_LIMIT_U16 */
+		break;
+	default:
+		/* Workaround PHB config space enable */
+		PHBLOGCFG(p, "000 CFG%02d Wr %02x=%08x\n", 8 * sz, reg, val);
+		if (use_asb)
+			phb4_write_reg_asb(p, PHB_RC_CONFIG_BASE + reg, val);
+		else
+			out_le32(p->regs + PHB_RC_CONFIG_BASE + reg, val);
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_pcicfg_read(struct phb4 *p, uint32_t bdfn,
+				uint32_t offset, uint32_t size,
+				void *data)
+{
+	uint64_t addr, val64;
+	int64_t rc;
+	uint16_t pe;
+	bool use_asb = false;
+
+	rc = phb4_pcicfg_check(p, bdfn, offset, size, &pe);
+	if (rc)
+		return rc;
+
+	if (p->flags & PHB4_AIB_FENCED) {
+		if (!(p->flags & PHB4_CFG_USE_ASB))
+			return OPAL_HARDWARE;
+		if (bdfn != 0)
+			return OPAL_HARDWARE;
+		use_asb = true;
+	} else if ((p->flags & PHB4_CFG_BLOCKED) && bdfn != 0) {
+		return OPAL_HARDWARE;
+	}
+
+	/* Handle per-device filters */
+	rc = pci_handle_cfg_filters(&p->phb, bdfn, offset, size,
+				    (uint32_t *)data, false);
+	if (rc != OPAL_PARTIAL)
+		return rc;
+
+	/* Handle root complex MMIO based config space */
+	if (bdfn == 0)
+		return phb4_rc_read(p, offset, size, data, use_asb);
+
+	addr = PHB_CA_ENABLE;
+	addr = SETFIELD(PHB_CA_BDFN, addr, bdfn);
+	addr = SETFIELD(PHB_CA_REG, addr, offset & ~3u);
+	addr = SETFIELD(PHB_CA_PE, addr, pe);
+	if (use_asb) {
+		phb4_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr);
+		sync();
+		val64 = bswap_64(phb4_read_reg_asb(p, PHB_CONFIG_DATA));
+		switch(size) {
+		case 1:
+			*((uint8_t *)data) = val64 >> (8 * (offset & 3));
+			break;
+		case 2:
+			*((uint16_t *)data) = val64 >> (8 * (offset & 2));
+			break;
+		case 4:
+			*((uint32_t *)data) = val64;
+			break;
+		default:
+			return OPAL_PARAMETER;
+		}
+	} else {
+		out_be64(p->regs + PHB_CONFIG_ADDRESS, addr);
+		switch(size) {
+		case 1:
+			*((uint8_t *)data) =
+				in_8(p->regs + PHB_CONFIG_DATA + (offset & 3));
+			PHBLOGCFG(p, "%03x CFG08 Rd %02x=%02x\n",
+				  bdfn, offset, *((uint8_t *)data));
+			break;
+		case 2:
+			*((uint16_t *)data) =
+				in_le16(p->regs + PHB_CONFIG_DATA + (offset & 2));
+			PHBLOGCFG(p, "%03x CFG16 Rd %02x=%04x\n",
+				  bdfn, offset, *((uint16_t *)data));
+			break;
+		case 4:
+			*((uint32_t *)data) = in_le32(p->regs + PHB_CONFIG_DATA);
+			PHBLOGCFG(p, "%03x CFG32 Rd %02x=%08x\n",
+				  bdfn, offset, *((uint32_t *)data));
+			break;
+		default:
+			return OPAL_PARAMETER;
+		}
+	}
+	return OPAL_SUCCESS;
+}
+
+
+#define PHB4_PCI_CFG_READ(size, type)					\
+static int64_t phb4_pcicfg_read##size(struct phb *phb, uint32_t bdfn,	\
+				      uint32_t offset, type *data)	\
+{									\
+	struct phb4 *p = phb_to_phb4(phb);				\
+									\
+	/* Initialize data in case of error */				\
+	*data = (type)0xffffffff;					\
+	return phb4_pcicfg_read(p, bdfn, offset, sizeof(type), data);	\
+}
+
+static int64_t phb4_pcicfg_write(struct phb4 *p, uint32_t bdfn,
+				 uint32_t offset, uint32_t size,
+				 uint32_t data)
+{
+	uint64_t addr;
+	int64_t rc;
+	uint16_t pe;
+	bool use_asb = false;
+
+	rc = phb4_pcicfg_check(p, bdfn, offset, size, &pe);
+	if (rc)
+		return rc;
+
+	if (p->flags & PHB4_AIB_FENCED) {
+		if (!(p->flags & PHB4_CFG_USE_ASB))
+			return OPAL_HARDWARE;
+		if (bdfn != 0)
+			return OPAL_HARDWARE;
+		use_asb = true;
+	} else if ((p->flags & PHB4_CFG_BLOCKED) && bdfn != 0) {
+		return OPAL_HARDWARE;
+	}
+
+	/* Handle per-device filters */
+	rc = pci_handle_cfg_filters(&p->phb, bdfn, offset, size,
+				    (uint32_t *)&data, true);
+	if (rc != OPAL_PARTIAL)
+		return rc;
+
+	/* Handle root complex MMIO based config space */
+	if (bdfn == 0)
+		return phb4_rc_write(p, offset, size, data, use_asb);
+
+	addr = PHB_CA_ENABLE;
+	addr = SETFIELD(PHB_CA_BDFN, addr, bdfn);
+	addr = SETFIELD(PHB_CA_REG, addr, offset & ~3u);
+	addr = SETFIELD(PHB_CA_PE, addr, pe);
+	if (use_asb) {
+		/* We don't support ASB config space writes */
+		return OPAL_UNSUPPORTED;
+	} else {
+		out_be64(p->regs + PHB_CONFIG_ADDRESS, addr);
+		switch(size) {
+		case 1:
+			out_8(p->regs + PHB_CONFIG_DATA + (offset & 3), data);
+			break;
+		case 2:
+			out_le16(p->regs + PHB_CONFIG_DATA + (offset & 2), data);
+			break;
+		case 4:
+			out_le32(p->regs + PHB_CONFIG_DATA, data);
+			break;
+		default:
+			return OPAL_PARAMETER;
+		}
+	}
+	PHBLOGCFG(p, "%03x CFG%d Wr %02x=%08x\n", bdfn, 8 * size, offset, data);
+	return OPAL_SUCCESS;
+}
+
+#define PHB4_PCI_CFG_WRITE(size, type)					\
+static int64_t phb4_pcicfg_write##size(struct phb *phb, uint32_t bdfn,	\
+				       uint32_t offset, type data)	\
+{									\
+	struct phb4 *p = phb_to_phb4(phb);				\
+									\
+	return phb4_pcicfg_write(p, bdfn, offset, sizeof(type), data);	\
+}
+
+PHB4_PCI_CFG_READ(8, u8)
+PHB4_PCI_CFG_READ(16, u16)
+PHB4_PCI_CFG_READ(32, u32)
+PHB4_PCI_CFG_WRITE(8, u8)
+PHB4_PCI_CFG_WRITE(16, u16)
+PHB4_PCI_CFG_WRITE(32, u32)
+
+static int64_t phb4_get_reserved_pe_number(struct phb *phb)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+
+	return PHB4_RESERVED_PE_NUM(p);
+}
+
+
+static void phb4_root_port_init(struct phb *phb, struct pci_device *dev,
+				int ecap, int aercap)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	struct pci_slot *slot = dev->slot;
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/*
+	 * Use the PHB's callback so that UTL events will be masked or
+	 * unmasked when the link is down or up.
+	 */
+	if (dev->slot && dev->slot->ops.prepare_link_change &&
+	    phb->slot && phb->slot->ops.prepare_link_change)
+		dev->slot->ops.prepare_link_change =
+			phb->slot->ops.prepare_link_change;
+
+	// FIXME: check recommended init values for phb4
+
+	/*
+	 * Enable the bridge slot capability in the root port's config
+	 * space. This should probably be done *before* we start
+	 * scanning config space, but we need a pci_device struct to
+	 * exist before we do a slot lookup so *faaaaaaaaaaaaaart*
+	 */
+	if (slot && slot->pluggable && slot->power_limit) {
+		uint64_t val;
+
+		val = in_be64(p->regs + PHB_PCIE_SCR);
+		val |= PHB_PCIE_SCR_SLOT_CAP;
+		out_be64(p->regs + PHB_PCIE_SCR, val);
+
+		/* update the cached slotcap */
+		pci_cfg_read32(phb, bdfn, ecap + PCICAP_EXP_SLOTCAP,
+				&slot->slot_cap);
+	}
+
+	/* Enable SERR and parity checking */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_SERR_EN | PCI_CFG_CMD_PERR_RESP |
+		  PCI_CFG_CMD_MEM_EN);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+		  PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT |
+		  PCICAP_EXP_DEVCTL_UR_REPORT);
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	if (!aercap) return;
+
+	/* Mask various unrecoverable errors */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, &val32);
+	val32 |= (PCIECAP_AER_UE_MASK_POISON_TLP |
+		  PCIECAP_AER_UE_MASK_COMPL_TIMEOUT |
+		  PCIECAP_AER_UE_MASK_COMPL_ABORT |
+		  PCIECAP_AER_UE_MASK_ECRC);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, val32);
+
+	/* Report various unrecoverable errors as fatal errors */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, &val32);
+	val32 |= (PCIECAP_AER_UE_SEVERITY_DLLP |
+		  PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+		  PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+		  PCIECAP_AER_UE_SEVERITY_UNEXP_COMPL |
+		  PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+		  PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+	/* Mask various recoverable errors */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, &val32);
+	val32 |= PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+	/* Enable ECRC check */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+		  PCIECAP_AER_CAPCTL_ECRCC_EN);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+
+	/* Enable all error reporting */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, &val32);
+	val32 |= (PCIECAP_AER_RERR_CMD_FE |
+		  PCIECAP_AER_RERR_CMD_NFE |
+		  PCIECAP_AER_RERR_CMD_CE);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, val32);
+}
+
+static void phb4_switch_port_init(struct phb *phb,
+				  struct pci_device *dev,
+				  int ecap, int aercap)
+{
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	// FIXME: update AER settings for phb4
+
+	/* Enable SERR and parity checking and disable INTx */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_PERR_RESP |
+		  PCI_CFG_CMD_SERR_EN |
+		  PCI_CFG_CMD_INTx_DIS);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Disable partity error and enable system error */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_BRCTL, &val16);
+	val16 &= ~PCI_CFG_BRCTL_PERR_RESP_EN;
+	val16 |= PCI_CFG_BRCTL_SERR_EN;
+	pci_cfg_write16(phb, bdfn, PCI_CFG_BRCTL, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+		  PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT);
+	/* HW279570 - Disable reporting of correctable errors */
+	val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	/* Unmask all unrecoverable errors */
+	if (!aercap) return;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, 0x0);
+
+	/* Severity of unrecoverable errors */
+	if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT)
+		val32 = (PCIECAP_AER_UE_SEVERITY_DLLP |
+			 PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+			 PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+			 PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+			 PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP |
+			 PCIECAP_AER_UE_SEVERITY_INTERNAL);
+	else
+		val32 = (PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+			 PCIECAP_AER_UE_SEVERITY_INTERNAL);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+	/*
+	 * Mask various correctable errors
+	 */
+	val32 = PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+	/* Enable ECRC generation and disable ECRC check */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= PCIECAP_AER_CAPCTL_ECRCG_EN;
+	val32 &= ~PCIECAP_AER_CAPCTL_ECRCC_EN;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static void phb4_endpoint_init(struct phb *phb,
+			       struct pci_device *dev,
+			       int ecap, int aercap)
+{
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/* Enable SERR and parity checking */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_PERR_RESP |
+		  PCI_CFG_CMD_SERR_EN);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+	val16 |= (PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT |
+		  PCICAP_EXP_DEVCTL_UR_REPORT);
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	/* Enable ECRC generation and check */
+	if (!aercap)
+		return;
+
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+		  PCIECAP_AER_CAPCTL_ECRCC_EN);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static int64_t phb4_pcicfg_no_dstate(void *dev __unused,
+				     struct pci_cfg_reg_filter *pcrf,
+				     uint32_t offset, uint32_t len __unused,
+				     uint32_t *data __unused,  bool write)
+{
+	uint32_t loff = offset - pcrf->start;
+
+	/* Disable D-state change on children of the PHB. For now we
+	 * simply block all writes to the PM control/status
+	 */
+	if (write && loff >= 4 && loff < 6)
+		return OPAL_SUCCESS;
+
+	return OPAL_PARTIAL;
+}
+
+void phb4_pec2_dma_engine_realloc(struct phb4 *p)
+{
+	uint64_t reg;
+
+	/*
+	 * Allocate 16 extra dma read engines to stack 0, to boost dma
+	 * performance for devices on stack 0 of PEC2, i.e PHB3.
+	 * It comes at a price of reduced read engine allocation for
+	 * devices on stack 1 and 2. The engine allocation becomes
+	 * 48/8/8 instead of the default 32/16/16.
+	 *
+	 * The reallocation magic value should be 0xffff0000ff008000,
+	 * but per the PCI designers, dma engine 32 (bit 0) has a
+	 * quirk, and 0x7fff80007F008000 has the same effect (engine
+	 * 32 goes to PHB4).
+	 */
+	if (p->index != 3) /* shared slot on PEC2 */
+		return;
+
+	PHBINF(p, "Allocating an extra 16 dma read engines on PEC2 stack0\n");
+	reg = 0x7fff80007F008000ULL;
+	xscom_write(p->chip_id,
+		    p->pci_xscom + XPEC_PCI_PRDSTKOVR, reg);
+	xscom_write(p->chip_id,
+		    p->pe_xscom  + XPEC_NEST_READ_STACK_OVERRIDE, reg);
+}
+
+static void phb4_check_device_quirks(struct pci_device *dev)
+{
+	/* Some special adapter tweaks for devices directly under the PHB */
+	if (dev->primary_bus != 1)
+		return;
+
+	/* PM quirk */
+	if (!pci_has_cap(dev, PCI_CFG_CAP_ID_PM, false))
+		return;
+
+	pci_add_cfg_reg_filter(dev,
+			       pci_cap(dev, PCI_CFG_CAP_ID_PM, false), 8,
+			       PCI_REG_FLAG_WRITE,
+			       phb4_pcicfg_no_dstate);
+}
+
+static int phb4_device_init(struct phb *phb, struct pci_device *dev,
+			    void *data __unused)
+{
+	int ecap, aercap;
+
+	/* Setup special device quirks */
+	phb4_check_device_quirks(dev);
+
+	/* Common initialization for the device */
+	pci_device_init(phb, dev);
+
+	ecap = pci_cap(dev, PCI_CFG_CAP_ID_EXP, false);
+	aercap = pci_cap(dev, PCIECAP_ID_AER, true);
+	if (dev->dev_type == PCIE_TYPE_ROOT_PORT)
+		phb4_root_port_init(phb, dev, ecap, aercap);
+	else if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT ||
+		 dev->dev_type == PCIE_TYPE_SWITCH_DNPORT)
+		phb4_switch_port_init(phb, dev, ecap, aercap);
+	else
+		phb4_endpoint_init(phb, dev, ecap, aercap);
+
+	return 0;
+}
+
+static int64_t phb4_pci_reinit(struct phb *phb, uint64_t scope, uint64_t data)
+{
+	struct pci_device *pd;
+	uint16_t bdfn = data;
+	int ret;
+
+	if (scope != OPAL_REINIT_PCI_DEV)
+		return OPAL_PARAMETER;
+
+	pd = pci_find_dev(phb, bdfn);
+	if (!pd)
+		return OPAL_PARAMETER;
+
+	ret = phb4_device_init(phb, pd, NULL);
+	if (ret)
+		return OPAL_HARDWARE;
+
+	return OPAL_SUCCESS;
+}
+
+/* Default value for MBT0, see comments in init_ioda_cache() */
+static uint64_t phb4_default_mbt0(struct phb4 *p, unsigned int bar_idx)
+{
+	uint64_t mbt0;
+
+	switch (p->mbt_size - bar_idx - 1) {
+	case 0:
+		mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT);
+		mbt0 = SETFIELD(IODA3_MBT0_MDT_COLUMN, mbt0, 3);
+		break;
+	case 1:
+		mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT);
+		mbt0 = SETFIELD(IODA3_MBT0_MDT_COLUMN, mbt0, 2);
+		break;
+	case 2:
+		mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT);
+		mbt0 = SETFIELD(IODA3_MBT0_MDT_COLUMN, mbt0, 1);
+		break;
+	default:
+		mbt0 = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_PE_SEG);
+	}
+	return mbt0;
+}
+
+/*
+ * Clear the saved (cached) IODA state.
+ *
+ * The caches here are used to save the configuration of the IODA tables
+ * done by the OS. When the PHB is reset it loses all of its internal state
+ * so we need to keep a copy to restore from. This function re-initialises
+ * the saved state to sane defaults.
+ */
+static void phb4_init_ioda_cache(struct phb4 *p)
+{
+	uint32_t i;
+
+	/*
+	 * The RTT entries (RTE) are supposed to be initialised to
+	 * 0xFF which indicates an invalid PE# for that RTT index
+	 * (the bdfn). However, we set them to 0x00 since Linux
+	 * needs to find the devices first by scanning config space
+	 * and this occurs before PEs have been assigned.
+	 */
+	for (i = 0; i < RTT_TABLE_ENTRIES; i++)
+		p->tbl_rtt[i] = cpu_to_be16(PHB4_RESERVED_PE_NUM(p));
+	memset(p->tbl_peltv, 0x0, p->tbl_peltv_size);
+	memset(p->tve_cache, 0x0, sizeof(p->tve_cache));
+
+	/* XXX Should we mask them ? */
+	memset(p->mist_cache, 0x0, sizeof(p->mist_cache));
+
+	/* Configure MBT entries 1...N */
+
+	/* Column 0 is left 0 and will be used fo M32 and configured
+	 * by the OS. We use MDT column 1..3 for the last 3 BARs, thus
+	 * allowing Linux to remap those, and setup all the other ones
+	 * for now in mode 00 (segment# == PE#). By default those
+	 * columns are set to map the same way.
+	 */
+	for (i = 0; i < p->max_num_pes; i++) {
+		p->mdt_cache[i]  = SETFIELD(IODA3_MDT_PE_B, 0ull, i);
+		p->mdt_cache[i] |= SETFIELD(IODA3_MDT_PE_C, 0ull, i);
+		p->mdt_cache[i] |= SETFIELD(IODA3_MDT_PE_D, 0ull, i);
+	}
+
+	/* Initialize MBT entries for BARs 1...N */
+	for (i = 1; i < p->mbt_size; i++) {
+		p->mbt_cache[i][0] = phb4_default_mbt0(p, i);
+		p->mbt_cache[i][1] = 0;
+	}
+
+	/* Initialize M32 bar using MBT entry 0, MDT colunm A */
+	p->mbt_cache[0][0] = SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_MDT);
+	p->mbt_cache[0][0] |= SETFIELD(IODA3_MBT0_MDT_COLUMN, 0ull, 0);
+	p->mbt_cache[0][0] |= IODA3_MBT0_TYPE_M32 | (p->mm1_base & IODA3_MBT0_BASE_ADDR);
+	p->mbt_cache[0][1] = IODA3_MBT1_ENABLE | ((~(M32_PCI_SIZE - 1)) & IODA3_MBT1_MASK);
+}
+
+static int64_t phb4_wait_bit(struct phb4 *p, uint32_t reg,
+			     uint64_t mask, uint64_t want_val)
+{
+	uint64_t val;
+
+	/* Wait for all pending TCE kills to complete
+	 *
+	 * XXX Add timeout...
+	 */
+	/* XXX SIMICS is nasty... */
+	if ((reg == PHB_TCE_KILL || reg == PHB_DMA_READ_WRITE_SYNC) &&
+	    chip_quirk(QUIRK_SIMICS))
+		return OPAL_SUCCESS;
+
+	for (;;) {
+		val = in_be64(p->regs + reg);
+		if (val == 0xffffffffffffffffull) {
+			/* XXX Fenced ? */
+			return OPAL_HARDWARE;
+		}
+		if ((val & mask) == want_val)
+			break;
+
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_tce_kill(struct phb *phb, uint32_t kill_type,
+			     uint64_t pe_number, uint32_t tce_size,
+			     uint64_t dma_addr, uint32_t npages)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t val;
+	int64_t rc;
+
+	sync();
+	switch(kill_type) {
+	case OPAL_PCI_TCE_KILL_PAGES:
+		while (npages--) {
+			/* Wait for a slot in the HW kill queue */
+			rc = phb4_wait_bit(p, PHB_TCE_KILL,
+					   PHB_TCE_KILL_ALL |
+					   PHB_TCE_KILL_PE |
+					   PHB_TCE_KILL_ONE, 0);
+			if (rc)
+				return rc;
+			val = SETFIELD(PHB_TCE_KILL_PENUM, dma_addr, pe_number);
+
+			/* Set appropriate page size */
+			switch(tce_size) {
+			case 0x1000:
+				if (dma_addr & 0xf000000000000fffull)
+					return OPAL_PARAMETER;
+				break;
+			case 0x10000:
+				if (dma_addr & 0xf00000000000ffffull)
+					return OPAL_PARAMETER;
+				val |= PHB_TCE_KILL_PSEL | PHB_TCE_KILL_64K;
+				break;
+			case 0x200000:
+				if (dma_addr & 0xf0000000001fffffull)
+					return OPAL_PARAMETER;
+				val |= PHB_TCE_KILL_PSEL | PHB_TCE_KILL_2M;
+				break;
+			case 0x40000000:
+				if (dma_addr & 0xf00000003fffffffull)
+					return OPAL_PARAMETER;
+				val |= PHB_TCE_KILL_PSEL | PHB_TCE_KILL_1G;
+				break;
+			default:
+				return OPAL_PARAMETER;
+			}
+			/* Perform kill */
+			out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_ONE | val);
+			/* Next page */
+			dma_addr += tce_size;
+		}
+		break;
+	case OPAL_PCI_TCE_KILL_PE:
+		/* Wait for a slot in the HW kill queue */
+		rc = phb4_wait_bit(p, PHB_TCE_KILL,
+				   PHB_TCE_KILL_ALL |
+				   PHB_TCE_KILL_PE |
+				   PHB_TCE_KILL_ONE, 0);
+		if (rc)
+			return rc;
+		/* Perform kill */
+		out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_PE |
+			 SETFIELD(PHB_TCE_KILL_PENUM, 0ull, pe_number));
+		break;
+	case OPAL_PCI_TCE_KILL_ALL:
+		/* Wait for a slot in the HW kill queue */
+		rc = phb4_wait_bit(p, PHB_TCE_KILL,
+				   PHB_TCE_KILL_ALL |
+				   PHB_TCE_KILL_PE |
+				   PHB_TCE_KILL_ONE, 0);
+		if (rc)
+			return rc;
+		/* Perform kill */
+		out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_ALL);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/* Start DMA sync process */
+	if (is_phb5()){
+		val = in_be64(p->regs + PHB_DMA_READ_WRITE_SYNC) &
+					(PHB_DMA_READ_SYNC_COMPLETE |
+					 PHB_DMA_WRITE_SYNC_COMPLETE);
+		out_be64(p->regs + PHB_DMA_READ_WRITE_SYNC,
+					val | PHB_DMA_READ_SYNC_START);
+
+	} else {
+		out_be64(p->regs + PHB_DMA_READ_WRITE_SYNC,
+			 PHB_DMA_READ_SYNC_START);
+	}
+
+	/* Wait for kill to complete */
+	rc = phb4_wait_bit(p, PHB_Q_DMA_R, PHB_Q_DMA_R_TCE_KILL_STATUS, 0);
+	if (rc)
+		return rc;
+
+	/* Wait for DMA sync to complete */
+	return phb4_wait_bit(p, PHB_DMA_READ_WRITE_SYNC,
+			     PHB_DMA_READ_SYNC_COMPLETE,
+			     PHB_DMA_READ_SYNC_COMPLETE);
+}
+
+/* phb4_ioda_reset - Reset the IODA tables
+ *
+ * @purge: If true, the cache is cleared and the cleared values
+ *         are applied to HW. If false, the cached values are
+ *         applied to HW
+ *
+ * This reset the IODA tables in the PHB. It is called at
+ * initialization time, on PHB reset, and can be called
+ * explicitly from OPAL
+ */
+static int64_t phb4_ioda_reset(struct phb *phb, bool purge)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint32_t i;
+	uint64_t val;
+
+	if (purge) {
+		PHBDBG(p, "Purging all IODA tables...\n");
+		if (phb->slot)
+			phb->slot->link_retries = PHB4_LINK_LINK_RETRIES;
+		phb4_init_ioda_cache(p);
+	}
+
+	/* Init_30..31 - Errata workaround, clear PESTA entry 0 */
+	phb4_ioda_sel(p, IODA3_TBL_PESTA, 0, false);
+	out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+	/* Init_32..33 - MIST  */
+	phb4_ioda_sel(p, IODA3_TBL_MIST, 0, true);
+	val = in_be64(p->regs + PHB_IODA_ADDR);
+	val = SETFIELD(PHB_IODA_AD_MIST_PWV, val, 0xf);
+	out_be64(p->regs + PHB_IODA_ADDR, val);
+	for (i = 0; i < (p->num_irqs/4); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->mist_cache[i]);
+
+	/* Init_34..35 - MRT */
+	phb4_ioda_sel(p, IODA3_TBL_MRT, 0, true);
+	for (i = 0; i < p->mrt_size; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+	/* Init_36..37 - TVT */
+	phb4_ioda_sel(p, IODA3_TBL_TVT, 0, true);
+	for (i = 0; i < p->tvt_size; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]);
+
+	/* Init_38..39 - MBT */
+	phb4_ioda_sel(p, IODA3_TBL_MBT, 0, true);
+	for (i = 0; i < p->mbt_size; i++) {
+		out_be64(p->regs + PHB_IODA_DATA0, p->mbt_cache[i][0]);
+		out_be64(p->regs + PHB_IODA_DATA0, p->mbt_cache[i][1]);
+	}
+
+	/* Init_40..41 - MDT */
+	phb4_ioda_sel(p, IODA3_TBL_MDT, 0, true);
+	for (i = 0; i < p->max_num_pes; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->mdt_cache[i]);
+
+	/* Additional OPAL specific inits */
+
+	/* Clear PEST & PEEV */
+	for (i = 0; i < p->max_num_pes; i++) {
+		phb4_ioda_sel(p, IODA3_TBL_PESTA, i, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+		phb4_ioda_sel(p, IODA3_TBL_PESTB, i, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+	}
+
+	phb4_ioda_sel(p, IODA3_TBL_PEEV, 0, true);
+	for (i = 0; i < p->max_num_pes/64; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+	/* Invalidate RTE, TCE cache */
+	out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL);
+
+	return phb4_tce_kill(&p->phb, OPAL_PCI_TCE_KILL_ALL, 0, 0, 0, 0);
+}
+
+/*
+ * Clear anything we have in PAPR Error Injection registers. Though
+ * the spec says the PAPR error injection should be one-shot without
+ * the "sticky" bit. However, that's false according to the experiments
+ * I had. So we have to clear it at appropriate point in kernel to
+ * avoid endless frozen PE.
+ */
+static int64_t phb4_papr_errinjct_reset(struct phb *phb)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul);
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_ADDR, 0x0ul);
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_MASK, 0x0ul);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_set_phb_mem_window(struct phb *phb,
+				       uint16_t window_type,
+				       uint16_t window_num,
+				       uint64_t addr,
+				       uint64_t pci_addr __unused,
+				       uint64_t size)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t mbt0, mbt1;
+
+	/*
+	 * We have a unified MBT for all BARs on PHB4.
+	 *
+	 * So we use it as follow:
+	 *
+	 *  - M32 is hard wired to be MBT[0] and uses MDT column 0
+	 *    for remapping.
+	 *
+	 *  - MBT[1..n] are available to the OS, currently only as
+	 *    fully segmented or single PE (we don't yet expose the
+	 *    new segmentation modes).
+	 *
+	 *  - We configure the 3 last BARs to columnt 1..3 initially
+	 *    set to segment# == PE#. We will need to provide some
+	 *    extensions to the existing APIs to enable remapping of
+	 *    segments on those BARs (and only those) as the current
+	 *    API forces single segment mode.
+	 */
+	switch (window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+	case OPAL_M32_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num == 0 || window_num >= p->mbt_size) {
+			PHBERR(p, "%s: Invalid window %d\n",
+			       __func__, window_num);
+			return OPAL_PARAMETER;
+		}
+
+		mbt0 = p->mbt_cache[window_num][0];
+		mbt1 = p->mbt_cache[window_num][1];
+
+		/* XXX For now we assume the 4K minimum alignment,
+		 * todo: check with the HW folks what the exact limits
+		 * are based on the segmentation model.
+		 */
+		if ((addr & 0xFFFul) || (size & 0xFFFul)) {
+			PHBERR(p, "%s: Bad addr/size alignment %llx/%llx\n",
+			       __func__, addr, size);
+			return OPAL_PARAMETER;
+		}
+
+		/* size should be 2^N */
+		if (!size || size & (size-1)) {
+			PHBERR(p, "%s: size not a power of 2: %llx\n",
+			       __func__,  size);
+			return OPAL_PARAMETER;
+		}
+
+		/* address should be size aligned */
+		if (addr & (size - 1)) {
+			PHBERR(p, "%s: addr not size aligned %llx/%llx\n",
+			       __func__, addr, size);
+			return OPAL_PARAMETER;
+		}
+
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/* The BAR shouldn't be enabled yet */
+	if (mbt0 & IODA3_MBT0_ENABLE)
+		return OPAL_PARTIAL;
+
+	/* Apply the settings */
+	mbt0 = SETFIELD(IODA3_MBT0_BASE_ADDR, mbt0, addr >> 12);
+	mbt1 = SETFIELD(IODA3_MBT1_MASK, mbt1, ~((size >> 12) -1));
+	p->mbt_cache[window_num][0] = mbt0;
+	p->mbt_cache[window_num][1] = mbt1;
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * For one specific M64 BAR, it can be shared by all PEs,
+ * or owned by single PE exclusively.
+ */
+static int64_t phb4_phb_mmio_enable(struct phb __unused *phb,
+				    uint16_t window_type,
+				    uint16_t window_num,
+				    uint16_t enable)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t mbt0, mbt1, base, mask;
+
+	/*
+	 * By design, PHB4 doesn't support IODT any more.
+	 * Besides, we can't enable M32 BAR as well. So
+	 * the function is used to do M64 mapping and each
+	 * BAR is supposed to be shared by all PEs.
+	 *
+	 * TODO: Add support for some of the new PHB4 split modes
+	 */
+	switch (window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+	case OPAL_M32_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M64_WINDOW_TYPE:
+		/* Window 0 is reserved for M32 */
+		if (window_num == 0 || window_num >= p->mbt_size ||
+		    enable > OPAL_ENABLE_M64_NON_SPLIT) {
+			PHBDBG(p,
+			       "phb4_phb_mmio_enable wrong args (window %d enable %d)\n",
+			       window_num, enable);
+			return OPAL_PARAMETER;
+		}
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/*
+	 * We need check the base/mask while enabling
+	 * the M64 BAR. Otherwise, invalid base/mask
+	 * might cause fenced AIB unintentionally
+	 */
+	mbt0 = p->mbt_cache[window_num][0];
+	mbt1 = p->mbt_cache[window_num][1];
+
+	if (enable == OPAL_DISABLE_M64) {
+		/* Reset the window to disabled & default mode */
+		mbt0 = phb4_default_mbt0(p, window_num);
+		mbt1 = 0;
+	} else {
+		/* Verify that the mode is valid and consistent */
+		if (enable == OPAL_ENABLE_M64_SPLIT) {
+			uint64_t mode = GETFIELD(IODA3_MBT0_MODE, mbt0);
+			if (mode != IODA3_MBT0_MODE_PE_SEG &&
+			    mode != IODA3_MBT0_MODE_MDT)
+				return OPAL_PARAMETER;
+		} else if (enable == OPAL_ENABLE_M64_NON_SPLIT) {
+			if (GETFIELD(IODA3_MBT0_MODE, mbt0) !=
+			    IODA3_MBT0_MODE_SINGLE_PE)
+				return OPAL_PARAMETER;
+		} else
+			return OPAL_PARAMETER;
+
+		base = GETFIELD(IODA3_MBT0_BASE_ADDR, mbt0);
+		base = (base << 12);
+		mask = GETFIELD(IODA3_MBT1_MASK, mbt1);
+		if (base < p->mm0_base || !mask)
+			return OPAL_PARTIAL;
+
+		mbt0 |= IODA3_MBT0_ENABLE;
+		mbt1 |= IODA3_MBT1_ENABLE;
+	}
+
+	/* Update HW and cache */
+	p->mbt_cache[window_num][0] = mbt0;
+	p->mbt_cache[window_num][1] = mbt1;
+	phb4_ioda_sel(p, IODA3_TBL_MBT, window_num << 1, true);
+	out_be64(p->regs + PHB_IODA_DATA0, mbt0);
+	out_be64(p->regs + PHB_IODA_DATA0, mbt1);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_map_pe_mmio_window(struct phb *phb,
+				       uint64_t pe_number,
+				       uint16_t window_type,
+				       uint16_t window_num,
+				       uint16_t segment_num)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t mbt0, mbt1, mdt0;
+
+	if (pe_number >= p->num_pes)
+		return OPAL_PARAMETER;
+
+	/*
+	 * We support a combined MDT that has 4 columns. We let the OS
+	 * use kernel 0 for M32.
+	 *
+	 * We configure the 3 last BARs to map column 3..1 which by default
+	 * are set to map segment# == pe#, but can be remapped here if we
+	 * extend this function.
+	 *
+	 * The problem is that the current API was "hijacked" so that an
+	 * attempt at remapping any segment of an M64 has the effect of
+	 * turning it into a single-PE mode BAR. So if we want to support
+	 * remapping we'll have to play around this for example by creating
+	 * a new API or a new window type...
+	 */
+	switch(window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M32_WINDOW_TYPE:
+		if (window_num != 0 || segment_num >= p->num_pes)
+			return OPAL_PARAMETER;
+
+		mdt0 = p->mdt_cache[segment_num];
+		mdt0 = SETFIELD(IODA3_MDT_PE_A, mdt0, pe_number);
+		phb4_ioda_sel(p, IODA3_TBL_MDT, segment_num, false);
+		out_be64(p->regs + PHB_IODA_DATA0, mdt0);
+		break;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num == 0 || window_num >= p->mbt_size)
+			return OPAL_PARAMETER;
+
+		mbt0 = p->mbt_cache[window_num][0];
+		mbt1 = p->mbt_cache[window_num][1];
+
+		/* The BAR shouldn't be enabled yet */
+		if (mbt0 & IODA3_MBT0_ENABLE)
+			return OPAL_PARTIAL;
+
+		/* Set to single PE mode and configure the PE */
+		mbt0 = SETFIELD(IODA3_MBT0_MODE, mbt0,
+				IODA3_MBT0_MODE_SINGLE_PE);
+		mbt1 = SETFIELD(IODA3_MBT1_SINGLE_PE_NUM, mbt1, pe_number);
+		p->mbt_cache[window_num][0] = mbt0;
+		p->mbt_cache[window_num][1] = mbt1;
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_map_pe_dma_window(struct phb *phb,
+				      uint64_t pe_number,
+				      uint16_t window_id,
+				      uint16_t tce_levels,
+				      uint64_t tce_table_addr,
+				      uint64_t tce_table_size,
+				      uint64_t tce_page_size)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t tts_encoded;
+	uint64_t data64 = 0;
+
+	/*
+	 * We configure the PHB in 2 TVE per PE mode to match phb3.
+	 * Current Linux implementation *requires* the two windows per
+	 * PE.
+	 *
+	 * Note: On DD2.0 this is the normal mode of operation.
+	 */
+
+	/*
+	 * Sanity check. We currently only support "2 window per PE" mode
+	 * ie, only bit 59 of the PCI address is used to select the window
+	 */
+	if (pe_number >= p->num_pes || (window_id >> 1) != pe_number)
+		return OPAL_PARAMETER;
+
+	/*
+	 * tce_table_size == 0 is used to disable an entry, in this case
+	 * we ignore other arguments
+	 */
+	if (tce_table_size == 0) {
+		phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+		p->tve_cache[window_id] = 0;
+		return OPAL_SUCCESS;
+	}
+
+	/* Additional arguments validation */
+	if (tce_levels < 1 || tce_levels > 5 ||
+	    !is_pow2(tce_table_size) ||
+	    tce_table_size < 0x1000)
+		return OPAL_PARAMETER;
+
+	/* Encode TCE table size */
+	data64 = SETFIELD(IODA3_TVT_TABLE_ADDR, 0ul, tce_table_addr >> 12);
+	tts_encoded = ilog2(tce_table_size) - 11;
+	if (tts_encoded > 31)
+		return OPAL_PARAMETER;
+	data64 = SETFIELD(IODA3_TVT_TCE_TABLE_SIZE, data64, tts_encoded);
+
+	/* Encode TCE page size */
+	switch (tce_page_size) {
+	case 0x1000:	/* 4K */
+		data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 1);
+		break;
+	case 0x10000:	/* 64K */
+		data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 5);
+		break;
+	case 0x200000:	/* 2M */
+		data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 10);
+		break;
+	case 0x40000000: /* 1G */
+		data64 = SETFIELD(IODA3_TVT_IO_PSIZE, data64, 19);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/* Encode number of levels */
+	data64 = SETFIELD(IODA3_TVT_NUM_LEVELS, data64, tce_levels - 1);
+
+	phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false);
+	out_be64(p->regs + PHB_IODA_DATA0, data64);
+	p->tve_cache[window_id] = data64;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_map_pe_dma_window_real(struct phb *phb,
+					   uint64_t pe_number,
+					   uint16_t window_id,
+					   uint64_t pci_start_addr,
+					   uint64_t pci_mem_size)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t end = pci_start_addr + pci_mem_size;
+	uint64_t tve;
+
+	if (pe_number >= p->num_pes ||
+	    (window_id >> 1) != pe_number)
+		return OPAL_PARAMETER;
+
+	if (pci_mem_size) {
+		/* Enable */
+
+		/*
+		 * Check that the start address has the right TVE index,
+		 * we only support the 1 bit mode where each PE has 2
+		 * TVEs
+		 */
+		if ((pci_start_addr >> 59) != (window_id & 1))
+			return OPAL_PARAMETER;
+		pci_start_addr &= ((1ull << 59) - 1);
+		end = pci_start_addr + pci_mem_size;
+
+		/* We have to be 16M aligned */
+		if ((pci_start_addr & 0x00ffffff) ||
+		    (pci_mem_size & 0x00ffffff))
+			return OPAL_PARAMETER;
+
+		/*
+		 * It *looks* like this is the max we can support (we need
+		 * to verify this. Also we are not checking for rollover,
+		 * but then we aren't trying too hard to protect ourselves
+		 * againt a completely broken OS.
+		 */
+		if (end > 0x0003ffffffffffffull)
+			return OPAL_PARAMETER;
+
+		/*
+		 * Put start address bits 49:24 into TVE[52:53]||[0:23]
+		 * and end address bits 49:24 into TVE[54:55]||[24:47]
+		 * and set TVE[51]
+		 */
+		tve  = (pci_start_addr << 16) & (0xffffffull << 40);
+		tve |= (pci_start_addr >> 38) & (3ull << 10);
+		tve |= (end >>  8) & (0xfffffful << 16);
+		tve |= (end >> 40) & (3ull << 8);
+		tve |= PPC_BIT(51) | IODA3_TVT_NON_TRANSLATE_50;
+	} else {
+		/* Disable */
+		tve = 0;
+	}
+
+	phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false);
+	out_be64(p->regs + PHB_IODA_DATA0, tve);
+	p->tve_cache[window_id] = tve;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_set_option(struct phb *phb, enum OpalPhbOption opt,
+			       uint64_t setting)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t data64;
+
+	data64 = phb4_read_reg(p, PHB_CTRLR);
+	switch (opt) {
+	case OPAL_PHB_OPTION_TVE1_4GB:
+		if (setting > 1)
+			return OPAL_PARAMETER;
+
+		PHBDBG(p, "4GB bypass mode = %lld\n", setting);
+		if (setting)
+			data64 |= PPC_BIT(24);
+		else
+			data64 &= ~PPC_BIT(24);
+		break;
+	case OPAL_PHB_OPTION_MMIO_EEH_DISABLE:
+		if (setting > 1)
+			return OPAL_PARAMETER;
+
+		PHBDBG(p, "MMIO EEH Disable = %lld\n", setting);
+		if (setting)
+			data64 |= PPC_BIT(14);
+		else
+			data64 &= ~PPC_BIT(14);
+		break;
+	default:
+		return OPAL_UNSUPPORTED;
+	}
+	phb4_write_reg(p, PHB_CTRLR, data64);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_get_option(struct phb *phb, enum OpalPhbOption opt,
+			       __be64 *setting)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t data64;
+
+	data64 = phb4_read_reg(p, PHB_CTRLR);
+	switch (opt) {
+	case OPAL_PHB_OPTION_TVE1_4GB:
+		*setting = cpu_to_be64((data64 & PPC_BIT(24)) ? 1 : 0);
+		break;
+	case OPAL_PHB_OPTION_MMIO_EEH_DISABLE:
+		*setting = cpu_to_be64((data64 & PPC_BIT(14)) ? 1 : 0);
+		break;
+	default:
+		return OPAL_UNSUPPORTED;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_set_ive_pe(struct phb *phb,
+			       uint64_t pe_number,
+			       uint32_t ive_num)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint32_t mist_idx;
+	uint32_t mist_quad;
+	uint32_t mist_shift;
+	uint64_t val;
+
+	if (pe_number >= p->num_pes || ive_num >= (p->num_irqs - 8))
+		return OPAL_PARAMETER;
+
+	mist_idx = ive_num >> 2;
+	mist_quad = ive_num & 3;
+	mist_shift = (3 - mist_quad) << 4;
+	p->mist_cache[mist_idx] &= ~(0x0fffull << mist_shift);
+	p->mist_cache[mist_idx] |=  ((uint64_t)pe_number) << mist_shift;
+
+	/* Note: This has the side effect of clearing P/Q, so this
+	 * shouldn't be called while the interrupt is "hot"
+	 */
+
+	phb4_ioda_sel(p, IODA3_TBL_MIST, mist_idx, false);
+
+	/* We need to inject the appropriate MIST write enable bit
+	 * in the IODA table address register
+	 */
+	val = in_be64(p->regs + PHB_IODA_ADDR);
+	val = SETFIELD(PHB_IODA_AD_MIST_PWV, val, 8 >> mist_quad);
+	out_be64(p->regs + PHB_IODA_ADDR, val);
+
+	/* Write entry */
+	out_be64(p->regs + PHB_IODA_DATA0, p->mist_cache[mist_idx]);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_get_msi_32(struct phb *phb,
+			       uint64_t pe_number,
+			       uint32_t ive_num,
+			       uint8_t msi_range,
+			       uint32_t *msi_address,
+			       uint32_t *message_data)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+
+	/*
+	 * Sanity check. We needn't check on mve_number (PE#)
+	 * on PHB3 since the interrupt source is purely determined
+	 * by its DMA address and data, but the check isn't
+	 * harmful.
+	 */
+	if (pe_number >= p->num_pes ||
+	    ive_num >= (p->num_irqs - 8) ||
+	    msi_range != 1 || !msi_address|| !message_data)
+		return OPAL_PARAMETER;
+
+	/*
+	 * DMA address and data will form the IVE index.
+	 * For more details, please refer to IODA2 spec.
+	 */
+	*msi_address = 0xFFFF0000 | ((ive_num << 4) & 0xFFFFFE0F);
+	*message_data = ive_num & 0x1F;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_get_msi_64(struct phb *phb,
+			       uint64_t pe_number,
+			       uint32_t ive_num,
+			       uint8_t msi_range,
+			       uint64_t *msi_address,
+			       uint32_t *message_data)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+
+	/* Sanity check */
+	if (pe_number >= p->num_pes ||
+	    ive_num >= (p->num_irqs - 8) ||
+	    msi_range != 1 || !msi_address || !message_data)
+		return OPAL_PARAMETER;
+
+	/*
+	 * DMA address and data will form the IVE index.
+	 * For more details, please refer to IODA2 spec.
+	 */
+	*msi_address = (0x1ul << 60) | ((ive_num << 4) & 0xFFFFFFFFFFFFFE0Ful);
+	*message_data = ive_num & 0x1F;
+
+	return OPAL_SUCCESS;
+}
+
+static void phb4_rc_err_clear(struct phb4 *p)
+{
+	/* Init_47 - Clear errors */
+	phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, 0xffff);
+
+	if (p->ecap <= 0)
+		return;
+
+	phb4_pcicfg_write16(&p->phb, 0, p->ecap + PCICAP_EXP_DEVSTAT,
+			     PCICAP_EXP_DEVSTAT_CE	|
+			     PCICAP_EXP_DEVSTAT_NFE	|
+			     PCICAP_EXP_DEVSTAT_FE	|
+			     PCICAP_EXP_DEVSTAT_UE);
+
+	if (p->aercap <= 0)
+		return;
+
+	/* Clear all UE status */
+	phb4_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS,
+			     0xffffffff);
+	/* Clear all CE status */
+	phb4_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS,
+			     0xffffffff);
+	/* Clear root error status */
+	phb4_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA,
+			     0xffffffff);
+}
+
+static void phb4_err_clear_regb(struct phb4 *p)
+{
+	uint64_t val64;
+
+	val64 = phb4_read_reg(p, PHB_REGB_ERR_STATUS);
+	phb4_write_reg(p, PHB_REGB_ERR_STATUS, val64);
+	phb4_write_reg(p, PHB_REGB_ERR1_STATUS, 0x0ul);
+	phb4_write_reg(p, PHB_REGB_ERR_LOG_0, 0x0ul);
+	phb4_write_reg(p, PHB_REGB_ERR_LOG_1, 0x0ul);
+}
+
+/*
+ * The function can be called during error recovery for all classes of
+ * errors.  This is new to PHB4; previous revisions had separate
+ * sequences for INF/ER/Fatal errors.
+ *
+ * "Rec #" in this function refer to "Recov_#" steps in the
+ * PHB4 INF recovery sequence.
+ */
+static void phb4_err_clear(struct phb4 *p)
+{
+	uint64_t val64;
+	uint64_t fir = phb4_read_reg(p, PHB_LEM_FIR_ACCUM);
+
+	/* Rec 1: Acquire the PCI config lock (we don't need to do this) */
+
+	/* Rec 2...15: Clear error status in RC config space */
+	phb4_rc_err_clear(p);
+
+	/* Rec 16...23: Clear PBL errors */
+	val64 = phb4_read_reg(p, PHB_PBL_ERR_STATUS);
+	phb4_write_reg(p, PHB_PBL_ERR_STATUS, val64);
+	phb4_write_reg(p, PHB_PBL_ERR1_STATUS, 0x0ul);
+	phb4_write_reg(p, PHB_PBL_ERR_LOG_0, 0x0ul);
+	phb4_write_reg(p, PHB_PBL_ERR_LOG_1, 0x0ul);
+
+	/* Rec 24...31: Clear REGB errors */
+	phb4_err_clear_regb(p);
+
+	/* Rec 32...59: Clear PHB error trap */
+	val64 = phb4_read_reg(p, PHB_TXE_ERR_STATUS);
+	phb4_write_reg(p, PHB_TXE_ERR_STATUS, val64);
+	phb4_write_reg(p, PHB_TXE_ERR1_STATUS, 0x0ul);
+	phb4_write_reg(p, PHB_TXE_ERR_LOG_0, 0x0ul);
+	phb4_write_reg(p, PHB_TXE_ERR_LOG_1, 0x0ul);
+
+	val64 = phb4_read_reg(p, PHB_RXE_ARB_ERR_STATUS);
+	phb4_write_reg(p, PHB_RXE_ARB_ERR_STATUS, val64);
+	phb4_write_reg(p, PHB_RXE_ARB_ERR1_STATUS, 0x0ul);
+	phb4_write_reg(p, PHB_RXE_ARB_ERR_LOG_0, 0x0ul);
+	phb4_write_reg(p, PHB_RXE_ARB_ERR_LOG_1, 0x0ul);
+
+	val64 = phb4_read_reg(p, PHB_RXE_MRG_ERR_STATUS);
+	phb4_write_reg(p, PHB_RXE_MRG_ERR_STATUS, val64);
+	phb4_write_reg(p, PHB_RXE_MRG_ERR1_STATUS, 0x0ul);
+	phb4_write_reg(p, PHB_RXE_MRG_ERR_LOG_0, 0x0ul);
+	phb4_write_reg(p, PHB_RXE_MRG_ERR_LOG_1, 0x0ul);
+
+	val64 = phb4_read_reg(p, PHB_RXE_TCE_ERR_STATUS);
+	phb4_write_reg(p, PHB_RXE_TCE_ERR_STATUS, val64);
+	phb4_write_reg(p, PHB_RXE_TCE_ERR1_STATUS, 0x0ul);
+	phb4_write_reg(p, PHB_RXE_TCE_ERR_LOG_0, 0x0ul);
+	phb4_write_reg(p, PHB_RXE_TCE_ERR_LOG_1, 0x0ul);
+
+	val64 = phb4_read_reg(p, PHB_ERR_STATUS);
+	phb4_write_reg(p, PHB_ERR_STATUS, val64);
+	phb4_write_reg(p, PHB_ERR1_STATUS, 0x0ul);
+	phb4_write_reg(p, PHB_ERR_LOG_0, 0x0ul);
+	phb4_write_reg(p, PHB_ERR_LOG_1, 0x0ul);
+
+	/* Rec 61/62: Clear FIR/WOF */
+	phb4_write_reg(p, PHB_LEM_FIR_AND_MASK, ~fir);
+	phb4_write_reg(p, PHB_LEM_WOF, 0x0ul);
+
+	/* Rec 63: Update LEM mask to its initial value */
+	phb4_write_reg(p, PHB_LEM_ERROR_MASK, 0x0ul);
+
+	/* Rec 64: Clear the PCI config lock (we don't need to do this) */
+}
+
+static void phb4_read_phb_status(struct phb4 *p,
+				 struct OpalIoPhb4ErrorData *stat)
+{
+	uint32_t i;
+	__be64 *pPEST;
+	uint16_t __16;
+	uint32_t __32;
+	uint64_t __64;
+
+	memset(stat, 0, sizeof(struct OpalIoPhb4ErrorData));
+
+	/* Error data common part */
+	stat->common.version = cpu_to_be32(OPAL_PHB_ERROR_DATA_VERSION_1);
+	stat->common.ioType  = cpu_to_be32(OPAL_PHB_ERROR_DATA_TYPE_PHB4);
+	stat->common.len     = cpu_to_be32(sizeof(struct OpalIoPhb4ErrorData));
+
+	/* Use ASB for config space if the PHB is fenced */
+	if (p->flags & PHB4_AIB_FENCED)
+		p->flags |= PHB4_CFG_USE_ASB;
+
+	/* Grab RC bridge control, make it 32-bit */
+	phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &__16);
+	stat->brdgCtl = cpu_to_be32(__16);
+
+	/*
+	 * Grab various RC PCIe capability registers. All device, slot
+	 * and link status are 16-bit, so we grab the pair control+status
+	 * for each of them
+	 */
+	phb4_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL, &__32);
+	stat->deviceStatus = cpu_to_be32(__32);
+	phb4_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_SLOTCTL, &__32);
+	stat->slotStatus = cpu_to_be32(__32);
+	phb4_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL, &__32);
+	stat->linkStatus = cpu_to_be32(__32);
+
+	 /*
+	 * I assume those are the standard config space header, cmd & status
+	 * together makes 32-bit. Secondary status is 16-bit so I'll clear
+	 * the top on that one
+	 */
+	phb4_pcicfg_read32(&p->phb, 0, PCI_CFG_CMD, &__32);
+	stat->devCmdStatus = cpu_to_be32(__32);
+	phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, &__16);
+	stat->devSecStatus = cpu_to_be32(__16);
+
+	/* Grab a bunch of AER regs */
+	phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA, &__32);
+	stat->rootErrorStatus = cpu_to_be32(__32);
+	phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS, &__32);
+	stat->uncorrErrorStatus = cpu_to_be32(__32);
+
+	phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS, &__32);
+	stat->corrErrorStatus = cpu_to_be32(__32);
+
+	phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG0, &__32);
+	stat->tlpHdr1 = cpu_to_be32(__32);
+
+	phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG1, &__32);
+	stat->tlpHdr2 = cpu_to_be32(__32);
+
+	phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG2, &__32);
+	stat->tlpHdr3 = cpu_to_be32(__32);
+
+	phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG3, &__32);
+	stat->tlpHdr4 = cpu_to_be32(__32);
+
+	phb4_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_SRCID, &__32);
+	stat->sourceId = cpu_to_be32(__32);
+
+
+	/* PEC NFIR, same as P8/PHB3 */
+	xscom_read(p->chip_id, p->pe_stk_xscom + 0x0, &__64);
+	stat->nFir = cpu_to_be64(__64);
+	xscom_read(p->chip_id, p->pe_stk_xscom + 0x3, &__64);
+	stat->nFirMask = cpu_to_be64(__64);
+	xscom_read(p->chip_id, p->pe_stk_xscom + 0x8, &__64);
+	stat->nFirWOF = cpu_to_be64(__64);
+
+	/* PHB4 inbound and outbound error Regs */
+	stat->phbPlssr = cpu_to_be64(phb4_read_reg_asb(p, PHB_CPU_LOADSTORE_STATUS));
+	stat->phbCsr = cpu_to_be64(phb4_read_reg_asb(p, PHB_DMA_CHAN_STATUS));
+	stat->lemFir = cpu_to_be64(phb4_read_reg_asb(p, PHB_LEM_FIR_ACCUM));
+	stat->lemErrorMask = cpu_to_be64(phb4_read_reg_asb(p, PHB_LEM_ERROR_MASK));
+	stat->lemWOF = cpu_to_be64(phb4_read_reg_asb(p, PHB_LEM_WOF));
+	stat->phbErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR_STATUS));
+	stat->phbFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR1_STATUS));
+	stat->phbErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR_LOG_0));
+	stat->phbErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_ERR_LOG_1));
+	stat->phbTxeErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR_STATUS));
+	stat->phbTxeFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR1_STATUS));
+	stat->phbTxeErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR_LOG_0));
+	stat->phbTxeErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_TXE_ERR_LOG_1));
+	stat->phbRxeArbErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR_STATUS));
+	stat->phbRxeArbFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR1_STATUS));
+	stat->phbRxeArbErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR_LOG_0));
+	stat->phbRxeArbErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_ARB_ERR_LOG_1));
+	stat->phbRxeMrgErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR_STATUS));
+	stat->phbRxeMrgFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR1_STATUS));
+	stat->phbRxeMrgErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR_LOG_0));
+	stat->phbRxeMrgErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_MRG_ERR_LOG_1));
+	stat->phbRxeTceErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR_STATUS));
+	stat->phbRxeTceFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR1_STATUS));
+	stat->phbRxeTceErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR_LOG_0));
+	stat->phbRxeTceErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_RXE_TCE_ERR_LOG_1));
+
+	/* PHB4 REGB error registers */
+	stat->phbPblErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR_STATUS));
+	stat->phbPblFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR1_STATUS));
+	stat->phbPblErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR_LOG_0));
+	stat->phbPblErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PBL_ERR_LOG_1));
+
+	stat->phbPcieDlpErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_PCIE_DLP_ERR_STATUS));
+	stat->phbPcieDlpErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PCIE_DLP_ERRLOG1));
+	stat->phbPcieDlpErrorLog2 = cpu_to_be64(phb4_read_reg_asb(p, PHB_PCIE_DLP_ERRLOG2));
+
+	stat->phbRegbErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR_STATUS));
+	stat->phbRegbFirstErrorStatus = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR1_STATUS));
+	stat->phbRegbErrorLog0 = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR_LOG_0));
+	stat->phbRegbErrorLog1 = cpu_to_be64(phb4_read_reg_asb(p, PHB_REGB_ERR_LOG_1));
+
+	/*
+	 * Grab PESTA & B content. The error bit (bit#0) should
+	 * be fetched from IODA and the left content from memory
+	 * resident tables.
+	 */
+	 pPEST = (__be64 *)p->tbl_pest;
+	 phb4_ioda_sel(p, IODA3_TBL_PESTA, 0, true);
+	 for (i = 0; i < p->max_num_pes; i++) {
+		 stat->pestA[i] = cpu_to_be64(phb4_read_reg_asb(p, PHB_IODA_DATA0));
+		 stat->pestA[i] |= pPEST[2 * i];
+	 }
+
+	 phb4_ioda_sel(p, IODA3_TBL_PESTB, 0, true);
+	 for (i = 0; i < p->max_num_pes; i++) {
+		 stat->pestB[i] = cpu_to_be64(phb4_read_reg_asb(p, PHB_IODA_DATA0));
+		 stat->pestB[i] |= pPEST[2 * i + 1];
+	 }
+}
+
+static void __unused phb4_dump_peltv(struct phb4 *p)
+{
+	int stride = p->max_num_pes / 64;
+	uint64_t *tbl = (void *) p->tbl_peltv;
+	unsigned int pe;
+
+	PHBERR(p, "PELT-V: base addr: %p size: %llx (%d PEs, stride = %d)\n",
+			tbl, p->tbl_peltv_size, p->max_num_pes, stride);
+
+	for (pe = 0; pe < p->max_num_pes; pe++) {
+		unsigned int i, j;
+		uint64_t sum = 0;
+
+		i = pe * stride;
+
+		/*
+		 * Only print an entry if there's bits set in the PE's
+		 * PELT-V entry. There's a few hundred possible PEs and
+		 * generally only a handful will be in use.
+		 */
+
+		for (j = 0; j < stride; j++)
+			sum |= tbl[i + j];
+		if (!sum)
+			continue; /* unused PE, skip it */
+
+		if (p->max_num_pes == 512) {
+			PHBERR(p, "PELT-V[%03x] = "
+				"%016llx %016llx %016llx %016llx"
+				"%016llx %016llx %016llx %016llx\n", pe,
+				tbl[i + 0], tbl[i + 1], tbl[i + 2], tbl[i + 3],
+				tbl[i + 4], tbl[i + 5], tbl[i + 6], tbl[i + 7]);
+		} else if (p->max_num_pes == 256) {
+			PHBERR(p, "PELT-V[%03x] = "
+				"%016llx %016llx %016llx %016llx\n", pe,
+				tbl[i + 0], tbl[i + 1], tbl[i + 2], tbl[i + 3]);
+		}
+	}
+}
+
+static void __unused phb4_dump_ioda_table(struct phb4 *p, int table)
+{
+	const char *name;
+	int entries, i;
+
+	switch (table) {
+	case IODA3_TBL_LIST:
+		name = "LIST";
+		entries = 8;
+		break;
+	case IODA3_TBL_MIST:
+		name = "MIST";
+		entries = 1024;
+		break;
+	case IODA3_TBL_RCAM:
+		name = "RCAM";
+		entries = 128;
+		break;
+	case IODA3_TBL_MRT:
+		name = "MRT";
+		entries = 16;
+		break;
+	case IODA3_TBL_PESTA:
+		name = "PESTA";
+		entries = 512;
+		break;
+	case IODA3_TBL_PESTB:
+		name = "PESTB";
+		entries = 512;
+		break;
+	case IODA3_TBL_TVT:
+		name = "TVT";
+		entries = 512;
+		break;
+	case IODA3_TBL_TCAM:
+		name = "TCAM";
+		entries = 1024;
+		break;
+	case IODA3_TBL_TDR:
+		name = "TDR";
+		entries = 1024;
+		break;
+	case IODA3_TBL_MBT: /* special case, see below */
+		name = "MBT";
+		entries = 64;
+		break;
+	case IODA3_TBL_MDT:
+		name = "MDT";
+		entries = 512;
+		break;
+	case IODA3_TBL_PEEV:
+		name = "PEEV";
+		entries = 8;
+		break;
+	default:
+		PHBERR(p, "Invalid IODA table %d!\n", table);
+		return;
+	}
+
+	PHBERR(p, "Start %s dump (only non-zero entries are printed):\n", name);
+
+	phb4_ioda_sel(p, table, 0, true);
+
+	/*
+	 * Each entry in the MBT is 16 bytes. Every other table has 8 byte
+	 * entries so we special case the MDT to keep the output readable.
+	 */
+	if (table == IODA3_TBL_MBT) {
+		for (i = 0; i < 32; i++) {
+			uint64_t v1 = phb4_read_reg_asb(p, PHB_IODA_DATA0);
+			uint64_t v2 = phb4_read_reg_asb(p, PHB_IODA_DATA0);
+
+			if (!v1 && !v2)
+				continue;
+			PHBERR(p, "MBT[%03x] = %016llx %016llx\n", i, v1, v2);
+		}
+	} else {
+		for (i = 0; i < entries; i++) {
+			uint64_t v = phb4_read_reg_asb(p, PHB_IODA_DATA0);
+
+			if (!v)
+				continue;
+			PHBERR(p, "%s[%03x] = %016llx\n", name, i, v);
+		}
+	}
+
+	PHBERR(p, "End %s dump\n", name);
+}
+
+static void phb4_eeh_dump_regs(struct phb4 *p)
+{
+	struct OpalIoPhb4ErrorData *s;
+	uint16_t reg;
+	unsigned int i;
+
+	if (!verbose_eeh)
+		return;
+
+	s = zalloc(sizeof(struct OpalIoPhb4ErrorData));
+	if (!s) {
+		PHBERR(p, "Failed to allocate error info !\n");
+		return;
+	}
+	phb4_read_phb_status(p, s);
+
+	PHBERR(p, "                 brdgCtl = %08x\n", be32_to_cpu(s->brdgCtl));
+
+	/* PHB4 cfg regs */
+	PHBERR(p, "            deviceStatus = %08x\n", be32_to_cpu(s->deviceStatus));
+	PHBERR(p, "              slotStatus = %08x\n", be32_to_cpu(s->slotStatus));
+	PHBERR(p, "              linkStatus = %08x\n", be32_to_cpu(s->linkStatus));
+	PHBERR(p, "            devCmdStatus = %08x\n", be32_to_cpu(s->devCmdStatus));
+	PHBERR(p, "            devSecStatus = %08x\n", be32_to_cpu(s->devSecStatus));
+	PHBERR(p, "         rootErrorStatus = %08x\n", be32_to_cpu(s->rootErrorStatus));
+	PHBERR(p, "         corrErrorStatus = %08x\n", be32_to_cpu(s->corrErrorStatus));
+	PHBERR(p, "       uncorrErrorStatus = %08x\n", be32_to_cpu(s->uncorrErrorStatus));
+
+	/* Two non OPAL API registers that are useful */
+	phb4_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL, &reg);
+	PHBERR(p, "                  devctl = %08x\n", reg);
+	phb4_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_DEVSTAT,
+			   &reg);
+	PHBERR(p, "                 devStat = %08x\n", reg);
+
+	/* Byte swap TLP headers so they are the same as the PCIe spec */
+	PHBERR(p, "                 tlpHdr1 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr1)));
+	PHBERR(p, "                 tlpHdr2 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr2)));
+	PHBERR(p, "                 tlpHdr3 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr3)));
+	PHBERR(p, "                 tlpHdr4 = %08x\n", cpu_to_le32(be32_to_cpu(s->tlpHdr4)));
+	PHBERR(p, "                sourceId = %08x\n", be32_to_cpu(s->sourceId));
+	PHBERR(p, "                    nFir = %016llx\n", be64_to_cpu(s->nFir));
+	PHBERR(p, "                nFirMask = %016llx\n", be64_to_cpu(s->nFirMask));
+	PHBERR(p, "                 nFirWOF = %016llx\n", be64_to_cpu(s->nFirWOF));
+	PHBERR(p, "                phbPlssr = %016llx\n", be64_to_cpu(s->phbPlssr));
+	PHBERR(p, "                  phbCsr = %016llx\n", be64_to_cpu(s->phbCsr));
+	PHBERR(p, "                  lemFir = %016llx\n", be64_to_cpu(s->lemFir));
+	PHBERR(p, "            lemErrorMask = %016llx\n", be64_to_cpu(s->lemErrorMask));
+	PHBERR(p, "                  lemWOF = %016llx\n", be64_to_cpu(s->lemWOF));
+	PHBERR(p, "          phbErrorStatus = %016llx\n", be64_to_cpu(s->phbErrorStatus));
+	PHBERR(p, "     phbFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbFirstErrorStatus));
+	PHBERR(p, "            phbErrorLog0 = %016llx\n", be64_to_cpu(s->phbErrorLog0));
+	PHBERR(p, "            phbErrorLog1 = %016llx\n", be64_to_cpu(s->phbErrorLog1));
+	PHBERR(p, "       phbTxeErrorStatus = %016llx\n", be64_to_cpu(s->phbTxeErrorStatus));
+	PHBERR(p, "  phbTxeFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbTxeFirstErrorStatus));
+	PHBERR(p, "         phbTxeErrorLog0 = %016llx\n", be64_to_cpu(s->phbTxeErrorLog0));
+	PHBERR(p, "         phbTxeErrorLog1 = %016llx\n", be64_to_cpu(s->phbTxeErrorLog1));
+	PHBERR(p, "    phbRxeArbErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeArbErrorStatus));
+	PHBERR(p, "phbRxeArbFrstErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeArbFirstErrorStatus));
+	PHBERR(p, "      phbRxeArbErrorLog0 = %016llx\n", be64_to_cpu(s->phbRxeArbErrorLog0));
+	PHBERR(p, "      phbRxeArbErrorLog1 = %016llx\n", be64_to_cpu(s->phbRxeArbErrorLog1));
+	PHBERR(p, "    phbRxeMrgErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeMrgErrorStatus));
+	PHBERR(p, "phbRxeMrgFrstErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeMrgFirstErrorStatus));
+	PHBERR(p, "      phbRxeMrgErrorLog0 = %016llx\n", be64_to_cpu(s->phbRxeMrgErrorLog0));
+	PHBERR(p, "      phbRxeMrgErrorLog1 = %016llx\n", be64_to_cpu(s->phbRxeMrgErrorLog1));
+	PHBERR(p, "    phbRxeTceErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeTceErrorStatus));
+	PHBERR(p, "phbRxeTceFrstErrorStatus = %016llx\n", be64_to_cpu(s->phbRxeTceFirstErrorStatus));
+	PHBERR(p, "      phbRxeTceErrorLog0 = %016llx\n", be64_to_cpu(s->phbRxeTceErrorLog0));
+	PHBERR(p, "      phbRxeTceErrorLog1 = %016llx\n", be64_to_cpu(s->phbRxeTceErrorLog1));
+	PHBERR(p, "       phbPblErrorStatus = %016llx\n", be64_to_cpu(s->phbPblErrorStatus));
+	PHBERR(p, "  phbPblFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbPblFirstErrorStatus));
+	PHBERR(p, "         phbPblErrorLog0 = %016llx\n", be64_to_cpu(s->phbPblErrorLog0));
+	PHBERR(p, "         phbPblErrorLog1 = %016llx\n", be64_to_cpu(s->phbPblErrorLog1));
+	PHBERR(p, "     phbPcieDlpErrorLog1 = %016llx\n", be64_to_cpu(s->phbPcieDlpErrorLog1));
+	PHBERR(p, "     phbPcieDlpErrorLog2 = %016llx\n", be64_to_cpu(s->phbPcieDlpErrorLog2));
+	PHBERR(p, "   phbPcieDlpErrorStatus = %016llx\n", be64_to_cpu(s->phbPcieDlpErrorStatus));
+
+	PHBERR(p, "      phbRegbErrorStatus = %016llx\n", be64_to_cpu(s->phbRegbErrorStatus));
+	PHBERR(p, " phbRegbFirstErrorStatus = %016llx\n", be64_to_cpu(s->phbRegbFirstErrorStatus));
+	PHBERR(p, "        phbRegbErrorLog0 = %016llx\n", be64_to_cpu(s->phbRegbErrorLog0));
+	PHBERR(p, "        phbRegbErrorLog1 = %016llx\n", be64_to_cpu(s->phbRegbErrorLog1));
+
+	for (i = 0; i < p->max_num_pes; i++) {
+		if (!s->pestA[i] && !s->pestB[i])
+			continue;
+		PHBERR(p, "               PEST[%03x] = %016llx %016llx\n",
+		       i, be64_to_cpu(s->pestA[i]), be64_to_cpu(s->pestB[i]));
+	}
+	free(s);
+}
+
+static int64_t phb4_set_pe(struct phb *phb,
+			   uint64_t pe_number,
+			   uint64_t bdfn,
+			   uint8_t bcompare,
+			   uint8_t dcompare,
+			   uint8_t fcompare,
+			   uint8_t action)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t mask, idx;
+
+	/* Sanity check */
+	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+		return OPAL_PARAMETER;
+	if (pe_number >= p->num_pes || bdfn > 0xffff ||
+	    bcompare > OpalPciBusAll ||
+	    dcompare > OPAL_COMPARE_RID_DEVICE_NUMBER ||
+	    fcompare > OPAL_COMPARE_RID_FUNCTION_NUMBER)
+		return OPAL_PARAMETER;
+
+	/* match everything by default */
+	mask = 0;
+
+	/* Figure out the RID range */
+	if (bcompare != OpalPciBusAny)
+		mask  = ((0x1 << (bcompare + 1)) - 1) << (15 - bcompare);
+
+	if (dcompare == OPAL_COMPARE_RID_DEVICE_NUMBER)
+		mask |= 0xf8;
+
+	if (fcompare == OPAL_COMPARE_RID_FUNCTION_NUMBER)
+		mask |= 0x7;
+
+	if (action == OPAL_UNMAP_PE)
+		pe_number = PHB4_RESERVED_PE_NUM(p);
+
+	/* Map or unmap the RTT range */
+	for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++)
+		if ((idx & mask) == (bdfn & mask))
+			p->tbl_rtt[idx] = cpu_to_be16(pe_number);
+
+	/* Invalidate the RID Translation Cache (RTC) inside the PHB */
+	out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_set_peltv(struct phb *phb,
+			      uint32_t parent_pe,
+			      uint32_t child_pe,
+			      uint8_t state)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint32_t idx, mask;
+
+	/* Sanity check */
+	if (parent_pe >= p->num_pes || child_pe >= p->num_pes)
+		return OPAL_PARAMETER;
+
+	/* Find index for parent PE */
+	idx = parent_pe * (p->max_num_pes / 8);
+	idx += (child_pe / 8);
+	mask = 0x1 << (7 - (child_pe % 8));
+
+	if (state)
+		p->tbl_peltv[idx] |= mask;
+	else
+		p->tbl_peltv[idx] &= ~mask;
+
+	return OPAL_SUCCESS;
+}
+
+static void phb4_prepare_link_change(struct pci_slot *slot, bool is_up)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+	uint32_t reg32;
+
+	p->has_link = is_up;
+
+	if (is_up) {
+		/* Clear AER receiver error status */
+		phb4_pcicfg_write32(&p->phb, 0, p->aercap +
+				    PCIECAP_AER_CE_STATUS,
+				    PCIECAP_AER_CE_RECVR_ERR);
+		/* Unmask receiver error status in AER */
+		phb4_pcicfg_read32(&p->phb, 0, p->aercap +
+				   PCIECAP_AER_CE_MASK, &reg32);
+		reg32 &= ~PCIECAP_AER_CE_RECVR_ERR;
+		phb4_pcicfg_write32(&p->phb, 0, p->aercap +
+				    PCIECAP_AER_CE_MASK, reg32);
+
+		/* Don't block PCI-CFG */
+		p->flags &= ~PHB4_CFG_BLOCKED;
+
+		/* Re-enable link down errors */
+		out_be64(p->regs + PHB_PCIE_MISC_STRAP,
+			 0x0000060000000000ull);
+
+		/* Re-enable error status indicators that trigger irqs */
+		out_be64(p->regs + PHB_REGB_ERR_INF_ENABLE,
+			 0x2130006efca8bc00ull);
+		out_be64(p->regs + PHB_REGB_ERR_ERC_ENABLE,
+			 0x0080000000000000ull);
+		out_be64(p->regs + PHB_REGB_ERR_FAT_ENABLE,
+			 0xde0fff91035743ffull);
+
+	} else {
+		/* Mask AER receiver error */
+		phb4_pcicfg_read32(&p->phb, 0, p->aercap +
+				   PCIECAP_AER_CE_MASK, &reg32);
+		reg32 |= PCIECAP_AER_CE_RECVR_ERR;
+		phb4_pcicfg_write32(&p->phb, 0, p->aercap +
+				    PCIECAP_AER_CE_MASK, reg32);
+
+		/* Clear error link enable & error link down kill enable */
+		out_be64(p->regs + PHB_PCIE_MISC_STRAP, 0);
+
+		/* Disable all error status indicators that trigger irqs */
+		out_be64(p->regs + PHB_REGB_ERR_INF_ENABLE, 0);
+		out_be64(p->regs + PHB_REGB_ERR_ERC_ENABLE, 0);
+		out_be64(p->regs + PHB_REGB_ERR_FAT_ENABLE, 0);
+
+		/* Block PCI-CFG access */
+		p->flags |= PHB4_CFG_BLOCKED;
+	}
+}
+
+static int64_t phb4_get_presence_state(struct pci_slot *slot, uint8_t *val)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+	uint64_t hps, dtctl;
+
+	/* Test for PHB in error state ? */
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	/* Check hotplug status */
+	hps = in_be64(p->regs + PHB_PCIE_HOTPLUG_STATUS);
+	if (!(hps & PHB_PCIE_HPSTAT_PRESENCE)) {
+		*val = OPAL_PCI_SLOT_PRESENT;
+	} else {
+		/*
+		 * If it says not present but link is up, then we assume
+		 * we are on a broken simulation environment and still
+		 * return a valid presence. Otherwise, not present.
+		 */
+		dtctl = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (dtctl & PHB_PCIE_DLP_TL_LINKACT) {
+			PHBERR(p, "Presence detect 0 but link set !\n");
+			*val = OPAL_PCI_SLOT_PRESENT;
+		} else {
+			*val = OPAL_PCI_SLOT_EMPTY;
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_get_link_info(struct pci_slot *slot, uint8_t *speed,
+				   uint8_t *width)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+	uint64_t reg;
+	uint16_t state;
+	int64_t rc;
+	uint8_t s;
+
+	/* Link is up, let's find the actual speed */
+	reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+	if (!(reg & PHB_PCIE_DLP_TL_LINKACT)) {
+		*width = 0;
+		if (speed)
+			*speed = 0;
+		return OPAL_SUCCESS;
+	}
+
+	rc = phb4_pcicfg_read16(&p->phb, 0,
+				p->ecap + PCICAP_EXP_LSTAT, &state);
+	if (rc != OPAL_SUCCESS) {
+		PHBERR(p, "%s: Error %lld getting link state\n", __func__, rc);
+		return OPAL_HARDWARE;
+	}
+
+	if (state & PCICAP_EXP_LSTAT_DLLL_ACT) {
+		*width = ((state & PCICAP_EXP_LSTAT_WIDTH) >> 4);
+		s =  state & PCICAP_EXP_LSTAT_SPEED;
+	} else {
+		*width = 0;
+		s = 0;
+	}
+
+	if (speed)
+		*speed = s;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_get_link_state(struct pci_slot *slot, uint8_t *val)
+{
+	return phb4_get_link_info(slot, NULL, val);
+}
+
+static int64_t phb4_retry_state(struct pci_slot *slot)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+
+	/* Mark link as down */
+	phb4_prepare_link_change(slot, false);
+
+	/* Last attempt to activate link */
+	if (slot->link_retries == 1) {
+		if (slot->state == PHB4_SLOT_LINK_WAIT) {
+			PHBERR(p, "Falling back to GEN1 training\n");
+			p->max_link_speed = 1;
+		}
+	}
+
+	if (!slot->link_retries--) {
+		switch (slot->state) {
+		case PHB4_SLOT_LINK_WAIT_ELECTRICAL:
+			PHBERR(p, "Presence detected but no electrical link\n");
+			break;
+		case PHB4_SLOT_LINK_WAIT:
+			PHBERR(p, "Electrical link detected but won't train\n");
+			break;
+		case PHB4_SLOT_LINK_STABLE:
+			PHBERR(p, "Linked trained but was degraded or unstable\n");
+			break;
+		default:
+			PHBERR(p, "Unknown link issue\n");
+		}
+		return OPAL_HARDWARE;
+	}
+
+	pci_slot_set_state(slot, PHB4_SLOT_CRESET_START);
+	return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+}
+
+static uint64_t phb4_train_info(struct phb4 *p, uint64_t reg, unsigned long dt)
+{
+	uint64_t ltssm_state = GETFIELD(PHB_PCIE_DLP_LTSSM_TRC, reg);
+	char s[80];
+
+	snprintf(s, sizeof(s), "TRACE:0x%016llx % 2lims",
+		 reg, tb_to_msecs(dt));
+
+	if (reg & PHB_PCIE_DLP_TL_LINKACT)
+		snprintf(s, sizeof(s), "%s trained ", s);
+	else if (reg & PHB_PCIE_DLP_TRAINING)
+		snprintf(s, sizeof(s), "%s training", s);
+	else if (reg & PHB_PCIE_DLP_INBAND_PRESENCE)
+		snprintf(s, sizeof(s), "%s presence", s);
+	else
+		snprintf(s, sizeof(s), "%s         ", s);
+
+	snprintf(s, sizeof(s), "%s GEN%lli:x%02lli:", s,
+		 GETFIELD(PHB_PCIE_DLP_LINK_SPEED, reg),
+		 GETFIELD(PHB_PCIE_DLP_LINK_WIDTH, reg));
+
+	switch (ltssm_state) {
+	case PHB_PCIE_DLP_LTSSM_RESET:
+		snprintf(s, sizeof(s), "%sreset", s);
+		break;
+	case PHB_PCIE_DLP_LTSSM_DETECT:
+		snprintf(s, sizeof(s), "%sdetect", s);
+		break;
+	case PHB_PCIE_DLP_LTSSM_POLLING:
+		snprintf(s, sizeof(s), "%spolling", s);
+		break;
+	case PHB_PCIE_DLP_LTSSM_CONFIG:
+		snprintf(s, sizeof(s), "%sconfig", s);
+		break;
+	case PHB_PCIE_DLP_LTSSM_L0:
+		snprintf(s, sizeof(s), "%sL0", s);
+		break;
+	case PHB_PCIE_DLP_LTSSM_REC:
+		snprintf(s, sizeof(s), "%srecovery", s);
+		break;
+	case PHB_PCIE_DLP_LTSSM_L1:
+		snprintf(s, sizeof(s), "%sL1", s);
+		break;
+	case PHB_PCIE_DLP_LTSSM_L2:
+		snprintf(s, sizeof(s), "%sL2", s);
+		break;
+	case PHB_PCIE_DLP_LTSSM_HOTRESET:
+		snprintf(s, sizeof(s), "%shotreset", s);
+		break;
+	case PHB_PCIE_DLP_LTSSM_DISABLED:
+		snprintf(s, sizeof(s), "%sdisabled", s);
+		break;
+	case PHB_PCIE_DLP_LTSSM_LOOPBACK:
+		snprintf(s, sizeof(s), "%sloopback", s);
+		break;
+	default:
+		snprintf(s, sizeof(s), "%sunvalid", s);
+	}
+	PHBNOTICE(p, "%s\n", s);
+
+	return ltssm_state;
+}
+
+static void phb4_dump_pec_err_regs(struct phb4 *p)
+{
+	uint64_t nfir_p_wof, nfir_n_wof, err_aib;
+	uint64_t err_rpt0, err_rpt1;
+
+	/* Read the PCI and NEST FIRs and dump them. Also cache PCI/NEST FIRs */
+	xscom_read(p->chip_id,
+		   p->pci_stk_xscom + XPEC_PCI_STK_PCI_FIR,  &p->pfir_cache);
+	xscom_read(p->chip_id,
+		   p->pci_stk_xscom + XPEC_PCI_STK_PCI_FIR_WOF, &nfir_p_wof);
+	xscom_read(p->chip_id,
+		   p->pe_stk_xscom + XPEC_NEST_STK_PCI_NFIR, &p->nfir_cache);
+	xscom_read(p->chip_id,
+		   p->pe_stk_xscom + XPEC_NEST_STK_PCI_NFIR_WOF, &nfir_n_wof);
+	xscom_read(p->chip_id,
+		   p->pe_stk_xscom + XPEC_NEST_STK_ERR_RPT0, &err_rpt0);
+	xscom_read(p->chip_id,
+		   p->pe_stk_xscom + XPEC_NEST_STK_ERR_RPT1, &err_rpt1);
+	xscom_read(p->chip_id,
+		   p->pci_stk_xscom + XPEC_PCI_STK_PBAIB_ERR_REPORT, &err_aib);
+
+	PHBERR(p, "            PCI FIR=%016llx\n", p->pfir_cache);
+	PHBERR(p, "        PCI FIR WOF=%016llx\n", nfir_p_wof);
+	PHBERR(p, "           NEST FIR=%016llx\n", p->nfir_cache);
+	PHBERR(p, "       NEST FIR WOF=%016llx\n", nfir_n_wof);
+	PHBERR(p, "           ERR RPT0=%016llx\n", err_rpt0);
+	PHBERR(p, "           ERR RPT1=%016llx\n", err_rpt1);
+	PHBERR(p, "            AIB ERR=%016llx\n", err_aib);
+}
+
+static void phb4_dump_capp_err_regs(struct phb4 *p)
+{
+	uint64_t fir, apc_master_err, snoop_err, transport_err;
+	uint64_t tlbi_err, capp_err_status;
+	uint64_t offset = PHB4_CAPP_REG_OFFSET(p);
+
+	xscom_read(p->chip_id, CAPP_FIR + offset, &fir);
+	xscom_read(p->chip_id, CAPP_APC_MASTER_ERR_RPT + offset,
+		   &apc_master_err);
+	xscom_read(p->chip_id, CAPP_SNOOP_ERR_RTP + offset, &snoop_err);
+	xscom_read(p->chip_id, CAPP_TRANSPORT_ERR_RPT + offset, &transport_err);
+	xscom_read(p->chip_id, CAPP_TLBI_ERR_RPT + offset, &tlbi_err);
+	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &capp_err_status);
+
+	PHBERR(p, "           CAPP FIR=%016llx\n", fir);
+	PHBERR(p, "CAPP APC MASTER ERR=%016llx\n", apc_master_err);
+	PHBERR(p, "     CAPP SNOOP ERR=%016llx\n", snoop_err);
+	PHBERR(p, " CAPP TRANSPORT ERR=%016llx\n", transport_err);
+	PHBERR(p, "      CAPP TLBI ERR=%016llx\n", tlbi_err);
+	PHBERR(p, "    CAPP ERR STATUS=%016llx\n", capp_err_status);
+}
+
+/* Check if AIB is fenced via PBCQ NFIR */
+static bool phb4_fenced(struct phb4 *p)
+{
+
+	/* Already fenced ? */
+	if (p->flags & PHB4_AIB_FENCED)
+		return true;
+
+	/*
+	 * An all 1's from the PHB indicates a PHB freeze/fence. We
+	 * don't really differenciate them at this point.
+	 */
+	if (in_be64(p->regs + PHB_CPU_LOADSTORE_STATUS)!= 0xfffffffffffffffful)
+		return false;
+
+	/* Mark ourselves fenced */
+	p->flags |= PHB4_AIB_FENCED;
+
+	PHBERR(p, "PHB Freeze/Fence detected !\n");
+	phb4_dump_pec_err_regs(p);
+
+	/*
+	 * dump capp error registers in case phb was fenced due to capp.
+	 * Expect p->nfir_cache already updated in phb4_dump_pec_err_regs()
+	 */
+	if (p->nfir_cache & XPEC_NEST_STK_PCI_NFIR_CXA_PE_CAPP)
+		phb4_dump_capp_err_regs(p);
+
+	phb4_eeh_dump_regs(p);
+
+	return true;
+}
+
+static bool phb4_check_reg(struct phb4 *p, uint64_t reg)
+{
+	if (reg == 0xffffffffffffffffUL)
+		return !phb4_fenced(p);
+	return true;
+}
+
+static void phb4_get_info(struct phb *phb, uint16_t bdfn, uint8_t *speed,
+			  uint8_t *width)
+{
+	int32_t ecap;
+	uint32_t cap;
+
+	ecap = pci_find_cap(phb, bdfn, PCI_CFG_CAP_ID_EXP);
+	pci_cfg_read32(phb, bdfn, ecap + PCICAP_EXP_LCAP, &cap);
+	*width = (cap & PCICAP_EXP_LCAP_MAXWDTH) >> 4;
+	*speed = cap & PCICAP_EXP_LCAP_MAXSPD;
+}
+
+#define PVR_POWER9_CUMULUS		0x00002000
+
+static bool phb4_chip_retry_workaround(void)
+{
+	unsigned int pvr;
+
+	if (pci_retry_all)
+		return true;
+
+	/* Chips that need this retry are:
+	 *  - CUMULUS DD1.0
+	 *  - NIMBUS DD2.0 (and DD1.0, but it is unsupported so no check).
+	 */
+	pvr = mfspr(SPR_PVR);
+	if (pvr & PVR_POWER9_CUMULUS) {
+		if ((PVR_VERS_MAJ(pvr) == 1) && (PVR_VERS_MIN(pvr) == 0))
+			return true;
+	} else { /* NIMBUS */
+		if ((PVR_VERS_MAJ(pvr) == 2) && (PVR_VERS_MIN(pvr) == 0))
+			return true;
+	}
+	return false;
+}
+
+struct pci_card_id {
+	uint16_t vendor;
+	uint16_t device;
+};
+
+static struct pci_card_id retry_allowlist[] = {
+	{ 0x1000, 0x005d }, /* LSI Logic MegaRAID SAS-3 3108 */
+	{ 0x1000, 0x00c9 }, /* LSI MPT SAS-3 */
+	{ 0x104c, 0x8241 }, /* TI xHCI USB */
+	{ 0x1077, 0x2261 }, /* QLogic ISP2722-based 16/32Gb FC */
+	{ 0x10b5, 0x8725 }, /* PLX Switch: p9dsu, witherspoon */
+	{ 0x10b5, 0x8748 }, /* PLX Switch: ZZ */
+	{ 0x11f8, 0xf117 }, /* PMC-Sierra/MicroSemi NV1604 */
+	{ 0x15b3, 0x1013 }, /* Mellanox ConnectX-4 */
+	{ 0x15b3, 0x1017 }, /* Mellanox ConnectX-5 */
+	{ 0x15b3, 0x1019 }, /* Mellanox ConnectX-5 Ex */
+	{ 0x1a03, 0x1150 }, /* ASPEED AST2500 Switch */
+	{ 0x8086, 0x10fb }, /* Intel x520 10G Eth */
+	{ 0x9005, 0x028d }, /* MicroSemi PM8069 */
+};
+
+#define VENDOR(vdid) ((vdid) & 0xffff)
+#define DEVICE(vdid) (((vdid) >> 16) & 0xffff)
+
+static bool phb4_adapter_in_allowlist(uint32_t vdid)
+{
+	int i;
+
+	if (pci_retry_all)
+		return true;
+
+	for (i = 0; i < ARRAY_SIZE(retry_allowlist); i++)
+		if ((retry_allowlist[i].vendor == VENDOR(vdid)) &&
+		    (retry_allowlist[i].device == DEVICE(vdid)))
+			return true;
+
+	return false;
+}
+
+static struct pci_card_id lane_eq_disable[] = {
+	{ 0x10de, 0x17fd }, /* Nvidia GM200GL [Tesla M40] */
+	{ 0x10de, 0x1db4 }, /* Nvidia GV100 */
+};
+
+static bool phb4_lane_eq_retry_allowlist(uint32_t vdid)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lane_eq_disable); i++)
+		if ((lane_eq_disable[i].vendor == VENDOR(vdid)) &&
+		    (lane_eq_disable[i].device == DEVICE(vdid)))
+			return true;
+	return false;
+}
+
+static void phb4_lane_eq_change(struct phb4 *p, uint32_t vdid)
+{
+	p->lane_eq_en = !phb4_lane_eq_retry_allowlist(vdid);
+}
+
+static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+	uint64_t reg;
+	uint32_t id;
+	uint16_t bdfn, lane_errs;
+	uint8_t trained_speed, dev_speed, target_speed, rx_errs;
+	uint8_t trained_width, dev_width, target_width;
+	bool optimal_speed, optimal_width, optimal, retry_enabled, rx_err_ok;
+
+
+	/* Current trained state */
+	phb4_get_link_info(slot, &trained_speed, &trained_width);
+
+	/* Get device capability */
+	bdfn = 0x0100; /* bus=1 dev=0 device=0 */
+	/* Since this is the first access, we need to wait for CRS */
+	if (!pci_wait_crs(slot->phb, bdfn , &id))
+		return true;
+	phb4_get_info(slot->phb, bdfn, &dev_speed, &dev_width);
+
+	/* Work out if we are optimally trained */
+	target_speed = MIN(p->max_link_speed, dev_speed);
+	optimal_speed = (trained_speed >= target_speed);
+	target_width = MIN(p->max_link_width, dev_width);
+	optimal_width = (trained_width >= target_width);
+	optimal = optimal_width && optimal_speed;
+	retry_enabled = (phb4_chip_retry_workaround() &&
+			 phb4_adapter_in_allowlist(id)) ||
+		phb4_lane_eq_retry_allowlist(id);
+	reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_COUNTERS);
+	rx_errs =  GETFIELD(PHB_PCIE_DLP_RX_ERR_CNT, reg);
+	rx_err_ok = (rx_errs < rx_err_max);
+	reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_STATUS);
+	lane_errs = GETFIELD(PHB_PCIE_DLP_LANE_ERR, reg);
+
+	PHBDBG(p, "LINK: Card [%04x:%04x] %s Retry:%s\n", VENDOR(id),
+	       DEVICE(id), optimal ? "Optimal" : "Degraded",
+	       retry_enabled ? "enabled" : "disabled");
+	PHBDBG(p, "LINK: Speed Train:GEN%i PHB:GEN%i DEV:GEN%i%s\n",
+	       trained_speed, p->max_link_speed, dev_speed,
+	       optimal_speed ? "" : " *");
+	PHBDBG(p, "LINK: Width Train:x%02i PHB:x%02i DEV:x%02i%s\n",
+	       trained_width, p->max_link_width, dev_width,
+	       optimal_width ? "" : " *");
+	PHBDBG(p, "LINK: RX Errors Now:%i Max:%i Lane:0x%04x%s\n",
+	       rx_errs, rx_err_max, lane_errs, rx_err_ok ? "" : " *");
+
+	if (vdid)
+		*vdid = id;
+
+	/* Always do RX error retry irrespective of chip and card */
+	if (!rx_err_ok)
+		return false;
+
+	if (!retry_enabled)
+		return true;
+
+	return optimal;
+}
+
+/*
+ * This is a trace function to watch what's happening duing pcie link
+ * training.  If any errors are detected it simply returns so the
+ * normal code can deal with it.
+ */
+static void phb4_link_trace(struct phb4 *p, uint64_t target_state, int max_ms)
+{
+	unsigned long now, end, start = mftb(), state = 0;
+	uint64_t trwctl, reg, reglast = -1;
+	bool enabled;
+
+	/*
+	 * Enable the DLP trace outputs. If we don't the LTSSM state in
+	 * PHB_PCIE_DLP_TRAIN_CTL won't be updated and always reads zero.
+	 */
+	trwctl = phb4_read_reg(p, PHB_PCIE_DLP_TRWCTL);
+	enabled = !!(trwctl & PHB_PCIE_DLP_TRWCTL_EN);
+	if (!enabled) {
+		phb4_write_reg(p, PHB_PCIE_DLP_TRWCTL,
+				trwctl | PHB_PCIE_DLP_TRWCTL_EN);
+	}
+
+	end = start + msecs_to_tb(max_ms);
+	now = start;
+
+	do {
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (reg != reglast)
+			state = phb4_train_info(p, reg, now - start);
+		reglast = reg;
+
+		if (!phb4_check_reg(p, reg)) {
+			PHBNOTICE(p, "TRACE: PHB fenced.\n");
+			goto out;
+		}
+
+		if (tb_compare(now, end) == TB_AAFTERB) {
+			PHBNOTICE(p, "TRACE: Timed out after %dms\n", max_ms);
+			goto out;
+		}
+
+		now = mftb();
+	} while (state != target_state);
+
+	PHBNOTICE(p, "TRACE: Reached target state\n");
+
+out:
+	/*
+	 * The trace enable bit is a clock gate for the tracing logic. Turn
+	 * it off to save power if we're not using it otherwise.
+	 */
+	if (!enabled)
+		phb4_write_reg(p, PHB_PCIE_DLP_TRWCTL, trwctl);
+}
+
+/*
+ * This helper is called repeatedly by the host sync notifier mechanism, which
+ * relies on the kernel to regularly poll the OPAL_SYNC_HOST_REBOOT call as it
+ * shuts down.
+ */
+static bool phb4_host_sync_reset(void *data)
+{
+	struct phb4 *p = (struct phb4 *)data;
+	struct phb *phb = &p->phb;
+	int64_t rc = 0;
+
+	/* Make sure no-one modifies the phb flags while we are active */
+	phb_lock(phb);
+
+	/* Make sure CAPP is attached to the PHB */
+	if (p->capp)
+		/* Call phb ops to disable capi */
+		rc = phb->ops->set_capi_mode(phb, OPAL_PHB_CAPI_MODE_PCIE,
+				       p->capp->attached_pe);
+	else
+		rc = OPAL_SUCCESS;
+
+	/* Continue kicking state-machine if in middle of a mode transition */
+	if (rc == OPAL_BUSY)
+		rc = phb->slot->ops.run_sm(phb->slot);
+
+	phb_unlock(phb);
+
+	return rc <= OPAL_SUCCESS;
+}
+
+/*
+ * Notification from the pci-core that a pci slot state machine completed.
+ * We use this callback to mark the CAPP disabled if we were waiting for it.
+ */
+static int64_t phb4_slot_sm_run_completed(struct pci_slot *slot, uint64_t err)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+
+	/* Check if we are disabling the capp */
+	if (p->flags & PHB4_CAPP_DISABLE) {
+
+		/* Unset struct capp so that we dont fall into a creset loop */
+		p->flags &= ~(PHB4_CAPP_DISABLE);
+		p->capp->phb = NULL;
+		p->capp->attached_pe = phb4_get_reserved_pe_number(&p->phb);
+
+		/* Remove the host sync notifier is we are done.*/
+		opal_del_host_sync_notifier(phb4_host_sync_reset, p);
+		if (err) {
+			/* Force a CEC ipl reboot */
+			disable_fast_reboot("CAPP: reset failed");
+			PHBERR(p, "CAPP: Unable to reset. Error=%lld\n", err);
+		} else {
+			PHBINF(p, "CAPP: reset complete\n");
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_poll_link(struct pci_slot *slot)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+	uint64_t reg;
+	uint32_t vdid;
+
+	switch (slot->state) {
+	case PHB4_SLOT_NORMAL:
+	case PHB4_SLOT_LINK_START:
+		PHBDBG(p, "LINK: Start polling\n");
+		slot->retries = PHB4_LINK_ELECTRICAL_RETRIES;
+		pci_slot_set_state(slot, PHB4_SLOT_LINK_WAIT_ELECTRICAL);
+		/* Polling early here has no chance of a false positive */
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+	case PHB4_SLOT_LINK_WAIT_ELECTRICAL:
+		/*
+		 * Wait for the link electrical connection to be
+		 * established (shorter timeout). This allows us to
+		 * workaround spurrious presence detect on some machines
+		 * without waiting 10s each time
+		 *
+		 * Note: We *also* check for the full link up bit here
+		 * because simics doesn't seem to implement the electrical
+		 * link bit at all
+		 */
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (!phb4_check_reg(p, reg)) {
+			PHBERR(p, "PHB fence waiting for electrical link\n");
+			return phb4_retry_state(slot);
+		}
+
+		if (reg & (PHB_PCIE_DLP_INBAND_PRESENCE |
+			   PHB_PCIE_DLP_TL_LINKACT)) {
+			PHBDBG(p, "LINK: Electrical link detected\n");
+			pci_slot_set_state(slot, PHB4_SLOT_LINK_WAIT);
+			slot->retries = PHB4_LINK_WAIT_RETRIES;
+			/* No wait here since already have an elec link */
+			return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
+		}
+
+		if (slot->retries-- == 0) {
+			PHBDBG(p, "LINK: No in-band presence\n");
+			return OPAL_SUCCESS;
+		}
+		/* Retry */
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+	case PHB4_SLOT_LINK_WAIT:
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (!phb4_check_reg(p, reg)) {
+			PHBERR(p, "LINK: PHB fence waiting for link training\n");
+			return phb4_retry_state(slot);
+		}
+		if (reg & PHB_PCIE_DLP_TL_LINKACT) {
+			PHBDBG(p, "LINK: Link is up\n");
+			phb4_prepare_link_change(slot, true);
+			pci_slot_set_state(slot, PHB4_SLOT_LINK_STABLE);
+			return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+		}
+
+		if (slot->retries-- == 0) {
+			PHBERR(p, "LINK: Timeout waiting for link up\n");
+			PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg);
+			return phb4_retry_state(slot);
+		}
+		/* Retry */
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+	case PHB4_SLOT_LINK_STABLE:
+		/* Sanity check link */
+		if (phb4_fenced(p)) {
+			PHBERR(p, "LINK: PHB fenced waiting for stabilty\n");
+			return phb4_retry_state(slot);
+		}
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (!phb4_check_reg(p, reg)) {
+			PHBERR(p, "LINK: PHB fence reading training control\n");
+			return phb4_retry_state(slot);
+		}
+		if (reg & PHB_PCIE_DLP_TL_LINKACT) {
+			PHBDBG(p, "LINK: Link is stable\n");
+			if (!phb4_link_optimal(slot, &vdid)) {
+				PHBDBG(p, "LINK: Link degraded\n");
+				if (slot->link_retries) {
+					phb4_lane_eq_change(p, vdid);
+					return phb4_retry_state(slot);
+				}
+				/*
+				 * Link is degraded but no more retries, so
+				 * settle for what we have :-(
+				 */
+				PHBERR(p, "LINK: Degraded but no more retries\n");
+			}
+			pci_restore_slot_bus_configs(slot);
+			pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+			return OPAL_SUCCESS;
+		}
+		PHBERR(p, "LINK: Went down waiting for stabilty\n");
+		PHBDBG(p, "LINK: DLP train control: 0x%016llx\n", reg);
+		return phb4_retry_state(slot);
+	default:
+		PHBERR(p, "LINK: Unexpected slot state %08x\n",
+		       slot->state);
+	}
+
+	pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+static unsigned int phb4_get_max_link_speed(struct phb4 *p, struct dt_node *np)
+{
+	unsigned int max_link_speed, hw_max_link_speed;
+	struct proc_chip *chip;
+	chip = get_chip(p->chip_id);
+
+	hw_max_link_speed = 4;
+	if (is_phb5() && (p->index == 0 || p->index == 3))
+		hw_max_link_speed = 5;
+
+	/* Priority order: NVRAM -> dt -> GEN3 dd2.00 -> hw default */
+	max_link_speed = hw_max_link_speed;
+	if (p->rev == PHB4_REV_NIMBUS_DD20 &&
+	    ((0xf & chip->ec_level) == 0) && chip->ec_rev == 0)
+		max_link_speed = 3;
+	if (np) {
+		if (dt_has_node_property(np, "ibm,max-link-speed", NULL)) {
+			max_link_speed = dt_prop_get_u32(np, "ibm,max-link-speed");
+			p->dt_max_link_speed = max_link_speed;
+		}
+		else {
+			p->dt_max_link_speed = 0;
+		}
+	}
+	else {
+		if (p->dt_max_link_speed > 0) {
+			max_link_speed = p->dt_max_link_speed;
+		}
+	}
+	if (pcie_max_link_speed)
+		max_link_speed = pcie_max_link_speed;
+	if (max_link_speed > hw_max_link_speed)
+		max_link_speed = hw_max_link_speed;
+
+	return max_link_speed;
+}
+
+static unsigned int __phb4_get_max_link_width(struct phb4 *p)
+{
+	uint64_t addr, reg;
+	unsigned int lane_config, width = 16;
+
+	/*
+	 * On P9, only PEC2 is configurable (no-/bi-/tri-furcation)
+	 */
+	switch (p->pec) {
+	case 0:
+		width = 16;
+		break;
+	case 1:
+		width = 8;
+		break;
+	case 2:
+		addr = XPEC_P9_PCI_CPLT_CONF1 + 2 * XPEC_PCI_CPLT_OFFSET;
+		xscom_read(p->chip_id, addr, &reg);
+		lane_config = GETFIELD(XPEC_P9_PCI_LANE_CFG, reg);
+
+		if (lane_config == 0b10 && p->index >= 4)
+			width = 4;
+		else
+			width = 8;
+	}
+	return width;
+}
+
+static unsigned int __phb5_get_max_link_width(struct phb4 *p)
+{
+	uint64_t addr, reg;
+	unsigned int lane_config, width = 16;
+
+	/*
+	 * On P10, the 2 PECs are identical and each can have a
+	 * different furcation, so we always need to check the PEC
+	 * config
+	 */
+	addr = XPEC_P10_PCI_CPLT_CONF1 + p->pec * XPEC_PCI_CPLT_OFFSET;
+	xscom_read(p->chip_id, addr, &reg);
+	lane_config = GETFIELD(XPEC_P10_PCI_LANE_CFG, reg);
+
+	switch (lane_config) {
+	case 0b00:
+		width = 16;
+		break;
+	case 0b01:
+		width = 8;
+		break;
+	case 0b10:
+		if (p->index == 0 || p->index == 3)
+			width = 8;
+		else
+			width = 4;
+		break;
+	default:
+		PHBERR(p, "Unexpected PEC lane config value %#x\n",
+		       lane_config);
+	}
+	return width;
+}
+
+static unsigned int phb4_get_max_link_width(struct phb4 *p)
+{
+	if (is_phb5())
+		return __phb5_get_max_link_width(p);
+	else
+		return __phb4_get_max_link_width(p);
+}
+
+static void phb4_assert_perst(struct pci_slot *slot, bool assert)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+	uint16_t linkctl;
+	uint64_t reg;
+
+	/*
+	 * Disable the link before asserting PERST. The Cursed RAID card
+	 * in ozrom1 (9005:028c) has problems coming back if PERST is asserted
+	 * while link is active. To work around the problem we assert the link
+	 * disable bit before asserting PERST. Asserting the secondary reset
+	 * bit in the btctl register also works.
+	 */
+	phb4_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL, &linkctl);
+	reg = phb4_read_reg(p, PHB_PCIE_CRESET);
+
+	if (assert) {
+		linkctl |= PCICAP_EXP_LCTL_LINK_DIS;
+		reg &= ~PHB_PCIE_CRESET_PERST_N;
+	} else {
+		linkctl &= ~PCICAP_EXP_LCTL_LINK_DIS;
+		reg |= PHB_PCIE_CRESET_PERST_N;
+	}
+
+	phb4_write_reg(p, PHB_PCIE_CRESET, reg);
+	phb4_pcicfg_write16(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL, linkctl);
+}
+
+static void set_sys_disable_detect(struct phb4 *p, bool set)
+{
+	uint64_t val;
+
+	val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+	if (set)
+		val |= PHB_PCIE_DLP_SYS_DISABLEDETECT;
+	else
+		val &= ~PHB_PCIE_DLP_SYS_DISABLEDETECT;
+	out_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL, val);
+}
+
+static int64_t phb4_hreset(struct pci_slot *slot)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+	uint16_t brctl;
+	uint8_t presence = 1;
+
+	switch (slot->state) {
+	case PHB4_SLOT_NORMAL:
+		PHBDBG(p, "HRESET: Starts\n");
+		if (slot->ops.get_presence_state)
+			slot->ops.get_presence_state(slot, &presence);
+		if (!presence) {
+			PHBDBG(p, "HRESET: No device\n");
+			return OPAL_SUCCESS;
+		}
+
+		/* circumvention for HW551382 */
+		if (is_phb5()) {
+			PHBINF(p, "HRESET: Workaround for HW551382\n");
+			set_sys_disable_detect(p, true);
+		}
+
+		PHBDBG(p, "HRESET: Prepare for link down\n");
+		phb4_prepare_link_change(slot, false);
+		/* fall through */
+	case PHB4_SLOT_HRESET_START:
+		PHBDBG(p, "HRESET: Assert\n");
+
+		phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+		brctl |= PCI_CFG_BRCTL_SECONDARY_RESET;
+		phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+		pci_slot_set_state(slot, PHB4_SLOT_HRESET_DELAY);
+
+		return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+	case PHB4_SLOT_HRESET_DELAY:
+		PHBDBG(p, "HRESET: Deassert\n");
+
+		/* Clear link errors before we deassert reset */
+		phb4_err_clear_regb(p);
+
+		phb4_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+		brctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+		phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+
+		/*
+		 * Due to some oddball adapters bouncing the link
+		 * training a couple of times, we wait for a full second
+		 * before we start checking the link status, otherwise
+		 * we can get a spurrious link down interrupt which
+		 * causes us to EEH immediately.
+		 */
+		pci_slot_set_state(slot, PHB4_SLOT_HRESET_DELAY2);
+		return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
+	case PHB4_SLOT_HRESET_DELAY2:
+		if (is_phb5())
+			set_sys_disable_detect(p, false);
+		pci_slot_set_state(slot, PHB4_SLOT_LINK_START);
+		return slot->ops.poll_link(slot);
+	default:
+		PHBERR(p, "Unexpected slot state %08x\n", slot->state);
+	}
+
+	pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+static int64_t phb4_freset(struct pci_slot *slot)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+
+	switch(slot->state) {
+	case PHB4_SLOT_NORMAL:
+	case PHB4_SLOT_FRESET_START:
+		PHBDBG(p, "FRESET: Starts\n");
+
+		/* Reset max link speed for training */
+		p->max_link_speed = phb4_get_max_link_speed(p, NULL);
+
+		PHBDBG(p, "FRESET: Prepare for link down\n");
+		phb4_prepare_link_change(slot, false);
+
+		if (!p->skip_perst) {
+			/* circumvention for HW551382 */
+			if (is_phb5()) {
+				PHBINF(p, "FRESET: Workaround for HW551382\n");
+				set_sys_disable_detect(p, true);
+			}
+
+			PHBDBG(p, "FRESET: Assert\n");
+			phb4_assert_perst(slot, true);
+			pci_slot_set_state(slot, PHB4_SLOT_FRESET_ASSERT_DELAY);
+
+			/* 250ms assert time aligns with powernv */
+			return pci_slot_set_sm_timeout(slot, msecs_to_tb(250));
+		}
+
+		/* To skip the assert during boot time */
+		PHBDBG(p, "FRESET: Assert skipped\n");
+		pci_slot_set_state(slot, PHB4_SLOT_FRESET_ASSERT_DELAY);
+		p->skip_perst = false;
+		/* fall through */
+	case PHB4_SLOT_FRESET_ASSERT_DELAY:
+		/* Clear link errors before we deassert PERST */
+		phb4_err_clear_regb(p);
+
+		PHBDBG(p, "FRESET: Deassert\n");
+		phb4_assert_perst(slot, false);
+
+		if (pci_tracing)
+			phb4_link_trace(p, PHB_PCIE_DLP_LTSSM_L0, 3000);
+
+		if (is_phb5())
+			set_sys_disable_detect(p, false);
+
+		pci_slot_set_state(slot, PHB4_SLOT_LINK_START);
+		return slot->ops.poll_link(slot);
+	default:
+		PHBERR(p, "Unexpected slot state %08x\n", slot->state);
+	}
+
+	pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+	return OPAL_HARDWARE;
+}
+
+static int64_t load_capp_ucode(struct phb4 *p)
+{
+	int64_t rc;
+
+	if (p->index != CAPP0_PHB_INDEX && p->index != CAPP1_PHB_INDEX)
+		return OPAL_HARDWARE;
+
+	/* 0x434150504c494448 = 'CAPPLIDH' in ASCII */
+	rc = capp_load_ucode(p->chip_id, p->phb.opal_id, p->index,
+			0x434150504c494448UL, PHB4_CAPP_REG_OFFSET(p),
+			CAPP_APC_MASTER_ARRAY_ADDR_REG,
+			CAPP_APC_MASTER_ARRAY_WRITE_REG,
+			CAPP_SNP_ARRAY_ADDR_REG,
+			CAPP_SNP_ARRAY_WRITE_REG);
+	return rc;
+}
+
+static int do_capp_recovery_scoms(struct phb4 *p)
+{
+	uint64_t rc, reg, end;
+	uint64_t offset = PHB4_CAPP_REG_OFFSET(p);
+
+
+	/* Get the status of CAPP recovery */
+	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+
+	/* No recovery in progress ignore */
+	if ((reg & PPC_BIT(0)) == 0) {
+		PHBDBG(p, "CAPP: No recovery in progress\n");
+		return OPAL_SUCCESS;
+	}
+
+	PHBDBG(p, "CAPP: Waiting for recovery to complete\n");
+	/* recovery timer failure period 168ms */
+	end = mftb() + msecs_to_tb(168);
+	while ((reg & (PPC_BIT(1) | PPC_BIT(5) | PPC_BIT(9))) == 0) {
+
+		time_wait_ms(5);
+		xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+
+		if (tb_compare(mftb(), end) != TB_ABEFOREB) {
+			PHBERR(p, "CAPP: Capp recovery Timed-out.\n");
+			end = 0;
+			break;
+		}
+	}
+
+	/* Check if the recovery failed or passed */
+	if (reg & PPC_BIT(1)) {
+		uint64_t act0, act1, mask, fir;
+
+		/* Use the Action0/1 and mask to only clear the bits
+		 * that cause local checkstop. Other bits needs attention
+		 * of the PRD daemon.
+		 */
+		xscom_read(p->chip_id, CAPP_FIR_ACTION0 + offset, &act0);
+		xscom_read(p->chip_id, CAPP_FIR_ACTION1 + offset, &act1);
+		xscom_read(p->chip_id, CAPP_FIR_MASK + offset, &mask);
+		xscom_read(p->chip_id, CAPP_FIR + offset, &fir);
+
+		fir = ~(fir & ~mask & act0 & act1);
+		PHBDBG(p, "Doing CAPP recovery scoms\n");
+
+		/* update capp fir clearing bits causing local checkstop */
+		PHBDBG(p, "Resetting CAPP Fir with mask 0x%016llX\n", fir);
+		xscom_write(p->chip_id, CAPP_FIR_CLEAR + offset, fir);
+
+		/* disable snoops */
+		xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0);
+		load_capp_ucode(p);
+
+		/* clear err rpt reg*/
+		xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0);
+
+		/* clear capp fir */
+		xscom_write(p->chip_id, CAPP_FIR + offset, 0);
+
+		/* Just reset Bit-0,1 and dont touch any other bit */
+		xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+		reg &= ~(PPC_BIT(0) | PPC_BIT(1));
+		xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg);
+
+		PHBDBG(p, "CAPP recovery complete\n");
+		rc = OPAL_SUCCESS;
+
+	} else {
+		/* Most likely will checkstop here due to FIR ACTION for
+		 * failed recovery. So this message would never be logged.
+		 * But if we still enter here then return an error forcing a
+		 * fence of the PHB.
+		 */
+		if (reg  & PPC_BIT(5))
+			PHBERR(p, "CAPP: Capp recovery Failed\n");
+		else if (reg  & PPC_BIT(9))
+			PHBERR(p, "CAPP: Capp recovery hang detected\n");
+		else if (end != 0)
+			PHBERR(p, "CAPP: Unknown recovery failure\n");
+
+		PHBDBG(p, "CAPP: Err/Status-reg=0x%016llx\n", reg);
+		rc = OPAL_HARDWARE;
+	}
+
+	return rc;
+}
+
+/*
+ * Disable CAPI mode on a PHB. Must be done while PHB is fenced and
+ * not in recovery.
+ */
+static void disable_capi_mode(struct phb4 *p)
+{
+	uint64_t reg;
+	struct capp *capp = p->capp;
+
+	PHBINF(p, "CAPP: Deactivating\n");
+
+	/* Check if CAPP attached to the PHB and active */
+	if (!capp || capp->phb != &p->phb) {
+		PHBDBG(p, "CAPP: Not attached to this PHB!\n");
+		return;
+	}
+
+	xscom_read(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, &reg);
+	if (!(reg & PPC_BIT(0))) {
+		/* Not in CAPI mode, no action required */
+		PHBERR(p, "CAPP: Not enabled!\n");
+		return;
+	}
+
+	/* CAPP should already be out of recovery in this function */
+	capp_xscom_read(capp, CAPP_ERR_STATUS_CTRL, &reg);
+	if (reg & PPC_BIT(0)) {
+		PHBERR(p, "CAPP: Can't disable while still in recovery!\n");
+		return;
+	}
+
+	PHBINF(p, "CAPP: Disabling CAPI mode\n");
+
+	/* First Phase Reset CAPP Registers */
+	/* CAPP about to be disabled mark TLBI_FENCED and tlbi_psl_is_dead */
+	capp_xscom_write(capp, CAPP_ERR_STATUS_CTRL, PPC_BIT(3) | PPC_BIT(4));
+
+	/* Flush SUE uOP1 Register */
+	if (p->rev != PHB4_REV_NIMBUS_DD10)
+		capp_xscom_write(capp, FLUSH_SUE_UOP1, 0);
+
+	/* Release DMA/STQ engines */
+	capp_xscom_write(capp, APC_FSM_READ_MASK, 0ull);
+	capp_xscom_write(capp, XPT_FSM_RMM, 0ull);
+
+	/* Disable snoop */
+	capp_xscom_write(capp, SNOOP_CAPI_CONFIG, 0);
+
+	/* Clear flush SUE state map register */
+	capp_xscom_write(capp, FLUSH_SUE_STATE_MAP, 0);
+
+	/* Disable epoch timer */
+	capp_xscom_write(capp, EPOCH_RECOVERY_TIMERS_CTRL, 0);
+
+	/* CAPP Transport Control Register */
+	capp_xscom_write(capp, TRANSPORT_CONTROL, PPC_BIT(15));
+
+	/* Disable snooping */
+	capp_xscom_write(capp, SNOOP_CONTROL, 0);
+	capp_xscom_write(capp, SNOOP_CAPI_CONFIG, 0);
+
+	/* APC Master PB Control Register - disable examining cResps */
+	capp_xscom_write(capp, APC_MASTER_PB_CTRL, 0);
+
+	/* APC Master Config Register - de-select PHBs */
+	xscom_write_mask(p->chip_id, capp->capp_xscom_offset +
+			 APC_MASTER_CAPI_CTRL, 0, PPC_BITMASK(2, 3));
+
+	/* Clear all error registers */
+	capp_xscom_write(capp, CAPP_ERR_RPT_CLR, 0);
+	capp_xscom_write(capp, CAPP_FIR, 0);
+	capp_xscom_write(capp, CAPP_FIR_ACTION0, 0);
+	capp_xscom_write(capp, CAPP_FIR_ACTION1, 0);
+	capp_xscom_write(capp, CAPP_FIR_MASK, 0);
+
+	/* Second Phase Reset PEC/PHB Registers */
+
+	/* Reset the stack overrides if any */
+	xscom_write(p->chip_id, p->pci_xscom + XPEC_PCI_PRDSTKOVR, 0);
+	xscom_write(p->chip_id, p->pe_xscom +
+		    XPEC_NEST_READ_STACK_OVERRIDE, 0);
+
+	/* PE Bus AIB Mode Bits. Disable Tracing. Leave HOL Blocking as it is */
+	if (!(p->rev == PHB4_REV_NIMBUS_DD10) && p->index == CAPP1_PHB_INDEX)
+		xscom_write_mask(p->chip_id,
+				 p->pci_xscom + XPEC_PCI_PBAIB_HW_CONFIG, 0,
+				 PPC_BIT(30));
+
+	/* Reset for PCI to PB data movement */
+	xscom_write_mask(p->chip_id, p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG,
+			 0, XPEC_NEST_PBCQ_HW_CONFIG_PBINIT);
+
+	/* Disable CAPP mode in PEC CAPP Control Register */
+	xscom_write(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, 0ull);
+}
+
+static int64_t phb4_creset(struct pci_slot *slot)
+{
+	struct phb4 *p = phb_to_phb4(slot->phb);
+	struct capp *capp = p->capp;
+	uint64_t pbcq_status;
+	uint64_t creset_time, wait_time;
+
+	/* Don't even try fixing a broken PHB */
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	switch (slot->state) {
+	case PHB4_SLOT_NORMAL:
+	case PHB4_SLOT_CRESET_START:
+		PHBDBG(p, "CRESET: Starts\n");
+
+		p->creset_start_time = mftb();
+
+		/* circumvention for HW551382 */
+		if (is_phb5()) {
+			PHBINF(p, "CRESET: Workaround for HW551382\n");
+			set_sys_disable_detect(p, true);
+		}
+
+		phb4_prepare_link_change(slot, false);
+		/* Clear error inject register, preventing recursive errors */
+		xscom_write(p->chip_id, p->pe_xscom + 0x2, 0x0);
+
+		/* Prevent HMI when PHB gets fenced as we are disabling CAPP */
+		if (p->flags & PHB4_CAPP_DISABLE &&
+		    capp && capp->phb == slot->phb) {
+			/* Since no HMI, So set the recovery flag manually. */
+			p->flags |= PHB4_CAPP_RECOVERY;
+			xscom_write_mask(p->chip_id, capp->capp_xscom_offset +
+					 CAPP_FIR_MASK,
+					 PPC_BIT(31), PPC_BIT(31));
+		}
+
+		/* Force fence on the PHB to work around a non-existent PE */
+		if (!phb4_fenced(p))
+			xscom_write(p->chip_id, p->pe_stk_xscom + 0x2,
+				    0x0000002000000000UL);
+
+		/*
+		 * Force use of ASB for register access until the PHB has
+		 * been fully reset.
+		 */
+		p->flags |= PHB4_CFG_USE_ASB | PHB4_AIB_FENCED;
+
+		/* Assert PREST before clearing errors */
+		phb4_assert_perst(slot, true);
+
+		/* Clear errors, following the proper sequence */
+		phb4_err_clear(p);
+
+		/* Actual reset */
+		p->flags |= PHB4_ETU_IN_RESET;
+		xscom_write(p->chip_id, p->pci_stk_xscom + XPEC_PCI_STK_ETU_RESET,
+			    0x8000000000000000UL);
+
+		/* Read errors in PFIR and NFIR */
+		xscom_read(p->chip_id, p->pci_stk_xscom + 0x0, &p->pfir_cache);
+		xscom_read(p->chip_id, p->pe_stk_xscom + 0x0, &p->nfir_cache);
+
+		pci_slot_set_state(slot, PHB4_SLOT_CRESET_WAIT_CQ);
+		slot->retries = 500;
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+	case PHB4_SLOT_CRESET_WAIT_CQ:
+
+		// Wait until operations are complete
+		xscom_read(p->chip_id, p->pe_stk_xscom + 0xc, &pbcq_status);
+		if (!(pbcq_status & 0xC000000000000000UL)) {
+			PHBDBG(p, "CRESET: No pending transactions\n");
+
+			/* capp recovery */
+			if ((p->flags & PHB4_CAPP_RECOVERY) &&
+			    (do_capp_recovery_scoms(p) != OPAL_SUCCESS))
+				goto error;
+
+			if (p->flags & PHB4_CAPP_DISABLE)
+				disable_capi_mode(p);
+
+			/* Clear errors in PFIR and NFIR */
+			xscom_write(p->chip_id, p->pci_stk_xscom + 0x1,
+				    ~p->pfir_cache);
+			xscom_write(p->chip_id, p->pe_stk_xscom + 0x1,
+				    ~p->nfir_cache);
+
+			/* Re-read errors in PFIR and NFIR and reset any new
+			 * error reported.
+			 */
+			xscom_read(p->chip_id, p->pci_stk_xscom +
+				   XPEC_PCI_STK_PCI_FIR, &p->pfir_cache);
+			xscom_read(p->chip_id, p->pe_stk_xscom +
+				   XPEC_NEST_STK_PCI_NFIR, &p->nfir_cache);
+
+			if (p->pfir_cache || p->nfir_cache) {
+				PHBERR(p, "CRESET: PHB still fenced !!\n");
+				phb4_dump_pec_err_regs(p);
+
+				/* Reset the PHB errors */
+				xscom_write(p->chip_id, p->pci_stk_xscom +
+					    XPEC_PCI_STK_PCI_FIR, 0);
+				xscom_write(p->chip_id, p->pe_stk_xscom +
+					    XPEC_NEST_STK_PCI_NFIR, 0);
+			}
+
+			/* Clear PHB from reset */
+			xscom_write(p->chip_id,
+				    p->pci_stk_xscom + XPEC_PCI_STK_ETU_RESET, 0x0);
+			p->flags &= ~PHB4_ETU_IN_RESET;
+
+			pci_slot_set_state(slot, PHB4_SLOT_CRESET_REINIT);
+			/* After lifting PHB reset, wait while logic settles */
+			return pci_slot_set_sm_timeout(slot, msecs_to_tb(10));
+		}
+
+		if (slot->retries-- == 0) {
+			PHBERR(p, "Timeout waiting for pending transaction\n");
+			goto error;
+		}
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(100));
+	case PHB4_SLOT_CRESET_REINIT:
+		PHBDBG(p, "CRESET: Reinitialization\n");
+		p->flags &= ~PHB4_AIB_FENCED;
+		p->flags &= ~PHB4_CAPP_RECOVERY;
+		p->flags &= ~PHB4_CFG_USE_ASB;
+		phb4_init_hw(p);
+		pci_slot_set_state(slot, PHB4_SLOT_CRESET_FRESET);
+
+		/*
+		 * The PERST is sticky across resets, but LINK_DIS isn't.
+		 * Re-assert it here now that we've reset the PHB.
+		 */
+		phb4_assert_perst(slot, true);
+
+		/*
+		 * wait either 100ms (for the ETU logic) or until we've had
+		 * PERST asserted for 250ms.
+		 */
+		creset_time = tb_to_msecs(mftb() - p->creset_start_time);
+		if (creset_time < 250)
+			wait_time = MAX(100, 250 - creset_time);
+		else
+			wait_time = 100;
+		PHBDBG(p, "CRESET: wait_time = %lld\n", wait_time);
+		return pci_slot_set_sm_timeout(slot, msecs_to_tb(wait_time));
+
+	case PHB4_SLOT_CRESET_FRESET:
+		/*
+		 * We asserted PERST at the beginning of the CRESET and we
+		 * have waited long enough, so we can skip it in the freset
+		 * procedure.
+		 */
+		p->skip_perst = true;
+		pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+		return slot->ops.freset(slot);
+	default:
+		PHBERR(p, "CRESET: Unexpected slot state %08x, resetting...\n",
+		       slot->state);
+		pci_slot_set_state(slot, PHB4_SLOT_NORMAL);
+		return slot->ops.creset(slot);
+
+	}
+
+error:
+	/* Mark the PHB as dead and expect it to be removed */
+	p->broken = true;
+	return OPAL_HARDWARE;
+}
+
+/*
+ * Initialize root complex slot, which is mainly used to
+ * do fundamental reset before PCI enumeration in PCI core.
+ * When probing root complex and building its real slot,
+ * the operations will be copied over.
+ */
+static struct pci_slot *phb4_slot_create(struct phb *phb)
+{
+	struct pci_slot *slot;
+
+	slot = pci_slot_alloc(phb, NULL);
+	if (!slot)
+		return slot;
+
+	/* Elementary functions */
+	slot->ops.get_presence_state  = phb4_get_presence_state;
+	slot->ops.get_link_state      = phb4_get_link_state;
+	slot->ops.get_power_state     = NULL;
+	slot->ops.get_attention_state = NULL;
+	slot->ops.get_latch_state     = NULL;
+	slot->ops.set_power_state     = NULL;
+	slot->ops.set_attention_state = NULL;
+
+	/*
+	 * For PHB slots, we have to split the fundamental reset
+	 * into 2 steps. We might not have the first step which
+	 * is to power off/on the slot, or it's controlled by
+	 * individual platforms.
+	 */
+	slot->ops.prepare_link_change	= phb4_prepare_link_change;
+	slot->ops.poll_link		= phb4_poll_link;
+	slot->ops.hreset		= phb4_hreset;
+	slot->ops.freset		= phb4_freset;
+	slot->ops.creset		= phb4_creset;
+	slot->ops.completed_sm_run	= phb4_slot_sm_run_completed;
+	slot->link_retries		= PHB4_LINK_LINK_RETRIES;
+
+	return slot;
+}
+
+static void phb4_int_unmask_all(struct phb4 *p)
+{
+	/* Init_126..130 - Re-enable error interrupts */
+	out_be64(p->regs + PHB_ERR_IRQ_ENABLE,         0xca8880cc00000000ull);
+
+	if (is_phb5())
+		out_be64(p->regs + PHB_TXE_ERR_IRQ_ENABLE, 0x200850be08200020ull);
+	else
+		out_be64(p->regs + PHB_TXE_ERR_IRQ_ENABLE, 0x2008400e08200000ull);
+	out_be64(p->regs + PHB_RXE_ARB_ERR_IRQ_ENABLE, 0xc40038fc01804070ull);
+	out_be64(p->regs + PHB_RXE_MRG_ERR_IRQ_ENABLE, 0x00006100008000a8ull);
+	out_be64(p->regs + PHB_RXE_TCE_ERR_IRQ_ENABLE, 0x60510050c0000000ull);
+}
+
+/*
+ * Mask the IRQ for any currently set error bits. This prevents the PHB's ERR
+ * and INF interrupts from being re-fired before the kernel can handle the
+ * underlying condition.
+ */
+static void phb4_int_mask_active(struct phb4 *p)
+{
+	const uint64_t error_regs[] = {
+		PHB_ERR_STATUS,
+		PHB_TXE_ERR_STATUS,
+		PHB_RXE_ARB_ERR_STATUS,
+		PHB_RXE_MRG_ERR_STATUS,
+		PHB_RXE_TCE_ERR_STATUS
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(error_regs); i++) {
+		uint64_t stat, mask;
+
+		/* The IRQ mask reg is always offset 0x20 from the status reg */
+		stat = phb4_read_reg(p, error_regs[i]);
+		mask = phb4_read_reg(p, error_regs[i] + 0x20);
+
+		phb4_write_reg(p, error_regs[i] + 0x20, mask & ~stat);
+	}
+}
+
+static uint64_t phb4_get_pesta(struct phb4 *p, uint64_t pe_number)
+{
+	uint64_t pesta;
+	__be64 *pPEST;
+
+	pPEST = (__be64 *)p->tbl_pest;
+
+	phb4_ioda_sel(p, IODA3_TBL_PESTA, pe_number, false);
+	pesta = phb4_read_reg(p, PHB_IODA_DATA0);
+	if (pesta & IODA3_PESTA_MMIO_FROZEN)
+		pesta |= be64_to_cpu(pPEST[2*pe_number]);
+
+	return pesta;
+}
+
+/* Check if the chip requires escalating a freeze to fence on MMIO loads */
+static bool phb4_escalation_required(void)
+{
+	uint64_t pvr = mfspr(SPR_PVR);
+
+	/* Only on Power9 */
+	if (proc_gen != proc_gen_p9)
+		return false;
+
+	/*
+	 * Escalation is required on the following chip versions:
+	 * - Cumulus DD1.0
+	 * - Nimbus DD2.0, DD2.1 (and DD1.0, but it is unsupported so no check).
+	 */
+	if (pvr & PVR_POWER9_CUMULUS) {
+		if (PVR_VERS_MAJ(pvr) == 1 && PVR_VERS_MIN(pvr) == 0)
+			return true;
+	} else { /* Nimbus */
+		if (PVR_VERS_MAJ(pvr) == 2 && PVR_VERS_MIN(pvr) < 2)
+			return true;
+	}
+
+	return false;
+}
+
+static bool phb4_freeze_escalate(uint64_t pesta)
+{
+	if ((GETFIELD(IODA3_PESTA_TRANS_TYPE, pesta) ==
+	     IODA3_PESTA_TRANS_TYPE_MMIOLOAD) &&
+	    (pesta & (IODA3_PESTA_CA_CMPLT_TMT | IODA3_PESTA_UR)))
+		return true;
+	return false;
+}
+
+static int64_t phb4_eeh_freeze_status(struct phb *phb, uint64_t pe_number,
+				      uint8_t *freeze_state,
+				      uint16_t *pci_error_type,
+				      uint16_t *severity)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t peev_bit = PPC_BIT(pe_number & 0x3f);
+	uint64_t peev, pesta, pestb;
+
+	/* Defaults: not frozen */
+	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+
+	/* Check dead */
+	if (p->broken) {
+		*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		if (severity)
+			*severity = OPAL_EEH_SEV_PHB_DEAD;
+		return OPAL_HARDWARE;
+	}
+
+	/* Check fence and CAPP recovery */
+	if (phb4_fenced(p) || (p->flags & PHB4_CAPP_RECOVERY)) {
+		*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		if (severity)
+			*severity = OPAL_EEH_SEV_PHB_FENCED;
+		return OPAL_SUCCESS;
+	}
+
+	/* Check the PEEV */
+	phb4_ioda_sel(p, IODA3_TBL_PEEV, pe_number / 64, false);
+	peev = in_be64(p->regs + PHB_IODA_DATA0);
+	if (!(peev & peev_bit))
+		return OPAL_SUCCESS;
+
+	/* Indicate that we have an ER pending */
+	phb4_set_err_pending(p, true);
+	if (severity)
+		*severity = OPAL_EEH_SEV_PE_ER;
+
+	/* Read the full PESTA */
+	pesta = phb4_get_pesta(p, pe_number);
+	/* Check if we need to escalate to fence */
+	if (phb4_escalation_required() && phb4_freeze_escalate(pesta)) {
+		PHBERR(p, "Escalating freeze to fence PESTA[%lli]=%016llx\n",
+		       pe_number, pesta);
+		*severity = OPAL_EEH_SEV_PHB_FENCED;
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+	}
+
+	/* Read the PESTB in the PHB */
+	phb4_ioda_sel(p, IODA3_TBL_PESTB, pe_number, false);
+	pestb = phb4_read_reg(p, PHB_IODA_DATA0);
+
+	/* Convert PESTA/B to freeze_state */
+	if (pesta & IODA3_PESTA_MMIO_FROZEN)
+		*freeze_state |= OPAL_EEH_STOPPED_MMIO_FREEZE;
+	if (pestb & IODA3_PESTB_DMA_STOPPED)
+		*freeze_state |= OPAL_EEH_STOPPED_DMA_FREEZE;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_eeh_freeze_clear(struct phb *phb, uint64_t pe_number,
+				     uint64_t eeh_action_token)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t err, peev;
+	int32_t i;
+	bool frozen_pe = false;
+
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	/* Summary. If nothing, move to clearing the PESTs which can
+	 * contain a freeze state from a previous error or simply set
+	 * explicitely by the user
+	 */
+	err = in_be64(p->regs + PHB_ETU_ERR_SUMMARY);
+	if (err == 0xffffffffffffffffUL) {
+		if (phb4_fenced(p)) {
+			PHBERR(p, "eeh_freeze_clear on fenced PHB\n");
+			return OPAL_HARDWARE;
+		}
+	}
+	if (err != 0)
+		phb4_err_clear(p);
+
+	/*
+	 * We have PEEV in system memory. It would give more performance
+	 * to access that directly.
+	 */
+	if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO) {
+		phb4_ioda_sel(p, IODA3_TBL_PESTA, pe_number, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+	}
+	if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_DMA) {
+		phb4_ioda_sel(p, IODA3_TBL_PESTB, pe_number, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+	}
+
+
+	/* Update ER pending indication */
+	phb4_ioda_sel(p, IODA3_TBL_PEEV, 0, true);
+	for (i = 0; i < p->num_pes/64; i++) {
+		peev = in_be64(p->regs + PHB_IODA_DATA0);
+		if (peev) {
+			frozen_pe = true;
+			break;
+		}
+	}
+	if (frozen_pe) {
+		p->err.err_src	 = PHB4_ERR_SRC_PHB;
+		p->err.err_class = PHB4_ERR_CLASS_ER;
+		p->err.err_bit   = -1;
+		phb4_set_err_pending(p, true);
+	} else
+		phb4_set_err_pending(p, false);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_eeh_freeze_set(struct phb *phb, uint64_t pe_number,
+				   uint64_t eeh_action_token)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t data;
+
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	if (pe_number >= p->num_pes)
+		return OPAL_PARAMETER;
+
+	if (eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_MMIO &&
+	    eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_DMA &&
+	    eeh_action_token != OPAL_EEH_ACTION_SET_FREEZE_ALL)
+		return OPAL_PARAMETER;
+
+	if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_MMIO) {
+		phb4_ioda_sel(p, IODA3_TBL_PESTA, pe_number, false);
+		data = in_be64(p->regs + PHB_IODA_DATA0);
+		data |= IODA3_PESTA_MMIO_FROZEN;
+		out_be64(p->regs + PHB_IODA_DATA0, data);
+	}
+
+	if (eeh_action_token & OPAL_EEH_ACTION_SET_FREEZE_DMA) {
+		phb4_ioda_sel(p, IODA3_TBL_PESTB, pe_number, false);
+		data = in_be64(p->regs + PHB_IODA_DATA0);
+		data |= IODA3_PESTB_DMA_STOPPED;
+		out_be64(p->regs + PHB_IODA_DATA0, data);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_eeh_next_error(struct phb *phb,
+				   uint64_t *first_frozen_pe,
+				   uint16_t *pci_error_type,
+				   uint16_t *severity)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t peev, pesta;
+	uint32_t peev_size = p->num_pes/64;
+	int32_t i, j;
+
+	/* If the PHB is broken, we needn't go forward */
+	if (p->broken) {
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_DEAD;
+		return OPAL_SUCCESS;
+	}
+
+	if ((p->flags & PHB4_CAPP_RECOVERY)) {
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_FENCED;
+		return OPAL_SUCCESS;
+	}
+
+	/*
+	 * Check if we already have pending errors. If that's
+	 * the case, then to get more information about the
+	 * pending errors. Here we try PBCQ prior to PHB.
+	 */
+	if (phb4_err_pending(p) /*&&
+	    !phb4_err_check_pbcq(p) &&
+	    !phb4_err_check_lem(p) */)
+		phb4_set_err_pending(p, false);
+
+	/* Clear result */
+	*pci_error_type  = OPAL_EEH_NO_ERROR;
+	*severity	 = OPAL_EEH_SEV_NO_ERROR;
+	*first_frozen_pe = (uint64_t)-1;
+
+	/* Check frozen PEs */
+	if (!phb4_err_pending(p)) {
+		phb4_ioda_sel(p, IODA3_TBL_PEEV, 0, true);
+		for (i = 0; i < peev_size; i++) {
+			peev = in_be64(p->regs + PHB_IODA_DATA0);
+			if (peev) {
+				p->err.err_src	 = PHB4_ERR_SRC_PHB;
+				p->err.err_class = PHB4_ERR_CLASS_ER;
+				p->err.err_bit	 = -1;
+				phb4_set_err_pending(p, true);
+				break;
+			}
+		}
+	}
+
+	if (!phb4_err_pending(p))
+		return OPAL_SUCCESS;
+	/*
+	 * If the frozen PE is caused by a malfunctioning TLP, we
+	 * need reset the PHB. So convert ER to PHB-fatal error
+	 * for the case.
+	 */
+	if (p->err.err_class == PHB4_ERR_CLASS_ER) {
+		for (i = peev_size - 1; i >= 0; i--) {
+			phb4_ioda_sel(p, IODA3_TBL_PEEV, i, false);
+			peev = in_be64(p->regs + PHB_IODA_DATA0);
+			for (j = 0; j < 64; j++) {
+				if (peev & PPC_BIT(j)) {
+					*first_frozen_pe = i * 64 + j;
+					break;
+				}
+			}
+			if (*first_frozen_pe != (uint64_t)(-1))
+				break;
+		}
+	}
+
+	if (*first_frozen_pe != (uint64_t)(-1)) {
+		pesta = phb4_get_pesta(p, *first_frozen_pe);
+		if (phb4_escalation_required() && phb4_freeze_escalate(pesta)) {
+			PHBINF(p, "Escalating freeze to fence. PESTA[%lli]=%016llx\n",
+			       *first_frozen_pe, pesta);
+			p->err.err_class = PHB4_ERR_CLASS_FENCED;
+		}
+	}
+
+	switch (p->err.err_class) {
+	case PHB4_ERR_CLASS_DEAD:
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_DEAD;
+		break;
+	case PHB4_ERR_CLASS_FENCED:
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_FENCED;
+		break;
+	case PHB4_ERR_CLASS_ER:
+		*pci_error_type = OPAL_EEH_PE_ERROR;
+		*severity = OPAL_EEH_SEV_PE_ER;
+
+		/* No frozen PE ? */
+		if (*first_frozen_pe == (uint64_t)-1) {
+			*pci_error_type = OPAL_EEH_NO_ERROR;
+			*severity = OPAL_EEH_SEV_NO_ERROR;
+			phb4_set_err_pending(p, false);
+		}
+
+		break;
+	case PHB4_ERR_CLASS_INF:
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_INF;
+		break;
+	default:
+		*pci_error_type = OPAL_EEH_NO_ERROR;
+		*severity = OPAL_EEH_SEV_NO_ERROR;
+		phb4_set_err_pending(p, false);
+	}
+
+	/*
+	 * Unmask all our error interrupts once all pending errors
+	 * have been handled.
+	 */
+	if (!phb4_err_pending(p))
+		phb4_int_unmask_all(p);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_err_inject_finalize(struct phb4 *phb, uint64_t addr,
+					uint64_t mask, uint64_t ctrl,
+					bool is_write)
+{
+	if (is_write)
+		ctrl |= PHB_PAPR_ERR_INJ_CTL_WR;
+	else
+		ctrl |= PHB_PAPR_ERR_INJ_CTL_RD;
+
+	out_be64(phb->regs + PHB_PAPR_ERR_INJ_ADDR, addr);
+	out_be64(phb->regs + PHB_PAPR_ERR_INJ_MASK, mask);
+	out_be64(phb->regs + PHB_PAPR_ERR_INJ_CTL, ctrl);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb4_err_inject_mem32(struct phb4 *phb __unused,
+				     uint64_t pe_number __unused,
+				     uint64_t addr __unused,
+				     uint64_t mask __unused,
+				     bool is_write __unused)
+{
+	return OPAL_UNSUPPORTED;
+}
+
+static int64_t phb4_err_inject_mem64(struct phb4 *phb __unused,
+				     uint64_t pe_number __unused,
+				     uint64_t addr __unused,
+				     uint64_t mask __unused,
+				     bool is_write __unused)
+{
+	return OPAL_UNSUPPORTED;
+}
+
+static int64_t phb4_err_inject_cfg(struct phb4 *phb, uint64_t pe_number,
+				   uint64_t addr, uint64_t mask,
+				   bool is_write)
+{
+	uint64_t a, m, prefer, ctrl;
+	int bdfn;
+	bool is_bus_pe = false;
+
+	a = 0xffffull;
+	prefer = 0xffffull;
+	m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL;
+	ctrl = PHB_PAPR_ERR_INJ_CTL_CFG;
+
+	for (bdfn = 0; bdfn < RTT_TABLE_ENTRIES; bdfn++) {
+		if (be16_to_cpu(phb->tbl_rtt[bdfn]) != pe_number)
+			continue;
+
+		/* The PE can be associated with PCI bus or device */
+		is_bus_pe = false;
+		if ((bdfn + 8) < RTT_TABLE_ENTRIES &&
+		    be16_to_cpu(phb->tbl_rtt[bdfn + 8]) == pe_number)
+			is_bus_pe = true;
+
+		/* Figure out the PCI config address */
+		if (prefer == 0xffffull) {
+			if (is_bus_pe) {
+				m = PHB_PAPR_ERR_INJ_MASK_CFG;
+				prefer = SETFIELD(m, 0x0ull, PCI_BUS_NUM(bdfn));
+			} else {
+				m = PHB_PAPR_ERR_INJ_MASK_CFG_ALL;
+				prefer = SETFIELD(m, 0x0ull, bdfn);
+			}
+		}
+
+		/* Check the input address is valid or not */
+		if (!is_bus_pe &&
+		    GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG_ALL, addr) == bdfn) {
+			a = addr;
+			break;
+		}
+
+		if (is_bus_pe &&
+		    GETFIELD(PHB_PAPR_ERR_INJ_MASK_CFG, addr) == PCI_BUS_NUM(bdfn)) {
+			a = addr;
+			break;
+		}
+	}
+
+	/* Invalid PE number */
+	if (prefer == 0xffffull)
+		return OPAL_PARAMETER;
+
+	/* Specified address is out of range */
+	if (a == 0xffffull)
+		a = prefer;
+	else
+		m = mask;
+
+	return phb4_err_inject_finalize(phb, a, m, ctrl, is_write);
+}
+
+static int64_t phb4_err_inject_dma(struct phb4 *phb __unused,
+				   uint64_t pe_number __unused,
+				   uint64_t addr __unused,
+				   uint64_t mask __unused,
+				   bool is_write __unused,
+				   bool is_64bits __unused)
+{
+	return OPAL_UNSUPPORTED;
+}
+
+static int64_t phb4_err_inject_dma32(struct phb4 *phb, uint64_t pe_number,
+				     uint64_t addr, uint64_t mask,
+				     bool is_write)
+{
+	return phb4_err_inject_dma(phb, pe_number, addr, mask, is_write, false);
+}
+
+static int64_t phb4_err_inject_dma64(struct phb4 *phb, uint64_t pe_number,
+				     uint64_t addr, uint64_t mask,
+				     bool is_write)
+{
+	return phb4_err_inject_dma(phb, pe_number, addr, mask, is_write, true);
+}
+
+
+static int64_t phb4_err_inject(struct phb *phb, uint64_t pe_number,
+			       uint32_t type, uint32_t func,
+			       uint64_t addr, uint64_t mask)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	int64_t (*handler)(struct phb4 *p, uint64_t pe_number,
+			   uint64_t addr, uint64_t mask, bool is_write);
+	bool is_write;
+
+	/* We can't inject error to the reserved PE */
+	if (pe_number == PHB4_RESERVED_PE_NUM(p) || pe_number >= p->num_pes)
+		return OPAL_PARAMETER;
+
+	/* Clear leftover from last time */
+	out_be64(p->regs + PHB_PAPR_ERR_INJ_CTL, 0x0ul);
+
+	switch (func) {
+	case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_DATA:
+		is_write = false;
+		if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+			handler = phb4_err_inject_mem64;
+		else
+			handler = phb4_err_inject_mem32;
+		break;
+	case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_ST_MEM_DATA:
+		is_write = true;
+		if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+			handler = phb4_err_inject_mem64;
+		else
+			handler = phb4_err_inject_mem32;
+		break;
+	case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_LD_CFG_DATA:
+		is_write = false;
+		handler = phb4_err_inject_cfg;
+		break;
+	case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_ST_CFG_DATA:
+		is_write = true;
+		handler = phb4_err_inject_cfg;
+		break;
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_DATA:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_MASTER:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_RD_TARGET:
+		is_write = false;
+		if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+			handler = phb4_err_inject_dma64;
+		else
+			handler = phb4_err_inject_dma32;
+		break;
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_ADDR:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_DATA:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_MASTER:
+	case OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_TARGET:
+		is_write = true;
+		if (type == OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64)
+			handler = phb4_err_inject_dma64;
+		else
+			handler = phb4_err_inject_dma32;
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return handler(p, pe_number, addr, mask, is_write);
+}
+
+static int64_t phb4_get_diag_data(struct phb *phb,
+				  void *diag_buffer,
+				  uint64_t diag_buffer_len)
+{
+	bool fenced;
+	struct phb4 *p = phb_to_phb4(phb);
+	struct OpalIoPhb4ErrorData *data = diag_buffer;
+
+	if (diag_buffer_len < sizeof(struct OpalIoPhb4ErrorData))
+		return OPAL_PARAMETER;
+	if (p->broken)
+		return OPAL_HARDWARE;
+
+	/*
+	 * Dummy check for fence so that phb4_read_phb_status knows
+	 * whether to use ASB or AIB
+	 */
+	fenced = phb4_fenced(p);
+	phb4_read_phb_status(p, data);
+
+	if (!fenced)
+		phb4_eeh_dump_regs(p);
+
+	/*
+	 * We're running to here probably because of errors
+	 * (INF class). For that case, we need clear the error
+	 * explicitly.
+	 */
+	if (phb4_err_pending(p) &&
+	    p->err.err_class == PHB4_ERR_CLASS_INF &&
+	    p->err.err_src == PHB4_ERR_SRC_PHB) {
+		phb4_err_clear(p);
+		phb4_set_err_pending(p, false);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static uint64_t tve_encode_50b_noxlate(uint64_t start_addr, uint64_t end_addr)
+{
+	uint64_t tve;
+
+	/*
+	 * Put start address bits 49:24 into TVE[52:53]||[0:23]
+	 * and end address bits 49:24 into TVE[54:55]||[24:47]
+	 * and set TVE[51]
+	 */
+	tve  = (start_addr << 16) & (0xffffffull << 40);
+	tve |= (start_addr >> 38) & (3ull << 10);
+	tve |= (end_addr >>  8) & (0xfffffful << 16);
+	tve |= (end_addr >> 40) & (3ull << 8);
+	tve |= PPC_BIT(51) | IODA3_TVT_NON_TRANSLATE_50;
+	return tve;
+}
+
+static bool phb4_is_dd20(struct phb4 *p)
+{
+	struct proc_chip *chip = get_chip(p->chip_id);
+
+	if (p->rev == PHB4_REV_NIMBUS_DD20 && ((0xf & chip->ec_level) == 0))
+		return true;
+	return false;
+}
+
+static int64_t phb4_get_capp_info(int chip_id, struct phb *phb,
+				  struct capp_info *info)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint32_t offset;
+
+	/* Not even supposed to be here on P10, but doesn't hurt */
+	if (is_phb5())
+		return OPAL_UNSUPPORTED;
+
+	if (chip_id != p->chip_id)
+		return OPAL_PARAMETER;
+
+	/* Check is CAPP is attached to the PHB */
+	if (p->capp == NULL || p->capp->phb != phb)
+		return OPAL_PARAMETER;
+
+	offset = PHB4_CAPP_REG_OFFSET(p);
+
+	if (p->index == CAPP0_PHB_INDEX)
+		info->capp_index = 0;
+	if (p->index == CAPP1_PHB_INDEX)
+		info->capp_index = 1;
+	info->phb_index = p->index;
+	info->capp_fir_reg = CAPP_FIR + offset;
+	info->capp_fir_mask_reg = CAPP_FIR_MASK + offset;
+	info->capp_fir_action0_reg = CAPP_FIR_ACTION0 + offset;
+	info->capp_fir_action1_reg = CAPP_FIR_ACTION1 + offset;
+	info->capp_err_status_ctrl_reg = CAPP_ERR_STATUS_CTRL + offset;
+
+	return OPAL_SUCCESS;
+}
+
+static void phb4_init_capp_regs(struct phb4 *p, uint32_t capp_eng)
+{
+	uint64_t addr, reg;
+	uint32_t offset;
+	uint8_t link_width_x16 = 1;
+
+	offset = PHB4_CAPP_REG_OFFSET(p);
+
+	/* Calculate the phb link width if card is attached to PEC2 */
+	if (p->index == CAPP1_PHB_INDEX) {
+		/* Check if PEC2 is in x8 or x16 mode.
+		 * PEC0 is always in x16
+		 */
+		addr = XPEC_P9_PCI_CPLT_CONF1 + 2 * XPEC_PCI_CPLT_OFFSET;
+		xscom_read(p->chip_id, addr, &reg);
+		link_width_x16 = ((reg & XPEC_P9_PCI_IOVALID_MASK) ==
+				  XPEC_P9_PCI_IOVALID_X16);
+	}
+
+	/* APC Master PowerBus Control Register */
+	xscom_read(p->chip_id, APC_MASTER_PB_CTRL + offset, &reg);
+	reg |= PPC_BIT(0); /* enable cResp exam */
+	reg |= PPC_BIT(3); /* disable vg not sys */
+	reg |= PPC_BIT(12);/* HW417025: disable capp virtual machines */
+	reg |= PPC_BIT(2); /* disable nn rn */
+	reg |= PPC_BIT(4); /* disable g */
+	reg |= PPC_BIT(5); /* disable ln */
+	xscom_write(p->chip_id, APC_MASTER_PB_CTRL + offset, reg);
+
+	/* Set PHB mode, HPC Dir State and P9 mode */
+	xscom_write(p->chip_id, APC_MASTER_CAPI_CTRL + offset,
+		    0x1772000000000000UL);
+	PHBINF(p, "CAPP: port attached\n");
+
+	/* Set snoop ttype decoding , dir size to 512K */
+	xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0x9000000000000000UL);
+
+	/* Use Read Epsilon Tier2 for all scopes.
+	 * Set Tier2 Read Epsilon.
+	 */
+	xscom_read(p->chip_id, SNOOP_CONTROL + offset, &reg);
+	reg |= PPC_BIT(0);
+	reg |= PPC_BIT(35);
+	reg |= PPC_BIT(45);
+	reg |= PPC_BIT(46);
+	reg |= PPC_BIT(47);
+	reg |= PPC_BIT(50);
+	xscom_write(p->chip_id, SNOOP_CONTROL + offset, reg);
+
+	/* Transport Control Register */
+	xscom_read(p->chip_id, TRANSPORT_CONTROL + offset, &reg);
+	if (p->index == CAPP0_PHB_INDEX) {
+		reg |= PPC_BIT(1); /* Send Packet Timer Value */
+		reg |= PPC_BITMASK(10, 13); /* Send Packet Timer Value */
+		reg &= ~PPC_BITMASK(14, 17); /* Set Max LPC CI store buffer to zeros */
+		reg &= ~PPC_BITMASK(18, 21); /* Set Max tlbi divider */
+		if (capp_eng & CAPP_MIN_STQ_ENGINES) {
+			/* 2 CAPP msg engines */
+			reg |= PPC_BIT(58);
+			reg |= PPC_BIT(59);
+			reg |= PPC_BIT(60);
+		}
+		if (capp_eng & CAPP_MAX_STQ_ENGINES) {
+			/* 14 CAPP msg engines */
+			reg |= PPC_BIT(60);
+		}
+		reg |= PPC_BIT(62);
+	}
+	if (p->index == CAPP1_PHB_INDEX) {
+		reg |= PPC_BIT(4); /* Send Packet Timer Value */
+		reg &= ~PPC_BIT(10); /* Set CI Store Buffer Threshold=5 */
+		reg |= PPC_BIT(11);  /* Set CI Store Buffer Threshold=5 */
+		reg &= ~PPC_BIT(12); /* Set CI Store Buffer Threshold=5 */
+		reg |= PPC_BIT(13);  /* Set CI Store Buffer Threshold=5 */
+		reg &= ~PPC_BITMASK(14, 17); /* Set Max LPC CI store buffer to zeros */
+		reg &= ~PPC_BITMASK(18, 21); /* Set Max tlbi divider */
+		if (capp_eng & CAPP_MIN_STQ_ENGINES) {
+			/* 2 CAPP msg engines */
+			reg |= PPC_BIT(59);
+			reg |= PPC_BIT(60);
+
+		} else if (capp_eng & CAPP_MAX_STQ_ENGINES) {
+
+			if (link_width_x16)
+				/* 14 CAPP msg engines */
+				reg |= PPC_BIT(60) | PPC_BIT(62);
+			else
+				/* 6 CAPP msg engines */
+				reg |= PPC_BIT(60);
+		}
+	}
+	xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, reg);
+
+	/* The transport control register needs to be loaded in two
+	 * steps. Once the register values have been set, we have to
+	 * write bit 63 to a '1', which loads the register values into
+	 * the ci store buffer logic.
+	 */
+	xscom_read(p->chip_id, TRANSPORT_CONTROL + offset, &reg);
+	reg |= PPC_BIT(63);
+	xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, reg);
+
+	/* Enable epoch timer */
+	xscom_write(p->chip_id, EPOCH_RECOVERY_TIMERS_CTRL + offset,
+		    0xC0000000FFF8FFE0UL);
+
+	/* Flush SUE State Map Register */
+	xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset,
+		    0x08020A0000000000UL);
+
+	/* Flush SUE uOP1 Register */
+	xscom_write(p->chip_id, FLUSH_SUE_UOP1 + offset,
+		    0xDCE0280428000000);
+
+	/* capp owns PHB read buffers */
+	if (p->index == CAPP0_PHB_INDEX) {
+		/* max PHB read buffers 0-47 */
+		reg = 0xFFFFFFFFFFFF0000UL;
+		if (capp_eng & CAPP_MAX_DMA_READ_ENGINES)
+			reg = 0xF000000000000000UL;
+		xscom_write(p->chip_id, APC_FSM_READ_MASK + offset, reg);
+		xscom_write(p->chip_id, XPT_FSM_RMM + offset, reg);
+	}
+	if (p->index == CAPP1_PHB_INDEX) {
+
+		if (capp_eng & CAPP_MAX_DMA_READ_ENGINES) {
+			reg = 0xF000000000000000ULL;
+		} else if (link_width_x16) {
+			/* 0-47 (Read machines) are available for
+			 * capp use
+			 */
+			reg = 0x0000FFFFFFFFFFFFULL;
+		} else {
+			/* Set 30 Read machines for CAPP Minus
+			 * 20-27 for DMA
+			 */
+			reg = 0xFFFFF00E00000000ULL;
+		}
+		xscom_write(p->chip_id, APC_FSM_READ_MASK + offset, reg);
+		xscom_write(p->chip_id, XPT_FSM_RMM + offset, reg);
+	}
+
+	/* CAPP FIR Action 0 */
+	xscom_write(p->chip_id, CAPP_FIR_ACTION0 + offset, 0x0b1c000104060000UL);
+
+	/* CAPP FIR Action 1 */
+	xscom_write(p->chip_id, CAPP_FIR_ACTION1 + offset, 0x2b9c0001240E0000UL);
+
+	/* CAPP FIR MASK */
+	xscom_write(p->chip_id, CAPP_FIR_MASK + offset, 0x80031f98d8717000UL);
+
+	/* Mask the CAPP PSL Credit Timeout Register error */
+	xscom_write_mask(p->chip_id, CAPP_FIR_MASK + offset,
+			 PPC_BIT(46), PPC_BIT(46));
+
+	/* Deassert TLBI_FENCED and tlbi_psl_is_dead */
+	xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, 0);
+}
+
+/* override some inits with CAPI defaults */
+static void phb4_init_capp_errors(struct phb4 *p)
+{
+	/* Init_77: TXE Error AIB Fence Enable Register */
+	if (phb4_is_dd20(p))
+		out_be64(p->regs + 0x0d30,	0xdfffbf0ff7ddfff0ull);
+	else
+		out_be64(p->regs + 0x0d30,	0xdff7bf0ff7ddfff0ull);
+	/* Init_86: RXE_ARB Error AIB Fence Enable Register */
+	out_be64(p->regs + 0x0db0,	0xfbffd7bbfb7fbfefull);
+
+	/* Init_95: RXE_MRG Error AIB Fence Enable Register */
+	out_be64(p->regs + 0x0e30,	0xfffffeffff7fff57ull);
+
+	/* Init_104: RXE_TCE Error AIB Fence Enable Register */
+	out_be64(p->regs + 0x0eb0,	0xffaeffafffffffffull);
+
+	/* Init_113: PHB Error AIB Fence Enable Register */
+	out_be64(p->regs + 0x0cb0,	0x35777073ff000000ull);
+}
+
+/*
+ * The capi, NBW and ASN indicators are used only on P9 to flag some
+ * types of incoming traffic for the PHB and have been removed on P10.
+ *
+ * The capi indicator is over the 8 most significant bits (and
+ * not 16). We stay away from bits 59 (TVE select), 60 and 61 (MSI)
+ *
+ * For the mask, we keep bit 59 in, as capi messages must hit TVE#0.
+ * Bit 56 is not part of the mask, so that a NBW message (see below)
+ * is also considered a capi message.
+ */
+#define CAPIIND		0x0200
+#define CAPIMASK	0xFE00
+
+/*
+ * Non-Blocking Write messages are a subset of capi messages, so the
+ * indicator is the same as capi + an extra bit (56) to differentiate.
+ * Mask is the same as capi + the extra bit
+ */
+#define NBWIND		0x0300
+#define NBWMASK		0xFF00
+
+/*
+ * The ASN indicator is used for tunneled operations (as_notify and
+ * atomics).  Tunneled operation messages can be sent in PCI mode as
+ * well as CAPI mode.
+ *
+ * The format of those messages is specific and, for as_notify
+ * messages, the address field is hijacked to encode the LPID/PID/TID
+ * of the target thread, so those messages should not go through
+ * translation. They must hit TVE#1. Therefore bit 59 is part of the
+ * indicator.
+ */
+#define ASNIND		0x0C00
+#define ASNMASK		0xFF00
+
+/* Power Bus Common Queue Registers
+ * All PBCQ and PBAIB registers are accessed via SCOM
+ * NestBase = 4010C00 for PEC0
+ *            4011000 for PEC1
+ *            4011400 for PEC2
+ * PCIBase  = D010800 for PE0
+ *            E010800 for PE1
+ *            F010800 for PE2
+ *
+ * Some registers are shared amongst all of the stacks and will only
+ * have 1 copy. Other registers are implemented one per stack.
+ * Registers that are duplicated will have an additional offset
+ * of “StackBase” so that they have a unique address.
+ * Stackoffset = 00000040 for Stack0
+ *             = 00000080 for Stack1
+ *             = 000000C0 for Stack2
+ */
+static int64_t enable_capi_mode(struct phb4 *p, uint64_t pe_number,
+				uint32_t capp_eng)
+{
+	uint64_t addr, reg, start_addr, end_addr, stq_eng, dma_eng;
+	uint64_t mbt0, mbt1;
+	int i, window_num = -1;
+
+	/* CAPP Control Register */
+	xscom_read(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, &reg);
+	if (reg & PPC_BIT(0)) {
+		PHBDBG(p, "Already in CAPP mode\n");
+	}
+
+	for (i = 0; i < 500000; i++) {
+		/* PBCQ General Status Register */
+		xscom_read(p->chip_id,
+			   p->pe_stk_xscom + XPEC_NEST_STK_PBCQ_STAT,
+			   &reg);
+		if (!(reg & 0xC000000000000000UL))
+			break;
+		time_wait_us(10);
+	}
+	if (reg & 0xC000000000000000UL) {
+		PHBERR(p, "CAPP: Timeout waiting for pending transaction\n");
+		return OPAL_HARDWARE;
+	}
+
+	stq_eng = 0x0000000000000000ULL;
+	dma_eng = 0x0000000000000000ULL;
+	if (p->index == CAPP0_PHB_INDEX) {
+		/* PBCQ is operating as a x16 stack
+		 * - The maximum number of engines give to CAPP will be
+		 * 14 and will be assigned in the order of STQ 15 to 2.
+		 * - 0-47 (Read machines) are available for capp use.
+		 */
+		stq_eng = 0x000E000000000000ULL; /* 14 CAPP msg engines */
+		dma_eng = 0x0000FFFFFFFFFFFFULL; /* 48 CAPP Read machines */
+	}
+
+	if (p->index == CAPP1_PHB_INDEX) {
+		/* Check if PEC is in x8 or x16 mode */
+		addr = XPEC_P9_PCI_CPLT_CONF1 + 2 * XPEC_PCI_CPLT_OFFSET;
+		xscom_read(p->chip_id, addr, &reg);
+		if ((reg & XPEC_P9_PCI_IOVALID_MASK) == XPEC_P9_PCI_IOVALID_X16) {
+			/* PBCQ is operating as a x16 stack
+			 * - The maximum number of engines give to CAPP will be
+			 * 14 and will be assigned in the order of STQ 15 to 2.
+			 * - 0-47 (Read machines) are available for capp use.
+			 */
+			stq_eng = 0x000E000000000000ULL;
+			dma_eng = 0x0000FFFFFFFFFFFFULL;
+		} else {
+
+			/* PBCQ is operating as a x8 stack
+			 * - The maximum number of engines given to CAPP should
+			 * be 6 and will be assigned in the order of 7 to 2.
+			 * - 0-30 (Read machines) are available for capp use.
+			 */
+			stq_eng = 0x0006000000000000ULL;
+			/* 30 Read machines for CAPP Minus 20-27 for DMA */
+			dma_eng = 0x0000FFFFF00E0000ULL;
+		}
+	}
+
+	if (capp_eng & CAPP_MIN_STQ_ENGINES)
+		stq_eng = 0x0002000000000000ULL; /* 2 capp msg engines */
+
+	/* CAPP Control Register. Enable CAPP Mode */
+	reg = 0x8000000000000000ULL; /* PEC works in CAPP Mode */
+	reg |= stq_eng;
+	if (capp_eng & CAPP_MAX_DMA_READ_ENGINES)
+		dma_eng = 0x0000F00000000000ULL; /* 4 CAPP Read machines */
+	reg |= dma_eng;
+	xscom_write(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, reg);
+
+	/* PEC2 has 3 ETU's + 16 pci lanes that can operate as x16,
+	 * x8+x8 (bifurcated) or x8+x4+x4 (trifurcated) mode. When
+	 * Mellanox CX5 card is attached to stack0 of this PEC, indicated by
+	 * request to allocate CAPP_MAX_DMA_READ_ENGINES; we tweak the default
+	 * dma-read engines allocations to maximize the DMA read performance
+	 */
+	if ((p->index == CAPP1_PHB_INDEX) &&
+	    (capp_eng & CAPP_MAX_DMA_READ_ENGINES))
+		phb4_pec2_dma_engine_realloc(p);
+
+	/* PCI to PB data movement ignores the PB init signal. */
+	xscom_write_mask(p->chip_id, p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG,
+			 XPEC_NEST_PBCQ_HW_CONFIG_PBINIT,
+			 XPEC_NEST_PBCQ_HW_CONFIG_PBINIT);
+
+	/* If pump mode is enabled don't do nodal broadcasts.
+	 */
+	xscom_read(p->chip_id, PB_CENT_HP_MODE_CURR, &reg);
+	if (reg & PB_CFG_PUMP_MODE) {
+		reg = XPEC_NEST_PBCQ_HW_CONFIG_DIS_NODAL;
+		reg |= XPEC_NEST_PBCQ_HW_CONFIG_DIS_RNNN;
+		xscom_write_mask(p->chip_id,
+				 p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG,
+				 reg, reg);
+	}
+
+	/* PEC Phase 4 (PHB) registers adjustment
+	 * Inbound CAPP traffic: The CAPI can send both CAPP packets and
+	 * I/O packets. A PCIe packet is indentified as a CAPP packet in
+	 * the PHB if the PCIe address matches either the CAPI
+	 * Compare/Mask register or its NBW Compare/Mask register.
+	 */
+
+	/*
+	 * Bit [0:7] XSL_DSNCTL[capiind]
+	 * Init_26 - CAPI Compare/Mask
+	 */
+	out_be64(p->regs + PHB_CAPI_CMPM,
+		 ((u64)CAPIIND << 48) |
+		 ((u64)CAPIMASK << 32) | PHB_CAPI_CMPM_ENABLE);
+
+	/* PB AIB Hardware Control Register
+	 * Wait 32 PCI clocks for a credit to become available
+	 * before rejecting.
+	 */
+	xscom_read(p->chip_id, p->pci_xscom + XPEC_PCI_PBAIB_HW_CONFIG, &reg);
+	reg |= PPC_BITMASK(40, 42);
+	if (p->index == CAPP1_PHB_INDEX)
+		reg |= PPC_BIT(30);
+	xscom_write(p->chip_id, p->pci_xscom + XPEC_PCI_PBAIB_HW_CONFIG, reg);
+
+	/* non-translate/50-bit mode */
+	out_be64(p->regs + PHB_NXLATE_PREFIX, 0x0000000000000000Ull);
+
+	/* set tve no translate mode allow mmio window */
+	memset(p->tve_cache, 0x0, sizeof(p->tve_cache));
+
+	/*
+	 * In 50-bit non-translate mode, the fields of the TVE are
+	 * used to perform an address range check. In this mode TCE
+	 * Table Size(0) must be a '1' (TVE[51] = 1)
+	 *      PCI Addr(49:24) >= TVE[52:53]+TVE[0:23] and
+	 *      PCI Addr(49:24) < TVE[54:55]+TVE[24:47]
+	 *
+	 * TVE[51] = 1
+	 * TVE[56] = 1: 50-bit Non-Translate Mode Enable
+	 * TVE[0:23] = 0x000000
+	 * TVE[24:47] = 0xFFFFFF
+	 *
+	 * capi dma mode: CAPP DMA mode needs access to all of memory
+	 * capi mode: Allow address range (bit 14 = 1)
+	 *            0x0002000000000000: 0x0002FFFFFFFFFFFF
+	 *            TVE[52:53] = '10' and TVE[54:55] = '10'
+	 */
+
+	/* TVT#0: CAPI window + DMA, all memory */
+	start_addr = 0ull;
+	end_addr   = 0x0003ffffffffffffull;
+	p->tve_cache[pe_number * 2] =
+		tve_encode_50b_noxlate(start_addr, end_addr);
+
+	/* TVT#1: CAPI window + DMA, all memory, in bypass mode */
+	start_addr = (1ull << 59);
+	end_addr   = start_addr + 0x0003ffffffffffffull;
+	p->tve_cache[pe_number * 2 + 1] =
+		tve_encode_50b_noxlate(start_addr, end_addr);
+
+	phb4_ioda_sel(p, IODA3_TBL_TVT, 0, true);
+	for (i = 0; i < p->tvt_size; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]);
+
+	/*
+	 * Since TVT#0 is in by-pass mode, disable 32-bit MSI, as a
+	 * DMA write targeting 0x00000000FFFFxxxx would be interpreted
+	 * as a 32-bit MSI
+	 */
+	reg = in_be64(p->regs + PHB_PHB4_CONFIG);
+	reg &= ~PHB_PHB4C_32BIT_MSI_EN;
+	out_be64(p->regs + PHB_PHB4_CONFIG, reg);
+
+	/* set mbt bar to pass capi mmio window and keep the other
+	 * mmio values
+	 */
+	mbt0 = IODA3_MBT0_ENABLE | IODA3_MBT0_TYPE_M64 |
+	       SETFIELD(IODA3_MBT0_MODE, 0ull, IODA3_MBT0_MODE_SINGLE_PE) |
+	       SETFIELD(IODA3_MBT0_MDT_COLUMN, 0ull, 0) |
+	       (0x0002000000000000ULL & IODA3_MBT0_BASE_ADDR);
+
+	mbt1 = IODA3_MBT1_ENABLE |
+	       (0x00ff000000000000ULL & IODA3_MBT1_MASK) |
+	       SETFIELD(IODA3_MBT1_SINGLE_PE_NUM, 0ull, pe_number);
+
+	for (i = 0; i < p->mbt_size; i++) {
+		/* search if the capi mmio window is already present */
+		if ((p->mbt_cache[i][0] == mbt0) &&
+		    (p->mbt_cache[i][1] == mbt1))
+			break;
+
+		/* search a free entry */
+		if ((window_num == -1) &&
+		   ((!(p->mbt_cache[i][0] & IODA3_MBT0_ENABLE)) &&
+		    (!(p->mbt_cache[i][1] & IODA3_MBT1_ENABLE))))
+			window_num = i;
+	}
+
+	if (window_num >= 0 && i == p->mbt_size) {
+		/* no capi mmio window found, so add it */
+		p->mbt_cache[window_num][0] = mbt0;
+		p->mbt_cache[window_num][1] = mbt1;
+
+		phb4_ioda_sel(p, IODA3_TBL_MBT, window_num << 1, true);
+		out_be64(p->regs + PHB_IODA_DATA0, mbt0);
+		out_be64(p->regs + PHB_IODA_DATA0, mbt1);
+	} else if (i == p->mbt_size) {
+		/* mbt cache full, this case should never happen */
+		PHBERR(p, "CAPP: Failed to add CAPI mmio window\n");
+	} else {
+		/* duplicate entry. Nothing to do */
+	}
+
+	phb4_init_capp_errors(p);
+
+	phb4_init_capp_regs(p, capp_eng);
+
+	if (!chiptod_capp_timebase_sync(p->chip_id, CAPP_TFMR,
+					CAPP_TB,
+					PHB4_CAPP_REG_OFFSET(p)))
+		PHBERR(p, "CAPP: Failed to sync timebase\n");
+
+	/* set callbacks to handle HMI events */
+	capi_ops.get_capp_info = &phb4_get_capp_info;
+
+	return OPAL_SUCCESS;
+}
+
+
+static int64_t phb4_init_capp(struct phb4 *p)
+{
+	struct capp *capp;
+	int rc;
+
+	if (p->index != CAPP0_PHB_INDEX &&
+	    p->index != CAPP1_PHB_INDEX)
+		return OPAL_UNSUPPORTED;
+
+	capp = zalloc(sizeof(struct capp));
+	if (capp == NULL)
+		return OPAL_NO_MEM;
+
+	if (p->index == CAPP0_PHB_INDEX) {
+		capp->capp_index = 0;
+		capp->capp_xscom_offset = 0;
+
+	} else if (p->index == CAPP1_PHB_INDEX) {
+		capp->capp_index = 1;
+		capp->capp_xscom_offset = CAPP1_REG_OFFSET;
+	}
+
+	capp->attached_pe = phb4_get_reserved_pe_number(&p->phb);
+	capp->chip_id = p->chip_id;
+
+	/* Load capp microcode into the capp unit */
+	rc = load_capp_ucode(p);
+
+	if (rc == OPAL_SUCCESS)
+		p->capp = capp;
+	else
+		free(capp);
+
+	return rc;
+}
+
+static int64_t phb4_set_capi_mode(struct phb *phb, uint64_t mode,
+				  uint64_t pe_number)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	struct proc_chip *chip = get_chip(p->chip_id);
+	struct capp *capp = p->capp;
+	uint64_t reg, ret;
+
+	/* No CAPI on P10. OpenCAPI only */
+	if (is_phb5())
+		return OPAL_UNSUPPORTED;
+
+	/* cant do a mode switch when capp is in recovery mode */
+	ret = capp_xscom_read(capp, CAPP_ERR_STATUS_CTRL, &reg);
+	if (ret != OPAL_SUCCESS)
+		return ret;
+
+	if ((reg & PPC_BIT(0)) && (!(reg & PPC_BIT(1)))) {
+		PHBDBG(p, "CAPP: recovery in progress\n");
+		return OPAL_BUSY;
+	}
+
+
+	switch (mode) {
+
+	case OPAL_PHB_CAPI_MODE_DMA: /* Enabled by default on p9 */
+	case OPAL_PHB_CAPI_MODE_SNOOP_ON:
+		/* nothing to do on P9 if CAPP is already enabled */
+		ret = p->capp->phb ? OPAL_SUCCESS : OPAL_UNSUPPORTED;
+		break;
+
+	case OPAL_PHB_CAPI_MODE_SNOOP_OFF:
+		ret = p->capp->phb ? OPAL_UNSUPPORTED : OPAL_SUCCESS;
+		break;
+
+	case OPAL_PHB_CAPI_MODE_PCIE:
+		if (p->flags & PHB4_CAPP_DISABLE) {
+			/* We are in middle of a CAPP disable */
+			ret = OPAL_BUSY;
+
+		} else if (capp->phb) {
+			/* Kick start a creset */
+			p->flags |= PHB4_CAPP_DISABLE;
+			PHBINF(p, "CAPP: PCIE mode needs a cold-reset\n");
+			/* Kick off the pci state machine */
+			ret = phb4_creset(phb->slot);
+			ret = ret > 0 ? OPAL_BUSY : ret;
+
+		} else {
+			/* PHB already in PCI mode */
+			ret = OPAL_SUCCESS;
+		}
+		break;
+
+	case OPAL_PHB_CAPI_MODE_CAPI: /* Fall Through */
+	case OPAL_PHB_CAPI_MODE_DMA_TVT1:
+		/* Make sure that PHB is not disabling CAPP */
+		if (p->flags & PHB4_CAPP_DISABLE) {
+			PHBERR(p, "CAPP: Disable in progress\n");
+			ret = OPAL_BUSY;
+			break;
+		}
+
+		/* Check if ucode is available */
+		if (!capp_ucode_loaded(chip, p->index)) {
+			PHBERR(p, "CAPP: ucode not loaded\n");
+			ret = OPAL_RESOURCE;
+			break;
+		}
+
+		/*
+		 * Mark the CAPP attached to the PHB right away so that
+		 * if a MCE happens during CAPP init we can handle it.
+		 * In case of an error in CAPP init we remove the PHB
+		 * from the attached_mask later.
+		 */
+		capp->phb = phb;
+		capp->attached_pe = pe_number;
+
+		if (mode == OPAL_PHB_CAPI_MODE_DMA_TVT1)
+			ret = enable_capi_mode(p, pe_number,
+					       CAPP_MIN_STQ_ENGINES |
+					       CAPP_MAX_DMA_READ_ENGINES);
+
+		else
+			ret = enable_capi_mode(p, pe_number,
+					       CAPP_MAX_STQ_ENGINES |
+					       CAPP_MIN_DMA_READ_ENGINES);
+		if (ret == OPAL_SUCCESS) {
+			/* register notification on system shutdown */
+			opal_add_host_sync_notifier(&phb4_host_sync_reset, p);
+
+		} else {
+			/* In case of an error mark the PHB detached */
+			capp->phb = NULL;
+			capp->attached_pe = phb4_get_reserved_pe_number(phb);
+		}
+		break;
+
+	default:
+		ret = OPAL_UNSUPPORTED;
+		break;
+	};
+
+	return ret;
+}
+
+static void phb4_p2p_set_initiator(struct phb4 *p, uint16_t pe_number)
+{
+	uint64_t tve;
+	uint16_t window_id = (pe_number << 1) + 1;
+
+	/*
+	 * Initiator needs access to the MMIO space of the target,
+	 * which is well beyond the 'normal' memory area. Set its TVE
+	 * with no range checking.
+	 */
+	PHBDBG(p, "Setting TVE#1 for peer-to-peer for pe %d\n", pe_number);
+	tve = PPC_BIT(51);
+	phb4_ioda_sel(p, IODA3_TBL_TVT, window_id, false);
+	out_be64(p->regs + PHB_IODA_DATA0, tve);
+	p->tve_cache[window_id] = tve;
+}
+
+static void phb4_p2p_set_target(struct phb4 *p, bool enable)
+{
+	uint64_t val;
+
+	/*
+	 * Enabling p2p on a target PHB reserves an outbound (as seen
+	 * from the CPU) store queue for p2p
+	 */
+	PHBDBG(p, "%s peer-to-peer\n", (enable ? "Enabling" : "Disabling"));
+	xscom_read(p->chip_id,
+		p->pe_stk_xscom + XPEC_NEST_STK_PBCQ_MODE, &val);
+	if (enable)
+		val |= XPEC_NEST_STK_PBCQ_MODE_P2P;
+	else
+		val &= ~XPEC_NEST_STK_PBCQ_MODE_P2P;
+	xscom_write(p->chip_id,
+		p->pe_stk_xscom + XPEC_NEST_STK_PBCQ_MODE, val);
+}
+
+static void phb4_set_p2p(struct phb *phb, uint64_t mode, uint64_t flags,
+			uint16_t pe_number)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+
+	switch (mode) {
+	case OPAL_PCI_P2P_INITIATOR:
+		if (flags & OPAL_PCI_P2P_ENABLE)
+			phb4_p2p_set_initiator(p, pe_number);
+		/*
+		 * When disabling p2p on the initiator, we should
+		 * reset the TVE to its default bypass setting, but it
+		 * is more easily done from the OS, as it knows the
+		 * the start and end address and there's already an
+		 * opal call for it, so let linux handle it.
+		 */
+		break;
+	case OPAL_PCI_P2P_TARGET:
+		phb4_p2p_set_target(p, !!(flags & OPAL_PCI_P2P_ENABLE));
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static int64_t phb4_set_capp_recovery(struct phb *phb)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+
+	if (p->flags & PHB4_CAPP_RECOVERY)
+		return 0;
+
+	/* set opal event flag to indicate eeh condition */
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+				OPAL_EVENT_PCI_ERROR);
+
+	p->flags |= PHB4_CAPP_RECOVERY;
+
+	return 0;
+}
+
+/*
+ * Return the address out of a PBCQ Tunnel Bar register.
+ */
+static void phb4_get_tunnel_bar(struct phb *phb, uint64_t *addr)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t val;
+
+	xscom_read(p->chip_id, p->pe_stk_xscom + XPEC_NEST_STK_TUNNEL_BAR,
+		   &val);
+	*addr = val >> 8;
+}
+
+/*
+ * Set PBCQ Tunnel Bar register.
+ * Store addr bits [8:50] in PBCQ Tunnel Bar register bits [0:42].
+ * Note that addr bits [8:50] must also match PSL_TNR_ADDR[8:50].
+ * Reset register if val == 0.
+ *
+ * This interface is required to let device drivers set the Tunnel Bar
+ * value of their choice.
+ *
+ * Compatibility with older versions of linux, that do not set the
+ * Tunnel Bar with phb4_set_tunnel_bar(), is ensured by enable_capi_mode(),
+ * that will set the default value that used to be assumed.
+ */
+static int64_t phb4_set_tunnel_bar(struct phb *phb, uint64_t addr)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+	uint64_t mask = 0x00FFFFFFFFFFE000ULL;
+
+	if (!addr) {
+		/* Reset register */
+		xscom_write(p->chip_id,
+			    p->pe_stk_xscom + XPEC_NEST_STK_TUNNEL_BAR, addr);
+		return OPAL_SUCCESS;
+	}
+	if ((addr & ~mask))
+		return OPAL_PARAMETER;
+	if (!(addr & mask))
+		return OPAL_PARAMETER;
+
+	xscom_write(p->chip_id, p->pe_stk_xscom + XPEC_NEST_STK_TUNNEL_BAR,
+		    (addr & mask) << 8);
+	return OPAL_SUCCESS;
+}
+
+static const struct phb_ops phb4_ops = {
+	.cfg_read8		= phb4_pcicfg_read8,
+	.cfg_read16		= phb4_pcicfg_read16,
+	.cfg_read32		= phb4_pcicfg_read32,
+	.cfg_write8		= phb4_pcicfg_write8,
+	.cfg_write16		= phb4_pcicfg_write16,
+	.cfg_write32		= phb4_pcicfg_write32,
+	.get_reserved_pe_number	= phb4_get_reserved_pe_number,
+	.device_init		= phb4_device_init,
+	.device_remove		= NULL,
+	.ioda_reset		= phb4_ioda_reset,
+	.papr_errinjct_reset	= phb4_papr_errinjct_reset,
+	.pci_reinit		= phb4_pci_reinit,
+	.set_phb_mem_window	= phb4_set_phb_mem_window,
+	.phb_mmio_enable	= phb4_phb_mmio_enable,
+	.map_pe_mmio_window	= phb4_map_pe_mmio_window,
+	.map_pe_dma_window	= phb4_map_pe_dma_window,
+	.map_pe_dma_window_real = phb4_map_pe_dma_window_real,
+	.set_option		= phb4_set_option,
+	.get_option		= phb4_get_option,
+	.set_xive_pe		= phb4_set_ive_pe,
+	.get_msi_32		= phb4_get_msi_32,
+	.get_msi_64		= phb4_get_msi_64,
+	.set_pe			= phb4_set_pe,
+	.set_peltv		= phb4_set_peltv,
+	.eeh_freeze_status	= phb4_eeh_freeze_status,
+	.eeh_freeze_clear	= phb4_eeh_freeze_clear,
+	.eeh_freeze_set		= phb4_eeh_freeze_set,
+	.next_error		= phb4_eeh_next_error,
+	.err_inject		= phb4_err_inject,
+	.get_diag_data2		= phb4_get_diag_data,
+	.tce_kill		= phb4_tce_kill,
+	.set_capi_mode		= phb4_set_capi_mode,
+	.set_p2p		= phb4_set_p2p,
+	.set_capp_recovery	= phb4_set_capp_recovery,
+	.get_tunnel_bar         = phb4_get_tunnel_bar,
+	.set_tunnel_bar         = phb4_set_tunnel_bar,
+};
+
+static void phb4_init_ioda3(struct phb4 *p)
+{
+	if (is_phb5()) {
+		/*
+		 * When ABT is on, the MSIs on the PHB use the PQ state bits
+		 * of the IC and MSI triggers from the PHB are forwarded
+		 * directly to the IC ESB page. However, the LSIs are still
+		 * controlled locally on the PHB and LSI triggers use a
+		 * special offset for trigger injection.
+		 */
+		if (phb_abt_mode(p)) {
+			uint64_t mmio_base = xive2_get_esb_base(p->base_msi);
+
+			PHBDBG(p, "Using ABT mode. ESB: 0x%016llx\n", mmio_base);
+
+			/* Init_18 - Interrupt Notify Base Address */
+			out_be64(p->regs + PHB_INT_NOTIFY_ADDR,
+				 PHB_INT_NOTIFY_ADDR_64K | mmio_base);
+
+			/* Interrupt Notify Base Index is unused */
+		} else {
+			p->irq_port = xive2_get_notify_port(p->chip_id,
+						XIVE_HW_SRC_PHBn(p->index));
+
+			PHBDBG(p, "Using IC notif page at 0x%016llx\n",
+						p->irq_port);
+
+			/* Init_18 - Interrupt Notify Base Address */
+			out_be64(p->regs + PHB_INT_NOTIFY_ADDR, p->irq_port);
+
+			/* Init_19 - Interrupt Notify Base Index */
+			out_be64(p->regs + PHB_INT_NOTIFY_INDEX,
+				 xive2_get_notify_base(p->base_msi));
+		}
+
+	} else { /* p9 */
+		p->irq_port = xive_get_notify_port(p->chip_id,
+						   XIVE_HW_SRC_PHBn(p->index));
+		/* Init_18 - Interrupt Notify Base Address */
+		out_be64(p->regs + PHB_INT_NOTIFY_ADDR, p->irq_port);
+
+		/* Init_19 - Interrupt Notify Base Index */
+		out_be64(p->regs + PHB_INT_NOTIFY_INDEX,
+			 xive_get_notify_base(p->base_msi));
+	}
+
+	/* Init_19x - Not in spec: Initialize source ID */
+	PHBDBG(p, "Reset state SRC_ID: %016llx\n",
+	       in_be64(p->regs + PHB_LSI_SOURCE_ID));
+	out_be64(p->regs + PHB_LSI_SOURCE_ID,
+		 SETFIELD(PHB_LSI_SRC_ID, 0ull, (p->num_irqs - 1) >> 3));
+
+	/* Init_20 - RTT BAR */
+	out_be64(p->regs + PHB_RTT_BAR, (u64) p->tbl_rtt | PHB_RTT_BAR_ENABLE);
+
+	/* Init_21 - PELT-V BAR */
+	out_be64(p->regs + PHB_PELTV_BAR,
+		 (u64) p->tbl_peltv | PHB_PELTV_BAR_ENABLE);
+
+	/* Init_22 - Setup M32 starting address */
+	out_be64(p->regs + PHB_M32_START_ADDR, M32_PCI_START);
+
+	/* Init_23 - Setup PEST BAR */
+	out_be64(p->regs + PHB_PEST_BAR,
+		 p->tbl_pest | PHB_PEST_BAR_ENABLE);
+
+	/* Init_24 - CRW Base Address Reg */
+	/* See enable_capi_mode() */
+
+	if (is_phb4()) {
+		/* Init_25 - ASN Compare/Mask - P9 only */
+		out_be64(p->regs + PHB_ASN_CMPM, ((u64)ASNIND << 48) |
+			 ((u64)ASNMASK << 32) | PHB_ASN_CMPM_ENABLE);
+	}
+
+	/* Init_26 - CAPI Compare/Mask */
+	/* See enable_capi_mode() */
+	/* if CAPP being disabled then reset CAPI Compare/Mask Register */
+	if (p->flags & PHB4_CAPP_DISABLE)
+		out_be64(p->regs + PHB_CAPI_CMPM, 0);
+
+	/* Init_27 - PCIE Outbound upper address */
+	out_be64(p->regs + PHB_M64_UPPER_BITS, 0);
+
+	/* Init_28 - PHB4 Configuration */
+	out_be64(p->regs + PHB_PHB4_CONFIG,
+		 PHB_PHB4C_32BIT_MSI_EN |
+		 PHB_PHB4C_64BIT_MSI_EN);
+
+	/* Init_29 - At least 256ns delay according to spec. Do a dummy
+	 * read first to flush posted writes
+	 */
+	in_be64(p->regs + PHB_PHB4_CONFIG);
+	time_wait_us(2);
+
+	/* Init_30..41 - On-chip IODA tables init */
+	phb4_ioda_reset(&p->phb, false);
+}
+
+/* phb4_init_rc - Initialize the Root Complex config space
+ */
+static bool phb4_init_rc_cfg(struct phb4 *p)
+{
+	int64_t ecap, aercap;
+
+	/* XXX Handle errors ? */
+
+	/* Init_46:
+	 *
+	 * Set primary bus to 0, secondary to 1 and subordinate to 0xff
+	 */
+	phb4_pcicfg_write32(&p->phb, 0, PCI_CFG_PRIMARY_BUS, 0x00ff0100);
+
+	/* Init_47 - Clear errors */
+	/* see phb4_rc_err_clear() called below */
+
+	/* Init_48
+	 *
+	 * PCIE Device control/status, enable error reporting, disable relaxed
+	 * ordering, set MPS to 128 (see note), clear errors.
+	 *
+	 * Note: The doc recommends to set MPS to 512. This has proved to have
+	 * some issues as it requires specific clamping of MRSS on devices and
+	 * we've found devices in the field that misbehave when doing that.
+	 *
+	 * We currently leave it all to 128 bytes (minimum setting) at init
+	 * time. The generic PCIe probing later on might apply a different
+	 * value, or the kernel will, but we play it safe at early init
+	 */
+	if (p->ecap <= 0) {
+		ecap = pci_find_cap(&p->phb, 0, PCI_CFG_CAP_ID_EXP);
+		if (ecap < 0) {
+			PHBERR(p, "Can't locate PCI-E capability\n");
+			return false;
+		}
+		p->ecap = ecap;
+	} else {
+		ecap = p->ecap;
+	}
+
+	phb4_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVCTL,
+			     PCICAP_EXP_DEVCTL_CE_REPORT	|
+			     PCICAP_EXP_DEVCTL_NFE_REPORT	|
+			     PCICAP_EXP_DEVCTL_FE_REPORT	|
+			     PCICAP_EXP_DEVCTL_UR_REPORT	|
+			     SETFIELD(PCICAP_EXP_DEVCTL_MPS, 0, PCIE_MPS_128B));
+
+	/* Init_49 - Device Control/Status 2 */
+	phb4_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DCTL2,
+			     SETFIELD(PCICAP_EXP_DCTL2_CMPTOUT, 0, 0x5) |
+			     PCICAP_EXP_DCTL2_ARI_FWD);
+
+	/* Init_50..54
+	 *
+	 * AER inits
+	 */
+	if (p->aercap <= 0) {
+		aercap = pci_find_ecap(&p->phb, 0, PCIECAP_ID_AER, NULL);
+		if (aercap < 0) {
+			PHBERR(p, "Can't locate AER capability\n");
+			return false;
+		}
+		p->aercap = aercap;
+	} else {
+		aercap = p->aercap;
+	}
+
+	/* Disable some error reporting as per the PHB4 spec */
+	phb4_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_MASK,
+			     PCIECAP_AER_UE_POISON_TLP		|
+			     PCIECAP_AER_UE_COMPL_TIMEOUT	|
+			     PCIECAP_AER_UE_COMPL_ABORT);
+
+	/* Enable ECRC generation & checking */
+	phb4_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CAPCTL,
+			     PCIECAP_AER_CAPCTL_ECRCG_EN	|
+			     PCIECAP_AER_CAPCTL_ECRCC_EN);
+
+	phb4_rc_err_clear(p);
+
+	return true;
+}
+
+static void phb4_init_errors(struct phb4 *p)
+{
+	/* Init_55..63 - PBL errors */
+	out_be64(p->regs + 0x1900,	0xffffffffffffffffull);
+	out_be64(p->regs + 0x1908,	0x0000000000000000ull);
+	out_be64(p->regs + 0x1920,	0x000000004d1780f8ull);
+	out_be64(p->regs + 0x1928,	0x0000000000000000ull);
+	out_be64(p->regs + 0x1930,	0xffffffffb2f87f07ull);
+	out_be64(p->regs + 0x1940,	0x0000000000000000ull);
+	out_be64(p->regs + 0x1948,	0x0000000000000000ull);
+	out_be64(p->regs + 0x1950,	0x0000000000000000ull);
+	out_be64(p->regs + 0x1958,	0x0000000000000000ull);
+
+	/* Init_64..72 - REGB errors */
+	out_be64(p->regs + 0x1c00,	0xffffffffffffffffull);
+	out_be64(p->regs + 0x1c08,	0x0000000000000000ull);
+	/* Enable/disable error status indicators that trigger irqs */
+	if (p->has_link) {
+		out_be64(p->regs + 0x1c20,	0x2130006efca8bc00ull);
+		out_be64(p->regs + 0x1c30,	0xde1fff91035743ffull);
+	} else {
+		out_be64(p->regs + 0x1c20,	0x0000000000000000ull);
+		out_be64(p->regs + 0x1c30,	0x0000000000000000ull);
+	}
+	out_be64(p->regs + 0x1c28,	0x0080000000000000ull);
+	out_be64(p->regs + 0x1c40,	0x0000000000000000ull);
+	out_be64(p->regs + 0x1c48,	0x0000000000000000ull);
+	out_be64(p->regs + 0x1c50,	0x0000000000000000ull);
+	out_be64(p->regs + 0x1c58,	0x0040000000000000ull);
+
+	/* Init_73..81 - TXE errors */
+	out_be64(p->regs + 0x0d08,	0x0000000000000000ull);
+
+	/* Errata: Clear bit 17, otherwise a CFG write UR/CA will incorrectly
+	 * freeze a "random" PE (whatever last PE did an MMIO)
+	 */
+	if (is_phb5()) {
+		out_be64(p->regs + 0x0d28,	0x0000500a00000000ull);
+		out_be64(p->regs + 0x0d00,	0xffffffffffffffffull);
+		out_be64(p->regs + 0x0d18,	0xffffff0fffffffffull);
+		out_be64(p->regs + 0x0d30,	0xdff7af41f7ddffdfull);
+	} else {
+		out_be64(p->regs + 0x0d28,	0x0000000a00000000ull);
+		if (phb4_is_dd20(p)) {
+			out_be64(p->regs + 0x0d00,	0xf3acff0ff7ddfff0ull);
+			out_be64(p->regs + 0x0d18,	0xf3acff0ff7ddfff0ull);
+			out_be64(p->regs + 0x0d30,	0xdfffbd05f7ddfff0ull); /* XXX CAPI has diff. value */
+		} else  {
+			out_be64(p->regs + 0x0d00,	0xffffffffffffffffull);
+			out_be64(p->regs + 0x0d18,	0xffffff0fffffffffull);
+			out_be64(p->regs + 0x0d30,	0xdff7bd05f7ddfff0ull);
+		}
+	}
+
+	out_be64(p->regs + 0x0d40,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0d48,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0d50,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0d58,	0x0000000000000000ull);
+
+	/* Init_82..90 - RXE_ARB errors */
+	out_be64(p->regs + 0x0d80,	0xffffffffffffffffull);
+	out_be64(p->regs + 0x0d88,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0d98,	0xfffffffffbffffffull);
+	out_be64(p->regs + 0x0da8,	0xc00018b801000060ull);
+	/*
+	 * Errata ER20161123 says we should set the top two bits in
+	 * 0x0db0 but this causes config space accesses which don't
+	 * get a response to fence the PHB. This breaks probing,
+	 * hence we don't set them here.
+	 */
+	out_be64(p->regs + 0x0db0,	0x3bffd703fa7fbf8full); /* XXX CAPI has diff. value */
+	out_be64(p->regs + 0x0dc0,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0dc8,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0dd0,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0dd8,	0x0000000004000000ull);
+
+	/* Init_91..99 - RXE_MRG errors */
+	out_be64(p->regs + 0x0e00,	0xffffffffffffffffull);
+	out_be64(p->regs + 0x0e08,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0e18,	0xffffffffffffffffull);
+	out_be64(p->regs + 0x0e28,	0x0000600000000000ull);
+	out_be64(p->regs + 0x0e30,	0xfffffeffff7fff57ull);
+	out_be64(p->regs + 0x0e40,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0e48,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0e50,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0e58,	0x0000000000000000ull);
+
+	/* Init_100..108 - RXE_TCE errors */
+	out_be64(p->regs + 0x0e80,	0xffffffffffffffffull);
+	out_be64(p->regs + 0x0e88,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0e98,	0xffffffffffffffffull);
+	out_be64(p->regs + 0x0ea8,	0x60000000c0000000ull);
+	out_be64(p->regs + 0x0eb0,	0x9faeffaf3fffffffull); /* XXX CAPI has diff. value */
+	out_be64(p->regs + 0x0ec0,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0ec8,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0ed0,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0ed8,	0x0000000000000000ull);
+
+	/* Init_109..117 - RXPHB errors */
+	out_be64(p->regs + 0x0c80,	0xffffffffffffffffull);
+	out_be64(p->regs + 0x0c88,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0c98,	0xffffffffffffffffull);
+	out_be64(p->regs + 0x0ca8,	0x0000004000000000ull);
+	out_be64(p->regs + 0x0cb0,	0x35777033ff000000ull); /* XXX CAPI has diff. value */
+	out_be64(p->regs + 0x0cc0,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0cc8,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0cd0,	0x0000000000000000ull);
+	out_be64(p->regs + 0x0cd8,	0x0000000000000000ull);
+
+	/* Init_118..121 - LEM */
+	out_be64(p->regs + 0x0c00,	0x0000000000000000ull);
+	if (phb4_is_dd20(p)) {
+		out_be64(p->regs + 0x0c30,	0xf3ffffffffffffffull);
+		out_be64(p->regs + 0x0c38,	0xf3ffffffffffffffull);
+	} else {
+		out_be64(p->regs + 0x0c30,	0xffffffffffffffffull);
+		out_be64(p->regs + 0x0c38,	0xffffffffffffffffull);
+	}
+	out_be64(p->regs + 0x0c40,	0x0000000000000000ull);
+}
+
+
+static bool phb4_wait_dlp_reset(struct phb4 *p)
+{
+	unsigned int i;
+	uint64_t val;
+
+	/*
+	 * Firmware cannot access the UTL core regs or PCI config space
+	 * until the cores are out of DL_PGRESET.
+	 * DL_PGRESET should be polled until it is inactive with a value
+	 * of '0'. The recommended polling frequency is once every 1ms.
+	 * Firmware should poll at least 200 attempts before giving up.
+	 * MMIO Stores to the link are silently dropped by the UTL core if
+	 * the link is down.
+	 * MMIO Loads to the link will be dropped by the UTL core and will
+	 * eventually time-out and will return an all ones response if the
+	 * link is down.
+	 */
+#define DLP_RESET_ATTEMPTS	200
+
+	PHBDBG(p, "Waiting for DLP PG reset to complete...\n");
+	for (i = 0; i < DLP_RESET_ATTEMPTS; i++) {
+		val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (!(val & PHB_PCIE_DLP_DL_PGRESET))
+			break;
+		time_wait_ms(1);
+	}
+	if (val & PHB_PCIE_DLP_DL_PGRESET) {
+		PHBERR(p, "Timeout waiting for DLP PG reset !\n");
+		return false;
+	}
+	return true;
+}
+static void phb4_init_hw(struct phb4 *p)
+{
+	uint64_t val, creset;
+
+	PHBDBG(p, "Initializing PHB...\n");
+
+	/* Init_1 - Sync reset
+	 *
+	 * At this point we assume the PHB has already been reset.
+	 */
+
+	/* Init_2 - Mask FIRs */
+	out_be64(p->regs + PHB_LEM_ERROR_MASK,			0xffffffffffffffffull);
+
+	/* Init_3 - TCE tag enable */
+	out_be64(p->regs + PHB_TCE_TAG_ENABLE,			0xffffffffffffffffull);
+
+	/* Init_4 - PCIE System Configuration Register
+	 *
+	 * Adjust max speed based on system config
+	 */
+	val = in_be64(p->regs + PHB_PCIE_SCR);
+	PHBDBG(p, "Default system config: 0x%016llx\n", val);
+	val = SETFIELD(PHB_PCIE_SCR_MAXLINKSPEED, val, p->max_link_speed);
+	out_be64(p->regs + PHB_PCIE_SCR, val);
+	PHBDBG(p, "New system config    : 0x%016llx\n",
+	       in_be64(p->regs + PHB_PCIE_SCR));
+
+	/* Init_5 - deassert CFG reset */
+	creset = in_be64(p->regs + PHB_PCIE_CRESET);
+	PHBDBG(p, "Initial PHB CRESET is 0x%016llx\n", creset);
+	creset &= ~PHB_PCIE_CRESET_CFG_CORE;
+	out_be64(p->regs + PHB_PCIE_CRESET,			creset);
+
+	/* Init_6..13 - PCIE DLP Lane EQ control */
+	if (p->lane_eq) {
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL0, be64_to_cpu(p->lane_eq[0]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL1, be64_to_cpu(p->lane_eq[1]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL2, be64_to_cpu(p->lane_eq[2]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL3, be64_to_cpu(p->lane_eq[3]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL40, be64_to_cpu(p->lane_eq[4]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL41, be64_to_cpu(p->lane_eq[5]));
+		if (is_phb5()) {
+			out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL50, be64_to_cpu(p->lane_eq[6]));
+			out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL51, be64_to_cpu(p->lane_eq[7]));
+		}
+	}
+	if (!p->lane_eq_en) {
+		/* Read modify write and set to 2 bits */
+		PHBDBG(p, "LINK: Disabling Lane EQ\n");
+		val = in_be64(p->regs + PHB_PCIE_DLP_CTL);
+		val |= PHB_PCIE_DLP_CTL_BYPASS_PH2 | PHB_PCIE_DLP_CTL_BYPASS_PH3;
+		out_be64(p->regs + PHB_PCIE_DLP_CTL, val);
+	}
+
+	if (is_phb5()) {
+		/* disable scaled flow control for now. SW527785 */
+		PHBDBG(p, "LINK: Disabling scaled flow control\n");
+		val = in_be64(p->regs + PHB_PCIE_DLP_CTL);
+		val |= PHB_PCIE_DLP_CTL_SFC_DISABLE;
+		out_be64(p->regs + PHB_PCIE_DLP_CTL, val);
+
+		/* lane equalization settings need to be tuned on P10 */
+		out_be64(p->regs + PHB_PCIE_PDL_PHY_EQ_CNTL,
+			 0x80F4FFFFFF0F9C00);
+	}
+
+	/* Init_14 - Clear link training */
+	phb4_pcicfg_write32(&p->phb, 0, 0x78,
+			    0x07FE0000 | p->max_link_speed);
+
+	/* Init_15 - deassert cores reset */
+	/*
+	 * Lift the PHB resets but not PERST, this will be lifted
+	 * later by the initial PERST state machine
+	 */
+	creset &= ~(PHB_PCIE_CRESET_TLDLP | PHB_PCIE_CRESET_PBL);
+	creset |= PHB_PCIE_CRESET_PIPE_N;
+	out_be64(p->regs + PHB_PCIE_CRESET,			   creset);
+
+	/* Init_16 - Wait for DLP PGRESET to clear */
+	if (!phb4_wait_dlp_reset(p))
+		goto failed;
+
+	/* Init_17 - PHB Control */
+	val = PHB_CTRLR_IRQ_PGSZ_64K;
+	val |= PHB_CTRLR_TCE_CLB_DISABLE; // HW557787 circumvention
+	val |= SETFIELD(PHB_CTRLR_TVT_ADDR_SEL, 0ull, TVT_2_PER_PE);
+	if (phb_pq_disable(p))
+		val |= PHB_CTRLR_IRQ_PQ_DISABLE;
+	if (phb_abt_mode(p))
+		val |= PHB_CTRLR_IRQ_ABT_MODE;
+	if (phb_can_store_eoi(p)) {
+		val |= PHB_CTRLR_IRQ_STORE_EOI;
+		PHBDBG(p, "store EOI is enabled\n");
+	}
+
+	if (!pci_eeh_mmio)
+		val |= PHB_CTRLR_MMIO_EEH_DISABLE;
+
+	out_be64(p->regs + PHB_CTRLR, val);
+
+	/* Init_18..41 - Architected IODA3 inits */
+	phb4_init_ioda3(p);
+
+	/* Init_42..45 - Clear DLP error logs */
+	out_be64(p->regs + 0x1aa0,			0xffffffffffffffffull);
+	out_be64(p->regs + 0x1aa8,			0xffffffffffffffffull);
+	out_be64(p->regs + 0x1ab0,			0xffffffffffffffffull);
+	out_be64(p->regs + 0x1ab8,			0x0);
+
+
+	/* Init_46..54 : Init root complex config space */
+	if (!phb4_init_rc_cfg(p))
+		goto failed;
+
+	/* Init_55..121  : Setup error registers */
+	phb4_init_errors(p);
+
+	/* Init_122..123 : Wait for link
+	 * NOTE: At this point the spec waits for the link to come up. We
+	 * don't bother as we are doing a PERST soon.
+	 */
+
+	/* Init_124 :  NBW. XXX TODO */
+	/* See enable_capi_mode() */
+
+	/* Init_125 : Setup PCI command/status on root complex
+	 * I don't know why the spec does this now and not earlier, so
+	 * to be sure to get it right we might want to move it to the freset
+	 * state machine, though the generic PCI layer will probably do
+	 * this anyway (ie, enable MEM, etc... in the RC)
+
+	 */
+	phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_CMD,
+			    PCI_CFG_CMD_MEM_EN |
+			    PCI_CFG_CMD_BUS_MASTER_EN);
+
+	/* Clear errors */
+	phb4_pcicfg_write16(&p->phb, 0, PCI_CFG_STAT,
+			    PCI_CFG_STAT_SENT_TABORT |
+			    PCI_CFG_STAT_RECV_TABORT |
+			    PCI_CFG_STAT_RECV_MABORT |
+			    PCI_CFG_STAT_SENT_SERR |
+			    PCI_CFG_STAT_RECV_PERR);
+
+	/* Init_126..130 - Re-enable error interrupts */
+	phb4_int_unmask_all(p);
+
+	/* Init_131 - Re-enable LEM error mask */
+	out_be64(p->regs + PHB_LEM_ERROR_MASK,			0x0000000000000000ull);
+
+
+	/* Init_132 - Enable DMA address speculation */
+	out_be64(p->regs + PHB_TCE_SPEC_CTL,			0x0000000000000000ull);
+
+	/* Init_133 - Timeout Control Register 1 */
+	out_be64(p->regs + PHB_TIMEOUT_CTRL1,			0x0015150000150000ull);
+
+	/* Init_134 - Timeout Control Register 2 */
+	out_be64(p->regs + PHB_TIMEOUT_CTRL2,			0x0000151500000000ull);
+
+	/* Init_135 - PBL Timeout Control Register */
+	out_be64(p->regs + PHB_PBL_TIMEOUT_CTRL,		0x2013000000000000ull);
+
+	/* Mark the PHB as functional which enables all the various sequences */
+	p->broken = false;
+
+	PHBDBG(p, "Initialization complete\n");
+
+	return;
+
+ failed:
+	PHBERR(p, "Initialization failed\n");
+	p->broken = true;
+}
+
+/* FIXME: Use scoms rather than MMIO incase we are fenced */
+static bool phb4_read_capabilities(struct phb4 *p)
+{
+	uint64_t val;
+
+	/* XXX Should make sure ETU is out of reset ! */
+
+	/* Grab version and fit it in an int */
+	val = phb4_read_reg_asb(p, PHB_VERSION);
+	if (val == 0 || val == 0xffffffffffffffffUL) {
+		PHBERR(p, "Failed to read version, PHB appears broken\n");
+		return false;
+	}
+
+	p->rev = ((val >> 16) & 0x00ff0000) | (val & 0xffff);
+	PHBDBG(p, "Core revision 0x%x\n", p->rev);
+
+	/* Read EEH capabilities */
+	val = in_be64(p->regs + PHB_PHB4_EEH_CAP);
+	if (val == 0xffffffffffffffffUL) {
+		PHBERR(p, "Failed to read EEH cap, PHB appears broken\n");
+		return false;
+	}
+	p->max_num_pes = val >> 52;
+	if (p->max_num_pes >= 512) {
+		p->mrt_size = 16;
+		p->mbt_size = 32;
+		p->tvt_size = 1024;
+	} else {
+		p->mrt_size = 8;
+		p->mbt_size = 16;
+		p->tvt_size = 512;
+	}
+
+	val = in_be64(p->regs + PHB_PHB4_IRQ_CAP);
+	if (val == 0xffffffffffffffffUL) {
+		PHBERR(p, "Failed to read IRQ cap, PHB appears broken\n");
+		return false;
+	}
+	p->num_irqs = val & 0xffff;
+
+	/* This works for 512 PEs.  FIXME calculate for any hardware
+	 * size returned above
+	 */
+	p->tbl_peltv_size = PELTV_TABLE_SIZE_MAX;
+
+	p->tbl_pest_size = p->max_num_pes*16;
+
+	PHBDBG(p, "Found %d max PEs and %d IRQs \n",
+	       p->max_num_pes, p->num_irqs);
+
+	return true;
+}
+
+static void phb4_allocate_tables(struct phb4 *p)
+{
+	uint32_t i;
+
+	/* XXX Our current memalign implementation sucks,
+	 *
+	 * It will do the job, however it doesn't support freeing
+	 * the memory and wastes space by always allocating twice
+	 * as much as requested (size + alignment)
+	 */
+	p->tbl_rtt = local_alloc(p->chip_id, RTT_TABLE_SIZE, RTT_TABLE_SIZE);
+	assert(p->tbl_rtt);
+	for (i = 0; i < RTT_TABLE_ENTRIES; i++)
+		p->tbl_rtt[i] = cpu_to_be16(PHB4_RESERVED_PE_NUM(p));
+
+	p->tbl_peltv = local_alloc(p->chip_id, p->tbl_peltv_size, p->tbl_peltv_size);
+	assert(p->tbl_peltv);
+	memset(p->tbl_peltv, 0, p->tbl_peltv_size);
+
+	p->tbl_pest = (uint64_t)local_alloc(p->chip_id, p->tbl_pest_size, p->tbl_pest_size);
+	assert(p->tbl_pest);
+	memset((void *)p->tbl_pest, 0, p->tbl_pest_size);
+}
+
+static void phb4_add_properties(struct phb4 *p)
+{
+	struct dt_node *np = p->phb.dt_node;
+	uint32_t lsibase, icsp = get_ics_phandle();
+	uint64_t m32b, m64b, m64s;
+
+	/* Add various properties that HB doesn't have to
+	 * add, some of them simply because they result from
+	 * policy decisions made in skiboot rather than in HB
+	 * such as the MMIO windows going to PCI, interrupts,
+	 * etc...
+	 */
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+	dt_add_property_cells(np, "bus-range", 0, 0xff);
+	dt_add_property_cells(np, "clock-frequency", 0x200, 0); /* ??? */
+
+	dt_add_property_cells(np, "interrupt-parent", icsp);
+
+	/* XXX FIXME: add slot-name */
+	//dt_property_cell("bus-width", 8); /* Figure it out from VPD ? */
+
+	/* "ranges", we only expose M32 (PHB4 doesn't do IO)
+	 *
+	 * Note: The kernel expects us to have chopped of 64k from the
+	 * M32 size (for the 32-bit MSIs). If we don't do that, it will
+	 * get confused (OPAL does it)
+	 */
+	m32b = cleanup_addr(p->mm1_base);
+	m64b = cleanup_addr(p->mm0_base);
+	m64s = p->mm0_size;
+	dt_add_property_cells(np, "ranges",
+			      /* M32 space */
+			      0x02000000, 0x00000000, M32_PCI_START,
+			      hi32(m32b), lo32(m32b), 0, M32_PCI_SIZE - 0x10000);
+
+	/* XXX FIXME: add opal-memwin32, dmawins, etc... */
+	dt_add_property_u64s(np, "ibm,opal-m64-window", m64b, m64b, m64s);
+	dt_add_property(np, "ibm,opal-single-pe", NULL, 0);
+	dt_add_property_cells(np, "ibm,opal-num-pes", p->num_pes);
+	dt_add_property_cells(np, "ibm,opal-reserved-pe",
+			      PHB4_RESERVED_PE_NUM(p));
+	dt_add_property_cells(np, "ibm,opal-msi-ranges",
+			      p->base_msi, p->num_irqs - 8);
+	/* M64 ranges start at 1 as MBT0 is used for M32 */
+	dt_add_property_cells(np, "ibm,opal-available-m64-ranges",
+			      1, p->mbt_size - 1);
+	dt_add_property_cells(np, "ibm,supported-tce-sizes",
+			      12, // 4K
+			      16, // 64K
+			      21, // 2M
+			      30); // 1G
+
+	/* Tell Linux about alignment limits for segment splits.
+	 *
+	 * XXX We currently only expose splits of 1 and "num PEs",
+	 */
+	dt_add_property_cells(np, "ibm,opal-m64-segment-splits",
+			      /* Full split, number of segments: */
+			      p->num_pes,
+			      /* Encoding passed to the enable call */
+			      OPAL_ENABLE_M64_SPLIT,
+			      /* Alignement/size restriction in #bits*/
+			      /* XXX VERIFY VALUE */
+			      12,
+			      /* Unused */
+			      0,
+			      /* single PE, number of segments: */
+			      1,
+			      /* Encoding passed to the enable call */
+			      OPAL_ENABLE_M64_NON_SPLIT,
+			      /* Alignement/size restriction in #bits*/
+			      /* XXX VERIFY VALUE */
+			      12,
+			      /* Unused */
+			      0);
+
+	/* The interrupt maps will be generated in the RC node by the
+	 * PCI code based on the content of this structure:
+	 */
+	lsibase = p->base_lsi;
+	p->phb.lstate.int_size = 2;
+	p->phb.lstate.int_val[0][0] = lsibase + PHB4_LSI_PCIE_INTA;
+	p->phb.lstate.int_val[0][1] = 1;
+	p->phb.lstate.int_val[1][0] = lsibase + PHB4_LSI_PCIE_INTB;
+	p->phb.lstate.int_val[1][1] = 1;
+	p->phb.lstate.int_val[2][0] = lsibase + PHB4_LSI_PCIE_INTC;
+	p->phb.lstate.int_val[2][1] = 1;
+	p->phb.lstate.int_val[3][0] = lsibase + PHB4_LSI_PCIE_INTD;
+	p->phb.lstate.int_val[3][1] = 1;
+	p->phb.lstate.int_parent[0] = icsp;
+	p->phb.lstate.int_parent[1] = icsp;
+	p->phb.lstate.int_parent[2] = icsp;
+	p->phb.lstate.int_parent[3] = icsp;
+
+	/* Indicators for variable tables */
+	dt_add_property_cells(np, "ibm,opal-rtt-table",
+		hi32((u64) p->tbl_rtt), lo32((u64) p->tbl_rtt), RTT_TABLE_SIZE);
+
+	dt_add_property_cells(np, "ibm,opal-peltv-table",
+		hi32((u64) p->tbl_peltv), lo32((u64) p->tbl_peltv),
+		p->tbl_peltv_size);
+
+	dt_add_property_cells(np, "ibm,opal-pest-table",
+		hi32(p->tbl_pest), lo32(p->tbl_pest), p->tbl_pest_size);
+
+	dt_add_property_cells(np, "ibm,phb-diag-data-size",
+			      sizeof(struct OpalIoPhb4ErrorData));
+
+	/* Indicate to Linux that CAPP timebase sync is supported */
+	dt_add_property_string(np, "ibm,capp-timebase-sync", NULL);
+
+	/* Tell Linux Compare/Mask indication values */
+	dt_add_property_cells(np, "ibm,phb-indications", CAPIIND, ASNIND,
+			      NBWIND);
+}
+
+static bool phb4_calculate_windows(struct phb4 *p)
+{
+	const struct dt_property *prop;
+
+	/* Get PBCQ MMIO windows from device-tree */
+	prop = dt_require_property(p->phb.dt_node,
+				   "ibm,mmio-windows", -1);
+	assert(prop->len >= (2 * sizeof(uint64_t)));
+
+	p->mm0_base = dt_property_get_u64(prop, 0);
+	p->mm0_size = dt_property_get_u64(prop, 1);
+	if (prop->len > 16) {
+		p->mm1_base = dt_property_get_u64(prop, 2);
+		p->mm1_size = dt_property_get_u64(prop, 3);
+	}
+
+	/* Sort them so that 0 is big and 1 is small */
+	if (p->mm1_size && p->mm1_size > p->mm0_size) {
+		uint64_t b = p->mm0_base;
+		uint64_t s = p->mm0_size;
+		p->mm0_base = p->mm1_base;
+		p->mm0_size = p->mm1_size;
+		p->mm1_base = b;
+		p->mm1_size = s;
+	}
+
+	/* If 1 is too small, ditch it */
+	if (p->mm1_size < M32_PCI_SIZE)
+		p->mm1_size = 0;
+
+	/* If 1 doesn't exist, carve it out of 0 */
+	if (p->mm1_size == 0) {
+		p->mm0_size /= 2;
+		p->mm1_base = p->mm0_base + p->mm0_size;
+		p->mm1_size = p->mm0_size;
+	}
+
+	/* Crop mm1 to our desired size */
+	if (p->mm1_size > M32_PCI_SIZE)
+		p->mm1_size = M32_PCI_SIZE;
+
+	return true;
+}
+
+static void phb4_err_interrupt(struct irq_source *is, uint32_t isn)
+{
+	struct phb4 *p = is->data;
+
+	PHBDBG(p, "Got interrupt 0x%08x\n", isn);
+
+	/* mask the interrupt conditions to prevent it from re-firing */
+	phb4_int_mask_active(p);
+
+	/* Update pending event */
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+				OPAL_EVENT_PCI_ERROR);
+
+	/* If the PHB is broken, go away */
+	if (p->broken)
+		return;
+
+	/*
+	 * Mark the PHB has pending error so that the OS
+	 * can handle it at late point.
+	 */
+	phb4_set_err_pending(p, true);
+}
+
+static uint64_t phb4_lsi_attributes(struct irq_source *is __unused,
+				uint32_t isn __unused)
+{
+#ifndef DISABLE_ERR_INTS
+	struct phb4 *p = is->data;
+	uint32_t idx = isn - p->base_lsi;
+
+	if (idx == PHB4_LSI_PCIE_INF || idx == PHB4_LSI_PCIE_ER)
+		return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI;
+#endif
+	return IRQ_ATTR_TARGET_LINUX;
+}
+
+static char *phb4_lsi_name(struct irq_source *is, uint32_t isn)
+{
+	struct phb4 *p = is->data;
+	uint32_t idx = isn - p->base_lsi;
+	char buf[32];
+
+	if (idx == PHB4_LSI_PCIE_INF)
+		snprintf(buf, 32, "phb#%04x-inf", p->phb.opal_id);
+	else if (idx == PHB4_LSI_PCIE_ER)
+		snprintf(buf, 32, "phb#%04x-err", p->phb.opal_id);
+	else
+		assert(0); /* PCIe LSIs should never be directed to OPAL */
+
+	return strdup(buf);
+}
+
+static const struct irq_source_ops phb4_lsi_ops = {
+	.interrupt = phb4_err_interrupt,
+	.attributes = phb4_lsi_attributes,
+	.name = phb4_lsi_name,
+};
+
+static __be64 lane_eq_default[8] = {
+	CPU_TO_BE64(0x5454545454545454UL), CPU_TO_BE64(0x5454545454545454UL),
+	CPU_TO_BE64(0x5454545454545454UL), CPU_TO_BE64(0x5454545454545454UL),
+	CPU_TO_BE64(0x7777777777777777UL), CPU_TO_BE64(0x7777777777777777UL),
+	CPU_TO_BE64(0x7777777777777777UL), CPU_TO_BE64(0x7777777777777777UL),
+};
+
+static __be64 lane_eq_phb5_default[8] = {
+	CPU_TO_BE64(0x4444444444444444UL), CPU_TO_BE64(0x4444444444444444UL),
+	CPU_TO_BE64(0x4444444444444444UL), CPU_TO_BE64(0x4444444444444444UL),
+	CPU_TO_BE64(0x4444444444444444UL), CPU_TO_BE64(0x4444444444444444UL),
+	CPU_TO_BE64(0x9999999999999999UL), CPU_TO_BE64(0x9999999999999999UL),
+};
+
+static void phb4_create(struct dt_node *np)
+{
+	const struct dt_property *prop;
+	struct phb4 *p;
+	struct pci_slot *slot;
+	size_t lane_eq_len, lane_eq_len_req;
+	struct dt_node *iplp;
+	char *path;
+	uint32_t irq_base, irq_flags;
+	int i, eq_reg_count;
+	int chip_id;
+
+	chip_id = dt_prop_get_u32(np, "ibm,chip-id");
+	p = local_alloc(chip_id, sizeof(struct phb4), 8);
+	assert(p);
+	memset(p, 0x0, sizeof(struct phb4));
+
+	/* Populate base stuff */
+	p->index = dt_prop_get_u32(np, "ibm,phb-index");
+	p->chip_id = chip_id;
+	p->pec = dt_prop_get_u32(np, "ibm,phb-pec-index");
+	p->regs = (void *)dt_get_address(np, 0, NULL);
+	p->int_mmio = (void *)dt_get_address(np, 1, NULL);
+	p->phb.dt_node = np;
+	p->phb.ops = &phb4_ops;
+	p->phb.phb_type = phb_type_pcie_v4;
+	p->phb.scan_map = 0x1; /* Only device 0 to scan */
+
+	if (!phb4_calculate_windows(p))
+		return;
+
+	/* Get the various XSCOM register bases from the device-tree */
+	prop = dt_require_property(np, "ibm,xscom-bases", 5 * sizeof(uint32_t));
+	p->pe_xscom = dt_property_get_cell(prop, 0);
+	p->pe_stk_xscom = dt_property_get_cell(prop, 1);
+	p->pci_xscom = dt_property_get_cell(prop, 2);
+	p->pci_stk_xscom = dt_property_get_cell(prop, 3);
+	p->etu_xscom = dt_property_get_cell(prop, 4);
+
+	/*
+	 * We skip the initial PERST assertion requested by the generic code
+	 * when doing a cold boot because we are coming out of cold boot already
+	 * so we save boot time that way. The PERST state machine will still
+	 * handle waiting for the link to come up, it will just avoid actually
+	 * asserting & deasserting the PERST output
+	 *
+	 * For a hot IPL, we still do a PERST
+	 *
+	 * Note: In absence of property (ie, FSP-less), we stick to the old
+	 * behaviour and set skip_perst to true
+	 */
+	p->skip_perst = true; /* Default */
+
+	iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+	if (iplp) {
+		const char *ipl_type = dt_prop_get_def(iplp, "cec-major-type", NULL);
+		if (ipl_type && (!strcmp(ipl_type, "hot")))
+			p->skip_perst = false;
+	}
+
+	/* By default link is assumed down */
+	p->has_link = false;
+
+	/* We register the PHB before we initialize it so we
+	 * get a useful OPAL ID for it
+	 */
+	pci_register_phb(&p->phb, phb4_get_opal_id(p->chip_id, p->index));
+
+	/* Create slot structure */
+	slot = phb4_slot_create(&p->phb);
+	if (!slot)
+		PHBERR(p, "Cannot create PHB slot\n");
+
+	/* Hello ! */
+	path = dt_get_path(np);
+	PHBINF(p, "Found %s @%p\n", path, p->regs);
+	PHBINF(p, "  M32 [0x%016llx..0x%016llx]\n",
+	       p->mm1_base, p->mm1_base + p->mm1_size - 1);
+	PHBINF(p, "  M64 [0x%016llx..0x%016llx]\n",
+	       p->mm0_base, p->mm0_base + p->mm0_size - 1);
+	free(path);
+
+	/* Find base location code from root node */
+	p->phb.base_loc_code = dt_prop_get_def(dt_root,
+					       "ibm,io-base-loc-code", NULL);
+	if (!p->phb.base_loc_code)
+		PHBDBG(p, "Base location code not found !\n");
+
+	/*
+	 * Grab CEC IO VPD load info from the root of the device-tree,
+	 * on P8 there's a single such VPD for the whole machine
+	 */
+	prop = dt_find_property(dt_root, "ibm,io-vpd");
+	if (!prop) {
+		/* LX VPD Lid not already loaded */
+		if (platform.vpd_iohub_load)
+			platform.vpd_iohub_load(dt_root);
+	}
+
+	/* Obtain informatin about the PHB from the hardware directly */
+	if (!phb4_read_capabilities(p))
+		goto failed;
+
+	p->max_link_speed = phb4_get_max_link_speed(p, np);
+	p->max_link_width = phb4_get_max_link_width(p);
+	PHBINF(p, "Max link speed: GEN%i, max link width %i\n",
+	       p->max_link_speed, p->max_link_width);
+
+	/* Check for lane equalization values from HB or HDAT */
+	p->lane_eq_en = true;
+	p->lane_eq = dt_prop_get_def_size(np, "ibm,lane-eq", NULL, &lane_eq_len);
+	if (is_phb5())
+		eq_reg_count = 8;
+	else
+		eq_reg_count = 6;
+	lane_eq_len_req = eq_reg_count * 8;
+	if (p->lane_eq) {
+		if (lane_eq_len < lane_eq_len_req) {
+			PHBERR(p, "Device-tree has ibm,lane-eq too short: %ld"
+			       " (want %ld)\n", lane_eq_len, lane_eq_len_req);
+			p->lane_eq = NULL;
+		}
+	} else {
+		PHBDBG(p, "Using default lane equalization settings\n");
+		if (is_phb5())
+			p->lane_eq = lane_eq_phb5_default;
+		else
+			p->lane_eq = lane_eq_default;
+	}
+	if (p->lane_eq) {
+		PHBDBG(p, "Override lane equalization settings:\n");
+		for (i = 0 ; i < lane_eq_len_req/(8 * 2) ; i++)
+			PHBDBG(p, "  0x%016llx 0x%016llx\n",
+			       be64_to_cpu(p->lane_eq[2 * i]),
+			       be64_to_cpu(p->lane_eq[2 * i + 1]));
+	}
+
+	/* Allocate a block of interrupts. We need to know if it needs
+	 * 2K or 4K interrupts ... for now we just use 4K but that
+	 * needs to be fixed
+	 */
+	if (is_phb5())
+		irq_base = xive2_alloc_hw_irqs(p->chip_id, p->num_irqs, p->num_irqs);
+	else
+		irq_base = xive_alloc_hw_irqs(p->chip_id, p->num_irqs, p->num_irqs);
+	if (irq_base == XIVE_IRQ_ERROR) {
+		PHBERR(p, "Failed to allocate %d interrupt sources\n",
+		       p->num_irqs);
+		goto failed;
+	}
+	p->base_msi = irq_base;
+	p->base_lsi = irq_base + p->num_irqs - 8;
+	p->num_pes = p->max_num_pes;
+
+	/* Allocate the SkiBoot internal in-memory tables for the PHB */
+	phb4_allocate_tables(p);
+
+	phb4_add_properties(p);
+
+	/* Clear IODA3 cache */
+	phb4_init_ioda_cache(p);
+
+	/* Get the HW up and running */
+	phb4_init_hw(p);
+
+	/* init capp that might get attached to the phb */
+	if (is_phb4())
+		phb4_init_capp(p);
+
+	/* Compute XIVE source flags depending on PHB revision */
+	irq_flags = 0;
+	if (phb_can_store_eoi(p))
+		irq_flags |= XIVE_SRC_STORE_EOI;
+	else
+		irq_flags |= XIVE_SRC_TRIGGER_PAGE;
+
+	if (is_phb5()) {
+		/*
+		 * Register sources with XIVE. If offloading is on, use the
+		 * ESB pages of the XIVE IC for the MSI sources instead of the
+		 * ESB pages of the PHB.
+		 */
+		if (phb_pq_disable(p) || phb_abt_mode(p)) {
+			xive2_register_esb_source(p->base_msi, p->num_irqs - 8);
+		} else {
+			xive2_register_hw_source(p->base_msi,
+						 p->num_irqs - 8, 16,
+						 p->int_mmio, irq_flags,
+						 NULL, NULL);
+		}
+
+		/*
+		 * LSI sources always use the ESB pages of the PHB.
+		 */
+		xive2_register_hw_source(p->base_lsi, 8, 16,
+					 p->int_mmio + ((p->num_irqs - 8) << 16),
+					 XIVE_SRC_LSI | irq_flags, p, &phb4_lsi_ops);
+	} else {
+		/* Register all interrupt sources with XIVE */
+		xive_register_hw_source(p->base_msi, p->num_irqs - 8, 16,
+					p->int_mmio, irq_flags, NULL, NULL);
+
+		xive_register_hw_source(p->base_lsi, 8, 16,
+					p->int_mmio + ((p->num_irqs - 8) << 16),
+					XIVE_SRC_LSI, p, &phb4_lsi_ops);
+	}
+
+	/* Platform additional setup */
+	if (platform.pci_setup_phb)
+		platform.pci_setup_phb(&p->phb, p->index);
+
+	dt_add_property_string(np, "status", "okay");
+
+	return;
+
+ failed:
+	p->broken = true;
+
+	/* Tell Linux it's broken */
+	dt_add_property_string(np, "status", "error");
+}
+
+static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
+			     uint32_t nest_base, uint32_t pci_base)
+{
+	enum phys_map_type phys_mmio64, phys_mmio32, phys_xive_esb, phys_reg_spc;
+	uint32_t pci_stack, nest_stack, etu_base, gcid, phb_num, stk_index;
+	uint64_t val, phb_bar = 0, irq_bar = 0, bar_en;
+	uint64_t mmio0_bar = 0, mmio0_bmask, mmio0_sz;
+	uint64_t mmio1_bar = 0, mmio1_bmask, mmio1_sz;
+	void *foo;
+	__be64 mmio_win[4];
+	unsigned int mmio_win_sz;
+	struct dt_node *np;
+	char *path;
+	uint64_t capp_ucode_base;
+	unsigned int max_link_speed;
+	int rc;
+
+	assert(is_phb5() || is_phb4()); /* Sanity check */
+
+	gcid = dt_get_chip_id(stk_node);
+	stk_index = dt_prop_get_u32(stk_node, "reg");
+	phb_num = dt_prop_get_u32(stk_node, "ibm,phb-index");
+	path = dt_get_path(stk_node);
+	if (is_phb5()) {
+		phys_mmio64 = PHB5_64BIT_MMIO;
+		phys_mmio32 = PHB5_32BIT_MMIO;
+		phys_xive_esb = PHB5_XIVE_ESB;
+		phys_reg_spc = PHB5_REG_SPC;
+		prlog(PR_INFO, "PHB: Chip %d Found PHB5 PBCQ%d Stack %d at %s\n",
+		      gcid, pec_index, stk_index, path);
+	} else {
+		phys_mmio64 = PHB4_64BIT_MMIO;
+		phys_mmio32 = PHB4_32BIT_MMIO;
+		phys_xive_esb = PHB4_XIVE_ESB;
+		phys_reg_spc = PHB4_REG_SPC;
+		prlog(PR_INFO, "PHB: Chip %d Found PHB4 PBCQ%d Stack %d at %s\n",
+		      gcid, pec_index, stk_index, path);
+	}
+	free(path);
+
+	pci_stack = pci_base + 0x40 * (stk_index + 1);
+	nest_stack = nest_base + 0x40 * (stk_index + 1);
+	etu_base = pci_base + 0x100 + 0x40 * stk_index;
+
+	prlog(PR_DEBUG, "PHB[%d:%d] X[PE]=0x%08x/0x%08x X[PCI]=0x%08x/0x%08x X[ETU]=0x%08x\n",
+	      gcid, phb_num, nest_base, nest_stack, pci_base, pci_stack, etu_base);
+
+	/* Default BAR enables */
+	bar_en = 0;
+
+	/* Initialize PHB register BAR */
+	phys_map_get(gcid, phys_reg_spc, phb_num, &phb_bar, NULL);
+	rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
+			 phb_bar << 8);
+
+	/* A scom error here probably indicates a defective/garded PHB */
+	if (rc != OPAL_SUCCESS) {
+		prerror("PHB[%d:%d] Unable to set PHB BAR. Error=%d\n",
+		      gcid, phb_num, rc);
+		return;
+	}
+
+	bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
+
+	/* Same with INT BAR (ESB) */
+	phys_map_get(gcid, phys_xive_esb, phb_num, &irq_bar, NULL);
+	xscom_write(gcid, nest_stack + XPEC_NEST_STK_IRQ_BAR, irq_bar << 8);
+	bar_en |= XPEC_NEST_STK_BAR_EN_INT;
+
+
+	/* Same with MMIO windows */
+	phys_map_get(gcid, phys_mmio64, phb_num, &mmio0_bar, &mmio0_sz);
+	mmio0_bmask =  (~(mmio0_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
+	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0, mmio0_bar << 8);
+	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0_MASK, mmio0_bmask << 8);
+
+	phys_map_get(gcid, phys_mmio32, phb_num, &mmio1_bar, &mmio1_sz);
+	mmio1_bmask =  (~(mmio1_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
+	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1, mmio1_bar << 8);
+	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1_MASK, mmio1_bmask << 8);
+
+	/* Build MMIO windows list */
+	mmio_win_sz = 0;
+	if (mmio0_bar) {
+		mmio_win[mmio_win_sz++] = cpu_to_be64(mmio0_bar);
+		mmio_win[mmio_win_sz++] = cpu_to_be64(mmio0_sz);
+		bar_en |= XPEC_NEST_STK_BAR_EN_MMIO0;
+	}
+	if (mmio1_bar) {
+		mmio_win[mmio_win_sz++] = cpu_to_be64(mmio1_bar);
+		mmio_win[mmio_win_sz++] = cpu_to_be64(mmio1_sz);
+		bar_en |= XPEC_NEST_STK_BAR_EN_MMIO1;
+	}
+
+	/* Set the appropriate enables */
+	xscom_read(gcid, nest_stack + XPEC_NEST_STK_BAR_EN, &val);
+	val |= bar_en;
+	xscom_write(gcid, nest_stack + XPEC_NEST_STK_BAR_EN, val);
+
+	/* No MMIO windows ? Barf ! */
+	if (mmio_win_sz == 0) {
+		prerror("PHB[%d:%d] No MMIO windows enabled !\n", gcid, phb_num);
+		return;
+	}
+
+	/* Clear errors in PFIR and NFIR */
+	xscom_write(gcid, pci_stack + XPEC_PCI_STK_PCI_FIR, 0);
+	xscom_write(gcid, nest_stack + XPEC_NEST_STK_PCI_NFIR, 0);
+
+	/* Check ETU reset */
+	xscom_read(gcid, pci_stack + XPEC_PCI_STK_ETU_RESET, &val);
+	prlog_once(PR_DEBUG, "ETU reset: %llx\n", val);
+	xscom_write(gcid, pci_stack + XPEC_PCI_STK_ETU_RESET, 0);
+	time_wait_ms(1);
+
+	// show we can read phb mmio space
+	foo = (void *)(phb_bar + 0x800); // phb version register
+	prlog_once(PR_DEBUG, "Version reg: 0x%016llx\n", in_be64(foo));
+
+	/* Create PHB node */
+	np = dt_new_addr(dt_root, "pciex", phb_bar);
+	if (!np)
+		return;
+
+	if (is_phb5())
+		dt_add_property_strings(np, "compatible", "ibm,power10-pciex", "ibm,ioda3-phb");
+	else
+		dt_add_property_strings(np, "compatible", "ibm,power9-pciex", "ibm,ioda3-phb");
+	dt_add_property_strings(np, "device_type", "pciex");
+	dt_add_property_u64s(np, "reg",
+				phb_bar, 0x1000,
+				irq_bar, 0x10000000);
+
+	/* Everything else is handled later by skiboot, we just
+	 * stick a few hints here
+	 */
+	dt_add_property_cells(np, "ibm,xscom-bases",
+			      nest_base, nest_stack, pci_base, pci_stack, etu_base);
+	dt_add_property(np, "ibm,mmio-windows", mmio_win, 8 * mmio_win_sz);
+	dt_add_property_cells(np, "ibm,phb-index", phb_num);
+	dt_add_property_cells(np, "ibm,phb-pec-index", pec_index);
+	dt_add_property_cells(np, "ibm,phb-stack", stk_node->phandle);
+	dt_add_property_cells(np, "ibm,phb-stack-index", stk_index);
+	dt_add_property_cells(np, "ibm,chip-id", gcid);
+
+	/* read the hub-id out of the pbcq node */
+	if (dt_has_node_property(stk_node->parent, "ibm,hub-id", NULL)) {
+		uint32_t hub_id;
+
+		hub_id = dt_prop_get_u32(stk_node->parent, "ibm,hub-id");
+		dt_add_property_cells(np, "ibm,hub-id", hub_id);
+	}
+
+	if (dt_has_node_property(stk_node->parent, "ibm,loc-code", NULL)) {
+		const char *lc = dt_prop_get(stk_node->parent, "ibm,loc-code");
+		dt_add_property_string(np, "ibm,loc-code", lc);
+	}
+	if (dt_has_node_property(stk_node, "ibm,lane-eq", NULL)) {
+		size_t leq_size;
+		const void *leq = dt_prop_get_def_size(stk_node, "ibm,lane-eq",
+						       NULL, &leq_size);
+		if (leq != NULL && leq_size >= 6 * 8)
+			dt_add_property(np, "ibm,lane-eq", leq, leq_size);
+	}
+	if (dt_has_node_property(stk_node, "ibm,capp-ucode", NULL)) {
+		capp_ucode_base = dt_prop_get_u32(stk_node, "ibm,capp-ucode");
+		dt_add_property_cells(np, "ibm,capp-ucode", capp_ucode_base);
+	}
+	if (dt_has_node_property(stk_node, "ibm,max-link-speed", NULL)) {
+		max_link_speed = dt_prop_get_u32(stk_node, "ibm,max-link-speed");
+		dt_add_property_cells(np, "ibm,max-link-speed", max_link_speed);
+	}
+	dt_add_property_cells(np, "ibm,capi-flags",
+			      OPAL_PHB_CAPI_FLAG_SNOOP_CONTROL);
+
+	add_chip_dev_associativity(np);
+}
+
+static void phb4_probe_pbcq(struct dt_node *pbcq)
+{
+	uint32_t nest_base, pci_base, pec_index;
+	struct dt_node *stk;
+
+	/* REMOVEME: force this for now until we stabalise PCIe */
+	verbose_eeh = 1;
+
+	nest_base = dt_get_address(pbcq, 0, NULL);
+	pci_base = dt_get_address(pbcq, 1, NULL);
+	pec_index = dt_prop_get_u32(pbcq, "ibm,pec-index");
+
+	dt_for_each_child(pbcq, stk) {
+		if (dt_node_is_enabled(stk))
+			phb4_probe_stack(stk, pec_index, nest_base, pci_base);
+	}
+}
+
+void probe_phb4(void)
+{
+	struct dt_node *np;
+	const char *s;
+
+	pci_eeh_mmio = !nvram_query_eq_dangerous("pci-eeh-mmio", "disabled");
+	pci_retry_all = nvram_query_eq_dangerous("pci-retry-all", "true");
+	s = nvram_query_dangerous("phb-rx-err-max");
+	if (s) {
+		rx_err_max = atoi(s);
+
+		/* Clip to uint8_t used by hardware */
+		rx_err_max = MAX(rx_err_max, 0);
+		rx_err_max = MIN(rx_err_max, 255);
+	}
+
+	if (is_phb5()) {
+		prlog(PR_DEBUG, "PHB5: Maximum RX errors during training: %d\n", rx_err_max);
+		/* Look for PBCQ XSCOM nodes */
+		dt_for_each_compatible(dt_root, np, "ibm,power10-pbcq")
+			phb4_probe_pbcq(np);
+
+		/* Look for newly created PHB nodes */
+		dt_for_each_compatible(dt_root, np, "ibm,power10-pciex")
+			phb4_create(np);
+	} else {
+		prlog(PR_DEBUG, "PHB4: Maximum RX errors during training: %d\n", rx_err_max);
+		/* Look for PBCQ XSCOM nodes */
+		dt_for_each_compatible(dt_root, np, "ibm,power9-pbcq")
+			phb4_probe_pbcq(np);
+
+		/* Look for newly created PHB nodes */
+		dt_for_each_compatible(dt_root, np, "ibm,power9-pciex")
+			phb4_create(np);
+	}
+}
diff --git a/roms/skiboot/hw/phys-map.c b/roms/skiboot/hw/phys-map.c
new file mode 100644
index 000000000..d6ff99fd8
--- /dev/null
+++ b/roms/skiboot/hw/phys-map.c
@@ -0,0 +1,445 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Physical memory map
+ *
+ * Copyright 2017-2019 IBM Corp.
+ */
+
+#include <phys-map.h>
+#include <chip.h>
+#include <skiboot.h>
+#include <opal-api.h>
+#include <stack.h>
+#include <inttypes.h>
+
+struct phys_map_entry {
+	enum phys_map_type type;
+	int index;
+	uint64_t addr;
+	uint64_t size;
+};
+
+struct phys_map_info {
+	int chip_select_shift;
+	const struct phys_map_entry *table;
+};
+
+static const struct phys_map_info *phys_map;
+
+static const struct phys_map_entry phys_map_table_p10[] = {
+	/* System memory upto 4TB minus GPU memory */
+	{ SYSTEM_MEM,      0, 0x0000000000000000ull, 0x0000034000000000ull },
+
+	/* TODO: Figure out GPU memory */
+
+	/* 0 TB offset @ MMIO 0x0006000000000000ull */
+	{ PHB5_64BIT_MMIO, 0, 0x0006000000000000ull, 0x0000004000000000ull },
+	{ PHB5_64BIT_MMIO, 1, 0x0006004000000000ull, 0x0000004000000000ull },
+	{ PHB5_64BIT_MMIO, 2, 0x0006008000000000ull, 0x0000004000000000ull },
+	{ PHB5_32BIT_MMIO, 0, 0x000600c000000000ull, 0x0000000080000000ull },
+	{ PHB5_32BIT_MMIO, 1, 0x000600c080000000ull, 0x0000000080000000ull },
+	{ PHB5_32BIT_MMIO, 2, 0x000600c100000000ull, 0x0000000080000000ull },
+	{ PHB5_32BIT_MMIO, 3, 0x000600c180000000ull, 0x0000000080000000ull },
+	{ PHB5_32BIT_MMIO, 4, 0x000600c200000000ull, 0x0000000080000000ull },
+	{ PHB5_32BIT_MMIO, 5, 0x000600c280000000ull, 0x0000000080000000ull },
+	{ PHB5_XIVE_ESB  , 0, 0x000600c300000000ull, 0x0000000020000000ull },
+	{ PHB5_XIVE_ESB  , 1, 0x000600c320000000ull, 0x0000000020000000ull },
+	{ PHB5_XIVE_ESB  , 2, 0x000600c340000000ull, 0x0000000020000000ull },
+	{ PHB5_XIVE_ESB  , 3, 0x000600c360000000ull, 0x0000000020000000ull },
+	{ PHB5_XIVE_ESB  , 4, 0x000600c380000000ull, 0x0000000020000000ull },
+	{ PHB5_XIVE_ESB  , 5, 0x000600c3a0000000ull, 0x0000000020000000ull },
+	{ PHB5_REG_SPC   , 0, 0x000600c3c0000000ull, 0x0000000000100000ull },
+	{ PHB5_REG_SPC   , 1, 0x000600c3c0100000ull, 0x0000000000100000ull },
+	{ PHB5_REG_SPC   , 2, 0x000600c3c0200000ull, 0x0000000000100000ull },
+	{ PHB5_REG_SPC   , 3, 0x000600c3c0300000ull, 0x0000000000100000ull },
+	{ PHB5_REG_SPC   , 4, 0x000600c3c0400000ull, 0x0000000000100000ull },
+	{ PHB5_REG_SPC   , 5, 0x000600c3c0500000ull, 0x0000000000100000ull },
+	{ RESV	         , 0, 0x000600c3c0600000ull, 0x0000003c3fa00000ull },
+
+	/* 1 TB offset */
+	{ RESV		 , 1, 0x0006010000000000ull, 0x0000010000000000ull },
+
+	/* 2 TB offset */
+	{ PHB5_64BIT_MMIO, 3, 0x0006020000000000ull, 0x0000004000000000ull },
+	{ PHB5_64BIT_MMIO, 4, 0x0006024000000000ull, 0x0000004000000000ull },
+	{ PHB5_64BIT_MMIO, 5, 0x0006028000000000ull, 0x0000004000000000ull },
+	{ RESV	         , 2, 0x000602c000000000ull, 0x0000004000000000ull },
+
+	/* 3 TB offset */
+	{ LPC_BUS        , 0, 0x0006030000000000ull, 0x0000000100000000ull },
+	{ FSP_MMIO       , 0, 0x0006030100000000ull, 0x0000000100000000ull },
+	{ XIVE_IC	 , 0, 0x0006030200000000ull, 0x0000000002000000ull },
+	{ PSIHB_ESB	 , 0, 0x0006030202000000ull, 0x0000000000100000ull },
+	{ RESV		 , 3, 0x0006030202100000ull, 0x0000000000f00000ull },
+	{ PSIHB_REG      , 0, 0x0006030203000000ull, 0x0000000000100000ull },
+	{ RESV		 , 4, 0x0006030203100000ull, 0x0000000000080000ull },
+	{ XIVE_TM        , 0, 0x0006030203180000ull, 0x0000000000040000ull },
+	{ RESV		 , 5, 0x00060302031c0000ull, 0x0000000000010000ull },
+	{ NX_RNG         , 0, 0x00060302031d0000ull, 0x0000000000010000ull },
+	{ RESV		 , 6, 0x00060302031e0000ull, 0x0000000004e20000ull },
+	{ XIVE_NVC	 , 0, 0x0006030208000000ull, 0x0000000008000000ull },
+	{ RESV		 , 7, 0x0006030210000000ull, 0x00000000ee000000ull },
+	{ VAS_HYP_WIN    , 0, 0x00060302fe000000ull, 0x0000000002000000ull },
+	{ VAS_USER_WIN   , 0, 0x0006030300000000ull, 0x0000000100000000ull },
+
+	/* TODO: MC, OCMB, PAU */
+	{ RESV		 , 8, 0x0006030400000000ull, 0x000000f800000000ull },
+	{ XSCOM          , 0, 0x000603fc00000000ull, 0x0000000400000000ull },
+
+	/* 4 TB offset */
+	{ XIVE_NVPG      , 0, 0x0006040000000000ull, 0x0000010000000000ull },
+
+	/* 5 - 7 TB offset */
+	/* for P10 the END and ESB regions are separate in the MMIO
+	 * table */
+	{ XIVE_ESB       , 0, 0x0006050000000000ull, 0x0000010000000000ull },
+	{ XIVE_END       , 0, 0x0006060000000000ull, 0x0000020000000000ull },
+
+	/* 8 - 13 TB offset */
+	{ RESV		 , 9, 0x0006080000000000ull, 0x0000060000000000ull },
+
+	/* 14 TB offset */
+	{ RESV		 ,10, 0x00060e0000000000ull, 0x0000008000000000ull },
+
+	{ NULL_MAP, 0, 0, 0 },
+};
+
+static const struct phys_map_entry phys_map_table_nimbus[] = {
+
+	/* System memory upto 4TB minus GPU memory */
+	{ SYSTEM_MEM,      0, 0x0000000000000000ull, 0x0000034000000000ull },
+	/* GPU memory from 4TB - 128GB*GPU */
+	{ GPU_MEM_4T_DOWN, 5, 0x0000034000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_DOWN, 4, 0x0000036000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_DOWN, 3, 0x0000038000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_DOWN, 2, 0x000003a000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_DOWN, 1, 0x000003c000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_DOWN, 0, 0x000003e000000000ull, 0x0000002000000000ull },
+	/* GPU memory from 4TB + 128GB*GPU. 4 GPUs only */
+	{ GPU_MEM_4T_UP,   0, 0x0000040000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_UP,   1, 0x0000042000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_UP,   2, 0x0000044000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_UP,   3, 0x0000046000000000ull, 0x0000002000000000ull },
+
+	/*
+	 * OpenCAPI LPC Memory
+	 *
+	 * With chip address extension enabled, we allocate 4TB ranges
+	 * (in the second non-mirrored region) for each OpenCAPI link
+	 * by varying the upper 2 bits of the group ID.
+	 *
+	 * We don't currently support >4TB ranges.
+	 */
+	{ OCAPI_MEM,	   0, 0x0002000000000000ull, 0x0000040000000000ull },
+	{ OCAPI_MEM,	   1, 0x0002800000000000ull, 0x0000040000000000ull },
+	{ OCAPI_MEM,	   2, 0x0003000000000000ull, 0x0000040000000000ull },
+	{ OCAPI_MEM,	   3, 0x0003800000000000ull, 0x0000040000000000ull },
+
+	/* 0 TB offset @ MMIO 0x0006000000000000ull */
+	{ PHB4_64BIT_MMIO, 0, 0x0006000000000000ull, 0x0000004000000000ull },
+	{ PHB4_64BIT_MMIO, 1, 0x0006004000000000ull, 0x0000004000000000ull },
+	{ PHB4_64BIT_MMIO, 2, 0x0006008000000000ull, 0x0000004000000000ull },
+	{ PHB4_32BIT_MMIO, 0, 0x000600c000000000ull, 0x0000000080000000ull },
+	{ PHB4_32BIT_MMIO, 1, 0x000600c080000000ull, 0x0000000080000000ull },
+	{ PHB4_32BIT_MMIO, 2, 0x000600c100000000ull, 0x0000000080000000ull },
+	{ PHB4_32BIT_MMIO, 3, 0x000600c180000000ull, 0x0000000080000000ull },
+	{ PHB4_32BIT_MMIO, 4, 0x000600c200000000ull, 0x0000000080000000ull },
+	{ PHB4_32BIT_MMIO, 5, 0x000600c280000000ull, 0x0000000080000000ull },
+	{ PHB4_XIVE_ESB  , 0, 0x000600c300000000ull, 0x0000000020000000ull },
+	{ PHB4_XIVE_ESB  , 1, 0x000600c320000000ull, 0x0000000020000000ull },
+	{ PHB4_XIVE_ESB  , 2, 0x000600c340000000ull, 0x0000000020000000ull },
+	{ PHB4_XIVE_ESB  , 3, 0x000600c360000000ull, 0x0000000020000000ull },
+	{ PHB4_XIVE_ESB  , 4, 0x000600c380000000ull, 0x0000000020000000ull },
+	{ PHB4_XIVE_ESB  , 5, 0x000600c3a0000000ull, 0x0000000020000000ull },
+	{ PHB4_REG_SPC   , 0, 0x000600c3c0000000ull, 0x0000000000100000ull },
+	{ PHB4_REG_SPC   , 1, 0x000600c3c0100000ull, 0x0000000000100000ull },
+	{ PHB4_REG_SPC   , 2, 0x000600c3c0200000ull, 0x0000000000100000ull },
+	{ PHB4_REG_SPC   , 3, 0x000600c3c0300000ull, 0x0000000000100000ull },
+	{ PHB4_REG_SPC   , 4, 0x000600c3c0400000ull, 0x0000000000100000ull },
+	{ PHB4_REG_SPC   , 5, 0x000600c3c0500000ull, 0x0000000000100000ull },
+	{ RESV	         , 0, 0x000600c3c0600000ull, 0x0000000c3fa00000ull },
+	{ NPU_OCAPI_MMIO , 0, 0x000600d000000000ull, 0x0000000800000000ull },
+	{ NPU_OCAPI_MMIO , 1, 0x000600d800000000ull, 0x0000000800000000ull },
+	{ NPU_OCAPI_MMIO , 2, 0x000600e000000000ull, 0x0000000800000000ull },
+	{ NPU_OCAPI_MMIO , 3, 0x000600e800000000ull, 0x0000000800000000ull },
+	{ NPU_OCAPI_MMIO , 4, 0x000600f000000000ull, 0x0000000800000000ull },
+	{ NPU_OCAPI_MMIO , 5, 0x000600f800000000ull, 0x0000000800000000ull },
+
+	/* 1 TB offset @ MMIO 0x0006000000000000ull */
+	{ XIVE_VC        , 0, 0x0006010000000000ull, 0x0000008000000000ull },
+	{ XIVE_PC        , 0, 0x0006018000000000ull, 0x0000001000000000ull },
+	{ VAS_USER_WIN   , 0, 0x0006019000000000ull, 0x0000000100000000ull },
+	{ VAS_HYP_WIN    , 0, 0x0006019100000000ull, 0x0000000002000000ull },
+	{ RESV	         , 1, 0x0006019102000000ull, 0x000000001e000000ull },
+	{ OCAB_XIVE_ESB  , 0, 0x0006019120000000ull, 0x0000000020000000ull },
+	{ RESV	         , 3, 0x0006019140000000ull, 0x0000006ec0000000ull },
+
+	/* 2 TB offset @ MMIO 0x0006000000000000ull */
+	{ PHB4_64BIT_MMIO, 3, 0x0006020000000000ull, 0x0000004000000000ull },
+	{ PHB4_64BIT_MMIO, 4, 0x0006024000000000ull, 0x0000004000000000ull },
+	{ PHB4_64BIT_MMIO, 5, 0x0006028000000000ull, 0x0000004000000000ull },
+	{ RESV	         , 4, 0x000602c000000000ull, 0x0000004000000000ull },
+
+	/* 3 TB offset @ MMIO 0x0006000000000000ull */
+	{ LPC_BUS        , 0, 0x0006030000000000ull, 0x0000000100000000ull },
+	{ FSP_MMIO       , 0, 0x0006030100000000ull, 0x0000000100000000ull },
+	{ NPU_REGS       , 0, 0x0006030200000000ull, 0x0000000001000000ull },
+	{ NPU_USR        , 0, 0x0006030201000000ull, 0x0000000000200000ull },
+	{ NPU_PHY        , 0, 0x0006030201200000ull, 0x0000000000200000ull },
+	{ NPU_PHY	 , 1, 0x0006030201400000ull, 0x0000000000200000ull },
+	{ NPU_NTL	 , 0, 0x0006030201600000ull, 0x0000000000020000ull },
+	{ NPU_NTL	 , 1, 0x0006030201620000ull, 0x0000000000020000ull },
+	{ NPU_NTL	 , 2, 0x0006030201640000ull, 0x0000000000020000ull },
+	{ NPU_NTL	 , 3, 0x0006030201660000ull, 0x0000000000020000ull },
+	{ NPU_NTL	 , 4, 0x0006030201680000ull, 0x0000000000020000ull },
+	{ NPU_NTL	 , 5, 0x00060302016a0000ull, 0x0000000000020000ull },
+	{ NPU_GENID	 , 0, 0x00060302016c0000ull, 0x0000000000020000ull },
+	{ NPU_GENID	 , 1, 0x00060302016e0000ull, 0x0000000000020000ull },
+	{ NPU_GENID	 , 2, 0x0006030201700000ull, 0x0000000000020000ull },
+	{ RESV	         , 5, 0x0006030201720000ull, 0x00000000018e0000ull },
+	{ PSIHB_REG      , 0, 0x0006030203000000ull, 0x0000000000100000ull },
+	{ XIVE_IC        , 0, 0x0006030203100000ull, 0x0000000000080000ull },
+	{ XIVE_TM        , 0, 0x0006030203180000ull, 0x0000000000040000ull },
+	{ PSIHB_ESB      , 0, 0x00060302031c0000ull, 0x0000000000010000ull },
+	{ NX_RNG         , 0, 0x00060302031d0000ull, 0x0000000000010000ull },
+	{ RESV           , 6, 0x00060302031e0000ull, 0x000000001ce20000ull },
+	{ CENTAUR_SCOM   , 0, 0x0006030220000000ull, 0x0000000020000000ull },
+	{ RESV           , 7, 0x0006030240000000ull, 0x000000f9c0000000ull },
+	{ XSCOM          , 0, 0x000603fc00000000ull, 0x0000000400000000ull },
+
+	/* NULL entry at end */
+	{ NULL_MAP, 0, 0, 0 },
+};
+
+static const struct phys_map_info phys_map_nimbus = {
+	.chip_select_shift = 42,
+	.table = phys_map_table_nimbus,
+};
+
+static const struct phys_map_entry phys_map_table_axone[] = {
+
+	/* System memory up to 4TB minus GPU memory */
+	{ SYSTEM_MEM,      0, 0x0000000000000000ull, 0x0000034000000000ull },
+	/* GPU memory from 4TB - 128GB*GPU */
+	{ GPU_MEM_4T_DOWN, 5, 0x0000034000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_DOWN, 4, 0x0000036000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_DOWN, 3, 0x0000038000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_DOWN, 2, 0x000003a000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_DOWN, 1, 0x000003c000000000ull, 0x0000002000000000ull },
+	{ GPU_MEM_4T_DOWN, 0, 0x000003e000000000ull, 0x0000002000000000ull },
+
+	/* 0 TB offset @ MMIO 0x0006000000000000ull */
+	{ PHB4_64BIT_MMIO, 0, 0x0006000000000000ull, 0x0000004000000000ull },
+	{ PHB4_64BIT_MMIO, 1, 0x0006004000000000ull, 0x0000004000000000ull },
+	{ PHB4_64BIT_MMIO, 2, 0x0006008000000000ull, 0x0000004000000000ull },
+	{ PHB4_32BIT_MMIO, 0, 0x000600c000000000ull, 0x0000000080000000ull },
+	{ PHB4_32BIT_MMIO, 1, 0x000600c080000000ull, 0x0000000080000000ull },
+	{ PHB4_32BIT_MMIO, 2, 0x000600c100000000ull, 0x0000000080000000ull },
+	{ PHB4_32BIT_MMIO, 3, 0x000600c180000000ull, 0x0000000080000000ull },
+	{ PHB4_32BIT_MMIO, 4, 0x000600c200000000ull, 0x0000000080000000ull },
+	{ PHB4_32BIT_MMIO, 5, 0x000600c280000000ull, 0x0000000080000000ull },
+	{ PHB4_XIVE_ESB,   0, 0x000600c300000000ull, 0x0000000020000000ull },
+	{ PHB4_XIVE_ESB,   1, 0x000600c320000000ull, 0x0000000020000000ull },
+	{ PHB4_XIVE_ESB,   2, 0x000600c340000000ull, 0x0000000020000000ull },
+	{ PHB4_XIVE_ESB,   3, 0x000600c360000000ull, 0x0000000020000000ull },
+	{ PHB4_XIVE_ESB,   4, 0x000600c380000000ull, 0x0000000020000000ull },
+	{ PHB4_XIVE_ESB,   5, 0x000600c3a0000000ull, 0x0000000020000000ull },
+	{ PHB4_REG_SPC,    0, 0x000600c3c0000000ull, 0x0000000000100000ull },
+	{ PHB4_REG_SPC,    1, 0x000600c3c0100000ull, 0x0000000000100000ull },
+	{ PHB4_REG_SPC,    2, 0x000600c3c0200000ull, 0x0000000000100000ull },
+	{ PHB4_REG_SPC,    3, 0x000600c3c0300000ull, 0x0000000000100000ull },
+	{ PHB4_REG_SPC,    4, 0x000600c3c0400000ull, 0x0000000000100000ull },
+	{ PHB4_REG_SPC,    5, 0x000600c3c0500000ull, 0x0000000000100000ull },
+	{ RESV,            0, 0x000600c3c0600000ull, 0x0000000c3fa00000ull },
+	{ NPU_OCAPI_MMIO,  0, 0x000600d000000000ull, 0x0000000800000000ull },
+	{ NPU_OCAPI_MMIO,  1, 0x000600d800000000ull, 0x0000000800000000ull },
+	{ NPU_OCAPI_MMIO,  2, 0x000600e000000000ull, 0x0000000800000000ull },
+	{ NPU_OCAPI_MMIO,  3, 0x000600e800000000ull, 0x0000000800000000ull },
+	{ NPU_OCAPI_MMIO,  4, 0x000600f000000000ull, 0x0000000800000000ull },
+	{ NPU_OCAPI_MMIO,  5, 0x000600f800000000ull, 0x0000000800000000ull },
+
+	/* 1 TB offset @ MMIO 0x0006000000000000ull */
+	{ XIVE_VC,         0, 0x0006010000000000ull, 0x0000008000000000ull },
+	{ XIVE_PC,         0, 0x0006018000000000ull, 0x0000004000000000ull },
+	{ VAS_USER_WIN,    0, 0x000601c000000000ull, 0x0000000100000000ull },
+	{ VAS_HYP_WIN,     0, 0x000601c100000000ull, 0x0000000002000000ull },
+	{ RESV,            1, 0x000601c102000000ull, 0x0000003efe000000ull },
+
+	/* 2 TB offset @ MMIO 0x0006000000000000ull */
+	{ PHB4_64BIT_MMIO, 3, 0x0006020000000000ull, 0x0000004000000000ull },
+	{ PHB4_64BIT_MMIO, 4, 0x0006024000000000ull, 0x0000004000000000ull },
+	{ PHB4_64BIT_MMIO, 5, 0x0006028000000000ull, 0x0000004000000000ull },
+	{ RESV,            2, 0x000602c000000000ull, 0x0000004000000000ull },
+
+	/* 3 TB offset @ MMIO 0x0006000000000000ull */
+	{ LPC_BUS,         0, 0x0006030000000000ull, 0x0000000100000000ull },
+	{ FSP_MMIO,        0, 0x0006030100000000ull, 0x0000000100000000ull },
+	{ RESV,            3, 0x0006030200000000ull, 0x0000000003000000ull },
+	{ PSIHB_REG,       0, 0x0006030203000000ull, 0x0000000000100000ull },
+	{ XIVE_IC,         0, 0x0006030203100000ull, 0x0000000000080000ull },
+	{ XIVE_TM,         0, 0x0006030203180000ull, 0x0000000000040000ull },
+	{ PSIHB_ESB,       0, 0x00060302031c0000ull, 0x0000000000010000ull },
+	{ NX_RNG,          0, 0x00060302031d0000ull, 0x0000000000010000ull },
+	{ RESV,            4, 0x00060302031e0000ull, 0x00000001fce20000ull },
+	{ MC_OCMB_CFG,     0, 0x0006030400000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,     1, 0x0006030480000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,    0, 0x0006030500000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,    1, 0x0006030580000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,     2, 0x0006030600000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,     3, 0x0006030680000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,    2, 0x0006030700000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,    3, 0x0006030780000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,     4, 0x0006030800000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,     5, 0x0006030880000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,    4, 0x0006030900000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,    5, 0x0006030980000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,     6, 0x0006030a00000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,     7, 0x0006030a80000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,    6, 0x0006030b00000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,    7, 0x0006030b80000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,     8, 0x0006030c00000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,     9, 0x0006030c80000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,    8, 0x0006030d00000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,    9, 0x0006030d80000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,    10, 0x0006030e00000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,    11, 0x0006030e80000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,   10, 0x0006030f00000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,   11, 0x0006030f80000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,    12, 0x0006031000000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,    13, 0x0006031080000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,   12, 0x0006031100000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,   13, 0x0006031180000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,    14, 0x0006031200000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_CFG,    15, 0x0006031280000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,   14, 0x0006031300000000ull, 0x0000000080000000ull },
+	{ MC_OCMB_MMIO,   15, 0x0006031380000000ull, 0x0000000080000000ull },
+	{ RESV,            5, 0x0006031400000000ull, 0x000000d800000000ull },
+	{ NPU_REGS,        0, 0x000603ec00000000ull, 0x0000000001000000ull },
+	{ NPU_REGS,        1, 0x000603ec01000000ull, 0x0000000001000000ull },
+	{ NPU_REGS,        2, 0x000603ec02000000ull, 0x0000000001000000ull },
+	{ NPU_NTL,         0, 0x000603ec03000000ull, 0x0000000000020000ull },
+	{ NPU_NTL,         1, 0x000603ec03020000ull, 0x0000000000020000ull },
+	{ NPU_NTL,         2, 0x000603ec03040000ull, 0x0000000000020000ull },
+	{ NPU_NTL,         3, 0x000603ec03060000ull, 0x0000000000020000ull },
+	{ NPU_GENID,       0, 0x000603ec03080000ull, 0x0000000000080000ull },
+	{ NPU_NTL,         4, 0x000603ec03100000ull, 0x0000000000020000ull },
+	{ NPU_NTL,         5, 0x000603ec03120000ull, 0x0000000000020000ull },
+	{ NPU_NTL,         6, 0x000603ec03140000ull, 0x0000000000020000ull },
+	{ NPU_NTL,         7, 0x000603ec03160000ull, 0x0000000000020000ull },
+	{ NPU_GENID,       1, 0x000603ec03180000ull, 0x0000000000080000ull },
+	{ NPU_NTL,         8, 0x000603ec03200000ull, 0x0000000000020000ull },
+	{ NPU_NTL,         9, 0x000603ec03220000ull, 0x0000000000020000ull },
+	{ NPU_NTL,        10, 0x000603ec03240000ull, 0x0000000000020000ull },
+	{ NPU_NTL,        11, 0x000603ec03260000ull, 0x0000000000020000ull },
+	{ NPU_GENID,       2, 0x000603ec03280000ull, 0x0000000000080000ull },
+	{ RESV,            6, 0x000603ec03300000ull, 0x0000000ffcd00000ull },
+	{ XSCOM,           0, 0x000603fc00000000ull, 0x0000000400000000ull },
+
+	/* NULL entry at end */
+	{ NULL_MAP, 0, 0, 0 },
+};
+
+static const struct phys_map_info phys_map_axone = {
+	.chip_select_shift = 42,
+	.table = phys_map_table_axone,
+};
+
+static const struct phys_map_info phys_map_p10 = {
+	.chip_select_shift = 44,
+	.table = phys_map_table_p10,
+};
+
+static inline bool phys_map_entry_null(const struct phys_map_entry *e)
+{
+	if (e->type == NULL_MAP)
+		return true;
+	return false;
+}
+
+
+/* This crashes skiboot on error as any bad calls here are almost
+ *  certainly a developer error
+ */
+void __phys_map_get(uint64_t topology_idx, uint64_t gcid, enum phys_map_type type,
+		  int index, uint64_t *addr, uint64_t *size) {
+	const struct phys_map_entry *e;
+	uint64_t a;
+
+	if (!phys_map)
+		goto error;
+
+	/* Find entry in table */
+	for (e = phys_map->table; ; e++) {
+
+		/* End of table */
+		if (phys_map_entry_null(e))
+			goto error;
+
+		/* Is this our entry? */
+		if (e->type != type)
+			continue;
+		if (e->index != index)
+			continue;
+
+		/* Found entry! */
+		break;
+	}
+	a = e->addr;
+	a += topology_idx << (phys_map->chip_select_shift);
+
+	if (addr)
+		*addr = a;
+	if (size)
+		*size = e->size;
+
+	prlog(PR_TRACE, "Assigning BAR [%"PRIx64"] type:%02i index:%x "
+	      "0x%016"PRIx64" for 0x%016"PRIx64"\n",
+	      gcid, type, index, a, e->size);
+
+	return;
+
+error:
+	/* Something has gone really wrong */
+	prlog(PR_EMERG, "ERROR: Failed to lookup BAR type:%i index:%i\n",
+	      type, index);
+	assert(0);
+}
+
+void phys_map_get(uint64_t gcid, enum phys_map_type type,
+		  int index, uint64_t *addr, uint64_t *size)
+{
+	struct proc_chip *chip;
+	uint64_t topology_idx = gcid;
+
+	if (proc_gen >= proc_gen_p10) {
+		chip = get_chip(gcid);
+		topology_idx = chip->primary_topology;
+	}
+
+	return __phys_map_get(topology_idx, gcid, type, index, addr, size);
+}
+
+void phys_map_init(unsigned long pvr)
+{
+	const char *name = "unused";
+
+	phys_map = NULL;
+
+	if (proc_gen == proc_gen_p9) {
+		switch(PVR_TYPE(pvr)) {
+		case PVR_TYPE_P9P:
+			name = "axone";
+			phys_map = &phys_map_axone;
+			break;
+		default:
+			name = "nimbus";
+			phys_map = &phys_map_nimbus;
+		}
+	} else if (proc_gen == proc_gen_p10) {
+		name = "p10";
+		phys_map = &phys_map_p10;
+	}
+
+	prlog(PR_DEBUG, "Assigning physical memory map table for %s\n", name);
+
+}
diff --git a/roms/skiboot/hw/prd.c b/roms/skiboot/hw/prd.c
new file mode 100644
index 000000000..45d765457
--- /dev/null
+++ b/roms/skiboot/hw/prd.c
@@ -0,0 +1,789 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * PRD: Processor Runtime Diagnostics
+ *
+ * Copyright 2014-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <lock.h>
+#include <xscom.h>
+#include <chip.h>
+#include <opal-msg.h>
+#include <fsp.h>
+#include <mem_region.h>
+#include <prd-fw-msg.h>
+#include <hostservices.h>
+
+enum events {
+	EVENT_ATTN	= 1 << 0,
+	EVENT_OCC_ERROR	= 1 << 1,
+	EVENT_OCC_RESET	= 1 << 2,
+	EVENT_SBE_PASSTHROUGH = 1 << 3,
+	EVENT_FSP_OCC_RESET = 1 << 4,
+	EVENT_FSP_OCC_LOAD_START = 1 << 5,
+};
+
+static uint8_t events[MAX_CHIPS];
+static uint64_t ipoll_status[MAX_CHIPS];
+static uint8_t _prd_msg_buf[sizeof(struct opal_prd_msg) +
+			    sizeof(struct prd_fw_msg)];
+static struct opal_prd_msg *prd_msg = (struct opal_prd_msg *)&_prd_msg_buf;
+static struct opal_prd_msg *prd_msg_fsp_req;
+static struct opal_prd_msg *prd_msg_fsp_notify;
+static bool prd_msg_inuse, prd_active;
+static struct dt_node *prd_node;
+static bool prd_enabled = false;
+
+/* Locking:
+ *
+ * The events lock serialises access to the events, ipoll_status,
+ * prd_msg_inuse, and prd_active variables.
+ *
+ * The ipoll_lock protects against concurrent updates to the ipoll registers.
+ *
+ * The ipoll_lock may be acquired with events_lock held. This order must
+ * be preserved.
+ */
+static struct lock events_lock = LOCK_UNLOCKED;
+static struct lock ipoll_lock = LOCK_UNLOCKED;
+
+static uint64_t prd_ipoll_mask_reg;
+static uint64_t prd_ipoll_status_reg;
+static uint64_t prd_ipoll_mask;
+
+/* PRD registers */
+#define PRD_P8_IPOLL_REG_MASK		0x01020013
+#define PRD_P8_IPOLL_REG_STATUS		0x01020014
+#define PRD_P8_IPOLL_XSTOP		PPC_BIT(0) /* Xstop for host/core/millicode */
+#define PRD_P8_IPOLL_RECOV		PPC_BIT(1) /* Recoverable */
+#define PRD_P8_IPOLL_SPEC_ATTN		PPC_BIT(2) /* Special attention */
+#define PRD_P8_IPOLL_HOST_ATTN		PPC_BIT(3) /* Host attention */
+#define PRD_P8_IPOLL_MASK		PPC_BITMASK(0, 3)
+
+#define PRD_P9_IPOLL_REG_MASK		0x000F0033
+#define PRD_P9_IPOLL_REG_STATUS		0x000F0034
+#define PRD_P9_IPOLL_XSTOP		PPC_BIT(0) /* Xstop for host/core/millicode */
+#define PRD_P9_IPOLL_RECOV		PPC_BIT(1) /* Recoverable */
+#define PRD_P9_IPOLL_SPEC_ATTN		PPC_BIT(2) /* Special attention */
+#define PRD_P9_IPOLL_UNIT_CS		PPC_BIT(3) /* Unit Xstop */
+#define PRD_P9_IPOLL_HOST_ATTN		PPC_BIT(4) /* Host attention */
+#define PRD_P9_IPOLL_MASK_INTR		PPC_BIT(5) /* Host interrupt */
+#define PRD_P9_IPOLL_MASK		PPC_BITMASK(0, 5)
+
+static void send_next_pending_event(void);
+
+static void prd_msg_consumed(void *data, int status)
+{
+	struct opal_prd_msg *msg = data;
+	uint32_t proc;
+	int notify_status = OPAL_SUCCESS;
+	uint8_t event = 0;
+
+	lock(&events_lock);
+	switch (msg->hdr.type) {
+	case OPAL_PRD_MSG_TYPE_ATTN:
+		proc = be64_to_cpu(msg->attn.proc);
+
+		/* If other ipoll events have been received in the time
+		 * between prd_msg creation and consumption, we'll need to
+		 * raise a separate ATTN message for those. So, we only
+		 * clear the event if we don't have any further ipoll_status
+		 * bits.
+		 */
+		ipoll_status[proc] &= ~be64_to_cpu(msg->attn.ipoll_status);
+		if (!ipoll_status[proc])
+			event = EVENT_ATTN;
+
+		break;
+	case OPAL_PRD_MSG_TYPE_OCC_ERROR:
+		proc = be64_to_cpu(msg->occ_error.chip);
+		event = EVENT_OCC_ERROR;
+		break;
+	case OPAL_PRD_MSG_TYPE_OCC_RESET:
+		proc = be64_to_cpu(msg->occ_reset.chip);
+		event = EVENT_OCC_RESET;
+		break;
+	case OPAL_PRD_MSG_TYPE_FIRMWARE_RESPONSE:
+		if (prd_msg_fsp_req) {
+			free(prd_msg_fsp_req);
+			prd_msg_fsp_req = NULL;
+		}
+		break;
+	case OPAL_PRD_MSG_TYPE_FIRMWARE_NOTIFY:
+		if (prd_msg_fsp_notify) {
+			free(prd_msg_fsp_notify);
+			prd_msg_fsp_notify = NULL;
+		}
+		if (status != 0) {
+			prlog(PR_DEBUG,
+			      "PRD: Failed to send FSP -> HBRT message\n");
+			notify_status = FSP_STATUS_GENERIC_ERROR;
+		}
+		if (platform.prd && platform.prd->msg_response)
+			platform.prd->msg_response(notify_status);
+		break;
+	case OPAL_PRD_MSG_TYPE_SBE_PASSTHROUGH:
+		proc = be64_to_cpu(msg->sbe_passthrough.chip);
+		event = EVENT_SBE_PASSTHROUGH;
+		break;
+	case OPAL_PRD_MSG_TYPE_FSP_OCC_RESET:
+		proc = be64_to_cpu(msg->occ_reset.chip);
+		event = EVENT_FSP_OCC_RESET;
+		break;
+	case OPAL_PRD_MSG_TYPE_FSP_OCC_LOAD_START:
+		proc = be64_to_cpu(msg->occ_reset.chip);
+		event = EVENT_FSP_OCC_LOAD_START;
+		break;
+	default:
+		prlog(PR_ERR, "PRD: invalid msg consumed, type: 0x%x\n",
+				msg->hdr.type);
+	}
+
+	if (event)
+		events[proc] &= ~event;
+	prd_msg_inuse = false;
+	send_next_pending_event();
+	unlock(&events_lock);
+}
+
+/*
+ * OPAL_MSG_PRD interface can handle message size <= OPAL_MSG_FIXED_PARAMS_SIZE.
+ * But kernel prd driver had a bug where it will not copy partial data to user
+ * space. Use OPAL_MSG_PRD interface only if size is <= sizeof(opal_prg_msg).
+ */
+static inline int opal_queue_prd_msg(struct opal_prd_msg *msg)
+{
+	enum opal_msg_type msg_type = OPAL_MSG_PRD2;
+
+	if (be16_to_cpu(msg->hdr.size) <= 0x20)
+		msg_type = OPAL_MSG_PRD;
+
+	return _opal_queue_msg(msg_type, msg, prd_msg_consumed,
+			       be16_to_cpu(msg->hdr.size), msg);
+}
+
+static int populate_ipoll_msg(struct opal_prd_msg *msg, uint32_t proc)
+{
+	uint64_t ipoll_mask;
+	int rc;
+
+	lock(&ipoll_lock);
+	rc = xscom_read(proc, prd_ipoll_mask_reg, &ipoll_mask);
+	unlock(&ipoll_lock);
+
+	if (rc) {
+		prlog(PR_ERR, "PRD: Unable to read ipoll status (chip %d)!\n",
+				proc);
+		return -1;
+	}
+
+	msg->attn.proc = cpu_to_be64(proc);
+	msg->attn.ipoll_status = cpu_to_be64(ipoll_status[proc]);
+	msg->attn.ipoll_mask = cpu_to_be64(ipoll_mask);
+	return 0;
+}
+
+static void send_next_pending_event(void)
+{
+	struct proc_chip *chip;
+	uint32_t proc;
+	int rc;
+	uint8_t event;
+
+	assert(!prd_msg_inuse);
+
+	if (!prd_active)
+		return;
+
+	event = 0;
+
+	for_each_chip(chip) {
+		proc = chip->id;
+		if (events[proc]) {
+			event = events[proc];
+			break;
+		}
+	}
+
+	if (!event)
+		return;
+
+	prd_msg->token = 0;
+	prd_msg->hdr.size = cpu_to_be16(sizeof(*prd_msg));
+
+	if (event & EVENT_ATTN) {
+		prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_ATTN;
+		populate_ipoll_msg(prd_msg, proc);
+	} else if (event & EVENT_OCC_ERROR) {
+		prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_OCC_ERROR;
+		prd_msg->occ_error.chip = cpu_to_be64(proc);
+	} else if (event & EVENT_OCC_RESET) {
+		prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_OCC_RESET;
+		prd_msg->occ_reset.chip = cpu_to_be64(proc);
+		occ_msg_queue_occ_reset();
+	} else if (event & EVENT_SBE_PASSTHROUGH) {
+		prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_SBE_PASSTHROUGH;
+		prd_msg->sbe_passthrough.chip = cpu_to_be64(proc);
+	} else if (event & EVENT_FSP_OCC_RESET) {
+		prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_FSP_OCC_RESET;
+		prd_msg->occ_reset.chip = cpu_to_be64(proc);
+	} else if (event & EVENT_FSP_OCC_LOAD_START) {
+		prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_FSP_OCC_LOAD_START;
+		prd_msg->occ_reset.chip = cpu_to_be64(proc);
+	}
+
+	/*
+	 * We always need to handle PSI interrupts, but if the is PRD is
+	 * disabled then we shouldn't propagate PRD events to the host.
+	 */
+	if (prd_enabled) {
+		rc = opal_queue_prd_msg(prd_msg);
+		if (!rc)
+			prd_msg_inuse = true;
+	}
+}
+
+static void __prd_event(uint32_t proc, uint8_t event)
+{
+	events[proc] |= event;
+	if (!prd_msg_inuse)
+		send_next_pending_event();
+}
+
+static void prd_event(uint32_t proc, uint8_t event)
+{
+	lock(&events_lock);
+	__prd_event(proc, event);
+	unlock(&events_lock);
+}
+
+static int __ipoll_update_mask(uint32_t proc, bool set, uint64_t bits)
+{
+	uint64_t mask;
+	int rc;
+
+	rc = xscom_read(proc, prd_ipoll_mask_reg, &mask);
+	if (rc)
+		return rc;
+
+	if (set)
+		mask |= bits;
+	else
+		mask &= ~bits;
+
+	return xscom_write(proc, prd_ipoll_mask_reg, mask);
+}
+
+static int ipoll_record_and_mask_pending(uint32_t proc)
+{
+	uint64_t status;
+	int rc;
+
+	lock(&ipoll_lock);
+	rc = xscom_read(proc, prd_ipoll_status_reg, &status);
+	status &= prd_ipoll_mask;
+	if (!rc)
+		__ipoll_update_mask(proc, true, status);
+	unlock(&ipoll_lock);
+
+	if (!rc)
+		ipoll_status[proc] |= status;
+
+	return rc;
+}
+
+/* Entry point for interrupts */
+void prd_psi_interrupt(uint32_t proc)
+{
+	int rc;
+
+	lock(&events_lock);
+
+	rc = ipoll_record_and_mask_pending(proc);
+	if (rc)
+		prlog(PR_ERR, "PRD: Failed to update IPOLL mask\n");
+
+	__prd_event(proc, EVENT_ATTN);
+
+	unlock(&events_lock);
+}
+
+void prd_tmgt_interrupt(uint32_t proc)
+{
+	prd_event(proc, EVENT_OCC_ERROR);
+}
+
+void prd_occ_reset(uint32_t proc)
+{
+	prd_event(proc, EVENT_OCC_RESET);
+}
+
+void prd_fsp_occ_reset(uint32_t proc)
+{
+	prd_event(proc, EVENT_FSP_OCC_RESET);
+}
+
+void prd_sbe_passthrough(uint32_t proc)
+{
+	prd_event(proc, EVENT_SBE_PASSTHROUGH);
+}
+
+void prd_fsp_occ_load_start(uint32_t proc)
+{
+	prd_event(proc, EVENT_FSP_OCC_LOAD_START);
+}
+
+void prd_fw_resp_fsp_response(int status)
+{
+	struct prd_fw_msg *fw_resp;
+	uint64_t fw_resp_len_old;
+	int rc;
+	uint16_t hdr_size;
+
+	lock(&events_lock);
+
+	/* In case of failure, return code is passed via generic_resp */
+	if (status != 0) {
+		fw_resp = (struct prd_fw_msg *)prd_msg_fsp_req->fw_resp.data;
+		fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_RESP_GENERIC);
+		fw_resp->generic_resp.status = cpu_to_be64(status);
+
+		fw_resp_len_old = be64_to_cpu(prd_msg_fsp_req->fw_resp.len);
+		prd_msg_fsp_req->fw_resp.len = cpu_to_be64(PRD_FW_MSG_BASE_SIZE +
+						 sizeof(fw_resp->generic_resp));
+
+		/* Update prd message size */
+		hdr_size = be16_to_cpu(prd_msg_fsp_req->hdr.size);
+		hdr_size -= fw_resp_len_old;
+		hdr_size += be64_to_cpu(prd_msg_fsp_req->fw_resp.len);
+		prd_msg_fsp_req->hdr.size = cpu_to_be16(hdr_size);
+	}
+
+	rc = opal_queue_prd_msg(prd_msg_fsp_req);
+	if (!rc)
+		prd_msg_inuse = true;
+	unlock(&events_lock);
+}
+
+int prd_hbrt_fsp_msg_notify(void *data, u32 dsize)
+{
+	struct prd_fw_msg *fw_notify;
+	int size, fw_notify_size;
+	int rc = FSP_STATUS_GENERIC_ERROR;
+
+	if (!prd_enabled) {
+		prlog(PR_NOTICE, "PRD: %s: PRD daemon is not ready\n",
+		      __func__);
+		return rc;
+	}
+
+	/* Calculate prd message size */
+	fw_notify_size = PRD_FW_MSG_BASE_SIZE + dsize;
+	size =  sizeof(prd_msg->hdr) + sizeof(prd_msg->token) +
+		sizeof(prd_msg->fw_notify) + fw_notify_size;
+
+	if (size > OPAL_PRD_MSG_SIZE_MAX) {
+		prlog(PR_DEBUG, "PRD: FSP - HBRT notify message size (0x%x)"
+		      " is bigger than prd interface can handle\n", size);
+		return rc;
+	}
+
+	lock(&events_lock);
+
+	/* FSP - HBRT messages are serialized */
+	if (prd_msg_fsp_notify) {
+		prlog(PR_DEBUG, "PRD: FSP - HBRT notify message is busy\n");
+		goto unlock_events;
+	}
+
+	/* Handle message allocation */
+	prd_msg_fsp_notify = zalloc(size);
+	if (!prd_msg_fsp_notify) {
+		prlog(PR_DEBUG,
+		      "PRD: %s: Failed to allocate memory.\n", __func__);
+		goto unlock_events;
+	}
+
+	prd_msg_fsp_notify->hdr.type = OPAL_PRD_MSG_TYPE_FIRMWARE_NOTIFY;
+	prd_msg_fsp_notify->hdr.size = cpu_to_be16(size);
+	prd_msg_fsp_notify->token = 0;
+	prd_msg_fsp_notify->fw_notify.len = cpu_to_be64(fw_notify_size);
+	fw_notify = (void *)prd_msg_fsp_notify->fw_notify.data;
+	fw_notify->type = cpu_to_be64(PRD_FW_MSG_TYPE_HBRT_FSP);
+	memcpy(&(fw_notify->mbox_msg), data, dsize);
+
+	if (!prd_active) {
+		// save the message, we'll deliver it when prd starts
+		rc = FSP_STATUS_BUSY;
+		goto unlock_events;
+	}
+
+	rc = opal_queue_prd_msg(prd_msg_fsp_notify);
+	if (!rc)
+		prd_msg_inuse = true;
+
+unlock_events:
+	unlock(&events_lock);
+	return rc;
+}
+
+/* incoming message handlers */
+static int prd_msg_handle_attn_ack(struct opal_prd_msg *msg)
+{
+	int rc;
+
+	lock(&ipoll_lock);
+	rc = __ipoll_update_mask(be64_to_cpu(msg->attn_ack.proc), false,
+			be64_to_cpu(msg->attn_ack.ipoll_ack) & prd_ipoll_mask);
+	unlock(&ipoll_lock);
+
+	if (rc)
+		prlog(PR_ERR, "PRD: Unable to unmask ipoll!\n");
+
+	return rc;
+}
+
+static int prd_msg_handle_init(struct opal_prd_msg *msg)
+{
+	struct proc_chip *chip;
+
+	lock(&ipoll_lock);
+	for_each_chip(chip) {
+		__ipoll_update_mask(chip->id, false,
+			be64_to_cpu(msg->init.ipoll) & prd_ipoll_mask);
+	}
+	unlock(&ipoll_lock);
+
+	/* we're transitioning from inactive to active; send any pending tmgt
+	 * interrupts */
+	lock(&events_lock);
+	prd_active = true;
+
+	if (prd_msg_fsp_notify) {
+		if (!opal_queue_prd_msg(prd_msg_fsp_notify))
+			prd_msg_inuse = true;
+	}
+	if (!prd_msg_inuse)
+		send_next_pending_event();
+	unlock(&events_lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int prd_msg_handle_fini(void)
+{
+	struct proc_chip *chip;
+
+	lock(&events_lock);
+	prd_active = false;
+	unlock(&events_lock);
+
+	lock(&ipoll_lock);
+	for_each_chip(chip) {
+		__ipoll_update_mask(chip->id, true, prd_ipoll_mask);
+	}
+	unlock(&ipoll_lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int prd_msg_handle_firmware_req(struct opal_prd_msg *msg)
+{
+	unsigned long fw_req_len, fw_resp_len, data_len;
+	struct prd_fw_msg *fw_req, *fw_resp;
+	int rc;
+	uint64_t resp_msg_size;
+
+	fw_req_len = be64_to_cpu(msg->fw_req.req_len);
+	fw_resp_len = be64_to_cpu(msg->fw_req.resp_len);
+	fw_req = (struct prd_fw_msg *)msg->fw_req.data;
+
+	/* do we have a full firmware message? */
+	if (fw_req_len < sizeof(struct prd_fw_msg))
+		return -EINVAL;
+
+	/* does the total (outer) PRD message len provide enough data for the
+	 * claimed (inner) FW message?
+	 */
+	if (be16_to_cpu(msg->hdr.size) < fw_req_len +
+			offsetof(struct opal_prd_msg, fw_req.data))
+		return -EINVAL;
+
+	/* is there enough response buffer for a base response? Type-specific
+	 * responses may be larger, but anything less than BASE_SIZE is
+	 * invalid. */
+	if (fw_resp_len < PRD_FW_MSG_BASE_SIZE)
+		return -EINVAL;
+
+	/* prepare a response message. */
+	lock(&events_lock);
+	prd_msg_inuse = true;
+	prd_msg->token = 0;
+	prd_msg->hdr.type = OPAL_PRD_MSG_TYPE_FIRMWARE_RESPONSE;
+	fw_resp = (void *)prd_msg->fw_resp.data;
+
+	switch (be64_to_cpu(fw_req->type)) {
+	case PRD_FW_MSG_TYPE_REQ_NOP:
+		fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_RESP_NOP);
+		prd_msg->fw_resp.len = cpu_to_be64(PRD_FW_MSG_BASE_SIZE);
+		prd_msg->hdr.size = cpu_to_be16(sizeof(*prd_msg));
+		rc = 0;
+		break;
+	case PRD_FW_MSG_TYPE_ERROR_LOG:
+		if (platform.prd == NULL ||
+		    platform.prd->send_error_log == NULL) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+
+		rc = platform.prd->send_error_log(be32_to_cpu(fw_req->errorlog.plid),
+						  be32_to_cpu(fw_req->errorlog.size),
+						  fw_req->errorlog.data);
+		/* Return generic response to HBRT */
+		fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_RESP_GENERIC);
+		fw_resp->generic_resp.status = cpu_to_be64(rc);
+		prd_msg->fw_resp.len = cpu_to_be64(PRD_FW_MSG_BASE_SIZE +
+						 sizeof(fw_resp->generic_resp));
+		prd_msg->hdr.size = cpu_to_be16(sizeof(*prd_msg));
+		rc = 0;
+		break;
+	case PRD_FW_MSG_TYPE_HBRT_FSP:
+		if (platform.prd == NULL ||
+		    platform.prd->send_hbrt_msg == NULL) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+
+		/*
+		 * HBRT -> FSP messages are serialized. Just to be sure check
+		 * whether fsp_req message is free or not.
+		 */
+		if (prd_msg_fsp_req) {
+			prlog(PR_DEBUG, "PRD: HBRT - FSP message is busy\n");
+			rc = OPAL_BUSY;
+			break;
+		}
+
+		/*
+		 * FSP interface doesn't tell us the response data size.
+		 * Hence pass response length = request length.
+		 */
+		resp_msg_size = sizeof(msg->hdr) + sizeof(msg->token) +
+			sizeof(msg->fw_resp) + fw_req_len;
+
+		if (resp_msg_size > OPAL_PRD_MSG_SIZE_MAX) {
+			prlog(PR_DEBUG, "PRD: HBRT - FSP response size (0x%llx)"
+			      " is bigger than prd interface can handle\n",
+			      resp_msg_size);
+			rc = OPAL_INTERNAL_ERROR;
+			break;
+		}
+
+		/*
+		 * We will use fsp_queue_msg() to pass HBRT data to FSP.
+		 * We cannot directly map kernel passed data as kernel
+		 * will release the memory as soon as we return the control.
+		 * Also FSP uses same memory to pass response to HBRT. Hence
+		 * lets copy data to local memory. Then pass this memory to
+		 * FSP via TCE mapping.
+		 */
+		prd_msg_fsp_req = zalloc(resp_msg_size);
+		if (!prd_msg_fsp_req) {
+			prlog(PR_DEBUG, "PRD: Failed to allocate memory "
+			      "for HBRT - FSP message\n");
+			rc = OPAL_RESOURCE;
+			break;
+		}
+
+		/* Update message header */
+		prd_msg_fsp_req->hdr.type = OPAL_PRD_MSG_TYPE_FIRMWARE_RESPONSE;
+		prd_msg_fsp_req->hdr.size = cpu_to_be16(resp_msg_size);
+		prd_msg_fsp_req->token = 0;
+		prd_msg_fsp_req->fw_resp.len = cpu_to_be64(fw_req_len);
+
+		/* copy HBRT data to local memory */
+		fw_resp = (struct prd_fw_msg *)prd_msg_fsp_req->fw_resp.data;
+		memcpy(fw_resp, fw_req, fw_req_len);
+
+		/* Update response type */
+		fw_resp->type = cpu_to_be64(PRD_FW_MSG_TYPE_HBRT_FSP);
+
+		/* Get MBOX message size */
+		data_len = fw_req_len - PRD_FW_MSG_BASE_SIZE;
+
+		/* We have to wait until FSP responds */
+		prd_msg_inuse = false;
+		/* Unlock to avoid recursive lock issue */
+		unlock(&events_lock);
+
+		/* Send message to FSP */
+		rc = platform.prd->send_hbrt_msg(&(fw_resp->mbox_msg), data_len);
+
+		/*
+		 * Callback handler from hservice_send_hbrt_msg will take
+		 * care of sending response to HBRT. So just send return
+		 * code to Linux.
+		 */
+		if (rc == OPAL_SUCCESS)
+			return rc;
+
+		lock(&events_lock);
+		if (prd_msg_fsp_req) {
+			free(prd_msg_fsp_req);
+			prd_msg_fsp_req = NULL;
+		}
+		break;
+	default:
+		prlog(PR_DEBUG, "PRD: Unsupported fw_request type : 0x%llx\n",
+		      be64_to_cpu(fw_req->type));
+		rc = -ENOSYS;
+	}
+
+	if (!rc) {
+		rc = opal_queue_prd_msg(prd_msg);
+		if (rc)
+			prd_msg_inuse = false;
+	} else {
+		prd_msg_inuse = false;
+	}
+
+	unlock(&events_lock);
+
+	return rc;
+}
+
+/* Entry from the host above */
+static int64_t opal_prd_msg(struct opal_prd_msg *msg)
+{
+	int rc;
+
+	/* fini is a little special: the kernel (which may not have the entire
+	 * opal_prd_msg definition) can send a FINI message, so we don't check
+	 * the full size */
+	if (be16_to_cpu(msg->hdr.size) >= sizeof(struct opal_prd_msg_header) &&
+			msg->hdr.type == OPAL_PRD_MSG_TYPE_FINI)
+		return prd_msg_handle_fini();
+
+	if (be16_to_cpu(msg->hdr.size) < sizeof(*msg))
+		return OPAL_PARAMETER;
+
+	switch (msg->hdr.type) {
+	case OPAL_PRD_MSG_TYPE_INIT:
+		rc = prd_msg_handle_init(msg);
+		break;
+	case OPAL_PRD_MSG_TYPE_ATTN_ACK:
+		rc = prd_msg_handle_attn_ack(msg);
+		break;
+	case OPAL_PRD_MSG_TYPE_OCC_RESET_NOTIFY:
+		rc = occ_msg_queue_occ_reset();
+		break;
+	case OPAL_PRD_MSG_TYPE_FIRMWARE_REQUEST:
+		rc = prd_msg_handle_firmware_req(msg);
+		break;
+	case OPAL_PRD_MSG_TYPE_FSP_OCC_RESET_STATUS:
+		if (platform.prd == NULL ||
+		    platform.prd->fsp_occ_reset_status == NULL) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+		rc = platform.prd->fsp_occ_reset_status(
+			be64_to_cpu(msg->fsp_occ_reset_status.chip),
+			be64_to_cpu(msg->fsp_occ_reset_status.status));
+		break;
+	case OPAL_PRD_MSG_TYPE_CORE_SPECIAL_WAKEUP:
+		if (platform.prd == NULL ||
+		    platform.prd->wakeup == NULL) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+		rc = platform.prd->wakeup(be32_to_cpu(msg->spl_wakeup.core),
+					  be32_to_cpu(msg->spl_wakeup.mode));
+		break;
+	case OPAL_PRD_MSG_TYPE_FSP_OCC_LOAD_START_STATUS:
+		if (platform.prd == NULL ||
+		    platform.prd->fsp_occ_load_start_status == NULL) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+		rc = platform.prd->fsp_occ_load_start_status(
+			be64_to_cpu(msg->fsp_occ_reset_status.chip),
+			be64_to_cpu(msg->fsp_occ_reset_status.status));
+		break;
+	default:
+		prlog(PR_DEBUG, "PRD: Unsupported prd message type : 0x%x\n",
+		      msg->hdr.type);
+		rc = OPAL_UNSUPPORTED;
+	}
+
+	return rc;
+}
+
+
+/*
+ * Initialise the Opal backend for the PRD daemon. This must be called from
+ * platform probe or init function.
+ */
+void prd_init(void)
+{
+	struct proc_chip *chip;
+
+	switch (proc_gen) {
+	case proc_gen_p8:
+		prd_ipoll_mask_reg = PRD_P8_IPOLL_REG_MASK;
+		prd_ipoll_status_reg = PRD_P8_IPOLL_REG_STATUS;
+		prd_ipoll_mask = PRD_P8_IPOLL_MASK;
+		break;
+	case proc_gen_p9:
+		prd_ipoll_mask_reg = PRD_P9_IPOLL_REG_MASK;
+		prd_ipoll_status_reg = PRD_P9_IPOLL_REG_STATUS;
+		prd_ipoll_mask = PRD_P9_IPOLL_MASK;
+		break;
+	case proc_gen_p10: /* IPOLL regs are the same for p9 and p10 */
+		prd_ipoll_mask_reg = PRD_P9_IPOLL_REG_MASK;
+		prd_ipoll_status_reg = PRD_P9_IPOLL_REG_STATUS;
+		prd_ipoll_mask = PRD_P9_IPOLL_MASK;
+		break;
+	default:
+		assert(0);
+	}
+
+	/* mask everything */
+	lock(&ipoll_lock);
+	for_each_chip(chip) {
+		__ipoll_update_mask(chip->id, true, prd_ipoll_mask);
+	}
+	unlock(&ipoll_lock);
+
+	prd_enabled = true;
+	opal_register(OPAL_PRD_MSG, opal_prd_msg, 1);
+
+	prd_node = dt_new(opal_node, "diagnostics");
+	dt_add_property_strings(prd_node, "compatible", "ibm,opal-prd");
+}
+
+void prd_register_reserved_memory(void)
+{
+	struct mem_region *region;
+
+	if (!prd_node)
+		return;
+
+	lock(&mem_region_lock);
+	for (region = mem_region_next(NULL); region;
+			region = mem_region_next(region)) {
+
+		if (region->type != REGION_FW_RESERVED)
+			continue;
+
+		if (!region->node)
+			continue;
+
+		if (!dt_find_property(region->node, "ibm,prd-label")) {
+			dt_add_property_string(region->node, "ibm,prd-label",
+					region->name);
+		}
+	}
+	unlock(&mem_region_lock);
+}
diff --git a/roms/skiboot/hw/psi.c b/roms/skiboot/hw/psi.c
new file mode 100644
index 000000000..de074ce4a
--- /dev/null
+++ b/roms/skiboot/hw/psi.c
@@ -0,0 +1,1079 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Service Processor serial console handling code
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <io.h>
+#include <psi.h>
+#include <fsp.h>
+#include <opal.h>
+#include <interrupts.h>
+#include <cpu.h>
+#include <dio-p9.h>
+#include <trace.h>
+#include <xscom.h>
+#include <chip.h>
+#include <lpc.h>
+#include <i2c.h>
+#include <timebase.h>
+#include <platform.h>
+#include <errorlog.h>
+#include <xive.h>
+#include <sbe-p9.h>
+#include <phys-map.h>
+#include <occ.h>
+
+static LIST_HEAD(psis);
+static u64 psi_link_timer;
+static u64 psi_link_timeout;
+static bool psi_link_poll_active;
+
+static void psi_activate_phb(struct psi *psi);
+
+struct lock psi_lock = LOCK_UNLOCKED;
+
+DEFINE_LOG_ENTRY(OPAL_RC_PSI_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_PSI,
+		OPAL_PLATFORM_FIRMWARE,
+		OPAL_UNRECOVERABLE_ERR_LOSS_OF_FUNCTION, OPAL_NA);
+
+void psi_set_link_polling(bool active)
+{
+	printf("PSI: %sing link polling\n",
+	       active ? "start" : "stopp");
+	psi_link_poll_active = active;
+}
+
+void psi_disable_link(struct psi *psi)
+{
+	lock(&psi_lock);
+
+	/*
+	 * Note: This can be called with the link already down but
+	 * not detected as such yet by this layer since psi_check_link_active()
+	 * operates locklessly and thus won't update the PSI structure. This
+	 * is a non-issue, the only consequence is the messages in the log
+	 * mentioning first the link having gone down then being disabled.
+	 */
+	if (psi->active) {
+		u64 reg;
+		psi->active = false;
+
+		/* Mask errors in SEMR */
+		reg = in_be64(psi->regs + PSIHB_SEMR);
+		reg &= ((0xfffull << 36) | (0xfffull << 20));
+		out_be64(psi->regs + PSIHB_SEMR, reg);
+		printf("PSI: SEMR set to %llx\n", reg);
+
+		/* Reset all the error bits in PSIHB_CR and
+		 * disable FSP interrupts
+		 */
+		reg = in_be64(psi->regs + PSIHB_CR);
+		reg &= ~(0x7ffull << 20);
+		reg &= ~PSIHB_CR_PSI_LINK_ENABLE;	/* flip link enable */
+		/*
+		 * Ensure no commands/spurious interrupts reach
+		 * the processor, by flipping the command enable.
+		 */
+		reg &= ~PSIHB_CR_FSP_CMD_ENABLE;
+		reg &= ~PSIHB_CR_FSP_IRQ_ENABLE;
+		reg &= ~PSIHB_CR_FSP_IRQ; /* Clear interrupt state too */
+		printf("PSI[0x%03x]: Disabling link!\n", psi->chip_id);
+		out_be64(psi->regs + PSIHB_CR, reg);
+		printf("PSI: PSIHB_CR (error bits) set to %llx\n",
+				in_be64(psi->regs + PSIHB_CR));
+		psi_set_link_polling(true);
+	}
+
+	unlock(&psi_lock);
+}
+
+/*
+ * Resetting the FSP is a multi step sequence:
+ * 1. Read the PSIHBCR
+ * 2. Set the PSIHBCR[6] -- write register back.
+ * 3. Read PSIHBCR again
+ * 4. Reset PSIHBCR[6] -- write register back.
+ */
+void psi_reset_fsp(struct psi *psi)
+{
+	lock(&psi_lock);
+
+	if (psi->active) {
+		u64 reg;
+
+		printf("PSI: Driving FSP reset via PSI\n");
+		reg = in_be64(psi->regs + PSIHB_CR);
+		reg &= ~(0xfffull << 20);	/* Reset error bits */
+		reg |= PSIHB_CR_FSP_RESET;	/* FSP reset trigger start */
+		out_be64(psi->regs + PSIHB_CR, reg);
+		printf("PSI[0x%03x]: FSP reset start PSIHBCR set to %llx\n",
+			psi->chip_id, in_be64(psi->regs + PSIHB_CR));
+
+		reg = in_be64(psi->regs + PSIHB_CR);
+		reg &= ~PSIHB_CR_FSP_RESET;	/* Clear FSP reset bit */
+		out_be64(psi->regs + PSIHB_CR, reg);	/* Complete reset */
+		printf("PSI[0x%03x]: FSP reset complete. PSIHBCR set to %llx\n",
+			psi->chip_id, in_be64(psi->regs + PSIHB_CR));
+	}
+	unlock(&psi_lock);
+
+	/* Now bring down the PSI link too... */
+	psi_disable_link(psi);
+}
+
+bool psi_check_link_active(struct psi *psi)
+{
+	u64 val = in_be64(psi->regs + PSIHB_CR);
+
+	/*
+	 * Unlocked, used during fsp_poke_msg so we really want
+	 * to avoid fancy link re-entrancy and deadlocks here
+	 */
+	if (!psi->active)
+		return false;
+	return (val & PSIHB_CR_PSI_LINK_ENABLE) &&
+		(val & PSIHB_CR_FSP_LINK_ACTIVE);
+}
+
+struct psi *psi_find_link(uint32_t chip_id)
+{
+	struct psi *psi;
+
+	list_for_each(&psis, psi, list) {
+		if (psi->chip_id == chip_id)
+			return psi;
+	}
+	return NULL;
+}
+
+#define PSI_LINK_CHECK_INTERVAL		10	/* Interval in secs */
+#define PSI_LINK_RECOVERY_TIMEOUT	1800	/* 30 minutes */
+
+static void psi_link_poll(void *data __unused)
+{
+	struct psi *psi;
+	u64 now;
+
+	if (!psi_link_poll_active)
+		return;
+
+	now = mftb();
+	if (psi_link_timer == 0 ||
+		(tb_compare(now, psi_link_timer) == TB_AAFTERB) ||
+		(tb_compare(now, psi_link_timer) == TB_AEQUALB)) {
+
+		lock(&psi_lock);
+
+		list_for_each(&psis, psi, list) {
+			u64 val;
+
+			if (psi->active)
+				continue;
+
+			val = in_be64(psi->regs + PSIHB_CR);
+
+			printf("PSI[0x%03x]: Poll CR=0x%016llx\n",
+			       psi->chip_id, val);
+
+			if ((val & PSIHB_CR_PSI_LINK_ENABLE) &&
+			    (val & PSIHB_CR_FSP_LINK_ACTIVE)) {
+				printf("PSI[0x%03x]: Found active link!\n",
+				       psi->chip_id);
+				psi_link_timeout = 0;
+				psi->active = true;
+				psi_activate_phb(psi);
+				psi_set_link_polling(false);
+				unlock(&psi_lock);
+				if (platform.psi && platform.psi->link_established)
+					platform.psi->link_established();
+				return;
+			}
+		}
+		if (!psi_link_timeout)
+			psi_link_timeout =
+				now + secs_to_tb(PSI_LINK_RECOVERY_TIMEOUT);
+
+		if (tb_compare(now, psi_link_timeout) == TB_AAFTERB) {
+			log_simple_error(&e_info(OPAL_RC_PSI_TIMEOUT),
+				"PSI: Link timeout -- loss of FSP\n");
+			/* Reset the link timeout and continue looking */
+			psi_link_timeout = 0;
+		}
+
+		/* Poll every 10 seconds */
+		psi_link_timer = now + secs_to_tb(PSI_LINK_CHECK_INTERVAL);
+
+		unlock(&psi_lock);
+	}
+}
+
+void psi_enable_fsp_interrupt(struct psi *psi)
+{
+	/* Enable FSP interrupts in the GXHB */
+	lock(&psi_lock);
+	out_be64(psi->regs + PSIHB_CR,
+		 in_be64(psi->regs + PSIHB_CR) | PSIHB_CR_FSP_IRQ_ENABLE);
+	unlock(&psi_lock);
+}
+
+/* Multiple bits can be set on errors */
+static void decode_psihb_error(u64 val)
+{
+	if (val & PSIHB_CR_PSI_ERROR)
+		printf("PSI: PSI Reported Error\n");
+	if (val & PSIHB_CR_PSI_LINK_INACTIVE)
+		printf("PSI: PSI Link Inactive Transition\n");
+	if (val & PSIHB_CR_FSP_ACK_TIMEOUT)
+		printf("PSI: FSP Ack Timeout\n");
+	if (val & PSIHB_CR_MMIO_LOAD_TIMEOUT)
+		printf("PSI: MMIO Load Timeout\n");
+	if (val & PSIHB_CR_MMIO_LENGTH_ERROR)
+		printf("PSI: MMIO Length Error\n");
+	if (val & PSIHB_CR_MMIO_ADDRESS_ERROR)
+		printf("PSI: MMIO Address Error\n");
+	if (val & PSIHB_CR_MMIO_TYPE_ERROR)
+		printf("PSI: MMIO Type Error\n");
+	if (val & PSIHB_CR_UE)
+		printf("PSI: UE Detected\n");
+	if (val & PSIHB_CR_PARITY_ERROR)
+		printf("PSI: Internal Parity Error\n");
+	if (val & PSIHB_CR_SYNC_ERR_ALERT1)
+		printf("PSI: Sync Error Alert1\n");
+	if (val & PSIHB_CR_SYNC_ERR_ALERT2)
+		printf("PSI: Sync Error Alert2\n");
+	if (val & PSIHB_CR_FSP_COMMAND_ERROR)
+		printf("PSI: FSP Command Error\n");
+}
+
+
+static void handle_psi_interrupt(struct psi *psi, u64 val)
+{
+	printf("PSI[0x%03x]: PSI mgmnt interrupt CR=0x%016llx\n",
+	       psi->chip_id, val);
+
+	if (val & (0xfffull << 20)) {
+		decode_psihb_error(val);
+		psi_disable_link(psi);
+	} else if (val & (0x1full << 11))
+		printf("PSI: FSP error detected\n");
+}
+
+static void psi_spurious_fsp_irq(struct psi *psi)
+{
+	u64 reg, bit;
+
+	prlog(PR_NOTICE, "PSI: Spurious interrupt, attempting clear\n");
+
+	if (proc_gen == proc_gen_p10) {
+		reg = PSIHB_XSCOM_P10_HBCSR_CLR;
+		bit = PSIHB_XSCOM_P10_HBSCR_FSP_IRQ;
+	} else if (proc_gen == proc_gen_p9) {
+		reg = PSIHB_XSCOM_P9_HBCSR_CLR;
+		bit = PSIHB_XSCOM_P9_HBSCR_FSP_IRQ;
+	} else if (proc_gen == proc_gen_p8) {
+		reg = PSIHB_XSCOM_P8_HBCSR_CLR;
+		bit = PSIHB_XSCOM_P8_HBSCR_FSP_IRQ;
+	} else {
+		assert(false);
+	}
+	xscom_write(psi->chip_id, psi->xscom_base + reg, bit);
+}
+
+bool psi_poll_fsp_interrupt(struct psi *psi)
+{
+	return !!(in_be64(psi->regs + PSIHB_CR) & PSIHB_CR_FSP_IRQ);
+}
+
+static void psihb_interrupt(struct irq_source *is, uint32_t isn __unused)
+{
+	struct psi *psi = is->data;
+	u64 val;
+
+	val = in_be64(psi->regs + PSIHB_CR);
+
+	if (psi_link_poll_active) {
+		printf("PSI[0x%03x]: PSI interrupt CR=0x%016llx (A=%d)\n",
+		       psi->chip_id, val, psi->active);
+	}
+
+	/* Handle PSI interrupts first in case it's a link down */
+	if (val & PSIHB_CR_PSI_IRQ) {
+		handle_psi_interrupt(psi, val);
+
+		/*
+		 * If the link went down, re-read PSIHB_CR as
+		 * the FSP interrupt might have been cleared.
+		 */
+		if (!psi->active)
+			val = in_be64(psi->regs + PSIHB_CR);
+	}
+
+
+	/*
+	 * We avoid forwarding FSP interrupts if the link isn't
+	 * active. They should be masked anyway but it looks
+	 * like the CR bit can remain set.
+	 */
+	if (val & PSIHB_CR_FSP_IRQ) {
+		/*
+		 * We have a case a flood with FSP mailbox interrupts
+		 * when the link is down, see if we manage to clear
+		 * the condition
+		 */
+		if (!psi->active)
+			psi_spurious_fsp_irq(psi);
+		else {
+			if (platform.psi && platform.psi->fsp_interrupt)
+				platform.psi->fsp_interrupt();
+		}
+	}
+
+	if (platform.psi && platform.psi->psihb_interrupt)
+		platform.psi->psihb_interrupt();
+}
+
+
+static const uint32_t psi_p8_irq_to_xivr[P8_IRQ_PSI_IRQ_COUNT] = {
+	[P8_IRQ_PSI_FSP]	= PSIHB_XIVR_FSP,
+	[P8_IRQ_PSI_OCC]	= PSIHB_XIVR_OCC,
+	[P8_IRQ_PSI_FSI]	= PSIHB_XIVR_FSI,
+	[P8_IRQ_PSI_LPC]	= PSIHB_XIVR_LPC,
+	[P8_IRQ_PSI_LOCAL_ERR]	= PSIHB_XIVR_LOCAL_ERR,
+	[P8_IRQ_PSI_EXTERNAL]= PSIHB_XIVR_HOST_ERR,
+};
+
+static void psi_cleanup_irq(struct psi *psi)
+{
+	uint32_t irq;
+	uint64_t xivr, xivr_p;
+
+	for (irq = 0; irq < P8_IRQ_PSI_IRQ_COUNT; irq++) {
+		prlog(PR_DEBUG, "PSI[0x%03x]: Cleaning up IRQ %d\n",
+		      psi->chip_id, irq);
+
+		xivr_p = psi_p8_irq_to_xivr[irq];
+		xivr = in_be64(psi->regs + xivr_p);
+		xivr |= (0xffull << 32);
+		out_be64(psi->regs + xivr_p, xivr);
+		time_wait_ms_nopoll(10);
+		xivr = in_be64(psi->regs + xivr_p);
+		if (xivr & PPC_BIT(39)) {
+			printf(" Need EOI !\n");
+			icp_send_eoi(psi->interrupt + irq);
+		}
+	}
+}
+
+/* Called on a fast reset, make sure we aren't stuck with
+ * an accepted and never EOId PSI interrupt
+ */
+void psi_irq_reset(void)
+{
+	struct psi *psi;
+
+	printf("PSI: Hot reset!\n");
+
+	assert(proc_gen == proc_gen_p8);
+
+	list_for_each(&psis, psi, list) {
+		psi_cleanup_irq(psi);
+	}
+}
+
+static int64_t psi_p8_set_xive(struct irq_source *is, uint32_t isn,
+			       uint16_t server, uint8_t priority)
+{
+	struct psi *psi = is->data;
+	uint64_t xivr_p, xivr;
+	uint32_t irq_idx = isn & 7;
+
+	if (irq_idx >= P8_IRQ_PSI_IRQ_COUNT)
+ 		return OPAL_PARAMETER;
+	xivr_p = psi_p8_irq_to_xivr[irq_idx];
+
+	/* Populate the XIVR */
+	xivr  = (uint64_t)server << 40;
+	xivr |= (uint64_t)priority << 32;
+	xivr |= (uint64_t)(isn & 7) << 29;
+
+	out_be64(psi->regs + xivr_p, xivr);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t psi_p8_get_xive(struct irq_source *is, uint32_t isn __unused,
+			       uint16_t *server, uint8_t *priority)
+{
+	struct psi *psi = is->data;
+	uint64_t xivr_p, xivr;
+	uint32_t irq_idx = isn & 7;
+
+	if (irq_idx >= P8_IRQ_PSI_IRQ_COUNT)
+ 		return OPAL_PARAMETER;
+
+	xivr_p = psi_p8_irq_to_xivr[irq_idx];
+
+	/* Read & decode the XIVR */
+	xivr = in_be64(psi->regs + xivr_p);
+
+	*server = (xivr >> 40) & 0xffff;
+	*priority = (xivr >> 32) & 0xff;
+
+	return OPAL_SUCCESS;
+}
+
+static void psihb_p8_interrupt(struct irq_source *is, uint32_t isn)
+{
+	struct psi *psi = is->data;
+	uint32_t idx = isn - psi->interrupt;
+
+	switch (idx) {
+	case P8_IRQ_PSI_FSP:
+		psihb_interrupt(is, isn);
+		break;
+	case P8_IRQ_PSI_OCC:
+		occ_p8_interrupt(psi->chip_id);
+		break;
+	case P8_IRQ_PSI_FSI:
+		printf("PSI: FSI irq received\n");
+		break;
+	case P8_IRQ_PSI_LPC:
+		lpc_interrupt(psi->chip_id);
+
+		/*
+		 * i2c interrupts are ORed with the LPC ones on
+		 * Murano DD2.1 and Venice DD2.0
+		 */
+		p8_i2c_interrupt(psi->chip_id);
+		break;
+	case P8_IRQ_PSI_LOCAL_ERR:
+		prd_psi_interrupt(psi->chip_id);
+		break;
+	case P8_IRQ_PSI_EXTERNAL:
+		if (platform.external_irq)
+			platform.external_irq(psi->chip_id);
+		break;
+	}
+
+	/*
+	 * TODO: Per Vicente Chung, CRESPs don't generate interrupts,
+	 * and are just informational. Need to define the policy
+	 * to handle them.
+	 */
+}
+
+static uint64_t psi_p8_irq_attributes(struct irq_source *is, uint32_t isn)
+{
+	struct psi *psi = is->data;
+	uint32_t idx = isn - psi->interrupt;
+	uint64_t attr;
+
+	if (psi->no_lpc_irqs && idx == P8_IRQ_PSI_LPC)
+		return IRQ_ATTR_TARGET_LINUX;
+
+	/* Only direct external interrupts to OPAL if we have a handler */
+	if (idx == P8_IRQ_PSI_EXTERNAL && !platform.external_irq)
+		return IRQ_ATTR_TARGET_LINUX;
+
+	attr = IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TYPE_LSI;
+	if (idx == P8_IRQ_PSI_EXTERNAL || idx == P8_IRQ_PSI_LPC ||
+	    idx == P8_IRQ_PSI_FSP)
+		attr |= IRQ_ATTR_TARGET_FREQUENT;
+	return attr;
+}
+
+static char *psi_p8_irq_name(struct irq_source *is, uint32_t isn)
+{
+	struct psi *psi = is->data;
+	uint32_t idx = isn - psi->interrupt;
+	char tmp[30];
+
+	static const char *names[P8_IRQ_PSI_IRQ_COUNT] = {
+		"fsp",
+		"occ",
+		"fsi",
+		"lpchc",
+		"local_err",
+		"external",
+	};
+
+	if (idx >= P8_IRQ_PSI_IRQ_COUNT)
+		return NULL;
+
+	snprintf(tmp, sizeof(tmp), "psi#%x:%s",
+		 psi->chip_id, names[idx]);
+
+	return strdup(tmp);
+}
+
+static const struct irq_source_ops psi_p8_irq_ops = {
+	.get_xive = psi_p8_get_xive,
+	.set_xive = psi_p8_set_xive,
+	.interrupt = psihb_p8_interrupt,
+	.attributes = psi_p8_irq_attributes,
+	.name = psi_p8_irq_name,
+};
+
+static const char *psi_p9_irq_names[P9_PSI_NUM_IRQS] = {
+	"fsp",
+	"occ",
+	"fsi",
+	"lpchc",
+	"local_err",
+	"global_err",
+	"external",
+	"lpc_serirq_mux0", /* Have a callback to get name ? */
+	"lpc_serirq_mux1", /* Have a callback to get name ? */
+	"lpc_serirq_mux2", /* Have a callback to get name ? */
+	"lpc_serirq_mux3", /* Have a callback to get name ? */
+	"i2c",
+	"dio",
+	"psu"
+};
+
+static void psi_p9_mask_all(struct psi *psi)
+{
+	struct irq_source *is;
+	int isn;
+
+	/* Mask all sources */
+	is = irq_find_source(psi->interrupt);
+	for (isn = is->start; isn < is->end; isn++)
+		xive_source_mask(is, isn);
+}
+
+static void psi_p9_mask_unhandled_irq(struct irq_source *is, uint32_t isn)
+{
+	struct psi *psi = is->data;
+	int idx = isn - psi->interrupt;
+	const char *name;
+
+	if (idx < ARRAY_SIZE(psi_p9_irq_names))
+		name = psi_p9_irq_names[idx];
+	else
+		name = "unknown!";
+
+	prerror("PSI[0x%03x]: Masking unhandled LSI %d (%s)\n",
+			psi->chip_id, idx, name);
+
+	/*
+	 * All the PSI interrupts are LSIs and will be constantly re-fired
+	 * unless the underlying interrupt condition is cleared. If we don't
+	 * have a handler for the interrupt then it needs to be masked to
+	 * prevent the IRQ from locking up the thread which handles it.
+	 */
+	switch (proc_gen) {
+	case proc_gen_p9:
+		xive_source_mask(is, isn);
+		break;
+	case proc_gen_p10:
+		xive2_source_mask(is, isn);
+		return;
+	default:
+		assert(false);
+	}
+
+}
+
+static void psihb_p9_interrupt(struct irq_source *is, uint32_t isn)
+{
+	struct psi *psi = is->data;
+	uint32_t idx = isn - psi->interrupt;
+
+	switch (idx) {
+	case P9_PSI_IRQ_PSI:
+		psihb_interrupt(is, isn);
+		break;
+	case P9_PSI_IRQ_OCC:
+		occ_p9_interrupt(psi->chip_id);
+		break;
+	case P9_PSI_IRQ_LPCHC:
+		lpc_interrupt(psi->chip_id);
+		break;
+	case P9_PSI_IRQ_LOCAL_ERR:
+		prd_psi_interrupt(psi->chip_id);
+		break;
+	case P9_PSI_IRQ_EXTERNAL:
+		if (platform.external_irq)
+			platform.external_irq(psi->chip_id);
+		else
+			psi_p9_mask_unhandled_irq(is, isn);
+		break;
+	case P9_PSI_IRQ_LPC_SIRQ0:
+	case P9_PSI_IRQ_LPC_SIRQ1:
+	case P9_PSI_IRQ_LPC_SIRQ2:
+	case P9_PSI_IRQ_LPC_SIRQ3:
+		lpc_serirq(psi->chip_id, idx - P9_PSI_IRQ_LPC_SIRQ0);
+		break;
+	case P9_PSI_IRQ_SBE_I2C:
+		p8_i2c_interrupt(psi->chip_id);
+		break;
+	case P9_PSI_IRQ_DIO:
+		printf("PSI: DIO irq received\n");
+		dio_interrupt_handler(psi->chip_id);
+		break;
+	case P9_PSI_IRQ_PSU:
+		p9_sbe_interrupt(psi->chip_id);
+		break;
+
+	default:
+		psi_p9_mask_unhandled_irq(is, isn);
+	}
+}
+
+static uint64_t psi_p9_irq_attributes(struct irq_source *is __unused,
+				      uint32_t isn)
+{
+	struct psi *psi = is->data;
+	unsigned int idx = isn & 0xf;
+	bool is_lpc_serirq;
+
+	 is_lpc_serirq =
+		 (idx == P9_PSI_IRQ_LPC_SIRQ0 ||
+		  idx == P9_PSI_IRQ_LPC_SIRQ1 ||
+		  idx == P9_PSI_IRQ_LPC_SIRQ2 ||
+		  idx == P9_PSI_IRQ_LPC_SIRQ3);
+
+	/* If LPC interrupts are disabled, route them to Linux
+	 * (who will not request them since they aren't referenced
+	 * in the device tree)
+	 */
+	 if (is_lpc_serirq && psi->no_lpc_irqs)
+		return IRQ_ATTR_TARGET_LINUX;
+
+	 /* For serirq, check the LPC layer for policy */
+	 if (is_lpc_serirq)
+		 return lpc_get_irq_policy(psi->chip_id, idx - P9_PSI_IRQ_LPC_SIRQ0);
+
+	/* Only direct external interrupts to OPAL if we have a handler */
+	if (idx == P9_PSI_IRQ_EXTERNAL && !platform.external_irq)
+		return IRQ_ATTR_TARGET_LINUX | IRQ_ATTR_TYPE_LSI;
+
+	return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TYPE_LSI;
+}
+
+static char *psi_p9_irq_name(struct irq_source *is, uint32_t isn)
+{
+	struct psi *psi = is->data;
+	uint32_t idx = isn - psi->interrupt;
+	char tmp[30];
+
+	if (idx >= ARRAY_SIZE(psi_p9_irq_names))
+		return NULL;
+
+	snprintf(tmp, sizeof(tmp), "psi#%x:%s",
+		 psi->chip_id, psi_p9_irq_names[idx]);
+
+	return strdup(tmp);
+}
+
+static const struct irq_source_ops psi_p9_irq_ops = {
+	.interrupt = psihb_p9_interrupt,
+	.attributes = psi_p9_irq_attributes,
+	.name = psi_p9_irq_name,
+};
+
+static void psi_init_p8_interrupts(struct psi *psi)
+{
+	uint32_t irq;
+	uint64_t xivr_p;
+
+	/* On P8 we get a block of 8, set up the base/mask
+	 * and mask all the sources for now
+	 */
+	out_be64(psi->regs + PSIHB_IRSN,
+		 SETFIELD(PSIHB_IRSN_COMP, 0ul, psi->interrupt) |
+		 SETFIELD(PSIHB_IRSN_MASK, 0ul, 0x7fff8ul) |
+		 PSIHB_IRSN_DOWNSTREAM_EN |
+		 PSIHB_IRSN_UPSTREAM_EN);
+
+	for (irq = 0; irq < P8_IRQ_PSI_IRQ_COUNT; irq++) {
+		xivr_p = psi_p8_irq_to_xivr[irq];
+		out_be64(psi->regs  + xivr_p, (0xffull << 32) | (irq << 29));
+	}
+
+	/*
+	 * Register the IRQ sources FSP, OCC, FSI, LPC
+	 * and Local Error. Host Error is actually the
+	 * external interrupt and the policy for that comes
+	 * from the platform
+	 */
+	register_irq_source(&psi_p8_irq_ops, psi,
+			    psi->interrupt, P8_IRQ_PSI_IRQ_COUNT);
+}
+
+static void psi_init_p9_interrupts(struct psi *psi)
+{
+	struct proc_chip *chip;
+	u64 val;
+
+	/* Grab chip */
+	chip = get_chip(psi->chip_id);
+	if (!chip)
+		return;
+
+	/* Configure the CI BAR */
+	phys_map_get(chip->id, PSIHB_ESB, 0, &val, NULL);
+	val |= PSIHB_ESB_CI_VALID;
+	out_be64(psi->regs + PSIHB_ESB_CI_BASE, val);
+
+	val = in_be64(psi->regs + PSIHB_ESB_CI_BASE);
+	psi->esb_mmio = (void *)(val & ~PSIHB_ESB_CI_VALID);
+	prlog(PR_DEBUG, "PSI[0x%03x]: ESB MMIO at @%p\n",
+	       psi->chip_id, psi->esb_mmio);
+
+	/* Register sources */
+	prlog(PR_DEBUG,
+	      "PSI[0x%03x]: Interrupts sources registered for P9 DD2.x\n",
+	      psi->chip_id);
+	xive_register_hw_source(psi->interrupt, P9_PSI_NUM_IRQS,
+				12, psi->esb_mmio, XIVE_SRC_LSI,
+				psi, &psi_p9_irq_ops);
+
+	psi_p9_mask_all(psi);
+
+	/* Setup interrupt offset */
+	val = xive_get_notify_base(psi->interrupt);
+	val <<= 32;
+	out_be64(psi->regs + PSIHB_IVT_OFFSET, val);
+
+	/* Grab and configure the notification port */
+	val = xive_get_notify_port(psi->chip_id, XIVE_HW_SRC_PSI);
+	val |= PSIHB_ESB_NOTIF_VALID;
+	out_be64(psi->regs + PSIHB_ESB_NOTIF_ADDR, val);
+
+	/* Reset irq handling and switch to ESB mode */
+	out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, PSIHB_IRQ_RESET);
+	out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, 0);
+}
+
+/*
+ * P9 and P10 have the same PSIHB interface
+ */
+static const struct irq_source_ops psi_p10_irq_ops = {
+	.interrupt = psihb_p9_interrupt,
+	.attributes = psi_p9_irq_attributes,
+	.name = psi_p9_irq_name,
+};
+
+#define PSIHB10_CAN_STORE_EOI(x) XIVE2_STORE_EOI_ENABLED
+
+static void psi_init_p10_interrupts(struct psi *psi)
+{
+	struct proc_chip *chip;
+	u64 val;
+	uint32_t esb_shift = 16;
+	uint32_t flags = XIVE_SRC_LSI;
+	struct irq_source *is;
+	int isn;
+
+	/* Grab chip */
+	chip = get_chip(psi->chip_id);
+	if (!chip)
+		return;
+
+	/* Configure the CI BAR */
+	phys_map_get(chip->id, PSIHB_ESB, 0, &val, NULL);
+	val |= PSIHB_ESB_CI_VALID;
+	if (esb_shift == 16)
+		val |= PSIHB10_ESB_CI_64K;
+	out_be64(psi->regs + PSIHB_ESB_CI_BASE, val);
+
+	val = in_be64(psi->regs + PSIHB_ESB_CI_BASE);
+	psi->esb_mmio = (void *)(val & ~(PSIHB_ESB_CI_VALID|PSIHB10_ESB_CI_64K));
+	prlog(PR_DEBUG, "PSI[0x%03x]: ESB MMIO at @%p\n",
+	       psi->chip_id, psi->esb_mmio);
+
+	/* Store EOI */
+	if (PSIHB10_CAN_STORE_EOI(psi)) {
+		val = in_be64(psi->regs + PSIHB_CR);
+		val |= PSIHB10_CR_STORE_EOI;
+		out_be64(psi->regs + PSIHB_CR, val);
+		prlog(PR_DEBUG, "PSI[0x%03x]: store EOI is enabled\n",
+		      psi->chip_id);
+		flags |= XIVE_SRC_STORE_EOI;
+	}
+
+	/* Register sources */
+	prlog(PR_DEBUG,
+	      "PSI[0x%03x]: Interrupts sources registered for P10 DD%i.%i\n",
+	      psi->chip_id, 0xf & (chip->ec_level >> 4), chip->ec_level & 0xf);
+
+	xive2_register_hw_source(psi->interrupt, P9_PSI_NUM_IRQS,
+				esb_shift, psi->esb_mmio, flags,
+				psi, &psi_p10_irq_ops);
+
+	/* Mask all sources */
+	is = irq_find_source(psi->interrupt);
+	for (isn = is->start; isn < is->end; isn++)
+		xive2_source_mask(is, isn);
+
+	/* Setup interrupt offset */
+	val = xive2_get_notify_base(psi->interrupt);
+	val <<= 32;
+	out_be64(psi->regs + PSIHB_IVT_OFFSET, val);
+
+	/* Grab and configure the notification port */
+	val = xive2_get_notify_port(psi->chip_id, XIVE_HW_SRC_PSI);
+	val |= PSIHB_ESB_NOTIF_VALID;
+	out_be64(psi->regs + PSIHB_ESB_NOTIF_ADDR, val);
+
+	/* Reset irq handling and switch to ESB mode */
+	out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, PSIHB_IRQ_RESET);
+	out_be64(psi->regs + PSIHB_INTERRUPT_CONTROL, 0);
+}
+
+static void psi_init_interrupts(struct psi *psi)
+{
+	/* Configure the interrupt BUID and mask it */
+	switch (proc_gen) {
+	case proc_gen_p8:
+		psi_init_p8_interrupts(psi);
+		break;
+	case proc_gen_p9:
+		psi_init_p9_interrupts(psi);
+		break;
+	case proc_gen_p10:
+		psi_init_p10_interrupts(psi);
+		break;
+	default:
+		/* Unknown: just no interrupts */
+		prerror("PSI: Unknown interrupt type\n");
+	}
+}
+
+static void psi_activate_phb(struct psi *psi)
+{
+	u64 reg;
+
+	/*
+	 * Disable interrupt emission in the control register,
+	 * it will be re-enabled later, after the mailbox one
+	 * will have been enabled.
+	 */
+	reg = in_be64(psi->regs + PSIHB_CR);
+	reg &= ~PSIHB_CR_FSP_IRQ_ENABLE;
+	out_be64(psi->regs + PSIHB_CR, reg);
+
+	/* Enable interrupts in the mask register. We enable everything
+	 * except for bit "FSP command error detected" which the doc
+	 * (P7 BookIV) says should be masked for normal ops. It also
+	 * seems to be masked under OPAL.
+	 */
+	reg = 0x0000010000100000ull;
+	out_be64(psi->regs + PSIHB_SEMR, reg);
+
+#if 0
+	/* Dump the GXHB registers */
+	printf("  PSIHB_BBAR   : %llx\n",
+	       in_be64(psi->regs + PSIHB_BBAR));
+	printf("  PSIHB_FSPBAR : %llx\n",
+	       in_be64(psi->regs + PSIHB_FSPBAR));
+	printf("  PSIHB_FSPMMR : %llx\n",
+	       in_be64(psi->regs + PSIHB_FSPMMR));
+	printf("  PSIHB_TAR    : %llx\n",
+	       in_be64(psi->regs + PSIHB_TAR));
+	printf("  PSIHB_CR     : %llx\n",
+	       in_be64(psi->regs + PSIHB_CR));
+	printf("  PSIHB_SEMR   : %llx\n",
+	       in_be64(psi->regs + PSIHB_SEMR));
+	printf("  PSIHB_XIVR   : %llx\n",
+	       in_be64(psi->regs + PSIHB_XIVR));
+#endif
+}
+
+static void psi_create_p9_int_map(struct psi *psi, struct dt_node *np)
+{
+	__be32 map[P9_PSI_NUM_IRQS][4];
+	int i;
+
+	for (i = 0; i < P9_PSI_NUM_IRQS; i++) {
+		map[i][0] = cpu_to_be32(i);
+		map[i][1] = cpu_to_be32(get_ics_phandle());
+		map[i][2] = cpu_to_be32(psi->interrupt + i);
+		map[i][3] = cpu_to_be32(1);
+	}
+	dt_add_property(np, "interrupt-map", map, sizeof(map));
+	dt_add_property_cells(np, "#address-cells", 0);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+}
+
+static void psi_create_mm_dtnode(struct psi *psi)
+{
+	struct dt_node *np;
+	uint64_t addr = (uint64_t)psi->regs;
+
+	np = dt_new_addr(dt_root, "psi", addr);
+	if (!np)
+		return;
+
+	/* Hard wire size to 4G */
+	dt_add_property_u64s(np, "reg", addr, 0x100000000ull);
+	switch (proc_gen) {
+	case proc_gen_p8:
+		dt_add_property_strings(np, "compatible", "ibm,psi",
+					"ibm,power8-psi");
+		break;
+	case proc_gen_p9:
+	case proc_gen_p10:
+		dt_add_property_strings(np, "compatible", "ibm,psi",
+					"ibm,power9-psi");
+		psi_create_p9_int_map(psi, np);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+	dt_add_property_cells(np, "interrupt-parent", get_ics_phandle());
+	dt_add_property_cells(np, "interrupts", psi->interrupt, 1);
+	dt_add_property_cells(np, "ibm,chip-id", psi->chip_id);
+	psi->node = np;
+}
+
+static struct psi *alloc_psi(struct proc_chip *chip, uint64_t base)
+{
+	struct psi *psi;
+
+	psi = zalloc(sizeof(struct psi));
+	if (!psi) {
+		prerror("PSI: Could not allocate memory\n");
+		return NULL;
+	}
+	psi->xscom_base = base;
+	psi->chip_id = chip->id;
+	return psi;
+}
+
+static struct psi *psi_probe_p8(struct proc_chip *chip, u64 base)
+{
+	struct psi *psi = NULL;
+	uint64_t rc, val;
+
+	rc = xscom_read(chip->id, base + PSIHB_XSCOM_P8_BASE, &val);
+	if (rc) {
+		prerror("PSI[0x%03x]: Error %llx reading PSIHB BAR\n",
+			chip->id, rc);
+		return NULL;
+	}
+	if (val & PSIHB_XSCOM_P8_HBBAR_EN) {
+		psi = alloc_psi(chip, base);
+		if (!psi)
+			return NULL;
+		psi->regs = (void *)(val & ~PSIHB_XSCOM_P8_HBBAR_EN);
+		psi->interrupt = get_psi_interrupt(chip->id);
+	} else
+		printf("PSI[0x%03x]: Working chip not found\n", chip->id);
+
+	return psi;
+}
+
+static struct psi *psi_probe_p9(struct proc_chip *chip, u64 base)
+{
+	struct psi *psi = NULL;
+	uint64_t addr;
+
+	phys_map_get(chip->id, PSIHB_REG, 0, &addr, NULL);
+	xscom_write(chip->id, base + PSIHB_XSCOM_P9_BASE,
+		    addr | PSIHB_XSCOM_P9_HBBAR_EN);
+
+	psi = alloc_psi(chip, base);
+	if (!psi)
+		return NULL;
+	psi->regs = (void *)addr;
+	psi->interrupt = xive_alloc_hw_irqs(chip->id, P9_PSI_NUM_IRQS, 16);
+	return psi;
+}
+
+static struct psi *psi_probe_p10(struct proc_chip *chip, u64 base)
+{
+	struct psi *psi = NULL;
+	uint64_t addr;
+
+	phys_map_get(chip->id, PSIHB_REG, 0, &addr, NULL);
+	xscom_write(chip->id, base + PSIHB_XSCOM_P9_BASE,
+		    addr | PSIHB_XSCOM_P9_HBBAR_EN);
+
+	psi = alloc_psi(chip, base);
+	if (!psi)
+		return NULL;
+	psi->regs = (void *)addr;
+	psi->interrupt = xive2_alloc_hw_irqs(chip->id, P9_PSI_NUM_IRQS, 16);
+	return psi;
+}
+
+static bool psi_init_psihb(struct dt_node *psihb)
+{
+	uint32_t chip_id = dt_get_chip_id(psihb);
+	struct proc_chip *chip = get_chip(chip_id);
+	struct psi *psi = NULL;
+	u64 base, val;
+
+	if (!chip) {
+		prerror("PSI: Can't find chip!\n");
+		return false;
+	}
+
+	base = dt_get_address(psihb, 0, NULL);
+
+	if (dt_node_is_compatible(psihb, "ibm,power8-psihb-x"))
+		psi = psi_probe_p8(chip, base);
+	else if (dt_node_is_compatible(psihb, "ibm,power9-psihb-x"))
+		psi = psi_probe_p9(chip, base);
+	else if (dt_node_is_compatible(psihb, "ibm,power10-psihb-x"))
+		psi = psi_probe_p10(chip, base);
+	else {
+		prerror("PSI: Unknown processor type\n");
+		return false;
+	}
+	if (!psi)
+		return false;
+
+	list_add(&psis, &psi->list);
+
+	val = in_be64(psi->regs + PSIHB_CR);
+	if (val & PSIHB_CR_FSP_LINK_ACTIVE) {
+		lock(&psi_lock);
+		psi->active = true;
+		unlock(&psi_lock);
+	}
+	chip->psi = psi;
+
+	if (dt_has_node_property(psihb, "no-lpc-interrupts", NULL))
+		psi->no_lpc_irqs = true;
+
+	psi_activate_phb(psi);
+	psi_init_interrupts(psi);
+	psi_create_mm_dtnode(psi);
+
+	prlog(PR_INFO, "PSI[0x%03x]: Found PSI bridge [active=%d]\n",
+	      psi->chip_id, psi->active);
+	return true;
+}
+
+void psi_fsp_link_in_use(struct psi *psi __unused)
+{
+	static bool poller_created = false;
+
+	/* Do this once only */
+	if (!poller_created) {
+		poller_created = true;
+		opal_add_poller(psi_link_poll, NULL);
+	}
+}
+
+struct psi *psi_find_functional_chip(void)
+{
+	return list_top(&psis, struct psi, list);
+}
+
+void psi_init(void)
+{
+	struct dt_node *np;
+
+	dt_for_each_compatible(dt_root, np, "ibm,psihb-x")
+		psi_init_psihb(np);
+}
+
+
diff --git a/roms/skiboot/hw/sbe-p8.c b/roms/skiboot/hw/sbe-p8.c
new file mode 100644
index 000000000..73fa5f1f2
--- /dev/null
+++ b/roms/skiboot/hw/sbe-p8.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * POWER8 Self Boot Engine (SLW - SLeep/Winkle)
+ *
+ * Copyright 2013-2018 IBM Corp.
+ */
+
+#include <device.h>
+#include <sbe-p8.h>
+#include <skiboot.h>
+#include <timebase.h>
+#include <xscom.h>
+
+/* SLW timer related stuff */
+static bool sbe_has_timer;
+static uint64_t sbe_timer_inc;
+static uint64_t sbe_timer_target;
+static uint32_t sbe_timer_chip;
+static uint64_t sbe_last_gen;
+static uint64_t sbe_last_gen_stamp;
+
+static void p8_sbe_dump_timer_ffdc(void)
+{
+	uint64_t i, val;
+	int64_t rc;
+
+	static const uint32_t dump_regs[] = {
+		0xe0000, 0xe0001, 0xe0002, 0xe0003,
+		0xe0004, 0xe0005, 0xe0006, 0xe0007,
+		0xe0008, 0xe0009, 0xe000a, 0xe000b,
+		0xe000c, 0xe000d, 0xe000e, 0xe000f,
+		0xe0010, 0xe0011, 0xe0012, 0xe0013,
+		0xe0014, 0xe0015, 0xe0016, 0xe0017,
+		0xe0018, 0xe0019,
+		0x5001c,
+		0x50038, 0x50039, 0x5003a, 0x5003b
+	};
+
+	/**
+	 * @fwts-label SLWRegisterDump
+	 * @fwts-advice An error condition occurred in sleep/winkle
+	 * engines timer state machine. Dumping debug information to
+	 * root-cause. OPAL/skiboot may be stuck on some operation that
+	 * requires SLW timer state machine (e.g. core powersaving)
+	 */
+	prlog(PR_DEBUG, "SLW: Register state:\n");
+
+	for (i = 0; i < ARRAY_SIZE(dump_regs); i++) {
+		uint32_t reg = dump_regs[i];
+		rc = xscom_read(sbe_timer_chip, reg, &val);
+		if (rc) {
+			prlog(PR_DEBUG, "SLW: XSCOM error %lld reading"
+			      " reg 0x%x\n", rc, reg);
+			break;
+		}
+		prlog(PR_DEBUG, "SLW:  %5x = %016llx\n", reg, val);
+	}
+}
+
+/* This is called with the timer lock held, so there is no
+ * issue with re-entrancy or concurrence
+ */
+void p8_sbe_update_timer_expiry(uint64_t new_target)
+{
+	uint64_t count, gen, gen2, req, now;
+	int64_t rc;
+
+	if (!sbe_has_timer || new_target == sbe_timer_target)
+		return;
+
+	sbe_timer_target = new_target;
+
+	_xscom_lock();
+	now = mftb();
+	/* Calculate how many increments from now, rounded up */
+	if (now < new_target)
+		count = (new_target - now + sbe_timer_inc - 1) / sbe_timer_inc;
+	else
+		count = 1;
+
+	/* Max counter is 24-bit */
+	if (count > 0xffffff)
+		count = 0xffffff;
+	/* Fabricate update request */
+	req = (1ull << 63) | (count << 32);
+
+	prlog(PR_TRACE, "SLW: TMR expiry: 0x%llx, req: %016llx\n", count, req);
+
+	do {
+		/* Grab generation and spin if odd */
+		for (;;) {
+			rc = _xscom_read(sbe_timer_chip, 0xE0006, &gen, false);
+			if (rc) {
+				prerror("SLW: Error %lld reading tmr gen "
+					" count\n", rc);
+				_xscom_unlock();
+				return;
+			}
+			if (!(gen & 1))
+				break;
+			if (tb_compare(now + msecs_to_tb(1), mftb()) == TB_ABEFOREB) {
+				/**
+				 * @fwts-label SLWTimerStuck
+				 * @fwts-advice The SLeep/Winkle Engine (SLW)
+				 * failed to increment the generation number
+				 * within our timeout period (it *should* have
+				 * done so within ~10us, not >1ms. OPAL uses
+				 * the SLW timer to schedule some operations,
+				 * but can fall back to the (much less frequent
+				 * OPAL poller, which although does not affect
+				 * functionality, runs *much* less frequently.
+				 * This could have the effect of slow I2C
+				 * operations (for example). It may also mean
+				 * that you *had* an increase in jitter, due
+				 * to slow interactions with SLW.
+				 * This error may also occur if the machine
+				 * is connected to via soft FSI.
+				 */
+				prerror("SLW: timer stuck, falling back to OPAL pollers. You will likely have slower I2C and may have experienced increased jitter.\n");
+				prlog(PR_DEBUG, "SLW: Stuck with odd generation !\n");
+				_xscom_unlock();
+				sbe_has_timer = false;
+				p8_sbe_dump_timer_ffdc();
+				return;
+			}
+		}
+
+		rc = _xscom_write(sbe_timer_chip, 0x5003A, req, false);
+		if (rc) {
+			prerror("SLW: Error %lld writing tmr request\n", rc);
+			_xscom_unlock();
+			return;
+		}
+
+		/* Re-check gen count */
+		rc = _xscom_read(sbe_timer_chip, 0xE0006, &gen2, false);
+		if (rc) {
+			prerror("SLW: Error %lld re-reading tmr gen "
+				" count\n", rc);
+			_xscom_unlock();
+			return;
+		}
+	} while(gen != gen2);
+	_xscom_unlock();
+
+	/* Check if the timer is working. If at least 1ms has elapsed
+	 * since the last call to this function, check that the gen
+	 * count has changed
+	 */
+	if (tb_compare(sbe_last_gen_stamp + msecs_to_tb(1), now)
+	    == TB_ABEFOREB) {
+		if (sbe_last_gen == gen) {
+			prlog(PR_ERR,
+			      "SLW: Timer appears to not be running !\n");
+			sbe_has_timer = false;
+			p8_sbe_dump_timer_ffdc();
+		}
+		sbe_last_gen = gen;
+		sbe_last_gen_stamp = mftb();
+	}
+
+	prlog(PR_TRACE, "SLW: gen: %llx\n", gen);
+}
+
+bool p8_sbe_timer_ok(void)
+{
+	return sbe_has_timer;
+}
+
+void p8_sbe_init_timer(void)
+{
+	struct dt_node *np;
+	int64_t rc;
+	uint32_t tick_us;
+
+	np = dt_find_compatible_node(dt_root, NULL, "ibm,power8-sbe-timer");
+	if (!np)
+		return;
+
+	sbe_timer_chip = dt_get_chip_id(np);
+	tick_us = dt_prop_get_u32(np, "tick-time-us");
+	sbe_timer_inc = usecs_to_tb(tick_us);
+	sbe_timer_target = ~0ull;
+
+	rc = xscom_read(sbe_timer_chip, 0xE0006, &sbe_last_gen);
+	if (rc) {
+		prerror("SLW: Error %lld reading tmr gen count\n", rc);
+		return;
+	}
+	sbe_last_gen_stamp = mftb();
+
+	prlog(PR_INFO, "SLW: Timer facility on chip %d, resolution %dus\n",
+	      sbe_timer_chip, tick_us);
+	sbe_has_timer = true;
+}
diff --git a/roms/skiboot/hw/sbe-p9.c b/roms/skiboot/hw/sbe-p9.c
new file mode 100644
index 000000000..898a1fb56
--- /dev/null
+++ b/roms/skiboot/hw/sbe-p9.c
@@ -0,0 +1,1040 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ *
+ * P9 OPAL - SBE communication driver
+ *
+ * SBE firmware at https://github.com/open-power/sbe
+ *
+ * P9 chip has Self Boot Engine (SBE). OPAL uses SBE for various purpose like
+ * timer, scom, MPIPL, etc,. Every chip has SBE. OPAL can communicate to SBE
+ * on all chips. Based on message type it selects appropriate SBE (ex: schedule
+ * timer on any chip).
+ *
+ * OPAL communicates to SBE via a set of data and control registers provided by
+ * the PSU block in P9 chip.
+ *  - Four 8 byte registers for Host to send command packets to SBE.
+ *  - Four 8 byte registers for SBE to send response packets to Host.
+ *  - Two doorbell registers (1 on each side) to alert either party
+ *    when data is placed in above mentioned data registers. Once Host/SBE reads
+ *    incoming data, it should clear doorbell register. Interrupt is disabled
+ *    as soon as doorbell register is cleared.
+ *
+ * OPAL - SBE message format:
+ *  - OPAL communicates to SBE via set of well defined commands.
+ *  - Reg0 contains message header (command class, subclass, flags etc).
+ *  - Reg1-3 contains actual data. If data is big then it uses indirect method
+ *    (data is passed via memory and memory address/size is passed in Reg1-3).
+ *  - Every message has defined timeout. SBE must respond within specified
+ *    time. Otherwise OPAL discards message and sends error message to caller.
+ *
+ * Constraints:
+ *  - Only one command is accepted in the command buffer until the response for
+ *    the command is enqueued in the response buffer by SBE.
+ *
+ * Copyright 2017-2019 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "SBE: " fmt
+
+#include <chip.h>
+#include <errorlog.h>
+#include <lock.h>
+#include <opal.h>
+#include <opal-dump.h>
+#include <sbe-p9.h>
+#include <skiboot.h>
+#include <timebase.h>
+#include <timer.h>
+#include <trace.h>
+#include <xscom.h>
+
+enum p9_sbe_mbox_state {
+	sbe_mbox_idle = 0,	/* Ready to send message */
+	sbe_mbox_send,		/* Message sent, waiting for ack/response */
+	sbe_mbox_rr,		/* SBE in R/R */
+};
+
+struct p9_sbe {
+	/* Chip ID to send message */
+	u32			chip_id;
+
+	/* List to hold SBE queue messages */
+	struct list_head	msg_list;
+
+	struct lock		lock;
+
+	enum p9_sbe_mbox_state	state;
+
+	/* SBE MBOX message sequence number */
+	u16			cur_seq;
+};
+
+/* Default SBE chip ID */
+static int sbe_default_chip_id = -1;
+
+/* Is SBE timer running? */
+static bool sbe_has_timer = false;
+static bool sbe_timer_in_progress = false;
+static bool has_new_target = false;
+
+/* Inflight and next timer in TB */
+static uint64_t sbe_last_gen_stamp;
+static uint64_t sbe_timer_target;
+
+/* Timer lock */
+static struct lock sbe_timer_lock;
+
+/*
+ * Minimum timeout value for P9 is 500 microseconds. After that
+ * SBE timer can handle granularity of 1 microsecond.
+ */
+#define SBE_TIMER_DEFAULT_US	500
+static uint64_t sbe_timer_def_tb;
+
+/*
+ * Rate limit continuous timer update.
+ * We can update inflight timer if new timer request is lesser than inflight
+ * one. Limit such updates so that SBE gets time to handle FIFO side requests.
+ */
+#define SBE_TIMER_UPDATE_MAX	2
+static uint32_t timer_update_cnt = 0;
+
+/* Timer control message */
+static struct p9_sbe_msg *timer_ctrl_msg;
+
+#define SBE_STATUS_PRI_SHIFT	0x30
+#define SBE_STATUS_SEC_SHIFT	0x20
+
+/* Forward declaration */
+static void p9_sbe_timeout_poll_one(struct p9_sbe *sbe);
+static void p9_sbe_timer_schedule(void);
+
+/* bit 0-15 : Primary status code */
+static inline u16 p9_sbe_get_primary_rc(struct p9_sbe_msg *resp)
+{
+	return (resp->reg[0] >> SBE_STATUS_PRI_SHIFT);
+}
+
+static inline void p9_sbe_set_primary_rc(struct p9_sbe_msg *resp, u64 rc)
+{
+	resp->reg[0] |= (rc << SBE_STATUS_PRI_SHIFT);
+}
+
+static u64 p9_sbe_rreg(u32 chip_id, u64 reg)
+{
+	u64 data = 0;
+	int rc;
+
+	rc = xscom_read(chip_id, reg, &data);
+	if (rc != OPAL_SUCCESS) {
+		prlog(PR_DEBUG, "XSCOM error %d reading reg 0x%llx\n", rc, reg);
+		return 0xffffffff;
+	}
+
+	return data;
+}
+
+static void p9_sbe_reg_dump(u32 chip_id)
+{
+#define SBE_DUMP_REG_ONE(chip_id, x) \
+	prlog(PR_DEBUG, "  %20s: %016llx\n", #x, p9_sbe_rreg(chip_id, x))
+
+	prlog(PR_DEBUG, "MBOX register dump for chip : %x\n", chip_id);
+	SBE_DUMP_REG_ONE(chip_id, PSU_SBE_DOORBELL_REG_RW);
+	SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG0);
+	SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG1);
+	SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG2);
+	SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG3);
+	SBE_DUMP_REG_ONE(chip_id, PSU_HOST_DOORBELL_REG_RW);
+	SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG4);
+	SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG5);
+	SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG6);
+	SBE_DUMP_REG_ONE(chip_id, PSU_HOST_SBE_MBOX_REG7);
+}
+
+void p9_sbe_freemsg(struct p9_sbe_msg *msg)
+{
+	if (msg && msg->resp)
+		free(msg->resp);
+	free(msg);
+}
+
+static void p9_sbe_fillmsg(struct p9_sbe_msg *msg, u16 cmd,
+			   u16 ctrl_flag, u64 reg1, u64 reg2, u64 reg3)
+{
+	bool response = !!(ctrl_flag & SBE_CMD_CTRL_RESP_REQ);
+	u16 flag;
+
+	/*
+	 * Always set ack required flag. SBE will interrupt OPAL once it read
+	 * message from mailbox register. If OPAL is expecting response, then
+	 * it will update message timeout, otherwise it will send next message.
+	 */
+	flag = ctrl_flag | SBE_CMD_CTRL_ACK_REQ;
+
+	/* Seqence ID is filled by p9_sbe_queue_msg() */
+	msg->reg[0] = ((u64)flag << 32) | cmd;
+	msg->reg[1] = reg1;
+	msg->reg[2] = reg2;
+	msg->reg[3] = reg3;
+	msg->state = sbe_msg_unused;
+	msg->response = response;
+}
+
+static struct p9_sbe_msg *p9_sbe_allocmsg(bool alloc_resp)
+{
+	struct p9_sbe_msg *msg;
+
+	msg = zalloc(sizeof(struct p9_sbe_msg));
+	if (!msg) {
+		prlog(PR_ERR, "Failed to allocate SBE message\n");
+		return NULL;
+	}
+	if (alloc_resp) {
+		msg->resp = zalloc(sizeof(struct p9_sbe_msg));
+		if (!msg->resp) {
+			prlog(PR_ERR, "Failed to allocate SBE resp message\n");
+			free(msg);
+			return NULL;
+		}
+	}
+
+	return msg;
+}
+
+/*
+ * Handles "command with direct data" format only.
+ *
+ * Note: All mbox messages of our interest uses direct data format. If we need
+ *       indirect data format then we may have to enhance this function.
+ */
+struct p9_sbe_msg *p9_sbe_mkmsg(u16 cmd, u16 ctrl_flag,
+				u64 reg1, u64 reg2, u64 reg3)
+{
+	struct p9_sbe_msg *msg;
+
+	msg = p9_sbe_allocmsg(!!(ctrl_flag & SBE_CMD_CTRL_RESP_REQ));
+	if (!msg)
+		return NULL;
+
+	p9_sbe_fillmsg(msg, cmd, ctrl_flag, reg1, reg2, reg3);
+	return msg;
+}
+
+static inline bool p9_sbe_mbox_busy(struct p9_sbe *sbe)
+{
+	return (sbe->state != sbe_mbox_idle);
+}
+
+static inline bool p9_sbe_msg_busy(struct p9_sbe_msg *msg)
+{
+	switch (msg->state) {
+	case sbe_msg_queued:
+	/* fall through */
+	case sbe_msg_sent:
+	case sbe_msg_wresp:
+		return true;
+	default:	/* + sbe_msg_unused, sbe_msg_done,
+			     sbe_msg_timeout, sbe_msg_error */
+		break;
+	}
+	return false;
+}
+
+static inline struct p9_sbe *p9_sbe_get_sbe(u32 chip_id)
+{
+	struct proc_chip *chip;
+
+	/* Default to SBE on master chip */
+	if (chip_id == -1) {
+		if (sbe_default_chip_id == -1)
+			return NULL;
+
+		chip = get_chip(sbe_default_chip_id);
+	} else {
+		chip = get_chip(chip_id);
+	}
+	if (chip == NULL || chip->sbe == NULL)
+		return NULL;
+
+	return chip->sbe;
+}
+
+static int p9_sbe_msg_send(struct p9_sbe *sbe, struct p9_sbe_msg *msg)
+{
+	int rc, i;
+	u64 addr, *data;
+
+	addr = PSU_HOST_SBE_MBOX_REG0;
+	data = &msg->reg[0];
+
+	for (i = 0; i < NR_HOST_SBE_MBOX_REG; i++) {
+		rc = xscom_write(sbe->chip_id, addr, *data);
+		if (rc)
+			return rc;
+
+		addr++;
+		data++;
+	}
+
+	rc = xscom_write(sbe->chip_id, PSU_SBE_DOORBELL_REG_OR,
+			 HOST_SBE_MSG_WAITING);
+	if (rc != OPAL_SUCCESS)
+		return rc;
+
+	prlog(PR_TRACE, "Message queued [chip id = 0x%x]:\n", sbe->chip_id);
+	for (i = 0; i < 4; i++)
+		prlog(PR_TRACE, "    Reg%d : %016llx\n", i, msg->reg[i]);
+
+	msg->timeout = mftb() + msecs_to_tb(SBE_CMD_TIMEOUT_MAX);
+	sbe->state = sbe_mbox_send;
+	msg->state = sbe_msg_sent;
+	return rc;
+}
+
+static int p9_sbe_msg_receive(u32 chip_id, struct p9_sbe_msg *resp)
+{
+	int i;
+	int rc = OPAL_SUCCESS;
+	u64 addr, *data;
+
+	addr = PSU_HOST_SBE_MBOX_REG4;
+	data = &resp->reg[0];
+
+	for (i = 0; i < NR_HOST_SBE_MBOX_REG; i++) {
+		rc = xscom_read(chip_id, addr, data);
+		if (rc)
+			return rc;
+
+		addr++;
+		data++;
+	}
+	return rc;
+}
+
+/* WARNING: This will drop sbe->lock */
+static void p9_sbe_msg_complete(struct p9_sbe *sbe, struct p9_sbe_msg *msg,
+				enum p9_sbe_msg_state msg_state)
+{
+	void (*comp)(struct p9_sbe_msg *msg);
+
+	prlog(PR_TRACE, "Completing msg [chip id = %x], reg0 : 0x%llx\n",
+	      sbe->chip_id, msg->reg[0]);
+
+	comp = msg->complete;
+	list_del(&msg->link);
+	sync();
+	msg->state = msg_state;
+
+	if (comp) {
+		unlock(&sbe->lock);
+		comp(msg);
+		lock(&sbe->lock);
+	}
+}
+
+/* WARNING: This will drop sbe->lock */
+static void p9_sbe_send_complete(struct p9_sbe *sbe)
+{
+	struct p9_sbe_msg *msg;
+
+	if (list_empty(&sbe->msg_list))
+		return;
+
+	msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link);
+	/* Need response */
+	if (msg->response) {
+		msg->state = sbe_msg_wresp;
+	} else {
+		sbe->state = sbe_mbox_idle;
+		p9_sbe_msg_complete(sbe, msg, sbe_msg_done);
+	}
+}
+
+/* WARNING: This will drop sbe->lock */
+static void p9_sbe_process_queue(struct p9_sbe *sbe)
+{
+	int rc, retry_cnt = 0;
+	struct p9_sbe_msg *msg = NULL;
+
+	if (p9_sbe_mbox_busy(sbe))
+		return;
+
+	while (!list_empty(&sbe->msg_list)) {
+		msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link);
+		/* Send message */
+		rc = p9_sbe_msg_send(sbe, msg);
+		if (rc == OPAL_SUCCESS)
+			return;
+
+		prlog(PR_ERR, "Failed to send message to SBE [chip id = %x]\n",
+		      sbe->chip_id);
+		if (msg->resp) {
+			p9_sbe_set_primary_rc(msg->resp,
+					      SBE_STATUS_PRI_GENERIC_ERR);
+		}
+		p9_sbe_msg_complete(sbe, msg, sbe_msg_error);
+
+		/*
+		 * Repeatedly failed to send message to SBE. Lets stop
+		 * sending message.
+		 */
+		if (retry_cnt++ >= 3) {
+			prlog(PR_ERR, "Temporarily stopped sending "
+			      "message to SBE\n");
+			return;
+		}
+	}
+}
+
+/*
+ * WARNING:
+ *         Only one command is accepted in the command buffer until response
+ *         to the command is enqueued in the response buffer by SBE.
+ *
+ *         Head of msg_list contains in-flight message. Hence we should always
+ *         add new message to tail of the list.
+ */
+int p9_sbe_queue_msg(u32 chip_id, struct p9_sbe_msg *msg,
+		     void (*comp)(struct p9_sbe_msg *msg))
+{
+	struct p9_sbe *sbe;
+
+	if (!msg)
+		return OPAL_PARAMETER;
+
+	sbe = p9_sbe_get_sbe(chip_id);
+	if (!sbe)
+		return OPAL_HARDWARE;
+
+	lock(&sbe->lock);
+	/* Set completion and update sequence number */
+	msg->complete = comp;
+	msg->state = sbe_msg_queued;
+	msg->reg[0] = msg->reg[0] | ((u64)sbe->cur_seq << 16);
+	sbe->cur_seq++;
+
+	/* Reset sequence number */
+	if (sbe->cur_seq == 0xffff)
+		sbe->cur_seq = 1;
+
+	/* Add message to queue */
+	list_add_tail(&sbe->msg_list, &msg->link);
+	p9_sbe_process_queue(sbe);
+	unlock(&sbe->lock);
+
+	return OPAL_SUCCESS;
+}
+
+int p9_sbe_sync_msg(u32 chip_id, struct p9_sbe_msg *msg, bool autofree)
+{
+	int rc;
+	struct p9_sbe *sbe;
+
+	rc = p9_sbe_queue_msg(chip_id, msg, NULL);
+	if (rc)
+		goto free_msg;
+
+	sbe = p9_sbe_get_sbe(chip_id);
+	if (!sbe) {
+		rc = OPAL_HARDWARE;
+		goto free_msg;
+	}
+
+	while (p9_sbe_msg_busy(msg)) {
+		cpu_relax();
+		p9_sbe_timeout_poll_one(sbe);
+	}
+
+	if (msg->state == sbe_msg_done)
+		rc = SBE_STATUS_PRI_SUCCESS;
+	else
+		rc = SBE_STATUS_PRI_GENERIC_ERR;
+
+	if (msg->response && msg->resp)
+		rc = p9_sbe_get_primary_rc(msg->resp);
+
+free_msg:
+	if (autofree)
+		p9_sbe_freemsg(msg);
+
+	return rc;
+}
+
+/* Remove SBE message from queue. It will not remove inflight message */
+int p9_sbe_cancelmsg(u32 chip_id, struct p9_sbe_msg *msg)
+{
+	struct p9_sbe *sbe;
+
+	sbe = p9_sbe_get_sbe(chip_id);
+	if (!sbe)
+		return OPAL_PARAMETER;
+
+	lock(&sbe->lock);
+	if (msg->state != sbe_msg_queued) {
+		unlock(&sbe->lock);
+		return OPAL_BUSY;
+	}
+
+	list_del(&msg->link);
+	msg->state = sbe_msg_done;
+	unlock(&sbe->lock);
+	return OPAL_SUCCESS;
+}
+
+static void p9_sbe_handle_response(u32 chip_id, struct p9_sbe_msg *msg)
+{
+	u16 send_seq, resp_seq;
+	int rc;
+
+	if (msg == NULL || msg->resp == NULL)
+		return;
+
+	memset(msg->resp, 0, sizeof(struct p9_sbe_msg));
+
+	rc = p9_sbe_msg_receive(chip_id, msg->resp);
+	if (rc != OPAL_SUCCESS) {
+		prlog(PR_ERR, "Failed to read response message "
+		      "[chip id = %x]\n", chip_id);
+		p9_sbe_set_primary_rc(msg->resp, SBE_STATUS_PRI_GENERIC_ERR);
+		return;
+	}
+
+	/* Validate sequence number */
+	send_seq = (msg->reg[0] >> 16) & 0xffff;
+	resp_seq = (msg->resp->reg[0] >> 16) & 0xffff;
+	if (send_seq != resp_seq) {
+		/*
+		 * XXX Handle SBE R/R.
+		 *     Lets send sequence error to caller until SBE reset works.
+		 */
+		prlog(PR_ERR, "Invalid sequence id [chip id = %x]\n", chip_id);
+		p9_sbe_set_primary_rc(msg->resp, SBE_STATUS_PRI_SEQ_ERR);
+		return;
+	}
+}
+
+static int p9_sbe_clear_interrupt(struct p9_sbe *sbe, u64 bits)
+{
+	int rc;
+	u64 val;
+
+	/* Clear doorbell register */
+	val = SBE_HOST_RESPONSE_MASK & ~bits;
+	rc = xscom_write(sbe->chip_id, PSU_HOST_DOORBELL_REG_AND, val);
+	if (rc) {
+		prlog(PR_ERR, "Failed to clear SBE to Host doorbell "
+		      "interrupt [chip id = %x]\n", sbe->chip_id);
+	}
+	return rc;
+}
+
+/* WARNING: This will drop sbe->lock */
+static void p9_sbe_timer_response(struct p9_sbe *sbe)
+{
+	if (sbe->chip_id != sbe_default_chip_id)
+		return;
+
+	sbe_timer_in_progress = false;
+	/* Drop lock and call timers */
+	unlock(&sbe->lock);
+
+	lock(&sbe_timer_lock);
+	/*
+	 * Once we get timer expiry interrupt (even if its suprious interrupt)
+	 * we can schedule next timer request.
+	 */
+	timer_update_cnt = 0;
+	unlock(&sbe_timer_lock);
+
+	check_timers(true);
+	lock(&sbe->lock);
+}
+
+/* WARNING: This will drop sbe->lock */
+static void __p9_sbe_interrupt(struct p9_sbe *sbe)
+{
+	bool has_response;
+	int rc;
+	u64 data = 0, val;
+	struct p9_sbe_msg *msg = NULL;
+
+again:
+	/* Read doorbell register */
+	rc = xscom_read(sbe->chip_id, PSU_HOST_DOORBELL_REG_RW, &data);
+	if (rc) {
+		prlog(PR_ERR, "Failed to read SBE to Host doorbell register "
+		      "[chip id = %x]\n", sbe->chip_id);
+		p9_sbe_reg_dump(sbe->chip_id);
+		return;
+	}
+
+	/* Completed processing all the bits */
+	if (!data)
+		return;
+
+	/* SBE came back from reset */
+	if (data & SBE_HOST_RESET) {
+		/* Clear all bits and restart sending message */
+		rc = p9_sbe_clear_interrupt(sbe, data);
+		if (rc)
+			return;
+
+		prlog(PR_NOTICE,
+		      "Back from reset [chip id = %x]\n", sbe->chip_id);
+		/* Reset SBE MBOX state */
+		sbe->state = sbe_mbox_idle;
+
+		/* Reset message state */
+		if (!list_empty(&sbe->msg_list)) {
+			msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link);
+			msg->state = sbe_msg_queued;
+		}
+		return;
+	}
+
+	/* Process ACK message before response */
+	if (data & SBE_HOST_MSG_READ) {
+		rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_MSG_READ);
+		if (rc)
+			return;
+		p9_sbe_send_complete(sbe);
+		goto again;
+	}
+
+	/* Read SBE response before clearing doorbell register */
+	if (data & SBE_HOST_RESPONSE_WAITING) {
+		if (!list_empty(&sbe->msg_list)) {
+			msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link);
+			p9_sbe_handle_response(sbe->chip_id, msg);
+			has_response = true;
+		} else {
+			has_response = false;
+			prlog(PR_DEBUG,
+			      "Got response with no pending message\n");
+		}
+
+		rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_RESPONSE_WAITING);
+		if (rc)
+			return;
+
+		/* Reset SBE MBOX state */
+		sbe->state = sbe_mbox_idle;
+		if (has_response)
+			p9_sbe_msg_complete(sbe, msg, sbe_msg_done);
+
+		goto again;
+	}
+
+	/* SBE passthrough command, call prd handler */
+	if (data & SBE_HOST_PASSTHROUGH) {
+		rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_PASSTHROUGH);
+		if (rc)
+			return;
+		prd_sbe_passthrough(sbe->chip_id);
+		goto again;
+	}
+
+	/* Timer expired */
+	if (data & SBE_HOST_TIMER_EXPIRY) {
+		rc = p9_sbe_clear_interrupt(sbe, SBE_HOST_TIMER_EXPIRY);
+		if (rc)
+			return;
+		p9_sbe_timer_response(sbe);
+		goto again;
+	}
+
+	/* Unhandled bits */
+	val = data & ~(SBE_HOST_RESPONSE_MASK);
+	if (val) {
+		prlog(PR_ERR, "Unhandled interrupt bit [chip id = %x] : "
+		      " %016llx\n", sbe->chip_id, val);
+		rc = p9_sbe_clear_interrupt(sbe, data);
+		if (rc)
+			return;
+		goto again;
+	}
+}
+
+void p9_sbe_interrupt(uint32_t chip_id)
+{
+	struct proc_chip *chip;
+	struct p9_sbe *sbe;
+
+	chip = get_chip(chip_id);
+	if (chip == NULL || chip->sbe == NULL)
+		return;
+
+	sbe = chip->sbe;
+	lock(&sbe->lock);
+	__p9_sbe_interrupt(sbe);
+	p9_sbe_process_queue(sbe);
+	unlock(&sbe->lock);
+}
+
+/*
+ * Check if the timer is working. If at least 10ms elapsed since
+ * last scheduled timer expiry.
+ */
+static void p9_sbe_timer_poll(struct p9_sbe *sbe)
+{
+	if (sbe->chip_id != sbe_default_chip_id)
+		return;
+
+	if (!sbe_has_timer || !sbe_timer_in_progress)
+		return;
+
+	if (tb_compare(mftb(), sbe_last_gen_stamp + msecs_to_tb(10))
+	    != TB_AAFTERB)
+		return;
+
+	prlog(PR_ERR, "Timer stuck, falling back to OPAL pollers.\n");
+	prlog(PR_ERR, "You will likely have slower I2C and may have "
+	      "experienced increased jitter.\n");
+	p9_sbe_reg_dump(sbe->chip_id);
+	sbe_has_timer = false;
+	sbe_timer_in_progress = false;
+}
+
+static void p9_sbe_timeout_poll_one(struct p9_sbe *sbe)
+{
+	struct p9_sbe_msg *msg;
+
+	if (sbe->chip_id == sbe_default_chip_id) {
+		if (list_empty_nocheck(&sbe->msg_list) &&
+		    !sbe_timer_in_progress)
+			return;
+	} else {
+		if (list_empty_nocheck(&sbe->msg_list))
+			return;
+	}
+
+	lock(&sbe->lock);
+
+	/*
+	 * In some cases there will be a delay in calling OPAL interrupt
+	 * handler routine (opal_handle_interrupt). In such cases its
+	 * possible that SBE has responded, but OPAL didn't act on that.
+	 * Hence check for SBE response.
+	 */
+	__p9_sbe_interrupt(sbe);
+	p9_sbe_timer_poll(sbe);
+
+	if (list_empty(&sbe->msg_list))
+		goto out;
+
+	/*
+	 * For some reason OPAL didn't sent message to SBE.
+	 * Lets try to send message again.
+	 */
+	if (!p9_sbe_mbox_busy(sbe)) {
+		p9_sbe_process_queue(sbe);
+		goto out;
+	}
+
+	msg = list_top(&sbe->msg_list, struct p9_sbe_msg, link);
+	if (tb_compare(mftb(), msg->timeout) != TB_AAFTERB)
+		goto out;
+
+	/* Message timeout */
+	prlog(PR_ERR, "Message timeout [chip id = %x], cmd = %llx, "
+	      "subcmd = %llx\n", sbe->chip_id,
+	      (msg->reg[0] >> 8) & 0xff, msg->reg[0] & 0xff);
+	p9_sbe_reg_dump(sbe->chip_id);
+	if (msg->resp) {
+		p9_sbe_set_primary_rc(msg->resp,
+				      SBE_STATUS_PRI_GENERIC_ERR);
+	}
+
+	/* XXX Handle SBE R/R. Reset SBE state until SBE R/R works. */
+	sbe->state = sbe_mbox_idle;
+	p9_sbe_msg_complete(sbe, msg, sbe_msg_timeout);
+	p9_sbe_process_queue(sbe);
+
+out:
+	unlock(&sbe->lock);
+}
+
+static void p9_sbe_timeout_poll(void *user_data __unused)
+{
+	struct p9_sbe *sbe;
+	struct proc_chip *chip;
+
+	for_each_chip(chip) {
+		if (chip->sbe == NULL)
+			continue;
+		sbe = chip->sbe;
+		p9_sbe_timeout_poll_one(sbe);
+	}
+}
+
+static void p9_sbe_timer_resp(struct p9_sbe_msg *msg)
+{
+	if (msg->state != sbe_msg_done) {
+		prlog(PR_DEBUG, "Failed to schedule timer [chip id %x]\n",
+		      sbe_default_chip_id);
+	} else {
+		/* Update last scheduled timer value */
+		sbe_last_gen_stamp = mftb() +
+			usecs_to_tb(timer_ctrl_msg->reg[1]);
+		sbe_timer_in_progress = true;
+	}
+
+	if (!has_new_target)
+		return;
+
+	lock(&sbe_timer_lock);
+	if (has_new_target) {
+		if (!p9_sbe_msg_busy(timer_ctrl_msg)) {
+			has_new_target = false;
+			p9_sbe_timer_schedule();
+		}
+	}
+	unlock(&sbe_timer_lock);
+}
+
+static void p9_sbe_timer_schedule(void)
+{
+	int rc;
+	u32 tick_us = SBE_TIMER_DEFAULT_US;
+	u64 tb_cnt, now = mftb();
+
+	if (sbe_timer_in_progress) {
+		if (sbe_timer_target >= sbe_last_gen_stamp)
+			return;
+
+		if (now >= sbe_last_gen_stamp)
+			return;
+
+		/* Remaining time of inflight timer <= sbe_timer_def_tb */
+		if ((sbe_last_gen_stamp - now) <= sbe_timer_def_tb)
+			return;
+	}
+
+	/* Stop sending timer update chipop until inflight timer expires */
+	if (timer_update_cnt > SBE_TIMER_UPDATE_MAX)
+		return;
+	timer_update_cnt++;
+
+	if (now < sbe_timer_target) {
+		/* Calculate how many microseconds from now, rounded up */
+		if ((sbe_timer_target - now) > sbe_timer_def_tb) {
+			tb_cnt = sbe_timer_target - now + usecs_to_tb(1) - 1;
+			tick_us = tb_to_usecs(tb_cnt);
+		}
+	}
+
+	/* Clear sequence number. p9_sbe_queue_msg will add new sequene ID */
+	timer_ctrl_msg->reg[0] &= ~(PPC_BITMASK(32, 47));
+	/* Update timeout value */
+	timer_ctrl_msg->reg[1] = tick_us;
+	rc = p9_sbe_queue_msg(sbe_default_chip_id, timer_ctrl_msg,
+			      p9_sbe_timer_resp);
+	if (rc != OPAL_SUCCESS) {
+		prlog(PR_ERR, "Failed to start timer [chip id = %x]\n",
+		      sbe_default_chip_id);
+		return;
+	}
+}
+
+/*
+ * This is called with the timer lock held, so there is no
+ * issue with re-entrancy or concurrence
+ */
+void p9_sbe_update_timer_expiry(uint64_t new_target)
+{
+	if (!sbe_has_timer || new_target == sbe_timer_target)
+		return;
+
+	lock(&sbe_timer_lock);
+	/* Timer message is in flight. Record new timer and schedule later */
+	if (p9_sbe_msg_busy(timer_ctrl_msg) || has_new_target) {
+		if (new_target < sbe_timer_target) {
+			sbe_timer_target = new_target;
+			has_new_target = true;
+		}
+	} else {
+		sbe_timer_target = new_target;
+		p9_sbe_timer_schedule();
+	}
+	unlock(&sbe_timer_lock);
+}
+
+/* Initialize SBE timer */
+static void p9_sbe_timer_init(void)
+{
+	timer_ctrl_msg = p9_sbe_mkmsg(SBE_CMD_CONTROL_TIMER,
+				      CONTROL_TIMER_START, 0, 0, 0);
+	assert(timer_ctrl_msg);
+	init_lock(&sbe_timer_lock);
+	sbe_has_timer = true;
+	sbe_timer_target = mftb();
+	sbe_last_gen_stamp = ~0ull;
+	sbe_timer_def_tb = usecs_to_tb(SBE_TIMER_DEFAULT_US);
+	prlog(PR_INFO, "Timer facility on chip %x\n", sbe_default_chip_id);
+}
+
+bool p9_sbe_timer_ok(void)
+{
+	return sbe_has_timer;
+}
+
+static void p9_sbe_stash_chipop_resp(struct p9_sbe_msg *msg)
+{
+	int rc = p9_sbe_get_primary_rc(msg->resp);
+	struct p9_sbe *sbe = (void *)msg->user_data;
+
+	if (rc == SBE_STATUS_PRI_SUCCESS) {
+		prlog(PR_DEBUG, "Sent stash MPIPL config [chip id =0x%x]\n",
+		      sbe->chip_id);
+	} else {
+		prlog(PR_ERR, "Failed to send stash MPIPL config "
+		      "[chip id = 0x%x, rc = %d]\n", sbe->chip_id, rc);
+	}
+
+	p9_sbe_freemsg(msg);
+}
+
+static void p9_sbe_send_relocated_base_single(struct p9_sbe *sbe, u64 reloc_base)
+{
+	u8 key = SBE_STASH_KEY_SKIBOOT_BASE;
+	u16 cmd = SBE_CMD_STASH_MPIPL_CONFIG;
+	u16 flag = SBE_CMD_CTRL_RESP_REQ;
+	struct p9_sbe_msg *msg;
+
+	msg = p9_sbe_mkmsg(cmd, flag, key, reloc_base, 0);
+	if (!msg) {
+		prlog(PR_ERR, "Message allocation failed\n");
+		return;
+	}
+
+	msg->user_data = (void *)sbe;
+	if (p9_sbe_queue_msg(sbe->chip_id, msg, p9_sbe_stash_chipop_resp)) {
+		prlog(PR_ERR, "Failed to queue stash MPIPL config message\n");
+	}
+}
+
+/* Send relocated skiboot base address to all SBE */
+void p9_sbe_send_relocated_base(uint64_t reloc_base)
+{
+	struct proc_chip *chip;
+
+	for_each_chip(chip) {
+		if (chip->sbe == NULL)
+			continue;
+
+		p9_sbe_send_relocated_base_single(chip->sbe, reloc_base);
+	}
+}
+
+void p9_sbe_init(void)
+{
+	struct dt_node *xn;
+	struct proc_chip *chip;
+	struct p9_sbe *sbe;
+
+	if (proc_gen < proc_gen_p9)
+		return;
+
+	dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+		sbe = zalloc(sizeof(struct p9_sbe));
+		assert(sbe);
+		sbe->chip_id = dt_get_chip_id(xn);
+		sbe->cur_seq = 1;
+		sbe->state = sbe_mbox_idle;
+		list_head_init(&sbe->msg_list);
+		init_lock(&sbe->lock);
+
+		chip = get_chip(sbe->chip_id);
+		assert(chip);
+		chip->sbe = sbe;
+
+		if (dt_has_node_property(xn, "primary", NULL)) {
+			sbe_default_chip_id = sbe->chip_id;
+			prlog(PR_DEBUG, "Master chip id : %x\n", sbe->chip_id);
+		}
+	}
+
+	if (sbe_default_chip_id == -1) {
+		prlog(PR_ERR, "Master chip ID not found.\n");
+		return;
+	}
+
+	/* Initiate SBE timer */
+	p9_sbe_timer_init();
+
+	/* Initiate SBE timeout poller */
+	opal_add_poller(p9_sbe_timeout_poll, NULL);
+}
+
+/* Terminate and initiate MPIPL */
+void p9_sbe_terminate(void)
+{
+	uint32_t primary_chip = -1;
+	int rc;
+	u64 wait_tb;
+	struct proc_chip *chip;
+
+	/* Return if MPIPL is not supported */
+	if (!is_mpipl_enabled())
+		return;
+
+	/* Save crashing CPU details */
+	opal_mpipl_save_crashing_pir();
+
+	/* Unregister flash. It will request BMC MBOX reset */
+	if (!flash_unregister()) {
+		prlog(PR_DEBUG, "Failed to reset BMC MBOX\n");
+		return;
+	}
+
+	/*
+	 * Send S0 interrupt to all SBE. Sequence:
+	 *   - S0 interrupt on secondary chip SBE
+	 *   - S0 interrupt on Primary chip SBE
+	 */
+	for_each_chip(chip) {
+		if (dt_has_node_property(chip->devnode, "primary", NULL)) {
+			primary_chip = chip->id;
+			continue;
+		}
+
+		rc = xscom_write(chip->id,
+				 SBE_CONTROL_REG_RW, SBE_CONTROL_REG_S0);
+		/* Initiate normal reboot */
+		if (rc) {
+			prlog(PR_ERR, "Failed to write S0 interrupt [chip id = %x]\n",
+			      chip->id);
+			return;
+		}
+	}
+
+	/* Initiate normal reboot */
+	if (primary_chip == -1) {
+		prlog(PR_ERR, "Primary chip ID not found.\n");
+		return;
+	}
+
+	rc = xscom_write(primary_chip,
+			 SBE_CONTROL_REG_RW, SBE_CONTROL_REG_S0);
+	if (rc) {
+		prlog(PR_ERR, "Failed to write S0 interrupt [chip id = %x]\n",
+		      primary_chip);
+		return;
+	}
+
+	/* XXX We expect SBE to act on interrupt, quiesce the system and start
+	 *     MPIPL flow. Currently we do not have a way to detect SBE state.
+	 *     Hence wait for max time SBE takes to respond and then trigger
+	 *     normal reboot.
+	 */
+	prlog(PR_NOTICE, "Initiated MPIPL, waiting for SBE to respond...\n");
+	wait_tb = mftb() + msecs_to_tb(SBE_CMD_TIMEOUT_MAX);
+	while (mftb() < wait_tb) {
+		cpu_relax();
+	}
+
+	prlog(PR_ERR, "SBE did not respond within timeout period (%d secs).\n",
+	      SBE_CMD_TIMEOUT_MAX / 1000);
+	prlog(PR_ERR, "Falling back to normal reboot\n");
+}
diff --git a/roms/skiboot/hw/sfc-ctrl.c b/roms/skiboot/hw/sfc-ctrl.c
new file mode 100644
index 000000000..34b5b8e20
--- /dev/null
+++ b/roms/skiboot/hw/sfc-ctrl.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2014 IBM Corp. */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <lpc.h>
+#include <sfc-ctrl.h>
+
+#include <libflash/libflash.h>
+#include <libflash/libflash-priv.h>
+
+/* Offset of SFC registers in FW space */
+#define SFC_CMDREG_OFFSET	0x00000c00
+/* Offset of SFC command buffer in FW space */
+#define	SFC_CMDBUF_OFFSET	0x00000d00
+/* Offset of flash MMIO mapping in FW space */
+#define SFC_MMIO_OFFSET		0x0c000000
+
+
+/*
+ * Register definitions
+ */
+#define SFC_REG_CONF      0x10 /* CONF: Direct Access Configuration */
+#define SFC_REG_CONF_FRZE		(1 << 3)
+#define SFC_REG_CONF_ECCEN		(1 << 2)
+#define SFC_REG_CONF_DRCD		(1 << 1)
+#define SFC_REG_CONF_FLRLD		(1 << 0)
+
+#define SFC_REG_STATUS    0x0C /* STATUS : Status Reg */
+#define SFC_REG_STATUS_NX_ON_SHFT	28
+#define SFC_REG_STATUS_RWP		(1 << 27)
+#define SFC_REG_STATUS_FOURBYTEAD	(1 << 26)
+#define SFC_REG_STATUS_ILLEGAL		(1 << 4)
+#define SFC_REG_STATUS_ECCERRCNTN	(1 << 3)
+#define SFC_REG_STATUS_ECCUEN		(1 << 2)
+#define SFC_REG_STATUS_DONE		(1 << 0)
+
+#define SFC_REG_CMD       0x40 /* CMD : Command */
+#define SFC_REG_CMD_OPCODE_SHFT		9
+#define SFC_REG_CMD_LENGTH_SHFT		0
+
+#define SFC_REG_SPICLK    0x3C /* SPICLK: SPI clock rate config */
+#define SFC_REG_SPICLK_OUTDLY_SHFT	24
+#define SFC_REG_SPICLK_INSAMPDLY_SHFT	16
+#define SFC_REG_SPICLK_CLKHI_SHFT	8
+#define SFC_REG_SPICLK_CLKLO_SHFT	0
+
+#define SFC_REG_ADR       0x44 /* ADR : Address */
+#define SFC_REG_ERASMS    0x48 /* ERASMS : Small Erase Block Size */
+#define SFC_REG_ERASLGS   0x4C /* ERALGS : Large Erase Block Size */
+#define SFC_REG_CONF4     0x54 /* CONF4  : SPI Op Code for Small Erase */
+#define SFC_REG_CONF5     0x58 /* CONF5  : Small Erase Size config reg */
+
+#define SFC_REG_CONF8     0x64 /* CONF8  : Read Command */
+#define SFC_REG_CONF8_CSINACTIVERD_SHFT	18
+#define SFC_REG_CONF8_DUMMY_SHFT	8
+#define SFC_REG_CONF8_READOP_SHFT	0
+
+#define SFC_REG_ADRCBF    0x80 /* ADRCBF : First Intf NOR Addr Offset */
+#define SFC_REG_ADRCMF    0x84 /* ADRCMF : First Intf NOR Allocation */
+#define SFC_REG_ADRCBS    0x88 /* ADRCBS : Second Intf NOR Addr Offset */
+#define SFC_REG_ADRCMS    0x8C /* ADRCMS : Second Intf NOR Allocation */
+#define SFC_REG_OADRNB    0x90 /* OADRNB : Direct Access OBP Window Base Address */
+#define SFC_REG_OADRNS    0x94 /* OADRNS : DIrect Access OPB Window Size */
+
+#define SFC_REG_CHIPIDCONF    0x9C /* CHIPIDCONF : config ChipId CMD */
+#define SFC_REG_CHIPIDCONF_OPCODE_SHFT	24
+#define SFC_REG_CHIPIDCONF_READ		(1 << 23)
+#define SFC_REG_CHIPIDCONF_WRITE	(1 << 22)
+#define SFC_REG_CHIPIDCONF_USE_ADDR	(1 << 21)
+#define SFC_REG_CHIPIDCONF_DUMMY_SHFT	16
+#define SFC_REG_CHIPIDCONF_LEN_SHFT	0
+
+/*
+ * SFC Opcodes
+ */
+#define SFC_OP_READRAW      0x03 /* Read Raw */
+#define SFC_OP_WRITERAW     0x02 /* Write Raw */
+#define SFC_OP_ERASM        0x32 /* Erase Small */
+#define SFC_OP_ERALG        0x34 /* Erase Large */
+#define SFC_OP_ENWRITPROT   0x53 /* Enable WRite Protect */
+#define SFC_OP_CHIPID       0x1F /* Get Chip ID */
+#define SFC_OP_STATUS       0x05 /* Get Status */
+#define SFC_OP_TURNOFF      0x5E /* Turn Off */
+#define SFC_OP_TURNON       0x50 /* Turn On */
+#define SFC_OP_ABORT        0x6F /* Super-Abort */
+#define SFC_OP_START4BA     0x37 /* Start 4BA */
+#define SFC_OP_END4BA       0x69 /* End 4BA */
+
+/* Command buffer size */
+#define SFC_CMDBUF_SIZE     256
+
+struct sfc_ctrl {
+	/* Erase sizes */
+	uint32_t		small_er_size;
+	uint32_t		large_er_size;
+
+	/* Current 4b mode */
+	bool			mode_4b;
+
+	/* Callbacks */
+	struct spi_flash_ctrl	ops;
+};
+
+/* Command register support */
+static inline int sfc_reg_read(uint8_t reg, uint32_t *val)
+{
+	int rc;
+
+	*val = 0xffffffff;
+	rc = lpc_fw_read32(val, SFC_CMDREG_OFFSET + reg);
+	if (rc)
+		return rc;
+	return 0;
+}
+
+static inline int sfc_reg_write(uint8_t reg, uint32_t val)
+{
+	return lpc_fw_write32(val, SFC_CMDREG_OFFSET + reg);
+}
+
+static int sfc_buf_write(uint32_t len, const void *data)
+{
+	__be32 tmp;
+	uint32_t off = 0;
+	int rc;
+
+	if (len > SFC_CMDBUF_SIZE)
+		return FLASH_ERR_PARM_ERROR;
+
+	while (len >= 4) {
+		tmp = cpu_to_be32(*(const uint32_t *)data);
+		rc = lpc_fw_write32((u32)tmp, SFC_CMDBUF_OFFSET + off);
+		if (rc)
+			return rc;
+		off += 4;
+		len -= 4;
+		data += 4;
+	}
+	if (!len)
+		return 0;
+
+	/* lpc_fw_write operates on BE values so that's what we layout
+	 * in memory with memcpy. The swap in the register on LE doesn't
+	 * matter, the result in memory will be in the right order.
+	 */
+	tmp = cpu_to_be32(-1);
+	memcpy(&tmp, data, len); /* XXX: is this right? */
+	return lpc_fw_write32((u32)tmp, SFC_CMDBUF_OFFSET + off);
+}
+
+static int sfc_buf_read(uint32_t len, void *data)
+{
+	uint32_t tmp, off = 0;
+	int rc;
+
+	if (len > SFC_CMDBUF_SIZE)
+		return FLASH_ERR_PARM_ERROR;
+
+	while (len >= 4) {
+		rc = lpc_fw_read32(data, SFC_CMDBUF_OFFSET + off);
+		if (rc)
+			return rc;
+		off += 4;
+		len -= 4;
+		data += 4;
+	}
+	if (!len)
+		return 0;
+
+	rc = lpc_fw_read32(&tmp, SFC_CMDBUF_OFFSET + off);
+	if (rc)
+		return rc;
+	/* We know tmp contains a big endian value, so memcpy is
+	 * our friend here
+	 */
+	memcpy(data, &tmp, len);
+	return 0;
+}
+
+/* Polls until SFC indicates command is complete */
+static int sfc_poll_complete(void)
+{
+	uint32_t status, timeout;
+	struct timespec ts;
+
+	/*
+	 * A full 256 bytes read/write command will take at least
+	 * 126us. Smaller commands are faster but we use less of
+	 * them. So let's sleep in increments of 100us
+	 */
+	ts.tv_sec = 0;
+	ts.tv_nsec = 100000;
+
+	/*
+	 * Use a 1s timeout which should be sufficient for the
+	 * commands we use
+	 */
+	timeout = 10000;
+
+	do {
+		int rc;
+
+		rc = sfc_reg_read(SFC_REG_STATUS, &status);
+		if (rc)
+			return rc;
+		if (status & SFC_REG_STATUS_DONE)
+			break;
+		if (--timeout == 0)
+			return FLASH_ERR_CTRL_TIMEOUT;
+		nanosleep(&ts, NULL);
+	} while (true);
+
+	return 0;
+}
+
+static int sfc_exec_command(uint8_t opcode, uint32_t length)
+{
+	int rc = 0;
+	uint32_t cmd_reg = 0;
+
+	if (opcode > 0x7f || length > 0x1ff)
+		return FLASH_ERR_PARM_ERROR;
+
+	/* Write command register to start execution */
+	cmd_reg |= (opcode << SFC_REG_CMD_OPCODE_SHFT);
+	cmd_reg |= (length << SFC_REG_CMD_LENGTH_SHFT);
+	rc = sfc_reg_write(SFC_REG_CMD, cmd_reg);
+	if (rc)
+		return rc;
+
+	/* Wait for command to complete */
+	return sfc_poll_complete();
+}
+
+static int sfc_chip_id(struct spi_flash_ctrl *ctrl, uint8_t *id_buf,
+		       uint32_t *id_size)
+{
+	uint32_t idconf;
+	int rc;
+
+	(void)ctrl;
+
+	if ((*id_size) < 3)
+		return FLASH_ERR_PARM_ERROR;
+
+	/*
+	 * XXX This will not work in locked down mode but we assume that
+	 * in this case, the chip ID command is already properly programmed
+	 * and the SFC will ignore this. However I haven't verified...
+	 */
+	idconf = ((uint64_t)CMD_RDID) << SFC_REG_CHIPIDCONF_OPCODE_SHFT;
+	idconf |= SFC_REG_CHIPIDCONF_READ;
+        idconf |= (3ul << SFC_REG_CHIPIDCONF_LEN_SHFT);
+	(void)sfc_reg_write(SFC_REG_CHIPIDCONF, idconf);
+
+	/* Perform command */
+	rc = sfc_exec_command(SFC_OP_CHIPID, 0);
+	if (rc)
+		return rc;
+
+	/* Read chip ID */
+        rc = sfc_buf_read(3, id_buf);
+	if (rc)
+		return rc;
+	*id_size = 3;
+
+	return 0;
+}
+
+
+static int sfc_read(struct spi_flash_ctrl *ctrl, uint32_t pos,
+		    void *buf, uint32_t len)
+{
+	(void)ctrl;
+
+	while(len) {
+		uint32_t chunk = len;
+		int rc;
+
+		if (chunk > SFC_CMDBUF_SIZE)
+			chunk = SFC_CMDBUF_SIZE;
+		rc = sfc_reg_write(SFC_REG_ADR, pos);
+		if (rc)
+			return rc;
+		rc = sfc_exec_command(SFC_OP_READRAW, chunk);
+		if (rc)
+			return rc;
+		rc = sfc_buf_read(chunk, buf);
+		if (rc)
+			return rc;
+		len -= chunk;
+		pos += chunk;
+		buf += chunk;
+	}
+	return 0;
+}
+
+static int sfc_write(struct spi_flash_ctrl *ctrl, uint32_t addr,
+		     const void *buf, uint32_t size)
+{
+	uint32_t chunk;
+	int rc;
+
+	(void)ctrl;
+
+	while(size) {
+		/* We shall not cross a page boundary */
+		chunk = 0x100 - (addr & 0xff);
+		if (chunk > size)
+			chunk = size;
+
+		/* Write to SFC write buffer */
+		rc = sfc_buf_write(chunk, buf);
+		if (rc)
+			return rc;
+
+		/* Program address */
+		rc = sfc_reg_write(SFC_REG_ADR, addr);
+		if (rc)
+			return rc;
+
+		/* Send command */
+		rc = sfc_exec_command(SFC_OP_WRITERAW, chunk);
+		if (rc)
+			return rc;
+
+		addr += chunk;
+		buf += chunk;
+		size -= chunk;
+	}
+	return 0;
+}
+
+static int sfc_erase(struct spi_flash_ctrl *ctrl, uint32_t addr,
+		     uint32_t size)
+{
+	struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+	uint32_t sm_mask = ct->small_er_size - 1;
+	uint32_t lg_mask = ct->large_er_size - 1;
+	uint32_t chunk;
+	uint8_t cmd;
+	int rc;
+
+	while(size) {
+		/* Choose erase size for this chunk */
+		if (((addr | size) & lg_mask) == 0) {
+			chunk = ct->large_er_size;
+			cmd = SFC_OP_ERALG;
+		} else if (((addr | size) & sm_mask) == 0) {
+			chunk = ct->small_er_size;
+			cmd = SFC_OP_ERASM;
+		} else
+			return FLASH_ERR_ERASE_BOUNDARY;
+
+		rc = sfc_reg_write(SFC_REG_ADR, addr);
+		if (rc)
+			return rc;
+		rc = sfc_exec_command(cmd, 0);
+		if (rc)
+			return rc;
+		addr += chunk;
+		size -= chunk;
+	}
+	return 0;
+}
+
+static int sfc_setup(struct spi_flash_ctrl *ctrl, uint32_t *tsize)
+{
+	struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+	struct flash_info *info = ctrl->finfo;
+	uint32_t er_flags;
+
+	(void)tsize;
+
+	/* Keep non-erase related flags */
+	er_flags = ~FL_ERASE_ALL;
+
+	/* Add supported erase sizes */
+	if (ct->small_er_size == 0x1000 || ct->large_er_size == 0x1000)
+		er_flags |= FL_ERASE_4K;
+	if (ct->small_er_size == 0x8000 || ct->large_er_size == 0x8000)
+		er_flags |= FL_ERASE_32K;
+	if (ct->small_er_size == 0x10000 || ct->large_er_size == 0x10000)
+		er_flags |= FL_ERASE_64K;
+
+	/* Mask the flags out */
+	info->flags &= er_flags;
+
+	return 0;
+}
+
+static int sfc_set_4b(struct spi_flash_ctrl *ctrl, bool enable)
+{
+	struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+	int rc;
+
+	rc = sfc_exec_command(enable ? SFC_OP_START4BA : SFC_OP_END4BA, 0);
+	if (rc)
+		return rc;
+	ct->mode_4b = enable;
+	return 0;
+}
+
+static void sfc_validate_er_size(uint32_t *size)
+{
+	if (*size == 0)
+		return;
+
+	/* We only support 4k, 32k and 64k */
+	if (*size != 0x1000 && *size != 0x8000 && *size != 0x10000) {
+		FL_ERR("SFC: Erase size %d bytes unsupported\n", *size);
+		*size = 0;
+	}
+}
+
+static int sfc_init(struct sfc_ctrl *ct)
+{
+	int rc;
+	uint32_t status;
+
+	/*
+	 * Assumptions: The controller has been fully initialized
+	 * by an earlier FW layer setting the chip ID command, the
+	 * erase sizes, and configuring the timings for reads and
+	 * writes.
+	 *
+	 * This driver is meant to be usable if the configuration
+	 * is in lock down.
+	 *
+	 * If that wasn't the case, we could configure some sane
+	 * defaults here and tuned values in setup() after the
+	 * chip has been identified.
+	 */
+
+	/* Read erase sizes from flash */
+	rc = sfc_reg_read(SFC_REG_ERASMS, &ct->small_er_size);
+	if (rc)
+		return rc;
+	sfc_validate_er_size(&ct->small_er_size);
+	rc = sfc_reg_read(SFC_REG_ERASLGS, &ct->large_er_size);
+	if (rc)
+		return rc;
+	sfc_validate_er_size(&ct->large_er_size);
+
+	/* No erase sizes we can cope with ? Ouch... */
+	if ((ct->small_er_size == 0 && ct->large_er_size == 0) ||
+	    (ct->large_er_size && (ct->small_er_size > ct->large_er_size))) {
+		FL_ERR("SFC: No supported erase sizes !\n");
+		return FLASH_ERR_CTRL_CONFIG_MISMATCH;
+	}
+
+	FL_INF("SFC: Suppored erase sizes:");
+	if (ct->small_er_size)
+		FL_INF(" %dKB", ct->small_er_size >> 10);
+	if (ct->large_er_size)
+		FL_INF(" %dKB", ct->large_er_size >> 10);
+	FL_INF("\n");
+
+	/* Read current state of 4 byte addressing */
+	rc = sfc_reg_read(SFC_REG_STATUS, &status);
+	if (rc)
+		return rc;
+	ct->mode_4b = !!(status & SFC_REG_STATUS_FOURBYTEAD);
+
+	return 0;
+}
+
+int sfc_open(struct spi_flash_ctrl **ctrl)
+{
+	struct sfc_ctrl *ct;
+	int rc;
+
+	*ctrl = NULL;
+	ct = malloc(sizeof(*ct));
+	if (!ct) {
+		FL_ERR("SFC: Failed to allocate\n");
+		return FLASH_ERR_MALLOC_FAILED;
+	}
+	memset(ct, 0, sizeof(*ct));
+	ct->ops.chip_id = sfc_chip_id;
+	ct->ops.setup = sfc_setup;
+	ct->ops.set_4b = sfc_set_4b;
+	ct->ops.read = sfc_read;
+	ct->ops.write = sfc_write;
+	ct->ops.erase = sfc_erase;
+
+	rc = sfc_init(ct);
+	if (rc)
+		goto fail;
+	*ctrl = &ct->ops;
+	return 0;
+ fail:
+	free(ct);
+	return rc;
+}
+
+void sfc_close(struct spi_flash_ctrl *ctrl)
+{
+	struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+
+	/* Free the whole lot */
+	free(ct);
+}
+
diff --git a/roms/skiboot/hw/slw.c b/roms/skiboot/hw/slw.c
new file mode 100644
index 000000000..56ba05b0a
--- /dev/null
+++ b/roms/skiboot/hw/slw.c
@@ -0,0 +1,1731 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Everything to do with deep power saving (stop) states
+ * SLeep/Winkle, Handle ChipTOD chip & configure core timebases
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <xscom-p8-regs.h>
+#include <xscom-p9-regs.h>
+#include <xscom-p10-regs.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <chiptod.h>
+#include <interrupts.h>
+#include <timebase.h>
+#include <errorlog.h>
+#include <libfdt/libfdt.h>
+#include <opal-api.h>
+#include <nvram.h>
+#include <sbe-p8.h>
+#include <xive.h>
+
+#include <p10_stop_api.H>
+#include <p8_pore_table_gen_api.H>
+#include <sbe_xip_image.h>
+
+static uint32_t slw_saved_reset[0x100];
+
+static bool slw_current_le = false;
+
+enum wakeup_engine_states wakeup_engine_state = WAKEUP_ENGINE_NOT_PRESENT;
+bool has_deep_states = false;
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_SET, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+		 OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_GET, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+		 OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_REG, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+		 OPAL_NA);
+
+static void slw_do_rvwinkle(void *data)
+{
+	struct cpu_thread *cpu = this_cpu();
+	struct cpu_thread *master = data;
+	uint64_t lpcr = mfspr(SPR_LPCR);
+	struct proc_chip *chip;
+
+	/* Setup our ICP to receive IPIs */
+	icp_prep_for_pm();
+
+	/* Setup LPCR to wakeup on external interrupts only */
+	mtspr(SPR_LPCR, ((lpcr & ~SPR_LPCR_P8_PECE) | SPR_LPCR_P8_PECE2));
+	isync();
+
+	prlog(PR_DEBUG, "SLW: CPU PIR 0x%04x going to rvwinkle...\n",
+	      cpu->pir);
+
+	/* Tell that we got it */
+	cpu->state = cpu_state_rvwinkle;
+
+	enter_p8_pm_state(1);
+
+	/* Restore SPRs */
+	init_shared_sprs();
+	init_replicated_sprs();
+
+	/* Ok, it's ours again */
+	cpu->state = cpu_state_active;
+
+	prlog(PR_DEBUG, "SLW: CPU PIR 0x%04x woken up !\n", cpu->pir);
+
+	/* Cleanup our ICP */
+	reset_cpu_icp();
+
+	/* Resync timebase */
+	chiptod_wakeup_resync();
+
+	/* Restore LPCR */
+	mtspr(SPR_LPCR, lpcr);
+	isync();
+
+	/* If we are passed a master pointer we are the designated
+	 * waker, let's proceed. If not, return, we are finished.
+	 */
+	if (!master)
+		return;
+
+	prlog(PR_DEBUG, "SLW: CPU PIR 0x%04x waiting for master...\n",
+	      cpu->pir);
+
+	/* Allriiiight... now wait for master to go down */
+	while(master->state != cpu_state_rvwinkle)
+		sync();
+
+	/* XXX Wait one second ! (should check xscom state ? ) */
+	time_wait_ms(1000);
+
+	for_each_chip(chip) {
+		struct cpu_thread *c;
+		uint64_t tmp;
+		for_each_available_core_in_chip(c, chip->id) {
+			xscom_read(chip->id,
+				 XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+							EX_PM_IDLE_STATE_HISTORY_PHYP),
+				   &tmp);	
+			prlog(PR_TRACE, "SLW: core %x:%x"
+			      " history: 0x%016llx (mid2)\n",
+			      chip->id, pir_to_core_id(c->pir),
+			      tmp);
+		}
+	}
+
+	prlog(PR_DEBUG, "SLW: Waking master (PIR 0x%04x)...\n", master->pir);
+
+	/* Now poke all the secondary threads on the master's core */
+	for_each_cpu(cpu) {
+		if (!cpu_is_sibling(cpu, master) || (cpu == master))
+			continue;
+		icp_kick_cpu(cpu);
+
+		/* Wait for it to claim to be back (XXX ADD TIMEOUT) */
+		while(cpu->state != cpu_state_active)
+			sync();
+	}
+
+	/* Now poke the master and be gone */
+	icp_kick_cpu(master);
+}
+
+static void slw_patch_reset(void)
+{
+	uint32_t *src, *dst, *sav;
+
+	src = &reset_patch_start;
+	dst = (uint32_t *)0x100;
+	sav = slw_saved_reset;
+	while(src < &reset_patch_end) {
+		*(sav++) = *(dst);
+		*(dst++) = *(src++);
+	}
+	sync_icache();
+}
+
+static void slw_unpatch_reset(void)
+{
+	extern uint32_t reset_patch_start;
+	extern uint32_t reset_patch_end;
+	uint32_t *src, *dst, *sav;
+
+	src = &reset_patch_start;
+	dst = (uint32_t *)0x100;
+	sav = slw_saved_reset;
+	while(src < &reset_patch_end) {
+		*(dst++) = *(sav++);
+		src++;
+	}
+	sync_icache();
+}
+
+static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	uint64_t tmp;
+	int rc;
+
+	/* PowerManagement GP0 clear PM_DISABLE */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), &tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+				"SLW: Failed to read PM_GP0\n");
+		return false;
+	}
+	tmp = tmp & ~0x8000000000000000ULL;
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+				"SLW: Failed to write PM_GP0\n");
+		return false;
+	}
+	prlog(PR_TRACE, "SLW: PMGP0 set to 0x%016llx\n", tmp);
+
+	/* Read back for debug */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), &tmp);
+	if (rc)
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+				 "SLW: Failed to re-read PM_GP0. Continuing...\n");
+
+	prlog(PR_TRACE, "SLW: PMGP0 read   0x%016llx\n", tmp);
+
+	return true;
+}
+
+static bool slw_set_overrides(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	int rc;
+
+	rc = xscom_write(chip->id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SPECIAL_WAKEUP_PHYP),
+			 0);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+			"SLW: Failed to write PM_SPECIAL_WAKEUP_PHYP\n");
+		return false;
+	}
+
+	return true;
+}
+
+static bool slw_set_overrides_p10(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint64_t tmp;
+	int rc;
+	uint32_t core = pir_to_core_id(c->pir);
+
+	/* Special wakeup bits that could hold power mgt */
+	rc = xscom_read(chip->id,
+			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
+			&tmp);
+        if (rc) {
+          log_simple_error(&e_info(OPAL_RC_SLW_SET),
+                           "SLW: Failed to read P10_QME_SPWU_HYP\n");
+          return false;
+        }
+        if (tmp & P10_SPWU_REQ)
+		prlog(PR_WARNING,
+		        "SLW: core %d P10_QME_SPWU_HYP requested 0x%016llx\n",
+		      core, tmp);
+
+	return true;
+}
+
+
+static bool slw_set_overrides_p9(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint64_t tmp;
+	int rc;
+	uint32_t core = pir_to_core_id(c->pir);
+
+	/* Special wakeup bits that could hold power mgt */
+	rc = xscom_read(chip->id,
+			XSCOM_ADDR_P9_EC_SLAVE(core, EC_PPM_SPECIAL_WKUP_HYP),
+			&tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+				 "SLW: Failed to read EC_PPM_SPECIAL_WKUP_HYP\n");
+		return false;
+	}
+	if (tmp)
+		prlog(PR_WARNING,
+			"SLW: core %d EC_PPM_SPECIAL_WKUP_HYP read  0x%016llx\n",
+		     core, tmp);
+	rc = xscom_read(chip->id,
+			XSCOM_ADDR_P9_EC_SLAVE(core, EC_PPM_SPECIAL_WKUP_OTR),
+			&tmp);
+	if (tmp)
+		prlog(PR_WARNING,
+			"SLW: core %d EC_PPM_SPECIAL_WKUP_OTR read  0x%016llx\n",
+		      core, tmp);
+	return true;
+}
+
+static bool slw_unset_overrides(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+
+	/* XXX FIXME: Save and restore the overrides */
+	prlog(PR_DEBUG, "SLW: slw_unset_overrides %x:%x\n", chip->id, core);
+	return true;
+}
+
+static bool slw_set_idle_mode(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	uint64_t tmp;
+	int rc;
+
+	/*
+	 * PM GP1 allows fast/deep mode to be selected independently for sleep
+	 * and winkle. Init PM GP1 so that sleep happens in fast mode and
+	 * winkle happens in deep mode.
+	 * Make use of the OR XSCOM for this since the OCC might be manipulating
+	 * the PM_GP1 register as well. Before doing this ensure that the bits
+	 * managing idle states are cleared so as to override any bits set at
+	 * init time.
+	 */
+
+	tmp = ~EX_PM_GP1_SLEEP_WINKLE_MASK;
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_CLEAR_GP1),
+			 tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+						"SLW: Failed to write PM_GP1\n");
+		return false;
+	}
+
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SET_GP1),
+			 EX_PM_SETUP_GP1_FAST_SLEEP_DEEP_WINKLE);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+						"SLW: Failed to write PM_GP1\n");
+		return false;
+	}
+
+	/* Read back for debug */
+	xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1), &tmp);
+	prlog(PR_TRACE, "SLW: PMGP1 read   0x%016llx\n", tmp);
+	return true;
+}
+
+static bool slw_get_idle_state_history(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	uint64_t tmp;
+	int rc;
+
+	/* Cleanup history */
+	rc = xscom_read(chip->id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_IDLE_STATE_HISTORY_PHYP),
+		   &tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_GET),
+			"SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+		return false;
+	}
+
+	prlog(PR_TRACE, "SLW: core %x:%x history: 0x%016llx (old1)\n",
+	    chip->id, core, tmp);
+
+	rc = xscom_read(chip->id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_IDLE_STATE_HISTORY_PHYP),
+		   &tmp);
+
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_GET),
+			"SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+		return false;
+	}
+
+	prlog(PR_TRACE, "SLW: core %x:%x history: 0x%016llx (old2)\n",
+	    chip->id, core, tmp);
+
+	return true;
+}
+
+static bool idle_prepare_core(struct proc_chip *chip, struct cpu_thread *c)
+{
+	prlog(PR_TRACE, "FASTSLEEP: Prepare core %x:%x\n",
+	    chip->id, pir_to_core_id(c->pir));
+
+	if(!slw_general_init(chip, c))
+		return false;
+	if(!slw_set_overrides(chip, c))
+		return false;
+	if(!slw_set_idle_mode(chip, c))
+		return false;
+	if(!slw_get_idle_state_history(chip, c))
+		return false;
+
+	return true;
+
+}
+
+/* Define device-tree fields */
+#define MAX_NAME_LEN	16
+struct cpu_idle_states {
+	char name[MAX_NAME_LEN];
+	u32 latency_ns;
+	u32 residency_ns;
+	/*
+	 * Register value/mask used to select different idle states.
+	 * PMICR in POWER8 and PSSCR in POWER9
+	 */
+	u64 pm_ctrl_reg_val;
+	u64 pm_ctrl_reg_mask;
+	u32 flags;
+};
+
+static struct cpu_idle_states nap_only_cpu_idle_states[] = {
+	{ /* nap */
+		.name = "nap",
+		.latency_ns = 4000,
+		.residency_ns = 100000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_NAP_ENABLED \
+		       | 0*OPAL_PM_SLEEP_ENABLED \
+		       | 0*OPAL_PM_WINKLE_ENABLED \
+		       | 0*OPAL_USE_PMICR,
+		.pm_ctrl_reg_val = 0,
+		.pm_ctrl_reg_mask = 0 },
+};
+
+static struct cpu_idle_states power8_cpu_idle_states[] = {
+	{ /* nap */
+		.name = "nap",
+		.latency_ns = 4000,
+		.residency_ns = 100000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_NAP_ENABLED \
+		       | 0*OPAL_USE_PMICR,
+		.pm_ctrl_reg_val = 0,
+		.pm_ctrl_reg_mask = 0 },
+	{ /* fast sleep (with workaround) */
+		.name = "fastsleep_",
+		.latency_ns = 40000,
+		.residency_ns = 300000000,
+		.flags = 1*OPAL_PM_DEC_STOP \
+		       | 1*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_SLEEP_ENABLED_ER1 \
+		       | 0*OPAL_USE_PMICR, /* Not enabled until deep
+						states are available */
+		.pm_ctrl_reg_val = OPAL_PM_FASTSLEEP_PMICR,
+		.pm_ctrl_reg_mask = OPAL_PM_SLEEP_PMICR_MASK },
+	{ /* Winkle */
+		.name = "winkle",
+		.latency_ns = 10000000,
+		.residency_ns = 1000000000, /* Educated guess (not measured).
+					     * Winkle is not currently used by 
+					     * linux cpuidle subsystem so we
+					     * don't have real world user.
+					     * However, this should be roughly
+					     * accurate for when linux does
+					     * use it. */
+		.flags = 1*OPAL_PM_DEC_STOP \
+		       | 1*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_WINKLE_ENABLED \
+		       | 0*OPAL_USE_PMICR, /* Currently choosing deep vs
+						fast via EX_PM_GP1 reg */
+		.pm_ctrl_reg_val = 0,
+		.pm_ctrl_reg_mask = 0 },
+};
+
+/*
+ * cpu_idle_states for key idle states of POWER9 that we want to
+ * exploit.
+ * Note latency_ns and residency_ns are estimated values for now.
+ */
+static struct cpu_idle_states power9_cpu_idle_states[] = {
+	{
+		.name = "stop0_lite", /* Enter stop0 with no state loss */
+		.latency_ns = 1000,
+		.residency_ns = 10000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 0*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3),
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	{
+		.name = "stop0",
+		.latency_ns = 2000,
+		.residency_ns = 20000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+	/* stop1_lite has been removed since it adds no additional benefit over stop0_lite */
+
+	{
+		.name = "stop1",
+		.latency_ns = 5000,
+		.residency_ns = 50000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(1) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	/*
+	 * stop2_lite has been removed since currently it adds minimal benefit over stop2.
+	 * However, the benefit is eclipsed by the time required to ungate the clocks
+	 */
+
+	{
+		.name = "stop2",
+		.latency_ns = 10000,
+		.residency_ns = 100000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	{
+		.name = "stop4",
+		.latency_ns = 100000,
+		.residency_ns = 10000000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_DEEP,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(4) \
+				 | OPAL_PM_PSSCR_MTL(7) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	{
+		.name = "stop5",
+		.latency_ns = 200000,
+		.residency_ns = 20000000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_DEEP,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(5) \
+				 | OPAL_PM_PSSCR_MTL(7) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+	{
+		.name = "stop8",
+		.latency_ns = 2000000,
+		.residency_ns = 20000000,
+		.flags = 1*OPAL_PM_DEC_STOP \
+		       | 1*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_DEEP,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(8) \
+				 | OPAL_PM_PSSCR_MTL(11) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+	{
+		.name = "stop11",
+		.latency_ns = 10000000,
+		.residency_ns = 100000000,
+		.flags = 1*OPAL_PM_DEC_STOP \
+		       | 1*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_DEEP,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(11) \
+				 | OPAL_PM_PSSCR_MTL(11) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+};
+
+/*
+ * Prior to Mambo.7.8.21, mambo did set the MSR correctly for lite stop
+ * states, so disable them for now.
+ */
+static struct cpu_idle_states power9_mambo_cpu_idle_states[] = {
+	{
+		.name = "stop0",
+		.latency_ns = 2000,
+		.residency_ns = 20000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	{
+		.name = "stop1",
+		.latency_ns = 5000,
+		.residency_ns = 50000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(1) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	{
+		.name = "stop2",
+		.latency_ns = 10000,
+		.residency_ns = 100000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	{
+		.name = "stop4",
+		.latency_ns = 100000,
+		.residency_ns = 1000000,
+		.flags = 1*OPAL_PM_DEC_STOP \
+		       | 1*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_DEEP,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(4) \
+				 | OPAL_PM_PSSCR_MTL(7) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+	{
+		.name = "stop8",
+		.latency_ns = 2000000,
+		.residency_ns = 20000000,
+		.flags = 1*OPAL_PM_DEC_STOP \
+		       | 1*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_DEEP,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(8) \
+				 | OPAL_PM_PSSCR_MTL(11) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+	{
+		.name = "stop11",
+		.latency_ns = 10000000,
+		.residency_ns = 100000000,
+		.flags = 1*OPAL_PM_DEC_STOP \
+		       | 1*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_DEEP,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(11) \
+				 | OPAL_PM_PSSCR_MTL(11) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+};
+
+/*
+ * cpu_idle_states for fused core configuration
+ * These will be a subset of power9 idle states.
+ */
+static struct cpu_idle_states power9_fusedcore_cpu_idle_states[] = {
+	{
+		.name = "stop0_lite", /* Enter stop0 with no state loss */
+		.latency_ns = 1000,
+		.residency_ns = 10000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 0*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3),
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	{
+		.name = "stop0",
+		.latency_ns = 2000,
+		.residency_ns = 20000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+
+	/* stop1_lite has been removed since it adds no additional benefit over stop0_lite */
+
+	{
+		.name = "stop1",
+		.latency_ns = 5000,
+		.residency_ns = 50000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(1) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	/*
+	 * stop2_lite has been removed since currently it adds minimal benefit over stop2.
+	 * However, the benefit is eclipsed by the time required to ungate the clocks
+	 */
+
+	{
+		.name = "stop2",
+		.latency_ns = 10000,
+		.residency_ns = 100000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+};
+
+/*
+ * Note latency_ns and residency_ns are estimated values for now.
+ */
+static struct cpu_idle_states power10_cpu_idle_states[] = {
+	{
+		.name = "stop0_lite", /* Enter stop0 with no state loss */
+		.latency_ns = 1000,
+		.residency_ns = 10000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 0*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+				 | OPAL_PM_PSSCR_MTL(0) \
+				 | OPAL_PM_PSSCR_TR(3),
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	{
+		.name = "stop0",
+		.latency_ns = 10000,
+		.residency_ns = 100000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(0) \
+				 | OPAL_PM_PSSCR_MTL(0) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	{
+		.name = "stop2",
+		.latency_ns = 20000,
+		.residency_ns = 200000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(2) \
+				 | OPAL_PM_PSSCR_MTL(2) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+	{
+		.name = "stop3",
+		.latency_ns = 45000,
+		.residency_ns = 450000,
+		.flags = 0*OPAL_PM_DEC_STOP \
+		       | 0*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 0*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 0*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_FAST,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(3) \
+				 | OPAL_PM_PSSCR_MTL(3) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+#if 0
+	{
+		.name = "stop11",
+		.latency_ns = 10000000,
+		.residency_ns = 100000000,
+		.flags = 1*OPAL_PM_DEC_STOP \
+		       | 1*OPAL_PM_TIMEBASE_STOP  \
+		       | 1*OPAL_PM_LOSE_USER_CONTEXT \
+		       | 1*OPAL_PM_LOSE_HYP_CONTEXT \
+		       | 1*OPAL_PM_LOSE_FULL_CONTEXT \
+		       | 1*OPAL_PM_STOP_INST_DEEP,
+		.pm_ctrl_reg_val = OPAL_PM_PSSCR_RL(11) \
+				 | OPAL_PM_PSSCR_MTL(11) \
+				 | OPAL_PM_PSSCR_TR(3) \
+				 | OPAL_PM_PSSCR_ESL \
+				 | OPAL_PM_PSSCR_EC,
+		.pm_ctrl_reg_mask = OPAL_PM_PSSCR_MASK },
+#endif
+};
+
+static void slw_late_init_p9(struct proc_chip *chip)
+{
+	struct cpu_thread *c;
+	int rc;
+
+	prlog(PR_INFO, "SLW: Configuring self-restore for HRMOR\n");
+	for_each_available_cpu(c) {
+		if (c->chip_id != chip->id)
+			continue;
+		/*
+		 * Clear HRMOR. Need to update only for thread
+		 * 0 of each core. Doing it anyway for all threads
+		 */
+		rc =  p9_stop_save_cpureg((void *)chip->homer_base,
+						P9_STOP_SPR_HRMOR, 0,
+						c->pir);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_SLW_REG),
+			"SLW: Failed to set HRMOR for CPU %x,RC=0x%x\n",
+			c->pir, rc);
+			prlog(PR_ERR, "Disabling deep stop states\n");
+		}
+	}
+}
+
+static void slw_late_init_p10(struct proc_chip *chip)
+{
+	struct cpu_thread *c;
+	int rc;
+
+	prlog(PR_INFO, "SLW: Configuring self-restore for HRMOR\n");
+	for_each_available_cpu(c) {
+		if (c->chip_id != chip->id)
+			continue;
+		/*
+		 * Clear HRMOR. Need to update only for thread
+		 * 0 of each core. Doing it anyway for all threads
+		 */
+		rc =  proc_stop_save_cpureg((void *)chip->homer_base,
+						PROC_STOP_SPR_HRMOR, 0,
+						c->pir);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_SLW_REG),
+			"SLW: Failed to set HRMOR for CPU %x,RC=0x%x\n",
+			c->pir, rc);
+			prlog(PR_ERR, "Disabling deep stop states\n");
+		}
+	}
+}
+
+/* Add device tree properties to describe idle states */
+void add_cpu_idle_state_properties(void)
+{
+	struct dt_node *power_mgt;
+	struct cpu_idle_states *states;
+	struct proc_chip *chip;
+	int nr_states;
+
+	bool can_sleep = true;
+	bool has_stop_inst = false;
+	u8 i;
+
+	fdt64_t *pm_ctrl_reg_val_buf;
+	fdt64_t *pm_ctrl_reg_mask_buf;
+	u32 supported_states_mask;
+	u32 opal_disabled_states_mask = ~0xFC000000; /* all but stop11 */
+	const char* nvram_disable_str;
+	u32 nvram_disabled_states_mask = 0x00;
+	u32 stop_levels;
+
+	/* Variables to track buffer length */
+	u8 name_buf_len;
+	u8 num_supported_idle_states;
+
+	/* Buffers to hold idle state properties */
+	char *name_buf, *alloced_name_buf;
+	fdt32_t *latency_ns_buf;
+	fdt32_t *residency_ns_buf;
+	fdt32_t *flags_buf;
+
+	prlog(PR_DEBUG, "CPU idle state device tree init\n");
+
+	/* Create /ibm,opal/power-mgt if it doesn't exist already */
+	power_mgt = dt_new_check(opal_node, "power-mgt");
+	if (!power_mgt) {
+		/**
+		 * @fwts-label CreateDTPowerMgtNodeFail
+		 * @fwts-advice OPAL failed to add the power-mgt device tree
+		 * node. This could mean that firmware ran out of memory,
+		 * or there's a bug somewhere.
+		 */
+		prlog(PR_ERR, "creating dt node /ibm,opal/power-mgt failed\n");
+		return;
+	}
+
+	/*
+	 * Chose the right state table for the chip
+	 *
+	 * XXX We use the first chip version, we should probably look
+	 * for the smaller of all chips instead..
+	 */
+	chip = next_chip(NULL);
+	assert(chip);
+	if (proc_gen >= proc_gen_p9) {
+		if (chip->type == PROC_CHIP_P9_NIMBUS ||
+		    chip->type == PROC_CHIP_P9_CUMULUS ||
+		    chip->type == PROC_CHIP_P9P) {
+			if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) {
+				states = power9_mambo_cpu_idle_states;
+				nr_states = ARRAY_SIZE(power9_mambo_cpu_idle_states);
+			} else if (this_cpu()->is_fused_core) {
+			  states = power9_fusedcore_cpu_idle_states;
+			  nr_states = ARRAY_SIZE(power9_fusedcore_cpu_idle_states);
+			} else {
+				states = power9_cpu_idle_states;
+				nr_states = ARRAY_SIZE(power9_cpu_idle_states);
+			}
+		} else if (chip->type == PROC_CHIP_P10) {
+			states = power10_cpu_idle_states;
+			nr_states = ARRAY_SIZE(power10_cpu_idle_states);
+		} else {
+			prlog(PR_ERR, "determining chip type\n");
+			return;
+		}
+
+		has_stop_inst = true;
+		stop_levels = dt_prop_get_u32_def(power_mgt,
+			"ibm,enabled-stop-levels", 0);
+		if (!stop_levels) {
+			prerror("SLW: No stop levels available. Power saving is disabled!\n");
+			has_deep_states = false;
+		} else {
+		/* Iterate to see if we have deep states enabled */
+			for (i = 0; i < nr_states; i++) {
+				u32 level = 31 - (states[i].pm_ctrl_reg_val &
+					 OPAL_PM_PSSCR_RL_MASK);
+
+				if ((stop_levels & (1ul << level)) &&
+					(states[i].flags & OPAL_PM_STOP_INST_DEEP))
+					has_deep_states = true;
+				}
+			}
+			if ((wakeup_engine_state == WAKEUP_ENGINE_PRESENT) && has_deep_states) {
+				if (chip->type == PROC_CHIP_P9_NIMBUS ||
+				    chip->type == PROC_CHIP_P9_CUMULUS) {
+					slw_late_init_p9(chip);
+					xive_late_init();
+					nx_p9_rng_late_init();
+				} else if (chip->type == PROC_CHIP_P10) {
+					slw_late_init_p10(chip);
+					xive2_late_init();
+				}
+			}
+			if (wakeup_engine_state != WAKEUP_ENGINE_PRESENT)
+				has_deep_states = false;
+	} else if (chip->type == PROC_CHIP_P8_MURANO ||
+	    chip->type == PROC_CHIP_P8_VENICE ||
+	    chip->type == PROC_CHIP_P8_NAPLES) {
+		const struct dt_property *p;
+
+		p = dt_find_property(dt_root, "ibm,enabled-idle-states");
+		if (p)
+			prlog(PR_NOTICE,
+			      "SLW: HB-provided idle states property found\n");
+		states = power8_cpu_idle_states;
+		nr_states = ARRAY_SIZE(power8_cpu_idle_states);
+
+		/* Check if hostboot say we can sleep */
+		if (!p || !dt_prop_find_string(p, "fast-sleep")) {
+			prlog(PR_WARNING, "SLW: Sleep not enabled by HB"
+			      " on this platform\n");
+			can_sleep = false;
+		}
+
+		/* Clip to NAP only on Murano and Venice DD1.x */
+		if ((chip->type == PROC_CHIP_P8_MURANO ||
+		     chip->type == PROC_CHIP_P8_VENICE) &&
+		    chip->ec_level < 0x20) {
+			prlog(PR_NOTICE, "SLW: Sleep not enabled on P8 DD1.x\n");
+			can_sleep = false;
+		}
+
+	} else {
+		states = nap_only_cpu_idle_states;
+		nr_states = ARRAY_SIZE(nap_only_cpu_idle_states);
+	}
+
+
+	/*
+	 * Currently we can't append strings and cells to dt properties.
+	 * So create buffers to which you can append values, then create
+	 * dt properties with this buffer content.
+	 */
+
+	/* Allocate memory to idle state property buffers. */
+	alloced_name_buf= malloc(nr_states * sizeof(char) * MAX_NAME_LEN);
+	name_buf = alloced_name_buf;
+	latency_ns_buf	= malloc(nr_states * sizeof(u32));
+	residency_ns_buf= malloc(nr_states * sizeof(u32));
+	flags_buf	= malloc(nr_states * sizeof(u32));
+	pm_ctrl_reg_val_buf	= malloc(nr_states * sizeof(u64));
+	pm_ctrl_reg_mask_buf	= malloc(nr_states * sizeof(u64));
+
+	name_buf_len = 0;
+	num_supported_idle_states = 0;
+
+	/*
+	 * Create a mask with the flags of all supported idle states
+	 * set. Use this to only add supported idle states to the
+	 * device-tree
+	 */
+	if (has_stop_inst) {
+		/* Power 9/10 / POWER ISA 3.0 and above */
+		supported_states_mask = OPAL_PM_STOP_INST_FAST;
+		if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT)
+			supported_states_mask |= OPAL_PM_STOP_INST_DEEP;
+	} else {
+		/* Power 7 and Power 8 */
+		supported_states_mask = OPAL_PM_NAP_ENABLED;
+		if (can_sleep)
+			supported_states_mask |= OPAL_PM_SLEEP_ENABLED |
+						OPAL_PM_SLEEP_ENABLED_ER1;
+		if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT)
+			supported_states_mask |= OPAL_PM_WINKLE_ENABLED;
+	}
+	nvram_disable_str = nvram_query_dangerous("opal-stop-state-disable-mask");
+	if (nvram_disable_str)
+		nvram_disabled_states_mask = strtol(nvram_disable_str, NULL, 0);
+	prlog(PR_DEBUG, "NVRAM stop disable mask: %x\n", nvram_disabled_states_mask);
+	for (i = 0; i < nr_states; i++) {
+		/* For each state, check if it is one of the supported states. */
+		if (!(states[i].flags & supported_states_mask))
+			continue;
+
+		/* We can only use the stop levels that HB has made available */
+		if (has_stop_inst) {
+			u32 level = 31 - (states[i].pm_ctrl_reg_val &
+					 OPAL_PM_PSSCR_RL_MASK);
+
+			if (!(stop_levels & (1ul << level)))
+				continue;
+
+			if ((opal_disabled_states_mask |
+			     nvram_disabled_states_mask) &
+			    (1ul << level)) {
+				if (nvram_disable_str &&
+				    !(nvram_disabled_states_mask & (1ul << level))) {
+					prlog(PR_NOTICE, "SLW: Enabling: %s "
+					      "(disabled in OPAL, forced by "
+					      "NVRAM)\n",states[i].name);
+				} else {
+					prlog(PR_NOTICE, "SLW: Disabling: %s in OPAL\n",
+					      states[i].name);
+					continue;
+				}
+			}
+		}
+
+		prlog(PR_INFO, "SLW: Enabling: %s\n", states[i].name);
+
+		/*
+		 * If a state is supported add each of its property
+		 * to its corresponding property buffer.
+		 */
+		strncpy(name_buf, states[i].name, MAX_NAME_LEN);
+		name_buf = name_buf + strlen(states[i].name) + 1;
+
+		*latency_ns_buf = cpu_to_fdt32(states[i].latency_ns);
+		latency_ns_buf++;
+
+		*residency_ns_buf = cpu_to_fdt32(states[i].residency_ns);
+		residency_ns_buf++;
+
+		*flags_buf = cpu_to_fdt32(states[i].flags);
+		flags_buf++;
+
+		*pm_ctrl_reg_val_buf = cpu_to_fdt64(states[i].pm_ctrl_reg_val);
+		pm_ctrl_reg_val_buf++;
+
+		*pm_ctrl_reg_mask_buf = cpu_to_fdt64(states[i].pm_ctrl_reg_mask);
+		pm_ctrl_reg_mask_buf++;
+
+		/* Increment buffer length trackers */
+		name_buf_len += strlen(states[i].name) + 1;
+		num_supported_idle_states++;
+
+	}
+
+	/* Point buffer pointers back to beginning of the buffer */
+	name_buf -= name_buf_len;
+	latency_ns_buf -= num_supported_idle_states;
+	residency_ns_buf -= num_supported_idle_states;
+	flags_buf -= num_supported_idle_states;
+	pm_ctrl_reg_val_buf -= num_supported_idle_states;
+	pm_ctrl_reg_mask_buf -= num_supported_idle_states;
+	/* Create dt properties with the buffer content */
+	dt_add_property(power_mgt, "ibm,cpu-idle-state-names", name_buf,
+			name_buf_len* sizeof(char));
+	dt_add_property(power_mgt, "ibm,cpu-idle-state-latencies-ns",
+			latency_ns_buf, num_supported_idle_states * sizeof(u32));
+	dt_add_property(power_mgt, "ibm,cpu-idle-state-residency-ns",
+			residency_ns_buf, num_supported_idle_states * sizeof(u32));
+	dt_add_property(power_mgt, "ibm,cpu-idle-state-flags", flags_buf,
+			num_supported_idle_states * sizeof(u32));
+
+	if (has_stop_inst) {
+		dt_add_property(power_mgt, "ibm,cpu-idle-state-psscr",
+				pm_ctrl_reg_val_buf,
+				num_supported_idle_states * sizeof(u64));
+		dt_add_property(power_mgt, "ibm,cpu-idle-state-psscr-mask",
+				pm_ctrl_reg_mask_buf,
+				num_supported_idle_states * sizeof(u64));
+	} else {
+		dt_add_property(power_mgt, "ibm,cpu-idle-state-pmicr",
+				pm_ctrl_reg_val_buf,
+				num_supported_idle_states * sizeof(u64));
+		dt_add_property(power_mgt, "ibm,cpu-idle-state-pmicr-mask",
+				pm_ctrl_reg_mask_buf,
+				num_supported_idle_states * sizeof(u64));
+	}
+	assert(alloced_name_buf == name_buf);
+	free(alloced_name_buf);
+	free(latency_ns_buf);
+	free(residency_ns_buf);
+	free(flags_buf);
+	free(pm_ctrl_reg_val_buf);
+	free(pm_ctrl_reg_mask_buf);
+}
+
+static void slw_cleanup_core(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint64_t tmp;
+	int rc;
+
+	/* Display history to check transition */
+	rc = xscom_read(chip->id,
+			XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+					       EX_PM_IDLE_STATE_HISTORY_PHYP),
+			&tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_GET),
+			"SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+		/* XXX error handling ? return false; */
+	}
+
+	prlog(PR_DEBUG, "SLW: core %x:%x history: 0x%016llx (new1)\n",
+	       chip->id, pir_to_core_id(c->pir), tmp);
+
+	rc = xscom_read(chip->id,
+			XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+					       EX_PM_IDLE_STATE_HISTORY_PHYP),
+			&tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_GET),
+			"SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+		/* XXX error handling ? return false; */
+	}
+
+	prlog(PR_DEBUG, "SLW: core %x:%x history: 0x%016llx (new2)\n",
+	       chip->id, pir_to_core_id(c->pir), tmp);
+
+	/*
+	 * XXX FIXME: Error out if the transition didn't reach rvwinkle ?
+	 */
+
+	/*
+	 * XXX FIXME: We should restore a bunch of the EX bits we
+	 * overwrite to sane values here
+	 */
+	slw_unset_overrides(chip, c);
+}
+
+static void slw_cleanup_chip(struct proc_chip *chip)
+{
+	struct cpu_thread *c;
+
+	for_each_available_core_in_chip(c, chip->id)
+		slw_cleanup_core(chip, c);
+}
+
+static void slw_patch_scans(struct proc_chip *chip, bool le_mode)
+{
+	int64_t rc;
+	uint64_t old_val, new_val;
+
+	rc = sbe_xip_get_scalar((void *)chip->slw_base,
+				"skip_ex_override_ring_scans", &old_val);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_REG),
+			"SLW: Failed to read scan override on chip %d\n",
+			chip->id);
+		return;
+	}
+
+	new_val = le_mode ? 0 : 1;
+
+	prlog(PR_TRACE, "SLW: Chip %d, LE value was: %lld, setting to %lld\n",
+	    chip->id, old_val, new_val);
+
+	rc = sbe_xip_set_scalar((void *)chip->slw_base,
+				"skip_ex_override_ring_scans", new_val);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_REG),
+			"SLW: Failed to set LE mode on chip %d\n", chip->id);
+		return;
+	}
+}
+
+int64_t slw_reinit(uint64_t flags)
+{
+	struct proc_chip *chip;
+	struct cpu_thread *cpu;
+	bool has_waker = false;
+	bool target_le = slw_current_le;
+
+	if (flags & OPAL_REINIT_CPUS_HILE_BE)
+		target_le = false;
+	if (flags & OPAL_REINIT_CPUS_HILE_LE)
+		target_le = true;
+
+	prlog(PR_TRACE, "SLW Reinit from CPU PIR 0x%04x,"
+	      " HILE set to %s endian...\n",
+	      this_cpu()->pir,
+	      target_le ? "little" : "big");
+
+	/* Prepare chips/cores for rvwinkle */
+	for_each_chip(chip) {
+		if (!chip->slw_base) {
+			log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+				"SLW: Not found on chip %d\n", chip->id);
+			return OPAL_HARDWARE;
+		}
+
+		slw_patch_scans(chip, target_le);
+	}
+	slw_current_le = target_le;
+
+	/* XXX Save HIDs ? Or do that in head.S ... */
+
+	slw_patch_reset();
+
+	/* rvwinkle everybody and pick one to wake me once I rvwinkle myself */
+	for_each_available_cpu(cpu) {
+		struct cpu_thread *master = NULL;
+
+		if (cpu == this_cpu())
+			continue;
+
+		/* Pick up a waker for myself: it must not be a sibling of
+		 * the current CPU and must be a thread 0 (so it gets to
+		 * sync its timebase before doing time_wait_ms()
+		 */
+		if (!has_waker && !cpu_is_sibling(cpu, this_cpu()) &&
+		    cpu_is_thread0(cpu)) {
+			has_waker = true;
+			master = this_cpu();
+		}
+		__cpu_queue_job(cpu, "slw_do_rvwinkle",
+				slw_do_rvwinkle, master, true);
+
+		/* Wait for it to claim to be down */
+		while(cpu->state != cpu_state_rvwinkle)
+			sync();		
+	}
+
+	/* XXX Wait one second ! (should check xscom state ? ) */
+	prlog(PR_TRACE, "SLW: Waiting one second...\n");
+	time_wait_ms(1000);
+	prlog(PR_TRACE, "SLW: Done.\n");
+
+	for_each_chip(chip) {
+		struct cpu_thread *c;
+		uint64_t tmp;
+		for_each_available_core_in_chip(c, chip->id) {
+			xscom_read(chip->id,
+				 XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+							EX_PM_IDLE_STATE_HISTORY_PHYP),
+				   &tmp);
+			prlog(PR_DEBUG, "SLW: core %x:%x"
+			      " history: 0x%016llx (mid)\n",
+			      chip->id, pir_to_core_id(c->pir), tmp);
+		}
+	}
+
+
+	/* Wake everybody except on my core */
+	for_each_cpu(cpu) {
+		if (cpu->state != cpu_state_rvwinkle ||
+		    cpu_is_sibling(cpu, this_cpu()))
+			continue;
+		icp_kick_cpu(cpu);
+
+		/* Wait for it to claim to be back (XXX ADD TIMEOUT) */
+		while(cpu->state != cpu_state_active)
+			sync();
+	}
+
+	/* Did we find a waker ? If we didn't, that means we had no
+	 * other core in the system, we can't do it
+	 */
+	if (!has_waker) {
+		prlog(PR_TRACE, "SLW: No candidate waker, giving up !\n");
+		return OPAL_HARDWARE;
+	}
+
+	/* Our siblings are rvwinkling, and our waker is waiting for us
+	 * so let's just go down now
+	 */
+	slw_do_rvwinkle(NULL);
+
+	slw_unpatch_reset();
+
+	for_each_chip(chip)
+		slw_cleanup_chip(chip);
+
+	prlog(PR_TRACE, "SLW Reinit complete !\n");
+
+	return OPAL_SUCCESS;
+}
+
+static void slw_patch_regs(struct proc_chip *chip)
+{
+	struct cpu_thread *c;
+	void *image = (void *)chip->slw_base;
+	int rc;
+
+	for_each_available_cpu(c) {
+		if (c->chip_id != chip->id)
+			continue;
+	
+		/* Clear HRMOR */
+		rc =  p8_pore_gen_cpureg_fixed(image, P8_SLW_MODEBUILD_SRAM,
+					       P8_SPR_HRMOR, 0,
+					       cpu_get_core_index(c),
+					       cpu_get_thread_index(c));
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_SLW_REG),
+				"SLW: Failed to set HRMOR for CPU %x\n",
+				c->pir);
+		}
+
+		/* XXX Add HIDs etc... */
+	}
+}
+
+static void slw_init_chip_p9(struct proc_chip *chip)
+{
+	struct cpu_thread *c;
+
+	prlog(PR_DEBUG, "SLW: Init chip 0x%x\n", chip->id);
+
+	/* At power ON setup inits for power-mgt */
+	for_each_available_core_in_chip(c, chip->id)
+		slw_set_overrides_p9(chip, c);
+
+
+}
+
+static void slw_init_chip_p10(struct proc_chip *chip)
+{
+	struct cpu_thread *c;
+
+	prlog(PR_DEBUG, "SLW: Init chip 0x%x\n", chip->id);
+
+	/* At power ON setup inits for power-mgt */
+	for_each_available_core_in_chip(c, chip->id)
+		slw_set_overrides_p10(chip, c);
+
+
+}
+
+
+static bool  slw_image_check_p9(struct proc_chip *chip)
+{
+
+	if (!chip->homer_base) {
+		log_simple_error(&e_info(OPAL_RC_SLW_REG),
+				 "SLW: HOMER base not set %x\n",
+				 chip->id);
+		return false;
+	} else
+		return true;
+
+
+}
+
+static bool  slw_image_check_p8(struct proc_chip *chip)
+{
+	int64_t rc;
+
+	prlog(PR_DEBUG, "SLW: slw_check chip 0x%x\n", chip->id);
+	if (!chip->slw_base) {
+		prerror("SLW: No image found !\n");
+		return false;
+	}
+
+	/* Check actual image size */
+	rc = sbe_xip_get_scalar((void *)chip->slw_base, "image_size",
+				&chip->slw_image_size);
+	if (rc != 0) {
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+			"SLW: Error %lld reading SLW image size\n", rc);
+		/* XXX Panic ? */
+		chip->slw_base = 0;
+		chip->slw_bar_size = 0;
+		chip->slw_image_size = 0;
+		return false;
+	}
+	prlog(PR_DEBUG, "SLW: Image size from image: 0x%llx\n",
+	      chip->slw_image_size);
+
+	if (chip->slw_image_size > chip->slw_bar_size) {
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+			"SLW: Built-in image size larger than BAR size !\n");
+		/* XXX Panic ? */
+		return false;
+	}
+	return true;
+
+}
+
+static void slw_late_init_p8(struct proc_chip *chip)
+{
+
+	prlog(PR_DEBUG, "SLW: late Init chip 0x%x\n", chip->id);
+
+	/* Patch SLW image */
+        slw_patch_regs(chip);
+
+}
+static void slw_init_chip_p8(struct proc_chip *chip)
+{
+	struct cpu_thread *c;
+
+	prlog(PR_DEBUG, "SLW: Init chip 0x%x\n", chip->id);
+	/* At power ON setup inits for fast-sleep */
+	for_each_available_core_in_chip(c, chip->id) {
+		idle_prepare_core(chip, c);
+	}
+}
+
+/* Workarounds while entering fast-sleep */
+
+static void fast_sleep_enter(void)
+{
+	uint32_t core = pir_to_core_id(this_cpu()->pir);
+	uint32_t chip_id = this_cpu()->chip_id;
+	struct cpu_thread *primary_thread;
+	uint64_t tmp;
+	int rc;
+
+	primary_thread = this_cpu()->primary;
+
+	rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1),
+			&tmp);
+	if (rc) {
+		prlog(PR_WARNING, "fast_sleep_enter XSCOM failed(1):"
+		      " rc=%d chip_id=%d core=%d\n",
+		      rc, chip_id, core);
+		return;
+	}
+
+	primary_thread->save_l2_fir_action1 = tmp;
+	primary_thread->in_fast_sleep = true;
+
+	tmp = tmp & ~0x0200000000000000ULL;
+	rc = xscom_write(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1),
+			 tmp);
+	if (rc) {
+		prlog(PR_WARNING, "fast_sleep_enter XSCOM failed(2):"
+		      " rc=%d chip_id=%d core=%d\n",
+		      rc, chip_id, core);
+		return;
+	}
+	rc = xscom_read(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1),
+			&tmp);
+	if (rc) {
+		prlog(PR_WARNING, "fast_sleep_enter XSCOM failed(3):"
+		      " rc=%d chip_id=%d core=%d\n",
+		      rc, chip_id, core);
+		return;
+	}
+
+}
+
+/* Workarounds while exiting fast-sleep */
+
+void fast_sleep_exit(void)
+{
+	uint32_t core = pir_to_core_id(this_cpu()->pir);
+	uint32_t chip_id = this_cpu()->chip_id;
+	struct cpu_thread *primary_thread;
+	int rc;
+
+	primary_thread = this_cpu()->primary;
+	primary_thread->in_fast_sleep = false;
+
+	rc = xscom_write(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1),
+			primary_thread->save_l2_fir_action1);
+	if (rc) {
+		prlog(PR_WARNING, "fast_sleep_exit XSCOM failed:"
+		      " rc=%d chip_id=%d core=%d\n",
+		      rc, chip_id, core);
+		return;
+	}
+}
+
+/*
+ * Setup and cleanup method for fast-sleep workarounds
+ * state = 1 fast-sleep
+ * enter = 1 Enter state
+ * exit  = 0 Exit state
+ */
+
+static int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t enter)
+{
+	/* Only fast-sleep for now */
+	if (state != 1)
+		return OPAL_PARAMETER;	
+
+	switch(enter) {
+	case 1:
+		fast_sleep_enter();
+		break;
+	case 0:
+		fast_sleep_exit();
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+opal_call(OPAL_CONFIG_CPU_IDLE_STATE, opal_config_cpu_idle_state, 2);
+
+int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val)
+{
+
+	struct cpu_thread *c = find_cpu_by_pir(cpu_pir);
+	struct proc_chip *chip;
+	int rc;
+
+	if (!c) {
+		prerror("SLW: Unknown thread with pir %x\n", (u32) cpu_pir);
+		return OPAL_PARAMETER;
+	}
+
+	chip = get_chip(c->chip_id);
+	if (!chip) {
+		prerror("SLW: Unknown chip for thread with pir %x\n",
+			(u32) cpu_pir);
+		return OPAL_PARAMETER;
+	}
+
+	if (proc_gen >= proc_gen_p9) {
+		if (!has_deep_states) {
+			prlog(PR_INFO, "SLW: Deep states not enabled\n");
+			return OPAL_SUCCESS;
+		}
+
+		if (wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
+			log_simple_error(&e_info(OPAL_RC_SLW_REG),
+					 "SLW: wakeup_engine in bad state=%d chip=%x\n",
+					 wakeup_engine_state,chip->id);
+			return OPAL_INTERNAL_ERROR;
+		}
+		if (proc_gen == proc_gen_p9) {
+			rc = p9_stop_save_cpureg((void *)chip->homer_base,
+					 sprn, val, cpu_pir);
+		} else {
+			rc = proc_stop_save_cpureg((void *)chip->homer_base,
+					 sprn, val, cpu_pir);
+		}
+
+	} else if (proc_gen == proc_gen_p8) {
+		int spr_is_supported = 0;
+		void *image;
+		int i;
+
+		/* Check of the SPR is supported by libpore */
+		for (i = 0; i < SLW_SPR_REGS_SIZE ; i++)  {
+			if (sprn == SLW_SPR_REGS[i].value)  {
+				spr_is_supported = 1;
+				break;
+			}
+		}
+		if (!spr_is_supported) {
+			log_simple_error(&e_info(OPAL_RC_SLW_REG),
+			"SLW: Trying to set unsupported spr for CPU %x\n",
+				c->pir);
+			return OPAL_UNSUPPORTED;
+		}
+		image = (void *)chip->slw_base;
+		rc = p8_pore_gen_cpureg_fixed(image, P8_SLW_MODEBUILD_SRAM,
+					      sprn, val,
+					      cpu_get_core_index(c),
+					      cpu_get_thread_index(c));
+	} else {
+		log_simple_error(&e_info(OPAL_RC_SLW_REG),
+		"SLW: proc_gen not supported\n");
+		return OPAL_UNSUPPORTED;
+
+	}
+
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_REG),
+			"SLW: Failed to set spr %llx for CPU %x, RC=0x%x\n",
+			sprn, c->pir, rc);
+		return OPAL_INTERNAL_ERROR;
+	}
+	prlog(PR_DEBUG, "SLW: restore spr:0x%llx on c:0x%x with 0x%llx\n",
+	      sprn, c->pir, val);
+	return OPAL_SUCCESS;
+
+}
+
+opal_call(OPAL_SLW_SET_REG, opal_slw_set_reg, 3);
+
+void slw_init(void)
+{
+	struct proc_chip *chip;
+
+	if (proc_chip_quirks & QUIRK_MAMBO_CALLOUTS) {
+		wakeup_engine_state = WAKEUP_ENGINE_NOT_PRESENT;
+		add_cpu_idle_state_properties();
+		return;
+	}
+	if (proc_gen == proc_gen_p8) {
+		for_each_chip(chip) {
+			slw_init_chip_p8(chip);
+			if(slw_image_check_p8(chip))
+				wakeup_engine_state = WAKEUP_ENGINE_PRESENT;
+			if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT)
+				slw_late_init_p8(chip);
+		}
+		p8_sbe_init_timer();
+	} else if (proc_gen == proc_gen_p9) {
+		for_each_chip(chip) {
+			slw_init_chip_p9(chip);
+			if(slw_image_check_p9(chip))
+				wakeup_engine_state = WAKEUP_ENGINE_PRESENT;
+			if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT)
+				slw_late_init_p9(chip);
+		}
+	} else if (proc_gen == proc_gen_p10) {
+		for_each_chip(chip) {
+			slw_init_chip_p10(chip);
+			if(slw_image_check_p9(chip))
+				wakeup_engine_state = WAKEUP_ENGINE_PRESENT;
+			if (wakeup_engine_state == WAKEUP_ENGINE_PRESENT) {
+				slw_late_init_p10(chip);
+			}
+		}
+	}
+	add_cpu_idle_state_properties();
+}
diff --git a/roms/skiboot/hw/test/Makefile.check b/roms/skiboot/hw/test/Makefile.check
new file mode 100644
index 000000000..45eb8072f
--- /dev/null
+++ b/roms/skiboot/hw/test/Makefile.check
@@ -0,0 +1,29 @@
+# -*-Makefile-*-
+SUBDIRS += hw/test/
+HW_TEST := hw/test/phys-map-test hw/test/run-port80h
+
+.PHONY : hw-check
+hw-check: $(HW_TEST:%=%-check)
+
+.PHONY : hw-coverage
+hw-coverage: $(HW_TEST:%=%-gcov-run)
+
+check: hw-check
+coverage: hw-coverage
+
+$(HW_TEST:%=%-gcov-run) : %-run: %
+	$(call QTEST, TEST-COVERAGE ,$< , $<)
+
+$(HW_TEST:%=%-check) : %-check: %
+	$(call QTEST, RUN-TEST ,$(VALGRIND) $<, $<)
+
+$(HW_TEST) : % : %.c hw/phys-map.o
+	$(call Q, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) -O0 -g -I include -I . -o $@ $<, $<)
+
+$(HW_TEST:%=%-gcov): %-gcov : %.c %
+	$(call QTEST, HOSTCC ,$(HOSTCC) $(HOSTCFLAGS) $(HOSTGCOVCFLAGS) -I include -I . -lgcov -o $@ $<, $<)
+
+clean: hw-clean
+
+hw-clean:
+	$(RM) -f hw/test/*.[od] $(HW_TEST) $(HW_TEST:%=%-gcov)
diff --git a/roms/skiboot/hw/test/phys-map-test.c b/roms/skiboot/hw/test/phys-map-test.c
new file mode 100644
index 000000000..d507175fe
--- /dev/null
+++ b/roms/skiboot/hw/test/phys-map-test.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Physical memory map test
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#include "../../core/test/stubs.c"
+#include "../phys-map.c"
+
+enum proc_gen proc_gen;
+
+static inline void print_entry(const struct phys_map_entry *e)
+{
+	printf("type:%i index:%i addr:%016lx size:%016lx",
+	       e->type, e->index, e->addr, e->size);
+}
+
+/* Check table directly for overlaps */
+static void check_table_directly(void)
+{
+	const struct phys_map_entry *e, *prev;
+	uint64_t start, end, pstart, pend;
+	bool passed;
+
+	/* Loop over table entries ...  */
+	for (e = phys_map->table; !phys_map_entry_null(e); e++) {
+
+		start = e->addr;
+		end = e->addr + e->size;
+		/* ... see if they overlap with previous entries */
+		for (prev = phys_map->table; prev != e; prev++) {
+			passed = true;
+			/* Check for overlaping regions */
+			pstart = prev->addr;
+			pend = prev->addr + prev->size;
+			if ((start > pstart) && (start < pend))
+				passed = false;
+			if ((end > pstart) && (end < pend))
+				passed = false;
+
+			/* Check for duplicate entries */
+			if ((e->type == prev->type) &&
+			    (e->index == prev->index))
+				passed = false;
+
+			if (passed)
+				continue;
+
+			printf("Phys map direct test FAILED: Entry overlaps\n");
+			printf("First:  ");
+			print_entry(prev);
+			printf("\n");
+			printf("Second: ");
+			print_entry(e);
+			printf("\n");
+			assert(0);
+		}
+	}
+}
+
+struct map_call_entry {
+	uint64_t start;
+	uint64_t end;
+};
+
+static inline bool map_call_entry_null(const struct map_call_entry *t)
+{
+	if ((t->start == 0) &&
+	    (t->end == 0))
+		return true;
+	return false;
+}
+
+/* Check calls to map to see if they overlap.
+ * Creates a new table for each of the entries it gets to check against
+ */
+
+/* Pick a chip ID, any ID. */
+#define FAKE_CHIP_ID 8
+
+struct proc_chip *get_chip(uint32_t chip_id __unused)
+{
+	return NULL;
+}
+
+static void check_map_call(void)
+{
+	uint64_t start, size, end;
+	const struct phys_map_entry *e;
+	struct map_call_entry *tbl, *t, *tnext;
+	int tbl_size = 0;
+	bool passed;
+
+	for (e = phys_map->table; !phys_map_entry_null(e); e++)
+		tbl_size++;
+
+	tbl_size++; /* allow for null entry at end */
+	tbl_size *= sizeof(struct map_call_entry);
+	tbl = malloc(tbl_size);
+	assert(tbl != NULL);
+	memset(tbl, 0, tbl_size);
+
+	/* Loop over table entries ...  */
+	for (e = phys_map->table; !phys_map_entry_null(e); e++) {
+		__phys_map_get(FAKE_CHIP_ID, FAKE_CHIP_ID, e->type, e->index, &start, &size);
+
+		/* Check for alignment */
+		if ((e->type != SYSTEM_MEM) && (e->type != RESV)) {
+			/* Size is power of 2? */
+			assert(__builtin_popcountl(size) == 1);
+			/* Start is aligned to size? */
+			assert((start % size) == 0);
+		}
+
+		end = start + size;
+		for (t = tbl; !map_call_entry_null(t); t++) {
+			passed = true;
+
+			/* Check for overlaping regions */
+			if ((start > t->start) && (start < t->end))
+				passed = false;
+			if ((end > t->start) && (end < t->end))
+				passed = false;
+
+			if (passed)
+				continue;
+
+			printf("Phys map call test FAILED: Entry overlaps\n");
+			printf("First:  addr:%016lx size:%016lx\n",
+			       t->start, t->end - t->start);
+			printf("Second: addr:%016lx size:%016lx\n  ",
+			       start, size);
+			print_entry(e);
+			printf("\n");
+			assert(0);
+		}
+		/* Insert entry at end of table */
+		t->start = start;
+		t->end = end;
+	}
+
+	for (t = tbl; !map_call_entry_null(t + 1); t++) {
+		tnext = t + 1;
+		/* Make sure the table is sorted */
+		if (t->start > tnext->start) {
+			printf("Phys map test FAILED: Entry not sorted\n");
+			printf("First:  addr:%016lx size:%016lx\n",
+			       t->start, t->end - t->start);
+			printf("Second:  addr:%016lx size:%016lx\n",
+			       tnext->start, tnext->end - tnext->start);
+			assert(0);
+		}
+
+		/* Look for holes in the table in MMIO region */
+		/* We assume over 1PB is MMIO. */
+		if ((t->end != tnext->start) &&
+		    (t->start > 0x0004000000000000)) {
+			printf("Phys map test FAILED: Hole in map\n");
+			printf("First:  addr:%016lx size:%016lx\n",
+			       t->start, t->end - t->start);
+			printf("Second:  addr:%016lx size:%016lx\n",
+			       tnext->start, tnext->end - tnext->start);
+			assert(0);
+		}
+	}
+
+	free(tbl);
+}
+
+/* Fake PVR definitions. See include/processor.h */
+unsigned long fake_pvr[] = {
+	0x004e0200,	/* PVR_P9 */
+	0x004f0100,	/* PVR_P9P */
+	0x00800100,	/* PVR_P10 */
+};
+
+int main(void)
+{
+	for (int i = 0; i < ARRAY_SIZE(fake_pvr); i++) {
+		switch(PVR_TYPE(fake_pvr[i])) {
+		case PVR_TYPE_P9:
+		case PVR_TYPE_P9P:
+			proc_gen = proc_gen_p9;
+			break;
+		case PVR_TYPE_P10:
+			proc_gen = proc_gen_p10;
+			break;
+		default:
+			printf("Unknown PVR 0x%lx\n", fake_pvr[i]);
+			return 1;
+			break;
+		}
+
+		phys_map_init(fake_pvr[i]);
+
+		/* Run tests */
+		check_table_directly();
+		check_map_call();
+	}
+
+	return(0);
+}
diff --git a/roms/skiboot/hw/test/run-port80h.c b/roms/skiboot/hw/test/run-port80h.c
new file mode 100644
index 000000000..860a4244d
--- /dev/null
+++ b/roms/skiboot/hw/test/run-port80h.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * Test result of our LPC port 80h boot progress code
+ *
+ * Copyright 2018-2019 IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <assert.h>
+
+#define __unused __attribute__((unused))
+
+#define __LPC_H
+
+uint8_t port80;
+uint16_t port8x;
+
+static int64_t lpc_probe_write(int addr_type __unused, uint32_t addr,
+                        uint32_t data, uint32_t sz)
+{
+	assert((addr - 0x80) <= 2);
+	assert(sz == 1);
+	if (addr == 0x80)
+		port80 = data;
+	if (addr == 0x81)
+		port8x = data << 8 | (port8x & 0xff);
+	if (addr == 0x82)
+		port8x = (port8x & 0xff00) | data;
+	return 0;
+}
+
+#include "op-panel.h"
+
+void op_display_lpc(enum op_severity s, enum op_module m, uint16_t c);
+
+#include "../lpc-port80h.c"
+#include "../../core/test/stubs.c"
+
+enum proc_chip_quirks proc_chip_quirks;
+
+int main(void)
+{
+	op_display_lpc(OP_LOG, OP_MOD_INIT, 0x00);
+	assert(port80 == 0x80);
+	assert(port8x == 0x8000);
+	op_display_lpc(OP_WARN, OP_MOD_INIT, 0x00);
+	assert(port80 == 0x82);
+	assert(port8x == 0x8002);
+	op_display_lpc(OP_ERROR, OP_MOD_INIT, 0x00);
+	assert(port80 == 0x81);
+	assert(port8x == 0x8001);
+	op_display_lpc(OP_FATAL, OP_MOD_INIT, 0x00);
+	assert(port80 == 0x83);
+	assert(port8x == 0x8003);
+	op_display_lpc(OP_FATAL, OP_MOD_INIT, 0x0f);
+	assert(port80 == 0xBF);
+	assert(port8x == 0x803F);
+	op_display_lpc(OP_LOG, OP_MOD_INIT, 0x0f);
+	assert(port80 == 0xBC);
+	assert(port8x == 0x803C);
+	op_display_lpc(OP_FATAL, OP_MOD_CORE, 0x6666);
+	assert(port80 == 0xBF);
+	assert(port8x == 0x803F);
+	op_display_lpc(OP_LOG, OP_MOD_INIT, 0x01);
+	assert(port80 == 0x84);
+	assert(port8x == 0x8004);
+	op_display_lpc(OP_LOG, OP_MOD_CPU, 0x05);
+	assert(port80 == 0xC4);
+	assert(port8x == 0xC014);
+	op_display_lpc(OP_LOG, OP_MOD_LOCK, 0x07);
+	assert(port80 == 0xDC);
+	assert(port8x == 0xD01C);
+	op_display_lpc(OP_FATAL, OP_MOD_LOCK, 0x07);
+	assert(port80 == 0xDF);
+	assert(port8x == 0xD01F);
+	op_display_lpc(OP_FATAL, OP_MOD_MEM, 0x07);
+	assert(port80 == 0xEF);
+	assert(port8x == 0xE01F);
+	op_display_lpc(OP_WARN, OP_MOD_MEM, 0x02);
+	assert(port80 == 0xEA);
+	assert(port8x == 0xE00A);
+	op_display_lpc(OP_WARN, OP_MOD_CHIPTOD, 0x02);
+	assert(port80 == 0xFA);
+	assert(port8x == 0xF00A);
+
+	/*
+	 * We can't assert that OP_MOD_FSP is invalid as we'd end up
+	 * trying to set port80 in the assert parth
+	 */
+	op_display_lpc(OP_LOG, OP_MOD_FSP, 0x00);
+	assert(port80 == 0x80);
+	assert(port8x == 0x8000);
+	op_display_lpc(OP_LOG, OP_MOD_FSPCON, 0x00);
+	assert(port80 == 0x80);
+	assert(port8x == 0x8000);
+	return 0;
+}
diff --git a/roms/skiboot/hw/vas.c b/roms/skiboot/hw/vas.c
new file mode 100644
index 000000000..0dbe0bcda
--- /dev/null
+++ b/roms/skiboot/hw/vas.c
@@ -0,0 +1,639 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/* Copyright 2013-2018 IBM Corp. */
+
+#include <skiboot.h>
+#include <chip.h>
+#include <phys-map.h>
+#include <xscom.h>
+#include <io.h>
+#include <xive.h>
+#include <interrupts.h>
+#include <nvram.h>
+#include <vas.h>
+
+#define vas_err(__fmt,...)	prlog(PR_ERR,"VAS: " __fmt, ##__VA_ARGS__)
+
+#ifdef VAS_VERBOSE_DEBUG
+#define vas_vdbg(__x,__fmt,...)	prlog(PR_DEBUG,"VAS: " __fmt, ##__VA_ARGS__)
+#else
+#define vas_vdbg(__x,__fmt,...)	do { } while (0)
+#endif
+
+static int vas_initialized;
+
+struct vas {
+	uint32_t	chip_id;
+	uint32_t	vas_id;
+	uint64_t	xscom_base;
+	uint64_t	wcbs;
+	uint32_t	vas_irq;
+	uint64_t	vas_port;
+};
+
+static inline void get_hvwc_mmio_bar(int chipid, uint64_t *start, uint64_t *len)
+{
+	phys_map_get(chipid, VAS_HYP_WIN, 0, start, len);
+}
+
+static inline void get_uwc_mmio_bar(int chipid, uint64_t *start, uint64_t *len)
+{
+	phys_map_get(chipid, VAS_USER_WIN, 0, start, len);
+}
+
+static inline uint64_t compute_vas_scom_addr(struct vas *vas, uint64_t reg)
+{
+	return vas->xscom_base + reg;
+}
+
+static int vas_scom_write(struct proc_chip *chip, uint64_t reg, uint64_t val)
+{
+	int rc;
+	uint64_t addr;
+
+	addr = compute_vas_scom_addr(chip->vas, reg);
+
+	rc = xscom_write(chip->id, addr, val);
+	if (rc != OPAL_SUCCESS) {
+		vas_err("Error writing 0x%llx to 0x%llx, rc %d\n", val, addr,
+				rc);
+	}
+
+	return rc;
+}
+
+/*
+ * Return true if NX crypto/compression is enabled on this processor.
+ *
+ * On POWER8, NX-842 crypto and compression are allowed, but they do not
+ * use VAS (return true).
+ *
+ * On POWER9, NX 842 and GZIP compressions use VAS but the PASTE instruction
+ * and hence VAS is not enabled in following revisions:
+ *
+ *	- Nimbus DD1.X, DD2.01, DD2.1
+ *	- Cumulus DD1.0
+ *
+ * Return false for these revisions. Return true otherwise.
+ */
+__attrconst inline bool vas_nx_enabled(void)
+{
+	uint32_t pvr;
+	int major, minor;
+	struct proc_chip *chip;
+
+	chip = next_chip(NULL);
+
+	pvr = mfspr(SPR_PVR);
+	major = PVR_VERS_MAJ(pvr);
+	minor = PVR_VERS_MIN(pvr);
+
+	switch (chip->type) {
+	case PROC_CHIP_P9_NIMBUS:
+		return (major > 2 || (major == 2 && minor > 1));
+	case PROC_CHIP_P9_CUMULUS:
+		return (major > 1 || minor > 0);
+	default:
+		return true;
+	}
+}
+
+/* Interface for NX - make sure VAS is fully initialized first */
+__attrconst inline uint64_t vas_get_hvwc_mmio_bar(const int chipid)
+{
+	uint64_t addr;
+
+	if (!vas_initialized)
+		return 0ULL;
+
+	get_hvwc_mmio_bar(chipid, &addr, NULL);
+
+	return addr;
+}
+
+/* Interface for NX - make sure VAS is fully initialized first */
+__attrconst uint64_t vas_get_wcbs_bar(int chipid)
+{
+	struct proc_chip *chip;
+
+	if (!vas_initialized)
+		return 0ULL;
+
+	chip = get_chip(chipid);
+	if (!chip)
+		return 0ULL;
+
+	return chip->vas->wcbs;
+}
+
+static int init_north_ctl(struct proc_chip *chip)
+{
+	uint64_t val = 0ULL;
+
+	val = SETFIELD(VAS_64K_MODE_MASK, val, true);
+	val = SETFIELD(VAS_ACCEPT_PASTE_MASK, val, true);
+	val = SETFIELD(VAS_ENABLE_WC_MMIO_BAR, val, true);
+	val = SETFIELD(VAS_ENABLE_UWC_MMIO_BAR, val, true);
+	val = SETFIELD(VAS_ENABLE_RMA_MMIO_BAR, val, true);
+
+	return vas_scom_write(chip, VAS_MISC_N_CTL, val);
+}
+
+/*
+ * Ensure paste instructions are not accepted and MMIO BARs are disabled.
+ */
+static inline int reset_north_ctl(struct proc_chip *chip)
+{
+	return vas_scom_write(chip, VAS_MISC_N_CTL, 0ULL);
+}
+
+static void reset_fir(struct proc_chip *chip)
+{
+	vas_scom_write(chip, VAS_FIR0,		0x0000000000000000ULL);
+	/* From VAS workbook */
+	vas_scom_write(chip, VAS_FIR_MASK,	0x000001000001ffffULL);
+	vas_scom_write(chip, VAS_FIR_ACTION0,	0xf800fdfc0001ffffull);
+	vas_scom_write(chip, VAS_FIR_ACTION1,	0xf8fffefffffc8000ull);
+}
+
+/* VAS workbook: Section 1.3.3.1: Send Message w/ Paste Commands (cl_rma_w) */
+/* P9 paste base address format */
+#define	P9_RMA_LSMP_64K_SYS_ID		PPC_BITMASK(8, 12)
+#define	P9_RMA_LSMP_64K_NODE_ID		PPC_BITMASK(15, 18)
+#define	P9_RMA_LSMP_64K_CHIP_ID		PPC_BITMASK(19, 21)
+
+/* Paste base address format (on P10 or later) */
+#define RMA_FOREIGN_ADDR_ENABLE		PPC_BITMASK(8, 11)
+#define RMA_TOPOLOGY_INDEX		PPC_BITMASK(15, 19)
+
+#define	RMA_LSMP_WINID_START_BIT	32
+#define	RMA_LSMP_WINID_NUM_BITS		16
+
+/*
+ * The start/base of the paste BAR is computed using the tables 1.1 through
+ * 1.4 in Section 1.3.3.1 (Send Message w/Paste Commands (cl_rma_w)) of VAS
+ * P9 Workbook.
+ *
+ * With 64K mode and Large SMP Mode the bits are used as follows:
+ *
+ *	Bits	Values		Comments
+ *	--------------------------------------
+ *	0:7	0b 0000_0000	Reserved
+ *	8:12	0b 0000_1	System id/Foreign Index 0:4
+ *	13:14	0b 00		Foreign Index 5:6
+ *
+ *	15:18	0 throuh 15	Node id (0 through 15)
+ *	19:21	0 through 7	Chip id (0 throuh 7)
+ *	22:23	0b 00		Unused, Foreign index 7:8
+ *
+ *	24:31	0b 0000_0000	RPN 0:7, Reserved
+ *	32:47	0 through 64K	Send Window Id
+ *	48:51	0b 0000		Spare
+ *
+ *	52	0b 0		Reserved
+ *	53	0b 1		Report Enable (Set to 1 for NX).
+ *	54	0b 0		Reserved
+ *
+ *	55:56	0b 00		Snoop Bus
+ *	57:63	0b 0000_000	Reserved
+ *
+ * Except for a few bits, the small SMP mode computation is similar.
+ *
+ * TODO: Detect and compute address for small SMP mode.
+ *
+ * Example: For Node 0, Chip 0, Window id 4, Report Enable 1:
+ *
+ *    Byte0    Byte1    Byte2    Byte3    Byte4    Byte5    Byte6    Byte7
+ *    00000000 00001000 00000000 00000000 00000000 00000100 00000100 00000000
+ *                    |   || |            |               |      |
+ *                    +-+-++++            +-------+-------+      v
+ *                      |   |                      |          Report Enable
+ *                      v   v                      v
+ *                   Node   Chip               Window id 4
+ *
+ *    Thus the paste address for window id 4 is 0x00080000_00040400 and
+ *    the _base_ paste address for Node 0 Chip 0 is 0x00080000_00000000.
+ */
+
+static void p9_get_rma_bar(int chipid, uint64_t *val)
+{
+	uint64_t v;
+
+	v = 0ULL;
+	v = SETFIELD(P9_RMA_LSMP_64K_SYS_ID, v, 1);
+	v = SETFIELD(P9_RMA_LSMP_64K_NODE_ID, v, P9_GCID2NODEID(chipid));
+	v = SETFIELD(P9_RMA_LSMP_64K_CHIP_ID, v, P9_GCID2CHIPID(chipid));
+
+	*val = v;
+}
+
+/*
+ * The start/base of the paste BAR is computed using the tables 1.1 through
+ * 1.3 in Section 1.3.3.1 (Send Message w/Paste Commands (cl_rma_w)) of VAS
+ * P10 Workbook.
+ *
+ * With 64K mode and Large SMP Mode the bits are used as follows:
+ *
+ *	Bits	Values		Comments
+ *	--------------------------------------
+ *	0:7	0b 0000_0000	Reserved
+ *	8:11	0b 0001		Foreign Address Enable
+ *	12	0b 0		SMF
+ *	13:14	0b 00		Memory Select
+ *
+ *	15:19	0 throuh 16	Topology Index
+ *	20:23	0b 0000		Chip Internal Address
+ *
+ *	24:31	0b 0000_0000	RPN 0:7, Reserved
+ *	32:47	0 through 64K	Send Window Id
+ *	48:51	0b 0000		Spare
+ *
+ *	52	0b 0		Reserved
+ *	53	0b 1		Report Enable (Set to 1 for NX).
+ *	54	0b 0		Reserved
+ *
+ *	55:56	0b 00		Snoop Bus
+ *	57:63	0b 0000_000	Reserved
+ *
+ * Example: For Node 0, Chip 0, Window id 4, Report Enable 1:
+ *
+ *    Byte0    Byte1    Byte2    Byte3    Byte4    Byte5    Byte6    Byte7
+ *    00000000 00010000 00000000 00000000 00000000 00000100 00000100 00000000
+ *                      |   |             |               |      |
+ *                      +---+             +-------+-------+      v
+ *                        |                       |          Report Enable
+ *                        v                       v
+ *                 Topology Index            Window id 4
+ *
+ *    Thus the paste address for window id 4 is 0x00100000_00040400 and
+ *    the _base_ paste address for Node 0 Chip 0 is 0x00100000_00000000.
+ *
+ * Note: Bit 11 (Foreign Address Enable) is set only for paste base address.
+ *	 Not for VAS/NX RMA BAR. RA(0:12) = 0 for VAS/NX RMA BAR.
+ */
+
+static void get_rma_bar(struct proc_chip *chip, uint64_t *val)
+{
+	uint64_t v;
+
+	v = 0ULL;
+	v = SETFIELD(RMA_TOPOLOGY_INDEX, v, chip->primary_topology);
+
+	*val = v;
+}
+
+/* Interface for NX - make sure VAS is fully initialized first */
+__attrconst uint64_t vas_get_rma_bar(int chipid)
+{
+	struct proc_chip *chip;
+	uint64_t addr;
+
+	if (!vas_initialized)
+		return 0ULL;
+
+	chip = get_chip(chipid);
+	if (!chip)
+		return 0ULL;
+
+	get_rma_bar(chip, &addr);
+
+	return addr;
+}
+
+/*
+ * Initialize RMA BAR on this chip to correspond to its node/chip id.
+ * This will cause VAS to accept paste commands to targeted for this chip.
+ * Initialize RMA Base Address Mask Register (BAMR) to its default value.
+ */
+static int init_rma(struct proc_chip *chip)
+{
+	int rc;
+	uint64_t val;
+
+	if (proc_gen == proc_gen_p9)
+		p9_get_rma_bar(chip->id, &val);
+	else
+		get_rma_bar(chip, &val);
+
+	rc = vas_scom_write(chip, VAS_RMA_BAR, val);
+	if (rc)
+		return rc;
+
+	val = SETFIELD(VAS_RMA_BAMR_ADDR_MASK, 0ULL, 0xFFFC0000000ULL);
+
+	return vas_scom_write(chip, VAS_RMA_BAMR, val);
+}
+
+/*
+ * get_paste_bar():
+ *
+ * Compute and return the "paste base address region" for @chipid. This
+ * BAR contains the "paste" addreses for all windows on the chip. Linux
+ * uses this paste BAR to compute the hardware paste address of a (send)
+ * window using:
+ *
+ * 	paste_addr = base + (winid << shift)
+ *
+ * where winid is the window index and shift is computed as:
+ *
+ *     start = RMA_LSMP_WINID_START_BIT;
+ *     nbits = RMA_LSMP_WINID_NUM_BITS;
+ *     shift = 63 - (start + nbits - 1);
+ *
+ * See also get_paste_bitfield() below, which is used to export the 'start'
+ * and 'nbits' to Linux through the DT.
+ *
+ * Each chip supports VAS_WINDOWS_PER_CHIP (64K on Power9) windows. To
+ * provide proper isolation, the paste address for each window is on a
+ * separate page. Thus with a page size of 64K, the length of the paste
+ * BAR for a chip is VAS_WINDOWS_PER_CHIP times 64K (or 4GB for Power9).
+ *
+ */
+#define        VAS_PASTE_BAR_LEN       (1ULL << 32)    /* 4GB - see above */
+
+static inline void get_paste_bar(int chipid, uint64_t *start, uint64_t *len)
+{
+	struct proc_chip *chip;
+	uint64_t val;
+
+	if (proc_gen == proc_gen_p9)
+		p9_get_rma_bar(chipid, &val);
+	else {
+		chip = get_chip(chipid);
+		if (!chip)
+			return;
+
+		get_rma_bar(chip, &val);
+
+		/*
+		 * RA(11) (Foreign Address Enable) is set only for paste
+		 * base address.
+		 */
+		val = SETFIELD(RMA_FOREIGN_ADDR_ENABLE, val, 1);
+	}
+
+	*start = val;
+	*len = VAS_PASTE_BAR_LEN;
+}
+
+/*
+ * get_paste_bitfield():
+ *
+ * As explained in the function header for get_paste_bar(), the window
+ * id is encoded in bits 32:47 of the paste address. Export this bitfield
+ * to Linux via the device tree as a reg property (with start bit and
+ * number of bits).
+ */
+static inline void get_paste_bitfield(uint64_t *start, uint64_t *n_bits)
+{
+	*start = (uint64_t)RMA_LSMP_WINID_START_BIT;
+	*n_bits = (uint64_t)RMA_LSMP_WINID_NUM_BITS;
+}
+
+/*
+ * Window Context MMIO (WCM) Region for each chip is assigned in the P9
+ * MMIO MAP spreadsheet. Write this value to the SCOM address associated
+ * with WCM_BAR.
+ */
+static int init_wcm(struct proc_chip *chip)
+{
+	uint64_t wcmbar;
+
+	get_hvwc_mmio_bar(chip->id, &wcmbar, NULL);
+
+	/*
+	 * Write the entire WCMBAR address to the SCOM address. VAS will
+	 * extract bits that it thinks are relevant i.e bits 8..38
+	 */
+	return vas_scom_write(chip, VAS_WCM_BAR, wcmbar);
+}
+
+/*
+ * OS/User Window Context MMIO (UWCM) Region for each is assigned in the
+ * P9 MMIO MAP spreadsheet. Write this value to the SCOM address associated
+ * with UWCM_BAR.
+ */
+static int init_uwcm(struct proc_chip *chip)
+{
+	uint64_t uwcmbar;
+
+	get_uwc_mmio_bar(chip->id, &uwcmbar, NULL);
+
+	/*
+	 * Write the entire UWCMBAR address to the SCOM address. VAS will
+	 * extract bits that it thinks are relevant i.e bits 8..35.
+	 */
+	return vas_scom_write(chip, VAS_UWCM_BAR, uwcmbar);
+}
+
+static inline void free_wcbs(struct proc_chip *chip)
+{
+	if (chip->vas->wcbs) {
+		free((void *)chip->vas->wcbs);
+		chip->vas->wcbs = 0ULL;
+	}
+}
+
+/*
+ * VAS needs a backing store for the 64K window contexts on a chip.
+ * (64K times 512 = 8MB). This region needs to be contiguous, so
+ * allocate during early boot. Then write the allocated address to
+ * the SCOM address for the Backing store BAR.
+ */
+static int alloc_init_wcbs(struct proc_chip *chip)
+{
+	int rc;
+	uint64_t wcbs;
+	size_t size;
+
+	/* align to the backing store size */
+	size = (size_t)VAS_WCBS_SIZE;
+	wcbs = (uint64_t)local_alloc(chip->id, size, size);
+	if (!wcbs) {
+		vas_err("Unable to allocate memory for backing store\n");
+		return -ENOMEM;
+	}
+	memset((void *)wcbs, 0ULL, size);
+
+	/*
+	 * Write entire WCBS_BAR address to the SCOM address. VAS will extract
+	 * relevant bits.
+	 */
+	rc = vas_scom_write(chip, VAS_WCBS_BAR, wcbs);
+	if (rc != OPAL_SUCCESS)
+		goto out;
+
+	chip->vas->wcbs = wcbs;
+	return OPAL_SUCCESS;
+
+out:
+	free((void *)wcbs);
+	return rc;
+}
+
+static struct vas *alloc_vas(uint32_t chip_id, uint32_t vas_id, uint64_t base)
+{
+	struct vas *vas;
+
+	vas = zalloc(sizeof(struct vas));
+	assert(vas);
+
+	vas->chip_id = chip_id;
+	vas->vas_id = vas_id;
+	vas->xscom_base = base;
+
+	return vas;
+}
+
+static void create_mm_dt_node(struct proc_chip *chip)
+{
+	struct dt_node *dn;
+	struct vas *vas;
+	const char *compat;
+	uint64_t hvwc_start, hvwc_len;
+	uint64_t uwc_start, uwc_len;
+	uint64_t pbf_start, pbf_nbits;
+	uint64_t pbar_start = 0, pbar_len = 0;
+
+	vas = chip->vas;
+	get_hvwc_mmio_bar(chip->id, &hvwc_start, &hvwc_len);
+	get_uwc_mmio_bar(chip->id, &uwc_start, &uwc_len);
+	get_paste_bar(chip->id, &pbar_start, &pbar_len);
+	get_paste_bitfield(&pbf_start, &pbf_nbits);
+
+	if (proc_gen == proc_gen_p9)
+		compat = "ibm,power9-vas";
+	else
+		compat = "ibm,power10-vas";
+
+	dn = dt_new_addr(dt_root, "vas", hvwc_start);
+
+	dt_add_property_strings(dn, "compatible", compat,
+					"ibm,vas");
+
+	dt_add_property_u64s(dn, "reg", hvwc_start, hvwc_len,
+					uwc_start, uwc_len,
+					pbar_start, pbar_len,
+					pbf_start, pbf_nbits);
+
+	dt_add_property_cells(dn, "ibm,vas-id", vas->vas_id);
+	dt_add_property_cells(dn, "ibm,chip-id", chip->id);
+	if (vas->vas_irq) {
+		dt_add_property_cells(dn, "interrupts", vas->vas_irq, 0);
+		dt_add_property_cells(dn, "interrupt-parent",
+					get_ics_phandle());
+		dt_add_property_u64(dn, "ibm,vas-port", vas->vas_port);
+	}
+}
+
+/*
+ * Disable one VAS instance.
+ *
+ * Free memory and ensure chip does not accept paste instructions.
+ */
+static void disable_vas_inst(struct dt_node *np)
+{
+	struct proc_chip *chip;
+
+	chip = get_chip(dt_get_chip_id(np));
+
+	if (!chip->vas)
+		return;
+
+	free_wcbs(chip);
+
+	reset_north_ctl(chip);
+}
+
+static void vas_setup_irq(struct proc_chip *chip)
+{
+	uint64_t port;
+	uint32_t irq;
+
+	irq = xive_alloc_ipi_irqs(chip->id, 1, 64);
+	if (irq == XIVE_IRQ_ERROR) {
+		vas_err("Failed to allocate interrupt sources for chipID %d\n",
+				chip->id);
+		return;
+	}
+
+	vas_vdbg("trigger port: 0x%p\n", xive_get_trigger_port(irq));
+
+	port = (uint64_t)xive_get_trigger_port(irq);
+
+	chip->vas->vas_irq = irq;
+	chip->vas->vas_port = port;
+}
+
+/*
+ * Initialize one VAS instance and enable it if @enable is true.
+ */
+static int init_vas_inst(struct dt_node *np, bool enable)
+{
+	uint32_t vas_id;
+	uint64_t xscom_base;
+	struct proc_chip *chip;
+
+	chip = get_chip(dt_get_chip_id(np));
+	vas_id = dt_prop_get_u32(np, "ibm,vas-id");
+	xscom_base = dt_get_address(np, 0, NULL);
+
+	chip->vas = alloc_vas(chip->id, vas_id, xscom_base);
+
+	if (!enable) {
+		reset_north_ctl(chip);
+		return 0;
+	}
+
+	if (alloc_init_wcbs(chip))
+		return -1;
+
+	reset_fir(chip);
+
+	if (init_wcm(chip) || init_uwcm(chip) || init_north_ctl(chip) ||
+	    			init_rma(chip))
+		return -1;
+
+	/*
+	 * Use NVRAM 'vas-user-space' config for backward compatibility
+	 * to older kernels. Remove this option in future if not needed.
+	 */
+	if (nvram_query_eq_dangerous("vas-user-space", "enable"))
+		vas_setup_irq(chip);
+
+	create_mm_dt_node(chip);
+
+	prlog(PR_INFO, "VAS: Initialized chip %d\n", chip->id);
+	return 0;
+
+}
+
+void vas_init(void)
+{
+	bool enabled;
+	struct dt_node *np;
+	const char *compat;
+
+	if (proc_gen == proc_gen_p9)
+		compat = "ibm,power9-vas-x";
+	else if (proc_gen == proc_gen_p10)
+		compat = "ibm,power10-vas-x";
+	else
+		return;
+
+	enabled = vas_nx_enabled();
+
+	dt_for_each_compatible(dt_root, np, compat) {
+		if (init_vas_inst(np, enabled))
+			goto out;
+	}
+
+	vas_initialized = enabled;
+	return;
+
+out:
+	dt_for_each_compatible(dt_root, np, compat)
+		disable_vas_inst(np);
+
+	vas_err("Disabled (failed initialization)\n");
+	return;
+}
diff --git a/roms/skiboot/hw/xive.c b/roms/skiboot/hw/xive.c
new file mode 100644
index 000000000..51b03549a
--- /dev/null
+++ b/roms/skiboot/hw/xive.c
@@ -0,0 +1,5234 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * XIVE: eXternal Interrupt Virtualization Engine. POWER9 interrupt
+ * controller
+ *
+ * Copyright (c) 2016-2019, IBM Corporation.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <chip.h>
+#include <io.h>
+#include <xive.h>
+#include <xive-p9-regs.h>
+#include <xscom-p9-regs.h>
+#include <interrupts.h>
+#include <timebase.h>
+#include <bitmap.h>
+#include <buddy.h>
+#include <phys-map.h>
+#include <p9_stop_api.H>
+
+/* Always notify from EQ to VP (no EOI on EQs). Will speed up
+ * EOIs at the expense of potentially higher powerbus traffic.
+ */
+#define EQ_ALWAYS_NOTIFY
+
+/* Verbose debug */
+#undef XIVE_VERBOSE_DEBUG
+
+/* Extra debug options used in debug builds */
+#ifdef DEBUG
+#define XIVE_DEBUG_DUPLICATES
+#define XIVE_PERCPU_LOG
+#define XIVE_DEBUG_INIT_CACHE_UPDATES
+#define XIVE_EXTRA_CHECK_INIT_CACHE
+#undef XIVE_CHECK_MISROUTED_IPI
+#define XIVE_CHECK_LOCKS
+#else
+#undef  XIVE_DEBUG_DUPLICATES
+#undef  XIVE_PERCPU_LOG
+#undef  XIVE_DEBUG_INIT_CACHE_UPDATES
+#undef  XIVE_EXTRA_CHECK_INIT_CACHE
+#undef  XIVE_CHECK_MISROUTED_IPI
+#undef  XIVE_CHECK_LOCKS
+#endif
+
+/*
+ *
+ * VSDs, blocks, set translation etc...
+ *
+ * This stuff confused me to no end so here's an attempt at explaining
+ * my understanding of it and how I use it in OPAL & Linux
+ *
+ * For the following data structures, the XIVE use a mechanism called
+ * Virtualization Structure Tables (VST) to manage the memory layout
+ * and access: ESBs (Event State Buffers, aka IPI sources), EAS/IVT
+ * (Event assignment structures), END/EQs (Notification descriptors
+ * aka event queues) and NVT/VPD (Notification Virtual Targets).
+ *
+ * These structures divide those tables into 16 "blocks". Each XIVE
+ * instance has a definition for all 16 blocks that can either represent
+ * an actual table in memory or a remote XIVE MMIO port to access a
+ * block that is owned by that remote XIVE.
+ *
+ * Our SW design will consist of allocating one block per chip (and thus
+ * per XIVE instance) for now, thus giving us up to 16 supported chips in
+ * the system. We may have to revisit that if we ever support systems with
+ * more than 16 chips but that isn't on our radar at the moment or if we
+ * want to do like pHyp on some machines and dedicate 2 blocks per chip
+ * for some structures.
+ *
+ * Thus we need to be careful that we never expose to Linux the concept
+ * of block and block boundaries, but instead we provide full number ranges
+ * so that consecutive blocks can be supported.
+ *
+ * We will pre-allocate some of the tables in order to support a "fallback"
+ * mode operations where an old-style XICS is emulated via OPAL calls. This
+ * is achieved by having a default of one VP per physical thread associated
+ * with one EQ and one IPI. There is also enought EATs to cover all the PHBs.
+ *
+ * Similarily, for MMIO access, the BARs support what is called "set
+ * translation" which allows the BAR to be divided into a certain
+ * number of sets. The VC BAR (ESBs, ENDs, ...) supports 64 sets and
+ * the PC BAR supports 16. Each "set" can be routed to a specific
+ * block and offset within a block.
+ *
+ * For now, we will not use much of that functionality. We will use a
+ * fixed split between ESB and ENDs for the VC BAR as defined by the
+ * constants below and we will allocate all the PC BARs set to the
+ * local block of that chip
+ */
+
+#define XIVE_VSD_SIZE		sizeof(u64)
+
+/* VC BAR contains set translations for the ESBs and the EQs.
+ *
+ * It's divided in 64 sets, each of which can be either ESB pages or EQ pages.
+ * The table configuring this is the EDT
+ *
+ * Additionally, the ESB pages come in pair of Linux_Trig_Mode isn't enabled
+ * (which we won't enable for now as it assumes write-only permission which
+ * the MMU doesn't support).
+ *
+ * To get started we just hard wire the following setup:
+ *
+ * VC_BAR size is 512G. We split it into 384G of ESBs (48 sets) and 128G
+ * of ENDs (16 sets) for the time being. IE. Each set is thus 8GB
+ */
+
+#define VC_ESB_SETS	48
+#define VC_END_SETS	16
+#define VC_MAX_SETS	64
+
+/* The table configuring the PC set translation (16 sets) is the VDT */
+#define PC_MAX_SETS	16
+
+/* XXX This is the currently top limit of number of ESB/SBE entries
+ * and EAS/IVT entries pre-allocated per chip. This should probably
+ * turn into a device-tree property or NVRAM setting, or maybe
+ * calculated from the amount of system RAM...
+ *
+ * This is currently set to 1M
+ *
+ * This is independent of the sizing of the MMIO space.
+ *
+ * WARNING: Due to how XICS emulation works, we cannot support more
+ * interrupts per chip at this stage as the full interrupt number
+ * (block + index) has to fit in a 24-bit number.
+ *
+ * That gives us a pre-allocated space of 256KB per chip for the state
+ * bits and 8M per chip for the EAS/IVT.
+ *
+ * Note: The HW interrupts from PCIe and similar other entities that
+ * use their own state bit array will have to share that IVT space,
+ * so we could potentially make the IVT size twice as big, but for now
+ * we will simply share it and ensure we don't hand out IPIs that
+ * overlap the HW interrupts.
+ *
+ * TODO: adjust the VC BAR range for IPI ESBs on this value
+ */
+
+#define XIVE_INT_ORDER		20 /* 1M interrupts */
+#define XIVE_INT_COUNT		(1ul << XIVE_INT_ORDER)
+
+/*
+ * First interrupt number, also the first logical interrupt number
+ * allocated by Linux (the first numbers are reserved for ISA)
+ */
+#define XIVE_INT_FIRST		0x10
+
+/* Corresponding direct table sizes */
+
+#define SBE_PER_BYTE	        4 /* PQ bits couples */
+#define SBE_SIZE	        (XIVE_INT_COUNT / SBE_PER_BYTE)
+#define IVT_SIZE	        (XIVE_INT_COUNT * sizeof(struct xive_ive))
+
+/* Use 64K for everything by default */
+#define XIVE_ESB_SHIFT		(16 + 1) /* trigger + mgmt pages */
+#define XIVE_ESB_PAGE_SIZE	(1ul << XIVE_ESB_SHIFT) /* 2 pages */
+
+/* Max number of EQs. We allocate an indirect table big enough so
+ * that when fully populated we can have that many EQs.
+ *
+ * The max number of EQs we support in our MMIO space is 128G/128K
+ * ie. 1M. Since one EQ is 8 words (32 bytes), a 64K page can hold
+ * 2K EQs. We need 512 pointers, ie, 4K of memory for the indirect
+ * table.
+ *
+ * TODO: adjust the VC BAR range for END ESBs on this value
+ */
+#define EQ_PER_PAGE		(PAGE_SIZE / sizeof(struct xive_eq))
+
+#define XIVE_EQ_ORDER		20 /* 1M ENDs */
+#define XIVE_EQ_COUNT		(1ul << XIVE_EQ_ORDER)
+#define XIVE_EQ_TABLE_SIZE	((XIVE_EQ_COUNT / EQ_PER_PAGE) * XIVE_VSD_SIZE)
+
+#define XIVE_EQ_SHIFT		(16 + 1) /* ESn + ESe pages */
+
+/* Number of priorities (and thus EQDs) we allocate for each VP */
+#define NUM_INT_PRIORITIES	8
+
+/* Max priority number */
+#define XIVE_MAX_PRIO		7
+
+/* Priority used for the one queue in XICS emulation */
+#define XIVE_EMULATION_PRIO	7
+
+/* Priority used for gather/silent escalation (KVM) */
+#define XIVE_ESCALATION_PRIO	7
+
+/* Max number of VPs. We allocate an indirect table big enough so
+ * that when fully populated we can have that many VPs.
+ *
+ * The max number of VPs we support in our MMIO space is 64G/64K
+ * ie. 1M. Since one VP is 16 words (64 bytes), a 64K page can hold
+ * 1K EQ. We need 1024 pointers, ie, 8K of memory for the indirect
+ * table.
+ *
+ * HOWEVER: A block supports only up to 512K VPs (19 bits of target
+ * in the EQ). Since we currently only support 1 block per chip,
+ * we will allocate half of the above. We might add support for
+ * 2 blocks per chip later if necessary.
+ *
+ * TODO: adjust the PC BAR range
+ */
+#define VP_PER_PAGE		(PAGE_SIZE / sizeof(struct xive_vp))
+
+#define NVT_SHIFT		19	/* in sync with EQ_W6_NVT_INDEX */
+
+/*
+ * We use 8 priorities per VP and the number of EQs is configured to
+ * 1M. Therefore, our VP space is limited to 128k.
+ */
+#define XIVE_VP_ORDER		(XIVE_EQ_ORDER - 3) /* 128k */
+#define XIVE_VP_COUNT		(1ul << XIVE_VP_ORDER)
+#define XIVE_VP_TABLE_SIZE	((XIVE_VP_COUNT / VP_PER_PAGE) * XIVE_VSD_SIZE)
+
+/*
+ * VP ids for HW threads.
+ *
+ * These values are hardcoded in the CAM line of the HW context and
+ * they depend on the thread id bits of the chip, 7bit for p9.
+ *
+ *     HW CAM Line      |chip|000000000001|thrdid |
+ *     23bits              4      12          7
+ */
+#define XIVE_THREADID_SHIFT	7
+#define XIVE_HW_VP_BASE		(1 << XIVE_THREADID_SHIFT)
+#define XIVE_HW_VP_COUNT	(1 << XIVE_THREADID_SHIFT)
+
+/* The xive operation mode indicates the active "API" and corresponds
+ * to the "mode" parameter of the opal_xive_reset() call
+ */
+static enum {
+	XIVE_MODE_EMU	= OPAL_XIVE_MODE_EMU,
+	XIVE_MODE_EXPL	= OPAL_XIVE_MODE_EXPL,
+	XIVE_MODE_NONE,
+} xive_mode = XIVE_MODE_NONE;
+
+
+/* Each source controller has one of these. There's one embedded
+ * in the XIVE struct for IPIs
+ */
+struct xive_src {
+	struct irq_source		is;
+	const struct irq_source_ops	*orig_ops;
+	struct xive			*xive;
+	void				*esb_mmio;
+	uint32_t			esb_base;
+	uint32_t			esb_shift;
+	uint32_t			flags;
+};
+
+#define LOG_TYPE_XIRR	0
+#define LOG_TYPE_XIRR2	1
+#define LOG_TYPE_POPQ	2
+#define LOG_TYPE_EOI	3
+#define LOG_TYPE_EQD	4
+
+struct xive_log_ent {
+	uint8_t type;
+	uint8_t cnt;
+	uint64_t tb;
+#define MAX_LOG_DATA	8
+	uint32_t data[MAX_LOG_DATA];
+};
+#define MAX_LOG_ENT	32
+
+struct xive_cpu_state {
+	struct xive	*xive;
+	void		*tm_ring1;
+
+#ifdef XIVE_PERCPU_LOG
+	struct xive_log_ent log[MAX_LOG_ENT];
+	uint32_t	log_pos;
+#endif
+	/* Base HW VP and associated queues */
+	uint32_t	vp_blk;
+	uint32_t	vp_idx;
+	uint32_t	eq_blk;
+	uint32_t	eq_idx; /* Base eq index of a block of 8 */
+	void		*eq_page;
+
+	/* Pre-allocated IPI */
+	uint32_t	ipi_irq;
+
+	/* Use for XICS emulation */
+	struct lock	lock;
+	uint8_t		cppr;
+	uint8_t		mfrr;
+	uint8_t		pending;
+	uint8_t		prev_cppr;
+	uint32_t	*eqbuf;
+	uint32_t	eqptr;
+	uint32_t	eqmsk;
+	uint8_t		eqgen;
+	void		*eqmmio;
+	uint64_t	total_irqs;
+};
+
+#ifdef XIVE_PERCPU_LOG
+
+static void log_add(struct xive_cpu_state *xs, uint8_t type,
+		    uint8_t count, ...)
+{
+	struct xive_log_ent *e = &xs->log[xs->log_pos];
+	va_list args;
+	int i;
+
+	e->type = type;
+	e->cnt = count;
+	e->tb = mftb();
+	va_start(args, count);
+	for (i = 0; i < count; i++)
+		e->data[i] = va_arg(args, u32);
+	va_end(args);
+	xs->log_pos = xs->log_pos + 1;
+	if (xs->log_pos == MAX_LOG_ENT)
+		xs->log_pos = 0;
+}
+
+static void log_print(struct xive_cpu_state *xs)
+{
+	uint32_t pos = xs->log_pos;
+	uint8_t buf[256];
+	int i, j;
+	static const char *lts[] = {
+		">XIRR",
+		"<XIRR",
+		" POPQ",
+		"  EOI",
+		"  EQD"
+	};
+	for (i = 0; i < MAX_LOG_ENT; i++) {
+		struct xive_log_ent *e = &xs->log[pos];
+		uint8_t *b = buf, *eb = &buf[255];
+
+		b += snprintf(b, eb-b, "%08llx %s ", e->tb,
+			      lts[e->type]);
+		for (j = 0; j < e->cnt && b < eb; j++)
+			b += snprintf(b, eb-b, "%08x ", e->data[j]);
+		printf("%s\n", buf);
+		pos = pos + 1;
+		if (pos == MAX_LOG_ENT)
+			pos = 0;
+	}
+}
+
+#else /* XIVE_PERCPU_LOG */
+
+static inline void log_add(struct xive_cpu_state *xs __unused,
+			   uint8_t type __unused,
+			   uint8_t count __unused, ...) { }
+static inline void log_print(struct xive_cpu_state *xs __unused) { }
+
+#endif /* XIVE_PERCPU_LOG */
+
+struct xive {
+	uint32_t	chip_id;
+	uint32_t	block_id;
+	struct dt_node	*x_node;
+
+	uint64_t	xscom_base;
+
+	/* MMIO regions */
+	void		*ic_base;
+	uint64_t	ic_size;
+	uint32_t	ic_shift;
+	void		*tm_base;
+	uint64_t	tm_size;
+	uint32_t	tm_shift;
+	void		*pc_base;
+	uint64_t	pc_size;
+	void		*vc_base;
+	uint64_t	vc_size;
+
+	void		*esb_mmio;
+	void		*eq_mmio;
+
+	/* Set on XSCOM register access error */
+	bool		last_reg_error;
+
+	/* Per-XIVE mutex */
+	struct lock	lock;
+
+	/* Pre-allocated tables.
+	 *
+	 * We setup all the VDS for actual tables (ie, by opposition to
+	 * forwarding ports) as either direct pre-allocated or indirect
+	 * and partially populated.
+	 *
+	 * Currently, the ESB/SBE and the EAS/IVT tables are direct and
+	 * fully pre-allocated based on XIVE_INT_COUNT.
+	 *
+	 * The other tables are indirect, we thus pre-allocate the indirect
+	 * table (ie, pages of pointers) and populate enough of the pages
+	 * for our basic setup using 64K pages.
+	 *
+	 * The size of the indirect tables are driven by XIVE_VP_COUNT and
+	 * XIVE_EQ_COUNT. The number of pre-allocated ones are driven by
+	 * XIVE_HW_VP_COUNT (number of EQ depends on number of VP) in block
+	 * mode, otherwise we only preallocate INITIAL_BLK0_VP_COUNT on
+	 * block 0.
+	 */
+
+	/* Direct SBE and IVT tables */
+	void		*sbe_base;
+	void		*ivt_base;
+
+	/* Indirect END/EQ table. NULL entries are unallocated, count is
+	 * the numbre of pointers (ie, sub page placeholders).
+	 */
+	__be64		*eq_ind_base;
+	uint32_t	eq_ind_count;
+
+	/* EQ allocation bitmap. Each bit represent 8 EQs */
+	bitmap_t	*eq_map;
+
+	/* Indirect NVT/VP table. NULL entries are unallocated, count is
+	 * the numbre of pointers (ie, sub page placeholders).
+	 */
+	__be64		*vp_ind_base;
+	uint32_t	vp_ind_count;
+
+	/* Pool of donated pages for provisioning indirect EQ and VP pages */
+	struct list_head donated_pages;
+
+	/* To ease a possible change to supporting more than one block of
+	 * interrupts per chip, we store here the "base" global number
+	 * and max number of interrupts for this chip. The global number
+	 * encompass the block number and index.
+	 */
+	uint32_t	int_base;
+	uint32_t	int_max;
+
+	/* Due to the overlap between IPIs and HW sources in the IVT table,
+	 * we keep some kind of top-down allocator. It is used for HW sources
+	 * to "allocate" interrupt entries and will limit what can be handed
+	 * out as IPIs. Of course this assumes we "allocate" all HW sources
+	 * before we start handing out IPIs.
+	 *
+	 * Note: The numbers here are global interrupt numbers so that we can
+	 * potentially handle more than one block per chip in the future.
+	 */
+	uint32_t	int_hw_bot;	/* Bottom of HW allocation */
+	uint32_t	int_ipi_top;	/* Highest IPI handed out so far + 1 */
+
+	/* The IPI allocation bitmap */
+	bitmap_t	*ipi_alloc_map;
+
+	/* We keep track of which interrupts were ever enabled to
+	 * speed up xive_reset
+	 */
+	bitmap_t	*int_enabled_map;
+
+	/* Embedded source IPIs */
+	struct xive_src	ipis;
+
+	/* Embedded escalation interrupts */
+	struct xive_src	esc_irqs;
+
+	/* In memory queue overflow */
+	void		*q_ovf;
+};
+
+#define XIVE_CAN_STORE_EOI(x) XIVE_STORE_EOI_ENABLED
+
+/* Global DT node */
+static struct dt_node *xive_dt_node;
+
+
+/* Block <-> Chip conversions.
+ *
+ * As chipIDs may not be within the range of 16 block IDs supported by XIVE,
+ * we have a 2 way conversion scheme.
+ *
+ * From block to chip, use the global table below.
+ *
+ * From chip to block, a field in struct proc_chip contains the first block
+ * of that chip. For now we only support one block per chip but that might
+ * change in the future
+ */
+#define XIVE_INVALID_CHIP	0xffffffff
+#define XIVE_MAX_CHIPS		16
+static uint32_t xive_block_to_chip[XIVE_MAX_CHIPS];
+static uint32_t xive_block_count;
+
+static uint32_t xive_chip_to_block(uint32_t chip_id)
+{
+	struct proc_chip *c = get_chip(chip_id);
+
+	assert(c);
+	assert(c->xive);
+	return c->xive->block_id;
+}
+
+/* Conversion between GIRQ and block/index.
+ *
+ * ------------------------------------
+ * |0000000E|BLOC|               INDEX|
+ * ------------------------------------
+ *      8      4           20
+ *
+ * the E bit indicates that this is an escalation interrupt, in
+ * that case, the BLOCK/INDEX points to the EQ descriptor associated
+ * with the escalation.
+ *
+ * Global interrupt numbers for non-escalation interrupts are thus
+ * limited to 24 bits because the XICS emulation encodes the CPPR
+ * value in the top (MSB) 8 bits. Hence, 4 bits are left for the XIVE
+ * block number and the remaining 20 bits for the interrupt index
+ * number.
+ */
+#define INT_SHIFT		20
+#define INT_ESC_SHIFT		(INT_SHIFT + 4) /* 4bits block id */
+
+#if XIVE_INT_ORDER > INT_SHIFT
+#error "Too many ESBs for IRQ encoding"
+#endif
+
+#if XIVE_EQ_ORDER > INT_SHIFT
+#error "Too many EQs for escalation IRQ number encoding"
+#endif
+
+#define GIRQ_TO_BLK(__g)	(((__g) >> INT_SHIFT) & 0xf)
+#define GIRQ_TO_IDX(__g)	((__g) & ((1 << INT_SHIFT) - 1))
+#define BLKIDX_TO_GIRQ(__b,__i)	(((uint32_t)(__b)) << INT_SHIFT | (__i))
+#define GIRQ_IS_ESCALATION(__g)	((__g) & (1 << INT_ESC_SHIFT))
+#define MAKE_ESCALATION_GIRQ(__b,__i)(BLKIDX_TO_GIRQ(__b,__i) | (1 << INT_ESC_SHIFT))
+
+/* Block/IRQ to chip# conversions */
+#define PC_BLK_TO_CHIP(__b)	(xive_block_to_chip[__b])
+#define VC_BLK_TO_CHIP(__b)	(xive_block_to_chip[__b])
+#define GIRQ_TO_CHIP(__isn)	(VC_BLK_TO_CHIP(GIRQ_TO_BLK(__isn)))
+
+/* Routing of physical processors to VPs */
+#define PIR2VP_IDX(__pir)	(XIVE_HW_VP_BASE | P9_PIR2LOCALCPU(__pir))
+#define PIR2VP_BLK(__pir)	(xive_chip_to_block(P9_PIR2GCID(__pir)))
+#define VP2PIR(__blk, __idx)	(P9_PIRFROMLOCALCPU(VC_BLK_TO_CHIP(__blk), (__idx) & 0x7f))
+
+/* Decoding of OPAL API VP IDs. The VP IDs are encoded as follow
+ *
+ * Block group mode:
+ *
+ * -----------------------------------
+ * |GVEOOOOO|                   INDEX|
+ * -----------------------------------
+ *  ||   |
+ *  ||  Order
+ *  |Virtual
+ *  Group
+ *
+ * G (Group)   : Set to 1 for a group VP (not currently supported)
+ * V (Virtual) : Set to 1 for an allocated VP (vs. a physical processor ID)
+ * E (Error)   : Should never be 1, used internally for errors
+ * O (Order)   : Allocation order of the VP block
+ *
+ * The conversion is thus done as follow (groups aren't implemented yet)
+ *
+ *  If V=0, O must be 0 and 24-bit INDEX value is the PIR
+ *  If V=1, the order O group is allocated such that if N is the number of
+ *          chip bits considered for allocation (*)
+ *          then the INDEX is constructed as follow (bit numbers such as 0=LSB)
+ *           - bottom O-N bits is the index within the "VP block"
+ *           - next N bits is the XIVE blockID of the VP
+ *           - the remaining bits is the per-chip "base"
+ *          so the conversion consists of "extracting" the block ID and moving
+ *          down the upper bits by N bits.
+ *
+ * In non-block-group mode, the difference is that the blockID is
+ * on the left of the index (the entire VP block is in a single
+ * block ID)
+ */
+
+/* VP allocation */
+static uint32_t xive_chips_alloc_bits = 0;
+static struct buddy *xive_vp_buddy;
+static struct lock xive_buddy_lock = LOCK_UNLOCKED;
+
+/* VP# decoding/encoding */
+static bool xive_decode_vp(uint32_t vp, uint32_t *blk, uint32_t *idx,
+			   uint8_t *order, bool *group)
+{
+	uint32_t o = (vp >> 24) & 0x1f;
+	uint32_t n = xive_chips_alloc_bits;
+	uint32_t index = vp & 0x00ffffff;
+	uint32_t imask = (1 << (o - n)) - 1;
+
+	/* Groups not supported yet */
+	if ((vp >> 31) & 1)
+		return false;
+	if (group)
+		*group = false;
+
+	/* PIR case */
+	if (((vp >> 30) & 1) == 0) {
+		if (find_cpu_by_pir(index) == NULL)
+			return false;
+		if (blk)
+			*blk = PIR2VP_BLK(index);
+		if (idx)
+			*idx = PIR2VP_IDX(index);
+		return true;
+	}
+
+	/* Ensure o > n, we have *at least* 2 VPs per block */
+	if (o <= n)
+		return false;
+
+	/* Combine the index base and index */
+	if (idx)
+		*idx = ((index >> n) & ~imask) | (index & imask);
+	/* Extract block ID */
+	if (blk)
+		*blk = (index >> (o - n)) & ((1 << n) - 1);
+
+	/* Return order as well if asked for */
+	if (order)
+		*order = o;
+
+	return true;
+}
+
+static uint32_t xive_encode_vp(uint32_t blk, uint32_t idx, uint32_t order)
+{
+	uint32_t vp = 0x40000000 | (order << 24);
+	uint32_t n = xive_chips_alloc_bits;
+	uint32_t imask = (1 << (order - n)) - 1;
+
+	vp |= (idx & ~imask) << n;
+	vp |= blk << (order - n);
+	vp |= idx & imask;
+	return  vp;
+}
+
+#define xive_regw(__x, __r, __v) \
+	__xive_regw(__x, __r, X_##__r, __v, #__r)
+#define xive_regr(__x, __r) \
+	__xive_regr(__x, __r, X_##__r, #__r)
+#define xive_regwx(__x, __r, __v) \
+	__xive_regw(__x, 0, X_##__r, __v, #__r)
+#define xive_regrx(__x, __r) \
+	__xive_regr(__x, 0, X_##__r, #__r)
+
+#ifdef XIVE_VERBOSE_DEBUG
+#define xive_vdbg(__x,__fmt,...)	prlog(PR_DEBUG,"XIVE[ IC %02x  ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_vdbg(__c,__fmt,...)	prlog(PR_DEBUG,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#else
+#define xive_vdbg(x,fmt,...)		do { } while(0)
+#define xive_cpu_vdbg(x,fmt,...)	do { } while(0)
+#endif
+
+#define xive_dbg(__x,__fmt,...)		prlog(PR_DEBUG,"XIVE[ IC %02x  ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_dbg(__c,__fmt,...)	prlog(PR_DEBUG,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#define xive_warn(__x,__fmt,...)	prlog(PR_WARNING,"XIVE[ IC %02x  ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_warn(__c,__fmt,...)	prlog(PR_WARNING,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#define xive_err(__x,__fmt,...)		prlog(PR_ERR,"XIVE[ IC %02x  ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_err(__c,__fmt,...)	prlog(PR_ERR,"XIVE[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+
+static void __xive_regw(struct xive *x, uint32_t m_reg, uint32_t x_reg, uint64_t v,
+			const char *rname)
+{
+	bool use_xscom = (m_reg == 0) || !x->ic_base;
+	int64_t rc;
+
+	x->last_reg_error = false;
+
+	if (use_xscom) {
+		assert(x_reg != 0);
+		rc = xscom_write(x->chip_id, x->xscom_base + x_reg, v);
+		if (rc) {
+			if (!rname)
+				rname = "???";
+			xive_err(x, "Error writing register %s\n", rname);
+			/* Anything else we can do here ? */
+			x->last_reg_error = true;
+		}
+	} else {
+		out_be64(x->ic_base + m_reg, v);
+	}
+}
+
+static uint64_t __xive_regr(struct xive *x, uint32_t m_reg, uint32_t x_reg,
+			    const char *rname)
+{
+	bool use_xscom = (m_reg == 0) || !x->ic_base;
+	int64_t rc;
+	uint64_t val;
+
+	x->last_reg_error = false;
+
+	if (use_xscom) {
+		assert(x_reg != 0);
+		rc = xscom_read(x->chip_id, x->xscom_base + x_reg, &val);
+		if (rc) {
+			if (!rname)
+				rname = "???";
+			xive_err(x, "Error reading register %s\n", rname);
+			/* Anything else we can do here ? */
+			x->last_reg_error = true;
+			return -1ull;
+		}
+	} else {
+		val = in_be64(x->ic_base + m_reg);
+	}
+	return val;
+}
+
+/* Locate a controller from an IRQ number */
+static struct xive *xive_from_isn(uint32_t isn)
+{
+	uint32_t chip_id = GIRQ_TO_CHIP(isn);
+	struct proc_chip *c = get_chip(chip_id);
+
+	if (!c)
+		return NULL;
+	return c->xive;
+}
+
+static struct xive *xive_from_pc_blk(uint32_t blk)
+{
+	uint32_t chip_id = PC_BLK_TO_CHIP(blk);
+	struct proc_chip *c = get_chip(chip_id);
+
+	if (!c)
+		return NULL;
+	return c->xive;
+}
+
+static struct xive *xive_from_vc_blk(uint32_t blk)
+{
+	uint32_t chip_id = VC_BLK_TO_CHIP(blk);
+	struct proc_chip *c = get_chip(chip_id);
+
+	if (!c)
+		return NULL;
+	return c->xive;
+}
+
+static struct xive_eq *xive_get_eq(struct xive *x, unsigned int idx)
+{
+	struct xive_eq *p;
+
+	if (idx >= (x->eq_ind_count * EQ_PER_PAGE))
+		return NULL;
+	p = (struct xive_eq *)(be64_to_cpu(x->eq_ind_base[idx / EQ_PER_PAGE]) &
+			       VSD_ADDRESS_MASK);
+	if (!p)
+		return NULL;
+
+	return &p[idx % EQ_PER_PAGE];
+}
+
+static struct xive_ive *xive_get_ive(struct xive *x, unsigned int isn)
+{
+	struct xive_ive *ivt;
+	uint32_t idx = GIRQ_TO_IDX(isn);
+
+	if (GIRQ_IS_ESCALATION(isn)) {
+		/* All right, an escalation IVE is buried inside an EQ, let's
+		 * try to find it
+		 */
+		struct xive_eq *eq;
+
+		if (x->chip_id != VC_BLK_TO_CHIP(GIRQ_TO_BLK(isn))) {
+			xive_err(x, "xive_get_ive, ESC ISN 0x%x not on right chip\n", isn);
+			return NULL;
+		}
+		eq = xive_get_eq(x, idx);
+		if (!eq) {
+			xive_err(x, "xive_get_ive, ESC ISN 0x%x EQ not found\n", isn);
+			return NULL;
+		}
+
+		/* If using single-escalation, don't let anybody get to the individual
+		 * escalation interrupts
+		 */
+		if (xive_get_field32(EQ_W0_UNCOND_ESCALATE, eq->w0))
+			return NULL;
+
+		/* Grab the buried IVE */
+		return (struct xive_ive *)(char *)&eq->w4;
+	} else {
+		/* Check the block matches */
+		if (isn < x->int_base || isn >= x->int_max) {
+			xive_err(x, "xive_get_ive, ISN 0x%x not on right chip\n", isn);
+			return NULL;
+		}
+		assert (idx < XIVE_INT_COUNT);
+
+		/* If we support >1 block per chip, this should still work as
+		 * we are likely to make the table contiguous anyway
+		 */
+		ivt = x->ivt_base;
+		assert(ivt);
+
+		return ivt + idx;
+	}
+}
+
+static struct xive_vp *xive_get_vp(struct xive *x, unsigned int idx)
+{
+	struct xive_vp *p;
+
+	assert(idx < (x->vp_ind_count * VP_PER_PAGE));
+	p = (struct xive_vp *)(be64_to_cpu(x->vp_ind_base[idx / VP_PER_PAGE]) &
+			       VSD_ADDRESS_MASK);
+	if (!p)
+		return NULL;
+
+	return &p[idx % VP_PER_PAGE];
+}
+
+static void xive_init_default_vp(struct xive_vp *vp,
+				 uint32_t eq_blk, uint32_t eq_idx)
+{
+	memset(vp, 0, sizeof(struct xive_vp));
+
+	/* Stash the EQ base in the pressure relief interrupt field */
+	vp->w1 = cpu_to_be32((eq_blk << 28) | eq_idx);
+	vp->w0 = xive_set_field32(VP_W0_VALID, 0, 1);
+}
+
+static void xive_init_emu_eq(uint32_t vp_blk, uint32_t vp_idx,
+			     struct xive_eq *eq, void *backing_page,
+			     uint8_t prio)
+{
+	memset(eq, 0, sizeof(struct xive_eq));
+
+	eq->w1 = xive_set_field32(EQ_W1_GENERATION, 0, 1);
+	eq->w3 = cpu_to_be32(((uint64_t)backing_page) & EQ_W3_OP_DESC_LO);
+	eq->w2 = cpu_to_be32((((uint64_t)backing_page) >> 32) & EQ_W2_OP_DESC_HI);
+	eq->w6 = xive_set_field32(EQ_W6_NVT_BLOCK, 0, vp_blk) |
+		 xive_set_field32(EQ_W6_NVT_INDEX, 0, vp_idx);
+	eq->w7 = xive_set_field32(EQ_W7_F0_PRIORITY, 0, prio);
+	eq->w0 = xive_set_field32(EQ_W0_VALID, 0, 1) |
+		 xive_set_field32(EQ_W0_ENQUEUE, 0, 1) |
+		 xive_set_field32(EQ_W0_FIRMWARE, 0, 1) |
+		 xive_set_field32(EQ_W0_QSIZE, 0, EQ_QSIZE_64K) |
+#ifdef EQ_ALWAYS_NOTIFY
+		 xive_set_field32(EQ_W0_UCOND_NOTIFY, 0, 1) |
+#endif
+		 0 ;
+}
+
+static uint32_t *xive_get_eq_buf(uint32_t eq_blk, uint32_t eq_idx)
+{
+	struct xive *x = xive_from_vc_blk(eq_blk);
+	struct xive_eq *eq;
+	uint64_t addr;
+
+	assert(x);
+	eq = xive_get_eq(x, eq_idx);
+	assert(eq);
+	assert(xive_get_field32(EQ_W0_VALID, eq->w0));
+	addr = ((((uint64_t)be32_to_cpu(eq->w2)) & 0x0fffffff) << 32) | be32_to_cpu(eq->w3);
+
+	return (uint32_t *)addr;
+}
+
+static void *xive_get_donated_page(struct xive *x)
+{
+	return (void *)list_pop_(&x->donated_pages, 0);
+}
+
+#define XIVE_ALLOC_IS_ERR(_idx)	((_idx) >= 0xfffffff0)
+
+#define XIVE_ALLOC_NO_SPACE	0xffffffff /* No possible space */
+#define XIVE_ALLOC_NO_IND	0xfffffffe /* Indirect need provisioning */
+#define XIVE_ALLOC_NO_MEM	0xfffffffd /* Local allocation failed */
+
+static uint32_t xive_alloc_eq_set(struct xive *x, bool alloc_indirect)
+{
+	uint32_t ind_idx;
+	int idx;
+	int eq_base_idx;
+
+	xive_vdbg(x, "Allocating EQ set...\n");
+
+	assert(x->eq_map);
+
+	/* Allocate from the EQ bitmap. Each bit is 8 EQs */
+	idx = bitmap_find_zero_bit(*x->eq_map, 0, XIVE_EQ_COUNT >> 3);
+	if (idx < 0) {
+		xive_dbg(x, "Allocation from EQ bitmap failed !\n");
+		return XIVE_ALLOC_NO_SPACE;
+	}
+
+	eq_base_idx = idx << 3;
+
+	xive_vdbg(x, "Got EQs 0x%x..0x%x\n", eq_base_idx,
+		  eq_base_idx + XIVE_MAX_PRIO);
+
+	/* Calculate the indirect page where the EQs reside */
+	ind_idx = eq_base_idx / EQ_PER_PAGE;
+
+	/* Is there an indirect page ? If not, check if we can provision it */
+	if (!x->eq_ind_base[ind_idx]) {
+		/* Default flags */
+		uint64_t vsd_flags = SETFIELD(VSD_TSIZE, 0ull, 4) |
+			SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+		void *page;
+
+		/* If alloc_indirect is set, allocate the memory from OPAL own,
+		 * otherwise try to provision from the donated pool
+		 */
+		if (alloc_indirect) {
+			/* Allocate/provision indirect page during boot only */
+			xive_vdbg(x, "Indirect empty, provisioning from local pool\n");
+			page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE);
+			if (!page) {
+				xive_dbg(x, "provisioning failed !\n");
+				return XIVE_ALLOC_NO_MEM;
+			}
+			vsd_flags |= VSD_FIRMWARE;
+		} else {
+			xive_vdbg(x, "Indirect empty, provisioning from donated pages\n");
+			page = xive_get_donated_page(x);
+			if (!page) {
+				xive_vdbg(x, "no idirect pages available !\n");
+				return XIVE_ALLOC_NO_IND;
+			}
+		}
+		memset(page, 0, PAGE_SIZE);
+		x->eq_ind_base[ind_idx] = cpu_to_be64(vsd_flags |
+			(((uint64_t)page) & VSD_ADDRESS_MASK));
+		/* Any cache scrub needed ? */
+	}
+
+	bitmap_set_bit(*x->eq_map, idx);
+	return eq_base_idx;
+}
+
+static void xive_free_eq_set(struct xive *x, uint32_t eqs)
+{
+	uint32_t idx;
+
+	xive_vdbg(x, "Freeing EQ 0x%x..0x%x\n", eqs, eqs + XIVE_MAX_PRIO);
+
+	assert((eqs & 7) == 0);
+	assert(x->eq_map);
+
+	idx = eqs >> 3;
+	bitmap_clr_bit(*x->eq_map, idx);
+}
+
+static bool xive_provision_vp_ind(struct xive *x, uint32_t vp_idx, uint32_t order)
+{
+	uint32_t pbase, pend, i;
+
+	pbase = vp_idx / VP_PER_PAGE;
+	pend  = (vp_idx + (1 << order)) / VP_PER_PAGE;
+
+	for (i = pbase; i <= pend; i++) {
+		void *page;
+		u64 vsd;
+
+		/* Already provisioned ? */
+		if (x->vp_ind_base[i])
+			continue;
+
+		/* Try to grab a donated page */
+		page = xive_get_donated_page(x);
+		if (!page)
+			return false;
+
+		/* Install the page */
+		memset(page, 0, PAGE_SIZE);
+		vsd = ((uint64_t)page) & VSD_ADDRESS_MASK;
+		vsd |= SETFIELD(VSD_TSIZE, 0ull, 4);
+		vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+		x->vp_ind_base[i] = cpu_to_be64(vsd);
+	}
+	return true;
+}
+
+static void xive_init_vp_allocator(void)
+{
+	/* Initialize chip alloc bits */
+	xive_chips_alloc_bits = ilog2(xive_block_count);
+
+	prlog(PR_INFO, "XIVE: %d chips considered for VP allocations\n",
+	      1 << xive_chips_alloc_bits);
+
+	/* Allocate a buddy big enough for XIVE_VP_ORDER allocations.
+	 *
+	 * each bit in the buddy represents 1 << xive_chips_alloc_bits
+	 * VPs.
+	 */
+	xive_vp_buddy = buddy_create(XIVE_VP_ORDER);
+	assert(xive_vp_buddy);
+
+	/* We reserve the whole range of VPs representing HW chips.
+	 *
+	 * These are 0x80..0xff, so order 7 starting at 0x80. This will
+	 * reserve that range on each chip.
+	 */
+	assert(buddy_reserve(xive_vp_buddy, XIVE_HW_VP_BASE,
+			     XIVE_THREADID_SHIFT));
+}
+
+static uint32_t xive_alloc_vps(uint32_t order)
+{
+	uint32_t local_order, i;
+	int vp;
+
+	/* The minimum order is 2 VPs per chip */
+	if (order < (xive_chips_alloc_bits + 1))
+		order = xive_chips_alloc_bits + 1;
+
+	/* We split the allocation */
+	local_order = order - xive_chips_alloc_bits;
+
+	/* We grab that in the global buddy */
+	assert(xive_vp_buddy);
+	lock(&xive_buddy_lock);
+	vp = buddy_alloc(xive_vp_buddy, local_order);
+	unlock(&xive_buddy_lock);
+	if (vp < 0)
+		return XIVE_ALLOC_NO_SPACE;
+
+	/* Provision on every chip considered for allocation */
+	for (i = 0; i < (1 << xive_chips_alloc_bits); i++) {
+		struct xive *x = xive_from_pc_blk(i);
+		bool success;
+
+		/* Return internal error & log rather than assert ? */
+		assert(x);
+		lock(&x->lock);
+		success = xive_provision_vp_ind(x, vp, local_order);
+		unlock(&x->lock);
+		if (!success) {
+			lock(&xive_buddy_lock);
+			buddy_free(xive_vp_buddy, vp, local_order);
+			unlock(&xive_buddy_lock);
+			return XIVE_ALLOC_NO_IND;
+		}
+	}
+
+	/* Encode the VP number. "blk" is 0 as this represents
+	 * all blocks and the allocation always starts at 0
+	 */
+	return xive_encode_vp(0, vp, order);
+}
+
+static void xive_free_vps(uint32_t vp)
+{
+	uint32_t idx;
+	uint8_t order, local_order;
+
+	assert(xive_decode_vp(vp, NULL, &idx, &order, NULL));
+
+	/* We split the allocation */
+	local_order = order - xive_chips_alloc_bits;
+
+	/* Free that in the buddy */
+	lock(&xive_buddy_lock);
+	buddy_free(xive_vp_buddy, idx, local_order);
+	unlock(&xive_buddy_lock);
+}
+
+enum xive_cache_type {
+	xive_cache_ivc,
+	xive_cache_sbc,
+	xive_cache_eqc,
+	xive_cache_vpc,
+};
+
+static int64_t __xive_cache_watch(struct xive *x, enum xive_cache_type ctype,
+				  uint64_t block, uint64_t idx,
+				  uint32_t start_dword, uint32_t dword_count,
+				  __be64 *new_data, bool light_watch,
+				  bool synchronous);
+
+static void xive_scrub_workaround_vp(struct xive *x, uint32_t block, uint32_t idx __unused)
+{
+	/* VP variant of the workaround described in __xive_cache_scrub(),
+	 * we need to be careful to use for that workaround an NVT that
+	 * sits on the same xive but isn NOT part of a donated indirect
+	 * entry.
+	 *
+	 * The reason is that the dummy cache watch will re-create a
+	 * dirty entry in the cache, even if the entry is marked
+	 * invalid.
+	 *
+	 * Thus if we are about to dispose of the indirect entry backing
+	 * it, we'll cause a checkstop later on when trying to write it
+	 * out.
+	 *
+	 * Note: This means the workaround only works for block group
+	 * mode.
+	 */
+	__xive_cache_watch(x, xive_cache_vpc, block, XIVE_HW_VP_BASE, 0,
+			   0, NULL, true, false);
+}
+
+static void xive_scrub_workaround_eq(struct xive *x, uint32_t block __unused, uint32_t idx)
+{
+	void *mmio;
+
+	/* EQ variant of the workaround described in __xive_cache_scrub(),
+	 * a simple non-side effect load from ESn will do
+	 */
+	mmio = x->eq_mmio + idx * XIVE_ESB_PAGE_SIZE;
+
+	/* Ensure the above has returned before we do anything else
+	 * the XIVE store queue is completely empty
+	 */
+	load_wait(in_be64(mmio + XIVE_ESB_GET));
+}
+
+static int64_t __xive_cache_scrub(struct xive *x, enum xive_cache_type ctype,
+				  uint64_t block, uint64_t idx,
+				  bool want_inval, bool want_disable)
+{
+	uint64_t sreg, sregx, mreg, mregx;
+	uint64_t mval, sval;
+
+#ifdef XIVE_CHECK_LOCKS
+	assert(lock_held_by_me(&x->lock));
+#endif
+
+	/* Workaround a HW bug in XIVE where the scrub completion
+	 * isn't ordered by loads, thus the data might still be
+	 * in a queue and may not have reached coherency.
+	 *
+	 * The workaround is two folds: We force the scrub to also
+	 * invalidate, then after the scrub, we do a dummy cache
+	 * watch which will make the HW read the data back, which
+	 * should be ordered behind all the preceding stores.
+	 *
+	 * Update: For EQs we can do a non-side effect ESB load instead
+	 * which is faster.
+	 */
+	want_inval = true;
+
+	switch (ctype) {
+	case xive_cache_ivc:
+		sreg = VC_IVC_SCRUB_TRIG;
+		sregx = X_VC_IVC_SCRUB_TRIG;
+		mreg = VC_IVC_SCRUB_MASK;
+		mregx = X_VC_IVC_SCRUB_MASK;
+		break;
+	case xive_cache_sbc:
+		sreg = VC_SBC_SCRUB_TRIG;
+		sregx = X_VC_SBC_SCRUB_TRIG;
+		mreg = VC_SBC_SCRUB_MASK;
+		mregx = X_VC_SBC_SCRUB_MASK;
+		break;
+	case xive_cache_eqc:
+		sreg = VC_EQC_SCRUB_TRIG;
+		sregx = X_VC_EQC_SCRUB_TRIG;
+		mreg = VC_EQC_SCRUB_MASK;
+		mregx = X_VC_EQC_SCRUB_MASK;
+		break;
+	case xive_cache_vpc:
+		sreg = PC_VPC_SCRUB_TRIG;
+		sregx = X_PC_VPC_SCRUB_TRIG;
+		mreg = PC_VPC_SCRUB_MASK;
+		mregx = X_PC_VPC_SCRUB_MASK;
+		break;
+	default:
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (ctype == xive_cache_vpc) {
+		mval = PC_SCRUB_BLOCK_ID | PC_SCRUB_OFFSET;
+		sval = SETFIELD(PC_SCRUB_BLOCK_ID, idx, block) |
+			PC_SCRUB_VALID;
+	} else {
+		mval = VC_SCRUB_BLOCK_ID | VC_SCRUB_OFFSET;
+		sval = SETFIELD(VC_SCRUB_BLOCK_ID, idx, block) |
+			VC_SCRUB_VALID;
+	}
+	if (want_inval)
+		sval |= PC_SCRUB_WANT_INVAL;
+	if (want_disable)
+		sval |= PC_SCRUB_WANT_DISABLE;
+
+	__xive_regw(x, mreg, mregx, mval, NULL);
+	__xive_regw(x, sreg, sregx, sval, NULL);
+
+	/* XXX Add timeout !!! */
+	for (;;) {
+		sval = __xive_regr(x, sreg, sregx, NULL);
+		if (!(sval & VC_SCRUB_VALID))
+			break;
+		/* Small delay */
+		time_wait(100);
+	}
+	sync();
+
+	/* Workaround for HW bug described above (only applies to
+	 * EQC and VPC
+	 */
+	if (ctype == xive_cache_eqc)
+		xive_scrub_workaround_eq(x, block, idx);
+	else if (ctype == xive_cache_vpc)
+		xive_scrub_workaround_vp(x, block, idx);
+
+	return 0;
+}
+
+static int64_t xive_ivc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+	/* IVC has no "want_inval" bit, it always invalidates */
+	return __xive_cache_scrub(x, xive_cache_ivc, block, idx, false, false);
+}
+
+static int64_t xive_vpc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+	return __xive_cache_scrub(x, xive_cache_vpc, block, idx, false, false);
+}
+
+static int64_t xive_vpc_scrub_clean(struct xive *x, uint64_t block, uint64_t idx)
+{
+	return __xive_cache_scrub(x, xive_cache_vpc, block, idx, true, false);
+}
+
+static int64_t xive_eqc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+	return __xive_cache_scrub(x, xive_cache_eqc, block, idx, false, false);
+}
+
+#define XIVE_CACHE_WATCH_MAX_RETRIES 10
+
+static int64_t __xive_cache_watch(struct xive *x, enum xive_cache_type ctype,
+				  uint64_t block, uint64_t idx,
+				  uint32_t start_dword, uint32_t dword_count,
+				  __be64 *new_data, bool light_watch,
+				  bool synchronous)
+{
+	uint64_t sreg, sregx, dreg0, dreg0x;
+	uint64_t dval0, sval, status;
+	int64_t i;
+	int retries = 0;
+
+#ifdef XIVE_CHECK_LOCKS
+	assert(lock_held_by_me(&x->lock));
+#endif
+	switch (ctype) {
+	case xive_cache_eqc:
+		sreg = VC_EQC_CWATCH_SPEC;
+		sregx = X_VC_EQC_CWATCH_SPEC;
+		dreg0 = VC_EQC_CWATCH_DAT0;
+		dreg0x = X_VC_EQC_CWATCH_DAT0;
+		sval = SETFIELD(VC_EQC_CWATCH_BLOCKID, idx, block);
+		break;
+	case xive_cache_vpc:
+		sreg = PC_VPC_CWATCH_SPEC;
+		sregx = X_PC_VPC_CWATCH_SPEC;
+		dreg0 = PC_VPC_CWATCH_DAT0;
+		dreg0x = X_PC_VPC_CWATCH_DAT0;
+		sval = SETFIELD(PC_VPC_CWATCH_BLOCKID, idx, block);
+		break;
+	default:
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	/* The full bit is in the same position for EQC and VPC */
+	if (!light_watch)
+		sval |= VC_EQC_CWATCH_FULL;
+
+	for (;;) {
+		/* Write the cache watch spec */
+		__xive_regw(x, sreg, sregx, sval, NULL);
+
+		/* Load data0 register to populate the watch */
+		dval0 = __xive_regr(x, dreg0, dreg0x, NULL);
+
+		/* If new_data is NULL, this is a dummy watch used as a
+		 * workaround for a HW bug
+		 */
+		if (!new_data) {
+			__xive_regw(x, dreg0, dreg0x, dval0, NULL);
+			return 0;
+		}
+
+		/* Write the words into the watch facility. We write in reverse
+		 * order in case word 0 is part of it as it must be the last
+		 * one written.
+		 */
+		for (i = start_dword + dword_count - 1; i >= start_dword ;i--) {
+			uint64_t dw = be64_to_cpu(new_data[i - start_dword]);
+			__xive_regw(x, dreg0 + i * 8, dreg0x + i, dw, NULL);
+		}
+
+		/* Write data0 register to trigger the update if word 0 wasn't
+		 * written above
+		 */
+		if (start_dword > 0)
+			__xive_regw(x, dreg0, dreg0x, dval0, NULL);
+
+		/* This may not be necessary for light updates (it's possible
+		 * that a sync in sufficient, TBD). Ensure the above is
+		 * complete and check the status of the watch.
+		 */
+		status = __xive_regr(x, sreg, sregx, NULL);
+
+		/* Bits FULL and CONFLICT are in the same position in
+		 * EQC and VPC
+		 */
+		if (!(status & VC_EQC_CWATCH_FULL) ||
+		    !(status & VC_EQC_CWATCH_CONFLICT))
+			break;
+		if (!synchronous)
+			return OPAL_BUSY;
+
+		if (++retries == XIVE_CACHE_WATCH_MAX_RETRIES) {
+			xive_err(x, "Reached maximum retries %d when doing "
+				 "a %s cache update\n", retries,
+				 ctype == xive_cache_eqc ? "EQC" : "VPC");
+			return OPAL_BUSY;
+		}
+	}
+
+	/* Perform a scrub with "want_invalidate" set to false to push the
+	 * cache updates to memory as well
+	 */
+	return __xive_cache_scrub(x, ctype, block, idx, false, false);
+}
+
+static int64_t xive_escalation_ive_cache_update(struct xive *x, uint64_t block,
+				     uint64_t idx, struct xive_ive *ive,
+				     bool synchronous)
+{
+	return __xive_cache_watch(x, xive_cache_eqc, block, idx,
+				  2, 1, &ive->w, true, synchronous);
+}
+
+static int64_t xive_eqc_cache_update(struct xive *x, uint64_t block,
+				     uint64_t idx, struct xive_eq *eq,
+				     bool synchronous)
+{
+	return __xive_cache_watch(x, xive_cache_eqc, block, idx,
+				  0, 4, (__be64 *)eq, false, synchronous);
+}
+
+static int64_t xive_vpc_cache_update(struct xive *x, uint64_t block,
+				     uint64_t idx, struct xive_vp *vp,
+				     bool synchronous)
+{
+	return __xive_cache_watch(x, xive_cache_vpc, block, idx,
+				  0, 8, (__be64 *)vp, false, synchronous);
+}
+
+static bool xive_set_vsd(struct xive *x, uint32_t tbl, uint32_t idx, uint64_t v)
+{
+	/* Set VC version */
+	xive_regw(x, VC_VSD_TABLE_ADDR,
+		  SETFIELD(VST_TABLE_SELECT, 0ull, tbl) |
+		  SETFIELD(VST_TABLE_OFFSET, 0ull, idx));
+	if (x->last_reg_error)
+		return false;
+	xive_regw(x, VC_VSD_TABLE_DATA, v);
+	if (x->last_reg_error)
+		return false;
+
+	/* Except for IRQ table, also set PC version */
+	if (tbl == VST_TSEL_IRQ)
+		return true;
+
+	xive_regw(x, PC_VSD_TABLE_ADDR,
+		  SETFIELD(VST_TABLE_SELECT, 0ull, tbl) |
+		  SETFIELD(VST_TABLE_OFFSET, 0ull, idx));
+	if (x->last_reg_error)
+		return false;
+	xive_regw(x, PC_VSD_TABLE_DATA, v);
+	if (x->last_reg_error)
+		return false;
+	return true;
+}
+
+static bool xive_set_local_tables(struct xive *x)
+{
+	uint64_t base, i;
+
+	/* These have to be power of 2 sized */
+	assert(is_pow2(SBE_SIZE));
+	assert(is_pow2(IVT_SIZE));
+
+	/* All tables set as exclusive */
+	base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+
+	/* Set IVT as direct mode */
+	if (!xive_set_vsd(x, VST_TSEL_IVT, x->block_id, base |
+			  (((uint64_t)x->ivt_base) & VSD_ADDRESS_MASK) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(IVT_SIZE) - 12)))
+		return false;
+
+	/* Set SBE as direct mode */
+	if (!xive_set_vsd(x, VST_TSEL_SBE, x->block_id, base |
+			  (((uint64_t)x->sbe_base) & VSD_ADDRESS_MASK) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(SBE_SIZE) - 12)))
+		return false;
+
+	/* Set EQDT as indirect mode with 64K subpages */
+	if (!xive_set_vsd(x, VST_TSEL_EQDT, x->block_id, base |
+			  (((uint64_t)x->eq_ind_base) & VSD_ADDRESS_MASK) |
+			  VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull, 4)))
+		return false;
+
+	/* Set VPDT as indirect mode with 64K subpages */
+	if (!xive_set_vsd(x, VST_TSEL_VPDT, x->block_id, base |
+			  (((uint64_t)x->vp_ind_base) & VSD_ADDRESS_MASK) |
+			  VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull, 4)))
+		return false;
+
+	/* Setup queue overflows */
+	for (i = 0; i < VC_QUEUE_OVF_COUNT; i++) {
+		u64 addr = ((uint64_t)x->q_ovf) + i * PAGE_SIZE;
+		u64 cfg, sreg, sregx;
+
+		if (!xive_set_vsd(x, VST_TSEL_IRQ, i, base |
+				  (addr & VSD_ADDRESS_MASK) |
+			  SETFIELD(VSD_TSIZE, 0ull, 4)))
+			return false;
+		sreg = VC_IRQ_CONFIG_IPI +  i * 8;
+		sregx = X_VC_IRQ_CONFIG_IPI + i;
+		cfg = __xive_regr(x, sreg, sregx, NULL);
+		cfg |= VC_IRQ_CONFIG_MEMB_EN;
+		cfg = SETFIELD(VC_IRQ_CONFIG_MEMB_SZ, cfg, 4);
+		__xive_regw(x, sreg, sregx, cfg, NULL);
+	}
+
+	return true;
+}
+
+static bool xive_configure_bars(struct xive *x)
+{
+	uint64_t chip_id = x->chip_id;
+	uint64_t val;
+
+	/* IC BAR */
+	phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size);
+	val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID | CQ_IC_BAR_64K;
+	x->ic_shift = 16;
+
+	xive_regwx(x, CQ_IC_BAR, val);
+	if (x->last_reg_error)
+		return false;
+
+	/* TM BAR, only configure TM1. Note that this has the same address
+	 * for each chip !!!  Hence we create a fake chip 0 and use that for
+	 * all phys_map_get(XIVE_TM) calls.
+	 */
+	phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size);
+	val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID | CQ_TM_BAR_64K;
+	x->tm_shift = 16;
+
+	xive_regwx(x, CQ_TM1_BAR, val);
+	if (x->last_reg_error)
+		return false;
+	xive_regwx(x, CQ_TM2_BAR, 0);
+	if (x->last_reg_error)
+		return false;
+
+	/* PC BAR. Clear first, write mask, then write value */
+	phys_map_get(chip_id, XIVE_PC, 0, (uint64_t *)&x->pc_base, &x->pc_size);
+	xive_regwx(x, CQ_PC_BAR, 0);
+	if (x->last_reg_error)
+		return false;
+	val = ~(x->pc_size - 1) & CQ_PC_BARM_MASK;
+	xive_regwx(x, CQ_PC_BARM, val);
+	if (x->last_reg_error)
+		return false;
+	val = (uint64_t)x->pc_base | CQ_PC_BAR_VALID;
+	xive_regwx(x, CQ_PC_BAR, val);
+	if (x->last_reg_error)
+		return false;
+
+	/* VC BAR. Clear first, write mask, then write value */
+	phys_map_get(chip_id, XIVE_VC, 0, (uint64_t *)&x->vc_base, &x->vc_size);
+	xive_regwx(x, CQ_VC_BAR, 0);
+	if (x->last_reg_error)
+		return false;
+	val = ~(x->vc_size - 1) & CQ_VC_BARM_MASK;
+	xive_regwx(x, CQ_VC_BARM, val);
+	if (x->last_reg_error)
+		return false;
+	val = (uint64_t)x->vc_base | CQ_VC_BAR_VALID;
+	xive_regwx(x, CQ_VC_BAR, val);
+	if (x->last_reg_error)
+		return false;
+
+	/* Calculate some MMIO bases in the VC BAR */
+	x->esb_mmio = x->vc_base;
+	x->eq_mmio = x->vc_base + (x->vc_size / VC_MAX_SETS) * VC_ESB_SETS;
+
+	/* Print things out */
+	xive_dbg(x, "IC: %14p [0x%012llx/%d]\n", x->ic_base, x->ic_size,
+		 x->ic_shift);
+	xive_dbg(x, "TM: %14p [0x%012llx/%d]\n", x->tm_base, x->tm_size,
+		 x->tm_shift);
+	xive_dbg(x, "PC: %14p [0x%012llx]\n", x->pc_base, x->pc_size);
+	xive_dbg(x, "VC: %14p [0x%012llx]\n", x->vc_base, x->vc_size);
+
+	return true;
+}
+
+static void xive_dump_mmio(struct xive *x)
+{
+	prlog(PR_DEBUG, " CQ_CFG_PB_GEN = %016llx\n",
+	      in_be64(x->ic_base + CQ_CFG_PB_GEN));
+	prlog(PR_DEBUG, " CQ_MSGSND     = %016llx\n",
+	      in_be64(x->ic_base + CQ_MSGSND));
+}
+
+static bool xive_config_init(struct xive *x)
+{
+	uint64_t val;
+
+	/* Configure PC and VC page sizes and disable Linux trigger mode */
+	xive_regwx(x, CQ_PBI_CTL, CQ_PBI_PC_64K | CQ_PBI_VC_64K | CQ_PBI_FORCE_TM_LOCAL);
+	if (x->last_reg_error)
+		return false;
+
+	/*** The rest can use MMIO ***/
+
+	/* Enable indirect mode in VC config */
+	val = xive_regr(x, VC_GLOBAL_CONFIG);
+	val |= VC_GCONF_INDIRECT;
+	xive_regw(x, VC_GLOBAL_CONFIG, val);
+
+	/* Enable indirect mode in PC config */
+	val = xive_regr(x, PC_GLOBAL_CONFIG);
+	val |= PC_GCONF_INDIRECT;
+	val |= PC_GCONF_CHIPID_OVR;
+	val = SETFIELD(PC_GCONF_CHIPID, val, x->block_id);
+	xive_regw(x, PC_GLOBAL_CONFIG, val);
+	xive_dbg(x, "PC_GLOBAL_CONFIG=%016llx\n", val);
+
+	val = xive_regr(x, PC_TCTXT_CFG);
+	val |= PC_TCTXT_CFG_BLKGRP_EN | PC_TCTXT_CFG_HARD_CHIPID_BLK;
+	val |= PC_TCTXT_CHIPID_OVERRIDE;
+	val |= PC_TCTXT_CFG_TARGET_EN;
+	val = SETFIELD(PC_TCTXT_CHIPID, val, x->block_id);
+	val = SETFIELD(PC_TCTXT_INIT_AGE, val, 0x2);
+	val |= PC_TCTXT_CFG_LGS_EN;
+	/* Disable pressure relief as we hijack the field in the VPs */
+	val &= ~PC_TCTXT_CFG_STORE_ACK;
+	if (this_cpu()->is_fused_core)
+		val |= PC_TCTXT_CFG_FUSE_CORE_EN;
+	else
+		val &= ~PC_TCTXT_CFG_FUSE_CORE_EN;
+	xive_regw(x, PC_TCTXT_CFG, val);
+	xive_dbg(x, "PC_TCTXT_CFG=%016llx\n", val);
+
+	val = xive_regr(x, CQ_CFG_PB_GEN);
+	/* 1-block-per-chip mode */
+	val = SETFIELD(CQ_INT_ADDR_OPT, val, 2);
+	xive_regw(x, CQ_CFG_PB_GEN, val);
+
+	/* Enable StoreEOI */
+	val = xive_regr(x, VC_SBC_CONFIG);
+	if (XIVE_CAN_STORE_EOI(x))
+		val |= VC_SBC_CONF_CPLX_CIST | VC_SBC_CONF_CIST_BOTH;
+	else
+		xive_dbg(x, "store EOI is disabled\n");
+
+	val |= VC_SBC_CONF_NO_UPD_PRF;
+	xive_regw(x, VC_SBC_CONFIG, val);
+
+	/* Disable block tracking on Nimbus (we may want to enable
+	 * it on Cumulus later). HW Erratas.
+	 */
+	val = xive_regr(x, PC_TCTXT_TRACK);
+	val &= ~PC_TCTXT_TRACK_EN;
+	xive_regw(x, PC_TCTXT_TRACK, val);
+
+	/* Enable relaxed ordering of trigger forwarding */
+	val = xive_regr(x, VC_AIB_TX_ORDER_TAG2);
+	val |= VC_AIB_TX_ORDER_TAG2_REL_TF;
+	xive_regw(x, VC_AIB_TX_ORDER_TAG2, val);
+
+	/* Enable new END s and u bits for silent escalate */
+	val = xive_regr(x, VC_EQC_CONFIG);
+	val |= VC_EQC_CONF_ENABLE_END_s_BIT;
+	val |= VC_EQC_CONF_ENABLE_END_u_BIT;
+	xive_regw(x, VC_EQC_CONFIG, val);
+
+	/* Disable error reporting in the FIR for info errors
+	 * from the VC.
+	 */
+	xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_VC_INFO_ERROR_0_1);
+
+	/* Mask CI Load and Store to bad location, as IPI trigger
+	 * pages may be mapped to user space, and a read on the
+	 * trigger page causes a checkstop
+	 */
+	xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_PB_RCMDX_CI_ERR1);
+
+	return true;
+}
+
+static bool xive_setup_set_xlate(struct xive *x)
+{
+	unsigned int i;
+
+	/* Configure EDT for ESBs (aka IPIs) */
+	xive_regw(x, CQ_TAR, CQ_TAR_TBL_AUTOINC | CQ_TAR_TSEL_EDT);
+	if (x->last_reg_error)
+		return false;
+	for (i = 0; i < VC_ESB_SETS; i++) {
+		xive_regw(x, CQ_TDR,
+			  /* IPI type */
+			  (1ull << 62) |
+			  /* block ID */
+			  (((uint64_t)x->block_id) << 48) |
+			  /* offset */
+			  (((uint64_t)i) << 32));
+		if (x->last_reg_error)
+			return false;
+	}
+
+	/* Configure EDT for ENDs (aka EQs) */
+	for (i = 0; i < VC_END_SETS; i++) {
+		xive_regw(x, CQ_TDR,
+			  /* EQ type */
+			  (2ull << 62) |
+			  /* block ID */
+			  (((uint64_t)x->block_id) << 48) |
+			  /* offset */
+			  (((uint64_t)i) << 32));
+		if (x->last_reg_error)
+			return false;
+	}
+
+	/* Configure VDT */
+	xive_regw(x, CQ_TAR, CQ_TAR_TBL_AUTOINC | CQ_TAR_TSEL_VDT);
+	if (x->last_reg_error)
+		return false;
+	for (i = 0; i < PC_MAX_SETS; i++) {
+		xive_regw(x, CQ_TDR,
+			  /* Valid bit */
+			  (1ull << 63) |
+			  /* block ID */
+			  (((uint64_t)x->block_id) << 48) |
+			  /* offset */
+			  (((uint64_t)i) << 32));
+		if (x->last_reg_error)
+			return false;
+	}
+	return true;
+}
+
+static bool xive_prealloc_tables(struct xive *x)
+{
+	uint32_t i, vp_init_count, vp_init_base;
+	uint32_t pbase, pend;
+	uint64_t al;
+
+	/* ESB/SBE has 4 entries per byte */
+	x->sbe_base = local_alloc(x->chip_id, SBE_SIZE, SBE_SIZE);
+	if (!x->sbe_base) {
+		xive_err(x, "Failed to allocate SBE\n");
+		return false;
+	}
+	/* SBEs are initialized to 0b01 which corresponds to "ints off" */
+	memset(x->sbe_base, 0x55, SBE_SIZE);
+	xive_dbg(x, "SBE at %p size 0x%lx\n", x->sbe_base, SBE_SIZE);
+
+	/* EAS/IVT entries are 8 bytes */
+	x->ivt_base = local_alloc(x->chip_id, IVT_SIZE, IVT_SIZE);
+	if (!x->ivt_base) {
+		xive_err(x, "Failed to allocate IVT\n");
+		return false;
+	}
+	/* We clear the entries (non-valid). They will be initialized
+	 * when actually used
+	 */
+	memset(x->ivt_base, 0, IVT_SIZE);
+	xive_dbg(x, "IVT at %p size 0x%lx\n", x->ivt_base, IVT_SIZE);
+
+	/* Indirect EQ table.  Limited to one top page. */
+	al = ALIGN_UP(XIVE_EQ_TABLE_SIZE, PAGE_SIZE);
+	if (al > PAGE_SIZE) {
+		xive_err(x, "EQ indirect table is too big !\n");
+		return false;
+	}
+	x->eq_ind_base = local_alloc(x->chip_id, al, al);
+	if (!x->eq_ind_base) {
+		xive_err(x, "Failed to allocate EQ indirect table\n");
+		return false;
+	}
+	memset(x->eq_ind_base, 0, al);
+	xive_dbg(x, "EQi at %p size 0x%llx\n", x->eq_ind_base, al);
+	x->eq_ind_count = XIVE_EQ_TABLE_SIZE / XIVE_VSD_SIZE;
+
+	/* Indirect VP table.  Limited to one top page. */
+	al = ALIGN_UP(XIVE_VP_TABLE_SIZE, PAGE_SIZE);
+	if (al > PAGE_SIZE) {
+		xive_err(x, "VP indirect table is too big !\n");
+		return false;
+	}
+	x->vp_ind_base = local_alloc(x->chip_id, al, al);
+	if (!x->vp_ind_base) {
+		xive_err(x, "Failed to allocate VP indirect table\n");
+		return false;
+	}
+	xive_dbg(x, "VPi at %p size 0x%llx\n", x->vp_ind_base, al);
+	x->vp_ind_count = XIVE_VP_TABLE_SIZE / XIVE_VSD_SIZE;
+	memset(x->vp_ind_base, 0, al);
+
+	/* Populate/initialize VP/EQs indirect backing */
+	vp_init_count = XIVE_HW_VP_COUNT;
+	vp_init_base = XIVE_HW_VP_BASE;
+
+	/* Allocate pages for some VPs in indirect mode */
+	pbase = vp_init_base / VP_PER_PAGE;
+	pend  = (vp_init_base + vp_init_count) / VP_PER_PAGE;
+
+	xive_dbg(x, "Allocating pages %d to %d of VPs (for %d VPs)\n",
+		 pbase, pend, vp_init_count);
+	for (i = pbase; i <= pend; i++) {
+		void *page;
+		u64 vsd;
+
+		/* Indirect entries have a VSD format */
+		page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE);
+		if (!page) {
+			xive_err(x, "Failed to allocate VP page\n");
+			return false;
+		}
+		xive_dbg(x, "VP%d at %p size 0x%x\n", i, page, PAGE_SIZE);
+		memset(page, 0, PAGE_SIZE);
+		vsd = ((uint64_t)page) & VSD_ADDRESS_MASK;
+
+		vsd |= SETFIELD(VSD_TSIZE, 0ull, 4);
+		vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+		vsd |= VSD_FIRMWARE;
+		x->vp_ind_base[i] = cpu_to_be64(vsd);
+	}
+
+	/* Allocate the queue overflow pages */
+	x->q_ovf = local_alloc(x->chip_id, VC_QUEUE_OVF_COUNT * PAGE_SIZE, PAGE_SIZE);
+	if (!x->q_ovf) {
+		xive_err(x, "Failed to allocate queue overflow\n");
+		return false;
+	}
+	return true;
+}
+
+static void xive_add_provisioning_properties(void)
+{
+	__be32 chips[XIVE_MAX_CHIPS];
+	uint32_t i, count;
+
+	dt_add_property_cells(xive_dt_node,
+			      "ibm,xive-provision-page-size", PAGE_SIZE);
+
+	count = 1 << xive_chips_alloc_bits;
+	for (i = 0; i < count; i++)
+		chips[i] = cpu_to_be32(xive_block_to_chip[i]);
+	dt_add_property(xive_dt_node, "ibm,xive-provision-chips",
+			chips, 4 * count);
+}
+
+static void xive_create_mmio_dt_node(struct xive *x)
+{
+	uint64_t tb = (uint64_t)x->tm_base;
+	uint32_t stride = 1u << x->tm_shift;
+
+	xive_dt_node = dt_new_addr(dt_root, "interrupt-controller", tb);
+	assert(xive_dt_node);
+
+	dt_add_property_u64s(xive_dt_node, "reg",
+			     tb + 0 * stride, stride,
+			     tb + 1 * stride, stride,
+			     tb + 2 * stride, stride,
+			     tb + 3 * stride, stride);
+
+	dt_add_property_strings(xive_dt_node, "compatible",
+				"ibm,opal-xive-pe", "ibm,opal-intc");
+
+	dt_add_property_cells(xive_dt_node, "ibm,xive-eq-sizes",
+			      12, 16, 21, 24);
+
+	dt_add_property_cells(xive_dt_node, "ibm,xive-#priorities",
+			      NUM_INT_PRIORITIES);
+	dt_add_property(xive_dt_node, "single-escalation-support", NULL, 0);
+
+	xive_add_provisioning_properties();
+}
+
+static void xive_setup_forward_ports(struct xive *x, struct proc_chip *remote_chip)
+{
+	struct xive *remote_xive = remote_chip->xive;
+	uint64_t base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_FORWARD);
+	uint32_t remote_id = remote_xive->block_id;
+	uint64_t nport;
+
+	/* ESB(SBE), EAS(IVT) and END(EQ) point to the notify port */
+	nport = ((uint64_t)remote_xive->ic_base) + (1ul << remote_xive->ic_shift);
+	if (!xive_set_vsd(x, VST_TSEL_IVT, remote_id, base | nport))
+		goto error;
+	if (!xive_set_vsd(x, VST_TSEL_SBE, remote_id, base | nport))
+		goto error;
+	if (!xive_set_vsd(x, VST_TSEL_EQDT, remote_id, base | nport))
+		goto error;
+
+	/* NVT/VPD points to the remote NVT MMIO sets */
+	if (!xive_set_vsd(x, VST_TSEL_VPDT, remote_id,
+			  base | ((uint64_t)remote_xive->pc_base) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(x->pc_size) - 12)))
+		goto error;
+
+	return;
+
+ error:
+	xive_err(x, "Failure configuring forwarding ports\n");
+}
+
+static void late_init_one_xive(struct xive *x)
+{
+	struct proc_chip *chip;
+
+	/* We need to setup the cross-chip forward ports. Let's
+	 * iterate all chip and set them up accordingly
+	 */
+	for_each_chip(chip) {
+		/* We skip ourselves or chips without a xive */
+		if (chip->xive == x || !chip->xive)
+			continue;
+
+		/* Setup our forward ports to that chip */
+		xive_setup_forward_ports(x, chip);
+	}
+}
+
+static bool xive_check_ipi_free(struct xive *x, uint32_t irq, uint32_t count)
+{
+	uint32_t i, idx = GIRQ_TO_IDX(irq);
+
+	for (i = 0; i < count; i++)
+		if (bitmap_tst_bit(*x->ipi_alloc_map, idx + i))
+			return false;
+	return true;
+}
+
+uint32_t xive_alloc_hw_irqs(uint32_t chip_id, uint32_t count, uint32_t align)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct xive *x;
+	uint32_t base, i;
+
+	assert(chip);
+	assert(is_pow2(align));
+
+	x = chip->xive;
+	assert(x);
+
+	lock(&x->lock);
+
+	/* Allocate the HW interrupts */
+	base = x->int_hw_bot - count;
+	base &= ~(align - 1);
+	if (base < x->int_ipi_top) {
+		xive_err(x,
+			 "HW alloc request for %d interrupts aligned to %d failed\n",
+			 count, align);
+		unlock(&x->lock);
+		return XIVE_IRQ_ERROR;
+	}
+	if (!xive_check_ipi_free(x, base, count)) {
+		xive_err(x, "HWIRQ boot allocator request overlaps dynamic allocator\n");
+		unlock(&x->lock);
+		return XIVE_IRQ_ERROR;
+	}
+
+	x->int_hw_bot = base;
+
+	/* Initialize the corresponding IVT entries to sane defaults,
+	 * IE entry is valid, not routed and masked, EQ data is set
+	 * to the GIRQ number.
+	 */
+	for (i = 0; i < count; i++) {
+		struct xive_ive *ive = xive_get_ive(x, base + i);
+
+		ive->w = xive_set_field64(IVE_VALID, 0ul, 1) |
+			 xive_set_field64(IVE_MASKED, 0ul, 1) |
+			 xive_set_field64(IVE_EQ_DATA, 0ul, base + i);
+	}
+
+	unlock(&x->lock);
+	return base;
+}
+
+uint32_t xive_alloc_ipi_irqs(uint32_t chip_id, uint32_t count, uint32_t align)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct xive *x;
+	uint32_t base, i;
+
+	assert(chip);
+	assert(is_pow2(align));
+
+	x = chip->xive;
+	assert(x);
+
+	lock(&x->lock);
+
+	/* Allocate the IPI interrupts */
+	base = x->int_ipi_top + (align - 1);
+	base &= ~(align - 1);
+	if (base >= x->int_hw_bot) {
+		xive_err(x,
+			 "IPI alloc request for %d interrupts aligned to %d failed\n",
+			 count, align);
+		unlock(&x->lock);
+		return XIVE_IRQ_ERROR;
+	}
+	if (!xive_check_ipi_free(x, base, count)) {
+		xive_err(x, "IPI boot allocator request overlaps dynamic allocator\n");
+		unlock(&x->lock);
+		return XIVE_IRQ_ERROR;
+	}
+
+	x->int_ipi_top = base + count;
+
+	/* Initialize the corresponding IVT entries to sane defaults,
+	 * IE entry is valid, not routed and masked, EQ data is set
+	 * to the GIRQ number.
+	 */
+	for (i = 0; i < count; i++) {
+		struct xive_ive *ive = xive_get_ive(x, base + i);
+
+		ive->w = xive_set_field64(IVE_VALID, 0ul, 1) |
+			 xive_set_field64(IVE_MASKED, 0ul, 1) |
+			 xive_set_field64(IVE_EQ_DATA, 0ul, base + i);
+	}
+
+	unlock(&x->lock);
+	return base;
+}
+
+void *xive_get_trigger_port(uint32_t girq)
+{
+	uint32_t idx = GIRQ_TO_IDX(girq);
+	struct xive *x;
+
+	/* Find XIVE on which the IVE resides */
+	x = xive_from_isn(girq);
+	if (!x)
+		return NULL;
+
+	if (GIRQ_IS_ESCALATION(girq)) {
+		/* There is no trigger page for escalation interrupts */
+		return NULL;
+	} else {
+		/* Make sure it's an IPI on that chip */
+		if (girq < x->int_base ||
+		    girq >= x->int_ipi_top)
+			return NULL;
+
+		return x->esb_mmio + idx * XIVE_ESB_PAGE_SIZE;
+	}
+}
+
+uint64_t xive_get_notify_port(uint32_t chip_id, uint32_t ent)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct xive *x;
+	uint32_t offset = 0;
+
+	assert(chip);
+	x = chip->xive;
+	assert(x);
+
+	/* This is where we can assign a different HW queue to a different
+	 * source by offsetting into the cache lines of the notify port
+	 *
+	 * For now we keep it very basic, this will have to be looked at
+	 * again on real HW with some proper performance analysis.
+	 *
+	 * Here's what Florian says on the matter:
+	 *
+	 * <<
+	 * The first 2k of the notify port page can all be used for PCIe triggers
+	 *
+	 * However the idea would be that we try to use the first 4 cache lines to
+	 * balance the PCIe Interrupt requests to use the least used snoop buses
+	 * (we went from 2 to 4 snoop buses for P9). snoop 0 is heavily used
+	 * (I think TLBIs are using that in addition to the normal addresses),
+	 * snoop 3 is used for all Int commands, so I think snoop 2 (CL 2 in the
+	 * page) is the least used overall. So we probably should that one for
+	 * the Int commands from PCIe.
+	 *
+	 * In addition, our EAS cache supports hashing to provide "private" cache
+	 * areas for the PHBs in the shared 1k EAS cache. This allows e.g. to avoid
+	 * that one "thrashing" PHB thrashes the EAS cache for everyone, or provide
+	 * a PHB with a private area that would allow high cache hits in case of a
+	 * device using very few interrupts. The hashing is based on the offset within
+	 * the cache line. So using that, you can e.g. set the EAS cache up so that
+	 * IPIs use 512 entries, the x16 PHB uses 256 entries and the x8 PHBs 128
+	 * entries each - or IPIs using all entries and sharing with PHBs, so PHBs
+	 * would use 512 entries and 256 entries respectively.
+	 *
+	 * This is a tuning we would probably do later in the lab, but as a "prep"
+	 * we should set up the different PHBs such that they are using different
+	 * 8B-aligned offsets within the cache line, so e.g.
+	 * PH4_0  addr        0x100        (CL 2 DW0
+	 * PH4_1  addr        0x108        (CL 2 DW1)
+	 * PH4_2  addr        0x110        (CL 2 DW2)
+	 * etc.
+	 * >>
+	 *
+	 * I'm using snoop1 for PHB0 and snoop2 for everybody else.
+	 */
+	switch(ent) {
+	case XIVE_HW_SRC_PHBn(0):
+		offset = 0x100;
+		break;
+	case XIVE_HW_SRC_PHBn(1):
+		offset = 0x208;
+		break;
+	case XIVE_HW_SRC_PHBn(2):
+		offset = 0x210;
+		break;
+	case XIVE_HW_SRC_PHBn(3):
+		offset = 0x218;
+		break;
+	case XIVE_HW_SRC_PHBn(4):
+		offset = 0x220;
+		break;
+	case XIVE_HW_SRC_PHBn(5):
+		offset = 0x228;
+		break;
+	case XIVE_HW_SRC_PSI:
+		offset = 0x230;
+		break;
+	default:
+		assert(false);
+		return 0;
+	}
+
+	/* Notify port is the second page of the IC BAR */
+	return ((uint64_t)x->ic_base) + (1ul << x->ic_shift) + offset;
+}
+
+/* Manufacture the powerbus packet bits 32:63 */
+__attrconst uint32_t xive_get_notify_base(uint32_t girq)
+{
+	return (GIRQ_TO_BLK(girq) << 28)  | GIRQ_TO_IDX(girq);
+}
+
+static bool xive_get_irq_targetting(uint32_t isn, uint32_t *out_target,
+				    uint8_t *out_prio, uint32_t *out_lirq)
+{
+	struct xive_ive *ive;
+	struct xive *x, *eq_x;
+	struct xive_eq *eq;
+	uint32_t eq_blk, eq_idx;
+	uint32_t vp_blk __unused, vp_idx;
+	uint32_t prio, server;
+	bool is_escalation = GIRQ_IS_ESCALATION(isn);
+
+	/* Find XIVE on which the IVE resides */
+	x = xive_from_isn(isn);
+	if (!x)
+		return false;
+	/* Grab the IVE */
+	ive = xive_get_ive(x, isn);
+	if (!ive)
+		return false;
+	if (!xive_get_field64(IVE_VALID, ive->w) && !is_escalation) {
+		xive_err(x, "ISN %x lead to invalid IVE !\n", isn);
+		return false;
+	}
+
+	if (out_lirq)
+		*out_lirq = xive_get_field64(IVE_EQ_DATA, ive->w);
+
+	/* Find the EQ and its xive instance */
+	eq_blk = xive_get_field64(IVE_EQ_BLOCK, ive->w);
+	eq_idx = xive_get_field64(IVE_EQ_INDEX, ive->w);
+	eq_x = xive_from_vc_blk(eq_blk);
+
+	/* This can fail if the interrupt hasn't been initialized yet
+	 * but it should also be masked, so fail silently
+	 */
+	if (!eq_x)
+		goto pick_default;
+	eq = xive_get_eq(eq_x, eq_idx);
+	if (!eq)
+		goto pick_default;
+
+	/* XXX Check valid and format 0 */
+
+	/* No priority conversion, return the actual one ! */
+	if (xive_get_field64(IVE_MASKED, ive->w))
+		prio = 0xff;
+	else
+		prio = xive_get_field32(EQ_W7_F0_PRIORITY, eq->w7);
+	if (out_prio)
+		*out_prio = prio;
+
+	vp_blk = xive_get_field32(EQ_W6_NVT_BLOCK, eq->w6);
+	vp_idx = xive_get_field32(EQ_W6_NVT_INDEX, eq->w6);
+	server = VP2PIR(vp_blk, vp_idx);
+
+	if (out_target)
+		*out_target = server;
+
+	xive_vdbg(eq_x, "EQ info for ISN %x: prio=%d, server=0x%x (VP %x/%x)\n",
+		  isn, prio, server, vp_blk, vp_idx);
+	return true;
+
+pick_default:
+	xive_vdbg(eq_x, "EQ info for ISN %x: Using masked defaults\n", isn);
+
+	if (out_prio)
+		*out_prio = 0xff;
+	/* Pick a random default, me will be fine ... */
+	if (out_target)
+		*out_target = mfspr(SPR_PIR);
+	return true;
+}
+
+static inline bool xive_eq_for_target(uint32_t target, uint8_t prio,
+				      uint32_t *out_eq_blk,
+				      uint32_t *out_eq_idx)
+{
+	struct xive *x;
+	struct xive_vp *vp;
+	uint32_t vp_blk, vp_idx;
+	uint32_t eq_blk, eq_idx;
+
+	if (prio > XIVE_MAX_PRIO)
+		return false;
+
+	/* Get the VP block/index from the target word */
+	if (!xive_decode_vp(target, &vp_blk, &vp_idx, NULL, NULL))
+		return false;
+
+	/* Grab the target VP's XIVE */
+	x = xive_from_pc_blk(vp_blk);
+	if (!x)
+		return false;
+
+	/* Find the VP structrure where we stashed the EQ number */
+	vp = xive_get_vp(x, vp_idx);
+	if (!vp)
+		return false;
+
+	/* Grab it, it's in the pressure relief interrupt field,
+	 * top 4 bits are the block (word 1).
+	 */
+	eq_blk = be32_to_cpu(vp->w1) >> 28;
+	eq_idx = be32_to_cpu(vp->w1) & 0x0fffffff;
+
+	/* Currently the EQ block and VP block should be the same */
+	if (eq_blk != vp_blk) {
+		xive_err(x, "eq_blk != vp_blk (%d vs. %d) for target 0x%08x/%d\n",
+			 eq_blk, vp_blk, target, prio);
+		return false;
+	}
+
+	if (out_eq_blk)
+		*out_eq_blk = eq_blk;
+	if (out_eq_idx)
+		*out_eq_idx = eq_idx + prio;
+
+	return true;
+}
+
+static int64_t xive_set_irq_targetting(uint32_t isn, uint32_t target,
+				       uint8_t prio, uint32_t lirq,
+				       bool synchronous)
+{
+	struct xive *x;
+	struct xive_ive *ive, new_ive;
+	uint32_t eq_blk, eq_idx;
+	bool is_escalation = GIRQ_IS_ESCALATION(isn);
+	int64_t rc;
+
+	/* Find XIVE on which the IVE resides */
+	x = xive_from_isn(isn);
+	if (!x)
+		return OPAL_PARAMETER;
+	/* Grab the IVE */
+	ive = xive_get_ive(x, isn);
+	if (!ive)
+		return OPAL_PARAMETER;
+	if (!xive_get_field64(IVE_VALID, ive->w) && !is_escalation) {
+		xive_err(x, "ISN %x lead to invalid IVE !\n", isn);
+		return OPAL_PARAMETER;
+	}
+
+	lock(&x->lock);
+
+	/* If using emulation mode, fixup prio to the only supported one */
+	if (xive_mode == XIVE_MODE_EMU && prio != 0xff)
+		prio = XIVE_EMULATION_PRIO;
+
+	/* Read existing IVE */
+	new_ive = *ive;
+
+	/* Are we masking ? */
+	if (prio == 0xff && !is_escalation) {
+		new_ive.w = xive_set_field64(IVE_MASKED, new_ive.w, 1);
+		xive_vdbg(x, "ISN %x masked !\n", isn);
+
+		/* Put prio 7 in the EQ */
+		prio = XIVE_MAX_PRIO;
+	} else {
+		/* Unmasking */
+		new_ive.w = xive_set_field64(IVE_MASKED, new_ive.w, 0);
+		xive_vdbg(x, "ISN %x unmasked !\n", isn);
+
+		/* For normal interrupt sources, keep track of which ones
+		 * we ever enabled since the last reset
+		 */
+		if (!is_escalation)
+			bitmap_set_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn));
+	}
+
+	/* If prio isn't 0xff, re-target the IVE. First find the EQ
+	 * correponding to the target
+	 */
+	if (prio != 0xff) {
+		if (!xive_eq_for_target(target, prio, &eq_blk, &eq_idx)) {
+			xive_err(x, "Can't find EQ for target/prio 0x%x/%d\n",
+				 target, prio);
+			unlock(&x->lock);
+			return OPAL_PARAMETER;
+		}
+
+		/* Try to update it atomically to avoid an intermediary
+		 * stale state
+		 */
+		new_ive.w = xive_set_field64(IVE_EQ_BLOCK, new_ive.w, eq_blk);
+		new_ive.w = xive_set_field64(IVE_EQ_INDEX, new_ive.w, eq_idx);
+	}
+	new_ive.w = xive_set_field64(IVE_EQ_DATA, new_ive.w, lirq);
+
+	xive_vdbg(x,"ISN %x routed to eq %x/%x lirq=%08x IVE=%016llx !\n",
+		  isn, eq_blk, eq_idx, lirq, be64_to_cpu(new_ive.w));
+
+	/* Updating the cache differs between real IVEs and escalation
+	 * IVEs inside an EQ
+	 */
+	if (is_escalation) {
+		rc = xive_escalation_ive_cache_update(x, x->block_id,
+				GIRQ_TO_IDX(isn), &new_ive, synchronous);
+	} else {
+		sync();
+		*ive = new_ive;
+		rc = xive_ivc_scrub(x, x->block_id, GIRQ_TO_IDX(isn));
+	}
+
+	unlock(&x->lock);
+	return rc;
+}
+
+static int64_t xive_source_get_xive(struct irq_source *is __unused,
+				    uint32_t isn, uint16_t *server,
+				    uint8_t *prio)
+{
+	uint32_t target_id;
+
+	if (xive_get_irq_targetting(isn, &target_id, prio, NULL)) {
+		*server = target_id << 2;
+		return OPAL_SUCCESS;
+	} else
+		return OPAL_PARAMETER;
+}
+
+static void xive_update_irq_mask(struct xive_src *s, uint32_t idx, bool masked)
+{
+	void *mmio_base = s->esb_mmio + (1ul << s->esb_shift) * idx;
+	uint32_t offset;
+
+	/* XXX FIXME: A quick mask/umask can make us shoot an interrupt
+	 * more than once to a queue. We need to keep track better
+	 */
+	if (s->flags & XIVE_SRC_EOI_PAGE1)
+		mmio_base += 1ull << (s->esb_shift - 1);
+	if (masked)
+		offset = XIVE_ESB_SET_PQ_01;
+	else
+		offset = XIVE_ESB_SET_PQ_00;
+
+	in_be64(mmio_base + offset);
+}
+
+static int64_t xive_sync(struct xive *x)
+{
+	uint64_t r;
+	void *p;
+
+	lock(&x->lock);
+
+	/* Second 2K range of second page */
+	p = x->ic_base + (1 << x->ic_shift) + 0x800;
+
+	/* TODO: Make this more fine grained */
+	out_be64(p + (10 << 7), 0); /* Sync OS escalations */
+	out_be64(p + (11 << 7), 0); /* Sync Hyp escalations */
+	out_be64(p + (12 << 7), 0); /* Sync Redistribution */
+	out_be64(p + ( 8 << 7), 0); /* Sync IPI */
+	out_be64(p + ( 9 << 7), 0); /* Sync HW */
+
+#define SYNC_MASK                \
+	(VC_EQC_CONF_SYNC_IPI  | \
+	 VC_EQC_CONF_SYNC_HW   | \
+	 VC_EQC_CONF_SYNC_ESC1 | \
+	 VC_EQC_CONF_SYNC_ESC2 | \
+	 VC_EQC_CONF_SYNC_REDI)
+
+	/* XXX Add timeout */
+	for (;;) {
+		r = xive_regr(x, VC_EQC_CONFIG);
+		if ((r & SYNC_MASK) == SYNC_MASK)
+			break;
+		cpu_relax();
+	}
+	xive_regw(x, VC_EQC_CONFIG, r & ~SYNC_MASK);
+
+	/* Workaround HW issue, read back before allowing a new sync */
+	xive_regr(x, VC_GLOBAL_CONFIG);
+
+	unlock(&x->lock);
+
+	return 0;
+}
+
+static int64_t __xive_set_irq_config(struct irq_source *is, uint32_t girq,
+				     uint64_t vp, uint8_t prio, uint32_t lirq,
+				     bool update_esb, bool sync)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+	uint32_t old_target, vp_blk;
+	u8 old_prio;
+	int64_t rc;
+
+	/* Grab existing target */
+	if (!xive_get_irq_targetting(girq, &old_target, &old_prio, NULL))
+		return OPAL_PARAMETER;
+
+	/* Let XIVE configure the EQ. We do the update without the
+	 * synchronous flag, thus a cache update failure will result
+	 * in us returning OPAL_BUSY
+	 */
+	rc = xive_set_irq_targetting(girq, vp, prio, lirq, false);
+	if (rc)
+		return rc;
+
+	/* Do we need to update the mask ? */
+	if (old_prio != prio && (old_prio == 0xff || prio == 0xff)) {
+		/* The source has special variants of masking/unmasking */
+		if (s->orig_ops && s->orig_ops->set_xive) {
+			/* We don't pass as server on source ops ! Targetting
+			 * is handled by the XIVE
+			 */
+			rc = s->orig_ops->set_xive(is, girq, 0, prio);
+		} else if (update_esb) {
+			/* Ensure it's enabled/disabled in the source
+			 * controller
+			 */
+			xive_update_irq_mask(s, girq - s->esb_base,
+					     prio == 0xff);
+		}
+	}
+
+	/*
+	 * Synchronize the source and old target XIVEs to ensure that
+	 * all pending interrupts to the old target have reached their
+	 * respective queue.
+	 *
+	 * WARNING: This assumes the VP and it's queues are on the same
+	 *          XIVE instance !
+	 */
+	if (!sync)
+		return OPAL_SUCCESS;
+	xive_sync(s->xive);
+	if (xive_decode_vp(old_target, &vp_blk, NULL, NULL, NULL)) {
+		struct xive *x = xive_from_pc_blk(vp_blk);
+		if (x)
+			xive_sync(x);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio,
+				   uint32_t lirq, bool update_esb)
+{
+	struct irq_source *is = irq_find_source(girq);
+
+	return __xive_set_irq_config(is, girq, vp, prio, lirq, update_esb,
+				     true);
+}
+
+static int64_t xive_source_set_xive(struct irq_source *is,
+				    uint32_t isn, uint16_t server, uint8_t prio)
+{
+	/*
+	 * WARNING: There is an inherent race with the use of the
+	 * mask bit in the EAS/IVT. When masked, interrupts are "lost"
+	 * but their P/Q bits are still set. So when unmasking, one has
+	 * to check the P bit and possibly trigger a resend.
+	 *
+	 * We "deal" with it by relying on the fact that the OS will
+	 * lazy disable MSIs. Thus mask will only be called if the
+	 * interrupt occurred while already logically masked. Thus
+	 * losing subsequent occurrences is of no consequences, we just
+	 * need to "cleanup" P and Q when unmasking.
+	 *
+	 * This needs to be documented in the OPAL APIs
+	 */
+
+	/* Unmangle server */
+	server >>= 2;
+
+	/* Set logical irq to match isn */
+	return __xive_set_irq_config(is, isn, server, prio, isn, true, true);
+}
+
+static void __xive_source_eoi(struct irq_source *is, uint32_t isn)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+	uint32_t idx = isn - s->esb_base;
+	struct xive_ive *ive;
+	void *mmio_base;
+	uint64_t eoi_val;
+
+	/* Grab the IVE */
+	ive = s->xive->ivt_base;
+	if (!ive)
+		return;
+	ive += GIRQ_TO_IDX(isn);
+
+	/* XXX To fix the races with mask/unmask potentially causing
+	 * multiple queue entries, we need to keep track of EOIs here,
+	 * before the masked test below
+	 */
+
+	/* If it's invalid or masked, don't do anything */
+	if (xive_get_field64(IVE_MASKED, ive->w) || !xive_get_field64(IVE_VALID, ive->w))
+		return;
+
+	/* Grab MMIO control address for that ESB */
+	mmio_base = s->esb_mmio + (1ull << s->esb_shift) * idx;
+
+	/* If the XIVE supports the new "store EOI facility, use it */
+	if (s->flags & XIVE_SRC_STORE_EOI)
+		out_be64(mmio_base + XIVE_ESB_STORE_EOI, 0);
+	else {
+		uint64_t offset;
+
+		/* Otherwise for EOI, we use the special MMIO that does
+		 * a clear of both P and Q and returns the old Q.
+		 *
+		 * This allows us to then do a re-trigger if Q was set
+		 * rather than synthetizing an interrupt in software
+		 */
+		if (s->flags & XIVE_SRC_EOI_PAGE1)
+			mmio_base += 1ull << (s->esb_shift - 1);
+
+		/* LSIs don't need anything special, just EOI */
+		if (s->flags & XIVE_SRC_LSI)
+			in_be64(mmio_base);
+		else {
+			offset = XIVE_ESB_SET_PQ_00;
+			eoi_val = in_be64(mmio_base + offset);
+			xive_vdbg(s->xive, "ISN: %08x EOI=%llx\n",
+				  isn, eoi_val);
+			if (!(eoi_val & 1))
+				return;
+
+			/* Re-trigger always on page0 or page1 ? */
+			out_be64(mmio_base + XIVE_ESB_STORE_TRIGGER, 0);
+		}
+	}
+}
+
+static void xive_source_eoi(struct irq_source *is, uint32_t isn)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+
+	if (s->orig_ops && s->orig_ops->eoi)
+		s->orig_ops->eoi(is, isn);
+	else
+		__xive_source_eoi(is, isn);
+}
+
+static void xive_source_interrupt(struct irq_source *is, uint32_t isn)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+
+	if (!s->orig_ops || !s->orig_ops->interrupt)
+		return;
+	s->orig_ops->interrupt(is, isn);
+}
+
+static uint64_t xive_source_attributes(struct irq_source *is, uint32_t isn)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+
+	if (!s->orig_ops || !s->orig_ops->attributes)
+		return IRQ_ATTR_TARGET_LINUX;
+	return s->orig_ops->attributes(is, isn);
+}
+
+static char *xive_source_name(struct irq_source *is, uint32_t isn)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+
+	if (!s->orig_ops || !s->orig_ops->name)
+		return NULL;
+	return s->orig_ops->name(is, isn);
+}
+
+void xive_source_mask(struct irq_source *is, uint32_t isn)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+
+	xive_update_irq_mask(s, isn - s->esb_base, true);
+}
+
+static const struct irq_source_ops xive_irq_source_ops = {
+	.get_xive = xive_source_get_xive,
+	.set_xive = xive_source_set_xive,
+	.eoi = xive_source_eoi,
+	.interrupt = xive_source_interrupt,
+	.attributes = xive_source_attributes,
+	.name = xive_source_name,
+};
+
+static void __xive_register_source(struct xive *x, struct xive_src *s,
+				   uint32_t base, uint32_t count,
+				   uint32_t shift, void *mmio, uint32_t flags,
+				   bool secondary, void *data,
+				   const struct irq_source_ops *orig_ops)
+{
+	s->esb_base = base;
+	s->esb_shift = shift;
+	s->esb_mmio = mmio;
+	s->flags = flags;
+	s->orig_ops = orig_ops;
+	s->xive = x;
+	s->is.start = base;
+	s->is.end = base + count;
+	s->is.ops = &xive_irq_source_ops;
+	s->is.data = data;
+
+	__register_irq_source(&s->is, secondary);
+}
+
+void xive_register_hw_source(uint32_t base, uint32_t count, uint32_t shift,
+			     void *mmio, uint32_t flags, void *data,
+			     const struct irq_source_ops *ops)
+{
+	struct xive_src *s;
+	struct xive *x = xive_from_isn(base);
+
+	assert(x);
+
+	s = malloc(sizeof(struct xive_src));
+	assert(s);
+	__xive_register_source(x, s, base, count, shift, mmio, flags,
+			       false, data, ops);
+}
+
+void xive_register_ipi_source(uint32_t base, uint32_t count, void *data,
+			      const struct irq_source_ops *ops)
+{
+	struct xive_src *s;
+	struct xive *x = xive_from_isn(base);
+	uint32_t base_idx = GIRQ_TO_IDX(base);
+	void *mmio_base;
+	uint32_t flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE;
+
+	assert(x);
+	assert(base >= x->int_base && (base + count) <= x->int_ipi_top);
+
+	s = malloc(sizeof(struct xive_src));
+	assert(s);
+
+	/* Store EOI supported on DD2.0 */
+	if (XIVE_CAN_STORE_EOI(x))
+		flags |= XIVE_SRC_STORE_EOI;
+
+	/* Callbacks assume the MMIO base corresponds to the first
+	 * interrupt of that source structure so adjust it
+	 */
+	mmio_base = x->esb_mmio + (1ul << XIVE_ESB_SHIFT) * base_idx;
+	__xive_register_source(x, s, base, count, XIVE_ESB_SHIFT, mmio_base,
+			       flags, false, data, ops);
+}
+
+static struct xive *init_one_xive(struct dt_node *np)
+{
+	struct xive *x;
+	struct proc_chip *chip;
+	uint32_t flags;
+
+	x = zalloc(sizeof(struct xive));
+	assert(x);
+	x->x_node = np;
+	x->xscom_base = dt_get_address(np, 0, NULL);
+	x->chip_id = dt_get_chip_id(np);
+
+	/* "Allocate" a new block ID for the chip */
+	x->block_id = xive_block_count++;
+	assert (x->block_id < XIVE_MAX_CHIPS);
+	xive_block_to_chip[x->block_id] = x->chip_id;
+	init_lock(&x->lock);
+
+	chip = get_chip(x->chip_id);
+	assert(chip);
+
+	/* All supported P9 are revision 2 (Nimbus DD2) */
+	switch (chip->type) {
+	case PROC_CHIP_P9_NIMBUS:
+		/* We should not be able to boot a P9N DD1 */
+		assert((chip->ec_level & 0xf0) != 0x10);
+		/* Fallthrough */
+	case PROC_CHIP_P9_CUMULUS:
+	case PROC_CHIP_P9P:
+		break;
+	default:
+		assert(0);
+	}
+
+	xive_dbg(x, "Initializing block ID %d...\n", x->block_id);
+	chip->xive = x;
+
+	list_head_init(&x->donated_pages);
+
+	/* Base interrupt numbers and allocator init */
+	/* XXX Consider allocating half as many ESBs than MMIO space
+	 * so that HW sources land outside of ESB space...
+	 */
+	x->int_base	= BLKIDX_TO_GIRQ(x->block_id, 0);
+	x->int_max	= x->int_base + XIVE_INT_COUNT;
+	x->int_hw_bot	= x->int_max;
+	x->int_ipi_top	= x->int_base;
+
+	/* Make sure we never hand out "2" as it's reserved for XICS emulation
+	 * IPI returns. Generally start handing out at 0x10
+	 */
+	if (x->int_ipi_top < XIVE_INT_FIRST)
+		x->int_ipi_top = XIVE_INT_FIRST;
+
+	/* Allocate a few bitmaps */
+	x->eq_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_EQ_COUNT >> 3), PAGE_SIZE);
+	assert(x->eq_map);
+	memset(x->eq_map, 0, BITMAP_BYTES(XIVE_EQ_COUNT >> 3));
+
+	/* Make sure we don't hand out 0 */
+	bitmap_set_bit(*x->eq_map, 0);
+
+	x->int_enabled_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE);
+	assert(x->int_enabled_map);
+	memset(x->int_enabled_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+	x->ipi_alloc_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE);
+	assert(x->ipi_alloc_map);
+	memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+
+	xive_dbg(x, "Handling interrupts [%08x..%08x]\n",
+		 x->int_base, x->int_max - 1);
+
+	/* Setup the BARs */
+	if (!xive_configure_bars(x))
+		goto fail;
+
+	/* Some basic global inits such as page sizes etc... */
+	if (!xive_config_init(x))
+		goto fail;
+
+	/* Configure the set translations for MMIO */
+	if (!xive_setup_set_xlate(x))
+		goto fail;
+
+	/* Dump some MMIO registers for diagnostics */
+	xive_dump_mmio(x);
+
+	/* Pre-allocate a number of tables */
+	if (!xive_prealloc_tables(x))
+		goto fail;
+
+	/* Configure local tables in VSDs (forward ports will be
+	 * handled later)
+	 */
+	if (!xive_set_local_tables(x))
+		goto fail;
+
+	/* Register built-in source controllers (aka IPIs) */
+	flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE;
+	if (XIVE_CAN_STORE_EOI(x))
+		flags |= XIVE_SRC_STORE_EOI;
+	__xive_register_source(x, &x->ipis, x->int_base,
+			       x->int_hw_bot - x->int_base, XIVE_ESB_SHIFT,
+			       x->esb_mmio, flags, true, NULL, NULL);
+
+	/* Register escalation sources */
+	__xive_register_source(x, &x->esc_irqs,
+			       MAKE_ESCALATION_GIRQ(x->block_id, 0),
+			       XIVE_EQ_COUNT, XIVE_EQ_SHIFT,
+			       x->eq_mmio, XIVE_SRC_EOI_PAGE1,
+			       false, NULL, NULL);
+
+
+	return x;
+ fail:
+	xive_err(x, "Initialization failed...\n");
+
+	/* Should this be fatal ? */
+	//assert(false);
+	return NULL;
+}
+
+/*
+ * XICS emulation
+ */
+static void xive_ipi_init(struct xive *x, struct cpu_thread *cpu)
+{
+	struct xive_cpu_state *xs = cpu->xstate;
+
+	assert(xs);
+
+	__xive_set_irq_config(&x->ipis.is, xs->ipi_irq, cpu->pir,
+			      XIVE_EMULATION_PRIO, xs->ipi_irq,
+			      true, true);
+}
+
+static void xive_ipi_eoi(struct xive *x, uint32_t idx)
+{
+	uint8_t *mm = x->esb_mmio + idx * XIVE_ESB_PAGE_SIZE;
+	uint8_t eoi_val;
+
+	/* For EOI, we use the special MMIO that does a clear of both
+	 * P and Q and returns the old Q.
+	 *
+	 * This allows us to then do a re-trigger if Q was set rather
+	 * than synthetizing an interrupt in software
+	 */
+	eoi_val = in_8(mm + PAGE_SIZE + XIVE_ESB_SET_PQ_00);
+	if (eoi_val & 1) {
+		out_8(mm + XIVE_ESB_STORE_TRIGGER, 0);
+	}
+}
+
+static void xive_ipi_trigger(struct xive *x, uint32_t idx)
+{
+	uint8_t *mm = x->esb_mmio + idx * XIVE_ESB_PAGE_SIZE;
+
+	xive_vdbg(x, "Trigger IPI 0x%x\n", idx);
+
+	out_8(mm + XIVE_ESB_STORE_TRIGGER, 0);
+}
+
+
+static void xive_reset_enable_thread(struct cpu_thread *c)
+{
+	struct proc_chip *chip = get_chip(c->chip_id);
+	struct xive *x = chip->xive;
+	uint32_t fc, bit;
+	uint64_t enable;
+
+	/* Get fused core number */
+	fc = (c->pir >> 3) & 0xf;
+
+	/* Get bit in register */
+	bit = c->pir & 0x3f;
+
+	/* Get which register to access */
+	if (fc < 8) {
+		xive_regw(x, PC_THREAD_EN_REG0_CLR, PPC_BIT(bit));
+		xive_regw(x, PC_THREAD_EN_REG0_SET, PPC_BIT(bit));
+
+		/*
+		 * To guarantee that the TIMA accesses will see the
+		 * latest state of the enable register, add an extra
+		 * load on PC_THREAD_EN_REG.
+		 */
+		enable = xive_regr(x, PC_THREAD_EN_REG0);
+		if (!(enable & PPC_BIT(bit)))
+			xive_cpu_err(c, "Failed to enable thread\n");
+	} else {
+		xive_regw(x, PC_THREAD_EN_REG1_CLR, PPC_BIT(bit));
+		xive_regw(x, PC_THREAD_EN_REG1_SET, PPC_BIT(bit));
+
+		/* Same as above */
+		enable = xive_regr(x, PC_THREAD_EN_REG1);
+		if (!(enable & PPC_BIT(bit)))
+			xive_cpu_err(c, "Failed to enable thread\n");
+	}
+}
+
+void xive_cpu_callin(struct cpu_thread *cpu)
+{
+	struct xive_cpu_state *xs = cpu->xstate;
+	uint8_t old_w2 __unused, w2 __unused;
+
+	if (!xs)
+		return;
+
+	/* Reset the HW thread context and enable it */
+	xive_reset_enable_thread(cpu);
+
+	/* Set VT to 1 */
+	old_w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2);
+	out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2, 0x80);
+	w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2);
+
+	xive_cpu_vdbg(cpu, "Initialized TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n",
+		      xs->vp_blk, xs->vp_idx,
+		      in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS),
+		      old_w2, w2);
+}
+
+#ifdef XIVE_DEBUG_INIT_CACHE_UPDATES
+static bool xive_check_eq_update(struct xive *x, uint32_t idx, struct xive_eq *eq)
+{
+	struct xive_eq *eq_p = xive_get_eq(x, idx);
+	struct xive_eq eq2;
+
+	assert(eq_p);
+	eq2 = *eq_p;
+	if (memcmp(eq, &eq2, sizeof(struct xive_eq)) != 0) {
+		xive_err(x, "EQ update mismatch idx %d\n", idx);
+		xive_err(x, "want: %08x %08x %08x %08x\n",
+			 be32_to_cpu(eq->w0), be32_to_cpu(eq->w1),
+			 be32_to_cpu(eq->w2), be32_to_cpu(eq->w3));
+		xive_err(x, "      %08x %08x %08x %08x\n",
+			 be32_to_cpu(eq->w4), be32_to_cpu(eq->w5),
+			 be32_to_cpu(eq->w6), be32_to_cpu(eq->w7));
+		xive_err(x, "got : %08x %08x %08x %08x\n",
+			 be32_to_cpu(eq2.w0), be32_to_cpu(eq2.w1),
+			 be32_to_cpu(eq2.w2), be32_to_cpu(eq2.w3));
+		xive_err(x, "      %08x %08x %08x %08x\n",
+			 be32_to_cpu(eq2.w4), be32_to_cpu(eq2.w5),
+			 be32_to_cpu(eq2.w6), be32_to_cpu(eq2.w7));
+		return false;
+	}
+	return true;
+}
+
+static bool xive_check_vpc_update(struct xive *x, uint32_t idx, struct xive_vp *vp)
+{
+	struct xive_vp *vp_p = xive_get_vp(x, idx);
+	struct xive_vp vp2;
+
+	assert(vp_p);
+	vp2 = *vp_p;
+	if (memcmp(vp, &vp2, sizeof(struct xive_vp)) != 0) {
+		xive_err(x, "VP update mismatch idx %d\n", idx);
+		xive_err(x, "want: %08x %08x %08x %08x\n",
+			 be32_to_cpu(vp->w0), be32_to_cpu(vp->w1),
+			 be32_to_cpu(vp->w2), be32_to_cpu(vp->w3));
+		xive_err(x, "      %08x %08x %08x %08x\n",
+			 be32_to_cpu(vp->w4), be32_to_cpu(vp->w5),
+			 be32_to_cpu(vp->w6), be32_to_cpu(vp->w7));
+		xive_err(x, "got : %08x %08x %08x %08x\n",
+			 be32_to_cpu(vp2.w0), be32_to_cpu(vp2.w1),
+			 be32_to_cpu(vp2.w2), be32_to_cpu(vp2.w3));
+		xive_err(x, "      %08x %08x %08x %08x\n",
+			 be32_to_cpu(vp2.w4), be32_to_cpu(vp2.w5),
+			 be32_to_cpu(vp2.w6), be32_to_cpu(vp2.w7));
+		return false;
+	}
+	return true;
+}
+#else
+static inline bool xive_check_eq_update(struct xive *x __unused,
+					uint32_t idx __unused,
+					struct xive_eq *eq __unused)
+{
+	return true;
+}
+
+static inline bool xive_check_vpc_update(struct xive *x __unused,
+					 uint32_t idx __unused,
+					 struct xive_vp *vp __unused)
+{
+	return true;
+}
+#endif
+
+#ifdef XIVE_EXTRA_CHECK_INIT_CACHE
+static void xive_special_cache_check(struct xive *x, uint32_t blk, uint32_t idx)
+{
+	struct xive_vp vp = {0};
+	uint32_t i;
+
+	for (i = 0; i < 1000; i++) {
+		struct xive_vp *vp_m = xive_get_vp(x, idx);
+
+		memset(vp_m, (~i) & 0xff, sizeof(*vp_m));
+		sync();
+		vp.w1 = cpu_to_be32((i << 16) | i);
+		xive_vpc_cache_update(x, blk, idx, &vp, true);
+		if (!xive_check_vpc_update(x, idx, &vp)) {
+			xive_dbg(x, "Test failed at %d iterations\n", i);
+			return;
+		}
+	}
+	xive_dbg(x, "1000 iterations test success at %d/0x%x\n", blk, idx);
+}
+#else
+static inline void xive_special_cache_check(struct xive *x __unused,
+					    uint32_t blk __unused,
+					    uint32_t idx __unused)
+{
+}
+#endif
+
+static void xive_setup_hw_for_emu(struct xive_cpu_state *xs)
+{
+	struct xive_eq eq;
+	struct xive_vp vp;
+	struct xive *x_eq, *x_vp;
+
+	/* Grab the XIVE where the VP resides. It could be different from
+	 * the local chip XIVE if not using block group mode
+	 */
+	x_vp = xive_from_pc_blk(xs->vp_blk);
+	assert(x_vp);
+
+	/* Grab the XIVE where the EQ resides. It will be the same as the
+	 * VP one with the current provisioning but I prefer not making
+	 * this code depend on it.
+	 */
+	x_eq = xive_from_vc_blk(xs->eq_blk);
+	assert(x_eq);
+
+	/* Initialize the structure */
+	xive_init_emu_eq(xs->vp_blk, xs->vp_idx, &eq,
+			 xs->eq_page, XIVE_EMULATION_PRIO);
+
+	/* Use the cache watch to write it out */
+	lock(&x_eq->lock);
+	xive_eqc_cache_update(x_eq, xs->eq_blk, xs->eq_idx + XIVE_EMULATION_PRIO, &eq, true);
+	xive_check_eq_update(x_eq, xs->eq_idx + XIVE_EMULATION_PRIO, &eq);
+
+	/* Extra testing of cache watch & scrub facilities */
+	xive_special_cache_check(x_vp, xs->vp_blk, xs->vp_idx);
+	unlock(&x_eq->lock);
+
+	/* Initialize/enable the VP */
+	xive_init_default_vp(&vp, xs->eq_blk, xs->eq_idx);
+
+	/* Use the cache watch to write it out */
+	lock(&x_vp->lock);
+	xive_vpc_cache_update(x_vp, xs->vp_blk, xs->vp_idx, &vp, true);
+	xive_check_vpc_update(x_vp, xs->vp_idx, &vp);
+	unlock(&x_vp->lock);
+}
+
+static void xive_init_cpu_emulation(struct xive_cpu_state *xs,
+				    struct cpu_thread *cpu)
+{
+	struct xive *x;
+
+	/* Setup HW EQ and VP */
+	xive_setup_hw_for_emu(xs);
+
+	/* Setup and unmask the IPI */
+	xive_ipi_init(xs->xive, cpu);
+
+	/* Initialize remaining state */
+	xs->cppr = 0;
+	xs->mfrr = 0xff;
+	xs->eqbuf = xive_get_eq_buf(xs->vp_blk,
+				    xs->eq_idx + XIVE_EMULATION_PRIO);
+	assert(xs->eqbuf);
+	memset(xs->eqbuf, 0, PAGE_SIZE);
+
+	xs->eqptr = 0;
+	xs->eqmsk = (PAGE_SIZE / 4) - 1;
+	xs->eqgen = 0;
+	x = xive_from_vc_blk(xs->eq_blk);
+	assert(x);
+	xs->eqmmio = x->eq_mmio + (xs->eq_idx + XIVE_EMULATION_PRIO) * XIVE_ESB_PAGE_SIZE;
+}
+
+static void xive_init_cpu_exploitation(struct xive_cpu_state *xs)
+{
+	struct xive_vp vp;
+	struct xive *x_vp;
+
+	/* Grab the XIVE where the VP resides. It could be different from
+	 * the local chip XIVE if not using block group mode
+	 */
+	x_vp = xive_from_pc_blk(xs->vp_blk);
+	assert(x_vp);
+
+	/* Initialize/enable the VP */
+	xive_init_default_vp(&vp, xs->eq_blk, xs->eq_idx);
+
+	/* Use the cache watch to write it out */
+	lock(&x_vp->lock);
+	xive_vpc_cache_update(x_vp, xs->vp_blk, xs->vp_idx, &vp, true);
+	unlock(&x_vp->lock);
+
+	/* Clenaup remaining state */
+	xs->cppr = 0;
+	xs->mfrr = 0xff;
+	xs->eqbuf = NULL;
+	xs->eqptr = 0;
+	xs->eqmsk = 0;
+	xs->eqgen = 0;
+	xs->eqmmio = NULL;
+}
+
+static void xive_configure_ex_special_bar(struct xive *x, struct cpu_thread *c)
+{
+	uint64_t xa, val;
+	int64_t rc;
+
+	xive_cpu_vdbg(c, "Setting up special BAR\n");
+	xa = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir), P9X_EX_NCU_SPEC_BAR);
+	val = (uint64_t)x->tm_base | P9X_EX_NCU_SPEC_BAR_ENABLE;
+	if (x->tm_shift == 16)
+		val |= P9X_EX_NCU_SPEC_BAR_256K;
+	xive_cpu_vdbg(c, "NCU_SPEC_BAR_XA[%08llx]=%016llx\n", xa, val);
+	rc = xscom_write(c->chip_id, xa, val);
+	if (rc) {
+		xive_cpu_err(c, "Failed to setup NCU_SPEC_BAR\n");
+		/* XXXX  what do do now ? */
+	}
+}
+
+void xive_late_init(void)
+{
+	struct cpu_thread *c;
+
+	prlog(PR_INFO, "SLW: Configuring self-restore for NCU_SPEC_BAR\n");
+	for_each_present_cpu(c) {
+		if(cpu_is_thread0(c)) {
+			struct proc_chip *chip = get_chip(c->chip_id);
+			struct xive *x = chip->xive;
+			uint64_t xa, val, rc;
+			xa = XSCOM_ADDR_P9_EX(pir_to_core_id(c->pir),
+				P9X_EX_NCU_SPEC_BAR);
+			val = (uint64_t)x->tm_base | P9X_EX_NCU_SPEC_BAR_ENABLE;
+			/* Bail out if wakeup engine has already failed */
+			if ( wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
+				prlog(PR_ERR, "XIVE p9_stop_api fail detected\n");
+				break;
+			}
+			rc = p9_stop_save_scom((void *)chip->homer_base, xa, val,
+				P9_STOP_SCOM_REPLACE, P9_STOP_SECTION_EQ_SCOM);
+			if (rc) {
+				xive_cpu_err(c, "p9_stop_api failed for NCU_SPEC_BAR rc=%lld\n",
+				rc);
+				wakeup_engine_state = WAKEUP_ENGINE_FAILED;
+			}
+		}
+	}
+
+}
+static void xive_provision_cpu(struct xive_cpu_state *xs, struct cpu_thread *c)
+{
+	struct xive *x;
+	void *p;
+
+	/* Physical VPs are pre-allocated */
+	xs->vp_blk = PIR2VP_BLK(c->pir);
+	xs->vp_idx = PIR2VP_IDX(c->pir);
+
+	/* For now we use identical block IDs for VC and PC but that might
+	 * change. We allocate the EQs on the same XIVE as the VP.
+	 */
+	xs->eq_blk = xs->vp_blk;
+
+	/* Grab the XIVE where the EQ resides. It could be different from
+	 * the local chip XIVE if not using block group mode
+	 */
+	x = xive_from_vc_blk(xs->eq_blk);
+	assert(x);
+
+	/* Allocate a set of EQs for that VP */
+	xs->eq_idx = xive_alloc_eq_set(x, true);
+	assert(!XIVE_ALLOC_IS_ERR(xs->eq_idx));
+
+	/* Provision one of the queues. Allocate the memory on the
+	 * chip where the CPU resides
+	 */
+	p = local_alloc(c->chip_id, PAGE_SIZE, PAGE_SIZE);
+	if (!p) {
+		xive_err(x, "Failed to allocate EQ backing store\n");
+		assert(false);
+	}
+	xs->eq_page = p;
+}
+
+static void xive_init_cpu(struct cpu_thread *c)
+{
+	struct proc_chip *chip = get_chip(c->chip_id);
+	struct xive *x = chip->xive;
+	struct xive_cpu_state *xs;
+
+	if (!x)
+		return;
+
+	/*
+	 * Each core pair (EX) needs this special BAR setup to have the
+	 * right powerbus cycle for the TM area (as it has the same address
+	 * on all chips so it's somewhat special).
+	 *
+	 * Because we don't want to bother trying to figure out which core
+	 * of a pair is present we just do the setup for each of them, which
+	 * is harmless.
+	 */
+	if (cpu_is_thread0(c) || cpu_is_core_chiplet_primary(c))
+		xive_configure_ex_special_bar(x, c);
+
+	/* Initialize the state structure */
+	c->xstate = xs = local_alloc(c->chip_id, sizeof(struct xive_cpu_state), 1);
+	assert(xs);
+	memset(xs, 0, sizeof(struct xive_cpu_state));
+	xs->xive = x;
+
+	init_lock(&xs->lock);
+
+	/* Shortcut to TM HV ring */
+	xs->tm_ring1 = x->tm_base + (1u << x->tm_shift);
+
+	/* Allocate an IPI */
+	xs->ipi_irq = xive_alloc_ipi_irqs(c->chip_id, 1, 1);
+
+	xive_cpu_vdbg(c, "CPU IPI is irq %08x\n", xs->ipi_irq);
+
+	/* Provision a VP and some EQDs for a physical CPU */
+	xive_provision_cpu(xs, c);
+
+	/* Initialize the XICS emulation related fields */
+	xive_init_cpu_emulation(xs, c);
+}
+
+static void xive_init_cpu_properties(struct cpu_thread *cpu)
+{
+	struct cpu_thread *t;
+	__be32 iprop[8][2] = { };
+	uint32_t i;
+
+	assert(cpu_thread_count <= 8);
+
+	if (!cpu->node)
+		return;
+	for (i = 0; i < cpu_thread_count; i++) {
+		t = (i == 0) ? cpu : find_cpu_by_pir(cpu->pir + i);
+		if (!t)
+			continue;
+		iprop[i][0] = cpu_to_be32(t->xstate->ipi_irq);
+		iprop[i][1] = 0; /* Edge */
+	}
+	dt_add_property(cpu->node, "interrupts", iprop, cpu_thread_count * 8);
+	dt_add_property_cells(cpu->node, "interrupt-parent", get_ics_phandle());
+}
+
+#ifdef XIVE_DEBUG_DUPLICATES
+static uint32_t xive_count_irq_copies(struct xive_cpu_state *xs, uint32_t ref)
+{
+	uint32_t i, irq;
+	uint32_t cnt = 0;
+	uint32_t pos = xs->eqptr;
+	uint32_t gen = xs->eqgen;
+
+	for (i = 0; i < 0x3fff; i++) {
+		irq = xs->eqbuf[pos];
+		if ((irq >> 31) == gen)
+			break;
+		if (irq == ref)
+			cnt++;
+		pos = (pos + 1) & xs->eqmsk;
+		if (!pos)
+			gen ^= 1;
+	}
+	return cnt;
+}
+#else
+static inline uint32_t xive_count_irq_copies(struct xive_cpu_state *xs __unused,
+					     uint32_t ref __unused)
+{
+	return 1;
+}
+#endif
+
+static uint32_t xive_read_eq(struct xive_cpu_state *xs, bool just_peek)
+{
+	uint32_t cur, copies;
+
+	xive_cpu_vdbg(this_cpu(), "  EQ %s... IDX=%x MSK=%x G=%d\n",
+		      just_peek ? "peek" : "read",
+		      xs->eqptr, xs->eqmsk, xs->eqgen);
+	cur = xs->eqbuf[xs->eqptr];
+	xive_cpu_vdbg(this_cpu(), "    cur: %08x [%08x %08x %08x ...]\n", cur,
+		      xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk],
+		      xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk],
+		      xs->eqbuf[(xs->eqptr + 3) & xs->eqmsk]);
+	if ((cur >> 31) == xs->eqgen)
+		return 0;
+
+	/* Debug: check for duplicate interrupts in the queue */
+	copies = xive_count_irq_copies(xs, cur);
+	if (copies > 1) {
+		struct xive_eq *eq;
+
+		prerror("Wow ! Dups of irq %x, found %d copies !\n",
+			cur & 0x7fffffff, copies);
+		prerror("[%08x > %08x %08x %08x %08x ...] eqgen=%x eqptr=%x jp=%d\n",
+			xs->eqbuf[(xs->eqptr - 1) & xs->eqmsk],
+			xs->eqbuf[(xs->eqptr + 0) & xs->eqmsk],
+			xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk],
+			xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk],
+			xs->eqbuf[(xs->eqptr + 3) & xs->eqmsk],
+			xs->eqgen, xs->eqptr, just_peek);
+		lock(&xs->xive->lock);
+		__xive_cache_scrub(xs->xive, xive_cache_eqc, xs->eq_blk,
+				   xs->eq_idx + XIVE_EMULATION_PRIO,
+				   false, false);
+		unlock(&xs->xive->lock);
+		eq = xive_get_eq(xs->xive, xs->eq_idx + XIVE_EMULATION_PRIO);
+		prerror("EQ @%p W0=%08x W1=%08x qbuf @%p\n",
+			eq, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1), xs->eqbuf);
+	}
+	log_add(xs, LOG_TYPE_POPQ, 7, cur,
+		xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk],
+		xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk],
+		copies,
+		xs->eqptr, xs->eqgen, just_peek);
+	if (!just_peek) {
+		xs->eqptr = (xs->eqptr + 1) & xs->eqmsk;
+		if (xs->eqptr == 0)
+			xs->eqgen ^= 1;
+		xs->total_irqs++;
+	}
+	return cur & 0x00ffffff;
+}
+
+static uint8_t xive_sanitize_cppr(uint8_t cppr)
+{
+	if (cppr == 0xff || cppr == 0)
+		return cppr;
+	else
+		return XIVE_EMULATION_PRIO;
+}
+
+static inline uint8_t opal_xive_check_pending(struct xive_cpu_state *xs,
+					      uint8_t cppr)
+{
+	uint8_t mask = (cppr > 7) ? 0xff : ~((0x100 >> cppr) - 1);
+
+	return xs->pending & mask;
+}
+
+static void opal_xive_update_cppr(struct xive_cpu_state *xs, u8 cppr)
+{
+	/* Peform the update */
+	xs->cppr = cppr;
+	out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, cppr);
+
+	/* Trigger the IPI if it's still more favored than the CPPR
+	 *
+	 * This can lead to a bunch of spurrious retriggers if the
+	 * IPI is queued up behind other interrupts but that's not
+	 * a big deal and keeps the code simpler
+	 */
+	if (xs->mfrr < cppr)
+		xive_ipi_trigger(xs->xive, GIRQ_TO_IDX(xs->ipi_irq));
+}
+
+static int64_t opal_xive_eoi(uint32_t xirr)
+{
+	struct cpu_thread *c = this_cpu();
+	struct xive_cpu_state *xs = c->xstate;
+	uint32_t isn = xirr & 0x00ffffff;
+	struct xive *src_x;
+	bool special_ipi = false;
+	uint8_t cppr;
+
+	/*
+	 * In exploitation mode, this is supported as a way to perform
+	 * an EOI via a FW calls. This can be needed to workaround HW
+	 * implementation bugs for example. In this case interrupts will
+	 * have the OPAL_XIVE_IRQ_EOI_VIA_FW flag set.
+	 *
+	 * In that mode the entire "xirr" argument is interpreterd as
+	 * a global IRQ number (including the escalation bit), ther is
+	 * no split between the top 8 bits for CPPR and bottom 24 for
+	 * the interrupt number.
+	 */
+	if (xive_mode != XIVE_MODE_EMU)
+		return irq_source_eoi(xirr) ? OPAL_SUCCESS : OPAL_PARAMETER;
+
+	if (!xs)
+		return OPAL_INTERNAL_ERROR;
+
+	xive_cpu_vdbg(c, "EOI xirr=%08x cur_cppr=%d\n", xirr, xs->cppr);
+
+	/* Limit supported CPPR values from OS */
+	cppr = xive_sanitize_cppr(xirr >> 24);
+
+	lock(&xs->lock);
+
+	log_add(xs, LOG_TYPE_EOI, 3, isn, xs->eqptr, xs->eqgen);
+
+	/* If this was our magic IPI, convert to IRQ number */
+	if (isn == 2) {
+		isn = xs->ipi_irq;
+		special_ipi = true;
+		xive_cpu_vdbg(c, "User EOI for IPI !\n");
+	}
+
+	/* First check if we have stuff in that queue. If we do, don't bother with
+	 * doing an EOI on the EQ. Just mark that priority pending, we'll come
+	 * back later.
+	 *
+	 * If/when supporting multiple queues we would have to check them all
+	 * in ascending prio order up to the passed-in CPPR value (exclusive).
+	 */
+	if (xive_read_eq(xs, true)) {
+		xive_cpu_vdbg(c, "  isn %08x, skip, queue non empty\n", xirr);
+		xs->pending |= 1 << XIVE_EMULATION_PRIO;
+	}
+#ifndef EQ_ALWAYS_NOTIFY
+	else {
+		uint8_t eoi_val;
+
+		/* Perform EQ level EOI. Only one EQ for now ...
+		 *
+		 * Note: We aren't doing an actual EOI. Instead we are clearing
+		 * both P and Q and will re-check the queue if Q was set.
+		 */
+		eoi_val = in_8(xs->eqmmio + XIVE_ESB_SET_PQ_00);
+		xive_cpu_vdbg(c, "  isn %08x, eoi_val=%02x\n", xirr, eoi_val);
+
+		/* Q was set ? Check EQ again after doing a sync to ensure
+		 * ordering.
+		 */
+		if (eoi_val & 1) {
+			sync();
+			if (xive_read_eq(xs, true))
+				xs->pending |= 1 << XIVE_EMULATION_PRIO;
+		}
+	}
+#endif
+
+	/* Perform source level EOI if it's not our emulated MFRR IPI
+	 * otherwise EOI ourselves
+	 */
+	src_x = xive_from_isn(isn);
+	if (src_x) {
+		uint32_t idx = GIRQ_TO_IDX(isn);
+
+		/* Is it an IPI ? */
+		if (special_ipi) {
+			xive_ipi_eoi(src_x, idx);
+		} else {
+			/* Otherwise go through the source mechanism */
+			xive_vdbg(src_x, "EOI of IDX %x in EXT range\n", idx);
+			irq_source_eoi(isn);
+		}
+	} else {
+		xive_cpu_err(c, "  EOI unknown ISN %08x\n", isn);
+	}
+
+	/* Finally restore CPPR */
+	opal_xive_update_cppr(xs, cppr);
+
+	xive_cpu_vdbg(c, "  pending=0x%x cppr=%d\n", xs->pending, cppr);
+
+	unlock(&xs->lock);
+
+	/* Return whether something is pending that is suitable for
+	 * delivery considering the new CPPR value. This can be done
+	 * without lock as these fields are per-cpu.
+	 */
+	return opal_xive_check_pending(xs, cppr) ? 1 : 0;
+}
+
+#ifdef XIVE_CHECK_MISROUTED_IPI
+static void xive_dump_eq(uint32_t eq_blk, uint32_t eq_idx)
+{
+	struct cpu_thread *me = this_cpu();
+	struct xive *x;
+	struct xive_eq *eq;
+
+	x = xive_from_vc_blk(eq_blk);
+	if (!x)
+		return;
+	eq = xive_get_eq(x, eq_idx);
+	if (!eq)
+		return;
+	xive_cpu_err(me, "EQ: %08x %08x %08x %08x (@%p)\n",
+		     eq->w0, eq->w1, eq->w2, eq->w3, eq);
+	xive_cpu_err(me, "    %08x %08x %08x %08x\n",
+		     eq->w4, eq->w5, eq->w6, eq->w7);
+}
+static int64_t __opal_xive_dump_emu(struct xive_cpu_state *xs, uint32_t pir);
+
+static bool check_misrouted_ipi(struct cpu_thread *me, uint32_t irq)
+{
+	struct cpu_thread *c;
+
+	for_each_present_cpu(c) {
+		struct xive_cpu_state *xs = c->xstate;
+		struct xive_ive *ive;
+		uint32_t ipi_target, i, eq_blk, eq_idx;
+		struct proc_chip *chip;
+		struct xive *x;
+
+		if (!xs)
+			continue;
+		if (irq == xs->ipi_irq) {
+			xive_cpu_err(me, "misrouted IPI 0x%x, should"
+				     " be aimed at CPU 0x%x\n",
+				     irq, c->pir);
+			xive_cpu_err(me, " my eq_page=%p eqbuff=%p eq=0x%x/%x\n",
+				     me->xstate->eq_page, me->xstate->eqbuf,
+				     me->xstate->eq_blk, me->xstate->eq_idx + XIVE_EMULATION_PRIO);
+			xive_cpu_err(me, "tgt eq_page=%p eqbuff=%p eq=0x%x/%x\n",
+				     c->xstate->eq_page, c->xstate->eqbuf,
+				     c->xstate->eq_blk, c->xstate->eq_idx + XIVE_EMULATION_PRIO);
+			__opal_xive_dump_emu(me->xstate, me->pir);
+			__opal_xive_dump_emu(c->xstate, c->pir);
+			if (xive_get_irq_targetting(xs->ipi_irq, &ipi_target, NULL, NULL))
+				xive_cpu_err(me, "target=%08x\n", ipi_target);
+			else
+				xive_cpu_err(me, "target=???\n");
+				/* Find XIVE on which the IVE resides */
+			x = xive_from_isn(irq);
+			if (!x) {
+				xive_cpu_err(me, "no xive attached\n");
+				return true;
+			}
+			ive = xive_get_ive(x, irq);
+			if (!ive) {
+				xive_cpu_err(me, "no ive attached\n");
+				return true;
+			}
+			xive_cpu_err(me, "ive=%016llx\n", be64_to_cpu(ive->w));
+			for_each_chip(chip) {
+				x = chip->xive;
+				if (!x)
+					continue;
+				ive = x->ivt_base;
+				for (i = 0; i < XIVE_INT_COUNT; i++) {
+					if (xive_get_field64(IVE_EQ_DATA, ive[i].w) == irq) {
+						eq_blk = xive_get_field64(IVE_EQ_BLOCK, ive[i].w);
+						eq_idx = xive_get_field64(IVE_EQ_INDEX, ive[i].w);
+						xive_cpu_err(me, "Found source: 0x%x ive=%016llx\n"
+							     " eq 0x%x/%x",
+							     BLKIDX_TO_GIRQ(x->block_id, i),
+							     be64_to_cpu(ive[i].w), eq_blk, eq_idx);
+						xive_dump_eq(eq_blk, eq_idx);
+					}
+				}
+			}
+			return true;
+		}
+	}
+	return false;
+}
+#else
+static inline bool check_misrouted_ipi(struct cpu_thread  *c __unused,
+				       uint32_t irq __unused)
+{
+	return false;
+}
+#endif
+
+static int64_t opal_xive_get_xirr(__be32 *out_xirr, bool just_poll)
+{
+	struct cpu_thread *c = this_cpu();
+	struct xive_cpu_state *xs = c->xstate;
+	uint16_t ack;
+	uint8_t active, old_cppr;
+
+	if (xive_mode != XIVE_MODE_EMU)
+		return OPAL_WRONG_STATE;
+	if (!xs)
+		return OPAL_INTERNAL_ERROR;
+	if (!out_xirr)
+		return OPAL_PARAMETER;
+
+	*out_xirr = 0;
+
+	lock(&xs->lock);
+
+	/*
+	 * Due to the need to fetch multiple interrupts from the EQ, we
+	 * need to play some tricks.
+	 *
+	 * The "pending" byte in "xs" keeps track of the priorities that
+	 * are known to have stuff to read (currently we only use one).
+	 *
+	 * It is set in EOI and cleared when consumed here. We don't bother
+	 * looking ahead here, EOI will do it.
+	 *
+	 * We do need to still do an ACK every time in case a higher prio
+	 * exception occurred (though we don't do prio yet... right ? still
+	 * let's get the basic design right !).
+	 *
+	 * Note that if we haven't found anything via ack, but did find
+	 * something in the queue, we must also raise CPPR back.
+	 */
+
+	xive_cpu_vdbg(c, "get_xirr W01=%016llx W2=%08x\n",
+		      __in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS),
+		      __in_be32(xs->tm_ring1 + TM_QW3_HV_PHYS + 8));
+
+	/* Perform the HV Ack cycle */
+	if (just_poll)
+		ack = __in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS) >> 48;
+	else
+		ack = __in_be16(xs->tm_ring1 + TM_SPC_ACK_HV_REG);
+	sync();
+	xive_cpu_vdbg(c, "get_xirr,%s=%04x\n", just_poll ? "POLL" : "ACK", ack);
+
+	/* Capture the old CPPR which we will return with the interrupt */
+	old_cppr = xs->cppr;
+
+	switch(GETFIELD(TM_QW3_NSR_HE, (ack >> 8))) {
+	case TM_QW3_NSR_HE_NONE:
+		break;
+	case TM_QW3_NSR_HE_POOL:
+		break;
+	case TM_QW3_NSR_HE_PHYS:
+		/* Mark pending and keep track of the CPPR update */
+		if (!just_poll && (ack & 0xff) != 0xff) {
+			xs->cppr = ack & 0xff;
+			xs->pending |= 1 << xs->cppr;
+		}
+		break;
+	case TM_QW3_NSR_HE_LSI:
+		break;
+	}
+
+	/* Calculate "active" lines as being the pending interrupts
+	 * masked by the "old" CPPR
+	 */
+	active = opal_xive_check_pending(xs, old_cppr);
+
+	log_add(xs, LOG_TYPE_XIRR, 6, old_cppr, xs->cppr, xs->pending, active,
+		xs->eqptr, xs->eqgen);
+
+#ifdef XIVE_PERCPU_LOG
+	{
+		struct xive_eq *eq;
+		lock(&xs->xive->lock);
+		__xive_cache_scrub(xs->xive, xive_cache_eqc, xs->eq_blk,
+				   xs->eq_idx + XIVE_EMULATION_PRIO,
+				   false, false);
+		unlock(&xs->xive->lock);
+		eq = xive_get_eq(xs->xive, xs->eq_idx + XIVE_EMULATION_PRIO);
+		log_add(xs, LOG_TYPE_EQD, 2, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1));
+	}
+#endif /* XIVE_PERCPU_LOG */
+
+	xive_cpu_vdbg(c, "  cppr=%d->%d pending=0x%x active=%x\n",
+		      old_cppr, xs->cppr, xs->pending, active);
+	if (active) {
+		/* Find highest pending */
+		uint8_t prio = ffs(active) - 1;
+		uint32_t val;
+
+		/* XXX Use "p" to select queue */
+		val = xive_read_eq(xs, just_poll);
+
+		if (val && val < XIVE_INT_FIRST)
+			xive_cpu_err(c, "Bogus interrupt 0x%x received !\n", val);
+
+		/* Convert to magic IPI if needed */
+		if (val == xs->ipi_irq)
+			val = 2;
+		if (check_misrouted_ipi(c, val))
+			val = 2;
+
+		*out_xirr = cpu_to_be32((old_cppr << 24) | val);
+
+		/* If we are polling, that's it */
+		if (just_poll)
+			goto skip;
+
+		/* Clear the pending bit. EOI will set it again if needed. We
+		 * could check the queue but that's not really critical here.
+		 */
+		xs->pending &= ~(1 << prio);
+
+		/* Spurrious IPB bit, nothing to fetch, bring CPPR back */
+		if (!val)
+			prio = old_cppr;
+
+		/* We could have fetched a pending interrupt left over
+		 * by a previous EOI, so the CPPR might need adjusting
+		 * Also if we had a spurrious one as well.
+		 */
+		if (xs->cppr != prio) {
+			xs->cppr = prio;
+			out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, prio);
+			xive_cpu_vdbg(c, "  adjusted CPPR to %d\n", prio);
+		}
+
+		if (val)
+			xive_cpu_vdbg(c, "  found irq, prio=%d\n", prio);
+
+	} else {
+		/* Nothing was active, this is a fluke, restore CPPR */
+		opal_xive_update_cppr(xs, old_cppr);
+		xive_cpu_vdbg(c, "  nothing active, restored CPPR to %d\n",
+			      old_cppr);
+	}
+ skip:
+
+	log_add(xs, LOG_TYPE_XIRR2, 5, xs->cppr, xs->pending,
+		be32_to_cpu(*out_xirr), xs->eqptr, xs->eqgen);
+	xive_cpu_vdbg(c, "  returning XIRR=%08x, pending=0x%x\n",
+		      be32_to_cpu(*out_xirr), xs->pending);
+
+	unlock(&xs->lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_set_cppr(uint8_t cppr)
+{
+	struct cpu_thread *c = this_cpu();
+	struct xive_cpu_state *xs = c->xstate;
+
+	if (xive_mode != XIVE_MODE_EMU)
+		return OPAL_WRONG_STATE;
+
+	/* Limit supported CPPR values */
+	cppr = xive_sanitize_cppr(cppr);
+
+	if (!xs)
+		return OPAL_INTERNAL_ERROR;
+	xive_cpu_vdbg(c, "CPPR setting to %d\n", cppr);
+
+	lock(&xs->lock);
+	opal_xive_update_cppr(xs, cppr);
+	unlock(&xs->lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_set_mfrr(uint32_t cpu, uint8_t mfrr)
+{
+	struct cpu_thread *c = find_cpu_by_server(cpu);
+	struct xive_cpu_state *xs;
+	uint8_t old_mfrr;
+
+	if (xive_mode != XIVE_MODE_EMU)
+		return OPAL_WRONG_STATE;
+	if (!c)
+		return OPAL_PARAMETER;
+	xs = c->xstate;
+	if (!xs)
+		return OPAL_INTERNAL_ERROR;
+
+	lock(&xs->lock);
+	old_mfrr = xs->mfrr;
+	xive_cpu_vdbg(c, "  Setting MFRR to %x, old is %x\n", mfrr, old_mfrr);
+	xs->mfrr = mfrr;
+	if (old_mfrr > mfrr && mfrr < xs->cppr)
+		xive_ipi_trigger(xs->xive, GIRQ_TO_IDX(xs->ipi_irq));
+	unlock(&xs->lock);
+
+	return OPAL_SUCCESS;
+}
+
+static uint64_t xive_convert_irq_flags(uint64_t iflags)
+{
+	uint64_t oflags = 0;
+
+	if (iflags & XIVE_SRC_STORE_EOI)
+		oflags |= OPAL_XIVE_IRQ_STORE_EOI;
+
+	/* OPAL_XIVE_IRQ_TRIGGER_PAGE is only meant to be set if
+	 * the interrupt has a *separate* trigger page.
+	 */
+	if ((iflags & XIVE_SRC_EOI_PAGE1) &&
+	    (iflags & XIVE_SRC_TRIGGER_PAGE))
+		oflags |= OPAL_XIVE_IRQ_TRIGGER_PAGE;
+
+	if (iflags & XIVE_SRC_LSI)
+		oflags |= OPAL_XIVE_IRQ_LSI;
+	return oflags;
+}
+
+static int64_t opal_xive_get_irq_info(uint32_t girq,
+				      __be64 *out_flags,
+				      __be64 *out_eoi_page,
+				      __be64 *out_trig_page,
+				      __be32 *out_esb_shift,
+				      __be32 *out_src_chip)
+{
+	struct irq_source *is = irq_find_source(girq);
+	struct xive_src *s = container_of(is, struct xive_src, is);
+	uint32_t idx;
+	uint64_t mm_base;
+	uint64_t eoi_page = 0, trig_page = 0;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+	if (is == NULL || out_flags == NULL)
+		return OPAL_PARAMETER;
+	assert(is->ops == &xive_irq_source_ops);
+
+	if (out_flags)
+		*out_flags = cpu_to_be64(xive_convert_irq_flags(s->flags));
+
+	idx = girq - s->esb_base;
+
+	if (out_esb_shift)
+		*out_esb_shift = cpu_to_be32(s->esb_shift);
+
+	mm_base = (uint64_t)s->esb_mmio + (1ull << s->esb_shift) * idx;
+
+	/* The EOI page can either be the first or second page */
+	if (s->flags & XIVE_SRC_EOI_PAGE1) {
+		uint64_t p1off = 1ull << (s->esb_shift - 1);
+		eoi_page = mm_base + p1off;
+	} else
+		eoi_page = mm_base;
+
+	/* The trigger page, if it exists, is always the first page */
+	if (s->flags & XIVE_SRC_TRIGGER_PAGE)
+		trig_page = mm_base;
+
+	if (out_eoi_page)
+		*out_eoi_page = cpu_to_be64(eoi_page);
+	if (out_trig_page)
+		*out_trig_page = cpu_to_be64(trig_page);
+	if (out_src_chip)
+		*out_src_chip = cpu_to_be32(GIRQ_TO_CHIP(girq));
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_get_irq_config(uint32_t girq,
+					__be64 *out_vp,
+					uint8_t *out_prio,
+					__be32 *out_lirq)
+{
+	uint32_t vp;
+	uint32_t lirq;
+	uint8_t prio;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	if (xive_get_irq_targetting(girq, &vp, &prio, &lirq)) {
+		*out_vp = cpu_to_be64(vp);
+		*out_prio = prio;
+		*out_lirq = cpu_to_be32(lirq);
+		return OPAL_SUCCESS;
+	} else
+		return OPAL_PARAMETER;
+}
+
+static int64_t opal_xive_set_irq_config(uint32_t girq,
+					uint64_t vp,
+					uint8_t prio,
+					uint32_t lirq)
+{
+	/*
+	 * This variant is meant for a XIVE-aware OS, thus it will
+	 * *not* affect the ESB state of the interrupt. If used with
+	 * a prio of FF, the IVT/EAS will be mased. In that case the
+	 * races have to be handled by the OS.
+	 *
+	 * The exception to this rule is interrupts for which masking
+	 * and unmasking is handled by firmware. In that case the ESB
+	 * state isn't under OS control and will be dealt here. This
+	 * is currently only the case of LSIs and on P9 DD1.0 only so
+	 * isn't an issue.
+	 */
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	return xive_set_irq_config(girq, vp, prio, lirq, false);
+}
+
+static int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio,
+					__be64 *out_qpage,
+					__be64 *out_qsize,
+					__be64 *out_qeoi_page,
+					__be32 *out_escalate_irq,
+					__be64 *out_qflags)
+{
+	uint32_t blk, idx;
+	struct xive *x;
+	struct xive_eq *eq;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	if (!xive_eq_for_target(vp, prio, &blk, &idx))
+		return OPAL_PARAMETER;
+
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+
+	eq = xive_get_eq(x, idx);
+	if (!eq)
+		return OPAL_PARAMETER;
+
+	if (out_escalate_irq) {
+		uint32_t esc_idx = idx;
+
+		/* If escalations are routed to a single queue, fix up
+		 * the escalation interrupt number here.
+		 */
+		if (xive_get_field32(EQ_W0_UNCOND_ESCALATE, eq->w0))
+			esc_idx |= XIVE_ESCALATION_PRIO;
+
+		*out_escalate_irq =
+			cpu_to_be32(MAKE_ESCALATION_GIRQ(blk, esc_idx));
+	}
+
+	/* If this is a single-escalation gather queue, that's all
+	 * there is to return
+	 */
+	if (xive_get_field32(EQ_W0_SILENT_ESCALATE, eq->w0)) {
+		if (out_qflags)
+			*out_qflags = 0;
+		if (out_qpage)
+			*out_qpage = 0;
+		if (out_qsize)
+			*out_qsize = 0;
+		if (out_qeoi_page)
+			*out_qeoi_page = 0;
+		return OPAL_SUCCESS;
+	}
+
+	if (out_qpage) {
+		if (xive_get_field32(EQ_W0_ENQUEUE, eq->w0))
+			*out_qpage = cpu_to_be64(((uint64_t)xive_get_field32(EQ_W2_OP_DESC_HI, eq->w2) << 32) | be32_to_cpu(eq->w3));
+		else
+			*out_qpage = 0;
+	}
+	if (out_qsize) {
+		if (xive_get_field32(EQ_W0_ENQUEUE, eq->w0))
+			*out_qsize = cpu_to_be64(xive_get_field32(EQ_W0_QSIZE, eq->w0) + 12);
+		else
+			*out_qsize = 0;
+	}
+	if (out_qeoi_page) {
+		*out_qeoi_page =
+			cpu_to_be64((uint64_t)x->eq_mmio + idx * XIVE_ESB_PAGE_SIZE);
+	}
+	if (out_qflags) {
+		*out_qflags = 0;
+		if (xive_get_field32(EQ_W0_VALID, eq->w0))
+			*out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ENABLED);
+		if (xive_get_field32(EQ_W0_UCOND_NOTIFY, eq->w0))
+			*out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ALWAYS_NOTIFY);
+		if (xive_get_field32(EQ_W0_ESCALATE_CTL, eq->w0))
+			*out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ESCALATE);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static void xive_cleanup_eq(struct xive_eq *eq)
+{
+	eq->w0 = xive_set_field32(EQ_W0_FIRMWARE, 0, xive_get_field32(EQ_W0_FIRMWARE, eq->w0));
+	eq->w1 = cpu_to_be32(EQ_W1_ESe_Q | EQ_W1_ESn_Q);
+	eq->w2 = eq->w3 = eq->w4 = eq->w5 = eq->w6 = eq->w7 = 0;
+}
+
+static int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
+					uint64_t qpage,
+					uint64_t qsize,
+					uint64_t qflags)
+{
+	uint32_t blk, idx;
+	struct xive *x;
+	struct xive_eq *old_eq;
+	struct xive_eq eq;
+	uint32_t vp_blk, vp_idx;
+	bool group;
+	int64_t rc;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+	if (!xive_eq_for_target(vp, prio, &blk, &idx))
+		return OPAL_PARAMETER;
+
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+
+	old_eq = xive_get_eq(x, idx);
+	if (!old_eq)
+		return OPAL_PARAMETER;
+
+	/* If this is a silent escalation queue, it cannot be
+	 * configured directly
+	 */
+	if (xive_get_field32(EQ_W0_SILENT_ESCALATE, old_eq->w0))
+		return OPAL_PARAMETER;
+
+	/* This shouldn't fail or xive_eq_for_target would have
+	 * failed already
+	 */
+	if (!xive_decode_vp(vp, &vp_blk, &vp_idx, NULL, &group))
+		return OPAL_PARAMETER;
+
+	/*
+	 * Make a local copy which we will later try to commit using
+	 * the cache watch facility
+	 */
+	eq = *old_eq;
+
+	if (qflags & OPAL_XIVE_EQ_ENABLED) {
+		switch(qsize) {
+			/* Supported sizes */
+		case 12:
+		case 16:
+		case 21:
+		case 24:
+			eq.w3 = cpu_to_be32(((uint64_t)qpage) & EQ_W3_OP_DESC_LO);
+			eq.w2 = cpu_to_be32((((uint64_t)qpage) >> 32) & EQ_W2_OP_DESC_HI);
+			eq.w0 = xive_set_field32(EQ_W0_ENQUEUE, eq.w0, 1);
+			eq.w0 = xive_set_field32(EQ_W0_QSIZE, eq.w0, qsize - 12);
+			break;
+		case 0:
+			eq.w2 = eq.w3 = 0;
+			eq.w0 = xive_set_field32(EQ_W0_ENQUEUE, eq.w0, 0);
+			break;
+		default:
+			return OPAL_PARAMETER;
+		}
+
+		/* Ensure the priority and target are correctly set (they will
+		 * not be right after allocation
+		 */
+		eq.w6 = xive_set_field32(EQ_W6_NVT_BLOCK, 0, vp_blk) |
+			xive_set_field32(EQ_W6_NVT_INDEX, 0, vp_idx);
+		eq.w7 = xive_set_field32(EQ_W7_F0_PRIORITY, 0, prio);
+		/* XXX Handle group i bit when needed */
+
+		/* Always notify flag */
+		if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
+			eq.w0 = xive_set_field32(EQ_W0_UCOND_NOTIFY, eq.w0, 1);
+		else
+			eq.w0 = xive_set_field32(EQ_W0_UCOND_NOTIFY, eq.w0, 0);
+
+		/* Escalation flag */
+		if (qflags & OPAL_XIVE_EQ_ESCALATE)
+			eq.w0 = xive_set_field32(EQ_W0_ESCALATE_CTL, eq.w0, 1);
+		else
+			eq.w0 = xive_set_field32(EQ_W0_ESCALATE_CTL, eq.w0, 0);
+
+		/* Unconditionally clear the current queue pointer, set
+		 * generation to 1 and disable escalation interrupts.
+		 */
+		eq.w1 = xive_set_field32(EQ_W1_GENERATION, 0, 1) |
+			xive_set_field32(EQ_W1_ES, 0, xive_get_field32(EQ_W1_ES, old_eq->w1));
+
+		/* Enable. We always enable backlog for an enabled queue
+		 * otherwise escalations won't work.
+		 */
+		eq.w0 = xive_set_field32(EQ_W0_VALID, eq.w0, 1);
+		eq.w0 = xive_set_field32(EQ_W0_BACKLOG, eq.w0, 1);
+	} else
+		xive_cleanup_eq(&eq);
+
+	/* Update EQ, non-synchronous */
+	lock(&x->lock);
+	rc = xive_eqc_cache_update(x, blk, idx, &eq, false);
+	unlock(&x->lock);
+
+	return rc;
+}
+
+static int64_t opal_xive_get_queue_state(uint64_t vp, uint32_t prio,
+					 __be32 *out_qtoggle,
+					 __be32 *out_qindex)
+{
+	uint32_t blk, idx;
+	struct xive *x;
+	struct xive_eq *eq;
+	int64_t rc;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	if (!out_qtoggle || !out_qindex ||
+	    !xive_eq_for_target(vp, prio, &blk, &idx))
+		return OPAL_PARAMETER;
+
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+
+	eq = xive_get_eq(x, idx);
+	if (!eq)
+		return OPAL_PARAMETER;
+
+	/* Scrub the queue */
+	lock(&x->lock);
+	rc = xive_eqc_scrub(x, blk, idx);
+	unlock(&x->lock);
+	if (rc)
+		return rc;
+
+	/* We don't do disable queues */
+	if (!xive_get_field32(EQ_W0_VALID, eq->w0))
+		return OPAL_WRONG_STATE;
+
+	*out_qtoggle = cpu_to_be32(xive_get_field32(EQ_W1_GENERATION, eq->w1));
+	*out_qindex  = cpu_to_be32(xive_get_field32(EQ_W1_PAGE_OFF, eq->w1));
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_set_queue_state(uint64_t vp, uint32_t prio,
+					 uint32_t qtoggle, uint32_t qindex)
+{
+	uint32_t blk, idx;
+	struct xive *x;
+	struct xive_eq *eq, new_eq;
+	int64_t rc;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	if (!xive_eq_for_target(vp, prio, &blk, &idx))
+		return OPAL_PARAMETER;
+
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+
+	eq = xive_get_eq(x, idx);
+	if (!eq)
+		return OPAL_PARAMETER;
+
+	/* We don't do disable queues */
+	if (!xive_get_field32(EQ_W0_VALID, eq->w0))
+		return OPAL_WRONG_STATE;
+
+	new_eq = *eq;
+
+	new_eq.w1 = xive_set_field32(EQ_W1_GENERATION, new_eq.w1, qtoggle);
+	new_eq.w1 = xive_set_field32(EQ_W1_PAGE_OFF, new_eq.w1, qindex);
+
+	lock(&x->lock);
+	rc = xive_eqc_cache_update(x, blk, idx, &new_eq, false);
+	unlock(&x->lock);
+
+	return rc;
+}
+
+static int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr)
+{
+	struct proc_chip *c = get_chip(chip_id);
+	struct list_node *n;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+	if (!c)
+		return OPAL_PARAMETER;
+	if (!c->xive)
+		return OPAL_PARAMETER;
+	if (addr & 0xffff)
+		return OPAL_PARAMETER;
+
+	n = (struct list_node *)addr;
+	lock(&c->xive->lock);
+	list_add(&c->xive->donated_pages, n);
+	unlock(&c->xive->lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_get_vp_info(uint64_t vp_id,
+				     __be64 *out_flags,
+				     __be64 *out_cam_value,
+				     __be64 *out_report_cl_pair,
+				     __be32 *out_chip_id)
+{
+	struct xive *x;
+	struct xive_vp *vp;
+	uint32_t blk, idx;
+	bool group;
+
+	if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+		return OPAL_PARAMETER;
+	/* We don't do groups yet */
+	if (group)
+		return OPAL_PARAMETER;
+	x = xive_from_pc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+	vp = xive_get_vp(x, idx);
+	if (!vp)
+		return OPAL_PARAMETER;
+
+	if (out_flags) {
+		uint32_t eq_blk, eq_idx;
+		struct xive_eq *eq;
+		struct xive *eq_x;
+		*out_flags = 0;
+
+		/* We would like to a way to stash a SW bit in the VP to
+		 * know whether silent escalation is enabled or not, but
+		 * unlike what happens with EQs, the PC cache watch doesn't
+		 * implement the reserved bit in the VPs... so we have to go
+		 * look at EQ 7 instead.
+		 */
+		/* Grab EQ for prio 7 to check for silent escalation */
+		if (!xive_eq_for_target(vp_id, XIVE_ESCALATION_PRIO,
+					&eq_blk, &eq_idx))
+			return OPAL_PARAMETER;
+
+		eq_x = xive_from_vc_blk(eq_blk);
+		if (!eq_x)
+			return OPAL_PARAMETER;
+
+		eq = xive_get_eq(x, eq_idx);
+		if (!eq)
+			return OPAL_PARAMETER;
+		if (xive_get_field32(VP_W0_VALID, vp->w0))
+			*out_flags |= cpu_to_be64(OPAL_XIVE_VP_ENABLED);
+		if (xive_get_field32(EQ_W0_SILENT_ESCALATE, eq->w0))
+			*out_flags |= cpu_to_be64(OPAL_XIVE_VP_SINGLE_ESCALATION);
+	}
+
+	if (out_cam_value)
+		*out_cam_value = cpu_to_be64((blk << NVT_SHIFT) | idx);
+
+	if (out_report_cl_pair) {
+		*out_report_cl_pair = cpu_to_be64(((uint64_t)(be32_to_cpu(vp->w6) & 0x0fffffff)) << 32);
+		*out_report_cl_pair |= cpu_to_be64(be32_to_cpu(vp->w7) & 0xffffff00);
+	}
+
+	if (out_chip_id)
+		*out_chip_id = cpu_to_be32(xive_block_to_chip[blk]);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t xive_setup_silent_gather(uint64_t vp_id, bool enable)
+{
+	uint32_t blk, idx, i;
+	struct xive_eq *eq_orig;
+	struct xive_eq eq;
+	struct xive *x;
+	int64_t rc;
+
+	/* Get base EQ block */
+	if (!xive_eq_for_target(vp_id, 0, &blk, &idx))
+		return OPAL_PARAMETER;
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+
+	/* Grab prio 7 */
+	eq_orig = xive_get_eq(x, idx + XIVE_ESCALATION_PRIO);
+	if (!eq_orig)
+		return OPAL_PARAMETER;
+
+	/* If trying to enable silent gather, make sure prio 7 is not
+	 * already enabled as a normal queue
+	 */
+	if (enable && xive_get_field32(EQ_W0_VALID, eq_orig->w0) &&
+	    !xive_get_field32(EQ_W0_SILENT_ESCALATE, eq_orig->w0)) {
+		xive_dbg(x, "Attempt at enabling silent gather but"
+			 " prio 7 queue already in use\n");
+		return OPAL_PARAMETER;
+	}
+
+	eq = *eq_orig;
+
+	if (enable) {
+		/* W0: Enabled and "s" set, no other bit */
+		eq.w0 = xive_set_field32(EQ_W0_FIRMWARE, 0, xive_get_field32(EQ_W0_FIRMWARE, eq.w0)) |
+			xive_set_field32(EQ_W0_VALID, 0, 1) |
+			xive_set_field32(EQ_W0_SILENT_ESCALATE, 0, 1) |
+			xive_set_field32(EQ_W0_ESCALATE_CTL, 0, 1) |
+			xive_set_field32(EQ_W0_BACKLOG, 0, 1);
+
+		/* W1: Mark ESn as 01, ESe as 00 */
+		eq.w1 = xive_set_field32(EQ_W1_ESn_P, eq.w1, 0);
+		eq.w1 = xive_set_field32(EQ_W1_ESn_Q, eq.w1, 1);
+		eq.w1 = xive_set_field32(EQ_W1_ESe, eq.w1, 0);
+	} else if (xive_get_field32(EQ_W0_SILENT_ESCALATE, eq.w0))
+		xive_cleanup_eq(&eq);
+
+	if (!memcmp(eq_orig, &eq, sizeof(eq)))
+		rc = 0;
+	else
+		rc = xive_eqc_cache_update(x, blk, idx + XIVE_ESCALATION_PRIO,
+					   &eq, false);
+	if (rc)
+		return rc;
+
+	/* Mark/unmark all other prios with the new "u" bit and update
+	 * escalation
+	 */
+	for (i = 0; i < NUM_INT_PRIORITIES; i++) {
+		if (i == XIVE_ESCALATION_PRIO)
+			continue;
+		eq_orig = xive_get_eq(x, idx + i);
+		if (!eq_orig)
+			continue;
+		eq = *eq_orig;
+		if (enable) {
+			/* Set new "u" bit */
+			eq.w0 = xive_set_field32(EQ_W0_UNCOND_ESCALATE, eq.w0, 1);
+
+			/* Re-route escalation interrupt (previous
+			 * route is lost !) to the gather queue
+			 */
+			eq.w4 = xive_set_field32(EQ_W4_ESC_EQ_BLOCK, eq.w4, blk);
+			eq.w4 = xive_set_field32(EQ_W4_ESC_EQ_INDEX, eq.w4, idx + XIVE_ESCALATION_PRIO);
+		} else if (xive_get_field32(EQ_W0_UNCOND_ESCALATE, eq.w0)) {
+			/* Clear the "u" bit, disable escalations if it was set */
+			eq.w0 = xive_set_field32(EQ_W0_UNCOND_ESCALATE, eq.w0, 0);
+			eq.w0 = xive_set_field32(EQ_W0_ESCALATE_CTL, eq.w0, 0);
+		}
+		if (!memcmp(eq_orig, &eq, sizeof(eq)))
+			continue;
+		rc = xive_eqc_cache_update(x, blk, idx + i, &eq, false);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+static int64_t opal_xive_set_vp_info(uint64_t vp_id,
+				     uint64_t flags,
+				     uint64_t report_cl_pair)
+{
+	struct xive *x;
+	struct xive_vp *vp, vp_new;
+	uint32_t blk, idx;
+	bool group;
+	int64_t rc;
+
+	if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+		return OPAL_PARAMETER;
+	/* We don't do groups yet */
+	if (group)
+		return OPAL_PARAMETER;
+	if (report_cl_pair & 0xff)
+		return OPAL_PARAMETER;
+	x = xive_from_pc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+	vp = xive_get_vp(x, idx);
+	if (!vp)
+		return OPAL_PARAMETER;
+
+	lock(&x->lock);
+
+	vp_new = *vp;
+	if (flags & OPAL_XIVE_VP_ENABLED) {
+		vp_new.w0 = xive_set_field32(VP_W0_VALID, vp_new.w0, 1);
+		vp_new.w6 = cpu_to_be32(report_cl_pair >> 32);
+		vp_new.w7 = cpu_to_be32(report_cl_pair & 0xffffffff);
+
+		if (flags & OPAL_XIVE_VP_SINGLE_ESCALATION)
+			rc = xive_setup_silent_gather(vp_id, true);
+		else
+			rc = xive_setup_silent_gather(vp_id, false);
+	} else {
+		vp_new.w0 = vp_new.w6 = vp_new.w7 = 0;
+		rc = xive_setup_silent_gather(vp_id, false);
+	}
+
+	if (rc) {
+		if (rc != OPAL_BUSY)
+			xive_dbg(x, "Silent gather setup failed with err %lld\n", rc);
+		goto bail;
+	}
+
+	rc = xive_vpc_cache_update(x, blk, idx, &vp_new, false);
+	if (rc)
+		goto bail;
+
+	/* When disabling, we scrub clean (invalidate the entry) so
+	 * we can avoid cache ops in alloc/free
+	 */
+	if (!(flags & OPAL_XIVE_VP_ENABLED))
+		xive_vpc_scrub_clean(x, blk, idx);
+
+bail:
+	unlock(&x->lock);
+	return rc;
+}
+
+static int64_t opal_xive_get_vp_state(uint64_t vp_id, __be64 *out_state)
+{
+	struct xive *x;
+	struct xive_vp *vp;
+	uint32_t blk, idx;
+	int64_t rc;
+	bool group;
+
+	if (!out_state || !xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+		return OPAL_PARAMETER;
+	if (group)
+		return OPAL_PARAMETER;
+	x = xive_from_pc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+	vp = xive_get_vp(x, idx);
+	if (!vp)
+		return OPAL_PARAMETER;
+
+	/* Scrub the vp */
+	lock(&x->lock);
+	rc = xive_vpc_scrub(x, blk, idx);
+	unlock(&x->lock);
+	if (rc)
+		return rc;
+
+	if (!xive_get_field32(VP_W0_VALID, vp->w0))
+		return OPAL_WRONG_STATE;
+
+	/*
+	 * Return word4 and word5 which contain the saved HW thread
+	 * context. The IPB register is all we care for now on P9.
+	 */
+	*out_state = cpu_to_be64((((uint64_t)be32_to_cpu(vp->w4)) << 32) | be32_to_cpu(vp->w5));
+
+	return OPAL_SUCCESS;
+}
+
+static void xive_cleanup_cpu_tima(struct cpu_thread *c)
+{
+	struct xive_cpu_state *xs = c->xstate;
+	struct xive *x = xs->xive;
+	void *ind_tm_base = x->ic_base + (4 << x->ic_shift);
+	uint8_t old_w2 __unused, w2 __unused;
+
+	/* Reset the HW context */
+	xive_reset_enable_thread(c);
+
+	/* Setup indirect access to the corresponding thread */
+	xive_regw(x, PC_TCTXT_INDIR0,
+		  PC_TCTXT_INDIR_VALID |
+		  SETFIELD(PC_TCTXT_INDIR_THRDID, 0ull, c->pir & 0xff));
+
+	/* Workaround for HW issue: Need to read the above register
+	 * back before doing the subsequent accesses
+	 */
+	xive_regr(x, PC_TCTXT_INDIR0);
+
+	/* Set VT to 1 */
+	old_w2 = in_8(ind_tm_base + TM_QW3_HV_PHYS + TM_WORD2);
+	out_8(ind_tm_base + TM_QW3_HV_PHYS + TM_WORD2, 0x80);
+	w2 = in_8(ind_tm_base + TM_QW3_HV_PHYS + TM_WORD2);
+
+	/* Dump HV state */
+	xive_cpu_vdbg(c, "[reset] VP TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n",
+		      xs->vp_blk, xs->vp_idx,
+		      in_be64(ind_tm_base + TM_QW3_HV_PHYS),
+		      old_w2, w2);
+
+	/* Reset indirect access */
+	xive_regw(x, PC_TCTXT_INDIR0, 0);
+}
+
+static int64_t xive_vc_ind_cache_kill(struct xive *x, uint64_t type)
+{
+	uint64_t val;
+
+	/* We clear the whole thing */
+	xive_regw(x, VC_AT_MACRO_KILL_MASK, 0);
+	xive_regw(x, VC_AT_MACRO_KILL, VC_KILL_VALID |
+		  SETFIELD(VC_KILL_TYPE, 0ull, type));
+
+	/* XXX SIMICS problem ? */
+	if (chip_quirk(QUIRK_SIMICS))
+		return 0;
+
+	/* XXX Add timeout */
+	for (;;) {
+		val = xive_regr(x, VC_AT_MACRO_KILL);
+		if (!(val & VC_KILL_VALID))
+			break;
+	}
+	return 0;
+}
+
+static int64_t xive_pc_ind_cache_kill(struct xive *x)
+{
+	uint64_t val;
+
+	/* We clear the whole thing */
+	xive_regw(x, PC_AT_KILL_MASK, 0);
+	xive_regw(x, PC_AT_KILL, PC_AT_KILL_VALID);
+
+	/* XXX SIMICS problem ? */
+	if (chip_quirk(QUIRK_SIMICS))
+		return 0;
+
+	/* XXX Add timeout */
+	for (;;) {
+		val = xive_regr(x, PC_AT_KILL);
+		if (!(val & PC_AT_KILL_VALID))
+			break;
+	}
+	return 0;
+}
+
+static void xive_cleanup_vp_ind(struct xive *x)
+{
+	int i;
+
+	xive_dbg(x, "Cleaning up %d VP ind entries...\n", x->vp_ind_count);
+	for (i = 0; i < x->vp_ind_count; i++) {
+		if (be64_to_cpu(x->vp_ind_base[i]) & VSD_FIRMWARE) {
+			xive_dbg(x, " %04x ... skip (firmware)\n", i);
+			continue;
+		}
+		if (x->vp_ind_base[i] != 0) {
+			x->vp_ind_base[i] = 0;
+			xive_dbg(x, " %04x ... cleaned\n", i);
+		}
+	}
+	xive_pc_ind_cache_kill(x);
+}
+
+static void xive_cleanup_eq_ind(struct xive *x)
+{
+	int i;
+
+	xive_dbg(x, "Cleaning up %d EQ ind entries...\n", x->eq_ind_count);
+	for (i = 0; i < x->eq_ind_count; i++) {
+		if (be64_to_cpu(x->eq_ind_base[i]) & VSD_FIRMWARE) {
+			xive_dbg(x, " %04x ... skip (firmware)\n", i);
+			continue;
+		}
+		if (x->eq_ind_base[i] != 0) {
+			x->eq_ind_base[i] = 0;
+			xive_dbg(x, " %04x ... cleaned\n", i);
+		}
+	}
+	xive_vc_ind_cache_kill(x, VC_KILL_EQD);
+}
+
+static void xive_reset_one(struct xive *x)
+{
+	struct cpu_thread *c;
+	bool eq_firmware;
+	int i;
+
+	xive_dbg(x, "Resetting one xive...\n");
+
+	lock(&x->lock);
+
+	/* Check all interrupts are disabled */
+	i = bitmap_find_one_bit(*x->int_enabled_map, 0, XIVE_INT_COUNT);
+	if (i >= 0)
+		xive_warn(x, "Interrupt %d (and maybe more) not disabled"
+			  " at reset !\n", i);
+
+	/* Reset IPI allocation */
+	xive_dbg(x, "freeing alloc map %p/%p\n",
+		 x->ipi_alloc_map, *x->ipi_alloc_map);
+	memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+
+	xive_dbg(x, "Resetting EQs...\n");
+
+	/* Reset all allocated EQs and free the user ones */
+	bitmap_for_each_one(*x->eq_map, XIVE_EQ_COUNT >> 3, i) {
+		struct xive_eq eq0;
+		struct xive_eq *eq;
+		int j;
+
+		if (i == 0)
+			continue;
+		eq_firmware = false;
+		for (j = 0; j < NUM_INT_PRIORITIES; j++) {
+			uint32_t idx = (i << 3) | j;
+
+			eq = xive_get_eq(x, idx);
+			if (!eq)
+				continue;
+
+			/* We need to preserve the firmware bit, otherwise
+			 * we will incorrectly free the EQs that are reserved
+			 * for the physical CPUs
+			 */
+			if (xive_get_field32(EQ_W0_VALID, eq->w0)) {
+				if (!xive_get_field32(EQ_W0_FIRMWARE, eq->w0))
+					xive_dbg(x, "EQ 0x%x:0x%x is valid at reset: %08x %08x\n",
+						 x->block_id, idx, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1));
+				eq0 = *eq;
+				xive_cleanup_eq(&eq0);
+				xive_eqc_cache_update(x, x->block_id, idx, &eq0, true);
+			}
+			if (xive_get_field32(EQ_W0_FIRMWARE, eq->w0))
+				eq_firmware = true;
+		}
+		if (!eq_firmware)
+			bitmap_clr_bit(*x->eq_map, i);
+	}
+
+	/* Take out all VPs from HW and reset all CPPRs to 0 */
+	for_each_present_cpu(c) {
+		if (c->chip_id != x->chip_id)
+			continue;
+		if (!c->xstate)
+			continue;
+		xive_cleanup_cpu_tima(c);
+	}
+
+	/* Reset all user-allocated VPs. This is inefficient, we should
+	 * either keep a bitmap of allocated VPs or add an iterator to
+	 * the buddy which is trickier but doable.
+	 */
+	for (i = 0; i < XIVE_VP_COUNT; i++) {
+		struct xive_vp *vp;
+		struct xive_vp vp0 = {0};
+
+		/* Ignore the physical CPU VPs */
+		if (i >= XIVE_HW_VP_BASE &&
+		    i < (XIVE_HW_VP_BASE + XIVE_HW_VP_COUNT))
+			continue;
+
+		/* Is the VP valid ? */
+		vp = xive_get_vp(x, i);
+		if (!vp || !xive_get_field32(VP_W0_VALID, vp->w0))
+			continue;
+
+		/* Clear it */
+		xive_dbg(x, "VP 0x%x:0x%x is valid at reset\n", x->block_id, i);
+		xive_vpc_cache_update(x, x->block_id, i, &vp0, true);
+	}
+
+	/* Forget about remaining donated pages */
+	list_head_init(&x->donated_pages);
+
+	/* And cleanup donated indirect VP and EQ pages */
+	xive_cleanup_vp_ind(x);
+	xive_cleanup_eq_ind(x);
+
+	/* The rest must not be called with the lock held */
+	unlock(&x->lock);
+
+	/* Re-configure VPs and emulation */
+	for_each_present_cpu(c) {
+		struct xive_cpu_state *xs = c->xstate;
+
+		if (c->chip_id != x->chip_id || !xs)
+			continue;
+
+		if (xive_mode == XIVE_MODE_EMU)
+			xive_init_cpu_emulation(xs, c);
+		else
+			xive_init_cpu_exploitation(xs);
+	}
+}
+
+static void xive_reset_mask_source_cb(struct irq_source *is,
+				      void *data __unused)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+	struct xive *x;
+	uint32_t isn;
+
+	if (is->ops != &xive_irq_source_ops)
+		return;
+
+	/* Skip escalation sources */
+	if (GIRQ_IS_ESCALATION(is->start))
+		return;
+
+	x = s->xive;
+
+	/* Iterate all interrupts */
+	for (isn = is->start; isn < is->end; isn++) {
+		/* Has it ever been enabled ? */
+		if (!bitmap_tst_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn)))
+			continue;
+		/* Mask it and clear the enabled map bit */
+		xive_vdbg(x, "[reset] disabling source 0x%x\n", isn);
+		__xive_set_irq_config(is, isn, 0, 0xff, isn, true, false);
+		bitmap_clr_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn));
+	}
+}
+
+void xive_cpu_reset(void)
+{
+	struct cpu_thread *c = this_cpu();
+	struct xive_cpu_state *xs = c->xstate;
+
+	xs->cppr = 0;
+	out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, 0);
+
+	in_be64(xs->tm_ring1 + TM_SPC_PULL_POOL_CTX);
+}
+
+static int64_t __xive_reset(uint64_t version)
+{
+	struct proc_chip *chip;
+
+	xive_mode = version;
+
+	/* Mask all interrupt sources */
+	irq_for_each_source(xive_reset_mask_source_cb, NULL);
+
+	/* For each XIVE do a sync... */
+	for_each_chip(chip) {
+		if (!chip->xive)
+			continue;
+		xive_sync(chip->xive);
+	}
+
+	/* For each XIVE reset everything else... */
+	for_each_chip(chip) {
+		if (!chip->xive)
+			continue;
+		xive_reset_one(chip->xive);
+	}
+
+	/* Cleanup global VP allocator */
+	buddy_reset(xive_vp_buddy);
+
+	/* We reserve the whole range of VPs representing HW chips.
+	 *
+	 * These are 0x80..0xff, so order 7 starting at 0x80. This will
+	 * reserve that range on each chip.
+	 */
+	assert(buddy_reserve(xive_vp_buddy, XIVE_HW_VP_BASE,
+			     XIVE_THREADID_SHIFT));
+
+	return OPAL_SUCCESS;
+}
+
+/* Called by fast reboot */
+int64_t xive_reset(void)
+{
+	if (xive_mode == XIVE_MODE_NONE)
+		return OPAL_SUCCESS;
+	return __xive_reset(XIVE_MODE_EMU);
+}
+
+static int64_t opal_xive_reset(uint64_t version)
+{
+	prlog(PR_DEBUG, "XIVE reset, version: %d...\n", (int)version);
+
+	if (version > 1)
+		return OPAL_PARAMETER;
+
+	return __xive_reset(version);
+}
+
+static int64_t opal_xive_free_vp_block(uint64_t vp_base)
+{
+	uint32_t blk, idx, i, j, count;
+	uint8_t order;
+	bool group;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	if (!xive_decode_vp(vp_base, &blk, &idx, &order, &group))
+		return OPAL_PARAMETER;
+	if (group)
+		return OPAL_PARAMETER;
+	if (blk)
+		return OPAL_PARAMETER;
+	if (order < (xive_chips_alloc_bits + 1))
+		return OPAL_PARAMETER;
+	if (idx & ((1 << (order - xive_chips_alloc_bits)) - 1))
+		return OPAL_PARAMETER;
+
+	count = 1 << order;
+	for (i = 0; i < count; i++) {
+		uint32_t vp_id = vp_base + i;
+		uint32_t blk, idx, eq_blk, eq_idx;
+		struct xive *x;
+		struct xive_vp *vp;
+
+		if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) {
+			prerror("XIVE: Couldn't decode VP id %u\n", vp_id);
+			return OPAL_INTERNAL_ERROR;
+		}
+		x = xive_from_pc_blk(blk);
+		if (!x) {
+			prerror("XIVE: Instance not found for deallocated VP"
+				" block %d\n", blk);
+			return OPAL_INTERNAL_ERROR;
+		}
+		vp = xive_get_vp(x, idx);
+		if (!vp) {
+			prerror("XIVE: VP not found for deallocation !");
+			return OPAL_INTERNAL_ERROR;
+		}
+
+		/* VP must be disabled */
+		if (xive_get_field32(VP_W0_VALID, vp->w0)) {
+			prlog(PR_ERR, "XIVE: freeing active VP %d\n", vp_id);
+			return OPAL_XIVE_FREE_ACTIVE;
+		}
+
+		/* Not populated */
+		if (vp->w1 == 0)
+			continue;
+		eq_blk = be32_to_cpu(vp->w1) >> 28;
+		eq_idx = be32_to_cpu(vp->w1) & 0x0fffffff;
+
+		lock(&x->lock);
+
+		/* Ensure EQs are disabled and cleaned up. Ideally the caller
+		 * should have done it but we double check it here
+		 */
+		for (j = 0; j < NUM_INT_PRIORITIES; j++) {
+			struct xive *eq_x = xive_from_vc_blk(eq_blk);
+			struct xive_eq eq, *orig_eq = xive_get_eq(eq_x, eq_idx + j);
+
+			if (!xive_get_field32(EQ_W0_VALID, orig_eq->w0))
+				continue;
+
+			prlog(PR_WARNING, "XIVE: freeing VP %d with queue %d active\n",
+			      vp_id, j);
+			eq = *orig_eq;
+			xive_cleanup_eq(&eq);
+			xive_eqc_cache_update(x, eq_blk, eq_idx + j, &eq, true);
+		}
+
+		/* Mark it not populated so we don't try to free it again */
+		vp->w1 = 0;
+
+		if (eq_blk != blk) {
+			prerror("XIVE: Block mismatch trying to free EQs\n");
+			unlock(&x->lock);
+			return OPAL_INTERNAL_ERROR;
+		}
+
+		xive_free_eq_set(x, eq_idx);
+		unlock(&x->lock);
+	}
+
+	xive_free_vps(vp_base);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_alloc_vp_block(uint32_t alloc_order)
+{
+	uint32_t vp_base, eqs, count, i;
+	int64_t rc;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	prlog(PR_TRACE, "opal_xive_alloc_vp_block(%d)\n", alloc_order);
+
+	vp_base = xive_alloc_vps(alloc_order);
+	if (XIVE_ALLOC_IS_ERR(vp_base)) {
+		if (vp_base == XIVE_ALLOC_NO_IND)
+			return OPAL_XIVE_PROVISIONING;
+		return OPAL_RESOURCE;
+	}
+
+	/* Allocate EQs and initialize VPs */
+	count = 1 << alloc_order;
+	for (i = 0; i < count; i++) {
+		uint32_t vp_id = vp_base + i;
+		uint32_t blk, idx;
+		struct xive *x;
+		struct xive_vp *vp;
+
+		if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) {
+			prerror("XIVE: Couldn't decode VP id %u\n", vp_id);
+			return OPAL_INTERNAL_ERROR;
+		}
+		x = xive_from_pc_blk(blk);
+		if (!x) {
+			prerror("XIVE: Instance not found for allocated VP"
+				" block %d\n", blk);
+			rc = OPAL_INTERNAL_ERROR;
+			goto fail;
+		}
+		vp = xive_get_vp(x, idx);
+		if (!vp) {
+			prerror("XIVE: VP not found after allocation !");
+			rc = OPAL_INTERNAL_ERROR;
+			goto fail;
+		}
+
+		/* Allocate EQs, if fails, free the VPs and return */
+		lock(&x->lock);
+		eqs = xive_alloc_eq_set(x, false);
+		unlock(&x->lock);
+		if (XIVE_ALLOC_IS_ERR(eqs)) {
+			if (eqs == XIVE_ALLOC_NO_IND)
+				rc = OPAL_XIVE_PROVISIONING;
+			else
+				rc = OPAL_RESOURCE;
+			goto fail;
+		}
+
+		/* Initialize the VP structure. We don't use a cache watch
+		 * as we have made sure when freeing the entries to scrub
+		 * it out of the cache.
+		 */
+		memset(vp, 0, sizeof(*vp));
+		vp->w1 = cpu_to_be32((blk << 28) | eqs);
+	}
+	return vp_base;
+ fail:
+	opal_xive_free_vp_block(vp_base);
+
+	return rc;
+}
+
+static int64_t xive_try_allocate_irq(struct xive *x)
+{
+	int idx, base_idx, max_count, girq;
+	struct xive_ive *ive;
+
+	lock(&x->lock);
+
+	base_idx = x->int_ipi_top - x->int_base;
+	max_count = x->int_hw_bot - x->int_ipi_top;
+
+	idx = bitmap_find_zero_bit(*x->ipi_alloc_map, base_idx, max_count);
+	if (idx < 0) {
+		unlock(&x->lock);
+		return OPAL_RESOURCE;
+	}
+	bitmap_set_bit(*x->ipi_alloc_map, idx);
+	girq = x->int_base + idx;
+
+	/* Mark the IVE valid. Don't bother with the HW cache, it's
+	 * still masked anyway, the cache will be updated when unmasked
+	 * and configured.
+	 */
+	ive = xive_get_ive(x, girq);
+	if (!ive) {
+		bitmap_clr_bit(*x->ipi_alloc_map, idx);
+		unlock(&x->lock);
+		return OPAL_PARAMETER;
+	}
+	ive->w = xive_set_field64(IVE_VALID, 0ul, 1) |
+		 xive_set_field64(IVE_MASKED, 0ul, 1) |
+		 xive_set_field64(IVE_EQ_DATA, 0ul, girq);
+	unlock(&x->lock);
+
+	return girq;
+}
+
+static int64_t opal_xive_allocate_irq(uint32_t chip_id)
+{
+	struct proc_chip *chip;
+	bool try_all = false;
+	int64_t rc;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	if (chip_id == OPAL_XIVE_ANY_CHIP) {
+		try_all = true;
+		chip_id = this_cpu()->chip_id;
+	}
+	chip = get_chip(chip_id);
+	if (!chip)
+		return OPAL_PARAMETER;
+
+	/* Try initial target chip */
+	if (!chip->xive)
+		rc = OPAL_PARAMETER;
+	else
+		rc = xive_try_allocate_irq(chip->xive);
+	if (rc >= 0 || !try_all)
+		return rc;
+
+	/* Failed and we try all... do so */
+	for_each_chip(chip) {
+		if (!chip->xive)
+			continue;
+		rc = xive_try_allocate_irq(chip->xive);
+		if (rc >= 0)
+			break;
+	}
+	return rc;
+}
+
+static int64_t opal_xive_free_irq(uint32_t girq)
+{
+	struct irq_source *is = irq_find_source(girq);
+	struct xive_src *s = container_of(is, struct xive_src, is);
+	struct xive *x = xive_from_isn(girq);
+	struct xive_ive *ive;
+	uint32_t idx;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+	if (!x || !is)
+		return OPAL_PARAMETER;
+
+	idx = GIRQ_TO_IDX(girq);
+
+	lock(&x->lock);
+
+	ive = xive_get_ive(x, girq);
+	if (!ive) {
+		unlock(&x->lock);
+		return OPAL_PARAMETER;
+	}
+
+	/* Mask the interrupt source */
+	xive_update_irq_mask(s, girq - s->esb_base, true);
+
+	/* Mark the IVE masked and invalid */
+	ive->w = xive_set_field64(IVE_VALID, 0ul, 1) |
+		 xive_set_field64(IVE_MASKED, 0ul, 1);
+	xive_ivc_scrub(x, x->block_id, idx);
+
+	/* Free it */
+	if (!bitmap_tst_bit(*x->ipi_alloc_map, idx)) {
+		unlock(&x->lock);
+		return OPAL_PARAMETER;
+	}
+	bitmap_clr_bit(*x->ipi_alloc_map, idx);
+	bitmap_clr_bit(*x->int_enabled_map, idx);
+	unlock(&x->lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_dump_tm(uint32_t offset, const char *n, uint32_t pir)
+{
+	struct cpu_thread *c = find_cpu_by_pir(pir);
+	struct xive_cpu_state *xs;
+	struct xive *x;
+	void *ind_tm_base;
+	uint64_t v0,v1;
+
+	if (!c)
+		return OPAL_PARAMETER;
+	xs = c->xstate;
+	if (!xs || !xs->tm_ring1)
+		return OPAL_INTERNAL_ERROR;
+	x = xs->xive;
+	ind_tm_base = x->ic_base + (4 << x->ic_shift);
+
+	lock(&x->lock);
+
+	/* Setup indirect access to the corresponding thread */
+	xive_regw(x, PC_TCTXT_INDIR0,
+		  PC_TCTXT_INDIR_VALID |
+		  SETFIELD(PC_TCTXT_INDIR_THRDID, 0ull, pir & 0xff));
+
+	/* Workaround for HW issue: Need to read the above register
+	 * back before doing the subsequent accesses
+	 */
+	xive_regr(x, PC_TCTXT_INDIR0);
+
+	v0 = in_be64(ind_tm_base + offset);
+	if (offset == TM_QW3_HV_PHYS) {
+		v1 = in_8(ind_tm_base + offset + 8);
+		v1 <<= 56;
+	} else {
+		v1 = in_be32(ind_tm_base + offset + 8);
+		v1 <<= 32;
+	}
+	prlog(PR_INFO, "CPU[%04x]: TM state for QW %s\n", pir, n);
+	prlog(PR_INFO, "CPU[%04x]: NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
+	      " W2       W3\n", pir);
+	prlog(PR_INFO, "CPU[%04x]: %02x  %02x   %02x  %02x    %02x   "
+	       "%02x  %02x  %02x   %08x %08x\n", pir,
+	      (uint8_t)(v0 >> 58) & 0xff, (uint8_t)(v0 >> 48) & 0xff,
+	      (uint8_t)(v0 >> 40) & 0xff, (uint8_t)(v0 >> 32) & 0xff,
+	      (uint8_t)(v0 >> 24) & 0xff, (uint8_t)(v0 >> 16) & 0xff,
+	      (uint8_t)(v0 >>  8) & 0xff, (uint8_t)(v0      ) & 0xff,
+	      (uint32_t)(v1 >> 32) & 0xffffffff,
+	      (uint32_t)(v1 & 0xffffffff));
+
+
+	xive_regw(x, PC_TCTXT_INDIR0, 0);
+	unlock(&x->lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_dump_vp(uint32_t vp_id)
+{
+	uint32_t blk, idx;
+	uint8_t order;
+	bool group;
+	struct xive *x;
+	struct xive_vp *vp;
+	uint32_t *vpw;
+
+	if (!xive_decode_vp(vp_id, &blk, &idx, &order, &group))
+		return OPAL_PARAMETER;
+
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+	vp = xive_get_vp(x, idx);
+	if (!vp)
+		return OPAL_PARAMETER;
+	lock(&x->lock);
+
+	xive_vpc_scrub_clean(x, blk, idx);
+
+	vpw = ((uint32_t *)vp) + (group ? 8 : 0);
+	prlog(PR_INFO, "VP[%08x]: 0..3: %08x %08x %08x %08x\n", vp_id,
+	      vpw[0], vpw[1], vpw[2], vpw[3]);
+	prlog(PR_INFO, "VP[%08x]: 4..7: %08x %08x %08x %08x\n", vp_id,
+	      vpw[4], vpw[5], vpw[6], vpw[7]);
+	unlock(&x->lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t __opal_xive_dump_emu(struct xive_cpu_state *xs, uint32_t pir)
+{
+	struct xive_eq *eq;
+	uint32_t ipi_target;
+	uint8_t *mm, pq;
+
+	prlog(PR_INFO, "CPU[%04x]: XIVE emulation state\n", pir);
+
+	prlog(PR_INFO, "CPU[%04x]: cppr=%02x mfrr=%02x pend=%02x"
+	      " prev_cppr=%02x total_irqs=%llx\n", pir,
+	      xs->cppr, xs->mfrr, xs->pending, xs->prev_cppr, xs->total_irqs);
+
+	prlog(PR_INFO, "CPU[%04x]: EQ IDX=%x MSK=%x G=%d [%08x %08x %08x > %08x %08x %08x %08x ...]\n",
+	      pir,  xs->eqptr, xs->eqmsk, xs->eqgen,
+	      xs->eqbuf[(xs->eqptr - 3) & xs->eqmsk],
+	      xs->eqbuf[(xs->eqptr - 2) & xs->eqmsk],
+	      xs->eqbuf[(xs->eqptr - 1) & xs->eqmsk],
+	      xs->eqbuf[(xs->eqptr + 0) & xs->eqmsk],
+	      xs->eqbuf[(xs->eqptr + 1) & xs->eqmsk],
+	      xs->eqbuf[(xs->eqptr + 2) & xs->eqmsk],
+	      xs->eqbuf[(xs->eqptr + 3) & xs->eqmsk]);
+
+	mm = xs->xive->esb_mmio + GIRQ_TO_IDX(xs->ipi_irq) * XIVE_ESB_PAGE_SIZE;
+	pq = in_8(mm + 0x10800);
+	if (xive_get_irq_targetting(xs->ipi_irq, &ipi_target, NULL, NULL))
+		prlog(PR_INFO, "CPU[%04x]: IPI #%08x PQ=%x target=%08x\n",
+				pir, xs->ipi_irq, pq, ipi_target);
+	else
+		prlog(PR_INFO, "CPU[%04x]: IPI #%08x PQ=%x target=??\n",
+				pir, xs->ipi_irq, pq);
+
+
+
+	__xive_cache_scrub(xs->xive, xive_cache_eqc, xs->eq_blk,
+			   xs->eq_idx + XIVE_EMULATION_PRIO,
+			   false, false);
+	eq = xive_get_eq(xs->xive, xs->eq_idx + XIVE_EMULATION_PRIO);
+	prlog(PR_INFO, "CPU[%04x]: EQ @%p W0=%08x W1=%08x qbuf @%p\n",
+	      pir, eq, be32_to_cpu(eq->w0), be32_to_cpu(eq->w1), xs->eqbuf);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_dump_emu(uint32_t pir)
+{
+	struct cpu_thread *c = find_cpu_by_pir(pir);
+	struct xive_cpu_state *xs;
+	int64_t rc;
+
+	if (!c)
+		return OPAL_PARAMETER;
+
+	xs = c->xstate;
+	if (!xs) {
+		prlog(PR_INFO, "  <none>\n");
+		return OPAL_SUCCESS;
+	}
+	lock(&xs->lock);
+	rc = __opal_xive_dump_emu(xs, pir);
+	log_print(xs);
+	unlock(&xs->lock);
+
+	return rc;
+}
+
+static int64_t opal_xive_sync_irq_src(uint32_t girq)
+{
+	struct xive *x = xive_from_isn(girq);
+
+	if (!x)
+		return OPAL_PARAMETER;
+	return xive_sync(x);
+}
+
+static int64_t opal_xive_sync_irq_target(uint32_t girq)
+{
+	uint32_t target, vp_blk;
+	struct xive *x;
+
+	if (!xive_get_irq_targetting(girq, &target, NULL, NULL))
+		return OPAL_PARAMETER;
+	if (!xive_decode_vp(target, &vp_blk, NULL, NULL, NULL))
+		return OPAL_PARAMETER;
+	x = xive_from_pc_blk(vp_blk);
+	if (!x)
+		return OPAL_PARAMETER;
+	return xive_sync(x);
+}
+
+static int64_t opal_xive_sync(uint32_t type, uint32_t id)
+{
+	int64_t rc = OPAL_SUCCESS;;
+
+	if (type & XIVE_SYNC_EAS)
+		rc = opal_xive_sync_irq_src(id);
+	if (rc)
+		return rc;
+	if (type & XIVE_SYNC_QUEUE)
+		rc = opal_xive_sync_irq_target(id);
+	if (rc)
+		return rc;
+
+	/* Add more ... */
+
+	return rc;
+}
+
+static int64_t opal_xive_dump(uint32_t type, uint32_t id)
+{
+	switch (type) {
+	case XIVE_DUMP_TM_HYP:
+		return opal_xive_dump_tm(TM_QW3_HV_PHYS, "PHYS", id);
+	case XIVE_DUMP_TM_POOL:
+		return opal_xive_dump_tm(TM_QW2_HV_POOL, "POOL", id);
+	case XIVE_DUMP_TM_OS:
+		return opal_xive_dump_tm(TM_QW1_OS, "OS  ", id);
+	case XIVE_DUMP_TM_USER:
+		return opal_xive_dump_tm(TM_QW0_USER, "USER", id);
+	case XIVE_DUMP_VP:
+		return opal_xive_dump_vp(id);
+	case XIVE_DUMP_EMU_STATE:
+		return opal_xive_dump_emu(id);
+	default:
+		return OPAL_PARAMETER;
+	}
+}
+
+static void xive_init_globals(void)
+{
+	uint32_t i;
+
+	for (i = 0; i < XIVE_MAX_CHIPS; i++)
+		xive_block_to_chip[i] = XIVE_INVALID_CHIP;
+}
+
+void init_xive(void)
+{
+	struct dt_node *np;
+	struct proc_chip *chip;
+	struct cpu_thread *cpu;
+	struct xive *one_xive;
+	bool first = true;
+
+	/* Look for xive nodes and do basic inits */
+	dt_for_each_compatible(dt_root, np, "ibm,power9-xive-x") {
+		struct xive *x;
+
+		/* Initialize some global stuff */
+		if (first)
+			xive_init_globals();
+
+		/* Create/initialize the xive instance */
+		x = init_one_xive(np);
+		if (first)
+			one_xive = x;
+		first = false;
+	}
+	if (first)
+		return;
+
+	xive_mode = XIVE_MODE_EMU;
+
+	/* Init VP allocator */
+	xive_init_vp_allocator();
+
+	/* Create a device-tree node for Linux use */
+	xive_create_mmio_dt_node(one_xive);
+
+	/* Some inits must be done after all xive have been created
+	 * such as setting up the forwarding ports
+	 */
+	for_each_chip(chip) {
+		if (chip->xive)
+			late_init_one_xive(chip->xive);
+	}
+
+	/* Initialize XICS emulation per-cpu structures */
+	for_each_present_cpu(cpu) {
+		xive_init_cpu(cpu);
+	}
+	/* Add interrupts propertie to each CPU node */
+	for_each_present_cpu(cpu) {
+		if (cpu_is_thread0(cpu))
+			xive_init_cpu_properties(cpu);
+	}
+
+	/* Calling boot CPU */
+	xive_cpu_callin(this_cpu());
+
+	/* Register XICS emulation calls */
+	opal_register(OPAL_INT_GET_XIRR, opal_xive_get_xirr, 2);
+	opal_register(OPAL_INT_SET_CPPR, opal_xive_set_cppr, 1);
+	opal_register(OPAL_INT_EOI, opal_xive_eoi, 1);
+	opal_register(OPAL_INT_SET_MFRR, opal_xive_set_mfrr, 2);
+
+	/* Register XIVE exploitation calls */
+	opal_register(OPAL_XIVE_RESET, opal_xive_reset, 1);
+	opal_register(OPAL_XIVE_GET_IRQ_INFO, opal_xive_get_irq_info, 6);
+	opal_register(OPAL_XIVE_GET_IRQ_CONFIG, opal_xive_get_irq_config, 4);
+	opal_register(OPAL_XIVE_SET_IRQ_CONFIG, opal_xive_set_irq_config, 4);
+	opal_register(OPAL_XIVE_GET_QUEUE_INFO, opal_xive_get_queue_info, 7);
+	opal_register(OPAL_XIVE_SET_QUEUE_INFO, opal_xive_set_queue_info, 5);
+	opal_register(OPAL_XIVE_DONATE_PAGE, opal_xive_donate_page, 2);
+	opal_register(OPAL_XIVE_ALLOCATE_IRQ, opal_xive_allocate_irq, 1);
+	opal_register(OPAL_XIVE_FREE_IRQ, opal_xive_free_irq, 1);
+	opal_register(OPAL_XIVE_ALLOCATE_VP_BLOCK, opal_xive_alloc_vp_block, 1);
+	opal_register(OPAL_XIVE_FREE_VP_BLOCK, opal_xive_free_vp_block, 1);
+	opal_register(OPAL_XIVE_GET_VP_INFO, opal_xive_get_vp_info, 5);
+	opal_register(OPAL_XIVE_SET_VP_INFO, opal_xive_set_vp_info, 3);
+	opal_register(OPAL_XIVE_SYNC, opal_xive_sync, 2);
+	opal_register(OPAL_XIVE_DUMP, opal_xive_dump, 2);
+	opal_register(OPAL_XIVE_GET_QUEUE_STATE, opal_xive_get_queue_state, 4);
+	opal_register(OPAL_XIVE_SET_QUEUE_STATE, opal_xive_set_queue_state, 4);
+	opal_register(OPAL_XIVE_GET_VP_STATE, opal_xive_get_vp_state, 2);
+}
+
diff --git a/roms/skiboot/hw/xive2.c b/roms/skiboot/hw/xive2.c
new file mode 100644
index 000000000..d5814bcbf
--- /dev/null
+++ b/roms/skiboot/hw/xive2.c
@@ -0,0 +1,4666 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * XIVE2: eXternal Interrupt Virtualization Engine. POWER10 interrupt
+ * controller
+ *
+ * Copyright (c) 2016-2019, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "XIVE: " fmt
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <chip.h>
+#include <io.h>
+#include <xive.h>
+#include <xive2-regs.h>
+#include <xscom-p10-regs.h>
+#include <interrupts.h>
+#include <timebase.h>
+#include <bitmap.h>
+#include <buddy.h>
+#include <phys-map.h>
+#include <p10_stop_api.H>
+
+
+/* Verbose debug */
+#undef XIVE_VERBOSE_DEBUG
+#undef DEBUG
+
+/* Extra debug options used in debug builds */
+#ifdef DEBUG
+#define XIVE_CHECK_LOCKS
+#define XIVE_DEBUG_INIT_CACHE_UPDATES
+#define XIVE_EXTRA_CHECK_INIT_CACHE
+#else
+#undef  XIVE_CHECK_LOCKS
+#undef  XIVE_DEBUG_INIT_CACHE_UPDATES
+#undef  XIVE_EXTRA_CHECK_INIT_CACHE
+#endif
+
+/*
+ * VSDs, blocks, set translation etc...
+ *
+ * For the following data structures, the XIVE use a mechanism called
+ * Virtualization Structure Tables (VST) to manage the memory layout
+ * and access: ESBs (Event State Buffers), EAS (Event assignment
+ * structures), ENDs (Event Notification Descriptors) and NVT/NVP
+ * (Notification Virtual Targets/Processors).
+ *
+ * These structures divide those tables into 16 "blocks". Each XIVE
+ * instance has a definition for all 16 blocks that can either represent
+ * an actual table in memory or a remote XIVE MMIO port to access a
+ * block that is owned by that remote XIVE.
+ *
+ * Our SW design will consist of allocating one block per chip (and thus
+ * per XIVE instance) for now, thus giving us up to 16 supported chips in
+ * the system. We may have to revisit that if we ever support systems with
+ * more than 16 chips but that isn't on our radar at the moment or if we
+ * want to do like pHyp on some machines and dedicate 2 blocks per chip
+ * for some structures.
+ *
+ * Thus we need to be careful that we never expose to Linux the concept
+ * of block and block boundaries, but instead we provide full number ranges
+ * so that consecutive blocks can be supported.
+ *
+ * Similarily, for MMIO access, the BARs support what is called "set
+ * translation" which allows the BAR to be devided into a certain
+ * number of sets. Each "set" can be routed to a specific block and
+ * offset within a block.
+ */
+
+#define XIVE_MAX_BLOCKS		16
+#define XIVE_VSD_SIZE		8
+
+/*
+ * Max number of ESBs. (direct table)
+ *
+ * The max number of ESBs supported in the P10 MMIO space is 1TB/128K: 8M.
+ *
+ * 1M is our current top limit of ESB entries and EAS entries
+ * pre-allocated per chip. That allocates 256KB per chip for the state
+ * bits and 8M per chip for the EAS.
+ */
+
+#define XIVE_INT_ORDER		20 /* 1M interrupts */
+#define XIVE_INT_COUNT		(1ul << XIVE_INT_ORDER)
+
+/*
+ * First interrupt number, also the first logical interrupt number
+ * allocated by Linux (maximum ISA interrupt number + 1)
+ */
+#define XIVE_INT_FIRST		0x10
+
+/* Corresponding direct table sizes */
+#define XIVE_ESB_SIZE		(XIVE_INT_COUNT / 4)
+#define XIVE_EAT_SIZE		(XIVE_INT_COUNT * 8)
+
+/* Use 64K for everything by default */
+#define XIVE_ESB_SHIFT		(16 + 1) /* trigger + mgmt pages */
+#define XIVE_ESB_PAGE_SIZE     (1ul << XIVE_ESB_SHIFT) /* 2 pages */
+
+/*
+ * Max number of ENDs. (indirect table)
+ *
+ * The max number of ENDs supported in the P10 MMIO space is 2TB/128K: 16M.
+ * Since one END is 32 bytes, a 64K indirect subpage can hold 2K ENDs.
+ * We need 8192 subpages, ie, 64K of memory for the indirect table.
+ */
+#define END_PER_PAGE		(PAGE_SIZE / sizeof(struct xive_end))
+
+#define XIVE_END_ORDER		23 /* 8M ENDs */
+#define XIVE_END_COUNT		(1ul << XIVE_END_ORDER)
+#define XIVE_END_TABLE_SIZE	((XIVE_END_COUNT / END_PER_PAGE) * XIVE_VSD_SIZE)
+
+#define XIVE_END_SHIFT		(16 + 1) /* ESn + ESe pages */
+
+/* One bit per number of priorities configured */
+#define xive_end_bitmap_size(x)	(XIVE_END_COUNT >> xive_cfg_vp_prio_shift(x))
+
+/* Number of priorities (and thus ENDs) we allocate for each VP */
+#define xive_cfg_vp_prio_shift(x) GETFIELD(CQ_XIVE_CFG_VP_INT_PRIO, (x)->config)
+#define xive_cfg_vp_prio(x)	(1 << xive_cfg_vp_prio_shift(x))
+
+/* Max priority number */
+#define xive_max_prio(x)	(xive_cfg_vp_prio(x) - 1)
+
+/* Priority used for gather/silent escalation (KVM) */
+#define xive_escalation_prio(x)	xive_max_prio(x)
+
+/*
+ * Max number of VPs. (indirect table)
+ *
+ * The max number of NVPs we support in our MMIO space is 1TB/128K: 8M.
+ * Since one NVP is 32 bytes, a 64K indirect subpage can hold 2K NVPs.
+ * We need 4096 pointers, ie, 32K of memory for the indirect table.
+ *
+ * However, we use 8 priorities (by default) per NVP and the number of
+ * ENDs is configured to 8M. Therefore, our VP space is limited to 1M.
+ */
+#define VP_PER_PAGE		(PAGE_SIZE / sizeof(struct xive_nvp))
+
+#define XIVE_VP_ORDER(x)	(XIVE_END_ORDER - xive_cfg_vp_prio_shift(x))
+#define XIVE_VP_COUNT(x)	(1ul << XIVE_VP_ORDER(x))
+#define XIVE_VP_TABLE_SIZE(x)	((XIVE_VP_COUNT(x) / VP_PER_PAGE) * XIVE_VSD_SIZE)
+
+#define XIVE_NVP_SHIFT		17 /* NVPG BAR: two pages, even NVP, odd NVG */
+
+/* VP Space maximums in Gen1 and Gen2 modes */
+#define VP_SHIFT_GEN1		19	/* in sync with END_W6_VP_OFFSET_GEN1 */
+#define VP_SHIFT_GEN2		24	/* in sync with END_W6_VP_OFFSET */
+
+/*
+ * VP ids for HW threads.
+ *
+ * Depends on the thread id bits configuration of the IC. 8bit is the
+ * default for P10 and 7bit for p9.
+ *
+ * These values are global because they should be common to all chips
+ */
+static uint32_t xive_threadid_shift;
+static uint32_t	xive_hw_vp_base;
+static uint32_t xive_hw_vp_count;
+
+/*
+ * The XIVE operation mode indicates the active "API" and corresponds
+ * to the "version/mode" parameter of the opal_xive_reset() call
+ */
+static enum {
+	/* No XICS emulation */
+	XIVE_MODE_EXPL	= OPAL_XIVE_MODE_EXPL, /* default */
+	XIVE_MODE_NONE,
+} xive_mode = XIVE_MODE_NONE;
+
+/*
+ * The XIVE exploitation mode options indicates the active features and
+ * is part of the mode parameter of the opal_xive_reset() call
+ */
+static uint64_t xive_expl_options;
+
+#define XIVE_EXPL_ALL_OPTIONS 0
+
+/*
+ * Each source controller has one of these. There's one embedded in
+ * the XIVE struct for IPIs
+ */
+struct xive_src {
+	struct irq_source		is;
+	const struct irq_source_ops	*orig_ops;
+	struct xive			*xive;
+	void				*esb_mmio;
+	uint32_t			esb_base;
+	uint32_t			esb_shift;
+	uint32_t			flags;
+};
+
+struct xive_cpu_state {
+	struct xive	*xive;
+	void		*tm_ring1;
+
+	/* Base HW VP and associated queues */
+	uint32_t	vp_blk;
+	uint32_t	vp_idx;
+	uint32_t	end_blk;
+	uint32_t	end_idx; /* Base end index of a block of 8 */
+
+	struct lock	lock;
+};
+
+enum xive_generation {
+	XIVE_GEN1 = 1, /* P9 compat mode */
+	XIVE_GEN2 = 2, /* P10 default */
+};
+
+enum xive_quirks {
+	/* HW527671 - 8bits Hardwired Thread Id range not implemented */
+	XIVE_QUIRK_THREADID_7BITS	= 0x00000001,
+	/* HW542974 - interrupt command priority checker not working properly */
+	XIVE_QUIRK_BROKEN_PRIO_CHECK	= 0x00000002,
+};
+
+struct xive {
+	uint32_t		 chip_id;
+	uint32_t		 block_id;
+	struct dt_node		*x_node;
+
+	enum xive_generation	 generation;
+	uint64_t		 capabilities;
+	uint64_t		 config;
+
+	uint64_t		 xscom_base;
+
+	/* MMIO regions */
+	void			*ic_base;
+	uint64_t		 ic_size;
+	uint32_t		 ic_shift;
+	void			*ic_tm_direct_base;
+
+	void			*tm_base;
+	uint64_t		 tm_size;
+	uint32_t		 tm_shift;
+	void			*nvp_base;
+	uint64_t		 nvp_size;
+	void			*esb_base;
+	uint64_t		 esb_size;
+	void			*end_base;
+	uint64_t		 end_size;
+
+	/* Set on XSCOM register access error */
+	bool			 last_reg_error;
+
+	/* Per-XIVE mutex */
+	struct lock		 lock;
+
+	/* Pre-allocated tables.
+	 *
+	 * We setup all the VDS for actual tables (ie, by opposition to
+	 * forwarding ports) as either direct pre-allocated or indirect
+	 * and partially populated.
+	 *
+	 * Currently, the ESB and the EAS tables are direct and fully
+	 * pre-allocated based on XIVE_INT_COUNT.
+	 *
+	 * The other tables are indirect, we thus pre-allocate the indirect
+	 * table (ie, pages of pointers) and populate enough of the pages
+	 * for our basic setup using 64K subpages.
+	 *
+	 * The size of the indirect tables are driven by XIVE_VP_COUNT
+	 * and XIVE_END_COUNT. The number of pre-allocated ones are
+	 * driven by xive_hw_vp_count for the HW threads. The number
+	 * of END depends on number of VP.
+	 */
+
+	/* Direct SBE and EAT tables */
+	void			*sbe_base;
+	void			*eat_base;
+
+	/* Indirect END table. NULL entries are unallocated, count is
+	 * the numbre of pointers (ie, sub page placeholders).
+	 */
+	beint64_t		*end_ind_base;
+	uint32_t		 end_ind_count;
+	uint64_t 		 end_ind_size;
+
+	/* END allocation bitmap. Each bit represent #priority ENDs */
+	bitmap_t		*end_map;
+
+	/* Indirect NVT/VP table. NULL entries are unallocated, count is
+	 * the numbre of pointers (ie, sub page placeholders).
+	 */
+	beint64_t		*vp_ind_base;
+	uint32_t		 vp_ind_count;
+	uint64_t 		 vp_ind_size;
+
+	/* VP space size. Depends on Gen1/2 mode */
+	uint32_t		 vp_shift;
+
+	/* Pool of donated pages for provisioning indirect END and VP pages */
+	struct list_head	 donated_pages;
+
+	/* To ease a possible change to supporting more than one block of
+	 * interrupts per chip, we store here the "base" global number
+	 * and max number of interrupts for this chip. The global number
+	 * encompass the block number and index.
+	 */
+	uint32_t		 int_base;
+	uint32_t		 int_count;
+
+	/* Due to the overlap between IPIs and HW sources in the EAS table,
+	 * we keep some kind of top-down allocator. It is used for HW sources
+	 * to "allocate" interrupt entries and will limit what can be handed
+	 * out as IPIs. Of course this assumes we "allocate" all HW sources
+	 * before we start handing out IPIs.
+	 *
+	 * Note: The numbers here are global interrupt numbers so that we can
+	 * potentially handle more than one block per chip in the future.
+	 */
+	uint32_t		 int_hw_bot;	/* Bottom of HW allocation */
+	uint32_t		 int_ipi_top;	/* Highest IPI handed out so far + 1 */
+
+	/* The IPI allocation bitmap */
+	bitmap_t		*ipi_alloc_map;
+
+	/* We keep track of which interrupts were ever enabled to
+	 * speed up xive_reset
+	 */
+	bitmap_t		*int_enabled_map;
+
+	/* Embedded source IPIs */
+	struct xive_src		 ipis;
+
+	/* Embedded escalation interrupts */
+	struct xive_src		 esc_irqs;
+
+	/* In memory queue overflow */
+	void			*q_ovf;
+
+	/* Cache/sync injection */
+	uint64_t		 sync_inject_size;
+	void			*sync_inject;
+
+	/* INT HW Errata */
+	uint64_t		quirks;
+};
+
+/* First XIVE unit configured on the system */
+static struct xive *one_xive;
+
+/* Global DT node */
+static struct dt_node *xive_dt_node;
+
+/* Block <-> Chip conversions.
+ *
+ * As chipIDs may not be within the range of 16 block IDs supported by XIVE,
+ * we have a 2 way conversion scheme.
+ *
+ * From block to chip, use the global table below.
+ *
+ * From chip to block, a field in struct proc_chip contains the first block
+ * of that chip. For now we only support one block per chip but that might
+ * change in the future
+ */
+#define XIVE_INVALID_CHIP	0xffffffff
+#define XIVE_MAX_CHIPS		16
+static uint32_t xive_block_to_chip[XIVE_MAX_CHIPS];
+static uint32_t xive_block_count;
+
+static uint32_t xive_chip_to_block(uint32_t chip_id)
+{
+	struct proc_chip *c = get_chip(chip_id);
+
+	assert(c);
+	assert(c->xive);
+	return c->xive->block_id;
+}
+
+/*
+ * Conversion between GIRQ and block/index.
+ *
+ * ------------------------------------
+ * |000E|BLOC|                   INDEX|
+ * ------------------------------------
+ *   4     4           24
+ *
+ * the E bit indicates that this is an escalation interrupt, in
+ * that case, the BLOC/INDEX represents the END containing the
+ * corresponding escalation descriptor.
+ *
+ * Global interrupt numbers for non-escalation interrupts are thus
+ * limited to 28 bits.
+ */
+
+#define INT_SHIFT		24
+#define INT_ESC_SHIFT		(INT_SHIFT + 4) /* 4bits block id */
+
+#if XIVE_INT_ORDER > INT_SHIFT
+#error "Too many ESBs for IRQ encoding"
+#endif
+
+#if XIVE_END_ORDER > INT_SHIFT
+#error "Too many ENDs for escalation IRQ number encoding"
+#endif
+
+#define GIRQ_TO_BLK(__g)	(((__g) >> INT_SHIFT) & 0xf)
+#define GIRQ_TO_IDX(__g)	((__g) & ((1 << INT_SHIFT) - 1))
+#define BLKIDX_TO_GIRQ(__b,__i)	(((uint32_t)(__b)) << INT_SHIFT | (__i))
+
+#define GIRQ_IS_ESCALATION(__g)	((__g) & (1 << INT_ESC_SHIFT))
+#define MAKE_ESCALATION_GIRQ(__b,__i)(BLKIDX_TO_GIRQ(__b,__i) | (1 << INT_ESC_SHIFT))
+
+
+/* Block/IRQ to chip# conversions */
+#define PC_BLK_TO_CHIP(__b)	(xive_block_to_chip[__b])
+#define VC_BLK_TO_CHIP(__b)	(xive_block_to_chip[__b])
+#define GIRQ_TO_CHIP(__isn)	(VC_BLK_TO_CHIP(GIRQ_TO_BLK(__isn)))
+
+/* Routing of physical processors to VPs */
+#define PIR2VP_IDX( __pir)	(xive_hw_vp_base | P10_PIR2LOCALCPU(__pir))
+#define PIR2VP_BLK(__pir)	(xive_chip_to_block(P10_PIR2GCID(__pir)))
+#define VP2PIR(__blk, __idx)	(P10_PIRFROMLOCALCPU(VC_BLK_TO_CHIP(__blk), (__idx) & 0xff))
+
+/* Decoding of OPAL API VP IDs. The VP IDs are encoded as follow
+ *
+ * Block group mode:
+ *
+ * -----------------------------------
+ * |GVEOOOOO|                   INDEX|
+ * -----------------------------------
+ *  ||   |
+ *  ||  Order
+ *  |Virtual
+ *  Group
+ *
+ * G (Group)   : Set to 1 for a group VP (not currently supported)
+ * V (Virtual) : Set to 1 for an allocated VP (vs. a physical processor ID)
+ * E (Error)   : Should never be 1, used internally for errors
+ * O (Order)   : Allocation order of the VP block
+ *
+ * The conversion is thus done as follow (groups aren't implemented yet)
+ *
+ *  If V=0, O must be 0 and 24-bit INDEX value is the PIR
+ *  If V=1, the order O group is allocated such that if N is the number of
+ *          chip bits considered for allocation (*)
+ *          then the INDEX is constructed as follow (bit numbers such as 0=LSB)
+ *           - bottom O-N bits is the index within the "VP block"
+ *           - next N bits is the XIVE blockID of the VP
+ *           - the remaining bits is the per-chip "base"
+ *          so the conversion consists of "extracting" the block ID and moving
+ *          down the upper bits by N bits.
+ *
+ * In non-block-group mode, the difference is that the blockID is
+ * on the left of the index (the entire VP block is in a single
+ * block ID)
+ */
+
+#define VP_GROUP_SHIFT		31
+#define VP_VIRTUAL_SHIFT	30
+#define VP_ERROR_SHIFT		29
+#define VP_ORDER_SHIFT		24
+
+#define vp_group(vp)		(((vp) >> VP_GROUP_SHIFT) & 1)
+#define vp_virtual(vp) 		(((vp) >> VP_VIRTUAL_SHIFT) & 1)
+#define vp_order(vp)		(((vp) >> VP_ORDER_SHIFT) & 0x1f)
+#define vp_index(vp)		((vp) & ((1 << VP_ORDER_SHIFT) - 1))
+
+/* VP allocation */
+static uint32_t xive_chips_alloc_bits = 0;
+static struct buddy *xive_vp_buddy;
+static struct lock xive_buddy_lock = LOCK_UNLOCKED;
+
+/* VP# decoding/encoding */
+static bool xive_decode_vp(uint32_t vp, uint32_t *blk, uint32_t *idx,
+			   uint8_t *order, bool *group)
+{
+	uint32_t o = vp_order(vp);
+	uint32_t n = xive_chips_alloc_bits;
+	uint32_t index = vp_index(vp);
+	uint32_t imask = (1 << (o - n)) - 1;
+
+	/* Groups not supported yet */
+	if (vp_group(vp))
+		return false;
+	if (group)
+		*group = false;
+
+	/* PIR case */
+	if (!vp_virtual(vp)) {
+		if (find_cpu_by_pir(index) == NULL)
+			return false;
+		if (blk)
+			*blk = PIR2VP_BLK(index);
+		if (idx)
+			*idx = PIR2VP_IDX(index);
+		return true;
+	}
+
+	/* Ensure o > n, we have *at least* 2 VPs per block */
+	if (o <= n)
+		return false;
+
+	/* Combine the index base and index */
+	if (idx)
+		*idx = ((index >> n) & ~imask) | (index & imask);
+	/* Extract block ID */
+	if (blk)
+		*blk = (index >> (o - n)) & ((1 << n) - 1);
+
+	/* Return order as well if asked for */
+	if (order)
+		*order = o;
+
+	return true;
+}
+
+static uint32_t xive_encode_vp(uint32_t blk, uint32_t idx, uint32_t order)
+{
+	uint32_t vp = (1 << VP_VIRTUAL_SHIFT) | (order << VP_ORDER_SHIFT);
+	uint32_t n = xive_chips_alloc_bits;
+	uint32_t imask = (1 << (order - n)) - 1;
+
+	vp |= (idx & ~imask) << n;
+	vp |= blk << (order - n);
+	vp |= idx & imask;
+	return  vp;
+}
+
+/*
+ * XSCOM/MMIO helpers
+ */
+#define XIVE_NO_MMIO -1
+
+#define xive_regw(__x, __r, __v) \
+	__xive_regw(__x, __r, X_##__r, __v, #__r)
+#define xive_regr(__x, __r) \
+	__xive_regr(__x, __r, X_##__r, #__r)
+#define xive_regwx(__x, __r, __v) \
+	__xive_regw(__x, XIVE_NO_MMIO, X_##__r, __v, #__r)
+#define xive_regrx(__x, __r) \
+	__xive_regr(__x, XIVE_NO_MMIO, X_##__r, #__r)
+
+#ifdef XIVE_VERBOSE_DEBUG
+#define xive_vdbg(__x,__fmt,...)	prlog(PR_DEBUG,"[ IC %02x  ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_vdbg(__c,__fmt,...)	prlog(PR_DEBUG,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#else
+#define xive_vdbg(x,fmt,...)		do { } while(0)
+#define xive_cpu_vdbg(x,fmt,...)	do { } while(0)
+#endif
+
+#define xive_dbg(__x,__fmt,...)		prlog(PR_DEBUG,"[ IC %02x  ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_dbg(__c,__fmt,...)	prlog(PR_DEBUG,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#define xive_notice(__x,__fmt,...)	prlog(PR_NOTICE,"[ IC %02x  ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_notice(__c,__fmt,...)	prlog(PR_NOTICE,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#define xive_warn(__x,__fmt,...)	prlog(PR_WARNING,"[ IC %02x  ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_warn(__c,__fmt,...)	prlog(PR_WARNING,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+#define xive_err(__x,__fmt,...)		prlog(PR_ERR,"[ IC %02x  ] " __fmt, (__x)->chip_id, ##__VA_ARGS__)
+#define xive_cpu_err(__c,__fmt,...)	prlog(PR_ERR,"[CPU %04x] " __fmt, (__c)->pir, ##__VA_ARGS__)
+
+/*
+ * The XIVE subengine being accessed can be deduced from the XSCOM
+ * reg, and from there, the page offset in the IC BAR.
+ */
+static void* xive_ic_page(struct xive *x, uint32_t x_reg)
+{
+	uint64_t pgoff = (x_reg >> 8) & 0x3;
+
+	return x->ic_base + (pgoff << x->ic_shift);
+}
+
+static void __xive_regw(struct xive *x, uint32_t m_reg, uint32_t x_reg, uint64_t v,
+			const char *rname)
+{
+	bool use_xscom = (m_reg == XIVE_NO_MMIO) || !x->ic_base;
+	int64_t rc;
+
+	x->last_reg_error = false;
+
+	assert(x_reg != 0);
+
+	if (use_xscom) {
+		rc = xscom_write(x->chip_id, x->xscom_base + x_reg, v);
+		if (rc) {
+			if (!rname)
+				rname = "???";
+			xive_err(x, "Error writing register %s\n", rname);
+			/* Anything else we can do here ? */
+			x->last_reg_error = true;
+		}
+	} else {
+		out_be64(xive_ic_page(x, x_reg) + m_reg, v);
+	}
+}
+
+static uint64_t __xive_regr(struct xive *x, uint32_t m_reg, uint32_t x_reg,
+			    const char *rname)
+{
+	bool use_xscom = (m_reg == XIVE_NO_MMIO) || !x->ic_base;
+	int64_t rc;
+	uint64_t val;
+
+	x->last_reg_error = false;
+
+	assert(x_reg != 0);
+
+	if (use_xscom) {
+		rc = xscom_read(x->chip_id, x->xscom_base + x_reg, &val);
+		if (rc) {
+			if (!rname)
+				rname = "???";
+			xive_err(x, "Error reading register %s\n", rname);
+			/* Anything else we can do here ? */
+			x->last_reg_error = true;
+			return -1ull;
+		}
+	} else {
+		val = in_be64(xive_ic_page(x, x_reg) + m_reg);
+	}
+	return val;
+}
+
+/* Locate a controller from an IRQ number */
+static struct xive *xive_from_isn(uint32_t isn)
+{
+	uint32_t chip_id = GIRQ_TO_CHIP(isn);
+	struct proc_chip *c = get_chip(chip_id);
+
+	if (!c)
+		return NULL;
+	return c->xive;
+}
+
+static struct xive *xive_from_pc_blk(uint32_t blk)
+{
+	uint32_t chip_id = PC_BLK_TO_CHIP(blk);
+	struct proc_chip *c = get_chip(chip_id);
+
+	if (!c)
+		return NULL;
+	return c->xive;
+}
+
+static struct xive *xive_from_vc_blk(uint32_t blk)
+{
+	uint32_t chip_id = VC_BLK_TO_CHIP(blk);
+	struct proc_chip *c = get_chip(chip_id);
+
+	if (!c)
+		return NULL;
+	return c->xive;
+}
+
+static struct xive_end *xive_get_end(struct xive *x, unsigned int idx)
+{
+	struct xive_end *p;
+
+	if (idx >= (x->end_ind_count * END_PER_PAGE))
+		return NULL;
+	p = (struct xive_end *)(be64_to_cpu(x->end_ind_base[idx / END_PER_PAGE]) &
+			       VSD_ADDRESS_MASK);
+	if (!p)
+		return NULL;
+
+	return &p[idx % END_PER_PAGE];
+}
+
+static struct xive_eas *xive_get_eas(struct xive *x, unsigned int isn)
+{
+	struct xive_eas *eat;
+	uint32_t idx = GIRQ_TO_IDX(isn);
+
+	if (GIRQ_IS_ESCALATION(isn)) {
+		/* Allright, an escalation EAS is buried inside an END, let's
+		 * try to find it
+		 */
+		struct xive_end *end;
+
+		if (x->chip_id != VC_BLK_TO_CHIP(GIRQ_TO_BLK(isn))) {
+			xive_err(x, "%s, ESC ISN 0x%x not on right chip\n",
+				 __func__, isn);
+			return NULL;
+		}
+		end = xive_get_end(x, idx);
+		if (!end) {
+			xive_err(x, "%s, ESC ISN 0x%x END not found\n",
+				 __func__, isn);
+			return NULL;
+		}
+
+		/* If using single-escalation, don't let anybody get
+		 * to the individual escalation interrupts
+		 */
+		if (xive_get_field32(END_W0_UNCOND_ESCALATE, end->w0))
+			return NULL;
+
+		/* Grab the escalation END */
+		return (struct xive_eas *)(char *)&end->w4;
+	} else {
+		/* Check the block matches */
+		if (isn < x->int_base || isn >= x->int_count) {
+			xive_err(x, "%s, ISN 0x%x not on right chip\n",
+				 __func__, isn);
+			return NULL;
+		}
+		assert (idx < XIVE_INT_COUNT);
+
+		/* If we support >1 block per chip, this should still
+		 * work as we are likely to make the table contiguous
+		 * anyway
+		 */
+		eat = x->eat_base;
+		assert(eat);
+
+		return eat + idx;
+	}
+}
+
+static struct xive_nvp *xive_get_vp(struct xive *x, unsigned int idx)
+{
+	struct xive_nvp *p;
+
+	assert(idx < (x->vp_ind_count * VP_PER_PAGE));
+	p = (struct xive_nvp *)(be64_to_cpu(x->vp_ind_base[idx / VP_PER_PAGE]) &
+			       VSD_ADDRESS_MASK);
+	if (!p)
+		return NULL;
+
+	return &p[idx % VP_PER_PAGE];
+}
+
+/*
+ * Store the END base of the VP in W5, using the new architected field
+ * in P10. Used to be the pressure relief interrupt field on P9.
+ */
+static void xive_vp_set_end_base(struct xive_nvp *vp,
+				 uint32_t end_blk, uint32_t end_idx)
+{
+	vp->w5 = xive_set_field32(NVP_W5_VP_END_BLOCK, 0, end_blk) |
+		xive_set_field32(NVP_W5_VP_END_INDEX, 0, end_idx);
+
+	/* This is the criteria to know if a VP was allocated */
+	assert(vp->w5 != 0);
+}
+
+static void xive_init_default_vp(struct xive_nvp *vp,
+				 uint32_t end_blk, uint32_t end_idx)
+{
+	memset(vp, 0, sizeof(struct xive_nvp));
+
+	xive_vp_set_end_base(vp, end_blk, end_idx);
+
+	vp->w0 = xive_set_field32(NVP_W0_VALID, 0, 1);
+}
+
+/*
+ * VPs of the HW threads have their own set of ENDs which is allocated
+ * when XIVE is initialized. These are tagged with a FIRMWARE bit so
+ * that they can be identified when the driver is reset (kexec).
+ */
+static void xive_init_hw_end(struct xive_end *end)
+{
+	memset(end, 0, sizeof(struct xive_end));
+	end->w0 = xive_set_field32(END_W0_FIRMWARE1, 0, 1);
+}
+
+static void *xive_get_donated_page(struct xive *x)
+{
+	return (void *)list_pop_(&x->donated_pages, 0);
+}
+
+#define XIVE_ALLOC_IS_ERR(_idx)	((_idx) >= 0xfffffff0)
+
+#define XIVE_ALLOC_NO_SPACE	0xffffffff /* No possible space */
+#define XIVE_ALLOC_NO_IND	0xfffffffe /* Indirect need provisioning */
+#define XIVE_ALLOC_NO_MEM	0xfffffffd /* Local allocation failed */
+
+static uint32_t xive_alloc_end_set(struct xive *x, bool alloc_indirect)
+{
+	uint32_t ind_idx;
+	int idx;
+	int end_base_idx;
+
+	xive_vdbg(x, "Allocating END set...\n");
+
+	assert(x->end_map);
+
+	/* Allocate from the END bitmap. Each bit is 8 ENDs */
+	idx = bitmap_find_zero_bit(*x->end_map, 0, xive_end_bitmap_size(x));
+	if (idx < 0) {
+		xive_dbg(x, "Allocation from END bitmap failed !\n");
+		return XIVE_ALLOC_NO_SPACE;
+	}
+
+	end_base_idx = idx << xive_cfg_vp_prio_shift(x);
+
+	xive_vdbg(x, "Got ENDs 0x%x..0x%x\n", end_base_idx,
+		  end_base_idx + xive_max_prio(x));
+
+	/* Calculate the indirect page where the ENDs reside */
+	ind_idx = end_base_idx / END_PER_PAGE;
+
+	/* Is there an indirect page ? If not, check if we can provision it */
+	if (!x->end_ind_base[ind_idx]) {
+		/* Default flags */
+		uint64_t vsd_flags = SETFIELD(VSD_TSIZE, 0ull, 4) |
+			SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+		void *page;
+
+		/* If alloc_indirect is set, allocate the memory from OPAL own,
+		 * otherwise try to provision from the donated pool
+		 */
+		if (alloc_indirect) {
+			/* Allocate/provision indirect page during boot only */
+			xive_vdbg(x, "Indirect empty, provisioning from local pool\n");
+			page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE);
+			if (!page) {
+				xive_dbg(x, "provisioning failed !\n");
+				return XIVE_ALLOC_NO_MEM;
+			}
+			vsd_flags |= VSD_FIRMWARE;
+		} else {
+			xive_vdbg(x, "Indirect empty, provisioning from donated pages\n");
+			page = xive_get_donated_page(x);
+			if (!page) {
+				xive_vdbg(x, "no idirect pages available !\n");
+				return XIVE_ALLOC_NO_IND;
+			}
+		}
+		memset(page, 0, PAGE_SIZE);
+		x->end_ind_base[ind_idx] = cpu_to_be64(vsd_flags |
+			(((uint64_t)page) & VSD_ADDRESS_MASK));
+		/* Any cache scrub needed ? */
+	}
+
+	bitmap_set_bit(*x->end_map, idx);
+	return end_base_idx;
+}
+
+static void xive_free_end_set(struct xive *x, uint32_t ends)
+{
+	uint32_t idx;
+	uint8_t  prio_mask = xive_max_prio(x);
+
+	xive_vdbg(x, "Freeing END 0x%x..0x%x\n", ends, ends + xive_max_prio(x));
+
+	assert((ends & prio_mask) == 0);
+	assert(x->end_map);
+
+	idx = ends >> xive_cfg_vp_prio_shift(x);
+	bitmap_clr_bit(*x->end_map, idx);
+}
+
+static bool xive_provision_vp_ind(struct xive *x, uint32_t vp_idx, uint32_t order)
+{
+	uint32_t pbase, pend, i;
+
+	pbase = vp_idx / VP_PER_PAGE;
+	pend  = (vp_idx + (1 << order)) / VP_PER_PAGE;
+
+	for (i = pbase; i <= pend; i++) {
+		void *page;
+		u64 vsd;
+
+		/* Already provisioned ? */
+		if (x->vp_ind_base[i])
+			continue;
+
+		/* Try to grab a donated page */
+		page = xive_get_donated_page(x);
+		if (!page)
+			return false;
+
+		/* Install the page */
+		memset(page, 0, PAGE_SIZE);
+		vsd = ((uint64_t)page) & VSD_ADDRESS_MASK;
+		vsd |= SETFIELD(VSD_TSIZE, 0ull, 4);
+		vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+		x->vp_ind_base[i] = cpu_to_be64(vsd);
+	}
+	return true;
+}
+
+static void xive_init_vp_allocator(void)
+{
+	/* Initialize chip alloc bits */
+	xive_chips_alloc_bits = ilog2(xive_block_count);
+
+	prlog(PR_INFO, "%d chips considered for VP allocations\n",
+	      1 << xive_chips_alloc_bits);
+
+	/* Allocate a buddy big enough for XIVE_VP_ORDER allocations.
+	 *
+	 * each bit in the buddy represents 1 << xive_chips_alloc_bits
+	 * VPs.
+	 */
+	xive_vp_buddy = buddy_create(XIVE_VP_ORDER(one_xive));
+	assert(xive_vp_buddy);
+
+	/*
+	 * We reserve the whole range of VP ids representing HW threads.
+	 */
+	assert(buddy_reserve(xive_vp_buddy, xive_hw_vp_base,
+			     xive_threadid_shift));
+}
+
+static uint32_t xive_alloc_vps(uint32_t order)
+{
+	uint32_t local_order, i;
+	int vp;
+
+	/* The minimum order is 2 VPs per chip */
+	if (order < (xive_chips_alloc_bits + 1))
+		order = xive_chips_alloc_bits + 1;
+
+	/* We split the allocation */
+	local_order = order - xive_chips_alloc_bits;
+
+	/* We grab that in the global buddy */
+	assert(xive_vp_buddy);
+	lock(&xive_buddy_lock);
+	vp = buddy_alloc(xive_vp_buddy, local_order);
+	unlock(&xive_buddy_lock);
+	if (vp < 0)
+		return XIVE_ALLOC_NO_SPACE;
+
+	/* Provision on every chip considered for allocation */
+	for (i = 0; i < (1 << xive_chips_alloc_bits); i++) {
+		struct xive *x = xive_from_pc_blk(i);
+		bool success;
+
+		/* Return internal error & log rather than assert ? */
+		assert(x);
+		lock(&x->lock);
+		success = xive_provision_vp_ind(x, vp, local_order);
+		unlock(&x->lock);
+		if (!success) {
+			lock(&xive_buddy_lock);
+			buddy_free(xive_vp_buddy, vp, local_order);
+			unlock(&xive_buddy_lock);
+			return XIVE_ALLOC_NO_IND;
+		}
+	}
+
+	/* Encode the VP number. "blk" is 0 as this represents
+	 * all blocks and the allocation always starts at 0
+	 */
+	return xive_encode_vp(0, vp, order);
+}
+
+static void xive_free_vps(uint32_t vp)
+{
+	uint32_t idx;
+	uint8_t order, local_order;
+
+	assert(xive_decode_vp(vp, NULL, &idx, &order, NULL));
+
+	/* We split the allocation */
+	local_order = order - xive_chips_alloc_bits;
+
+	/* Free that in the buddy */
+	lock(&xive_buddy_lock);
+	buddy_free(xive_vp_buddy, idx, local_order);
+	unlock(&xive_buddy_lock);
+}
+
+enum xive_cache_type {
+	xive_cache_easc,
+	xive_cache_esbc,
+	xive_cache_endc,
+	xive_cache_nxc,
+};
+
+/*
+ * Cache update
+ */
+
+#define FLUSH_CTRL_POLL_VALID PPC_BIT(0)  /* POLL bit is the same for all */
+
+static int64_t __xive_cache_scrub(struct xive *x,
+				  enum xive_cache_type ctype,
+				  uint64_t block, uint64_t idx,
+				  bool want_inval __unused, bool want_disable __unused)
+{
+	uint64_t ctrl_reg, x_ctrl_reg;
+	uint64_t poll_val, ctrl_val;
+
+#ifdef XIVE_CHECK_LOCKS
+	assert(lock_held_by_me(&x->lock));
+#endif
+	switch (ctype) {
+	case xive_cache_easc:
+		poll_val =
+			SETFIELD(VC_EASC_FLUSH_POLL_BLOCK_ID, 0ll, block) |
+			SETFIELD(VC_EASC_FLUSH_POLL_OFFSET, 0ll, idx) |
+			VC_EASC_FLUSH_POLL_BLOCK_ID_MASK |
+			VC_EASC_FLUSH_POLL_OFFSET_MASK;
+		xive_regw(x, VC_EASC_FLUSH_POLL, poll_val);
+		ctrl_reg = VC_EASC_FLUSH_CTRL;
+		x_ctrl_reg = X_VC_EASC_FLUSH_CTRL;
+		break;
+	case xive_cache_esbc:
+		poll_val =
+			SETFIELD(VC_ESBC_FLUSH_POLL_BLOCK_ID, 0ll, block) |
+			SETFIELD(VC_ESBC_FLUSH_POLL_OFFSET, 0ll, idx) |
+			VC_ESBC_FLUSH_POLL_BLOCK_ID_MASK |
+			VC_ESBC_FLUSH_POLL_OFFSET_MASK;
+		xive_regw(x, VC_ESBC_FLUSH_POLL, poll_val);
+		ctrl_reg = VC_ESBC_FLUSH_CTRL;
+		x_ctrl_reg = X_VC_ESBC_FLUSH_CTRL;
+		break;
+	case xive_cache_endc:
+		poll_val =
+			SETFIELD(VC_ENDC_FLUSH_POLL_BLOCK_ID, 0ll, block) |
+			SETFIELD(VC_ENDC_FLUSH_POLL_OFFSET, 0ll, idx) |
+			VC_ENDC_FLUSH_POLL_BLOCK_ID_MASK |
+			VC_ENDC_FLUSH_POLL_OFFSET_MASK;
+		xive_regw(x, VC_ENDC_FLUSH_POLL, poll_val);
+		ctrl_reg = VC_ENDC_FLUSH_CTRL;
+		x_ctrl_reg = X_VC_ENDC_FLUSH_CTRL;
+		break;
+	case xive_cache_nxc:
+		poll_val =
+			SETFIELD(PC_NXC_FLUSH_POLL_BLOCK_ID, 0ll, block) |
+			SETFIELD(PC_NXC_FLUSH_POLL_OFFSET, 0ll, idx) |
+			PC_NXC_FLUSH_POLL_BLOCK_ID_MASK |
+			PC_NXC_FLUSH_POLL_OFFSET_MASK;
+		xive_regw(x, PC_NXC_FLUSH_POLL, poll_val);
+		ctrl_reg = PC_NXC_FLUSH_CTRL;
+		x_ctrl_reg = X_PC_NXC_FLUSH_CTRL;
+		break;
+	default:
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	/* XXX Add timeout !!! */
+	for (;;) {
+		ctrl_val = __xive_regr(x, ctrl_reg, x_ctrl_reg, NULL);
+		if (!(ctrl_val & FLUSH_CTRL_POLL_VALID))
+			break;
+		/* Small delay */
+		time_wait(100);
+	}
+	sync();
+	return 0;
+}
+
+static int64_t xive_easc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+	return __xive_cache_scrub(x, xive_cache_easc, block, idx, false, false);
+}
+
+static int64_t xive_nxc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+	return __xive_cache_scrub(x, xive_cache_nxc, block, idx, false, false);
+}
+
+static int64_t xive_nxc_scrub_clean(struct xive *x, uint64_t block, uint64_t idx)
+{
+	return __xive_cache_scrub(x, xive_cache_nxc, block, idx, true, false);
+}
+
+static int64_t xive_endc_scrub(struct xive *x, uint64_t block, uint64_t idx)
+{
+	return __xive_cache_scrub(x, xive_cache_endc, block, idx, false, false);
+}
+
+#define XIVE_CACHE_WATCH_MAX_RETRIES 10
+
+static int64_t __xive_cache_watch(struct xive *x, enum xive_cache_type ctype,
+				  uint64_t block, uint64_t idx,
+				  uint32_t start_dword, uint32_t dword_count,
+				  beint64_t *new_data, bool light_watch,
+				  bool synchronous)
+{
+	uint64_t sreg, sregx, dreg0, dreg0x;
+	uint64_t dval0, sval, status;
+	int64_t i;
+	int retries = 0;
+
+#ifdef XIVE_CHECK_LOCKS
+	assert(lock_held_by_me(&x->lock));
+#endif
+	switch (ctype) {
+	case xive_cache_endc:
+		sreg = VC_ENDC_WATCH0_SPEC;
+		sregx = X_VC_ENDC_WATCH0_SPEC;
+		dreg0 = VC_ENDC_WATCH0_DATA0;
+		dreg0x = X_VC_ENDC_WATCH0_DATA0;
+		sval = SETFIELD(VC_ENDC_WATCH_BLOCK_ID, idx, block);
+		break;
+	case xive_cache_nxc:
+		sreg = PC_NXC_WATCH0_SPEC;
+		sregx = X_PC_NXC_WATCH0_SPEC;
+		dreg0 = PC_NXC_WATCH0_DATA0;
+		dreg0x = X_PC_NXC_WATCH0_DATA0;
+		sval = SETFIELD(PC_NXC_WATCH_BLOCK_ID, idx, block);
+		break;
+	default:
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	/* The full bit is in the same position for ENDC and NXC */
+	if (!light_watch)
+		sval |= VC_ENDC_WATCH_FULL;
+
+	for (;;) {
+		/* Write the cache watch spec */
+		__xive_regw(x, sreg, sregx, sval, NULL);
+
+		/* Load data0 register to populate the watch */
+		dval0 = __xive_regr(x, dreg0, dreg0x, NULL);
+
+		/* If new_data is NULL, this is a dummy watch used as a
+		 * workaround for a HW bug
+		 */
+		if (!new_data) {
+			__xive_regw(x, dreg0, dreg0x, dval0, NULL);
+			return 0;
+		}
+
+		/* Write the words into the watch facility. We write in reverse
+		 * order in case word 0 is part of it as it must be the last
+		 * one written.
+		 */
+		for (i = start_dword + dword_count - 1; i >= start_dword ;i--) {
+			uint64_t dw = be64_to_cpu(new_data[i - start_dword]);
+			__xive_regw(x, dreg0 + i * 8, dreg0x + i, dw, NULL);
+		}
+
+		/* Write data0 register to trigger the update if word 0 wasn't
+		 * written above
+		 */
+		if (start_dword > 0)
+			__xive_regw(x, dreg0, dreg0x, dval0, NULL);
+
+		/* This may not be necessary for light updates (it's possible
+		 * that a sync in sufficient, TBD). Ensure the above is
+		 * complete and check the status of the watch.
+		 */
+		status = __xive_regr(x, sreg, sregx, NULL);
+
+		/* Bits FULL and CONFLICT are in the same position in
+		 * ENDC and NXC
+		 */
+		if (!(status & VC_ENDC_WATCH_FULL) ||
+		    !(status & VC_ENDC_WATCH_CONFLICT))
+			break;
+		if (!synchronous)
+			return OPAL_BUSY;
+
+		if (++retries == XIVE_CACHE_WATCH_MAX_RETRIES) {
+			xive_err(x, "Reached maximum retries %d when doing "
+				 "a %s cache update\n", retries,
+				 ctype == xive_cache_endc ? "ENDC" : "NXC");
+			return OPAL_BUSY;
+		}
+	}
+
+	/* Perform a scrub with "want_invalidate" set to false to push the
+	 * cache updates to memory as well
+	 */
+	return __xive_cache_scrub(x, ctype, block, idx, false, false);
+}
+
+#ifdef XIVE_DEBUG_INIT_CACHE_UPDATES
+static bool xive_check_endc_update(struct xive *x, uint32_t idx, struct xive_end *end)
+{
+	struct xive_end *end_p = xive_get_end(x, idx);
+	struct xive_end end2;
+
+	assert(end_p);
+	end2 = *end_p;
+	if (memcmp(end, &end2, sizeof(struct xive_end)) != 0) {
+		xive_err(x, "END update mismatch idx %d\n", idx);
+		xive_err(x, "want: %08x %08x %08x %08x\n",
+			 end->w0, end->w1, end->w2, end->w3);
+		xive_err(x, "      %08x %08x %08x %08x\n",
+			 end->w4, end->w5, end->w6, end->w7);
+		xive_err(x, "got : %08x %08x %08x %08x\n",
+			 end2.w0, end2.w1, end2.w2, end2.w3);
+		xive_err(x, "      %08x %08x %08x %08x\n",
+			 end2.w4, end2.w5, end2.w6, end2.w7);
+		return false;
+	}
+	return true;
+}
+
+static bool xive_check_nxc_update(struct xive *x, uint32_t idx, struct xive_nvp *vp)
+{
+	struct xive_nvp *vp_p = xive_get_vp(x, idx);
+	struct xive_nvp vp2;
+
+	assert(vp_p);
+	vp2 = *vp_p;
+	if (memcmp(vp, &vp2, sizeof(struct xive_nvp)) != 0) {
+		xive_err(x, "VP update mismatch idx %d\n", idx);
+		xive_err(x, "want: %08x %08x %08x %08x\n",
+			 vp->w0, vp->w1, vp->w2, vp->w3);
+		xive_err(x, "      %08x %08x %08x %08x\n",
+			 vp->w4, vp->w5, vp->w6, vp->w7);
+		xive_err(x, "got : %08x %08x %08x %08x\n",
+			 vp2.w0, vp2.w1, vp2.w2, vp2.w3);
+		xive_err(x, "      %08x %08x %08x %08x\n",
+			 vp2.w4, vp2.w5, vp2.w6, vp2.w7);
+		return false;
+	}
+	return true;
+}
+#else
+static inline bool xive_check_endc_update(struct xive *x __unused,
+					uint32_t idx __unused,
+					struct xive_end *end __unused)
+{
+	return true;
+}
+
+static inline bool xive_check_nxc_update(struct xive *x __unused,
+					 uint32_t idx __unused,
+					 struct xive_nvp *vp __unused)
+{
+	return true;
+}
+#endif
+
+static int64_t xive_escalation_ive_cache_update(struct xive *x, uint64_t block,
+				     uint64_t idx, struct xive_eas *eas,
+				     bool synchronous)
+{
+	return __xive_cache_watch(x, xive_cache_endc, block, idx,
+				  2, 1, &eas->w, true, synchronous);
+}
+
+static int64_t xive_endc_cache_update(struct xive *x, uint64_t block,
+				     uint64_t idx, struct xive_end *end,
+				     bool synchronous)
+{
+	int64_t ret;
+
+	ret = __xive_cache_watch(x, xive_cache_endc, block, idx,
+				 0, 4, (beint64_t *)end, false, synchronous);
+	xive_check_endc_update(x, idx, end);
+	return ret;
+}
+
+static int64_t xive_nxc_cache_update(struct xive *x, uint64_t block,
+				     uint64_t idx, struct xive_nvp *vp,
+				     bool synchronous)
+{
+	int64_t ret;
+
+	ret = __xive_cache_watch(x, xive_cache_nxc, block, idx,
+				 0, 4, (beint64_t *)vp, false, synchronous);
+	xive_check_nxc_update(x, idx, vp);
+	return ret;
+}
+
+/*
+ * VSD
+ */
+static bool xive_set_vsd(struct xive *x, uint32_t tbl, uint32_t idx, uint64_t v)
+{
+	/* Set VC subengine */
+	xive_regw(x, VC_VSD_TABLE_ADDR,
+		  SETFIELD(VC_VSD_TABLE_SELECT, 0ull, tbl) |
+		  SETFIELD(VC_VSD_TABLE_ADDRESS, 0ull, idx));
+	if (x->last_reg_error)
+		return false;
+	xive_regw(x, VC_VSD_TABLE_DATA, v);
+	if (x->last_reg_error)
+		return false;
+
+	/* also set PC subengine if table is used */
+	if (tbl == VST_EAS || tbl == VST_ERQ || tbl == VST_IC)
+		return true;
+
+	xive_regw(x, PC_VSD_TABLE_ADDR,
+		  SETFIELD(PC_VSD_TABLE_SELECT, 0ull, tbl) |
+		  SETFIELD(PC_VSD_TABLE_ADDRESS, 0ull, idx));
+	if (x->last_reg_error)
+		return false;
+	xive_regw(x, PC_VSD_TABLE_DATA, v);
+	if (x->last_reg_error)
+		return false;
+	return true;
+}
+
+static bool xive_set_local_tables(struct xive *x)
+{
+	uint64_t base, i;
+
+	/* These have to be power of 2 sized */
+	assert(is_pow2(XIVE_ESB_SIZE));
+	assert(is_pow2(XIVE_EAT_SIZE));
+
+	/* All tables set as exclusive */
+	base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+
+	/* ESB: direct mode */
+	if (!xive_set_vsd(x, VST_ESB, x->block_id, base |
+			  (((uint64_t)x->sbe_base) & VSD_ADDRESS_MASK) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(XIVE_ESB_SIZE) - 12)))
+		return false;
+
+	/* EAS: direct mode */
+	if (!xive_set_vsd(x, VST_EAS, x->block_id, base |
+			  (((uint64_t)x->eat_base) & VSD_ADDRESS_MASK) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(XIVE_EAT_SIZE) - 12)))
+		return false;
+
+	/* END: indirect mode with 64K subpages */
+	if (!xive_set_vsd(x, VST_END, x->block_id, base |
+			  (((uint64_t)x->end_ind_base) & VSD_ADDRESS_MASK) |
+			  VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull,
+						  ilog2(x->end_ind_size) - 12)))
+		return false;
+
+	/* NVP: indirect mode with 64K subpages */
+	if (!xive_set_vsd(x, VST_NVP, x->block_id, base |
+			  (((uint64_t)x->vp_ind_base) & VSD_ADDRESS_MASK) |
+			  VSD_INDIRECT | SETFIELD(VSD_TSIZE, 0ull,
+						  ilog2(x->vp_ind_size) - 12)))
+		return false;
+
+	/* NVG: not used  */
+	/* NVC: not used */
+
+	/* INT and SYNC: indexed with the Topology# */
+	if (!xive_set_vsd(x, VST_IC, x->chip_id, base |
+			  (((uint64_t)x->ic_base) & VSD_ADDRESS_MASK) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(x->ic_size) - 12)))
+		return false;
+
+	if (!xive_set_vsd(x, VST_SYNC, x->chip_id, base |
+			  (((uint64_t)x->sync_inject) & VSD_ADDRESS_MASK) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(x->sync_inject_size) - 12)))
+		return false;
+
+	/*
+	 * ERQ: one 64K page for each queue overflow. Indexed with :
+	 *
+	 * 0:IPI, 1:HWD, 2:NxC, 3:INT, 4:OS-Queue, 5:Pool-Queue, 6:Hard-Queue
+	 */
+	for (i = 0; i < VC_QUEUE_COUNT; i++) {
+		u64 addr = ((uint64_t)x->q_ovf) + i * PAGE_SIZE;
+		u64 cfg, sreg, sregx;
+
+		if (!xive_set_vsd(x, VST_ERQ, i, base |
+				  (addr & VSD_ADDRESS_MASK) |
+			  SETFIELD(VSD_TSIZE, 0ull, 4)))
+			return false;
+
+		sreg = VC_QUEUES_CFG_REM0 + i * 8;
+		sregx = X_VC_QUEUES_CFG_REM0 + i;
+		cfg = __xive_regr(x, sreg, sregx, NULL);
+		cfg |= VC_QUEUES_CFG_MEMB_EN;
+		cfg = SETFIELD(VC_QUEUES_CFG_MEMB_SZ, cfg, 4);
+		__xive_regw(x, sreg, sregx, cfg, NULL);
+	}
+
+	return true;
+}
+
+
+/*
+ * IC BAR layout
+ *
+ * Page 0:		Internal CQ register accesses (reads & writes)
+ * Page 1:		Internal PC register accesses (reads & writes)
+ * Page 2:		Internal VC register accesses (reads & writes)
+ * Page 3:		Internal TCTXT (TIMA) reg accesses (read & writes)
+ * Page 4:		Notify Port page (writes only, w/data),
+ * Page 5:		Reserved
+ * Page 6:		Sync Poll page (writes only, dataless)
+ * Page 7:		Sync Inject page (writes only, dataless)
+ * Page 8:		LSI Trigger page (writes only, dataless)
+ * Page 9:		LSI SB Management page (reads & writes dataless)
+ * Pages 10-255:	Reserved
+ * Pages 256-383: 	Direct mapped Thread Context Area (reads & writes)
+ *                	covering the 128 threads in P10.
+ * Pages 384-511: 	Reserved
+ */
+
+#define XIVE_IC_CQ_PGOFF	0
+#define XIVE_IC_PC_PGOFF	1
+#define XIVE_IC_VC_PGOFF	2
+#define XIVE_IC_TCTXT_PGOFF	3
+#define XIVE_NOTIFY_PGOFF	4
+#define XIVE_SYNC_POLL_PGOFF	6
+#define XIVE_SYNC_INJECT_PGOFF	7
+#define XIVE_LSI_TRIGGER_PGOFF	8
+#define XIVE_LSI_MGMT_PGOFF	9
+#define XIVE_IC_TM_DIRECT_PGOFF 256
+
+static bool xive_configure_ic_bars(struct xive *x)
+{
+	uint64_t chip_id = x->chip_id;
+	uint64_t val;
+
+	/* Reset all bars to zero */
+	xive_regwx(x, CQ_RST_CTL, CQ_RST_PB_BAR_RESET);
+
+	/* IC BAR */
+	phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size);
+	val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID | CQ_IC_BAR_64K;
+	x->ic_shift = 16;
+
+	xive_regwx(x, CQ_IC_BAR, val);
+	if (x->last_reg_error)
+		return false;
+
+	/*
+	 * TM BAR, same address for each chip. Hence we create a fake
+	 * chip 0 and use that for all phys_map_get(XIVE_TM) calls.
+	 */
+	phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size);
+	val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID | CQ_TM_BAR_64K;
+	x->tm_shift = 16;
+
+	xive_regwx(x, CQ_TM_BAR, val);
+	if (x->last_reg_error)
+		return false;
+
+	/* IC BAR sub-pages shortcuts */
+	x->ic_tm_direct_base = x->ic_base +
+		(XIVE_IC_TM_DIRECT_PGOFF << x->ic_shift);
+
+	return true;
+}
+
+/*
+ * NVPG, NVC, ESB, END BARs have common attributes: 64k page and only
+ * one set covering the whole BAR.
+ */
+static bool xive_configure_bars(struct xive *x)
+{
+	uint64_t chip_id = x->chip_id;
+	uint64_t val;
+	uint64_t esb_size;
+	uint64_t end_size;
+	uint64_t nvp_size;
+
+	x->nvp_size = XIVE_VP_COUNT(x) << XIVE_NVP_SHIFT;
+	x->esb_size = XIVE_INT_COUNT << XIVE_ESB_SHIFT;
+	x->end_size = XIVE_END_COUNT << XIVE_END_SHIFT;
+
+	/*
+	 * NVC BAR is not configured because we do not use the XIVE2
+	 * Crowd capability.
+	 */
+
+	/* NVPG BAR: two pages, even NVP, odd NVG */
+	phys_map_get(chip_id, XIVE_NVPG, 0, (uint64_t *)&x->nvp_base, &nvp_size);
+	if (x->nvp_size > nvp_size) {
+		xive_err(x, "NVP table is larger than default: "
+			 "0x%012llx > 0x%012llx\n", x->nvp_size, nvp_size);
+		return false;
+	}
+
+	val = (uint64_t)x->nvp_base | CQ_BAR_VALID | CQ_BAR_64K |
+		SETFIELD(CQ_BAR_RANGE, 0ull, ilog2(x->nvp_size) - 24);
+	xive_regwx(x, CQ_NVPG_BAR, val);
+	if (x->last_reg_error)
+		return false;
+
+	/* ESB BAR */
+	phys_map_get(chip_id, XIVE_ESB, 0, (uint64_t *)&x->esb_base, &esb_size);
+	if (x->esb_size > esb_size) {
+		xive_err(x, "ESB table is larger than default: "
+			 "0x%012llx > 0x%012llx\n", x->esb_size, esb_size);
+		return false;
+	}
+
+	val = (uint64_t)x->esb_base | CQ_BAR_VALID | CQ_BAR_64K |
+		SETFIELD(CQ_BAR_RANGE, 0ull, ilog2(x->esb_size) - 24);
+	xive_regwx(x, CQ_ESB_BAR, val);
+	if (x->last_reg_error)
+		return false;
+
+	/* END BAR */
+	phys_map_get(chip_id, XIVE_END, 0, (uint64_t *)&x->end_base, &end_size);
+	if (x->end_size > end_size) {
+		xive_err(x, "END table is larger than default: "
+			 "0x%012llx > 0x%012llx\n", x->end_size, end_size);
+		return false;
+	}
+
+	val = (uint64_t)x->end_base | CQ_BAR_VALID | CQ_BAR_64K |
+		SETFIELD(CQ_BAR_RANGE, 0ull, ilog2(x->end_size) - 24);
+	xive_regwx(x, CQ_END_BAR, val);
+	if (x->last_reg_error)
+		return false;
+
+	xive_dbg(x, "IC:  %14p [0x%012llx]\n", x->ic_base, x->ic_size);
+	xive_dbg(x, "TM:  %14p [0x%012llx]\n", x->tm_base, x->tm_size);
+	xive_dbg(x, "NVP: %14p [0x%012llx]\n", x->nvp_base, x->nvp_size);
+	xive_dbg(x, "ESB: %14p [0x%012llx]\n", x->esb_base, x->esb_size);
+	xive_dbg(x, "END: %14p [0x%012llx]\n", x->end_base, x->end_size);
+	xive_dbg(x, "OVF: %14p [0x%012x]\n", x->q_ovf,
+		 VC_QUEUE_COUNT * PAGE_SIZE);
+
+	return true;
+}
+
+static void xive_dump_mmio(struct xive *x)
+{
+	prlog(PR_DEBUG, " CQ_CFG_PB_GEN = %016llx\n",
+	      in_be64(x->ic_base + CQ_CFG_PB_GEN));
+	prlog(PR_DEBUG, " CQ_MSGSND     = %016llx\n",
+	      in_be64(x->ic_base + CQ_MSGSND));
+}
+
+static const struct {
+	uint64_t bitmask;
+	const char *name;
+} xive_capabilities[] = {
+	{ CQ_XIVE_CAP_PHB_PQ_DISABLE, "PHB PQ disable mode support" },
+	{ CQ_XIVE_CAP_PHB_ABT, "PHB address based trigger mode support" },
+	{ CQ_XIVE_CAP_EXPLOITATION_MODE, "Exploitation mode" },
+	{ CQ_XIVE_CAP_STORE_EOI, "StoreEOI mode support" },
+	{ CQ_XIVE_CAP_VP_SAVE_RESTORE, "VP Context Save and Restore" },
+};
+
+static void xive_dump_capabilities(struct xive *x, uint64_t cap_val)
+{
+	int i;
+
+	xive_dbg(x, "capabilities: %016llx\n", cap_val);
+	xive_dbg(x, "\tVersion: %lld\n",
+		 GETFIELD(CQ_XIVE_CAP_VERSION, cap_val));
+	xive_dbg(x, "\tUser interrupt priorities: [ 1 - %d ]\n",
+		 1 << GETFIELD(CQ_XIVE_CAP_USER_INT_PRIO, cap_val));
+	xive_dbg(x, "\tVP interrupt priorities: [ %d - 8 ]\n",
+		 1 << GETFIELD(CQ_XIVE_CAP_VP_INT_PRIO, cap_val));
+	xive_dbg(x, "\tExtended Blockid bits: %lld\n",
+		 4 + GETFIELD(CQ_XIVE_CAP_BLOCK_ID_WIDTH, cap_val));
+
+	for (i = 0; i < ARRAY_SIZE(xive_capabilities); i++) {
+		if (xive_capabilities[i].bitmask & cap_val)
+			xive_dbg(x, "\t%s\n", xive_capabilities[i].name);
+	}
+}
+
+static const struct {
+	uint64_t bitmask;
+	const char *name;
+} xive_configs[] = {
+	{ CQ_XIVE_CFG_GEN1_TIMA_OS, "Gen1 mode TIMA OS" },
+	{ CQ_XIVE_CFG_GEN1_TIMA_HYP, "Gen1 mode TIMA Hyp" },
+	{ CQ_XIVE_CFG_GEN1_TIMA_HYP_BLK0, "Gen1 mode TIMA General Hypervisor Block0" },
+	{ CQ_XIVE_CFG_GEN1_TIMA_CROWD_DIS, "Gen1 mode TIMA Crowd disable" },
+	{ CQ_XIVE_CFG_GEN1_END_ESX, "Gen1 mode END ESx" },
+	{ CQ_XIVE_CFG_EN_VP_SAVE_RESTORE, "VP Context Save and Restore" },
+	{ CQ_XIVE_CFG_EN_VP_SAVE_REST_STRICT, "VP Context Save and Restore strict" },
+};
+
+static void xive_dump_configuration(struct xive *x, const char *prefix,
+				    uint64_t cfg_val)
+{
+	int i ;
+
+	xive_dbg(x, "%s configuration: %016llx\n", prefix, cfg_val);
+	xive_dbg(x, "\tHardwired Thread Id range: %lld bits\n",
+		 7 + GETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, cfg_val));
+	xive_dbg(x, "\tUser Interrupt priorities: [ 1 - %d ]\n",
+		 1 << GETFIELD(CQ_XIVE_CFG_USER_INT_PRIO, cfg_val));
+	xive_dbg(x, "\tVP Interrupt priorities: [ 0 - %d ]\n", xive_max_prio(x));
+	xive_dbg(x, "\tBlockId bits: %lld bits\n",
+		 4 + GETFIELD(CQ_XIVE_CFG_BLOCK_ID_WIDTH, cfg_val));
+	if (CQ_XIVE_CFG_HYP_HARD_BLKID_OVERRIDE & cfg_val)
+		xive_dbg(x, "\tHardwired BlockId: %lld\n",
+			 GETFIELD(CQ_XIVE_CFG_HYP_HARD_BLOCK_ID, cfg_val));
+
+	for (i = 0; i < ARRAY_SIZE(xive_configs); i++) {
+		if (xive_configs[i].bitmask & cfg_val)
+			xive_dbg(x, "\t%s\n", xive_configs[i].name);
+	}
+}
+
+/*
+ * Default XIVE configuration
+ */
+#define XIVE_CONFIGURATION                                        \
+	(SETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, 0ull, CQ_XIVE_CFG_THREADID_8BITS) | \
+	 SETFIELD(CQ_XIVE_CFG_VP_INT_PRIO, 0ull, CQ_XIVE_CFG_INT_PRIO_8))
+
+/*
+ * Gen1 configuration for tests (QEMU)
+ */
+#define XIVE_CONFIGURATION_GEN1						\
+	(SETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, 0ull, CQ_XIVE_CFG_THREADID_7BITS) | \
+	 SETFIELD(CQ_XIVE_CFG_VP_INT_PRIO, 0ull, CQ_XIVE_CFG_INT_PRIO_8) | \
+	 CQ_XIVE_CFG_GEN1_TIMA_OS |					\
+	 CQ_XIVE_CFG_GEN1_TIMA_HYP |					\
+	 CQ_XIVE_CFG_GEN1_TIMA_HYP_BLK0 |				\
+	 CQ_XIVE_CFG_GEN1_TIMA_CROWD_DIS |				\
+	 CQ_XIVE_CFG_GEN1_END_ESX)
+
+static bool xive_has_cap(struct xive *x, uint64_t cap)
+{
+	return !!x && !!(x->capabilities & cap);
+}
+
+#define XIVE_CAN_STORE_EOI(x) xive_has_cap(x, CQ_XIVE_CAP_STORE_EOI)
+
+static bool xive_cfg_save_restore(struct xive *x)
+{
+	return !!(x->config & CQ_XIVE_CFG_EN_VP_SAVE_RESTORE);
+}
+
+/*
+ * When PQ_disable is available, configure the ESB cache to improve
+ * performance for PHB ESBs.
+ *
+ * split_mode :
+ *   1/3rd of the cache is reserved for PHB ESBs and the rest to
+ *   IPIs. This is sufficient to keep all the PHB ESBs in cache and
+ *   avoid ESB cache misses during IO interrupt processing.
+ *
+ * hash_array_enable :
+ *   Internal cache hashing optimization. The hash_array tracks for
+ *   ESBs where the original trigger came from so that we avoid
+ *   getting the EAS into the cache twice.
+ */
+static void xive_config_esb_cache(struct xive *x)
+{
+	uint64_t val = xive_regr(x, VC_ESBC_CFG);
+
+	if (xive_has_cap(x, CQ_XIVE_CAP_PHB_PQ_DISABLE)) {
+		val |= VC_ESBC_CFG_SPLIT_MODE | VC_ESBC_CFG_HASH_ARRAY_ENABLE;
+		val = SETFIELD(VC_ESBC_CFG_MAX_ENTRIES_IN_MODIFIED, val, 0xE);
+		xive_dbg(x, "ESB cache configured with split mode "
+			 "and hash array. VC_ESBC_CFG=%016llx\n", val);
+	} else
+		val &= ~VC_ESBC_CFG_SPLIT_MODE;
+
+	xive_regw(x, VC_ESBC_CFG, val);
+}
+
+static void xive_config_fused_core(struct xive *x)
+{
+	uint64_t val = xive_regr(x, TCTXT_CFG);
+
+	if (this_cpu()->is_fused_core) {
+		val |= TCTXT_CFG_FUSE_CORE_EN;
+		xive_dbg(x, "configured for fused cores. "
+			 "PC_TCTXT_CFG=%016llx\n", val);
+	} else
+		val &= ~TCTXT_CFG_FUSE_CORE_EN;
+	xive_regw(x, TCTXT_CFG, val);
+}
+
+static void xive_config_reduced_priorities_fixup(struct xive *x)
+{
+	if (xive_cfg_vp_prio_shift(x) < CQ_XIVE_CFG_INT_PRIO_8 &&
+	    x->quirks & XIVE_QUIRK_BROKEN_PRIO_CHECK) {
+		uint64_t val = xive_regr(x, PC_ERR1_CFG1);
+
+		val &= ~PC_ERR1_CFG1_INTERRUPT_INVALID_PRIO;
+		xive_dbg(x, "workaround for reduced priorities. "
+			 "PC_ERR1_CFG1=%016llx\n", val);
+		xive_regw(x, PC_ERR1_CFG1, val);
+	}
+}
+
+static bool xive_config_init(struct xive *x)
+{
+	x->capabilities = xive_regr(x, CQ_XIVE_CAP);
+	xive_dump_capabilities(x, x->capabilities);
+
+	x->generation = GETFIELD(CQ_XIVE_CAP_VERSION, x->capabilities);
+
+	/*
+	 * Allow QEMU to override version for tests
+	 */
+	if (x->generation != XIVE_GEN2 && !chip_quirk(QUIRK_QEMU)) {
+		xive_err(x, "Invalid XIVE controller version %d\n",
+			 x->generation);
+		return false;
+	}
+
+	x->config = xive_regr(x, CQ_XIVE_CFG);
+	xive_dump_configuration(x, "default", x->config);
+
+	/* Start with default settings */
+	x->config = x->generation == XIVE_GEN1 ? XIVE_CONFIGURATION_GEN1 :
+		XIVE_CONFIGURATION;
+
+	if (x->quirks & XIVE_QUIRK_THREADID_7BITS)
+		x->config = SETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE, x->config,
+				     CQ_XIVE_CFG_THREADID_7BITS);
+
+	/*
+	 * Hardwire the block ID. The default value is the topology ID
+	 * of the chip which is different from the block.
+	 */
+	x->config |= CQ_XIVE_CFG_HYP_HARD_BLKID_OVERRIDE |
+		SETFIELD(CQ_XIVE_CFG_HYP_HARD_BLOCK_ID, 0ull, x->block_id);
+
+	/*
+	 * Enable "VP Context Save and Restore" by default. it is
+	 * compatible with KVM which currently does the context
+	 * save&restore in the entry/exit path of the vCPU
+	 */
+	if (x->capabilities & CQ_XIVE_CAP_VP_SAVE_RESTORE)
+		x->config |= CQ_XIVE_CFG_EN_VP_SAVE_RESTORE;
+
+	xive_dump_configuration(x, "new", x->config);
+	xive_regw(x, CQ_XIVE_CFG, x->config);
+	if (xive_regr(x, CQ_XIVE_CFG) != x->config) {
+		xive_err(x, "configuration setting failed\n");
+	}
+
+	/*
+	 * Disable error reporting in the FIR for info errors from the VC.
+	 */
+	xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_VC_INFO_ERROR_0_2);
+
+	/*
+	 * Mask CI Load and Store to bad location, as IPI trigger
+	 * pages may be mapped to user space, and a read on the
+	 * trigger page causes a checkstop
+	 */
+	xive_regw(x, CQ_FIRMASK_OR, CQ_FIR_PB_RCMDX_CI_ERR1);
+
+	/*
+	 * VP space settings. P9 mode is 19bits.
+	 */
+	x->vp_shift = x->generation == XIVE_GEN1 ?
+		VP_SHIFT_GEN1 : VP_SHIFT_GEN2;
+
+	/*
+	 * VP ids for HW threads. These values are hardcoded in the
+	 * CAM line of the HW context
+	 *
+	 *     POWER10     |chip|0000000000000001|threadid|
+	 *     28bits        4           16          8
+	 *
+	 *     POWER9           |chip|000000000001|thrdid |
+	 *     23bits              4      12          7
+	 */
+
+	/* TODO (cosmetic): set VP ids for HW threads only once */
+	xive_threadid_shift = 7 + GETFIELD(CQ_XIVE_CFG_HYP_HARD_RANGE,
+					   x->config);
+
+	xive_hw_vp_base  = 1 << xive_threadid_shift;
+	xive_hw_vp_count = 1 << xive_threadid_shift;
+
+	xive_dbg(x, "store EOI is %savailable\n",
+		 XIVE_CAN_STORE_EOI(x) ? "" : "not ");
+
+	xive_config_fused_core(x);
+
+	xive_config_esb_cache(x);
+
+	xive_config_reduced_priorities_fixup(x);
+
+	return true;
+}
+
+/* Set Translation tables : 1 block per chip */
+static bool xive_setup_set_xlate(struct xive *x)
+{
+	unsigned int i;
+
+	/* Configure ESBs */
+	xive_regw(x, CQ_TAR,
+		  CQ_TAR_AUTOINC | SETFIELD(CQ_TAR_SELECT, 0ull, CQ_TAR_ESB));
+	if (x->last_reg_error)
+		return false;
+	for (i = 0; i < XIVE_MAX_BLOCKS; i++) {
+		xive_regw(x, CQ_TDR, CQ_TDR_VALID |
+			  SETFIELD(CQ_TDR_BLOCK_ID, 0ull, x->block_id));
+		if (x->last_reg_error)
+			return false;
+	}
+
+	/* Configure ENDs */
+	xive_regw(x, CQ_TAR,
+		  CQ_TAR_AUTOINC | SETFIELD(CQ_TAR_SELECT, 0ull, CQ_TAR_END));
+	if (x->last_reg_error)
+		return false;
+	for (i = 0; i < XIVE_MAX_BLOCKS; i++) {
+		xive_regw(x, CQ_TDR, CQ_TDR_VALID |
+			  SETFIELD(CQ_TDR_BLOCK_ID, 0ull, x->block_id));
+		if (x->last_reg_error)
+			return false;
+	}
+
+	/* Configure NVPs */
+	xive_regw(x, CQ_TAR,
+		  CQ_TAR_AUTOINC | SETFIELD(CQ_TAR_SELECT, 0ull, CQ_TAR_NVPG));
+	if (x->last_reg_error)
+		return false;
+	for (i = 0; i < XIVE_MAX_BLOCKS; i++) {
+		xive_regw(x, CQ_TDR, CQ_TDR_VALID |
+			  SETFIELD(CQ_TDR_BLOCK_ID, 0ull, x->block_id));
+		if (x->last_reg_error)
+			return false;
+	}
+	return true;
+}
+
+static bool xive_prealloc_tables(struct xive *x)
+{
+	uint32_t i;
+	uint32_t pbase, pend;
+
+	/* ESB has 4 entries per byte */
+	x->sbe_base = local_alloc(x->chip_id, XIVE_ESB_SIZE, XIVE_ESB_SIZE);
+	if (!x->sbe_base) {
+		xive_err(x, "Failed to allocate SBE\n");
+		return false;
+	}
+
+	/* PQs are initialized to 0b01 which corresponds to "ints off" */
+	memset(x->sbe_base, 0x55, XIVE_ESB_SIZE);
+	xive_dbg(x, "SBE  at %p size 0x%lx\n", x->sbe_base, XIVE_ESB_SIZE);
+
+	/* EAS entries are 8 bytes */
+	x->eat_base = local_alloc(x->chip_id, XIVE_EAT_SIZE, XIVE_EAT_SIZE);
+	if (!x->eat_base) {
+		xive_err(x, "Failed to allocate EAS\n");
+		return false;
+	}
+
+	/*
+	 * We clear the entries (non-valid). They will be initialized
+	 * when actually used
+	 */
+	memset(x->eat_base, 0, XIVE_EAT_SIZE);
+	xive_dbg(x, "EAT  at %p size 0x%lx\n", x->eat_base, XIVE_EAT_SIZE);
+
+	/* Indirect END table. Limited to one top page. */
+	x->end_ind_size = ALIGN_UP(XIVE_END_TABLE_SIZE, PAGE_SIZE);
+	if (x->end_ind_size > PAGE_SIZE) {
+		xive_err(x, "END indirect table is too big !\n");
+		return false;
+	}
+	x->end_ind_base = local_alloc(x->chip_id, x->end_ind_size,
+				      x->end_ind_size);
+	if (!x->end_ind_base) {
+		xive_err(x, "Failed to allocate END indirect table\n");
+		return false;
+	}
+	memset(x->end_ind_base, 0, x->end_ind_size);
+	xive_dbg(x, "ENDi at %p size 0x%llx #%ld entries\n", x->end_ind_base,
+		 x->end_ind_size, XIVE_END_COUNT);
+	x->end_ind_count = XIVE_END_TABLE_SIZE / XIVE_VSD_SIZE;
+
+	/* Indirect VP table. Limited to one top page. */
+	x->vp_ind_size = ALIGN_UP(XIVE_VP_TABLE_SIZE(x), PAGE_SIZE);
+	if (x->vp_ind_size > PAGE_SIZE) {
+		xive_err(x, "VP indirect table is too big !\n");
+		return false;
+	}
+	x->vp_ind_base = local_alloc(x->chip_id, x->vp_ind_size,
+				     x->vp_ind_size);
+	if (!x->vp_ind_base) {
+		xive_err(x, "Failed to allocate VP indirect table\n");
+		return false;
+	}
+	xive_dbg(x, "VPi  at %p size 0x%llx #%ld entries\n", x->vp_ind_base,
+		 x->vp_ind_size, XIVE_VP_COUNT(x));
+	x->vp_ind_count = XIVE_VP_TABLE_SIZE(x) / XIVE_VSD_SIZE;
+	memset(x->vp_ind_base, 0, x->vp_ind_size);
+
+	/* Allocate pages for the VP ids representing HW threads */
+	pbase = xive_hw_vp_base / VP_PER_PAGE;
+	pend  = (xive_hw_vp_base + xive_hw_vp_count) / VP_PER_PAGE;
+
+	xive_dbg(x, "Allocating pages %d to %d of VPs (for %d VPs)\n",
+		 pbase, pend, xive_hw_vp_count);
+	for (i = pbase; i <= pend; i++) {
+		void *page;
+		u64 vsd;
+
+		/* Indirect entries have a VSD format */
+		page = local_alloc(x->chip_id, PAGE_SIZE, PAGE_SIZE);
+		if (!page) {
+			xive_err(x, "Failed to allocate VP page\n");
+			return false;
+		}
+		xive_dbg(x, "VP%d at %p size 0x%x\n", i, page, PAGE_SIZE);
+		memset(page, 0, PAGE_SIZE);
+		vsd = ((uint64_t)page) & VSD_ADDRESS_MASK;
+
+		vsd |= SETFIELD(VSD_TSIZE, 0ull, 4);
+		vsd |= SETFIELD(VSD_MODE, 0ull, VSD_MODE_EXCLUSIVE);
+		vsd |= VSD_FIRMWARE;
+		x->vp_ind_base[i] = cpu_to_be64(vsd);
+	}
+
+	/*
+	 * Allocate page for cache and sync injection (512 * 128 hw
+	 * threads) + one extra page for future use
+	 */
+	x->sync_inject_size = PAGE_SIZE + PAGE_SIZE;
+	x->sync_inject = local_alloc(x->chip_id, x->sync_inject_size,
+				     x->sync_inject_size);
+	if (!x->sync_inject) {
+		xive_err(x, "Failed to allocate sync pages\n");
+		return false;
+	}
+
+	/*
+	 * The Memory Coherence Directory uses 16M "granule" to track
+	 * shared copies of a cache line. If any cache line within the
+	 * 16M range gets touched by someone outside of the group, the
+	 * MCD forces accesses to any cache line within the range to
+	 * include everyone that might have a shared copy.
+	 */
+#define QUEUE_OVF_ALIGN (16 << 20) /* MCD granule size */
+
+	/*
+	 * Allocate the queue overflow pages and use a 16M alignment
+	 * to avoid sharing with other structures and reduce traffic
+	 * on the PowerBus.
+	 */
+	x->q_ovf = local_alloc(x->chip_id, VC_QUEUE_COUNT * PAGE_SIZE,
+			       QUEUE_OVF_ALIGN);
+	if (!x->q_ovf) {
+		xive_err(x, "Failed to allocate queue overflow\n");
+		return false;
+	}
+	return true;
+}
+
+static void xive_add_provisioning_properties(void)
+{
+	beint32_t chips[XIVE_MAX_CHIPS];
+	uint32_t i, count;
+
+	dt_add_property_cells(xive_dt_node,
+			      "ibm,xive-provision-page-size", PAGE_SIZE);
+
+	count = 1 << xive_chips_alloc_bits;
+	for (i = 0; i < count; i++)
+		chips[i] = cpu_to_be32(xive_block_to_chip[i]);
+	dt_add_property(xive_dt_node, "ibm,xive-provision-chips",
+			chips, 4 * count);
+}
+
+static void xive_create_mmio_dt_node(struct xive *x)
+{
+	uint64_t tb = (uint64_t)x->tm_base;
+	uint32_t stride = 1u << x->tm_shift;
+
+	xive_dt_node = dt_new_addr(dt_root, "interrupt-controller", tb);
+	assert(xive_dt_node);
+
+	dt_add_property_u64s(xive_dt_node, "reg",
+			     tb + 0 * stride, stride,
+			     tb + 1 * stride, stride,
+			     tb + 2 * stride, stride,
+			     tb + 3 * stride, stride);
+
+	dt_add_property_strings(xive_dt_node, "compatible",
+				"ibm,opal-xive-pe", "ibm,opal-intc");
+
+	dt_add_property_cells(xive_dt_node, "ibm,xive-eq-sizes",
+			      12, 16, 21, 24);
+
+	dt_add_property_cells(xive_dt_node, "ibm,xive-#priorities",
+			      xive_cfg_vp_prio(x));
+
+	dt_add_property(xive_dt_node, "single-escalation-support", NULL, 0);
+
+	if (XIVE_CAN_STORE_EOI(x))
+		dt_add_property(xive_dt_node, "store-eoi", NULL, 0);
+
+	if (xive_cfg_save_restore(x))
+		dt_add_property(xive_dt_node, "vp-save-restore", NULL, 0);
+
+	xive_add_provisioning_properties();
+
+}
+
+static void xive_setup_forward_ports(struct xive *x, struct proc_chip *remote_chip)
+{
+	struct xive *remote_xive = remote_chip->xive;
+	uint64_t base = SETFIELD(VSD_MODE, 0ull, VSD_MODE_FORWARD);
+
+	if (!xive_set_vsd(x, VST_ESB, remote_xive->block_id,
+			  base | ((uint64_t)remote_xive->esb_base) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(x->esb_size) - 12)))
+		goto error;
+
+	/* EAS: No remote */
+
+	if (!xive_set_vsd(x, VST_END, remote_xive->block_id,
+			  base | ((uint64_t)remote_xive->end_base) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(x->end_size) - 12)))
+		goto error;
+
+	if (!xive_set_vsd(x, VST_NVP, remote_xive->block_id,
+			  base | ((uint64_t)remote_xive->nvp_base) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(x->nvp_size) - 12)))
+		goto error;
+
+	/* NVG: not used */
+	/* NVC: not used */
+
+	if (!xive_set_vsd(x, VST_IC, remote_xive->chip_id,
+			  base | ((uint64_t)remote_xive->ic_base) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(x->ic_size) - 12)))
+		goto error;
+
+	if (!xive_set_vsd(x, VST_SYNC, remote_xive->chip_id,
+			  base | ((uint64_t)remote_xive->sync_inject) |
+			  SETFIELD(VSD_TSIZE, 0ull, ilog2(x->sync_inject_size) - 12)))
+		goto error;
+
+	/* ERQ: No remote */
+
+	return;
+
+ error:
+	xive_err(x, "Failure configuring forwarding ports\n");
+}
+
+static void late_init_one_xive(struct xive *x)
+{
+	struct proc_chip *chip;
+
+	/* We need to setup the cross-chip forward ports. Let's
+	 * iterate all chip and set them up accordingly
+	 */
+	for_each_chip(chip) {
+		/* We skip ourselves or chips without a xive */
+		if (chip->xive == x || !chip->xive)
+			continue;
+
+		/* Setup our forward ports to that chip */
+		xive_setup_forward_ports(x, chip);
+	}
+}
+
+static bool xive_check_ipi_free(struct xive *x, uint32_t irq, uint32_t count)
+{
+	uint32_t i, idx = GIRQ_TO_IDX(irq);
+
+	for (i = 0; i < count; i++)
+		if (bitmap_tst_bit(*x->ipi_alloc_map, idx + i))
+			return false;
+	return true;
+}
+
+uint32_t xive2_alloc_hw_irqs(uint32_t chip_id, uint32_t count,
+				      uint32_t align)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct xive *x;
+	uint32_t base, i;
+
+	assert(chip);
+	assert(is_pow2(align));
+
+	x = chip->xive;
+	assert(x);
+
+	lock(&x->lock);
+
+	/* Allocate the HW interrupts */
+	base = x->int_hw_bot - count;
+	base &= ~(align - 1);
+	if (base < x->int_ipi_top) {
+		xive_err(x,
+			 "HW alloc request for %d interrupts aligned to %d failed\n",
+			 count, align);
+		unlock(&x->lock);
+		return XIVE_IRQ_ERROR;
+	}
+	if (!xive_check_ipi_free(x, base, count)) {
+		xive_err(x, "HWIRQ boot allocator request overlaps dynamic allocator\n");
+		unlock(&x->lock);
+		return XIVE_IRQ_ERROR;
+	}
+
+	x->int_hw_bot = base;
+
+	/* Initialize the corresponding EAS entries to sane defaults,
+	 * IE entry is valid, not routed and masked, EQ data is set
+	 * to the GIRQ number.
+	 */
+	for (i = 0; i < count; i++) {
+		struct xive_eas *eas = xive_get_eas(x, base + i);
+
+		eas->w = xive_set_field64(EAS_VALID, 0, 1) |
+			 xive_set_field64(EAS_MASKED, 0, 1) |
+			 xive_set_field64(EAS_END_DATA, 0, base + i);
+	}
+
+	unlock(&x->lock);
+	return base;
+}
+
+uint32_t xive2_alloc_ipi_irqs(uint32_t chip_id, uint32_t count,
+				       uint32_t align)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct xive *x;
+	uint32_t base, i;
+
+	assert(chip);
+	assert(is_pow2(align));
+
+	x = chip->xive;
+	assert(x);
+
+	lock(&x->lock);
+
+	/* Allocate the IPI interrupts */
+	base = x->int_ipi_top + (align - 1);
+	base &= ~(align - 1);
+	if (base >= x->int_hw_bot) {
+		xive_err(x,
+			 "IPI alloc request for %d interrupts aligned to %d failed\n",
+			 count, align);
+		unlock(&x->lock);
+		return XIVE_IRQ_ERROR;
+	}
+	if (!xive_check_ipi_free(x, base, count)) {
+		xive_err(x, "IPI boot allocator request overlaps dynamic allocator\n");
+		unlock(&x->lock);
+		return XIVE_IRQ_ERROR;
+	}
+
+	x->int_ipi_top = base + count;
+
+	/* Initialize the corresponding EAS entries to sane defaults,
+	 * IE entry is valid, not routed and masked, END data is set
+	 * to the GIRQ number.
+	 */
+	for (i = 0; i < count; i++) {
+		struct xive_eas *eas = xive_get_eas(x, base + i);
+
+		eas->w = xive_set_field64(EAS_VALID, 0, 1) |
+			 xive_set_field64(EAS_MASKED, 0, 1) |
+			 xive_set_field64(EAS_END_DATA, 0, base + i);
+	}
+
+	unlock(&x->lock);
+	return base;
+}
+
+void *xive2_get_trigger_port(uint32_t girq)
+{
+	uint32_t idx = GIRQ_TO_IDX(girq);
+	struct xive *x;
+
+	/* Find XIVE on which the EAS resides */
+	x = xive_from_isn(girq);
+	if (!x)
+		return NULL;
+
+	if (GIRQ_IS_ESCALATION(girq)) {
+		/* There is no trigger page for escalation interrupts */
+		return NULL;
+	} else {
+		/* Make sure it's an IPI on that chip */
+		if (girq < x->int_base ||
+		    girq >= x->int_ipi_top)
+			return NULL;
+
+		return x->esb_base + idx * XIVE_ESB_PAGE_SIZE;
+	}
+}
+
+/*
+ *  Notify Port page (writes only, w/data), separated into two
+ *  categories, both sent to VC:
+ *   - IPI queue (Addr bit 52 = 0) (for NPU)
+ *   - HW queue (Addr bit 52 = 1)
+ */
+uint64_t xive2_get_notify_port(uint32_t chip_id, uint32_t ent)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	struct xive *x;
+	uint32_t offset = 0;
+
+	assert(chip);
+	x = chip->xive;
+	assert(x);
+
+	/* This is where we can assign a different HW queue to a different
+	 * source by offsetting into the cache lines of the notify port
+	 *
+	 * For now we keep it very basic, this will have to be looked at
+	 * again on real HW with some proper performance analysis.
+	 *
+	 * Here's what Florian says on the matter:
+	 *
+	 * <<
+	 * The first 2k of the notify port page can all be used for PCIe triggers
+	 *
+	 * However the idea would be that we try to use the first 4 cache lines to
+	 * balance the PCIe Interrupt requests to use the least used snoop buses
+	 * (we went from 2 to 4 snoop buses for P9). snoop 0 is heavily used
+	 * (I think TLBIs are using that in addition to the normal addresses),
+	 * snoop 3 is used for all Int commands, so I think snoop 2 (CL 2 in the
+	 * page) is the least used overall. So we probably should that one for
+	 * the Int commands from PCIe.
+	 *
+	 * In addition, our EAS cache supports hashing to provide "private" cache
+	 * areas for the PHBs in the shared 1k EAS cache. This allows e.g. to avoid
+	 * that one "thrashing" PHB thrashes the EAS cache for everyone, or provide
+	 * a PHB with a private area that would allow high cache hits in case of a
+	 * device using very few interrupts. The hashing is based on the offset within
+	 * the cache line. So using that, you can e.g. set the EAS cache up so that
+	 * IPIs use 512 entries, the x16 PHB uses 256 entries and the x8 PHBs 128
+	 * entries each - or IPIs using all entries and sharing with PHBs, so PHBs
+	 * would use 512 entries and 256 entries respectively.
+	 *
+	 * This is a tuning we would probably do later in the lab, but as a "prep"
+	 * we should set up the different PHBs such that they are using different
+	 * 8B-aligned offsets within the cache line, so e.g.
+	 * PH4_0  addr        0x100        (CL 2 DW0
+	 * PH4_1  addr        0x108        (CL 2 DW1)
+	 * PH4_2  addr        0x110        (CL 2 DW2)
+	 * etc.
+	 * >>
+	 *
+	 * I'm using snoop1 for PHB0 and snoop2 for everybody else.
+	 */
+
+	/* Florian adds :
+	 *
+	 * we just set them up for a start to have different offsets
+	 * within the cache line so that we could use the allocation
+	 * restrictions that can be enforced in the interrupt
+	 * controller
+	 *
+	 * P10 might now be randomizing the cache line bits in HW to
+	 * balance snoop bus usage
+	 */
+	switch(ent) {
+	case XIVE_HW_SRC_PHBn(0):
+		offset = 0x800;
+		break;
+	case XIVE_HW_SRC_PHBn(1):
+		offset = 0x908;
+		break;
+	case XIVE_HW_SRC_PHBn(2):
+		offset = 0x910;
+		break;
+	case XIVE_HW_SRC_PHBn(3):
+		offset = 0x918;
+		break;
+	case XIVE_HW_SRC_PHBn(4):
+		offset = 0x920;
+		break;
+	case XIVE_HW_SRC_PHBn(5):
+		offset = 0x928;
+		break;
+	case XIVE_HW_SRC_PSI:
+		offset = 0x930;
+		break;
+	default:
+		assert(false);
+		return 0;
+	}
+
+	return ((uint64_t)x->ic_base) +
+		(XIVE_NOTIFY_PGOFF << x->ic_shift) + offset;
+}
+
+/* Manufacture the powerbus packet bits 32:63 */
+__attrconst uint32_t xive2_get_notify_base(uint32_t girq)
+{
+	return (GIRQ_TO_BLK(girq) << 28)  | GIRQ_TO_IDX(girq);
+}
+
+static bool xive_get_irq_targetting(uint32_t isn, uint32_t *out_target,
+				    uint8_t *out_prio, uint32_t *out_lirq)
+{
+	struct xive_eas *eas;
+	struct xive *x, *end_x;
+	struct xive_end *end;
+	uint32_t end_blk, end_idx;
+	uint32_t vp_blk, vp_idx;
+	uint32_t prio, server;
+	bool is_escalation = GIRQ_IS_ESCALATION(isn);
+
+	/* Find XIVE on which the EAS resides */
+	x = xive_from_isn(isn);
+	if (!x)
+		return false;
+	/* Grab the EAS */
+	eas = xive_get_eas(x, isn);
+	if (!eas)
+		return false;
+	if (!xive_get_field64(EAS_VALID, eas->w) && !is_escalation) {
+		xive_err(x, "ISN %x lead to invalid EAS !\n", isn);
+		return false;
+	}
+
+	if (out_lirq)
+		*out_lirq = xive_get_field64(EAS_END_DATA, eas->w);
+
+	/* Find the END and its xive instance */
+	end_blk = xive_get_field64(EAS_END_BLOCK, eas->w);
+	end_idx = xive_get_field64(EAS_END_INDEX, eas->w);
+	end_x = xive_from_vc_blk(end_blk);
+
+	/* This can fail if the interrupt hasn't been initialized yet
+	 * but it should also be masked, so fail silently
+	 */
+	if (!end_x)
+		goto pick_default;
+	end = xive_get_end(end_x, end_idx);
+	if (!end)
+		goto pick_default;
+
+	/* XXX Check valid and format 0 */
+
+	/* No priority conversion, return the actual one ! */
+	if (xive_get_field64(EAS_MASKED, eas->w))
+		prio = 0xff;
+	else
+		prio = xive_get_field32(END_W7_F0_PRIORITY, end->w7);
+	if (out_prio)
+		*out_prio = prio;
+
+	vp_blk = xive_get_field32(END_W6_VP_BLOCK, end->w6);
+	vp_idx = xive_get_field32(END_W6_VP_OFFSET, end->w6);
+	server = VP2PIR(vp_blk, vp_idx);
+
+	if (out_target)
+		*out_target = server;
+
+	xive_vdbg(end_x, "END info for ISN %x: prio=%d, server=0x%x (VP %x/%x)\n",
+		  isn, prio, server, vp_blk, vp_idx);
+	return true;
+
+pick_default:
+	xive_vdbg(end_x, "END info for ISN %x: Using masked defaults\n", isn);
+
+	if (out_prio)
+		*out_prio = 0xff;
+	/* Pick a random default, me will be fine ... */
+	if (out_target)
+		*out_target = mfspr(SPR_PIR);
+	return true;
+}
+
+static inline bool xive_end_for_target(uint32_t target, uint8_t prio,
+				      uint32_t *out_end_blk,
+				      uint32_t *out_end_idx)
+{
+	struct xive *x;
+	struct xive_nvp *vp;
+	uint32_t vp_blk, vp_idx;
+	uint32_t end_blk, end_idx;
+
+	if (prio > xive_max_prio(one_xive))
+		return false;
+
+	/* Get the VP block/index from the target word */
+	if (!xive_decode_vp(target, &vp_blk, &vp_idx, NULL, NULL))
+		return false;
+
+	/* Grab the target VP's XIVE */
+	x = xive_from_pc_blk(vp_blk);
+	if (!x)
+		return false;
+
+	/* Find the VP structrure where we stashed the END number */
+	vp = xive_get_vp(x, vp_idx);
+	if (!vp)
+		return false;
+
+	end_blk = xive_get_field32(NVP_W5_VP_END_BLOCK, vp->w5);
+	end_idx = xive_get_field32(NVP_W5_VP_END_INDEX, vp->w5);
+
+	/* Currently the END block and VP block should be the same */
+	if (end_blk != vp_blk) {
+		xive_err(x, "end_blk != vp_blk (%d vs. %d) for target 0x%08x/%d\n",
+			 end_blk, vp_blk, target, prio);
+		assert(false);
+	}
+
+	if (out_end_blk)
+		*out_end_blk = end_blk;
+	if (out_end_idx)
+		*out_end_idx = end_idx + prio;
+
+	return true;
+}
+
+static int64_t xive_set_irq_targetting(uint32_t isn, uint32_t target,
+				       uint8_t prio, uint32_t lirq,
+				       bool synchronous)
+{
+	struct xive *x;
+	struct xive_eas *eas, new_eas;
+	uint32_t end_blk, end_idx;
+	bool is_escalation = GIRQ_IS_ESCALATION(isn);
+	int64_t rc;
+
+	/* Find XIVE on which the EAS resides */
+	x = xive_from_isn(isn);
+	if (!x)
+		return OPAL_PARAMETER;
+	/* Grab the EAS */
+	eas = xive_get_eas(x, isn);
+	if (!eas)
+		return OPAL_PARAMETER;
+	if (!xive_get_field64(EAS_VALID, eas->w) && !is_escalation) {
+		xive_err(x, "ISN %x lead to invalid EAS !\n", isn);
+		return OPAL_PARAMETER;
+	}
+
+	lock(&x->lock);
+
+	/* Read existing EAS */
+	new_eas = *eas;
+
+	/* Are we masking ? */
+	if (prio == 0xff && !is_escalation) {
+		new_eas.w = xive_set_field64(EAS_MASKED, new_eas.w, 1);
+		xive_vdbg(x, "ISN %x masked !\n", isn);
+
+		/* Put prio 7 in the END */
+		prio = xive_max_prio(x);
+	} else {
+		/* Unmasking */
+		new_eas.w = xive_set_field64(EAS_MASKED, new_eas.w, 0);
+		xive_vdbg(x, "ISN %x unmasked !\n", isn);
+
+		/* For normal interrupt sources, keep track of which ones
+		 * we ever enabled since the last reset
+		 */
+		if (!is_escalation)
+			bitmap_set_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn));
+	}
+
+	/* If prio isn't 0xff, re-target the EAS. First find the END
+	 * correponding to the target
+	 */
+	if (prio != 0xff) {
+		if (!xive_end_for_target(target, prio, &end_blk, &end_idx)) {
+			xive_err(x, "Can't find END for target/prio 0x%x/%d\n",
+				 target, prio);
+			unlock(&x->lock);
+			return OPAL_PARAMETER;
+		}
+
+		/* Try to update it atomically to avoid an intermediary
+		 * stale state
+		 */
+		new_eas.w = xive_set_field64(EAS_END_BLOCK, new_eas.w, end_blk);
+		new_eas.w = xive_set_field64(EAS_END_INDEX, new_eas.w, end_idx);
+	}
+	new_eas.w = xive_set_field64(EAS_END_DATA, new_eas.w, lirq);
+
+	xive_vdbg(x,"ISN %x routed to end %x/%x lirq=%08x EAS=%016llx !\n",
+		  isn, end_blk, end_idx, lirq, new_eas.w);
+
+	/* Updating the cache differs between real EAS and escalation
+	 * EAS inside an END
+	 */
+	if (is_escalation) {
+		rc = xive_escalation_ive_cache_update(x, x->block_id,
+				GIRQ_TO_IDX(isn), &new_eas, synchronous);
+	} else {
+		sync();
+		*eas = new_eas;
+		rc = xive_easc_scrub(x, x->block_id, GIRQ_TO_IDX(isn));
+	}
+
+	unlock(&x->lock);
+	return rc;
+}
+
+static void xive_update_irq_mask(struct xive_src *s, uint32_t idx, bool masked)
+{
+	void *mmio_base = s->esb_mmio + (1ul << s->esb_shift) * idx;
+	uint32_t offset;
+
+	/* XXX FIXME: A quick mask/umask can make us shoot an interrupt
+	 * more than once to a queue. We need to keep track better
+	 */
+	if (s->flags & XIVE_SRC_EOI_PAGE1)
+		mmio_base += 1ull << (s->esb_shift - 1);
+	if (masked)
+		offset = XIVE_ESB_SET_PQ_01;
+	else
+		offset = XIVE_ESB_SET_PQ_00;
+
+	in_be64(mmio_base + offset);
+}
+
+#define XIVE_SYNC_IPI      0x000
+#define XIVE_SYNC_HW       0x080
+#define XIVE_SYNC_NxC      0x100
+#define XIVE_SYNC_INT      0x180
+#define XIVE_SYNC_OS_ESC   0x200
+#define XIVE_SYNC_POOL_ESC 0x280
+#define XIVE_SYNC_HARD_ESC 0x300
+
+static int64_t xive_sync(struct xive *x __unused)
+{
+	uint64_t r;
+	void *sync_base;
+
+	lock(&x->lock);
+
+	sync_base = x->ic_base + (XIVE_SYNC_POLL_PGOFF << x->ic_shift);
+
+	out_be64(sync_base + XIVE_SYNC_IPI, 0);
+	out_be64(sync_base + XIVE_SYNC_HW, 0);
+	out_be64(sync_base + XIVE_SYNC_NxC, 0);
+	out_be64(sync_base + XIVE_SYNC_INT, 0);
+	out_be64(sync_base + XIVE_SYNC_OS_ESC, 0);
+	out_be64(sync_base + XIVE_SYNC_POOL_ESC, 0);
+	out_be64(sync_base + XIVE_SYNC_HARD_ESC, 0);
+
+	/* XXX Add timeout */
+	for (;;) {
+		r = xive_regr(x, VC_ENDC_SYNC_DONE);
+		if ((r & VC_ENDC_SYNC_POLL_DONE) == VC_ENDC_SYNC_POLL_DONE)
+			break;
+		cpu_relax();
+	}
+	xive_regw(x, VC_ENDC_SYNC_DONE, r & ~VC_ENDC_SYNC_POLL_DONE);
+
+	/*
+	 * Do a read after clearing the sync done bit to prevent any
+	 * race between CI write and next sync command
+	 */
+	xive_regr(x, VC_ENDC_SYNC_DONE);
+
+	unlock(&x->lock);
+	return 0;
+}
+
+static int64_t __xive_set_irq_config(struct irq_source *is, uint32_t girq,
+				     uint64_t vp, uint8_t prio, uint32_t lirq,
+				     bool update_esb, bool sync)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+	uint32_t old_target, vp_blk;
+	u8 old_prio;
+	int64_t rc;
+
+	/* Grab existing target */
+	if (!xive_get_irq_targetting(girq, &old_target, &old_prio, NULL))
+		return OPAL_PARAMETER;
+
+	/* Let XIVE configure the END. We do the update without the
+	 * synchronous flag, thus a cache update failure will result
+	 * in us returning OPAL_BUSY
+	 */
+	rc = xive_set_irq_targetting(girq, vp, prio, lirq, false);
+	if (rc)
+		return rc;
+
+	/* Do we need to update the mask ? */
+	if (old_prio != prio && (old_prio == 0xff || prio == 0xff)) {
+		/* The source has special variants of masking/unmasking */
+		if (update_esb) {
+			/* Ensure it's enabled/disabled in the source
+			 * controller
+			 */
+			xive_update_irq_mask(s, girq - s->esb_base,
+					     prio == 0xff);
+		}
+	}
+
+	/*
+	 * Synchronize the source and old target XIVEs to ensure that
+	 * all pending interrupts to the old target have reached their
+	 * respective queue.
+	 *
+	 * WARNING: This assumes the VP and it's queues are on the same
+	 *          XIVE instance !
+	 */
+	if (!sync)
+		return OPAL_SUCCESS;
+	xive_sync(s->xive);
+	if (xive_decode_vp(old_target, &vp_blk, NULL, NULL, NULL)) {
+		struct xive *x = xive_from_pc_blk(vp_blk);
+		if (x)
+			xive_sync(x);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio,
+				   uint32_t lirq, bool update_esb)
+{
+	struct irq_source *is = irq_find_source(girq);
+
+	return __xive_set_irq_config(is, girq, vp, prio, lirq, update_esb,
+				     true);
+}
+
+static void xive_source_interrupt(struct irq_source *is, uint32_t isn)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+
+	if (!s->orig_ops || !s->orig_ops->interrupt)
+		return;
+	s->orig_ops->interrupt(is, isn);
+}
+
+static uint64_t xive_source_attributes(struct irq_source *is, uint32_t isn)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+
+	if (!s->orig_ops || !s->orig_ops->attributes)
+		return IRQ_ATTR_TARGET_LINUX;
+	return s->orig_ops->attributes(is, isn);
+}
+
+static char *xive_source_name(struct irq_source *is, uint32_t isn)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+
+	if (!s->orig_ops || !s->orig_ops->name)
+		return NULL;
+	return s->orig_ops->name(is, isn);
+}
+
+void xive2_source_mask(struct irq_source *is, uint32_t isn)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+
+	xive_update_irq_mask(s, isn - s->esb_base, true);
+}
+
+static const struct irq_source_ops xive_irq_source_ops = {
+	.interrupt = xive_source_interrupt,
+	.attributes = xive_source_attributes,
+	.name = xive_source_name,
+};
+
+static void __xive_register_source(struct xive *x, struct xive_src *s,
+				   uint32_t base, uint32_t count,
+				   uint32_t shift, void *mmio, uint32_t flags,
+				   bool secondary, void *data,
+				   const struct irq_source_ops *orig_ops)
+{
+	s->esb_base = base;
+	s->esb_shift = shift;
+	s->esb_mmio = mmio;
+	s->flags = flags;
+	s->orig_ops = orig_ops;
+	s->xive = x;
+	s->is.start = base;
+	s->is.end = base + count;
+	s->is.ops = &xive_irq_source_ops;
+	s->is.data = data;
+
+	__register_irq_source(&s->is, secondary);
+}
+
+void xive2_register_hw_source(uint32_t base, uint32_t count, uint32_t shift,
+			     void *mmio, uint32_t flags, void *data,
+			     const struct irq_source_ops *ops)
+{
+	struct xive_src *s;
+	struct xive *x = xive_from_isn(base);
+
+	assert(x);
+
+	s = malloc(sizeof(struct xive_src));
+	assert(s);
+	__xive_register_source(x, s, base, count, shift, mmio, flags,
+			       false, data, ops);
+}
+
+static void __xive2_register_esb_source(uint32_t base, uint32_t count,
+				void *data, const struct irq_source_ops *ops)
+{
+	struct xive_src *s;
+	struct xive *x = xive_from_isn(base);
+	uint32_t base_idx = GIRQ_TO_IDX(base);
+	void *mmio_base;
+	uint32_t flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE;
+
+	assert(x);
+
+	s = malloc(sizeof(struct xive_src));
+	assert(s);
+
+	if (XIVE_CAN_STORE_EOI(x))
+		flags |= XIVE_SRC_STORE_EOI;
+
+	/* Callbacks assume the MMIO base corresponds to the first
+	 * interrupt of that source structure so adjust it
+	 */
+	mmio_base = x->esb_base + (1ul << XIVE_ESB_SHIFT) * base_idx;
+	__xive_register_source(x, s, base, count, XIVE_ESB_SHIFT, mmio_base,
+			       flags, false, data, ops);
+}
+
+/*
+ * Check that IPI sources have interrupt numbers in the IPI interrupt
+ * number range
+ */
+void xive2_register_ipi_source(uint32_t base, uint32_t count, void *data,
+			       const struct irq_source_ops *ops)
+{
+	struct xive *x = xive_from_isn(base);
+
+	assert(x);
+	assert(base >= x->int_base && (base + count) <= x->int_ipi_top);
+
+	__xive2_register_esb_source(base, count, data, ops);
+}
+
+/*
+ * Some HW sources (PHB) can disable the use of their own ESB pages
+ * and offload all the checks on ESB pages of the IC. The interrupt
+ * numbers are not necessarily in the IPI range.
+ */
+void xive2_register_esb_source(uint32_t base, uint32_t count)
+{
+	__xive2_register_esb_source(base, count, NULL, NULL);
+}
+
+uint64_t xive2_get_esb_base(uint32_t base)
+{
+	struct xive *x = xive_from_isn(base);
+	uint32_t base_idx = GIRQ_TO_IDX(base);
+
+	assert(x);
+
+	return (uint64_t) x->esb_base + (1ul << XIVE_ESB_SHIFT) * base_idx;
+}
+
+static void xive_set_quirks(struct xive *x, struct proc_chip *chip __unused)
+{
+	uint64_t quirks = 0;
+
+	/* This extension is dropped for P10 */
+	if (proc_gen == proc_gen_p10)
+		quirks |= XIVE_QUIRK_THREADID_7BITS;
+
+	/* Broken check on invalid priority when reduced priorities is in use */
+	if (proc_gen == proc_gen_p10)
+		quirks |= XIVE_QUIRK_BROKEN_PRIO_CHECK;
+
+	xive_dbg(x, "setting XIVE quirks to %016llx\n", quirks);
+	x->quirks = quirks;
+}
+
+static struct xive *init_one_xive(struct dt_node *np)
+{
+	struct xive *x;
+	struct proc_chip *chip;
+	uint32_t flags;
+
+	x = zalloc(sizeof(struct xive));
+	assert(x);
+	x->x_node = np;
+	x->xscom_base = dt_get_address(np, 0, NULL);
+	x->chip_id = dt_get_chip_id(np);
+
+	/* "Allocate" a new block ID for the chip */
+	x->block_id = xive_block_count++;
+	assert (x->block_id < XIVE_MAX_CHIPS);
+	xive_block_to_chip[x->block_id] = x->chip_id;
+	init_lock(&x->lock);
+
+	chip = get_chip(x->chip_id);
+	assert(chip);
+
+	xive_notice(x, "Initializing XIVE block ID %d...\n", x->block_id);
+	chip->xive = x;
+
+	xive_set_quirks(x, chip);
+
+	list_head_init(&x->donated_pages);
+
+	/* Base interrupt numbers and allocator init */
+
+	x->int_base	= BLKIDX_TO_GIRQ(x->block_id, 0);
+	x->int_count	= x->int_base + XIVE_INT_COUNT;
+	x->int_hw_bot	= x->int_count;
+	x->int_ipi_top	= x->int_base;
+
+	if (x->int_ipi_top < XIVE_INT_FIRST)
+		x->int_ipi_top = XIVE_INT_FIRST;
+
+	/* Allocate a few bitmaps */
+	x->end_map = local_alloc(x->chip_id, BITMAP_BYTES(xive_end_bitmap_size(x)), PAGE_SIZE);
+	assert(x->end_map);
+	memset(x->end_map, 0, BITMAP_BYTES(xive_end_bitmap_size(x)));
+
+	/*
+	 * Allocate END index 0 to make sure it can not be used as an
+	 * END base for a VP. This is the criteria to know if a VP was
+	 * allocated.
+	 */
+	bitmap_set_bit(*x->end_map, 0);
+
+	x->int_enabled_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE);
+	assert(x->int_enabled_map);
+	memset(x->int_enabled_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+	x->ipi_alloc_map = local_alloc(x->chip_id, BITMAP_BYTES(XIVE_INT_COUNT), PAGE_SIZE);
+	assert(x->ipi_alloc_map);
+	memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+
+	xive_dbg(x, "Handling interrupts [%08x..%08x]\n",
+		 x->int_base, x->int_count - 1);
+
+	/* Setup the IC BARs */
+	if (!xive_configure_ic_bars(x))
+		goto fail;
+
+	/* Some basic global inits such as page sizes etc... */
+	if (!xive_config_init(x))
+		goto fail;
+
+	/* Configure the set translations for MMIO */
+	if (!xive_setup_set_xlate(x))
+		goto fail;
+
+	/* Dump some MMIO registers for diagnostics */
+	xive_dump_mmio(x);
+
+	/* Pre-allocate a number of tables */
+	if (!xive_prealloc_tables(x))
+		goto fail;
+
+	/* Setup the XIVE structures BARs */
+	if (!xive_configure_bars(x))
+		goto fail;
+
+	/*
+	 * Configure local tables in VSDs (forward ports will be
+	 * handled later)
+	 */
+	if (!xive_set_local_tables(x))
+		goto fail;
+
+	/* Register built-in source controllers (aka IPIs) */
+	flags = XIVE_SRC_EOI_PAGE1 | XIVE_SRC_TRIGGER_PAGE;
+	if (XIVE_CAN_STORE_EOI(x))
+		flags |= XIVE_SRC_STORE_EOI;
+	__xive_register_source(x, &x->ipis, x->int_base,
+			       x->int_hw_bot - x->int_base, XIVE_ESB_SHIFT,
+			       x->esb_base, flags, true, NULL, NULL);
+
+	/* Register escalation sources (ENDs)
+	 *
+	 * The ESe PQ bits are used for coalescing and the END ESB for
+	 * interrupt management. The word 4&5 of the END is the EAS
+	 * for the escalation source and the indexing is the same as
+	 * the END.
+	 *
+	 * This is an OPAL primary source, IPIs are secondary.
+	 */
+	__xive_register_source(x, &x->esc_irqs,
+			       MAKE_ESCALATION_GIRQ(x->block_id, 0),
+			       XIVE_END_COUNT, XIVE_END_SHIFT,
+			       x->end_base, XIVE_SRC_EOI_PAGE1,
+			       false, NULL, NULL);
+
+
+	return x;
+ fail:
+	xive_err(x, "Initialization failed...\n");
+
+	/* Should this be fatal ? */
+	//assert(false);
+	return NULL;
+}
+
+static void xive_reset_enable_thread(struct cpu_thread *c)
+{
+	struct proc_chip *chip = get_chip(c->chip_id);
+	struct xive *x = chip->xive;
+	uint32_t fc, bit;
+	uint64_t enable;
+
+	/* Get fused core number */
+	fc = (c->pir >> 3) & 0xf;
+
+	/* Get bit in register */
+	bit = c->pir & 0x3f;
+
+	/* Get which register to access */
+	if (fc < 8) {
+		xive_regw(x, TCTXT_EN0_RESET, PPC_BIT(bit));
+		xive_regw(x, TCTXT_EN0_SET, PPC_BIT(bit));
+
+		enable = xive_regr(x, TCTXT_EN0);
+		if (!(enable & PPC_BIT(bit)))
+			xive_cpu_err(c, "Failed to enable thread\n");
+	} else {
+		xive_regw(x, TCTXT_EN1_RESET, PPC_BIT(bit));
+		xive_regw(x, TCTXT_EN1_SET, PPC_BIT(bit));
+
+		enable = xive_regr(x, TCTXT_EN1);
+		if (!(enable & PPC_BIT(bit)))
+			xive_cpu_err(c, "Failed to enable thread\n");
+	}
+}
+
+void xive2_cpu_callin(struct cpu_thread *cpu)
+{
+	struct xive_cpu_state *xs = cpu->xstate;
+	uint8_t old_w2 __unused, w2 __unused;
+
+	if (!xs)
+		return;
+
+	/* Reset the HW thread context and enable it */
+	xive_reset_enable_thread(cpu);
+
+	/* Set VT to 1 */
+	old_w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2);
+	out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2, 0x80);
+	w2 = in_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_WORD2);
+
+	xive_cpu_vdbg(cpu, "Initialized TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n",
+		      xs->vp_blk, xs->vp_idx,
+		      in_be64(xs->tm_ring1 + TM_QW3_HV_PHYS),
+		      old_w2, w2);
+}
+
+#ifdef XIVE_EXTRA_CHECK_INIT_CACHE
+#define CHECK_INIT_CACHE_LOOP 0x100
+static void xive_special_cache_check(struct xive *x, uint32_t blk, uint32_t idx)
+{
+	struct xive_nvp vp = {0};
+	uint32_t i;
+
+	/*
+	 * SIMICS checks the value of reserved fields
+	 */
+	if (chip_quirk(QUIRK_SIMICS))
+		return;
+
+	for (i = 0; i < CHECK_INIT_CACHE_LOOP; i++) {
+		struct xive_nvp *vp_m = xive_get_vp(x, idx);
+
+		memset(vp_m, (~i) & 0xff, sizeof(*vp_m));
+		sync();
+		vp.w1 = (i << 16) | i;
+		assert(!xive_nxc_cache_update(x, blk, idx, &vp, true));
+		if (!xive_check_nxc_update(x, idx, &vp)) {
+			xive_dbg(x, "NXC update test failed at %d iterations\n", i);
+			return;
+		}
+	}
+	xive_dbg(x, "NXC update test passed for %d/0x%x\n", blk, idx);
+}
+#else
+static inline void xive_special_cache_check(struct xive *x __unused,
+					    uint32_t blk __unused,
+					    uint32_t idx __unused)
+{
+}
+#endif
+
+static void xive_init_cpu_exploitation(struct xive_cpu_state *xs)
+{
+	struct xive_end end;
+	struct xive_nvp vp;
+	struct xive *x_vp, *x_end;
+	int i;
+
+	/* Grab the XIVE where the VP resides. It could be different from
+	 * the local chip XIVE if not using block group mode
+	 */
+	x_vp = xive_from_pc_blk(xs->vp_blk);
+	assert(x_vp);
+
+	/* Grab the XIVE where the END resides. It should be the same
+	 * as the VP.
+	 */
+	x_end = xive_from_vc_blk(xs->end_blk);
+	assert(x_end);
+
+	xive_init_hw_end(&end);
+
+	/* Use the cache watch to update all ENDs reserved for HW VPs */
+	lock(&x_end->lock);
+	for (i = 0; i < xive_cfg_vp_prio(x_end); i++)
+		xive_endc_cache_update(x_end, xs->end_blk, xs->end_idx + i,
+				       &end, true);
+	unlock(&x_end->lock);
+
+	/* Initialize/enable the VP */
+	xive_init_default_vp(&vp, xs->end_blk, xs->end_idx);
+
+	/* Use the cache watch to write it out */
+	lock(&x_vp->lock);
+	xive_special_cache_check(x_vp, xs->vp_blk, xs->vp_idx);
+	xive_nxc_cache_update(x_vp, xs->vp_blk, xs->vp_idx, &vp, true);
+	unlock(&x_vp->lock);
+}
+
+static void xive_configure_ex_special_bar(struct xive *x, struct cpu_thread *c)
+{
+	uint64_t xa, val;
+	int64_t rc;
+
+	xive_cpu_vdbg(c, "Setting up special BAR\n");
+	xa = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), P10_NCU_SPEC_BAR);
+	val = (uint64_t)x->tm_base | P10_NCU_SPEC_BAR_ENABLE;
+	if (x->tm_shift == 16)
+		val |= P10_NCU_SPEC_BAR_256K;
+	xive_cpu_vdbg(c, "NCU_SPEC_BAR_XA[%08llx]=%016llx\n", xa, val);
+	rc = xscom_write(c->chip_id, xa, val);
+	if (rc) {
+		xive_cpu_err(c, "Failed to setup NCU_SPEC_BAR\n");
+		/* XXXX  what do do now ? */
+	}
+}
+
+void xive2_late_init(void)
+{
+	struct cpu_thread *c;
+
+	prlog(PR_INFO, "SLW: Configuring self-restore for NCU_SPEC_BAR\n");
+	for_each_present_cpu(c) {
+		if(cpu_is_thread0(c)) {
+			struct proc_chip *chip = get_chip(c->chip_id);
+			struct xive *x = chip->xive;
+			uint64_t xa, val, rc;
+			xa = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), P10_NCU_SPEC_BAR);
+			val = (uint64_t)x->tm_base | P10_NCU_SPEC_BAR_ENABLE;
+			/* Bail out if wakeup engine has already failed */
+			if (wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
+				prlog(PR_ERR, "XIVE proc_stop_api fail detected\n");
+				break;
+			}
+			rc = proc_stop_save_scom((void *)chip->homer_base, xa, val,
+				PROC_STOP_SCOM_REPLACE, PROC_STOP_SECTION_L3);
+			if (rc) {
+				xive_cpu_err(c, "proc_stop_save_scom failed for NCU_SPEC_BAR rc=%lld\n",
+					     rc);
+				wakeup_engine_state = WAKEUP_ENGINE_FAILED;
+			}
+		}
+	}
+}
+
+static void xive_provision_cpu(struct xive_cpu_state *xs, struct cpu_thread *c)
+{
+	struct xive *x;
+
+	/* VP ids for HW threads are pre-allocated */
+	xs->vp_blk = PIR2VP_BLK(c->pir);
+	xs->vp_idx = PIR2VP_IDX(c->pir);
+
+	/* For now we use identical block IDs for VC and PC but that might
+	 * change. We allocate the ENDs on the same XIVE as the VP.
+	 */
+	xs->end_blk = xs->vp_blk;
+
+	/* Grab the XIVE where the END resides. It could be different from
+	 * the local chip XIVE if not using block group mode
+	 */
+	x = xive_from_vc_blk(xs->end_blk);
+	assert(x);
+
+	/* Allocate a set of ENDs for that VP */
+	xs->end_idx = xive_alloc_end_set(x, true);
+	assert(!XIVE_ALLOC_IS_ERR(xs->end_idx));
+}
+
+static void xive_init_cpu(struct cpu_thread *c)
+{
+	struct proc_chip *chip = get_chip(c->chip_id);
+	struct xive *x = chip->xive;
+	struct xive_cpu_state *xs;
+
+	if (!x)
+		return;
+
+	/*
+	 * Each core pair (EX) needs this special BAR setup to have the
+	 * right powerbus cycle for the TM area (as it has the same address
+	 * on all chips so it's somewhat special).
+	 *
+	 * Because we don't want to bother trying to figure out which core
+	 * of a pair is present we just do the setup for each of them, which
+	 * is harmless.
+	 */
+	if (cpu_is_thread0(c) || cpu_is_core_chiplet_primary(c))
+		xive_configure_ex_special_bar(x, c);
+
+	/* Initialize the state structure */
+	c->xstate = xs = local_alloc(c->chip_id, sizeof(struct xive_cpu_state), 1);
+	assert(xs);
+	memset(xs, 0, sizeof(struct xive_cpu_state));
+	xs->xive = x;
+
+	init_lock(&xs->lock);
+
+	/* Shortcut to TM HV ring */
+	xs->tm_ring1 = x->tm_base + (1u << x->tm_shift);
+
+	/* Provision a VP id and some ENDs for a HW thread */
+	xive_provision_cpu(xs, c);
+
+	xive_init_cpu_exploitation(xs);
+}
+
+static uint64_t xive_convert_irq_flags(uint64_t iflags)
+{
+	uint64_t oflags = 0;
+
+	if (iflags & XIVE_SRC_STORE_EOI)
+		oflags |= OPAL_XIVE_IRQ_STORE_EOI2;
+
+	/* OPAL_XIVE_IRQ_TRIGGER_PAGE is only meant to be set if
+	 * the interrupt has a *separate* trigger page.
+	 */
+	if ((iflags & XIVE_SRC_EOI_PAGE1) &&
+	    (iflags & XIVE_SRC_TRIGGER_PAGE))
+		oflags |= OPAL_XIVE_IRQ_TRIGGER_PAGE;
+
+	if (iflags & XIVE_SRC_LSI)
+		oflags |= OPAL_XIVE_IRQ_LSI;
+
+	return oflags;
+}
+
+static int64_t opal_xive_get_irq_info(uint32_t girq,
+				      beint64_t *out_flags,
+				      beint64_t *out_eoi_page,
+				      beint64_t *out_trig_page,
+				      beint32_t *out_esb_shift,
+				      beint32_t *out_src_chip)
+{
+	struct irq_source *is = irq_find_source(girq);
+	struct xive_src *s = container_of(is, struct xive_src, is);
+	uint32_t idx;
+	uint64_t mm_base;
+	uint64_t eoi_page = 0, trig_page = 0;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+	if (is == NULL || out_flags == NULL)
+		return OPAL_PARAMETER;
+	assert(is->ops == &xive_irq_source_ops);
+
+	if (out_flags)
+		*out_flags = cpu_to_be64(xive_convert_irq_flags(s->flags));
+
+	idx = girq - s->esb_base;
+
+	if (out_esb_shift)
+		*out_esb_shift = cpu_to_be32(s->esb_shift);
+
+	mm_base = (uint64_t)s->esb_mmio + (1ull << s->esb_shift) * idx;
+
+	/* The EOI page can either be the first or second page */
+	if (s->flags & XIVE_SRC_EOI_PAGE1) {
+		uint64_t p1off = 1ull << (s->esb_shift - 1);
+		eoi_page = mm_base + p1off;
+	} else
+		eoi_page = mm_base;
+
+	/* The trigger page, if it exists, is always the first page */
+	if (s->flags & XIVE_SRC_TRIGGER_PAGE)
+		trig_page = mm_base;
+
+	if (out_eoi_page)
+		*out_eoi_page = cpu_to_be64(eoi_page);
+	if (out_trig_page)
+		*out_trig_page = cpu_to_be64(trig_page);
+	if (out_src_chip)
+		*out_src_chip = cpu_to_be32(GIRQ_TO_CHIP(girq));
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_get_irq_config(uint32_t girq,
+					beint64_t *out_vp,
+					uint8_t *out_prio,
+					beint32_t *out_lirq)
+{
+	uint32_t vp;
+	uint32_t lirq;
+	uint8_t prio;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+               return OPAL_WRONG_STATE;
+
+	if (xive_get_irq_targetting(girq, &vp, &prio, &lirq)) {
+		*out_vp = cpu_to_be64(vp);
+		*out_prio = prio;
+		*out_lirq = cpu_to_be32(lirq);
+		return OPAL_SUCCESS;
+	} else
+		return OPAL_PARAMETER;
+}
+
+static int64_t opal_xive_set_irq_config(uint32_t girq,
+					uint64_t vp,
+					uint8_t prio,
+					uint32_t lirq)
+{
+	/*
+	 * This variant is meant for a XIVE-aware OS, thus it will
+	 * *not* affect the ESB state of the interrupt. If used with
+	 * a prio of FF, the EAS will be masked. In that case the
+	 * races have to be handled by the OS.
+	 */
+	if (xive_mode != XIVE_MODE_EXPL)
+               return OPAL_WRONG_STATE;
+
+	return xive_set_irq_config(girq, vp, prio, lirq, false);
+}
+
+static int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio,
+					beint64_t *out_qpage,
+					beint64_t *out_qsize,
+					beint64_t *out_qeoi_page,
+					beint32_t *out_escalate_irq,
+					beint64_t *out_qflags)
+{
+	uint32_t blk, idx;
+	struct xive *x;
+	struct xive_end *end;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+               return OPAL_WRONG_STATE;
+
+	if (!xive_end_for_target(vp, prio, &blk, &idx))
+		return OPAL_PARAMETER;
+
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+
+	end = xive_get_end(x, idx);
+	if (!end)
+		return OPAL_PARAMETER;
+
+	if (out_escalate_irq) {
+		uint32_t esc_idx = idx;
+
+		/* If escalations are routed to a single queue, fix up
+		 * the escalation interrupt number here.
+		 */
+		if (xive_get_field32(END_W0_UNCOND_ESCALATE, end->w0))
+			esc_idx |= xive_escalation_prio(x);
+		*out_escalate_irq =
+			cpu_to_be32(MAKE_ESCALATION_GIRQ(blk, esc_idx));
+	}
+
+	/* If this is a single-escalation gather queue, that's all
+	 * there is to return
+	 */
+	if (xive_get_field32(END_W0_SILENT_ESCALATE, end->w0)) {
+		if (out_qflags)
+			*out_qflags = 0;
+		if (out_qpage)
+			*out_qpage = 0;
+		if (out_qsize)
+			*out_qsize = 0;
+		if (out_qeoi_page)
+			*out_qeoi_page = 0;
+		return OPAL_SUCCESS;
+	}
+
+	if (out_qpage) {
+		if (xive_get_field32(END_W0_ENQUEUE, end->w0))
+			*out_qpage = cpu_to_be64(
+				((uint64_t)xive_get_field32(END_W2_EQ_ADDR_HI, end->w2) << 32) |
+				xive_get_field32(END_W3_EQ_ADDR_LO, end->w3));
+		else
+			*out_qpage = 0;
+	}
+	if (out_qsize) {
+		if (xive_get_field32(END_W0_ENQUEUE, end->w0))
+			*out_qsize = cpu_to_be64(xive_get_field32(END_W3_QSIZE, end->w3) + 12);
+		else
+			*out_qsize = 0;
+	}
+	if (out_qeoi_page) {
+		*out_qeoi_page = cpu_to_be64(
+			(uint64_t)x->end_base + idx * XIVE_ESB_PAGE_SIZE);
+	}
+	if (out_qflags) {
+		*out_qflags = 0;
+		if (xive_get_field32(END_W0_VALID, end->w0))
+			*out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ENABLED);
+		if (xive_get_field32(END_W0_UCOND_NOTIFY, end->w0))
+			*out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ALWAYS_NOTIFY);
+		if (xive_get_field32(END_W0_ESCALATE_CTL, end->w0))
+			*out_qflags |= cpu_to_be64(OPAL_XIVE_EQ_ESCALATE);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static void xive_cleanup_end(struct xive_end *end)
+{
+	end->w0 = xive_set_field32(END_W0_FIRMWARE1, 0, xive_end_is_firmware1(end));
+	end->w1 = xive_set_field32(END_W1_ESe_Q, 0, 1) |
+		  xive_set_field32(END_W1_ESn_Q, 0, 1);
+	end->w2 = end->w3 = end->w4 = end->w5 = end->w6 = end->w7 = 0;
+}
+
+static int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
+					uint64_t qpage,
+					uint64_t qsize,
+					uint64_t qflags)
+{
+	uint32_t blk, idx;
+	struct xive *x;
+	struct xive_end *old_end;
+	struct xive_end end;
+	uint32_t vp_blk, vp_idx;
+	bool group;
+	int64_t rc;
+
+	if (!xive_end_for_target(vp, prio, &blk, &idx))
+		return OPAL_PARAMETER;
+
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+
+	old_end = xive_get_end(x, idx);
+	if (!old_end)
+		return OPAL_PARAMETER;
+
+	/* If this is a silent escalation queue, it cannot be
+	 * configured directly
+	 */
+	if (xive_get_field32(END_W0_SILENT_ESCALATE, old_end->w0))
+		return OPAL_PARAMETER;
+
+	/* This shouldn't fail or xive_end_for_target would have
+	 * failed already
+	 */
+	if (!xive_decode_vp(vp, &vp_blk, &vp_idx, NULL, &group))
+		return OPAL_PARAMETER;
+
+	/*
+	 * Make a local copy which we will later try to commit using
+	 * the cache watch facility
+	 */
+	end = *old_end;
+
+	if (qflags & OPAL_XIVE_EQ_ENABLED) {
+		switch(qsize) {
+			/* Supported sizes */
+		case 12:
+		case 16:
+		case 21:
+		case 24:
+			end.w3 = cpu_to_be32(qpage & END_W3_EQ_ADDR_LO);
+			end.w2 = cpu_to_be32((qpage >> 32) & END_W2_EQ_ADDR_HI);
+			end.w3 = xive_set_field32(END_W3_QSIZE, end.w3, qsize - 12);
+			end.w0 = xive_set_field32(END_W0_ENQUEUE, end.w0, 1);
+			break;
+		case 0:
+			end.w2 = end.w3 = 0;
+			end.w0 = xive_set_field32(END_W0_ENQUEUE, end.w0, 0);
+			break;
+		default:
+			return OPAL_PARAMETER;
+		}
+
+		/* Ensure the priority and target are correctly set (they will
+		 * not be right after allocation
+		 */
+		end.w6 = xive_set_field32(END_W6_VP_BLOCK, 0, vp_blk) |
+			xive_set_field32(END_W6_VP_OFFSET, 0, vp_idx);
+		end.w7 = xive_set_field32(END_W7_F0_PRIORITY, 0, prio);
+		/* XXX Handle group i bit when needed */
+
+		/* Always notify flag */
+		if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
+			end.w0 = xive_set_field32(END_W0_UCOND_NOTIFY, end.w0, 1);
+		else
+			end.w0 = xive_set_field32(END_W0_UCOND_NOTIFY, end.w0, 0);
+
+		/* Escalation flag */
+		if (qflags & OPAL_XIVE_EQ_ESCALATE)
+			end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 1);
+		else
+			end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 0);
+
+		/* Unconditionally clear the current queue pointer, set
+		 * generation to 1 and disable escalation interrupts.
+		 */
+		end.w1 = xive_set_field32(END_W1_GENERATION, 0, 1) |
+			 xive_set_field32(END_W1_ES, 0, xive_get_field32(END_W1_ES, old_end->w1));
+
+		/* Enable. We always enable backlog for an enabled queue
+		 * otherwise escalations won't work.
+		 */
+		end.w0 = xive_set_field32(END_W0_VALID, end.w0, 1);
+		end.w0 = xive_set_field32(END_W0_BACKLOG, end.w0, 1);
+	} else
+		xive_cleanup_end(&end);
+
+	/* Update END, non-synchronous */
+	lock(&x->lock);
+	rc = xive_endc_cache_update(x, blk, idx, &end, false);
+	unlock(&x->lock);
+
+	return rc;
+}
+
+static int64_t opal_xive_get_queue_state(uint64_t vp, uint32_t prio,
+					 beint32_t *out_qtoggle,
+					 beint32_t *out_qindex)
+{
+	uint32_t blk, idx;
+	struct xive *x;
+	struct xive_end *end;
+	int64_t rc;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	if (!out_qtoggle || !out_qindex ||
+	    !xive_end_for_target(vp, prio, &blk, &idx))
+		return OPAL_PARAMETER;
+
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+
+	end = xive_get_end(x, idx);
+	if (!end)
+		return OPAL_PARAMETER;
+
+	/* Scrub the queue */
+	lock(&x->lock);
+	rc = xive_endc_scrub(x, blk, idx);
+	unlock(&x->lock);
+	if (rc)
+		return rc;
+
+	/* We don't do disable queues */
+	if (!xive_get_field32(END_W0_VALID, end->w0))
+		return OPAL_WRONG_STATE;
+
+	*out_qtoggle = cpu_to_be32(xive_get_field32(END_W1_GENERATION, end->w1));
+	*out_qindex  = cpu_to_be32(xive_get_field32(END_W1_PAGE_OFF, end->w1));
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_set_queue_state(uint64_t vp, uint32_t prio,
+					 uint32_t qtoggle, uint32_t qindex)
+{
+	uint32_t blk, idx;
+	struct xive *x;
+	struct xive_end *end, new_end;
+	int64_t rc;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	if (!xive_end_for_target(vp, prio, &blk, &idx))
+		return OPAL_PARAMETER;
+
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+
+	end = xive_get_end(x, idx);
+	if (!end)
+		return OPAL_PARAMETER;
+
+	/* We don't do disable queues */
+	if (!xive_get_field32(END_W0_VALID, end->w0))
+		return OPAL_WRONG_STATE;
+
+	new_end = *end;
+
+	new_end.w1 = xive_set_field32(END_W1_GENERATION, new_end.w1, qtoggle);
+	new_end.w1 = xive_set_field32(END_W1_PAGE_OFF, new_end.w1, qindex);
+
+	lock(&x->lock);
+	rc = xive_endc_cache_update(x, blk, idx, &new_end, false);
+	unlock(&x->lock);
+
+	return rc;
+}
+
+static int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr)
+{
+	struct proc_chip *c = get_chip(chip_id);
+	struct list_node *n;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+	if (!c)
+		return OPAL_PARAMETER;
+	if (!c->xive)
+		return OPAL_PARAMETER;
+	if (addr & 0xffff)
+		return OPAL_PARAMETER;
+
+	n = (struct list_node *)addr;
+	lock(&c->xive->lock);
+	list_add(&c->xive->donated_pages, n);
+	unlock(&c->xive->lock);
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_get_vp_info(uint64_t vp_id,
+				     beint64_t *out_flags,
+				     beint64_t *out_cam_value,
+				     beint64_t *out_report_cl_pair,
+				     beint32_t *out_chip_id)
+{
+	struct xive *x;
+	struct xive_nvp *vp;
+	uint32_t blk, idx;
+	bool group;
+
+	if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+		return OPAL_PARAMETER;
+	/* We don't do groups yet */
+	if (group)
+		return OPAL_PARAMETER;
+	x = xive_from_pc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+	vp = xive_get_vp(x, idx);
+	if (!vp)
+		return OPAL_PARAMETER;
+
+	if (out_flags) {
+		uint32_t end_blk, end_idx;
+		struct xive_end *end;
+		struct xive *end_x;
+		*out_flags = 0;
+
+		/*
+		 * We would like to a way to stash a SW bit in the VP
+		 * to know whether silent escalation is enabled or
+		 * not, but unlike what happens with ENDs, the PC
+		 * cache watch doesn't implement the reserved bit in
+		 * the VPs... so we have to go look at END 7 instead.
+		 */
+
+		/* Grab END for prio 7 to check for silent escalation */
+		if (!xive_end_for_target(vp_id, xive_escalation_prio(x),
+					 &end_blk, &end_idx))
+			return OPAL_PARAMETER;
+
+		end_x = xive_from_vc_blk(end_blk);
+		if (!end_x)
+			return OPAL_PARAMETER;
+
+		end = xive_get_end(x, end_idx);
+		if (!end)
+			return OPAL_PARAMETER;
+		if (xive_get_field32(NVP_W0_VALID, vp->w0))
+			*out_flags |= cpu_to_be64(OPAL_XIVE_VP_ENABLED);
+		if (xive_cfg_save_restore(x))
+			*out_flags |= cpu_to_be64(OPAL_XIVE_VP_SAVE_RESTORE);
+		if (xive_get_field32(END_W0_SILENT_ESCALATE, end->w0))
+			*out_flags |= cpu_to_be64(OPAL_XIVE_VP_SINGLE_ESCALATION);
+	}
+
+	if (out_cam_value) {
+		uint64_t cam_value;
+
+		cam_value = (blk << x->vp_shift) | idx;
+
+		/*
+		 * If save-restore is enabled, force the CAM line
+		 * value with the H bit.
+		 */
+		if (xive_cfg_save_restore(x))
+			cam_value |= TM10_QW1W2_HO;
+
+		*out_cam_value = cpu_to_be64(cam_value);
+	}
+
+	if (out_report_cl_pair) {
+		uint64_t report_cl_pair;
+
+		report_cl_pair = ((uint64_t)(be32_to_cpu(vp->w6) & 0x0fffffff)) << 32;
+		report_cl_pair |= be32_to_cpu(vp->w7) & 0xffffff00;
+
+		*out_report_cl_pair = cpu_to_be64(report_cl_pair);
+	}
+
+	if (out_chip_id)
+		*out_chip_id = cpu_to_be32(xive_block_to_chip[blk]);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t xive_setup_silent_gather(uint64_t vp_id, bool enable)
+{
+	uint32_t blk, idx, i;
+	struct xive_end *end_orig;
+	struct xive_end end;
+	struct xive *x;
+	int64_t rc;
+
+	/* Get base END block */
+	if (!xive_end_for_target(vp_id, 0, &blk, &idx)) {
+		prlog(PR_ERR, "%s: Invalid VP 0x%08llx\n", __func__, vp_id);
+		return OPAL_PARAMETER;
+	}
+	x = xive_from_vc_blk(blk);
+	if (!x) {
+		prlog(PR_ERR, "%s: VP 0x%08llx has invalid block %d\n", __func__,
+		      vp_id, blk);
+		return OPAL_PARAMETER;
+	}
+
+	/* Grab prio 7 */
+	end_orig = xive_get_end(x, idx + xive_escalation_prio(x));
+	if (!end_orig) {
+		xive_err(x, "Failed to get silent gather END 0x%x for VP 0x%08llx\n",
+			 idx + xive_escalation_prio(x), vp_id);
+		return OPAL_PARAMETER;
+	}
+
+	/* If trying to enable silent gather, make sure prio 7 is not
+	 * already enabled as a normal queue
+	 */
+	if (enable && xive_get_field32(END_W0_VALID, end_orig->w0) &&
+	    !xive_get_field32(END_W0_SILENT_ESCALATE, end_orig->w0)) {
+		xive_err(x, "silent gather END 0x%x already in use\n",
+			 idx + xive_escalation_prio(x));
+		return OPAL_PARAMETER;
+	}
+
+	end = *end_orig;
+
+	if (enable) {
+		/* W0: Enabled and "s" set, no other bit */
+		end.w0 = xive_set_field32(END_W0_FIRMWARE1, end.w0, 0);
+		end.w0 = xive_set_field32(END_W0_VALID, end.w0, 1);
+		end.w0 = xive_set_field32(END_W0_SILENT_ESCALATE, end.w0, 1);
+		end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 1);
+		end.w0 = xive_set_field32(END_W0_BACKLOG, end.w0, 1);
+
+		/* Set new "N" for END escalation (vs. ESB)  */
+		end.w0 = xive_set_field32(END_W0_ESCALATE_END, end.w0, 1);
+
+		/* W1: Mark ESn as 01, ESe as 00 */
+		end.w1 = xive_set_field32(END_W1_ESn_P, end.w1, 0);
+		end.w1 = xive_set_field32(END_W1_ESn_Q, end.w1, 1);
+		end.w1 = xive_set_field32(END_W1_ESe, end.w1, 0);
+	} else if (xive_get_field32(END_W0_SILENT_ESCALATE, end.w0))
+		xive_cleanup_end(&end);
+
+	if (!memcmp(end_orig, &end, sizeof(end)))
+		rc = 0;
+	else
+		rc = xive_endc_cache_update(x, blk, idx + xive_escalation_prio(x),
+					    &end, false);
+	if (rc)
+		return rc;
+
+	/* Mark/unmark all other prios with the new "u" bit and update
+	 * escalation
+	 */
+	for (i = 0; i < xive_cfg_vp_prio(x); i++) {
+		if (i == xive_escalation_prio(x))
+			continue;
+		end_orig = xive_get_end(x, idx + i);
+		if (!end_orig)
+			continue;
+		end = *end_orig;
+		if (enable) {
+			/* Set "u" bit */
+			end.w0 = xive_set_field32(END_W0_UNCOND_ESCALATE, end.w0, 1);
+
+			/* Set new "N" for END escalation (vs. ESB)  */
+			/* TODO (Gen2+) : use ESB escalation configuration */
+			end.w0 = xive_set_field32(END_W0_ESCALATE_END, end.w0, 1);
+
+			/* Re-route escalation interrupt (previous
+			 * route is lost !) to the gather queue
+			 */
+			end.w4 = xive_set_field32(END_W4_END_BLOCK, end.w4, blk);
+			end.w4 = xive_set_field32(END_W4_ESC_END_INDEX,
+					  end.w4, idx + xive_escalation_prio(x));
+		} else if (xive_get_field32(END_W0_UNCOND_ESCALATE, end.w0)) {
+			/* Clear the "u" bit, disable escalations if it was set */
+			end.w0 = xive_set_field32(END_W0_UNCOND_ESCALATE, end.w0, 0);
+			end.w0 = xive_set_field32(END_W0_ESCALATE_CTL, end.w0, 0);
+		}
+		if (!memcmp(end_orig, &end, sizeof(end)))
+			continue;
+		rc = xive_endc_cache_update(x, blk, idx + i, &end, false);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+static int64_t opal_xive_set_vp_info(uint64_t vp_id,
+				     uint64_t flags,
+				     uint64_t report_cl_pair)
+{
+	struct xive *x;
+	struct xive_nvp *vp, vp_new;
+	uint32_t blk, idx;
+	bool group;
+	int64_t rc;
+
+	if (!xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+		return OPAL_PARAMETER;
+	/* We don't do groups yet */
+	if (group)
+		return OPAL_PARAMETER;
+	if (report_cl_pair & 0xff)
+		return OPAL_PARAMETER;
+	x = xive_from_pc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+	vp = xive_get_vp(x, idx);
+	if (!vp)
+		return OPAL_PARAMETER;
+
+	/* Consistency check. */
+	if ((flags & OPAL_XIVE_VP_SAVE_RESTORE) && !xive_cfg_save_restore(x))
+		return OPAL_PARAMETER;
+
+	lock(&x->lock);
+
+	vp_new = *vp;
+	if (flags & OPAL_XIVE_VP_ENABLED) {
+		vp_new.w0 = xive_set_field32(NVP_W0_VALID, vp_new.w0, 1);
+		vp_new.w6 = cpu_to_be32(report_cl_pair >> 32);
+		vp_new.w7 = cpu_to_be32(report_cl_pair & 0xffffffff);
+
+		if (flags & OPAL_XIVE_VP_SINGLE_ESCALATION)
+			rc = xive_setup_silent_gather(vp_id, true);
+		else
+			rc = xive_setup_silent_gather(vp_id, false);
+
+		/*
+		 * Prepare NVP to be HW owned for automatic save-restore
+		 */
+		if (xive_cfg_save_restore(x)) {
+			/*
+			 * Set NVP privilege level. Default to OS.
+			 * This check only makes sense for KVM guests
+			 * currently. We would need an extra flag to
+			 * distinguish from pool level.
+			 */
+			vp_new.w0 = xive_set_field32(NVP_W0_VPRIV, vp_new.w0, 0);
+
+			vp_new.w2 = xive_set_field32(NVP_W2_CPPR, vp_new.w2, 0xFF);
+			vp_new.w0 = xive_set_field32(NVP_W0_HW, vp_new.w0, 1);
+		}
+	} else {
+		/*
+		 * TODO (kvm): disabling a VP invalidates the associated ENDs.
+		 *
+		 * The loads then return all 1s which can be an issue for the
+		 * Linux code to handle.
+		 */
+
+		vp_new.w0 = vp_new.w6 = vp_new.w7 = 0;
+		rc = xive_setup_silent_gather(vp_id, false);
+	}
+
+	if (rc) {
+		if (rc != OPAL_BUSY)
+			xive_dbg(x, "Silent gather setup failed with err %lld\n", rc);
+		goto bail;
+	}
+
+	rc = xive_nxc_cache_update(x, blk, idx, &vp_new, false);
+	if (rc)
+		goto bail;
+
+	/* When disabling, we scrub clean (invalidate the entry) so
+	 * we can avoid cache ops in alloc/free
+	 */
+	if (!(flags & OPAL_XIVE_VP_ENABLED))
+		xive_nxc_scrub_clean(x, blk, idx);
+
+bail:
+	unlock(&x->lock);
+	return rc;
+}
+
+static int64_t opal_xive_get_vp_state(uint64_t vp_id, beint64_t *out_state)
+{
+	struct xive *x;
+	struct xive_nvp *vp;
+	uint32_t blk, idx;
+	int64_t rc;
+	bool group;
+
+	if (!out_state || !xive_decode_vp(vp_id, &blk, &idx, NULL, &group))
+		return OPAL_PARAMETER;
+	if (group)
+		return OPAL_PARAMETER;
+	x = xive_from_pc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+	vp = xive_get_vp(x, idx);
+	if (!vp)
+		return OPAL_PARAMETER;
+
+	/* Scrub the vp */
+	lock(&x->lock);
+	rc = xive_nxc_scrub(x, blk, idx);
+	unlock(&x->lock);
+	if (rc)
+		return rc;
+
+	if (!xive_get_field32(NVP_W0_VALID, vp->w0))
+		return OPAL_WRONG_STATE;
+
+	/*
+	 * return a state matching the layout of WORD 0-1 of the TIMA
+	 * as this is expected by current implementation.
+	 */
+	*out_state = cpu_to_be64(((uint64_t) 0x0) << 54 |
+		(uint64_t)xive_get_field32(NVP_W2_CPPR, vp->w2)  << 48 |
+		(uint64_t)xive_get_field32(NVP_W2_IPB,  vp->w2)  << 40 |
+		(uint64_t)xive_get_field32(NVP_W2_LSMFB, vp->w2) << 32);
+
+	return OPAL_SUCCESS;
+}
+
+static void *xive_cpu_get_tima(struct cpu_thread *c)
+{
+	struct xive_cpu_state *xs = c->xstate;
+	struct xive *x = xs->xive;
+
+	return x->ic_tm_direct_base + ((c->pir & 0xff) << x->ic_shift);
+}
+
+static void xive_cleanup_cpu_tima(struct cpu_thread *c)
+{
+	struct xive_cpu_state *xs __unused = c->xstate;
+	void *cpu_tm_base = xive_cpu_get_tima(c);
+	uint8_t old_w2 __unused, w2 __unused;
+
+	/* Reset the HW context */
+	xive_reset_enable_thread(c);
+
+	/* Set VT to 1 */
+	old_w2 = in_8(cpu_tm_base + TM_QW3_HV_PHYS + TM_WORD2);
+	out_8(cpu_tm_base + TM_QW3_HV_PHYS + TM_WORD2, 0x80);
+	w2 = in_8(cpu_tm_base + TM_QW3_HV_PHYS + TM_WORD2);
+
+	/* Dump HV state */
+	xive_cpu_vdbg(c, "[reset] VP TIMA VP=%x/%x W01=%016llx W2=%02x->%02x\n",
+		      xs->vp_blk, xs->vp_idx,
+		      in_be64(cpu_tm_base + TM_QW3_HV_PHYS),
+		      old_w2, w2);
+}
+
+static int64_t xive_vc_ind_cache_kill(struct xive *x, uint64_t type)
+{
+	uint64_t val;
+
+	/* We clear the whole thing */
+	xive_regw(x, VC_AT_MACRO_KILL_MASK, 0);
+	xive_regw(x, VC_AT_MACRO_KILL, VC_AT_MACRO_KILL_VALID |
+		  SETFIELD(VC_AT_MACRO_KILL_VSD, 0ull, type));
+
+	/* XXX Add timeout */
+	for (;;) {
+		val = xive_regr(x, VC_AT_MACRO_KILL);
+		if (!(val & VC_AT_MACRO_KILL_VALID))
+			break;
+	}
+	return 0;
+}
+
+static int64_t xive_pc_ind_cache_kill(struct xive *x)
+{
+	uint64_t val;
+
+	/* We clear the whole thing */
+	xive_regw(x, PC_AT_KILL_MASK, 0);
+	xive_regw(x, PC_AT_KILL, PC_AT_KILL_VALID |
+		  SETFIELD(VC_AT_MACRO_KILL_VSD, 0ull, VST_NVP));
+
+	/* XXX Add timeout */
+	for (;;) {
+		val = xive_regr(x, PC_AT_KILL);
+		if (!(val & PC_AT_KILL_VALID))
+			break;
+	}
+	return 0;
+}
+
+static void xive_cleanup_vp_ind(struct xive *x)
+{
+	int i;
+
+	xive_dbg(x, "Cleaning up %d VP ind entries...\n", x->vp_ind_count);
+	for (i = 0; i < x->vp_ind_count; i++) {
+		if (be64_to_cpu(x->vp_ind_base[i]) & VSD_FIRMWARE) {
+			xive_dbg(x, " %04x ... skip (firmware)\n", i);
+			continue;
+		}
+		if (x->vp_ind_base[i] != 0) {
+			x->vp_ind_base[i] = 0;
+			xive_dbg(x, " %04x ... cleaned\n", i);
+		}
+	}
+	xive_pc_ind_cache_kill(x);
+}
+
+static void xive_cleanup_end_ind(struct xive *x)
+{
+	int i;
+
+	xive_dbg(x, "Cleaning up %d END ind entries...\n", x->end_ind_count);
+	for (i = 0; i < x->end_ind_count; i++) {
+		if (be64_to_cpu(x->end_ind_base[i]) & VSD_FIRMWARE) {
+			xive_dbg(x, " %04x ... skip (firmware)\n", i);
+			continue;
+		}
+		if (x->end_ind_base[i] != 0) {
+			x->end_ind_base[i] = 0;
+			xive_dbg(x, " %04x ... cleaned\n", i);
+		}
+	}
+	xive_vc_ind_cache_kill(x, VST_END);
+}
+
+static void xive_reset_one(struct xive *x)
+{
+	struct cpu_thread *c;
+	bool end_firmware;
+	int i;
+
+	xive_notice(x, "Resetting one xive...\n");
+
+	lock(&x->lock);
+
+	/* Check all interrupts are disabled */
+	i = bitmap_find_one_bit(*x->int_enabled_map, 0, XIVE_INT_COUNT);
+	if (i >= 0)
+		xive_warn(x, "Interrupt %d (and maybe more) not disabled"
+			  " at reset !\n", i);
+
+	/* Reset IPI allocation */
+	xive_dbg(x, "freeing alloc map %p/%p\n",
+		 x->ipi_alloc_map, *x->ipi_alloc_map);
+	memset(x->ipi_alloc_map, 0, BITMAP_BYTES(XIVE_INT_COUNT));
+
+	xive_dbg(x, "Resetting ENDs...\n");
+
+	/* Reset all allocated ENDs and free the user ones */
+	bitmap_for_each_one(*x->end_map, xive_end_bitmap_size(x), i) {
+		struct xive_end end0;
+		struct xive_end *end;
+		int j;
+
+		if (i == 0)
+			continue;
+		end_firmware = false;
+		for (j = 0; j < xive_cfg_vp_prio(x); j++) {
+			uint32_t idx = (i << xive_cfg_vp_prio_shift(x)) | j;
+
+			end = xive_get_end(x, idx);
+			if (!end)
+				continue;
+
+			/* We need to preserve the firmware bit, otherwise
+			 * we will incorrectly free the ENDs that are reserved
+			 * for the physical CPUs
+			 */
+			if (xive_get_field32(END_W0_VALID, end->w0)) {
+				if (!xive_end_is_firmware1(end))
+					xive_dbg(x, "END 0x%x:0x%x is valid at reset: %08x %08x\n",
+						 x->block_id, idx, end->w0, end->w1);
+				end0 = *end;
+				xive_cleanup_end(&end0);
+				xive_endc_cache_update(x, x->block_id, idx, &end0, true);
+			}
+			if (xive_end_is_firmware1(end))
+				end_firmware = true;
+		}
+		if (!end_firmware)
+			bitmap_clr_bit(*x->end_map, i);
+	}
+
+	/* Take out all VPs from HW and reset all CPPRs to 0 */
+	for_each_present_cpu(c) {
+		if (c->chip_id != x->chip_id)
+			continue;
+		if (!c->xstate)
+			continue;
+		xive_cleanup_cpu_tima(c);
+	}
+
+	/* Reset all user-allocated VPs. This is inefficient, we should
+	 * either keep a bitmap of allocated VPs or add an iterator to
+	 * the buddy which is trickier but doable.
+	 */
+	for (i = 0; i < XIVE_VP_COUNT(x); i++) {
+		struct xive_nvp *vp;
+		struct xive_nvp vp0 = {0};
+
+		/* Ignore the physical CPU VPs */
+		if (i >= xive_hw_vp_count &&
+		    i < (xive_hw_vp_base + xive_hw_vp_count))
+			continue;
+
+		/* Is the VP valid ? */
+		vp = xive_get_vp(x, i);
+		if (!vp || !xive_get_field32(NVP_W0_VALID, vp->w0))
+			continue;
+
+		/* Clear it */
+		xive_dbg(x, "VP 0x%x:0x%x is valid at reset\n", x->block_id, i);
+		xive_nxc_cache_update(x, x->block_id, i, &vp0, true);
+	}
+
+	/* Forget about remaining donated pages */
+	list_head_init(&x->donated_pages);
+
+	/* And cleanup donated indirect VP and END pages */
+	xive_cleanup_vp_ind(x);
+	xive_cleanup_end_ind(x);
+
+	/* The rest must not be called with the lock held */
+	unlock(&x->lock);
+
+	/* Re-configure VPs */
+	for_each_present_cpu(c) {
+		struct xive_cpu_state *xs = c->xstate;
+
+		if (c->chip_id != x->chip_id || !xs)
+			continue;
+
+		xive_init_cpu_exploitation(xs);
+	}
+}
+
+static void xive_reset_mask_source_cb(struct irq_source *is,
+				      void *data __unused)
+{
+	struct xive_src *s = container_of(is, struct xive_src, is);
+	struct xive *x;
+	uint32_t isn;
+
+	if (is->ops != &xive_irq_source_ops)
+		return;
+
+	/* Skip escalation sources */
+	if (GIRQ_IS_ESCALATION(is->start))
+		return;
+
+	x = s->xive;
+
+	/* Iterate all interrupts */
+	for (isn = is->start; isn < is->end; isn++) {
+		/* Has it ever been enabled ? */
+		if (!bitmap_tst_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn)))
+			continue;
+		/* Mask it and clear the enabled map bit */
+		xive_vdbg(x, "[reset] disabling source 0x%x\n", isn);
+		__xive_set_irq_config(is, isn, 0, 0xff, isn, true, false);
+		bitmap_clr_bit(*x->int_enabled_map, GIRQ_TO_IDX(isn));
+	}
+}
+
+void xive2_cpu_reset(void)
+{
+	struct cpu_thread *c = this_cpu();
+	struct xive_cpu_state *xs = c->xstate;
+
+	out_8(xs->tm_ring1 + TM_QW3_HV_PHYS + TM_CPPR, 0);
+
+	in_be64(xs->tm_ring1 + TM_SPC_PULL_POOL_CTX);
+}
+
+static int64_t __xive_reset(uint64_t mode)
+{
+	struct proc_chip *chip;
+
+	xive_mode = mode;
+
+	/* Mask all interrupt sources */
+	irq_for_each_source(xive_reset_mask_source_cb, NULL);
+
+	/* For each XIVE do a sync... */
+	for_each_chip(chip) {
+		if (!chip->xive)
+			continue;
+		xive_sync(chip->xive);
+	}
+
+	/* For each XIVE reset everything else... */
+	for_each_chip(chip) {
+		if (!chip->xive)
+			continue;
+		xive_reset_one(chip->xive);
+	}
+
+	/* Cleanup global VP allocator */
+	buddy_reset(xive_vp_buddy);
+
+	/*
+	 * We reserve the whole range of VP ids for HW threads.
+	 */
+	assert(buddy_reserve(xive_vp_buddy, xive_hw_vp_base, xive_threadid_shift));
+
+	return OPAL_SUCCESS;
+}
+
+/* Called by fast reboot */
+int64_t xive2_reset(void)
+{
+	if (xive_mode == XIVE_MODE_NONE)
+		return OPAL_SUCCESS;
+	return __xive_reset(XIVE_MODE_EXPL);
+}
+
+static int64_t opal_xive_reset(uint64_t mode)
+{
+	prlog(PR_DEBUG, "XIVE reset. mode = %llx\n", mode);
+
+	if (!(mode & XIVE_MODE_EXPL)) {
+		prlog(PR_NOTICE, "No emulation mode. XIVE exploitation mode "
+		      "is the default\n");
+	}
+
+	xive_expl_options = mode & ~XIVE_MODE_EXPL;
+	if (xive_expl_options & ~XIVE_EXPL_ALL_OPTIONS) {
+		prerror("invalid XIVE exploitation mode option %016llx\n",
+			xive_expl_options);
+		return OPAL_PARAMETER;
+	}
+
+	return __xive_reset(XIVE_MODE_EXPL);
+}
+
+static int64_t opal_xive_free_vp_block(uint64_t vp_base)
+{
+	uint32_t blk, idx, i, j, count;
+	uint8_t order;
+	bool group;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	if (!xive_decode_vp(vp_base, &blk, &idx, &order, &group))
+		return OPAL_PARAMETER;
+	if (group)
+		return OPAL_PARAMETER;
+	if (blk)
+		return OPAL_PARAMETER;
+	if (order < (xive_chips_alloc_bits + 1))
+		return OPAL_PARAMETER;
+	if (idx & ((1 << (order - xive_chips_alloc_bits)) - 1))
+		return OPAL_PARAMETER;
+
+	count = 1 << order;
+	for (i = 0; i < count; i++) {
+		uint32_t vp_id = vp_base + i;
+		uint32_t blk, idx, end_blk, end_idx;
+		struct xive *x;
+		struct xive_nvp *vp;
+
+		if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) {
+			prerror("Couldn't decode VP id %u\n", vp_id);
+			return OPAL_INTERNAL_ERROR;
+		}
+		x = xive_from_pc_blk(blk);
+		if (!x) {
+			prerror("Instance not found for deallocated VP"
+				" block %d\n", blk);
+			return OPAL_INTERNAL_ERROR;
+		}
+		vp = xive_get_vp(x, idx);
+		if (!vp) {
+			prerror("VP not found for deallocation !");
+			return OPAL_INTERNAL_ERROR;
+		}
+
+		/* VP must be disabled */
+		if (xive_get_field32(NVP_W0_VALID, vp->w0)) {
+			prlog(PR_ERR, "freeing active VP %d\n", vp_id);
+			return OPAL_XIVE_FREE_ACTIVE;
+		}
+
+		/* Not populated */
+		if (vp->w5 == 0)
+			continue;
+
+		end_blk = xive_get_field32(NVP_W5_VP_END_BLOCK, vp->w5);
+		end_idx = xive_get_field32(NVP_W5_VP_END_INDEX, vp->w5);
+
+		lock(&x->lock);
+
+		/* Ensure ENDs are disabled and cleaned up. Ideally the caller
+		 * should have done it but we double check it here
+		 */
+		for (j = 0; j < xive_cfg_vp_prio(x); j++) {
+			struct xive *end_x = xive_from_vc_blk(end_blk);
+			struct xive_end end, *orig_end = xive_get_end(end_x, end_idx + j);
+
+			if (!xive_get_field32(END_W0_VALID, orig_end->w0))
+				continue;
+
+			prlog(PR_WARNING, "freeing VP %d with queue %d active\n",
+			      vp_id, j);
+			end = *orig_end;
+			xive_cleanup_end(&end);
+			xive_endc_cache_update(x, end_blk, end_idx + j, &end, true);
+		}
+
+		/* Mark it not populated so we don't try to free it again */
+		vp->w5 = 0;
+
+		if (end_blk != blk) {
+			prerror("Block mismatch trying to free ENDs\n");
+			unlock(&x->lock);
+			return OPAL_INTERNAL_ERROR;
+		}
+
+		xive_free_end_set(x, end_idx);
+		unlock(&x->lock);
+	}
+
+	xive_free_vps(vp_base);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_alloc_vp_block(uint32_t alloc_order)
+{
+	uint32_t vp_base, ends, count, i;
+	int64_t rc;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	prlog(PR_TRACE, "opal_xive_alloc_vp_block(%d)\n", alloc_order);
+
+	vp_base = xive_alloc_vps(alloc_order);
+	if (XIVE_ALLOC_IS_ERR(vp_base)) {
+		if (vp_base == XIVE_ALLOC_NO_IND)
+			return OPAL_XIVE_PROVISIONING;
+		return OPAL_RESOURCE;
+	}
+
+	/* Allocate ENDs and initialize VPs */
+	count = 1 << alloc_order;
+	for (i = 0; i < count; i++) {
+		uint32_t vp_id = vp_base + i;
+		uint32_t blk, idx;
+		struct xive *x;
+		struct xive_nvp *vp;
+
+		if (!xive_decode_vp(vp_id, &blk, &idx, NULL, NULL)) {
+			prerror("Couldn't decode VP id %u\n", vp_id);
+			return OPAL_INTERNAL_ERROR;
+		}
+		x = xive_from_pc_blk(blk);
+		if (!x) {
+			prerror("Instance not found for allocated VP"
+				" block %d\n", blk);
+			rc = OPAL_INTERNAL_ERROR;
+			goto fail;
+		}
+		vp = xive_get_vp(x, idx);
+		if (!vp) {
+			prerror("VP not found after allocation !");
+			rc = OPAL_INTERNAL_ERROR;
+			goto fail;
+		}
+
+		/* Allocate ENDs, if fails, free the VPs and return */
+		lock(&x->lock);
+		ends = xive_alloc_end_set(x, false);
+		unlock(&x->lock);
+		if (XIVE_ALLOC_IS_ERR(ends)) {
+			if (ends == XIVE_ALLOC_NO_IND)
+				rc = OPAL_XIVE_PROVISIONING;
+			else
+				rc = OPAL_RESOURCE;
+			goto fail;
+		}
+
+		/* Initialize the VP structure. We don't use a cache watch
+		 * as we have made sure when freeing the entries to scrub
+		 * it out of the cache.
+		 */
+		memset(vp, 0, sizeof(*vp));
+
+		/* Store the END base of the VP in W5 (new in p10) */
+		xive_vp_set_end_base(vp, blk, ends);
+	}
+	return vp_base;
+ fail:
+	opal_xive_free_vp_block(vp_base);
+
+	return rc;
+}
+
+static int64_t xive_try_allocate_irq(struct xive *x)
+{
+	int idx, base_idx, max_count, girq;
+	struct xive_eas *eas;
+
+	lock(&x->lock);
+
+	base_idx = x->int_ipi_top - x->int_base;
+	max_count = x->int_hw_bot - x->int_ipi_top;
+
+	idx = bitmap_find_zero_bit(*x->ipi_alloc_map, base_idx, max_count);
+	if (idx < 0) {
+		unlock(&x->lock);
+		return OPAL_RESOURCE;
+	}
+	bitmap_set_bit(*x->ipi_alloc_map, idx);
+	girq = x->int_base + idx;
+
+	/* Mark the EAS valid. Don't bother with the HW cache, it's
+	 * still masked anyway, the cache will be updated when unmasked
+	 * and configured.
+	 */
+	eas = xive_get_eas(x, girq);
+	if (!eas) {
+		bitmap_clr_bit(*x->ipi_alloc_map, idx);
+		unlock(&x->lock);
+		return OPAL_PARAMETER;
+	}
+	eas->w = xive_set_field64(EAS_VALID, 0, 1) |
+		 xive_set_field64(EAS_MASKED, 0, 1) |
+		 xive_set_field64(EAS_END_DATA, 0, girq);
+	unlock(&x->lock);
+
+	return girq;
+}
+
+static int64_t opal_xive_allocate_irq(uint32_t chip_id)
+{
+	struct proc_chip *chip;
+	bool try_all = false;
+	int64_t rc;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+		return OPAL_WRONG_STATE;
+
+	if (chip_id == OPAL_XIVE_ANY_CHIP) {
+		try_all = true;
+		chip_id = this_cpu()->chip_id;
+	}
+	chip = get_chip(chip_id);
+	if (!chip)
+		return OPAL_PARAMETER;
+
+	/* Try initial target chip */
+	if (!chip->xive)
+		rc = OPAL_PARAMETER;
+	else
+		rc = xive_try_allocate_irq(chip->xive);
+	if (rc >= 0 || !try_all)
+		return rc;
+
+	/* Failed and we try all... do so */
+	for_each_chip(chip) {
+		if (!chip->xive)
+			continue;
+		rc = xive_try_allocate_irq(chip->xive);
+		if (rc >= 0)
+			break;
+	}
+	return rc;
+}
+
+static int64_t opal_xive_free_irq(uint32_t girq)
+{
+	struct irq_source *is = irq_find_source(girq);
+	struct xive_src *s = container_of(is, struct xive_src, is);
+	struct xive *x = xive_from_isn(girq);
+	struct xive_eas *eas;
+	uint32_t idx;
+
+	if (xive_mode != XIVE_MODE_EXPL)
+               return OPAL_WRONG_STATE;
+	if (!x || !is)
+		return OPAL_PARAMETER;
+
+	idx = GIRQ_TO_IDX(girq);
+
+	lock(&x->lock);
+
+	eas = xive_get_eas(x, girq);
+	if (!eas) {
+		unlock(&x->lock);
+		return OPAL_PARAMETER;
+	}
+
+	/* Mask the interrupt source */
+	xive_update_irq_mask(s, girq - s->esb_base, true);
+
+	/* Mark the EAS masked and invalid */
+	eas->w = xive_set_field64(EAS_VALID, 0, 1) |
+		 xive_set_field64(EAS_MASKED, 0, 1);
+	xive_easc_scrub(x, x->block_id, idx);
+
+	/* Free it */
+	if (!bitmap_tst_bit(*x->ipi_alloc_map, idx)) {
+		unlock(&x->lock);
+		return OPAL_PARAMETER;
+	}
+	bitmap_clr_bit(*x->ipi_alloc_map, idx);
+	bitmap_clr_bit(*x->int_enabled_map, idx);
+	unlock(&x->lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_dump_tm(uint32_t offset, const char *n, uint32_t pir)
+{
+	struct cpu_thread *c = find_cpu_by_pir(pir);
+	struct xive_cpu_state *xs;
+	struct xive *x;
+	void *cpu_tm_base;
+	uint64_t v0,v1;
+
+	if (!c)
+		return OPAL_PARAMETER;
+	xs = c->xstate;
+	if (!xs || !xs->tm_ring1)
+		return OPAL_INTERNAL_ERROR;
+	x = xs->xive;
+	cpu_tm_base = xive_cpu_get_tima(c);
+
+	lock(&x->lock);
+	v0 = in_be64(cpu_tm_base + offset);
+	if (offset == TM_QW3_HV_PHYS) {
+		v1 = in_8(cpu_tm_base + offset + 8);
+		v1 <<= 56;
+	} else {
+		v1 = in_be32(cpu_tm_base + offset + 8);
+		v1 <<= 32;
+	}
+	prlog(PR_INFO, "CPU[%04x]: TM state for QW %s\n", pir, n);
+	prlog(PR_INFO, "CPU[%04x]: NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
+	      " W2       W3\n", pir);
+	prlog(PR_INFO, "CPU[%04x]: %02x  %02x   %02x  %02x    %02x   "
+	       "%02x  %02x  %02x   %08x %08x\n", pir,
+	      (uint8_t)(v0 >> 58) & 0xff, (uint8_t)(v0 >> 48) & 0xff,
+	      (uint8_t)(v0 >> 40) & 0xff, (uint8_t)(v0 >> 32) & 0xff,
+	      (uint8_t)(v0 >> 24) & 0xff, (uint8_t)(v0 >> 16) & 0xff,
+	      (uint8_t)(v0 >>  8) & 0xff, (uint8_t)(v0      ) & 0xff,
+	      (uint32_t)(v1 >> 32) & 0xffffffff,
+	      (uint32_t)(v1 & 0xffffffff));
+	unlock(&x->lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_dump_vp(uint32_t vp_id)
+{
+	uint32_t blk, idx;
+	uint8_t order;
+	bool group;
+	struct xive *x;
+	struct xive_nvp *vp;
+	uint32_t *vpw;
+
+	if (!xive_decode_vp(vp_id, &blk, &idx, &order, &group))
+		return OPAL_PARAMETER;
+
+	x = xive_from_vc_blk(blk);
+	if (!x)
+		return OPAL_PARAMETER;
+	vp = xive_get_vp(x, idx);
+	if (!vp)
+		return OPAL_PARAMETER;
+	lock(&x->lock);
+
+	xive_nxc_scrub_clean(x, blk, idx);
+
+	vpw = ((uint32_t *)vp) + (group ? 8 : 0);
+	prlog(PR_INFO, "VP[%08x]: 0..3: %08x %08x %08x %08x\n", vp_id,
+	      vpw[0], vpw[1], vpw[2], vpw[3]);
+	prlog(PR_INFO, "VP[%08x]: 4..7: %08x %08x %08x %08x\n", vp_id,
+	      vpw[4], vpw[5], vpw[6], vpw[7]);
+	unlock(&x->lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_xive_sync_irq_src(uint32_t girq)
+{
+	struct xive *x = xive_from_isn(girq);
+
+	if (!x)
+		return OPAL_PARAMETER;
+	return xive_sync(x);
+}
+
+static int64_t opal_xive_sync_irq_target(uint32_t girq)
+{
+	uint32_t target, vp_blk;
+	struct xive *x;
+
+	if (!xive_get_irq_targetting(girq, &target, NULL, NULL))
+		return OPAL_PARAMETER;
+	if (!xive_decode_vp(target, &vp_blk, NULL, NULL, NULL))
+		return OPAL_PARAMETER;
+	x = xive_from_pc_blk(vp_blk);
+	if (!x)
+		return OPAL_PARAMETER;
+	return xive_sync(x);
+}
+
+static int64_t opal_xive_sync(uint32_t type, uint32_t id)
+{
+	int64_t rc = OPAL_SUCCESS;;
+
+	if (type & XIVE_SYNC_EAS)
+		rc = opal_xive_sync_irq_src(id);
+	if (rc)
+		return rc;
+	if (type & XIVE_SYNC_QUEUE)
+		rc = opal_xive_sync_irq_target(id);
+	if (rc)
+		return rc;
+
+	/* Add more ... */
+
+	return rc;
+}
+
+static int64_t opal_xive_dump(uint32_t type, uint32_t id)
+{
+	switch (type) {
+	case XIVE_DUMP_TM_HYP:
+		return opal_xive_dump_tm(TM_QW3_HV_PHYS, "PHYS", id);
+	case XIVE_DUMP_TM_POOL:
+		return opal_xive_dump_tm(TM_QW2_HV_POOL, "POOL", id);
+	case XIVE_DUMP_TM_OS:
+		return opal_xive_dump_tm(TM_QW1_OS, "OS  ", id);
+	case XIVE_DUMP_TM_USER:
+		return opal_xive_dump_tm(TM_QW0_USER, "USER", id);
+	case XIVE_DUMP_VP:
+		return opal_xive_dump_vp(id);
+	default:
+		return OPAL_PARAMETER;
+	}
+}
+
+static void xive_init_globals(void)
+{
+	uint32_t i;
+
+	for (i = 0; i < XIVE_MAX_CHIPS; i++)
+		xive_block_to_chip[i] = XIVE_INVALID_CHIP;
+}
+
+/*
+ * The global availability of some capabilities used in other drivers
+ * (PHB, PSI) is deduced from the capabilities of the first XIVE chip
+ * of the system. It should be common to all chips.
+ */
+bool xive2_cap_phb_pq_disable(void)
+{
+	return xive_has_cap(one_xive, CQ_XIVE_CAP_PHB_PQ_DISABLE);
+}
+
+bool xive2_cap_phb_abt(void)
+{
+	if (!xive_has_cap(one_xive, CQ_XIVE_CAP_PHB_ABT))
+		return false;
+
+	/*
+	 * We need 'PQ disable' to use ABT mode, else the OS will use
+	 * two different sets of ESB pages (PHB and IC) to control the
+	 * interrupt sources. Can not work.
+	 */
+	if (!xive2_cap_phb_pq_disable()) {
+		prlog_once(PR_ERR, "ABT mode is set without PQ disable. "
+			   "Ignoring bogus configuration\n");
+		return false;
+	}
+
+	return true;
+}
+
+bool xive2_cap_store_eoi(void)
+{
+	return xive_has_cap(one_xive, CQ_XIVE_CAP_STORE_EOI);
+}
+
+void xive2_init(void)
+{
+	struct dt_node *np;
+	struct proc_chip *chip;
+	struct cpu_thread *cpu;
+	bool first = true;
+
+	/* Look for xive nodes and do basic inits */
+	dt_for_each_compatible(dt_root, np, "ibm,power10-xive-x") {
+		struct xive *x;
+
+		/* Initialize some global stuff */
+		if (first)
+			xive_init_globals();
+
+		/* Create/initialize the xive instance */
+		x = init_one_xive(np);
+		if (first)
+			one_xive = x;
+		first = false;
+	}
+	if (first)
+		return;
+
+	/*
+	 * P8 emulation is not supported on P10 anymore. Exploitation
+	 * is the default XIVE mode. We might introduce a GEN2 mode.
+	 */
+	xive_mode = XIVE_MODE_EXPL;
+
+	/* Init VP allocator */
+	xive_init_vp_allocator();
+
+	/* Create a device-tree node for Linux use */
+	xive_create_mmio_dt_node(one_xive);
+
+	/* Some inits must be done after all xive have been created
+	 * such as setting up the forwarding ports
+	 */
+	for_each_chip(chip) {
+		if (chip->xive)
+			late_init_one_xive(chip->xive);
+	}
+
+	/* Initialize per-cpu structures */
+	for_each_present_cpu(cpu) {
+		xive_init_cpu(cpu);
+	}
+
+	/* Calling boot CPU */
+	xive2_cpu_callin(this_cpu());
+
+	/* Register XIVE exploitation calls */
+	opal_register(OPAL_XIVE_RESET, opal_xive_reset, 1);
+	opal_register(OPAL_XIVE_GET_IRQ_INFO, opal_xive_get_irq_info, 6);
+	opal_register(OPAL_XIVE_GET_IRQ_CONFIG, opal_xive_get_irq_config, 4);
+	opal_register(OPAL_XIVE_SET_IRQ_CONFIG, opal_xive_set_irq_config, 4);
+	opal_register(OPAL_XIVE_GET_QUEUE_INFO, opal_xive_get_queue_info, 7);
+	opal_register(OPAL_XIVE_SET_QUEUE_INFO, opal_xive_set_queue_info, 5);
+	opal_register(OPAL_XIVE_DONATE_PAGE, opal_xive_donate_page, 2);
+	opal_register(OPAL_XIVE_ALLOCATE_IRQ, opal_xive_allocate_irq, 1);
+	opal_register(OPAL_XIVE_FREE_IRQ, opal_xive_free_irq, 1);
+	opal_register(OPAL_XIVE_ALLOCATE_VP_BLOCK, opal_xive_alloc_vp_block, 1);
+	opal_register(OPAL_XIVE_FREE_VP_BLOCK, opal_xive_free_vp_block, 1);
+	opal_register(OPAL_XIVE_GET_VP_INFO, opal_xive_get_vp_info, 5);
+	opal_register(OPAL_XIVE_SET_VP_INFO, opal_xive_set_vp_info, 3);
+	opal_register(OPAL_XIVE_SYNC, opal_xive_sync, 2);
+	opal_register(OPAL_XIVE_DUMP, opal_xive_dump, 2);
+	opal_register(OPAL_XIVE_GET_QUEUE_STATE, opal_xive_get_queue_state, 4);
+	opal_register(OPAL_XIVE_SET_QUEUE_STATE, opal_xive_set_queue_state, 4);
+	opal_register(OPAL_XIVE_GET_VP_STATE, opal_xive_get_vp_state, 2);
+}
diff --git a/roms/skiboot/hw/xscom.c b/roms/skiboot/hw/xscom.c
new file mode 100644
index 000000000..347457242
--- /dev/null
+++ b/roms/skiboot/hw/xscom.c
@@ -0,0 +1,1019 @@
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+/*
+ * XSCOM driver
+ *
+ * Copyright 2013-2019 IBM Corp.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <processor.h>
+#include <device.h>
+#include <chip.h>
+#include <centaur.h>
+#include <errorlog.h>
+#include <opal-api.h>
+#include <timebase.h>
+#include <nvram.h>
+
+/* Mask of bits to clear in HMER before an access */
+#define HMER_CLR_MASK	(~(SPR_HMER_XSCOM_FAIL | \
+			   SPR_HMER_XSCOM_DONE | \
+			   SPR_HMER_XSCOM_STATUS))
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RW, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_INDIRECT_RW, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
+/* xscom details to trigger xstop */
+static struct {
+	uint64_t addr;
+	uint64_t fir_bit;
+} xstop_xscom;
+
+/*
+ * Locking notes:
+ *
+ * We used to have a per-target lock. However due to errata HW822317
+ * we can have issues on the issuer side if multiple threads try to
+ * send XSCOMs simultaneously (HMER responses get mixed up), so just
+ * use a global lock instead
+ */
+static struct lock xscom_lock = LOCK_UNLOCKED;
+
+static inline void *xscom_addr(uint32_t gcid, uint32_t pcb_addr)
+{
+	struct proc_chip *chip = get_chip(gcid);
+	uint64_t addr;
+
+	assert(chip);
+	addr  = chip->xscom_base;
+	if (proc_gen == proc_gen_p8) {
+		addr |= ((uint64_t)pcb_addr << 4) & ~0xfful;
+		addr |= (pcb_addr << 3) & 0x78;
+	} else
+		addr |= ((uint64_t)pcb_addr << 3);
+	return (void *)addr;
+}
+
+static uint64_t xscom_wait_done(void)
+{
+	uint64_t hmer;
+
+	do
+		hmer = mfspr(SPR_HMER);
+	while(!(hmer & SPR_HMER_XSCOM_DONE));
+
+	/*
+	 * HW822317: We need to read a second time as the actual
+	 * status can be delayed by 1 cycle after DONE
+	 */
+	return mfspr(SPR_HMER);
+}
+
+static void xscom_reset(uint32_t gcid, bool need_delay)
+{
+	u64 hmer;
+	uint32_t recv_status_reg, log_reg, err_reg;
+	struct timespec ts;
+
+	/* Clear errors in HMER */
+	mtspr(SPR_HMER, HMER_CLR_MASK);
+
+	/* Setup local and target scom addresses */
+	if (proc_gen == proc_gen_p10) {
+		recv_status_reg = 0x00090018;
+		log_reg = 0x0090012;
+		err_reg = 0x0090013;
+	} else if (proc_gen == proc_gen_p9) {
+		recv_status_reg = 0x00090018;
+		log_reg = 0x0090012;
+		err_reg = 0x0090013;
+	} else {
+		recv_status_reg = 0x202000f;
+		log_reg = 0x2020007;
+		err_reg = 0x2020009;
+	}
+
+	/* First we need to write 0 to a register on our chip */
+	out_be64(xscom_addr(this_cpu()->chip_id, recv_status_reg), 0);
+	hmer = xscom_wait_done();
+	if (hmer & SPR_HMER_XSCOM_FAIL)
+		goto fail;
+
+	/* Then we need to clear those two other registers on the target */
+	out_be64(xscom_addr(gcid, log_reg), 0);
+	hmer = xscom_wait_done();
+	if (hmer & SPR_HMER_XSCOM_FAIL)
+		goto fail;
+	out_be64(xscom_addr(gcid, err_reg), 0);
+	hmer = xscom_wait_done();
+	if (hmer & SPR_HMER_XSCOM_FAIL)
+		goto fail;
+
+	if (need_delay) {
+		/*
+		 * Its observed that sometimes immediate retry of
+		 * XSCOM operation returns wrong data. Adding a
+		 * delay for XSCOM reset to be effective. Delay of
+		 * 10 ms is found to be working fine experimentally.
+		 * FIXME: Replace 10ms delay by exact delay needed
+		 * or other alternate method to confirm XSCOM reset
+		 * completion, after checking from HW folks.
+		 */
+		ts.tv_sec = 0;
+		ts.tv_nsec = 10 * 1000;
+		nanosleep_nopoll(&ts, NULL);
+	}
+	return;
+ fail:
+	/* Fatal error resetting XSCOM */
+	log_simple_error(&e_info(OPAL_RC_XSCOM_RESET),
+		"XSCOM: Fatal error resetting engine after failed access !\n");
+
+	/* XXX Generate error log ? attn ? panic ?
+	 * If we decide to panic, change the above severity to PANIC
+	 */
+}
+
+static int xscom_clear_error(uint32_t gcid, uint32_t pcb_addr)
+{
+	u64 hmer;
+	uint32_t base_xscom_addr;
+	uint32_t xscom_clear_reg = 0x20010800;
+
+	/* only in case of p9 */
+	if (proc_gen != proc_gen_p9)
+		return 0;
+
+/* xscom clear address range/mask */
+#define XSCOM_CLEAR_RANGE_START		0x20010A00
+#define XSCOM_CLEAR_RANGE_END		0x20010ABF
+#define XSCOM_CLEAR_RANGE_MASK		0x200FFBFF
+
+	/*
+	 * Due to a hardware issue where core responding to scom was delayed
+	 * due to thread reconfiguration, leaves the scom logic in a state
+	 * where the subsequent scom to that core can get errors. This is
+	 * affected for Core PC scom registers in the range of
+	 * 20010A80-20010ABF.
+	 *
+	 * The solution is if a xscom timeout occurs to one of Core PC scom
+	 * registers in the range of 20010A80-20010ABF, a clearing scom
+	 * write is done to 0x20010800 with data of '0x00000000' which will
+	 * also get a timeout but clears the scom logic errors. After the
+	 * clearing write is done the original scom operation can be retried.
+	 *
+	 * The scom timeout is reported as status 0x4 (Invalid address)
+	 * in HMER[21-23].
+	 */
+
+	base_xscom_addr = pcb_addr & XSCOM_CLEAR_RANGE_MASK;
+	if (!((base_xscom_addr >= XSCOM_CLEAR_RANGE_START) &&
+				(base_xscom_addr <= XSCOM_CLEAR_RANGE_END)))
+		return 0;
+
+	/*
+	 * Reset the XSCOM or next scom operation will fail.
+	 * We also need a small delay before we go ahead with clearing write.
+	 * We have observed that without a delay the clearing write has reported
+	 * a wrong status.
+	 */
+	xscom_reset(gcid, true);
+
+	/* Clear errors in HMER */
+	mtspr(SPR_HMER, HMER_CLR_MASK);
+
+	/* Write 0 to clear the xscom logic errors on target chip */
+	out_be64(xscom_addr(gcid, xscom_clear_reg), 0);
+	hmer = xscom_wait_done();
+
+	/*
+	 * Above clearing xscom write will timeout and error out with
+	 * invalid access as there is no register at that address. This
+	 * xscom operation just helps to clear the xscom logic error.
+	 *
+	 * On failure, reset the XSCOM or we'll hang on the next access
+	 */
+	if (hmer & SPR_HMER_XSCOM_FAIL)
+		xscom_reset(gcid, true);
+
+	return 1;
+}
+
+static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
+			      bool is_write, int64_t retries,
+			      int64_t *xscom_clear_retries)
+{
+	unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
+	int64_t rc = OPAL_HARDWARE;
+
+	/* XXX Figure out error codes from doc and error
+	 * recovery procedures
+	 */
+	switch(stat) {
+	case 1:
+		/*
+		 * XSCOM engine is blocked, need to retry. Reset XSCOM
+		 * engine after crossing retry threshold before
+		 * retrying again.
+		 */
+		if (retries && !(retries  % XSCOM_BUSY_RESET_THRESHOLD)) {
+			prlog(PR_NOTICE, "XSCOM: Busy even after %d retries, "
+				"resetting XSCOM now. Total retries  = %lld\n",
+				XSCOM_BUSY_RESET_THRESHOLD, retries);
+			xscom_reset(gcid, true);
+
+		}
+
+		/* Log error if we have retried enough and its still busy */
+		if (retries == XSCOM_BUSY_MAX_RETRIES)
+			log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
+				"XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
+				"stat=0x%x\n", is_write ? "write" : "read",
+				gcid, pcb_addr, stat);
+		return OPAL_XSCOM_BUSY;
+
+	case 2: /* CPU is asleep, reset XSCOM engine and return */
+		xscom_reset(gcid, false);
+		return OPAL_XSCOM_CHIPLET_OFF;
+	case 3: /* Partial good */
+		rc = OPAL_XSCOM_PARTIAL_GOOD;
+		break;
+	case 4: /* Invalid address / address error */
+		rc = OPAL_XSCOM_ADDR_ERROR;
+		if (xscom_clear_error(gcid, pcb_addr)) {
+			/* return busy if retries still pending. */
+			if ((*xscom_clear_retries)--)
+				return OPAL_XSCOM_BUSY;
+
+			prlog(PR_DEBUG, "XSCOM: error recovery failed for "
+				"gcid=0x%x pcb_addr=0x%x\n", gcid, pcb_addr);
+
+		}
+		break;
+	case 5: /* Clock error */
+		rc = OPAL_XSCOM_CLOCK_ERROR;
+		break;
+	case 6: /* Parity error  */
+		rc = OPAL_XSCOM_PARITY_ERROR;
+		break;
+	case 7: /* Time out */
+		rc = OPAL_XSCOM_TIMEOUT;
+		break;
+	}
+
+	/*
+	 * If we're in an XSCOM opal call then squash the error
+	 * we assume that the caller (probably opal-prd) will
+	 * handle logging it
+	 */
+	if (this_cpu()->current_token != OPAL_XSCOM_READ &&
+	    this_cpu()->current_token != OPAL_XSCOM_WRITE) {
+		log_simple_error(&e_info(OPAL_RC_XSCOM_RW),
+			"XSCOM: %s error gcid=0x%x pcb_addr=0x%x stat=0x%x\n",
+			is_write ? "write" : "read", gcid, pcb_addr, stat);
+	}
+
+	/* We need to reset the XSCOM or we'll hang on the next access */
+	xscom_reset(gcid, false);
+
+	/* Non recovered ... just fail */
+	return rc;
+}
+
+static void xscom_handle_ind_error(uint64_t data, uint32_t gcid,
+				   uint64_t pcb_addr, bool is_write)
+{
+	unsigned int stat = GETFIELD(XSCOM_DATA_IND_ERR, data);
+	bool timeout = !(data & XSCOM_DATA_IND_COMPLETE);
+
+	/* XXX: Create error log entry ? */
+	if (timeout)
+		log_simple_error(&e_info(OPAL_RC_XSCOM_INDIRECT_RW),
+			"XSCOM: indirect %s timeout, gcid=0x%x pcb_addr=0x%llx"
+			" stat=0x%x\n",
+			is_write ? "write" : "read", gcid, pcb_addr, stat);
+	else
+		log_simple_error(&e_info(OPAL_RC_XSCOM_INDIRECT_RW),
+			"XSCOM: indirect %s error, gcid=0x%x pcb_addr=0x%llx"
+			" stat=0x%x\n",
+			is_write ? "write" : "read", gcid, pcb_addr, stat);
+}
+
+static bool xscom_gcid_ok(uint32_t gcid)
+{
+	return get_chip(gcid) != NULL;
+}
+
+/* Determine if SCOM address is multicast */
+static inline bool xscom_is_multicast_addr(uint32_t addr)
+{
+	return (((addr >> 30) & 0x1) == 0x1);
+}
+
+/*
+ * Low level XSCOM access functions, perform a single direct xscom
+ * access via MMIO
+ */
+static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
+{
+	uint64_t hmer;
+	int64_t ret, retries;
+	int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES;
+
+	if (!xscom_gcid_ok(gcid)) {
+		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
+		return OPAL_PARAMETER;
+	}
+
+	for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) {
+		/* Clear status bits in HMER (HMER is special
+		 * writing to it *ands* bits
+		 */
+		mtspr(SPR_HMER, HMER_CLR_MASK);
+
+		/* Read value from SCOM */
+		*val = in_be64(xscom_addr(gcid, pcb_addr));
+
+		/* Wait for done bit */
+		hmer = xscom_wait_done();
+
+		/* Check for error */
+		if (!(hmer & SPR_HMER_XSCOM_FAIL))
+			return OPAL_SUCCESS;
+
+		/* Handle error and possibly eventually retry */
+		ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries,
+				&xscom_clear_retries);
+		if (ret != OPAL_BUSY)
+			break;
+	}
+
+	/* Do not print error message for multicast SCOMS */
+	if (xscom_is_multicast_addr(pcb_addr) && ret == OPAL_XSCOM_CHIPLET_OFF)
+		return ret;
+
+	/*
+	 * Workaround on P9: PRD does operations it *knows* will fail with this
+	 * error to work around a hardware issue where accesses via the PIB
+	 * (FSI or OCC) work as expected, accesses via the ADU (what xscom goes
+	 * through) do not. The chip logic will always return all FFs if there
+	 * is any error on the scom.
+	 */
+	if (proc_gen == proc_gen_p9 && ret == OPAL_XSCOM_CHIPLET_OFF)
+		return ret;
+
+	/*
+	 * If an OPAL call XSCOM read fails, then the OPAL-PRD will
+	 * handle logging the error.  Hence just print an
+	 * informational message here.
+	 */
+	if (this_cpu()->current_token == OPAL_XSCOM_READ)
+		prlog(PR_INFO, "XSCOM: Read failed, ret =  %lld\n", ret);
+	else
+		prerror("XSCOM: Read failed, ret =  %lld\n", ret);
+
+	return ret;
+}
+
+static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
+{
+	uint64_t hmer;
+	int64_t ret, retries = 0;
+	int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES;
+
+	if (!xscom_gcid_ok(gcid)) {
+		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
+		return OPAL_PARAMETER;
+	}
+
+	for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) {
+		/* Clear status bits in HMER (HMER is special
+		 * writing to it *ands* bits
+		 */
+		mtspr(SPR_HMER, HMER_CLR_MASK);
+
+		/* Write value to SCOM */
+		out_be64(xscom_addr(gcid, pcb_addr), val);
+
+		/* Wait for done bit */
+		hmer = xscom_wait_done();
+
+		/* Check for error */
+		if (!(hmer & SPR_HMER_XSCOM_FAIL))
+			return OPAL_SUCCESS;
+
+		/* Handle error and possibly eventually retry */
+		ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries,
+				&xscom_clear_retries);
+		if (ret != OPAL_BUSY)
+			break;
+	}
+
+	/* Do not print error message for multicast SCOMS */
+	if (xscom_is_multicast_addr(pcb_addr) && ret == OPAL_XSCOM_CHIPLET_OFF)
+		return ret;
+
+	/*
+	 * Workaround on P9: PRD does operations it *knows* will fail with this
+	 * error to work around a hardware issue where accesses via the PIB
+	 * (FSI or OCC) work as expected, accesses via the ADU (what xscom goes
+	 * through) do not. The chip logic will always return all FFs if there
+	 * is any error on the scom.
+	 */
+	if (proc_gen == proc_gen_p9 && ret == OPAL_XSCOM_CHIPLET_OFF)
+		return ret;
+	/*
+	 * If an OPAL call XSCOM write fails, then the OPAL-PRD will
+	 * handle logging the error.  Hence just print an
+	 * informational message here.
+	 */
+	if (this_cpu()->current_token == OPAL_XSCOM_WRITE)
+		prlog(PR_INFO, "XSCOM: Write failed, ret =  %lld\n", ret);
+	else
+		prerror("XSCOM: Write failed, ret =  %lld\n", ret);
+
+	return ret;
+}
+
+/*
+ * Indirect XSCOM access functions
+ */
+static int xscom_indirect_read_form0(uint32_t gcid, uint64_t pcb_addr,
+				     uint64_t *val)
+{
+	uint32_t addr;
+	uint64_t data;
+	int rc, retries;
+
+	/* Write indirect address */
+	addr = pcb_addr & 0x7fffffff;
+	data = XSCOM_DATA_IND_READ |
+		(pcb_addr & XSCOM_ADDR_IND_ADDR);
+	rc = __xscom_write(gcid, addr, data);
+	if (rc)
+		goto bail;
+
+	/* Wait for completion */
+	for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
+		rc = __xscom_read(gcid, addr, &data);
+		if (rc)
+			goto bail;
+		if ((data & XSCOM_DATA_IND_COMPLETE) &&
+		    ((data & XSCOM_DATA_IND_ERR) == 0)) {
+			*val = data & XSCOM_DATA_IND_DATA;
+			break;
+		}
+		if ((data & XSCOM_DATA_IND_COMPLETE) ||
+		    (retries >= XSCOM_IND_MAX_RETRIES)) {
+			xscom_handle_ind_error(data, gcid, pcb_addr,
+					       false);
+			rc = OPAL_HARDWARE;
+			goto bail;
+		}
+	}
+ bail:
+	if (rc)
+		*val = (uint64_t)-1;
+	return rc;
+}
+
+static int xscom_indirect_form(uint64_t pcb_addr)
+{
+	return (pcb_addr >> 60) & 1;
+}
+
+static int xscom_indirect_read(uint32_t gcid, uint64_t pcb_addr, uint64_t *val)
+{
+	uint64_t form = xscom_indirect_form(pcb_addr);
+
+	if ((proc_gen >= proc_gen_p9) && (form == 1))
+		return OPAL_UNSUPPORTED;
+
+	return xscom_indirect_read_form0(gcid, pcb_addr, val);
+}
+
+static int xscom_indirect_write_form0(uint32_t gcid, uint64_t pcb_addr,
+				      uint64_t val)
+{
+	uint32_t addr;
+	uint64_t data;
+	int rc, retries;
+
+	/* Only 16 bit data with indirect */
+	if (val & ~(XSCOM_ADDR_IND_DATA))
+		return OPAL_PARAMETER;
+
+	/* Write indirect address & data */
+	addr = pcb_addr & 0x7fffffff;
+	data = pcb_addr & XSCOM_ADDR_IND_ADDR;
+	data |= val & XSCOM_ADDR_IND_DATA;
+
+	rc = __xscom_write(gcid, addr, data);
+	if (rc)
+		goto bail;
+
+	/* Wait for completion */
+	for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
+		rc = __xscom_read(gcid, addr, &data);
+		if (rc)
+			goto bail;
+		if ((data & XSCOM_DATA_IND_COMPLETE) &&
+		    ((data & XSCOM_DATA_IND_ERR) == 0))
+			break;
+		if ((data & XSCOM_DATA_IND_COMPLETE) ||
+		    (retries >= XSCOM_IND_MAX_RETRIES)) {
+			xscom_handle_ind_error(data, gcid, pcb_addr,
+					       true);
+			rc = OPAL_HARDWARE;
+			goto bail;
+		}
+	}
+ bail:
+	return rc;
+}
+
+static int xscom_indirect_write_form1(uint32_t gcid, uint64_t pcb_addr,
+				      uint64_t val)
+{
+	uint32_t addr;
+	uint64_t data;
+
+	if (proc_gen < proc_gen_p9)
+		return OPAL_UNSUPPORTED;
+	if (val & ~(XSCOM_DATA_IND_FORM1_DATA))
+		return OPAL_PARAMETER;
+
+	/* Mangle address and data for form1 */
+	addr = (pcb_addr & 0x000ffffffffUL);
+	data = (pcb_addr & 0xfff00000000UL) << 20;
+	data |= val;
+	return __xscom_write(gcid, addr, data);
+}
+
+static int xscom_indirect_write(uint32_t gcid, uint64_t pcb_addr, uint64_t val)
+{
+	uint64_t form = xscom_indirect_form(pcb_addr);
+
+	if ((proc_gen >= proc_gen_p9) && (form == 1))
+		return xscom_indirect_write_form1(gcid, pcb_addr, val);
+
+	return xscom_indirect_write_form0(gcid, pcb_addr, val);
+}
+
+static uint32_t xscom_decode_chiplet(uint32_t partid, uint64_t *pcb_addr)
+{
+	uint32_t gcid = (partid & 0x0fffffff) >> 4;
+	uint32_t core = partid & 0xf;
+
+	if (proc_gen >= proc_gen_p9) {
+		/* XXX Not supported */
+		*pcb_addr = 0;
+	} else {
+		*pcb_addr |= P8_EX_PCB_SLAVE_BASE;
+		*pcb_addr |= core << 24;
+	}
+
+	return gcid;
+}
+
+void _xscom_lock(void)
+{
+	lock(&xscom_lock);
+}
+
+void _xscom_unlock(void)
+{
+	unlock(&xscom_lock);
+}
+
+/* sorted by the scom controller's partid */
+static LIST_HEAD(scom_list);
+
+int64_t scom_register(struct scom_controller *new)
+{
+	struct scom_controller *cur;
+
+	list_for_each(&scom_list, cur, link) {
+		if (cur->part_id == new->part_id) {
+			prerror("Attempted to add duplicate scom, partid %x\n",
+				new->part_id);
+			return OPAL_BUSY;
+		}
+
+		if (cur->part_id > new->part_id) {
+			list_add_before(&scom_list, &new->link, &cur->link);
+			return 0;
+		}
+	}
+
+	/* if we never find a larger partid then this is the largest */
+	list_add_tail(&scom_list, &new->link);
+
+	return 0;
+}
+
+static struct scom_controller *scom_find(uint32_t partid)
+{
+	struct scom_controller *cur;
+
+	list_for_each(&scom_list, cur, link)
+		if (partid == cur->part_id)
+			return cur;
+
+	return NULL;
+}
+
+static int64_t scom_read(struct scom_controller *scom, uint32_t partid,
+			 uint64_t pcbaddr, uint64_t *val)
+{
+	int64_t rc = scom->read(scom, partid, pcbaddr, val);
+
+	if (rc) {
+		prerror("%s: to %x off: %llx rc = %lld\n",
+			__func__, partid, pcbaddr, rc);
+	}
+
+	return rc;
+}
+
+static int64_t scom_write(struct scom_controller *scom, uint32_t partid,
+			  uint64_t pcbaddr, uint64_t val)
+{
+	int64_t rc = scom->write(scom, partid, pcbaddr, val);
+
+	if (rc) {
+		prerror("%s: to %x off: %llx rc = %lld\n",
+			__func__, partid, pcbaddr, rc);
+	}
+
+	return rc;
+}
+
+/*
+ * External API
+ */
+int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_lock)
+{
+	struct scom_controller *scom;
+	uint32_t gcid;
+	int rc;
+
+	if (!opal_addr_valid(val))
+		return OPAL_PARAMETER;
+
+	/* Due to a bug in some versions of the PRD wrapper app, errors
+	 * might not be properly forwarded to PRD, in which case the data
+	 * set here will be used. Rather than a random value let's thus
+	 * initialize the data to a known clean state.
+	 */
+	*val = 0xdeadbeefdeadbeefull;
+
+	/* Handle part ID decoding */
+	switch(partid >> 28) {
+	case 0: /* Normal processor chip */
+		gcid = partid;
+		break;
+	case 4: /* EX chiplet */
+		gcid = xscom_decode_chiplet(partid, &pcb_addr);
+		if (pcb_addr == 0)
+			return OPAL_UNSUPPORTED;
+		break;
+	default:
+		/* is it one of our hacks? */
+		scom = scom_find(partid);
+		if (scom)
+			return scom_read(scom, partid, pcb_addr, val);
+
+		/**
+		 * @fwts-label XSCOMReadInvalidPartID
+		 * @fwts-advice xscom_read was called with an invalid partid.
+		 * There's likely a bug somewhere in the stack that's causing
+		 * someone to try an xscom_read on something that isn't a
+		 * processor, Centaur or EX chiplet.
+		 */
+		prerror("%s: invalid XSCOM partid 0x%x\n", __func__, partid);
+		return OPAL_PARAMETER;
+	}
+
+	/* HW822317 requires us to do global locking */
+	if (take_lock)
+		lock(&xscom_lock);
+
+	/* Direct vs indirect access */
+	if (pcb_addr & XSCOM_ADDR_IND_FLAG)
+		rc = xscom_indirect_read(gcid, pcb_addr, val);
+	else
+		rc = __xscom_read(gcid, pcb_addr & 0x7fffffff, val);
+
+	/* Unlock it */
+	if (take_lock)
+		unlock(&xscom_lock);
+	return rc;
+}
+
+static int64_t opal_xscom_read(uint32_t partid, uint64_t pcb_addr, __be64 *__val)
+{
+	uint64_t val;
+	int64_t rc;
+
+	rc = xscom_read(partid, pcb_addr, &val);
+	*__val = cpu_to_be64(val);
+
+	return rc;
+}
+opal_call(OPAL_XSCOM_READ, opal_xscom_read, 3);
+
+int _xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val, bool take_lock)
+{
+	struct scom_controller *scom;
+	uint32_t gcid;
+	int rc;
+
+	/* Handle part ID decoding */
+	switch(partid >> 28) {
+	case 0: /* Normal processor chip */
+		gcid = partid;
+		break;
+	case 4: /* EX chiplet */
+		gcid = xscom_decode_chiplet(partid, &pcb_addr);
+		break;
+	default:
+		/* is it one of our hacks? */
+		scom = scom_find(partid);
+		if (scom)
+			return scom_write(scom, partid, pcb_addr, val);
+
+		/**
+		 * @fwts-label XSCOMWriteInvalidPartID
+		 * @fwts-advice xscom_write was called with an invalid partid.
+		 * There's likely a bug somewhere in the stack that's causing
+		 * someone to try an xscom_write on something that isn't a
+		 * processor, Centaur or EX chiplet.
+		 */
+		prerror("%s: invalid XSCOM partid 0x%x\n", __func__, partid);
+		return OPAL_PARAMETER;
+	}
+
+	/* HW822317 requires us to do global locking */
+	if (take_lock)
+		lock(&xscom_lock);
+
+	/* Direct vs indirect access */
+	if (pcb_addr & XSCOM_ADDR_IND_FLAG)
+		rc = xscom_indirect_write(gcid, pcb_addr, val);
+	else
+		rc = __xscom_write(gcid, pcb_addr & 0x7fffffff, val);
+
+	/* Unlock it */
+	if (take_lock)
+		unlock(&xscom_lock);
+	return rc;
+}
+
+static int64_t opal_xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val)
+{
+	return xscom_write(partid, pcb_addr, val);
+}
+opal_call(OPAL_XSCOM_WRITE, opal_xscom_write, 3);
+
+/*
+ * Perform a xscom read-modify-write.
+ */
+int xscom_write_mask(uint32_t partid, uint64_t pcb_addr, uint64_t val, uint64_t mask)
+{
+	int rc;
+	uint64_t old_val;
+
+	rc = xscom_read(partid, pcb_addr, &old_val);
+	if (rc)
+		return rc;
+	val = (old_val & ~mask) | (val & mask);
+	return xscom_write(partid, pcb_addr, val);
+}
+
+int xscom_readme(uint64_t pcb_addr, uint64_t *val)
+{
+	return xscom_read(this_cpu()->chip_id, pcb_addr, val);
+}
+
+int xscom_writeme(uint64_t pcb_addr, uint64_t val)
+{
+	return xscom_write(this_cpu()->chip_id, pcb_addr, val);
+}
+
+int64_t xscom_read_cfam_chipid(uint32_t partid, uint32_t *chip_id)
+{
+	uint64_t val;
+	int64_t rc = OPAL_SUCCESS;
+
+	/* Mambo chip model lacks the f000f register, just make
+	 * something up
+	 */
+	if (chip_quirk(QUIRK_NO_F000F)) {
+		if (proc_gen == proc_gen_p10)
+			val = 0x120DA04980000000UL; /* P10 DD1.0 */
+		else if (proc_gen == proc_gen_p9)
+			val = 0x203D104980000000UL; /* P9 Nimbus DD2.3 */
+		else
+			val = 0x221EF04980000000UL; /* P8 Murano DD2.1 */
+	} else
+		rc = xscom_read(partid, 0xf000f, &val);
+
+	/* Extract CFAM id */
+	if (rc == OPAL_SUCCESS)
+		*chip_id = (uint32_t)(val >> 44);
+
+	return rc;
+}
+
+static void xscom_init_chip_info(struct proc_chip *chip)
+{
+	uint32_t val;
+	int64_t rc;
+
+	rc = xscom_read_cfam_chipid(chip->id, &val);
+	if (rc) {
+		prerror("XSCOM: Error %lld reading 0xf000f register\n", rc);
+		/* We leave chip type to UNKNOWN */
+		return;
+	}
+
+	/* Identify chip */
+	switch(val & 0xff) {
+	case 0xef:
+		chip->type = PROC_CHIP_P8_MURANO;
+		assert(proc_gen == proc_gen_p8);
+		break;
+	case 0xea:
+		chip->type = PROC_CHIP_P8_VENICE;
+		assert(proc_gen == proc_gen_p8);
+		break;
+	case 0xd3:
+		chip->type = PROC_CHIP_P8_NAPLES;
+		assert(proc_gen == proc_gen_p8);
+		break;
+	case 0xd1:
+		chip->type = PROC_CHIP_P9_NIMBUS;
+		assert(proc_gen == proc_gen_p9);
+		break;
+	case 0xd4:
+		chip->type = PROC_CHIP_P9_CUMULUS;
+		assert(proc_gen == proc_gen_p9);
+		break;
+	case 0xd9:
+		chip->type = PROC_CHIP_P9P;
+		assert(proc_gen == proc_gen_p9);
+		break;
+	case 0xda:
+		chip->type = PROC_CHIP_P10;
+		assert(proc_gen == proc_gen_p10);
+		break;
+	default:
+		printf("CHIP: Unknown chip type 0x%02x !!!\n",
+		       (unsigned char)(val & 0xff));
+	}
+
+	/* Get EC level from CFAM ID */
+	chip->ec_level = ((val >> 16) & 0xf) << 4;
+	chip->ec_level |= (val >> 8) & 0xf;
+
+	/*
+	 * On P9, grab the ECID bits to differenciate
+	 * DD1.01, 1.02, 2.00, etc...
+	 */
+	if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+		chip->ec_rev = 0;
+	} else if (proc_gen == proc_gen_p9) {
+		uint64_t ecid2 = 0;
+		uint8_t rev;
+		xscom_read(chip->id, 0x18002, &ecid2);
+		switch((ecid2 >> 45) & 7) {
+		case 0:
+			rev = 0;
+			break;
+		case 1:
+			rev = 1;
+			break;
+		case 3:
+			rev = 2;
+			break;
+		case 7:
+			rev = 3;
+			break;
+		default:
+			rev = 0;
+		}
+		prlog(PR_INFO,"P9 DD%i.%i%d detected\n", 0xf & (chip->ec_level >> 4),
+		       chip->ec_level & 0xf, rev);
+		chip->ec_rev = rev;
+	} /* XXX P10 */
+}
+
+/*
+* This function triggers xstop by writing to XSCOM.
+* Machine would enter xstop state post completion of this.
+*/
+int64_t xscom_trigger_xstop(void)
+{
+	int rc = OPAL_UNSUPPORTED;
+	bool xstop_disabled = false;
+
+	if (nvram_query_eq_dangerous("opal-sw-xstop", "disable"))
+		xstop_disabled = true;
+
+	if (xstop_disabled) {
+		prlog(PR_NOTICE, "Software initiated checkstop disabled.\n");
+		return rc;
+	}
+
+	if (xstop_xscom.addr)
+		rc = xscom_writeme(xstop_xscom.addr,
+				PPC_BIT(xstop_xscom.fir_bit));
+
+	return rc;
+}
+
+void xscom_init(void)
+{
+	struct dt_node *xn;
+	const struct dt_property *p;
+
+	dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+		uint32_t gcid = dt_get_chip_id(xn);
+		const struct dt_property *reg;
+		struct proc_chip *chip;
+		const char *chip_name;
+		static const char *chip_names[] = {
+			"UNKNOWN", "P8E", "P8", "P8NVL", "P9N", "P9C", "P9P",
+			"P10",
+		};
+
+		chip = get_chip(gcid);
+		assert(chip);
+
+		/* XXX We need a proper address parsing. For now, we just
+		 * "know" that we are looking at a u64
+		 */
+		reg = dt_find_property(xn, "reg");
+		assert(reg);
+
+		chip->xscom_base = dt_translate_address(xn, 0, NULL);
+
+		/* Grab processor type and EC level */
+		xscom_init_chip_info(chip);
+
+		if (chip->type >= ARRAY_SIZE(chip_names))
+			chip_name = "INVALID";
+		else
+			chip_name = chip_names[chip->type];
+
+		/* We keep a "CHIP" prefix to make the log more user-friendly */
+		prlog(PR_NOTICE, "CHIP: Chip ID %04x type: %s DD%x.%x%d\n",
+		      gcid, chip_name, chip->ec_level >> 4,
+		      chip->ec_level & 0xf, chip->ec_rev);
+		prlog(PR_DEBUG, "XSCOM: Base address: 0x%llx\n", chip->xscom_base);
+	}
+
+	/* Collect details to trigger xstop via XSCOM write */
+	p = dt_find_property(dt_root, "ibm,sw-checkstop-fir");
+	if (p) {
+		xstop_xscom.addr = dt_property_get_cell(p, 0);
+		xstop_xscom.fir_bit = dt_property_get_cell(p, 1);
+		prlog(PR_DEBUG, "XSTOP: XSCOM addr = 0x%llx, FIR bit = %lld\n",
+		      xstop_xscom.addr, xstop_xscom.fir_bit);
+	} else
+		prlog(PR_DEBUG, "XSTOP: ibm,sw-checkstop-fir prop not found\n");
+}
+
+void xscom_used_by_console(void)
+{
+	xscom_lock.in_con_path = true;
+
+	/*
+	 * Some other processor might hold it without having
+	 * disabled the console locally so let's make sure that
+	 * is over by taking/releasing the lock ourselves
+	 */
+	lock(&xscom_lock);
+	unlock(&xscom_lock);
+}
+
+bool xscom_ok(void)
+{
+	return !lock_held_by_me(&xscom_lock);
+}
author	Angelos Mouzakitis <a.mouzakitis@virtualopensystems.com>	2023-10-10 14:33:42 +0000
committer	Angelos Mouzakitis <a.mouzakitis@virtualopensystems.com>	2023-10-10 14:33:42 +0000
commit	af1a266670d040d2f4083ff309d732d648afba2a (patch)
tree	2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/skiboot/hw
parent	e02cda008591317b1625707ff8e115a4841aa889 (diff)